Commit | Line | Data |
---|---|---|
b19b3632 NR |
1 | package be.nikiroo.gofetch.support; |
2 | ||
3 | import be.nikiroo.gofetch.data.Story; | |
4 | import be.nikiroo.gofetch.data.Comment; | |
5 | ||
6 | import java.io.IOException; | |
7 | import java.io.UnsupportedEncodingException; | |
8 | import java.net.URL; | |
9 | import java.net.URLDecoder; | |
10 | import java.util.AbstractMap; | |
11 | import java.util.ArrayList; | |
12 | import java.util.List; | |
13 | import java.util.LinkedList; | |
14 | import java.util.Map.Entry; | |
15 | import java.util.Map; | |
16 | import java.util.HashMap; | |
17 | import java.util.Date; | |
1197ec1a | 18 | import java.text.SimpleDateFormat; |
b19b3632 NR |
19 | |
20 | import org.jsoup.nodes.Document; | |
21 | import org.jsoup.nodes.Element; | |
22 | import org.jsoup.nodes.Node; | |
23 | import org.jsoup.select.Elements; | |
24 | ||
25 | /** | |
26 | * Support <a href="https://www.reddit.com/">https://www.reddit.com/</a>. | |
27 | * | |
28 | * @author niki | |
29 | */ | |
30 | public class Reddit extends BasicSupport { | |
31 | @Override | |
32 | public String getDescription() { | |
33 | return "Reddit: The front page of the internet"; | |
34 | } | |
35 | ||
36 | @Override | |
37 | protected List<Entry<URL, String>> getUrls() throws IOException { | |
38 | List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>(); | |
39 | String base = "https://www.reddit.com/r/"; | |
40 | urls.add(new AbstractMap.SimpleEntry<URL, String>( | |
41 | new URL(base + "linux_gaming" + "/new/"), "linux_gaming" | |
42 | )); | |
43 | ||
44 | return urls; | |
45 | } | |
46 | ||
47 | @Override | |
48 | protected List<Element> getArticles(Document doc) { | |
1197ec1a NR |
49 | List<Element> list = doc.getElementsByClass("thing"); |
50 | if (list.isEmpty()) { | |
51 | list = doc.getElementsByClass("Post"); | |
52 | } | |
53 | if (list.isEmpty()) { | |
54 | list = doc.getElementsByClass("scrollerItem"); | |
55 | } | |
56 | ||
57 | return list; | |
b19b3632 NR |
58 | } |
59 | ||
60 | @Override | |
61 | protected String getArticleId(Document doc, Element article) { | |
7273fd58 NR |
62 | String date = getArticleDate(doc, article); |
63 | String title = getArticleTitle(doc, article); | |
64 | ||
65 | String id = (date + "_" + title).replaceAll("[^a-zA-Z0-9_-]", "_"); | |
66 | if (id.length() > 40) { | |
67 | id = id.substring(0, 40); | |
68 | } | |
69 | ||
70 | return id; | |
b19b3632 NR |
71 | } |
72 | ||
73 | @Override | |
74 | protected String getArticleTitle(Document doc, Element article) { | |
1197ec1a NR |
75 | Elements els = article.getElementsByAttributeValue( |
76 | "data-event-action", "title"); | |
77 | if (els == null || els.isEmpty()) { | |
78 | els = article.getElementsByTag("h2"); | |
79 | } | |
80 | ||
81 | return els.first().text().trim(); | |
b19b3632 NR |
82 | } |
83 | ||
84 | @Override | |
85 | protected String getArticleAuthor(Document doc, Element article) { | |
86 | return article.getElementsByAttributeValueStarting( | |
87 | "href", "/user/" | |
88 | ).text().trim(); | |
89 | } | |
90 | ||
91 | @Override | |
92 | protected String getArticleDate(Document doc, Element article) { | |
1197ec1a NR |
93 | Element el = article.getElementsByClass("live-timestamp").first(); |
94 | if (el == null) { | |
95 | el = article.getElementsByAttributeValue( | |
96 | "data-click-id", "timestamp").first(); | |
97 | } | |
98 | ||
99 | String dateAgo = el.text().trim(); | |
60acdaf9 | 100 | return new SimpleDateFormat("yyyy-MM-dd_HH-mm").format(getDate(dateAgo)); |
b19b3632 NR |
101 | } |
102 | ||
103 | @Override | |
104 | protected String getArticleCategory(Document doc, Element article, | |
105 | String currentCategory) { | |
106 | Elements categEls = article.getElementsByAttributeValueStarting( | |
107 | "href", "/r/" + currentCategory + "/search=?q=flair_name" | |
108 | ); | |
109 | ||
110 | if (categEls.size() > 0) { | |
111 | return currentCategory + ", " | |
112 | + categEls.first().text().trim(); | |
113 | } | |
114 | ||
115 | return currentCategory; | |
116 | } | |
117 | ||
118 | @Override | |
119 | protected String getArticleDetails(Document doc, Element article) { | |
120 | return ""; | |
121 | } | |
122 | ||
123 | @Override | |
124 | protected String getArticleIntUrl(Document doc, Element article) { | |
1197ec1a NR |
125 | String url = article.absUrl("data-permalink"); |
126 | if (url == null || url.isEmpty()) { | |
127 | url = article.getElementsByAttributeValue( | |
128 | "data-click-id", "timestamp").first().absUrl("href"); | |
129 | } | |
130 | ||
131 | return url; | |
b19b3632 NR |
132 | } |
133 | ||
134 | @Override | |
135 | protected String getArticleExtUrl(Document doc, Element article) { | |
1197ec1a NR |
136 | Elements els = article.getElementsByAttributeValue( |
137 | "data-event-action", "title"); | |
138 | if (els == null || els.isEmpty()) { | |
139 | els = article.getElementsByAttributeValue( | |
140 | "data-click-id", "body"); | |
141 | } | |
142 | ||
143 | Element url = els.first(); | |
b19b3632 NR |
144 | if (!url.attr("href").trim().startsWith("/")) { |
145 | return url.absUrl("href"); | |
146 | } | |
147 | ||
148 | return ""; | |
149 | } | |
150 | ||
151 | @Override | |
152 | protected String getArticleContent(Document doc, Element article) { | |
60acdaf9 | 153 | Elements els = article.getElementsByClass("h2"); |
1197ec1a NR |
154 | if (els != null && !els.isEmpty()) { |
155 | return els.first().text().trim(); | |
156 | } | |
157 | ||
b19b3632 NR |
158 | return ""; |
159 | } | |
160 | ||
161 | @Override | |
162 | protected Element getFullArticle(Document doc) { | |
60acdaf9 NR |
163 | Element element = doc.getElementsByAttributeValue( |
164 | "data-click-id", "body").first(); | |
165 | if (element == null) { | |
166 | element = doc.getElementsByClass("ckueCN").first(); | |
167 | } | |
168 | ||
169 | return element; | |
b19b3632 NR |
170 | } |
171 | ||
172 | @Override | |
173 | protected ElementProcessor getElementProcessorFullArticle() { | |
174 | return new BasicElementProcessor(); | |
175 | } | |
176 | ||
177 | @Override | |
178 | protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) { | |
60acdaf9 NR |
179 | Elements posts = doc.getElementsByClass("jHfOJm"); |
180 | if (posts.isEmpty()) { | |
181 | posts = doc.getElementsByClass("eCeBkc"); | |
182 | } | |
183 | ||
184 | return posts; | |
b19b3632 NR |
185 | } |
186 | ||
187 | @Override | |
188 | protected List<Element> getCommentCommentPosts(Document doc, | |
189 | Element container) { | |
190 | List<Element> elements = new LinkedList<Element>(); | |
191 | for (Element el : container.children()) { | |
192 | elements.addAll(el.getElementsByClass("jHfOJm")); | |
193 | } | |
194 | ||
195 | return elements; | |
196 | } | |
197 | ||
198 | @Override | |
199 | protected String getCommentId(Element post) { | |
200 | int level = 1; | |
201 | Elements els = post.getElementsByClass("imyGpC"); | |
202 | if (els.size() > 0) { | |
203 | String l = els.first().text().trim() | |
204 | .replace("level ", ""); | |
205 | try { | |
206 | level = Integer.parseInt(l); | |
207 | } catch(NumberFormatException e) { | |
208 | } | |
209 | } | |
210 | ||
211 | return Integer.toString(level); | |
212 | } | |
213 | ||
214 | @Override | |
215 | protected String getCommentAuthor(Element post) { | |
216 | // Since we have no title, we switch with author | |
217 | return ""; | |
218 | } | |
219 | ||
220 | @Override | |
221 | protected String getCommentTitle(Element post) { | |
222 | // Since we have no title, we switch with author | |
223 | Elements els = post.getElementsByClass("RVnoX"); | |
224 | if (els.size() > 0) { | |
225 | return els.first().text().trim(); | |
226 | } | |
227 | ||
228 | els = post.getElementsByClass("kzePTH"); | |
229 | if (els.size() > 0) { | |
230 | return els.first().text().trim(); | |
231 | } | |
232 | ||
233 | return ""; | |
234 | } | |
235 | ||
236 | @Override | |
237 | protected String getCommentDate(Element post) { | |
60acdaf9 | 238 | String dateAgo = post.getElementsByClass("hJDlLH") |
b19b3632 | 239 | .first().text().trim(); |
60acdaf9 | 240 | return new SimpleDateFormat("yyyy-MM-dd_HH-mm").format(getDate(dateAgo)); |
b19b3632 NR |
241 | } |
242 | ||
243 | @Override | |
244 | protected Element getCommentContentElement(Element post) { | |
245 | return post.getElementsByClass("ckueCN") | |
246 | .first(); | |
247 | } | |
248 | ||
249 | @Override | |
250 | protected ElementProcessor getElementProcessorComment() { | |
251 | return new BasicElementProcessor(); | |
252 | } | |
253 | ||
254 | @Override | |
255 | public void fetch(Story story) throws IOException { | |
256 | super.fetch(story); | |
257 | ||
258 | List<Comment> comments = new LinkedList<Comment>(); | |
259 | Map<Integer, Comment> lastOfLevel = | |
260 | new HashMap<Integer, Comment>(); | |
261 | ||
262 | for (Comment c : story.getComments()) { | |
263 | int level = Integer.parseInt(c.getId()); | |
264 | lastOfLevel.put(level, c); | |
265 | if (level <= 1) { | |
266 | comments.add(c); | |
267 | } else { | |
268 | Comment parent = lastOfLevel.get(level - 1); | |
269 | if (parent != null ){ | |
270 | parent.add(c); | |
271 | } else { | |
272 | // bad data | |
273 | comments.add(c); | |
274 | } | |
275 | } | |
276 | } | |
277 | ||
278 | story.setComments(comments); | |
279 | } | |
60acdaf9 NR |
280 | |
281 | // 2 hours ago -> 18/10/2018 21:00 | |
282 | private Date getDate(String dateAgo) { | |
283 | int h = 0; | |
284 | if (dateAgo.endsWith("hour ago")) { | |
285 | h = 1; | |
286 | } else if (dateAgo.endsWith("hours ago")) { | |
287 | dateAgo = dateAgo.replace("hours ago", "").trim(); | |
288 | h = Integer.parseInt(dateAgo); | |
289 | } else if (dateAgo.endsWith("day ago")) { | |
290 | h = 24; | |
291 | } else if (dateAgo.endsWith("days ago")) { | |
292 | dateAgo = dateAgo.replace("days ago", "").trim(); | |
293 | h = Integer.parseInt(dateAgo) * 24; | |
294 | } | |
295 | ||
296 | long now = new Date().getTime(); // in ms since 1970 | |
297 | now = now / (1000l * 60l * 60l); // in hours since 1970 | |
298 | long then = now - h; // in hours since 1970 | |
299 | then = then * (1000l * 60l * 60l); // in ms since 1970 | |
300 | ||
301 | return new Date(then); | |
302 | } | |
b19b3632 | 303 | } |