Commit | Line | Data |
---|---|---|
b19b3632 NR |
1 | package be.nikiroo.gofetch.support; |
2 | ||
b19b3632 | 3 | import java.io.IOException; |
b19b3632 | 4 | import java.net.URL; |
aacd7f07 | 5 | import java.text.SimpleDateFormat; |
b19b3632 NR |
6 | import java.util.AbstractMap; |
7 | import java.util.ArrayList; | |
aacd7f07 NR |
8 | import java.util.Date; |
9 | import java.util.HashMap; | |
b19b3632 | 10 | import java.util.LinkedList; |
aacd7f07 | 11 | import java.util.List; |
b19b3632 | 12 | import java.util.Map; |
aacd7f07 | 13 | import java.util.Map.Entry; |
b19b3632 NR |
14 | |
15 | import org.jsoup.nodes.Document; | |
16 | import org.jsoup.nodes.Element; | |
b19b3632 NR |
17 | import org.jsoup.select.Elements; |
18 | ||
aacd7f07 NR |
19 | import be.nikiroo.gofetch.data.Comment; |
20 | import be.nikiroo.gofetch.data.Story; | |
21 | ||
b19b3632 NR |
22 | /** |
23 | * Support <a href="https://www.reddit.com/">https://www.reddit.com/</a>. | |
24 | * | |
25 | * @author niki | |
26 | */ | |
27 | public class Reddit extends BasicSupport { | |
28 | @Override | |
29 | public String getDescription() { | |
30 | return "Reddit: The front page of the internet"; | |
31 | } | |
32 | ||
33 | @Override | |
34 | protected List<Entry<URL, String>> getUrls() throws IOException { | |
35 | List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>(); | |
36 | String base = "https://www.reddit.com/r/"; | |
aacd7f07 NR |
37 | urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(base |
38 | + "linux_gaming" + "/new/"), "linux_gaming")); | |
b19b3632 NR |
39 | |
40 | return urls; | |
41 | } | |
42 | ||
43 | @Override | |
44 | protected List<Element> getArticles(Document doc) { | |
1197ec1a NR |
45 | List<Element> list = doc.getElementsByClass("thing"); |
46 | if (list.isEmpty()) { | |
47 | list = doc.getElementsByClass("Post"); | |
48 | } | |
49 | if (list.isEmpty()) { | |
50 | list = doc.getElementsByClass("scrollerItem"); | |
51 | } | |
aacd7f07 | 52 | |
1197ec1a | 53 | return list; |
b19b3632 NR |
54 | } |
55 | ||
56 | @Override | |
57 | protected String getArticleId(Document doc, Element article) { | |
7273fd58 NR |
58 | String date = getArticleDate(doc, article); |
59 | String title = getArticleTitle(doc, article); | |
aacd7f07 | 60 | |
7273fd58 NR |
61 | String id = (date + "_" + title).replaceAll("[^a-zA-Z0-9_-]", "_"); |
62 | if (id.length() > 40) { | |
63 | id = id.substring(0, 40); | |
64 | } | |
aacd7f07 | 65 | |
7273fd58 | 66 | return id; |
b19b3632 NR |
67 | } |
68 | ||
69 | @Override | |
70 | protected String getArticleTitle(Document doc, Element article) { | |
aacd7f07 NR |
71 | Elements els = article.getElementsByAttributeValue("data-event-action", |
72 | "title"); | |
1197ec1a NR |
73 | if (els == null || els.isEmpty()) { |
74 | els = article.getElementsByTag("h2"); | |
75 | } | |
aacd7f07 | 76 | |
1197ec1a | 77 | return els.first().text().trim(); |
b19b3632 | 78 | } |
aacd7f07 | 79 | |
b19b3632 NR |
80 | @Override |
81 | protected String getArticleAuthor(Document doc, Element article) { | |
aacd7f07 NR |
82 | return article.getElementsByAttributeValueStarting("href", "/user/") |
83 | .text().trim(); | |
b19b3632 NR |
84 | } |
85 | ||
86 | @Override | |
87 | protected String getArticleDate(Document doc, Element article) { | |
1197ec1a NR |
88 | Element el = article.getElementsByClass("live-timestamp").first(); |
89 | if (el == null) { | |
aacd7f07 NR |
90 | el = article.getElementsByAttributeValue("data-click-id", |
91 | "timestamp").first(); | |
1197ec1a | 92 | } |
aacd7f07 | 93 | |
1197ec1a | 94 | String dateAgo = el.text().trim(); |
aacd7f07 NR |
95 | return new SimpleDateFormat("yyyy-MM-dd_HH-mm") |
96 | .format(getDate(dateAgo)); | |
b19b3632 NR |
97 | } |
98 | ||
99 | @Override | |
100 | protected String getArticleCategory(Document doc, Element article, | |
101 | String currentCategory) { | |
aacd7f07 NR |
102 | Elements categEls = article.getElementsByAttributeValueStarting("href", |
103 | "/r/" + currentCategory + "/search=?q=flair_name"); | |
104 | ||
b19b3632 | 105 | if (categEls.size() > 0) { |
aacd7f07 | 106 | return currentCategory + ", " + categEls.first().text().trim(); |
b19b3632 | 107 | } |
aacd7f07 | 108 | |
b19b3632 NR |
109 | return currentCategory; |
110 | } | |
111 | ||
112 | @Override | |
113 | protected String getArticleDetails(Document doc, Element article) { | |
114 | return ""; | |
115 | } | |
116 | ||
117 | @Override | |
118 | protected String getArticleIntUrl(Document doc, Element article) { | |
1197ec1a NR |
119 | String url = article.absUrl("data-permalink"); |
120 | if (url == null || url.isEmpty()) { | |
aacd7f07 NR |
121 | url = article |
122 | .getElementsByAttributeValue("data-click-id", "timestamp") | |
123 | .first().absUrl("href"); | |
1197ec1a | 124 | } |
aacd7f07 | 125 | |
1197ec1a | 126 | return url; |
b19b3632 NR |
127 | } |
128 | ||
129 | @Override | |
130 | protected String getArticleExtUrl(Document doc, Element article) { | |
aacd7f07 NR |
131 | Elements els = article.getElementsByAttributeValue("data-event-action", |
132 | "title"); | |
1197ec1a | 133 | if (els == null || els.isEmpty()) { |
aacd7f07 | 134 | els = article.getElementsByAttributeValue("data-click-id", "body"); |
1197ec1a | 135 | } |
aacd7f07 | 136 | |
1197ec1a | 137 | Element url = els.first(); |
b19b3632 NR |
138 | if (!url.attr("href").trim().startsWith("/")) { |
139 | return url.absUrl("href"); | |
140 | } | |
aacd7f07 | 141 | |
b19b3632 NR |
142 | return ""; |
143 | } | |
144 | ||
145 | @Override | |
146 | protected String getArticleContent(Document doc, Element article) { | |
60acdaf9 | 147 | Elements els = article.getElementsByClass("h2"); |
1197ec1a NR |
148 | if (els != null && !els.isEmpty()) { |
149 | return els.first().text().trim(); | |
150 | } | |
aacd7f07 | 151 | |
b19b3632 NR |
152 | return ""; |
153 | } | |
154 | ||
155 | @Override | |
156 | protected Element getFullArticle(Document doc) { | |
aacd7f07 NR |
157 | Element element = doc.getElementsByAttributeValue("data-click-id", |
158 | "body").first(); | |
60acdaf9 NR |
159 | if (element == null) { |
160 | element = doc.getElementsByClass("ckueCN").first(); | |
161 | } | |
aacd7f07 | 162 | |
60acdaf9 | 163 | return element; |
b19b3632 NR |
164 | } |
165 | ||
166 | @Override | |
167 | protected ElementProcessor getElementProcessorFullArticle() { | |
168 | return new BasicElementProcessor(); | |
169 | } | |
170 | ||
171 | @Override | |
172 | protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) { | |
60acdaf9 NR |
173 | Elements posts = doc.getElementsByClass("jHfOJm"); |
174 | if (posts.isEmpty()) { | |
175 | posts = doc.getElementsByClass("eCeBkc"); | |
176 | } | |
aacd7f07 NR |
177 | if (posts.isEmpty()) { |
178 | posts = doc.getElementsByClass("gxtxxZ"); | |
179 | } | |
180 | ||
60acdaf9 | 181 | return posts; |
b19b3632 NR |
182 | } |
183 | ||
184 | @Override | |
185 | protected List<Element> getCommentCommentPosts(Document doc, | |
186 | Element container) { | |
ff49bc76 | 187 | |
b19b3632 NR |
188 | List<Element> elements = new LinkedList<Element>(); |
189 | for (Element el : container.children()) { | |
ff49bc76 NR |
190 | // elements.addAll(el.getElementsByClass("jHfOJm")); |
191 | elements.addAll(el.getElementsByClass("emJXdb")); | |
b19b3632 | 192 | } |
aacd7f07 | 193 | |
b19b3632 NR |
194 | return elements; |
195 | } | |
196 | ||
197 | @Override | |
198 | protected String getCommentId(Element post) { | |
199 | int level = 1; | |
200 | Elements els = post.getElementsByClass("imyGpC"); | |
aacd7f07 NR |
201 | |
202 | if (!els.isEmpty()) { | |
203 | String l = els.first().text().trim().replace("level ", ""); | |
b19b3632 NR |
204 | try { |
205 | level = Integer.parseInt(l); | |
aacd7f07 | 206 | } catch (NumberFormatException e) { |
b19b3632 NR |
207 | } |
208 | } | |
aacd7f07 | 209 | |
b19b3632 NR |
210 | return Integer.toString(level); |
211 | } | |
212 | ||
213 | @Override | |
214 | protected String getCommentAuthor(Element post) { | |
215 | // Since we have no title, we switch with author | |
216 | return ""; | |
217 | } | |
218 | ||
219 | @Override | |
220 | protected String getCommentTitle(Element post) { | |
221 | // Since we have no title, we switch with author | |
aacd7f07 NR |
222 | |
223 | Element authorEl = post.getElementsByClass("RVnoX").first(); | |
224 | if (authorEl == null) | |
225 | authorEl = post.getElementsByClass("kzePTH").first(); | |
226 | if (authorEl == null) | |
227 | authorEl = post.getElementsByClass("jczTlv").first(); | |
228 | ||
229 | if (authorEl != null) | |
230 | return authorEl.text().trim(); | |
231 | ||
b19b3632 NR |
232 | return ""; |
233 | } | |
234 | ||
235 | @Override | |
236 | protected String getCommentDate(Element post) { | |
aacd7f07 NR |
237 | Element elAgo = post.getElementsByClass("hJDlLH").first(); |
238 | if (elAgo == null) | |
239 | elAgo = post.getElementsByClass("hDplaG").first(); | |
240 | ||
241 | if (elAgo != null) { | |
242 | String dateAgo = elAgo.text().trim(); | |
243 | return new SimpleDateFormat("yyyy-MM-dd_HH-mm") | |
244 | .format(getDate(dateAgo)); | |
245 | } | |
246 | ||
247 | return ""; | |
b19b3632 NR |
248 | } |
249 | ||
250 | @Override | |
251 | protected Element getCommentContentElement(Element post) { | |
aacd7f07 | 252 | return post.getElementsByClass("ckueCN").first(); |
b19b3632 NR |
253 | } |
254 | ||
255 | @Override | |
256 | protected ElementProcessor getElementProcessorComment() { | |
257 | return new BasicElementProcessor(); | |
258 | } | |
aacd7f07 | 259 | |
b19b3632 NR |
260 | @Override |
261 | public void fetch(Story story) throws IOException { | |
262 | super.fetch(story); | |
aacd7f07 | 263 | |
b19b3632 | 264 | List<Comment> comments = new LinkedList<Comment>(); |
aacd7f07 NR |
265 | Map<Integer, Comment> lastOfLevel = new HashMap<Integer, Comment>(); |
266 | ||
ff49bc76 NR |
267 | if (!story.getComments().isEmpty()) { |
268 | // comments are saved under a main ID (which is a copy of comment 1) | |
269 | // TODO: fix the cause instead of working around it here | |
270 | for (Comment c : story.getComments().get(0)) { | |
271 | int level = Integer.parseInt(c.getId()); | |
272 | lastOfLevel.put(level, c); | |
273 | if (level <= 1) { | |
b19b3632 | 274 | comments.add(c); |
ff49bc76 NR |
275 | } else { |
276 | Comment parent = lastOfLevel.get(level - 1); | |
277 | if (parent != null) { | |
278 | parent.add(c); | |
279 | } else { | |
280 | // bad data | |
281 | comments.add(c); | |
282 | } | |
b19b3632 NR |
283 | } |
284 | } | |
285 | } | |
aacd7f07 | 286 | |
b19b3632 NR |
287 | story.setComments(comments); |
288 | } | |
aacd7f07 | 289 | |
60acdaf9 NR |
290 | // 2 hours ago -> 18/10/2018 21:00 |
291 | private Date getDate(String dateAgo) { | |
292 | int h = 0; | |
293 | if (dateAgo.endsWith("hour ago")) { | |
294 | h = 1; | |
295 | } else if (dateAgo.endsWith("hours ago")) { | |
296 | dateAgo = dateAgo.replace("hours ago", "").trim(); | |
297 | h = Integer.parseInt(dateAgo); | |
298 | } else if (dateAgo.endsWith("day ago")) { | |
299 | h = 24; | |
300 | } else if (dateAgo.endsWith("days ago")) { | |
301 | dateAgo = dateAgo.replace("days ago", "").trim(); | |
302 | h = Integer.parseInt(dateAgo) * 24; | |
303 | } | |
aacd7f07 NR |
304 | |
305 | long now = new Date().getTime(); // in ms since 1970 | |
306 | now = now / (1000l * 60l * 60l); // in hours since 1970 | |
307 | long then = now - h; // in hours since 1970 | |
60acdaf9 | 308 | then = then * (1000l * 60l * 60l); // in ms since 1970 |
aacd7f07 | 309 | |
60acdaf9 NR |
310 | return new Date(then); |
311 | } | |
b19b3632 | 312 | } |