Reddit: fix problem with new kind of html (wip)
[gofetch.git] / src / be / nikiroo / gofetch / support / Reddit.java
CommitLineData
b19b3632
NR
1package be.nikiroo.gofetch.support;
2
3import be.nikiroo.gofetch.data.Story;
4import be.nikiroo.gofetch.data.Comment;
5
6import java.io.IOException;
7import java.io.UnsupportedEncodingException;
8import java.net.URL;
9import java.net.URLDecoder;
10import java.util.AbstractMap;
11import java.util.ArrayList;
12import java.util.List;
13import java.util.LinkedList;
14import java.util.Map.Entry;
15import java.util.Map;
16import java.util.HashMap;
17import java.util.Date;
1197ec1a 18import java.text.SimpleDateFormat;
b19b3632
NR
19
20import org.jsoup.nodes.Document;
21import org.jsoup.nodes.Element;
22import org.jsoup.nodes.Node;
23import org.jsoup.select.Elements;
24
25/**
26 * Support <a href="https://www.reddit.com/">https://www.reddit.com/</a>.
27 *
28 * @author niki
29 */
30public class Reddit extends BasicSupport {
31 @Override
32 public String getDescription() {
33 return "Reddit: The front page of the internet";
34 }
35
36 @Override
37 protected List<Entry<URL, String>> getUrls() throws IOException {
38 List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
39 String base = "https://www.reddit.com/r/";
40 urls.add(new AbstractMap.SimpleEntry<URL, String>(
41 new URL(base + "linux_gaming" + "/new/"), "linux_gaming"
42 ));
43
44 return urls;
45 }
46
47 @Override
48 protected List<Element> getArticles(Document doc) {
1197ec1a
NR
49 List<Element> list = doc.getElementsByClass("thing");
50 if (list.isEmpty()) {
51 list = doc.getElementsByClass("Post");
52 }
53 if (list.isEmpty()) {
54 list = doc.getElementsByClass("scrollerItem");
55 }
56
57 return list;
b19b3632
NR
58 }
59
60 @Override
61 protected String getArticleId(Document doc, Element article) {
62 // Use the date, Luke
63 return "";
64 }
65
66 @Override
67 protected String getArticleTitle(Document doc, Element article) {
1197ec1a
NR
68 Elements els = article.getElementsByAttributeValue(
69 "data-event-action", "title");
70 if (els == null || els.isEmpty()) {
71 els = article.getElementsByTag("h2");
72 }
73
74 return els.first().text().trim();
b19b3632
NR
75 }
76
77 @Override
78 protected String getArticleAuthor(Document doc, Element article) {
79 return article.getElementsByAttributeValueStarting(
80 "href", "/user/"
81 ).text().trim();
82 }
83
84 @Override
85 protected String getArticleDate(Document doc, Element article) {
1197ec1a
NR
86 Element el = article.getElementsByClass("live-timestamp").first();
87 if (el == null) {
88 el = article.getElementsByAttributeValue(
89 "data-click-id", "timestamp").first();
90 }
91
92 String dateAgo = el.text().trim();
93 int h = 0;
94 if (dateAgo.endsWith("hour ago")) {
95 h = 1;
96 } else if (dateAgo.endsWith("hours ago")) {
97 dateAgo = dateAgo.replace("hours ago", "").trim();
98 h = Integer.parseInt(dateAgo);
99 } else if (dateAgo.endsWith("day ago")) {
100 h = 24;
101 } else if (dateAgo.endsWith("days ago")) {
102 dateAgo = dateAgo.replace("days ago", "").trim();
103 h = Integer.parseInt(dateAgo) * 24;
104 }
105
106 long now = new Date().getTime(); // in ms since 1970
107 now = now / (1000l * 60l * 60l); // in hours
108 long then = now - h; // in hours
109 then = then * (60l * 60l); // in seconds
110
111 return Long.toString(then);
b19b3632
NR
112 }
113
114 @Override
115 protected String getArticleCategory(Document doc, Element article,
116 String currentCategory) {
117 Elements categEls = article.getElementsByAttributeValueStarting(
118 "href", "/r/" + currentCategory + "/search=?q=flair_name"
119 );
120
121 if (categEls.size() > 0) {
122 return currentCategory + ", "
123 + categEls.first().text().trim();
124 }
125
126 return currentCategory;
127 }
128
129 @Override
130 protected String getArticleDetails(Document doc, Element article) {
131 return "";
132 }
133
134 @Override
135 protected String getArticleIntUrl(Document doc, Element article) {
1197ec1a
NR
136 String url = article.absUrl("data-permalink");
137 if (url == null || url.isEmpty()) {
138 url = article.getElementsByAttributeValue(
139 "data-click-id", "timestamp").first().absUrl("href");
140 }
141
142 return url;
b19b3632
NR
143 }
144
145 @Override
146 protected String getArticleExtUrl(Document doc, Element article) {
1197ec1a
NR
147 Elements els = article.getElementsByAttributeValue(
148 "data-event-action", "title");
149 if (els == null || els.isEmpty()) {
150 els = article.getElementsByAttributeValue(
151 "data-click-id", "body");
152 }
153
154 Element url = els.first();
b19b3632
NR
155 if (!url.attr("href").trim().startsWith("/")) {
156 return url.absUrl("href");
157 }
158
159 return "";
160 }
161
162 @Override
163 protected String getArticleContent(Document doc, Element article) {
1197ec1a
NR
164 Elements els = article.getElementsByClass("md");
165 if (els != null && !els.isEmpty()) {
166 return els.first().text().trim();
167 }
168
b19b3632
NR
169 return "";
170 }
171
172 @Override
173 protected Element getFullArticle(Document doc) {
174 return doc.getElementsByClass("ckueCN").first();
175 }
176
177 @Override
178 protected ElementProcessor getElementProcessorFullArticle() {
179 return new BasicElementProcessor();
180 }
181
182 @Override
183 protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
184 return doc.getElementsByClass("jHfOJm");
185 }
186
187 @Override
188 protected List<Element> getCommentCommentPosts(Document doc,
189 Element container) {
190 List<Element> elements = new LinkedList<Element>();
191 for (Element el : container.children()) {
192 elements.addAll(el.getElementsByClass("jHfOJm"));
193 }
194
195 return elements;
196 }
197
198 @Override
199 protected String getCommentId(Element post) {
200 int level = 1;
201 Elements els = post.getElementsByClass("imyGpC");
202 if (els.size() > 0) {
203 String l = els.first().text().trim()
204 .replace("level ", "");
205 try {
206 level = Integer.parseInt(l);
207 } catch(NumberFormatException e) {
208 }
209 }
210
211 return Integer.toString(level);
212 }
213
214 @Override
215 protected String getCommentAuthor(Element post) {
216 // Since we have no title, we switch with author
217 return "";
218 }
219
220 @Override
221 protected String getCommentTitle(Element post) {
222 // Since we have no title, we switch with author
223 Elements els = post.getElementsByClass("RVnoX");
224 if (els.size() > 0) {
225 return els.first().text().trim();
226 }
227
228 els = post.getElementsByClass("kzePTH");
229 if (els.size() > 0) {
230 return els.first().text().trim();
231 }
232
233 return "";
234 }
235
236 @Override
237 protected String getCommentDate(Element post) {
238 return post.getElementsByClass("hJDlLH")
239 .first().text().trim();
240 }
241
242 @Override
243 protected Element getCommentContentElement(Element post) {
244 return post.getElementsByClass("ckueCN")
245 .first();
246 }
247
248 @Override
249 protected ElementProcessor getElementProcessorComment() {
250 return new BasicElementProcessor();
251 }
252
253 @Override
254 public void fetch(Story story) throws IOException {
255 super.fetch(story);
256
257 List<Comment> comments = new LinkedList<Comment>();
258 Map<Integer, Comment> lastOfLevel =
259 new HashMap<Integer, Comment>();
260
261 for (Comment c : story.getComments()) {
262 int level = Integer.parseInt(c.getId());
263 lastOfLevel.put(level, c);
264 if (level <= 1) {
265 comments.add(c);
266 } else {
267 Comment parent = lastOfLevel.get(level - 1);
268 if (parent != null ){
269 parent.add(c);
270 } else {
271 // bad data
272 comments.add(c);
273 }
274 }
275 }
276
277 story.setComments(comments);
278 }
279}