X-Git-Url: http://git.nikiroo.be/?a=blobdiff_plain;f=src%2Fbe%2Fnikiroo%2Fgofetch%2Fsupport%2FReddit.java;h=c89feff671e1f963948f14780e3cf725d9f1220f;hb=ff49bc765089d37d20ad950350d02876435c73f8;hp=278a1a21a18898975d2a377ab4b6e26e2df7bf5d;hpb=60acdaf963614095692d2c3b59915c6d299eafb7;p=gofetch.git diff --git a/src/be/nikiroo/gofetch/support/Reddit.java b/src/be/nikiroo/gofetch/support/Reddit.java index 278a1a2..c89feff 100644 --- a/src/be/nikiroo/gofetch/support/Reddit.java +++ b/src/be/nikiroo/gofetch/support/Reddit.java @@ -1,27 +1,24 @@ package be.nikiroo.gofetch.support; -import be.nikiroo.gofetch.data.Story; -import be.nikiroo.gofetch.data.Comment; - import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.net.URL; -import java.net.URLDecoder; +import java.text.SimpleDateFormat; import java.util.AbstractMap; import java.util.ArrayList; -import java.util.List; +import java.util.Date; +import java.util.HashMap; import java.util.LinkedList; -import java.util.Map.Entry; +import java.util.List; import java.util.Map; -import java.util.HashMap; -import java.util.Date; -import java.text.SimpleDateFormat; +import java.util.Map.Entry; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; import org.jsoup.select.Elements; +import be.nikiroo.gofetch.data.Comment; +import be.nikiroo.gofetch.data.Story; + /** * Support https://www.reddit.com/. * @@ -37,9 +34,8 @@ public class Reddit extends BasicSupport { protected List> getUrls() throws IOException { List> urls = new ArrayList>(); String base = "https://www.reddit.com/r/"; - urls.add(new AbstractMap.SimpleEntry( - new URL(base + "linux_gaming" + "/new/"), "linux_gaming" - )); + urls.add(new AbstractMap.SimpleEntry(new URL(base + + "linux_gaming" + "/new/"), "linux_gaming")); return urls; } @@ -53,58 +49,63 @@ public class Reddit extends BasicSupport { if (list.isEmpty()) { list = doc.getElementsByClass("scrollerItem"); } - + return list; } @Override protected String getArticleId(Document doc, Element article) { - // Use the date, Luke - return ""; + String date = getArticleDate(doc, article); + String title = getArticleTitle(doc, article); + + String id = (date + "_" + title).replaceAll("[^a-zA-Z0-9_-]", "_"); + if (id.length() > 40) { + id = id.substring(0, 40); + } + + return id; } @Override protected String getArticleTitle(Document doc, Element article) { - Elements els = article.getElementsByAttributeValue( - "data-event-action", "title"); + Elements els = article.getElementsByAttributeValue("data-event-action", + "title"); if (els == null || els.isEmpty()) { els = article.getElementsByTag("h2"); } - + return els.first().text().trim(); } - + @Override protected String getArticleAuthor(Document doc, Element article) { - return article.getElementsByAttributeValueStarting( - "href", "/user/" - ).text().trim(); + return article.getElementsByAttributeValueStarting("href", "/user/") + .text().trim(); } @Override protected String getArticleDate(Document doc, Element article) { Element el = article.getElementsByClass("live-timestamp").first(); if (el == null) { - el = article.getElementsByAttributeValue( - "data-click-id", "timestamp").first(); + el = article.getElementsByAttributeValue("data-click-id", + "timestamp").first(); } - + String dateAgo = el.text().trim(); - return new SimpleDateFormat("yyyy-MM-dd_HH-mm").format(getDate(dateAgo)); + return new SimpleDateFormat("yyyy-MM-dd_HH-mm") + .format(getDate(dateAgo)); } @Override protected String getArticleCategory(Document doc, Element article, String currentCategory) { - Elements categEls = article.getElementsByAttributeValueStarting( - "href", "/r/" + currentCategory + "/search=?q=flair_name" - ); - + Elements categEls = article.getElementsByAttributeValueStarting("href", + "/r/" + currentCategory + "/search=?q=flair_name"); + if (categEls.size() > 0) { - return currentCategory + ", " - + categEls.first().text().trim(); + return currentCategory + ", " + categEls.first().text().trim(); } - + return currentCategory; } @@ -117,27 +118,27 @@ public class Reddit extends BasicSupport { protected String getArticleIntUrl(Document doc, Element article) { String url = article.absUrl("data-permalink"); if (url == null || url.isEmpty()) { - url = article.getElementsByAttributeValue( - "data-click-id", "timestamp").first().absUrl("href"); + url = article + .getElementsByAttributeValue("data-click-id", "timestamp") + .first().absUrl("href"); } - + return url; } @Override protected String getArticleExtUrl(Document doc, Element article) { - Elements els = article.getElementsByAttributeValue( - "data-event-action", "title"); + Elements els = article.getElementsByAttributeValue("data-event-action", + "title"); if (els == null || els.isEmpty()) { - els = article.getElementsByAttributeValue( - "data-click-id", "body"); + els = article.getElementsByAttributeValue("data-click-id", "body"); } - + Element url = els.first(); if (!url.attr("href").trim().startsWith("/")) { return url.absUrl("href"); } - + return ""; } @@ -147,18 +148,18 @@ public class Reddit extends BasicSupport { if (els != null && !els.isEmpty()) { return els.first().text().trim(); } - + return ""; } @Override protected Element getFullArticle(Document doc) { - Element element = doc.getElementsByAttributeValue( - "data-click-id", "body").first(); + Element element = doc.getElementsByAttributeValue("data-click-id", + "body").first(); if (element == null) { element = doc.getElementsByClass("ckueCN").first(); } - + return element; } @@ -173,18 +174,23 @@ public class Reddit extends BasicSupport { if (posts.isEmpty()) { posts = doc.getElementsByClass("eCeBkc"); } - + if (posts.isEmpty()) { + posts = doc.getElementsByClass("gxtxxZ"); + } + return posts; } @Override protected List getCommentCommentPosts(Document doc, Element container) { + List elements = new LinkedList(); for (Element el : container.children()) { - elements.addAll(el.getElementsByClass("jHfOJm")); + // elements.addAll(el.getElementsByClass("jHfOJm")); + elements.addAll(el.getElementsByClass("emJXdb")); } - + return elements; } @@ -192,15 +198,15 @@ public class Reddit extends BasicSupport { protected String getCommentId(Element post) { int level = 1; Elements els = post.getElementsByClass("imyGpC"); - if (els.size() > 0) { - String l = els.first().text().trim() - .replace("level ", ""); + + if (!els.isEmpty()) { + String l = els.first().text().trim().replace("level ", ""); try { level = Integer.parseInt(l); - } catch(NumberFormatException e) { + } catch (NumberFormatException e) { } } - + return Integer.toString(level); } @@ -213,64 +219,74 @@ public class Reddit extends BasicSupport { @Override protected String getCommentTitle(Element post) { // Since we have no title, we switch with author - Elements els = post.getElementsByClass("RVnoX"); - if (els.size() > 0) { - return els.first().text().trim(); - } - - els = post.getElementsByClass("kzePTH"); - if (els.size() > 0) { - return els.first().text().trim(); - } - + + Element authorEl = post.getElementsByClass("RVnoX").first(); + if (authorEl == null) + authorEl = post.getElementsByClass("kzePTH").first(); + if (authorEl == null) + authorEl = post.getElementsByClass("jczTlv").first(); + + if (authorEl != null) + return authorEl.text().trim(); + return ""; } @Override protected String getCommentDate(Element post) { - String dateAgo = post.getElementsByClass("hJDlLH") - .first().text().trim(); - return new SimpleDateFormat("yyyy-MM-dd_HH-mm").format(getDate(dateAgo)); + Element elAgo = post.getElementsByClass("hJDlLH").first(); + if (elAgo == null) + elAgo = post.getElementsByClass("hDplaG").first(); + + if (elAgo != null) { + String dateAgo = elAgo.text().trim(); + return new SimpleDateFormat("yyyy-MM-dd_HH-mm") + .format(getDate(dateAgo)); + } + + return ""; } @Override protected Element getCommentContentElement(Element post) { - return post.getElementsByClass("ckueCN") - .first(); + return post.getElementsByClass("ckueCN").first(); } @Override protected ElementProcessor getElementProcessorComment() { return new BasicElementProcessor(); } - + @Override public void fetch(Story story) throws IOException { super.fetch(story); - + List comments = new LinkedList(); - Map lastOfLevel = - new HashMap(); - - for (Comment c : story.getComments()) { - int level = Integer.parseInt(c.getId()); - lastOfLevel.put(level, c); - if (level <= 1) { - comments.add(c); - } else { - Comment parent = lastOfLevel.get(level - 1); - if (parent != null ){ - parent.add(c); - } else { - // bad data + Map lastOfLevel = new HashMap(); + + if (!story.getComments().isEmpty()) { + // comments are saved under a main ID (which is a copy of comment 1) + // TODO: fix the cause instead of working around it here + for (Comment c : story.getComments().get(0)) { + int level = Integer.parseInt(c.getId()); + lastOfLevel.put(level, c); + if (level <= 1) { comments.add(c); + } else { + Comment parent = lastOfLevel.get(level - 1); + if (parent != null) { + parent.add(c); + } else { + // bad data + comments.add(c); + } } } } - + story.setComments(comments); } - + // 2 hours ago -> 18/10/2018 21:00 private Date getDate(String dateAgo) { int h = 0; @@ -285,12 +301,12 @@ public class Reddit extends BasicSupport { dateAgo = dateAgo.replace("days ago", "").trim(); h = Integer.parseInt(dateAgo) * 24; } - - long now = new Date().getTime(); // in ms since 1970 - now = now / (1000l * 60l * 60l); // in hours since 1970 - long then = now - h; // in hours since 1970 + + long now = new Date().getTime(); // in ms since 1970 + now = now / (1000l * 60l * 60l); // in hours since 1970 + long then = now - h; // in hours since 1970 then = then * (1000l * 60l * 60l); // in ms since 1970 - + return new Date(then); } }