X-Git-Url: http://git.nikiroo.be/?a=blobdiff_plain;f=src%2Fbe%2Fnikiroo%2Fgofetch%2Fsupport%2FSlashdot.java;h=b3a779da62d229469346f8a9455b1e01b160ab9d;hb=c9cffa913fe4ebc5cbe483cc5afe676e6cb54abd;hp=6a5395498de5894657e39311cc68ee3b8f9529e2;hpb=5c056aade2e020276e039f81acba7bcb2b12e87f;p=gofetch.git diff --git a/src/be/nikiroo/gofetch/support/Slashdot.java b/src/be/nikiroo/gofetch/support/Slashdot.java index 6a53954..b3a779d 100644 --- a/src/be/nikiroo/gofetch/support/Slashdot.java +++ b/src/be/nikiroo/gofetch/support/Slashdot.java @@ -9,10 +9,12 @@ import java.util.List; import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; import org.jsoup.select.Elements; import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; +import be.nikiroo.utils.StringUtils; /** * Support https://slashdot.org/. @@ -30,11 +32,11 @@ public class Slashdot extends BasicSupport { List list = new ArrayList(); URL url = new URL("https://slashdot.org/"); - InputStream in = open(url); + InputStream in = downloader.open(url); Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements stories = doc.getElementsByTag("header"); - for (Element story : stories) { - Elements titles = story.getElementsByClass("story-title"); + Elements articles = doc.getElementsByTag("header"); + for (Element article : articles) { + Elements titles = article.getElementsByClass("story-title"); if (titles.size() == 0) { continue; } @@ -47,8 +49,8 @@ public class Slashdot extends BasicSupport { } Elements links = title.getElementsByTag("a"); - String intUrl = null; - String extUrl = null; + String intUrl = ""; + String extUrl = ""; if (links.size() > 0) { intUrl = links.get(0).absUrl("href"); } @@ -57,19 +59,46 @@ public class Slashdot extends BasicSupport { } String details = ""; - Elements detailsElements = story.getElementsByClass("details"); + Elements detailsElements = article.getElementsByClass("details"); if (detailsElements.size() > 0) { details = detailsElements.get(0).text(); } + // details: + // "Posted by AUTHOR on DATE from the further-crackdown dept." + String author = ""; + int pos = details.indexOf(" on "); + if (details.startsWith("Posted by ") && pos >= 0) { + author = details.substring("Posted by ".length(), pos).trim(); + } + pos = details.indexOf(" from the "); + if (pos >= 0) { + details = details.substring(pos).trim(); + } + String body = ""; Element bodyElement = doc.getElementById("text-" + id); if (bodyElement != null) { body = bodyElement.text(); } - list.add(new Story(getType(), id, title.text(), details, intUrl, - extUrl, body)); + String categ = ""; + Element categElement = doc.getElementsByClass("topic").first(); + if (categElement != null) { + categ = StringUtils.unhtml(categElement.text()).trim(); + } + + String date = ""; + Element dateElement = doc.getElementsByTag("time").first(); + if (dateElement != null) { + date = StringUtils.unhtml(dateElement.text()).trim(); + if (date.startsWith("on ")) { + date = date.substring("on ".length()); + } + } + + list.add(new Story(getType(), id, title.text(), author, date, + categ, details, intUrl, extUrl, body)); } return list; @@ -80,7 +109,7 @@ public class Slashdot extends BasicSupport { List comments = new ArrayList(); URL url = new URL(story.getUrlInternal()); - InputStream in = open(url); + InputStream in = downloader.open(url); Document doc = DataUtil.load(in, "UTF-8", url.toString()); Element listing = doc.getElementById("commentlisting"); if (listing != null) { @@ -92,41 +121,77 @@ public class Slashdot extends BasicSupport { private List getComments(Element listing) { List comments = new ArrayList(); + Comment lastComment = null; for (Element commentElement : listing.children()) { if (commentElement.hasClass("comment")) { - Comment comment = getComment(commentElement); - if (!comment.isEmpty()) { - comments.add(comment); + if (!commentElement.hasClass("hidden")) { + lastComment = getComment(commentElement); + comments.add(lastComment); + } + + List subComments = new ArrayList(); + for (Element child : commentElement.children()) { + if (child.id().contains("commtree_")) { + subComments.addAll(getComments(child)); + } + } + + if (lastComment == null) { + comments.addAll(subComments); + } else { + lastComment.addAll(subComments); } } } + return comments; } + /** + * Get a comment from the given element. + * + * @param commentElement + * the element to get the comment of. + * + * @return the comment, NOT including sub-comments + */ private Comment getComment(Element commentElement) { - String title = firstOrEmpty(commentElement, "title"); - String author = firstOrEmpty(commentElement, "by"); - String content = firstOrEmpty(commentElement, "commentBody"); - String date = firstOrEmpty(commentElement, "otherdetails"); + String title = firstOrEmpty(commentElement, "title").text(); + String author = firstOrEmpty(commentElement, "by").text(); + String date = firstOrEmpty(commentElement, "otherdetails").text(); + Element content = firstOrEmpty(commentElement, "commentBody"); - Comment comment = new Comment(commentElement.id(), author, title, date, - content); + return new Comment(commentElement.id(), author, title, date, + toLines(content)); + } - for (Element child : commentElement.children()) { - if (child.id().contains("commtree_")) { - comment.addAll(getComments(child)); - } - } + private List toLines(Element element) { + return toLines(element, new BasicElementProcessor() { + @Override + public String processText(String text) { + while (text.startsWith(">")) { // comment in one-liners + text = text.substring(1).trim(); + } - return comment; - } + return text; + } - private String firstOrEmpty(Element element, String className) { - Elements subElements = element.getElementsByClass(className); - if (subElements.size() > 0) { - return subElements.get(0).text(); - } + @Override + public boolean detectQuote(Node node) { + if (node instanceof Element) { + Element elementNode = (Element) node; + if (elementNode.tagName().equals("blockquote") + || elementNode.hasClass("quote") + || (elementNode.tagName().equals("p") + && elementNode.textNodes().size() == 1 && elementNode + .textNodes().get(0).getWholeText() + .startsWith(">"))) { + return true; + } + } - return ""; + return false; + } + }); } }