From 27008a8782c0ed96e07c8dc39ff0ed1f5163a9d0 Mon Sep 17 00:00:00 2001 From: Niki Roo Date: Tue, 22 Aug 2017 18:56:51 +0200 Subject: [PATCH] Version 0.2.0: supports LWN, quotes,
s --- VERSION | 2 +- changelog.md | 4 +- src/be/nikiroo/gofetch/data/Comment.java | 14 +- src/be/nikiroo/gofetch/output/Gopher.java | 21 ++- src/be/nikiroo/gofetch/output/Html.java | 13 +- .../nikiroo/gofetch/support/BasicSupport.java | 149 ++++++++++++++++++ src/be/nikiroo/gofetch/support/LWN.java | 91 +++++------ src/be/nikiroo/gofetch/support/Pipedot.java | 66 ++++---- src/be/nikiroo/gofetch/support/Slashdot.java | 84 +++++++--- 9 files changed, 315 insertions(+), 129 deletions(-) diff --git a/VERSION b/VERSION index 17e51c3..0ea3a94 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.1 +0.2.0 diff --git a/changelog.md b/changelog.md index a3ec4a2..13ebf3a 100644 --- a/changelog.md +++ b/changelog.md @@ -1,8 +1,10 @@ # Gofetch -## Version WIP +## Version 0.2.0 - Add Linux Weekly News support +- Correctly handle BR tags +- Supports quotes ## Version 0.1.1 diff --git a/src/be/nikiroo/gofetch/data/Comment.java b/src/be/nikiroo/gofetch/data/Comment.java index 44c0de1..963d6aa 100644 --- a/src/be/nikiroo/gofetch/data/Comment.java +++ b/src/be/nikiroo/gofetch/data/Comment.java @@ -9,16 +9,16 @@ public class Comment implements Iterable { private String author; private String title; private String date; - private String content; + private List lines; private List children; public Comment(String id, String author, String title, String date, - String content) { + List lines) { this.id = id; this.author = author; this.title = title; this.date = date; - this.content = content; + this.lines = lines; this.children = new ArrayList(); } @@ -61,13 +61,13 @@ public class Comment implements Iterable { /** * @return the content */ - public String getContent() { - return content; + public List getContentLines() { + return lines; } public boolean isEmpty() { - return children.isEmpty() - && ("" + author + title + content).trim().isEmpty(); + return children.isEmpty() && lines.isEmpty() + && ("" + author + title).trim().isEmpty(); } @Override diff --git a/src/be/nikiroo/gofetch/output/Gopher.java b/src/be/nikiroo/gofetch/output/Gopher.java index 3fa6035..6dcb4aa 100644 --- a/src/be/nikiroo/gofetch/output/Gopher.java +++ b/src/be/nikiroo/gofetch/output/Gopher.java @@ -65,13 +65,30 @@ public class Gopher extends Output { space = space.substring(0, LINE_SIZE - 20); } - appendLeft(builder, comment.getTitle(), ">> ", " ", space); + appendLeft(builder, comment.getTitle(), "** ", " ", space); appendLeft(builder, "(" + comment.getAuthor() + ")", " ", " ", space); builder.append("i\r\n"); - appendLeft(builder, comment.getContent(), " ", " ", space); + for (String line : comment.getContentLines()) { + int depth = 0; + while (line.length() > depth && line.charAt(depth) == '>') { + depth++; + } + line = line.substring(depth).trim(); + + String prep = " "; + for (int i = 0; i < depth; i++) { + prep += ">"; + } + + if (depth > 0) { + prep += " "; + } + + appendLeft(builder, line, prep, prep, space); + } builder.append("i\r\n"); for (Comment subComment : comment) { diff --git a/src/be/nikiroo/gofetch/output/Html.java b/src/be/nikiroo/gofetch/output/Html.java index cdc77a4..33c99c8 100644 --- a/src/be/nikiroo/gofetch/output/Html.java +++ b/src/be/nikiroo/gofetch/output/Html.java @@ -17,7 +17,7 @@ public class Html extends Output { } String gopherUrl = "gopher://" + hostname + sel + ":" + port; - + StringBuilder builder = new StringBuilder(); appendPre(builder); @@ -28,9 +28,9 @@ public class Html extends Output { + ".

\n"// + "

They are simply scrapped from their associated webpage and updated a few times a day.

\n"// ); - + appendPost(builder); - + return builder.toString(); } @@ -101,8 +101,11 @@ public class Html extends Output { .append("\n"); builder.append(space).append("
") .append(comment.getAuthor()).append("
\n"); - builder.append(space).append("
") - .append(comment.getContent()).append("
\n"); + builder.append(space).append("
"); + for (String line : comment.getContentLines()) { + builder.append("

" + line + "

"); + } + builder.append("
\n"); for (Comment subComment : comment) { appendHtml(builder, subComment, space + " "); } diff --git a/src/be/nikiroo/gofetch/support/BasicSupport.java b/src/be/nikiroo/gofetch/support/BasicSupport.java index 7a1d0ea..1db066b 100644 --- a/src/be/nikiroo/gofetch/support/BasicSupport.java +++ b/src/be/nikiroo/gofetch/support/BasicSupport.java @@ -4,9 +4,18 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; +import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPInputStream; +import org.jsoup.helper.StringUtil; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.Elements; +import org.jsoup.select.NodeTraversor; +import org.jsoup.select.NodeVisitor; + import be.nikiroo.gofetch.data.Story; public abstract class BasicSupport { @@ -14,6 +23,14 @@ public abstract class BasicSupport { SLASHDOT, PIPEDOT, LWN, } + public interface QuoteProcessor { + public boolean detectQuote(Node node); + + public String processText(String text); + + public boolean ignoreNode(Node node); + } + static private String preselector; private Type type; @@ -93,4 +110,136 @@ public abstract class BasicSupport { return in; } + + /** + * Get the first {@link Element} of the given class, or an empty span + * {@link Element} if none found. + * + * @param element + * the element to look in + * @param className + * the class to look for + * + * @return the value or an empty span {@link Element} + */ + static protected Element firstOrEmpty(Element element, String className) { + Elements subElements = element.getElementsByClass(className); + if (subElements.size() > 0) { + return subElements.get(0); + } + + return new Element("span"); + } + + /** + * Get the first {@link Element} of the given tag, or an empty span + * {@link Element} if none found. + * + * @param element + * the element to look in + * @param tagName + * the tag to look for + * + * @return the value or an empty span {@link Element} + */ + static protected Element firstOrEmptyTag(Element element, String tagName) { + Elements subElements = element.getElementsByTag(tagName); + if (subElements.size() > 0) { + return subElements.get(0); + } + + return new Element("span"); + } + + static protected List toLines(Element element, + final QuoteProcessor quoteProcessor) { + final List lines = new ArrayList(); + final StringBuilder currentLine = new StringBuilder(); + final List quoted = new ArrayList(); + final List ignoredNodes = new ArrayList(); + + if (element != null) { + new NodeTraversor(new NodeVisitor() { + @Override + public void head(Node node, int depth) { + if (quoteProcessor.ignoreNode(node) + || ignoredNodes.contains(node.parentNode())) { + ignoredNodes.add(node); + return; + } + + String prep = ""; + for (int i = 0; i < quoted.size(); i++) { + prep += ">"; + } + prep += " "; + + boolean enterQuote = quoteProcessor.detectQuote(node); + boolean leaveQuote = quoted.contains(depth); + + if (enterQuote) { + quoted.add(depth); + } + + if (leaveQuote) { + quoted.remove(Integer.valueOf(depth)); + } + + if (enterQuote || leaveQuote) { + if (currentLine.length() > 0) { + if (currentLine.charAt(currentLine.length() - 1) == '\n') { + currentLine.setLength(currentLine.length() - 1); + } + for (String l : currentLine.toString().split("\n")) { + lines.add(prep + l); + } + } + currentLine.setLength(0); + } + + if (node instanceof Element) { + Element element = (Element) node; + boolean block = element.isBlock() + || element.tagName().equalsIgnoreCase("br"); + if (block && currentLine.length() > 0) { + currentLine.append("\n"); + } + } else if (node instanceof TextNode) { + TextNode textNode = (TextNode) node; + String line = StringUtil.normaliseWhitespace(textNode + .getWholeText()); + + currentLine.append(quoteProcessor.processText(line)); + currentLine.append(" "); + } + } + + @Override + public void tail(Node node, int depth) { + } + }).traverse(element); + } + + if (currentLine.length() > 0) { + String prep = ""; + for (int i = 0; i < quoted.size(); i++) { + prep += ">"; + } + prep += " "; + if (currentLine.length() > 0) { + if (currentLine.charAt(currentLine.length() - 1) == '\n') { + currentLine.setLength(currentLine.length() - 1); + } + for (String l : currentLine.toString().split("\n")) { + lines.add(prep + l); + } + } + } + + for (int i = 0; i < lines.size(); i++) { + lines.set(i, lines.get(i).replace(" ", " ").trim()); + } + + return lines; + } } diff --git a/src/be/nikiroo/gofetch/support/LWN.java b/src/be/nikiroo/gofetch/support/LWN.java index 2fea78a..dba4c3b 100644 --- a/src/be/nikiroo/gofetch/support/LWN.java +++ b/src/be/nikiroo/gofetch/support/LWN.java @@ -139,8 +139,8 @@ public class LWN extends BasicSupport { } private Comment getComment(Element commentElement) { - String title = firstOrEmpty(commentElement, "CommentTitle"); - String author = firstOrEmpty(commentElement, "CommentPoster"); + String title = firstOrEmpty(commentElement, "CommentTitle").text(); + String author = firstOrEmpty(commentElement, "CommentPoster").text(); String date = ""; int pos = author.lastIndexOf(" by "); @@ -153,69 +153,54 @@ public class LWN extends BasicSupport { } } - String content = ""; + Element content = null; Elements commentBodyElements = commentElement .getElementsByClass("CommentBody"); if (commentBodyElements.size() > 0) { - for (Node contentNode : commentBodyElements.get(0).childNodes()) { - if (contentNode instanceof Element) { - Element contentElement = (Element) contentNode; - if (!contentElement.hasClass("CommentPoster")) { - content = content.trim() + " " - + contentElement.text().trim(); - } - } else { - content = content.trim() + " " - + contentNode.outerHtml().trim(); - } - - } - content = content.trim(); + content = commentBodyElements.get(0); } Comment comment = new Comment(commentElement.id(), author, title, date, - content); + toLines(content)); return comment; } - /** - * Get the first element of the given class, or an empty {@link String} if - * none found. - * - * @param element - * the element to look in - * @param className - * the class to look for - * - * @return the value or an empty {@link String} - */ - private String firstOrEmpty(Element element, String className) { - Elements subElements = element.getElementsByClass(className); - if (subElements.size() > 0) { - return subElements.get(0).text(); - } + private List toLines(Element element) { + return toLines(element, new QuoteProcessor() { + @Override + public String processText(String text) { + while (text.startsWith(">")) { // comments + text = text.substring(1).trim(); + } - return ""; - } + return text; + } - /** - * Get the first element of the given tag, or an empty {@link String} if - * none found. - * - * @param element - * the element to look in - * @param tagName - * the tag to look for - * - * @return the value or an empty {@link String} - */ - private String firstOrEmptyTag(Element element, String tagName) { - Elements subElements = element.getElementsByTag(tagName); - if (subElements.size() > 0) { - return subElements.get(0).text(); - } + @Override + public boolean detectQuote(Node node) { + if (node instanceof Element) { + Element elementNode = (Element) node; + if (elementNode.tagName().equals("blockquote") + || elementNode.hasClass("QuotedText")) { + return true; + } + } + + return false; + } - return ""; + @Override + public boolean ignoreNode(Node node) { + if (node instanceof Element) { + Element elementNode = (Element) node; + if (elementNode.hasClass("CommentPoster")) { + return true; + } + } + + return false; + } + }); } } diff --git a/src/be/nikiroo/gofetch/support/Pipedot.java b/src/be/nikiroo/gofetch/support/Pipedot.java index 2436540..1bd5173 100644 --- a/src/be/nikiroo/gofetch/support/Pipedot.java +++ b/src/be/nikiroo/gofetch/support/Pipedot.java @@ -9,6 +9,7 @@ import java.util.List; import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; import org.jsoup.select.Elements; import be.nikiroo.gofetch.data.Comment; @@ -117,9 +118,9 @@ public class Pipedot extends BasicSupport { } private Comment getComment(Element commentElement) { - String title = firstOrEmptyTag(commentElement, "h3"); - String author = firstOrEmpty(commentElement, "h4"); - String content = firstOrEmpty(commentElement, "comment-body"); + String title = firstOrEmptyTag(commentElement, "h3").text(); + String author = firstOrEmpty(commentElement, "h4").text(); + Element content = firstOrEmpty(commentElement, "comment-body"); String date = ""; int pos = author.lastIndexOf(" on "); @@ -129,7 +130,7 @@ public class Pipedot extends BasicSupport { } Comment comment = new Comment(commentElement.id(), author, title, date, - content); + toLines(content)); Elements commentOutline = commentElement .getElementsByClass("comment-outline"); @@ -140,43 +141,30 @@ public class Pipedot extends BasicSupport { return comment; } - /** - * Get the first element of the given class, or an empty {@link String} if - * none found. - * - * @param element - * the element to look in - * @param className - * the class to look for - * - * @return the value or an empty {@link String} - */ - private String firstOrEmpty(Element element, String className) { - Elements subElements = element.getElementsByClass(className); - if (subElements.size() > 0) { - return subElements.get(0).text(); - } + private List toLines(Element element) { + return toLines(element, new QuoteProcessor() { + @Override + public String processText(String text) { + return text; + } - return ""; - } + @Override + public boolean detectQuote(Node node) { + if (node instanceof Element) { + Element elementNode = (Element) node; + if (elementNode.tagName().equals("blockquote") + || elementNode.hasClass("quote")) { + return true; + } + } - /** - * Get the first element of the given tag, or an empty {@link String} if - * none found. - * - * @param element - * the element to look in - * @param tagName - * the tag to look for - * - * @return the value or an empty {@link String} - */ - private String firstOrEmptyTag(Element element, String tagName) { - Elements subElements = element.getElementsByTag(tagName); - if (subElements.size() > 0) { - return subElements.get(0).text(); - } + return false; + } - return ""; + @Override + public boolean ignoreNode(Node node) { + return false; + } + }); } } diff --git a/src/be/nikiroo/gofetch/support/Slashdot.java b/src/be/nikiroo/gofetch/support/Slashdot.java index 6a53954..8776e35 100644 --- a/src/be/nikiroo/gofetch/support/Slashdot.java +++ b/src/be/nikiroo/gofetch/support/Slashdot.java @@ -9,6 +9,7 @@ import java.util.List; import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; import org.jsoup.select.Elements; import be.nikiroo.gofetch.data.Comment; @@ -92,41 +93,82 @@ public class Slashdot extends BasicSupport { private List getComments(Element listing) { List comments = new ArrayList(); + Comment lastComment = null; for (Element commentElement : listing.children()) { if (commentElement.hasClass("comment")) { - Comment comment = getComment(commentElement); - if (!comment.isEmpty()) { - comments.add(comment); + if (!commentElement.hasClass("hidden")) { + lastComment = getComment(commentElement); + comments.add(lastComment); + } + + List subComments = new ArrayList(); + for (Element child : commentElement.children()) { + if (child.id().contains("commtree_")) { + subComments.addAll(getComments(child)); + } + } + + if (lastComment == null) { + comments.addAll(subComments); + } else { + lastComment.addAll(subComments); } } } + return comments; } + /** + * Get a comment from the given element. + * + * @param commentElement + * the element to get the comment of. + * + * @return the comment, NOT including sub-comments + */ private Comment getComment(Element commentElement) { - String title = firstOrEmpty(commentElement, "title"); - String author = firstOrEmpty(commentElement, "by"); - String content = firstOrEmpty(commentElement, "commentBody"); - String date = firstOrEmpty(commentElement, "otherdetails"); + String title = firstOrEmpty(commentElement, "title").text(); + String author = firstOrEmpty(commentElement, "by").text(); + String date = firstOrEmpty(commentElement, "otherdetails").text(); + Element content = firstOrEmpty(commentElement, "commentBody"); + + return new Comment(commentElement.id(), author, title, date, + toLines(content)); + } - Comment comment = new Comment(commentElement.id(), author, title, date, - content); + private List toLines(Element element) { + return toLines(element, new QuoteProcessor() { + @Override + public String processText(String text) { + while (text.startsWith(">")) { // comment in one-liners + text = text.substring(1).trim(); + } - for (Element child : commentElement.children()) { - if (child.id().contains("commtree_")) { - comment.addAll(getComments(child)); + return text; } - } - return comment; - } + @Override + public boolean detectQuote(Node node) { + if (node instanceof Element) { + Element elementNode = (Element) node; + if (elementNode.tagName().equals("blockquote") + || elementNode.hasClass("quote") + || (elementNode.tagName().equals("p") + && elementNode.textNodes().size() == 1 && elementNode + .textNodes().get(0).getWholeText() + .startsWith(">"))) { + return true; + } + } - private String firstOrEmpty(Element element, String className) { - Elements subElements = element.getElementsByClass(className); - if (subElements.size() > 0) { - return subElements.get(0).text(); - } + return false; + } - return ""; + @Override + public boolean ignoreNode(Node node) { + return false; + } + }); } } -- 2.27.0