From 3e62b034c1981ae6329f06b3f8c0ee25c3683789 Mon Sep 17 00:00:00 2001 From: Niki Roo Date: Sun, 25 Mar 2018 21:39:01 +0200 Subject: [PATCH] Bug fixes + rework of BasicSupport --- src/be/nikiroo/gofetch/Fetcher.java | 2 +- src/be/nikiroo/gofetch/Main.java | 2 +- src/be/nikiroo/gofetch/data/Comment.java | 2 +- src/be/nikiroo/gofetch/data/Story.java | 2 +- src/be/nikiroo/gofetch/output/Gopher.java | 2 +- src/be/nikiroo/gofetch/output/Html.java | 19 +- src/be/nikiroo/gofetch/output/Output.java | 2 +- .../support/BasicElementProcessor.java | 36 + .../nikiroo/gofetch/support/BasicSupport.java | 613 +++++++++++++----- .../gofetch/support/ElementProcessor.java | 67 ++ .../nikiroo/gofetch/support/EreNumerique.java | 345 +++++----- src/be/nikiroo/gofetch/support/LWN.java | 324 +++++---- src/be/nikiroo/gofetch/support/LeMonde.java | 233 ++++--- src/be/nikiroo/gofetch/support/Pipedot.java | 298 +++++---- src/be/nikiroo/gofetch/support/Slashdot.java | 312 ++++++--- .../nikiroo/gofetch/support/TheRegister.java | 363 ++++++----- src/be/nikiroo/gofetch/support/TooLinux.java | 217 ++++--- src/be/nikiroo/gofetch/support/Type.java | 23 + 18 files changed, 1851 insertions(+), 1011 deletions(-) create mode 100644 src/be/nikiroo/gofetch/support/BasicElementProcessor.java create mode 100644 src/be/nikiroo/gofetch/support/ElementProcessor.java create mode 100644 src/be/nikiroo/gofetch/support/Type.java diff --git a/src/be/nikiroo/gofetch/Fetcher.java b/src/be/nikiroo/gofetch/Fetcher.java index 258f196..6c86c13 100644 --- a/src/be/nikiroo/gofetch/Fetcher.java +++ b/src/be/nikiroo/gofetch/Fetcher.java @@ -14,7 +14,7 @@ import be.nikiroo.gofetch.output.Gopher; import be.nikiroo.gofetch.output.Html; import be.nikiroo.gofetch.output.Output; import be.nikiroo.gofetch.support.BasicSupport; -import be.nikiroo.gofetch.support.BasicSupport.Type; +import be.nikiroo.gofetch.support.Type; import be.nikiroo.utils.IOUtils; /** diff --git a/src/be/nikiroo/gofetch/Main.java b/src/be/nikiroo/gofetch/Main.java index 7aa9405..e4078d8 100644 --- a/src/be/nikiroo/gofetch/Main.java +++ b/src/be/nikiroo/gofetch/Main.java @@ -3,7 +3,7 @@ package be.nikiroo.gofetch; import java.io.File; import java.io.IOException; -import be.nikiroo.gofetch.support.BasicSupport.Type; +import be.nikiroo.gofetch.support.Type; /** * This class is tha main entry point of the program. It will parse the diff --git a/src/be/nikiroo/gofetch/data/Comment.java b/src/be/nikiroo/gofetch/data/Comment.java index bbd648a..da07482 100644 --- a/src/be/nikiroo/gofetch/data/Comment.java +++ b/src/be/nikiroo/gofetch/data/Comment.java @@ -91,7 +91,7 @@ public class Comment implements Iterable { public boolean isEmpty() { return children.isEmpty() && lines.isEmpty() - && ("" + author + title).trim().isEmpty(); + && ("" + author + title).isEmpty(); } @Override diff --git a/src/be/nikiroo/gofetch/data/Story.java b/src/be/nikiroo/gofetch/data/Story.java index c0719d2..9a2e68d 100644 --- a/src/be/nikiroo/gofetch/data/Story.java +++ b/src/be/nikiroo/gofetch/data/Story.java @@ -4,7 +4,7 @@ import java.net.URL; import java.util.List; import be.nikiroo.gofetch.support.BasicSupport; -import be.nikiroo.gofetch.support.BasicSupport.Type; +import be.nikiroo.gofetch.support.Type; /** * A news story. diff --git a/src/be/nikiroo/gofetch/output/Gopher.java b/src/be/nikiroo/gofetch/output/Gopher.java index f0c6f6d..12a420c 100644 --- a/src/be/nikiroo/gofetch/output/Gopher.java +++ b/src/be/nikiroo/gofetch/output/Gopher.java @@ -2,7 +2,7 @@ package be.nikiroo.gofetch.output; import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; -import be.nikiroo.gofetch.support.BasicSupport.Type; +import be.nikiroo.gofetch.support.Type; import be.nikiroo.utils.StringUtils; import be.nikiroo.utils.StringUtils.Alignment; diff --git a/src/be/nikiroo/gofetch/output/Html.java b/src/be/nikiroo/gofetch/output/Html.java index 50fe2d7..385df8b 100644 --- a/src/be/nikiroo/gofetch/output/Html.java +++ b/src/be/nikiroo/gofetch/output/Html.java @@ -2,7 +2,8 @@ package be.nikiroo.gofetch.output; import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; -import be.nikiroo.gofetch.support.BasicSupport.Type; +import be.nikiroo.gofetch.support.Type; +import be.nikiroo.utils.StringUtils; public class Html extends Output { public Html(Type type, String hostname, String preselector, int port) { @@ -99,7 +100,8 @@ public class Html extends Output { .append("
\n"); builder.append(space).append("

").append(comment.getTitle()) .append("

\n"); - builder.append(space).append("
") + builder.append(space) + .append("
") .append(comment.getAuthor()).append("
\n"); builder.append(space).append("
"); for (String line : comment.getContentLines()) { @@ -123,7 +125,9 @@ public class Html extends Output { builder.append("
"); if (story.getDetails() != null && !story.getDetails().isEmpty()) { - builder.append("(").append(story.getDetails()).append(")"); + builder.append("(") + .append(StringUtils.xmlEscape(story.getDetails())) + .append(")"); } builder.append("
\n"); builder.append("
\n"); @@ -142,12 +146,13 @@ public class Html extends Output { builder.append("
\n"); if (resume) { - builder.append(" " + story.getContent() + "\n"); + builder.append(" " + StringUtils.xmlEscape(story.getContent()) + + "\n"); } else { builder.append(" " - + story.getFullContent().replace("\n", "
") - .replace("[ ", "

").replace(" ]", "

") - + "\n"); + + StringUtils.xmlEscape(story.getFullContent()) + .replace("\n", "
").replace("[ ", "

") + .replace(" ]", "

") + "\n"); } builder.append("
\n"); diff --git a/src/be/nikiroo/gofetch/output/Output.java b/src/be/nikiroo/gofetch/output/Output.java index db6554b..1166879 100644 --- a/src/be/nikiroo/gofetch/output/Output.java +++ b/src/be/nikiroo/gofetch/output/Output.java @@ -1,7 +1,7 @@ package be.nikiroo.gofetch.output; import be.nikiroo.gofetch.data.Story; -import be.nikiroo.gofetch.support.BasicSupport.Type; +import be.nikiroo.gofetch.support.Type; /** * Base class for output operations. diff --git a/src/be/nikiroo/gofetch/support/BasicElementProcessor.java b/src/be/nikiroo/gofetch/support/BasicElementProcessor.java new file mode 100644 index 0000000..83d7c8b --- /dev/null +++ b/src/be/nikiroo/gofetch/support/BasicElementProcessor.java @@ -0,0 +1,36 @@ +package be.nikiroo.gofetch.support; + +import org.jsoup.nodes.Node; + +/** + * A default {@link ElementProcessor} (will not detect or process anything + * manually). + * + * @author niki + */ +class BasicElementProcessor implements ElementProcessor { + @Override + public boolean detectQuote(Node node) { + return false; + } + + @Override + public String processText(String text) { + return text; + } + + @Override + public boolean ignoreNode(Node node) { + return false; + } + + @Override + public String manualProcessing(Node node) { + return null; + } + + @Override + public String isSubtitle(Node node) { + return null; + } +} diff --git a/src/be/nikiroo/gofetch/support/BasicSupport.java b/src/be/nikiroo/gofetch/support/BasicSupport.java index b15fac7..a59ae31 100644 --- a/src/be/nikiroo/gofetch/support/BasicSupport.java +++ b/src/be/nikiroo/gofetch/support/BasicSupport.java @@ -1,22 +1,29 @@ package be.nikiroo.gofetch.support; import java.io.IOException; +import java.io.InputStream; +import java.net.URL; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Arrays; import java.util.Date; import java.util.List; +import java.util.Map.Entry; +import org.jsoup.helper.DataUtil; import org.jsoup.helper.StringUtil; +import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; -import org.jsoup.select.Elements; import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; +import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; import be.nikiroo.utils.Downloader; +import be.nikiroo.utils.StringUtils; /** * Base class for website support. @@ -27,140 +34,226 @@ public abstract class BasicSupport { /** The downloader to use for all websites. */ protected static Downloader downloader = new Downloader("gofetcher"); + static private String preselector; + + private Type type; + + /** + * The website textual description, to add in the dispatcher page. + *

+ * Should be short. + * + * @return the description + */ + abstract public String getDescription(); + /** - * The support type (each website we support has a single type). - * - * @author niki - */ - public enum Type { - /** EN: Any, but mostly IT/Sci */ - SLASHDOT, - /** EN: Clone of Slashdot, mostly abandoned */ - PIPEDOT, - /** EN: Linux */ - LWN, - /** FR: Any */ - LEMONDE, - /** EN: IT */ - REGISTER, - /** FR: Linux */ - TOO_LINUX, - /** FR: IT */ - ERE_NUMERIQUE, + * The gopher "selector" to use for output. + *

+ * A kind of "URL path", like "/news/" or "/misc/news/" or... + * + * @return the selector + */ + public String getSelector() { + return getSelector(type); } /** - * Used to process an element into lines. - * - * @author niki - */ - public interface ElementProcessor { - /** - * Detect if this node is a quote and should be trated as such. - * - * @param node - * the node to check - * @return TRUE if it is - */ - public boolean detectQuote(Node node); - - /** - * Process text content (will be called on each text element, allowing - * you to modify it if needed). - * - * @param text - * the text to process - * - * @return the resulting text - */ - public String processText(String text); - - /** - * Ignore this node. - * - * @param node - * the node to ignore - * @return TRUE if it has to be ignored - */ - public boolean ignoreNode(Node node); - - /** - * Manually process this node (and return the manual processing value) - * if so desired. - *

- * If the node is manually processed, it and its children will not be - * automatically processed. - * - * @param node - * the node to optionally process - * - * @return NULL if not processed (will thus be automatically processed - * as usual), a {@link String} (may be empty) if we process it - * manually -- the given {@link String} will be used instead of - * the usual automatic processing if not NULL - */ - public String manualProcessing(Node node); - - /** - * This {@link Node} is a subtitle and should be treated as such - * (highlighted). - * - * @param node - * the node to check - * - * @return NULL if it is not a subtitle, the subtitle to use if it is - */ - public String isSubtitle(Node node); + * The support type. + * + * @return the type + */ + public Type getType() { + return type; } /** - * A default {@link ElementProcessor} (will not detect or process anything - * manually). + * List all the recent items, but only assure the ID and internal URL to + * fetch it later on (until it has been fetched, the rest of the + * {@link Story} is not confirmed). * - * @author niki + * @return the list of new stories + * + * @throws IOException + * in case of I/O */ - public class BasicElementProcessor implements ElementProcessor { - @Override - public boolean detectQuote(Node node) { - return false; - } + public List list() throws IOException { + List list = new ArrayList(); + + for (Entry entry : getUrls()) { + URL url = entry.getKey(); + String defaultCateg = entry.getValue(); + if (defaultCateg == null) { + defaultCateg = ""; + } - @Override - public String processText(String text) { - return text; - } + InputStream in = downloader.open(url); + Document doc = DataUtil.load(in, "UTF-8", url.toString()); + List articles = getArticles(doc); + for (Element article : articles) { + String id = getArticleId(doc, article).trim(); + String title = getArticleTitle(doc, article).trim(); + String author = getArticleAuthor(doc, article).trim(); + String date = getArticleDate(doc, article).trim(); + String categ = getArticleCategory(doc, article, defaultCateg) + .trim(); + String details = getArticleDetails(doc, article).trim(); + String intUrl = getArticleIntUrl(doc, article).trim(); + String extUrl = getArticleExtUrl(doc, article).trim(); + String content = getArticleContent(doc, article).trim(); + + if (id.isEmpty() && date.isEmpty()) { + continue; + } - @Override - public boolean ignoreNode(Node node) { - return false; - } + if (id.isEmpty()) { + id = date.replace(":", "_").replace("+", "_"); + } - @Override - public String manualProcessing(Node node) { - return null; - } + date = date(date); - @Override - public String isSubtitle(Node node) { - return null; + list.add(new Story(getType(), id, title, author, date, categ, + details, intUrl, extUrl, content)); + } } + + return list; } - static private String preselector; + /** + * The {@link URL}s to process for this website. + * + * @return the list of {@link URL}s + * + * @throws IOException + * in case of I/O error + */ + abstract protected List> getUrls() throws IOException; - private Type type; + /** + * The article {@link Element}s of this document. + * + * @param doc + * the main document for the current category + * + * @return the articles + */ + abstract protected List getArticles(Document doc); /** - * List all the recent items, but only assure the ID and internal URL to - * fetch it later on (until it has been fetched, the rest of the - * {@link Story} is not confirmed). + * The ID of the article (defaults to the date element if empty). * - * @return the list of new stories + * @param doc + * the main document for the current category + * @param article + * the article to look into * - * @throws IOException - * in case of I/O + * @return the ID + */ + abstract protected String getArticleId(Document doc, Element article); + + /** + * The article title to display. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the title + */ + abstract protected String getArticleTitle(Document doc, Element article); + + /** + * The optional article author. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the author + */ + abstract protected String getArticleAuthor(Document doc, Element article); + + /** + * The optional article date. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the date + */ + abstract protected String getArticleDate(Document doc, Element article); + + /** + * the optional article category. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * @param currentCategory + * the currently listed category if any (can be NULL) + * + * @return the category */ - abstract public List list() throws IOException; + abstract protected String getArticleCategory(Document doc, Element article, + String currentCategory); + + /** + * the optional details of the article (can replace the date, author and + * category, for instance). + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the details + */ + abstract protected String getArticleDetails(Document doc, Element article); + + /** + * The (required) {@link URL} that points to the news page on the supported + * website. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the internal {@link URL} + */ + abstract protected String getArticleIntUrl(Document doc, Element article); + + /** + * the optional {@link URL} that points to an external website for more + * information. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the external {@link URL} + */ + abstract protected String getArticleExtUrl(Document doc, Element article); + + /** + * The optional article short-content (not the full content, that will be + * fetched by {@link BasicSupport#fetch(Story)}). + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the short content + */ + abstract protected String getArticleContent(Document doc, Element article); /** * Fetch the full article content as well as all the comments associated to @@ -172,37 +265,211 @@ public abstract class BasicSupport { * @throws IOException * in case of I/O error */ - abstract public void fetch(Story story) throws IOException; + public void fetch(Story story) throws IOException { + String fullContent = ""; + + URL url = new URL(story.getUrlInternal()); + InputStream in = downloader.open(url); + try { + Document doc = DataUtil.load(in, "UTF-8", url.toString()); + Element article = getFullArticle(doc); + if (article != null) { + StringBuilder builder = new StringBuilder(); + ElementProcessor eProc = getElementProcessorFullArticle(); + if (eProc != null) { + for (String line : toLines(article, eProc)) { + builder.append(line + "\n"); + } + } else { + builder.append(article.text()); + } + + // Content is too tight with a single break per line: + fullContent = builder.toString().replace("\n", "\n\n") // + .replace("\n\n\n\n", "\n\n") // + .replace("\n\n\n\n", "\n\n") // + .trim(); + } + + if (fullContent.isEmpty()) { + fullContent = story.getContent(); + } + + story.setFullContent(fullContent); + story.setComments(getComments(doc, + getFullArticleCommentPosts(doc, url))); + } finally { + if (in != null) { + in.close(); + } + } + } /** - * The website textual description, to add in the dispatcher page. - *

- * Should be short. + * Return the full article if available. * - * @return the description + * @param doc + * the (full article) document to work on + * + * @return the article or NULL */ - abstract public String getDescription(); + abstract protected Element getFullArticle(Document doc); /** - * The gopher "selector" to use for output. + * Return the list of comment {@link Element}s from this optional container + * -- must NOT return the "container" as a comment {@link Element}. + * + * @param doc + * the (full article) document to work on + * @param intUrl + * the internal {@link URL} this article wa taken from (the + * {@link URL} from the supported website) + * + * @return the list of comment posts + */ + abstract protected List getFullArticleCommentPosts(Document doc, + URL intUrl); + + /** + * The {@link ElementProcessor} to use to convert the main article element + * (see {@link BasicSupport#getFullArticle(Document)}) into text. *

- * A kind of "URL path", like "/news/" or "/misc/news/" or... + * See {@link BasicElementProcessor} for a working, basic implementation. + *

+ * Can be NULL to simply use {@link Element#text()}. * - * @return the selector + * @return the processor, or NULL */ - public String getSelector() { - return getSelector(type); - } + abstract protected ElementProcessor getElementProcessorFullArticle(); /** - * The support type. + * Convert the comment elements into {@link Comment}s * - * @return the type + * @param doc + * the document we work on + * @param posts + * the comment elements + * + * @return the converted {@link Comment}s */ - public Type getType() { - return type; + private List getComments(Document doc, List posts) { + List comments = new ArrayList(); + if (posts != null) { + for (Element post : posts) { + String id = getCommentId(post).trim(); + String author = getCommentAuthor(post).trim(); + String title = getCommentTitle(post).trim(); + String date = getCommentDate(post).trim(); + + List content = new ArrayList(); + + if (id.isEmpty()) { + id = date; + } + + date = date(date); + + Element contentE = getCommentContentElement(post); + if (contentE != null) { + ElementProcessor eProc = getElementProcessorComment(); + if (eProc != null) { + for (String line : toLines(contentE, eProc)) { + content.add(line); + } + } else { + content = Arrays.asList(contentE.text().split("\n")); + } + } + + Comment comment = new Comment(id, author, title, date, content); + comment.addAll(getComments(doc, + getCommentCommentPosts(doc, post))); + + if (!comment.isEmpty()) { + comments.add(comment); + } + } + } + + return comments; } + /** + * Return the list of subcomment {@link Element}s from this comment element + * -- must NOT return the "container" as a comment {@link Element}. + * + * @param doc + * the (full article) document to work on + * @param container + * the container (a comment {@link Element}) + * + * @return the list of comment posts + */ + abstract protected List getCommentCommentPosts(Document doc, + Element container); + + /** + * Compute the ID of the given comment element. + * + * @param post + * the comment element + * + * @return the ID + */ + abstract protected String getCommentId(Element post); + + /** + * Compute the author of the given comment element. + * + * @param post + * the comment element + * + * @return the author + */ + abstract protected String getCommentAuthor(Element post); + + /** + * Compute the title of the given comment element. + * + * @param post + * the comment element + * + * @return the title + */ + abstract protected String getCommentTitle(Element post); + + /** + * Compute the date of the given comment element. + * + * @param post + * the comment element + * + * @return the date + */ + abstract protected String getCommentDate(Element post); + + /** + * Get the main of the given comment element, which can be NULL. + * + * @param post + * the comment element + * + * @return the element + */ + abstract protected Element getCommentContentElement(Element post); + + /** + * The {@link ElementProcessor} to use to convert the main comment element + * (see {@link BasicSupport#getCommentContentElement(Element)}) into text. + *

+ * See {@link BasicElementProcessor} for a working, basic implementation. + *

+ * Can be NULL to simply use {@link Element#text()}. + * + * @return the processor + */ + abstract protected ElementProcessor getElementProcessorComment(); + /** * The support type. * @@ -284,46 +551,6 @@ public abstract class BasicSupport { return preselector + "/" + type + "/"; } - /** - * Get the first {@link Element} of the given class, or an empty span - * {@link Element} if none found. - * - * @param element - * the element to look in - * @param className - * the class to look for - * - * @return the value or an empty span {@link Element} - */ - static protected Element firstOrEmpty(Element element, String className) { - Elements subElements = element.getElementsByClass(className); - if (subElements.size() > 0) { - return subElements.get(0); - } - - return new Element("span"); - } - - /** - * Get the first {@link Element} of the given tag, or an empty span - * {@link Element} if none found. - * - * @param element - * the element to look in - * @param tagName - * the tag to look for - * - * @return the value or an empty span {@link Element} - */ - static protected Element firstOrEmptyTag(Element element, String tagName) { - Elements subElements = element.getElementsByTag(tagName); - if (subElements.size() > 0) { - return subElements.get(0); - } - - return new Element("span"); - } - /** * Process the given element into text (each line is a text paragraph and * can be prepended with ">" signs to indicate a quote or sub-quote or @@ -342,6 +569,7 @@ public abstract class BasicSupport { final StringBuilder currentLine = new StringBuilder(); final List quoted = new ArrayList(); final List ignoredNodes = new ArrayList(); + final List footnotes = new ArrayList(); if (element != null) { new NodeTraversor(new NodeVisitor() { @@ -369,6 +597,18 @@ public abstract class BasicSupport { } } + //

 check
+					if (!ignore) {
+						if (node instanceof Element) {
+							Element el = (Element) node;
+							if ("pre".equals(el.tagName())) {
+								currentLine.append(StringUtils
+										.unhtml(el.text()).trim());
+								ignore = true;
+							}
+						}
+					}
+
 					if (ignore) {
 						ignoredNodes.add(node);
 						return;
@@ -410,6 +650,11 @@ public abstract class BasicSupport {
 						if (block && currentLine.length() > 0) {
 							currentLine.append("\n");
 						}
+
+						if (!element.absUrl("href").trim().isEmpty()) {
+							footnotes.add(element.absUrl("href"));
+							currentLine.append("[" + footnotes.size() + "]");
+						}
 					} else if (node instanceof TextNode) {
 						TextNode textNode = (TextNode) node;
 						String line = StringUtil.normaliseWhitespace(textNode
@@ -442,11 +687,35 @@ public abstract class BasicSupport {
 			}
 		}
 
+		// Fix spaces and nbsp, remove multiple following blank lines
+		List linesCopy = new ArrayList(lines.size());
+		long blanks = 0;
 		for (int i = 0; i < lines.size(); i++) {
-			lines.set(i, lines.get(i).replace("  ", " ").trim());
+			String line = lines.get(i).replace(" ", " ") // nbsp -> space
+					.replace("  ", " ").trim();
+			if (line.isEmpty()) {
+				blanks++;
+			} else {
+				blanks = 0;
+			}
+
+			if (blanks < 2) {
+				linesCopy.add(line);
+			}
+		}
+
+		// Footnotes insertion
+		if (footnotes.size() > 0) {
+			linesCopy.add("");
+			linesCopy.add("");
+			linesCopy.add("");
+			linesCopy.add("");
+			for (int i = 0; i < footnotes.size(); i++) {
+				linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
+			}
 		}
 
-		return lines;
+		return linesCopy;
 	}
 
 	/**
@@ -457,7 +726,7 @@ public abstract class BasicSupport {
 	 * 
 	 * @return the reformated date, or the same value if it was not parsable
 	 */
-	static protected String date(String date) {
+	static private String date(String date) {
 		SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
 
 		long epoch = 0;
diff --git a/src/be/nikiroo/gofetch/support/ElementProcessor.java b/src/be/nikiroo/gofetch/support/ElementProcessor.java
new file mode 100644
index 0000000..69e291d
--- /dev/null
+++ b/src/be/nikiroo/gofetch/support/ElementProcessor.java
@@ -0,0 +1,67 @@
+package be.nikiroo.gofetch.support;
+
+import org.jsoup.nodes.Node;
+
+/**
+ * Used to process an element into lines.
+ * 
+ * @author niki
+ */
+interface ElementProcessor {
+	/**
+	 * Detect if this node is a quote and should be trated as such.
+	 * 
+	 * @param node
+	 *            the node to check
+	 * @return TRUE if it is
+	 */
+	public boolean detectQuote(Node node);
+
+	/**
+	 * Process text content (will be called on each text element, allowing you
+	 * to modify it if needed).
+	 * 
+	 * @param text
+	 *            the text to process
+	 * 
+	 * @return the resulting text
+	 */
+	public String processText(String text);
+
+	/**
+	 * Ignore this node.
+	 * 
+	 * @param node
+	 *            the node to ignore
+	 * @return TRUE if it has to be ignored
+	 */
+	public boolean ignoreNode(Node node);
+
+	/**
+	 * Manually process this node (and return the manual processing value) if so
+	 * desired.
+	 * 

+ * If the node is manually processed, it and its children will not be + * automatically processed. + * + * @param node + * the node to optionally process + * + * @return NULL if not processed (will thus be automatically processed as + * usual), a {@link String} (may be empty) if we process it manually + * -- the given {@link String} will be used instead of the usual + * automatic processing if not NULL + */ + public String manualProcessing(Node node); + + /** + * This {@link Node} is a subtitle and should be treated as such + * (highlighted). + * + * @param node + * the node to check + * + * @return NULL if it is not a subtitle, the subtitle to use if it is + */ + public String isSubtitle(Node node); +} \ No newline at end of file diff --git a/src/be/nikiroo/gofetch/support/EreNumerique.java b/src/be/nikiroo/gofetch/support/EreNumerique.java index b6a7598..0b3efcf 100644 --- a/src/be/nikiroo/gofetch/support/EreNumerique.java +++ b/src/be/nikiroo/gofetch/support/EreNumerique.java @@ -1,20 +1,15 @@ package be.nikiroo.gofetch.support; import java.io.IOException; -import java.io.InputStream; import java.net.URL; +import java.util.AbstractMap; import java.util.ArrayList; import java.util.List; +import java.util.Map.Entry; -import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; -import org.jsoup.select.Elements; - -import be.nikiroo.gofetch.data.Comment; -import be.nikiroo.gofetch.data.Story; -import be.nikiroo.utils.StringUtils; /** * Support list() throws IOException { - List list = new ArrayList(); - - for (String categ : new String[] { "informatique" }) { - URL url = new URL("https://www.erenumerique.fr/" + categ); - InputStream in = downloader.open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements articles = doc.getElementsByClass("item-details"); - for (Element article : articles) { - String id = ""; - String intUrl = ""; - String extUrl = ""; // nope - String title = ""; - String date = ""; - String author = ""; - String details = ""; - String body = ""; - - // MUST NOT fail: - Element dateElement = article // - .getElementsByTag("time").first(); - if (dateElement == null) { - continue; - } + protected List> getUrls() throws IOException { + List> urls = new ArrayList>(); + for (String categ : new String[] { "Informatique" }) { + URL url = new URL("https://www.erenumerique.fr/" + + categ.toLowerCase()); + urls.add(new AbstractMap.SimpleEntry(url, categ)); + } - Element urlElement = article.getElementsByTag("a").first(); - if (urlElement != null) { - intUrl = urlElement.absUrl("href"); - } + return urls; + } - id = dateElement.attr("datetime").replace(":", "_") - .replace("+", "_"); - date = date(dateElement.attr("datetime")); + @Override + protected List getArticles(Document doc) { + return doc.getElementsByClass("item-details"); + } - Element titleElement = article.getElementsByTag("h2").first(); - if (titleElement != null) { - title = StringUtils.unhtml(titleElement.text()).trim(); - } + @Override + protected String getArticleId(Document doc, Element article) { + return ""; // will use the date + } - Element authorElement = article.getElementsByClass( - "td-post-author-name").first(); - if (authorElement != null) { - authorElement = authorElement.getElementsByTag("a").first(); - } - if (authorElement != null) { - author = StringUtils.unhtml(authorElement.text()).trim(); - } + @Override + protected String getArticleTitle(Document doc, Element article) { + Element titleElement = article.getElementsByTag("h2").first(); + if (titleElement != null) { + return titleElement.text(); + } - Element contentElement = article.getElementsByClass( - "td-excerpt").first(); - if (contentElement != null) { - body = StringUtils.unhtml(contentElement.text()).trim(); - } + return ""; + } - list.add(new Story(getType(), id, title, author, date, categ, - details, intUrl, extUrl, body)); - } + @Override + protected String getArticleAuthor(Document doc, Element article) { + Element authorElement = article.getElementsByClass( + "td-post-author-name").first(); + if (authorElement != null) { + authorElement = authorElement.getElementsByTag("a").first(); + } + if (authorElement != null) { + return authorElement.text(); } - return list; + return ""; } @Override - public void fetch(Story story) throws IOException { - String fullContent = story.getContent(); + protected String getArticleDate(Document doc, Element article) { + Element dateElement = article // + .getElementsByTag("time").first(); + if (dateElement != null) { + return dateElement.attr("datetime"); + } - URL url = new URL(story.getUrlInternal()); - InputStream in = downloader.open(url); - try { - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Element article = doc.getElementsByTag("article").first(); - if (article != null) { - article = article.getElementsByAttributeValue("itemprop", - "articleBody").first(); - } - if (article != null) { - for (String line : toLines(article, - new BasicElementProcessor() { - @Override - public boolean ignoreNode(Node node) { - return node.attr("class").contains("chapo"); - } - - @Override - public String isSubtitle(Node node) { - if (node instanceof Element) { - Element element = (Element) node; - if (element.tagName().startsWith("h") - && element.tagName().length() == 2) { - return element.text(); - } - } - return null; - } - })) { - fullContent += line + "\n"; - } + return ""; + } - // Content is too tight with a single break per line: - fullContent = fullContent.replace("\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .trim(); - } + @Override + protected String getArticleCategory(Document doc, Element article, + String currentCategory) { + return currentCategory; + } - // Get comments URL then parse it, if possible - Element posts = doc.getElementsByClass("comment-list").first(); + @Override + protected String getArticleDetails(Document doc, Element article) { + return ""; + } - story.setFullContent(fullContent); - story.setComments(getComments(posts)); - } finally { - if (in != null) { - in.close(); - } + @Override + protected String getArticleIntUrl(Document doc, Element article) { + Element urlElement = article.getElementsByTag("a").first(); + if (urlElement != null) { + return urlElement.absUrl("href"); } + + return ""; } - private List getComments(Element posts) { - List comments = new ArrayList(); - if (posts != null) { - for (Element post : posts.children()) { - if (!post.hasClass("comment")) { - continue; - } + @Override + protected String getArticleExtUrl(Document doc, Element article) { + return ""; + } + + @Override + protected String getArticleContent(Document doc, Element article) { + Element contentElement = article.getElementsByClass("td-excerpt") + .first(); + if (contentElement != null) { + return contentElement.text(); + } - String id = ""; - String author = ""; - String title = ""; - String date = ""; - List content = new ArrayList(); + return ""; + } - Element authorE = post.getElementsByTag("footer").first(); - if (authorE != null) { - authorE = authorE.getElementsByTag("cite").first(); - } - if (authorE != null) { - author = StringUtils.unhtml(authorE.text()).trim(); - } + @Override + protected Element getFullArticle(Document doc) { + Element article = doc.getElementsByTag("article").first(); + if (article != null) { + article = article.getElementsByAttributeValue("itemprop", + "articleBody").first(); + } + + return article; + } + + @Override + protected List getFullArticleCommentPosts(Document doc, URL intUrl) { + return getSubCommentElements(doc.getElementsByClass("comment-list") + .first()); + } - Element idE = post.getElementsByTag("a").first(); - if (idE != null) { - id = idE.attr("id"); - Element dateE = idE.getElementsByTag("span").first(); - if (dateE != null) { - date = date(dateE.attr("data-epoch")); + @Override + protected ElementProcessor getElementProcessorFullArticle() { + return new BasicElementProcessor() { + @Override + public boolean ignoreNode(Node node) { + return node.attr("class").contains("chapo"); + } + + @Override + public String isSubtitle(Node node) { + if (node instanceof Element) { + Element element = (Element) node; + if (element.tagName().startsWith("h") + && element.tagName().length() == 2) { + return element.text(); } } + return null; + } + }; + } + + @Override + protected List getCommentCommentPosts(Document doc, + Element container) { + return getSubCommentElements(container.getElementsByClass("children") + .first()); + } + + @Override + protected String getCommentId(Element post) { + Element idE = post.getElementsByTag("a").first(); + if (idE != null) { + return idE.attr("id"); + } + + return ""; + } + + @Override + protected String getCommentAuthor(Element post) { + // Since we have no title, we switch with author + return ""; + } + + @Override + protected String getCommentTitle(Element post) { + // Since we have no title, we switch with author + Element authorE = post.getElementsByTag("footer").first(); + if (authorE != null) { + authorE = authorE.getElementsByTag("cite").first(); + } + if (authorE != null) { + return authorE.text(); + } + + return ""; + } + + @Override + protected String getCommentDate(Element post) { + Element idE = post.getElementsByTag("a").first(); + if (idE != null) { + Element dateE = idE.getElementsByTag("span").first(); + if (dateE != null) { + return dateE.attr("data-epoch"); + } + } - Element contentE = post.getElementsByClass("comment-content") - .first(); - if (contentE != null) { - for (String line : toLines(contentE, - new BasicElementProcessor() { - @Override - public boolean ignoreNode(Node node) { - // TODO: ignore headlines/pub - if (node instanceof Element) { - Element el = (Element) node; - if ("h4".equals(el.tagName())) { - return true; - } - } - - return false; - } - })) { - content.add(line); + return ""; + } + + @Override + protected Element getCommentContentElement(Element post) { + Element contentE = post.getElementsByClass("comment-content").first(); + return contentE; + } + + @Override + protected ElementProcessor getElementProcessorComment() { + return new BasicElementProcessor() { + @Override + public boolean ignoreNode(Node node) { + if (node instanceof Element) { + Element el = (Element) node; + if ("h4".equals(el.tagName())) { + return true; } } - // Since we have no title but still an author, let's switch: - title = author; - author = ""; - Comment comment = new Comment(id, author, title, date, content); - comments.add(comment); + return false; + } + }; + } - Element children = post.getElementsByClass("children").first(); - comment.addAll(getComments(children)); + private List getSubCommentElements(Element posts) { + List commentElements = new ArrayList(); + if (posts != null) { + for (Element possibleCommentElement : posts.children()) { + if (possibleCommentElement.hasClass("comment")) { + commentElements.add(possibleCommentElement); + } } } - return comments; + return commentElements; } } diff --git a/src/be/nikiroo/gofetch/support/LWN.java b/src/be/nikiroo/gofetch/support/LWN.java index c033104..144fdc9 100644 --- a/src/be/nikiroo/gofetch/support/LWN.java +++ b/src/be/nikiroo/gofetch/support/LWN.java @@ -1,16 +1,16 @@ package be.nikiroo.gofetch.support; import java.io.IOException; -import java.io.InputStream; import java.net.URL; +import java.util.AbstractMap; import java.util.ArrayList; import java.util.List; +import java.util.Map.Entry; -import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; -import org.jsoup.select.Elements; +import org.jsoup.nodes.TextNode; import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; @@ -27,162 +27,236 @@ public class LWN extends BasicSupport { } @Override - public List list() throws IOException { - List list = new ArrayList(); + public void fetch(Story story) throws IOException { + // Do not try the paid-for stories... + if (!story.getTitle().startsWith("[$]")) { + super.fetch(story); + } else { + String fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/]."; + story.setFullContent(fullContent); + story.setComments(new ArrayList()); + } + } - URL url = new URL("https://lwn.net/"); - InputStream in = downloader.open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements articles = doc.getElementsByClass("pure-u-1"); - for (Element article : articles) { - Elements titles = article.getElementsByClass("Headline"); - Elements listings = article.getElementsByClass("BlurbListing"); - if (titles.size() == 0) { - continue; - } - if (listings.size() == 0) { - continue; - } + @Override + protected List> getUrls() throws IOException { + List> urls = new ArrayList>(); + urls.add(new AbstractMap.SimpleEntry(new URL( + "https://lwn.net/"), "")); + return urls; + } - Element listing = listings.get(0); - if (listing.children().size() < 2) { - continue; - } + @Override + protected List getArticles(Document doc) { + return doc.getElementsByClass("pure-u-1"); + } - String title = titles.get(0).text(); - String details = listing.children().get(0).text(); - String body = ""; - // All but the first and two last children - for (int i = 1; i < listing.children().size() - 2; i++) { - Element e = listing.children().get(i); - body = body.trim() + " " + e.text().trim(); - } - body = body.trim(); + @Override + protected String getArticleId(Document doc, Element article) { + return getArticleIntUrl(doc, article).replaceAll("[^0-9]", ""); + } - int pos; + @Override + protected String getArticleTitle(Document doc, Element article) { + Element title = article.getElementsByClass("Headline").first(); + if (title != null) { + return title.text(); + } - String categ = ""; - pos = details.indexOf("]"); - if (pos >= 0) { - categ = details.substring(1, pos).trim(); - } + return ""; + } - String author = ""; - pos = details.indexOf(" by "); - if (pos >= 0) { - author = details.substring(pos + " by ".length()).trim(); - } + @Override + protected String getArticleAuthor(Document doc, Element article) { + String author = ""; + String details = getArticleDetailsReal(article); + int pos = details.indexOf(" by "); + if (pos >= 0) { + author = details.substring(pos + " by ".length()).trim(); + } + + return author; + } - String date = ""; - pos = details.indexOf(" Posted "); + @Override + protected String getArticleDate(Document doc, Element article) { + String date = ""; + String details = getArticleDetailsReal(article); + int pos = details.indexOf(" Posted "); + if (pos >= 0) { + date = details.substring(pos + " Posted ".length()).trim(); + pos = date.indexOf(" by "); if (pos >= 0) { - date = details.substring(pos + " Posted ".length()).trim(); - pos = date.indexOf(" by "); - if (pos >= 0) { - date = date.substring(0, pos).trim(); - } + date = date.substring(0, pos).trim(); } + } - // We extracted everything from details so... - details = ""; - - String id = ""; - String intUrl = ""; - String extUrl = ""; - for (Element idElem : article.getElementsByTag("a")) { - // Last link is the story link - intUrl = idElem.absUrl("href"); - pos = intUrl.indexOf("#Comments"); - if (pos >= 0) { - intUrl = intUrl.substring(0, pos - 1); - } - id = intUrl.replaceAll("[^0-9]", ""); - } + return date; + } - list.add(new Story(getType(), id, title, author, date, categ, - details, intUrl, extUrl, body)); + @Override + protected String getArticleCategory(Document doc, Element article, + String currentCategory) { + String categ = ""; + String details = getArticleDetailsReal(article); + int pos = details.indexOf("]"); + if (pos >= 0) { + categ = details.substring(1, pos).trim(); } - return list; + return categ; } @Override - public void fetch(Story story) throws IOException { - List comments = new ArrayList(); - String fullContent = story.getContent(); + protected String getArticleDetails(Document doc, Element article) { + return ""; // We actually extract all the values + } - // Do not try the paid-for stories... - if (!story.getTitle().startsWith("[$]")) { - URL url = new URL(story.getUrlInternal()); - InputStream in = downloader.open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements fullContentElements = doc - .getElementsByClass("ArticleText"); - if (fullContentElements.size() > 0) { - // comments.addAll(getComments(listing.get(0))); - fullContent = fullContentElements.get(0).text(); + @Override + protected String getArticleIntUrl(Document doc, Element article) { + String intUrl = ""; + for (Element idElem : article.getElementsByTag("a")) { + // Last link is the story link + intUrl = idElem.absUrl("href"); + int pos = intUrl.indexOf("#Comments"); + if (pos >= 0) { + intUrl = intUrl.substring(0, pos - 1); } + } - Elements listing = doc.getElementsByClass("lwn-u-1"); - if (listing.size() > 0) { - comments.addAll(getComments(listing.get(0))); + return intUrl; + } + + @Override + protected String getArticleExtUrl(Document doc, Element article) { + return ""; + } + + @Override + protected String getArticleContent(Document doc, Element article) { + Element listing = article.getElementsByClass("BlurbListing").first(); + if (listing != null && listing.children().size() >= 2) { + String content = ""; + + // All but the first and two last children + for (int i = 1; i < listing.children().size() - 2; i++) { + Element e = listing.children().get(i); + content = content.trim() + " " + e.text().trim(); } - } else { - fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/]."; + + return content; } - story.setFullContent(fullContent); - story.setComments(comments); + return ""; + } + + @Override + protected Element getFullArticle(Document doc) { + return doc.getElementsByClass("ArticleText").first(); } - private List getComments(Element listing) { - List comments = new ArrayList(); - for (Element commentElement : listing.children()) { - if (commentElement.hasClass("CommentBox")) { - Comment comment = getComment(commentElement); - if (!comment.isEmpty()) { - comments.add(comment); + @Override + protected List getFullArticleCommentPosts(Document doc, URL intUrl) { + return doc.getElementsByClass("lwn-u-1"); + } + + @Override + protected ElementProcessor getElementProcessorFullArticle() { + return new BasicElementProcessor() { + @Override + public boolean ignoreNode(Node node) { + if (node instanceof Element) { + Element el = (Element) node; + if ("Log in".equals(el.text().trim())) { + return true; + } + } else if (node instanceof TextNode) { + TextNode text = (TextNode) node; + String t = text.text().trim(); + if (t.equals("(") || t.equals("to post comments)")) { + return true; + } } - } else if (commentElement.hasClass("Comment")) { - if (comments.size() > 0) { - comments.get(comments.size() - 1).addAll( - getComments(commentElement)); + + return false; + } + }; + } + + @Override + protected List getCommentCommentPosts(Document doc, + Element container) { + List commentElements = new ArrayList(); + if (container != null) { + for (Element possibleCommentElement : container.children()) { + if (possibleCommentElement.hasClass("CommentBox")) { + commentElements.add(possibleCommentElement); + } else if (possibleCommentElement.hasClass("Comment")) { + commentElements.add(possibleCommentElement); } } } - return comments; + + return commentElements; } - private Comment getComment(Element commentElement) { - String title = firstOrEmpty(commentElement, "CommentTitle").text(); - String author = firstOrEmpty(commentElement, "CommentPoster").text(); + @Override + protected String getCommentId(Element post) { + return post.id(); + } - String date = ""; - int pos = author.lastIndexOf(" by "); - if (pos >= 0) { - date = author.substring(0, pos).trim(); - author = author.substring(pos + " by ".length()).trim(); + @Override + protected String getCommentAuthor(Element post) { + Element detailsE = post.getElementsByClass("CommentPoster").first(); + if (detailsE != null) { + String details = detailsE.text(); + + int pos = details.lastIndexOf(" by "); + if (pos >= 0) { + details = details.substring(pos + " by ".length()).trim(); - if (author.startsWith("Posted ")) { - author = author.substring("Posted ".length()).trim(); + if (details.startsWith("Posted ")) { + return details.substring("Posted ".length()).trim(); + } } } - Element content = null; - Elements commentBodyElements = commentElement - .getElementsByClass("CommentBody"); - if (commentBodyElements.size() > 0) { - content = commentBodyElements.get(0); + return ""; + } + + @Override + protected String getCommentTitle(Element post) { + Element title = post.getElementsByClass("CommentTitle").first(); + if (title != null) { + return title.text(); + } + + return ""; + } + + @Override + protected String getCommentDate(Element post) { + Element detailsE = post.getElementsByClass("CommentPoster").first(); + if (detailsE != null) { + String details = detailsE.text(); + + int pos = details.lastIndexOf(" by "); + if (pos >= 0) { + return details.substring(0, pos).trim(); + } } - Comment comment = new Comment(commentElement.id(), author, title, date, - toLines(content)); + return ""; + } - return comment; + @Override + protected Element getCommentContentElement(Element post) { + return post.getElementsByClass("CommentBody").first(); } - private List toLines(Element element) { - return toLines(element, new BasicElementProcessor() { + @Override + protected ElementProcessor getElementProcessorComment() { + return new BasicElementProcessor() { @Override public String processText(String text) { while (text.startsWith(">")) { // comments @@ -216,6 +290,16 @@ public class LWN extends BasicSupport { return false; } - }); + }; + } + + private String getArticleDetailsReal(Element article) { + Element listing = article.getElementsByClass("BlurbListing").first(); + // Valid articles have 2+ listings + if (listing != null && listing.children().size() >= 2) { + return listing.children().get(0).text(); + } + + return ""; } } diff --git a/src/be/nikiroo/gofetch/support/LeMonde.java b/src/be/nikiroo/gofetch/support/LeMonde.java index 235f7ee..1f7aea7 100644 --- a/src/be/nikiroo/gofetch/support/LeMonde.java +++ b/src/be/nikiroo/gofetch/support/LeMonde.java @@ -1,19 +1,15 @@ package be.nikiroo.gofetch.support; import java.io.IOException; -import java.io.InputStream; import java.net.URL; +import java.util.AbstractMap; import java.util.ArrayList; import java.util.List; +import java.util.Map.Entry; -import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; -import org.jsoup.select.Elements; - -import be.nikiroo.gofetch.data.Comment; -import be.nikiroo.gofetch.data.Story; /** * Support http://www.lemonde.fr/. @@ -27,98 +23,171 @@ public class LeMonde extends BasicSupport { } @Override - public List list() throws IOException { - List list = new ArrayList(); - - for (String topic : new String[] { "international", "politique", - "societe", "sciences" }) { - URL url = new URL("http://www.lemonde.fr/" + topic + "/1.html"); - InputStream in = downloader.open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements articles = doc.getElementsByTag("article"); - for (Element article : articles) { - Elements times = article.getElementsByTag("time"); - Elements titleElements = article.getElementsByTag("h3"); - Elements contentElements = article.getElementsByClass("txt3"); - if (times.size() > 0 && titleElements.size() > 0 - && contentElements.size() > 0) { - String id = times.get(0).attr("datetime").replace(":", "_") - .replace("+", "_"); - String title = titleElements.get(0).text(); - String date = date(titleElements.get(0).text()); - String content = contentElements.get(0).text(); - String intUrl = ""; - String extUrl = ""; - String author = ""; - String details = ""; - - Elements detailsElements = article - .getElementsByClass("signature"); - if (detailsElements.size() > 0) { - author = detailsElements.get(0).text(); - } + protected List> getUrls() throws IOException { + List> urls = new ArrayList>(); + for (String topic : new String[] { "International", "Politique", + "Société", "Sciences" }) { + URL url = new URL("http://www.lemonde.fr/" + + topic.toLowerCase().replace("é", "e") + "/1.html"); + urls.add(new AbstractMap.SimpleEntry(url, topic)); + } - Elements links = titleElements.get(0).getElementsByTag("a"); - if (links.size() > 0) { - intUrl = links.get(0).absUrl("href"); - list.add(new Story(getType(), id, title, author, date, - topic, details, intUrl, extUrl, content)); - } - } + return urls; + } + + @Override + protected List getArticles(Document doc) { + return doc.getElementsByTag("article"); + } + + @Override + protected String getArticleId(Document doc, Element article) { + return ""; // will use the date + } + + @Override + protected String getArticleTitle(Document doc, Element article) { + Element titleElement = article.getElementsByTag("h3").first(); + if (titleElement != null) { + return titleElement.text(); + } + + return ""; + } + + @Override + protected String getArticleAuthor(Document doc, Element article) { + Element detailsElement = article.getElementsByClass("signature") + .first(); + if (detailsElement != null) { + return detailsElement.text(); + } + + return ""; + } + + @Override + protected String getArticleDate(Document doc, Element article) { + Element timeElement = article.getElementsByTag("time").first(); + if (timeElement != null) { + return timeElement.attr("datetime"); + } + + return ""; + } + + @Override + protected String getArticleCategory(Document doc, Element article, + String currentCategory) { + return currentCategory; + } + + @Override + protected String getArticleDetails(Document doc, Element article) { + return ""; + } + + @Override + protected String getArticleIntUrl(Document doc, Element article) { + Element titleElement = article.getElementsByTag("h3").first(); + if (titleElement != null) { + Element link = titleElement.getElementsByTag("a").first(); + if (link != null) { + return link.absUrl("href"); } } - return list; + return ""; } @Override - public void fetch(Story story) throws IOException { - String fullContent = story.getContent(); - List comments = new ArrayList(); + protected String getArticleExtUrl(Document doc, Element article) { + return ""; + } - // Note: no comments on this site as far as I can see (or maybe with - // some javascript, I need to check...) + @Override + protected String getArticleContent(Document doc, Element article) { + Element contentElement = article.getElementsByClass("txt3").first(); + if (contentElement != null) { + return contentElement.text(); + } - URL url = new URL(story.getUrlInternal()); - InputStream in = downloader.open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Element article = doc.getElementById("articleBody"); - if (article != null) { - for (String line : toLines(article, new BasicElementProcessor() { - @Override - public boolean ignoreNode(Node node) { - if (node instanceof Element) { - Element element = (Element) node; - if (element.hasClass("lire")) { - return true; - } - } + return ""; + } + + @Override + protected Element getFullArticle(Document doc) { + return doc.getElementById("articleBody"); + } + + @Override + protected List getFullArticleCommentPosts(Document doc, URL intUrl) { + return null; + } - return false; + @Override + protected ElementProcessor getElementProcessorFullArticle() { + return new BasicElementProcessor() { + @Override + public boolean ignoreNode(Node node) { + if (node instanceof Element) { + Element element = (Element) node; + if (element.hasClass("lire")) { + return true; + } } - @Override - public String isSubtitle(Node node) { - if (node instanceof Element) { - Element element = (Element) node; - if (element.hasClass("intertitre")) { - return element.text(); - } + return false; + } + + @Override + public String isSubtitle(Node node) { + if (node instanceof Element) { + Element element = (Element) node; + if (element.hasClass("intertitre")) { + return element.text(); } - return null; } - })) { - fullContent += line + "\n"; + return null; } + }; + } - // Content is too tight with a single break per line: - fullContent = fullContent.replace("\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .trim(); - } + // No comment on this site, horrible javascript system - story.setFullContent(fullContent); - story.setComments(comments); + @Override + protected List getCommentCommentPosts(Document doc, + Element container) { + return null; + } + + @Override + protected String getCommentId(Element post) { + return null; + } + + @Override + protected String getCommentAuthor(Element post) { + return null; + } + + @Override + protected String getCommentTitle(Element post) { + return null; + } + + @Override + protected String getCommentDate(Element post) { + return null; + } + + @Override + protected Element getCommentContentElement(Element post) { + return null; + } + + @Override + protected ElementProcessor getElementProcessorComment() { + return null; } } diff --git a/src/be/nikiroo/gofetch/support/Pipedot.java b/src/be/nikiroo/gofetch/support/Pipedot.java index 9ea70ff..149a20c 100644 --- a/src/be/nikiroo/gofetch/support/Pipedot.java +++ b/src/be/nikiroo/gofetch/support/Pipedot.java @@ -1,20 +1,17 @@ package be.nikiroo.gofetch.support; import java.io.IOException; -import java.io.InputStream; import java.net.URL; +import java.util.AbstractMap; import java.util.ArrayList; import java.util.List; +import java.util.Map.Entry; -import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.Elements; -import be.nikiroo.gofetch.data.Comment; -import be.nikiroo.gofetch.data.Story; - /** * Support https://pipedot.org/. * @@ -27,151 +24,207 @@ public class Pipedot extends BasicSupport { } @Override - public List list() throws IOException { - List list = new ArrayList(); + protected List> getUrls() throws IOException { + List> urls = new ArrayList>(); + urls.add(new AbstractMap.SimpleEntry(new URL( + "https://pipedot.org/"), "")); + return urls; + } - URL url = new URL("https://pipedot.org/"); - InputStream in = downloader.open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements articles = doc.getElementsByClass("story"); - for (Element article : articles) { - Elements titles = article.getElementsByTag("h1"); - if (titles.size() == 0) { - continue; - } + @Override + protected List getArticles(Document doc) { + return doc.getElementsByClass("story"); + } - Element title = titles.get(0); + @Override + protected String getArticleId(Document doc, Element article) { + // Don't try on bad articles + if (getArticleTitle(doc, article).isEmpty()) { + return ""; + } - String id = ""; - for (Element idElem : article.getElementsByTag("a")) { - if (idElem.attr("href").startsWith("/pipe/")) { - id = idElem.attr("href").substring("/pipe/".length()); - break; - } + for (Element idElem : article.getElementsByTag("a")) { + if (idElem.attr("href").startsWith("/pipe/")) { + return idElem.attr("href").substring("/pipe/".length()); } + } - String intUrl = null; - String extUrl = null; - - Elements links = article.getElementsByTag("a"); - if (links.size() > 0) { - intUrl = links.get(0).absUrl("href"); - } + return ""; + } - // Take first ext URL as original source - for (Element link : links) { - String uuu = link.absUrl("href"); - if (!uuu.isEmpty() && !uuu.contains("pipedot.org/")) { - extUrl = uuu; - break; - } - } + @Override + protected String getArticleTitle(Document doc, Element article) { + Element title = article.getElementsByTag("h1").first(); + if (title != null) { + return title.text(); + } - String details = ""; - Elements detailsElements = article.getElementsByTag("div"); - if (detailsElements.size() > 0) { - details = detailsElements.get(0).text().trim(); - } + return ""; + } - String author = ""; - int pos = details.indexOf("by "); + @Override + protected String getArticleAuthor(Document doc, Element article) { + String value = getArticleDetailsReal(article); + int pos = value.indexOf("by "); + if (pos >= 0) { + value = value.substring(pos + "by ".length()).trim(); + pos = value.indexOf(" in "); if (pos >= 0) { - author = details.substring(pos + "by ".length()).trim(); - pos = author.indexOf(" in "); - if (pos >= 0) { - author = author.substring(0, pos).trim(); - } + value = value.substring(0, pos).trim(); } - String categ = ""; - pos = details.indexOf(" in "); + return value; + } + + return ""; + } + + @Override + protected String getArticleDate(Document doc, Element article) { + Element dateElement = article.getElementsByTag("time").first(); + if (dateElement != null) { + return dateElement.attr("datetime"); + } + + return ""; + } + + @Override + protected String getArticleCategory(Document doc, Element article, + String currentCategory) { + String value = getArticleDetailsReal(article); + int pos = value.indexOf(" in "); + if (pos >= 0) { + value = value.substring(pos + " in ".length()).trim(); + pos = value.indexOf(" on "); if (pos >= 0) { - categ = details.substring(pos + " in ".length()).trim(); - pos = categ.indexOf(" on "); - if (pos >= 0) { - categ = categ.substring(0, pos).trim(); - } + value = value.substring(0, pos).trim(); } - String date = ""; - Element dateElement = article.getElementsByTag("time").first(); - if (dateElement != null) { - date = date(dateElement.attr("datetime")); - } + return value; + } - // We already have all the details (date, author, id, categ) - details = ""; + return ""; + } - String body = ""; - for (Element elem : article.children()) { - String tag = elem.tag().toString(); - if (!tag.equals("header") && !tag.equals("footer")) { - body = elem.text(); - break; - } + @Override + protected String getArticleDetails(Document doc, Element article) { + return ""; // We alrady extracted all the info + } + + @Override + protected String getArticleIntUrl(Document doc, Element article) { + Element link = article.getElementsByTag("a").first(); + if (link != null) { + return link.absUrl("href"); + } + + return ""; + } + + @Override + protected String getArticleExtUrl(Document doc, Element article) { + Element link = article.getElementsByTag("a").first(); + if (link != null) { + String possibleExtLink = link.absUrl("href").trim(); + if (!possibleExtLink.isEmpty() + && !possibleExtLink.contains("pipedot.org/")) { + return possibleExtLink; } + } - list.add(new Story(getType(), id, title.text(), author, date, - categ, details, intUrl, extUrl, body)); + return ""; + } + + @Override + protected String getArticleContent(Document doc, Element article) { + for (Element elem : article.children()) { + String tag = elem.tagName(); + if (!tag.equals("header") && !tag.equals("footer")) { + return elem.text(); + } } - return list; + return ""; + } + + @Override + protected Element getFullArticle(Document doc) { + return null; + } + + @Override + protected List getFullArticleCommentPosts(Document doc, URL intUrl) { + return getCommentElements(doc.getElementsByTag("main").first()); + } + + @Override + protected ElementProcessor getElementProcessorFullArticle() { + return new BasicElementProcessor(); } @Override - public void fetch(Story story) throws IOException { - List comments = new ArrayList(); + protected List getCommentCommentPosts(Document doc, + Element container) { - URL url = new URL(story.getUrlInternal()); - InputStream in = downloader.open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements listing = doc.getElementsByTag("main"); - if (listing.size() > 0) { - comments.addAll(getComments(listing.get(0))); + if (container != null) { + container = container.getElementsByClass("comment-outline").first(); } - story.setComments(comments); + return getCommentElements(container); } - private List getComments(Element listing) { - List comments = new ArrayList(); - for (Element commentElement : listing.children()) { - if (commentElement.hasClass("comment")) { - Comment comment = getComment(commentElement); - if (!comment.isEmpty()) { - comments.add(comment); - } + @Override + protected String getCommentId(Element post) { + return post.id(); + } + + @Override + protected String getCommentAuthor(Element post) { + Element authorDateE = post.getElementsByTag("h3").first(); + if (authorDateE != null) { + String authorDate = authorDateE.text(); + int pos = authorDate.lastIndexOf(" on "); + if (pos >= 0) { + return authorDate.substring(0, pos).trim(); } } - return comments; - } - private Comment getComment(Element commentElement) { - String title = firstOrEmptyTag(commentElement, "h3").text(); - String author = firstOrEmpty(commentElement, "h4").text(); - Element content = firstOrEmpty(commentElement, "comment-body"); + return ""; + } - String date = ""; - int pos = author.lastIndexOf(" on "); - if (pos >= 0) { - date = author.substring(pos + " on ".length()).trim(); - author = author.substring(0, pos).trim(); + @Override + protected String getCommentTitle(Element post) { + Element title = post.getElementsByTag("h3").first(); + if (title != null) { + return title.text(); } - Comment comment = new Comment(commentElement.id(), author, title, date, - toLines(content)); + return ""; + } - Elements commentOutline = commentElement - .getElementsByClass("comment-outline"); - if (commentOutline.size() > 0) { - comment.addAll(getComments(commentOutline.get(0))); + @Override + protected String getCommentDate(Element post) { + Element authorDateE = post.getElementsByTag("h3").first(); + if (authorDateE != null) { + String authorDate = authorDateE.text(); + int pos = authorDate.lastIndexOf(" on "); + if (pos >= 0) { + return authorDate.substring(pos + " on ".length()).trim(); + } } - return comment; + return ""; + } + + @Override + protected Element getCommentContentElement(Element post) { + return post.getElementsByClass("comment-body").first(); } - private List toLines(Element element) { - return toLines(element, new BasicElementProcessor() { + @Override + protected ElementProcessor getElementProcessorComment() { + return new BasicElementProcessor() { @Override public boolean detectQuote(Node node) { if (node instanceof Element) { @@ -184,6 +237,27 @@ public class Pipedot extends BasicSupport { return false; } - }); + }; + } + + private String getArticleDetailsReal(Element article) { + Elements detailsElements = article.getElementsByTag("div"); + if (detailsElements.size() > 0) { + return detailsElements.get(0).text().trim(); + } + + return ""; + } + + private List getCommentElements(Element container) { + List commentElements = new ArrayList(); + if (container != null) { + for (Element commentElement : container.children()) { + if (commentElement.hasClass("comment")) { + commentElements.add(commentElement); + } + } + } + return commentElements; } } diff --git a/src/be/nikiroo/gofetch/support/Slashdot.java b/src/be/nikiroo/gofetch/support/Slashdot.java index b3a779d..43ce13d 100644 --- a/src/be/nikiroo/gofetch/support/Slashdot.java +++ b/src/be/nikiroo/gofetch/support/Slashdot.java @@ -1,21 +1,17 @@ package be.nikiroo.gofetch.support; import java.io.IOException; -import java.io.InputStream; import java.net.URL; +import java.util.AbstractMap; import java.util.ArrayList; import java.util.List; +import java.util.Map.Entry; -import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.Elements; -import be.nikiroo.gofetch.data.Comment; -import be.nikiroo.gofetch.data.Story; -import be.nikiroo.utils.StringUtils; - /** * Support https://slashdot.org/. * @@ -28,145 +24,238 @@ public class Slashdot extends BasicSupport { } @Override - public List list() throws IOException { - List list = new ArrayList(); - - URL url = new URL("https://slashdot.org/"); - InputStream in = downloader.open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements articles = doc.getElementsByTag("header"); - for (Element article : articles) { - Elements titles = article.getElementsByClass("story-title"); - if (titles.size() == 0) { - continue; - } + protected List> getUrls() throws IOException { + List> urls = new ArrayList>(); + urls.add(new AbstractMap.SimpleEntry(new URL( + "https://slashdot.org/"), "")); + return urls; + } - Element title = titles.get(0); + @Override + protected List getArticles(Document doc) { + return doc.getElementsByTag("header"); + } - String id = "" + title.attr("id"); + @Override + protected String getArticleId(Document doc, Element article) { + Element title = article.getElementsByClass("story-title").first(); + if (title != null) { + String id = title.attr("id"); if (id.startsWith("title-")) { id = id.substring("title-".length()); } + return id; + } + + return ""; + } + + @Override + protected String getArticleTitle(Document doc, Element article) { + Element title = article.getElementsByClass("story-title").first(); + if (title != null) { + return title.text(); + } + + return ""; + } + + @Override + protected String getArticleAuthor(Document doc, Element article) { + // details: "Posted by AUTHOR on DATE from the further-crackdown dept." + String details = getArticleDetailsReal(article); + int pos = details.indexOf(" on "); + if (details.startsWith("Posted by ") && pos >= 0) { + return details.substring("Posted by ".length(), pos).trim(); + } + + return ""; + } + + @Override + protected String getArticleDate(Document doc, Element article) { + // Do not try bad articles + if (getArticleId(doc, article).isEmpty()) { + return ""; + } + + Element dateElement = doc.getElementsByTag("time").first(); + if (dateElement != null) { + String date = dateElement.text().trim(); + if (date.startsWith("on ")) { + date = date.substring("on ".length()); + } + + return date; + } + + return ""; + } + + @Override + protected String getArticleCategory(Document doc, Element article, + String currentCategory) { + Element categElement = doc.getElementsByClass("topic").first(); + if (categElement != null) { + return categElement.text(); + } + + return ""; + } + + @Override + protected String getArticleDetails(Document doc, Element article) { + // details: "Posted by AUTHOR on DATE from the further-crackdown dept." + String details = getArticleDetailsReal(article); + int pos = details.indexOf(" from the "); + if (pos >= 0) { + return details.substring(pos).trim(); + } + + return ""; + } + + @Override + protected String getArticleIntUrl(Document doc, Element article) { + Element title = article.getElementsByClass("story-title").first(); + if (title != null) { Elements links = title.getElementsByTag("a"); - String intUrl = ""; - String extUrl = ""; if (links.size() > 0) { - intUrl = links.get(0).absUrl("href"); + return links.get(0).absUrl("href"); } + } + return ""; + } + + @Override + protected String getArticleExtUrl(Document doc, Element article) { + Element title = article.getElementsByClass("story-title").first(); + if (title != null) { + Elements links = title.getElementsByTag("a"); if (links.size() > 1) { - extUrl = links.get(1).absUrl("href"); + return links.get(1).absUrl("href"); } + } + return ""; + } - String details = ""; - Elements detailsElements = article.getElementsByClass("details"); - if (detailsElements.size() > 0) { - details = detailsElements.get(0).text(); - } + @Override + protected String getArticleContent(Document doc, Element article) { + Element contentElement = doc // + .getElementById("text-" + getArticleId(doc, article)); + if (contentElement != null) { + return contentElement.text(); + } - // details: - // "Posted by AUTHOR on DATE from the further-crackdown dept." - String author = ""; - int pos = details.indexOf(" on "); - if (details.startsWith("Posted by ") && pos >= 0) { - author = details.substring("Posted by ".length(), pos).trim(); - } - pos = details.indexOf(" from the "); - if (pos >= 0) { - details = details.substring(pos).trim(); - } + return ""; + } - String body = ""; - Element bodyElement = doc.getElementById("text-" + id); - if (bodyElement != null) { - body = bodyElement.text(); - } + @Override + protected Element getFullArticle(Document doc) { + return null; + } - String categ = ""; - Element categElement = doc.getElementsByClass("topic").first(); - if (categElement != null) { - categ = StringUtils.unhtml(categElement.text()).trim(); + @Override + protected List getFullArticleCommentPosts(Document doc, URL intUrl) { + List commentElements = new ArrayList(); + Element listing = doc.getElementById("commentlisting"); + if (listing != null) { + for (Element commentElement : listing.children()) { + if (commentElement.hasClass("comment")) { + commentElements.add(commentElement); + } } + } + + return commentElements; + } + + @Override + protected ElementProcessor getElementProcessorFullArticle() { + return null; + } - String date = ""; - Element dateElement = doc.getElementsByTag("time").first(); - if (dateElement != null) { - date = StringUtils.unhtml(dateElement.text()).trim(); - if (date.startsWith("on ")) { - date = date.substring("on ".length()); + @Override + protected List getCommentCommentPosts(Document doc, + Element container) { + List commentElements = new ArrayList(); + for (Element child : container.children()) { + if (child.id().contains("commtree_")) { + for (Element sub : child.children()) { + if (sub.hasClass("comment")) { + commentElements.add(sub); + } } } + } + + return commentElements; + } - list.add(new Story(getType(), id, title.text(), author, date, - categ, details, intUrl, extUrl, body)); + @Override + protected String getCommentId(Element post) { + if (post.hasClass("hidden")) { + return ""; } - return list; + return post.id(); } @Override - public void fetch(Story story) throws IOException { - List comments = new ArrayList(); + protected String getCommentAuthor(Element post) { + if (post.hasClass("hidden")) { + return ""; + } - URL url = new URL(story.getUrlInternal()); - InputStream in = downloader.open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Element listing = doc.getElementById("commentlisting"); - if (listing != null) { - comments.addAll(getComments(listing)); + Element author = post.getElementsByClass("by").first(); + if (author != null) { + return author.text(); } - story.setComments(comments); + return ""; } - private List getComments(Element listing) { - List comments = new ArrayList(); - Comment lastComment = null; - for (Element commentElement : listing.children()) { - if (commentElement.hasClass("comment")) { - if (!commentElement.hasClass("hidden")) { - lastComment = getComment(commentElement); - comments.add(lastComment); - } + @Override + protected String getCommentTitle(Element post) { + if (post.hasClass("hidden")) { + return ""; + } - List subComments = new ArrayList(); - for (Element child : commentElement.children()) { - if (child.id().contains("commtree_")) { - subComments.addAll(getComments(child)); - } - } + Element title = post.getElementsByClass("title").first(); + if (title != null) { + return title.text(); + } - if (lastComment == null) { - comments.addAll(subComments); - } else { - lastComment.addAll(subComments); - } - } + return ""; + } + + @Override + protected String getCommentDate(Element post) { + if (post.hasClass("hidden")) { + return ""; } - return comments; + Element date = post.getElementsByClass("otherdetails").first(); + if (date != null) { + return date.text(); + } + + return ""; } - /** - * Get a comment from the given element. - * - * @param commentElement - * the element to get the comment of. - * - * @return the comment, NOT including sub-comments - */ - private Comment getComment(Element commentElement) { - String title = firstOrEmpty(commentElement, "title").text(); - String author = firstOrEmpty(commentElement, "by").text(); - String date = firstOrEmpty(commentElement, "otherdetails").text(); - Element content = firstOrEmpty(commentElement, "commentBody"); + @Override + protected Element getCommentContentElement(Element post) { + if (post.hasClass("hidden")) { + return null; + } - return new Comment(commentElement.id(), author, title, date, - toLines(content)); + return post.getElementsByClass("commentBody").first(); } - private List toLines(Element element) { - return toLines(element, new BasicElementProcessor() { + @Override + protected ElementProcessor getElementProcessorComment() { + return new BasicElementProcessor() { @Override public String processText(String text) { while (text.startsWith(">")) { // comment in one-liners @@ -192,6 +281,15 @@ public class Slashdot extends BasicSupport { return false; } - }); + }; + } + + private String getArticleDetailsReal(Element article) { + Element detailsElement = article.getElementsByClass("details").first(); + if (detailsElement != null) { + return detailsElement.text(); + } + + return ""; } } diff --git a/src/be/nikiroo/gofetch/support/TheRegister.java b/src/be/nikiroo/gofetch/support/TheRegister.java index 7fb1524..1195d3d 100644 --- a/src/be/nikiroo/gofetch/support/TheRegister.java +++ b/src/be/nikiroo/gofetch/support/TheRegister.java @@ -3,18 +3,20 @@ package be.nikiroo.gofetch.support; import java.io.IOException; import java.io.InputStream; import java.net.URL; +import java.util.AbstractMap; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.Map.Entry; import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; -import org.jsoup.select.Elements; import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; -import be.nikiroo.utils.StringUtils; /** * Support commentReplies = new HashMap(); + @Override public String getDescription() { return "The Register: Biting the hand that feeds IT"; } @Override - public List list() throws IOException { - List list = new ArrayList(); + public void fetch(Story story) throws IOException { + super.fetch(story); - URL url = new URL("https://www.theregister.co.uk/"); - InputStream in = downloader.open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements articles = doc.getElementsByClass("story_link"); - for (Element article : articles) { - if (article.getElementsByClass("time_stamp").isEmpty()) { - // Some articles are doubled, - // but the second copy without the time info - continue; + // Update comment replies + List comments = new ArrayList(); + for (Comment comment : story.getComments()) { + if (commentReplies.containsKey(comment.getId())) { + String inReplyToId = commentReplies.get(comment.getId()); + Comment inReplyTo = story.getCommentById(inReplyToId); + if (inReplyTo != null) { + inReplyTo.add(comment); + } else { + comments.add(comment); + } + } else { + comments.add(comment); } + } + story.setComments(comments); + } - String id = ""; - String intUrl = article.absUrl("href"); - String extUrl = ""; // nope - String title = ""; - String date = ""; - String details = ""; - String body = ""; - String categ = ""; - String author = ""; // nope - - Element categElement = article.previousElementSibling(); - if (categElement != null) { - categ = categElement.text().trim(); - } + @Override + protected List> getUrls() throws IOException { + List> urls = new ArrayList>(); + urls.add(new AbstractMap.SimpleEntry(new URL( + "https://www.theregister.co.uk/"), "")); + return urls; + } - Element titleElement = article.getElementsByTag("h4").first(); - if (titleElement != null) { - title = StringUtils.unhtml(titleElement.text()).trim(); - } + @Override + protected List getArticles(Document doc) { + return doc.getElementsByClass("story_link"); + } - Element dateElement = article.getElementsByClass("time_stamp") - .first(); - if (dateElement != null) { - String epochS = dateElement.attr("data-epoch"); - if (epochS != null && !epochS.isEmpty()) { - id = epochS; - date = date(epochS); - } - } + @Override + protected String getArticleId(Document doc, Element article) { + return ""; + } - if (id.isEmpty()) { - // fallback - id = article.attr("href").replace("/", "_"); - } + @Override + protected String getArticleTitle(Document doc, Element article) { + Element titleElement = article.getElementsByTag("h4").first(); + if (titleElement != null) { + return titleElement.text(); + } - Element detailsElement = article.getElementsByClass("standfirst") - .first(); - details = "(" + date + ") "; - if (detailsElement != null) { - details += StringUtils.unhtml(detailsElement.text()).trim(); - } + return ""; + } + + @Override + protected String getArticleAuthor(Document doc, Element article) { + return ""; + } - // We have some "details" but no content, so we switch them: - body = details; - details = ""; - list.add(new Story(getType(), id, title, author, date, categ, - details, intUrl, extUrl, body)); + @Override + protected String getArticleDate(Document doc, Element article) { + Element dateElement = article.getElementsByClass("time_stamp").first(); + if (dateElement != null) { + return dateElement.attr("data-epoch"); } - return list; + return ""; } @Override - public void fetch(Story story) throws IOException { - String fullContent = story.getContent(); - List comments = new ArrayList(); - story.setComments(comments); + protected String getArticleCategory(Document doc, Element article, + String currentCategory) { + Element categElement = article.previousElementSibling(); + if (categElement != null) { + return categElement.text(); + } + + return ""; + } + + @Override + protected String getArticleDetails(Document doc, Element article) { + // We have some "details" but no content, so we switch them: + return ""; + } + + @Override + protected String getArticleIntUrl(Document doc, Element article) { + return article.absUrl("href"); + } + + @Override + protected String getArticleExtUrl(Document doc, Element article) { + return ""; + } + + @Override + protected String getArticleContent(Document doc, Element article) { + // We have some "details" but no content, so we switch them: + Element detailsElement = article.getElementsByClass("standfirst") + .first(); + if (detailsElement != null) { + return detailsElement.text(); + } + + return ""; + } + + @Override + protected Element getFullArticle(Document doc) { + return doc.getElementById("body"); + } - URL url = new URL(story.getUrlInternal()); - InputStream in = downloader.open(url); + @Override + protected List getFullArticleCommentPosts(Document doc, URL intUrl) { + List commentElements = new ArrayList(); + + // Get comments URL then parse it try { - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Element article = doc.getElementById("body"); - if (article != null) { - for (String line : toLines(article, - new BasicElementProcessor() { - // TODO: ignore headlines/pub - })) { - fullContent += line + "\n"; + URL url = new URL("https://forums.theregister.co.uk/forum/1" + + intUrl.getPath()); + InputStream in = downloader.open(url); + try { + doc = DataUtil.load(in, "UTF-8", url.toString()); + Element posts = doc.getElementById("forum_posts"); + if (posts != null) { + for (Element post : posts.getElementsByClass("post")) { + commentElements.add(post); + Element inReplyTo = post.getElementsByClass( + "in-reply-to").first(); + if (inReplyTo != null) { + String parentId = inReplyTo.absUrl("href"); + if (parentId != null && parentId.contains("/")) { + int i = parentId.lastIndexOf('/'); + parentId = parentId.substring(i + 1); + + commentReplies + .put(getCommentId(post), parentId); + } + } + } } + } finally { + in.close(); + } + } catch (IOException e) { + } + + return commentElements; + } + + @Override + protected ElementProcessor getElementProcessorFullArticle() { + return new BasicElementProcessor(); + } + + @Override + protected List getCommentCommentPosts(Document doc, + Element container) { + return null; + } - // Content is too tight with a single break per line: - fullContent = fullContent.replace("\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .trim(); + @Override + protected String getCommentId(Element post) { + Element idE = post.getElementsByTag("a").first(); + if (idE != null) { + String id = idE.attr("id"); + if (id.startsWith("c_")) { + id = id.substring(2); } - story.setFullContent(fullContent); - - // Get comments URL then parse it - in.close(); - in = null; - in = downloader - .open(new URL("https://forums.theregister.co.uk/forum/1" - + url.getPath())); - doc = DataUtil.load(in, "UTF-8", url.toString()); - Element posts = doc.getElementById("forum_posts"); - if (posts != null) { - for (Element post : posts.getElementsByClass("post")) { - String id = ""; - String author = ""; - String title = ""; - String date = ""; - List content = new ArrayList(); - - Element idE = post.getElementsByTag("a").first(); - if (idE != null) { - id = idE.attr("id"); - if (id.startsWith("c_")) { - id = id.substring(2); - } + return id; + } - Element dateE = idE.getElementsByTag("span").first(); - if (dateE != null) { - date = date(dateE.attr("data-epoch")); - } - } + return ""; + } - Element authorE = post.getElementsByClass("author").first(); - if (authorE != null) { - author = StringUtils.unhtml(authorE.text()).trim(); - } + @Override + protected String getCommentAuthor(Element post) { + Element author = post.getElementsByClass("author").first(); + if (author != null) { + return author.text(); + } - Element titleE = post.getElementsByTag("h4").first(); - if (titleE != null) { - title = StringUtils.unhtml(titleE.text()).trim(); - } + return ""; + } - Element contentE = post.getElementsByClass("body").first(); - if (contentE != null) { - for (String line : toLines(contentE, - new BasicElementProcessor() { - @Override - public boolean ignoreNode(Node node) { - // TODO: ignore headlines/pub - - // Remove the comment title (which has - // already been processed earlier) - if (node instanceof Element) { - Element el = (Element) node; - if ("h4".equals(el.tagName())) { - return true; - } - } - - return false; - } - })) { - content.add(line); - } - } + @Override + protected String getCommentTitle(Element post) { + Element title = post.getElementsByTag("h4").first(); + if (title != null) { + return title.text(); + } - Comment comment = new Comment(id, author, title, date, - content); - Comment parent = null; - - Element inReplyTo = post.getElementsByClass("in-reply-to") - .first(); - if (inReplyTo != null) { - String parentId = inReplyTo.absUrl("href"); - if (parentId != null && parentId.contains("/")) { - int i = parentId.lastIndexOf('/'); - parentId = parentId.substring(i + 1); - parent = story.getCommentById(parentId); - } - } + return ""; + } + + @Override + protected String getCommentDate(Element post) { + Element id = post.getElementsByTag("a").first(); + if (id != null) { + Element date = id.getElementsByTag("span").first(); + if (date != null) { + return date.attr("data-epoch"); + } + } + + return ""; + } + + @Override + protected Element getCommentContentElement(Element post) { + return post.getElementsByClass("body").first(); + } - if (parent == null) { - comments.add(comment); - } else { - parent.add(comment); + @Override + protected ElementProcessor getElementProcessorComment() { + return new BasicElementProcessor() { + @Override + public boolean ignoreNode(Node node) { + // Remove the comment title (which has + // already been processed earlier) + if (node instanceof Element) { + Element el = (Element) node; + if ("h4".equals(el.tagName())) { + return true; } } + + return false; } - } finally { - if (in != null) { - in.close(); - } - } + }; } } diff --git a/src/be/nikiroo/gofetch/support/TooLinux.java b/src/be/nikiroo/gofetch/support/TooLinux.java index 0cc4c6c..ba909cf 100644 --- a/src/be/nikiroo/gofetch/support/TooLinux.java +++ b/src/be/nikiroo/gofetch/support/TooLinux.java @@ -1,20 +1,15 @@ package be.nikiroo.gofetch.support; import java.io.IOException; -import java.io.InputStream; import java.net.URL; +import java.util.AbstractMap; import java.util.ArrayList; import java.util.List; +import java.util.Map.Entry; -import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; -import org.jsoup.select.Elements; - -import be.nikiroo.gofetch.data.Comment; -import be.nikiroo.gofetch.data.Story; -import be.nikiroo.utils.StringUtils; /** * Support https://www.toolinux.com/. @@ -28,97 +23,141 @@ public class TooLinux extends BasicSupport { } @Override - public List list() throws IOException { - List list = new ArrayList(); - - URL url = new URL("https://www.toolinux.com/"); - InputStream in = downloader.open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements articles = doc.getElementsByClass("hentry"); - for (Element article : articles) { - String id = ""; - String intUrl = ""; - String extUrl = ""; // nope - String title = ""; - String date = ""; - String details = ""; - String body = ""; - String author = ""; // nope - String categ = ""; // nope - - Element urlElement = article.getElementsByTag("a").first(); - if (urlElement != null) { - intUrl = urlElement.absUrl("href"); - } + protected List> getUrls() throws IOException { + List> urls = new ArrayList>(); + urls.add(new AbstractMap.SimpleEntry(new URL( + "https://www.toolinux.com/"), "")); + return urls; + } - Element titleElement = article.getElementsByClass("entry-title") - .first(); - if (titleElement != null) { - title = StringUtils.unhtml(titleElement.text()).trim(); - } + @Override + protected List getArticles(Document doc) { + return doc.getElementsByClass("hentry"); + } - Element dateElement = article.getElementsByClass("published") - .first(); - if (dateElement != null) { - date = StringUtils.unhtml(dateElement.text()).trim(); - id = dateElement.attr("title").trim(); - } + @Override + protected String getArticleId(Document doc, Element article) { + return ""; // We use the date + } - if (id.isEmpty()) { - // fallback - id = intUrl.replace("/", "_"); - } + @Override + protected String getArticleTitle(Document doc, Element article) { + Element titleElement = article.getElementsByClass("entry-title") + .first(); + if (titleElement != null) { + return titleElement.text(); + } - Element bodyElement = article.getElementsByClass("introduction") - .first(); - if (bodyElement != null) { - body = StringUtils.unhtml(bodyElement.text()).trim(); - } + return ""; + } - list.add(new Story(getType(), id, title, author, date, categ, - details, intUrl, extUrl, body)); + @Override + protected String getArticleAuthor(Document doc, Element article) { + return ""; + } + + @Override + protected String getArticleDate(Document doc, Element article) { + Element dateElement = article.getElementsByClass("published").first(); + if (dateElement != null) { + return dateElement.text(); } - return list; - } - - @Override - public void fetch(Story story) throws IOException { - String fullContent = story.getContent(); - List comments = new ArrayList(); - story.setComments(comments); - - URL url = new URL(story.getUrlInternal()); - InputStream in = downloader.open(url); - try { - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Element article = doc.getElementById("content"); - if (article != null) { - for (String line : toLines(article, - new BasicElementProcessor() { - @Override - public boolean ignoreNode(Node node) { - if ("notes".equals(node.attr("class"))) { - return true; - } - return false; - } - })) { - fullContent += line + "\n"; - } + return ""; + } - // Content is too tight with a single break per line: - fullContent = fullContent.replace("\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .trim(); - } + @Override + protected String getArticleCategory(Document doc, Element article, + String currentCategory) { + return ""; + } - story.setFullContent(fullContent); - } finally { - if (in != null) { - in.close(); - } + @Override + protected String getArticleDetails(Document doc, Element article) { + return ""; + } + + @Override + protected String getArticleIntUrl(Document doc, Element article) { + Element urlElement = article.getElementsByTag("a").first(); + if (urlElement != null) { + return urlElement.absUrl("href"); } + + return ""; + } + + @Override + protected String getArticleExtUrl(Document doc, Element article) { + return ""; + } + + @Override + protected String getArticleContent(Document doc, Element article) { + Element content = article.getElementsByClass("introduction").first(); + if (content != null) { + return content.text(); + } + + return ""; + } + + @Override + protected Element getFullArticle(Document doc) { + return doc.getElementById("content"); + } + + @Override + protected List getFullArticleCommentPosts(Document doc, URL intUrl) { + return null; + } + + @Override + protected ElementProcessor getElementProcessorFullArticle() { + return new BasicElementProcessor() { + @Override + public boolean ignoreNode(Node node) { + if ("notes".equals(node.attr("class"))) { + return true; + } + return false; + } + }; + } + + @Override + protected List getCommentCommentPosts(Document doc, + Element container) { + return null; + } + + @Override + protected String getCommentId(Element post) { + return null; + } + + @Override + protected String getCommentAuthor(Element post) { + return null; + } + + @Override + protected String getCommentTitle(Element post) { + return null; + } + + @Override + protected String getCommentDate(Element post) { + return null; + } + + @Override + protected Element getCommentContentElement(Element post) { + return null; + } + + @Override + protected ElementProcessor getElementProcessorComment() { + return null; } } diff --git a/src/be/nikiroo/gofetch/support/Type.java b/src/be/nikiroo/gofetch/support/Type.java new file mode 100644 index 0000000..dadbec1 --- /dev/null +++ b/src/be/nikiroo/gofetch/support/Type.java @@ -0,0 +1,23 @@ +package be.nikiroo.gofetch.support; + +/** + * The support type (each website we support has a single type). + * + * @author niki + */ +public enum Type { + /** EN: Any, but mostly IT/Sci */ + SLASHDOT, + /** EN: Clone of Slashdot, mostly abandoned */ + PIPEDOT, + /** EN: Linux */ + LWN, + /** FR: Any */ + LEMONDE, + /** EN: IT */ + REGISTER, + /** FR: Linux */ + TOO_LINUX, + /** FR: IT */ + ERE_NUMERIQUE, +} -- 2.27.0