Bug fixes + rework of BasicSupport
authorNiki Roo <niki@nikiroo.be>
Sun, 25 Mar 2018 19:39:01 +0000 (21:39 +0200)
committerNiki Roo <niki@nikiroo.be>
Sun, 25 Mar 2018 19:39:01 +0000 (21:39 +0200)
18 files changed:
src/be/nikiroo/gofetch/Fetcher.java
src/be/nikiroo/gofetch/Main.java
src/be/nikiroo/gofetch/data/Comment.java
src/be/nikiroo/gofetch/data/Story.java
src/be/nikiroo/gofetch/output/Gopher.java
src/be/nikiroo/gofetch/output/Html.java
src/be/nikiroo/gofetch/output/Output.java
src/be/nikiroo/gofetch/support/BasicElementProcessor.java [new file with mode: 0644]
src/be/nikiroo/gofetch/support/BasicSupport.java
src/be/nikiroo/gofetch/support/ElementProcessor.java [new file with mode: 0644]
src/be/nikiroo/gofetch/support/EreNumerique.java
src/be/nikiroo/gofetch/support/LWN.java
src/be/nikiroo/gofetch/support/LeMonde.java
src/be/nikiroo/gofetch/support/Pipedot.java
src/be/nikiroo/gofetch/support/Slashdot.java
src/be/nikiroo/gofetch/support/TheRegister.java
src/be/nikiroo/gofetch/support/TooLinux.java
src/be/nikiroo/gofetch/support/Type.java [new file with mode: 0644]

index 258f1961e58b8746b8286f7131726d2e9de659d3..6c86c136266b53f8c79e0d73cc3626fd7b9da495 100644 (file)
@@ -14,7 +14,7 @@ import be.nikiroo.gofetch.output.Gopher;
 import be.nikiroo.gofetch.output.Html;
 import be.nikiroo.gofetch.output.Output;
 import be.nikiroo.gofetch.support.BasicSupport;
 import be.nikiroo.gofetch.output.Html;
 import be.nikiroo.gofetch.output.Output;
 import be.nikiroo.gofetch.support.BasicSupport;
-import be.nikiroo.gofetch.support.BasicSupport.Type;
+import be.nikiroo.gofetch.support.Type;
 import be.nikiroo.utils.IOUtils;
 
 /**
 import be.nikiroo.utils.IOUtils;
 
 /**
index 7aa94053d10236932897c872e34c236e93560b53..e4078d8d1009e454dfcea46ea3907ce45212a7ca 100644 (file)
@@ -3,7 +3,7 @@ package be.nikiroo.gofetch;
 import java.io.File;
 import java.io.IOException;
 
 import java.io.File;
 import java.io.IOException;
 
-import be.nikiroo.gofetch.support.BasicSupport.Type;
+import be.nikiroo.gofetch.support.Type;
 
 /**
  * This class is tha main entry point of the program. It will parse the
 
 /**
  * This class is tha main entry point of the program. It will parse the
index bbd648a36a81be826af4b574b87f4a8db9137f53..da074820149f4b1fa555036f7890bb1293eafe98 100644 (file)
@@ -91,7 +91,7 @@ public class Comment implements Iterable<Comment> {
 
        public boolean isEmpty() {
                return children.isEmpty() && lines.isEmpty()
 
        public boolean isEmpty() {
                return children.isEmpty() && lines.isEmpty()
-                               && ("" + author + title).trim().isEmpty();
+                               && ("" + author + title).isEmpty();
        }
 
        @Override
        }
 
        @Override
index c0719d2d6b43bcd6812984fa7a59f8b2a8ea8306..9a2e68d2525db8d98e486158018028b07c8fc33e 100644 (file)
@@ -4,7 +4,7 @@ import java.net.URL;
 import java.util.List;
 
 import be.nikiroo.gofetch.support.BasicSupport;
 import java.util.List;
 
 import be.nikiroo.gofetch.support.BasicSupport;
-import be.nikiroo.gofetch.support.BasicSupport.Type;
+import be.nikiroo.gofetch.support.Type;
 
 /**
  * A news story.
 
 /**
  * A news story.
index f0c6f6dad69d90d2dd09d20baad1b4fec82fca67..12a420cd84482b988cde4775c6b29f3962fe574b 100644 (file)
@@ -2,7 +2,7 @@ package be.nikiroo.gofetch.output;
 
 import be.nikiroo.gofetch.data.Comment;
 import be.nikiroo.gofetch.data.Story;
 
 import be.nikiroo.gofetch.data.Comment;
 import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.gofetch.support.BasicSupport.Type;
+import be.nikiroo.gofetch.support.Type;
 import be.nikiroo.utils.StringUtils;
 import be.nikiroo.utils.StringUtils.Alignment;
 
 import be.nikiroo.utils.StringUtils;
 import be.nikiroo.utils.StringUtils.Alignment;
 
index 50fe2d7fac8848e50c2efe41929567e2fa92c15c..385df8bf8da919416ccf672a623fa9288c0ab585 100644 (file)
@@ -2,7 +2,8 @@ package be.nikiroo.gofetch.output;
 
 import be.nikiroo.gofetch.data.Comment;
 import be.nikiroo.gofetch.data.Story;
 
 import be.nikiroo.gofetch.data.Comment;
 import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.gofetch.support.BasicSupport.Type;
+import be.nikiroo.gofetch.support.Type;
+import be.nikiroo.utils.StringUtils;
 
 public class Html extends Output {
        public Html(Type type, String hostname, String preselector, int port) {
 
 public class Html extends Output {
        public Html(Type type, String hostname, String preselector, int port) {
@@ -99,7 +100,8 @@ public class Html extends Output {
                                .append("<div class='comment' style='display: block; margin-left: 80px'>\n");
                builder.append(space).append("  <h2>").append(comment.getTitle())
                                .append("</h2>\n");
                                .append("<div class='comment' style='display: block; margin-left: 80px'>\n");
                builder.append(space).append("  <h2>").append(comment.getTitle())
                                .append("</h2>\n");
-               builder.append(space).append("  <div class='by' style='font-style: italic;'>")
+               builder.append(space)
+                               .append("  <div class='by' style='font-style: italic;'>")
                                .append(comment.getAuthor()).append("</div>\n");
                builder.append(space).append("  <div class='comment_content'>");
                for (String line : comment.getContentLines()) {
                                .append(comment.getAuthor()).append("</div>\n");
                builder.append(space).append("  <div class='comment_content'>");
                for (String line : comment.getContentLines()) {
@@ -123,7 +125,9 @@ public class Html extends Output {
 
                builder.append("        <div class='details'>");
                if (story.getDetails() != null && !story.getDetails().isEmpty()) {
 
                builder.append("        <div class='details'>");
                if (story.getDetails() != null && !story.getDetails().isEmpty()) {
-                       builder.append("(").append(story.getDetails()).append(")");
+                       builder.append("(")
+                                       .append(StringUtils.xmlEscape(story.getDetails()))
+                                       .append(")");
                }
                builder.append("</div>\n");
                builder.append("        <br/>\n");
                }
                builder.append("</div>\n");
                builder.append("        <br/>\n");
@@ -142,12 +146,13 @@ public class Html extends Output {
 
                builder.append("        <div class='content' style='text-align: justify'>\n");
                if (resume) {
 
                builder.append("        <div class='content' style='text-align: justify'>\n");
                if (resume) {
-                       builder.append("                " + story.getContent() + "\n");
+                       builder.append("                " + StringUtils.xmlEscape(story.getContent())
+                                       + "\n");
                } else {
                        builder.append("                "
                } else {
                        builder.append("                "
-                                       + story.getFullContent().replace("\n", "<br/>")
-                                                       .replace("[ ", "<h2>").replace(" ]", "</h2>")
-                                       + "\n");
+                                       + StringUtils.xmlEscape(story.getFullContent())
+                                                       .replace("\n", "<br/>").replace("[ ", "<h2>")
+                                                       .replace(" ]", "</h2>") + "\n");
                }
                builder.append("        </div>\n");
 
                }
                builder.append("        </div>\n");
 
index db6554b074587ff9d398c8cc6bbc596467913ac1..1166879ea36de11a9eba1be26200f594a1c9dbd7 100644 (file)
@@ -1,7 +1,7 @@
 package be.nikiroo.gofetch.output;
 
 import be.nikiroo.gofetch.data.Story;
 package be.nikiroo.gofetch.output;
 
 import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.gofetch.support.BasicSupport.Type;
+import be.nikiroo.gofetch.support.Type;
 
 /**
  * Base class for output operations.
 
 /**
  * Base class for output operations.
diff --git a/src/be/nikiroo/gofetch/support/BasicElementProcessor.java b/src/be/nikiroo/gofetch/support/BasicElementProcessor.java
new file mode 100644 (file)
index 0000000..83d7c8b
--- /dev/null
@@ -0,0 +1,36 @@
+package be.nikiroo.gofetch.support;
+
+import org.jsoup.nodes.Node;
+
+/**
+ * A default {@link ElementProcessor} (will not detect or process anything
+ * manually).
+ * 
+ * @author niki
+ */
+class BasicElementProcessor implements ElementProcessor {
+       @Override
+       public boolean detectQuote(Node node) {
+               return false;
+       }
+
+       @Override
+       public String processText(String text) {
+               return text;
+       }
+
+       @Override
+       public boolean ignoreNode(Node node) {
+               return false;
+       }
+
+       @Override
+       public String manualProcessing(Node node) {
+               return null;
+       }
+
+       @Override
+       public String isSubtitle(Node node) {
+               return null;
+       }
+}
index b15fac7e5e2598d0d67c3bcf493c6dae03a0a8a1..a59ae313fb1f1fda8979020d7e6315d81ba6592e 100644 (file)
@@ -1,22 +1,29 @@
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Date;
 import java.util.List;
 import java.util.Date;
 import java.util.List;
+import java.util.Map.Entry;
 
 
+import org.jsoup.helper.DataUtil;
 import org.jsoup.helper.StringUtil;
 import org.jsoup.helper.StringUtil;
+import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.TextNode;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.TextNode;
-import org.jsoup.select.Elements;
 import org.jsoup.select.NodeTraversor;
 import org.jsoup.select.NodeVisitor;
 
 import org.jsoup.select.NodeTraversor;
 import org.jsoup.select.NodeVisitor;
 
+import be.nikiroo.gofetch.data.Comment;
 import be.nikiroo.gofetch.data.Story;
 import be.nikiroo.utils.Downloader;
 import be.nikiroo.gofetch.data.Story;
 import be.nikiroo.utils.Downloader;
+import be.nikiroo.utils.StringUtils;
 
 /**
  * Base class for website support.
 
 /**
  * Base class for website support.
@@ -27,140 +34,226 @@ public abstract class BasicSupport {
        /** The downloader to use for all websites. */
        protected static Downloader downloader = new Downloader("gofetcher");
 
        /** The downloader to use for all websites. */
        protected static Downloader downloader = new Downloader("gofetcher");
 
+       static private String preselector;
+
+       private Type type;
+
+       /**
+        * The website textual description, to add in the dispatcher page.
+        * <p>
+        * Should be short.
+        * 
+        * @return the description
+        */
+       abstract public String getDescription();
+
        /**
        /**
-        * The support type (each website we support has a single type).
-        * 
-        * @author niki
-        */
-       public enum Type {
-               /** EN: Any, but mostly IT/Sci */
-               SLASHDOT,
-               /** EN: Clone of Slashdot, mostly abandoned */
-               PIPEDOT,
-               /** EN: Linux */
-               LWN,
-               /** FR: Any */
-               LEMONDE,
-               /** EN: IT */
-               REGISTER,
-               /** FR: Linux */
-               TOO_LINUX,
-               /** FR: IT */
-               ERE_NUMERIQUE,
+        * The gopher "selector" to use for output.
+        * <p>
+        * A kind of "URL path", like "/news/" or "/misc/news/" or...
+        * 
+        * @return the selector
+        */
+       public String getSelector() {
+               return getSelector(type);
        }
 
        /**
        }
 
        /**
-        * Used to process an element into lines.
-        * 
-        * @author niki
-        */
-       public interface ElementProcessor {
-               /**
-                * Detect if this node is a quote and should be trated as such.
-                * 
-                * @param node
-                *            the node to check
-                * @return TRUE if it is
-                */
-               public boolean detectQuote(Node node);
-
-               /**
-                * Process text content (will be called on each text element, allowing
-                * you to modify it if needed).
-                * 
-                * @param text
-                *            the text to process
-                * 
-                * @return the resulting text
-                */
-               public String processText(String text);
-
-               /**
-                * Ignore this node.
-                * 
-                * @param node
-                *            the node to ignore
-                * @return TRUE if it has to be ignored
-                */
-               public boolean ignoreNode(Node node);
-
-               /**
-                * Manually process this node (and return the manual processing value)
-                * if so desired.
-                * <p>
-                * If the node is manually processed, it and its children will not be
-                * automatically processed.
-                * 
-                * @param node
-                *            the node to optionally process
-                * 
-                * @return NULL if not processed (will thus be automatically processed
-                *         as usual), a {@link String} (may be empty) if we process it
-                *         manually -- the given {@link String} will be used instead of
-                *         the usual automatic processing if not NULL
-                */
-               public String manualProcessing(Node node);
-
-               /**
-                * This {@link Node} is a subtitle and should be treated as such
-                * (highlighted).
-                * 
-                * @param node
-                *            the node to check
-                * 
-                * @return NULL if it is not a subtitle, the subtitle to use if it is
-                */
-               public String isSubtitle(Node node);
+        * The support type.
+        * 
+        * @return the type
+        */
+       public Type getType() {
+               return type;
        }
 
        /**
        }
 
        /**
-        * A default {@link ElementProcessor} (will not detect or process anything
-        * manually).
+        * List all the recent items, but only assure the ID and internal URL to
+        * fetch it later on (until it has been fetched, the rest of the
+        * {@link Story} is not confirmed).
         * 
         * 
-        * @author niki
+        * @return the list of new stories
+        * 
+        * @throws IOException
+        *             in case of I/O
         */
         */
-       public class BasicElementProcessor implements ElementProcessor {
-               @Override
-               public boolean detectQuote(Node node) {
-                       return false;
-               }
+       public List<Story> list() throws IOException {
+               List<Story> list = new ArrayList<Story>();
+
+               for (Entry<URL, String> entry : getUrls()) {
+                       URL url = entry.getKey();
+                       String defaultCateg = entry.getValue();
+                       if (defaultCateg == null) {
+                               defaultCateg = "";
+                       }
 
 
-               @Override
-               public String processText(String text) {
-                       return text;
-               }
+                       InputStream in = downloader.open(url);
+                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
+                       List<Element> articles = getArticles(doc);
+                       for (Element article : articles) {
+                               String id = getArticleId(doc, article).trim();
+                               String title = getArticleTitle(doc, article).trim();
+                               String author = getArticleAuthor(doc, article).trim();
+                               String date = getArticleDate(doc, article).trim();
+                               String categ = getArticleCategory(doc, article, defaultCateg)
+                                               .trim();
+                               String details = getArticleDetails(doc, article).trim();
+                               String intUrl = getArticleIntUrl(doc, article).trim();
+                               String extUrl = getArticleExtUrl(doc, article).trim();
+                               String content = getArticleContent(doc, article).trim();
+
+                               if (id.isEmpty() && date.isEmpty()) {
+                                       continue;
+                               }
 
 
-               @Override
-               public boolean ignoreNode(Node node) {
-                       return false;
-               }
+                               if (id.isEmpty()) {
+                                       id = date.replace(":", "_").replace("+", "_");
+                               }
 
 
-               @Override
-               public String manualProcessing(Node node) {
-                       return null;
-               }
+                               date = date(date);
 
 
-               @Override
-               public String isSubtitle(Node node) {
-                       return null;
+                               list.add(new Story(getType(), id, title, author, date, categ,
+                                               details, intUrl, extUrl, content));
+                       }
                }
                }
+
+               return list;
        }
 
        }
 
-       static private String preselector;
+       /**
+        * The {@link URL}s to process for this website.
+        * 
+        * @return the list of {@link URL}s
+        * 
+        * @throws IOException
+        *             in case of I/O error
+        */
+       abstract protected List<Entry<URL, String>> getUrls() throws IOException;
 
 
-       private Type type;
+       /**
+        * The article {@link Element}s of this document.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * 
+        * @return the articles
+        */
+       abstract protected List<Element> getArticles(Document doc);
 
        /**
 
        /**
-        * List all the recent items, but only assure the ID and internal URL to
-        * fetch it later on (until it has been fetched, the rest of the
-        * {@link Story} is not confirmed).
+        * The ID of the article (defaults to the date element if empty).
         * 
         * 
-        * @return the list of new stories
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
         * 
         * 
-        * @throws IOException
-        *             in case of I/O
+        * @return the ID
+        */
+       abstract protected String getArticleId(Document doc, Element article);
+
+       /**
+        * The article title to display.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the title
+        */
+       abstract protected String getArticleTitle(Document doc, Element article);
+
+       /**
+        * The optional article author.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the author
+        */
+       abstract protected String getArticleAuthor(Document doc, Element article);
+
+       /**
+        * The optional article date.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the date
+        */
+       abstract protected String getArticleDate(Document doc, Element article);
+
+       /**
+        * the optional article category.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * @param currentCategory
+        *            the currently listed category if any (can be NULL)
+        * 
+        * @return the category
         */
         */
-       abstract public List<Story> list() throws IOException;
+       abstract protected String getArticleCategory(Document doc, Element article,
+                       String currentCategory);
+
+       /**
+        * the optional details of the article (can replace the date, author and
+        * category, for instance).
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the details
+        */
+       abstract protected String getArticleDetails(Document doc, Element article);
+
+       /**
+        * The (required) {@link URL} that points to the news page on the supported
+        * website.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the internal {@link URL}
+        */
+       abstract protected String getArticleIntUrl(Document doc, Element article);
+
+       /**
+        * the optional {@link URL} that points to an external website for more
+        * information.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the external {@link URL}
+        */
+       abstract protected String getArticleExtUrl(Document doc, Element article);
+
+       /**
+        * The optional article short-content (not the full content, that will be
+        * fetched by {@link BasicSupport#fetch(Story)}).
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the short content
+        */
+       abstract protected String getArticleContent(Document doc, Element article);
 
        /**
         * Fetch the full article content as well as all the comments associated to
 
        /**
         * Fetch the full article content as well as all the comments associated to
@@ -172,37 +265,211 @@ public abstract class BasicSupport {
         * @throws IOException
         *             in case of I/O error
         */
         * @throws IOException
         *             in case of I/O error
         */
-       abstract public void fetch(Story story) throws IOException;
+       public void fetch(Story story) throws IOException {
+               String fullContent = "";
+
+               URL url = new URL(story.getUrlInternal());
+               InputStream in = downloader.open(url);
+               try {
+                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
+                       Element article = getFullArticle(doc);
+                       if (article != null) {
+                               StringBuilder builder = new StringBuilder();
+                               ElementProcessor eProc = getElementProcessorFullArticle();
+                               if (eProc != null) {
+                                       for (String line : toLines(article, eProc)) {
+                                               builder.append(line + "\n");
+                                       }
+                               } else {
+                                       builder.append(article.text());
+                               }
+
+                               // Content is too tight with a single break per line:
+                               fullContent = builder.toString().replace("\n", "\n\n") //
+                                               .replace("\n\n\n\n", "\n\n") //
+                                               .replace("\n\n\n\n", "\n\n") //
+                                               .trim();
+                       }
+
+                       if (fullContent.isEmpty()) {
+                               fullContent = story.getContent();
+                       }
+
+                       story.setFullContent(fullContent);
+                       story.setComments(getComments(doc,
+                                       getFullArticleCommentPosts(doc, url)));
+               } finally {
+                       if (in != null) {
+                               in.close();
+                       }
+               }
+       }
 
        /**
 
        /**
-        * The website textual description, to add in the dispatcher page.
-        * <p>
-        * Should be short.
+        * Return the full article if available.
         * 
         * 
-        * @return the description
+        * @param doc
+        *            the (full article) document to work on
+        * 
+        * @return the article or NULL
         */
         */
-       abstract public String getDescription();
+       abstract protected Element getFullArticle(Document doc);
 
        /**
 
        /**
-        * The gopher "selector" to use for output.
+        * Return the list of comment {@link Element}s from this optional container
+        * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
+        * 
+        * @param doc
+        *            the (full article) document to work on
+        * @param intUrl
+        *            the internal {@link URL} this article wa taken from (the
+        *            {@link URL} from the supported website)
+        * 
+        * @return the list of comment posts
+        */
+       abstract protected List<Element> getFullArticleCommentPosts(Document doc,
+                       URL intUrl);
+
+       /**
+        * The {@link ElementProcessor} to use to convert the main article element
+        * (see {@link BasicSupport#getFullArticle(Document)}) into text.
         * <p>
         * <p>
-        * A kind of "URL path", like "/news/" or "/misc/news/" or...
+        * See {@link BasicElementProcessor} for a working, basic implementation.
+        * <p>
+        * Can be NULL to simply use {@link Element#text()}.
         * 
         * 
-        * @return the selector
+        * @return the processor, or NULL
         */
         */
-       public String getSelector() {
-               return getSelector(type);
-       }
+       abstract protected ElementProcessor getElementProcessorFullArticle();
 
        /**
 
        /**
-        * The support type.
+        * Convert the comment elements into {@link Comment}s
         * 
         * 
-        * @return the type
+        * @param doc
+        *            the document we work on
+        * @param posts
+        *            the comment elements
+        * 
+        * @return the converted {@link Comment}s
         */
         */
-       public Type getType() {
-               return type;
+       private List<Comment> getComments(Document doc, List<Element> posts) {
+               List<Comment> comments = new ArrayList<Comment>();
+               if (posts != null) {
+                       for (Element post : posts) {
+                               String id = getCommentId(post).trim();
+                               String author = getCommentAuthor(post).trim();
+                               String title = getCommentTitle(post).trim();
+                               String date = getCommentDate(post).trim();
+
+                               List<String> content = new ArrayList<String>();
+
+                               if (id.isEmpty()) {
+                                       id = date;
+                               }
+
+                               date = date(date);
+
+                               Element contentE = getCommentContentElement(post);
+                               if (contentE != null) {
+                                       ElementProcessor eProc = getElementProcessorComment();
+                                       if (eProc != null) {
+                                               for (String line : toLines(contentE, eProc)) {
+                                                       content.add(line);
+                                               }
+                                       } else {
+                                               content = Arrays.asList(contentE.text().split("\n"));
+                                       }
+                               }
+
+                               Comment comment = new Comment(id, author, title, date, content);
+                               comment.addAll(getComments(doc,
+                                               getCommentCommentPosts(doc, post)));
+
+                               if (!comment.isEmpty()) {
+                                       comments.add(comment);
+                               }
+                       }
+               }
+
+               return comments;
        }
 
        }
 
+       /**
+        * Return the list of subcomment {@link Element}s from this comment element
+        * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
+        * 
+        * @param doc
+        *            the (full article) document to work on
+        * @param container
+        *            the container (a comment {@link Element})
+        * 
+        * @return the list of comment posts
+        */
+       abstract protected List<Element> getCommentCommentPosts(Document doc,
+                       Element container);
+
+       /**
+        * Compute the ID of the given comment element.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the ID
+        */
+       abstract protected String getCommentId(Element post);
+
+       /**
+        * Compute the author of the given comment element.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the author
+        */
+       abstract protected String getCommentAuthor(Element post);
+
+       /**
+        * Compute the title of the given comment element.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the title
+        */
+       abstract protected String getCommentTitle(Element post);
+
+       /**
+        * Compute the date of the given comment element.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the date
+        */
+       abstract protected String getCommentDate(Element post);
+
+       /**
+        * Get the main of the given comment element, which can be NULL.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the element
+        */
+       abstract protected Element getCommentContentElement(Element post);
+
+       /**
+        * The {@link ElementProcessor} to use to convert the main comment element
+        * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
+        * <p>
+        * See {@link BasicElementProcessor} for a working, basic implementation.
+        * <p>
+        * Can be NULL to simply use {@link Element#text()}.
+        * 
+        * @return the processor
+        */
+       abstract protected ElementProcessor getElementProcessorComment();
+
        /**
         * The support type.
         * 
        /**
         * The support type.
         * 
@@ -284,46 +551,6 @@ public abstract class BasicSupport {
                return preselector + "/" + type + "/";
        }
 
                return preselector + "/" + type + "/";
        }
 
-       /**
-        * Get the first {@link Element} of the given class, or an empty span
-        * {@link Element} if none found.
-        * 
-        * @param element
-        *            the element to look in
-        * @param className
-        *            the class to look for
-        * 
-        * @return the value or an empty span {@link Element}
-        */
-       static protected Element firstOrEmpty(Element element, String className) {
-               Elements subElements = element.getElementsByClass(className);
-               if (subElements.size() > 0) {
-                       return subElements.get(0);
-               }
-
-               return new Element("span");
-       }
-
-       /**
-        * Get the first {@link Element} of the given tag, or an empty span
-        * {@link Element} if none found.
-        * 
-        * @param element
-        *            the element to look in
-        * @param tagName
-        *            the tag to look for
-        * 
-        * @return the value or an empty span {@link Element}
-        */
-       static protected Element firstOrEmptyTag(Element element, String tagName) {
-               Elements subElements = element.getElementsByTag(tagName);
-               if (subElements.size() > 0) {
-                       return subElements.get(0);
-               }
-
-               return new Element("span");
-       }
-
        /**
         * Process the given element into text (each line is a text paragraph and
         * can be prepended with ">" signs to indicate a quote or sub-quote or
        /**
         * Process the given element into text (each line is a text paragraph and
         * can be prepended with ">" signs to indicate a quote or sub-quote or
@@ -342,6 +569,7 @@ public abstract class BasicSupport {
                final StringBuilder currentLine = new StringBuilder();
                final List<Integer> quoted = new ArrayList<Integer>();
                final List<Node> ignoredNodes = new ArrayList<Node>();
                final StringBuilder currentLine = new StringBuilder();
                final List<Integer> quoted = new ArrayList<Integer>();
                final List<Node> ignoredNodes = new ArrayList<Node>();
+               final List<String> footnotes = new ArrayList<String>();
 
                if (element != null) {
                        new NodeTraversor(new NodeVisitor() {
 
                if (element != null) {
                        new NodeTraversor(new NodeVisitor() {
@@ -369,6 +597,18 @@ public abstract class BasicSupport {
                                                }
                                        }
 
                                                }
                                        }
 
+                                       // <pre> check
+                                       if (!ignore) {
+                                               if (node instanceof Element) {
+                                                       Element el = (Element) node;
+                                                       if ("pre".equals(el.tagName())) {
+                                                               currentLine.append(StringUtils
+                                                                               .unhtml(el.text()).trim());
+                                                               ignore = true;
+                                                       }
+                                               }
+                                       }
+
                                        if (ignore) {
                                                ignoredNodes.add(node);
                                                return;
                                        if (ignore) {
                                                ignoredNodes.add(node);
                                                return;
@@ -410,6 +650,11 @@ public abstract class BasicSupport {
                                                if (block && currentLine.length() > 0) {
                                                        currentLine.append("\n");
                                                }
                                                if (block && currentLine.length() > 0) {
                                                        currentLine.append("\n");
                                                }
+
+                                               if (!element.absUrl("href").trim().isEmpty()) {
+                                                       footnotes.add(element.absUrl("href"));
+                                                       currentLine.append("[" + footnotes.size() + "]");
+                                               }
                                        } else if (node instanceof TextNode) {
                                                TextNode textNode = (TextNode) node;
                                                String line = StringUtil.normaliseWhitespace(textNode
                                        } else if (node instanceof TextNode) {
                                                TextNode textNode = (TextNode) node;
                                                String line = StringUtil.normaliseWhitespace(textNode
@@ -442,11 +687,35 @@ public abstract class BasicSupport {
                        }
                }
 
                        }
                }
 
+               // Fix spaces and nbsp, remove multiple following blank lines
+               List<String> linesCopy = new ArrayList<String>(lines.size());
+               long blanks = 0;
                for (int i = 0; i < lines.size(); i++) {
                for (int i = 0; i < lines.size(); i++) {
-                       lines.set(i, lines.get(i).replace("  ", " ").trim());
+                       String line = lines.get(i).replace(" ", " ") // nbsp -> space
+                                       .replace("  ", " ").trim();
+                       if (line.isEmpty()) {
+                               blanks++;
+                       } else {
+                               blanks = 0;
+                       }
+
+                       if (blanks < 2) {
+                               linesCopy.add(line);
+                       }
+               }
+
+               // Footnotes insertion
+               if (footnotes.size() > 0) {
+                       linesCopy.add("");
+                       linesCopy.add("");
+                       linesCopy.add("");
+                       linesCopy.add("");
+                       for (int i = 0; i < footnotes.size(); i++) {
+                               linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
+                       }
                }
 
                }
 
-               return lines;
+               return linesCopy;
        }
 
        /**
        }
 
        /**
@@ -457,7 +726,7 @@ public abstract class BasicSupport {
         * 
         * @return the reformated date, or the same value if it was not parsable
         */
         * 
         * @return the reformated date, or the same value if it was not parsable
         */
-       static protected String date(String date) {
+       static private String date(String date) {
                SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
 
                long epoch = 0;
                SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
 
                long epoch = 0;
diff --git a/src/be/nikiroo/gofetch/support/ElementProcessor.java b/src/be/nikiroo/gofetch/support/ElementProcessor.java
new file mode 100644 (file)
index 0000000..69e291d
--- /dev/null
@@ -0,0 +1,67 @@
+package be.nikiroo.gofetch.support;
+
+import org.jsoup.nodes.Node;
+
+/**
+ * Used to process an element into lines.
+ * 
+ * @author niki
+ */
+interface ElementProcessor {
+       /**
+        * Detect if this node is a quote and should be trated as such.
+        * 
+        * @param node
+        *            the node to check
+        * @return TRUE if it is
+        */
+       public boolean detectQuote(Node node);
+
+       /**
+        * Process text content (will be called on each text element, allowing you
+        * to modify it if needed).
+        * 
+        * @param text
+        *            the text to process
+        * 
+        * @return the resulting text
+        */
+       public String processText(String text);
+
+       /**
+        * Ignore this node.
+        * 
+        * @param node
+        *            the node to ignore
+        * @return TRUE if it has to be ignored
+        */
+       public boolean ignoreNode(Node node);
+
+       /**
+        * Manually process this node (and return the manual processing value) if so
+        * desired.
+        * <p>
+        * If the node is manually processed, it and its children will not be
+        * automatically processed.
+        * 
+        * @param node
+        *            the node to optionally process
+        * 
+        * @return NULL if not processed (will thus be automatically processed as
+        *         usual), a {@link String} (may be empty) if we process it manually
+        *         -- the given {@link String} will be used instead of the usual
+        *         automatic processing if not NULL
+        */
+       public String manualProcessing(Node node);
+
+       /**
+        * This {@link Node} is a subtitle and should be treated as such
+        * (highlighted).
+        * 
+        * @param node
+        *            the node to check
+        * 
+        * @return NULL if it is not a subtitle, the subtitle to use if it is
+        */
+       public String isSubtitle(Node node);
+}
\ No newline at end of file
index b6a7598027c9b632cb52fb50f22677e4a1a314b4..0b3efcf4c07596b4f0a1e3076ca115a31a5021c7 100644 (file)
@@ -1,20 +1,15 @@
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
-import java.io.InputStream;
 import java.net.URL;
 import java.net.URL;
+import java.util.AbstractMap;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map.Entry;
 
 
-import org.jsoup.helper.DataUtil;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
-
-import be.nikiroo.gofetch.data.Comment;
-import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.utils.StringUtils;
 
 /**
  * Support <a
 
 /**
  * Support <a
@@ -29,187 +24,221 @@ public class EreNumerique extends BasicSupport {
        }
 
        @Override
        }
 
        @Override
-       public List<Story> list() throws IOException {
-               List<Story> list = new ArrayList<Story>();
-
-               for (String categ : new String[] { "informatique" }) {
-                       URL url = new URL("https://www.erenumerique.fr/" + categ);
-                       InputStream in = downloader.open(url);
-                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
-                       Elements articles = doc.getElementsByClass("item-details");
-                       for (Element article : articles) {
-                               String id = "";
-                               String intUrl = "";
-                               String extUrl = ""; // nope
-                               String title = "";
-                               String date = "";
-                               String author = "";
-                               String details = "";
-                               String body = "";
-
-                               // MUST NOT fail:
-                               Element dateElement = article //
-                                               .getElementsByTag("time").first();
-                               if (dateElement == null) {
-                                       continue;
-                               }
+       protected List<Entry<URL, String>> getUrls() throws IOException {
+               List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+               for (String categ : new String[] { "Informatique" }) {
+                       URL url = new URL("https://www.erenumerique.fr/"
+                                       + categ.toLowerCase());
+                       urls.add(new AbstractMap.SimpleEntry<URL, String>(url, categ));
+               }
 
 
-                               Element urlElement = article.getElementsByTag("a").first();
-                               if (urlElement != null) {
-                                       intUrl = urlElement.absUrl("href");
-                               }
+               return urls;
+       }
 
 
-                               id = dateElement.attr("datetime").replace(":", "_")
-                                               .replace("+", "_");
-                               date = date(dateElement.attr("datetime"));
+       @Override
+       protected List<Element> getArticles(Document doc) {
+               return doc.getElementsByClass("item-details");
+       }
 
 
-                               Element titleElement = article.getElementsByTag("h2").first();
-                               if (titleElement != null) {
-                                       title = StringUtils.unhtml(titleElement.text()).trim();
-                               }
+       @Override
+       protected String getArticleId(Document doc, Element article) {
+               return ""; // will use the date
+       }
 
 
-                               Element authorElement = article.getElementsByClass(
-                                               "td-post-author-name").first();
-                               if (authorElement != null) {
-                                       authorElement = authorElement.getElementsByTag("a").first();
-                               }
-                               if (authorElement != null) {
-                                       author = StringUtils.unhtml(authorElement.text()).trim();
-                               }
+       @Override
+       protected String getArticleTitle(Document doc, Element article) {
+               Element titleElement = article.getElementsByTag("h2").first();
+               if (titleElement != null) {
+                       return titleElement.text();
+               }
 
 
-                               Element contentElement = article.getElementsByClass(
-                                               "td-excerpt").first();
-                               if (contentElement != null) {
-                                       body = StringUtils.unhtml(contentElement.text()).trim();
-                               }
+               return "";
+       }
 
 
-                               list.add(new Story(getType(), id, title, author, date, categ,
-                                               details, intUrl, extUrl, body));
-                       }
+       @Override
+       protected String getArticleAuthor(Document doc, Element article) {
+               Element authorElement = article.getElementsByClass(
+                               "td-post-author-name").first();
+               if (authorElement != null) {
+                       authorElement = authorElement.getElementsByTag("a").first();
+               }
+               if (authorElement != null) {
+                       return authorElement.text();
                }
 
                }
 
-               return list;
+               return "";
        }
 
        @Override
        }
 
        @Override
-       public void fetch(Story story) throws IOException {
-               String fullContent = story.getContent();
+       protected String getArticleDate(Document doc, Element article) {
+               Element dateElement = article //
+                               .getElementsByTag("time").first();
+               if (dateElement != null) {
+                       return dateElement.attr("datetime");
+               }
 
 
-               URL url = new URL(story.getUrlInternal());
-               InputStream in = downloader.open(url);
-               try {
-                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
-                       Element article = doc.getElementsByTag("article").first();
-                       if (article != null) {
-                               article = article.getElementsByAttributeValue("itemprop",
-                                               "articleBody").first();
-                       }
-                       if (article != null) {
-                               for (String line : toLines(article,
-                                               new BasicElementProcessor() {
-                                                       @Override
-                                                       public boolean ignoreNode(Node node) {
-                                                               return node.attr("class").contains("chapo");
-                                                       }
-
-                                                       @Override
-                                                       public String isSubtitle(Node node) {
-                                                               if (node instanceof Element) {
-                                                                       Element element = (Element) node;
-                                                                       if (element.tagName().startsWith("h")
-                                                                                       && element.tagName().length() == 2) {
-                                                                               return element.text();
-                                                                       }
-                                                               }
-                                                               return null;
-                                                       }
-                                               })) {
-                                       fullContent += line + "\n";
-                               }
+               return "";
+       }
 
 
-                               // Content is too tight with a single break per line:
-                               fullContent = fullContent.replace("\n", "\n\n") //
-                                               .replace("\n\n\n\n", "\n\n") //
-                                               .replace("\n\n\n\n", "\n\n") //
-                                               .trim();
-                       }
+       @Override
+       protected String getArticleCategory(Document doc, Element article,
+                       String currentCategory) {
+               return currentCategory;
+       }
 
 
-                       // Get comments URL then parse it, if possible
-                       Element posts = doc.getElementsByClass("comment-list").first();
+       @Override
+       protected String getArticleDetails(Document doc, Element article) {
+               return "";
+       }
 
 
-                       story.setFullContent(fullContent);
-                       story.setComments(getComments(posts));
-               } finally {
-                       if (in != null) {
-                               in.close();
-                       }
+       @Override
+       protected String getArticleIntUrl(Document doc, Element article) {
+               Element urlElement = article.getElementsByTag("a").first();
+               if (urlElement != null) {
+                       return urlElement.absUrl("href");
                }
                }
+
+               return "";
        }
 
        }
 
-       private List<Comment> getComments(Element posts) {
-               List<Comment> comments = new ArrayList<Comment>();
-               if (posts != null) {
-                       for (Element post : posts.children()) {
-                               if (!post.hasClass("comment")) {
-                                       continue;
-                               }
+       @Override
+       protected String getArticleExtUrl(Document doc, Element article) {
+               return "";
+       }
+
+       @Override
+       protected String getArticleContent(Document doc, Element article) {
+               Element contentElement = article.getElementsByClass("td-excerpt")
+                               .first();
+               if (contentElement != null) {
+                       return contentElement.text();
+               }
 
 
-                               String id = "";
-                               String author = "";
-                               String title = "";
-                               String date = "";
-                               List<String> content = new ArrayList<String>();
+               return "";
+       }
 
 
-                               Element authorE = post.getElementsByTag("footer").first();
-                               if (authorE != null) {
-                                       authorE = authorE.getElementsByTag("cite").first();
-                               }
-                               if (authorE != null) {
-                                       author = StringUtils.unhtml(authorE.text()).trim();
-                               }
+       @Override
+       protected Element getFullArticle(Document doc) {
+               Element article = doc.getElementsByTag("article").first();
+               if (article != null) {
+                       article = article.getElementsByAttributeValue("itemprop",
+                                       "articleBody").first();
+               }
+
+               return article;
+       }
+
+       @Override
+       protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+               return getSubCommentElements(doc.getElementsByClass("comment-list")
+                               .first());
+       }
 
 
-                               Element idE = post.getElementsByTag("a").first();
-                               if (idE != null) {
-                                       id = idE.attr("id");
-                                       Element dateE = idE.getElementsByTag("span").first();
-                                       if (dateE != null) {
-                                               date = date(dateE.attr("data-epoch"));
+       @Override
+       protected ElementProcessor getElementProcessorFullArticle() {
+               return new BasicElementProcessor() {
+                       @Override
+                       public boolean ignoreNode(Node node) {
+                               return node.attr("class").contains("chapo");
+                       }
+
+                       @Override
+                       public String isSubtitle(Node node) {
+                               if (node instanceof Element) {
+                                       Element element = (Element) node;
+                                       if (element.tagName().startsWith("h")
+                                                       && element.tagName().length() == 2) {
+                                               return element.text();
                                        }
                                }
                                        }
                                }
+                               return null;
+                       }
+               };
+       }
+
+       @Override
+       protected List<Element> getCommentCommentPosts(Document doc,
+                       Element container) {
+               return getSubCommentElements(container.getElementsByClass("children")
+                               .first());
+       }
+
+       @Override
+       protected String getCommentId(Element post) {
+               Element idE = post.getElementsByTag("a").first();
+               if (idE != null) {
+                       return idE.attr("id");
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getCommentAuthor(Element post) {
+               // Since we have no title, we switch with author
+               return "";
+       }
+
+       @Override
+       protected String getCommentTitle(Element post) {
+               // Since we have no title, we switch with author
+               Element authorE = post.getElementsByTag("footer").first();
+               if (authorE != null) {
+                       authorE = authorE.getElementsByTag("cite").first();
+               }
+               if (authorE != null) {
+                       return authorE.text();
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getCommentDate(Element post) {
+               Element idE = post.getElementsByTag("a").first();
+               if (idE != null) {
+                       Element dateE = idE.getElementsByTag("span").first();
+                       if (dateE != null) {
+                               return dateE.attr("data-epoch");
+                       }
+               }
 
 
-                               Element contentE = post.getElementsByClass("comment-content")
-                                               .first();
-                               if (contentE != null) {
-                                       for (String line : toLines(contentE,
-                                                       new BasicElementProcessor() {
-                                                               @Override
-                                                               public boolean ignoreNode(Node node) {
-                                                                       // TODO: ignore headlines/pub
-                                                                       if (node instanceof Element) {
-                                                                               Element el = (Element) node;
-                                                                               if ("h4".equals(el.tagName())) {
-                                                                                       return true;
-                                                                               }
-                                                                       }
-
-                                                                       return false;
-                                                               }
-                                                       })) {
-                                               content.add(line);
+               return "";
+       }
+
+       @Override
+       protected Element getCommentContentElement(Element post) {
+               Element contentE = post.getElementsByClass("comment-content").first();
+               return contentE;
+       }
+
+       @Override
+       protected ElementProcessor getElementProcessorComment() {
+               return new BasicElementProcessor() {
+                       @Override
+                       public boolean ignoreNode(Node node) {
+                               if (node instanceof Element) {
+                                       Element el = (Element) node;
+                                       if ("h4".equals(el.tagName())) {
+                                               return true;
                                        }
                                }
 
                                        }
                                }
 
-                               // Since we have no title but still an author, let's switch:
-                               title = author;
-                               author = "";
-                               Comment comment = new Comment(id, author, title, date, content);
-                               comments.add(comment);
+                               return false;
+                       }
+               };
+       }
 
 
-                               Element children = post.getElementsByClass("children").first();
-                               comment.addAll(getComments(children));
+       private List<Element> getSubCommentElements(Element posts) {
+               List<Element> commentElements = new ArrayList<Element>();
+               if (posts != null) {
+                       for (Element possibleCommentElement : posts.children()) {
+                               if (possibleCommentElement.hasClass("comment")) {
+                                       commentElements.add(possibleCommentElement);
+                               }
                        }
                }
 
                        }
                }
 
-               return comments;
+               return commentElements;
        }
 }
        }
 }
index c033104f92fed7e7a81a5673894c0463a8d0608e..144fdc90514039656168bf41fef8b850a027d4fe 100644 (file)
@@ -1,16 +1,16 @@
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
-import java.io.InputStream;
 import java.net.URL;
 import java.net.URL;
+import java.util.AbstractMap;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map.Entry;
 
 
-import org.jsoup.helper.DataUtil;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
+import org.jsoup.nodes.TextNode;
 
 import be.nikiroo.gofetch.data.Comment;
 import be.nikiroo.gofetch.data.Story;
 
 import be.nikiroo.gofetch.data.Comment;
 import be.nikiroo.gofetch.data.Story;
@@ -27,162 +27,236 @@ public class LWN extends BasicSupport {
        }
 
        @Override
        }
 
        @Override
-       public List<Story> list() throws IOException {
-               List<Story> list = new ArrayList<Story>();
+       public void fetch(Story story) throws IOException {
+               // Do not try the paid-for stories...
+               if (!story.getTitle().startsWith("[$]")) {
+                       super.fetch(story);
+               } else {
+                       String fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/].";
+                       story.setFullContent(fullContent);
+                       story.setComments(new ArrayList<Comment>());
+               }
+       }
 
 
-               URL url = new URL("https://lwn.net/");
-               InputStream in = downloader.open(url);
-               Document doc = DataUtil.load(in, "UTF-8", url.toString());
-               Elements articles = doc.getElementsByClass("pure-u-1");
-               for (Element article : articles) {
-                       Elements titles = article.getElementsByClass("Headline");
-                       Elements listings = article.getElementsByClass("BlurbListing");
-                       if (titles.size() == 0) {
-                               continue;
-                       }
-                       if (listings.size() == 0) {
-                               continue;
-                       }
+       @Override
+       protected List<Entry<URL, String>> getUrls() throws IOException {
+               List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+               urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+                               "https://lwn.net/"), ""));
+               return urls;
+       }
 
 
-                       Element listing = listings.get(0);
-                       if (listing.children().size() < 2) {
-                               continue;
-                       }
+       @Override
+       protected List<Element> getArticles(Document doc) {
+               return doc.getElementsByClass("pure-u-1");
+       }
 
 
-                       String title = titles.get(0).text();
-                       String details = listing.children().get(0).text();
-                       String body = "";
-                       // All but the first and two last children
-                       for (int i = 1; i < listing.children().size() - 2; i++) {
-                               Element e = listing.children().get(i);
-                               body = body.trim() + " " + e.text().trim();
-                       }
-                       body = body.trim();
+       @Override
+       protected String getArticleId(Document doc, Element article) {
+               return getArticleIntUrl(doc, article).replaceAll("[^0-9]", "");
+       }
 
 
-                       int pos;
+       @Override
+       protected String getArticleTitle(Document doc, Element article) {
+               Element title = article.getElementsByClass("Headline").first();
+               if (title != null) {
+                       return title.text();
+               }
 
 
-                       String categ = "";
-                       pos = details.indexOf("]");
-                       if (pos >= 0) {
-                               categ = details.substring(1, pos).trim();
-                       }
+               return "";
+       }
 
 
-                       String author = "";
-                       pos = details.indexOf(" by ");
-                       if (pos >= 0) {
-                               author = details.substring(pos + " by ".length()).trim();
-                       }
+       @Override
+       protected String getArticleAuthor(Document doc, Element article) {
+               String author = "";
+               String details = getArticleDetailsReal(article);
+               int pos = details.indexOf(" by ");
+               if (pos >= 0) {
+                       author = details.substring(pos + " by ".length()).trim();
+               }
+
+               return author;
+       }
 
 
-                       String date = "";
-                       pos = details.indexOf(" Posted ");
+       @Override
+       protected String getArticleDate(Document doc, Element article) {
+               String date = "";
+               String details = getArticleDetailsReal(article);
+               int pos = details.indexOf(" Posted ");
+               if (pos >= 0) {
+                       date = details.substring(pos + " Posted ".length()).trim();
+                       pos = date.indexOf(" by ");
                        if (pos >= 0) {
                        if (pos >= 0) {
-                               date = details.substring(pos + " Posted ".length()).trim();
-                               pos = date.indexOf(" by ");
-                               if (pos >= 0) {
-                                       date = date.substring(0, pos).trim();
-                               }
+                               date = date.substring(0, pos).trim();
                        }
                        }
+               }
 
 
-                       // We extracted everything from details so...
-                       details = "";
-
-                       String id = "";
-                       String intUrl = "";
-                       String extUrl = "";
-                       for (Element idElem : article.getElementsByTag("a")) {
-                               // Last link is the story link
-                               intUrl = idElem.absUrl("href");
-                               pos = intUrl.indexOf("#Comments");
-                               if (pos >= 0) {
-                                       intUrl = intUrl.substring(0, pos - 1);
-                               }
-                               id = intUrl.replaceAll("[^0-9]", "");
-                       }
+               return date;
+       }
 
 
-                       list.add(new Story(getType(), id, title, author, date, categ,
-                                       details, intUrl, extUrl, body));
+       @Override
+       protected String getArticleCategory(Document doc, Element article,
+                       String currentCategory) {
+               String categ = "";
+               String details = getArticleDetailsReal(article);
+               int pos = details.indexOf("]");
+               if (pos >= 0) {
+                       categ = details.substring(1, pos).trim();
                }
 
                }
 
-               return list;
+               return categ;
        }
 
        @Override
        }
 
        @Override
-       public void fetch(Story story) throws IOException {
-               List<Comment> comments = new ArrayList<Comment>();
-               String fullContent = story.getContent();
+       protected String getArticleDetails(Document doc, Element article) {
+               return ""; // We actually extract all the values
+       }
 
 
-               // Do not try the paid-for stories...
-               if (!story.getTitle().startsWith("[$]")) {
-                       URL url = new URL(story.getUrlInternal());
-                       InputStream in = downloader.open(url);
-                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
-                       Elements fullContentElements = doc
-                                       .getElementsByClass("ArticleText");
-                       if (fullContentElements.size() > 0) {
-                               // comments.addAll(getComments(listing.get(0)));
-                               fullContent = fullContentElements.get(0).text();
+       @Override
+       protected String getArticleIntUrl(Document doc, Element article) {
+               String intUrl = "";
+               for (Element idElem : article.getElementsByTag("a")) {
+                       // Last link is the story link
+                       intUrl = idElem.absUrl("href");
+                       int pos = intUrl.indexOf("#Comments");
+                       if (pos >= 0) {
+                               intUrl = intUrl.substring(0, pos - 1);
                        }
                        }
+               }
 
 
-                       Elements listing = doc.getElementsByClass("lwn-u-1");
-                       if (listing.size() > 0) {
-                               comments.addAll(getComments(listing.get(0)));
+               return intUrl;
+       }
+
+       @Override
+       protected String getArticleExtUrl(Document doc, Element article) {
+               return "";
+       }
+
+       @Override
+       protected String getArticleContent(Document doc, Element article) {
+               Element listing = article.getElementsByClass("BlurbListing").first();
+               if (listing != null && listing.children().size() >= 2) {
+                       String content = "";
+
+                       // All but the first and two last children
+                       for (int i = 1; i < listing.children().size() - 2; i++) {
+                               Element e = listing.children().get(i);
+                               content = content.trim() + " " + e.text().trim();
                        }
                        }
-               } else {
-                       fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/].";
+
+                       return content;
                }
 
                }
 
-               story.setFullContent(fullContent);
-               story.setComments(comments);
+               return "";
+       }
+
+       @Override
+       protected Element getFullArticle(Document doc) {
+               return doc.getElementsByClass("ArticleText").first();
        }
 
        }
 
-       private List<Comment> getComments(Element listing) {
-               List<Comment> comments = new ArrayList<Comment>();
-               for (Element commentElement : listing.children()) {
-                       if (commentElement.hasClass("CommentBox")) {
-                               Comment comment = getComment(commentElement);
-                               if (!comment.isEmpty()) {
-                                       comments.add(comment);
+       @Override
+       protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+               return doc.getElementsByClass("lwn-u-1");
+       }
+
+       @Override
+       protected ElementProcessor getElementProcessorFullArticle() {
+               return new BasicElementProcessor() {
+                       @Override
+                       public boolean ignoreNode(Node node) {
+                               if (node instanceof Element) {
+                                       Element el = (Element) node;
+                                       if ("Log in".equals(el.text().trim())) {
+                                               return true;
+                                       }
+                               } else if (node instanceof TextNode) {
+                                       TextNode text = (TextNode) node;
+                                       String t = text.text().trim();
+                                       if (t.equals("(") || t.equals("to post comments)")) {
+                                               return true;
+                                       }
                                }
                                }
-                       } else if (commentElement.hasClass("Comment")) {
-                               if (comments.size() > 0) {
-                                       comments.get(comments.size() - 1).addAll(
-                                                       getComments(commentElement));
+
+                               return false;
+                       }
+               };
+       }
+
+       @Override
+       protected List<Element> getCommentCommentPosts(Document doc,
+                       Element container) {
+               List<Element> commentElements = new ArrayList<Element>();
+               if (container != null) {
+                       for (Element possibleCommentElement : container.children()) {
+                               if (possibleCommentElement.hasClass("CommentBox")) {
+                                       commentElements.add(possibleCommentElement);
+                               } else if (possibleCommentElement.hasClass("Comment")) {
+                                       commentElements.add(possibleCommentElement);
                                }
                        }
                }
                                }
                        }
                }
-               return comments;
+
+               return commentElements;
        }
 
        }
 
-       private Comment getComment(Element commentElement) {
-               String title = firstOrEmpty(commentElement, "CommentTitle").text();
-               String author = firstOrEmpty(commentElement, "CommentPoster").text();
+       @Override
+       protected String getCommentId(Element post) {
+               return post.id();
+       }
 
 
-               String date = "";
-               int pos = author.lastIndexOf(" by ");
-               if (pos >= 0) {
-                       date = author.substring(0, pos).trim();
-                       author = author.substring(pos + " by ".length()).trim();
+       @Override
+       protected String getCommentAuthor(Element post) {
+               Element detailsE = post.getElementsByClass("CommentPoster").first();
+               if (detailsE != null) {
+                       String details = detailsE.text();
+
+                       int pos = details.lastIndexOf(" by ");
+                       if (pos >= 0) {
+                               details = details.substring(pos + " by ".length()).trim();
 
 
-                       if (author.startsWith("Posted ")) {
-                               author = author.substring("Posted ".length()).trim();
+                               if (details.startsWith("Posted ")) {
+                                       return details.substring("Posted ".length()).trim();
+                               }
                        }
                }
 
                        }
                }
 
-               Element content = null;
-               Elements commentBodyElements = commentElement
-                               .getElementsByClass("CommentBody");
-               if (commentBodyElements.size() > 0) {
-                       content = commentBodyElements.get(0);
+               return "";
+       }
+
+       @Override
+       protected String getCommentTitle(Element post) {
+               Element title = post.getElementsByClass("CommentTitle").first();
+               if (title != null) {
+                       return title.text();
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getCommentDate(Element post) {
+               Element detailsE = post.getElementsByClass("CommentPoster").first();
+               if (detailsE != null) {
+                       String details = detailsE.text();
+
+                       int pos = details.lastIndexOf(" by ");
+                       if (pos >= 0) {
+                               return details.substring(0, pos).trim();
+                       }
                }
 
                }
 
-               Comment comment = new Comment(commentElement.id(), author, title, date,
-                               toLines(content));
+               return "";
+       }
 
 
-               return comment;
+       @Override
+       protected Element getCommentContentElement(Element post) {
+               return post.getElementsByClass("CommentBody").first();
        }
 
        }
 
-       private List<String> toLines(Element element) {
-               return toLines(element, new BasicElementProcessor() {
+       @Override
+       protected ElementProcessor getElementProcessorComment() {
+               return new BasicElementProcessor() {
                        @Override
                        public String processText(String text) {
                                while (text.startsWith(">")) { // comments
                        @Override
                        public String processText(String text) {
                                while (text.startsWith(">")) { // comments
@@ -216,6 +290,16 @@ public class LWN extends BasicSupport {
 
                                return false;
                        }
 
                                return false;
                        }
-               });
+               };
+       }
+
+       private String getArticleDetailsReal(Element article) {
+               Element listing = article.getElementsByClass("BlurbListing").first();
+               // Valid articles have 2+ listings
+               if (listing != null && listing.children().size() >= 2) {
+                       return listing.children().get(0).text();
+               }
+
+               return "";
        }
 }
        }
 }
index 235f7ee2ce2985738401cb45b6d6e8a1b25ec750..1f7aea7d633eda4ee567f9a981dc0d961e36908d 100644 (file)
@@ -1,19 +1,15 @@
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
-import java.io.InputStream;
 import java.net.URL;
 import java.net.URL;
+import java.util.AbstractMap;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map.Entry;
 
 
-import org.jsoup.helper.DataUtil;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
-
-import be.nikiroo.gofetch.data.Comment;
-import be.nikiroo.gofetch.data.Story;
 
 /**
  * Support <a href="http://www.lemonde.fr/">http://www.lemonde.fr/</a>.
 
 /**
  * Support <a href="http://www.lemonde.fr/">http://www.lemonde.fr/</a>.
@@ -27,98 +23,171 @@ public class LeMonde extends BasicSupport {
        }
 
        @Override
        }
 
        @Override
-       public List<Story> list() throws IOException {
-               List<Story> list = new ArrayList<Story>();
-
-               for (String topic : new String[] { "international", "politique",
-                               "societe", "sciences" }) {
-                       URL url = new URL("http://www.lemonde.fr/" + topic + "/1.html");
-                       InputStream in = downloader.open(url);
-                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
-                       Elements articles = doc.getElementsByTag("article");
-                       for (Element article : articles) {
-                               Elements times = article.getElementsByTag("time");
-                               Elements titleElements = article.getElementsByTag("h3");
-                               Elements contentElements = article.getElementsByClass("txt3");
-                               if (times.size() > 0 && titleElements.size() > 0
-                                               && contentElements.size() > 0) {
-                                       String id = times.get(0).attr("datetime").replace(":", "_")
-                                                       .replace("+", "_");
-                                       String title = titleElements.get(0).text();
-                                       String date = date(titleElements.get(0).text());
-                                       String content = contentElements.get(0).text();
-                                       String intUrl = "";
-                                       String extUrl = "";
-                                       String author = "";
-                                       String details = "";
-
-                                       Elements detailsElements = article
-                                                       .getElementsByClass("signature");
-                                       if (detailsElements.size() > 0) {
-                                               author = detailsElements.get(0).text();
-                                       }
+       protected List<Entry<URL, String>> getUrls() throws IOException {
+               List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+               for (String topic : new String[] { "International", "Politique",
+                               "Société", "Sciences" }) {
+                       URL url = new URL("http://www.lemonde.fr/"
+                                       + topic.toLowerCase().replace("é", "e") + "/1.html");
+                       urls.add(new AbstractMap.SimpleEntry<URL, String>(url, topic));
+               }
 
 
-                                       Elements links = titleElements.get(0).getElementsByTag("a");
-                                       if (links.size() > 0) {
-                                               intUrl = links.get(0).absUrl("href");
-                                               list.add(new Story(getType(), id, title, author, date,
-                                                               topic, details, intUrl, extUrl, content));
-                                       }
-                               }
+               return urls;
+       }
+
+       @Override
+       protected List<Element> getArticles(Document doc) {
+               return doc.getElementsByTag("article");
+       }
+
+       @Override
+       protected String getArticleId(Document doc, Element article) {
+               return ""; // will use the date
+       }
+
+       @Override
+       protected String getArticleTitle(Document doc, Element article) {
+               Element titleElement = article.getElementsByTag("h3").first();
+               if (titleElement != null) {
+                       return titleElement.text();
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleAuthor(Document doc, Element article) {
+               Element detailsElement = article.getElementsByClass("signature")
+                               .first();
+               if (detailsElement != null) {
+                       return detailsElement.text();
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleDate(Document doc, Element article) {
+               Element timeElement = article.getElementsByTag("time").first();
+               if (timeElement != null) {
+                       return timeElement.attr("datetime");
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleCategory(Document doc, Element article,
+                       String currentCategory) {
+               return currentCategory;
+       }
+
+       @Override
+       protected String getArticleDetails(Document doc, Element article) {
+               return "";
+       }
+
+       @Override
+       protected String getArticleIntUrl(Document doc, Element article) {
+               Element titleElement = article.getElementsByTag("h3").first();
+               if (titleElement != null) {
+                       Element link = titleElement.getElementsByTag("a").first();
+                       if (link != null) {
+                               return link.absUrl("href");
                        }
                }
 
                        }
                }
 
-               return list;
+               return "";
        }
 
        @Override
        }
 
        @Override
-       public void fetch(Story story) throws IOException {
-               String fullContent = story.getContent();
-               List<Comment> comments = new ArrayList<Comment>();
+       protected String getArticleExtUrl(Document doc, Element article) {
+               return "";
+       }
 
 
-               // Note: no comments on this site as far as I can see (or maybe with
-               // some javascript, I need to check...)
+       @Override
+       protected String getArticleContent(Document doc, Element article) {
+               Element contentElement = article.getElementsByClass("txt3").first();
+               if (contentElement != null) {
+                       return contentElement.text();
+               }
 
 
-               URL url = new URL(story.getUrlInternal());
-               InputStream in = downloader.open(url);
-               Document doc = DataUtil.load(in, "UTF-8", url.toString());
-               Element article = doc.getElementById("articleBody");
-               if (article != null) {
-                       for (String line : toLines(article, new BasicElementProcessor() {
-                               @Override
-                               public boolean ignoreNode(Node node) {
-                                       if (node instanceof Element) {
-                                               Element element = (Element) node;
-                                               if (element.hasClass("lire")) {
-                                                       return true;
-                                               }
-                                       }
+               return "";
+       }
+
+       @Override
+       protected Element getFullArticle(Document doc) {
+               return doc.getElementById("articleBody");
+       }
+
+       @Override
+       protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+               return null;
+       }
 
 
-                                       return false;
+       @Override
+       protected ElementProcessor getElementProcessorFullArticle() {
+               return new BasicElementProcessor() {
+                       @Override
+                       public boolean ignoreNode(Node node) {
+                               if (node instanceof Element) {
+                                       Element element = (Element) node;
+                                       if (element.hasClass("lire")) {
+                                               return true;
+                                       }
                                }
 
                                }
 
-                               @Override
-                               public String isSubtitle(Node node) {
-                                       if (node instanceof Element) {
-                                               Element element = (Element) node;
-                                               if (element.hasClass("intertitre")) {
-                                                       return element.text();
-                                               }
+                               return false;
+                       }
+
+                       @Override
+                       public String isSubtitle(Node node) {
+                               if (node instanceof Element) {
+                                       Element element = (Element) node;
+                                       if (element.hasClass("intertitre")) {
+                                               return element.text();
                                        }
                                        }
-                                       return null;
                                }
                                }
-                       })) {
-                               fullContent += line + "\n";
+                               return null;
                        }
                        }
+               };
+       }
 
 
-                       // Content is too tight with a single break per line:
-                       fullContent = fullContent.replace("\n", "\n\n") //
-                                       .replace("\n\n\n\n", "\n\n") //
-                                       .replace("\n\n\n\n", "\n\n") //
-                                       .trim();
-               }
+       // No comment on this site, horrible javascript system
 
 
-               story.setFullContent(fullContent);
-               story.setComments(comments);
+       @Override
+       protected List<Element> getCommentCommentPosts(Document doc,
+                       Element container) {
+               return null;
+       }
+
+       @Override
+       protected String getCommentId(Element post) {
+               return null;
+       }
+
+       @Override
+       protected String getCommentAuthor(Element post) {
+               return null;
+       }
+
+       @Override
+       protected String getCommentTitle(Element post) {
+               return null;
+       }
+
+       @Override
+       protected String getCommentDate(Element post) {
+               return null;
+       }
+
+       @Override
+       protected Element getCommentContentElement(Element post) {
+               return null;
+       }
+
+       @Override
+       protected ElementProcessor getElementProcessorComment() {
+               return null;
        }
 }
        }
 }
index 9ea70ff4bfcb37f2a6c08648857332dfd44282af..149a20cb510ed333bb7b8e4307c8b16273c16810 100644 (file)
@@ -1,20 +1,17 @@
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
-import java.io.InputStream;
 import java.net.URL;
 import java.net.URL;
+import java.util.AbstractMap;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map.Entry;
 
 
-import org.jsoup.helper.DataUtil;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.select.Elements;
 
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.select.Elements;
 
-import be.nikiroo.gofetch.data.Comment;
-import be.nikiroo.gofetch.data.Story;
-
 /**
  * Support <a href='https://pipedot.org/'>https://pipedot.org/</a>.
  * 
 /**
  * Support <a href='https://pipedot.org/'>https://pipedot.org/</a>.
  * 
@@ -27,151 +24,207 @@ public class Pipedot extends BasicSupport {
        }
 
        @Override
        }
 
        @Override
-       public List<Story> list() throws IOException {
-               List<Story> list = new ArrayList<Story>();
+       protected List<Entry<URL, String>> getUrls() throws IOException {
+               List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+               urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+                               "https://pipedot.org/"), ""));
+               return urls;
+       }
 
 
-               URL url = new URL("https://pipedot.org/");
-               InputStream in = downloader.open(url);
-               Document doc = DataUtil.load(in, "UTF-8", url.toString());
-               Elements articles = doc.getElementsByClass("story");
-               for (Element article : articles) {
-                       Elements titles = article.getElementsByTag("h1");
-                       if (titles.size() == 0) {
-                               continue;
-                       }
+       @Override
+       protected List<Element> getArticles(Document doc) {
+               return doc.getElementsByClass("story");
+       }
 
 
-                       Element title = titles.get(0);
+       @Override
+       protected String getArticleId(Document doc, Element article) {
+               // Don't try on bad articles
+               if (getArticleTitle(doc, article).isEmpty()) {
+                       return "";
+               }
 
 
-                       String id = "";
-                       for (Element idElem : article.getElementsByTag("a")) {
-                               if (idElem.attr("href").startsWith("/pipe/")) {
-                                       id = idElem.attr("href").substring("/pipe/".length());
-                                       break;
-                               }
+               for (Element idElem : article.getElementsByTag("a")) {
+                       if (idElem.attr("href").startsWith("/pipe/")) {
+                               return idElem.attr("href").substring("/pipe/".length());
                        }
                        }
+               }
 
 
-                       String intUrl = null;
-                       String extUrl = null;
-
-                       Elements links = article.getElementsByTag("a");
-                       if (links.size() > 0) {
-                               intUrl = links.get(0).absUrl("href");
-                       }
+               return "";
+       }
 
 
-                       // Take first ext URL as original source
-                       for (Element link : links) {
-                               String uuu = link.absUrl("href");
-                               if (!uuu.isEmpty() && !uuu.contains("pipedot.org/")) {
-                                       extUrl = uuu;
-                                       break;
-                               }
-                       }
+       @Override
+       protected String getArticleTitle(Document doc, Element article) {
+               Element title = article.getElementsByTag("h1").first();
+               if (title != null) {
+                       return title.text();
+               }
 
 
-                       String details = "";
-                       Elements detailsElements = article.getElementsByTag("div");
-                       if (detailsElements.size() > 0) {
-                               details = detailsElements.get(0).text().trim();
-                       }
+               return "";
+       }
 
 
-                       String author = "";
-                       int pos = details.indexOf("by ");
+       @Override
+       protected String getArticleAuthor(Document doc, Element article) {
+               String value = getArticleDetailsReal(article);
+               int pos = value.indexOf("by ");
+               if (pos >= 0) {
+                       value = value.substring(pos + "by ".length()).trim();
+                       pos = value.indexOf(" in ");
                        if (pos >= 0) {
                        if (pos >= 0) {
-                               author = details.substring(pos + "by ".length()).trim();
-                               pos = author.indexOf(" in ");
-                               if (pos >= 0) {
-                                       author = author.substring(0, pos).trim();
-                               }
+                               value = value.substring(0, pos).trim();
                        }
 
                        }
 
-                       String categ = "";
-                       pos = details.indexOf(" in ");
+                       return value;
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleDate(Document doc, Element article) {
+               Element dateElement = article.getElementsByTag("time").first();
+               if (dateElement != null) {
+                       return dateElement.attr("datetime");
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleCategory(Document doc, Element article,
+                       String currentCategory) {
+               String value = getArticleDetailsReal(article);
+               int pos = value.indexOf(" in ");
+               if (pos >= 0) {
+                       value = value.substring(pos + " in ".length()).trim();
+                       pos = value.indexOf(" on ");
                        if (pos >= 0) {
                        if (pos >= 0) {
-                               categ = details.substring(pos + " in ".length()).trim();
-                               pos = categ.indexOf(" on ");
-                               if (pos >= 0) {
-                                       categ = categ.substring(0, pos).trim();
-                               }
+                               value = value.substring(0, pos).trim();
                        }
 
                        }
 
-                       String date = "";
-                       Element dateElement = article.getElementsByTag("time").first();
-                       if (dateElement != null) {
-                               date = date(dateElement.attr("datetime"));
-                       }
+                       return value;
+               }
 
 
-                       // We already have all the details (date, author, id, categ)
-                       details = "";
+               return "";
+       }
 
 
-                       String body = "";
-                       for (Element elem : article.children()) {
-                               String tag = elem.tag().toString();
-                               if (!tag.equals("header") && !tag.equals("footer")) {
-                                       body = elem.text();
-                                       break;
-                               }
+       @Override
+       protected String getArticleDetails(Document doc, Element article) {
+               return ""; // We alrady extracted all the info
+       }
+
+       @Override
+       protected String getArticleIntUrl(Document doc, Element article) {
+               Element link = article.getElementsByTag("a").first();
+               if (link != null) {
+                       return link.absUrl("href");
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleExtUrl(Document doc, Element article) {
+               Element link = article.getElementsByTag("a").first();
+               if (link != null) {
+                       String possibleExtLink = link.absUrl("href").trim();
+                       if (!possibleExtLink.isEmpty()
+                                       && !possibleExtLink.contains("pipedot.org/")) {
+                               return possibleExtLink;
                        }
                        }
+               }
 
 
-                       list.add(new Story(getType(), id, title.text(), author, date,
-                                       categ, details, intUrl, extUrl, body));
+               return "";
+       }
+
+       @Override
+       protected String getArticleContent(Document doc, Element article) {
+               for (Element elem : article.children()) {
+                       String tag = elem.tagName();
+                       if (!tag.equals("header") && !tag.equals("footer")) {
+                               return elem.text();
+                       }
                }
 
                }
 
-               return list;
+               return "";
+       }
+
+       @Override
+       protected Element getFullArticle(Document doc) {
+               return null;
+       }
+
+       @Override
+       protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+               return getCommentElements(doc.getElementsByTag("main").first());
+       }
+
+       @Override
+       protected ElementProcessor getElementProcessorFullArticle() {
+               return new BasicElementProcessor();
        }
 
        @Override
        }
 
        @Override
-       public void fetch(Story story) throws IOException {
-               List<Comment> comments = new ArrayList<Comment>();
+       protected List<Element> getCommentCommentPosts(Document doc,
+                       Element container) {
 
 
-               URL url = new URL(story.getUrlInternal());
-               InputStream in = downloader.open(url);
-               Document doc = DataUtil.load(in, "UTF-8", url.toString());
-               Elements listing = doc.getElementsByTag("main");
-               if (listing.size() > 0) {
-                       comments.addAll(getComments(listing.get(0)));
+               if (container != null) {
+                       container = container.getElementsByClass("comment-outline").first();
                }
 
                }
 
-               story.setComments(comments);
+               return getCommentElements(container);
        }
 
        }
 
-       private List<Comment> getComments(Element listing) {
-               List<Comment> comments = new ArrayList<Comment>();
-               for (Element commentElement : listing.children()) {
-                       if (commentElement.hasClass("comment")) {
-                               Comment comment = getComment(commentElement);
-                               if (!comment.isEmpty()) {
-                                       comments.add(comment);
-                               }
+       @Override
+       protected String getCommentId(Element post) {
+               return post.id();
+       }
+
+       @Override
+       protected String getCommentAuthor(Element post) {
+               Element authorDateE = post.getElementsByTag("h3").first();
+               if (authorDateE != null) {
+                       String authorDate = authorDateE.text();
+                       int pos = authorDate.lastIndexOf(" on ");
+                       if (pos >= 0) {
+                               return authorDate.substring(0, pos).trim();
                        }
                }
                        }
                }
-               return comments;
-       }
 
 
-       private Comment getComment(Element commentElement) {
-               String title = firstOrEmptyTag(commentElement, "h3").text();
-               String author = firstOrEmpty(commentElement, "h4").text();
-               Element content = firstOrEmpty(commentElement, "comment-body");
+               return "";
+       }
 
 
-               String date = "";
-               int pos = author.lastIndexOf(" on ");
-               if (pos >= 0) {
-                       date = author.substring(pos + " on ".length()).trim();
-                       author = author.substring(0, pos).trim();
+       @Override
+       protected String getCommentTitle(Element post) {
+               Element title = post.getElementsByTag("h3").first();
+               if (title != null) {
+                       return title.text();
                }
 
                }
 
-               Comment comment = new Comment(commentElement.id(), author, title, date,
-                               toLines(content));
+               return "";
+       }
 
 
-               Elements commentOutline = commentElement
-                               .getElementsByClass("comment-outline");
-               if (commentOutline.size() > 0) {
-                       comment.addAll(getComments(commentOutline.get(0)));
+       @Override
+       protected String getCommentDate(Element post) {
+               Element authorDateE = post.getElementsByTag("h3").first();
+               if (authorDateE != null) {
+                       String authorDate = authorDateE.text();
+                       int pos = authorDate.lastIndexOf(" on ");
+                       if (pos >= 0) {
+                               return authorDate.substring(pos + " on ".length()).trim();
+                       }
                }
 
                }
 
-               return comment;
+               return "";
+       }
+
+       @Override
+       protected Element getCommentContentElement(Element post) {
+               return post.getElementsByClass("comment-body").first();
        }
 
        }
 
-       private List<String> toLines(Element element) {
-               return toLines(element, new BasicElementProcessor() {
+       @Override
+       protected ElementProcessor getElementProcessorComment() {
+               return new BasicElementProcessor() {
                        @Override
                        public boolean detectQuote(Node node) {
                                if (node instanceof Element) {
                        @Override
                        public boolean detectQuote(Node node) {
                                if (node instanceof Element) {
@@ -184,6 +237,27 @@ public class Pipedot extends BasicSupport {
 
                                return false;
                        }
 
                                return false;
                        }
-               });
+               };
+       }
+
+       private String getArticleDetailsReal(Element article) {
+               Elements detailsElements = article.getElementsByTag("div");
+               if (detailsElements.size() > 0) {
+                       return detailsElements.get(0).text().trim();
+               }
+
+               return "";
+       }
+
+       private List<Element> getCommentElements(Element container) {
+               List<Element> commentElements = new ArrayList<Element>();
+               if (container != null) {
+                       for (Element commentElement : container.children()) {
+                               if (commentElement.hasClass("comment")) {
+                                       commentElements.add(commentElement);
+                               }
+                       }
+               }
+               return commentElements;
        }
 }
        }
 }
index b3a779da62d229469346f8a9455b1e01b160ab9d..43ce13d023ff498d3a759e9eac884f351d10e79a 100644 (file)
@@ -1,21 +1,17 @@
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
-import java.io.InputStream;
 import java.net.URL;
 import java.net.URL;
+import java.util.AbstractMap;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map.Entry;
 
 
-import org.jsoup.helper.DataUtil;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.select.Elements;
 
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.select.Elements;
 
-import be.nikiroo.gofetch.data.Comment;
-import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.utils.StringUtils;
-
 /**
  * Support <a href='https://slashdot.org/'>https://slashdot.org/</a>.
  * 
 /**
  * Support <a href='https://slashdot.org/'>https://slashdot.org/</a>.
  * 
@@ -28,145 +24,238 @@ public class Slashdot extends BasicSupport {
        }
 
        @Override
        }
 
        @Override
-       public List<Story> list() throws IOException {
-               List<Story> list = new ArrayList<Story>();
-
-               URL url = new URL("https://slashdot.org/");
-               InputStream in = downloader.open(url);
-               Document doc = DataUtil.load(in, "UTF-8", url.toString());
-               Elements articles = doc.getElementsByTag("header");
-               for (Element article : articles) {
-                       Elements titles = article.getElementsByClass("story-title");
-                       if (titles.size() == 0) {
-                               continue;
-                       }
+       protected List<Entry<URL, String>> getUrls() throws IOException {
+               List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+               urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+                               "https://slashdot.org/"), ""));
+               return urls;
+       }
 
 
-                       Element title = titles.get(0);
+       @Override
+       protected List<Element> getArticles(Document doc) {
+               return doc.getElementsByTag("header");
+       }
 
 
-                       String id = "" + title.attr("id");
+       @Override
+       protected String getArticleId(Document doc, Element article) {
+               Element title = article.getElementsByClass("story-title").first();
+               if (title != null) {
+                       String id = title.attr("id");
                        if (id.startsWith("title-")) {
                                id = id.substring("title-".length());
                        }
 
                        if (id.startsWith("title-")) {
                                id = id.substring("title-".length());
                        }
 
+                       return id;
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleTitle(Document doc, Element article) {
+               Element title = article.getElementsByClass("story-title").first();
+               if (title != null) {
+                       return title.text();
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleAuthor(Document doc, Element article) {
+               // details: "Posted by AUTHOR on DATE from the further-crackdown dept."
+               String details = getArticleDetailsReal(article);
+               int pos = details.indexOf(" on ");
+               if (details.startsWith("Posted by ") && pos >= 0) {
+                       return details.substring("Posted by ".length(), pos).trim();
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleDate(Document doc, Element article) {
+               // Do not try bad articles
+               if (getArticleId(doc, article).isEmpty()) {
+                       return "";
+               }
+
+               Element dateElement = doc.getElementsByTag("time").first();
+               if (dateElement != null) {
+                       String date = dateElement.text().trim();
+                       if (date.startsWith("on ")) {
+                               date = date.substring("on ".length());
+                       }
+
+                       return date;
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleCategory(Document doc, Element article,
+                       String currentCategory) {
+               Element categElement = doc.getElementsByClass("topic").first();
+               if (categElement != null) {
+                       return categElement.text();
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleDetails(Document doc, Element article) {
+               // details: "Posted by AUTHOR on DATE from the further-crackdown dept."
+               String details = getArticleDetailsReal(article);
+               int pos = details.indexOf(" from the ");
+               if (pos >= 0) {
+                       return details.substring(pos).trim();
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleIntUrl(Document doc, Element article) {
+               Element title = article.getElementsByClass("story-title").first();
+               if (title != null) {
                        Elements links = title.getElementsByTag("a");
                        Elements links = title.getElementsByTag("a");
-                       String intUrl = "";
-                       String extUrl = "";
                        if (links.size() > 0) {
                        if (links.size() > 0) {
-                               intUrl = links.get(0).absUrl("href");
+                               return links.get(0).absUrl("href");
                        }
                        }
+               }
+               return "";
+       }
+
+       @Override
+       protected String getArticleExtUrl(Document doc, Element article) {
+               Element title = article.getElementsByClass("story-title").first();
+               if (title != null) {
+                       Elements links = title.getElementsByTag("a");
                        if (links.size() > 1) {
                        if (links.size() > 1) {
-                               extUrl = links.get(1).absUrl("href");
+                               return links.get(1).absUrl("href");
                        }
                        }
+               }
+               return "";
+       }
 
 
-                       String details = "";
-                       Elements detailsElements = article.getElementsByClass("details");
-                       if (detailsElements.size() > 0) {
-                               details = detailsElements.get(0).text();
-                       }
+       @Override
+       protected String getArticleContent(Document doc, Element article) {
+               Element contentElement = doc //
+                               .getElementById("text-" + getArticleId(doc, article));
+               if (contentElement != null) {
+                       return contentElement.text();
+               }
 
 
-                       // details:
-                       // "Posted by AUTHOR on DATE from the further-crackdown dept."
-                       String author = "";
-                       int pos = details.indexOf(" on ");
-                       if (details.startsWith("Posted by ") && pos >= 0) {
-                               author = details.substring("Posted by ".length(), pos).trim();
-                       }
-                       pos = details.indexOf(" from the ");
-                       if (pos >= 0) {
-                               details = details.substring(pos).trim();
-                       }
+               return "";
+       }
 
 
-                       String body = "";
-                       Element bodyElement = doc.getElementById("text-" + id);
-                       if (bodyElement != null) {
-                               body = bodyElement.text();
-                       }
+       @Override
+       protected Element getFullArticle(Document doc) {
+               return null;
+       }
 
 
-                       String categ = "";
-                       Element categElement = doc.getElementsByClass("topic").first();
-                       if (categElement != null) {
-                               categ = StringUtils.unhtml(categElement.text()).trim();
+       @Override
+       protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+               List<Element> commentElements = new ArrayList<Element>();
+               Element listing = doc.getElementById("commentlisting");
+               if (listing != null) {
+                       for (Element commentElement : listing.children()) {
+                               if (commentElement.hasClass("comment")) {
+                                       commentElements.add(commentElement);
+                               }
                        }
                        }
+               }
+
+               return commentElements;
+       }
+
+       @Override
+       protected ElementProcessor getElementProcessorFullArticle() {
+               return null;
+       }
 
 
-                       String date = "";
-                       Element dateElement = doc.getElementsByTag("time").first();
-                       if (dateElement != null) {
-                               date = StringUtils.unhtml(dateElement.text()).trim();
-                               if (date.startsWith("on ")) {
-                                       date = date.substring("on ".length());
+       @Override
+       protected List<Element> getCommentCommentPosts(Document doc,
+                       Element container) {
+               List<Element> commentElements = new ArrayList<Element>();
+               for (Element child : container.children()) {
+                       if (child.id().contains("commtree_")) {
+                               for (Element sub : child.children()) {
+                                       if (sub.hasClass("comment")) {
+                                               commentElements.add(sub);
+                                       }
                                }
                        }
                                }
                        }
+               }
+
+               return commentElements;
+       }
 
 
-                       list.add(new Story(getType(), id, title.text(), author, date,
-                                       categ, details, intUrl, extUrl, body));
+       @Override
+       protected String getCommentId(Element post) {
+               if (post.hasClass("hidden")) {
+                       return "";
                }
 
                }
 
-               return list;
+               return post.id();
        }
 
        @Override
        }
 
        @Override
-       public void fetch(Story story) throws IOException {
-               List<Comment> comments = new ArrayList<Comment>();
+       protected String getCommentAuthor(Element post) {
+               if (post.hasClass("hidden")) {
+                       return "";
+               }
 
 
-               URL url = new URL(story.getUrlInternal());
-               InputStream in = downloader.open(url);
-               Document doc = DataUtil.load(in, "UTF-8", url.toString());
-               Element listing = doc.getElementById("commentlisting");
-               if (listing != null) {
-                       comments.addAll(getComments(listing));
+               Element author = post.getElementsByClass("by").first();
+               if (author != null) {
+                       return author.text();
                }
 
                }
 
-               story.setComments(comments);
+               return "";
        }
 
        }
 
-       private List<Comment> getComments(Element listing) {
-               List<Comment> comments = new ArrayList<Comment>();
-               Comment lastComment = null;
-               for (Element commentElement : listing.children()) {
-                       if (commentElement.hasClass("comment")) {
-                               if (!commentElement.hasClass("hidden")) {
-                                       lastComment = getComment(commentElement);
-                                       comments.add(lastComment);
-                               }
+       @Override
+       protected String getCommentTitle(Element post) {
+               if (post.hasClass("hidden")) {
+                       return "";
+               }
 
 
-                               List<Comment> subComments = new ArrayList<Comment>();
-                               for (Element child : commentElement.children()) {
-                                       if (child.id().contains("commtree_")) {
-                                               subComments.addAll(getComments(child));
-                                       }
-                               }
+               Element title = post.getElementsByClass("title").first();
+               if (title != null) {
+                       return title.text();
+               }
 
 
-                               if (lastComment == null) {
-                                       comments.addAll(subComments);
-                               } else {
-                                       lastComment.addAll(subComments);
-                               }
-                       }
+               return "";
+       }
+
+       @Override
+       protected String getCommentDate(Element post) {
+               if (post.hasClass("hidden")) {
+                       return "";
                }
 
                }
 
-               return comments;
+               Element date = post.getElementsByClass("otherdetails").first();
+               if (date != null) {
+                       return date.text();
+               }
+
+               return "";
        }
 
        }
 
-       /**
-        * Get a comment from the given element.
-        * 
-        * @param commentElement
-        *            the element to get the comment of.
-        * 
-        * @return the comment, <b>NOT</b> including sub-comments
-        */
-       private Comment getComment(Element commentElement) {
-               String title = firstOrEmpty(commentElement, "title").text();
-               String author = firstOrEmpty(commentElement, "by").text();
-               String date = firstOrEmpty(commentElement, "otherdetails").text();
-               Element content = firstOrEmpty(commentElement, "commentBody");
+       @Override
+       protected Element getCommentContentElement(Element post) {
+               if (post.hasClass("hidden")) {
+                       return null;
+               }
 
 
-               return new Comment(commentElement.id(), author, title, date,
-                               toLines(content));
+               return post.getElementsByClass("commentBody").first();
        }
 
        }
 
-       private List<String> toLines(Element element) {
-               return toLines(element, new BasicElementProcessor() {
+       @Override
+       protected ElementProcessor getElementProcessorComment() {
+               return new BasicElementProcessor() {
                        @Override
                        public String processText(String text) {
                                while (text.startsWith(">")) { // comment in one-liners
                        @Override
                        public String processText(String text) {
                                while (text.startsWith(">")) { // comment in one-liners
@@ -192,6 +281,15 @@ public class Slashdot extends BasicSupport {
 
                                return false;
                        }
 
                                return false;
                        }
-               });
+               };
+       }
+
+       private String getArticleDetailsReal(Element article) {
+               Element detailsElement = article.getElementsByClass("details").first();
+               if (detailsElement != null) {
+                       return detailsElement.text();
+               }
+
+               return "";
        }
 }
        }
 }
index 7fb152400f11a641193e47ddd78d2287baef81fb..1195d3d5f9edf7200aef9318b2517af79b892b67 100644 (file)
@@ -3,18 +3,20 @@ package be.nikiroo.gofetch.support;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.util.AbstractMap;
 import java.util.ArrayList;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
 import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
 
 import org.jsoup.helper.DataUtil;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 
 import org.jsoup.helper.DataUtil;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
 
 import be.nikiroo.gofetch.data.Comment;
 import be.nikiroo.gofetch.data.Story;
 
 import be.nikiroo.gofetch.data.Comment;
 import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.utils.StringUtils;
 
 /**
  * Support <a
 
 /**
  * Support <a
@@ -23,195 +25,240 @@ import be.nikiroo.utils.StringUtils;
  * @author niki
  */
 public class TheRegister extends BasicSupport {
  * @author niki
  */
 public class TheRegister extends BasicSupport {
+       private Map<String, String> commentReplies = new HashMap<String, String>();
+
        @Override
        public String getDescription() {
                return "The Register: Biting the hand that feeds IT";
        }
 
        @Override
        @Override
        public String getDescription() {
                return "The Register: Biting the hand that feeds IT";
        }
 
        @Override
-       public List<Story> list() throws IOException {
-               List<Story> list = new ArrayList<Story>();
+       public void fetch(Story story) throws IOException {
+               super.fetch(story);
 
 
-               URL url = new URL("https://www.theregister.co.uk/");
-               InputStream in = downloader.open(url);
-               Document doc = DataUtil.load(in, "UTF-8", url.toString());
-               Elements articles = doc.getElementsByClass("story_link");
-               for (Element article : articles) {
-                       if (article.getElementsByClass("time_stamp").isEmpty()) {
-                               // Some articles are doubled,
-                               // but the second copy without the time info
-                               continue;
+               // Update comment replies
+               List<Comment> comments = new ArrayList<Comment>();
+               for (Comment comment : story.getComments()) {
+                       if (commentReplies.containsKey(comment.getId())) {
+                               String inReplyToId = commentReplies.get(comment.getId());
+                               Comment inReplyTo = story.getCommentById(inReplyToId);
+                               if (inReplyTo != null) {
+                                       inReplyTo.add(comment);
+                               } else {
+                                       comments.add(comment);
+                               }
+                       } else {
+                               comments.add(comment);
                        }
                        }
+               }
+               story.setComments(comments);
+       }
 
 
-                       String id = "";
-                       String intUrl = article.absUrl("href");
-                       String extUrl = ""; // nope
-                       String title = "";
-                       String date = "";
-                       String details = "";
-                       String body = "";
-                       String categ = "";
-                       String author = ""; // nope
-
-                       Element categElement = article.previousElementSibling();
-                       if (categElement != null) {
-                               categ = categElement.text().trim();
-                       }
+       @Override
+       protected List<Entry<URL, String>> getUrls() throws IOException {
+               List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+               urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+                               "https://www.theregister.co.uk/"), ""));
+               return urls;
+       }
 
 
-                       Element titleElement = article.getElementsByTag("h4").first();
-                       if (titleElement != null) {
-                               title = StringUtils.unhtml(titleElement.text()).trim();
-                       }
+       @Override
+       protected List<Element> getArticles(Document doc) {
+               return doc.getElementsByClass("story_link");
+       }
 
 
-                       Element dateElement = article.getElementsByClass("time_stamp")
-                                       .first();
-                       if (dateElement != null) {
-                               String epochS = dateElement.attr("data-epoch");
-                               if (epochS != null && !epochS.isEmpty()) {
-                                       id = epochS;
-                                       date = date(epochS);
-                               }
-                       }
+       @Override
+       protected String getArticleId(Document doc, Element article) {
+               return "";
+       }
 
 
-                       if (id.isEmpty()) {
-                               // fallback
-                               id = article.attr("href").replace("/", "_");
-                       }
+       @Override
+       protected String getArticleTitle(Document doc, Element article) {
+               Element titleElement = article.getElementsByTag("h4").first();
+               if (titleElement != null) {
+                       return titleElement.text();
+               }
 
 
-                       Element detailsElement = article.getElementsByClass("standfirst")
-                                       .first();
-                       details = "(" + date + ") ";
-                       if (detailsElement != null) {
-                               details += StringUtils.unhtml(detailsElement.text()).trim();
-                       }
+               return "";
+       }
+
+       @Override
+       protected String getArticleAuthor(Document doc, Element article) {
+               return "";
+       }
 
 
-                       // We have some "details" but no content, so we switch them:
-                       body = details;
-                       details = "";
-                       list.add(new Story(getType(), id, title, author, date, categ,
-                                       details, intUrl, extUrl, body));
+       @Override
+       protected String getArticleDate(Document doc, Element article) {
+               Element dateElement = article.getElementsByClass("time_stamp").first();
+               if (dateElement != null) {
+                       return dateElement.attr("data-epoch");
                }
 
                }
 
-               return list;
+               return "";
        }
 
        @Override
        }
 
        @Override
-       public void fetch(Story story) throws IOException {
-               String fullContent = story.getContent();
-               List<Comment> comments = new ArrayList<Comment>();
-               story.setComments(comments);
+       protected String getArticleCategory(Document doc, Element article,
+                       String currentCategory) {
+               Element categElement = article.previousElementSibling();
+               if (categElement != null) {
+                       return categElement.text();
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleDetails(Document doc, Element article) {
+               // We have some "details" but no content, so we switch them:
+               return "";
+       }
+
+       @Override
+       protected String getArticleIntUrl(Document doc, Element article) {
+               return article.absUrl("href");
+       }
+
+       @Override
+       protected String getArticleExtUrl(Document doc, Element article) {
+               return "";
+       }
+
+       @Override
+       protected String getArticleContent(Document doc, Element article) {
+               // We have some "details" but no content, so we switch them:
+               Element detailsElement = article.getElementsByClass("standfirst")
+                               .first();
+               if (detailsElement != null) {
+                       return detailsElement.text();
+               }
+
+               return "";
+       }
+
+       @Override
+       protected Element getFullArticle(Document doc) {
+               return doc.getElementById("body");
+       }
 
 
-               URL url = new URL(story.getUrlInternal());
-               InputStream in = downloader.open(url);
+       @Override
+       protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+               List<Element> commentElements = new ArrayList<Element>();
+
+               // Get comments URL then parse it
                try {
                try {
-                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
-                       Element article = doc.getElementById("body");
-                       if (article != null) {
-                               for (String line : toLines(article,
-                                               new BasicElementProcessor() {
-                                                       // TODO: ignore headlines/pub
-                                               })) {
-                                       fullContent += line + "\n";
+                       URL url = new URL("https://forums.theregister.co.uk/forum/1"
+                                       + intUrl.getPath());
+                       InputStream in = downloader.open(url);
+                       try {
+                               doc = DataUtil.load(in, "UTF-8", url.toString());
+                               Element posts = doc.getElementById("forum_posts");
+                               if (posts != null) {
+                                       for (Element post : posts.getElementsByClass("post")) {
+                                               commentElements.add(post);
+                                               Element inReplyTo = post.getElementsByClass(
+                                                               "in-reply-to").first();
+                                               if (inReplyTo != null) {
+                                                       String parentId = inReplyTo.absUrl("href");
+                                                       if (parentId != null && parentId.contains("/")) {
+                                                               int i = parentId.lastIndexOf('/');
+                                                               parentId = parentId.substring(i + 1);
+
+                                                               commentReplies
+                                                                               .put(getCommentId(post), parentId);
+                                                       }
+                                               }
+                                       }
                                }
                                }
+                       } finally {
+                               in.close();
+                       }
+               } catch (IOException e) {
+               }
+
+               return commentElements;
+       }
+
+       @Override
+       protected ElementProcessor getElementProcessorFullArticle() {
+               return new BasicElementProcessor();
+       }
+
+       @Override
+       protected List<Element> getCommentCommentPosts(Document doc,
+                       Element container) {
+               return null;
+       }
 
 
-                               // Content is too tight with a single break per line:
-                               fullContent = fullContent.replace("\n", "\n\n") //
-                                               .replace("\n\n\n\n", "\n\n") //
-                                               .replace("\n\n\n\n", "\n\n") //
-                                               .trim();
+       @Override
+       protected String getCommentId(Element post) {
+               Element idE = post.getElementsByTag("a").first();
+               if (idE != null) {
+                       String id = idE.attr("id");
+                       if (id.startsWith("c_")) {
+                               id = id.substring(2);
                        }
 
                        }
 
-                       story.setFullContent(fullContent);
-
-                       // Get comments URL then parse it
-                       in.close();
-                       in = null;
-                       in = downloader
-                                       .open(new URL("https://forums.theregister.co.uk/forum/1"
-                                                       + url.getPath()));
-                       doc = DataUtil.load(in, "UTF-8", url.toString());
-                       Element posts = doc.getElementById("forum_posts");
-                       if (posts != null) {
-                               for (Element post : posts.getElementsByClass("post")) {
-                                       String id = "";
-                                       String author = "";
-                                       String title = "";
-                                       String date = "";
-                                       List<String> content = new ArrayList<String>();
-
-                                       Element idE = post.getElementsByTag("a").first();
-                                       if (idE != null) {
-                                               id = idE.attr("id");
-                                               if (id.startsWith("c_")) {
-                                                       id = id.substring(2);
-                                               }
+                       return id;
+               }
 
 
-                                               Element dateE = idE.getElementsByTag("span").first();
-                                               if (dateE != null) {
-                                                       date = date(dateE.attr("data-epoch"));
-                                               }
-                                       }
+               return "";
+       }
 
 
-                                       Element authorE = post.getElementsByClass("author").first();
-                                       if (authorE != null) {
-                                               author = StringUtils.unhtml(authorE.text()).trim();
-                                       }
+       @Override
+       protected String getCommentAuthor(Element post) {
+               Element author = post.getElementsByClass("author").first();
+               if (author != null) {
+                       return author.text();
+               }
 
 
-                                       Element titleE = post.getElementsByTag("h4").first();
-                                       if (titleE != null) {
-                                               title = StringUtils.unhtml(titleE.text()).trim();
-                                       }
+               return "";
+       }
 
 
-                                       Element contentE = post.getElementsByClass("body").first();
-                                       if (contentE != null) {
-                                               for (String line : toLines(contentE,
-                                                               new BasicElementProcessor() {
-                                                                       @Override
-                                                                       public boolean ignoreNode(Node node) {
-                                                                               // TODO: ignore headlines/pub
-
-                                                                               // Remove the comment title (which has
-                                                                               // already been processed earlier)
-                                                                               if (node instanceof Element) {
-                                                                                       Element el = (Element) node;
-                                                                                       if ("h4".equals(el.tagName())) {
-                                                                                               return true;
-                                                                                       }
-                                                                               }
-
-                                                                               return false;
-                                                                       }
-                                                               })) {
-                                                       content.add(line);
-                                               }
-                                       }
+       @Override
+       protected String getCommentTitle(Element post) {
+               Element title = post.getElementsByTag("h4").first();
+               if (title != null) {
+                       return title.text();
+               }
 
 
-                                       Comment comment = new Comment(id, author, title, date,
-                                                       content);
-                                       Comment parent = null;
-
-                                       Element inReplyTo = post.getElementsByClass("in-reply-to")
-                                                       .first();
-                                       if (inReplyTo != null) {
-                                               String parentId = inReplyTo.absUrl("href");
-                                               if (parentId != null && parentId.contains("/")) {
-                                                       int i = parentId.lastIndexOf('/');
-                                                       parentId = parentId.substring(i + 1);
-                                                       parent = story.getCommentById(parentId);
-                                               }
-                                       }
+               return "";
+       }
+
+       @Override
+       protected String getCommentDate(Element post) {
+               Element id = post.getElementsByTag("a").first();
+               if (id != null) {
+                       Element date = id.getElementsByTag("span").first();
+                       if (date != null) {
+                               return date.attr("data-epoch");
+                       }
+               }
+
+               return "";
+       }
+
+       @Override
+       protected Element getCommentContentElement(Element post) {
+               return post.getElementsByClass("body").first();
+       }
 
 
-                                       if (parent == null) {
-                                               comments.add(comment);
-                                       } else {
-                                               parent.add(comment);
+       @Override
+       protected ElementProcessor getElementProcessorComment() {
+               return new BasicElementProcessor() {
+                       @Override
+                       public boolean ignoreNode(Node node) {
+                               // Remove the comment title (which has
+                               // already been processed earlier)
+                               if (node instanceof Element) {
+                                       Element el = (Element) node;
+                                       if ("h4".equals(el.tagName())) {
+                                               return true;
                                        }
                                }
                                        }
                                }
+
+                               return false;
                        }
                        }
-               } finally {
-                       if (in != null) {
-                               in.close();
-                       }
-               }
+               };
        }
 }
        }
 }
index 0cc4c6cf58839d9646ad5d567f48767dfdff068e..ba909cfdc63fa60b8fdae0beba04efa0e709c075 100644 (file)
@@ -1,20 +1,15 @@
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
-import java.io.InputStream;
 import java.net.URL;
 import java.net.URL;
+import java.util.AbstractMap;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map.Entry;
 
 
-import org.jsoup.helper.DataUtil;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
-
-import be.nikiroo.gofetch.data.Comment;
-import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.utils.StringUtils;
 
 /**
  * Support <a href="https://www.toolinux.com/">https://www.toolinux.com/</a>.
 
 /**
  * Support <a href="https://www.toolinux.com/">https://www.toolinux.com/</a>.
@@ -28,97 +23,141 @@ public class TooLinux extends BasicSupport {
        }
 
        @Override
        }
 
        @Override
-       public List<Story> list() throws IOException {
-               List<Story> list = new ArrayList<Story>();
-
-               URL url = new URL("https://www.toolinux.com/");
-               InputStream in = downloader.open(url);
-               Document doc = DataUtil.load(in, "UTF-8", url.toString());
-               Elements articles = doc.getElementsByClass("hentry");
-               for (Element article : articles) {
-                       String id = "";
-                       String intUrl = "";
-                       String extUrl = ""; // nope
-                       String title = "";
-                       String date = "";
-                       String details = "";
-                       String body = "";
-                       String author = ""; // nope
-                       String categ = ""; // nope
-
-                       Element urlElement = article.getElementsByTag("a").first();
-                       if (urlElement != null) {
-                               intUrl = urlElement.absUrl("href");
-                       }
+       protected List<Entry<URL, String>> getUrls() throws IOException {
+               List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+               urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+                               "https://www.toolinux.com/"), ""));
+               return urls;
+       }
 
 
-                       Element titleElement = article.getElementsByClass("entry-title")
-                                       .first();
-                       if (titleElement != null) {
-                               title = StringUtils.unhtml(titleElement.text()).trim();
-                       }
+       @Override
+       protected List<Element> getArticles(Document doc) {
+               return doc.getElementsByClass("hentry");
+       }
 
 
-                       Element dateElement = article.getElementsByClass("published")
-                                       .first();
-                       if (dateElement != null) {
-                               date = StringUtils.unhtml(dateElement.text()).trim();
-                               id = dateElement.attr("title").trim();
-                       }
+       @Override
+       protected String getArticleId(Document doc, Element article) {
+               return ""; // We use the date
+       }
 
 
-                       if (id.isEmpty()) {
-                               // fallback
-                               id = intUrl.replace("/", "_");
-                       }
+       @Override
+       protected String getArticleTitle(Document doc, Element article) {
+               Element titleElement = article.getElementsByClass("entry-title")
+                               .first();
+               if (titleElement != null) {
+                       return titleElement.text();
+               }
 
 
-                       Element bodyElement = article.getElementsByClass("introduction")
-                                       .first();
-                       if (bodyElement != null) {
-                               body = StringUtils.unhtml(bodyElement.text()).trim();
-                       }
+               return "";
+       }
 
 
-                       list.add(new Story(getType(), id, title, author, date, categ,
-                                       details, intUrl, extUrl, body));
+       @Override
+       protected String getArticleAuthor(Document doc, Element article) {
+               return "";
+       }
+
+       @Override
+       protected String getArticleDate(Document doc, Element article) {
+               Element dateElement = article.getElementsByClass("published").first();
+               if (dateElement != null) {
+                       return dateElement.text();
                }
 
                }
 
-               return list;
-       }
-
-       @Override
-       public void fetch(Story story) throws IOException {
-               String fullContent = story.getContent();
-               List<Comment> comments = new ArrayList<Comment>();
-               story.setComments(comments);
-
-               URL url = new URL(story.getUrlInternal());
-               InputStream in = downloader.open(url);
-               try {
-                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
-                       Element article = doc.getElementById("content");
-                       if (article != null) {
-                               for (String line : toLines(article,
-                                               new BasicElementProcessor() {
-                                                       @Override
-                                                       public boolean ignoreNode(Node node) {
-                                                               if ("notes".equals(node.attr("class"))) {
-                                                                       return true;
-                                                               }
-                                                               return false;
-                                                       }
-                                               })) {
-                                       fullContent += line + "\n";
-                               }
+               return "";
+       }
 
 
-                               // Content is too tight with a single break per line:
-                               fullContent = fullContent.replace("\n", "\n\n") //
-                                               .replace("\n\n\n\n", "\n\n") //
-                                               .replace("\n\n\n\n", "\n\n") //
-                                               .trim();
-                       }
+       @Override
+       protected String getArticleCategory(Document doc, Element article,
+                       String currentCategory) {
+               return "";
+       }
 
 
-                       story.setFullContent(fullContent);
-               } finally {
-                       if (in != null) {
-                               in.close();
-                       }
+       @Override
+       protected String getArticleDetails(Document doc, Element article) {
+               return "";
+       }
+
+       @Override
+       protected String getArticleIntUrl(Document doc, Element article) {
+               Element urlElement = article.getElementsByTag("a").first();
+               if (urlElement != null) {
+                       return urlElement.absUrl("href");
                }
                }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleExtUrl(Document doc, Element article) {
+               return "";
+       }
+
+       @Override
+       protected String getArticleContent(Document doc, Element article) {
+               Element content = article.getElementsByClass("introduction").first();
+               if (content != null) {
+                       return content.text();
+               }
+
+               return "";
+       }
+
+       @Override
+       protected Element getFullArticle(Document doc) {
+               return doc.getElementById("content");
+       }
+
+       @Override
+       protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+               return null;
+       }
+
+       @Override
+       protected ElementProcessor getElementProcessorFullArticle() {
+               return new BasicElementProcessor() {
+                       @Override
+                       public boolean ignoreNode(Node node) {
+                               if ("notes".equals(node.attr("class"))) {
+                                       return true;
+                               }
+                               return false;
+                       }
+               };
+       }
+
+       @Override
+       protected List<Element> getCommentCommentPosts(Document doc,
+                       Element container) {
+               return null;
+       }
+
+       @Override
+       protected String getCommentId(Element post) {
+               return null;
+       }
+
+       @Override
+       protected String getCommentAuthor(Element post) {
+               return null;
+       }
+
+       @Override
+       protected String getCommentTitle(Element post) {
+               return null;
+       }
+
+       @Override
+       protected String getCommentDate(Element post) {
+               return null;
+       }
+
+       @Override
+       protected Element getCommentContentElement(Element post) {
+               return null;
+       }
+
+       @Override
+       protected ElementProcessor getElementProcessorComment() {
+               return null;
        }
 }
        }
 }
diff --git a/src/be/nikiroo/gofetch/support/Type.java b/src/be/nikiroo/gofetch/support/Type.java
new file mode 100644 (file)
index 0000000..dadbec1
--- /dev/null
@@ -0,0 +1,23 @@
+package be.nikiroo.gofetch.support;
+
+/**
+ * The support type (each website we support has a single type).
+ * 
+ * @author niki
+ */
+public enum Type {
+       /** EN: Any, but mostly IT/Sci */
+       SLASHDOT,
+       /** EN: Clone of Slashdot, mostly abandoned */
+       PIPEDOT,
+       /** EN: Linux */
+       LWN,
+       /** FR: Any */
+       LEMONDE,
+       /** EN: IT */
+       REGISTER,
+       /** FR: Linux */
+       TOO_LINUX,
+       /** FR: IT */
+       ERE_NUMERIQUE,
+}