Bug fixes + rework of BasicSupport
[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java
index b15fac7e5e2598d0d67c3bcf493c6dae03a0a8a1..a59ae313fb1f1fda8979020d7e6315d81ba6592e 100644 (file)
@@ -1,22 +1,29 @@
 package be.nikiroo.gofetch.support;
 
 import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Date;
 import java.util.List;
+import java.util.Map.Entry;
 
+import org.jsoup.helper.DataUtil;
 import org.jsoup.helper.StringUtil;
+import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.TextNode;
-import org.jsoup.select.Elements;
 import org.jsoup.select.NodeTraversor;
 import org.jsoup.select.NodeVisitor;
 
+import be.nikiroo.gofetch.data.Comment;
 import be.nikiroo.gofetch.data.Story;
 import be.nikiroo.utils.Downloader;
+import be.nikiroo.utils.StringUtils;
 
 /**
  * Base class for website support.
@@ -27,140 +34,226 @@ public abstract class BasicSupport {
        /** The downloader to use for all websites. */
        protected static Downloader downloader = new Downloader("gofetcher");
 
+       static private String preselector;
+
+       private Type type;
+
+       /**
+        * The website textual description, to add in the dispatcher page.
+        * <p>
+        * Should be short.
+        * 
+        * @return the description
+        */
+       abstract public String getDescription();
+
        /**
-        * The support type (each website we support has a single type).
-        * 
-        * @author niki
-        */
-       public enum Type {
-               /** EN: Any, but mostly IT/Sci */
-               SLASHDOT,
-               /** EN: Clone of Slashdot, mostly abandoned */
-               PIPEDOT,
-               /** EN: Linux */
-               LWN,
-               /** FR: Any */
-               LEMONDE,
-               /** EN: IT */
-               REGISTER,
-               /** FR: Linux */
-               TOO_LINUX,
-               /** FR: IT */
-               ERE_NUMERIQUE,
+        * The gopher "selector" to use for output.
+        * <p>
+        * A kind of "URL path", like "/news/" or "/misc/news/" or...
+        * 
+        * @return the selector
+        */
+       public String getSelector() {
+               return getSelector(type);
        }
 
        /**
-        * Used to process an element into lines.
-        * 
-        * @author niki
-        */
-       public interface ElementProcessor {
-               /**
-                * Detect if this node is a quote and should be trated as such.
-                * 
-                * @param node
-                *            the node to check
-                * @return TRUE if it is
-                */
-               public boolean detectQuote(Node node);
-
-               /**
-                * Process text content (will be called on each text element, allowing
-                * you to modify it if needed).
-                * 
-                * @param text
-                *            the text to process
-                * 
-                * @return the resulting text
-                */
-               public String processText(String text);
-
-               /**
-                * Ignore this node.
-                * 
-                * @param node
-                *            the node to ignore
-                * @return TRUE if it has to be ignored
-                */
-               public boolean ignoreNode(Node node);
-
-               /**
-                * Manually process this node (and return the manual processing value)
-                * if so desired.
-                * <p>
-                * If the node is manually processed, it and its children will not be
-                * automatically processed.
-                * 
-                * @param node
-                *            the node to optionally process
-                * 
-                * @return NULL if not processed (will thus be automatically processed
-                *         as usual), a {@link String} (may be empty) if we process it
-                *         manually -- the given {@link String} will be used instead of
-                *         the usual automatic processing if not NULL
-                */
-               public String manualProcessing(Node node);
-
-               /**
-                * This {@link Node} is a subtitle and should be treated as such
-                * (highlighted).
-                * 
-                * @param node
-                *            the node to check
-                * 
-                * @return NULL if it is not a subtitle, the subtitle to use if it is
-                */
-               public String isSubtitle(Node node);
+        * The support type.
+        * 
+        * @return the type
+        */
+       public Type getType() {
+               return type;
        }
 
        /**
-        * A default {@link ElementProcessor} (will not detect or process anything
-        * manually).
+        * List all the recent items, but only assure the ID and internal URL to
+        * fetch it later on (until it has been fetched, the rest of the
+        * {@link Story} is not confirmed).
         * 
-        * @author niki
+        * @return the list of new stories
+        * 
+        * @throws IOException
+        *             in case of I/O
         */
-       public class BasicElementProcessor implements ElementProcessor {
-               @Override
-               public boolean detectQuote(Node node) {
-                       return false;
-               }
+       public List<Story> list() throws IOException {
+               List<Story> list = new ArrayList<Story>();
+
+               for (Entry<URL, String> entry : getUrls()) {
+                       URL url = entry.getKey();
+                       String defaultCateg = entry.getValue();
+                       if (defaultCateg == null) {
+                               defaultCateg = "";
+                       }
 
-               @Override
-               public String processText(String text) {
-                       return text;
-               }
+                       InputStream in = downloader.open(url);
+                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
+                       List<Element> articles = getArticles(doc);
+                       for (Element article : articles) {
+                               String id = getArticleId(doc, article).trim();
+                               String title = getArticleTitle(doc, article).trim();
+                               String author = getArticleAuthor(doc, article).trim();
+                               String date = getArticleDate(doc, article).trim();
+                               String categ = getArticleCategory(doc, article, defaultCateg)
+                                               .trim();
+                               String details = getArticleDetails(doc, article).trim();
+                               String intUrl = getArticleIntUrl(doc, article).trim();
+                               String extUrl = getArticleExtUrl(doc, article).trim();
+                               String content = getArticleContent(doc, article).trim();
+
+                               if (id.isEmpty() && date.isEmpty()) {
+                                       continue;
+                               }
 
-               @Override
-               public boolean ignoreNode(Node node) {
-                       return false;
-               }
+                               if (id.isEmpty()) {
+                                       id = date.replace(":", "_").replace("+", "_");
+                               }
 
-               @Override
-               public String manualProcessing(Node node) {
-                       return null;
-               }
+                               date = date(date);
 
-               @Override
-               public String isSubtitle(Node node) {
-                       return null;
+                               list.add(new Story(getType(), id, title, author, date, categ,
+                                               details, intUrl, extUrl, content));
+                       }
                }
+
+               return list;
        }
 
-       static private String preselector;
+       /**
+        * The {@link URL}s to process for this website.
+        * 
+        * @return the list of {@link URL}s
+        * 
+        * @throws IOException
+        *             in case of I/O error
+        */
+       abstract protected List<Entry<URL, String>> getUrls() throws IOException;
 
-       private Type type;
+       /**
+        * The article {@link Element}s of this document.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * 
+        * @return the articles
+        */
+       abstract protected List<Element> getArticles(Document doc);
 
        /**
-        * List all the recent items, but only assure the ID and internal URL to
-        * fetch it later on (until it has been fetched, the rest of the
-        * {@link Story} is not confirmed).
+        * The ID of the article (defaults to the date element if empty).
         * 
-        * @return the list of new stories
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
         * 
-        * @throws IOException
-        *             in case of I/O
+        * @return the ID
+        */
+       abstract protected String getArticleId(Document doc, Element article);
+
+       /**
+        * The article title to display.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the title
+        */
+       abstract protected String getArticleTitle(Document doc, Element article);
+
+       /**
+        * The optional article author.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the author
+        */
+       abstract protected String getArticleAuthor(Document doc, Element article);
+
+       /**
+        * The optional article date.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the date
+        */
+       abstract protected String getArticleDate(Document doc, Element article);
+
+       /**
+        * the optional article category.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * @param currentCategory
+        *            the currently listed category if any (can be NULL)
+        * 
+        * @return the category
         */
-       abstract public List<Story> list() throws IOException;
+       abstract protected String getArticleCategory(Document doc, Element article,
+                       String currentCategory);
+
+       /**
+        * the optional details of the article (can replace the date, author and
+        * category, for instance).
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the details
+        */
+       abstract protected String getArticleDetails(Document doc, Element article);
+
+       /**
+        * The (required) {@link URL} that points to the news page on the supported
+        * website.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the internal {@link URL}
+        */
+       abstract protected String getArticleIntUrl(Document doc, Element article);
+
+       /**
+        * the optional {@link URL} that points to an external website for more
+        * information.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the external {@link URL}
+        */
+       abstract protected String getArticleExtUrl(Document doc, Element article);
+
+       /**
+        * The optional article short-content (not the full content, that will be
+        * fetched by {@link BasicSupport#fetch(Story)}).
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the short content
+        */
+       abstract protected String getArticleContent(Document doc, Element article);
 
        /**
         * Fetch the full article content as well as all the comments associated to
@@ -172,37 +265,211 @@ public abstract class BasicSupport {
         * @throws IOException
         *             in case of I/O error
         */
-       abstract public void fetch(Story story) throws IOException;
+       public void fetch(Story story) throws IOException {
+               String fullContent = "";
+
+               URL url = new URL(story.getUrlInternal());
+               InputStream in = downloader.open(url);
+               try {
+                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
+                       Element article = getFullArticle(doc);
+                       if (article != null) {
+                               StringBuilder builder = new StringBuilder();
+                               ElementProcessor eProc = getElementProcessorFullArticle();
+                               if (eProc != null) {
+                                       for (String line : toLines(article, eProc)) {
+                                               builder.append(line + "\n");
+                                       }
+                               } else {
+                                       builder.append(article.text());
+                               }
+
+                               // Content is too tight with a single break per line:
+                               fullContent = builder.toString().replace("\n", "\n\n") //
+                                               .replace("\n\n\n\n", "\n\n") //
+                                               .replace("\n\n\n\n", "\n\n") //
+                                               .trim();
+                       }
+
+                       if (fullContent.isEmpty()) {
+                               fullContent = story.getContent();
+                       }
+
+                       story.setFullContent(fullContent);
+                       story.setComments(getComments(doc,
+                                       getFullArticleCommentPosts(doc, url)));
+               } finally {
+                       if (in != null) {
+                               in.close();
+                       }
+               }
+       }
 
        /**
-        * The website textual description, to add in the dispatcher page.
-        * <p>
-        * Should be short.
+        * Return the full article if available.
         * 
-        * @return the description
+        * @param doc
+        *            the (full article) document to work on
+        * 
+        * @return the article or NULL
         */
-       abstract public String getDescription();
+       abstract protected Element getFullArticle(Document doc);
 
        /**
-        * The gopher "selector" to use for output.
+        * Return the list of comment {@link Element}s from this optional container
+        * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
+        * 
+        * @param doc
+        *            the (full article) document to work on
+        * @param intUrl
+        *            the internal {@link URL} this article wa taken from (the
+        *            {@link URL} from the supported website)
+        * 
+        * @return the list of comment posts
+        */
+       abstract protected List<Element> getFullArticleCommentPosts(Document doc,
+                       URL intUrl);
+
+       /**
+        * The {@link ElementProcessor} to use to convert the main article element
+        * (see {@link BasicSupport#getFullArticle(Document)}) into text.
         * <p>
-        * A kind of "URL path", like "/news/" or "/misc/news/" or...
+        * See {@link BasicElementProcessor} for a working, basic implementation.
+        * <p>
+        * Can be NULL to simply use {@link Element#text()}.
         * 
-        * @return the selector
+        * @return the processor, or NULL
         */
-       public String getSelector() {
-               return getSelector(type);
-       }
+       abstract protected ElementProcessor getElementProcessorFullArticle();
 
        /**
-        * The support type.
+        * Convert the comment elements into {@link Comment}s
         * 
-        * @return the type
+        * @param doc
+        *            the document we work on
+        * @param posts
+        *            the comment elements
+        * 
+        * @return the converted {@link Comment}s
         */
-       public Type getType() {
-               return type;
+       private List<Comment> getComments(Document doc, List<Element> posts) {
+               List<Comment> comments = new ArrayList<Comment>();
+               if (posts != null) {
+                       for (Element post : posts) {
+                               String id = getCommentId(post).trim();
+                               String author = getCommentAuthor(post).trim();
+                               String title = getCommentTitle(post).trim();
+                               String date = getCommentDate(post).trim();
+
+                               List<String> content = new ArrayList<String>();
+
+                               if (id.isEmpty()) {
+                                       id = date;
+                               }
+
+                               date = date(date);
+
+                               Element contentE = getCommentContentElement(post);
+                               if (contentE != null) {
+                                       ElementProcessor eProc = getElementProcessorComment();
+                                       if (eProc != null) {
+                                               for (String line : toLines(contentE, eProc)) {
+                                                       content.add(line);
+                                               }
+                                       } else {
+                                               content = Arrays.asList(contentE.text().split("\n"));
+                                       }
+                               }
+
+                               Comment comment = new Comment(id, author, title, date, content);
+                               comment.addAll(getComments(doc,
+                                               getCommentCommentPosts(doc, post)));
+
+                               if (!comment.isEmpty()) {
+                                       comments.add(comment);
+                               }
+                       }
+               }
+
+               return comments;
        }
 
+       /**
+        * Return the list of subcomment {@link Element}s from this comment element
+        * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
+        * 
+        * @param doc
+        *            the (full article) document to work on
+        * @param container
+        *            the container (a comment {@link Element})
+        * 
+        * @return the list of comment posts
+        */
+       abstract protected List<Element> getCommentCommentPosts(Document doc,
+                       Element container);
+
+       /**
+        * Compute the ID of the given comment element.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the ID
+        */
+       abstract protected String getCommentId(Element post);
+
+       /**
+        * Compute the author of the given comment element.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the author
+        */
+       abstract protected String getCommentAuthor(Element post);
+
+       /**
+        * Compute the title of the given comment element.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the title
+        */
+       abstract protected String getCommentTitle(Element post);
+
+       /**
+        * Compute the date of the given comment element.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the date
+        */
+       abstract protected String getCommentDate(Element post);
+
+       /**
+        * Get the main of the given comment element, which can be NULL.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the element
+        */
+       abstract protected Element getCommentContentElement(Element post);
+
+       /**
+        * The {@link ElementProcessor} to use to convert the main comment element
+        * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
+        * <p>
+        * See {@link BasicElementProcessor} for a working, basic implementation.
+        * <p>
+        * Can be NULL to simply use {@link Element#text()}.
+        * 
+        * @return the processor
+        */
+       abstract protected ElementProcessor getElementProcessorComment();
+
        /**
         * The support type.
         * 
@@ -284,46 +551,6 @@ public abstract class BasicSupport {
                return preselector + "/" + type + "/";
        }
 
-       /**
-        * Get the first {@link Element} of the given class, or an empty span
-        * {@link Element} if none found.
-        * 
-        * @param element
-        *            the element to look in
-        * @param className
-        *            the class to look for
-        * 
-        * @return the value or an empty span {@link Element}
-        */
-       static protected Element firstOrEmpty(Element element, String className) {
-               Elements subElements = element.getElementsByClass(className);
-               if (subElements.size() > 0) {
-                       return subElements.get(0);
-               }
-
-               return new Element("span");
-       }
-
-       /**
-        * Get the first {@link Element} of the given tag, or an empty span
-        * {@link Element} if none found.
-        * 
-        * @param element
-        *            the element to look in
-        * @param tagName
-        *            the tag to look for
-        * 
-        * @return the value or an empty span {@link Element}
-        */
-       static protected Element firstOrEmptyTag(Element element, String tagName) {
-               Elements subElements = element.getElementsByTag(tagName);
-               if (subElements.size() > 0) {
-                       return subElements.get(0);
-               }
-
-               return new Element("span");
-       }
-
        /**
         * Process the given element into text (each line is a text paragraph and
         * can be prepended with ">" signs to indicate a quote or sub-quote or
@@ -342,6 +569,7 @@ public abstract class BasicSupport {
                final StringBuilder currentLine = new StringBuilder();
                final List<Integer> quoted = new ArrayList<Integer>();
                final List<Node> ignoredNodes = new ArrayList<Node>();
+               final List<String> footnotes = new ArrayList<String>();
 
                if (element != null) {
                        new NodeTraversor(new NodeVisitor() {
@@ -369,6 +597,18 @@ public abstract class BasicSupport {
                                                }
                                        }
 
+                                       // <pre> check
+                                       if (!ignore) {
+                                               if (node instanceof Element) {
+                                                       Element el = (Element) node;
+                                                       if ("pre".equals(el.tagName())) {
+                                                               currentLine.append(StringUtils
+                                                                               .unhtml(el.text()).trim());
+                                                               ignore = true;
+                                                       }
+                                               }
+                                       }
+
                                        if (ignore) {
                                                ignoredNodes.add(node);
                                                return;
@@ -410,6 +650,11 @@ public abstract class BasicSupport {
                                                if (block && currentLine.length() > 0) {
                                                        currentLine.append("\n");
                                                }
+
+                                               if (!element.absUrl("href").trim().isEmpty()) {
+                                                       footnotes.add(element.absUrl("href"));
+                                                       currentLine.append("[" + footnotes.size() + "]");
+                                               }
                                        } else if (node instanceof TextNode) {
                                                TextNode textNode = (TextNode) node;
                                                String line = StringUtil.normaliseWhitespace(textNode
@@ -442,11 +687,35 @@ public abstract class BasicSupport {
                        }
                }
 
+               // Fix spaces and nbsp, remove multiple following blank lines
+               List<String> linesCopy = new ArrayList<String>(lines.size());
+               long blanks = 0;
                for (int i = 0; i < lines.size(); i++) {
-                       lines.set(i, lines.get(i).replace("  ", " ").trim());
+                       String line = lines.get(i).replace(" ", " ") // nbsp -> space
+                                       .replace("  ", " ").trim();
+                       if (line.isEmpty()) {
+                               blanks++;
+                       } else {
+                               blanks = 0;
+                       }
+
+                       if (blanks < 2) {
+                               linesCopy.add(line);
+                       }
+               }
+
+               // Footnotes insertion
+               if (footnotes.size() > 0) {
+                       linesCopy.add("");
+                       linesCopy.add("");
+                       linesCopy.add("");
+                       linesCopy.add("");
+                       for (int i = 0; i < footnotes.size(); i++) {
+                               linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
+                       }
                }
 
-               return lines;
+               return linesCopy;
        }
 
        /**
@@ -457,7 +726,7 @@ public abstract class BasicSupport {
         * 
         * @return the reformated date, or the same value if it was not parsable
         */
-       static protected String date(String date) {
+       static private String date(String date) {
                SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
 
                long epoch = 0;