Reddit test: add expected files

[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java
diff --git a/src/be/nikiroo/gofetch/support/BasicSupport.java b/src/be/nikiroo/gofetch/support/BasicSupport.java

index b7eaca3cfe8e6917e3507212cfbb24d29657be5b..17a3c151750e9c7e56169fe4fd8be87b9eb39855 100644 (file)
--- a/src/be/nikiroo/gofetch/support/BasicSupport.java
+++ b/src/be/nikiroo/gofetch/support/BasicSupport.java
@@ -1,110 +1,92 @@
  package be.nikiroo.gofetch.support;
  
  import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
  import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashMap;
  import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
  
+import org.jsoup.helper.DataUtil;
  import org.jsoup.helper.StringUtil;
+import org.jsoup.nodes.Document;
  import org.jsoup.nodes.Element;
  import org.jsoup.nodes.Node;
  import org.jsoup.nodes.TextNode;
-import org.jsoup.select.Elements;
  import org.jsoup.select.NodeTraversor;
  import org.jsoup.select.NodeVisitor;
  
+import be.nikiroo.gofetch.data.Comment;
  import be.nikiroo.gofetch.data.Story;
  import be.nikiroo.utils.Downloader;
+import be.nikiroo.utils.StringUtils;
  
+/**
+ * Base class for website support.
+ * 
+ * @author niki
+ */
  public abstract class BasicSupport {
-       protected static Downloader downloader = new Downloader("gofetcher");
+       /**
+        * The downloader to use for all web sites via
+        * {@link BasicSupport#open(URL)}
+        */
+       static private Downloader downloader = new Downloader("gofetcher");
  
-       public enum Type {
-               SLASHDOT, PIPEDOT, LWN, LEMONDE, REGISTER, 
-       }
+       static private String preselector;
  
         /**
-        * Used to process an element into lines.
-        * 
-        * @author niki
-        */
-       public interface ElementProcessor {
-               /**
-                * Detect if this node is a quote and should be trated as such.
-                * 
-                * @param node
-                *            the node to check
-                * @return TRUE if it is
-                */
-               public boolean detectQuote(Node node);
-
-               /**
-                * Process text content (will be called on each text element, allowing
-                * you to modify it if needed).
-                * 
-                * @param text
-                *            the text to process
-                * @return
-                */
-               public String processText(String text);
-
-               /**
-                * Ignore this node.
-                * 
-                * @param node
-                *            the node to ignore
-                * @return TRUE if it has to be ignored
-                */
-               public boolean ignoreNode(Node node);
-
-               /**
-                * Manually process this node (and return the manual processing value)
-                * if so desired.
-                * <p>
-                * If the node is manually processed, it and its children will not be
-                * automatically processed.
-                * 
-                * @param node
-                *            the node to optionally process
-                * 
-                * @return NULL if not processed (will thus be automatically processed
-                *         as usual), a {@link String} (may be empty) if we process it
-                *         manually -- the given {@link String} will be used instead of
-                *         the usual automatic processing if not NULL
-                */
-               public String manualProcessing(Node node);
-       }
+        * The optional cookies to use to get the site data.
+        */
+       private Map<String, String> cookies = new HashMap<String, String>();
+
+       private Type type;
  
         /**
-        * A default {@link ElementProcessor} (will not detect or process anything
-        * manually).
+        * Login on the web site (this method does nothing by default, but can be
+        * overridden if needed).
+        * 
+        * @throws IOException
+        *             in case of I/O error
          * 
-        * @author niki
          */
-       public class BasicElementProcessor implements ElementProcessor {
-               @Override
-               public boolean detectQuote(Node node) {
-                       return false;
-               }
-
-               @Override
-               public String processText(String text) {
-                       return text;
-               }
+       public void login() throws IOException {
+       }
  
-               @Override
-               public boolean ignoreNode(Node node) {
-                       return false;
-               }
+       /**
+        * The website textual description, to add in the dispatcher page.
+        * <p>
+        * Should be short.
+        * 
+        * @return the description
+        */
+       abstract public String getDescription();
  
-               @Override
-               public String manualProcessing(Node node) {
-                       return null;
-               }
+       /**
+        * The gopher "selector" to use for output.
+        * <p>
+        * A kind of "URL path", like "/news/" or "/misc/news/" or...
+        * 
+        * @return the selector
+        */
+       public String getSelector() {
+               return getSelector(getType());
         }
  
-       static private String preselector;
-
-       private Type type;
+       /**
+        * The support type.
+        * 
+        * @return the type
+        */
+       public Type getType() {
+               return type;
+       }
  
         /**
          * List all the recent items, but only assure the ID and internal URL to
@@ -116,7 +98,188 @@ public abstract class BasicSupport {
          * @throws IOException
          *             in case of I/O
          */
-       abstract public List<Story> list() throws IOException;
+       public List<Story> list() throws IOException {
+               List<Story> list = new ArrayList<Story>();
+
+               login();
+               for (Entry<URL, String> entry : getUrls()) {
+                       URL url = entry.getKey();
+                       String defaultCateg = entry.getValue();
+                       if (defaultCateg == null) {
+                               defaultCateg = "";
+                       }
+
+                       InputStream in = open(url);
+                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
+                       List<Element> articles = getArticles(doc);
+                       for (Element article : articles) {
+                               String id = getArticleId(doc, article).trim();
+                               String title = getArticleTitle(doc, article).trim();
+                               String author = getArticleAuthor(doc, article).trim();
+                               String date = getArticleDate(doc, article).trim();
+                               String categ = getArticleCategory(doc, article, defaultCateg)
+                                               .trim();
+                               String details = getArticleDetails(doc, article).trim();
+                               String intUrl = getArticleIntUrl(doc, article).trim();
+                               String extUrl = getArticleExtUrl(doc, article).trim();
+                               String content = getArticleContent(doc, article).trim();
+
+                               if (id.isEmpty() && date.isEmpty()) {
+                                       continue;
+                               }
+
+                               if (!id.isEmpty()) {
+                                       while (id.length() < 10) {
+                                               id = "0" + id;
+                                       }
+                               } else {
+                                       id = date.replace(":", "_").replace("+", "_").replace("/", "-");
+                               }
+                               
+                               date = date(date);
+
+                               list.add(new Story(getType(), id, title, author, date, categ,
+                                               details, intUrl, extUrl, content));
+                       }
+               }
+
+               return list;
+       }
+
+       /**
+        * The {@link URL}s to process for this website.
+        * 
+        * @return the list of {@link URL}s
+        * 
+        * @throws IOException
+        *             in case of I/O error
+        */
+       abstract protected List<Entry<URL, String>> getUrls() throws IOException;
+
+       /**
+        * The article {@link Element}s of this document.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * 
+        * @return the articles
+        */
+       abstract protected List<Element> getArticles(Document doc);
+
+       /**
+        * The ID of the article (defaults to the date element if empty).
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the ID
+        */
+       abstract protected String getArticleId(Document doc, Element article);
+
+       /**
+        * The article title to display.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the title
+        */
+       abstract protected String getArticleTitle(Document doc, Element article);
+
+       /**
+        * The optional article author.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the author
+        */
+       abstract protected String getArticleAuthor(Document doc, Element article);
+
+       /**
+        * The optional article date.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the date
+        */
+       abstract protected String getArticleDate(Document doc, Element article);
+
+       /**
+        * the optional article category.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * @param currentCategory
+        *            the currently listed category if any (can be NULL)
+        * 
+        * @return the category
+        */
+       abstract protected String getArticleCategory(Document doc, Element article,
+                       String currentCategory);
+
+       /**
+        * the optional details of the article (can replace the date, author and
+        * category, for instance).
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the details
+        */
+       abstract protected String getArticleDetails(Document doc, Element article);
+
+       /**
+        * The (required) {@link URL} that points to the news page on the supported
+        * website.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the internal {@link URL}
+        */
+       abstract protected String getArticleIntUrl(Document doc, Element article);
+
+       /**
+        * the optional {@link URL} that points to an external website for more
+        * information.
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the external {@link URL}
+        */
+       abstract protected String getArticleExtUrl(Document doc, Element article);
+
+       /**
+        * The optional article short-content (not the full content, that will be
+        * fetched by {@link BasicSupport#fetch(Story)}).
+        * 
+        * @param doc
+        *            the main document for the current category
+        * @param article
+        *            the article to look into
+        * 
+        * @return the short content
+        */
+       abstract protected String getArticleContent(Document doc, Element article);
  
         /**
          * Fetch the full article content as well as all the comments associated to
@@ -128,23 +291,268 @@ public abstract class BasicSupport {
          * @throws IOException
          *             in case of I/O error
          */
-       abstract public void fetch(Story story) throws IOException;
+       public void fetch(Story story) throws IOException {
+               String fullContent = "";
+
+               URL url = new URL(story.getUrlInternal());
+               InputStream in = open(url);
+               try {
+                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
+                       Element article = getFullArticle(doc);
+                       if (article != null) {
+                               fullContent = getArticleText(article);
+                       }
  
-       abstract public String getDescription();
+                       if (fullContent.isEmpty()) {
+                               fullContent = story.getContent();
+                       }
  
-       public String getSelector() {
-               return getSelector(type);
+                       story.setFullContent(fullContent);
+                       story.setComments(getComments(doc,
+                                       getFullArticleCommentPosts(doc, url)));
+               } finally {
+                       if (in != null) {
+                               in.close();
+                       }
+               }
         }
  
-       public Type getType() {
-               return type;
+       /**
+        * Return the text from this {@link Element}, using the
+        * {@link BasicSupport#getElementProcessorFullArticle()} processor logic.
+        * 
+        * @param article
+        *            the element to extract the text from
+        * 
+        * @return the text
+        */
+       protected String getArticleText(Element article) {
+               StringBuilder builder = new StringBuilder();
+               ElementProcessor eProc = getElementProcessorFullArticle();
+               if (eProc != null) {
+                       for (String line : toLines(article, eProc)) {
+                               builder.append(line + "\n");
+                       }
+               } else {
+                       builder.append(article.text());
+               }
+
+               // Content is too tight with a single break per line:
+               return builder.toString().replace("\n", "\n\n") //
+                               .replace("\n\n\n\n", "\n\n") //
+                               .replace("\n\n\n\n", "\n\n") //
+                               .trim();
         }
  
+       /**
+        * Return the full article if available (this is the article to retrieve
+        * from the newly downloaded page at {@link Story#getUrlInternal()}).
+        * 
+        * @param doc
+        *            the (full article) document to work on
+        * 
+        * @return the article or NULL
+        */
+       abstract protected Element getFullArticle(Document doc);
+
+       /**
+        * Return the list of comment {@link Element}s from this optional container
+        * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
+        * 
+        * @param doc
+        *            the (full article) document to work on
+        * @param intUrl
+        *            the internal {@link URL} this article wa taken from (the
+        *            {@link URL} from the supported website)
+        * 
+        * @return the list of comment posts
+        */
+       abstract protected List<Element> getFullArticleCommentPosts(Document doc,
+                       URL intUrl);
+
+       /**
+        * The {@link ElementProcessor} to use to convert the main article element
+        * (see {@link BasicSupport#getFullArticle(Document)}) into text.
+        * <p>
+        * See {@link BasicElementProcessor} for a working, basic implementation.
+        * <p>
+        * Can be NULL to simply use {@link Element#text()}.
+        * 
+        * @return the processor, or NULL
+        */
+       abstract protected ElementProcessor getElementProcessorFullArticle();
+
+       /**
+        * Open a network resource.
+        * <p>
+        * You need to close the returned {@link InputStream} when done.
+        * 
+        * @param url
+        *            the source to open
+        * 
+        * @return the content
+        * 
+        * @throws IOException
+        *             in case of I/O error
+        */
+       protected InputStream open(URL url) throws IOException {
+               return downloader.open(url, url, cookies, null, null, null);
+       }
+
+       /**
+        * Convert the comment elements into {@link Comment}s
+        * 
+        * @param doc
+        *            the document we work on
+        * @param posts
+        *            the comment elements
+        * 
+        * @return the converted {@link Comment}s
+        */
+       private List<Comment> getComments(Document doc, List<Element> posts) {
+               List<Comment> comments = new ArrayList<Comment>();
+               if (posts != null) {
+                       for (Element post : posts) {
+                               String id = getCommentId(post).trim();
+                               String author = getCommentAuthor(post).trim();
+                               String title = getCommentTitle(post).trim();
+                               String date = getCommentDate(post).trim();
+
+                               List<String> content = new ArrayList<String>();
+
+                               if (id.isEmpty()) {
+                                       id = date;
+                               }
+
+                               date = date(date);
+
+                               Element contentE = getCommentContentElement(post);
+                               if (contentE != null) {
+                                       ElementProcessor eProc = getElementProcessorComment();
+                                       if (eProc != null) {
+                                               for (String line : toLines(contentE, eProc)) {
+                                                       content.add(line);
+                                               }
+                                       } else {
+                                               content = Arrays.asList(contentE.text().split("\n"));
+                                       }
+                               }
+
+                               Comment comment = new Comment(id, author, title, date, content);
+                               comment.addAll(getComments(doc,
+                                               getCommentCommentPosts(doc, post)));
+
+                               if (!comment.isEmpty()) {
+                                       comments.add(comment);
+                               }
+                       }
+               }
+
+               return comments;
+       }
+
+       /**
+        * Return the list of subcomment {@link Element}s from this comment element
+        * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
+        * 
+        * @param doc
+        *            the (full article) document to work on
+        * @param container
+        *            the container (a comment {@link Element})
+        * 
+        * @return the list of comment posts
+        */
+       abstract protected List<Element> getCommentCommentPosts(Document doc,
+                       Element container);
+
+       /**
+        * Compute the ID of the given comment element.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the ID
+        */
+       abstract protected String getCommentId(Element post);
+
+       /**
+        * Compute the author of the given comment element.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the author
+        */
+       abstract protected String getCommentAuthor(Element post);
+
+       /**
+        * Compute the title of the given comment element.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the title
+        */
+       abstract protected String getCommentTitle(Element post);
+
+       /**
+        * Compute the date of the given comment element.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the date
+        */
+       abstract protected String getCommentDate(Element post);
+
+       /**
+        * Get the main of the given comment element, which can be NULL.
+        * 
+        * @param post
+        *            the comment element
+        * 
+        * @return the element
+        */
+       abstract protected Element getCommentContentElement(Element post);
+
+       /**
+        * The {@link ElementProcessor} to use to convert the main comment element
+        * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
+        * <p>
+        * See {@link BasicElementProcessor} for a working, basic implementation.
+        * <p>
+        * Can be NULL to simply use {@link Element#text()}.
+        * 
+        * @return the processor
+        */
+       abstract protected ElementProcessor getElementProcessorComment();
+
+       /**
+        * The support type.
+        * 
+        * @param type
+        *            the new type
+        */
         protected void setType(Type type) {
                 this.type = type;
         }
  
         /**
+        * Add a cookie for all site connections.
+        * 
+        * @param name
+        *            the cookie name
+        * @param value
+        *            the value
+        */
+       protected void addCookie(String name, String value) {
+               cookies.put(name, value);
+       }
+
+       /**
+        * The {@link String} to append to the selector (the selector will be
+        * constructed as "this string" then "/type/".
+        * 
          * @param preselector
          *            the preselector to set
          */
@@ -181,6 +589,21 @@ public abstract class BasicSupport {
                         case REGISTER:
                                 support = new TheRegister();
                                 break;
+                       case TOO_LINUX:
+                               support = new TooLinux();
+                               break;
+                       case ERE_NUMERIQUE:
+                               support = new EreNumerique();
+                               break;
+                       case PHORONIX:
+                               support = new Phoronix();
+                               break;
+                       case SEPT_SUR_SEPT:
+                               support = new SeptSurSept();
+                               break;
+                       case REDDIT:
+                               support = new Reddit();
+                               break;
                         }
  
                         if (support != null) {
@@ -191,48 +614,19 @@ public abstract class BasicSupport {
                 return support;
         }
  
-       static public String getSelector(Type type) {
-               return preselector + "/" + type + "/";
-       }
-
-       /**
-        * Get the first {@link Element} of the given class, or an empty span
-        * {@link Element} if none found.
-        * 
-        * @param element
-        *            the element to look in
-        * @param className
-        *            the class to look for
-        * 
-        * @return the value or an empty span {@link Element}
-        */
-       static protected Element firstOrEmpty(Element element, String className) {
-               Elements subElements = element.getElementsByClass(className);
-               if (subElements.size() > 0) {
-                       return subElements.get(0);
-               }
-
-               return new Element("span");
-       }
-
         /**
-        * Get the first {@link Element} of the given tag, or an empty span
-        * {@link Element} if none found.
+        * The gopher "selector" to use for output for this type, using the
+        * preselector.
+        * <p>
+        * A kind of "URL path", like "/news/" or "/misc/news/" or...
          * 
-        * @param element
-        *            the element to look in
-        * @param tagName
-        *            the tag to look for
+        * @param type
+        *            the type to get the selector of
          * 
-        * @return the value or an empty span {@link Element}
+        * @return the selector
          */
-       static protected Element firstOrEmptyTag(Element element, String tagName) {
-               Elements subElements = element.getElementsByTag(tagName);
-               if (subElements.size() > 0) {
-                       return subElements.get(0);
-               }
-
-               return new Element("span");
+       static public String getSelector(Type type) {
+               return preselector + "/" + type + "/";
         }
  
         /**
@@ -253,6 +647,7 @@ public abstract class BasicSupport {
                 final StringBuilder currentLine = new StringBuilder();
                 final List<Integer> quoted = new ArrayList<Integer>();
                 final List<Node> ignoredNodes = new ArrayList<Node>();
+               final List<String> footnotes = new ArrayList<String>();
  
                 if (element != null) {
                         new NodeTraversor(new NodeVisitor() {
@@ -261,6 +656,7 @@ public abstract class BasicSupport {
                                         String manual = null;
                                         boolean ignore = elementProcessor.ignoreNode(node)
                                                         || ignoredNodes.contains(node.parentNode());
+                                       // Manual processing
                                         if (!ignore) {
                                                 manual = elementProcessor.manualProcessing(node);
                                                 if (manual != null) {
@@ -269,6 +665,28 @@ public abstract class BasicSupport {
                                                 }
                                         }
  
+                                       // Subtitle check
+                                       if (!ignore) {
+                                               String subtitle = elementProcessor.isSubtitle(node);
+                                               if (subtitle != null) {
+                                                       subtitle = subtitle.trim();
+                                                       currentLine.append("\n[ " + subtitle + " ]\n");
+                                                       ignore = true;
+                                               }
+                                       }
+
+                                       // <pre> check
+                                       if (!ignore) {
+                                               if (node instanceof Element) {
+                                                       Element el = (Element) node;
+                                                       if ("pre".equals(el.tagName())) {
+                                                               currentLine.append(StringUtils
+                                                                               .unhtml(el.text()).trim());
+                                                               ignore = true;
+                                                       }
+                                               }
+                                       }
+
                                         if (ignore) {
                                                 ignoredNodes.add(node);
                                                 return;
@@ -310,6 +728,11 @@ public abstract class BasicSupport {
                                                 if (block && currentLine.length() > 0) {
                                                         currentLine.append("\n");
                                                 }
+
+                                               if (!element.absUrl("href").trim().isEmpty()) {
+                                                       footnotes.add(element.absUrl("href"));
+                                                       currentLine.append("[" + footnotes.size() + "]");
+                                               }
                                         } else if (node instanceof TextNode) {
                                                 TextNode textNode = (TextNode) node;
                                                 String line = StringUtil.normaliseWhitespace(textNode
@@ -342,10 +765,65 @@ public abstract class BasicSupport {
                         }
                 }
  
+               // Fix spaces and nbsp, remove multiple following blank lines
+               List<String> linesCopy = new ArrayList<String>(lines.size());
+               long blanks = 0;
                 for (int i = 0; i < lines.size(); i++) {
-                       lines.set(i, lines.get(i).replace("  ", " ").trim());
+                       String line = lines.get(i).replace(" ", " ") // nbsp -> space
+                                       .replace("  ", " ").trim();
+                       if (line.isEmpty()) {
+                               blanks++;
+                       } else {
+                               blanks = 0;
+                       }
+
+                       if (blanks < 2) {
+                               linesCopy.add(line);
+                       }
+               }
+
+               // Footnotes insertion
+               if (footnotes.size() > 0) {
+                       linesCopy.add("");
+                       linesCopy.add("");
+                       linesCopy.add("");
+                       linesCopy.add("");
+                       for (int i = 0; i < footnotes.size(); i++) {
+                               linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
+                       }
                 }
  
-               return lines;
+               return linesCopy;
+       }
+
+       /**
+        * Reformat the date if possible.
+        * 
+        * @param date
+        *            the input date
+        * 
+        * @return the reformated date, or the same value if it was not parsable
+        */
+       static private String date(String date) {
+               SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
+
+               long epoch = 0;
+               try {
+                       epoch = Long.parseLong(date.trim());
+               } catch (Exception e) {
+                       epoch = 0;
+               }
+
+               if (epoch > 0) {
+                       return out.format(new Date(1000 * epoch));
+               }
+
+               try {
+                       Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
+                                       .parse(date.trim());
+                       return out.format(dat);
+               } catch (Exception e) {
+                       return date;
+               }
         }
  }