Bug fixes + rework of BasicSupport
[gofetch.git] / src / be / nikiroo / gofetch / support / TheRegister.java
index 7fb152400f11a641193e47ddd78d2287baef81fb..1195d3d5f9edf7200aef9318b2517af79b892b67 100644 (file)
@@ -3,18 +3,20 @@ package be.nikiroo.gofetch.support;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.util.AbstractMap;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
 
 import org.jsoup.helper.DataUtil;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
 
 import be.nikiroo.gofetch.data.Comment;
 import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.utils.StringUtils;
 
 /**
  * Support <a
@@ -23,195 +25,240 @@ import be.nikiroo.utils.StringUtils;
  * @author niki
  */
 public class TheRegister extends BasicSupport {
+       private Map<String, String> commentReplies = new HashMap<String, String>();
+
        @Override
        public String getDescription() {
                return "The Register: Biting the hand that feeds IT";
        }
 
        @Override
-       public List<Story> list() throws IOException {
-               List<Story> list = new ArrayList<Story>();
+       public void fetch(Story story) throws IOException {
+               super.fetch(story);
 
-               URL url = new URL("https://www.theregister.co.uk/");
-               InputStream in = downloader.open(url);
-               Document doc = DataUtil.load(in, "UTF-8", url.toString());
-               Elements articles = doc.getElementsByClass("story_link");
-               for (Element article : articles) {
-                       if (article.getElementsByClass("time_stamp").isEmpty()) {
-                               // Some articles are doubled,
-                               // but the second copy without the time info
-                               continue;
+               // Update comment replies
+               List<Comment> comments = new ArrayList<Comment>();
+               for (Comment comment : story.getComments()) {
+                       if (commentReplies.containsKey(comment.getId())) {
+                               String inReplyToId = commentReplies.get(comment.getId());
+                               Comment inReplyTo = story.getCommentById(inReplyToId);
+                               if (inReplyTo != null) {
+                                       inReplyTo.add(comment);
+                               } else {
+                                       comments.add(comment);
+                               }
+                       } else {
+                               comments.add(comment);
                        }
+               }
+               story.setComments(comments);
+       }
 
-                       String id = "";
-                       String intUrl = article.absUrl("href");
-                       String extUrl = ""; // nope
-                       String title = "";
-                       String date = "";
-                       String details = "";
-                       String body = "";
-                       String categ = "";
-                       String author = ""; // nope
-
-                       Element categElement = article.previousElementSibling();
-                       if (categElement != null) {
-                               categ = categElement.text().trim();
-                       }
+       @Override
+       protected List<Entry<URL, String>> getUrls() throws IOException {
+               List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+               urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+                               "https://www.theregister.co.uk/"), ""));
+               return urls;
+       }
 
-                       Element titleElement = article.getElementsByTag("h4").first();
-                       if (titleElement != null) {
-                               title = StringUtils.unhtml(titleElement.text()).trim();
-                       }
+       @Override
+       protected List<Element> getArticles(Document doc) {
+               return doc.getElementsByClass("story_link");
+       }
 
-                       Element dateElement = article.getElementsByClass("time_stamp")
-                                       .first();
-                       if (dateElement != null) {
-                               String epochS = dateElement.attr("data-epoch");
-                               if (epochS != null && !epochS.isEmpty()) {
-                                       id = epochS;
-                                       date = date(epochS);
-                               }
-                       }
+       @Override
+       protected String getArticleId(Document doc, Element article) {
+               return "";
+       }
 
-                       if (id.isEmpty()) {
-                               // fallback
-                               id = article.attr("href").replace("/", "_");
-                       }
+       @Override
+       protected String getArticleTitle(Document doc, Element article) {
+               Element titleElement = article.getElementsByTag("h4").first();
+               if (titleElement != null) {
+                       return titleElement.text();
+               }
 
-                       Element detailsElement = article.getElementsByClass("standfirst")
-                                       .first();
-                       details = "(" + date + ") ";
-                       if (detailsElement != null) {
-                               details += StringUtils.unhtml(detailsElement.text()).trim();
-                       }
+               return "";
+       }
+
+       @Override
+       protected String getArticleAuthor(Document doc, Element article) {
+               return "";
+       }
 
-                       // We have some "details" but no content, so we switch them:
-                       body = details;
-                       details = "";
-                       list.add(new Story(getType(), id, title, author, date, categ,
-                                       details, intUrl, extUrl, body));
+       @Override
+       protected String getArticleDate(Document doc, Element article) {
+               Element dateElement = article.getElementsByClass("time_stamp").first();
+               if (dateElement != null) {
+                       return dateElement.attr("data-epoch");
                }
 
-               return list;
+               return "";
        }
 
        @Override
-       public void fetch(Story story) throws IOException {
-               String fullContent = story.getContent();
-               List<Comment> comments = new ArrayList<Comment>();
-               story.setComments(comments);
+       protected String getArticleCategory(Document doc, Element article,
+                       String currentCategory) {
+               Element categElement = article.previousElementSibling();
+               if (categElement != null) {
+                       return categElement.text();
+               }
+
+               return "";
+       }
+
+       @Override
+       protected String getArticleDetails(Document doc, Element article) {
+               // We have some "details" but no content, so we switch them:
+               return "";
+       }
+
+       @Override
+       protected String getArticleIntUrl(Document doc, Element article) {
+               return article.absUrl("href");
+       }
+
+       @Override
+       protected String getArticleExtUrl(Document doc, Element article) {
+               return "";
+       }
+
+       @Override
+       protected String getArticleContent(Document doc, Element article) {
+               // We have some "details" but no content, so we switch them:
+               Element detailsElement = article.getElementsByClass("standfirst")
+                               .first();
+               if (detailsElement != null) {
+                       return detailsElement.text();
+               }
+
+               return "";
+       }
+
+       @Override
+       protected Element getFullArticle(Document doc) {
+               return doc.getElementById("body");
+       }
 
-               URL url = new URL(story.getUrlInternal());
-               InputStream in = downloader.open(url);
+       @Override
+       protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+               List<Element> commentElements = new ArrayList<Element>();
+
+               // Get comments URL then parse it
                try {
-                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
-                       Element article = doc.getElementById("body");
-                       if (article != null) {
-                               for (String line : toLines(article,
-                                               new BasicElementProcessor() {
-                                                       // TODO: ignore headlines/pub
-                                               })) {
-                                       fullContent += line + "\n";
+                       URL url = new URL("https://forums.theregister.co.uk/forum/1"
+                                       + intUrl.getPath());
+                       InputStream in = downloader.open(url);
+                       try {
+                               doc = DataUtil.load(in, "UTF-8", url.toString());
+                               Element posts = doc.getElementById("forum_posts");
+                               if (posts != null) {
+                                       for (Element post : posts.getElementsByClass("post")) {
+                                               commentElements.add(post);
+                                               Element inReplyTo = post.getElementsByClass(
+                                                               "in-reply-to").first();
+                                               if (inReplyTo != null) {
+                                                       String parentId = inReplyTo.absUrl("href");
+                                                       if (parentId != null && parentId.contains("/")) {
+                                                               int i = parentId.lastIndexOf('/');
+                                                               parentId = parentId.substring(i + 1);
+
+                                                               commentReplies
+                                                                               .put(getCommentId(post), parentId);
+                                                       }
+                                               }
+                                       }
                                }
+                       } finally {
+                               in.close();
+                       }
+               } catch (IOException e) {
+               }
+
+               return commentElements;
+       }
+
+       @Override
+       protected ElementProcessor getElementProcessorFullArticle() {
+               return new BasicElementProcessor();
+       }
+
+       @Override
+       protected List<Element> getCommentCommentPosts(Document doc,
+                       Element container) {
+               return null;
+       }
 
-                               // Content is too tight with a single break per line:
-                               fullContent = fullContent.replace("\n", "\n\n") //
-                                               .replace("\n\n\n\n", "\n\n") //
-                                               .replace("\n\n\n\n", "\n\n") //
-                                               .trim();
+       @Override
+       protected String getCommentId(Element post) {
+               Element idE = post.getElementsByTag("a").first();
+               if (idE != null) {
+                       String id = idE.attr("id");
+                       if (id.startsWith("c_")) {
+                               id = id.substring(2);
                        }
 
-                       story.setFullContent(fullContent);
-
-                       // Get comments URL then parse it
-                       in.close();
-                       in = null;
-                       in = downloader
-                                       .open(new URL("https://forums.theregister.co.uk/forum/1"
-                                                       + url.getPath()));
-                       doc = DataUtil.load(in, "UTF-8", url.toString());
-                       Element posts = doc.getElementById("forum_posts");
-                       if (posts != null) {
-                               for (Element post : posts.getElementsByClass("post")) {
-                                       String id = "";
-                                       String author = "";
-                                       String title = "";
-                                       String date = "";
-                                       List<String> content = new ArrayList<String>();
-
-                                       Element idE = post.getElementsByTag("a").first();
-                                       if (idE != null) {
-                                               id = idE.attr("id");
-                                               if (id.startsWith("c_")) {
-                                                       id = id.substring(2);
-                                               }
+                       return id;
+               }
 
-                                               Element dateE = idE.getElementsByTag("span").first();
-                                               if (dateE != null) {
-                                                       date = date(dateE.attr("data-epoch"));
-                                               }
-                                       }
+               return "";
+       }
 
-                                       Element authorE = post.getElementsByClass("author").first();
-                                       if (authorE != null) {
-                                               author = StringUtils.unhtml(authorE.text()).trim();
-                                       }
+       @Override
+       protected String getCommentAuthor(Element post) {
+               Element author = post.getElementsByClass("author").first();
+               if (author != null) {
+                       return author.text();
+               }
 
-                                       Element titleE = post.getElementsByTag("h4").first();
-                                       if (titleE != null) {
-                                               title = StringUtils.unhtml(titleE.text()).trim();
-                                       }
+               return "";
+       }
 
-                                       Element contentE = post.getElementsByClass("body").first();
-                                       if (contentE != null) {
-                                               for (String line : toLines(contentE,
-                                                               new BasicElementProcessor() {
-                                                                       @Override
-                                                                       public boolean ignoreNode(Node node) {
-                                                                               // TODO: ignore headlines/pub
-
-                                                                               // Remove the comment title (which has
-                                                                               // already been processed earlier)
-                                                                               if (node instanceof Element) {
-                                                                                       Element el = (Element) node;
-                                                                                       if ("h4".equals(el.tagName())) {
-                                                                                               return true;
-                                                                                       }
-                                                                               }
-
-                                                                               return false;
-                                                                       }
-                                                               })) {
-                                                       content.add(line);
-                                               }
-                                       }
+       @Override
+       protected String getCommentTitle(Element post) {
+               Element title = post.getElementsByTag("h4").first();
+               if (title != null) {
+                       return title.text();
+               }
 
-                                       Comment comment = new Comment(id, author, title, date,
-                                                       content);
-                                       Comment parent = null;
-
-                                       Element inReplyTo = post.getElementsByClass("in-reply-to")
-                                                       .first();
-                                       if (inReplyTo != null) {
-                                               String parentId = inReplyTo.absUrl("href");
-                                               if (parentId != null && parentId.contains("/")) {
-                                                       int i = parentId.lastIndexOf('/');
-                                                       parentId = parentId.substring(i + 1);
-                                                       parent = story.getCommentById(parentId);
-                                               }
-                                       }
+               return "";
+       }
+
+       @Override
+       protected String getCommentDate(Element post) {
+               Element id = post.getElementsByTag("a").first();
+               if (id != null) {
+                       Element date = id.getElementsByTag("span").first();
+                       if (date != null) {
+                               return date.attr("data-epoch");
+                       }
+               }
+
+               return "";
+       }
+
+       @Override
+       protected Element getCommentContentElement(Element post) {
+               return post.getElementsByClass("body").first();
+       }
 
-                                       if (parent == null) {
-                                               comments.add(comment);
-                                       } else {
-                                               parent.add(comment);
+       @Override
+       protected ElementProcessor getElementProcessorComment() {
+               return new BasicElementProcessor() {
+                       @Override
+                       public boolean ignoreNode(Node node) {
+                               // Remove the comment title (which has
+                               // already been processed earlier)
+                               if (node instanceof Element) {
+                                       Element el = (Element) node;
+                                       if ("h4".equals(el.tagName())) {
+                                               return true;
                                        }
                                }
+
+                               return false;
                        }
-               } finally {
-                       if (in != null) {
-                               in.close();
-                       }
-               }
+               };
        }
 }