Add new supported site: The Register
authorNiki Roo <niki@nikiroo.be>
Mon, 19 Mar 2018 16:13:55 +0000 (17:13 +0100)
committerNiki Roo <niki@nikiroo.be>
Mon, 19 Mar 2018 16:13:55 +0000 (17:13 +0100)
src/be/nikiroo/gofetch/support/BasicSupport.java
src/be/nikiroo/gofetch/support/TheRegister.java [new file with mode: 0644]

index 615c72d6b0cbd81c4f59d8472878ea07ef4b730a..b7eaca3cfe8e6917e3507212cfbb24d29657be5b 100644 (file)
@@ -19,7 +19,7 @@ public abstract class BasicSupport {
        protected static Downloader downloader = new Downloader("gofetcher");
 
        public enum Type {
-               SLASHDOT, PIPEDOT, LWN, LEMONDE,
+               SLASHDOT, PIPEDOT, LWN, LEMONDE, REGISTER, 
        }
 
        /**
@@ -178,6 +178,9 @@ public abstract class BasicSupport {
                        case LEMONDE:
                                support = new LeMonde();
                                break;
+                       case REGISTER:
+                               support = new TheRegister();
+                               break;
                        }
 
                        if (support != null) {
diff --git a/src/be/nikiroo/gofetch/support/TheRegister.java b/src/be/nikiroo/gofetch/support/TheRegister.java
new file mode 100644 (file)
index 0000000..35c619c
--- /dev/null
@@ -0,0 +1,200 @@
+package be.nikiroo.gofetch.support;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+
+import org.jsoup.helper.DataUtil;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.select.Elements;
+
+import be.nikiroo.gofetch.data.Comment;
+import be.nikiroo.gofetch.data.Story;
+import be.nikiroo.utils.StringUtils;
+
+public class TheRegister extends BasicSupport {
+       @Override
+       public String getDescription() {
+               return "The Register: Biting the hand that feeds IT";
+       }
+
+       @Override
+       public List<Story> list() throws IOException {
+               List<Story> list = new ArrayList<Story>();
+
+               URL url = new URL("https://www.theregister.co.uk/");
+               InputStream in = downloader.open(url);
+               Document doc = DataUtil.load(in, "UTF-8", url.toString());
+               Elements articles = doc.getElementsByClass("story_link");
+               for (Element article : articles) {
+                       if (article.getElementsByClass("time_stamp").isEmpty()) {
+                               // Some articles are doubled,
+                               // but the second copy without the time info
+                               continue;
+                       }
+
+                       String id = "";
+                       String intUrl = article.absUrl("href");
+                       String extUrl = ""; // nope
+                       String title = "";
+                       String date = "";
+                       String details = "";
+                       String body = "";
+
+                       String topic = "";
+                       Element topicElement = article.previousElementSibling();
+                       if (topicElement != null) {
+                               topic = "[" + topicElement.text().trim() + "] ";
+                       }
+                       Element titleElement = article.getElementsByTag("h4").first();
+                       if (titleElement != null) {
+                               title = StringUtils.unhtml(titleElement.text()).trim();
+                       }
+                       title = topic + title;
+
+                       Element dateElement = article.getElementsByClass("time_stamp")
+                                       .first();
+                       if (dateElement != null) {
+                               String epochS = dateElement.attr("data-epoch");
+                               if (epochS != null && !epochS.isEmpty()) {
+                                       id = epochS;
+                                       date = date(epochS);
+                               }
+                       }
+
+                       if (id.isEmpty()) {
+                               // fallback
+                               id = article.attr("href").replace("/", "_");
+                       }
+
+                       Element detailsElement = article.getElementsByClass("standfirst")
+                                       .first();
+                       details = "(" + date + ") ";
+                       if (detailsElement != null) {
+                               details += StringUtils.unhtml(detailsElement.text()).trim();
+                       }
+
+                       list.add(new Story(getType(), id, title, details, intUrl, extUrl,
+                                       body));
+               }
+
+               return list;
+       }
+
+       @Override
+       public void fetch(Story story) throws IOException {
+               String fullContent = story.getContent();
+               List<Comment> comments = new ArrayList<Comment>();
+
+               URL url = new URL(story.getUrlInternal());
+               InputStream in = downloader.open(url);
+               try {
+                       Document doc = DataUtil.load(in, "UTF-8", url.toString());
+                       Element article = doc.getElementById("body");
+                       if (article != null) {
+                               for (String line : toLines(article,
+                                               new BasicElementProcessor() {
+                                                       // TODO: ignore headlines/pub
+                                               })) {
+                                       fullContent += line + "\n";
+                               }
+
+                               // Content is too tight with a single break per line:
+                               fullContent = fullContent.replace("\n", "\n\n") //
+                                               .replace("\n\n\n\n", "\n\n") //
+                                               .replace("\n\n\n\n", "\n\n") //
+                                               .trim();
+                       }
+
+                       // Get comments URL then parse it
+                       in.close();
+                       in = null;
+                       in = downloader
+                                       .open(new URL("https://forums.theregister.co.uk/forum/1"
+                                                       + url.getPath()));
+                       doc = DataUtil.load(in, "UTF-8", url.toString());
+                       Element posts = doc.getElementById("forum_posts");
+                       if (posts != null) {
+                               for (Element post : posts.getElementsByClass("post")) {
+                                       String id = "";
+                                       String author = "";
+                                       String title = "";
+                                       String date = "";
+                                       List<String> content = new ArrayList<String>();
+
+                                       Element idE = post.getElementsByTag("a").first();
+                                       if (idE != null) {
+                                               id = idE.attr("id");
+                                               Element dateE = idE.getElementsByTag("span").first();
+                                               if (dateE != null) {
+                                                       date = date(dateE.attr("data-epoch"));
+                                               }
+                                       }
+
+                                       Element authorE = post.getElementsByClass("author").first();
+                                       if (authorE != null) {
+                                               author = StringUtils.unhtml(authorE.text()).trim();
+                                       }
+
+                                       Element titleE = post.getElementsByTag("h4").first();
+                                       if (titleE != null) {
+                                               title = StringUtils.unhtml(titleE.text()).trim();
+                                       }
+
+                                       Element contentE = post.getElementsByClass("body").first();
+                                       if (contentE != null) {
+                                               for (String line : toLines(contentE,
+                                                               new BasicElementProcessor() {
+                                                                       @Override
+                                                                       public boolean ignoreNode(Node node) {
+                                                                               // TODO: ignore headlines/pub
+                                                                               if (node instanceof Element) {
+                                                                                       Element el = (Element)node;
+                                                                                       if ("h4".equals(el.tagName())) {
+                                                                                               return true;
+                                                                                       }
+                                                                               }
+                                                                               
+                                                                               return false;
+                                                                       }
+                                                               })) {
+                                                       content.add(line);
+                                               }
+                                       }
+
+                                       comments.add(new Comment(id, author, title, date, content));
+                               }
+                       }
+
+                       story.setFullContent(fullContent);
+                       story.setComments(comments);
+               } finally {
+                       if (in != null) {
+                               in.close();
+                       }
+               }
+       }
+
+       // Return display date from epoch String, or "" if error
+       private static String date(String epochString) {
+               long epoch = 0;
+               try {
+                       epoch = Long.parseLong(epochString);
+               } catch (Exception e) {
+                       epoch = 0;
+               }
+
+               if (epoch > 0) {
+                       return new SimpleDateFormat("dd MMM YYYY").format(new Date(
+                                       1000 * epoch));
+               }
+
+               return "";
+       }
+}