From d28c4aac3f42d9de93e3969e86a7c84e2d2e963a Mon Sep 17 00:00:00 2001 From: Niki Roo Date: Mon, 19 Mar 2018 17:13:55 +0100 Subject: [PATCH] Add new supported site: The Register --- .../nikiroo/gofetch/support/BasicSupport.java | 5 +- .../nikiroo/gofetch/support/TheRegister.java | 200 ++++++++++++++++++ 2 files changed, 204 insertions(+), 1 deletion(-) create mode 100644 src/be/nikiroo/gofetch/support/TheRegister.java diff --git a/src/be/nikiroo/gofetch/support/BasicSupport.java b/src/be/nikiroo/gofetch/support/BasicSupport.java index 615c72d..b7eaca3 100644 --- a/src/be/nikiroo/gofetch/support/BasicSupport.java +++ b/src/be/nikiroo/gofetch/support/BasicSupport.java @@ -19,7 +19,7 @@ public abstract class BasicSupport { protected static Downloader downloader = new Downloader("gofetcher"); public enum Type { - SLASHDOT, PIPEDOT, LWN, LEMONDE, + SLASHDOT, PIPEDOT, LWN, LEMONDE, REGISTER, } /** @@ -178,6 +178,9 @@ public abstract class BasicSupport { case LEMONDE: support = new LeMonde(); break; + case REGISTER: + support = new TheRegister(); + break; } if (support != null) { diff --git a/src/be/nikiroo/gofetch/support/TheRegister.java b/src/be/nikiroo/gofetch/support/TheRegister.java new file mode 100644 index 0000000..35c619c --- /dev/null +++ b/src/be/nikiroo/gofetch/support/TheRegister.java @@ -0,0 +1,200 @@ +package be.nikiroo.gofetch.support; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; + +import org.jsoup.helper.DataUtil; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.select.Elements; + +import be.nikiroo.gofetch.data.Comment; +import be.nikiroo.gofetch.data.Story; +import be.nikiroo.utils.StringUtils; + +public class TheRegister extends BasicSupport { + @Override + public String getDescription() { + return "The Register: Biting the hand that feeds IT"; + } + + @Override + public List list() throws IOException { + List list = new ArrayList(); + + URL url = new URL("https://www.theregister.co.uk/"); + InputStream in = downloader.open(url); + Document doc = DataUtil.load(in, "UTF-8", url.toString()); + Elements articles = doc.getElementsByClass("story_link"); + for (Element article : articles) { + if (article.getElementsByClass("time_stamp").isEmpty()) { + // Some articles are doubled, + // but the second copy without the time info + continue; + } + + String id = ""; + String intUrl = article.absUrl("href"); + String extUrl = ""; // nope + String title = ""; + String date = ""; + String details = ""; + String body = ""; + + String topic = ""; + Element topicElement = article.previousElementSibling(); + if (topicElement != null) { + topic = "[" + topicElement.text().trim() + "] "; + } + Element titleElement = article.getElementsByTag("h4").first(); + if (titleElement != null) { + title = StringUtils.unhtml(titleElement.text()).trim(); + } + title = topic + title; + + Element dateElement = article.getElementsByClass("time_stamp") + .first(); + if (dateElement != null) { + String epochS = dateElement.attr("data-epoch"); + if (epochS != null && !epochS.isEmpty()) { + id = epochS; + date = date(epochS); + } + } + + if (id.isEmpty()) { + // fallback + id = article.attr("href").replace("/", "_"); + } + + Element detailsElement = article.getElementsByClass("standfirst") + .first(); + details = "(" + date + ") "; + if (detailsElement != null) { + details += StringUtils.unhtml(detailsElement.text()).trim(); + } + + list.add(new Story(getType(), id, title, details, intUrl, extUrl, + body)); + } + + return list; + } + + @Override + public void fetch(Story story) throws IOException { + String fullContent = story.getContent(); + List comments = new ArrayList(); + + URL url = new URL(story.getUrlInternal()); + InputStream in = downloader.open(url); + try { + Document doc = DataUtil.load(in, "UTF-8", url.toString()); + Element article = doc.getElementById("body"); + if (article != null) { + for (String line : toLines(article, + new BasicElementProcessor() { + // TODO: ignore headlines/pub + })) { + fullContent += line + "\n"; + } + + // Content is too tight with a single break per line: + fullContent = fullContent.replace("\n", "\n\n") // + .replace("\n\n\n\n", "\n\n") // + .replace("\n\n\n\n", "\n\n") // + .trim(); + } + + // Get comments URL then parse it + in.close(); + in = null; + in = downloader + .open(new URL("https://forums.theregister.co.uk/forum/1" + + url.getPath())); + doc = DataUtil.load(in, "UTF-8", url.toString()); + Element posts = doc.getElementById("forum_posts"); + if (posts != null) { + for (Element post : posts.getElementsByClass("post")) { + String id = ""; + String author = ""; + String title = ""; + String date = ""; + List content = new ArrayList(); + + Element idE = post.getElementsByTag("a").first(); + if (idE != null) { + id = idE.attr("id"); + Element dateE = idE.getElementsByTag("span").first(); + if (dateE != null) { + date = date(dateE.attr("data-epoch")); + } + } + + Element authorE = post.getElementsByClass("author").first(); + if (authorE != null) { + author = StringUtils.unhtml(authorE.text()).trim(); + } + + Element titleE = post.getElementsByTag("h4").first(); + if (titleE != null) { + title = StringUtils.unhtml(titleE.text()).trim(); + } + + Element contentE = post.getElementsByClass("body").first(); + if (contentE != null) { + for (String line : toLines(contentE, + new BasicElementProcessor() { + @Override + public boolean ignoreNode(Node node) { + // TODO: ignore headlines/pub + if (node instanceof Element) { + Element el = (Element)node; + if ("h4".equals(el.tagName())) { + return true; + } + } + + return false; + } + })) { + content.add(line); + } + } + + comments.add(new Comment(id, author, title, date, content)); + } + } + + story.setFullContent(fullContent); + story.setComments(comments); + } finally { + if (in != null) { + in.close(); + } + } + } + + // Return display date from epoch String, or "" if error + private static String date(String epochString) { + long epoch = 0; + try { + epoch = Long.parseLong(epochString); + } catch (Exception e) { + epoch = 0; + } + + if (epoch > 0) { + return new SimpleDateFormat("dd MMM YYYY").format(new Date( + 1000 * epoch)); + } + + return ""; + } +} -- 2.27.0