X-Git-Url: https://git.nikiroo.be/?a=blobdiff_plain;f=src%2Fbe%2Fnikiroo%2Fgofetch%2Fsupport%2FTheRegister.java;h=1195d3d5f9edf7200aef9318b2517af79b892b67;hb=3e62b034c1981ae6329f06b3f8c0ee25c3683789;hp=7fb152400f11a641193e47ddd78d2287baef81fb;hpb=a81f396bc4bf0f70e4b5f654045f533941d86dc9;p=gofetch.git diff --git a/src/be/nikiroo/gofetch/support/TheRegister.java b/src/be/nikiroo/gofetch/support/TheRegister.java index 7fb1524..1195d3d 100644 --- a/src/be/nikiroo/gofetch/support/TheRegister.java +++ b/src/be/nikiroo/gofetch/support/TheRegister.java @@ -3,18 +3,20 @@ package be.nikiroo.gofetch.support; import java.io.IOException; import java.io.InputStream; import java.net.URL; +import java.util.AbstractMap; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.Map.Entry; import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; -import org.jsoup.select.Elements; import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; -import be.nikiroo.utils.StringUtils; /** * Support commentReplies = new HashMap(); + @Override public String getDescription() { return "The Register: Biting the hand that feeds IT"; } @Override - public List list() throws IOException { - List list = new ArrayList(); + public void fetch(Story story) throws IOException { + super.fetch(story); - URL url = new URL("https://www.theregister.co.uk/"); - InputStream in = downloader.open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements articles = doc.getElementsByClass("story_link"); - for (Element article : articles) { - if (article.getElementsByClass("time_stamp").isEmpty()) { - // Some articles are doubled, - // but the second copy without the time info - continue; + // Update comment replies + List comments = new ArrayList(); + for (Comment comment : story.getComments()) { + if (commentReplies.containsKey(comment.getId())) { + String inReplyToId = commentReplies.get(comment.getId()); + Comment inReplyTo = story.getCommentById(inReplyToId); + if (inReplyTo != null) { + inReplyTo.add(comment); + } else { + comments.add(comment); + } + } else { + comments.add(comment); } + } + story.setComments(comments); + } - String id = ""; - String intUrl = article.absUrl("href"); - String extUrl = ""; // nope - String title = ""; - String date = ""; - String details = ""; - String body = ""; - String categ = ""; - String author = ""; // nope - - Element categElement = article.previousElementSibling(); - if (categElement != null) { - categ = categElement.text().trim(); - } + @Override + protected List> getUrls() throws IOException { + List> urls = new ArrayList>(); + urls.add(new AbstractMap.SimpleEntry(new URL( + "https://www.theregister.co.uk/"), "")); + return urls; + } - Element titleElement = article.getElementsByTag("h4").first(); - if (titleElement != null) { - title = StringUtils.unhtml(titleElement.text()).trim(); - } + @Override + protected List getArticles(Document doc) { + return doc.getElementsByClass("story_link"); + } - Element dateElement = article.getElementsByClass("time_stamp") - .first(); - if (dateElement != null) { - String epochS = dateElement.attr("data-epoch"); - if (epochS != null && !epochS.isEmpty()) { - id = epochS; - date = date(epochS); - } - } + @Override + protected String getArticleId(Document doc, Element article) { + return ""; + } - if (id.isEmpty()) { - // fallback - id = article.attr("href").replace("/", "_"); - } + @Override + protected String getArticleTitle(Document doc, Element article) { + Element titleElement = article.getElementsByTag("h4").first(); + if (titleElement != null) { + return titleElement.text(); + } - Element detailsElement = article.getElementsByClass("standfirst") - .first(); - details = "(" + date + ") "; - if (detailsElement != null) { - details += StringUtils.unhtml(detailsElement.text()).trim(); - } + return ""; + } + + @Override + protected String getArticleAuthor(Document doc, Element article) { + return ""; + } - // We have some "details" but no content, so we switch them: - body = details; - details = ""; - list.add(new Story(getType(), id, title, author, date, categ, - details, intUrl, extUrl, body)); + @Override + protected String getArticleDate(Document doc, Element article) { + Element dateElement = article.getElementsByClass("time_stamp").first(); + if (dateElement != null) { + return dateElement.attr("data-epoch"); } - return list; + return ""; } @Override - public void fetch(Story story) throws IOException { - String fullContent = story.getContent(); - List comments = new ArrayList(); - story.setComments(comments); + protected String getArticleCategory(Document doc, Element article, + String currentCategory) { + Element categElement = article.previousElementSibling(); + if (categElement != null) { + return categElement.text(); + } + + return ""; + } + + @Override + protected String getArticleDetails(Document doc, Element article) { + // We have some "details" but no content, so we switch them: + return ""; + } + + @Override + protected String getArticleIntUrl(Document doc, Element article) { + return article.absUrl("href"); + } + + @Override + protected String getArticleExtUrl(Document doc, Element article) { + return ""; + } + + @Override + protected String getArticleContent(Document doc, Element article) { + // We have some "details" but no content, so we switch them: + Element detailsElement = article.getElementsByClass("standfirst") + .first(); + if (detailsElement != null) { + return detailsElement.text(); + } + + return ""; + } + + @Override + protected Element getFullArticle(Document doc) { + return doc.getElementById("body"); + } - URL url = new URL(story.getUrlInternal()); - InputStream in = downloader.open(url); + @Override + protected List getFullArticleCommentPosts(Document doc, URL intUrl) { + List commentElements = new ArrayList(); + + // Get comments URL then parse it try { - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Element article = doc.getElementById("body"); - if (article != null) { - for (String line : toLines(article, - new BasicElementProcessor() { - // TODO: ignore headlines/pub - })) { - fullContent += line + "\n"; + URL url = new URL("https://forums.theregister.co.uk/forum/1" + + intUrl.getPath()); + InputStream in = downloader.open(url); + try { + doc = DataUtil.load(in, "UTF-8", url.toString()); + Element posts = doc.getElementById("forum_posts"); + if (posts != null) { + for (Element post : posts.getElementsByClass("post")) { + commentElements.add(post); + Element inReplyTo = post.getElementsByClass( + "in-reply-to").first(); + if (inReplyTo != null) { + String parentId = inReplyTo.absUrl("href"); + if (parentId != null && parentId.contains("/")) { + int i = parentId.lastIndexOf('/'); + parentId = parentId.substring(i + 1); + + commentReplies + .put(getCommentId(post), parentId); + } + } + } } + } finally { + in.close(); + } + } catch (IOException e) { + } + + return commentElements; + } + + @Override + protected ElementProcessor getElementProcessorFullArticle() { + return new BasicElementProcessor(); + } + + @Override + protected List getCommentCommentPosts(Document doc, + Element container) { + return null; + } - // Content is too tight with a single break per line: - fullContent = fullContent.replace("\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .trim(); + @Override + protected String getCommentId(Element post) { + Element idE = post.getElementsByTag("a").first(); + if (idE != null) { + String id = idE.attr("id"); + if (id.startsWith("c_")) { + id = id.substring(2); } - story.setFullContent(fullContent); - - // Get comments URL then parse it - in.close(); - in = null; - in = downloader - .open(new URL("https://forums.theregister.co.uk/forum/1" - + url.getPath())); - doc = DataUtil.load(in, "UTF-8", url.toString()); - Element posts = doc.getElementById("forum_posts"); - if (posts != null) { - for (Element post : posts.getElementsByClass("post")) { - String id = ""; - String author = ""; - String title = ""; - String date = ""; - List content = new ArrayList(); - - Element idE = post.getElementsByTag("a").first(); - if (idE != null) { - id = idE.attr("id"); - if (id.startsWith("c_")) { - id = id.substring(2); - } + return id; + } - Element dateE = idE.getElementsByTag("span").first(); - if (dateE != null) { - date = date(dateE.attr("data-epoch")); - } - } + return ""; + } - Element authorE = post.getElementsByClass("author").first(); - if (authorE != null) { - author = StringUtils.unhtml(authorE.text()).trim(); - } + @Override + protected String getCommentAuthor(Element post) { + Element author = post.getElementsByClass("author").first(); + if (author != null) { + return author.text(); + } - Element titleE = post.getElementsByTag("h4").first(); - if (titleE != null) { - title = StringUtils.unhtml(titleE.text()).trim(); - } + return ""; + } - Element contentE = post.getElementsByClass("body").first(); - if (contentE != null) { - for (String line : toLines(contentE, - new BasicElementProcessor() { - @Override - public boolean ignoreNode(Node node) { - // TODO: ignore headlines/pub - - // Remove the comment title (which has - // already been processed earlier) - if (node instanceof Element) { - Element el = (Element) node; - if ("h4".equals(el.tagName())) { - return true; - } - } - - return false; - } - })) { - content.add(line); - } - } + @Override + protected String getCommentTitle(Element post) { + Element title = post.getElementsByTag("h4").first(); + if (title != null) { + return title.text(); + } - Comment comment = new Comment(id, author, title, date, - content); - Comment parent = null; - - Element inReplyTo = post.getElementsByClass("in-reply-to") - .first(); - if (inReplyTo != null) { - String parentId = inReplyTo.absUrl("href"); - if (parentId != null && parentId.contains("/")) { - int i = parentId.lastIndexOf('/'); - parentId = parentId.substring(i + 1); - parent = story.getCommentById(parentId); - } - } + return ""; + } + + @Override + protected String getCommentDate(Element post) { + Element id = post.getElementsByTag("a").first(); + if (id != null) { + Element date = id.getElementsByTag("span").first(); + if (date != null) { + return date.attr("data-epoch"); + } + } + + return ""; + } + + @Override + protected Element getCommentContentElement(Element post) { + return post.getElementsByClass("body").first(); + } - if (parent == null) { - comments.add(comment); - } else { - parent.add(comment); + @Override + protected ElementProcessor getElementProcessorComment() { + return new BasicElementProcessor() { + @Override + public boolean ignoreNode(Node node) { + // Remove the comment title (which has + // already been processed earlier) + if (node instanceof Element) { + Element el = (Element) node; + if ("h4".equals(el.tagName())) { + return true; } } + + return false; } - } finally { - if (in != null) { - in.close(); - } - } + }; } }