import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
+import java.util.AbstractMap;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
import be.nikiroo.gofetch.data.Comment;
import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.utils.StringUtils;
/**
* Support <a
* @author niki
*/
public class TheRegister extends BasicSupport {
+ private Map<String, String> commentReplies = new HashMap<String, String>();
+
@Override
public String getDescription() {
return "The Register: Biting the hand that feeds IT";
}
@Override
- public List<Story> list() throws IOException {
- List<Story> list = new ArrayList<Story>();
+ public void fetch(Story story) throws IOException {
+ super.fetch(story);
- URL url = new URL("https://www.theregister.co.uk/");
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements articles = doc.getElementsByClass("story_link");
- for (Element article : articles) {
- if (article.getElementsByClass("time_stamp").isEmpty()) {
- // Some articles are doubled,
- // but the second copy without the time info
- continue;
+ // Update comment replies
+ List<Comment> comments = new ArrayList<Comment>();
+ for (Comment comment : story.getComments()) {
+ if (commentReplies.containsKey(comment.getId())) {
+ String inReplyToId = commentReplies.get(comment.getId());
+ Comment inReplyTo = story.getCommentById(inReplyToId);
+ if (inReplyTo != null) {
+ inReplyTo.add(comment);
+ } else {
+ comments.add(comment);
+ }
+ } else {
+ comments.add(comment);
}
+ }
+ story.setComments(comments);
+ }
- String id = "";
- String intUrl = article.absUrl("href");
- String extUrl = ""; // nope
- String title = "";
- String date = "";
- String details = "";
- String body = "";
- String categ = "";
- String author = ""; // nope
-
- Element categElement = article.previousElementSibling();
- if (categElement != null) {
- categ = categElement.text().trim();
- }
+ @Override
+ protected List<Entry<URL, String>> getUrls() throws IOException {
+ List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+ urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+ "https://www.theregister.co.uk/"), ""));
+ return urls;
+ }
- Element titleElement = article.getElementsByTag("h4").first();
- if (titleElement != null) {
- title = StringUtils.unhtml(titleElement.text()).trim();
- }
+ @Override
+ protected List<Element> getArticles(Document doc) {
+ return doc.getElementsByClass("story_link");
+ }
- Element dateElement = article.getElementsByClass("time_stamp")
- .first();
- if (dateElement != null) {
- String epochS = dateElement.attr("data-epoch");
- if (epochS != null && !epochS.isEmpty()) {
- id = epochS;
- date = date(epochS);
- }
- }
+ @Override
+ protected String getArticleId(Document doc, Element article) {
+ return "";
+ }
- if (id.isEmpty()) {
- // fallback
- id = article.attr("href").replace("/", "_");
- }
+ @Override
+ protected String getArticleTitle(Document doc, Element article) {
+ Element titleElement = article.getElementsByTag("h4").first();
+ if (titleElement != null) {
+ return titleElement.text();
+ }
- Element detailsElement = article.getElementsByClass("standfirst")
- .first();
- details = "(" + date + ") ";
- if (detailsElement != null) {
- details += StringUtils.unhtml(detailsElement.text()).trim();
- }
+ return "";
+ }
+
+ @Override
+ protected String getArticleAuthor(Document doc, Element article) {
+ return "";
+ }
- list.add(new Story(getType(), id, title, author, date, categ,
- details, intUrl, extUrl, body));
+ @Override
+ protected String getArticleDate(Document doc, Element article) {
+ Element dateElement = article.getElementsByClass("time_stamp").first();
+ if (dateElement != null) {
+ return dateElement.attr("data-epoch");
}
- return list;
+ return "";
}
@Override
- public void fetch(Story story) throws IOException {
- String fullContent = story.getContent();
- List<Comment> comments = new ArrayList<Comment>();
- story.setComments(comments);
+ protected String getArticleCategory(Document doc, Element article,
+ String currentCategory) {
+ Element categElement = article.previousElementSibling();
+ if (categElement != null) {
+ return categElement.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleDetails(Document doc, Element article) {
+ // We have some "details" but no content, so we switch them:
+ return "";
+ }
+
+ @Override
+ protected String getArticleIntUrl(Document doc, Element article) {
+ return article.absUrl("href");
+ }
+
+ @Override
+ protected String getArticleExtUrl(Document doc, Element article) {
+ return "";
+ }
+
+ @Override
+ protected String getArticleContent(Document doc, Element article) {
+ // We have some "details" but no content, so we switch them:
+ Element detailsElement = article.getElementsByClass("standfirst")
+ .first();
+ if (detailsElement != null) {
+ return getArticleText(detailsElement);
+ }
+
+ return "";
+ }
+
+ @Override
+ protected Element getFullArticle(Document doc) {
+ return doc.getElementById("body");
+ }
- URL url = new URL(story.getUrlInternal());
- InputStream in = downloader.open(url);
+ @Override
+ protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+ List<Element> commentElements = new ArrayList<Element>();
+
+ // Get comments URL then parse it
try {
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Element article = doc.getElementById("body");
- if (article != null) {
- for (String line : toLines(article,
- new BasicElementProcessor() {
- // TODO: ignore headlines/pub
- })) {
- fullContent += line + "\n";
+ URL url = new URL("https://forums.theregister.co.uk/forum/1"
+ + intUrl.getPath());
+ InputStream in = open(url);
+ try {
+ doc = DataUtil.load(in, "UTF-8", url.toString());
+ Element posts = doc.getElementById("forum_posts");
+ if (posts != null) {
+ for (Element post : posts.getElementsByClass("post")) {
+ commentElements.add(post);
+ Element inReplyTo = post.getElementsByClass(
+ "in-reply-to").first();
+ if (inReplyTo != null) {
+ String parentId = inReplyTo.absUrl("href");
+ if (parentId != null && parentId.contains("/")) {
+ int i = parentId.lastIndexOf('/');
+ parentId = parentId.substring(i + 1);
+
+ commentReplies
+ .put(getCommentId(post), parentId);
+ }
+ }
+ }
}
+ } finally {
+ in.close();
+ }
+ } catch (IOException e) {
+ }
+
+ return commentElements;
+ }
+
+ @Override
+ protected ElementProcessor getElementProcessorFullArticle() {
+ return new BasicElementProcessor();
+ }
+
+ @Override
+ protected List<Element> getCommentCommentPosts(Document doc,
+ Element container) {
+ return null;
+ }
- // Content is too tight with a single break per line:
- fullContent = fullContent.replace("\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .trim();
+ @Override
+ protected String getCommentId(Element post) {
+ Element idE = post.getElementsByTag("a").first();
+ if (idE != null) {
+ String id = idE.attr("id");
+ if (id.startsWith("c_")) {
+ id = id.substring(2);
}
- story.setFullContent(fullContent);
-
- // Get comments URL then parse it
- in.close();
- in = null;
- in = downloader
- .open(new URL("https://forums.theregister.co.uk/forum/1"
- + url.getPath()));
- doc = DataUtil.load(in, "UTF-8", url.toString());
- Element posts = doc.getElementById("forum_posts");
- if (posts != null) {
- for (Element post : posts.getElementsByClass("post")) {
- String id = "";
- String author = "";
- String title = "";
- String date = "";
- List<String> content = new ArrayList<String>();
-
- Element idE = post.getElementsByTag("a").first();
- if (idE != null) {
- id = idE.attr("id");
- if (id.startsWith("c_")) {
- id = id.substring(2);
- }
+ return id;
+ }
- Element dateE = idE.getElementsByTag("span").first();
- if (dateE != null) {
- date = date(dateE.attr("data-epoch"));
- }
- }
+ return "";
+ }
- Element authorE = post.getElementsByClass("author").first();
- if (authorE != null) {
- author = StringUtils.unhtml(authorE.text()).trim();
- }
+ @Override
+ protected String getCommentAuthor(Element post) {
+ Element author = post.getElementsByClass("author").first();
+ if (author != null) {
+ return author.text();
+ }
- Element titleE = post.getElementsByTag("h4").first();
- if (titleE != null) {
- title = StringUtils.unhtml(titleE.text()).trim();
- }
+ return "";
+ }
- Element contentE = post.getElementsByClass("body").first();
- if (contentE != null) {
- for (String line : toLines(contentE,
- new BasicElementProcessor() {
- @Override
- public boolean ignoreNode(Node node) {
- // TODO: ignore headlines/pub
-
- // Remove the comment title (which has
- // already been processed earlier)
- if (node instanceof Element) {
- Element el = (Element) node;
- if ("h4".equals(el.tagName())) {
- return true;
- }
- }
-
- return false;
- }
- })) {
- content.add(line);
- }
- }
+ @Override
+ protected String getCommentTitle(Element post) {
+ Element title = post.getElementsByTag("h4").first();
+ if (title != null) {
+ return title.text();
+ }
- Comment comment = new Comment(id, author, title, date,
- content);
- Comment parent = null;
-
- Element inReplyTo = post.getElementsByClass("in-reply-to")
- .first();
- if (inReplyTo != null) {
- String parentId = inReplyTo.absUrl("href");
- if (parentId != null && parentId.contains("/")) {
- int i = parentId.lastIndexOf('/');
- parentId = parentId.substring(i + 1);
- parent = story.getCommentById(parentId);
- }
- }
+ return "";
+ }
+
+ @Override
+ protected String getCommentDate(Element post) {
+ Element id = post.getElementsByTag("a").first();
+ if (id != null) {
+ Element date = id.getElementsByTag("span").first();
+ if (date != null) {
+ return date.attr("data-epoch");
+ }
+ }
+
+ return "";
+ }
+
+ @Override
+ protected Element getCommentContentElement(Element post) {
+ return post.getElementsByClass("body").first();
+ }
- if (parent == null) {
- comments.add(comment);
- } else {
- parent.add(comment);
+ @Override
+ protected ElementProcessor getElementProcessorComment() {
+ return new BasicElementProcessor() {
+ @Override
+ public boolean ignoreNode(Node node) {
+ // Remove the comment title (which has
+ // already been processed earlier)
+ if (node instanceof Element) {
+ Element el = (Element) node;
+ if ("h4".equals(el.tagName())) {
+ return true;
}
}
+
+ return false;
}
- } finally {
- if (in != null) {
- in.close();
- }
- }
+ };
}
}