X-Git-Url: https://git.nikiroo.be/?a=blobdiff_plain;f=src%2Fbe%2Fnikiroo%2Fgofetch%2Fsupport%2FTheRegister.java;h=1195d3d5f9edf7200aef9318b2517af79b892b67;hb=3e62b034c1981ae6329f06b3f8c0ee25c3683789;hp=7fb152400f11a641193e47ddd78d2287baef81fb;hpb=a81f396bc4bf0f70e4b5f654045f533941d86dc9;p=gofetch.git

diff --git a/src/be/nikiroo/gofetch/support/TheRegister.java b/src/be/nikiroo/gofetch/support/TheRegister.java
index 7fb1524..1195d3d 100644
--- a/src/be/nikiroo/gofetch/support/TheRegister.java
+++ b/src/be/nikiroo/gofetch/support/TheRegister.java
@@ -3,18 +3,20 @@ package be.nikiroo.gofetch.support;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.util.AbstractMap;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
 
 import org.jsoup.helper.DataUtil;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
 
 import be.nikiroo.gofetch.data.Comment;
 import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.utils.StringUtils;
 
 /**
  * Support <a
@@ -23,195 +25,240 @@ import be.nikiroo.utils.StringUtils;
  * @author niki
  */
 public class TheRegister extends BasicSupport {
+	private Map<String, String> commentReplies = new HashMap<String, String>();
+
 	@Override
 	public String getDescription() {
 		return "The Register: Biting the hand that feeds IT";
 	}
 
 	@Override
-	public List<Story> list() throws IOException {
-		List<Story> list = new ArrayList<Story>();
+	public void fetch(Story story) throws IOException {
+		super.fetch(story);
 
-		URL url = new URL("https://www.theregister.co.uk/");
-		InputStream in = downloader.open(url);
-		Document doc = DataUtil.load(in, "UTF-8", url.toString());
-		Elements articles = doc.getElementsByClass("story_link");
-		for (Element article : articles) {
-			if (article.getElementsByClass("time_stamp").isEmpty()) {
-				// Some articles are doubled,
-				// but the second copy without the time info
-				continue;
+		// Update comment replies
+		List<Comment> comments = new ArrayList<Comment>();
+		for (Comment comment : story.getComments()) {
+			if (commentReplies.containsKey(comment.getId())) {
+				String inReplyToId = commentReplies.get(comment.getId());
+				Comment inReplyTo = story.getCommentById(inReplyToId);
+				if (inReplyTo != null) {
+					inReplyTo.add(comment);
+				} else {
+					comments.add(comment);
+				}
+			} else {
+				comments.add(comment);
 			}
+		}
+		story.setComments(comments);
+	}
 
-			String id = "";
-			String intUrl = article.absUrl("href");
-			String extUrl = ""; // nope
-			String title = "";
-			String date = "";
-			String details = "";
-			String body = "";
-			String categ = "";
-			String author = ""; // nope
-
-			Element categElement = article.previousElementSibling();
-			if (categElement != null) {
-				categ = categElement.text().trim();
-			}
+	@Override
+	protected List<Entry<URL, String>> getUrls() throws IOException {
+		List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+		urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+				"https://www.theregister.co.uk/"), ""));
+		return urls;
+	}
 
-			Element titleElement = article.getElementsByTag("h4").first();
-			if (titleElement != null) {
-				title = StringUtils.unhtml(titleElement.text()).trim();
-			}
+	@Override
+	protected List<Element> getArticles(Document doc) {
+		return doc.getElementsByClass("story_link");
+	}
 
-			Element dateElement = article.getElementsByClass("time_stamp")
-					.first();
-			if (dateElement != null) {
-				String epochS = dateElement.attr("data-epoch");
-				if (epochS != null && !epochS.isEmpty()) {
-					id = epochS;
-					date = date(epochS);
-				}
-			}
+	@Override
+	protected String getArticleId(Document doc, Element article) {
+		return "";
+	}
 
-			if (id.isEmpty()) {
-				// fallback
-				id = article.attr("href").replace("/", "_");
-			}
+	@Override
+	protected String getArticleTitle(Document doc, Element article) {
+		Element titleElement = article.getElementsByTag("h4").first();
+		if (titleElement != null) {
+			return titleElement.text();
+		}
 
-			Element detailsElement = article.getElementsByClass("standfirst")
-					.first();
-			details = "(" + date + ") ";
-			if (detailsElement != null) {
-				details += StringUtils.unhtml(detailsElement.text()).trim();
-			}
+		return "";
+	}
+
+	@Override
+	protected String getArticleAuthor(Document doc, Element article) {
+		return "";
+	}
 
-			// We have some "details" but no content, so we switch them:
-			body = details;
-			details = "";
-			list.add(new Story(getType(), id, title, author, date, categ,
-					details, intUrl, extUrl, body));
+	@Override
+	protected String getArticleDate(Document doc, Element article) {
+		Element dateElement = article.getElementsByClass("time_stamp").first();
+		if (dateElement != null) {
+			return dateElement.attr("data-epoch");
 		}
 
-		return list;
+		return "";
 	}
 
 	@Override
-	public void fetch(Story story) throws IOException {
-		String fullContent = story.getContent();
-		List<Comment> comments = new ArrayList<Comment>();
-		story.setComments(comments);
+	protected String getArticleCategory(Document doc, Element article,
+			String currentCategory) {
+		Element categElement = article.previousElementSibling();
+		if (categElement != null) {
+			return categElement.text();
+		}
+
+		return "";
+	}
+
+	@Override
+	protected String getArticleDetails(Document doc, Element article) {
+		// We have some "details" but no content, so we switch them:
+		return "";
+	}
+
+	@Override
+	protected String getArticleIntUrl(Document doc, Element article) {
+		return article.absUrl("href");
+	}
+
+	@Override
+	protected String getArticleExtUrl(Document doc, Element article) {
+		return "";
+	}
+
+	@Override
+	protected String getArticleContent(Document doc, Element article) {
+		// We have some "details" but no content, so we switch them:
+		Element detailsElement = article.getElementsByClass("standfirst")
+				.first();
+		if (detailsElement != null) {
+			return detailsElement.text();
+		}
+
+		return "";
+	}
+
+	@Override
+	protected Element getFullArticle(Document doc) {
+		return doc.getElementById("body");
+	}
 
-		URL url = new URL(story.getUrlInternal());
-		InputStream in = downloader.open(url);
+	@Override
+	protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+		List<Element> commentElements = new ArrayList<Element>();
+
+		// Get comments URL then parse it
 		try {
-			Document doc = DataUtil.load(in, "UTF-8", url.toString());
-			Element article = doc.getElementById("body");
-			if (article != null) {
-				for (String line : toLines(article,
-						new BasicElementProcessor() {
-							// TODO: ignore headlines/pub
-						})) {
-					fullContent += line + "\n";
+			URL url = new URL("https://forums.theregister.co.uk/forum/1"
+					+ intUrl.getPath());
+			InputStream in = downloader.open(url);
+			try {
+				doc = DataUtil.load(in, "UTF-8", url.toString());
+				Element posts = doc.getElementById("forum_posts");
+				if (posts != null) {
+					for (Element post : posts.getElementsByClass("post")) {
+						commentElements.add(post);
+						Element inReplyTo = post.getElementsByClass(
+								"in-reply-to").first();
+						if (inReplyTo != null) {
+							String parentId = inReplyTo.absUrl("href");
+							if (parentId != null && parentId.contains("/")) {
+								int i = parentId.lastIndexOf('/');
+								parentId = parentId.substring(i + 1);
+
+								commentReplies
+										.put(getCommentId(post), parentId);
+							}
+						}
+					}
 				}
+			} finally {
+				in.close();
+			}
+		} catch (IOException e) {
+		}
+
+		return commentElements;
+	}
+
+	@Override
+	protected ElementProcessor getElementProcessorFullArticle() {
+		return new BasicElementProcessor();
+	}
+
+	@Override
+	protected List<Element> getCommentCommentPosts(Document doc,
+			Element container) {
+		return null;
+	}
 
-				// Content is too tight with a single break per line:
-				fullContent = fullContent.replace("\n", "\n\n") //
-						.replace("\n\n\n\n", "\n\n") //
-						.replace("\n\n\n\n", "\n\n") //
-						.trim();
+	@Override
+	protected String getCommentId(Element post) {
+		Element idE = post.getElementsByTag("a").first();
+		if (idE != null) {
+			String id = idE.attr("id");
+			if (id.startsWith("c_")) {
+				id = id.substring(2);
 			}
 
-			story.setFullContent(fullContent);
-
-			// Get comments URL then parse it
-			in.close();
-			in = null;
-			in = downloader
-					.open(new URL("https://forums.theregister.co.uk/forum/1"
-							+ url.getPath()));
-			doc = DataUtil.load(in, "UTF-8", url.toString());
-			Element posts = doc.getElementById("forum_posts");
-			if (posts != null) {
-				for (Element post : posts.getElementsByClass("post")) {
-					String id = "";
-					String author = "";
-					String title = "";
-					String date = "";
-					List<String> content = new ArrayList<String>();
-
-					Element idE = post.getElementsByTag("a").first();
-					if (idE != null) {
-						id = idE.attr("id");
-						if (id.startsWith("c_")) {
-							id = id.substring(2);
-						}
+			return id;
+		}
 
-						Element dateE = idE.getElementsByTag("span").first();
-						if (dateE != null) {
-							date = date(dateE.attr("data-epoch"));
-						}
-					}
+		return "";
+	}
 
-					Element authorE = post.getElementsByClass("author").first();
-					if (authorE != null) {
-						author = StringUtils.unhtml(authorE.text()).trim();
-					}
+	@Override
+	protected String getCommentAuthor(Element post) {
+		Element author = post.getElementsByClass("author").first();
+		if (author != null) {
+			return author.text();
+		}
 
-					Element titleE = post.getElementsByTag("h4").first();
-					if (titleE != null) {
-						title = StringUtils.unhtml(titleE.text()).trim();
-					}
+		return "";
+	}
 
-					Element contentE = post.getElementsByClass("body").first();
-					if (contentE != null) {
-						for (String line : toLines(contentE,
-								new BasicElementProcessor() {
-									@Override
-									public boolean ignoreNode(Node node) {
-										// TODO: ignore headlines/pub
-
-										// Remove the comment title (which has
-										// already been processed earlier)
-										if (node instanceof Element) {
-											Element el = (Element) node;
-											if ("h4".equals(el.tagName())) {
-												return true;
-											}
-										}
-
-										return false;
-									}
-								})) {
-							content.add(line);
-						}
-					}
+	@Override
+	protected String getCommentTitle(Element post) {
+		Element title = post.getElementsByTag("h4").first();
+		if (title != null) {
+			return title.text();
+		}
 
-					Comment comment = new Comment(id, author, title, date,
-							content);
-					Comment parent = null;
-
-					Element inReplyTo = post.getElementsByClass("in-reply-to")
-							.first();
-					if (inReplyTo != null) {
-						String parentId = inReplyTo.absUrl("href");
-						if (parentId != null && parentId.contains("/")) {
-							int i = parentId.lastIndexOf('/');
-							parentId = parentId.substring(i + 1);
-							parent = story.getCommentById(parentId);
-						}
-					}
+		return "";
+	}
+
+	@Override
+	protected String getCommentDate(Element post) {
+		Element id = post.getElementsByTag("a").first();
+		if (id != null) {
+			Element date = id.getElementsByTag("span").first();
+			if (date != null) {
+				return date.attr("data-epoch");
+			}
+		}
+
+		return "";
+	}
+
+	@Override
+	protected Element getCommentContentElement(Element post) {
+		return post.getElementsByClass("body").first();
+	}
 
-					if (parent == null) {
-						comments.add(comment);
-					} else {
-						parent.add(comment);
+	@Override
+	protected ElementProcessor getElementProcessorComment() {
+		return new BasicElementProcessor() {
+			@Override
+			public boolean ignoreNode(Node node) {
+				// Remove the comment title (which has
+				// already been processed earlier)
+				if (node instanceof Element) {
+					Element el = (Element) node;
+					if ("h4".equals(el.tagName())) {
+						return true;
 					}
 				}
+
+				return false;
 			}
-		} finally {
-			if (in != null) {
-				in.close();
-			}
-		}
+		};
 	}
 }