src/be/nikiroo/gofetch/support/TheRegister.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.net.URL;
   6 import java.util.AbstractMap;
   7 import java.util.ArrayList;
   8 import java.util.HashMap;
   9 import java.util.List;
  10 import java.util.Map;
  11 import java.util.Map.Entry;
  12
  13 import org.jsoup.helper.DataUtil;
  14 import org.jsoup.nodes.Document;
  15 import org.jsoup.nodes.Element;
  16 import org.jsoup.nodes.Node;
  17
  18 import be.nikiroo.gofetch.data.Comment;
  19 import be.nikiroo.gofetch.data.Story;
  20
  21 /**
  22  * Support <a
  23  * href="https://www.theregister.co.uk/">https://www.theregister.co.uk/</a>.
  24  *
  25  * @author niki
  26  */
  27 public class TheRegister extends BasicSupport {
  28         private Map<String, String> commentReplies = new HashMap<String, String>();
  29
  30         @Override
  31         public String getDescription() {
  32                 return "The Register: Biting the hand that feeds IT";
  33         }
  34
  35         @Override
  36         public void fetch(Story story) throws IOException {
  37                 super.fetch(story);
  38
  39                 // Update comment replies
  40                 List<Comment> comments = new ArrayList<Comment>();
  41                 for (Comment comment : story.getComments()) {
  42                         if (commentReplies.containsKey(comment.getId())) {
  43                                 String inReplyToId = commentReplies.get(comment.getId());
  44                                 Comment inReplyTo = story.getCommentById(inReplyToId);
  45                                 if (inReplyTo != null) {
  46                                         inReplyTo.add(comment);
  47                                 } else {
  48                                         comments.add(comment);
  49                                 }
  50                         } else {
  51                                 comments.add(comment);
  52                         }
  53                 }
  54                 story.setComments(comments);
  55         }
  56
  57         @Override
  58         protected List<Entry<URL, String>> getUrls() throws IOException {
  59                 List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
  60                 urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
  61                                 "https://www.theregister.co.uk/"), ""));
  62                 return urls;
  63         }
  64
  65         @Override
  66         protected List<Element> getArticles(Document doc) {
  67                 return doc.getElementsByClass("story_link");
  68         }
  69
  70         @Override
  71         protected String getArticleId(Document doc, Element article) {
  72                 return "";
  73         }
  74
  75         @Override
  76         protected String getArticleTitle(Document doc, Element article) {
  77                 Element titleElement = article.getElementsByTag("h4").first();
  78                 if (titleElement != null) {
  79                         return titleElement.text();
  80                 }
  81
  82                 return "";
  83         }
  84
  85         @Override
  86         protected String getArticleAuthor(Document doc, Element article) {
  87                 return "";
  88         }
  89
  90         @Override
  91         protected String getArticleDate(Document doc, Element article) {
  92                 Element dateElement = article.getElementsByClass("time_stamp").first();
  93                 if (dateElement != null) {
  94                         return dateElement.attr("data-epoch");
  95                 }
  96
  97                 return "";
  98         }
  99
 100         @Override
 101         protected String getArticleCategory(Document doc, Element article,
 102                         String currentCategory) {
 103                 Element categElement = article.previousElementSibling();
 104                 if (categElement != null) {
 105                         return categElement.text();
 106                 }
 107
 108                 return "";
 109         }
 110
 111         @Override
 112         protected String getArticleDetails(Document doc, Element article) {
 113                 // We have some "details" but no content, so we switch them:
 114                 return "";
 115         }
 116
 117         @Override
 118         protected String getArticleIntUrl(Document doc, Element article) {
 119                 return article.absUrl("href");
 120         }
 121
 122         @Override
 123         protected String getArticleExtUrl(Document doc, Element article) {
 124                 return "";
 125         }
 126
 127         @Override
 128         protected String getArticleContent(Document doc, Element article) {
 129                 // We have some "details" but no content, so we switch them:
 130                 Element detailsElement = article.getElementsByClass("standfirst")
 131                                 .first();
 132                 if (detailsElement != null) {
 133                         return detailsElement.text();
 134                 }
 135
 136                 return "";
 137         }
 138
 139         @Override
 140         protected Element getFullArticle(Document doc) {
 141                 return doc.getElementById("body");
 142         }
 143
 144         @Override
 145         protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
 146                 List<Element> commentElements = new ArrayList<Element>();
 147
 148                 // Get comments URL then parse it
 149                 try {
 150                         URL url = new URL("https://forums.theregister.co.uk/forum/1"
 151                                         + intUrl.getPath());
 152                         InputStream in = open(url);
 153                         try {
 154                                 doc = DataUtil.load(in, "UTF-8", url.toString());
 155                                 Element posts = doc.getElementById("forum_posts");
 156                                 if (posts != null) {
 157                                         for (Element post : posts.getElementsByClass("post")) {
 158                                                 commentElements.add(post);
 159                                                 Element inReplyTo = post.getElementsByClass(
 160                                                                 "in-reply-to").first();
 161                                                 if (inReplyTo != null) {
 162                                                         String parentId = inReplyTo.absUrl("href");
 163                                                         if (parentId != null && parentId.contains("/")) {
 164                                                                 int i = parentId.lastIndexOf('/');
 165                                                                 parentId = parentId.substring(i + 1);
 166
 167                                                                 commentReplies
 168                                                                                 .put(getCommentId(post), parentId);
 169                                                         }
 170                                                 }
 171                                         }
 172                                 }
 173                         } finally {
 174                                 in.close();
 175                         }
 176                 } catch (IOException e) {
 177                 }
 178
 179                 return commentElements;
 180         }
 181
 182         @Override
 183         protected ElementProcessor getElementProcessorFullArticle() {
 184                 return new BasicElementProcessor();
 185         }
 186
 187         @Override
 188         protected List<Element> getCommentCommentPosts(Document doc,
 189                         Element container) {
 190                 return null;
 191         }
 192
 193         @Override
 194         protected String getCommentId(Element post) {
 195                 Element idE = post.getElementsByTag("a").first();
 196                 if (idE != null) {
 197                         String id = idE.attr("id");
 198                         if (id.startsWith("c_")) {
 199                                 id = id.substring(2);
 200                         }
 201
 202                         return id;
 203                 }
 204
 205                 return "";
 206         }
 207
 208         @Override
 209         protected String getCommentAuthor(Element post) {
 210                 Element author = post.getElementsByClass("author").first();
 211                 if (author != null) {
 212                         return author.text();
 213                 }
 214
 215                 return "";
 216         }
 217
 218         @Override
 219         protected String getCommentTitle(Element post) {
 220                 Element title = post.getElementsByTag("h4").first();
 221                 if (title != null) {
 222                         return title.text();
 223                 }
 224
 225                 return "";
 226         }
 227
 228         @Override
 229         protected String getCommentDate(Element post) {
 230                 Element id = post.getElementsByTag("a").first();
 231                 if (id != null) {
 232                         Element date = id.getElementsByTag("span").first();
 233                         if (date != null) {
 234                                 return date.attr("data-epoch");
 235                         }
 236                 }
 237
 238                 return "";
 239         }
 240
 241         @Override
 242         protected Element getCommentContentElement(Element post) {
 243                 return post.getElementsByClass("body").first();
 244         }
 245
 246         @Override
 247         protected ElementProcessor getElementProcessorComment() {
 248                 return new BasicElementProcessor() {
 249                         @Override
 250                         public boolean ignoreNode(Node node) {
 251                                 // Remove the comment title (which has
 252                                 // already been processed earlier)
 253                                 if (node instanceof Element) {
 254                                         Element el = (Element) node;
 255                                         if ("h4".equals(el.tagName())) {
 256                                                 return true;
 257                                         }
 258                                 }
 259
 260                                 return false;
 261                         }
 262                 };
 263         }
 264 }