src/be/nikiroo/gofetch/support/TheRegister.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.net.URL;
   6 import java.util.ArrayList;
   7 import java.util.List;
   8
   9 import org.jsoup.helper.DataUtil;
  10 import org.jsoup.nodes.Document;
  11 import org.jsoup.nodes.Element;
  12 import org.jsoup.nodes.Node;
  13 import org.jsoup.select.Elements;
  14
  15 import be.nikiroo.gofetch.data.Comment;
  16 import be.nikiroo.gofetch.data.Story;
  17 import be.nikiroo.utils.StringUtils;
  18
  19 /**
  20  * Support <a
  21  * href="https://www.theregister.co.uk/">https://www.theregister.co.uk/</a>.
  22  *
  23  * @author niki
  24  */
  25 public class TheRegister extends BasicSupport {
  26         @Override
  27         public String getDescription() {
  28                 return "The Register: Biting the hand that feeds IT";
  29         }
  30
  31         @Override
  32         public List<Story> list() throws IOException {
  33                 List<Story> list = new ArrayList<Story>();
  34
  35                 URL url = new URL("https://www.theregister.co.uk/");
  36                 InputStream in = downloader.open(url);
  37                 Document doc = DataUtil.load(in, "UTF-8", url.toString());
  38                 Elements articles = doc.getElementsByClass("story_link");
  39                 for (Element article : articles) {
  40                         if (article.getElementsByClass("time_stamp").isEmpty()) {
  41                                 // Some articles are doubled,
  42                                 // but the second copy without the time info
  43                                 continue;
  44                         }
  45
  46                         String id = "";
  47                         String intUrl = article.absUrl("href");
  48                         String extUrl = ""; // nope
  49                         String title = "";
  50                         String date = "";
  51                         String details = "";
  52                         String body = "";
  53                         String categ = "";
  54                         String author = ""; // nope
  55
  56                         Element categElement = article.previousElementSibling();
  57                         if (categElement != null) {
  58                                 categ = categElement.text().trim();
  59                         }
  60
  61                         Element titleElement = article.getElementsByTag("h4").first();
  62                         if (titleElement != null) {
  63                                 title = StringUtils.unhtml(titleElement.text()).trim();
  64                         }
  65
  66                         Element dateElement = article.getElementsByClass("time_stamp")
  67                                         .first();
  68                         if (dateElement != null) {
  69                                 String epochS = dateElement.attr("data-epoch");
  70                                 if (epochS != null && !epochS.isEmpty()) {
  71                                         id = epochS;
  72                                         date = date(epochS);
  73                                 }
  74                         }
  75
  76                         if (id.isEmpty()) {
  77                                 // fallback
  78                                 id = article.attr("href").replace("/", "_");
  79                         }
  80
  81                         Element detailsElement = article.getElementsByClass("standfirst")
  82                                         .first();
  83                         details = "(" + date + ") ";
  84                         if (detailsElement != null) {
  85                                 details += StringUtils.unhtml(detailsElement.text()).trim();
  86                         }
  87
  88                         // We have some "details" but no content, so we switch them:
  89                         body = details;
  90                         details = "";
  91                         list.add(new Story(getType(), id, title, author, date, categ,
  92                                         details, intUrl, extUrl, body));
  93                 }
  94
  95                 return list;
  96         }
  97
  98         @Override
  99         public void fetch(Story story) throws IOException {
 100                 String fullContent = story.getContent();
 101                 List<Comment> comments = new ArrayList<Comment>();
 102                 story.setComments(comments);
 103
 104                 URL url = new URL(story.getUrlInternal());
 105                 InputStream in = downloader.open(url);
 106                 try {
 107                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
 108                         Element article = doc.getElementById("body");
 109                         if (article != null) {
 110                                 for (String line : toLines(article,
 111                                                 new BasicElementProcessor() {
 112                                                         // TODO: ignore headlines/pub
 113                                                 })) {
 114                                         fullContent += line + "\n";
 115                                 }
 116
 117                                 // Content is too tight with a single break per line:
 118                                 fullContent = fullContent.replace("\n", "\n\n") //
 119                                                 .replace("\n\n\n\n", "\n\n") //
 120                                                 .replace("\n\n\n\n", "\n\n") //
 121                                                 .trim();
 122                         }
 123
 124                         story.setFullContent(fullContent);
 125
 126                         // Get comments URL then parse it
 127                         in.close();
 128                         in = null;
 129                         in = downloader
 130                                         .open(new URL("https://forums.theregister.co.uk/forum/1"
 131                                                         + url.getPath()));
 132                         doc = DataUtil.load(in, "UTF-8", url.toString());
 133                         Element posts = doc.getElementById("forum_posts");
 134                         if (posts != null) {
 135                                 for (Element post : posts.getElementsByClass("post")) {
 136                                         String id = "";
 137                                         String author = "";
 138                                         String title = "";
 139                                         String date = "";
 140                                         List<String> content = new ArrayList<String>();
 141
 142                                         Element idE = post.getElementsByTag("a").first();
 143                                         if (idE != null) {
 144                                                 id = idE.attr("id");
 145                                                 if (id.startsWith("c_")) {
 146                                                         id = id.substring(2);
 147                                                 }
 148
 149                                                 Element dateE = idE.getElementsByTag("span").first();
 150                                                 if (dateE != null) {
 151                                                         date = date(dateE.attr("data-epoch"));
 152                                                 }
 153                                         }
 154
 155                                         Element authorE = post.getElementsByClass("author").first();
 156                                         if (authorE != null) {
 157                                                 author = StringUtils.unhtml(authorE.text()).trim();
 158                                         }
 159
 160                                         Element titleE = post.getElementsByTag("h4").first();
 161                                         if (titleE != null) {
 162                                                 title = StringUtils.unhtml(titleE.text()).trim();
 163                                         }
 164
 165                                         Element contentE = post.getElementsByClass("body").first();
 166                                         if (contentE != null) {
 167                                                 for (String line : toLines(contentE,
 168                                                                 new BasicElementProcessor() {
 169                                                                         @Override
 170                                                                         public boolean ignoreNode(Node node) {
 171                                                                                 // TODO: ignore headlines/pub
 172
 173                                                                                 // Remove the comment title (which has
 174                                                                                 // already been processed earlier)
 175                                                                                 if (node instanceof Element) {
 176                                                                                         Element el = (Element) node;
 177                                                                                         if ("h4".equals(el.tagName())) {
 178                                                                                                 return true;
 179                                                                                         }
 180                                                                                 }
 181
 182                                                                                 return false;
 183                                                                         }
 184                                                                 })) {
 185                                                         content.add(line);
 186                                                 }
 187                                         }
 188
 189                                         Comment comment = new Comment(id, author, title, date,
 190                                                         content);
 191                                         Comment parent = null;
 192
 193                                         Element inReplyTo = post.getElementsByClass("in-reply-to")
 194                                                         .first();
 195                                         if (inReplyTo != null) {
 196                                                 String parentId = inReplyTo.absUrl("href");
 197                                                 if (parentId != null && parentId.contains("/")) {
 198                                                         int i = parentId.lastIndexOf('/');
 199                                                         parentId = parentId.substring(i + 1);
 200                                                         parent = story.getCommentById(parentId);
 201                                                 }
 202                                         }
 203
 204                                         if (parent == null) {
 205                                                 comments.add(comment);
 206                                         } else {
 207                                                 parent.add(comment);
 208                                         }
 209                                 }
 210                         }
 211                 } finally {
 212                         if (in != null) {
 213                                 in.close();
 214                         }
 215                 }
 216         }
 217 }