src/be/nikiroo/gofetch/support/TheRegister.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.net.URL;
   6 import java.util.ArrayList;
   7 import java.util.List;
   8
   9 import org.jsoup.helper.DataUtil;
  10 import org.jsoup.nodes.Document;
  11 import org.jsoup.nodes.Element;
  12 import org.jsoup.nodes.Node;
  13 import org.jsoup.select.Elements;
  14
  15 import be.nikiroo.gofetch.data.Comment;
  16 import be.nikiroo.gofetch.data.Story;
  17 import be.nikiroo.utils.StringUtils;
  18
  19 /**
  20  * Support <a
  21  * href="https://www.theregister.co.uk/">https://www.theregister.co.uk/</a>.
  22  *
  23  * @author niki
  24  */
  25 public class TheRegister extends BasicSupport {
  26         @Override
  27         public String getDescription() {
  28                 return "The Register: Biting the hand that feeds IT";
  29         }
  30
  31         @Override
  32         public List<Story> list() throws IOException {
  33                 List<Story> list = new ArrayList<Story>();
  34
  35                 URL url = new URL("https://www.theregister.co.uk/");
  36                 InputStream in = downloader.open(url);
  37                 Document doc = DataUtil.load(in, "UTF-8", url.toString());
  38                 Elements articles = doc.getElementsByClass("story_link");
  39                 for (Element article : articles) {
  40                         if (article.getElementsByClass("time_stamp").isEmpty()) {
  41                                 // Some articles are doubled,
  42                                 // but the second copy without the time info
  43                                 continue;
  44                         }
  45
  46                         String id = "";
  47                         String intUrl = article.absUrl("href");
  48                         String extUrl = ""; // nope
  49                         String title = "";
  50                         String date = "";
  51                         String details = "";
  52                         String body = "";
  53                         String categ = "";
  54                         String author = ""; // nope
  55
  56                         Element categElement = article.previousElementSibling();
  57                         if (categElement != null) {
  58                                 categ = categElement.text().trim();
  59                         }
  60
  61                         Element titleElement = article.getElementsByTag("h4").first();
  62                         if (titleElement != null) {
  63                                 title = StringUtils.unhtml(titleElement.text()).trim();
  64                         }
  65
  66                         Element dateElement = article.getElementsByClass("time_stamp")
  67                                         .first();
  68                         if (dateElement != null) {
  69                                 String epochS = dateElement.attr("data-epoch");
  70                                 if (epochS != null && !epochS.isEmpty()) {
  71                                         id = epochS;
  72                                         date = date(epochS);
  73                                 }
  74                         }
  75
  76                         if (id.isEmpty()) {
  77                                 // fallback
  78                                 id = article.attr("href").replace("/", "_");
  79                         }
  80
  81                         Element detailsElement = article.getElementsByClass("standfirst")
  82                                         .first();
  83                         details = "(" + date + ") ";
  84                         if (detailsElement != null) {
  85                                 details += StringUtils.unhtml(detailsElement.text()).trim();
  86                         }
  87
  88                         list.add(new Story(getType(), id, title, author, date, categ,
  89                                         details, intUrl, extUrl, body));
  90                 }
  91
  92                 return list;
  93         }
  94
  95         @Override
  96         public void fetch(Story story) throws IOException {
  97                 String fullContent = story.getContent();
  98                 List<Comment> comments = new ArrayList<Comment>();
  99                 story.setComments(comments);
 100
 101                 URL url = new URL(story.getUrlInternal());
 102                 InputStream in = downloader.open(url);
 103                 try {
 104                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
 105                         Element article = doc.getElementById("body");
 106                         if (article != null) {
 107                                 for (String line : toLines(article,
 108                                                 new BasicElementProcessor() {
 109                                                         // TODO: ignore headlines/pub
 110                                                 })) {
 111                                         fullContent += line + "\n";
 112                                 }
 113
 114                                 // Content is too tight with a single break per line:
 115                                 fullContent = fullContent.replace("\n", "\n\n") //
 116                                                 .replace("\n\n\n\n", "\n\n") //
 117                                                 .replace("\n\n\n\n", "\n\n") //
 118                                                 .trim();
 119                         }
 120
 121                         story.setFullContent(fullContent);
 122
 123                         // Get comments URL then parse it
 124                         in.close();
 125                         in = null;
 126                         in = downloader
 127                                         .open(new URL("https://forums.theregister.co.uk/forum/1"
 128                                                         + url.getPath()));
 129                         doc = DataUtil.load(in, "UTF-8", url.toString());
 130                         Element posts = doc.getElementById("forum_posts");
 131                         if (posts != null) {
 132                                 for (Element post : posts.getElementsByClass("post")) {
 133                                         String id = "";
 134                                         String author = "";
 135                                         String title = "";
 136                                         String date = "";
 137                                         List<String> content = new ArrayList<String>();
 138
 139                                         Element idE = post.getElementsByTag("a").first();
 140                                         if (idE != null) {
 141                                                 id = idE.attr("id");
 142                                                 if (id.startsWith("c_")) {
 143                                                         id = id.substring(2);
 144                                                 }
 145
 146                                                 Element dateE = idE.getElementsByTag("span").first();
 147                                                 if (dateE != null) {
 148                                                         date = date(dateE.attr("data-epoch"));
 149                                                 }
 150                                         }
 151
 152                                         Element authorE = post.getElementsByClass("author").first();
 153                                         if (authorE != null) {
 154                                                 author = StringUtils.unhtml(authorE.text()).trim();
 155                                         }
 156
 157                                         Element titleE = post.getElementsByTag("h4").first();
 158                                         if (titleE != null) {
 159                                                 title = StringUtils.unhtml(titleE.text()).trim();
 160                                         }
 161
 162                                         Element contentE = post.getElementsByClass("body").first();
 163                                         if (contentE != null) {
 164                                                 for (String line : toLines(contentE,
 165                                                                 new BasicElementProcessor() {
 166                                                                         @Override
 167                                                                         public boolean ignoreNode(Node node) {
 168                                                                                 // TODO: ignore headlines/pub
 169
 170                                                                                 // Remove the comment title (which has
 171                                                                                 // already been processed earlier)
 172                                                                                 if (node instanceof Element) {
 173                                                                                         Element el = (Element) node;
 174                                                                                         if ("h4".equals(el.tagName())) {
 175                                                                                                 return true;
 176                                                                                         }
 177                                                                                 }
 178
 179                                                                                 return false;
 180                                                                         }
 181                                                                 })) {
 182                                                         content.add(line);
 183                                                 }
 184                                         }
 185
 186                                         Comment comment = new Comment(id, author, title, date,
 187                                                         content);
 188                                         Comment parent = null;
 189
 190                                         Element inReplyTo = post.getElementsByClass("in-reply-to")
 191                                                         .first();
 192                                         if (inReplyTo != null) {
 193                                                 String parentId = inReplyTo.absUrl("href");
 194                                                 if (parentId != null && parentId.contains("/")) {
 195                                                         int i = parentId.lastIndexOf('/');
 196                                                         parentId = parentId.substring(i + 1);
 197                                                         parent = story.getCommentById(parentId);
 198                                                 }
 199                                         }
 200
 201                                         if (parent == null) {
 202                                                 comments.add(comment);
 203                                         } else {
 204                                                 parent.add(comment);
 205                                         }
 206                                 }
 207                         }
 208                 } finally {
 209                         if (in != null) {
 210                                 in.close();
 211                         }
 212                 }
 213         }
 214 }