1 package be
.nikiroo
.gofetch
.support
;
3 import java
.io
.IOException
;
4 import java
.io
.InputStream
;
6 import java
.text
.SimpleDateFormat
;
7 import java
.util
.ArrayList
;
11 import org
.jsoup
.helper
.DataUtil
;
12 import org
.jsoup
.nodes
.Document
;
13 import org
.jsoup
.nodes
.Element
;
14 import org
.jsoup
.nodes
.Node
;
15 import org
.jsoup
.select
.Elements
;
17 import be
.nikiroo
.gofetch
.data
.Comment
;
18 import be
.nikiroo
.gofetch
.data
.Story
;
19 import be
.nikiroo
.utils
.StringUtils
;
21 public class TheRegister
extends BasicSupport
{
23 public String
getDescription() {
24 return "The Register: Biting the hand that feeds IT";
28 public List
<Story
> list() throws IOException
{
29 List
<Story
> list
= new ArrayList
<Story
>();
31 URL url
= new URL("https://www.theregister.co.uk/");
32 InputStream in
= downloader
.open(url
);
33 Document doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
34 Elements articles
= doc
.getElementsByClass("story_link");
35 for (Element article
: articles
) {
36 if (article
.getElementsByClass("time_stamp").isEmpty()) {
37 // Some articles are doubled,
38 // but the second copy without the time info
43 String intUrl
= article
.absUrl("href");
44 String extUrl
= ""; // nope
51 Element topicElement
= article
.previousElementSibling();
52 if (topicElement
!= null) {
53 topic
= "[" + topicElement
.text().trim() + "] ";
55 Element titleElement
= article
.getElementsByTag("h4").first();
56 if (titleElement
!= null) {
57 title
= StringUtils
.unhtml(titleElement
.text()).trim();
59 title
= topic
+ title
;
61 Element dateElement
= article
.getElementsByClass("time_stamp")
63 if (dateElement
!= null) {
64 String epochS
= dateElement
.attr("data-epoch");
65 if (epochS
!= null && !epochS
.isEmpty()) {
73 id
= article
.attr("href").replace("/", "_");
76 Element detailsElement
= article
.getElementsByClass("standfirst")
78 details
= "(" + date
+ ") ";
79 if (detailsElement
!= null) {
80 details
+= StringUtils
.unhtml(detailsElement
.text()).trim();
83 list
.add(new Story(getType(), id
, title
, details
, intUrl
, extUrl
,
91 public void fetch(Story story
) throws IOException
{
92 String fullContent
= story
.getContent();
93 List
<Comment
> comments
= new ArrayList
<Comment
>();
94 story
.setComments(comments
);
96 URL url
= new URL(story
.getUrlInternal());
97 InputStream in
= downloader
.open(url
);
99 Document doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
100 Element article
= doc
.getElementById("body");
101 if (article
!= null) {
102 for (String line
: toLines(article
,
103 new BasicElementProcessor() {
104 // TODO: ignore headlines/pub
106 fullContent
+= line
+ "\n";
109 // Content is too tight with a single break per line:
110 fullContent
= fullContent
.replace("\n", "\n\n") //
111 .replace("\n\n\n\n", "\n\n") //
112 .replace("\n\n\n\n", "\n\n") //
116 story
.setFullContent(fullContent
);
118 // Get comments URL then parse it
122 .open(new URL("https://forums.theregister.co.uk/forum/1"
124 doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
125 Element posts
= doc
.getElementById("forum_posts");
127 for (Element post
: posts
.getElementsByClass("post")) {
132 List
<String
> content
= new ArrayList
<String
>();
134 Element idE
= post
.getElementsByTag("a").first();
137 if (id
.startsWith("c_")) {
138 id
= id
.substring(2);
141 Element dateE
= idE
.getElementsByTag("span").first();
143 date
= date(dateE
.attr("data-epoch"));
147 Element authorE
= post
.getElementsByClass("author").first();
148 if (authorE
!= null) {
149 author
= StringUtils
.unhtml(authorE
.text()).trim();
152 Element titleE
= post
.getElementsByTag("h4").first();
153 if (titleE
!= null) {
154 title
= StringUtils
.unhtml(titleE
.text()).trim();
157 Element contentE
= post
.getElementsByClass("body").first();
158 if (contentE
!= null) {
159 for (String line
: toLines(contentE
,
160 new BasicElementProcessor() {
162 public boolean ignoreNode(Node node
) {
163 // TODO: ignore headlines/pub
165 // Remove the comment title (which has
166 // already been processed earlier)
167 if (node
instanceof Element
) {
168 Element el
= (Element
) node
;
169 if ("h4".equals(el
.tagName())) {
181 Comment comment
= new Comment(id
, author
, title
, date
,
183 Comment parent
= null;
185 Element inReplyTo
= post
.getElementsByClass("in-reply-to")
187 if (inReplyTo
!= null) {
188 String parentId
= inReplyTo
.absUrl("href");
189 if (parentId
!= null && parentId
.contains("/")) {
190 int i
= parentId
.lastIndexOf('/');
191 parentId
= parentId
.substring(i
+ 1);
192 parent
= story
.getCommentById(parentId
);
196 if (parent
== null) {
197 comments
.add(comment
);
210 // Return display date from epoch String, or "" if error
211 private static String
date(String epochString
) {
214 epoch
= Long
.parseLong(epochString
);
215 } catch (Exception e
) {
220 return new SimpleDateFormat("dd MMM YYYY").format(new Date(