1 package be
.nikiroo
.gofetch
.support
;
3 import java
.io
.IOException
;
4 import java
.io
.InputStream
;
6 import java
.util
.ArrayList
;
9 import org
.jsoup
.helper
.DataUtil
;
10 import org
.jsoup
.nodes
.Document
;
11 import org
.jsoup
.nodes
.Element
;
12 import org
.jsoup
.nodes
.Node
;
13 import org
.jsoup
.select
.Elements
;
15 import be
.nikiroo
.gofetch
.data
.Comment
;
16 import be
.nikiroo
.gofetch
.data
.Story
;
17 import be
.nikiroo
.utils
.StringUtils
;
21 * href="https://www.theregister.co.uk/">https://www.theregister.co.uk/</a>.
25 public class TheRegister
extends BasicSupport
{
27 public String
getDescription() {
28 return "The Register: Biting the hand that feeds IT";
32 public List
<Story
> list() throws IOException
{
33 List
<Story
> list
= new ArrayList
<Story
>();
35 URL url
= new URL("https://www.theregister.co.uk/");
36 InputStream in
= downloader
.open(url
);
37 Document doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
38 Elements articles
= doc
.getElementsByClass("story_link");
39 for (Element article
: articles
) {
40 if (article
.getElementsByClass("time_stamp").isEmpty()) {
41 // Some articles are doubled,
42 // but the second copy without the time info
47 String intUrl
= article
.absUrl("href");
48 String extUrl
= ""; // nope
54 String author
= ""; // nope
56 Element categElement
= article
.previousElementSibling();
57 if (categElement
!= null) {
58 categ
= categElement
.text().trim();
61 Element titleElement
= article
.getElementsByTag("h4").first();
62 if (titleElement
!= null) {
63 title
= StringUtils
.unhtml(titleElement
.text()).trim();
66 Element dateElement
= article
.getElementsByClass("time_stamp")
68 if (dateElement
!= null) {
69 String epochS
= dateElement
.attr("data-epoch");
70 if (epochS
!= null && !epochS
.isEmpty()) {
78 id
= article
.attr("href").replace("/", "_");
81 Element detailsElement
= article
.getElementsByClass("standfirst")
83 details
= "(" + date
+ ") ";
84 if (detailsElement
!= null) {
85 details
+= StringUtils
.unhtml(detailsElement
.text()).trim();
88 // We have some "details" but no content, so we switch them:
91 list
.add(new Story(getType(), id
, title
, author
, date
, categ
,
92 details
, intUrl
, extUrl
, body
));
99 public void fetch(Story story
) throws IOException
{
100 String fullContent
= story
.getContent();
101 List
<Comment
> comments
= new ArrayList
<Comment
>();
102 story
.setComments(comments
);
104 URL url
= new URL(story
.getUrlInternal());
105 InputStream in
= downloader
.open(url
);
107 Document doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
108 Element article
= doc
.getElementById("body");
109 if (article
!= null) {
110 for (String line
: toLines(article
,
111 new BasicElementProcessor() {
112 // TODO: ignore headlines/pub
114 fullContent
+= line
+ "\n";
117 // Content is too tight with a single break per line:
118 fullContent
= fullContent
.replace("\n", "\n\n") //
119 .replace("\n\n\n\n", "\n\n") //
120 .replace("\n\n\n\n", "\n\n") //
124 story
.setFullContent(fullContent
);
126 // Get comments URL then parse it
130 .open(new URL("https://forums.theregister.co.uk/forum/1"
132 doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
133 Element posts
= doc
.getElementById("forum_posts");
135 for (Element post
: posts
.getElementsByClass("post")) {
140 List
<String
> content
= new ArrayList
<String
>();
142 Element idE
= post
.getElementsByTag("a").first();
145 if (id
.startsWith("c_")) {
146 id
= id
.substring(2);
149 Element dateE
= idE
.getElementsByTag("span").first();
151 date
= date(dateE
.attr("data-epoch"));
155 Element authorE
= post
.getElementsByClass("author").first();
156 if (authorE
!= null) {
157 author
= StringUtils
.unhtml(authorE
.text()).trim();
160 Element titleE
= post
.getElementsByTag("h4").first();
161 if (titleE
!= null) {
162 title
= StringUtils
.unhtml(titleE
.text()).trim();
165 Element contentE
= post
.getElementsByClass("body").first();
166 if (contentE
!= null) {
167 for (String line
: toLines(contentE
,
168 new BasicElementProcessor() {
170 public boolean ignoreNode(Node node
) {
171 // TODO: ignore headlines/pub
173 // Remove the comment title (which has
174 // already been processed earlier)
175 if (node
instanceof Element
) {
176 Element el
= (Element
) node
;
177 if ("h4".equals(el
.tagName())) {
189 Comment comment
= new Comment(id
, author
, title
, date
,
191 Comment parent
= null;
193 Element inReplyTo
= post
.getElementsByClass("in-reply-to")
195 if (inReplyTo
!= null) {
196 String parentId
= inReplyTo
.absUrl("href");
197 if (parentId
!= null && parentId
.contains("/")) {
198 int i
= parentId
.lastIndexOf('/');
199 parentId
= parentId
.substring(i
+ 1);
200 parent
= story
.getCommentById(parentId
);
204 if (parent
== null) {
205 comments
.add(comment
);