1 package be
.nikiroo
.gofetch
.support
;
3 import java
.io
.IOException
;
4 import java
.io
.InputStream
;
6 import java
.util
.ArrayList
;
9 import org
.jsoup
.helper
.DataUtil
;
10 import org
.jsoup
.nodes
.Document
;
11 import org
.jsoup
.nodes
.Element
;
12 import org
.jsoup
.nodes
.Node
;
13 import org
.jsoup
.select
.Elements
;
15 import be
.nikiroo
.gofetch
.data
.Comment
;
16 import be
.nikiroo
.gofetch
.data
.Story
;
17 import be
.nikiroo
.utils
.StringUtils
;
21 * href="https://www.theregister.co.uk/">https://www.theregister.co.uk/</a>.
25 public class TheRegister
extends BasicSupport
{
27 public String
getDescription() {
28 return "The Register: Biting the hand that feeds IT";
32 public List
<Story
> list() throws IOException
{
33 List
<Story
> list
= new ArrayList
<Story
>();
35 URL url
= new URL("https://www.theregister.co.uk/");
36 InputStream in
= downloader
.open(url
);
37 Document doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
38 Elements articles
= doc
.getElementsByClass("story_link");
39 for (Element article
: articles
) {
40 if (article
.getElementsByClass("time_stamp").isEmpty()) {
41 // Some articles are doubled,
42 // but the second copy without the time info
47 String intUrl
= article
.absUrl("href");
48 String extUrl
= ""; // nope
54 String author
= ""; // nope
56 Element categElement
= article
.previousElementSibling();
57 if (categElement
!= null) {
58 categ
= categElement
.text().trim();
61 Element titleElement
= article
.getElementsByTag("h4").first();
62 if (titleElement
!= null) {
63 title
= StringUtils
.unhtml(titleElement
.text()).trim();
66 Element dateElement
= article
.getElementsByClass("time_stamp")
68 if (dateElement
!= null) {
69 String epochS
= dateElement
.attr("data-epoch");
70 if (epochS
!= null && !epochS
.isEmpty()) {
78 id
= article
.attr("href").replace("/", "_");
81 Element detailsElement
= article
.getElementsByClass("standfirst")
83 details
= "(" + date
+ ") ";
84 if (detailsElement
!= null) {
85 details
+= StringUtils
.unhtml(detailsElement
.text()).trim();
88 list
.add(new Story(getType(), id
, title
, author
, date
, categ
,
89 details
, intUrl
, extUrl
, body
));
96 public void fetch(Story story
) throws IOException
{
97 String fullContent
= story
.getContent();
98 List
<Comment
> comments
= new ArrayList
<Comment
>();
99 story
.setComments(comments
);
101 URL url
= new URL(story
.getUrlInternal());
102 InputStream in
= downloader
.open(url
);
104 Document doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
105 Element article
= doc
.getElementById("body");
106 if (article
!= null) {
107 for (String line
: toLines(article
,
108 new BasicElementProcessor() {
109 // TODO: ignore headlines/pub
111 fullContent
+= line
+ "\n";
114 // Content is too tight with a single break per line:
115 fullContent
= fullContent
.replace("\n", "\n\n") //
116 .replace("\n\n\n\n", "\n\n") //
117 .replace("\n\n\n\n", "\n\n") //
121 story
.setFullContent(fullContent
);
123 // Get comments URL then parse it
127 .open(new URL("https://forums.theregister.co.uk/forum/1"
129 doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
130 Element posts
= doc
.getElementById("forum_posts");
132 for (Element post
: posts
.getElementsByClass("post")) {
137 List
<String
> content
= new ArrayList
<String
>();
139 Element idE
= post
.getElementsByTag("a").first();
142 if (id
.startsWith("c_")) {
143 id
= id
.substring(2);
146 Element dateE
= idE
.getElementsByTag("span").first();
148 date
= date(dateE
.attr("data-epoch"));
152 Element authorE
= post
.getElementsByClass("author").first();
153 if (authorE
!= null) {
154 author
= StringUtils
.unhtml(authorE
.text()).trim();
157 Element titleE
= post
.getElementsByTag("h4").first();
158 if (titleE
!= null) {
159 title
= StringUtils
.unhtml(titleE
.text()).trim();
162 Element contentE
= post
.getElementsByClass("body").first();
163 if (contentE
!= null) {
164 for (String line
: toLines(contentE
,
165 new BasicElementProcessor() {
167 public boolean ignoreNode(Node node
) {
168 // TODO: ignore headlines/pub
170 // Remove the comment title (which has
171 // already been processed earlier)
172 if (node
instanceof Element
) {
173 Element el
= (Element
) node
;
174 if ("h4".equals(el
.tagName())) {
186 Comment comment
= new Comment(id
, author
, title
, date
,
188 Comment parent
= null;
190 Element inReplyTo
= post
.getElementsByClass("in-reply-to")
192 if (inReplyTo
!= null) {
193 String parentId
= inReplyTo
.absUrl("href");
194 if (parentId
!= null && parentId
.contains("/")) {
195 int i
= parentId
.lastIndexOf('/');
196 parentId
= parentId
.substring(i
+ 1);
197 parent
= story
.getCommentById(parentId
);
201 if (parent
== null) {
202 comments
.add(comment
);