1 package be
.nikiroo
.gofetch
.support
;
3 import java
.io
.IOException
;
4 import java
.io
.InputStream
;
6 import java
.util
.AbstractMap
;
7 import java
.util
.ArrayList
;
8 import java
.util
.HashMap
;
11 import java
.util
.Map
.Entry
;
13 import org
.jsoup
.helper
.DataUtil
;
14 import org
.jsoup
.nodes
.Document
;
15 import org
.jsoup
.nodes
.Element
;
16 import org
.jsoup
.nodes
.Node
;
18 import be
.nikiroo
.gofetch
.data
.Comment
;
19 import be
.nikiroo
.gofetch
.data
.Story
;
23 * href="https://www.theregister.co.uk/">https://www.theregister.co.uk/</a>.
27 public class TheRegister
extends BasicSupport
{
28 private Map
<String
, String
> commentReplies
= new HashMap
<String
, String
>();
31 public String
getDescription() {
32 return "The Register: Biting the hand that feeds IT";
36 public void fetch(Story story
) throws IOException
{
39 // Update comment replies
40 List
<Comment
> comments
= new ArrayList
<Comment
>();
41 for (Comment comment
: story
.getComments()) {
42 if (commentReplies
.containsKey(comment
.getId())) {
43 String inReplyToId
= commentReplies
.get(comment
.getId());
44 Comment inReplyTo
= story
.getCommentById(inReplyToId
);
45 if (inReplyTo
!= null) {
46 inReplyTo
.add(comment
);
48 comments
.add(comment
);
51 comments
.add(comment
);
54 story
.setComments(comments
);
58 protected List
<Entry
<URL
, String
>> getUrls() throws IOException
{
59 List
<Entry
<URL
, String
>> urls
= new ArrayList
<Entry
<URL
, String
>>();
60 urls
.add(new AbstractMap
.SimpleEntry
<URL
, String
>(new URL(
61 "https://www.theregister.co.uk/"), ""));
66 protected List
<Element
> getArticles(Document doc
) {
67 return doc
.getElementsByClass("story_link");
71 protected String
getArticleId(Document doc
, Element article
) {
76 protected String
getArticleTitle(Document doc
, Element article
) {
77 Element titleElement
= article
.getElementsByTag("h4").first();
78 if (titleElement
!= null) {
79 return titleElement
.text();
86 protected String
getArticleAuthor(Document doc
, Element article
) {
91 protected String
getArticleDate(Document doc
, Element article
) {
92 Element dateElement
= article
.getElementsByClass("time_stamp").first();
93 if (dateElement
!= null) {
94 return dateElement
.attr("data-epoch");
101 protected String
getArticleCategory(Document doc
, Element article
,
102 String currentCategory
) {
103 Element categElement
= article
.previousElementSibling();
104 if (categElement
!= null) {
105 return categElement
.text();
112 protected String
getArticleDetails(Document doc
, Element article
) {
113 // We have some "details" but no content, so we switch them:
118 protected String
getArticleIntUrl(Document doc
, Element article
) {
119 return article
.absUrl("href");
123 protected String
getArticleExtUrl(Document doc
, Element article
) {
128 protected String
getArticleContent(Document doc
, Element article
) {
129 // We have some "details" but no content, so we switch them:
130 Element detailsElement
= article
.getElementsByClass("standfirst")
132 if (detailsElement
!= null) {
133 return detailsElement
.text();
140 protected Element
getFullArticle(Document doc
) {
141 return doc
.getElementById("body");
145 protected List
<Element
> getFullArticleCommentPosts(Document doc
, URL intUrl
) {
146 List
<Element
> commentElements
= new ArrayList
<Element
>();
148 // Get comments URL then parse it
150 URL url
= new URL("https://forums.theregister.co.uk/forum/1"
152 InputStream in
= open(url
);
154 doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
155 Element posts
= doc
.getElementById("forum_posts");
157 for (Element post
: posts
.getElementsByClass("post")) {
158 commentElements
.add(post
);
159 Element inReplyTo
= post
.getElementsByClass(
160 "in-reply-to").first();
161 if (inReplyTo
!= null) {
162 String parentId
= inReplyTo
.absUrl("href");
163 if (parentId
!= null && parentId
.contains("/")) {
164 int i
= parentId
.lastIndexOf('/');
165 parentId
= parentId
.substring(i
+ 1);
168 .put(getCommentId(post
), parentId
);
176 } catch (IOException e
) {
179 return commentElements
;
183 protected ElementProcessor
getElementProcessorFullArticle() {
184 return new BasicElementProcessor();
188 protected List
<Element
> getCommentCommentPosts(Document doc
,
194 protected String
getCommentId(Element post
) {
195 Element idE
= post
.getElementsByTag("a").first();
197 String id
= idE
.attr("id");
198 if (id
.startsWith("c_")) {
199 id
= id
.substring(2);
209 protected String
getCommentAuthor(Element post
) {
210 Element author
= post
.getElementsByClass("author").first();
211 if (author
!= null) {
212 return author
.text();
219 protected String
getCommentTitle(Element post
) {
220 Element title
= post
.getElementsByTag("h4").first();
229 protected String
getCommentDate(Element post
) {
230 Element id
= post
.getElementsByTag("a").first();
232 Element date
= id
.getElementsByTag("span").first();
234 return date
.attr("data-epoch");
242 protected Element
getCommentContentElement(Element post
) {
243 return post
.getElementsByClass("body").first();
247 protected ElementProcessor
getElementProcessorComment() {
248 return new BasicElementProcessor() {
250 public boolean ignoreNode(Node node
) {
251 // Remove the comment title (which has
252 // already been processed earlier)
253 if (node
instanceof Element
) {
254 Element el
= (Element
) node
;
255 if ("h4".equals(el
.tagName())) {