Small fixes in different places
[gofetch.git] / src / be / nikiroo / gofetch / support / TheRegister.java
CommitLineData
d28c4aac
NR
1package be.nikiroo.gofetch.support;
2
3import java.io.IOException;
4import java.io.InputStream;
5import java.net.URL;
d28c4aac 6import java.util.ArrayList;
d28c4aac
NR
7import java.util.List;
8
9import org.jsoup.helper.DataUtil;
10import org.jsoup.nodes.Document;
11import org.jsoup.nodes.Element;
12import org.jsoup.nodes.Node;
13import org.jsoup.select.Elements;
14
15import be.nikiroo.gofetch.data.Comment;
16import be.nikiroo.gofetch.data.Story;
17import be.nikiroo.utils.StringUtils;
18
b34d1f35
NR
19/**
20 * Support <a
21 * href="https://www.theregister.co.uk/">https://www.theregister.co.uk/</a>.
22 *
23 * @author niki
24 */
d28c4aac
NR
25public class TheRegister extends BasicSupport {
26 @Override
27 public String getDescription() {
28 return "The Register: Biting the hand that feeds IT";
29 }
30
31 @Override
32 public List<Story> list() throws IOException {
33 List<Story> list = new ArrayList<Story>();
34
35 URL url = new URL("https://www.theregister.co.uk/");
36 InputStream in = downloader.open(url);
37 Document doc = DataUtil.load(in, "UTF-8", url.toString());
38 Elements articles = doc.getElementsByClass("story_link");
39 for (Element article : articles) {
40 if (article.getElementsByClass("time_stamp").isEmpty()) {
41 // Some articles are doubled,
42 // but the second copy without the time info
43 continue;
44 }
45
46 String id = "";
47 String intUrl = article.absUrl("href");
48 String extUrl = ""; // nope
49 String title = "";
50 String date = "";
51 String details = "";
52 String body = "";
b34d1f35
NR
53 String categ = "";
54 String author = ""; // nope
d28c4aac 55
b34d1f35
NR
56 Element categElement = article.previousElementSibling();
57 if (categElement != null) {
58 categ = categElement.text().trim();
d28c4aac 59 }
b34d1f35 60
d28c4aac
NR
61 Element titleElement = article.getElementsByTag("h4").first();
62 if (titleElement != null) {
63 title = StringUtils.unhtml(titleElement.text()).trim();
64 }
d28c4aac
NR
65
66 Element dateElement = article.getElementsByClass("time_stamp")
67 .first();
68 if (dateElement != null) {
69 String epochS = dateElement.attr("data-epoch");
70 if (epochS != null && !epochS.isEmpty()) {
71 id = epochS;
72 date = date(epochS);
73 }
74 }
75
76 if (id.isEmpty()) {
77 // fallback
78 id = article.attr("href").replace("/", "_");
79 }
80
81 Element detailsElement = article.getElementsByClass("standfirst")
82 .first();
83 details = "(" + date + ") ";
84 if (detailsElement != null) {
85 details += StringUtils.unhtml(detailsElement.text()).trim();
86 }
87
c9cffa91
NR
88 // We have some "details" but no content, so we switch them:
89 body = details;
90 details = "";
b34d1f35
NR
91 list.add(new Story(getType(), id, title, author, date, categ,
92 details, intUrl, extUrl, body));
d28c4aac
NR
93 }
94
95 return list;
96 }
97
98 @Override
99 public void fetch(Story story) throws IOException {
100 String fullContent = story.getContent();
101 List<Comment> comments = new ArrayList<Comment>();
7686553a 102 story.setComments(comments);
d28c4aac
NR
103
104 URL url = new URL(story.getUrlInternal());
105 InputStream in = downloader.open(url);
106 try {
107 Document doc = DataUtil.load(in, "UTF-8", url.toString());
108 Element article = doc.getElementById("body");
109 if (article != null) {
110 for (String line : toLines(article,
111 new BasicElementProcessor() {
112 // TODO: ignore headlines/pub
113 })) {
114 fullContent += line + "\n";
115 }
116
117 // Content is too tight with a single break per line:
118 fullContent = fullContent.replace("\n", "\n\n") //
119 .replace("\n\n\n\n", "\n\n") //
120 .replace("\n\n\n\n", "\n\n") //
121 .trim();
122 }
123
7686553a
NR
124 story.setFullContent(fullContent);
125
d28c4aac
NR
126 // Get comments URL then parse it
127 in.close();
128 in = null;
129 in = downloader
130 .open(new URL("https://forums.theregister.co.uk/forum/1"
131 + url.getPath()));
132 doc = DataUtil.load(in, "UTF-8", url.toString());
133 Element posts = doc.getElementById("forum_posts");
134 if (posts != null) {
135 for (Element post : posts.getElementsByClass("post")) {
136 String id = "";
137 String author = "";
138 String title = "";
139 String date = "";
140 List<String> content = new ArrayList<String>();
141
142 Element idE = post.getElementsByTag("a").first();
143 if (idE != null) {
144 id = idE.attr("id");
7686553a
NR
145 if (id.startsWith("c_")) {
146 id = id.substring(2);
147 }
148
d28c4aac
NR
149 Element dateE = idE.getElementsByTag("span").first();
150 if (dateE != null) {
151 date = date(dateE.attr("data-epoch"));
152 }
153 }
154
155 Element authorE = post.getElementsByClass("author").first();
156 if (authorE != null) {
157 author = StringUtils.unhtml(authorE.text()).trim();
158 }
159
160 Element titleE = post.getElementsByTag("h4").first();
161 if (titleE != null) {
162 title = StringUtils.unhtml(titleE.text()).trim();
163 }
164
165 Element contentE = post.getElementsByClass("body").first();
166 if (contentE != null) {
167 for (String line : toLines(contentE,
168 new BasicElementProcessor() {
169 @Override
170 public boolean ignoreNode(Node node) {
171 // TODO: ignore headlines/pub
7686553a
NR
172
173 // Remove the comment title (which has
174 // already been processed earlier)
d28c4aac 175 if (node instanceof Element) {
7686553a 176 Element el = (Element) node;
d28c4aac
NR
177 if ("h4".equals(el.tagName())) {
178 return true;
179 }
180 }
7686553a 181
d28c4aac
NR
182 return false;
183 }
184 })) {
185 content.add(line);
186 }
187 }
188
7686553a
NR
189 Comment comment = new Comment(id, author, title, date,
190 content);
191 Comment parent = null;
192
193 Element inReplyTo = post.getElementsByClass("in-reply-to")
194 .first();
195 if (inReplyTo != null) {
196 String parentId = inReplyTo.absUrl("href");
197 if (parentId != null && parentId.contains("/")) {
198 int i = parentId.lastIndexOf('/');
199 parentId = parentId.substring(i + 1);
200 parent = story.getCommentById(parentId);
201 }
202 }
203
204 if (parent == null) {
205 comments.add(comment);
206 } else {
207 parent.add(comment);
208 }
d28c4aac
NR
209 }
210 }
d28c4aac
NR
211 } finally {
212 if (in != null) {
213 in.close();
214 }
215 }
216 }
d28c4aac 217}