REGISTER: now supports comment replies
[gofetch.git] / src / be / nikiroo / gofetch / support / TheRegister.java
CommitLineData
d28c4aac
NR
1package be.nikiroo.gofetch.support;
2
3import java.io.IOException;
4import java.io.InputStream;
5import java.net.URL;
6import java.text.SimpleDateFormat;
7import java.util.ArrayList;
8import java.util.Date;
9import java.util.List;
10
11import org.jsoup.helper.DataUtil;
12import org.jsoup.nodes.Document;
13import org.jsoup.nodes.Element;
14import org.jsoup.nodes.Node;
15import org.jsoup.select.Elements;
16
17import be.nikiroo.gofetch.data.Comment;
18import be.nikiroo.gofetch.data.Story;
19import be.nikiroo.utils.StringUtils;
20
21public class TheRegister extends BasicSupport {
22 @Override
23 public String getDescription() {
24 return "The Register: Biting the hand that feeds IT";
25 }
26
27 @Override
28 public List<Story> list() throws IOException {
29 List<Story> list = new ArrayList<Story>();
30
31 URL url = new URL("https://www.theregister.co.uk/");
32 InputStream in = downloader.open(url);
33 Document doc = DataUtil.load(in, "UTF-8", url.toString());
34 Elements articles = doc.getElementsByClass("story_link");
35 for (Element article : articles) {
36 if (article.getElementsByClass("time_stamp").isEmpty()) {
37 // Some articles are doubled,
38 // but the second copy without the time info
39 continue;
40 }
41
42 String id = "";
43 String intUrl = article.absUrl("href");
44 String extUrl = ""; // nope
45 String title = "";
46 String date = "";
47 String details = "";
48 String body = "";
49
50 String topic = "";
51 Element topicElement = article.previousElementSibling();
52 if (topicElement != null) {
53 topic = "[" + topicElement.text().trim() + "] ";
54 }
55 Element titleElement = article.getElementsByTag("h4").first();
56 if (titleElement != null) {
57 title = StringUtils.unhtml(titleElement.text()).trim();
58 }
59 title = topic + title;
60
61 Element dateElement = article.getElementsByClass("time_stamp")
62 .first();
63 if (dateElement != null) {
64 String epochS = dateElement.attr("data-epoch");
65 if (epochS != null && !epochS.isEmpty()) {
66 id = epochS;
67 date = date(epochS);
68 }
69 }
70
71 if (id.isEmpty()) {
72 // fallback
73 id = article.attr("href").replace("/", "_");
74 }
75
76 Element detailsElement = article.getElementsByClass("standfirst")
77 .first();
78 details = "(" + date + ") ";
79 if (detailsElement != null) {
80 details += StringUtils.unhtml(detailsElement.text()).trim();
81 }
82
83 list.add(new Story(getType(), id, title, details, intUrl, extUrl,
84 body));
85 }
86
87 return list;
88 }
89
90 @Override
91 public void fetch(Story story) throws IOException {
92 String fullContent = story.getContent();
93 List<Comment> comments = new ArrayList<Comment>();
7686553a 94 story.setComments(comments);
d28c4aac
NR
95
96 URL url = new URL(story.getUrlInternal());
97 InputStream in = downloader.open(url);
98 try {
99 Document doc = DataUtil.load(in, "UTF-8", url.toString());
100 Element article = doc.getElementById("body");
101 if (article != null) {
102 for (String line : toLines(article,
103 new BasicElementProcessor() {
104 // TODO: ignore headlines/pub
105 })) {
106 fullContent += line + "\n";
107 }
108
109 // Content is too tight with a single break per line:
110 fullContent = fullContent.replace("\n", "\n\n") //
111 .replace("\n\n\n\n", "\n\n") //
112 .replace("\n\n\n\n", "\n\n") //
113 .trim();
114 }
115
7686553a
NR
116 story.setFullContent(fullContent);
117
d28c4aac
NR
118 // Get comments URL then parse it
119 in.close();
120 in = null;
121 in = downloader
122 .open(new URL("https://forums.theregister.co.uk/forum/1"
123 + url.getPath()));
124 doc = DataUtil.load(in, "UTF-8", url.toString());
125 Element posts = doc.getElementById("forum_posts");
126 if (posts != null) {
127 for (Element post : posts.getElementsByClass("post")) {
128 String id = "";
129 String author = "";
130 String title = "";
131 String date = "";
132 List<String> content = new ArrayList<String>();
133
134 Element idE = post.getElementsByTag("a").first();
135 if (idE != null) {
136 id = idE.attr("id");
7686553a
NR
137 if (id.startsWith("c_")) {
138 id = id.substring(2);
139 }
140
d28c4aac
NR
141 Element dateE = idE.getElementsByTag("span").first();
142 if (dateE != null) {
143 date = date(dateE.attr("data-epoch"));
144 }
145 }
146
147 Element authorE = post.getElementsByClass("author").first();
148 if (authorE != null) {
149 author = StringUtils.unhtml(authorE.text()).trim();
150 }
151
152 Element titleE = post.getElementsByTag("h4").first();
153 if (titleE != null) {
154 title = StringUtils.unhtml(titleE.text()).trim();
155 }
156
157 Element contentE = post.getElementsByClass("body").first();
158 if (contentE != null) {
159 for (String line : toLines(contentE,
160 new BasicElementProcessor() {
161 @Override
162 public boolean ignoreNode(Node node) {
163 // TODO: ignore headlines/pub
7686553a
NR
164
165 // Remove the comment title (which has
166 // already been processed earlier)
d28c4aac 167 if (node instanceof Element) {
7686553a 168 Element el = (Element) node;
d28c4aac
NR
169 if ("h4".equals(el.tagName())) {
170 return true;
171 }
172 }
7686553a 173
d28c4aac
NR
174 return false;
175 }
176 })) {
177 content.add(line);
178 }
179 }
180
7686553a
NR
181 Comment comment = new Comment(id, author, title, date,
182 content);
183 Comment parent = null;
184
185 Element inReplyTo = post.getElementsByClass("in-reply-to")
186 .first();
187 if (inReplyTo != null) {
188 String parentId = inReplyTo.absUrl("href");
189 if (parentId != null && parentId.contains("/")) {
190 int i = parentId.lastIndexOf('/');
191 parentId = parentId.substring(i + 1);
192 parent = story.getCommentById(parentId);
193 }
194 }
195
196 if (parent == null) {
197 comments.add(comment);
198 } else {
199 parent.add(comment);
200 }
d28c4aac
NR
201 }
202 }
d28c4aac
NR
203 } finally {
204 if (in != null) {
205 in.close();
206 }
207 }
208 }
209
210 // Return display date from epoch String, or "" if error
211 private static String date(String epochString) {
212 long epoch = 0;
213 try {
214 epoch = Long.parseLong(epochString);
215 } catch (Exception e) {
216 epoch = 0;
217 }
218
219 if (epoch > 0) {
220 return new SimpleDateFormat("dd MMM YYYY").format(new Date(
221 1000 * epoch));
222 }
223
224 return "";
225 }
226}