Add new supported site: The Register
[gofetch.git] / src / be / nikiroo / gofetch / support / TheRegister.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.URL;
6 import java.text.SimpleDateFormat;
7 import java.util.ArrayList;
8 import java.util.Date;
9 import java.util.List;
10
11 import org.jsoup.helper.DataUtil;
12 import org.jsoup.nodes.Document;
13 import org.jsoup.nodes.Element;
14 import org.jsoup.nodes.Node;
15 import org.jsoup.select.Elements;
16
17 import be.nikiroo.gofetch.data.Comment;
18 import be.nikiroo.gofetch.data.Story;
19 import be.nikiroo.utils.StringUtils;
20
21 public class TheRegister extends BasicSupport {
22 @Override
23 public String getDescription() {
24 return "The Register: Biting the hand that feeds IT";
25 }
26
27 @Override
28 public List<Story> list() throws IOException {
29 List<Story> list = new ArrayList<Story>();
30
31 URL url = new URL("https://www.theregister.co.uk/");
32 InputStream in = downloader.open(url);
33 Document doc = DataUtil.load(in, "UTF-8", url.toString());
34 Elements articles = doc.getElementsByClass("story_link");
35 for (Element article : articles) {
36 if (article.getElementsByClass("time_stamp").isEmpty()) {
37 // Some articles are doubled,
38 // but the second copy without the time info
39 continue;
40 }
41
42 String id = "";
43 String intUrl = article.absUrl("href");
44 String extUrl = ""; // nope
45 String title = "";
46 String date = "";
47 String details = "";
48 String body = "";
49
50 String topic = "";
51 Element topicElement = article.previousElementSibling();
52 if (topicElement != null) {
53 topic = "[" + topicElement.text().trim() + "] ";
54 }
55 Element titleElement = article.getElementsByTag("h4").first();
56 if (titleElement != null) {
57 title = StringUtils.unhtml(titleElement.text()).trim();
58 }
59 title = topic + title;
60
61 Element dateElement = article.getElementsByClass("time_stamp")
62 .first();
63 if (dateElement != null) {
64 String epochS = dateElement.attr("data-epoch");
65 if (epochS != null && !epochS.isEmpty()) {
66 id = epochS;
67 date = date(epochS);
68 }
69 }
70
71 if (id.isEmpty()) {
72 // fallback
73 id = article.attr("href").replace("/", "_");
74 }
75
76 Element detailsElement = article.getElementsByClass("standfirst")
77 .first();
78 details = "(" + date + ") ";
79 if (detailsElement != null) {
80 details += StringUtils.unhtml(detailsElement.text()).trim();
81 }
82
83 list.add(new Story(getType(), id, title, details, intUrl, extUrl,
84 body));
85 }
86
87 return list;
88 }
89
90 @Override
91 public void fetch(Story story) throws IOException {
92 String fullContent = story.getContent();
93 List<Comment> comments = new ArrayList<Comment>();
94
95 URL url = new URL(story.getUrlInternal());
96 InputStream in = downloader.open(url);
97 try {
98 Document doc = DataUtil.load(in, "UTF-8", url.toString());
99 Element article = doc.getElementById("body");
100 if (article != null) {
101 for (String line : toLines(article,
102 new BasicElementProcessor() {
103 // TODO: ignore headlines/pub
104 })) {
105 fullContent += line + "\n";
106 }
107
108 // Content is too tight with a single break per line:
109 fullContent = fullContent.replace("\n", "\n\n") //
110 .replace("\n\n\n\n", "\n\n") //
111 .replace("\n\n\n\n", "\n\n") //
112 .trim();
113 }
114
115 // Get comments URL then parse it
116 in.close();
117 in = null;
118 in = downloader
119 .open(new URL("https://forums.theregister.co.uk/forum/1"
120 + url.getPath()));
121 doc = DataUtil.load(in, "UTF-8", url.toString());
122 Element posts = doc.getElementById("forum_posts");
123 if (posts != null) {
124 for (Element post : posts.getElementsByClass("post")) {
125 String id = "";
126 String author = "";
127 String title = "";
128 String date = "";
129 List<String> content = new ArrayList<String>();
130
131 Element idE = post.getElementsByTag("a").first();
132 if (idE != null) {
133 id = idE.attr("id");
134 Element dateE = idE.getElementsByTag("span").first();
135 if (dateE != null) {
136 date = date(dateE.attr("data-epoch"));
137 }
138 }
139
140 Element authorE = post.getElementsByClass("author").first();
141 if (authorE != null) {
142 author = StringUtils.unhtml(authorE.text()).trim();
143 }
144
145 Element titleE = post.getElementsByTag("h4").first();
146 if (titleE != null) {
147 title = StringUtils.unhtml(titleE.text()).trim();
148 }
149
150 Element contentE = post.getElementsByClass("body").first();
151 if (contentE != null) {
152 for (String line : toLines(contentE,
153 new BasicElementProcessor() {
154 @Override
155 public boolean ignoreNode(Node node) {
156 // TODO: ignore headlines/pub
157 if (node instanceof Element) {
158 Element el = (Element)node;
159 if ("h4".equals(el.tagName())) {
160 return true;
161 }
162 }
163
164 return false;
165 }
166 })) {
167 content.add(line);
168 }
169 }
170
171 comments.add(new Comment(id, author, title, date, content));
172 }
173 }
174
175 story.setFullContent(fullContent);
176 story.setComments(comments);
177 } finally {
178 if (in != null) {
179 in.close();
180 }
181 }
182 }
183
184 // Return display date from epoch String, or "" if error
185 private static String date(String epochString) {
186 long epoch = 0;
187 try {
188 epoch = Long.parseLong(epochString);
189 } catch (Exception e) {
190 epoch = 0;
191 }
192
193 if (epoch > 0) {
194 return new SimpleDateFormat("dd MMM YYYY").format(new Date(
195 1000 * epoch));
196 }
197
198 return "";
199 }
200 }