Commit | Line | Data |
---|---|---|
d28c4aac NR |
1 | package be.nikiroo.gofetch.support; |
2 | ||
3 | import java.io.IOException; | |
4 | import java.io.InputStream; | |
5 | import java.net.URL; | |
6 | import java.text.SimpleDateFormat; | |
7 | import java.util.ArrayList; | |
8 | import java.util.Date; | |
9 | import java.util.List; | |
10 | ||
11 | import org.jsoup.helper.DataUtil; | |
12 | import org.jsoup.nodes.Document; | |
13 | import org.jsoup.nodes.Element; | |
14 | import org.jsoup.nodes.Node; | |
15 | import org.jsoup.select.Elements; | |
16 | ||
17 | import be.nikiroo.gofetch.data.Comment; | |
18 | import be.nikiroo.gofetch.data.Story; | |
19 | import be.nikiroo.utils.StringUtils; | |
20 | ||
21 | public class TheRegister extends BasicSupport { | |
22 | @Override | |
23 | public String getDescription() { | |
24 | return "The Register: Biting the hand that feeds IT"; | |
25 | } | |
26 | ||
27 | @Override | |
28 | public List<Story> list() throws IOException { | |
29 | List<Story> list = new ArrayList<Story>(); | |
30 | ||
31 | URL url = new URL("https://www.theregister.co.uk/"); | |
32 | InputStream in = downloader.open(url); | |
33 | Document doc = DataUtil.load(in, "UTF-8", url.toString()); | |
34 | Elements articles = doc.getElementsByClass("story_link"); | |
35 | for (Element article : articles) { | |
36 | if (article.getElementsByClass("time_stamp").isEmpty()) { | |
37 | // Some articles are doubled, | |
38 | // but the second copy without the time info | |
39 | continue; | |
40 | } | |
41 | ||
42 | String id = ""; | |
43 | String intUrl = article.absUrl("href"); | |
44 | String extUrl = ""; // nope | |
45 | String title = ""; | |
46 | String date = ""; | |
47 | String details = ""; | |
48 | String body = ""; | |
49 | ||
50 | String topic = ""; | |
51 | Element topicElement = article.previousElementSibling(); | |
52 | if (topicElement != null) { | |
53 | topic = "[" + topicElement.text().trim() + "] "; | |
54 | } | |
55 | Element titleElement = article.getElementsByTag("h4").first(); | |
56 | if (titleElement != null) { | |
57 | title = StringUtils.unhtml(titleElement.text()).trim(); | |
58 | } | |
59 | title = topic + title; | |
60 | ||
61 | Element dateElement = article.getElementsByClass("time_stamp") | |
62 | .first(); | |
63 | if (dateElement != null) { | |
64 | String epochS = dateElement.attr("data-epoch"); | |
65 | if (epochS != null && !epochS.isEmpty()) { | |
66 | id = epochS; | |
67 | date = date(epochS); | |
68 | } | |
69 | } | |
70 | ||
71 | if (id.isEmpty()) { | |
72 | // fallback | |
73 | id = article.attr("href").replace("/", "_"); | |
74 | } | |
75 | ||
76 | Element detailsElement = article.getElementsByClass("standfirst") | |
77 | .first(); | |
78 | details = "(" + date + ") "; | |
79 | if (detailsElement != null) { | |
80 | details += StringUtils.unhtml(detailsElement.text()).trim(); | |
81 | } | |
82 | ||
83 | list.add(new Story(getType(), id, title, details, intUrl, extUrl, | |
84 | body)); | |
85 | } | |
86 | ||
87 | return list; | |
88 | } | |
89 | ||
90 | @Override | |
91 | public void fetch(Story story) throws IOException { | |
92 | String fullContent = story.getContent(); | |
93 | List<Comment> comments = new ArrayList<Comment>(); | |
7686553a | 94 | story.setComments(comments); |
d28c4aac NR |
95 | |
96 | URL url = new URL(story.getUrlInternal()); | |
97 | InputStream in = downloader.open(url); | |
98 | try { | |
99 | Document doc = DataUtil.load(in, "UTF-8", url.toString()); | |
100 | Element article = doc.getElementById("body"); | |
101 | if (article != null) { | |
102 | for (String line : toLines(article, | |
103 | new BasicElementProcessor() { | |
104 | // TODO: ignore headlines/pub | |
105 | })) { | |
106 | fullContent += line + "\n"; | |
107 | } | |
108 | ||
109 | // Content is too tight with a single break per line: | |
110 | fullContent = fullContent.replace("\n", "\n\n") // | |
111 | .replace("\n\n\n\n", "\n\n") // | |
112 | .replace("\n\n\n\n", "\n\n") // | |
113 | .trim(); | |
114 | } | |
115 | ||
7686553a NR |
116 | story.setFullContent(fullContent); |
117 | ||
d28c4aac NR |
118 | // Get comments URL then parse it |
119 | in.close(); | |
120 | in = null; | |
121 | in = downloader | |
122 | .open(new URL("https://forums.theregister.co.uk/forum/1" | |
123 | + url.getPath())); | |
124 | doc = DataUtil.load(in, "UTF-8", url.toString()); | |
125 | Element posts = doc.getElementById("forum_posts"); | |
126 | if (posts != null) { | |
127 | for (Element post : posts.getElementsByClass("post")) { | |
128 | String id = ""; | |
129 | String author = ""; | |
130 | String title = ""; | |
131 | String date = ""; | |
132 | List<String> content = new ArrayList<String>(); | |
133 | ||
134 | Element idE = post.getElementsByTag("a").first(); | |
135 | if (idE != null) { | |
136 | id = idE.attr("id"); | |
7686553a NR |
137 | if (id.startsWith("c_")) { |
138 | id = id.substring(2); | |
139 | } | |
140 | ||
d28c4aac NR |
141 | Element dateE = idE.getElementsByTag("span").first(); |
142 | if (dateE != null) { | |
143 | date = date(dateE.attr("data-epoch")); | |
144 | } | |
145 | } | |
146 | ||
147 | Element authorE = post.getElementsByClass("author").first(); | |
148 | if (authorE != null) { | |
149 | author = StringUtils.unhtml(authorE.text()).trim(); | |
150 | } | |
151 | ||
152 | Element titleE = post.getElementsByTag("h4").first(); | |
153 | if (titleE != null) { | |
154 | title = StringUtils.unhtml(titleE.text()).trim(); | |
155 | } | |
156 | ||
157 | Element contentE = post.getElementsByClass("body").first(); | |
158 | if (contentE != null) { | |
159 | for (String line : toLines(contentE, | |
160 | new BasicElementProcessor() { | |
161 | @Override | |
162 | public boolean ignoreNode(Node node) { | |
163 | // TODO: ignore headlines/pub | |
7686553a NR |
164 | |
165 | // Remove the comment title (which has | |
166 | // already been processed earlier) | |
d28c4aac | 167 | if (node instanceof Element) { |
7686553a | 168 | Element el = (Element) node; |
d28c4aac NR |
169 | if ("h4".equals(el.tagName())) { |
170 | return true; | |
171 | } | |
172 | } | |
7686553a | 173 | |
d28c4aac NR |
174 | return false; |
175 | } | |
176 | })) { | |
177 | content.add(line); | |
178 | } | |
179 | } | |
180 | ||
7686553a NR |
181 | Comment comment = new Comment(id, author, title, date, |
182 | content); | |
183 | Comment parent = null; | |
184 | ||
185 | Element inReplyTo = post.getElementsByClass("in-reply-to") | |
186 | .first(); | |
187 | if (inReplyTo != null) { | |
188 | String parentId = inReplyTo.absUrl("href"); | |
189 | if (parentId != null && parentId.contains("/")) { | |
190 | int i = parentId.lastIndexOf('/'); | |
191 | parentId = parentId.substring(i + 1); | |
192 | parent = story.getCommentById(parentId); | |
193 | } | |
194 | } | |
195 | ||
196 | if (parent == null) { | |
197 | comments.add(comment); | |
198 | } else { | |
199 | parent.add(comment); | |
200 | } | |
d28c4aac NR |
201 | } |
202 | } | |
d28c4aac NR |
203 | } finally { |
204 | if (in != null) { | |
205 | in.close(); | |
206 | } | |
207 | } | |
208 | } | |
209 | ||
210 | // Return display date from epoch String, or "" if error | |
211 | private static String date(String epochString) { | |
212 | long epoch = 0; | |
213 | try { | |
214 | epoch = Long.parseLong(epochString); | |
215 | } catch (Exception e) { | |
216 | epoch = 0; | |
217 | } | |
218 | ||
219 | if (epoch > 0) { | |
220 | return new SimpleDateFormat("dd MMM YYYY").format(new Date( | |
221 | 1000 * epoch)); | |
222 | } | |
223 | ||
224 | return ""; | |
225 | } | |
226 | } |