Commit | Line | Data |
---|---|---|
d28c4aac NR |
1 | package be.nikiroo.gofetch.support; |
2 | ||
3 | import java.io.IOException; | |
4 | import java.io.InputStream; | |
5 | import java.net.URL; | |
6 | import java.text.SimpleDateFormat; | |
7 | import java.util.ArrayList; | |
8 | import java.util.Date; | |
9 | import java.util.List; | |
10 | ||
11 | import org.jsoup.helper.DataUtil; | |
12 | import org.jsoup.nodes.Document; | |
13 | import org.jsoup.nodes.Element; | |
14 | import org.jsoup.nodes.Node; | |
15 | import org.jsoup.select.Elements; | |
16 | ||
17 | import be.nikiroo.gofetch.data.Comment; | |
18 | import be.nikiroo.gofetch.data.Story; | |
19 | import be.nikiroo.utils.StringUtils; | |
20 | ||
21 | public class TheRegister extends BasicSupport { | |
22 | @Override | |
23 | public String getDescription() { | |
24 | return "The Register: Biting the hand that feeds IT"; | |
25 | } | |
26 | ||
27 | @Override | |
28 | public List<Story> list() throws IOException { | |
29 | List<Story> list = new ArrayList<Story>(); | |
30 | ||
31 | URL url = new URL("https://www.theregister.co.uk/"); | |
32 | InputStream in = downloader.open(url); | |
33 | Document doc = DataUtil.load(in, "UTF-8", url.toString()); | |
34 | Elements articles = doc.getElementsByClass("story_link"); | |
35 | for (Element article : articles) { | |
36 | if (article.getElementsByClass("time_stamp").isEmpty()) { | |
37 | // Some articles are doubled, | |
38 | // but the second copy without the time info | |
39 | continue; | |
40 | } | |
41 | ||
42 | String id = ""; | |
43 | String intUrl = article.absUrl("href"); | |
44 | String extUrl = ""; // nope | |
45 | String title = ""; | |
46 | String date = ""; | |
47 | String details = ""; | |
48 | String body = ""; | |
49 | ||
50 | String topic = ""; | |
51 | Element topicElement = article.previousElementSibling(); | |
52 | if (topicElement != null) { | |
53 | topic = "[" + topicElement.text().trim() + "] "; | |
54 | } | |
55 | Element titleElement = article.getElementsByTag("h4").first(); | |
56 | if (titleElement != null) { | |
57 | title = StringUtils.unhtml(titleElement.text()).trim(); | |
58 | } | |
59 | title = topic + title; | |
60 | ||
61 | Element dateElement = article.getElementsByClass("time_stamp") | |
62 | .first(); | |
63 | if (dateElement != null) { | |
64 | String epochS = dateElement.attr("data-epoch"); | |
65 | if (epochS != null && !epochS.isEmpty()) { | |
66 | id = epochS; | |
67 | date = date(epochS); | |
68 | } | |
69 | } | |
70 | ||
71 | if (id.isEmpty()) { | |
72 | // fallback | |
73 | id = article.attr("href").replace("/", "_"); | |
74 | } | |
75 | ||
76 | Element detailsElement = article.getElementsByClass("standfirst") | |
77 | .first(); | |
78 | details = "(" + date + ") "; | |
79 | if (detailsElement != null) { | |
80 | details += StringUtils.unhtml(detailsElement.text()).trim(); | |
81 | } | |
82 | ||
83 | list.add(new Story(getType(), id, title, details, intUrl, extUrl, | |
84 | body)); | |
85 | } | |
86 | ||
87 | return list; | |
88 | } | |
89 | ||
90 | @Override | |
91 | public void fetch(Story story) throws IOException { | |
92 | String fullContent = story.getContent(); | |
93 | List<Comment> comments = new ArrayList<Comment>(); | |
94 | ||
95 | URL url = new URL(story.getUrlInternal()); | |
96 | InputStream in = downloader.open(url); | |
97 | try { | |
98 | Document doc = DataUtil.load(in, "UTF-8", url.toString()); | |
99 | Element article = doc.getElementById("body"); | |
100 | if (article != null) { | |
101 | for (String line : toLines(article, | |
102 | new BasicElementProcessor() { | |
103 | // TODO: ignore headlines/pub | |
104 | })) { | |
105 | fullContent += line + "\n"; | |
106 | } | |
107 | ||
108 | // Content is too tight with a single break per line: | |
109 | fullContent = fullContent.replace("\n", "\n\n") // | |
110 | .replace("\n\n\n\n", "\n\n") // | |
111 | .replace("\n\n\n\n", "\n\n") // | |
112 | .trim(); | |
113 | } | |
114 | ||
115 | // Get comments URL then parse it | |
116 | in.close(); | |
117 | in = null; | |
118 | in = downloader | |
119 | .open(new URL("https://forums.theregister.co.uk/forum/1" | |
120 | + url.getPath())); | |
121 | doc = DataUtil.load(in, "UTF-8", url.toString()); | |
122 | Element posts = doc.getElementById("forum_posts"); | |
123 | if (posts != null) { | |
124 | for (Element post : posts.getElementsByClass("post")) { | |
125 | String id = ""; | |
126 | String author = ""; | |
127 | String title = ""; | |
128 | String date = ""; | |
129 | List<String> content = new ArrayList<String>(); | |
130 | ||
131 | Element idE = post.getElementsByTag("a").first(); | |
132 | if (idE != null) { | |
133 | id = idE.attr("id"); | |
134 | Element dateE = idE.getElementsByTag("span").first(); | |
135 | if (dateE != null) { | |
136 | date = date(dateE.attr("data-epoch")); | |
137 | } | |
138 | } | |
139 | ||
140 | Element authorE = post.getElementsByClass("author").first(); | |
141 | if (authorE != null) { | |
142 | author = StringUtils.unhtml(authorE.text()).trim(); | |
143 | } | |
144 | ||
145 | Element titleE = post.getElementsByTag("h4").first(); | |
146 | if (titleE != null) { | |
147 | title = StringUtils.unhtml(titleE.text()).trim(); | |
148 | } | |
149 | ||
150 | Element contentE = post.getElementsByClass("body").first(); | |
151 | if (contentE != null) { | |
152 | for (String line : toLines(contentE, | |
153 | new BasicElementProcessor() { | |
154 | @Override | |
155 | public boolean ignoreNode(Node node) { | |
156 | // TODO: ignore headlines/pub | |
157 | if (node instanceof Element) { | |
158 | Element el = (Element)node; | |
159 | if ("h4".equals(el.tagName())) { | |
160 | return true; | |
161 | } | |
162 | } | |
163 | ||
164 | return false; | |
165 | } | |
166 | })) { | |
167 | content.add(line); | |
168 | } | |
169 | } | |
170 | ||
171 | comments.add(new Comment(id, author, title, date, content)); | |
172 | } | |
173 | } | |
174 | ||
175 | story.setFullContent(fullContent); | |
176 | story.setComments(comments); | |
177 | } finally { | |
178 | if (in != null) { | |
179 | in.close(); | |
180 | } | |
181 | } | |
182 | } | |
183 | ||
184 | // Return display date from epoch String, or "" if error | |
185 | private static String date(String epochString) { | |
186 | long epoch = 0; | |
187 | try { | |
188 | epoch = Long.parseLong(epochString); | |
189 | } catch (Exception e) { | |
190 | epoch = 0; | |
191 | } | |
192 | ||
193 | if (epoch > 0) { | |
194 | return new SimpleDateFormat("dd MMM YYYY").format(new Date( | |
195 | 1000 * epoch)); | |
196 | } | |
197 | ||
198 | return ""; | |
199 | } | |
200 | } |