Commit | Line | Data |
---|---|---|
d28c4aac NR |
1 | package be.nikiroo.gofetch.support; |
2 | ||
3 | import java.io.IOException; | |
4 | import java.io.InputStream; | |
5 | import java.net.URL; | |
3e62b034 | 6 | import java.util.AbstractMap; |
d28c4aac | 7 | import java.util.ArrayList; |
3e62b034 | 8 | import java.util.HashMap; |
d28c4aac | 9 | import java.util.List; |
3e62b034 NR |
10 | import java.util.Map; |
11 | import java.util.Map.Entry; | |
d28c4aac NR |
12 | |
13 | import org.jsoup.helper.DataUtil; | |
14 | import org.jsoup.nodes.Document; | |
15 | import org.jsoup.nodes.Element; | |
16 | import org.jsoup.nodes.Node; | |
d28c4aac NR |
17 | |
18 | import be.nikiroo.gofetch.data.Comment; | |
19 | import be.nikiroo.gofetch.data.Story; | |
d28c4aac | 20 | |
b34d1f35 NR |
21 | /** |
22 | * Support <a | |
23 | * href="https://www.theregister.co.uk/">https://www.theregister.co.uk/</a>. | |
24 | * | |
25 | * @author niki | |
26 | */ | |
d28c4aac | 27 | public class TheRegister extends BasicSupport { |
3e62b034 NR |
28 | private Map<String, String> commentReplies = new HashMap<String, String>(); |
29 | ||
d28c4aac NR |
30 | @Override |
31 | public String getDescription() { | |
32 | return "The Register: Biting the hand that feeds IT"; | |
33 | } | |
34 | ||
35 | @Override | |
3e62b034 NR |
36 | public void fetch(Story story) throws IOException { |
37 | super.fetch(story); | |
d28c4aac | 38 | |
3e62b034 NR |
39 | // Update comment replies |
40 | List<Comment> comments = new ArrayList<Comment>(); | |
41 | for (Comment comment : story.getComments()) { | |
42 | if (commentReplies.containsKey(comment.getId())) { | |
43 | String inReplyToId = commentReplies.get(comment.getId()); | |
44 | Comment inReplyTo = story.getCommentById(inReplyToId); | |
45 | if (inReplyTo != null) { | |
46 | inReplyTo.add(comment); | |
47 | } else { | |
48 | comments.add(comment); | |
49 | } | |
50 | } else { | |
51 | comments.add(comment); | |
d28c4aac | 52 | } |
3e62b034 NR |
53 | } |
54 | story.setComments(comments); | |
55 | } | |
d28c4aac | 56 | |
3e62b034 NR |
57 | @Override |
58 | protected List<Entry<URL, String>> getUrls() throws IOException { | |
59 | List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>(); | |
60 | urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL( | |
61 | "https://www.theregister.co.uk/"), "")); | |
62 | return urls; | |
63 | } | |
b34d1f35 | 64 | |
3e62b034 NR |
65 | @Override |
66 | protected List<Element> getArticles(Document doc) { | |
67 | return doc.getElementsByClass("story_link"); | |
68 | } | |
d28c4aac | 69 | |
3e62b034 NR |
70 | @Override |
71 | protected String getArticleId(Document doc, Element article) { | |
72 | return ""; | |
73 | } | |
d28c4aac | 74 | |
3e62b034 NR |
75 | @Override |
76 | protected String getArticleTitle(Document doc, Element article) { | |
77 | Element titleElement = article.getElementsByTag("h4").first(); | |
78 | if (titleElement != null) { | |
79 | return titleElement.text(); | |
80 | } | |
d28c4aac | 81 | |
3e62b034 NR |
82 | return ""; |
83 | } | |
84 | ||
85 | @Override | |
86 | protected String getArticleAuthor(Document doc, Element article) { | |
87 | return ""; | |
88 | } | |
d28c4aac | 89 | |
3e62b034 NR |
90 | @Override |
91 | protected String getArticleDate(Document doc, Element article) { | |
92 | Element dateElement = article.getElementsByClass("time_stamp").first(); | |
93 | if (dateElement != null) { | |
94 | return dateElement.attr("data-epoch"); | |
d28c4aac NR |
95 | } |
96 | ||
3e62b034 | 97 | return ""; |
d28c4aac NR |
98 | } |
99 | ||
100 | @Override | |
3e62b034 NR |
101 | protected String getArticleCategory(Document doc, Element article, |
102 | String currentCategory) { | |
103 | Element categElement = article.previousElementSibling(); | |
104 | if (categElement != null) { | |
105 | return categElement.text(); | |
106 | } | |
107 | ||
108 | return ""; | |
109 | } | |
110 | ||
111 | @Override | |
112 | protected String getArticleDetails(Document doc, Element article) { | |
113 | // We have some "details" but no content, so we switch them: | |
114 | return ""; | |
115 | } | |
116 | ||
117 | @Override | |
118 | protected String getArticleIntUrl(Document doc, Element article) { | |
119 | return article.absUrl("href"); | |
120 | } | |
121 | ||
122 | @Override | |
123 | protected String getArticleExtUrl(Document doc, Element article) { | |
124 | return ""; | |
125 | } | |
126 | ||
127 | @Override | |
128 | protected String getArticleContent(Document doc, Element article) { | |
129 | // We have some "details" but no content, so we switch them: | |
130 | Element detailsElement = article.getElementsByClass("standfirst") | |
131 | .first(); | |
132 | if (detailsElement != null) { | |
133 | return detailsElement.text(); | |
134 | } | |
135 | ||
136 | return ""; | |
137 | } | |
138 | ||
139 | @Override | |
140 | protected Element getFullArticle(Document doc) { | |
141 | return doc.getElementById("body"); | |
142 | } | |
d28c4aac | 143 | |
3e62b034 NR |
144 | @Override |
145 | protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) { | |
146 | List<Element> commentElements = new ArrayList<Element>(); | |
147 | ||
148 | // Get comments URL then parse it | |
d28c4aac | 149 | try { |
3e62b034 NR |
150 | URL url = new URL("https://forums.theregister.co.uk/forum/1" |
151 | + intUrl.getPath()); | |
152 | InputStream in = downloader.open(url); | |
153 | try { | |
154 | doc = DataUtil.load(in, "UTF-8", url.toString()); | |
155 | Element posts = doc.getElementById("forum_posts"); | |
156 | if (posts != null) { | |
157 | for (Element post : posts.getElementsByClass("post")) { | |
158 | commentElements.add(post); | |
159 | Element inReplyTo = post.getElementsByClass( | |
160 | "in-reply-to").first(); | |
161 | if (inReplyTo != null) { | |
162 | String parentId = inReplyTo.absUrl("href"); | |
163 | if (parentId != null && parentId.contains("/")) { | |
164 | int i = parentId.lastIndexOf('/'); | |
165 | parentId = parentId.substring(i + 1); | |
166 | ||
167 | commentReplies | |
168 | .put(getCommentId(post), parentId); | |
169 | } | |
170 | } | |
171 | } | |
d28c4aac | 172 | } |
3e62b034 NR |
173 | } finally { |
174 | in.close(); | |
175 | } | |
176 | } catch (IOException e) { | |
177 | } | |
178 | ||
179 | return commentElements; | |
180 | } | |
181 | ||
182 | @Override | |
183 | protected ElementProcessor getElementProcessorFullArticle() { | |
184 | return new BasicElementProcessor(); | |
185 | } | |
186 | ||
187 | @Override | |
188 | protected List<Element> getCommentCommentPosts(Document doc, | |
189 | Element container) { | |
190 | return null; | |
191 | } | |
d28c4aac | 192 | |
3e62b034 NR |
193 | @Override |
194 | protected String getCommentId(Element post) { | |
195 | Element idE = post.getElementsByTag("a").first(); | |
196 | if (idE != null) { | |
197 | String id = idE.attr("id"); | |
198 | if (id.startsWith("c_")) { | |
199 | id = id.substring(2); | |
d28c4aac NR |
200 | } |
201 | ||
3e62b034 NR |
202 | return id; |
203 | } | |
7686553a | 204 | |
3e62b034 NR |
205 | return ""; |
206 | } | |
d28c4aac | 207 | |
3e62b034 NR |
208 | @Override |
209 | protected String getCommentAuthor(Element post) { | |
210 | Element author = post.getElementsByClass("author").first(); | |
211 | if (author != null) { | |
212 | return author.text(); | |
213 | } | |
d28c4aac | 214 | |
3e62b034 NR |
215 | return ""; |
216 | } | |
d28c4aac | 217 | |
3e62b034 NR |
218 | @Override |
219 | protected String getCommentTitle(Element post) { | |
220 | Element title = post.getElementsByTag("h4").first(); | |
221 | if (title != null) { | |
222 | return title.text(); | |
223 | } | |
d28c4aac | 224 | |
3e62b034 NR |
225 | return ""; |
226 | } | |
227 | ||
228 | @Override | |
229 | protected String getCommentDate(Element post) { | |
230 | Element id = post.getElementsByTag("a").first(); | |
231 | if (id != null) { | |
232 | Element date = id.getElementsByTag("span").first(); | |
233 | if (date != null) { | |
234 | return date.attr("data-epoch"); | |
235 | } | |
236 | } | |
237 | ||
238 | return ""; | |
239 | } | |
240 | ||
241 | @Override | |
242 | protected Element getCommentContentElement(Element post) { | |
243 | return post.getElementsByClass("body").first(); | |
244 | } | |
7686553a | 245 | |
3e62b034 NR |
246 | @Override |
247 | protected ElementProcessor getElementProcessorComment() { | |
248 | return new BasicElementProcessor() { | |
249 | @Override | |
250 | public boolean ignoreNode(Node node) { | |
251 | // Remove the comment title (which has | |
252 | // already been processed earlier) | |
253 | if (node instanceof Element) { | |
254 | Element el = (Element) node; | |
255 | if ("h4".equals(el.tagName())) { | |
256 | return true; | |
7686553a | 257 | } |
d28c4aac | 258 | } |
3e62b034 NR |
259 | |
260 | return false; | |
d28c4aac | 261 | } |
3e62b034 | 262 | }; |
d28c4aac | 263 | } |
d28c4aac | 264 | } |