Fix layout issues in getContent() text
[gofetch.git] / src / be / nikiroo / gofetch / support / TheRegister.java
... / ...
CommitLineData
1package be.nikiroo.gofetch.support;
2
3import java.io.IOException;
4import java.io.InputStream;
5import java.net.URL;
6import java.util.AbstractMap;
7import java.util.ArrayList;
8import java.util.HashMap;
9import java.util.List;
10import java.util.Map;
11import java.util.Map.Entry;
12
13import org.jsoup.helper.DataUtil;
14import org.jsoup.nodes.Document;
15import org.jsoup.nodes.Element;
16import org.jsoup.nodes.Node;
17
18import be.nikiroo.gofetch.data.Comment;
19import be.nikiroo.gofetch.data.Story;
20
21/**
22 * Support <a
23 * href="https://www.theregister.co.uk/">https://www.theregister.co.uk/</a>.
24 *
25 * @author niki
26 */
27public class TheRegister extends BasicSupport {
28 private Map<String, String> commentReplies = new HashMap<String, String>();
29
30 @Override
31 public String getDescription() {
32 return "The Register: Biting the hand that feeds IT";
33 }
34
35 @Override
36 public void fetch(Story story) throws IOException {
37 super.fetch(story);
38
39 // Update comment replies
40 List<Comment> comments = new ArrayList<Comment>();
41 for (Comment comment : story.getComments()) {
42 if (commentReplies.containsKey(comment.getId())) {
43 String inReplyToId = commentReplies.get(comment.getId());
44 Comment inReplyTo = story.getCommentById(inReplyToId);
45 if (inReplyTo != null) {
46 inReplyTo.add(comment);
47 } else {
48 comments.add(comment);
49 }
50 } else {
51 comments.add(comment);
52 }
53 }
54 story.setComments(comments);
55 }
56
57 @Override
58 protected List<Entry<URL, String>> getUrls() throws IOException {
59 List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
60 urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
61 "https://www.theregister.co.uk/"), ""));
62 return urls;
63 }
64
65 @Override
66 protected List<Element> getArticles(Document doc) {
67 return doc.getElementsByClass("story_link");
68 }
69
70 @Override
71 protected String getArticleId(Document doc, Element article) {
72 return "";
73 }
74
75 @Override
76 protected String getArticleTitle(Document doc, Element article) {
77 Element titleElement = article.getElementsByTag("h4").first();
78 if (titleElement != null) {
79 return titleElement.text();
80 }
81
82 return "";
83 }
84
85 @Override
86 protected String getArticleAuthor(Document doc, Element article) {
87 return "";
88 }
89
90 @Override
91 protected String getArticleDate(Document doc, Element article) {
92 Element dateElement = article.getElementsByClass("time_stamp").first();
93 if (dateElement != null) {
94 return dateElement.attr("data-epoch");
95 }
96
97 return "";
98 }
99
100 @Override
101 protected String getArticleCategory(Document doc, Element article,
102 String currentCategory) {
103 Element categElement = article.previousElementSibling();
104 if (categElement != null) {
105 return categElement.text();
106 }
107
108 return "";
109 }
110
111 @Override
112 protected String getArticleDetails(Document doc, Element article) {
113 // We have some "details" but no content, so we switch them:
114 return "";
115 }
116
117 @Override
118 protected String getArticleIntUrl(Document doc, Element article) {
119 return article.absUrl("href");
120 }
121
122 @Override
123 protected String getArticleExtUrl(Document doc, Element article) {
124 return "";
125 }
126
127 @Override
128 protected String getArticleContent(Document doc, Element article) {
129 // We have some "details" but no content, so we switch them:
130 Element detailsElement = article.getElementsByClass("standfirst")
131 .first();
132 if (detailsElement != null) {
133 return getArticleText(detailsElement);
134 }
135
136 return "";
137 }
138
139 @Override
140 protected Element getFullArticle(Document doc) {
141 return doc.getElementById("body");
142 }
143
144 @Override
145 protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
146 List<Element> commentElements = new ArrayList<Element>();
147
148 // Get comments URL then parse it
149 try {
150 URL url = new URL("https://forums.theregister.co.uk/forum/1"
151 + intUrl.getPath());
152 InputStream in = open(url);
153 try {
154 doc = DataUtil.load(in, "UTF-8", url.toString());
155 Element posts = doc.getElementById("forum_posts");
156 if (posts != null) {
157 for (Element post : posts.getElementsByClass("post")) {
158 commentElements.add(post);
159 Element inReplyTo = post.getElementsByClass(
160 "in-reply-to").first();
161 if (inReplyTo != null) {
162 String parentId = inReplyTo.absUrl("href");
163 if (parentId != null && parentId.contains("/")) {
164 int i = parentId.lastIndexOf('/');
165 parentId = parentId.substring(i + 1);
166
167 commentReplies
168 .put(getCommentId(post), parentId);
169 }
170 }
171 }
172 }
173 } finally {
174 in.close();
175 }
176 } catch (IOException e) {
177 }
178
179 return commentElements;
180 }
181
182 @Override
183 protected ElementProcessor getElementProcessorFullArticle() {
184 return new BasicElementProcessor();
185 }
186
187 @Override
188 protected List<Element> getCommentCommentPosts(Document doc,
189 Element container) {
190 return null;
191 }
192
193 @Override
194 protected String getCommentId(Element post) {
195 Element idE = post.getElementsByTag("a").first();
196 if (idE != null) {
197 String id = idE.attr("id");
198 if (id.startsWith("c_")) {
199 id = id.substring(2);
200 }
201
202 return id;
203 }
204
205 return "";
206 }
207
208 @Override
209 protected String getCommentAuthor(Element post) {
210 Element author = post.getElementsByClass("author").first();
211 if (author != null) {
212 return author.text();
213 }
214
215 return "";
216 }
217
218 @Override
219 protected String getCommentTitle(Element post) {
220 Element title = post.getElementsByTag("h4").first();
221 if (title != null) {
222 return title.text();
223 }
224
225 return "";
226 }
227
228 @Override
229 protected String getCommentDate(Element post) {
230 Element id = post.getElementsByTag("a").first();
231 if (id != null) {
232 Element date = id.getElementsByTag("span").first();
233 if (date != null) {
234 return date.attr("data-epoch");
235 }
236 }
237
238 return "";
239 }
240
241 @Override
242 protected Element getCommentContentElement(Element post) {
243 return post.getElementsByClass("body").first();
244 }
245
246 @Override
247 protected ElementProcessor getElementProcessorComment() {
248 return new BasicElementProcessor() {
249 @Override
250 public boolean ignoreNode(Node node) {
251 // Remove the comment title (which has
252 // already been processed earlier)
253 if (node instanceof Element) {
254 Element el = (Element) node;
255 if ("h4".equals(el.tagName())) {
256 return true;
257 }
258 }
259
260 return false;
261 }
262 };
263 }
264}