Commit | Line | Data |
---|---|---|
eaaeae39 NR |
1 | package be.nikiroo.gofetch.support; |
2 | ||
3 | import java.io.IOException; | |
eaaeae39 | 4 | import java.net.URL; |
3e62b034 | 5 | import java.util.AbstractMap; |
eaaeae39 NR |
6 | import java.util.ArrayList; |
7 | import java.util.List; | |
3e62b034 | 8 | import java.util.Map.Entry; |
eaaeae39 | 9 | |
eaaeae39 NR |
10 | import org.jsoup.nodes.Document; |
11 | import org.jsoup.nodes.Element; | |
bb0d9eb2 | 12 | import org.jsoup.nodes.Node; |
3e62b034 | 13 | import org.jsoup.nodes.TextNode; |
eaaeae39 NR |
14 | |
15 | import be.nikiroo.gofetch.data.Comment; | |
16 | import be.nikiroo.gofetch.data.Story; | |
17 | ||
18 | /** | |
19 | * Support <a href='https://lwn.net/'>https://lwn.net/</a>. | |
20 | * | |
21 | * @author niki | |
22 | */ | |
23 | public class LWN extends BasicSupport { | |
24 | @Override | |
25 | public String getDescription() { | |
26 | return "LWN: Linux Weekly Newsletter"; | |
27 | } | |
28 | ||
29 | @Override | |
3e62b034 NR |
30 | public void fetch(Story story) throws IOException { |
31 | // Do not try the paid-for stories... | |
32 | if (!story.getTitle().startsWith("[$]")) { | |
33 | super.fetch(story); | |
34 | } else { | |
35 | String fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/]."; | |
36 | story.setFullContent(fullContent); | |
37 | story.setComments(new ArrayList<Comment>()); | |
38 | } | |
39 | } | |
eaaeae39 | 40 | |
3e62b034 NR |
41 | @Override |
42 | protected List<Entry<URL, String>> getUrls() throws IOException { | |
43 | List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>(); | |
44 | urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL( | |
45 | "https://lwn.net/"), "")); | |
46 | return urls; | |
47 | } | |
5c056aad | 48 | |
3e62b034 NR |
49 | @Override |
50 | protected List<Element> getArticles(Document doc) { | |
51 | return doc.getElementsByClass("pure-u-1"); | |
52 | } | |
eaaeae39 | 53 | |
3e62b034 NR |
54 | @Override |
55 | protected String getArticleId(Document doc, Element article) { | |
64a785f6 NR |
56 | String id = getArticleIntUrl(doc, article).replaceAll("[^0-9]", ""); |
57 | while (id.length() < 10) { | |
58 | id = "0" + id; | |
59 | } | |
60 | ||
61 | return id; | |
3e62b034 | 62 | } |
5c056aad | 63 | |
3e62b034 NR |
64 | @Override |
65 | protected String getArticleTitle(Document doc, Element article) { | |
66 | Element title = article.getElementsByClass("Headline").first(); | |
67 | if (title != null) { | |
68 | return title.text(); | |
69 | } | |
b34d1f35 | 70 | |
3e62b034 NR |
71 | return ""; |
72 | } | |
b34d1f35 | 73 | |
3e62b034 NR |
74 | @Override |
75 | protected String getArticleAuthor(Document doc, Element article) { | |
76 | String author = ""; | |
77 | String details = getArticleDetailsReal(article); | |
78 | int pos = details.indexOf(" by "); | |
79 | if (pos >= 0) { | |
80 | author = details.substring(pos + " by ".length()).trim(); | |
81 | } | |
82 | ||
83 | return author; | |
84 | } | |
5c056aad | 85 | |
3e62b034 NR |
86 | @Override |
87 | protected String getArticleDate(Document doc, Element article) { | |
88 | String date = ""; | |
89 | String details = getArticleDetailsReal(article); | |
90 | int pos = details.indexOf(" Posted "); | |
91 | if (pos >= 0) { | |
92 | date = details.substring(pos + " Posted ".length()).trim(); | |
93 | pos = date.indexOf(" by "); | |
eaaeae39 | 94 | if (pos >= 0) { |
3e62b034 | 95 | date = date.substring(0, pos).trim(); |
eaaeae39 | 96 | } |
3e62b034 | 97 | } |
eaaeae39 | 98 | |
3e62b034 NR |
99 | return date; |
100 | } | |
eaaeae39 | 101 | |
3e62b034 NR |
102 | @Override |
103 | protected String getArticleCategory(Document doc, Element article, | |
104 | String currentCategory) { | |
105 | String categ = ""; | |
106 | String details = getArticleDetailsReal(article); | |
107 | int pos = details.indexOf("]"); | |
108 | if (pos >= 0) { | |
109 | categ = details.substring(1, pos).trim(); | |
eaaeae39 NR |
110 | } |
111 | ||
3e62b034 | 112 | return categ; |
eaaeae39 NR |
113 | } |
114 | ||
115 | @Override | |
3e62b034 NR |
116 | protected String getArticleDetails(Document doc, Element article) { |
117 | return ""; // We actually extract all the values | |
118 | } | |
bb0d9eb2 | 119 | |
3e62b034 NR |
120 | @Override |
121 | protected String getArticleIntUrl(Document doc, Element article) { | |
122 | String intUrl = ""; | |
123 | for (Element idElem : article.getElementsByTag("a")) { | |
124 | // Last link is the story link | |
125 | intUrl = idElem.absUrl("href"); | |
126 | int pos = intUrl.indexOf("#Comments"); | |
127 | if (pos >= 0) { | |
128 | intUrl = intUrl.substring(0, pos - 1); | |
bb0d9eb2 | 129 | } |
3e62b034 | 130 | } |
bb0d9eb2 | 131 | |
3e62b034 NR |
132 | return intUrl; |
133 | } | |
134 | ||
135 | @Override | |
136 | protected String getArticleExtUrl(Document doc, Element article) { | |
137 | return ""; | |
138 | } | |
139 | ||
140 | @Override | |
141 | protected String getArticleContent(Document doc, Element article) { | |
142 | Element listing = article.getElementsByClass("BlurbListing").first(); | |
143 | if (listing != null && listing.children().size() >= 2) { | |
144 | String content = ""; | |
145 | ||
146 | // All but the first and two last children | |
147 | for (int i = 1; i < listing.children().size() - 2; i++) { | |
148 | Element e = listing.children().get(i); | |
149 | content = content.trim() + " " + e.text().trim(); | |
bb0d9eb2 | 150 | } |
3e62b034 NR |
151 | |
152 | return content; | |
bb0d9eb2 NR |
153 | } |
154 | ||
3e62b034 NR |
155 | return ""; |
156 | } | |
157 | ||
158 | @Override | |
159 | protected Element getFullArticle(Document doc) { | |
160 | return doc.getElementsByClass("ArticleText").first(); | |
eaaeae39 NR |
161 | } |
162 | ||
3e62b034 NR |
163 | @Override |
164 | protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) { | |
165 | return doc.getElementsByClass("lwn-u-1"); | |
166 | } | |
167 | ||
168 | @Override | |
169 | protected ElementProcessor getElementProcessorFullArticle() { | |
170 | return new BasicElementProcessor() { | |
171 | @Override | |
172 | public boolean ignoreNode(Node node) { | |
173 | if (node instanceof Element) { | |
174 | Element el = (Element) node; | |
175 | if ("Log in".equals(el.text().trim())) { | |
176 | return true; | |
177 | } | |
178 | } else if (node instanceof TextNode) { | |
179 | TextNode text = (TextNode) node; | |
180 | String t = text.text().trim(); | |
181 | if (t.equals("(") || t.equals("to post comments)")) { | |
182 | return true; | |
183 | } | |
eaaeae39 | 184 | } |
3e62b034 NR |
185 | |
186 | return false; | |
187 | } | |
188 | }; | |
189 | } | |
190 | ||
191 | @Override | |
192 | protected List<Element> getCommentCommentPosts(Document doc, | |
193 | Element container) { | |
194 | List<Element> commentElements = new ArrayList<Element>(); | |
195 | if (container != null) { | |
196 | for (Element possibleCommentElement : container.children()) { | |
197 | if (possibleCommentElement.hasClass("CommentBox")) { | |
198 | commentElements.add(possibleCommentElement); | |
199 | } else if (possibleCommentElement.hasClass("Comment")) { | |
200 | commentElements.add(possibleCommentElement); | |
bb0d9eb2 | 201 | } |
eaaeae39 NR |
202 | } |
203 | } | |
3e62b034 NR |
204 | |
205 | return commentElements; | |
eaaeae39 NR |
206 | } |
207 | ||
3e62b034 NR |
208 | @Override |
209 | protected String getCommentId(Element post) { | |
210 | return post.id(); | |
211 | } | |
eaaeae39 | 212 | |
3e62b034 NR |
213 | @Override |
214 | protected String getCommentAuthor(Element post) { | |
215 | Element detailsE = post.getElementsByClass("CommentPoster").first(); | |
216 | if (detailsE != null) { | |
217 | String details = detailsE.text(); | |
218 | ||
219 | int pos = details.lastIndexOf(" by "); | |
220 | if (pos >= 0) { | |
221 | details = details.substring(pos + " by ".length()).trim(); | |
bb0d9eb2 | 222 | |
3e62b034 NR |
223 | if (details.startsWith("Posted ")) { |
224 | return details.substring("Posted ".length()).trim(); | |
225 | } | |
bb0d9eb2 | 226 | } |
eaaeae39 NR |
227 | } |
228 | ||
3e62b034 NR |
229 | return ""; |
230 | } | |
231 | ||
232 | @Override | |
233 | protected String getCommentTitle(Element post) { | |
234 | Element title = post.getElementsByClass("CommentTitle").first(); | |
235 | if (title != null) { | |
236 | return title.text(); | |
237 | } | |
238 | ||
239 | return ""; | |
240 | } | |
241 | ||
242 | @Override | |
243 | protected String getCommentDate(Element post) { | |
244 | Element detailsE = post.getElementsByClass("CommentPoster").first(); | |
245 | if (detailsE != null) { | |
246 | String details = detailsE.text(); | |
247 | ||
248 | int pos = details.lastIndexOf(" by "); | |
249 | if (pos >= 0) { | |
250 | return details.substring(0, pos).trim(); | |
251 | } | |
eaaeae39 NR |
252 | } |
253 | ||
3e62b034 NR |
254 | return ""; |
255 | } | |
bb0d9eb2 | 256 | |
3e62b034 NR |
257 | @Override |
258 | protected Element getCommentContentElement(Element post) { | |
259 | return post.getElementsByClass("CommentBody").first(); | |
eaaeae39 NR |
260 | } |
261 | ||
3e62b034 NR |
262 | @Override |
263 | protected ElementProcessor getElementProcessorComment() { | |
264 | return new BasicElementProcessor() { | |
27008a87 NR |
265 | @Override |
266 | public String processText(String text) { | |
267 | while (text.startsWith(">")) { // comments | |
268 | text = text.substring(1).trim(); | |
269 | } | |
eaaeae39 | 270 | |
27008a87 NR |
271 | return text; |
272 | } | |
eaaeae39 | 273 | |
27008a87 NR |
274 | @Override |
275 | public boolean detectQuote(Node node) { | |
276 | if (node instanceof Element) { | |
277 | Element elementNode = (Element) node; | |
278 | if (elementNode.tagName().equals("blockquote") | |
279 | || elementNode.hasClass("QuotedText")) { | |
280 | return true; | |
281 | } | |
282 | } | |
283 | ||
284 | return false; | |
285 | } | |
eaaeae39 | 286 | |
27008a87 NR |
287 | @Override |
288 | public boolean ignoreNode(Node node) { | |
289 | if (node instanceof Element) { | |
290 | Element elementNode = (Element) node; | |
291 | if (elementNode.hasClass("CommentPoster")) { | |
292 | return true; | |
293 | } | |
294 | } | |
295 | ||
296 | return false; | |
297 | } | |
3e62b034 NR |
298 | }; |
299 | } | |
300 | ||
301 | private String getArticleDetailsReal(Element article) { | |
302 | Element listing = article.getElementsByClass("BlurbListing").first(); | |
303 | // Valid articles have 2+ listings | |
304 | if (listing != null && listing.children().size() >= 2) { | |
305 | return listing.children().get(0).text(); | |
306 | } | |
307 | ||
308 | return ""; | |
eaaeae39 NR |
309 | } |
310 | } |