Commit | Line | Data |
---|---|---|
eaaeae39 NR |
1 | package be.nikiroo.gofetch.support; |
2 | ||
3 | import java.io.IOException; | |
eaaeae39 | 4 | import java.net.URL; |
3e62b034 | 5 | import java.util.AbstractMap; |
eaaeae39 NR |
6 | import java.util.ArrayList; |
7 | import java.util.List; | |
3e62b034 | 8 | import java.util.Map.Entry; |
eaaeae39 | 9 | |
eaaeae39 NR |
10 | import org.jsoup.nodes.Document; |
11 | import org.jsoup.nodes.Element; | |
bb0d9eb2 | 12 | import org.jsoup.nodes.Node; |
3e62b034 | 13 | import org.jsoup.nodes.TextNode; |
eaaeae39 NR |
14 | |
15 | import be.nikiroo.gofetch.data.Comment; | |
16 | import be.nikiroo.gofetch.data.Story; | |
17 | ||
18 | /** | |
19 | * Support <a href='https://lwn.net/'>https://lwn.net/</a>. | |
20 | * | |
21 | * @author niki | |
22 | */ | |
23 | public class LWN extends BasicSupport { | |
24 | @Override | |
25 | public String getDescription() { | |
26 | return "LWN: Linux Weekly Newsletter"; | |
27 | } | |
28 | ||
29 | @Override | |
3e62b034 NR |
30 | public void fetch(Story story) throws IOException { |
31 | // Do not try the paid-for stories... | |
32 | if (!story.getTitle().startsWith("[$]")) { | |
33 | super.fetch(story); | |
34 | } else { | |
35 | String fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/]."; | |
36 | story.setFullContent(fullContent); | |
37 | story.setComments(new ArrayList<Comment>()); | |
38 | } | |
39 | } | |
eaaeae39 | 40 | |
3e62b034 NR |
41 | @Override |
42 | protected List<Entry<URL, String>> getUrls() throws IOException { | |
43 | List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>(); | |
44 | urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL( | |
45 | "https://lwn.net/"), "")); | |
46 | return urls; | |
47 | } | |
5c056aad | 48 | |
3e62b034 NR |
49 | @Override |
50 | protected List<Element> getArticles(Document doc) { | |
51 | return doc.getElementsByClass("pure-u-1"); | |
52 | } | |
eaaeae39 | 53 | |
3e62b034 NR |
54 | @Override |
55 | protected String getArticleId(Document doc, Element article) { | |
56 | return getArticleIntUrl(doc, article).replaceAll("[^0-9]", ""); | |
57 | } | |
5c056aad | 58 | |
3e62b034 NR |
59 | @Override |
60 | protected String getArticleTitle(Document doc, Element article) { | |
61 | Element title = article.getElementsByClass("Headline").first(); | |
62 | if (title != null) { | |
63 | return title.text(); | |
64 | } | |
b34d1f35 | 65 | |
3e62b034 NR |
66 | return ""; |
67 | } | |
b34d1f35 | 68 | |
3e62b034 NR |
69 | @Override |
70 | protected String getArticleAuthor(Document doc, Element article) { | |
71 | String author = ""; | |
72 | String details = getArticleDetailsReal(article); | |
73 | int pos = details.indexOf(" by "); | |
74 | if (pos >= 0) { | |
75 | author = details.substring(pos + " by ".length()).trim(); | |
76 | } | |
77 | ||
78 | return author; | |
79 | } | |
5c056aad | 80 | |
3e62b034 NR |
81 | @Override |
82 | protected String getArticleDate(Document doc, Element article) { | |
83 | String date = ""; | |
84 | String details = getArticleDetailsReal(article); | |
85 | int pos = details.indexOf(" Posted "); | |
86 | if (pos >= 0) { | |
87 | date = details.substring(pos + " Posted ".length()).trim(); | |
88 | pos = date.indexOf(" by "); | |
eaaeae39 | 89 | if (pos >= 0) { |
3e62b034 | 90 | date = date.substring(0, pos).trim(); |
eaaeae39 | 91 | } |
3e62b034 | 92 | } |
eaaeae39 | 93 | |
3e62b034 NR |
94 | return date; |
95 | } | |
eaaeae39 | 96 | |
3e62b034 NR |
97 | @Override |
98 | protected String getArticleCategory(Document doc, Element article, | |
99 | String currentCategory) { | |
100 | String categ = ""; | |
101 | String details = getArticleDetailsReal(article); | |
102 | int pos = details.indexOf("]"); | |
103 | if (pos >= 0) { | |
104 | categ = details.substring(1, pos).trim(); | |
eaaeae39 NR |
105 | } |
106 | ||
3e62b034 | 107 | return categ; |
eaaeae39 NR |
108 | } |
109 | ||
110 | @Override | |
3e62b034 NR |
111 | protected String getArticleDetails(Document doc, Element article) { |
112 | return ""; // We actually extract all the values | |
113 | } | |
bb0d9eb2 | 114 | |
3e62b034 NR |
115 | @Override |
116 | protected String getArticleIntUrl(Document doc, Element article) { | |
117 | String intUrl = ""; | |
118 | for (Element idElem : article.getElementsByTag("a")) { | |
119 | // Last link is the story link | |
120 | intUrl = idElem.absUrl("href"); | |
121 | int pos = intUrl.indexOf("#Comments"); | |
122 | if (pos >= 0) { | |
123 | intUrl = intUrl.substring(0, pos - 1); | |
bb0d9eb2 | 124 | } |
3e62b034 | 125 | } |
bb0d9eb2 | 126 | |
3e62b034 NR |
127 | return intUrl; |
128 | } | |
129 | ||
130 | @Override | |
131 | protected String getArticleExtUrl(Document doc, Element article) { | |
132 | return ""; | |
133 | } | |
134 | ||
135 | @Override | |
136 | protected String getArticleContent(Document doc, Element article) { | |
137 | Element listing = article.getElementsByClass("BlurbListing").first(); | |
138 | if (listing != null && listing.children().size() >= 2) { | |
139 | String content = ""; | |
140 | ||
141 | // All but the first and two last children | |
142 | for (int i = 1; i < listing.children().size() - 2; i++) { | |
143 | Element e = listing.children().get(i); | |
144 | content = content.trim() + " " + e.text().trim(); | |
bb0d9eb2 | 145 | } |
3e62b034 NR |
146 | |
147 | return content; | |
bb0d9eb2 NR |
148 | } |
149 | ||
3e62b034 NR |
150 | return ""; |
151 | } | |
152 | ||
153 | @Override | |
154 | protected Element getFullArticle(Document doc) { | |
155 | return doc.getElementsByClass("ArticleText").first(); | |
eaaeae39 NR |
156 | } |
157 | ||
3e62b034 NR |
158 | @Override |
159 | protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) { | |
160 | return doc.getElementsByClass("lwn-u-1"); | |
161 | } | |
162 | ||
163 | @Override | |
164 | protected ElementProcessor getElementProcessorFullArticle() { | |
165 | return new BasicElementProcessor() { | |
166 | @Override | |
167 | public boolean ignoreNode(Node node) { | |
168 | if (node instanceof Element) { | |
169 | Element el = (Element) node; | |
170 | if ("Log in".equals(el.text().trim())) { | |
171 | return true; | |
172 | } | |
173 | } else if (node instanceof TextNode) { | |
174 | TextNode text = (TextNode) node; | |
175 | String t = text.text().trim(); | |
176 | if (t.equals("(") || t.equals("to post comments)")) { | |
177 | return true; | |
178 | } | |
eaaeae39 | 179 | } |
3e62b034 NR |
180 | |
181 | return false; | |
182 | } | |
183 | }; | |
184 | } | |
185 | ||
186 | @Override | |
187 | protected List<Element> getCommentCommentPosts(Document doc, | |
188 | Element container) { | |
189 | List<Element> commentElements = new ArrayList<Element>(); | |
190 | if (container != null) { | |
191 | for (Element possibleCommentElement : container.children()) { | |
192 | if (possibleCommentElement.hasClass("CommentBox")) { | |
193 | commentElements.add(possibleCommentElement); | |
194 | } else if (possibleCommentElement.hasClass("Comment")) { | |
195 | commentElements.add(possibleCommentElement); | |
bb0d9eb2 | 196 | } |
eaaeae39 NR |
197 | } |
198 | } | |
3e62b034 NR |
199 | |
200 | return commentElements; | |
eaaeae39 NR |
201 | } |
202 | ||
3e62b034 NR |
203 | @Override |
204 | protected String getCommentId(Element post) { | |
205 | return post.id(); | |
206 | } | |
eaaeae39 | 207 | |
3e62b034 NR |
208 | @Override |
209 | protected String getCommentAuthor(Element post) { | |
210 | Element detailsE = post.getElementsByClass("CommentPoster").first(); | |
211 | if (detailsE != null) { | |
212 | String details = detailsE.text(); | |
213 | ||
214 | int pos = details.lastIndexOf(" by "); | |
215 | if (pos >= 0) { | |
216 | details = details.substring(pos + " by ".length()).trim(); | |
bb0d9eb2 | 217 | |
3e62b034 NR |
218 | if (details.startsWith("Posted ")) { |
219 | return details.substring("Posted ".length()).trim(); | |
220 | } | |
bb0d9eb2 | 221 | } |
eaaeae39 NR |
222 | } |
223 | ||
3e62b034 NR |
224 | return ""; |
225 | } | |
226 | ||
227 | @Override | |
228 | protected String getCommentTitle(Element post) { | |
229 | Element title = post.getElementsByClass("CommentTitle").first(); | |
230 | if (title != null) { | |
231 | return title.text(); | |
232 | } | |
233 | ||
234 | return ""; | |
235 | } | |
236 | ||
237 | @Override | |
238 | protected String getCommentDate(Element post) { | |
239 | Element detailsE = post.getElementsByClass("CommentPoster").first(); | |
240 | if (detailsE != null) { | |
241 | String details = detailsE.text(); | |
242 | ||
243 | int pos = details.lastIndexOf(" by "); | |
244 | if (pos >= 0) { | |
245 | return details.substring(0, pos).trim(); | |
246 | } | |
eaaeae39 NR |
247 | } |
248 | ||
3e62b034 NR |
249 | return ""; |
250 | } | |
bb0d9eb2 | 251 | |
3e62b034 NR |
252 | @Override |
253 | protected Element getCommentContentElement(Element post) { | |
254 | return post.getElementsByClass("CommentBody").first(); | |
eaaeae39 NR |
255 | } |
256 | ||
3e62b034 NR |
257 | @Override |
258 | protected ElementProcessor getElementProcessorComment() { | |
259 | return new BasicElementProcessor() { | |
27008a87 NR |
260 | @Override |
261 | public String processText(String text) { | |
262 | while (text.startsWith(">")) { // comments | |
263 | text = text.substring(1).trim(); | |
264 | } | |
eaaeae39 | 265 | |
27008a87 NR |
266 | return text; |
267 | } | |
eaaeae39 | 268 | |
27008a87 NR |
269 | @Override |
270 | public boolean detectQuote(Node node) { | |
271 | if (node instanceof Element) { | |
272 | Element elementNode = (Element) node; | |
273 | if (elementNode.tagName().equals("blockquote") | |
274 | || elementNode.hasClass("QuotedText")) { | |
275 | return true; | |
276 | } | |
277 | } | |
278 | ||
279 | return false; | |
280 | } | |
eaaeae39 | 281 | |
27008a87 NR |
282 | @Override |
283 | public boolean ignoreNode(Node node) { | |
284 | if (node instanceof Element) { | |
285 | Element elementNode = (Element) node; | |
286 | if (elementNode.hasClass("CommentPoster")) { | |
287 | return true; | |
288 | } | |
289 | } | |
290 | ||
291 | return false; | |
292 | } | |
3e62b034 NR |
293 | }; |
294 | } | |
295 | ||
296 | private String getArticleDetailsReal(Element article) { | |
297 | Element listing = article.getElementsByClass("BlurbListing").first(); | |
298 | // Valid articles have 2+ listings | |
299 | if (listing != null && listing.children().size() >= 2) { | |
300 | return listing.children().get(0).text(); | |
301 | } | |
302 | ||
303 | return ""; | |
eaaeae39 NR |
304 | } |
305 | } |