eac12e5c239c37aa921d2945333d1b7fbc4c4e9a
[gofetch.git] / src / be / nikiroo / gofetch / support / LWN.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.net.URL;
5 import java.util.AbstractMap;
6 import java.util.ArrayList;
7 import java.util.List;
8 import java.util.Map.Entry;
9
10 import org.jsoup.nodes.Document;
11 import org.jsoup.nodes.Element;
12 import org.jsoup.nodes.Node;
13 import org.jsoup.nodes.TextNode;
14
15 import be.nikiroo.gofetch.data.Comment;
16 import be.nikiroo.gofetch.data.Story;
17
18 /**
19 * Support <a href='https://lwn.net/'>https://lwn.net/</a>.
20 *
21 * @author niki
22 */
23 public class LWN extends BasicSupport {
24 @Override
25 public String getDescription() {
26 return "LWN: Linux Weekly Newsletter";
27 }
28
29 @Override
30 public void fetch(Story story) throws IOException {
31 // Do not try the paid-for stories...
32 if (!story.getTitle().startsWith("[$]")) {
33 super.fetch(story);
34 } else {
35 String fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/].";
36 story.setFullContent(fullContent);
37 story.setComments(new ArrayList<Comment>());
38 }
39 }
40
41 @Override
42 protected List<Entry<URL, String>> getUrls() throws IOException {
43 List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
44 urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
45 "https://lwn.net/"), ""));
46 return urls;
47 }
48
49 @Override
50 protected List<Element> getArticles(Document doc) {
51 return doc.getElementsByClass("pure-u-1");
52 }
53
54 @Override
55 protected String getArticleId(Document doc, Element article) {
56 String id = getArticleIntUrl(doc, article).replaceAll("[^0-9]", "");
57 while (id.length() < 10) {
58 id = "0" + id;
59 }
60
61 return id;
62 }
63
64 @Override
65 protected String getArticleTitle(Document doc, Element article) {
66 Element title = article.getElementsByClass("Headline").first();
67 if (title != null) {
68 return title.text();
69 }
70
71 return "";
72 }
73
74 @Override
75 protected String getArticleAuthor(Document doc, Element article) {
76 String author = "";
77 String details = getArticleDetailsReal(article);
78 int pos = details.indexOf(" by ");
79 if (pos >= 0) {
80 author = details.substring(pos + " by ".length()).trim();
81 }
82
83 return author;
84 }
85
86 @Override
87 protected String getArticleDate(Document doc, Element article) {
88 String date = "";
89 String details = getArticleDetailsReal(article);
90 int pos = details.indexOf(" Posted ");
91 if (pos >= 0) {
92 date = details.substring(pos + " Posted ".length()).trim();
93 pos = date.indexOf(" by ");
94 if (pos >= 0) {
95 date = date.substring(0, pos).trim();
96 }
97 }
98
99 return date;
100 }
101
102 @Override
103 protected String getArticleCategory(Document doc, Element article,
104 String currentCategory) {
105 String categ = "";
106 String details = getArticleDetailsReal(article);
107 int pos = details.indexOf("]");
108 if (pos >= 0) {
109 categ = details.substring(1, pos).trim();
110 }
111
112 return categ;
113 }
114
115 @Override
116 protected String getArticleDetails(Document doc, Element article) {
117 return ""; // We actually extract all the values
118 }
119
120 @Override
121 protected String getArticleIntUrl(Document doc, Element article) {
122 String intUrl = "";
123 for (Element idElem : article.getElementsByTag("a")) {
124 // Last link is the story link
125 intUrl = idElem.absUrl("href");
126 int pos = intUrl.indexOf("#Comments");
127 if (pos >= 0) {
128 intUrl = intUrl.substring(0, pos - 1);
129 }
130 }
131
132 return intUrl;
133 }
134
135 @Override
136 protected String getArticleExtUrl(Document doc, Element article) {
137 return "";
138 }
139
140 @Override
141 protected String getArticleContent(Document doc, Element article) {
142 Element listing = article.getElementsByClass("BlurbListing").first();
143 if (listing != null && listing.children().size() >= 2) {
144 String content = "";
145
146 // All but the first and two last children
147 for (int i = 1; i < listing.children().size() - 2; i++) {
148 Element e = listing.children().get(i);
149 content = content.trim() + " " + e.text().trim();
150 }
151
152 return content;
153 }
154
155 return "";
156 }
157
158 @Override
159 protected Element getFullArticle(Document doc) {
160 return doc.getElementsByClass("ArticleText").first();
161 }
162
163 @Override
164 protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
165 return doc.getElementsByClass("lwn-u-1");
166 }
167
168 @Override
169 protected ElementProcessor getElementProcessorFullArticle() {
170 return new BasicElementProcessor() {
171 @Override
172 public boolean ignoreNode(Node node) {
173 if (node instanceof Element) {
174 Element el = (Element) node;
175 if ("Log in".equals(el.text().trim())) {
176 return true;
177 }
178 } else if (node instanceof TextNode) {
179 TextNode text = (TextNode) node;
180 String t = text.text().trim();
181 if (t.equals("(") || t.equals("to post comments)")) {
182 return true;
183 }
184 }
185
186 return false;
187 }
188 };
189 }
190
191 @Override
192 protected List<Element> getCommentCommentPosts(Document doc,
193 Element container) {
194 List<Element> commentElements = new ArrayList<Element>();
195 if (container != null) {
196 for (Element possibleCommentElement : container.children()) {
197 if (possibleCommentElement.hasClass("CommentBox")) {
198 commentElements.add(possibleCommentElement);
199 } else if (possibleCommentElement.hasClass("Comment")) {
200 commentElements.add(possibleCommentElement);
201 }
202 }
203 }
204
205 return commentElements;
206 }
207
208 @Override
209 protected String getCommentId(Element post) {
210 return post.id();
211 }
212
213 @Override
214 protected String getCommentAuthor(Element post) {
215 Element detailsE = post.getElementsByClass("CommentPoster").first();
216 if (detailsE != null) {
217 String details = detailsE.text();
218
219 int pos = details.lastIndexOf(" by ");
220 if (pos >= 0) {
221 details = details.substring(pos + " by ".length()).trim();
222
223 if (details.startsWith("Posted ")) {
224 return details.substring("Posted ".length()).trim();
225 }
226 }
227 }
228
229 return "";
230 }
231
232 @Override
233 protected String getCommentTitle(Element post) {
234 Element title = post.getElementsByClass("CommentTitle").first();
235 if (title != null) {
236 return title.text();
237 }
238
239 return "";
240 }
241
242 @Override
243 protected String getCommentDate(Element post) {
244 Element detailsE = post.getElementsByClass("CommentPoster").first();
245 if (detailsE != null) {
246 String details = detailsE.text();
247
248 int pos = details.lastIndexOf(" by ");
249 if (pos >= 0) {
250 return details.substring(0, pos).trim();
251 }
252 }
253
254 return "";
255 }
256
257 @Override
258 protected Element getCommentContentElement(Element post) {
259 return post.getElementsByClass("CommentBody").first();
260 }
261
262 @Override
263 protected ElementProcessor getElementProcessorComment() {
264 return new BasicElementProcessor() {
265 @Override
266 public String processText(String text) {
267 while (text.startsWith(">")) { // comments
268 text = text.substring(1).trim();
269 }
270
271 return text;
272 }
273
274 @Override
275 public boolean detectQuote(Node node) {
276 if (node instanceof Element) {
277 Element elementNode = (Element) node;
278 if (elementNode.tagName().equals("blockquote")
279 || elementNode.hasClass("QuotedText")) {
280 return true;
281 }
282 }
283
284 return false;
285 }
286
287 @Override
288 public boolean ignoreNode(Node node) {
289 if (node instanceof Element) {
290 Element elementNode = (Element) node;
291 if (elementNode.hasClass("CommentPoster")) {
292 return true;
293 }
294 }
295
296 return false;
297 }
298 };
299 }
300
301 private String getArticleDetailsReal(Element article) {
302 Element listing = article.getElementsByClass("BlurbListing").first();
303 // Valid articles have 2+ listings
304 if (listing != null && listing.children().size() >= 2) {
305 return listing.children().get(0).text();
306 }
307
308 return "";
309 }
310 }