Bug fixes + rework of BasicSupport
[gofetch.git] / src / be / nikiroo / gofetch / support / LWN.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.net.URL;
5 import java.util.AbstractMap;
6 import java.util.ArrayList;
7 import java.util.List;
8 import java.util.Map.Entry;
9
10 import org.jsoup.nodes.Document;
11 import org.jsoup.nodes.Element;
12 import org.jsoup.nodes.Node;
13 import org.jsoup.nodes.TextNode;
14
15 import be.nikiroo.gofetch.data.Comment;
16 import be.nikiroo.gofetch.data.Story;
17
18 /**
19 * Support <a href='https://lwn.net/'>https://lwn.net/</a>.
20 *
21 * @author niki
22 */
23 public class LWN extends BasicSupport {
24 @Override
25 public String getDescription() {
26 return "LWN: Linux Weekly Newsletter";
27 }
28
29 @Override
30 public void fetch(Story story) throws IOException {
31 // Do not try the paid-for stories...
32 if (!story.getTitle().startsWith("[$]")) {
33 super.fetch(story);
34 } else {
35 String fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/].";
36 story.setFullContent(fullContent);
37 story.setComments(new ArrayList<Comment>());
38 }
39 }
40
41 @Override
42 protected List<Entry<URL, String>> getUrls() throws IOException {
43 List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
44 urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
45 "https://lwn.net/"), ""));
46 return urls;
47 }
48
49 @Override
50 protected List<Element> getArticles(Document doc) {
51 return doc.getElementsByClass("pure-u-1");
52 }
53
54 @Override
55 protected String getArticleId(Document doc, Element article) {
56 return getArticleIntUrl(doc, article).replaceAll("[^0-9]", "");
57 }
58
59 @Override
60 protected String getArticleTitle(Document doc, Element article) {
61 Element title = article.getElementsByClass("Headline").first();
62 if (title != null) {
63 return title.text();
64 }
65
66 return "";
67 }
68
69 @Override
70 protected String getArticleAuthor(Document doc, Element article) {
71 String author = "";
72 String details = getArticleDetailsReal(article);
73 int pos = details.indexOf(" by ");
74 if (pos >= 0) {
75 author = details.substring(pos + " by ".length()).trim();
76 }
77
78 return author;
79 }
80
81 @Override
82 protected String getArticleDate(Document doc, Element article) {
83 String date = "";
84 String details = getArticleDetailsReal(article);
85 int pos = details.indexOf(" Posted ");
86 if (pos >= 0) {
87 date = details.substring(pos + " Posted ".length()).trim();
88 pos = date.indexOf(" by ");
89 if (pos >= 0) {
90 date = date.substring(0, pos).trim();
91 }
92 }
93
94 return date;
95 }
96
97 @Override
98 protected String getArticleCategory(Document doc, Element article,
99 String currentCategory) {
100 String categ = "";
101 String details = getArticleDetailsReal(article);
102 int pos = details.indexOf("]");
103 if (pos >= 0) {
104 categ = details.substring(1, pos).trim();
105 }
106
107 return categ;
108 }
109
110 @Override
111 protected String getArticleDetails(Document doc, Element article) {
112 return ""; // We actually extract all the values
113 }
114
115 @Override
116 protected String getArticleIntUrl(Document doc, Element article) {
117 String intUrl = "";
118 for (Element idElem : article.getElementsByTag("a")) {
119 // Last link is the story link
120 intUrl = idElem.absUrl("href");
121 int pos = intUrl.indexOf("#Comments");
122 if (pos >= 0) {
123 intUrl = intUrl.substring(0, pos - 1);
124 }
125 }
126
127 return intUrl;
128 }
129
130 @Override
131 protected String getArticleExtUrl(Document doc, Element article) {
132 return "";
133 }
134
135 @Override
136 protected String getArticleContent(Document doc, Element article) {
137 Element listing = article.getElementsByClass("BlurbListing").first();
138 if (listing != null && listing.children().size() >= 2) {
139 String content = "";
140
141 // All but the first and two last children
142 for (int i = 1; i < listing.children().size() - 2; i++) {
143 Element e = listing.children().get(i);
144 content = content.trim() + " " + e.text().trim();
145 }
146
147 return content;
148 }
149
150 return "";
151 }
152
153 @Override
154 protected Element getFullArticle(Document doc) {
155 return doc.getElementsByClass("ArticleText").first();
156 }
157
158 @Override
159 protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
160 return doc.getElementsByClass("lwn-u-1");
161 }
162
163 @Override
164 protected ElementProcessor getElementProcessorFullArticle() {
165 return new BasicElementProcessor() {
166 @Override
167 public boolean ignoreNode(Node node) {
168 if (node instanceof Element) {
169 Element el = (Element) node;
170 if ("Log in".equals(el.text().trim())) {
171 return true;
172 }
173 } else if (node instanceof TextNode) {
174 TextNode text = (TextNode) node;
175 String t = text.text().trim();
176 if (t.equals("(") || t.equals("to post comments)")) {
177 return true;
178 }
179 }
180
181 return false;
182 }
183 };
184 }
185
186 @Override
187 protected List<Element> getCommentCommentPosts(Document doc,
188 Element container) {
189 List<Element> commentElements = new ArrayList<Element>();
190 if (container != null) {
191 for (Element possibleCommentElement : container.children()) {
192 if (possibleCommentElement.hasClass("CommentBox")) {
193 commentElements.add(possibleCommentElement);
194 } else if (possibleCommentElement.hasClass("Comment")) {
195 commentElements.add(possibleCommentElement);
196 }
197 }
198 }
199
200 return commentElements;
201 }
202
203 @Override
204 protected String getCommentId(Element post) {
205 return post.id();
206 }
207
208 @Override
209 protected String getCommentAuthor(Element post) {
210 Element detailsE = post.getElementsByClass("CommentPoster").first();
211 if (detailsE != null) {
212 String details = detailsE.text();
213
214 int pos = details.lastIndexOf(" by ");
215 if (pos >= 0) {
216 details = details.substring(pos + " by ".length()).trim();
217
218 if (details.startsWith("Posted ")) {
219 return details.substring("Posted ".length()).trim();
220 }
221 }
222 }
223
224 return "";
225 }
226
227 @Override
228 protected String getCommentTitle(Element post) {
229 Element title = post.getElementsByClass("CommentTitle").first();
230 if (title != null) {
231 return title.text();
232 }
233
234 return "";
235 }
236
237 @Override
238 protected String getCommentDate(Element post) {
239 Element detailsE = post.getElementsByClass("CommentPoster").first();
240 if (detailsE != null) {
241 String details = detailsE.text();
242
243 int pos = details.lastIndexOf(" by ");
244 if (pos >= 0) {
245 return details.substring(0, pos).trim();
246 }
247 }
248
249 return "";
250 }
251
252 @Override
253 protected Element getCommentContentElement(Element post) {
254 return post.getElementsByClass("CommentBody").first();
255 }
256
257 @Override
258 protected ElementProcessor getElementProcessorComment() {
259 return new BasicElementProcessor() {
260 @Override
261 public String processText(String text) {
262 while (text.startsWith(">")) { // comments
263 text = text.substring(1).trim();
264 }
265
266 return text;
267 }
268
269 @Override
270 public boolean detectQuote(Node node) {
271 if (node instanceof Element) {
272 Element elementNode = (Element) node;
273 if (elementNode.tagName().equals("blockquote")
274 || elementNode.hasClass("QuotedText")) {
275 return true;
276 }
277 }
278
279 return false;
280 }
281
282 @Override
283 public boolean ignoreNode(Node node) {
284 if (node instanceof Element) {
285 Element elementNode = (Element) node;
286 if (elementNode.hasClass("CommentPoster")) {
287 return true;
288 }
289 }
290
291 return false;
292 }
293 };
294 }
295
296 private String getArticleDetailsReal(Element article) {
297 Element listing = article.getElementsByClass("BlurbListing").first();
298 // Valid articles have 2+ listings
299 if (listing != null && listing.children().size() >= 2) {
300 return listing.children().get(0).text();
301 }
302
303 return "";
304 }
305 }