Fix some IDs and update utils
[gofetch.git] / src / be / nikiroo / gofetch / support / LWN.java
CommitLineData
eaaeae39
NR
1package be.nikiroo.gofetch.support;
2
3import java.io.IOException;
eaaeae39 4import java.net.URL;
3e62b034 5import java.util.AbstractMap;
eaaeae39
NR
6import java.util.ArrayList;
7import java.util.List;
3e62b034 8import java.util.Map.Entry;
eaaeae39 9
eaaeae39
NR
10import org.jsoup.nodes.Document;
11import org.jsoup.nodes.Element;
bb0d9eb2 12import org.jsoup.nodes.Node;
3e62b034 13import org.jsoup.nodes.TextNode;
eaaeae39
NR
14
15import be.nikiroo.gofetch.data.Comment;
16import be.nikiroo.gofetch.data.Story;
17
18/**
19 * Support <a href='https://lwn.net/'>https://lwn.net/</a>.
20 *
21 * @author niki
22 */
23public class LWN extends BasicSupport {
24 @Override
25 public String getDescription() {
26 return "LWN: Linux Weekly Newsletter";
27 }
28
29 @Override
3e62b034
NR
30 public void fetch(Story story) throws IOException {
31 // Do not try the paid-for stories...
32 if (!story.getTitle().startsWith("[$]")) {
33 super.fetch(story);
34 } else {
35 String fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/].";
36 story.setFullContent(fullContent);
37 story.setComments(new ArrayList<Comment>());
38 }
39 }
eaaeae39 40
3e62b034
NR
41 @Override
42 protected List<Entry<URL, String>> getUrls() throws IOException {
43 List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
44 urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
45 "https://lwn.net/"), ""));
46 return urls;
47 }
5c056aad 48
3e62b034
NR
49 @Override
50 protected List<Element> getArticles(Document doc) {
51 return doc.getElementsByClass("pure-u-1");
52 }
eaaeae39 53
3e62b034
NR
54 @Override
55 protected String getArticleId(Document doc, Element article) {
64a785f6
NR
56 String id = getArticleIntUrl(doc, article).replaceAll("[^0-9]", "");
57 while (id.length() < 10) {
58 id = "0" + id;
59 }
60
61 return id;
3e62b034 62 }
5c056aad 63
3e62b034
NR
64 @Override
65 protected String getArticleTitle(Document doc, Element article) {
66 Element title = article.getElementsByClass("Headline").first();
67 if (title != null) {
68 return title.text();
69 }
b34d1f35 70
3e62b034
NR
71 return "";
72 }
b34d1f35 73
3e62b034
NR
74 @Override
75 protected String getArticleAuthor(Document doc, Element article) {
76 String author = "";
77 String details = getArticleDetailsReal(article);
78 int pos = details.indexOf(" by ");
79 if (pos >= 0) {
80 author = details.substring(pos + " by ".length()).trim();
81 }
82
83 return author;
84 }
5c056aad 85
3e62b034
NR
86 @Override
87 protected String getArticleDate(Document doc, Element article) {
88 String date = "";
89 String details = getArticleDetailsReal(article);
90 int pos = details.indexOf(" Posted ");
91 if (pos >= 0) {
92 date = details.substring(pos + " Posted ".length()).trim();
93 pos = date.indexOf(" by ");
eaaeae39 94 if (pos >= 0) {
3e62b034 95 date = date.substring(0, pos).trim();
eaaeae39 96 }
3e62b034 97 }
eaaeae39 98
3e62b034
NR
99 return date;
100 }
eaaeae39 101
3e62b034
NR
102 @Override
103 protected String getArticleCategory(Document doc, Element article,
104 String currentCategory) {
105 String categ = "";
106 String details = getArticleDetailsReal(article);
107 int pos = details.indexOf("]");
108 if (pos >= 0) {
109 categ = details.substring(1, pos).trim();
eaaeae39
NR
110 }
111
3e62b034 112 return categ;
eaaeae39
NR
113 }
114
115 @Override
3e62b034
NR
116 protected String getArticleDetails(Document doc, Element article) {
117 return ""; // We actually extract all the values
118 }
bb0d9eb2 119
3e62b034
NR
120 @Override
121 protected String getArticleIntUrl(Document doc, Element article) {
122 String intUrl = "";
123 for (Element idElem : article.getElementsByTag("a")) {
124 // Last link is the story link
125 intUrl = idElem.absUrl("href");
126 int pos = intUrl.indexOf("#Comments");
127 if (pos >= 0) {
128 intUrl = intUrl.substring(0, pos - 1);
bb0d9eb2 129 }
3e62b034 130 }
bb0d9eb2 131
3e62b034
NR
132 return intUrl;
133 }
134
135 @Override
136 protected String getArticleExtUrl(Document doc, Element article) {
137 return "";
138 }
139
140 @Override
141 protected String getArticleContent(Document doc, Element article) {
142 Element listing = article.getElementsByClass("BlurbListing").first();
143 if (listing != null && listing.children().size() >= 2) {
144 String content = "";
145
146 // All but the first and two last children
147 for (int i = 1; i < listing.children().size() - 2; i++) {
148 Element e = listing.children().get(i);
149 content = content.trim() + " " + e.text().trim();
bb0d9eb2 150 }
3e62b034
NR
151
152 return content;
bb0d9eb2
NR
153 }
154
3e62b034
NR
155 return "";
156 }
157
158 @Override
159 protected Element getFullArticle(Document doc) {
160 return doc.getElementsByClass("ArticleText").first();
eaaeae39
NR
161 }
162
3e62b034
NR
163 @Override
164 protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
165 return doc.getElementsByClass("lwn-u-1");
166 }
167
168 @Override
169 protected ElementProcessor getElementProcessorFullArticle() {
170 return new BasicElementProcessor() {
171 @Override
172 public boolean ignoreNode(Node node) {
173 if (node instanceof Element) {
174 Element el = (Element) node;
175 if ("Log in".equals(el.text().trim())) {
176 return true;
177 }
178 } else if (node instanceof TextNode) {
179 TextNode text = (TextNode) node;
180 String t = text.text().trim();
181 if (t.equals("(") || t.equals("to post comments)")) {
182 return true;
183 }
eaaeae39 184 }
3e62b034
NR
185
186 return false;
187 }
188 };
189 }
190
191 @Override
192 protected List<Element> getCommentCommentPosts(Document doc,
193 Element container) {
194 List<Element> commentElements = new ArrayList<Element>();
195 if (container != null) {
196 for (Element possibleCommentElement : container.children()) {
197 if (possibleCommentElement.hasClass("CommentBox")) {
198 commentElements.add(possibleCommentElement);
199 } else if (possibleCommentElement.hasClass("Comment")) {
200 commentElements.add(possibleCommentElement);
bb0d9eb2 201 }
eaaeae39
NR
202 }
203 }
3e62b034
NR
204
205 return commentElements;
eaaeae39
NR
206 }
207
3e62b034
NR
208 @Override
209 protected String getCommentId(Element post) {
210 return post.id();
211 }
eaaeae39 212
3e62b034
NR
213 @Override
214 protected String getCommentAuthor(Element post) {
215 Element detailsE = post.getElementsByClass("CommentPoster").first();
216 if (detailsE != null) {
217 String details = detailsE.text();
218
219 int pos = details.lastIndexOf(" by ");
220 if (pos >= 0) {
221 details = details.substring(pos + " by ".length()).trim();
bb0d9eb2 222
3e62b034
NR
223 if (details.startsWith("Posted ")) {
224 return details.substring("Posted ".length()).trim();
225 }
bb0d9eb2 226 }
eaaeae39
NR
227 }
228
3e62b034
NR
229 return "";
230 }
231
232 @Override
233 protected String getCommentTitle(Element post) {
234 Element title = post.getElementsByClass("CommentTitle").first();
235 if (title != null) {
236 return title.text();
237 }
238
239 return "";
240 }
241
242 @Override
243 protected String getCommentDate(Element post) {
244 Element detailsE = post.getElementsByClass("CommentPoster").first();
245 if (detailsE != null) {
246 String details = detailsE.text();
247
248 int pos = details.lastIndexOf(" by ");
249 if (pos >= 0) {
250 return details.substring(0, pos).trim();
251 }
eaaeae39
NR
252 }
253
3e62b034
NR
254 return "";
255 }
bb0d9eb2 256
3e62b034
NR
257 @Override
258 protected Element getCommentContentElement(Element post) {
259 return post.getElementsByClass("CommentBody").first();
eaaeae39
NR
260 }
261
3e62b034
NR
262 @Override
263 protected ElementProcessor getElementProcessorComment() {
264 return new BasicElementProcessor() {
27008a87
NR
265 @Override
266 public String processText(String text) {
267 while (text.startsWith(">")) { // comments
268 text = text.substring(1).trim();
269 }
eaaeae39 270
27008a87
NR
271 return text;
272 }
eaaeae39 273
27008a87
NR
274 @Override
275 public boolean detectQuote(Node node) {
276 if (node instanceof Element) {
277 Element elementNode = (Element) node;
278 if (elementNode.tagName().equals("blockquote")
279 || elementNode.hasClass("QuotedText")) {
280 return true;
281 }
282 }
283
284 return false;
285 }
eaaeae39 286
27008a87
NR
287 @Override
288 public boolean ignoreNode(Node node) {
289 if (node instanceof Element) {
290 Element elementNode = (Element) node;
291 if (elementNode.hasClass("CommentPoster")) {
292 return true;
293 }
294 }
295
296 return false;
297 }
3e62b034
NR
298 };
299 }
300
301 private String getArticleDetailsReal(Element article) {
302 Element listing = article.getElementsByClass("BlurbListing").first();
303 // Valid articles have 2+ listings
304 if (listing != null && listing.children().size() >= 2) {
305 return listing.children().get(0).text();
306 }
307
308 return "";
eaaeae39
NR
309 }
310}