Fix layout issues in getContent() text
[gofetch.git] / src / be / nikiroo / gofetch / support / LWN.java
CommitLineData
eaaeae39
NR
1package be.nikiroo.gofetch.support;
2
3import java.io.IOException;
eaaeae39 4import java.net.URL;
3e62b034 5import java.util.AbstractMap;
eaaeae39
NR
6import java.util.ArrayList;
7import java.util.List;
3e62b034 8import java.util.Map.Entry;
eaaeae39 9
eaaeae39
NR
10import org.jsoup.nodes.Document;
11import org.jsoup.nodes.Element;
bb0d9eb2 12import org.jsoup.nodes.Node;
3e62b034 13import org.jsoup.nodes.TextNode;
eaaeae39
NR
14
15import be.nikiroo.gofetch.data.Comment;
16import be.nikiroo.gofetch.data.Story;
17
18/**
19 * Support <a href='https://lwn.net/'>https://lwn.net/</a>.
20 *
21 * @author niki
22 */
23public class LWN extends BasicSupport {
24 @Override
25 public String getDescription() {
26 return "LWN: Linux Weekly Newsletter";
27 }
28
29 @Override
3e62b034
NR
30 public void fetch(Story story) throws IOException {
31 // Do not try the paid-for stories...
32 if (!story.getTitle().startsWith("[$]")) {
33 super.fetch(story);
34 } else {
35 String fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/].";
36 story.setFullContent(fullContent);
37 story.setComments(new ArrayList<Comment>());
38 }
39 }
eaaeae39 40
3e62b034
NR
41 @Override
42 protected List<Entry<URL, String>> getUrls() throws IOException {
43 List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
44 urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
45 "https://lwn.net/"), ""));
46 return urls;
47 }
5c056aad 48
3e62b034
NR
49 @Override
50 protected List<Element> getArticles(Document doc) {
51 return doc.getElementsByClass("pure-u-1");
52 }
eaaeae39 53
3e62b034
NR
54 @Override
55 protected String getArticleId(Document doc, Element article) {
1ab7ff0a 56 return getArticleIntUrl(doc, article).replaceAll("[^0-9]", "");
3e62b034 57 }
5c056aad 58
3e62b034
NR
59 @Override
60 protected String getArticleTitle(Document doc, Element article) {
61 Element title = article.getElementsByClass("Headline").first();
62 if (title != null) {
63 return title.text();
64 }
b34d1f35 65
3e62b034
NR
66 return "";
67 }
b34d1f35 68
3e62b034
NR
69 @Override
70 protected String getArticleAuthor(Document doc, Element article) {
71 String author = "";
72 String details = getArticleDetailsReal(article);
73 int pos = details.indexOf(" by ");
74 if (pos >= 0) {
75 author = details.substring(pos + " by ".length()).trim();
76 }
77
78 return author;
79 }
5c056aad 80
3e62b034
NR
81 @Override
82 protected String getArticleDate(Document doc, Element article) {
83 String date = "";
84 String details = getArticleDetailsReal(article);
85 int pos = details.indexOf(" Posted ");
86 if (pos >= 0) {
87 date = details.substring(pos + " Posted ".length()).trim();
88 pos = date.indexOf(" by ");
eaaeae39 89 if (pos >= 0) {
3e62b034 90 date = date.substring(0, pos).trim();
eaaeae39 91 }
3e62b034 92 }
eaaeae39 93
3e62b034
NR
94 return date;
95 }
eaaeae39 96
3e62b034
NR
97 @Override
98 protected String getArticleCategory(Document doc, Element article,
99 String currentCategory) {
100 String categ = "";
101 String details = getArticleDetailsReal(article);
102 int pos = details.indexOf("]");
103 if (pos >= 0) {
104 categ = details.substring(1, pos).trim();
eaaeae39
NR
105 }
106
3e62b034 107 return categ;
eaaeae39
NR
108 }
109
110 @Override
3e62b034
NR
111 protected String getArticleDetails(Document doc, Element article) {
112 return ""; // We actually extract all the values
113 }
bb0d9eb2 114
3e62b034
NR
115 @Override
116 protected String getArticleIntUrl(Document doc, Element article) {
117 String intUrl = "";
118 for (Element idElem : article.getElementsByTag("a")) {
119 // Last link is the story link
120 intUrl = idElem.absUrl("href");
121 int pos = intUrl.indexOf("#Comments");
122 if (pos >= 0) {
123 intUrl = intUrl.substring(0, pos - 1);
bb0d9eb2 124 }
3e62b034 125 }
bb0d9eb2 126
3e62b034
NR
127 return intUrl;
128 }
129
130 @Override
131 protected String getArticleExtUrl(Document doc, Element article) {
132 return "";
133 }
134
135 @Override
136 protected String getArticleContent(Document doc, Element article) {
137 Element listing = article.getElementsByClass("BlurbListing").first();
138 if (listing != null && listing.children().size() >= 2) {
139 String content = "";
140
141 // All but the first and two last children
142 for (int i = 1; i < listing.children().size() - 2; i++) {
143 Element e = listing.children().get(i);
e818d449 144 content = content.trim() + " " + getArticleText(e);
bb0d9eb2 145 }
3e62b034
NR
146
147 return content;
bb0d9eb2
NR
148 }
149
3e62b034
NR
150 return "";
151 }
152
153 @Override
154 protected Element getFullArticle(Document doc) {
155 return doc.getElementsByClass("ArticleText").first();
eaaeae39
NR
156 }
157
3e62b034
NR
158 @Override
159 protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
160 return doc.getElementsByClass("lwn-u-1");
161 }
162
163 @Override
164 protected ElementProcessor getElementProcessorFullArticle() {
165 return new BasicElementProcessor() {
166 @Override
167 public boolean ignoreNode(Node node) {
168 if (node instanceof Element) {
169 Element el = (Element) node;
170 if ("Log in".equals(el.text().trim())) {
171 return true;
172 }
173 } else if (node instanceof TextNode) {
174 TextNode text = (TextNode) node;
175 String t = text.text().trim();
176 if (t.equals("(") || t.equals("to post comments)")) {
177 return true;
178 }
eaaeae39 179 }
3e62b034
NR
180
181 return false;
182 }
183 };
184 }
185
186 @Override
187 protected List<Element> getCommentCommentPosts(Document doc,
188 Element container) {
189 List<Element> commentElements = new ArrayList<Element>();
190 if (container != null) {
191 for (Element possibleCommentElement : container.children()) {
192 if (possibleCommentElement.hasClass("CommentBox")) {
193 commentElements.add(possibleCommentElement);
194 } else if (possibleCommentElement.hasClass("Comment")) {
195 commentElements.add(possibleCommentElement);
bb0d9eb2 196 }
eaaeae39
NR
197 }
198 }
3e62b034
NR
199
200 return commentElements;
eaaeae39
NR
201 }
202
3e62b034
NR
203 @Override
204 protected String getCommentId(Element post) {
205 return post.id();
206 }
eaaeae39 207
3e62b034
NR
208 @Override
209 protected String getCommentAuthor(Element post) {
210 Element detailsE = post.getElementsByClass("CommentPoster").first();
211 if (detailsE != null) {
212 String details = detailsE.text();
213
214 int pos = details.lastIndexOf(" by ");
215 if (pos >= 0) {
216 details = details.substring(pos + " by ".length()).trim();
bb0d9eb2 217
3e62b034
NR
218 if (details.startsWith("Posted ")) {
219 return details.substring("Posted ".length()).trim();
220 }
bb0d9eb2 221 }
eaaeae39
NR
222 }
223
3e62b034
NR
224 return "";
225 }
226
227 @Override
228 protected String getCommentTitle(Element post) {
229 Element title = post.getElementsByClass("CommentTitle").first();
230 if (title != null) {
231 return title.text();
232 }
233
234 return "";
235 }
236
237 @Override
238 protected String getCommentDate(Element post) {
239 Element detailsE = post.getElementsByClass("CommentPoster").first();
240 if (detailsE != null) {
241 String details = detailsE.text();
242
243 int pos = details.lastIndexOf(" by ");
244 if (pos >= 0) {
245 return details.substring(0, pos).trim();
246 }
eaaeae39
NR
247 }
248
3e62b034
NR
249 return "";
250 }
bb0d9eb2 251
3e62b034
NR
252 @Override
253 protected Element getCommentContentElement(Element post) {
254 return post.getElementsByClass("CommentBody").first();
eaaeae39
NR
255 }
256
3e62b034
NR
257 @Override
258 protected ElementProcessor getElementProcessorComment() {
259 return new BasicElementProcessor() {
27008a87
NR
260 @Override
261 public String processText(String text) {
262 while (text.startsWith(">")) { // comments
263 text = text.substring(1).trim();
264 }
eaaeae39 265
27008a87
NR
266 return text;
267 }
eaaeae39 268
27008a87
NR
269 @Override
270 public boolean detectQuote(Node node) {
271 if (node instanceof Element) {
272 Element elementNode = (Element) node;
273 if (elementNode.tagName().equals("blockquote")
274 || elementNode.hasClass("QuotedText")) {
275 return true;
276 }
277 }
278
279 return false;
280 }
eaaeae39 281
27008a87
NR
282 @Override
283 public boolean ignoreNode(Node node) {
284 if (node instanceof Element) {
285 Element elementNode = (Element) node;
286 if (elementNode.hasClass("CommentPoster")) {
287 return true;
288 }
289 }
290
291 return false;
292 }
3e62b034
NR
293 };
294 }
295
296 private String getArticleDetailsReal(Element article) {
297 Element listing = article.getElementsByClass("BlurbListing").first();
298 // Valid articles have 2+ listings
299 if (listing != null && listing.children().size() >= 2) {
300 return listing.children().get(0).text();
301 }
302
303 return "";
eaaeae39
NR
304 }
305}