Version 0.2.0: supports LWN, quotes, <br>s
[gofetch.git] / src / be / nikiroo / gofetch / support / LWN.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.URL;
6 import java.util.ArrayList;
7 import java.util.List;
8
9 import org.jsoup.helper.DataUtil;
10 import org.jsoup.nodes.Document;
11 import org.jsoup.nodes.Element;
12 import org.jsoup.nodes.Node;
13 import org.jsoup.select.Elements;
14
15 import be.nikiroo.gofetch.data.Comment;
16 import be.nikiroo.gofetch.data.Story;
17
18 /**
19 * Support <a href='https://lwn.net/'>https://lwn.net/</a>.
20 *
21 * @author niki
22 */
23 public class LWN extends BasicSupport {
24 @Override
25 public String getDescription() {
26 return "LWN: Linux Weekly Newsletter";
27 }
28
29 @Override
30 public List<Story> list() throws IOException {
31 List<Story> list = new ArrayList<Story>();
32
33 URL url = new URL("https://lwn.net/");
34 InputStream in = open(url);
35 Document doc = DataUtil.load(in, "UTF-8", url.toString());
36 Elements stories = doc.getElementsByClass("pure-u-1");
37 for (Element story : stories) {
38 Elements titles = story.getElementsByClass("Headline");
39 Elements listings = story.getElementsByClass("BlurbListing");
40 if (titles.size() == 0) {
41 continue;
42 }
43 if (listings.size() == 0) {
44 continue;
45 }
46
47 Element listing = listings.get(0);
48 if (listing.children().size() < 2) {
49 continue;
50 }
51
52 String title = titles.get(0).text();
53 String details = listing.children().get(0).text();
54 String body = "";
55 // All but the first and two last children
56 for (int i = 1; i < listing.children().size() - 2; i++) {
57 Element e = listing.children().get(i);
58 body = body.trim() + " " + e.text().trim();
59 }
60 body = body.trim();
61
62 String author = "";
63 int pos = details.indexOf(" by ");
64 if (pos >= 0) {
65 author = details.substring(pos + " by ".length()).trim();
66 }
67
68 String date = "";
69 pos = details.indexOf(" Posted ");
70 if (pos >= 0) {
71 date = details.substring(pos + " Posted ".length()).trim();
72 }
73
74 String id = "";
75 String intUrl = "";
76 String extUrl = "";
77 for (Element idElem : story.getElementsByTag("a")) {
78 // Last link is the story link
79 intUrl = idElem.absUrl("href");
80 pos = intUrl.indexOf("#Comments");
81 if (pos >= 0) {
82 intUrl = intUrl.substring(0, pos - 1);
83 }
84 id = intUrl.replaceAll("[^0-9]", "");
85 }
86
87 list.add(new Story(getType(), id, title, details, intUrl, extUrl,
88 body));
89 }
90
91 return list;
92 }
93
94 @Override
95 public void fetch(Story story) throws IOException {
96 List<Comment> comments = new ArrayList<Comment>();
97 String fullContent = story.getContent();
98
99 // Do not try the paid-for stories...
100 if (!story.getTitle().startsWith("[$]")) {
101 URL url = new URL(story.getUrlInternal());
102 InputStream in = open(url);
103 Document doc = DataUtil.load(in, "UTF-8", url.toString());
104 Elements fullContentElements = doc
105 .getElementsByClass("ArticleText");
106 if (fullContentElements.size() > 0) {
107 // comments.addAll(getComments(listing.get(0)));
108 fullContent = fullContentElements.get(0).text();
109 }
110
111 Elements listing = doc.getElementsByClass("lwn-u-1");
112 if (listing.size() > 0) {
113 comments.addAll(getComments(listing.get(0)));
114 }
115 } else {
116 fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/].";
117 }
118
119 story.setFullContent(fullContent);
120 story.setComments(comments);
121 }
122
123 private List<Comment> getComments(Element listing) {
124 List<Comment> comments = new ArrayList<Comment>();
125 for (Element commentElement : listing.children()) {
126 if (commentElement.hasClass("CommentBox")) {
127 Comment comment = getComment(commentElement);
128 if (!comment.isEmpty()) {
129 comments.add(comment);
130 }
131 } else if (commentElement.hasClass("Comment")) {
132 if (comments.size() > 0) {
133 comments.get(comments.size() - 1).addAll(
134 getComments(commentElement));
135 }
136 }
137 }
138 return comments;
139 }
140
141 private Comment getComment(Element commentElement) {
142 String title = firstOrEmpty(commentElement, "CommentTitle").text();
143 String author = firstOrEmpty(commentElement, "CommentPoster").text();
144
145 String date = "";
146 int pos = author.lastIndexOf(" by ");
147 if (pos >= 0) {
148 date = author.substring(0, pos).trim();
149 author = author.substring(pos + " by ".length()).trim();
150
151 if (author.startsWith("Posted ")) {
152 author = author.substring("Posted ".length()).trim();
153 }
154 }
155
156 Element content = null;
157 Elements commentBodyElements = commentElement
158 .getElementsByClass("CommentBody");
159 if (commentBodyElements.size() > 0) {
160 content = commentBodyElements.get(0);
161 }
162
163 Comment comment = new Comment(commentElement.id(), author, title, date,
164 toLines(content));
165
166 return comment;
167 }
168
169 private List<String> toLines(Element element) {
170 return toLines(element, new QuoteProcessor() {
171 @Override
172 public String processText(String text) {
173 while (text.startsWith(">")) { // comments
174 text = text.substring(1).trim();
175 }
176
177 return text;
178 }
179
180 @Override
181 public boolean detectQuote(Node node) {
182 if (node instanceof Element) {
183 Element elementNode = (Element) node;
184 if (elementNode.tagName().equals("blockquote")
185 || elementNode.hasClass("QuotedText")) {
186 return true;
187 }
188 }
189
190 return false;
191 }
192
193 @Override
194 public boolean ignoreNode(Node node) {
195 if (node instanceof Element) {
196 Element elementNode = (Element) node;
197 if (elementNode.hasClass("CommentPoster")) {
198 return true;
199 }
200 }
201
202 return false;
203 }
204 });
205 }
206 }