Separate story details components
[gofetch.git] / src / be / nikiroo / gofetch / support / LWN.java
... / ...
CommitLineData
1package be.nikiroo.gofetch.support;
2
3import java.io.IOException;
4import java.io.InputStream;
5import java.net.URL;
6import java.util.ArrayList;
7import java.util.List;
8
9import org.jsoup.helper.DataUtil;
10import org.jsoup.nodes.Document;
11import org.jsoup.nodes.Element;
12import org.jsoup.nodes.Node;
13import org.jsoup.select.Elements;
14
15import be.nikiroo.gofetch.data.Comment;
16import be.nikiroo.gofetch.data.Story;
17
18/**
19 * Support <a href='https://lwn.net/'>https://lwn.net/</a>.
20 *
21 * @author niki
22 */
23public class LWN extends BasicSupport {
24 @Override
25 public String getDescription() {
26 return "LWN: Linux Weekly Newsletter";
27 }
28
29 @Override
30 public List<Story> list() throws IOException {
31 List<Story> list = new ArrayList<Story>();
32
33 URL url = new URL("https://lwn.net/");
34 InputStream in = downloader.open(url);
35 Document doc = DataUtil.load(in, "UTF-8", url.toString());
36 Elements articles = doc.getElementsByClass("pure-u-1");
37 for (Element article : articles) {
38 Elements titles = article.getElementsByClass("Headline");
39 Elements listings = article.getElementsByClass("BlurbListing");
40 if (titles.size() == 0) {
41 continue;
42 }
43 if (listings.size() == 0) {
44 continue;
45 }
46
47 Element listing = listings.get(0);
48 if (listing.children().size() < 2) {
49 continue;
50 }
51
52 String title = titles.get(0).text();
53 String details = listing.children().get(0).text();
54 String body = "";
55 // All but the first and two last children
56 for (int i = 1; i < listing.children().size() - 2; i++) {
57 Element e = listing.children().get(i);
58 body = body.trim() + " " + e.text().trim();
59 }
60 body = body.trim();
61
62 int pos;
63
64 String categ = "";
65 pos = details.indexOf("]");
66 if (pos >= 0) {
67 categ = details.substring(1, pos + 1).trim();
68 }
69
70 String author = "";
71 pos = details.indexOf(" by ");
72 if (pos >= 0) {
73 author = details.substring(pos + " by ".length()).trim();
74 }
75
76 String date = "";
77 pos = details.indexOf(" Posted ");
78 if (pos >= 0) {
79 date = details.substring(pos + " Posted ".length()).trim();
80 pos = details.indexOf(" by ");
81 if (pos >= 0) {
82 author = details.substring(0, pos).trim();
83 }
84 }
85
86 // We extracted everything from details so...
87 details = "";
88
89 String id = "";
90 String intUrl = "";
91 String extUrl = "";
92 for (Element idElem : article.getElementsByTag("a")) {
93 // Last link is the story link
94 intUrl = idElem.absUrl("href");
95 pos = intUrl.indexOf("#Comments");
96 if (pos >= 0) {
97 intUrl = intUrl.substring(0, pos - 1);
98 }
99 id = intUrl.replaceAll("[^0-9]", "");
100 }
101
102 list.add(new Story(getType(), id, title, author, date, categ,
103 details, intUrl, extUrl, body));
104 }
105
106 return list;
107 }
108
109 @Override
110 public void fetch(Story story) throws IOException {
111 List<Comment> comments = new ArrayList<Comment>();
112 String fullContent = story.getContent();
113
114 // Do not try the paid-for stories...
115 if (!story.getTitle().startsWith("[$]")) {
116 URL url = new URL(story.getUrlInternal());
117 InputStream in = downloader.open(url);
118 Document doc = DataUtil.load(in, "UTF-8", url.toString());
119 Elements fullContentElements = doc
120 .getElementsByClass("ArticleText");
121 if (fullContentElements.size() > 0) {
122 // comments.addAll(getComments(listing.get(0)));
123 fullContent = fullContentElements.get(0).text();
124 }
125
126 Elements listing = doc.getElementsByClass("lwn-u-1");
127 if (listing.size() > 0) {
128 comments.addAll(getComments(listing.get(0)));
129 }
130 } else {
131 fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/].";
132 }
133
134 story.setFullContent(fullContent);
135 story.setComments(comments);
136 }
137
138 private List<Comment> getComments(Element listing) {
139 List<Comment> comments = new ArrayList<Comment>();
140 for (Element commentElement : listing.children()) {
141 if (commentElement.hasClass("CommentBox")) {
142 Comment comment = getComment(commentElement);
143 if (!comment.isEmpty()) {
144 comments.add(comment);
145 }
146 } else if (commentElement.hasClass("Comment")) {
147 if (comments.size() > 0) {
148 comments.get(comments.size() - 1).addAll(
149 getComments(commentElement));
150 }
151 }
152 }
153 return comments;
154 }
155
156 private Comment getComment(Element commentElement) {
157 String title = firstOrEmpty(commentElement, "CommentTitle").text();
158 String author = firstOrEmpty(commentElement, "CommentPoster").text();
159
160 String date = "";
161 int pos = author.lastIndexOf(" by ");
162 if (pos >= 0) {
163 date = author.substring(0, pos).trim();
164 author = author.substring(pos + " by ".length()).trim();
165
166 if (author.startsWith("Posted ")) {
167 author = author.substring("Posted ".length()).trim();
168 }
169 }
170
171 Element content = null;
172 Elements commentBodyElements = commentElement
173 .getElementsByClass("CommentBody");
174 if (commentBodyElements.size() > 0) {
175 content = commentBodyElements.get(0);
176 }
177
178 Comment comment = new Comment(commentElement.id(), author, title, date,
179 toLines(content));
180
181 return comment;
182 }
183
184 private List<String> toLines(Element element) {
185 return toLines(element, new BasicElementProcessor() {
186 @Override
187 public String processText(String text) {
188 while (text.startsWith(">")) { // comments
189 text = text.substring(1).trim();
190 }
191
192 return text;
193 }
194
195 @Override
196 public boolean detectQuote(Node node) {
197 if (node instanceof Element) {
198 Element elementNode = (Element) node;
199 if (elementNode.tagName().equals("blockquote")
200 || elementNode.hasClass("QuotedText")) {
201 return true;
202 }
203 }
204
205 return false;
206 }
207
208 @Override
209 public boolean ignoreNode(Node node) {
210 if (node instanceof Element) {
211 Element elementNode = (Element) node;
212 if (elementNode.hasClass("CommentPoster")) {
213 return true;
214 }
215 }
216
217 return false;
218 }
219 });
220 }
221}