1 package be
.nikiroo
.gofetch
.support
;
3 import java
.io
.IOException
;
5 import java
.util
.AbstractMap
;
6 import java
.util
.ArrayList
;
8 import java
.util
.Map
.Entry
;
10 import org
.jsoup
.nodes
.Document
;
11 import org
.jsoup
.nodes
.Element
;
12 import org
.jsoup
.nodes
.Node
;
13 import org
.jsoup
.nodes
.TextNode
;
15 import be
.nikiroo
.gofetch
.data
.Comment
;
16 import be
.nikiroo
.gofetch
.data
.Story
;
19 * Support <a href='https://lwn.net/'>https://lwn.net/</a>.
23 public class LWN
extends BasicSupport
{
25 public String
getDescription() {
26 return "LWN: Linux Weekly Newsletter";
30 public void fetch(Story story
) throws IOException
{
31 // Do not try the paid-for stories...
32 if (!story
.getTitle().startsWith("[$]")) {
35 String fullContent
= "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/].";
36 story
.setFullContent(fullContent
);
37 story
.setComments(new ArrayList
<Comment
>());
42 protected List
<Entry
<URL
, String
>> getUrls() throws IOException
{
43 List
<Entry
<URL
, String
>> urls
= new ArrayList
<Entry
<URL
, String
>>();
44 urls
.add(new AbstractMap
.SimpleEntry
<URL
, String
>(new URL(
45 "https://lwn.net/"), ""));
50 protected List
<Element
> getArticles(Document doc
) {
51 return doc
.getElementsByClass("pure-u-1");
55 protected String
getArticleId(Document doc
, Element article
) {
56 String id
= getArticleIntUrl(doc
, article
).replaceAll("[^0-9]", "");
57 while (id
.length() < 10) {
65 protected String
getArticleTitle(Document doc
, Element article
) {
66 Element title
= article
.getElementsByClass("Headline").first();
75 protected String
getArticleAuthor(Document doc
, Element article
) {
77 String details
= getArticleDetailsReal(article
);
78 int pos
= details
.indexOf(" by ");
80 author
= details
.substring(pos
+ " by ".length()).trim();
87 protected String
getArticleDate(Document doc
, Element article
) {
89 String details
= getArticleDetailsReal(article
);
90 int pos
= details
.indexOf(" Posted ");
92 date
= details
.substring(pos
+ " Posted ".length()).trim();
93 pos
= date
.indexOf(" by ");
95 date
= date
.substring(0, pos
).trim();
103 protected String
getArticleCategory(Document doc
, Element article
,
104 String currentCategory
) {
106 String details
= getArticleDetailsReal(article
);
107 int pos
= details
.indexOf("]");
109 categ
= details
.substring(1, pos
).trim();
116 protected String
getArticleDetails(Document doc
, Element article
) {
117 return ""; // We actually extract all the values
121 protected String
getArticleIntUrl(Document doc
, Element article
) {
123 for (Element idElem
: article
.getElementsByTag("a")) {
124 // Last link is the story link
125 intUrl
= idElem
.absUrl("href");
126 int pos
= intUrl
.indexOf("#Comments");
128 intUrl
= intUrl
.substring(0, pos
- 1);
136 protected String
getArticleExtUrl(Document doc
, Element article
) {
141 protected String
getArticleContent(Document doc
, Element article
) {
142 Element listing
= article
.getElementsByClass("BlurbListing").first();
143 if (listing
!= null && listing
.children().size() >= 2) {
146 // All but the first and two last children
147 for (int i
= 1; i
< listing
.children().size() - 2; i
++) {
148 Element e
= listing
.children().get(i
);
149 content
= content
.trim() + " " + e
.text().trim();
159 protected Element
getFullArticle(Document doc
) {
160 return doc
.getElementsByClass("ArticleText").first();
164 protected List
<Element
> getFullArticleCommentPosts(Document doc
, URL intUrl
) {
165 return doc
.getElementsByClass("lwn-u-1");
169 protected ElementProcessor
getElementProcessorFullArticle() {
170 return new BasicElementProcessor() {
172 public boolean ignoreNode(Node node
) {
173 if (node
instanceof Element
) {
174 Element el
= (Element
) node
;
175 if ("Log in".equals(el
.text().trim())) {
178 } else if (node
instanceof TextNode
) {
179 TextNode text
= (TextNode
) node
;
180 String t
= text
.text().trim();
181 if (t
.equals("(") || t
.equals("to post comments)")) {
192 protected List
<Element
> getCommentCommentPosts(Document doc
,
194 List
<Element
> commentElements
= new ArrayList
<Element
>();
195 if (container
!= null) {
196 for (Element possibleCommentElement
: container
.children()) {
197 if (possibleCommentElement
.hasClass("CommentBox")) {
198 commentElements
.add(possibleCommentElement
);
199 } else if (possibleCommentElement
.hasClass("Comment")) {
200 commentElements
.add(possibleCommentElement
);
205 return commentElements
;
209 protected String
getCommentId(Element post
) {
214 protected String
getCommentAuthor(Element post
) {
215 Element detailsE
= post
.getElementsByClass("CommentPoster").first();
216 if (detailsE
!= null) {
217 String details
= detailsE
.text();
219 int pos
= details
.lastIndexOf(" by ");
221 details
= details
.substring(pos
+ " by ".length()).trim();
223 if (details
.startsWith("Posted ")) {
224 return details
.substring("Posted ".length()).trim();
233 protected String
getCommentTitle(Element post
) {
234 Element title
= post
.getElementsByClass("CommentTitle").first();
243 protected String
getCommentDate(Element post
) {
244 Element detailsE
= post
.getElementsByClass("CommentPoster").first();
245 if (detailsE
!= null) {
246 String details
= detailsE
.text();
248 int pos
= details
.lastIndexOf(" by ");
250 return details
.substring(0, pos
).trim();
258 protected Element
getCommentContentElement(Element post
) {
259 return post
.getElementsByClass("CommentBody").first();
263 protected ElementProcessor
getElementProcessorComment() {
264 return new BasicElementProcessor() {
266 public String
processText(String text
) {
267 while (text
.startsWith(">")) { // comments
268 text
= text
.substring(1).trim();
275 public boolean detectQuote(Node node
) {
276 if (node
instanceof Element
) {
277 Element elementNode
= (Element
) node
;
278 if (elementNode
.tagName().equals("blockquote")
279 || elementNode
.hasClass("QuotedText")) {
288 public boolean ignoreNode(Node node
) {
289 if (node
instanceof Element
) {
290 Element elementNode
= (Element
) node
;
291 if (elementNode
.hasClass("CommentPoster")) {
301 private String
getArticleDetailsReal(Element article
) {
302 Element listing
= article
.getElementsByClass("BlurbListing").first();
303 // Valid articles have 2+ listings
304 if (listing
!= null && listing
.children().size() >= 2) {
305 return listing
.children().get(0).text();