1 package be
.nikiroo
.gofetch
.support
;
3 import java
.io
.IOException
;
5 import java
.util
.AbstractMap
;
6 import java
.util
.ArrayList
;
8 import java
.util
.Map
.Entry
;
10 import org
.jsoup
.nodes
.Document
;
11 import org
.jsoup
.nodes
.Element
;
12 import org
.jsoup
.nodes
.Node
;
13 import org
.jsoup
.nodes
.TextNode
;
15 import be
.nikiroo
.gofetch
.data
.Comment
;
16 import be
.nikiroo
.gofetch
.data
.Story
;
19 * Support <a href='https://lwn.net/'>https://lwn.net/</a>.
23 public class LWN
extends BasicSupport
{
25 public String
getDescription() {
26 return "LWN: Linux Weekly Newsletter";
30 public void fetch(Story story
) throws IOException
{
31 // Do not try the paid-for stories...
32 if (!story
.getTitle().startsWith("[$]")) {
35 String fullContent
= "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/].";
36 story
.setFullContent(fullContent
);
37 story
.setComments(new ArrayList
<Comment
>());
42 protected List
<Entry
<URL
, String
>> getUrls() throws IOException
{
43 List
<Entry
<URL
, String
>> urls
= new ArrayList
<Entry
<URL
, String
>>();
44 urls
.add(new AbstractMap
.SimpleEntry
<URL
, String
>(new URL(
45 "https://lwn.net/"), ""));
50 protected List
<Element
> getArticles(Document doc
) {
51 return doc
.getElementsByClass("pure-u-1");
55 protected String
getArticleId(Document doc
, Element article
) {
56 return getArticleIntUrl(doc
, article
).replaceAll("[^0-9]", "");
60 protected String
getArticleTitle(Document doc
, Element article
) {
61 Element title
= article
.getElementsByClass("Headline").first();
70 protected String
getArticleAuthor(Document doc
, Element article
) {
72 String details
= getArticleDetailsReal(article
);
73 int pos
= details
.indexOf(" by ");
75 author
= details
.substring(pos
+ " by ".length()).trim();
82 protected String
getArticleDate(Document doc
, Element article
) {
84 String details
= getArticleDetailsReal(article
);
85 int pos
= details
.indexOf(" Posted ");
87 date
= details
.substring(pos
+ " Posted ".length()).trim();
88 pos
= date
.indexOf(" by ");
90 date
= date
.substring(0, pos
).trim();
98 protected String
getArticleCategory(Document doc
, Element article
,
99 String currentCategory
) {
101 String details
= getArticleDetailsReal(article
);
102 int pos
= details
.indexOf("]");
104 categ
= details
.substring(1, pos
).trim();
111 protected String
getArticleDetails(Document doc
, Element article
) {
112 return ""; // We actually extract all the values
116 protected String
getArticleIntUrl(Document doc
, Element article
) {
118 for (Element idElem
: article
.getElementsByTag("a")) {
119 // Last link is the story link
120 intUrl
= idElem
.absUrl("href");
121 int pos
= intUrl
.indexOf("#Comments");
123 intUrl
= intUrl
.substring(0, pos
- 1);
131 protected String
getArticleExtUrl(Document doc
, Element article
) {
136 protected String
getArticleContent(Document doc
, Element article
) {
137 Element listing
= article
.getElementsByClass("BlurbListing").first();
138 if (listing
!= null && listing
.children().size() >= 2) {
141 // All but the first and two last children
142 for (int i
= 1; i
< listing
.children().size() - 2; i
++) {
143 Element e
= listing
.children().get(i
);
144 content
= content
.trim() + " " + e
.text().trim();
154 protected Element
getFullArticle(Document doc
) {
155 return doc
.getElementsByClass("ArticleText").first();
159 protected List
<Element
> getFullArticleCommentPosts(Document doc
, URL intUrl
) {
160 return doc
.getElementsByClass("lwn-u-1");
164 protected ElementProcessor
getElementProcessorFullArticle() {
165 return new BasicElementProcessor() {
167 public boolean ignoreNode(Node node
) {
168 if (node
instanceof Element
) {
169 Element el
= (Element
) node
;
170 if ("Log in".equals(el
.text().trim())) {
173 } else if (node
instanceof TextNode
) {
174 TextNode text
= (TextNode
) node
;
175 String t
= text
.text().trim();
176 if (t
.equals("(") || t
.equals("to post comments)")) {
187 protected List
<Element
> getCommentCommentPosts(Document doc
,
189 List
<Element
> commentElements
= new ArrayList
<Element
>();
190 if (container
!= null) {
191 for (Element possibleCommentElement
: container
.children()) {
192 if (possibleCommentElement
.hasClass("CommentBox")) {
193 commentElements
.add(possibleCommentElement
);
194 } else if (possibleCommentElement
.hasClass("Comment")) {
195 commentElements
.add(possibleCommentElement
);
200 return commentElements
;
204 protected String
getCommentId(Element post
) {
209 protected String
getCommentAuthor(Element post
) {
210 Element detailsE
= post
.getElementsByClass("CommentPoster").first();
211 if (detailsE
!= null) {
212 String details
= detailsE
.text();
214 int pos
= details
.lastIndexOf(" by ");
216 details
= details
.substring(pos
+ " by ".length()).trim();
218 if (details
.startsWith("Posted ")) {
219 return details
.substring("Posted ".length()).trim();
228 protected String
getCommentTitle(Element post
) {
229 Element title
= post
.getElementsByClass("CommentTitle").first();
238 protected String
getCommentDate(Element post
) {
239 Element detailsE
= post
.getElementsByClass("CommentPoster").first();
240 if (detailsE
!= null) {
241 String details
= detailsE
.text();
243 int pos
= details
.lastIndexOf(" by ");
245 return details
.substring(0, pos
).trim();
253 protected Element
getCommentContentElement(Element post
) {
254 return post
.getElementsByClass("CommentBody").first();
258 protected ElementProcessor
getElementProcessorComment() {
259 return new BasicElementProcessor() {
261 public String
processText(String text
) {
262 while (text
.startsWith(">")) { // comments
263 text
= text
.substring(1).trim();
270 public boolean detectQuote(Node node
) {
271 if (node
instanceof Element
) {
272 Element elementNode
= (Element
) node
;
273 if (elementNode
.tagName().equals("blockquote")
274 || elementNode
.hasClass("QuotedText")) {
283 public boolean ignoreNode(Node node
) {
284 if (node
instanceof Element
) {
285 Element elementNode
= (Element
) node
;
286 if (elementNode
.hasClass("CommentPoster")) {
296 private String
getArticleDetailsReal(Element article
) {
297 Element listing
= article
.getElementsByClass("BlurbListing").first();
298 // Valid articles have 2+ listings
299 if (listing
!= null && listing
.children().size() >= 2) {
300 return listing
.children().get(0).text();