LWN.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.net.URL;
   5 import java.util.AbstractMap;
   6 import java.util.ArrayList;
   7 import java.util.List;
   8 import java.util.Map.Entry;
   9
  10 import org.jsoup.nodes.Document;
  11 import org.jsoup.nodes.Element;
  12 import org.jsoup.nodes.Node;
  13 import org.jsoup.nodes.TextNode;
  14
  15 import be.nikiroo.gofetch.data.Comment;
  16 import be.nikiroo.gofetch.data.Story;
  17
  18 /**
  19  * Support <a href='https://lwn.net/'>https://lwn.net/</a>.
  20  *
  21  * @author niki
  22  */
  23 public class LWN extends BasicSupport {
  24         @Override
  25         public String getDescription() {
  26                 return "LWN: Linux Weekly Newsletter";
  27         }
  28
  29         @Override
  30         public void fetch(Story story) throws IOException {
  31                 // Do not try the paid-for stories...
  32                 if (!story.getTitle().startsWith("[$]")) {
  33                         super.fetch(story);
  34                 } else {
  35                         String fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/].";
  36                         story.setFullContent(fullContent);
  37                         story.setComments(new ArrayList<Comment>());
  38                 }
  39         }
  40
  41         @Override
  42         protected List<Entry<URL, String>> getUrls() throws IOException {
  43                 List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
  44                 urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
  45                                 "https://lwn.net/"), ""));
  46                 return urls;
  47         }
  48
  49         @Override
  50         protected List<Element> getArticles(Document doc) {
  51                 return doc.getElementsByClass("pure-u-1");
  52         }
  53
  54         @Override
  55         protected String getArticleId(Document doc, Element article) {
  56                 return getArticleIntUrl(doc, article).replaceAll("[^0-9]", "");
  57         }
  58
  59         @Override
  60         protected String getArticleTitle(Document doc, Element article) {
  61                 Element title = article.getElementsByClass("Headline").first();
  62                 if (title != null) {
  63                         return title.text();
  64                 }
  65
  66                 return "";
  67         }
  68
  69         @Override
  70         protected String getArticleAuthor(Document doc, Element article) {
  71                 String author = "";
  72                 String details = getArticleDetailsReal(article);
  73                 int pos = details.indexOf(" by ");
  74                 if (pos >= 0) {
  75                         author = details.substring(pos + " by ".length()).trim();
  76                 }
  77
  78                 return author;
  79         }
  80
  81         @Override
  82         protected String getArticleDate(Document doc, Element article) {
  83                 String date = "";
  84                 String details = getArticleDetailsReal(article);
  85                 int pos = details.indexOf(" Posted ");
  86                 if (pos >= 0) {
  87                         date = details.substring(pos + " Posted ".length()).trim();
  88                         pos = date.indexOf(" by ");
  89                         if (pos >= 0) {
  90                                 date = date.substring(0, pos).trim();
  91                         }
  92                 }
  93
  94                 return date;
  95         }
  96
  97         @Override
  98         protected String getArticleCategory(Document doc, Element article,
  99                         String currentCategory) {
 100                 String categ = "";
 101                 String details = getArticleDetailsReal(article);
 102                 int pos = details.indexOf("]");
 103                 if (pos >= 0) {
 104                         categ = details.substring(1, pos).trim();
 105                 }
 106
 107                 return categ;
 108         }
 109
 110         @Override
 111         protected String getArticleDetails(Document doc, Element article) {
 112                 return ""; // We actually extract all the values
 113         }
 114
 115         @Override
 116         protected String getArticleIntUrl(Document doc, Element article) {
 117                 String intUrl = "";
 118                 for (Element idElem : article.getElementsByTag("a")) {
 119                         // Last link is the story link
 120                         intUrl = idElem.absUrl("href");
 121                         int pos = intUrl.indexOf("#Comments");
 122                         if (pos >= 0) {
 123                                 intUrl = intUrl.substring(0, pos - 1);
 124                         }
 125                 }
 126
 127                 return intUrl;
 128         }
 129
 130         @Override
 131         protected String getArticleExtUrl(Document doc, Element article) {
 132                 return "";
 133         }
 134
 135         @Override
 136         protected String getArticleContent(Document doc, Element article) {
 137                 Element listing = article.getElementsByClass("BlurbListing").first();
 138                 if (listing != null && listing.children().size() >= 2) {
 139                         String content = "";
 140
 141                         // All but the first and two last children
 142                         for (int i = 1; i < listing.children().size() - 2; i++) {
 143                                 Element e = listing.children().get(i);
 144                                 content = content.trim() + " " + e.text().trim();
 145                         }
 146
 147                         return content;
 148                 }
 149
 150                 return "";
 151         }
 152
 153         @Override
 154         protected Element getFullArticle(Document doc) {
 155                 return doc.getElementsByClass("ArticleText").first();
 156         }
 157
 158         @Override
 159         protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
 160                 return doc.getElementsByClass("lwn-u-1");
 161         }
 162
 163         @Override
 164         protected ElementProcessor getElementProcessorFullArticle() {
 165                 return new BasicElementProcessor() {
 166                         @Override
 167                         public boolean ignoreNode(Node node) {
 168                                 if (node instanceof Element) {
 169                                         Element el = (Element) node;
 170                                         if ("Log in".equals(el.text().trim())) {
 171                                                 return true;
 172                                         }
 173                                 } else if (node instanceof TextNode) {
 174                                         TextNode text = (TextNode) node;
 175                                         String t = text.text().trim();
 176                                         if (t.equals("(") || t.equals("to post comments)")) {
 177                                                 return true;
 178                                         }
 179                                 }
 180
 181                                 return false;
 182                         }
 183                 };
 184         }
 185
 186         @Override
 187         protected List<Element> getCommentCommentPosts(Document doc,
 188                         Element container) {
 189                 List<Element> commentElements = new ArrayList<Element>();
 190                 if (container != null) {
 191                         for (Element possibleCommentElement : container.children()) {
 192                                 if (possibleCommentElement.hasClass("CommentBox")) {
 193                                         commentElements.add(possibleCommentElement);
 194                                 } else if (possibleCommentElement.hasClass("Comment")) {
 195                                         commentElements.add(possibleCommentElement);
 196                                 }
 197                         }
 198                 }
 199
 200                 return commentElements;
 201         }
 202
 203         @Override
 204         protected String getCommentId(Element post) {
 205                 return post.id();
 206         }
 207
 208         @Override
 209         protected String getCommentAuthor(Element post) {
 210                 Element detailsE = post.getElementsByClass("CommentPoster").first();
 211                 if (detailsE != null) {
 212                         String details = detailsE.text();
 213
 214                         int pos = details.lastIndexOf(" by ");
 215                         if (pos >= 0) {
 216                                 details = details.substring(pos + " by ".length()).trim();
 217
 218                                 if (details.startsWith("Posted ")) {
 219                                         return details.substring("Posted ".length()).trim();
 220                                 }
 221                         }
 222                 }
 223
 224                 return "";
 225         }
 226
 227         @Override
 228         protected String getCommentTitle(Element post) {
 229                 Element title = post.getElementsByClass("CommentTitle").first();
 230                 if (title != null) {
 231                         return title.text();
 232                 }
 233
 234                 return "";
 235         }
 236
 237         @Override
 238         protected String getCommentDate(Element post) {
 239                 Element detailsE = post.getElementsByClass("CommentPoster").first();
 240                 if (detailsE != null) {
 241                         String details = detailsE.text();
 242
 243                         int pos = details.lastIndexOf(" by ");
 244                         if (pos >= 0) {
 245                                 return details.substring(0, pos).trim();
 246                         }
 247                 }
 248
 249                 return "";
 250         }
 251
 252         @Override
 253         protected Element getCommentContentElement(Element post) {
 254                 return post.getElementsByClass("CommentBody").first();
 255         }
 256
 257         @Override
 258         protected ElementProcessor getElementProcessorComment() {
 259                 return new BasicElementProcessor() {
 260                         @Override
 261                         public String processText(String text) {
 262                                 while (text.startsWith(">")) { // comments
 263                                         text = text.substring(1).trim();
 264                                 }
 265
 266                                 return text;
 267                         }
 268
 269                         @Override
 270                         public boolean detectQuote(Node node) {
 271                                 if (node instanceof Element) {
 272                                         Element elementNode = (Element) node;
 273                                         if (elementNode.tagName().equals("blockquote")
 274                                                         || elementNode.hasClass("QuotedText")) {
 275                                                 return true;
 276                                         }
 277                                 }
 278
 279                                 return false;
 280                         }
 281
 282                         @Override
 283                         public boolean ignoreNode(Node node) {
 284                                 if (node instanceof Element) {
 285                                         Element elementNode = (Element) node;
 286                                         if (elementNode.hasClass("CommentPoster")) {
 287                                                 return true;
 288                                         }
 289                                 }
 290
 291                                 return false;
 292                         }
 293                 };
 294         }
 295
 296         private String getArticleDetailsReal(Element article) {
 297                 Element listing = article.getElementsByClass("BlurbListing").first();
 298                 // Valid articles have 2+ listings
 299                 if (listing != null && listing.children().size() >= 2) {
 300                         return listing.children().get(0).text();
 301                 }
 302
 303                 return "";
 304         }
 305 }