src/be/nikiroo/gofetch/support/LeMonde.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.net.URL;
   6 import java.util.ArrayList;
   7 import java.util.List;
   8
   9 import org.jsoup.helper.DataUtil;
  10 import org.jsoup.nodes.Document;
  11 import org.jsoup.nodes.Element;
  12 import org.jsoup.nodes.Node;
  13 import org.jsoup.select.Elements;
  14
  15 import be.nikiroo.gofetch.data.Comment;
  16 import be.nikiroo.gofetch.data.Story;
  17
  18 public class LeMonde extends BasicSupport {
  19         @Override
  20         public String getDescription() {
  21                 return "Le Monde: Actualités et Infos en France et dans le monde";
  22         }
  23
  24         @Override
  25         public List<Story> list() throws IOException {
  26                 List<Story> list = new ArrayList<Story>();
  27
  28                 for (String topic : new String[] { "international", "politique",
  29                                 "societe", "sciences" }) {
  30                         URL url = new URL("http://www.lemonde.fr/" + topic + "/1.html");
  31                         InputStream in = open(url);
  32                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
  33                         Elements articles = doc.getElementsByTag("article");
  34                         for (Element article : articles) {
  35                                 Elements times = article.getElementsByTag("time");
  36                                 Elements titleElements = article.getElementsByTag("h3");
  37                                 Elements contentElements = article.getElementsByClass("txt3");
  38                                 if (times.size() > 0 && titleElements.size() > 0
  39                                                 && contentElements.size() > 0) {
  40                                         String id = times.get(0).attr("datetime").replace(":", "_")
  41                                                         .replace("+", "_");
  42                                         String title = "[" + topic + "] "
  43                                                         + titleElements.get(0).text();
  44                                         String content = contentElements.get(0).text();
  45                                         String intUrl = "";
  46                                         String extUrl = "";
  47                                         String details = "";
  48
  49                                         Elements detailsElements = article
  50                                                         .getElementsByClass("signature");
  51                                         if (detailsElements.size() > 0) {
  52                                                 details = detailsElements.get(0).text();
  53                                         }
  54
  55                                         Elements links = titleElements.get(0).getElementsByTag("a");
  56                                         if (links.size() > 0) {
  57                                                 intUrl = links.get(0).absUrl("href");
  58                                                 list.add(new Story(getType(), id, title, details,
  59                                                                 intUrl, extUrl, content));
  60                                         }
  61                                 }
  62                         }
  63                 }
  64
  65                 return list;
  66         }
  67
  68         @Override
  69         public void fetch(Story story) throws IOException {
  70                 String fullContent = story.getContent();
  71                 List<Comment> comments = new ArrayList<Comment>();
  72
  73                 // Note: no comments on this site as far as I can see (or maybe with
  74                 // some javascript, I need to check...)
  75
  76                 URL url = new URL(story.getUrlInternal());
  77                 InputStream in = open(url);
  78                 Document doc = DataUtil.load(in, "UTF-8", url.toString());
  79                 Element article = doc.getElementById("articleBody");
  80                 if (article != null) {
  81                         for (String line : toLines(article, new QuoteProcessor() {
  82                                 @Override
  83                                 public String processText(String text) {
  84                                         return text;
  85                                 }
  86
  87                                 @Override
  88                                 public boolean ignoreNode(Node node) {
  89                                         if (node instanceof Element) {
  90                                                 Element element = (Element) node;
  91                                                 if (element.hasClass("lire")) {
  92                                                         return true;
  93                                                 }
  94                                         }
  95
  96                                         return false;
  97                                 }
  98
  99                                 @Override
 100                                 public boolean detectQuote(Node node) {
 101                                         return false;
 102                                 }
 103
 104                                 @Override
 105                                 public String manualProcessing(Node node) {
 106                                         if (node instanceof Element) {
 107                                                 Element element = (Element) node;
 108                                                 if (element.hasClass("intertitre")) {
 109                                                         return "\n[ " + element.text() + " ]\n";
 110                                                 }
 111                                         }
 112                                         return null;
 113                                 }
 114                         })) {
 115                                 fullContent += line + "\n";
 116                         }
 117
 118                         // Content is too tight with a single break per line:
 119                         fullContent = fullContent.replace("\n", "\n\n") //
 120                                         .replace("\n\n\n\n", "\n\n") //
 121                                         .replace("\n\n\n\n", "\n\n") //
 122                                         .trim();
 123                 }
 124
 125                 story.setFullContent(fullContent);
 126                 story.setComments(comments);
 127         }
 128 }