src/be/nikiroo/gofetch/support/LeMonde.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.net.URL;
   6 import java.util.ArrayList;
   7 import java.util.List;
   8
   9 import org.jsoup.helper.DataUtil;
  10 import org.jsoup.nodes.Document;
  11 import org.jsoup.nodes.Element;
  12 import org.jsoup.nodes.Node;
  13 import org.jsoup.select.Elements;
  14
  15 import be.nikiroo.gofetch.data.Comment;
  16 import be.nikiroo.gofetch.data.Story;
  17
  18 /**
  19  * Support <a href="http://www.lemonde.fr/">http://www.lemonde.fr/</a>.
  20  *
  21  * @author niki
  22  */
  23 public class LeMonde extends BasicSupport {
  24         @Override
  25         public String getDescription() {
  26                 return "Le Monde: Actualités et Infos en France et dans le monde";
  27         }
  28
  29         @Override
  30         public List<Story> list() throws IOException {
  31                 List<Story> list = new ArrayList<Story>();
  32
  33                 for (String topic : new String[] { "international", "politique",
  34                                 "societe", "sciences" }) {
  35                         URL url = new URL("http://www.lemonde.fr/" + topic + "/1.html");
  36                         InputStream in = downloader.open(url);
  37                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
  38                         Elements articles = doc.getElementsByTag("article");
  39                         for (Element article : articles) {
  40                                 Elements times = article.getElementsByTag("time");
  41                                 Elements titleElements = article.getElementsByTag("h3");
  42                                 Elements contentElements = article.getElementsByClass("txt3");
  43                                 if (times.size() > 0 && titleElements.size() > 0
  44                                                 && contentElements.size() > 0) {
  45                                         String id = times.get(0).attr("datetime").replace(":", "_")
  46                                                         .replace("+", "_");
  47                                         String title = titleElements.get(0).text();
  48                                         String date = date(titleElements.get(0).text());
  49                                         String content = contentElements.get(0).text();
  50                                         String intUrl = "";
  51                                         String extUrl = "";
  52                                         String author = "";
  53                                         String details = "";
  54
  55                                         Elements detailsElements = article
  56                                                         .getElementsByClass("signature");
  57                                         if (detailsElements.size() > 0) {
  58                                                 author = detailsElements.get(0).text();
  59                                         }
  60
  61                                         Elements links = titleElements.get(0).getElementsByTag("a");
  62                                         if (links.size() > 0) {
  63                                                 intUrl = links.get(0).absUrl("href");
  64                                                 list.add(new Story(getType(), id, title, author, date,
  65                                                                 topic, details, intUrl, extUrl, content));
  66                                         }
  67                                 }
  68                         }
  69                 }
  70
  71                 return list;
  72         }
  73
  74         @Override
  75         public void fetch(Story story) throws IOException {
  76                 String fullContent = story.getContent();
  77                 List<Comment> comments = new ArrayList<Comment>();
  78
  79                 // Note: no comments on this site as far as I can see (or maybe with
  80                 // some javascript, I need to check...)
  81
  82                 URL url = new URL(story.getUrlInternal());
  83                 InputStream in = downloader.open(url);
  84                 Document doc = DataUtil.load(in, "UTF-8", url.toString());
  85                 Element article = doc.getElementById("articleBody");
  86                 if (article != null) {
  87                         for (String line : toLines(article, new BasicElementProcessor() {
  88                                 @Override
  89                                 public boolean ignoreNode(Node node) {
  90                                         if (node instanceof Element) {
  91                                                 Element element = (Element) node;
  92                                                 if (element.hasClass("lire")) {
  93                                                         return true;
  94                                                 }
  95                                         }
  96
  97                                         return false;
  98                                 }
  99
 100                                 @Override
 101                                 public String manualProcessing(Node node) {
 102                                         if (node instanceof Element) {
 103                                                 Element element = (Element) node;
 104                                                 if (element.hasClass("intertitre")) {
 105                                                         return "\n[ " + element.text() + " ]\n";
 106                                                 }
 107                                         }
 108                                         return null;
 109                                 }
 110                         })) {
 111                                 fullContent += line + "\n";
 112                         }
 113
 114                         // Content is too tight with a single break per line:
 115                         fullContent = fullContent.replace("\n", "\n\n") //
 116                                         .replace("\n\n\n\n", "\n\n") //
 117                                         .replace("\n\n\n\n", "\n\n") //
 118                                         .trim();
 119                 }
 120
 121                 story.setFullContent(fullContent);
 122                 story.setComments(comments);
 123         }
 124 }