4ec2c30f39a6fcb0b344ca4dd387d629f35c8dc0
[gofetch.git] / src / be / nikiroo / gofetch / support / LeMonde.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.URL;
6 import java.util.ArrayList;
7 import java.util.List;
8
9 import org.jsoup.helper.DataUtil;
10 import org.jsoup.nodes.Document;
11 import org.jsoup.nodes.Element;
12 import org.jsoup.nodes.Node;
13 import org.jsoup.select.Elements;
14
15 import be.nikiroo.gofetch.data.Comment;
16 import be.nikiroo.gofetch.data.Story;
17
18 /**
19 * Support <a href="http://www.lemonde.fr/">http://www.lemonde.fr/</a>.
20 *
21 * @author niki
22 */
23 public class LeMonde extends BasicSupport {
24 @Override
25 public String getDescription() {
26 return "Le Monde: Actualités et Infos en France et dans le monde";
27 }
28
29 @Override
30 public List<Story> list() throws IOException {
31 List<Story> list = new ArrayList<Story>();
32
33 for (String topic : new String[] { "international", "politique",
34 "societe", "sciences" }) {
35 URL url = new URL("http://www.lemonde.fr/" + topic + "/1.html");
36 InputStream in = downloader.open(url);
37 Document doc = DataUtil.load(in, "UTF-8", url.toString());
38 Elements articles = doc.getElementsByTag("article");
39 for (Element article : articles) {
40 Elements times = article.getElementsByTag("time");
41 Elements titleElements = article.getElementsByTag("h3");
42 Elements contentElements = article.getElementsByClass("txt3");
43 if (times.size() > 0 && titleElements.size() > 0
44 && contentElements.size() > 0) {
45 String id = times.get(0).attr("datetime").replace(":", "_")
46 .replace("+", "_");
47 String title = titleElements.get(0).text();
48 String date = date(titleElements.get(0).text());
49 String content = contentElements.get(0).text();
50 String intUrl = "";
51 String extUrl = "";
52 String author = "";
53 String details = "";
54
55 Elements detailsElements = article
56 .getElementsByClass("signature");
57 if (detailsElements.size() > 0) {
58 author = detailsElements.get(0).text();
59 }
60
61 Elements links = titleElements.get(0).getElementsByTag("a");
62 if (links.size() > 0) {
63 intUrl = links.get(0).absUrl("href");
64 list.add(new Story(getType(), id, title, author, date,
65 topic, details, intUrl, extUrl, content));
66 }
67 }
68 }
69 }
70
71 return list;
72 }
73
74 @Override
75 public void fetch(Story story) throws IOException {
76 String fullContent = story.getContent();
77 List<Comment> comments = new ArrayList<Comment>();
78
79 // Note: no comments on this site as far as I can see (or maybe with
80 // some javascript, I need to check...)
81
82 URL url = new URL(story.getUrlInternal());
83 InputStream in = downloader.open(url);
84 Document doc = DataUtil.load(in, "UTF-8", url.toString());
85 Element article = doc.getElementById("articleBody");
86 if (article != null) {
87 for (String line : toLines(article, new BasicElementProcessor() {
88 @Override
89 public boolean ignoreNode(Node node) {
90 if (node instanceof Element) {
91 Element element = (Element) node;
92 if (element.hasClass("lire")) {
93 return true;
94 }
95 }
96
97 return false;
98 }
99
100 @Override
101 public String manualProcessing(Node node) {
102 if (node instanceof Element) {
103 Element element = (Element) node;
104 if (element.hasClass("intertitre")) {
105 return "\n[ " + element.text() + " ]\n";
106 }
107 }
108 return null;
109 }
110 })) {
111 fullContent += line + "\n";
112 }
113
114 // Content is too tight with a single break per line:
115 fullContent = fullContent.replace("\n", "\n\n") //
116 .replace("\n\n\n\n", "\n\n") //
117 .replace("\n\n\n\n", "\n\n") //
118 .trim();
119 }
120
121 story.setFullContent(fullContent);
122 story.setComments(comments);
123 }
124 }