4e22b4c0a9c8f6fdc93db8a09984f45b3020594a
[gofetch.git] / src / be / nikiroo / gofetch / support / LeMonde.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.URL;
6 import java.util.ArrayList;
7 import java.util.List;
8
9 import org.jsoup.helper.DataUtil;
10 import org.jsoup.nodes.Document;
11 import org.jsoup.nodes.Element;
12 import org.jsoup.nodes.Node;
13 import org.jsoup.select.Elements;
14
15 import be.nikiroo.gofetch.data.Comment;
16 import be.nikiroo.gofetch.data.Story;
17
18 public class LeMonde extends BasicSupport {
19 @Override
20 public String getDescription() {
21 return "Le Monde: Actualités et Infos en France et dans le monde";
22 }
23
24 @Override
25 public List<Story> list() throws IOException {
26 List<Story> list = new ArrayList<Story>();
27
28 for (String topic : new String[] { "international", "politique",
29 "societe", "sciences" }) {
30 URL url = new URL("http://www.lemonde.fr/" + topic + "/1.html");
31 InputStream in = open(url);
32 Document doc = DataUtil.load(in, "UTF-8", url.toString());
33 Elements articles = doc.getElementsByTag("article");
34 for (Element article : articles) {
35 Elements times = article.getElementsByTag("time");
36 Elements titleElements = article.getElementsByTag("h3");
37 Elements contentElements = article.getElementsByClass("txt3");
38 if (times.size() > 0 && titleElements.size() > 0
39 && contentElements.size() > 0) {
40 String id = times.get(0).attr("datetime").replace(":", "_");
41 String title = "[" + topic + "] "
42 + titleElements.get(0).text();
43 String content = contentElements.get(0).text();
44 String intUrl = "";
45 String extUrl = "";
46 String details = "";
47
48 Elements detailsElements = article
49 .getElementsByClass("signature");
50 if (detailsElements.size() > 0) {
51 details = detailsElements.get(0).text();
52 }
53
54 Elements links = titleElements.get(0).getElementsByTag("a");
55 if (links.size() > 0) {
56 intUrl = links.get(0).absUrl("href");
57 list.add(new Story(getType(), id, title, details,
58 intUrl, extUrl, content));
59 }
60 }
61 }
62 }
63
64 return list;
65 }
66
67 @Override
68 public void fetch(Story story) throws IOException {
69 String fullContent = story.getContent();
70 List<Comment> comments = new ArrayList<Comment>();
71
72 // Note: no comments on this site as far as I can see (or maybe with
73 // some javascript, I need to check...)
74
75 URL url = new URL(story.getUrlInternal());
76 InputStream in = open(url);
77 Document doc = DataUtil.load(in, "UTF-8", url.toString());
78 Element article = doc.getElementById("articleBody");
79 if (article != null) {
80 for (String line : toLines(article, new QuoteProcessor() {
81 @Override
82 public String processText(String text) {
83 return text;
84 }
85
86 @Override
87 public boolean ignoreNode(Node node) {
88 if (node instanceof Element) {
89 Element element = (Element) node;
90 if (element.hasClass("lire")) {
91 return true;
92 }
93 }
94
95 return false;
96 }
97
98 @Override
99 public boolean detectQuote(Node node) {
100 return false;
101 }
102
103 @Override
104 public String manualProcessing(Node node) {
105 if (node instanceof Element) {
106 Element element = (Element) node;
107 if (element.hasClass("intertitre")) {
108 return "\n[ " + element.text() + " ]\n";
109 }
110 }
111 return null;
112 }
113 })) {
114 fullContent += line + "\n";
115 }
116
117 // Content is too tight with a single break per line:
118 fullContent = fullContent.replace("\n", "\n\n") //
119 .replace("\n\n\n\n", "\n\n") //
120 .replace("\n\n\n\n", "\n\n") //
121 .trim();
122 }
123
124 story.setFullContent(fullContent);
125 story.setComments(comments);
126 }
127 }