Fix subtitles and too much content in EreNumerique
[gofetch.git] / src / be / nikiroo / gofetch / support / EreNumerique.java
... / ...
CommitLineData
1package be.nikiroo.gofetch.support;
2
3import java.io.IOException;
4import java.io.InputStream;
5import java.net.URL;
6import java.util.ArrayList;
7import java.util.List;
8
9import org.jsoup.helper.DataUtil;
10import org.jsoup.nodes.Document;
11import org.jsoup.nodes.Element;
12import org.jsoup.nodes.Node;
13import org.jsoup.select.Elements;
14
15import be.nikiroo.gofetch.data.Comment;
16import be.nikiroo.gofetch.data.Story;
17import be.nikiroo.utils.StringUtils;
18
19/**
20 * Support <a
21 * href="https://www.erenumerique.fr/">https://www.erenumerique.fr/</a>.
22 *
23 * @author niki
24 */
25public class EreNumerique extends BasicSupport {
26 @Override
27 public String getDescription() {
28 return "Ère Numérique.FR: faites le bon choix !";
29 }
30
31 @Override
32 public List<Story> list() throws IOException {
33 List<Story> list = new ArrayList<Story>();
34
35 for (String categ : new String[] { "informatique" }) {
36 URL url = new URL("https://www.erenumerique.fr/" + categ);
37 InputStream in = downloader.open(url);
38 Document doc = DataUtil.load(in, "UTF-8", url.toString());
39 Elements articles = doc.getElementsByClass("item-details");
40 for (Element article : articles) {
41 String id = "";
42 String intUrl = "";
43 String extUrl = ""; // nope
44 String title = "";
45 String date = "";
46 String author = "";
47 String details = "";
48 String body = "";
49
50 // MUST NOT fail:
51 Element dateElement = article //
52 .getElementsByTag("time").first();
53 if (dateElement == null) {
54 continue;
55 }
56
57 Element urlElement = article.getElementsByTag("a").first();
58 if (urlElement != null) {
59 intUrl = urlElement.absUrl("href");
60 }
61
62 id = dateElement.attr("datetime").replace(":", "_")
63 .replace("+", "_");
64 date = date(dateElement.attr("datetime"));
65
66 Element titleElement = article.getElementsByTag("h2").first();
67 if (titleElement != null) {
68 title = StringUtils.unhtml(titleElement.text()).trim();
69 }
70
71 Element authorElement = article.getElementsByClass(
72 "td-post-author-name").first();
73 if (authorElement != null) {
74 authorElement = authorElement.getElementsByTag("a").first();
75 }
76 if (authorElement != null) {
77 author = StringUtils.unhtml(authorElement.text()).trim();
78 }
79
80 Element contentElement = article.getElementsByClass(
81 "td-excerpt").first();
82 if (contentElement != null) {
83 body = StringUtils.unhtml(contentElement.text()).trim();
84 }
85
86 list.add(new Story(getType(), id, title, author, date, categ,
87 details, intUrl, extUrl, body));
88 }
89 }
90
91 return list;
92 }
93
94 @Override
95 public void fetch(Story story) throws IOException {
96 String fullContent = story.getContent();
97
98 URL url = new URL(story.getUrlInternal());
99 InputStream in = downloader.open(url);
100 try {
101 Document doc = DataUtil.load(in, "UTF-8", url.toString());
102 Element article = doc.getElementsByTag("article").first();
103 if (article != null) {
104 article = article.getElementsByAttributeValue("itemprop",
105 "articleBody").first();
106 }
107 if (article != null) {
108 for (String line : toLines(article,
109 new BasicElementProcessor() {
110 @Override
111 public boolean ignoreNode(Node node) {
112 return node.attr("class").contains("chapo");
113 }
114
115 @Override
116 public String isSubtitle(Node node) {
117 if (node instanceof Element) {
118 Element element = (Element) node;
119 if (element.tagName().startsWith("h")
120 && element.tagName().length() == 2) {
121 return element.text();
122 }
123 }
124 return null;
125 }
126 })) {
127 fullContent += line + "\n";
128 }
129
130 // Content is too tight with a single break per line:
131 fullContent = fullContent.replace("\n", "\n\n") //
132 .replace("\n\n\n\n", "\n\n") //
133 .replace("\n\n\n\n", "\n\n") //
134 .trim();
135 }
136
137 // Get comments URL then parse it, if possible
138 Element posts = doc.getElementsByClass("comment-list").first();
139
140 story.setFullContent(fullContent);
141 story.setComments(getComments(posts));
142 } finally {
143 if (in != null) {
144 in.close();
145 }
146 }
147 }
148
149 private List<Comment> getComments(Element posts) {
150 List<Comment> comments = new ArrayList<Comment>();
151 if (posts != null) {
152 for (Element post : posts.children()) {
153 if (!post.hasClass("comment")) {
154 continue;
155 }
156
157 String id = "";
158 String author = "";
159 String title = "";
160 String date = "";
161 List<String> content = new ArrayList<String>();
162
163 Element authorE = post.getElementsByTag("footer").first();
164 if (authorE != null) {
165 authorE = authorE.getElementsByTag("cite").first();
166 }
167 if (authorE != null) {
168 author = StringUtils.unhtml(authorE.text()).trim();
169 }
170
171 Element idE = post.getElementsByTag("a").first();
172 if (idE != null) {
173 id = idE.attr("id");
174 Element dateE = idE.getElementsByTag("span").first();
175 if (dateE != null) {
176 date = date(dateE.attr("data-epoch"));
177 }
178 }
179
180 Element contentE = post.getElementsByClass("comment-content")
181 .first();
182 if (contentE != null) {
183 for (String line : toLines(contentE,
184 new BasicElementProcessor() {
185 @Override
186 public boolean ignoreNode(Node node) {
187 // TODO: ignore headlines/pub
188 if (node instanceof Element) {
189 Element el = (Element) node;
190 if ("h4".equals(el.tagName())) {
191 return true;
192 }
193 }
194
195 return false;
196 }
197 })) {
198 content.add(line);
199 }
200 }
201
202 // Since we have no title but still an author, let's switch:
203 title = author;
204 author = "";
205 Comment comment = new Comment(id, author, title, date, content);
206 comments.add(comment);
207
208 Element children = post.getElementsByClass("children").first();
209 comment.addAll(getComments(children));
210 }
211 }
212
213 return comments;
214 }
215}