New website supported: Ère Numérique FR
[gofetch.git] / src / be / nikiroo / gofetch / support / EreNumerique.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.URL;
6 import java.util.ArrayList;
7 import java.util.List;
8
9 import org.jsoup.helper.DataUtil;
10 import org.jsoup.nodes.Document;
11 import org.jsoup.nodes.Element;
12 import org.jsoup.nodes.Node;
13 import org.jsoup.select.Elements;
14
15 import be.nikiroo.gofetch.data.Comment;
16 import be.nikiroo.gofetch.data.Story;
17 import be.nikiroo.utils.StringUtils;
18
19 /**
20 * Support <a
21 * href="https://www.erenumerique.fr/">https://www.erenumerique.fr/</a>.
22 *
23 * @author niki
24 */
25 public class EreNumerique extends BasicSupport {
26 @Override
27 public String getDescription() {
28 return "Ère Numérique.FR: faites le bon choix !";
29 }
30
31 @Override
32 public List<Story> list() throws IOException {
33 List<Story> list = new ArrayList<Story>();
34
35 for (String categ : new String[] { "informatique" }) {
36 URL url = new URL("https://www.erenumerique.fr/" + categ);
37 InputStream in = downloader.open(url);
38 Document doc = DataUtil.load(in, "UTF-8", url.toString());
39 Elements articles = doc.getElementsByClass("item-details");
40 for (Element article : articles) {
41 String id = "";
42 String intUrl = "";
43 String extUrl = ""; // nope
44 String title = "";
45 String date = "";
46 String author = "";
47 String details = "";
48 String body = "";
49
50 // MUST NOT fail:
51 Element dateElement = article //
52 .getElementsByTag("time").first();
53 if (dateElement == null) {
54 continue;
55 }
56
57 Element urlElement = article.getElementsByTag("a").first();
58 if (urlElement != null) {
59 intUrl = urlElement.absUrl("href");
60 }
61
62 id = dateElement.attr("datetime").replace(":", "_")
63 .replace("+", "_");
64 date = date(dateElement.attr("datetime"));
65
66 Element titleElement = article.getElementsByTag("h2").first();
67 if (titleElement != null) {
68 title = StringUtils.unhtml(titleElement.text()).trim();
69 }
70
71 Element authorElement = article.getElementsByClass(
72 "td-post-author-name").first();
73 if (authorElement != null) {
74 authorElement = authorElement.getElementsByTag("a").first();
75 }
76 if (authorElement != null) {
77 author = StringUtils.unhtml(authorElement.text()).trim();
78 }
79
80 Element contentElement = article.getElementsByClass(
81 "td-excerpt").first();
82 if (contentElement != null) {
83 body = StringUtils.unhtml(contentElement.text()).trim();
84 }
85
86 list.add(new Story(getType(), id, title, author, date, categ,
87 details, intUrl, extUrl, body));
88 }
89 }
90
91 return list;
92 }
93
94 @Override
95 public void fetch(Story story) throws IOException {
96 String fullContent = story.getContent();
97
98 URL url = new URL(story.getUrlInternal());
99 InputStream in = downloader.open(url);
100 try {
101 Document doc = DataUtil.load(in, "UTF-8", url.toString());
102 Element article = doc.getElementsByTag("article").first();
103 if (article != null) {
104 for (String line : toLines(article,
105 new BasicElementProcessor() {
106 // TODO: ignore headlines/pub
107 })) {
108 fullContent += line + "\n";
109 }
110
111 // Content is too tight with a single break per line:
112 fullContent = fullContent.replace("\n", "\n\n") //
113 .replace("\n\n\n\n", "\n\n") //
114 .replace("\n\n\n\n", "\n\n") //
115 .trim();
116 }
117
118 // Get comments URL then parse it, if possible
119 Element posts = doc.getElementsByClass("comment-list").first();
120
121 story.setFullContent(fullContent);
122 story.setComments(getComments(posts));
123 } finally {
124 if (in != null) {
125 in.close();
126 }
127 }
128 }
129
130 private List<Comment> getComments(Element posts) {
131 List<Comment> comments = new ArrayList<Comment>();
132 if (posts != null) {
133 for (Element post : posts.children()) {
134 if (!post.hasClass("comment")) {
135 continue;
136 }
137
138 String id = "";
139 String author = "";
140 String title = "";
141 String date = "";
142 List<String> content = new ArrayList<String>();
143
144 Element authorE = post.getElementsByTag("footer").first();
145 if (authorE != null) {
146 authorE = authorE.getElementsByTag("cite").first();
147 }
148 if (authorE != null) {
149 author = StringUtils.unhtml(authorE.text()).trim();
150 }
151
152 Element idE = post.getElementsByTag("a").first();
153 if (idE != null) {
154 id = idE.attr("id");
155 Element dateE = idE.getElementsByTag("span").first();
156 if (dateE != null) {
157 date = date(dateE.attr("data-epoch"));
158 }
159 }
160
161 Element contentE = post.getElementsByClass("comment-content")
162 .first();
163 if (contentE != null) {
164 for (String line : toLines(contentE,
165 new BasicElementProcessor() {
166 @Override
167 public boolean ignoreNode(Node node) {
168 // TODO: ignore headlines/pub
169 if (node instanceof Element) {
170 Element el = (Element) node;
171 if ("h4".equals(el.tagName())) {
172 return true;
173 }
174 }
175
176 return false;
177 }
178 })) {
179 content.add(line);
180 }
181 }
182
183 // Since we have no title but still an author, let's switch:
184 title = author;
185 author = "";
186 Comment comment = new Comment(id, author, title, date, content);
187 comments.add(comment);
188
189 Element children = post.getElementsByClass("children").first();
190 comment.addAll(getComments(children));
191 }
192 }
193
194 return comments;
195 }
196 }