1 package be
.nikiroo
.gofetch
.support
;
3 import java
.io
.IOException
;
4 import java
.io
.InputStream
;
6 import java
.util
.ArrayList
;
9 import org
.jsoup
.helper
.DataUtil
;
10 import org
.jsoup
.nodes
.Document
;
11 import org
.jsoup
.nodes
.Element
;
12 import org
.jsoup
.nodes
.Node
;
13 import org
.jsoup
.select
.Elements
;
15 import be
.nikiroo
.gofetch
.data
.Comment
;
16 import be
.nikiroo
.gofetch
.data
.Story
;
17 import be
.nikiroo
.utils
.StringUtils
;
21 * href="https://www.erenumerique.fr/">https://www.erenumerique.fr/</a>.
25 public class EreNumerique
extends BasicSupport
{
27 public String
getDescription() {
28 return "Ère Numérique.FR: faites le bon choix !";
32 public List
<Story
> list() throws IOException
{
33 List
<Story
> list
= new ArrayList
<Story
>();
35 for (String categ
: new String
[] { "informatique" }) {
36 URL url
= new URL("https://www.erenumerique.fr/" + categ
);
37 InputStream in
= downloader
.open(url
);
38 Document doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
39 Elements articles
= doc
.getElementsByClass("item-details");
40 for (Element article
: articles
) {
43 String extUrl
= ""; // nope
51 Element dateElement
= article
//
52 .getElementsByTag("time").first();
53 if (dateElement
== null) {
57 Element urlElement
= article
.getElementsByTag("a").first();
58 if (urlElement
!= null) {
59 intUrl
= urlElement
.absUrl("href");
62 id
= dateElement
.attr("datetime").replace(":", "_")
64 date
= date(dateElement
.attr("datetime"));
66 Element titleElement
= article
.getElementsByTag("h2").first();
67 if (titleElement
!= null) {
68 title
= StringUtils
.unhtml(titleElement
.text()).trim();
71 Element authorElement
= article
.getElementsByClass(
72 "td-post-author-name").first();
73 if (authorElement
!= null) {
74 authorElement
= authorElement
.getElementsByTag("a").first();
76 if (authorElement
!= null) {
77 author
= StringUtils
.unhtml(authorElement
.text()).trim();
80 Element contentElement
= article
.getElementsByClass(
81 "td-excerpt").first();
82 if (contentElement
!= null) {
83 body
= StringUtils
.unhtml(contentElement
.text()).trim();
86 list
.add(new Story(getType(), id
, title
, author
, date
, categ
,
87 details
, intUrl
, extUrl
, body
));
95 public void fetch(Story story
) throws IOException
{
96 String fullContent
= story
.getContent();
98 URL url
= new URL(story
.getUrlInternal());
99 InputStream in
= downloader
.open(url
);
101 Document doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
102 Element article
= doc
.getElementsByTag("article").first();
103 if (article
!= null) {
104 for (String line
: toLines(article
,
105 new BasicElementProcessor() {
106 // TODO: ignore headlines/pub
108 fullContent
+= line
+ "\n";
111 // Content is too tight with a single break per line:
112 fullContent
= fullContent
.replace("\n", "\n\n") //
113 .replace("\n\n\n\n", "\n\n") //
114 .replace("\n\n\n\n", "\n\n") //
118 // Get comments URL then parse it, if possible
119 Element posts
= doc
.getElementsByClass("comment-list").first();
121 story
.setFullContent(fullContent
);
122 story
.setComments(getComments(posts
));
130 private List
<Comment
> getComments(Element posts
) {
131 List
<Comment
> comments
= new ArrayList
<Comment
>();
133 for (Element post
: posts
.children()) {
134 if (!post
.hasClass("comment")) {
142 List
<String
> content
= new ArrayList
<String
>();
144 Element authorE
= post
.getElementsByTag("footer").first();
145 if (authorE
!= null) {
146 authorE
= authorE
.getElementsByTag("cite").first();
148 if (authorE
!= null) {
149 author
= StringUtils
.unhtml(authorE
.text()).trim();
152 Element idE
= post
.getElementsByTag("a").first();
155 Element dateE
= idE
.getElementsByTag("span").first();
157 date
= date(dateE
.attr("data-epoch"));
161 Element contentE
= post
.getElementsByClass("comment-content")
163 if (contentE
!= null) {
164 for (String line
: toLines(contentE
,
165 new BasicElementProcessor() {
167 public boolean ignoreNode(Node node
) {
168 // TODO: ignore headlines/pub
169 if (node
instanceof Element
) {
170 Element el
= (Element
) node
;
171 if ("h4".equals(el
.tagName())) {
183 // Since we have no title but still an author, let's switch:
186 Comment comment
= new Comment(id
, author
, title
, date
, content
);
187 comments
.add(comment
);
189 Element children
= post
.getElementsByClass("children").first();
190 comment
.addAll(getComments(children
));