package be.nikiroo.gofetch.support; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.ArrayList; import java.util.List; import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.Elements; import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; import be.nikiroo.utils.StringUtils; /** * Support https://www.erenumerique.fr/. * * @author niki */ public class EreNumerique extends BasicSupport { @Override public String getDescription() { return "Ère Numérique.FR: faites le bon choix !"; } @Override public List list() throws IOException { List list = new ArrayList(); for (String categ : new String[] { "informatique" }) { URL url = new URL("https://www.erenumerique.fr/" + categ); InputStream in = downloader.open(url); Document doc = DataUtil.load(in, "UTF-8", url.toString()); Elements articles = doc.getElementsByClass("item-details"); for (Element article : articles) { String id = ""; String intUrl = ""; String extUrl = ""; // nope String title = ""; String date = ""; String author = ""; String details = ""; String body = ""; // MUST NOT fail: Element dateElement = article // .getElementsByTag("time").first(); if (dateElement == null) { continue; } Element urlElement = article.getElementsByTag("a").first(); if (urlElement != null) { intUrl = urlElement.absUrl("href"); } id = dateElement.attr("datetime").replace(":", "_") .replace("+", "_"); date = date(dateElement.attr("datetime")); Element titleElement = article.getElementsByTag("h2").first(); if (titleElement != null) { title = StringUtils.unhtml(titleElement.text()).trim(); } Element authorElement = article.getElementsByClass( "td-post-author-name").first(); if (authorElement != null) { authorElement = authorElement.getElementsByTag("a").first(); } if (authorElement != null) { author = StringUtils.unhtml(authorElement.text()).trim(); } Element contentElement = article.getElementsByClass( "td-excerpt").first(); if (contentElement != null) { body = StringUtils.unhtml(contentElement.text()).trim(); } list.add(new Story(getType(), id, title, author, date, categ, details, intUrl, extUrl, body)); } } return list; } @Override public void fetch(Story story) throws IOException { String fullContent = story.getContent(); URL url = new URL(story.getUrlInternal()); InputStream in = downloader.open(url); try { Document doc = DataUtil.load(in, "UTF-8", url.toString()); Element article = doc.getElementsByTag("article").first(); if (article != null) { for (String line : toLines(article, new BasicElementProcessor() { // TODO: ignore headlines/pub })) { fullContent += line + "\n"; } // Content is too tight with a single break per line: fullContent = fullContent.replace("\n", "\n\n") // .replace("\n\n\n\n", "\n\n") // .replace("\n\n\n\n", "\n\n") // .trim(); } // Get comments URL then parse it, if possible Element posts = doc.getElementsByClass("comment-list").first(); story.setFullContent(fullContent); story.setComments(getComments(posts)); } finally { if (in != null) { in.close(); } } } private List getComments(Element posts) { List comments = new ArrayList(); if (posts != null) { for (Element post : posts.children()) { if (!post.hasClass("comment")) { continue; } String id = ""; String author = ""; String title = ""; String date = ""; List content = new ArrayList(); Element authorE = post.getElementsByTag("footer").first(); if (authorE != null) { authorE = authorE.getElementsByTag("cite").first(); } if (authorE != null) { author = StringUtils.unhtml(authorE.text()).trim(); } Element idE = post.getElementsByTag("a").first(); if (idE != null) { id = idE.attr("id"); Element dateE = idE.getElementsByTag("span").first(); if (dateE != null) { date = date(dateE.attr("data-epoch")); } } Element contentE = post.getElementsByClass("comment-content") .first(); if (contentE != null) { for (String line : toLines(contentE, new BasicElementProcessor() { @Override public boolean ignoreNode(Node node) { // TODO: ignore headlines/pub if (node instanceof Element) { Element el = (Element) node; if ("h4".equals(el.tagName())) { return true; } } return false; } })) { content.add(line); } } // Since we have no title but still an author, let's switch: title = author; author = ""; Comment comment = new Comment(id, author, title, date, content); comments.add(comment); Element children = post.getElementsByClass("children").first(); comment.addAll(getComments(children)); } } return comments; } }