From: Niki Roo Date: Fri, 23 Mar 2018 22:22:56 +0000 (+0100) Subject: New website supported: Ère Numérique FR X-Git-Url: http://git.nikiroo.be/?p=gofetch.git;a=commitdiff_plain;h=31755801cf5c2e2502e1222928d56c11f64dbdfe New website supported: Ère Numérique FR --- diff --git a/src/be/nikiroo/gofetch/support/BasicSupport.java b/src/be/nikiroo/gofetch/support/BasicSupport.java index 6d930f6..b0325b3 100644 --- a/src/be/nikiroo/gofetch/support/BasicSupport.java +++ b/src/be/nikiroo/gofetch/support/BasicSupport.java @@ -45,6 +45,8 @@ public abstract class BasicSupport { REGISTER, /** FR: Linux */ TOO_LINUX, + /** FR: IT */ + ERE_NUMERIQUE, } /** @@ -238,6 +240,9 @@ public abstract class BasicSupport { case TOO_LINUX: support = new TooLinux(); break; + case ERE_NUMERIQUE: + support = new EreNumerique(); + break; } if (support != null) { diff --git a/src/be/nikiroo/gofetch/support/EreNumerique.java b/src/be/nikiroo/gofetch/support/EreNumerique.java new file mode 100644 index 0000000..bef677d --- /dev/null +++ b/src/be/nikiroo/gofetch/support/EreNumerique.java @@ -0,0 +1,196 @@ +package be.nikiroo.gofetch.support; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import org.jsoup.helper.DataUtil; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.select.Elements; + +import be.nikiroo.gofetch.data.Comment; +import be.nikiroo.gofetch.data.Story; +import be.nikiroo.utils.StringUtils; + +/** + * Support https://www.erenumerique.fr/. + * + * @author niki + */ +public class EreNumerique extends BasicSupport { + @Override + public String getDescription() { + return "Ère Numérique.FR: faites le bon choix !"; + } + + @Override + public List list() throws IOException { + List list = new ArrayList(); + + for (String categ : new String[] { "informatique" }) { + URL url = new URL("https://www.erenumerique.fr/" + categ); + InputStream in = downloader.open(url); + Document doc = DataUtil.load(in, "UTF-8", url.toString()); + Elements articles = doc.getElementsByClass("item-details"); + for (Element article : articles) { + String id = ""; + String intUrl = ""; + String extUrl = ""; // nope + String title = ""; + String date = ""; + String author = ""; + String details = ""; + String body = ""; + + // MUST NOT fail: + Element dateElement = article // + .getElementsByTag("time").first(); + if (dateElement == null) { + continue; + } + + Element urlElement = article.getElementsByTag("a").first(); + if (urlElement != null) { + intUrl = urlElement.absUrl("href"); + } + + id = dateElement.attr("datetime").replace(":", "_") + .replace("+", "_"); + date = date(dateElement.attr("datetime")); + + Element titleElement = article.getElementsByTag("h2").first(); + if (titleElement != null) { + title = StringUtils.unhtml(titleElement.text()).trim(); + } + + Element authorElement = article.getElementsByClass( + "td-post-author-name").first(); + if (authorElement != null) { + authorElement = authorElement.getElementsByTag("a").first(); + } + if (authorElement != null) { + author = StringUtils.unhtml(authorElement.text()).trim(); + } + + Element contentElement = article.getElementsByClass( + "td-excerpt").first(); + if (contentElement != null) { + body = StringUtils.unhtml(contentElement.text()).trim(); + } + + list.add(new Story(getType(), id, title, author, date, categ, + details, intUrl, extUrl, body)); + } + } + + return list; + } + + @Override + public void fetch(Story story) throws IOException { + String fullContent = story.getContent(); + + URL url = new URL(story.getUrlInternal()); + InputStream in = downloader.open(url); + try { + Document doc = DataUtil.load(in, "UTF-8", url.toString()); + Element article = doc.getElementsByTag("article").first(); + if (article != null) { + for (String line : toLines(article, + new BasicElementProcessor() { + // TODO: ignore headlines/pub + })) { + fullContent += line + "\n"; + } + + // Content is too tight with a single break per line: + fullContent = fullContent.replace("\n", "\n\n") // + .replace("\n\n\n\n", "\n\n") // + .replace("\n\n\n\n", "\n\n") // + .trim(); + } + + // Get comments URL then parse it, if possible + Element posts = doc.getElementsByClass("comment-list").first(); + + story.setFullContent(fullContent); + story.setComments(getComments(posts)); + } finally { + if (in != null) { + in.close(); + } + } + } + + private List getComments(Element posts) { + List comments = new ArrayList(); + if (posts != null) { + for (Element post : posts.children()) { + if (!post.hasClass("comment")) { + continue; + } + + String id = ""; + String author = ""; + String title = ""; + String date = ""; + List content = new ArrayList(); + + Element authorE = post.getElementsByTag("footer").first(); + if (authorE != null) { + authorE = authorE.getElementsByTag("cite").first(); + } + if (authorE != null) { + author = StringUtils.unhtml(authorE.text()).trim(); + } + + Element idE = post.getElementsByTag("a").first(); + if (idE != null) { + id = idE.attr("id"); + Element dateE = idE.getElementsByTag("span").first(); + if (dateE != null) { + date = date(dateE.attr("data-epoch")); + } + } + + Element contentE = post.getElementsByClass("comment-content") + .first(); + if (contentE != null) { + for (String line : toLines(contentE, + new BasicElementProcessor() { + @Override + public boolean ignoreNode(Node node) { + // TODO: ignore headlines/pub + if (node instanceof Element) { + Element el = (Element) node; + if ("h4".equals(el.tagName())) { + return true; + } + } + + return false; + } + })) { + content.add(line); + } + } + + // Since we have no title but still an author, let's switch: + title = author; + author = ""; + Comment comment = new Comment(id, author, title, date, content); + comments.add(comment); + + Element children = post.getElementsByClass("children").first(); + comment.addAll(getComments(children)); + } + } + + return comments; + } +}