X-Git-Url: http://git.nikiroo.be/?p=gofetch.git;a=blobdiff_plain;f=src%2Fbe%2Fnikiroo%2Fgofetch%2Fsupport%2FEreNumerique.java;h=0b3efcf4c07596b4f0a1e3076ca115a31a5021c7;hp=b6a7598027c9b632cb52fb50f22677e4a1a314b4;hb=3e62b034c1981ae6329f06b3f8c0ee25c3683789;hpb=a81f396bc4bf0f70e4b5f654045f533941d86dc9 diff --git a/src/be/nikiroo/gofetch/support/EreNumerique.java b/src/be/nikiroo/gofetch/support/EreNumerique.java index b6a7598..0b3efcf 100644 --- a/src/be/nikiroo/gofetch/support/EreNumerique.java +++ b/src/be/nikiroo/gofetch/support/EreNumerique.java @@ -1,20 +1,15 @@ package be.nikiroo.gofetch.support; import java.io.IOException; -import java.io.InputStream; import java.net.URL; +import java.util.AbstractMap; import java.util.ArrayList; import java.util.List; +import java.util.Map.Entry; -import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; -import org.jsoup.select.Elements; - -import be.nikiroo.gofetch.data.Comment; -import be.nikiroo.gofetch.data.Story; -import be.nikiroo.utils.StringUtils; /** * Support list() throws IOException { - List list = new ArrayList(); - - for (String categ : new String[] { "informatique" }) { - URL url = new URL("https://www.erenumerique.fr/" + categ); - InputStream in = downloader.open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements articles = doc.getElementsByClass("item-details"); - for (Element article : articles) { - String id = ""; - String intUrl = ""; - String extUrl = ""; // nope - String title = ""; - String date = ""; - String author = ""; - String details = ""; - String body = ""; - - // MUST NOT fail: - Element dateElement = article // - .getElementsByTag("time").first(); - if (dateElement == null) { - continue; - } + protected List> getUrls() throws IOException { + List> urls = new ArrayList>(); + for (String categ : new String[] { "Informatique" }) { + URL url = new URL("https://www.erenumerique.fr/" + + categ.toLowerCase()); + urls.add(new AbstractMap.SimpleEntry(url, categ)); + } - Element urlElement = article.getElementsByTag("a").first(); - if (urlElement != null) { - intUrl = urlElement.absUrl("href"); - } + return urls; + } - id = dateElement.attr("datetime").replace(":", "_") - .replace("+", "_"); - date = date(dateElement.attr("datetime")); + @Override + protected List getArticles(Document doc) { + return doc.getElementsByClass("item-details"); + } - Element titleElement = article.getElementsByTag("h2").first(); - if (titleElement != null) { - title = StringUtils.unhtml(titleElement.text()).trim(); - } + @Override + protected String getArticleId(Document doc, Element article) { + return ""; // will use the date + } - Element authorElement = article.getElementsByClass( - "td-post-author-name").first(); - if (authorElement != null) { - authorElement = authorElement.getElementsByTag("a").first(); - } - if (authorElement != null) { - author = StringUtils.unhtml(authorElement.text()).trim(); - } + @Override + protected String getArticleTitle(Document doc, Element article) { + Element titleElement = article.getElementsByTag("h2").first(); + if (titleElement != null) { + return titleElement.text(); + } - Element contentElement = article.getElementsByClass( - "td-excerpt").first(); - if (contentElement != null) { - body = StringUtils.unhtml(contentElement.text()).trim(); - } + return ""; + } - list.add(new Story(getType(), id, title, author, date, categ, - details, intUrl, extUrl, body)); - } + @Override + protected String getArticleAuthor(Document doc, Element article) { + Element authorElement = article.getElementsByClass( + "td-post-author-name").first(); + if (authorElement != null) { + authorElement = authorElement.getElementsByTag("a").first(); + } + if (authorElement != null) { + return authorElement.text(); } - return list; + return ""; } @Override - public void fetch(Story story) throws IOException { - String fullContent = story.getContent(); + protected String getArticleDate(Document doc, Element article) { + Element dateElement = article // + .getElementsByTag("time").first(); + if (dateElement != null) { + return dateElement.attr("datetime"); + } - URL url = new URL(story.getUrlInternal()); - InputStream in = downloader.open(url); - try { - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Element article = doc.getElementsByTag("article").first(); - if (article != null) { - article = article.getElementsByAttributeValue("itemprop", - "articleBody").first(); - } - if (article != null) { - for (String line : toLines(article, - new BasicElementProcessor() { - @Override - public boolean ignoreNode(Node node) { - return node.attr("class").contains("chapo"); - } - - @Override - public String isSubtitle(Node node) { - if (node instanceof Element) { - Element element = (Element) node; - if (element.tagName().startsWith("h") - && element.tagName().length() == 2) { - return element.text(); - } - } - return null; - } - })) { - fullContent += line + "\n"; - } + return ""; + } - // Content is too tight with a single break per line: - fullContent = fullContent.replace("\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .trim(); - } + @Override + protected String getArticleCategory(Document doc, Element article, + String currentCategory) { + return currentCategory; + } - // Get comments URL then parse it, if possible - Element posts = doc.getElementsByClass("comment-list").first(); + @Override + protected String getArticleDetails(Document doc, Element article) { + return ""; + } - story.setFullContent(fullContent); - story.setComments(getComments(posts)); - } finally { - if (in != null) { - in.close(); - } + @Override + protected String getArticleIntUrl(Document doc, Element article) { + Element urlElement = article.getElementsByTag("a").first(); + if (urlElement != null) { + return urlElement.absUrl("href"); } + + return ""; } - private List getComments(Element posts) { - List comments = new ArrayList(); - if (posts != null) { - for (Element post : posts.children()) { - if (!post.hasClass("comment")) { - continue; - } + @Override + protected String getArticleExtUrl(Document doc, Element article) { + return ""; + } + + @Override + protected String getArticleContent(Document doc, Element article) { + Element contentElement = article.getElementsByClass("td-excerpt") + .first(); + if (contentElement != null) { + return contentElement.text(); + } - String id = ""; - String author = ""; - String title = ""; - String date = ""; - List content = new ArrayList(); + return ""; + } - Element authorE = post.getElementsByTag("footer").first(); - if (authorE != null) { - authorE = authorE.getElementsByTag("cite").first(); - } - if (authorE != null) { - author = StringUtils.unhtml(authorE.text()).trim(); - } + @Override + protected Element getFullArticle(Document doc) { + Element article = doc.getElementsByTag("article").first(); + if (article != null) { + article = article.getElementsByAttributeValue("itemprop", + "articleBody").first(); + } + + return article; + } + + @Override + protected List getFullArticleCommentPosts(Document doc, URL intUrl) { + return getSubCommentElements(doc.getElementsByClass("comment-list") + .first()); + } - Element idE = post.getElementsByTag("a").first(); - if (idE != null) { - id = idE.attr("id"); - Element dateE = idE.getElementsByTag("span").first(); - if (dateE != null) { - date = date(dateE.attr("data-epoch")); + @Override + protected ElementProcessor getElementProcessorFullArticle() { + return new BasicElementProcessor() { + @Override + public boolean ignoreNode(Node node) { + return node.attr("class").contains("chapo"); + } + + @Override + public String isSubtitle(Node node) { + if (node instanceof Element) { + Element element = (Element) node; + if (element.tagName().startsWith("h") + && element.tagName().length() == 2) { + return element.text(); } } + return null; + } + }; + } + + @Override + protected List getCommentCommentPosts(Document doc, + Element container) { + return getSubCommentElements(container.getElementsByClass("children") + .first()); + } + + @Override + protected String getCommentId(Element post) { + Element idE = post.getElementsByTag("a").first(); + if (idE != null) { + return idE.attr("id"); + } + + return ""; + } + + @Override + protected String getCommentAuthor(Element post) { + // Since we have no title, we switch with author + return ""; + } + + @Override + protected String getCommentTitle(Element post) { + // Since we have no title, we switch with author + Element authorE = post.getElementsByTag("footer").first(); + if (authorE != null) { + authorE = authorE.getElementsByTag("cite").first(); + } + if (authorE != null) { + return authorE.text(); + } + + return ""; + } + + @Override + protected String getCommentDate(Element post) { + Element idE = post.getElementsByTag("a").first(); + if (idE != null) { + Element dateE = idE.getElementsByTag("span").first(); + if (dateE != null) { + return dateE.attr("data-epoch"); + } + } - Element contentE = post.getElementsByClass("comment-content") - .first(); - if (contentE != null) { - for (String line : toLines(contentE, - new BasicElementProcessor() { - @Override - public boolean ignoreNode(Node node) { - // TODO: ignore headlines/pub - if (node instanceof Element) { - Element el = (Element) node; - if ("h4".equals(el.tagName())) { - return true; - } - } - - return false; - } - })) { - content.add(line); + return ""; + } + + @Override + protected Element getCommentContentElement(Element post) { + Element contentE = post.getElementsByClass("comment-content").first(); + return contentE; + } + + @Override + protected ElementProcessor getElementProcessorComment() { + return new BasicElementProcessor() { + @Override + public boolean ignoreNode(Node node) { + if (node instanceof Element) { + Element el = (Element) node; + if ("h4".equals(el.tagName())) { + return true; } } - // Since we have no title but still an author, let's switch: - title = author; - author = ""; - Comment comment = new Comment(id, author, title, date, content); - comments.add(comment); + return false; + } + }; + } - Element children = post.getElementsByClass("children").first(); - comment.addAll(getComments(children)); + private List getSubCommentElements(Element posts) { + List commentElements = new ArrayList(); + if (posts != null) { + for (Element possibleCommentElement : posts.children()) { + if (possibleCommentElement.hasClass("comment")) { + commentElements.add(possibleCommentElement); + } } } - return comments; + return commentElements; } }