X-Git-Url: http://git.nikiroo.be/?a=blobdiff_plain;f=src%2Fbe%2Fnikiroo%2Fgofetch%2Fsupport%2FLeMonde.java;h=09990931b3d195bc2fa3eb54b1f74ce93dd2f942;hb=3367f6256b5143b7cba2a61de36e74f389a5f379;hp=c83dc14b8d0198c89ac46bf5ac7980c86eaf2da6;hpb=202173602397b0793542c7a90f9d86013e153067;p=gofetch.git diff --git a/src/be/nikiroo/gofetch/support/LeMonde.java b/src/be/nikiroo/gofetch/support/LeMonde.java index c83dc14..0999093 100644 --- a/src/be/nikiroo/gofetch/support/LeMonde.java +++ b/src/be/nikiroo/gofetch/support/LeMonde.java @@ -1,20 +1,21 @@ package be.nikiroo.gofetch.support; import java.io.IOException; -import java.io.InputStream; import java.net.URL; +import java.util.AbstractMap; import java.util.ArrayList; import java.util.List; +import java.util.Map.Entry; -import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; -import org.jsoup.select.Elements; - -import be.nikiroo.gofetch.data.Comment; -import be.nikiroo.gofetch.data.Story; +/** + * Support http://www.lemonde.fr/. + * + * @author niki + */ public class LeMonde extends BasicSupport { @Override public String getDescription() { @@ -22,97 +23,171 @@ public class LeMonde extends BasicSupport { } @Override - public List list() throws IOException { - List list = new ArrayList(); - - for (String topic : new String[] { "international", "politique", - "societe", "sciences" }) { - URL url = new URL("http://www.lemonde.fr/" + topic + "/1.html"); - InputStream in = open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements articles = doc.getElementsByTag("article"); - for (Element article : articles) { - Elements times = article.getElementsByTag("time"); - Elements titleElements = article.getElementsByTag("h3"); - Elements contentElements = article.getElementsByClass("txt3"); - if (times.size() > 0 && titleElements.size() > 0 - && contentElements.size() > 0) { - String id = times.get(0).attr("datetime").replace(":", "_") - .replace("+", "_"); - String title = "[" + topic + "] " - + titleElements.get(0).text(); - String content = contentElements.get(0).text(); - String intUrl = ""; - String extUrl = ""; - String details = ""; - - Elements detailsElements = article - .getElementsByClass("signature"); - if (detailsElements.size() > 0) { - details = detailsElements.get(0).text(); - } + protected List> getUrls() throws IOException { + List> urls = new ArrayList>(); + for (String topic : new String[] { "International", "Politique", + "Société", "Sciences" }) { + URL url = new URL("http://www.lemonde.fr/" + + topic.toLowerCase().replace("é", "e") + "/1.html"); + urls.add(new AbstractMap.SimpleEntry(url, topic)); + } - Elements links = titleElements.get(0).getElementsByTag("a"); - if (links.size() > 0) { - intUrl = links.get(0).absUrl("href"); - list.add(new Story(getType(), id, title, details, - intUrl, extUrl, content)); - } - } + return urls; + } + + @Override + protected List getArticles(Document doc) { + return doc.getElementsByTag("article"); + } + + @Override + protected String getArticleId(Document doc, Element article) { + return ""; // will use the date + } + + @Override + protected String getArticleTitle(Document doc, Element article) { + Element titleElement = article.getElementsByTag("h3").first(); + if (titleElement != null) { + return titleElement.text(); + } + + return ""; + } + + @Override + protected String getArticleAuthor(Document doc, Element article) { + Element detailsElement = article.getElementsByClass("signature") + .first(); + if (detailsElement != null) { + return detailsElement.text(); + } + + return ""; + } + + @Override + protected String getArticleDate(Document doc, Element article) { + Element timeElement = article.getElementsByTag("time").first(); + if (timeElement != null) { + return timeElement.attr("datetime"); + } + + return ""; + } + + @Override + protected String getArticleCategory(Document doc, Element article, + String currentCategory) { + return currentCategory; + } + + @Override + protected String getArticleDetails(Document doc, Element article) { + return ""; + } + + @Override + protected String getArticleIntUrl(Document doc, Element article) { + Element titleElement = article.getElementsByTag("h3").first(); + if (titleElement != null) { + Element link = titleElement.getElementsByTag("a").first(); + if (link != null) { + return link.absUrl("href"); } } - return list; + return ""; } @Override - public void fetch(Story story) throws IOException { - String fullContent = story.getContent(); - List comments = new ArrayList(); + protected String getArticleExtUrl(Document doc, Element article) { + return ""; + } - // Note: no comments on this site as far as I can see (or maybe with - // some javascript, I need to check...) + @Override + protected String getArticleContent(Document doc, Element article) { + Element contentElement = article.getElementsByClass("txt3").first(); + if (contentElement != null) { + return getArticleText(contentElement); + } - URL url = new URL(story.getUrlInternal()); - InputStream in = open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Element article = doc.getElementById("articleBody"); - if (article != null) { - for (String line : toLines(article, new BasicElementProcessor() { - @Override - public boolean ignoreNode(Node node) { - if (node instanceof Element) { - Element element = (Element) node; - if (element.hasClass("lire")) { - return true; - } - } + return ""; + } + + @Override + protected Element getFullArticle(Document doc) { + return doc.getElementById("articleBody"); + } + + @Override + protected List getFullArticleCommentPosts(Document doc, URL intUrl) { + return null; + } - return false; + @Override + protected ElementProcessor getElementProcessorFullArticle() { + return new BasicElementProcessor() { + @Override + public boolean ignoreNode(Node node) { + if (node instanceof Element) { + Element element = (Element) node; + if (element.hasClass("lire")) { + return true; + } } - @Override - public String manualProcessing(Node node) { - if (node instanceof Element) { - Element element = (Element) node; - if (element.hasClass("intertitre")) { - return "\n[ " + element.text() + " ]\n"; - } + return false; + } + + @Override + public String isSubtitle(Node node) { + if (node instanceof Element) { + Element element = (Element) node; + if (element.hasClass("intertitre")) { + return element.text(); } - return null; } - })) { - fullContent += line + "\n"; + return null; } + }; + } - // Content is too tight with a single break per line: - fullContent = fullContent.replace("\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .replace("\n\n\n\n", "\n\n") // - .trim(); - } + // No comment on this site, horrible javascript system - story.setFullContent(fullContent); - story.setComments(comments); + @Override + protected List getCommentCommentPosts(Document doc, + Element container) { + return null; + } + + @Override + protected String getCommentId(Element post) { + return null; + } + + @Override + protected String getCommentAuthor(Element post) { + return null; + } + + @Override + protected String getCommentTitle(Element post) { + return null; + } + + @Override + protected String getCommentDate(Element post) { + return null; + } + + @Override + protected Element getCommentContentElement(Element post) { + return null; + } + + @Override + protected ElementProcessor getElementProcessorComment() { + return null; } }