package be.nikiroo.gofetch.support; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.ArrayList; import java.util.List; import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.Elements; import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; /** * Support http://www.lemonde.fr/. * * @author niki */ public class LeMonde extends BasicSupport { @Override public String getDescription() { return "Le Monde: Actualités et Infos en France et dans le monde"; } @Override public List list() throws IOException { List list = new ArrayList(); for (String topic : new String[] { "international", "politique", "societe", "sciences" }) { URL url = new URL("http://www.lemonde.fr/" + topic + "/1.html"); InputStream in = downloader.open(url); Document doc = DataUtil.load(in, "UTF-8", url.toString()); Elements articles = doc.getElementsByTag("article"); for (Element article : articles) { Elements times = article.getElementsByTag("time"); Elements titleElements = article.getElementsByTag("h3"); Elements contentElements = article.getElementsByClass("txt3"); if (times.size() > 0 && titleElements.size() > 0 && contentElements.size() > 0) { String id = times.get(0).attr("datetime").replace(":", "_") .replace("+", "_"); String title = titleElements.get(0).text(); String date = date(titleElements.get(0).text()); String content = contentElements.get(0).text(); String intUrl = ""; String extUrl = ""; String author = ""; String details = ""; Elements detailsElements = article .getElementsByClass("signature"); if (detailsElements.size() > 0) { author = detailsElements.get(0).text(); } Elements links = titleElements.get(0).getElementsByTag("a"); if (links.size() > 0) { intUrl = links.get(0).absUrl("href"); list.add(new Story(getType(), id, title, author, date, topic, details, intUrl, extUrl, content)); } } } } return list; } @Override public void fetch(Story story) throws IOException { String fullContent = story.getContent(); List comments = new ArrayList(); // Note: no comments on this site as far as I can see (or maybe with // some javascript, I need to check...) URL url = new URL(story.getUrlInternal()); InputStream in = downloader.open(url); Document doc = DataUtil.load(in, "UTF-8", url.toString()); Element article = doc.getElementById("articleBody"); if (article != null) { for (String line : toLines(article, new BasicElementProcessor() { @Override public boolean ignoreNode(Node node) { if (node instanceof Element) { Element element = (Element) node; if (element.hasClass("lire")) { return true; } } return false; } @Override public String isSubtitle(Node node) { if (node instanceof Element) { Element element = (Element) node; if (element.hasClass("intertitre")) { return element.text(); } } return null; } })) { fullContent += line + "\n"; } // Content is too tight with a single break per line: fullContent = fullContent.replace("\n", "\n\n") // .replace("\n\n\n\n", "\n\n") // .replace("\n\n\n\n", "\n\n") // .trim(); } story.setFullContent(fullContent); story.setComments(comments); } }