package be.nikiroo.gofetch.support;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import be.nikiroo.gofetch.data.Comment;
import be.nikiroo.gofetch.data.Story;
/**
* Support http://www.lemonde.fr/.
*
* @author niki
*/
public class LeMonde extends BasicSupport {
@Override
public String getDescription() {
return "Le Monde: Actualités et Infos en France et dans le monde";
}
@Override
public List list() throws IOException {
List list = new ArrayList();
for (String topic : new String[] { "international", "politique",
"societe", "sciences" }) {
URL url = new URL("http://www.lemonde.fr/" + topic + "/1.html");
InputStream in = downloader.open(url);
Document doc = DataUtil.load(in, "UTF-8", url.toString());
Elements articles = doc.getElementsByTag("article");
for (Element article : articles) {
Elements times = article.getElementsByTag("time");
Elements titleElements = article.getElementsByTag("h3");
Elements contentElements = article.getElementsByClass("txt3");
if (times.size() > 0 && titleElements.size() > 0
&& contentElements.size() > 0) {
String id = times.get(0).attr("datetime").replace(":", "_")
.replace("+", "_");
String title = titleElements.get(0).text();
String date = date(titleElements.get(0).text());
String content = contentElements.get(0).text();
String intUrl = "";
String extUrl = "";
String author = "";
String details = "";
Elements detailsElements = article
.getElementsByClass("signature");
if (detailsElements.size() > 0) {
author = detailsElements.get(0).text();
}
Elements links = titleElements.get(0).getElementsByTag("a");
if (links.size() > 0) {
intUrl = links.get(0).absUrl("href");
list.add(new Story(getType(), id, title, author, date,
topic, details, intUrl, extUrl, content));
}
}
}
}
return list;
}
@Override
public void fetch(Story story) throws IOException {
String fullContent = story.getContent();
List comments = new ArrayList();
// Note: no comments on this site as far as I can see (or maybe with
// some javascript, I need to check...)
URL url = new URL(story.getUrlInternal());
InputStream in = downloader.open(url);
Document doc = DataUtil.load(in, "UTF-8", url.toString());
Element article = doc.getElementById("articleBody");
if (article != null) {
for (String line : toLines(article, new BasicElementProcessor() {
@Override
public boolean ignoreNode(Node node) {
if (node instanceof Element) {
Element element = (Element) node;
if (element.hasClass("lire")) {
return true;
}
}
return false;
}
@Override
public String manualProcessing(Node node) {
if (node instanceof Element) {
Element element = (Element) node;
if (element.hasClass("intertitre")) {
return "\n[ " + element.text() + " ]\n";
}
}
return null;
}
})) {
fullContent += line + "\n";
}
// Content is too tight with a single break per line:
fullContent = fullContent.replace("\n", "\n\n") //
.replace("\n\n\n\n", "\n\n") //
.replace("\n\n\n\n", "\n\n") //
.trim();
}
story.setFullContent(fullContent);
story.setComments(comments);
}
}