package be.nikiroo.gofetch.support;
import java.io.IOException;
-import java.io.InputStream;
import java.net.URL;
+import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map.Entry;
-import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
-
-import be.nikiroo.gofetch.data.Comment;
-import be.nikiroo.gofetch.data.Story;
/**
* Support <a href="http://www.lemonde.fr/">http://www.lemonde.fr/</a>.
}
@Override
- public List<Story> list() throws IOException {
- List<Story> list = new ArrayList<Story>();
-
- for (String topic : new String[] { "international", "politique",
- "societe", "sciences" }) {
- URL url = new URL("http://www.lemonde.fr/" + topic + "/1.html");
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements articles = doc.getElementsByTag("article");
- for (Element article : articles) {
- Elements times = article.getElementsByTag("time");
- Elements titleElements = article.getElementsByTag("h3");
- Elements contentElements = article.getElementsByClass("txt3");
- if (times.size() > 0 && titleElements.size() > 0
- && contentElements.size() > 0) {
- String id = times.get(0).attr("datetime").replace(":", "_")
- .replace("+", "_");
- String title = titleElements.get(0).text();
- String date = date(titleElements.get(0).text());
- String content = contentElements.get(0).text();
- String intUrl = "";
- String extUrl = "";
- String author = "";
- String details = "";
-
- Elements detailsElements = article
- .getElementsByClass("signature");
- if (detailsElements.size() > 0) {
- author = detailsElements.get(0).text();
- }
+ protected List<Entry<URL, String>> getUrls() throws IOException {
+ List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+ for (String topic : new String[] { "International", "Politique",
+ "Société", "Sciences" }) {
+ URL url = new URL("http://www.lemonde.fr/"
+ + topic.toLowerCase().replace("é", "e") + "/1.html");
+ urls.add(new AbstractMap.SimpleEntry<URL, String>(url, topic));
+ }
- Elements links = titleElements.get(0).getElementsByTag("a");
- if (links.size() > 0) {
- intUrl = links.get(0).absUrl("href");
- list.add(new Story(getType(), id, title, author, date,
- topic, details, intUrl, extUrl, content));
- }
- }
+ return urls;
+ }
+
+ @Override
+ protected List<Element> getArticles(Document doc) {
+ return doc.getElementsByTag("article");
+ }
+
+ @Override
+ protected String getArticleId(Document doc, Element article) {
+ return ""; // will use the date
+ }
+
+ @Override
+ protected String getArticleTitle(Document doc, Element article) {
+ Element titleElement = article.getElementsByTag("h3").first();
+ if (titleElement != null) {
+ return titleElement.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleAuthor(Document doc, Element article) {
+ Element detailsElement = article.getElementsByClass("signature")
+ .first();
+ if (detailsElement != null) {
+ return detailsElement.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleDate(Document doc, Element article) {
+ Element timeElement = article.getElementsByTag("time").first();
+ if (timeElement != null) {
+ return timeElement.attr("datetime");
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleCategory(Document doc, Element article,
+ String currentCategory) {
+ return currentCategory;
+ }
+
+ @Override
+ protected String getArticleDetails(Document doc, Element article) {
+ return "";
+ }
+
+ @Override
+ protected String getArticleIntUrl(Document doc, Element article) {
+ Element titleElement = article.getElementsByTag("h3").first();
+ if (titleElement != null) {
+ Element link = titleElement.getElementsByTag("a").first();
+ if (link != null) {
+ return link.absUrl("href");
}
}
- return list;
+ return "";
}
@Override
- public void fetch(Story story) throws IOException {
- String fullContent = story.getContent();
- List<Comment> comments = new ArrayList<Comment>();
+ protected String getArticleExtUrl(Document doc, Element article) {
+ return "";
+ }
- // Note: no comments on this site as far as I can see (or maybe with
- // some javascript, I need to check...)
+ @Override
+ protected String getArticleContent(Document doc, Element article) {
+ Element contentElement = article.getElementsByClass("txt3").first();
+ if (contentElement != null) {
+ return getArticleText(contentElement);
+ }
- URL url = new URL(story.getUrlInternal());
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Element article = doc.getElementById("articleBody");
- if (article != null) {
- for (String line : toLines(article, new BasicElementProcessor() {
- @Override
- public boolean ignoreNode(Node node) {
- if (node instanceof Element) {
- Element element = (Element) node;
- if (element.hasClass("lire")) {
- return true;
- }
- }
+ return "";
+ }
+
+ @Override
+ protected Element getFullArticle(Document doc) {
+ return doc.getElementById("articleBody");
+ }
+
+ @Override
+ protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+ return null;
+ }
- return false;
+ @Override
+ protected ElementProcessor getElementProcessorFullArticle() {
+ return new BasicElementProcessor() {
+ @Override
+ public boolean ignoreNode(Node node) {
+ if (node instanceof Element) {
+ Element element = (Element) node;
+ if (element.hasClass("lire")) {
+ return true;
+ }
}
- @Override
- public String isSubtitle(Node node) {
- if (node instanceof Element) {
- Element element = (Element) node;
- if (element.hasClass("intertitre")) {
- return element.text();
- }
+ return false;
+ }
+
+ @Override
+ public String isSubtitle(Node node) {
+ if (node instanceof Element) {
+ Element element = (Element) node;
+ if (element.hasClass("intertitre")) {
+ return element.text();
}
- return null;
}
- })) {
- fullContent += line + "\n";
+ return null;
}
+ };
+ }
- // Content is too tight with a single break per line:
- fullContent = fullContent.replace("\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .trim();
- }
+ // No comment on this site, horrible javascript system
- story.setFullContent(fullContent);
- story.setComments(comments);
+ @Override
+ protected List<Element> getCommentCommentPosts(Document doc,
+ Element container) {
+ return null;
+ }
+
+ @Override
+ protected String getCommentId(Element post) {
+ return null;
+ }
+
+ @Override
+ protected String getCommentAuthor(Element post) {
+ return null;
+ }
+
+ @Override
+ protected String getCommentTitle(Element post) {
+ return null;
+ }
+
+ @Override
+ protected String getCommentDate(Element post) {
+ return null;
+ }
+
+ @Override
+ protected Element getCommentContentElement(Element post) {
+ return null;
+ }
+
+ @Override
+ protected ElementProcessor getElementProcessorComment() {
+ return null;
}
}