package be.nikiroo.gofetch.support;
import java.io.IOException;
-import java.io.InputStream;
import java.net.URL;
+import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map.Entry;
-import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
-
-import be.nikiroo.gofetch.data.Comment;
-import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.utils.StringUtils;
/**
* Support <a
}
@Override
- public List<Story> list() throws IOException {
- List<Story> list = new ArrayList<Story>();
-
- for (String categ : new String[] { "informatique" }) {
- URL url = new URL("https://www.erenumerique.fr/" + categ);
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements articles = doc.getElementsByClass("item-details");
- for (Element article : articles) {
- String id = "";
- String intUrl = "";
- String extUrl = ""; // nope
- String title = "";
- String date = "";
- String author = "";
- String details = "";
- String body = "";
-
- // MUST NOT fail:
- Element dateElement = article //
- .getElementsByTag("time").first();
- if (dateElement == null) {
- continue;
- }
+ protected List<Entry<URL, String>> getUrls() throws IOException {
+ List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+ for (String categ : new String[] { "Informatique" }) {
+ URL url = new URL("https://www.erenumerique.fr/"
+ + categ.toLowerCase());
+ urls.add(new AbstractMap.SimpleEntry<URL, String>(url, categ));
+ }
- Element urlElement = article.getElementsByTag("a").first();
- if (urlElement != null) {
- intUrl = urlElement.absUrl("href");
- }
+ return urls;
+ }
- id = dateElement.attr("datetime").replace(":", "_")
- .replace("+", "_");
- date = date(dateElement.attr("datetime"));
+ @Override
+ protected List<Element> getArticles(Document doc) {
+ return doc.getElementsByClass("item-details");
+ }
- Element titleElement = article.getElementsByTag("h2").first();
- if (titleElement != null) {
- title = StringUtils.unhtml(titleElement.text()).trim();
- }
+ @Override
+ protected String getArticleId(Document doc, Element article) {
+ return ""; // will use the date
+ }
- Element authorElement = article.getElementsByClass(
- "td-post-author-name").first();
- if (authorElement != null) {
- authorElement = authorElement.getElementsByTag("a").first();
- }
- if (authorElement != null) {
- author = StringUtils.unhtml(authorElement.text()).trim();
- }
+ @Override
+ protected String getArticleTitle(Document doc, Element article) {
+ Element titleElement = article.getElementsByTag("h2").first();
+ if (titleElement != null) {
+ return titleElement.text();
+ }
- Element contentElement = article.getElementsByClass(
- "td-excerpt").first();
- if (contentElement != null) {
- body = StringUtils.unhtml(contentElement.text()).trim();
- }
+ return "";
+ }
- list.add(new Story(getType(), id, title, author, date, categ,
- details, intUrl, extUrl, body));
- }
+ @Override
+ protected String getArticleAuthor(Document doc, Element article) {
+ Element authorElement = article.getElementsByClass(
+ "td-post-author-name").first();
+ if (authorElement != null) {
+ authorElement = authorElement.getElementsByTag("a").first();
+ }
+ if (authorElement != null) {
+ return authorElement.text();
}
- return list;
+ return "";
}
@Override
- public void fetch(Story story) throws IOException {
- String fullContent = story.getContent();
+ protected String getArticleDate(Document doc, Element article) {
+ Element dateElement = article //
+ .getElementsByTag("time").first();
+ if (dateElement != null) {
+ return dateElement.attr("datetime");
+ }
- URL url = new URL(story.getUrlInternal());
- InputStream in = downloader.open(url);
- try {
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Element article = doc.getElementsByTag("article").first();
- if (article != null) {
- article = article.getElementsByAttributeValue("itemprop",
- "articleBody").first();
- }
- if (article != null) {
- for (String line : toLines(article,
- new BasicElementProcessor() {
- @Override
- public boolean ignoreNode(Node node) {
- return node.attr("class").contains("chapo");
- }
-
- @Override
- public String isSubtitle(Node node) {
- if (node instanceof Element) {
- Element element = (Element) node;
- if (element.tagName().startsWith("h")
- && element.tagName().length() == 2) {
- return element.text();
- }
- }
- return null;
- }
- })) {
- fullContent += line + "\n";
- }
+ return "";
+ }
- // Content is too tight with a single break per line:
- fullContent = fullContent.replace("\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .trim();
- }
+ @Override
+ protected String getArticleCategory(Document doc, Element article,
+ String currentCategory) {
+ return currentCategory;
+ }
- // Get comments URL then parse it, if possible
- Element posts = doc.getElementsByClass("comment-list").first();
+ @Override
+ protected String getArticleDetails(Document doc, Element article) {
+ return "";
+ }
- story.setFullContent(fullContent);
- story.setComments(getComments(posts));
- } finally {
- if (in != null) {
- in.close();
- }
+ @Override
+ protected String getArticleIntUrl(Document doc, Element article) {
+ Element urlElement = article.getElementsByTag("a").first();
+ if (urlElement != null) {
+ return urlElement.absUrl("href");
}
+
+ return "";
}
- private List<Comment> getComments(Element posts) {
- List<Comment> comments = new ArrayList<Comment>();
- if (posts != null) {
- for (Element post : posts.children()) {
- if (!post.hasClass("comment")) {
- continue;
- }
+ @Override
+ protected String getArticleExtUrl(Document doc, Element article) {
+ return "";
+ }
+
+ @Override
+ protected String getArticleContent(Document doc, Element article) {
+ Element contentElement = article.getElementsByClass("td-excerpt")
+ .first();
+ if (contentElement != null) {
+ return getArticleText(contentElement);
+ }
- String id = "";
- String author = "";
- String title = "";
- String date = "";
- List<String> content = new ArrayList<String>();
+ return "";
+ }
- Element authorE = post.getElementsByTag("footer").first();
- if (authorE != null) {
- authorE = authorE.getElementsByTag("cite").first();
- }
- if (authorE != null) {
- author = StringUtils.unhtml(authorE.text()).trim();
- }
+ @Override
+ protected Element getFullArticle(Document doc) {
+ Element article = doc.getElementsByTag("article").first();
+ if (article != null) {
+ article = article.getElementsByAttributeValue("itemprop",
+ "articleBody").first();
+ }
+
+ return article;
+ }
+
+ @Override
+ protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+ return getSubCommentElements(doc.getElementsByClass("comment-list")
+ .first());
+ }
- Element idE = post.getElementsByTag("a").first();
- if (idE != null) {
- id = idE.attr("id");
- Element dateE = idE.getElementsByTag("span").first();
- if (dateE != null) {
- date = date(dateE.attr("data-epoch"));
+ @Override
+ protected ElementProcessor getElementProcessorFullArticle() {
+ return new BasicElementProcessor() {
+ @Override
+ public boolean ignoreNode(Node node) {
+ return node.attr("class").contains("chapo");
+ }
+
+ @Override
+ public String isSubtitle(Node node) {
+ if (node instanceof Element) {
+ Element element = (Element) node;
+ if (element.tagName().startsWith("h")
+ && element.tagName().length() == 2) {
+ return element.text();
}
}
+ return null;
+ }
+ };
+ }
+
+ @Override
+ protected List<Element> getCommentCommentPosts(Document doc,
+ Element container) {
+ return getSubCommentElements(container.getElementsByClass("children")
+ .first());
+ }
+
+ @Override
+ protected String getCommentId(Element post) {
+ Element idE = post.getElementsByTag("a").first();
+ if (idE != null) {
+ return idE.attr("id");
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getCommentAuthor(Element post) {
+ // Since we have no title, we switch with author
+ return "";
+ }
+
+ @Override
+ protected String getCommentTitle(Element post) {
+ // Since we have no title, we switch with author
+ Element authorE = post.getElementsByTag("footer").first();
+ if (authorE != null) {
+ authorE = authorE.getElementsByTag("cite").first();
+ }
+ if (authorE != null) {
+ return authorE.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getCommentDate(Element post) {
+ Element idE = post.getElementsByTag("a").first();
+ if (idE != null) {
+ Element dateE = idE.getElementsByTag("span").first();
+ if (dateE != null) {
+ return dateE.attr("data-epoch");
+ }
+ }
- Element contentE = post.getElementsByClass("comment-content")
- .first();
- if (contentE != null) {
- for (String line : toLines(contentE,
- new BasicElementProcessor() {
- @Override
- public boolean ignoreNode(Node node) {
- // TODO: ignore headlines/pub
- if (node instanceof Element) {
- Element el = (Element) node;
- if ("h4".equals(el.tagName())) {
- return true;
- }
- }
-
- return false;
- }
- })) {
- content.add(line);
+ return "";
+ }
+
+ @Override
+ protected Element getCommentContentElement(Element post) {
+ Element contentE = post.getElementsByClass("comment-content").first();
+ return contentE;
+ }
+
+ @Override
+ protected ElementProcessor getElementProcessorComment() {
+ return new BasicElementProcessor() {
+ @Override
+ public boolean ignoreNode(Node node) {
+ if (node instanceof Element) {
+ Element el = (Element) node;
+ if ("h4".equals(el.tagName())) {
+ return true;
}
}
- // Since we have no title but still an author, let's switch:
- title = author;
- author = "";
- Comment comment = new Comment(id, author, title, date, content);
- comments.add(comment);
+ return false;
+ }
+ };
+ }
- Element children = post.getElementsByClass("children").first();
- comment.addAll(getComments(children));
+ private List<Element> getSubCommentElements(Element posts) {
+ List<Element> commentElements = new ArrayList<Element>();
+ if (posts != null) {
+ for (Element possibleCommentElement : posts.children()) {
+ if (possibleCommentElement.hasClass("comment")) {
+ commentElements.add(possibleCommentElement);
+ }
}
}
- return comments;
+ return commentElements;
}
}