X-Git-Url: http://git.nikiroo.be/?a=blobdiff_plain;ds=sidebyside;f=src%2Fbe%2Fnikiroo%2Fgofetch%2Fsupport%2FPipedot.java;h=0d044e5042a73c54c02d5353b9c8ebb0987dab0f;hb=254d5bc320fa106f9d0a05c1e32a09adda6f82e0;hp=89932f7636c170abc3b34b7c71542880b255984c;hpb=100a839503d23e324d2db3f6d3e47892def3bf81;p=gofetch.git
diff --git a/src/be/nikiroo/gofetch/support/Pipedot.java b/src/be/nikiroo/gofetch/support/Pipedot.java
index 89932f7..0d044e5 100644
--- a/src/be/nikiroo/gofetch/support/Pipedot.java
+++ b/src/be/nikiroo/gofetch/support/Pipedot.java
@@ -1,20 +1,17 @@
package be.nikiroo.gofetch.support;
import java.io.IOException;
-import java.io.InputStream;
import java.net.URL;
+import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map.Entry;
-import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
-import be.nikiroo.gofetch.data.Comment;
-import be.nikiroo.gofetch.data.Story;
-
/**
* Support https://pipedot.org/.
*
@@ -27,127 +24,207 @@ public class Pipedot extends BasicSupport {
}
@Override
- public List list() throws IOException {
- List list = new ArrayList();
+ protected List> getUrls() throws IOException {
+ List> urls = new ArrayList>();
+ urls.add(new AbstractMap.SimpleEntry(new URL(
+ "https://pipedot.org/"), ""));
+ return urls;
+ }
- URL url = new URL("https://pipedot.org/");
- InputStream in = open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements articles = doc.getElementsByClass("story");
- for (Element article : articles) {
- Elements titles = article.getElementsByTag("h1");
- if (titles.size() == 0) {
- continue;
- }
+ @Override
+ protected List getArticles(Document doc) {
+ return doc.getElementsByClass("story");
+ }
- Element title = titles.get(0);
+ @Override
+ protected String getArticleId(Document doc, Element article) {
+ // Don't try on bad articles
+ if (getArticleTitle(doc, article).isEmpty()) {
+ return "";
+ }
- String id = "";
- for (Element idElem : article.getElementsByTag("a")) {
- if (idElem.attr("href").startsWith("/pipe/")) {
- id = idElem.attr("href").substring("/pipe/".length());
- break;
- }
+ for (Element idElem : article.getElementsByTag("a")) {
+ if (idElem.attr("href").startsWith("/pipe/")) {
+ return idElem.attr("href").substring("/pipe/".length());
}
+ }
- String intUrl = null;
- String extUrl = null;
+ return "";
+ }
- Elements links = article.getElementsByTag("a");
- if (links.size() > 0) {
- intUrl = links.get(0).absUrl("href");
- }
+ @Override
+ protected String getArticleTitle(Document doc, Element article) {
+ Element title = article.getElementsByTag("h1").first();
+ if (title != null) {
+ return title.text();
+ }
- // Take first ext URL as original source
- for (Element link : links) {
- String uuu = link.absUrl("href");
- if (!uuu.isEmpty() && !uuu.contains("pipedot.org/")) {
- extUrl = uuu;
- break;
- }
- }
+ return "";
+ }
- String details = "";
- Elements detailsElements = article.getElementsByTag("div");
- if (detailsElements.size() > 0) {
- details = detailsElements.get(0).text();
+ @Override
+ protected String getArticleAuthor(Document doc, Element article) {
+ String value = getArticleDetailsReal(article);
+ int pos = value.indexOf("by ");
+ if (pos >= 0) {
+ value = value.substring(pos + "by ".length()).trim();
+ pos = value.indexOf(" in ");
+ if (pos >= 0) {
+ value = value.substring(0, pos).trim();
}
- String body = "";
- for (Element elem : article.children()) {
- String tag = elem.tag().toString();
- if (!tag.equals("header") && !tag.equals("footer")) {
- body = elem.text();
- break;
- }
+ return value;
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleDate(Document doc, Element article) {
+ Element dateElement = article.getElementsByTag("time").first();
+ if (dateElement != null) {
+ return dateElement.attr("datetime");
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleCategory(Document doc, Element article,
+ String currentCategory) {
+ String value = getArticleDetailsReal(article);
+ int pos = value.indexOf(" in ");
+ if (pos >= 0) {
+ value = value.substring(pos + " in ".length()).trim();
+ pos = value.indexOf(" on ");
+ if (pos >= 0) {
+ value = value.substring(0, pos).trim();
}
- list.add(new Story(getType(), id, title.text(), details, intUrl,
- extUrl, body));
+ return value;
}
- return list;
+ return "";
}
@Override
- public void fetch(Story story) throws IOException {
- List comments = new ArrayList();
+ protected String getArticleDetails(Document doc, Element article) {
+ return ""; // We alrady extracted all the info
+ }
- URL url = new URL(story.getUrlInternal());
- InputStream in = open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements listing = doc.getElementsByTag("main");
- if (listing.size() > 0) {
- comments.addAll(getComments(listing.get(0)));
+ @Override
+ protected String getArticleIntUrl(Document doc, Element article) {
+ Element link = article.getElementsByTag("a").first();
+ if (link != null) {
+ return link.absUrl("href");
}
- story.setComments(comments);
+ return "";
}
- private List getComments(Element listing) {
- List comments = new ArrayList();
- for (Element commentElement : listing.children()) {
- if (commentElement.hasClass("comment")) {
- Comment comment = getComment(commentElement);
- if (!comment.isEmpty()) {
- comments.add(comment);
- }
+ @Override
+ protected String getArticleExtUrl(Document doc, Element article) {
+ Element link = article.getElementsByTag("a").first();
+ if (link != null) {
+ String possibleExtLink = link.absUrl("href").trim();
+ if (!possibleExtLink.isEmpty()
+ && !possibleExtLink.contains("pipedot.org/")) {
+ return possibleExtLink;
}
}
- return comments;
+
+ return "";
}
- private Comment getComment(Element commentElement) {
- String title = firstOrEmptyTag(commentElement, "h3").text();
- String author = firstOrEmpty(commentElement, "h4").text();
- Element content = firstOrEmpty(commentElement, "comment-body");
+ @Override
+ protected String getArticleContent(Document doc, Element article) {
+ for (Element elem : article.children()) {
+ String tag = elem.tagName();
+ if (!tag.equals("header") && !tag.equals("footer")) {
+ return getArticleText(elem);
+ }
+ }
+
+ return "";
+ }
- String date = "";
- int pos = author.lastIndexOf(" on ");
- if (pos >= 0) {
- date = author.substring(pos + " on ".length()).trim();
- author = author.substring(0, pos).trim();
+ @Override
+ protected Element getFullArticle(Document doc) {
+ return null;
+ }
+
+ @Override
+ protected List getFullArticleCommentPosts(Document doc, URL intUrl) {
+ return getCommentElements(doc.getElementsByTag("main").first());
+ }
+
+ @Override
+ protected ElementProcessor getElementProcessorFullArticle() {
+ return new BasicElementProcessor();
+ }
+
+ @Override
+ protected List getCommentCommentPosts(Document doc,
+ Element container) {
+
+ if (container != null) {
+ container = container.getElementsByClass("comment-outline").first();
}
- Comment comment = new Comment(commentElement.id(), author, title, date,
- toLines(content));
+ return getCommentElements(container);
+ }
- Elements commentOutline = commentElement
- .getElementsByClass("comment-outline");
- if (commentOutline.size() > 0) {
- comment.addAll(getComments(commentOutline.get(0)));
+ @Override
+ protected String getCommentId(Element post) {
+ return post.id();
+ }
+
+ @Override
+ protected String getCommentAuthor(Element post) {
+ Element authorDateE = post.getElementsByTag("h3").first();
+ if (authorDateE != null) {
+ String authorDate = authorDateE.text();
+ int pos = authorDate.lastIndexOf(" on ");
+ if (pos >= 0) {
+ return authorDate.substring(0, pos).trim();
+ }
}
- return comment;
+ return "";
}
- private List toLines(Element element) {
- return toLines(element, new QuoteProcessor() {
- @Override
- public String processText(String text) {
- return text;
+ @Override
+ protected String getCommentTitle(Element post) {
+ Element title = post.getElementsByTag("h3").first();
+ if (title != null) {
+ return title.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getCommentDate(Element post) {
+ Element authorDateE = post.getElementsByTag("h3").first();
+ if (authorDateE != null) {
+ String authorDate = authorDateE.text();
+ int pos = authorDate.lastIndexOf(" on ");
+ if (pos >= 0) {
+ return authorDate.substring(pos + " on ".length()).trim();
}
+ }
+
+ return "";
+ }
+ @Override
+ protected Element getCommentContentElement(Element post) {
+ return post.getElementsByClass("comment-body").first();
+ }
+
+ @Override
+ protected ElementProcessor getElementProcessorComment() {
+ return new BasicElementProcessor() {
@Override
public boolean detectQuote(Node node) {
if (node instanceof Element) {
@@ -160,16 +237,27 @@ public class Pipedot extends BasicSupport {
return false;
}
+ };
+ }
- @Override
- public boolean ignoreNode(Node node) {
- return false;
- }
+ private String getArticleDetailsReal(Element article) {
+ Elements detailsElements = article.getElementsByTag("div");
+ if (detailsElements.size() > 0) {
+ return detailsElements.get(0).text().trim();
+ }
- @Override
- public String manualProcessing(Node node) {
- return null;
+ return "";
+ }
+
+ private List getCommentElements(Element container) {
+ List commentElements = new ArrayList();
+ if (container != null) {
+ for (Element commentElement : container.children()) {
+ if (commentElement.hasClass("comment")) {
+ commentElements.add(commentElement);
+ }
}
- });
+ }
+ return commentElements;
}
}