import be.nikiroo.gofetch.output.Html;
import be.nikiroo.gofetch.output.Output;
import be.nikiroo.gofetch.support.BasicSupport;
-import be.nikiroo.gofetch.support.BasicSupport.Type;
+import be.nikiroo.gofetch.support.Type;
import be.nikiroo.utils.IOUtils;
/**
import java.io.File;
import java.io.IOException;
-import be.nikiroo.gofetch.support.BasicSupport.Type;
+import be.nikiroo.gofetch.support.Type;
/**
* This class is tha main entry point of the program. It will parse the
public boolean isEmpty() {
return children.isEmpty() && lines.isEmpty()
- && ("" + author + title).trim().isEmpty();
+ && ("" + author + title).isEmpty();
}
@Override
import java.util.List;
import be.nikiroo.gofetch.support.BasicSupport;
-import be.nikiroo.gofetch.support.BasicSupport.Type;
+import be.nikiroo.gofetch.support.Type;
/**
* A news story.
import be.nikiroo.gofetch.data.Comment;
import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.gofetch.support.BasicSupport.Type;
+import be.nikiroo.gofetch.support.Type;
import be.nikiroo.utils.StringUtils;
import be.nikiroo.utils.StringUtils.Alignment;
import be.nikiroo.gofetch.data.Comment;
import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.gofetch.support.BasicSupport.Type;
+import be.nikiroo.gofetch.support.Type;
+import be.nikiroo.utils.StringUtils;
public class Html extends Output {
public Html(Type type, String hostname, String preselector, int port) {
.append("<div class='comment' style='display: block; margin-left: 80px'>\n");
builder.append(space).append(" <h2>").append(comment.getTitle())
.append("</h2>\n");
- builder.append(space).append(" <div class='by' style='font-style: italic;'>")
+ builder.append(space)
+ .append(" <div class='by' style='font-style: italic;'>")
.append(comment.getAuthor()).append("</div>\n");
builder.append(space).append(" <div class='comment_content'>");
for (String line : comment.getContentLines()) {
builder.append(" <div class='details'>");
if (story.getDetails() != null && !story.getDetails().isEmpty()) {
- builder.append("(").append(story.getDetails()).append(")");
+ builder.append("(")
+ .append(StringUtils.xmlEscape(story.getDetails()))
+ .append(")");
}
builder.append("</div>\n");
builder.append(" <br/>\n");
builder.append(" <div class='content' style='text-align: justify'>\n");
if (resume) {
- builder.append(" " + story.getContent() + "\n");
+ builder.append(" " + StringUtils.xmlEscape(story.getContent())
+ + "\n");
} else {
builder.append(" "
- + story.getFullContent().replace("\n", "<br/>")
- .replace("[ ", "<h2>").replace(" ]", "</h2>")
- + "\n");
+ + StringUtils.xmlEscape(story.getFullContent())
+ .replace("\n", "<br/>").replace("[ ", "<h2>")
+ .replace(" ]", "</h2>") + "\n");
}
builder.append(" </div>\n");
package be.nikiroo.gofetch.output;
import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.gofetch.support.BasicSupport.Type;
+import be.nikiroo.gofetch.support.Type;
/**
* Base class for output operations.
--- /dev/null
+package be.nikiroo.gofetch.support;
+
+import org.jsoup.nodes.Node;
+
+/**
+ * A default {@link ElementProcessor} (will not detect or process anything
+ * manually).
+ *
+ * @author niki
+ */
+class BasicElementProcessor implements ElementProcessor {
+ @Override
+ public boolean detectQuote(Node node) {
+ return false;
+ }
+
+ @Override
+ public String processText(String text) {
+ return text;
+ }
+
+ @Override
+ public boolean ignoreNode(Node node) {
+ return false;
+ }
+
+ @Override
+ public String manualProcessing(Node node) {
+ return null;
+ }
+
+ @Override
+ public String isSubtitle(Node node) {
+ return null;
+ }
+}
package be.nikiroo.gofetch.support;
import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Date;
import java.util.List;
+import java.util.Map.Entry;
+import org.jsoup.helper.DataUtil;
import org.jsoup.helper.StringUtil;
+import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
-import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
+import be.nikiroo.gofetch.data.Comment;
import be.nikiroo.gofetch.data.Story;
import be.nikiroo.utils.Downloader;
+import be.nikiroo.utils.StringUtils;
/**
* Base class for website support.
/** The downloader to use for all websites. */
protected static Downloader downloader = new Downloader("gofetcher");
+ static private String preselector;
+
+ private Type type;
+
+ /**
+ * The website textual description, to add in the dispatcher page.
+ * <p>
+ * Should be short.
+ *
+ * @return the description
+ */
+ abstract public String getDescription();
+
/**
- * The support type (each website we support has a single type).
- *
- * @author niki
- */
- public enum Type {
- /** EN: Any, but mostly IT/Sci */
- SLASHDOT,
- /** EN: Clone of Slashdot, mostly abandoned */
- PIPEDOT,
- /** EN: Linux */
- LWN,
- /** FR: Any */
- LEMONDE,
- /** EN: IT */
- REGISTER,
- /** FR: Linux */
- TOO_LINUX,
- /** FR: IT */
- ERE_NUMERIQUE,
+ * The gopher "selector" to use for output.
+ * <p>
+ * A kind of "URL path", like "/news/" or "/misc/news/" or...
+ *
+ * @return the selector
+ */
+ public String getSelector() {
+ return getSelector(type);
}
/**
- * Used to process an element into lines.
- *
- * @author niki
- */
- public interface ElementProcessor {
- /**
- * Detect if this node is a quote and should be trated as such.
- *
- * @param node
- * the node to check
- * @return TRUE if it is
- */
- public boolean detectQuote(Node node);
-
- /**
- * Process text content (will be called on each text element, allowing
- * you to modify it if needed).
- *
- * @param text
- * the text to process
- *
- * @return the resulting text
- */
- public String processText(String text);
-
- /**
- * Ignore this node.
- *
- * @param node
- * the node to ignore
- * @return TRUE if it has to be ignored
- */
- public boolean ignoreNode(Node node);
-
- /**
- * Manually process this node (and return the manual processing value)
- * if so desired.
- * <p>
- * If the node is manually processed, it and its children will not be
- * automatically processed.
- *
- * @param node
- * the node to optionally process
- *
- * @return NULL if not processed (will thus be automatically processed
- * as usual), a {@link String} (may be empty) if we process it
- * manually -- the given {@link String} will be used instead of
- * the usual automatic processing if not NULL
- */
- public String manualProcessing(Node node);
-
- /**
- * This {@link Node} is a subtitle and should be treated as such
- * (highlighted).
- *
- * @param node
- * the node to check
- *
- * @return NULL if it is not a subtitle, the subtitle to use if it is
- */
- public String isSubtitle(Node node);
+ * The support type.
+ *
+ * @return the type
+ */
+ public Type getType() {
+ return type;
}
/**
- * A default {@link ElementProcessor} (will not detect or process anything
- * manually).
+ * List all the recent items, but only assure the ID and internal URL to
+ * fetch it later on (until it has been fetched, the rest of the
+ * {@link Story} is not confirmed).
*
- * @author niki
+ * @return the list of new stories
+ *
+ * @throws IOException
+ * in case of I/O
*/
- public class BasicElementProcessor implements ElementProcessor {
- @Override
- public boolean detectQuote(Node node) {
- return false;
- }
+ public List<Story> list() throws IOException {
+ List<Story> list = new ArrayList<Story>();
+
+ for (Entry<URL, String> entry : getUrls()) {
+ URL url = entry.getKey();
+ String defaultCateg = entry.getValue();
+ if (defaultCateg == null) {
+ defaultCateg = "";
+ }
- @Override
- public String processText(String text) {
- return text;
- }
+ InputStream in = downloader.open(url);
+ Document doc = DataUtil.load(in, "UTF-8", url.toString());
+ List<Element> articles = getArticles(doc);
+ for (Element article : articles) {
+ String id = getArticleId(doc, article).trim();
+ String title = getArticleTitle(doc, article).trim();
+ String author = getArticleAuthor(doc, article).trim();
+ String date = getArticleDate(doc, article).trim();
+ String categ = getArticleCategory(doc, article, defaultCateg)
+ .trim();
+ String details = getArticleDetails(doc, article).trim();
+ String intUrl = getArticleIntUrl(doc, article).trim();
+ String extUrl = getArticleExtUrl(doc, article).trim();
+ String content = getArticleContent(doc, article).trim();
+
+ if (id.isEmpty() && date.isEmpty()) {
+ continue;
+ }
- @Override
- public boolean ignoreNode(Node node) {
- return false;
- }
+ if (id.isEmpty()) {
+ id = date.replace(":", "_").replace("+", "_");
+ }
- @Override
- public String manualProcessing(Node node) {
- return null;
- }
+ date = date(date);
- @Override
- public String isSubtitle(Node node) {
- return null;
+ list.add(new Story(getType(), id, title, author, date, categ,
+ details, intUrl, extUrl, content));
+ }
}
+
+ return list;
}
- static private String preselector;
+ /**
+ * The {@link URL}s to process for this website.
+ *
+ * @return the list of {@link URL}s
+ *
+ * @throws IOException
+ * in case of I/O error
+ */
+ abstract protected List<Entry<URL, String>> getUrls() throws IOException;
- private Type type;
+ /**
+ * The article {@link Element}s of this document.
+ *
+ * @param doc
+ * the main document for the current category
+ *
+ * @return the articles
+ */
+ abstract protected List<Element> getArticles(Document doc);
/**
- * List all the recent items, but only assure the ID and internal URL to
- * fetch it later on (until it has been fetched, the rest of the
- * {@link Story} is not confirmed).
+ * The ID of the article (defaults to the date element if empty).
*
- * @return the list of new stories
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
*
- * @throws IOException
- * in case of I/O
+ * @return the ID
+ */
+ abstract protected String getArticleId(Document doc, Element article);
+
+ /**
+ * The article title to display.
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the title
+ */
+ abstract protected String getArticleTitle(Document doc, Element article);
+
+ /**
+ * The optional article author.
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the author
+ */
+ abstract protected String getArticleAuthor(Document doc, Element article);
+
+ /**
+ * The optional article date.
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the date
+ */
+ abstract protected String getArticleDate(Document doc, Element article);
+
+ /**
+ * the optional article category.
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ * @param currentCategory
+ * the currently listed category if any (can be NULL)
+ *
+ * @return the category
*/
- abstract public List<Story> list() throws IOException;
+ abstract protected String getArticleCategory(Document doc, Element article,
+ String currentCategory);
+
+ /**
+ * the optional details of the article (can replace the date, author and
+ * category, for instance).
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the details
+ */
+ abstract protected String getArticleDetails(Document doc, Element article);
+
+ /**
+ * The (required) {@link URL} that points to the news page on the supported
+ * website.
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the internal {@link URL}
+ */
+ abstract protected String getArticleIntUrl(Document doc, Element article);
+
+ /**
+ * the optional {@link URL} that points to an external website for more
+ * information.
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the external {@link URL}
+ */
+ abstract protected String getArticleExtUrl(Document doc, Element article);
+
+ /**
+ * The optional article short-content (not the full content, that will be
+ * fetched by {@link BasicSupport#fetch(Story)}).
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the short content
+ */
+ abstract protected String getArticleContent(Document doc, Element article);
/**
* Fetch the full article content as well as all the comments associated to
* @throws IOException
* in case of I/O error
*/
- abstract public void fetch(Story story) throws IOException;
+ public void fetch(Story story) throws IOException {
+ String fullContent = "";
+
+ URL url = new URL(story.getUrlInternal());
+ InputStream in = downloader.open(url);
+ try {
+ Document doc = DataUtil.load(in, "UTF-8", url.toString());
+ Element article = getFullArticle(doc);
+ if (article != null) {
+ StringBuilder builder = new StringBuilder();
+ ElementProcessor eProc = getElementProcessorFullArticle();
+ if (eProc != null) {
+ for (String line : toLines(article, eProc)) {
+ builder.append(line + "\n");
+ }
+ } else {
+ builder.append(article.text());
+ }
+
+ // Content is too tight with a single break per line:
+ fullContent = builder.toString().replace("\n", "\n\n") //
+ .replace("\n\n\n\n", "\n\n") //
+ .replace("\n\n\n\n", "\n\n") //
+ .trim();
+ }
+
+ if (fullContent.isEmpty()) {
+ fullContent = story.getContent();
+ }
+
+ story.setFullContent(fullContent);
+ story.setComments(getComments(doc,
+ getFullArticleCommentPosts(doc, url)));
+ } finally {
+ if (in != null) {
+ in.close();
+ }
+ }
+ }
/**
- * The website textual description, to add in the dispatcher page.
- * <p>
- * Should be short.
+ * Return the full article if available.
*
- * @return the description
+ * @param doc
+ * the (full article) document to work on
+ *
+ * @return the article or NULL
*/
- abstract public String getDescription();
+ abstract protected Element getFullArticle(Document doc);
/**
- * The gopher "selector" to use for output.
+ * Return the list of comment {@link Element}s from this optional container
+ * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
+ *
+ * @param doc
+ * the (full article) document to work on
+ * @param intUrl
+ * the internal {@link URL} this article wa taken from (the
+ * {@link URL} from the supported website)
+ *
+ * @return the list of comment posts
+ */
+ abstract protected List<Element> getFullArticleCommentPosts(Document doc,
+ URL intUrl);
+
+ /**
+ * The {@link ElementProcessor} to use to convert the main article element
+ * (see {@link BasicSupport#getFullArticle(Document)}) into text.
* <p>
- * A kind of "URL path", like "/news/" or "/misc/news/" or...
+ * See {@link BasicElementProcessor} for a working, basic implementation.
+ * <p>
+ * Can be NULL to simply use {@link Element#text()}.
*
- * @return the selector
+ * @return the processor, or NULL
*/
- public String getSelector() {
- return getSelector(type);
- }
+ abstract protected ElementProcessor getElementProcessorFullArticle();
/**
- * The support type.
+ * Convert the comment elements into {@link Comment}s
*
- * @return the type
+ * @param doc
+ * the document we work on
+ * @param posts
+ * the comment elements
+ *
+ * @return the converted {@link Comment}s
*/
- public Type getType() {
- return type;
+ private List<Comment> getComments(Document doc, List<Element> posts) {
+ List<Comment> comments = new ArrayList<Comment>();
+ if (posts != null) {
+ for (Element post : posts) {
+ String id = getCommentId(post).trim();
+ String author = getCommentAuthor(post).trim();
+ String title = getCommentTitle(post).trim();
+ String date = getCommentDate(post).trim();
+
+ List<String> content = new ArrayList<String>();
+
+ if (id.isEmpty()) {
+ id = date;
+ }
+
+ date = date(date);
+
+ Element contentE = getCommentContentElement(post);
+ if (contentE != null) {
+ ElementProcessor eProc = getElementProcessorComment();
+ if (eProc != null) {
+ for (String line : toLines(contentE, eProc)) {
+ content.add(line);
+ }
+ } else {
+ content = Arrays.asList(contentE.text().split("\n"));
+ }
+ }
+
+ Comment comment = new Comment(id, author, title, date, content);
+ comment.addAll(getComments(doc,
+ getCommentCommentPosts(doc, post)));
+
+ if (!comment.isEmpty()) {
+ comments.add(comment);
+ }
+ }
+ }
+
+ return comments;
}
+ /**
+ * Return the list of subcomment {@link Element}s from this comment element
+ * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
+ *
+ * @param doc
+ * the (full article) document to work on
+ * @param container
+ * the container (a comment {@link Element})
+ *
+ * @return the list of comment posts
+ */
+ abstract protected List<Element> getCommentCommentPosts(Document doc,
+ Element container);
+
+ /**
+ * Compute the ID of the given comment element.
+ *
+ * @param post
+ * the comment element
+ *
+ * @return the ID
+ */
+ abstract protected String getCommentId(Element post);
+
+ /**
+ * Compute the author of the given comment element.
+ *
+ * @param post
+ * the comment element
+ *
+ * @return the author
+ */
+ abstract protected String getCommentAuthor(Element post);
+
+ /**
+ * Compute the title of the given comment element.
+ *
+ * @param post
+ * the comment element
+ *
+ * @return the title
+ */
+ abstract protected String getCommentTitle(Element post);
+
+ /**
+ * Compute the date of the given comment element.
+ *
+ * @param post
+ * the comment element
+ *
+ * @return the date
+ */
+ abstract protected String getCommentDate(Element post);
+
+ /**
+ * Get the main of the given comment element, which can be NULL.
+ *
+ * @param post
+ * the comment element
+ *
+ * @return the element
+ */
+ abstract protected Element getCommentContentElement(Element post);
+
+ /**
+ * The {@link ElementProcessor} to use to convert the main comment element
+ * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
+ * <p>
+ * See {@link BasicElementProcessor} for a working, basic implementation.
+ * <p>
+ * Can be NULL to simply use {@link Element#text()}.
+ *
+ * @return the processor
+ */
+ abstract protected ElementProcessor getElementProcessorComment();
+
/**
* The support type.
*
return preselector + "/" + type + "/";
}
- /**
- * Get the first {@link Element} of the given class, or an empty span
- * {@link Element} if none found.
- *
- * @param element
- * the element to look in
- * @param className
- * the class to look for
- *
- * @return the value or an empty span {@link Element}
- */
- static protected Element firstOrEmpty(Element element, String className) {
- Elements subElements = element.getElementsByClass(className);
- if (subElements.size() > 0) {
- return subElements.get(0);
- }
-
- return new Element("span");
- }
-
- /**
- * Get the first {@link Element} of the given tag, or an empty span
- * {@link Element} if none found.
- *
- * @param element
- * the element to look in
- * @param tagName
- * the tag to look for
- *
- * @return the value or an empty span {@link Element}
- */
- static protected Element firstOrEmptyTag(Element element, String tagName) {
- Elements subElements = element.getElementsByTag(tagName);
- if (subElements.size() > 0) {
- return subElements.get(0);
- }
-
- return new Element("span");
- }
-
/**
* Process the given element into text (each line is a text paragraph and
* can be prepended with ">" signs to indicate a quote or sub-quote or
final StringBuilder currentLine = new StringBuilder();
final List<Integer> quoted = new ArrayList<Integer>();
final List<Node> ignoredNodes = new ArrayList<Node>();
+ final List<String> footnotes = new ArrayList<String>();
if (element != null) {
new NodeTraversor(new NodeVisitor() {
}
}
+ // <pre> check
+ if (!ignore) {
+ if (node instanceof Element) {
+ Element el = (Element) node;
+ if ("pre".equals(el.tagName())) {
+ currentLine.append(StringUtils
+ .unhtml(el.text()).trim());
+ ignore = true;
+ }
+ }
+ }
+
if (ignore) {
ignoredNodes.add(node);
return;
if (block && currentLine.length() > 0) {
currentLine.append("\n");
}
+
+ if (!element.absUrl("href").trim().isEmpty()) {
+ footnotes.add(element.absUrl("href"));
+ currentLine.append("[" + footnotes.size() + "]");
+ }
} else if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
String line = StringUtil.normaliseWhitespace(textNode
}
}
+ // Fix spaces and nbsp, remove multiple following blank lines
+ List<String> linesCopy = new ArrayList<String>(lines.size());
+ long blanks = 0;
for (int i = 0; i < lines.size(); i++) {
- lines.set(i, lines.get(i).replace(" ", " ").trim());
+ String line = lines.get(i).replace("Â ", " ") // nbsp -> space
+ .replace(" ", " ").trim();
+ if (line.isEmpty()) {
+ blanks++;
+ } else {
+ blanks = 0;
+ }
+
+ if (blanks < 2) {
+ linesCopy.add(line);
+ }
+ }
+
+ // Footnotes insertion
+ if (footnotes.size() > 0) {
+ linesCopy.add("");
+ linesCopy.add("");
+ linesCopy.add("");
+ linesCopy.add("");
+ for (int i = 0; i < footnotes.size(); i++) {
+ linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
+ }
}
- return lines;
+ return linesCopy;
}
/**
*
* @return the reformated date, or the same value if it was not parsable
*/
- static protected String date(String date) {
+ static private String date(String date) {
SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
long epoch = 0;
--- /dev/null
+package be.nikiroo.gofetch.support;
+
+import org.jsoup.nodes.Node;
+
+/**
+ * Used to process an element into lines.
+ *
+ * @author niki
+ */
+interface ElementProcessor {
+ /**
+ * Detect if this node is a quote and should be trated as such.
+ *
+ * @param node
+ * the node to check
+ * @return TRUE if it is
+ */
+ public boolean detectQuote(Node node);
+
+ /**
+ * Process text content (will be called on each text element, allowing you
+ * to modify it if needed).
+ *
+ * @param text
+ * the text to process
+ *
+ * @return the resulting text
+ */
+ public String processText(String text);
+
+ /**
+ * Ignore this node.
+ *
+ * @param node
+ * the node to ignore
+ * @return TRUE if it has to be ignored
+ */
+ public boolean ignoreNode(Node node);
+
+ /**
+ * Manually process this node (and return the manual processing value) if so
+ * desired.
+ * <p>
+ * If the node is manually processed, it and its children will not be
+ * automatically processed.
+ *
+ * @param node
+ * the node to optionally process
+ *
+ * @return NULL if not processed (will thus be automatically processed as
+ * usual), a {@link String} (may be empty) if we process it manually
+ * -- the given {@link String} will be used instead of the usual
+ * automatic processing if not NULL
+ */
+ public String manualProcessing(Node node);
+
+ /**
+ * This {@link Node} is a subtitle and should be treated as such
+ * (highlighted).
+ *
+ * @param node
+ * the node to check
+ *
+ * @return NULL if it is not a subtitle, the subtitle to use if it is
+ */
+ public String isSubtitle(Node node);
+}
\ No newline at end of file
package be.nikiroo.gofetch.support;
import java.io.IOException;
-import java.io.InputStream;
import java.net.URL;
+import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map.Entry;
-import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
-
-import be.nikiroo.gofetch.data.Comment;
-import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.utils.StringUtils;
/**
* Support <a
}
@Override
- public List<Story> list() throws IOException {
- List<Story> list = new ArrayList<Story>();
-
- for (String categ : new String[] { "informatique" }) {
- URL url = new URL("https://www.erenumerique.fr/" + categ);
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements articles = doc.getElementsByClass("item-details");
- for (Element article : articles) {
- String id = "";
- String intUrl = "";
- String extUrl = ""; // nope
- String title = "";
- String date = "";
- String author = "";
- String details = "";
- String body = "";
-
- // MUST NOT fail:
- Element dateElement = article //
- .getElementsByTag("time").first();
- if (dateElement == null) {
- continue;
- }
+ protected List<Entry<URL, String>> getUrls() throws IOException {
+ List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+ for (String categ : new String[] { "Informatique" }) {
+ URL url = new URL("https://www.erenumerique.fr/"
+ + categ.toLowerCase());
+ urls.add(new AbstractMap.SimpleEntry<URL, String>(url, categ));
+ }
- Element urlElement = article.getElementsByTag("a").first();
- if (urlElement != null) {
- intUrl = urlElement.absUrl("href");
- }
+ return urls;
+ }
- id = dateElement.attr("datetime").replace(":", "_")
- .replace("+", "_");
- date = date(dateElement.attr("datetime"));
+ @Override
+ protected List<Element> getArticles(Document doc) {
+ return doc.getElementsByClass("item-details");
+ }
- Element titleElement = article.getElementsByTag("h2").first();
- if (titleElement != null) {
- title = StringUtils.unhtml(titleElement.text()).trim();
- }
+ @Override
+ protected String getArticleId(Document doc, Element article) {
+ return ""; // will use the date
+ }
- Element authorElement = article.getElementsByClass(
- "td-post-author-name").first();
- if (authorElement != null) {
- authorElement = authorElement.getElementsByTag("a").first();
- }
- if (authorElement != null) {
- author = StringUtils.unhtml(authorElement.text()).trim();
- }
+ @Override
+ protected String getArticleTitle(Document doc, Element article) {
+ Element titleElement = article.getElementsByTag("h2").first();
+ if (titleElement != null) {
+ return titleElement.text();
+ }
- Element contentElement = article.getElementsByClass(
- "td-excerpt").first();
- if (contentElement != null) {
- body = StringUtils.unhtml(contentElement.text()).trim();
- }
+ return "";
+ }
- list.add(new Story(getType(), id, title, author, date, categ,
- details, intUrl, extUrl, body));
- }
+ @Override
+ protected String getArticleAuthor(Document doc, Element article) {
+ Element authorElement = article.getElementsByClass(
+ "td-post-author-name").first();
+ if (authorElement != null) {
+ authorElement = authorElement.getElementsByTag("a").first();
+ }
+ if (authorElement != null) {
+ return authorElement.text();
}
- return list;
+ return "";
}
@Override
- public void fetch(Story story) throws IOException {
- String fullContent = story.getContent();
+ protected String getArticleDate(Document doc, Element article) {
+ Element dateElement = article //
+ .getElementsByTag("time").first();
+ if (dateElement != null) {
+ return dateElement.attr("datetime");
+ }
- URL url = new URL(story.getUrlInternal());
- InputStream in = downloader.open(url);
- try {
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Element article = doc.getElementsByTag("article").first();
- if (article != null) {
- article = article.getElementsByAttributeValue("itemprop",
- "articleBody").first();
- }
- if (article != null) {
- for (String line : toLines(article,
- new BasicElementProcessor() {
- @Override
- public boolean ignoreNode(Node node) {
- return node.attr("class").contains("chapo");
- }
-
- @Override
- public String isSubtitle(Node node) {
- if (node instanceof Element) {
- Element element = (Element) node;
- if (element.tagName().startsWith("h")
- && element.tagName().length() == 2) {
- return element.text();
- }
- }
- return null;
- }
- })) {
- fullContent += line + "\n";
- }
+ return "";
+ }
- // Content is too tight with a single break per line:
- fullContent = fullContent.replace("\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .trim();
- }
+ @Override
+ protected String getArticleCategory(Document doc, Element article,
+ String currentCategory) {
+ return currentCategory;
+ }
- // Get comments URL then parse it, if possible
- Element posts = doc.getElementsByClass("comment-list").first();
+ @Override
+ protected String getArticleDetails(Document doc, Element article) {
+ return "";
+ }
- story.setFullContent(fullContent);
- story.setComments(getComments(posts));
- } finally {
- if (in != null) {
- in.close();
- }
+ @Override
+ protected String getArticleIntUrl(Document doc, Element article) {
+ Element urlElement = article.getElementsByTag("a").first();
+ if (urlElement != null) {
+ return urlElement.absUrl("href");
}
+
+ return "";
}
- private List<Comment> getComments(Element posts) {
- List<Comment> comments = new ArrayList<Comment>();
- if (posts != null) {
- for (Element post : posts.children()) {
- if (!post.hasClass("comment")) {
- continue;
- }
+ @Override
+ protected String getArticleExtUrl(Document doc, Element article) {
+ return "";
+ }
+
+ @Override
+ protected String getArticleContent(Document doc, Element article) {
+ Element contentElement = article.getElementsByClass("td-excerpt")
+ .first();
+ if (contentElement != null) {
+ return contentElement.text();
+ }
- String id = "";
- String author = "";
- String title = "";
- String date = "";
- List<String> content = new ArrayList<String>();
+ return "";
+ }
- Element authorE = post.getElementsByTag("footer").first();
- if (authorE != null) {
- authorE = authorE.getElementsByTag("cite").first();
- }
- if (authorE != null) {
- author = StringUtils.unhtml(authorE.text()).trim();
- }
+ @Override
+ protected Element getFullArticle(Document doc) {
+ Element article = doc.getElementsByTag("article").first();
+ if (article != null) {
+ article = article.getElementsByAttributeValue("itemprop",
+ "articleBody").first();
+ }
+
+ return article;
+ }
+
+ @Override
+ protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+ return getSubCommentElements(doc.getElementsByClass("comment-list")
+ .first());
+ }
- Element idE = post.getElementsByTag("a").first();
- if (idE != null) {
- id = idE.attr("id");
- Element dateE = idE.getElementsByTag("span").first();
- if (dateE != null) {
- date = date(dateE.attr("data-epoch"));
+ @Override
+ protected ElementProcessor getElementProcessorFullArticle() {
+ return new BasicElementProcessor() {
+ @Override
+ public boolean ignoreNode(Node node) {
+ return node.attr("class").contains("chapo");
+ }
+
+ @Override
+ public String isSubtitle(Node node) {
+ if (node instanceof Element) {
+ Element element = (Element) node;
+ if (element.tagName().startsWith("h")
+ && element.tagName().length() == 2) {
+ return element.text();
}
}
+ return null;
+ }
+ };
+ }
+
+ @Override
+ protected List<Element> getCommentCommentPosts(Document doc,
+ Element container) {
+ return getSubCommentElements(container.getElementsByClass("children")
+ .first());
+ }
+
+ @Override
+ protected String getCommentId(Element post) {
+ Element idE = post.getElementsByTag("a").first();
+ if (idE != null) {
+ return idE.attr("id");
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getCommentAuthor(Element post) {
+ // Since we have no title, we switch with author
+ return "";
+ }
+
+ @Override
+ protected String getCommentTitle(Element post) {
+ // Since we have no title, we switch with author
+ Element authorE = post.getElementsByTag("footer").first();
+ if (authorE != null) {
+ authorE = authorE.getElementsByTag("cite").first();
+ }
+ if (authorE != null) {
+ return authorE.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getCommentDate(Element post) {
+ Element idE = post.getElementsByTag("a").first();
+ if (idE != null) {
+ Element dateE = idE.getElementsByTag("span").first();
+ if (dateE != null) {
+ return dateE.attr("data-epoch");
+ }
+ }
- Element contentE = post.getElementsByClass("comment-content")
- .first();
- if (contentE != null) {
- for (String line : toLines(contentE,
- new BasicElementProcessor() {
- @Override
- public boolean ignoreNode(Node node) {
- // TODO: ignore headlines/pub
- if (node instanceof Element) {
- Element el = (Element) node;
- if ("h4".equals(el.tagName())) {
- return true;
- }
- }
-
- return false;
- }
- })) {
- content.add(line);
+ return "";
+ }
+
+ @Override
+ protected Element getCommentContentElement(Element post) {
+ Element contentE = post.getElementsByClass("comment-content").first();
+ return contentE;
+ }
+
+ @Override
+ protected ElementProcessor getElementProcessorComment() {
+ return new BasicElementProcessor() {
+ @Override
+ public boolean ignoreNode(Node node) {
+ if (node instanceof Element) {
+ Element el = (Element) node;
+ if ("h4".equals(el.tagName())) {
+ return true;
}
}
- // Since we have no title but still an author, let's switch:
- title = author;
- author = "";
- Comment comment = new Comment(id, author, title, date, content);
- comments.add(comment);
+ return false;
+ }
+ };
+ }
- Element children = post.getElementsByClass("children").first();
- comment.addAll(getComments(children));
+ private List<Element> getSubCommentElements(Element posts) {
+ List<Element> commentElements = new ArrayList<Element>();
+ if (posts != null) {
+ for (Element possibleCommentElement : posts.children()) {
+ if (possibleCommentElement.hasClass("comment")) {
+ commentElements.add(possibleCommentElement);
+ }
}
}
- return comments;
+ return commentElements;
}
}
package be.nikiroo.gofetch.support;
import java.io.IOException;
-import java.io.InputStream;
import java.net.URL;
+import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map.Entry;
-import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
+import org.jsoup.nodes.TextNode;
import be.nikiroo.gofetch.data.Comment;
import be.nikiroo.gofetch.data.Story;
}
@Override
- public List<Story> list() throws IOException {
- List<Story> list = new ArrayList<Story>();
+ public void fetch(Story story) throws IOException {
+ // Do not try the paid-for stories...
+ if (!story.getTitle().startsWith("[$]")) {
+ super.fetch(story);
+ } else {
+ String fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/].";
+ story.setFullContent(fullContent);
+ story.setComments(new ArrayList<Comment>());
+ }
+ }
- URL url = new URL("https://lwn.net/");
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements articles = doc.getElementsByClass("pure-u-1");
- for (Element article : articles) {
- Elements titles = article.getElementsByClass("Headline");
- Elements listings = article.getElementsByClass("BlurbListing");
- if (titles.size() == 0) {
- continue;
- }
- if (listings.size() == 0) {
- continue;
- }
+ @Override
+ protected List<Entry<URL, String>> getUrls() throws IOException {
+ List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+ urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+ "https://lwn.net/"), ""));
+ return urls;
+ }
- Element listing = listings.get(0);
- if (listing.children().size() < 2) {
- continue;
- }
+ @Override
+ protected List<Element> getArticles(Document doc) {
+ return doc.getElementsByClass("pure-u-1");
+ }
- String title = titles.get(0).text();
- String details = listing.children().get(0).text();
- String body = "";
- // All but the first and two last children
- for (int i = 1; i < listing.children().size() - 2; i++) {
- Element e = listing.children().get(i);
- body = body.trim() + " " + e.text().trim();
- }
- body = body.trim();
+ @Override
+ protected String getArticleId(Document doc, Element article) {
+ return getArticleIntUrl(doc, article).replaceAll("[^0-9]", "");
+ }
- int pos;
+ @Override
+ protected String getArticleTitle(Document doc, Element article) {
+ Element title = article.getElementsByClass("Headline").first();
+ if (title != null) {
+ return title.text();
+ }
- String categ = "";
- pos = details.indexOf("]");
- if (pos >= 0) {
- categ = details.substring(1, pos).trim();
- }
+ return "";
+ }
- String author = "";
- pos = details.indexOf(" by ");
- if (pos >= 0) {
- author = details.substring(pos + " by ".length()).trim();
- }
+ @Override
+ protected String getArticleAuthor(Document doc, Element article) {
+ String author = "";
+ String details = getArticleDetailsReal(article);
+ int pos = details.indexOf(" by ");
+ if (pos >= 0) {
+ author = details.substring(pos + " by ".length()).trim();
+ }
+
+ return author;
+ }
- String date = "";
- pos = details.indexOf(" Posted ");
+ @Override
+ protected String getArticleDate(Document doc, Element article) {
+ String date = "";
+ String details = getArticleDetailsReal(article);
+ int pos = details.indexOf(" Posted ");
+ if (pos >= 0) {
+ date = details.substring(pos + " Posted ".length()).trim();
+ pos = date.indexOf(" by ");
if (pos >= 0) {
- date = details.substring(pos + " Posted ".length()).trim();
- pos = date.indexOf(" by ");
- if (pos >= 0) {
- date = date.substring(0, pos).trim();
- }
+ date = date.substring(0, pos).trim();
}
+ }
- // We extracted everything from details so...
- details = "";
-
- String id = "";
- String intUrl = "";
- String extUrl = "";
- for (Element idElem : article.getElementsByTag("a")) {
- // Last link is the story link
- intUrl = idElem.absUrl("href");
- pos = intUrl.indexOf("#Comments");
- if (pos >= 0) {
- intUrl = intUrl.substring(0, pos - 1);
- }
- id = intUrl.replaceAll("[^0-9]", "");
- }
+ return date;
+ }
- list.add(new Story(getType(), id, title, author, date, categ,
- details, intUrl, extUrl, body));
+ @Override
+ protected String getArticleCategory(Document doc, Element article,
+ String currentCategory) {
+ String categ = "";
+ String details = getArticleDetailsReal(article);
+ int pos = details.indexOf("]");
+ if (pos >= 0) {
+ categ = details.substring(1, pos).trim();
}
- return list;
+ return categ;
}
@Override
- public void fetch(Story story) throws IOException {
- List<Comment> comments = new ArrayList<Comment>();
- String fullContent = story.getContent();
+ protected String getArticleDetails(Document doc, Element article) {
+ return ""; // We actually extract all the values
+ }
- // Do not try the paid-for stories...
- if (!story.getTitle().startsWith("[$]")) {
- URL url = new URL(story.getUrlInternal());
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements fullContentElements = doc
- .getElementsByClass("ArticleText");
- if (fullContentElements.size() > 0) {
- // comments.addAll(getComments(listing.get(0)));
- fullContent = fullContentElements.get(0).text();
+ @Override
+ protected String getArticleIntUrl(Document doc, Element article) {
+ String intUrl = "";
+ for (Element idElem : article.getElementsByTag("a")) {
+ // Last link is the story link
+ intUrl = idElem.absUrl("href");
+ int pos = intUrl.indexOf("#Comments");
+ if (pos >= 0) {
+ intUrl = intUrl.substring(0, pos - 1);
}
+ }
- Elements listing = doc.getElementsByClass("lwn-u-1");
- if (listing.size() > 0) {
- comments.addAll(getComments(listing.get(0)));
+ return intUrl;
+ }
+
+ @Override
+ protected String getArticleExtUrl(Document doc, Element article) {
+ return "";
+ }
+
+ @Override
+ protected String getArticleContent(Document doc, Element article) {
+ Element listing = article.getElementsByClass("BlurbListing").first();
+ if (listing != null && listing.children().size() >= 2) {
+ String content = "";
+
+ // All but the first and two last children
+ for (int i = 1; i < listing.children().size() - 2; i++) {
+ Element e = listing.children().get(i);
+ content = content.trim() + " " + e.text().trim();
}
- } else {
- fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/].";
+
+ return content;
}
- story.setFullContent(fullContent);
- story.setComments(comments);
+ return "";
+ }
+
+ @Override
+ protected Element getFullArticle(Document doc) {
+ return doc.getElementsByClass("ArticleText").first();
}
- private List<Comment> getComments(Element listing) {
- List<Comment> comments = new ArrayList<Comment>();
- for (Element commentElement : listing.children()) {
- if (commentElement.hasClass("CommentBox")) {
- Comment comment = getComment(commentElement);
- if (!comment.isEmpty()) {
- comments.add(comment);
+ @Override
+ protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+ return doc.getElementsByClass("lwn-u-1");
+ }
+
+ @Override
+ protected ElementProcessor getElementProcessorFullArticle() {
+ return new BasicElementProcessor() {
+ @Override
+ public boolean ignoreNode(Node node) {
+ if (node instanceof Element) {
+ Element el = (Element) node;
+ if ("Log in".equals(el.text().trim())) {
+ return true;
+ }
+ } else if (node instanceof TextNode) {
+ TextNode text = (TextNode) node;
+ String t = text.text().trim();
+ if (t.equals("(") || t.equals("to post comments)")) {
+ return true;
+ }
}
- } else if (commentElement.hasClass("Comment")) {
- if (comments.size() > 0) {
- comments.get(comments.size() - 1).addAll(
- getComments(commentElement));
+
+ return false;
+ }
+ };
+ }
+
+ @Override
+ protected List<Element> getCommentCommentPosts(Document doc,
+ Element container) {
+ List<Element> commentElements = new ArrayList<Element>();
+ if (container != null) {
+ for (Element possibleCommentElement : container.children()) {
+ if (possibleCommentElement.hasClass("CommentBox")) {
+ commentElements.add(possibleCommentElement);
+ } else if (possibleCommentElement.hasClass("Comment")) {
+ commentElements.add(possibleCommentElement);
}
}
}
- return comments;
+
+ return commentElements;
}
- private Comment getComment(Element commentElement) {
- String title = firstOrEmpty(commentElement, "CommentTitle").text();
- String author = firstOrEmpty(commentElement, "CommentPoster").text();
+ @Override
+ protected String getCommentId(Element post) {
+ return post.id();
+ }
- String date = "";
- int pos = author.lastIndexOf(" by ");
- if (pos >= 0) {
- date = author.substring(0, pos).trim();
- author = author.substring(pos + " by ".length()).trim();
+ @Override
+ protected String getCommentAuthor(Element post) {
+ Element detailsE = post.getElementsByClass("CommentPoster").first();
+ if (detailsE != null) {
+ String details = detailsE.text();
+
+ int pos = details.lastIndexOf(" by ");
+ if (pos >= 0) {
+ details = details.substring(pos + " by ".length()).trim();
- if (author.startsWith("Posted ")) {
- author = author.substring("Posted ".length()).trim();
+ if (details.startsWith("Posted ")) {
+ return details.substring("Posted ".length()).trim();
+ }
}
}
- Element content = null;
- Elements commentBodyElements = commentElement
- .getElementsByClass("CommentBody");
- if (commentBodyElements.size() > 0) {
- content = commentBodyElements.get(0);
+ return "";
+ }
+
+ @Override
+ protected String getCommentTitle(Element post) {
+ Element title = post.getElementsByClass("CommentTitle").first();
+ if (title != null) {
+ return title.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getCommentDate(Element post) {
+ Element detailsE = post.getElementsByClass("CommentPoster").first();
+ if (detailsE != null) {
+ String details = detailsE.text();
+
+ int pos = details.lastIndexOf(" by ");
+ if (pos >= 0) {
+ return details.substring(0, pos).trim();
+ }
}
- Comment comment = new Comment(commentElement.id(), author, title, date,
- toLines(content));
+ return "";
+ }
- return comment;
+ @Override
+ protected Element getCommentContentElement(Element post) {
+ return post.getElementsByClass("CommentBody").first();
}
- private List<String> toLines(Element element) {
- return toLines(element, new BasicElementProcessor() {
+ @Override
+ protected ElementProcessor getElementProcessorComment() {
+ return new BasicElementProcessor() {
@Override
public String processText(String text) {
while (text.startsWith(">")) { // comments
return false;
}
- });
+ };
+ }
+
+ private String getArticleDetailsReal(Element article) {
+ Element listing = article.getElementsByClass("BlurbListing").first();
+ // Valid articles have 2+ listings
+ if (listing != null && listing.children().size() >= 2) {
+ return listing.children().get(0).text();
+ }
+
+ return "";
}
}
package be.nikiroo.gofetch.support;
import java.io.IOException;
-import java.io.InputStream;
import java.net.URL;
+import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map.Entry;
-import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
-
-import be.nikiroo.gofetch.data.Comment;
-import be.nikiroo.gofetch.data.Story;
/**
* Support <a href="http://www.lemonde.fr/">http://www.lemonde.fr/</a>.
}
@Override
- public List<Story> list() throws IOException {
- List<Story> list = new ArrayList<Story>();
-
- for (String topic : new String[] { "international", "politique",
- "societe", "sciences" }) {
- URL url = new URL("http://www.lemonde.fr/" + topic + "/1.html");
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements articles = doc.getElementsByTag("article");
- for (Element article : articles) {
- Elements times = article.getElementsByTag("time");
- Elements titleElements = article.getElementsByTag("h3");
- Elements contentElements = article.getElementsByClass("txt3");
- if (times.size() > 0 && titleElements.size() > 0
- && contentElements.size() > 0) {
- String id = times.get(0).attr("datetime").replace(":", "_")
- .replace("+", "_");
- String title = titleElements.get(0).text();
- String date = date(titleElements.get(0).text());
- String content = contentElements.get(0).text();
- String intUrl = "";
- String extUrl = "";
- String author = "";
- String details = "";
-
- Elements detailsElements = article
- .getElementsByClass("signature");
- if (detailsElements.size() > 0) {
- author = detailsElements.get(0).text();
- }
+ protected List<Entry<URL, String>> getUrls() throws IOException {
+ List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+ for (String topic : new String[] { "International", "Politique",
+ "Société", "Sciences" }) {
+ URL url = new URL("http://www.lemonde.fr/"
+ + topic.toLowerCase().replace("é", "e") + "/1.html");
+ urls.add(new AbstractMap.SimpleEntry<URL, String>(url, topic));
+ }
- Elements links = titleElements.get(0).getElementsByTag("a");
- if (links.size() > 0) {
- intUrl = links.get(0).absUrl("href");
- list.add(new Story(getType(), id, title, author, date,
- topic, details, intUrl, extUrl, content));
- }
- }
+ return urls;
+ }
+
+ @Override
+ protected List<Element> getArticles(Document doc) {
+ return doc.getElementsByTag("article");
+ }
+
+ @Override
+ protected String getArticleId(Document doc, Element article) {
+ return ""; // will use the date
+ }
+
+ @Override
+ protected String getArticleTitle(Document doc, Element article) {
+ Element titleElement = article.getElementsByTag("h3").first();
+ if (titleElement != null) {
+ return titleElement.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleAuthor(Document doc, Element article) {
+ Element detailsElement = article.getElementsByClass("signature")
+ .first();
+ if (detailsElement != null) {
+ return detailsElement.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleDate(Document doc, Element article) {
+ Element timeElement = article.getElementsByTag("time").first();
+ if (timeElement != null) {
+ return timeElement.attr("datetime");
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleCategory(Document doc, Element article,
+ String currentCategory) {
+ return currentCategory;
+ }
+
+ @Override
+ protected String getArticleDetails(Document doc, Element article) {
+ return "";
+ }
+
+ @Override
+ protected String getArticleIntUrl(Document doc, Element article) {
+ Element titleElement = article.getElementsByTag("h3").first();
+ if (titleElement != null) {
+ Element link = titleElement.getElementsByTag("a").first();
+ if (link != null) {
+ return link.absUrl("href");
}
}
- return list;
+ return "";
}
@Override
- public void fetch(Story story) throws IOException {
- String fullContent = story.getContent();
- List<Comment> comments = new ArrayList<Comment>();
+ protected String getArticleExtUrl(Document doc, Element article) {
+ return "";
+ }
- // Note: no comments on this site as far as I can see (or maybe with
- // some javascript, I need to check...)
+ @Override
+ protected String getArticleContent(Document doc, Element article) {
+ Element contentElement = article.getElementsByClass("txt3").first();
+ if (contentElement != null) {
+ return contentElement.text();
+ }
- URL url = new URL(story.getUrlInternal());
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Element article = doc.getElementById("articleBody");
- if (article != null) {
- for (String line : toLines(article, new BasicElementProcessor() {
- @Override
- public boolean ignoreNode(Node node) {
- if (node instanceof Element) {
- Element element = (Element) node;
- if (element.hasClass("lire")) {
- return true;
- }
- }
+ return "";
+ }
+
+ @Override
+ protected Element getFullArticle(Document doc) {
+ return doc.getElementById("articleBody");
+ }
+
+ @Override
+ protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+ return null;
+ }
- return false;
+ @Override
+ protected ElementProcessor getElementProcessorFullArticle() {
+ return new BasicElementProcessor() {
+ @Override
+ public boolean ignoreNode(Node node) {
+ if (node instanceof Element) {
+ Element element = (Element) node;
+ if (element.hasClass("lire")) {
+ return true;
+ }
}
- @Override
- public String isSubtitle(Node node) {
- if (node instanceof Element) {
- Element element = (Element) node;
- if (element.hasClass("intertitre")) {
- return element.text();
- }
+ return false;
+ }
+
+ @Override
+ public String isSubtitle(Node node) {
+ if (node instanceof Element) {
+ Element element = (Element) node;
+ if (element.hasClass("intertitre")) {
+ return element.text();
}
- return null;
}
- })) {
- fullContent += line + "\n";
+ return null;
}
+ };
+ }
- // Content is too tight with a single break per line:
- fullContent = fullContent.replace("\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .trim();
- }
+ // No comment on this site, horrible javascript system
- story.setFullContent(fullContent);
- story.setComments(comments);
+ @Override
+ protected List<Element> getCommentCommentPosts(Document doc,
+ Element container) {
+ return null;
+ }
+
+ @Override
+ protected String getCommentId(Element post) {
+ return null;
+ }
+
+ @Override
+ protected String getCommentAuthor(Element post) {
+ return null;
+ }
+
+ @Override
+ protected String getCommentTitle(Element post) {
+ return null;
+ }
+
+ @Override
+ protected String getCommentDate(Element post) {
+ return null;
+ }
+
+ @Override
+ protected Element getCommentContentElement(Element post) {
+ return null;
+ }
+
+ @Override
+ protected ElementProcessor getElementProcessorComment() {
+ return null;
}
}
package be.nikiroo.gofetch.support;
import java.io.IOException;
-import java.io.InputStream;
import java.net.URL;
+import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map.Entry;
-import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
-import be.nikiroo.gofetch.data.Comment;
-import be.nikiroo.gofetch.data.Story;
-
/**
* Support <a href='https://pipedot.org/'>https://pipedot.org/</a>.
*
}
@Override
- public List<Story> list() throws IOException {
- List<Story> list = new ArrayList<Story>();
+ protected List<Entry<URL, String>> getUrls() throws IOException {
+ List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+ urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+ "https://pipedot.org/"), ""));
+ return urls;
+ }
- URL url = new URL("https://pipedot.org/");
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements articles = doc.getElementsByClass("story");
- for (Element article : articles) {
- Elements titles = article.getElementsByTag("h1");
- if (titles.size() == 0) {
- continue;
- }
+ @Override
+ protected List<Element> getArticles(Document doc) {
+ return doc.getElementsByClass("story");
+ }
- Element title = titles.get(0);
+ @Override
+ protected String getArticleId(Document doc, Element article) {
+ // Don't try on bad articles
+ if (getArticleTitle(doc, article).isEmpty()) {
+ return "";
+ }
- String id = "";
- for (Element idElem : article.getElementsByTag("a")) {
- if (idElem.attr("href").startsWith("/pipe/")) {
- id = idElem.attr("href").substring("/pipe/".length());
- break;
- }
+ for (Element idElem : article.getElementsByTag("a")) {
+ if (idElem.attr("href").startsWith("/pipe/")) {
+ return idElem.attr("href").substring("/pipe/".length());
}
+ }
- String intUrl = null;
- String extUrl = null;
-
- Elements links = article.getElementsByTag("a");
- if (links.size() > 0) {
- intUrl = links.get(0).absUrl("href");
- }
+ return "";
+ }
- // Take first ext URL as original source
- for (Element link : links) {
- String uuu = link.absUrl("href");
- if (!uuu.isEmpty() && !uuu.contains("pipedot.org/")) {
- extUrl = uuu;
- break;
- }
- }
+ @Override
+ protected String getArticleTitle(Document doc, Element article) {
+ Element title = article.getElementsByTag("h1").first();
+ if (title != null) {
+ return title.text();
+ }
- String details = "";
- Elements detailsElements = article.getElementsByTag("div");
- if (detailsElements.size() > 0) {
- details = detailsElements.get(0).text().trim();
- }
+ return "";
+ }
- String author = "";
- int pos = details.indexOf("by ");
+ @Override
+ protected String getArticleAuthor(Document doc, Element article) {
+ String value = getArticleDetailsReal(article);
+ int pos = value.indexOf("by ");
+ if (pos >= 0) {
+ value = value.substring(pos + "by ".length()).trim();
+ pos = value.indexOf(" in ");
if (pos >= 0) {
- author = details.substring(pos + "by ".length()).trim();
- pos = author.indexOf(" in ");
- if (pos >= 0) {
- author = author.substring(0, pos).trim();
- }
+ value = value.substring(0, pos).trim();
}
- String categ = "";
- pos = details.indexOf(" in ");
+ return value;
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleDate(Document doc, Element article) {
+ Element dateElement = article.getElementsByTag("time").first();
+ if (dateElement != null) {
+ return dateElement.attr("datetime");
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleCategory(Document doc, Element article,
+ String currentCategory) {
+ String value = getArticleDetailsReal(article);
+ int pos = value.indexOf(" in ");
+ if (pos >= 0) {
+ value = value.substring(pos + " in ".length()).trim();
+ pos = value.indexOf(" on ");
if (pos >= 0) {
- categ = details.substring(pos + " in ".length()).trim();
- pos = categ.indexOf(" on ");
- if (pos >= 0) {
- categ = categ.substring(0, pos).trim();
- }
+ value = value.substring(0, pos).trim();
}
- String date = "";
- Element dateElement = article.getElementsByTag("time").first();
- if (dateElement != null) {
- date = date(dateElement.attr("datetime"));
- }
+ return value;
+ }
- // We already have all the details (date, author, id, categ)
- details = "";
+ return "";
+ }
- String body = "";
- for (Element elem : article.children()) {
- String tag = elem.tag().toString();
- if (!tag.equals("header") && !tag.equals("footer")) {
- body = elem.text();
- break;
- }
+ @Override
+ protected String getArticleDetails(Document doc, Element article) {
+ return ""; // We alrady extracted all the info
+ }
+
+ @Override
+ protected String getArticleIntUrl(Document doc, Element article) {
+ Element link = article.getElementsByTag("a").first();
+ if (link != null) {
+ return link.absUrl("href");
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleExtUrl(Document doc, Element article) {
+ Element link = article.getElementsByTag("a").first();
+ if (link != null) {
+ String possibleExtLink = link.absUrl("href").trim();
+ if (!possibleExtLink.isEmpty()
+ && !possibleExtLink.contains("pipedot.org/")) {
+ return possibleExtLink;
}
+ }
- list.add(new Story(getType(), id, title.text(), author, date,
- categ, details, intUrl, extUrl, body));
+ return "";
+ }
+
+ @Override
+ protected String getArticleContent(Document doc, Element article) {
+ for (Element elem : article.children()) {
+ String tag = elem.tagName();
+ if (!tag.equals("header") && !tag.equals("footer")) {
+ return elem.text();
+ }
}
- return list;
+ return "";
+ }
+
+ @Override
+ protected Element getFullArticle(Document doc) {
+ return null;
+ }
+
+ @Override
+ protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+ return getCommentElements(doc.getElementsByTag("main").first());
+ }
+
+ @Override
+ protected ElementProcessor getElementProcessorFullArticle() {
+ return new BasicElementProcessor();
}
@Override
- public void fetch(Story story) throws IOException {
- List<Comment> comments = new ArrayList<Comment>();
+ protected List<Element> getCommentCommentPosts(Document doc,
+ Element container) {
- URL url = new URL(story.getUrlInternal());
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements listing = doc.getElementsByTag("main");
- if (listing.size() > 0) {
- comments.addAll(getComments(listing.get(0)));
+ if (container != null) {
+ container = container.getElementsByClass("comment-outline").first();
}
- story.setComments(comments);
+ return getCommentElements(container);
}
- private List<Comment> getComments(Element listing) {
- List<Comment> comments = new ArrayList<Comment>();
- for (Element commentElement : listing.children()) {
- if (commentElement.hasClass("comment")) {
- Comment comment = getComment(commentElement);
- if (!comment.isEmpty()) {
- comments.add(comment);
- }
+ @Override
+ protected String getCommentId(Element post) {
+ return post.id();
+ }
+
+ @Override
+ protected String getCommentAuthor(Element post) {
+ Element authorDateE = post.getElementsByTag("h3").first();
+ if (authorDateE != null) {
+ String authorDate = authorDateE.text();
+ int pos = authorDate.lastIndexOf(" on ");
+ if (pos >= 0) {
+ return authorDate.substring(0, pos).trim();
}
}
- return comments;
- }
- private Comment getComment(Element commentElement) {
- String title = firstOrEmptyTag(commentElement, "h3").text();
- String author = firstOrEmpty(commentElement, "h4").text();
- Element content = firstOrEmpty(commentElement, "comment-body");
+ return "";
+ }
- String date = "";
- int pos = author.lastIndexOf(" on ");
- if (pos >= 0) {
- date = author.substring(pos + " on ".length()).trim();
- author = author.substring(0, pos).trim();
+ @Override
+ protected String getCommentTitle(Element post) {
+ Element title = post.getElementsByTag("h3").first();
+ if (title != null) {
+ return title.text();
}
- Comment comment = new Comment(commentElement.id(), author, title, date,
- toLines(content));
+ return "";
+ }
- Elements commentOutline = commentElement
- .getElementsByClass("comment-outline");
- if (commentOutline.size() > 0) {
- comment.addAll(getComments(commentOutline.get(0)));
+ @Override
+ protected String getCommentDate(Element post) {
+ Element authorDateE = post.getElementsByTag("h3").first();
+ if (authorDateE != null) {
+ String authorDate = authorDateE.text();
+ int pos = authorDate.lastIndexOf(" on ");
+ if (pos >= 0) {
+ return authorDate.substring(pos + " on ".length()).trim();
+ }
}
- return comment;
+ return "";
+ }
+
+ @Override
+ protected Element getCommentContentElement(Element post) {
+ return post.getElementsByClass("comment-body").first();
}
- private List<String> toLines(Element element) {
- return toLines(element, new BasicElementProcessor() {
+ @Override
+ protected ElementProcessor getElementProcessorComment() {
+ return new BasicElementProcessor() {
@Override
public boolean detectQuote(Node node) {
if (node instanceof Element) {
return false;
}
- });
+ };
+ }
+
+ private String getArticleDetailsReal(Element article) {
+ Elements detailsElements = article.getElementsByTag("div");
+ if (detailsElements.size() > 0) {
+ return detailsElements.get(0).text().trim();
+ }
+
+ return "";
+ }
+
+ private List<Element> getCommentElements(Element container) {
+ List<Element> commentElements = new ArrayList<Element>();
+ if (container != null) {
+ for (Element commentElement : container.children()) {
+ if (commentElement.hasClass("comment")) {
+ commentElements.add(commentElement);
+ }
+ }
+ }
+ return commentElements;
}
}
package be.nikiroo.gofetch.support;
import java.io.IOException;
-import java.io.InputStream;
import java.net.URL;
+import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map.Entry;
-import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
-import be.nikiroo.gofetch.data.Comment;
-import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.utils.StringUtils;
-
/**
* Support <a href='https://slashdot.org/'>https://slashdot.org/</a>.
*
}
@Override
- public List<Story> list() throws IOException {
- List<Story> list = new ArrayList<Story>();
-
- URL url = new URL("https://slashdot.org/");
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements articles = doc.getElementsByTag("header");
- for (Element article : articles) {
- Elements titles = article.getElementsByClass("story-title");
- if (titles.size() == 0) {
- continue;
- }
+ protected List<Entry<URL, String>> getUrls() throws IOException {
+ List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+ urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+ "https://slashdot.org/"), ""));
+ return urls;
+ }
- Element title = titles.get(0);
+ @Override
+ protected List<Element> getArticles(Document doc) {
+ return doc.getElementsByTag("header");
+ }
- String id = "" + title.attr("id");
+ @Override
+ protected String getArticleId(Document doc, Element article) {
+ Element title = article.getElementsByClass("story-title").first();
+ if (title != null) {
+ String id = title.attr("id");
if (id.startsWith("title-")) {
id = id.substring("title-".length());
}
+ return id;
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleTitle(Document doc, Element article) {
+ Element title = article.getElementsByClass("story-title").first();
+ if (title != null) {
+ return title.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleAuthor(Document doc, Element article) {
+ // details: "Posted by AUTHOR on DATE from the further-crackdown dept."
+ String details = getArticleDetailsReal(article);
+ int pos = details.indexOf(" on ");
+ if (details.startsWith("Posted by ") && pos >= 0) {
+ return details.substring("Posted by ".length(), pos).trim();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleDate(Document doc, Element article) {
+ // Do not try bad articles
+ if (getArticleId(doc, article).isEmpty()) {
+ return "";
+ }
+
+ Element dateElement = doc.getElementsByTag("time").first();
+ if (dateElement != null) {
+ String date = dateElement.text().trim();
+ if (date.startsWith("on ")) {
+ date = date.substring("on ".length());
+ }
+
+ return date;
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleCategory(Document doc, Element article,
+ String currentCategory) {
+ Element categElement = doc.getElementsByClass("topic").first();
+ if (categElement != null) {
+ return categElement.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleDetails(Document doc, Element article) {
+ // details: "Posted by AUTHOR on DATE from the further-crackdown dept."
+ String details = getArticleDetailsReal(article);
+ int pos = details.indexOf(" from the ");
+ if (pos >= 0) {
+ return details.substring(pos).trim();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleIntUrl(Document doc, Element article) {
+ Element title = article.getElementsByClass("story-title").first();
+ if (title != null) {
Elements links = title.getElementsByTag("a");
- String intUrl = "";
- String extUrl = "";
if (links.size() > 0) {
- intUrl = links.get(0).absUrl("href");
+ return links.get(0).absUrl("href");
}
+ }
+ return "";
+ }
+
+ @Override
+ protected String getArticleExtUrl(Document doc, Element article) {
+ Element title = article.getElementsByClass("story-title").first();
+ if (title != null) {
+ Elements links = title.getElementsByTag("a");
if (links.size() > 1) {
- extUrl = links.get(1).absUrl("href");
+ return links.get(1).absUrl("href");
}
+ }
+ return "";
+ }
- String details = "";
- Elements detailsElements = article.getElementsByClass("details");
- if (detailsElements.size() > 0) {
- details = detailsElements.get(0).text();
- }
+ @Override
+ protected String getArticleContent(Document doc, Element article) {
+ Element contentElement = doc //
+ .getElementById("text-" + getArticleId(doc, article));
+ if (contentElement != null) {
+ return contentElement.text();
+ }
- // details:
- // "Posted by AUTHOR on DATE from the further-crackdown dept."
- String author = "";
- int pos = details.indexOf(" on ");
- if (details.startsWith("Posted by ") && pos >= 0) {
- author = details.substring("Posted by ".length(), pos).trim();
- }
- pos = details.indexOf(" from the ");
- if (pos >= 0) {
- details = details.substring(pos).trim();
- }
+ return "";
+ }
- String body = "";
- Element bodyElement = doc.getElementById("text-" + id);
- if (bodyElement != null) {
- body = bodyElement.text();
- }
+ @Override
+ protected Element getFullArticle(Document doc) {
+ return null;
+ }
- String categ = "";
- Element categElement = doc.getElementsByClass("topic").first();
- if (categElement != null) {
- categ = StringUtils.unhtml(categElement.text()).trim();
+ @Override
+ protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+ List<Element> commentElements = new ArrayList<Element>();
+ Element listing = doc.getElementById("commentlisting");
+ if (listing != null) {
+ for (Element commentElement : listing.children()) {
+ if (commentElement.hasClass("comment")) {
+ commentElements.add(commentElement);
+ }
}
+ }
+
+ return commentElements;
+ }
+
+ @Override
+ protected ElementProcessor getElementProcessorFullArticle() {
+ return null;
+ }
- String date = "";
- Element dateElement = doc.getElementsByTag("time").first();
- if (dateElement != null) {
- date = StringUtils.unhtml(dateElement.text()).trim();
- if (date.startsWith("on ")) {
- date = date.substring("on ".length());
+ @Override
+ protected List<Element> getCommentCommentPosts(Document doc,
+ Element container) {
+ List<Element> commentElements = new ArrayList<Element>();
+ for (Element child : container.children()) {
+ if (child.id().contains("commtree_")) {
+ for (Element sub : child.children()) {
+ if (sub.hasClass("comment")) {
+ commentElements.add(sub);
+ }
}
}
+ }
+
+ return commentElements;
+ }
- list.add(new Story(getType(), id, title.text(), author, date,
- categ, details, intUrl, extUrl, body));
+ @Override
+ protected String getCommentId(Element post) {
+ if (post.hasClass("hidden")) {
+ return "";
}
- return list;
+ return post.id();
}
@Override
- public void fetch(Story story) throws IOException {
- List<Comment> comments = new ArrayList<Comment>();
+ protected String getCommentAuthor(Element post) {
+ if (post.hasClass("hidden")) {
+ return "";
+ }
- URL url = new URL(story.getUrlInternal());
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Element listing = doc.getElementById("commentlisting");
- if (listing != null) {
- comments.addAll(getComments(listing));
+ Element author = post.getElementsByClass("by").first();
+ if (author != null) {
+ return author.text();
}
- story.setComments(comments);
+ return "";
}
- private List<Comment> getComments(Element listing) {
- List<Comment> comments = new ArrayList<Comment>();
- Comment lastComment = null;
- for (Element commentElement : listing.children()) {
- if (commentElement.hasClass("comment")) {
- if (!commentElement.hasClass("hidden")) {
- lastComment = getComment(commentElement);
- comments.add(lastComment);
- }
+ @Override
+ protected String getCommentTitle(Element post) {
+ if (post.hasClass("hidden")) {
+ return "";
+ }
- List<Comment> subComments = new ArrayList<Comment>();
- for (Element child : commentElement.children()) {
- if (child.id().contains("commtree_")) {
- subComments.addAll(getComments(child));
- }
- }
+ Element title = post.getElementsByClass("title").first();
+ if (title != null) {
+ return title.text();
+ }
- if (lastComment == null) {
- comments.addAll(subComments);
- } else {
- lastComment.addAll(subComments);
- }
- }
+ return "";
+ }
+
+ @Override
+ protected String getCommentDate(Element post) {
+ if (post.hasClass("hidden")) {
+ return "";
}
- return comments;
+ Element date = post.getElementsByClass("otherdetails").first();
+ if (date != null) {
+ return date.text();
+ }
+
+ return "";
}
- /**
- * Get a comment from the given element.
- *
- * @param commentElement
- * the element to get the comment of.
- *
- * @return the comment, <b>NOT</b> including sub-comments
- */
- private Comment getComment(Element commentElement) {
- String title = firstOrEmpty(commentElement, "title").text();
- String author = firstOrEmpty(commentElement, "by").text();
- String date = firstOrEmpty(commentElement, "otherdetails").text();
- Element content = firstOrEmpty(commentElement, "commentBody");
+ @Override
+ protected Element getCommentContentElement(Element post) {
+ if (post.hasClass("hidden")) {
+ return null;
+ }
- return new Comment(commentElement.id(), author, title, date,
- toLines(content));
+ return post.getElementsByClass("commentBody").first();
}
- private List<String> toLines(Element element) {
- return toLines(element, new BasicElementProcessor() {
+ @Override
+ protected ElementProcessor getElementProcessorComment() {
+ return new BasicElementProcessor() {
@Override
public String processText(String text) {
while (text.startsWith(">")) { // comment in one-liners
return false;
}
- });
+ };
+ }
+
+ private String getArticleDetailsReal(Element article) {
+ Element detailsElement = article.getElementsByClass("details").first();
+ if (detailsElement != null) {
+ return detailsElement.text();
+ }
+
+ return "";
}
}
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
+import java.util.AbstractMap;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
import be.nikiroo.gofetch.data.Comment;
import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.utils.StringUtils;
/**
* Support <a
* @author niki
*/
public class TheRegister extends BasicSupport {
+ private Map<String, String> commentReplies = new HashMap<String, String>();
+
@Override
public String getDescription() {
return "The Register: Biting the hand that feeds IT";
}
@Override
- public List<Story> list() throws IOException {
- List<Story> list = new ArrayList<Story>();
+ public void fetch(Story story) throws IOException {
+ super.fetch(story);
- URL url = new URL("https://www.theregister.co.uk/");
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements articles = doc.getElementsByClass("story_link");
- for (Element article : articles) {
- if (article.getElementsByClass("time_stamp").isEmpty()) {
- // Some articles are doubled,
- // but the second copy without the time info
- continue;
+ // Update comment replies
+ List<Comment> comments = new ArrayList<Comment>();
+ for (Comment comment : story.getComments()) {
+ if (commentReplies.containsKey(comment.getId())) {
+ String inReplyToId = commentReplies.get(comment.getId());
+ Comment inReplyTo = story.getCommentById(inReplyToId);
+ if (inReplyTo != null) {
+ inReplyTo.add(comment);
+ } else {
+ comments.add(comment);
+ }
+ } else {
+ comments.add(comment);
}
+ }
+ story.setComments(comments);
+ }
- String id = "";
- String intUrl = article.absUrl("href");
- String extUrl = ""; // nope
- String title = "";
- String date = "";
- String details = "";
- String body = "";
- String categ = "";
- String author = ""; // nope
-
- Element categElement = article.previousElementSibling();
- if (categElement != null) {
- categ = categElement.text().trim();
- }
+ @Override
+ protected List<Entry<URL, String>> getUrls() throws IOException {
+ List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+ urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+ "https://www.theregister.co.uk/"), ""));
+ return urls;
+ }
- Element titleElement = article.getElementsByTag("h4").first();
- if (titleElement != null) {
- title = StringUtils.unhtml(titleElement.text()).trim();
- }
+ @Override
+ protected List<Element> getArticles(Document doc) {
+ return doc.getElementsByClass("story_link");
+ }
- Element dateElement = article.getElementsByClass("time_stamp")
- .first();
- if (dateElement != null) {
- String epochS = dateElement.attr("data-epoch");
- if (epochS != null && !epochS.isEmpty()) {
- id = epochS;
- date = date(epochS);
- }
- }
+ @Override
+ protected String getArticleId(Document doc, Element article) {
+ return "";
+ }
- if (id.isEmpty()) {
- // fallback
- id = article.attr("href").replace("/", "_");
- }
+ @Override
+ protected String getArticleTitle(Document doc, Element article) {
+ Element titleElement = article.getElementsByTag("h4").first();
+ if (titleElement != null) {
+ return titleElement.text();
+ }
- Element detailsElement = article.getElementsByClass("standfirst")
- .first();
- details = "(" + date + ") ";
- if (detailsElement != null) {
- details += StringUtils.unhtml(detailsElement.text()).trim();
- }
+ return "";
+ }
+
+ @Override
+ protected String getArticleAuthor(Document doc, Element article) {
+ return "";
+ }
- // We have some "details" but no content, so we switch them:
- body = details;
- details = "";
- list.add(new Story(getType(), id, title, author, date, categ,
- details, intUrl, extUrl, body));
+ @Override
+ protected String getArticleDate(Document doc, Element article) {
+ Element dateElement = article.getElementsByClass("time_stamp").first();
+ if (dateElement != null) {
+ return dateElement.attr("data-epoch");
}
- return list;
+ return "";
}
@Override
- public void fetch(Story story) throws IOException {
- String fullContent = story.getContent();
- List<Comment> comments = new ArrayList<Comment>();
- story.setComments(comments);
+ protected String getArticleCategory(Document doc, Element article,
+ String currentCategory) {
+ Element categElement = article.previousElementSibling();
+ if (categElement != null) {
+ return categElement.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleDetails(Document doc, Element article) {
+ // We have some "details" but no content, so we switch them:
+ return "";
+ }
+
+ @Override
+ protected String getArticleIntUrl(Document doc, Element article) {
+ return article.absUrl("href");
+ }
+
+ @Override
+ protected String getArticleExtUrl(Document doc, Element article) {
+ return "";
+ }
+
+ @Override
+ protected String getArticleContent(Document doc, Element article) {
+ // We have some "details" but no content, so we switch them:
+ Element detailsElement = article.getElementsByClass("standfirst")
+ .first();
+ if (detailsElement != null) {
+ return detailsElement.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected Element getFullArticle(Document doc) {
+ return doc.getElementById("body");
+ }
- URL url = new URL(story.getUrlInternal());
- InputStream in = downloader.open(url);
+ @Override
+ protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+ List<Element> commentElements = new ArrayList<Element>();
+
+ // Get comments URL then parse it
try {
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Element article = doc.getElementById("body");
- if (article != null) {
- for (String line : toLines(article,
- new BasicElementProcessor() {
- // TODO: ignore headlines/pub
- })) {
- fullContent += line + "\n";
+ URL url = new URL("https://forums.theregister.co.uk/forum/1"
+ + intUrl.getPath());
+ InputStream in = downloader.open(url);
+ try {
+ doc = DataUtil.load(in, "UTF-8", url.toString());
+ Element posts = doc.getElementById("forum_posts");
+ if (posts != null) {
+ for (Element post : posts.getElementsByClass("post")) {
+ commentElements.add(post);
+ Element inReplyTo = post.getElementsByClass(
+ "in-reply-to").first();
+ if (inReplyTo != null) {
+ String parentId = inReplyTo.absUrl("href");
+ if (parentId != null && parentId.contains("/")) {
+ int i = parentId.lastIndexOf('/');
+ parentId = parentId.substring(i + 1);
+
+ commentReplies
+ .put(getCommentId(post), parentId);
+ }
+ }
+ }
}
+ } finally {
+ in.close();
+ }
+ } catch (IOException e) {
+ }
+
+ return commentElements;
+ }
+
+ @Override
+ protected ElementProcessor getElementProcessorFullArticle() {
+ return new BasicElementProcessor();
+ }
+
+ @Override
+ protected List<Element> getCommentCommentPosts(Document doc,
+ Element container) {
+ return null;
+ }
- // Content is too tight with a single break per line:
- fullContent = fullContent.replace("\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .trim();
+ @Override
+ protected String getCommentId(Element post) {
+ Element idE = post.getElementsByTag("a").first();
+ if (idE != null) {
+ String id = idE.attr("id");
+ if (id.startsWith("c_")) {
+ id = id.substring(2);
}
- story.setFullContent(fullContent);
-
- // Get comments URL then parse it
- in.close();
- in = null;
- in = downloader
- .open(new URL("https://forums.theregister.co.uk/forum/1"
- + url.getPath()));
- doc = DataUtil.load(in, "UTF-8", url.toString());
- Element posts = doc.getElementById("forum_posts");
- if (posts != null) {
- for (Element post : posts.getElementsByClass("post")) {
- String id = "";
- String author = "";
- String title = "";
- String date = "";
- List<String> content = new ArrayList<String>();
-
- Element idE = post.getElementsByTag("a").first();
- if (idE != null) {
- id = idE.attr("id");
- if (id.startsWith("c_")) {
- id = id.substring(2);
- }
+ return id;
+ }
- Element dateE = idE.getElementsByTag("span").first();
- if (dateE != null) {
- date = date(dateE.attr("data-epoch"));
- }
- }
+ return "";
+ }
- Element authorE = post.getElementsByClass("author").first();
- if (authorE != null) {
- author = StringUtils.unhtml(authorE.text()).trim();
- }
+ @Override
+ protected String getCommentAuthor(Element post) {
+ Element author = post.getElementsByClass("author").first();
+ if (author != null) {
+ return author.text();
+ }
- Element titleE = post.getElementsByTag("h4").first();
- if (titleE != null) {
- title = StringUtils.unhtml(titleE.text()).trim();
- }
+ return "";
+ }
- Element contentE = post.getElementsByClass("body").first();
- if (contentE != null) {
- for (String line : toLines(contentE,
- new BasicElementProcessor() {
- @Override
- public boolean ignoreNode(Node node) {
- // TODO: ignore headlines/pub
-
- // Remove the comment title (which has
- // already been processed earlier)
- if (node instanceof Element) {
- Element el = (Element) node;
- if ("h4".equals(el.tagName())) {
- return true;
- }
- }
-
- return false;
- }
- })) {
- content.add(line);
- }
- }
+ @Override
+ protected String getCommentTitle(Element post) {
+ Element title = post.getElementsByTag("h4").first();
+ if (title != null) {
+ return title.text();
+ }
- Comment comment = new Comment(id, author, title, date,
- content);
- Comment parent = null;
-
- Element inReplyTo = post.getElementsByClass("in-reply-to")
- .first();
- if (inReplyTo != null) {
- String parentId = inReplyTo.absUrl("href");
- if (parentId != null && parentId.contains("/")) {
- int i = parentId.lastIndexOf('/');
- parentId = parentId.substring(i + 1);
- parent = story.getCommentById(parentId);
- }
- }
+ return "";
+ }
+
+ @Override
+ protected String getCommentDate(Element post) {
+ Element id = post.getElementsByTag("a").first();
+ if (id != null) {
+ Element date = id.getElementsByTag("span").first();
+ if (date != null) {
+ return date.attr("data-epoch");
+ }
+ }
+
+ return "";
+ }
+
+ @Override
+ protected Element getCommentContentElement(Element post) {
+ return post.getElementsByClass("body").first();
+ }
- if (parent == null) {
- comments.add(comment);
- } else {
- parent.add(comment);
+ @Override
+ protected ElementProcessor getElementProcessorComment() {
+ return new BasicElementProcessor() {
+ @Override
+ public boolean ignoreNode(Node node) {
+ // Remove the comment title (which has
+ // already been processed earlier)
+ if (node instanceof Element) {
+ Element el = (Element) node;
+ if ("h4".equals(el.tagName())) {
+ return true;
}
}
+
+ return false;
}
- } finally {
- if (in != null) {
- in.close();
- }
- }
+ };
}
}
package be.nikiroo.gofetch.support;
import java.io.IOException;
-import java.io.InputStream;
import java.net.URL;
+import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map.Entry;
-import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
-import org.jsoup.select.Elements;
-
-import be.nikiroo.gofetch.data.Comment;
-import be.nikiroo.gofetch.data.Story;
-import be.nikiroo.utils.StringUtils;
/**
* Support <a href="https://www.toolinux.com/">https://www.toolinux.com/</a>.
}
@Override
- public List<Story> list() throws IOException {
- List<Story> list = new ArrayList<Story>();
-
- URL url = new URL("https://www.toolinux.com/");
- InputStream in = downloader.open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements articles = doc.getElementsByClass("hentry");
- for (Element article : articles) {
- String id = "";
- String intUrl = "";
- String extUrl = ""; // nope
- String title = "";
- String date = "";
- String details = "";
- String body = "";
- String author = ""; // nope
- String categ = ""; // nope
-
- Element urlElement = article.getElementsByTag("a").first();
- if (urlElement != null) {
- intUrl = urlElement.absUrl("href");
- }
+ protected List<Entry<URL, String>> getUrls() throws IOException {
+ List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+ urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+ "https://www.toolinux.com/"), ""));
+ return urls;
+ }
- Element titleElement = article.getElementsByClass("entry-title")
- .first();
- if (titleElement != null) {
- title = StringUtils.unhtml(titleElement.text()).trim();
- }
+ @Override
+ protected List<Element> getArticles(Document doc) {
+ return doc.getElementsByClass("hentry");
+ }
- Element dateElement = article.getElementsByClass("published")
- .first();
- if (dateElement != null) {
- date = StringUtils.unhtml(dateElement.text()).trim();
- id = dateElement.attr("title").trim();
- }
+ @Override
+ protected String getArticleId(Document doc, Element article) {
+ return ""; // We use the date
+ }
- if (id.isEmpty()) {
- // fallback
- id = intUrl.replace("/", "_");
- }
+ @Override
+ protected String getArticleTitle(Document doc, Element article) {
+ Element titleElement = article.getElementsByClass("entry-title")
+ .first();
+ if (titleElement != null) {
+ return titleElement.text();
+ }
- Element bodyElement = article.getElementsByClass("introduction")
- .first();
- if (bodyElement != null) {
- body = StringUtils.unhtml(bodyElement.text()).trim();
- }
+ return "";
+ }
- list.add(new Story(getType(), id, title, author, date, categ,
- details, intUrl, extUrl, body));
+ @Override
+ protected String getArticleAuthor(Document doc, Element article) {
+ return "";
+ }
+
+ @Override
+ protected String getArticleDate(Document doc, Element article) {
+ Element dateElement = article.getElementsByClass("published").first();
+ if (dateElement != null) {
+ return dateElement.text();
}
- return list;
- }
-
- @Override
- public void fetch(Story story) throws IOException {
- String fullContent = story.getContent();
- List<Comment> comments = new ArrayList<Comment>();
- story.setComments(comments);
-
- URL url = new URL(story.getUrlInternal());
- InputStream in = downloader.open(url);
- try {
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Element article = doc.getElementById("content");
- if (article != null) {
- for (String line : toLines(article,
- new BasicElementProcessor() {
- @Override
- public boolean ignoreNode(Node node) {
- if ("notes".equals(node.attr("class"))) {
- return true;
- }
- return false;
- }
- })) {
- fullContent += line + "\n";
- }
+ return "";
+ }
- // Content is too tight with a single break per line:
- fullContent = fullContent.replace("\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .replace("\n\n\n\n", "\n\n") //
- .trim();
- }
+ @Override
+ protected String getArticleCategory(Document doc, Element article,
+ String currentCategory) {
+ return "";
+ }
- story.setFullContent(fullContent);
- } finally {
- if (in != null) {
- in.close();
- }
+ @Override
+ protected String getArticleDetails(Document doc, Element article) {
+ return "";
+ }
+
+ @Override
+ protected String getArticleIntUrl(Document doc, Element article) {
+ Element urlElement = article.getElementsByTag("a").first();
+ if (urlElement != null) {
+ return urlElement.absUrl("href");
}
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleExtUrl(Document doc, Element article) {
+ return "";
+ }
+
+ @Override
+ protected String getArticleContent(Document doc, Element article) {
+ Element content = article.getElementsByClass("introduction").first();
+ if (content != null) {
+ return content.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected Element getFullArticle(Document doc) {
+ return doc.getElementById("content");
+ }
+
+ @Override
+ protected List<Element> getFullArticleCommentPosts(Document doc, URL intUrl) {
+ return null;
+ }
+
+ @Override
+ protected ElementProcessor getElementProcessorFullArticle() {
+ return new BasicElementProcessor() {
+ @Override
+ public boolean ignoreNode(Node node) {
+ if ("notes".equals(node.attr("class"))) {
+ return true;
+ }
+ return false;
+ }
+ };
+ }
+
+ @Override
+ protected List<Element> getCommentCommentPosts(Document doc,
+ Element container) {
+ return null;
+ }
+
+ @Override
+ protected String getCommentId(Element post) {
+ return null;
+ }
+
+ @Override
+ protected String getCommentAuthor(Element post) {
+ return null;
+ }
+
+ @Override
+ protected String getCommentTitle(Element post) {
+ return null;
+ }
+
+ @Override
+ protected String getCommentDate(Element post) {
+ return null;
+ }
+
+ @Override
+ protected Element getCommentContentElement(Element post) {
+ return null;
+ }
+
+ @Override
+ protected ElementProcessor getElementProcessorComment() {
+ return null;
}
}
--- /dev/null
+package be.nikiroo.gofetch.support;
+
+/**
+ * The support type (each website we support has a single type).
+ *
+ * @author niki
+ */
+public enum Type {
+ /** EN: Any, but mostly IT/Sci */
+ SLASHDOT,
+ /** EN: Clone of Slashdot, mostly abandoned */
+ PIPEDOT,
+ /** EN: Linux */
+ LWN,
+ /** FR: Any */
+ LEMONDE,
+ /** EN: IT */
+ REGISTER,
+ /** FR: Linux */
+ TOO_LINUX,
+ /** FR: IT */
+ ERE_NUMERIQUE,
+}