package be.nikiroo.gofetch.support;
import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import org.jsoup.helper.DataUtil;
import org.jsoup.helper.StringUtil;
+import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
-import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
+import be.nikiroo.gofetch.data.Comment;
import be.nikiroo.gofetch.data.Story;
import be.nikiroo.utils.Downloader;
+import be.nikiroo.utils.StringUtils;
+/**
+ * Base class for website support.
+ *
+ * @author niki
+ */
public abstract class BasicSupport {
- protected static Downloader downloader = new Downloader("gofetcher");
+ /**
+ * The downloader to use for all web sites via
+ * {@link BasicSupport#open(URL)}
+ */
+ static private Downloader downloader = new Downloader("gofetcher");
- public enum Type {
- SLASHDOT, PIPEDOT, LWN, LEMONDE, REGISTER,
- }
+ static private String preselector;
/**
- * Used to process an element into lines.
- *
- * @author niki
- */
- public interface ElementProcessor {
- /**
- * Detect if this node is a quote and should be trated as such.
- *
- * @param node
- * the node to check
- * @return TRUE if it is
- */
- public boolean detectQuote(Node node);
-
- /**
- * Process text content (will be called on each text element, allowing
- * you to modify it if needed).
- *
- * @param text
- * the text to process
- * @return
- */
- public String processText(String text);
-
- /**
- * Ignore this node.
- *
- * @param node
- * the node to ignore
- * @return TRUE if it has to be ignored
- */
- public boolean ignoreNode(Node node);
-
- /**
- * Manually process this node (and return the manual processing value)
- * if so desired.
- * <p>
- * If the node is manually processed, it and its children will not be
- * automatically processed.
- *
- * @param node
- * the node to optionally process
- *
- * @return NULL if not processed (will thus be automatically processed
- * as usual), a {@link String} (may be empty) if we process it
- * manually -- the given {@link String} will be used instead of
- * the usual automatic processing if not NULL
- */
- public String manualProcessing(Node node);
- }
+ * The optional cookies to use to get the site data.
+ */
+ private Map<String, String> cookies = new HashMap<String, String>();
+
+ private Type type;
/**
- * A default {@link ElementProcessor} (will not detect or process anything
- * manually).
+ * Login on the web site (this method does nothing by default, but can be
+ * overridden if needed).
+ *
+ * @throws IOException
+ * in case of I/O error
*
- * @author niki
*/
- public class BasicElementProcessor implements ElementProcessor {
- @Override
- public boolean detectQuote(Node node) {
- return false;
- }
-
- @Override
- public String processText(String text) {
- return text;
- }
+ public void login() throws IOException {
+ }
- @Override
- public boolean ignoreNode(Node node) {
- return false;
- }
+ /**
+ * The website textual description, to add in the dispatcher page.
+ * <p>
+ * Should be short.
+ *
+ * @return the description
+ */
+ abstract public String getDescription();
- @Override
- public String manualProcessing(Node node) {
- return null;
- }
+ /**
+ * The gopher "selector" to use for output.
+ * <p>
+ * A kind of "URL path", like "/news/" or "/misc/news/" or...
+ *
+ * @return the selector
+ */
+ public String getSelector() {
+ return getSelector(getType());
}
- static private String preselector;
-
- private Type type;
+ /**
+ * The support type.
+ *
+ * @return the type
+ */
+ public Type getType() {
+ return type;
+ }
/**
* List all the recent items, but only assure the ID and internal URL to
* @throws IOException
* in case of I/O
*/
- abstract public List<Story> list() throws IOException;
+ public List<Story> list() throws IOException {
+ List<Story> list = new ArrayList<Story>();
+
+ login();
+ for (Entry<URL, String> entry : getUrls()) {
+ URL url = entry.getKey();
+ String defaultCateg = entry.getValue();
+ if (defaultCateg == null) {
+ defaultCateg = "";
+ }
+
+ InputStream in = open(url);
+ Document doc = DataUtil.load(in, "UTF-8", url.toString());
+ List<Element> articles = getArticles(doc);
+ for (Element article : articles) {
+ String id = getArticleId(doc, article).trim();
+ String title = getArticleTitle(doc, article).trim();
+ String author = getArticleAuthor(doc, article).trim();
+ String date = getArticleDate(doc, article).trim();
+ String categ = getArticleCategory(doc, article, defaultCateg)
+ .trim();
+ String details = getArticleDetails(doc, article).trim();
+ String intUrl = getArticleIntUrl(doc, article).trim();
+ String extUrl = getArticleExtUrl(doc, article).trim();
+ String content = getArticleContent(doc, article).trim();
+
+ if (id.isEmpty() && date.isEmpty()) {
+ continue;
+ }
+
+ if (!id.isEmpty()) {
+ while (id.length() < 10) {
+ id = "0" + id;
+ }
+ } else {
+ id = date.replace(":", "_").replace("+", "_").replace("/", "-");
+ }
+
+ date = date(date);
+
+ list.add(new Story(getType(), id, title, author, date, categ,
+ details, intUrl, extUrl, content));
+ }
+ }
+
+ return list;
+ }
+
+ /**
+ * The {@link URL}s to process for this website.
+ *
+ * @return the list of {@link URL}s
+ *
+ * @throws IOException
+ * in case of I/O error
+ */
+ abstract protected List<Entry<URL, String>> getUrls() throws IOException;
+
+ /**
+ * The article {@link Element}s of this document.
+ *
+ * @param doc
+ * the main document for the current category
+ *
+ * @return the articles
+ */
+ abstract protected List<Element> getArticles(Document doc);
+
+ /**
+ * The ID of the article (defaults to the date element if empty).
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the ID
+ */
+ abstract protected String getArticleId(Document doc, Element article);
+
+ /**
+ * The article title to display.
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the title
+ */
+ abstract protected String getArticleTitle(Document doc, Element article);
+
+ /**
+ * The optional article author.
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the author
+ */
+ abstract protected String getArticleAuthor(Document doc, Element article);
+
+ /**
+ * The optional article date.
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the date
+ */
+ abstract protected String getArticleDate(Document doc, Element article);
+
+ /**
+ * the optional article category.
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ * @param currentCategory
+ * the currently listed category if any (can be NULL)
+ *
+ * @return the category
+ */
+ abstract protected String getArticleCategory(Document doc, Element article,
+ String currentCategory);
+
+ /**
+ * the optional details of the article (can replace the date, author and
+ * category, for instance).
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the details
+ */
+ abstract protected String getArticleDetails(Document doc, Element article);
+
+ /**
+ * The (required) {@link URL} that points to the news page on the supported
+ * website.
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the internal {@link URL}
+ */
+ abstract protected String getArticleIntUrl(Document doc, Element article);
+
+ /**
+ * the optional {@link URL} that points to an external website for more
+ * information.
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the external {@link URL}
+ */
+ abstract protected String getArticleExtUrl(Document doc, Element article);
+
+ /**
+ * The optional article short-content (not the full content, that will be
+ * fetched by {@link BasicSupport#fetch(Story)}).
+ *
+ * @param doc
+ * the main document for the current category
+ * @param article
+ * the article to look into
+ *
+ * @return the short content
+ */
+ abstract protected String getArticleContent(Document doc, Element article);
/**
* Fetch the full article content as well as all the comments associated to
* @throws IOException
* in case of I/O error
*/
- abstract public void fetch(Story story) throws IOException;
+ public void fetch(Story story) throws IOException {
+ String fullContent = "";
+
+ URL url = new URL(story.getUrlInternal());
+ InputStream in = open(url);
+ try {
+ Document doc = DataUtil.load(in, "UTF-8", url.toString());
+ Element article = getFullArticle(doc);
+ if (article != null) {
+ fullContent = getArticleText(article);
+ }
- abstract public String getDescription();
+ if (fullContent.isEmpty()) {
+ fullContent = story.getContent();
+ }
- public String getSelector() {
- return getSelector(type);
+ story.setFullContent(fullContent);
+ story.setComments(getComments(doc,
+ getFullArticleCommentPosts(doc, url)));
+ } finally {
+ if (in != null) {
+ in.close();
+ }
+ }
}
- public Type getType() {
- return type;
+ /**
+ * Return the text from this {@link Element}, using the
+ * {@link BasicSupport#getElementProcessorFullArticle()} processor logic.
+ *
+ * @param article
+ * the element to extract the text from
+ *
+ * @return the text
+ */
+ protected String getArticleText(Element article) {
+ StringBuilder builder = new StringBuilder();
+ ElementProcessor eProc = getElementProcessorFullArticle();
+ if (eProc != null) {
+ for (String line : toLines(article, eProc)) {
+ builder.append(line + "\n");
+ }
+ } else {
+ builder.append(article.text());
+ }
+
+ // Content is too tight with a single break per line:
+ return builder.toString().replace("\n", "\n\n") //
+ .replace("\n\n\n\n", "\n\n") //
+ .replace("\n\n\n\n", "\n\n") //
+ .trim();
}
+ /**
+ * Return the full article if available (this is the article to retrieve
+ * from the newly downloaded page at {@link Story#getUrlInternal()}).
+ *
+ * @param doc
+ * the (full article) document to work on
+ *
+ * @return the article or NULL
+ */
+ abstract protected Element getFullArticle(Document doc);
+
+ /**
+ * Return the list of comment {@link Element}s from this optional container
+ * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
+ *
+ * @param doc
+ * the (full article) document to work on
+ * @param intUrl
+ * the internal {@link URL} this article wa taken from (the
+ * {@link URL} from the supported website)
+ *
+ * @return the list of comment posts
+ */
+ abstract protected List<Element> getFullArticleCommentPosts(Document doc,
+ URL intUrl);
+
+ /**
+ * The {@link ElementProcessor} to use to convert the main article element
+ * (see {@link BasicSupport#getFullArticle(Document)}) into text.
+ * <p>
+ * See {@link BasicElementProcessor} for a working, basic implementation.
+ * <p>
+ * Can be NULL to simply use {@link Element#text()}.
+ *
+ * @return the processor, or NULL
+ */
+ abstract protected ElementProcessor getElementProcessorFullArticle();
+
+ /**
+ * Open a network resource.
+ * <p>
+ * You need to close the returned {@link InputStream} when done.
+ *
+ * @param url
+ * the source to open
+ *
+ * @return the content
+ *
+ * @throws IOException
+ * in case of I/O error
+ */
+ protected InputStream open(URL url) throws IOException {
+ return downloader.open(url, url, cookies, null, null, null);
+ }
+
+ /**
+ * Convert the comment elements into {@link Comment}s
+ *
+ * @param doc
+ * the document we work on
+ * @param posts
+ * the comment elements
+ *
+ * @return the converted {@link Comment}s
+ */
+ private List<Comment> getComments(Document doc, List<Element> posts) {
+ List<Comment> comments = new ArrayList<Comment>();
+ if (posts != null) {
+ for (Element post : posts) {
+ String id = getCommentId(post).trim();
+ String author = getCommentAuthor(post).trim();
+ String title = getCommentTitle(post).trim();
+ String date = getCommentDate(post).trim();
+
+ List<String> content = new ArrayList<String>();
+
+ if (id.isEmpty()) {
+ id = date;
+ }
+
+ date = date(date);
+
+ Element contentE = getCommentContentElement(post);
+ if (contentE != null) {
+ ElementProcessor eProc = getElementProcessorComment();
+ if (eProc != null) {
+ for (String line : toLines(contentE, eProc)) {
+ content.add(line);
+ }
+ } else {
+ content = Arrays.asList(contentE.text().split("\n"));
+ }
+ }
+
+ Comment comment = new Comment(id, author, title, date, content);
+ comment.addAll(getComments(doc,
+ getCommentCommentPosts(doc, post)));
+
+ if (!comment.isEmpty()) {
+ comments.add(comment);
+ }
+ }
+ }
+
+ return comments;
+ }
+
+ /**
+ * Return the list of subcomment {@link Element}s from this comment element
+ * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
+ *
+ * @param doc
+ * the (full article) document to work on
+ * @param container
+ * the container (a comment {@link Element})
+ *
+ * @return the list of comment posts
+ */
+ abstract protected List<Element> getCommentCommentPosts(Document doc,
+ Element container);
+
+ /**
+ * Compute the ID of the given comment element.
+ *
+ * @param post
+ * the comment element
+ *
+ * @return the ID
+ */
+ abstract protected String getCommentId(Element post);
+
+ /**
+ * Compute the author of the given comment element.
+ *
+ * @param post
+ * the comment element
+ *
+ * @return the author
+ */
+ abstract protected String getCommentAuthor(Element post);
+
+ /**
+ * Compute the title of the given comment element.
+ *
+ * @param post
+ * the comment element
+ *
+ * @return the title
+ */
+ abstract protected String getCommentTitle(Element post);
+
+ /**
+ * Compute the date of the given comment element.
+ *
+ * @param post
+ * the comment element
+ *
+ * @return the date
+ */
+ abstract protected String getCommentDate(Element post);
+
+ /**
+ * Get the main of the given comment element, which can be NULL.
+ *
+ * @param post
+ * the comment element
+ *
+ * @return the element
+ */
+ abstract protected Element getCommentContentElement(Element post);
+
+ /**
+ * The {@link ElementProcessor} to use to convert the main comment element
+ * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
+ * <p>
+ * See {@link BasicElementProcessor} for a working, basic implementation.
+ * <p>
+ * Can be NULL to simply use {@link Element#text()}.
+ *
+ * @return the processor
+ */
+ abstract protected ElementProcessor getElementProcessorComment();
+
+ /**
+ * The support type.
+ *
+ * @param type
+ * the new type
+ */
protected void setType(Type type) {
this.type = type;
}
/**
+ * Add a cookie for all site connections.
+ *
+ * @param name
+ * the cookie name
+ * @param value
+ * the value
+ */
+ protected void addCookie(String name, String value) {
+ cookies.put(name, value);
+ }
+
+ /**
+ * The {@link String} to append to the selector (the selector will be
+ * constructed as "this string" then "/type/".
+ *
* @param preselector
* the preselector to set
*/
case REGISTER:
support = new TheRegister();
break;
+ case TOO_LINUX:
+ support = new TooLinux();
+ break;
+ case ERE_NUMERIQUE:
+ support = new EreNumerique();
+ break;
+ case PHORONIX:
+ support = new Phoronix();
+ break;
+ case SEPT_SUR_SEPT:
+ support = new SeptSurSept();
+ break;
+ case REDDIT:
+ support = new Reddit();
+ break;
}
if (support != null) {
return support;
}
- static public String getSelector(Type type) {
- return preselector + "/" + type + "/";
- }
-
- /**
- * Get the first {@link Element} of the given class, or an empty span
- * {@link Element} if none found.
- *
- * @param element
- * the element to look in
- * @param className
- * the class to look for
- *
- * @return the value or an empty span {@link Element}
- */
- static protected Element firstOrEmpty(Element element, String className) {
- Elements subElements = element.getElementsByClass(className);
- if (subElements.size() > 0) {
- return subElements.get(0);
- }
-
- return new Element("span");
- }
-
/**
- * Get the first {@link Element} of the given tag, or an empty span
- * {@link Element} if none found.
+ * The gopher "selector" to use for output for this type, using the
+ * preselector.
+ * <p>
+ * A kind of "URL path", like "/news/" or "/misc/news/" or...
*
- * @param element
- * the element to look in
- * @param tagName
- * the tag to look for
+ * @param type
+ * the type to get the selector of
*
- * @return the value or an empty span {@link Element}
+ * @return the selector
*/
- static protected Element firstOrEmptyTag(Element element, String tagName) {
- Elements subElements = element.getElementsByTag(tagName);
- if (subElements.size() > 0) {
- return subElements.get(0);
- }
-
- return new Element("span");
+ static public String getSelector(Type type) {
+ return preselector + "/" + type + "/";
}
/**
final StringBuilder currentLine = new StringBuilder();
final List<Integer> quoted = new ArrayList<Integer>();
final List<Node> ignoredNodes = new ArrayList<Node>();
+ final List<String> footnotes = new ArrayList<String>();
if (element != null) {
new NodeTraversor(new NodeVisitor() {
String manual = null;
boolean ignore = elementProcessor.ignoreNode(node)
|| ignoredNodes.contains(node.parentNode());
+ // Manual processing
if (!ignore) {
manual = elementProcessor.manualProcessing(node);
if (manual != null) {
}
}
+ // Subtitle check
+ if (!ignore) {
+ String subtitle = elementProcessor.isSubtitle(node);
+ if (subtitle != null) {
+ subtitle = subtitle.trim();
+ currentLine.append("\n[ " + subtitle + " ]\n");
+ ignore = true;
+ }
+ }
+
+ // <pre> check
+ if (!ignore) {
+ if (node instanceof Element) {
+ Element el = (Element) node;
+ if ("pre".equals(el.tagName())) {
+ currentLine.append(StringUtils
+ .unhtml(el.text()).trim());
+ ignore = true;
+ }
+ }
+ }
+
if (ignore) {
ignoredNodes.add(node);
return;
if (block && currentLine.length() > 0) {
currentLine.append("\n");
}
+
+ if (!element.absUrl("href").trim().isEmpty()) {
+ footnotes.add(element.absUrl("href"));
+ currentLine.append("[" + footnotes.size() + "]");
+ }
} else if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
String line = StringUtil.normaliseWhitespace(textNode
}
}
+ // Fix spaces and nbsp, remove multiple following blank lines
+ List<String> linesCopy = new ArrayList<String>(lines.size());
+ long blanks = 0;
for (int i = 0; i < lines.size(); i++) {
- lines.set(i, lines.get(i).replace(" ", " ").trim());
+ String line = lines.get(i).replace(" ", " ") // nbsp -> space
+ .replace(" ", " ").trim();
+ if (line.isEmpty()) {
+ blanks++;
+ } else {
+ blanks = 0;
+ }
+
+ if (blanks < 2) {
+ linesCopy.add(line);
+ }
+ }
+
+ // Footnotes insertion
+ if (footnotes.size() > 0) {
+ linesCopy.add("");
+ linesCopy.add("");
+ linesCopy.add("");
+ linesCopy.add("");
+ for (int i = 0; i < footnotes.size(); i++) {
+ linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
+ }
}
- return lines;
+ return linesCopy;
+ }
+
+ /**
+ * Reformat the date if possible.
+ *
+ * @param date
+ * the input date
+ *
+ * @return the reformated date, or the same value if it was not parsable
+ */
+ static private String date(String date) {
+ SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
+
+ long epoch = 0;
+ try {
+ epoch = Long.parseLong(date.trim());
+ } catch (Exception e) {
+ epoch = 0;
+ }
+
+ if (epoch > 0) {
+ return out.format(new Date(1000 * epoch));
+ }
+
+ try {
+ Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
+ .parse(date.trim());
+ return out.format(dat);
+ } catch (Exception e) {
+ return date;
+ }
}
}