X-Git-Url: http://git.nikiroo.be/?p=gofetch.git;a=blobdiff_plain;f=src%2Fbe%2Fnikiroo%2Fgofetch%2Fsupport%2FBasicSupport.java;h=a59ae313fb1f1fda8979020d7e6315d81ba6592e;hp=b15fac7e5e2598d0d67c3bcf493c6dae03a0a8a1;hb=3e62b034c1981ae6329f06b3f8c0ee25c3683789;hpb=a81f396bc4bf0f70e4b5f654045f533941d86dc9 diff --git a/src/be/nikiroo/gofetch/support/BasicSupport.java b/src/be/nikiroo/gofetch/support/BasicSupport.java index b15fac7..a59ae31 100644 --- a/src/be/nikiroo/gofetch/support/BasicSupport.java +++ b/src/be/nikiroo/gofetch/support/BasicSupport.java @@ -1,22 +1,29 @@ package be.nikiroo.gofetch.support; import java.io.IOException; +import java.io.InputStream; +import java.net.URL; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Arrays; import java.util.Date; import java.util.List; +import java.util.Map.Entry; +import org.jsoup.helper.DataUtil; import org.jsoup.helper.StringUtil; +import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; -import org.jsoup.select.Elements; import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; +import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; import be.nikiroo.utils.Downloader; +import be.nikiroo.utils.StringUtils; /** * Base class for website support. @@ -27,140 +34,226 @@ public abstract class BasicSupport { /** The downloader to use for all websites. */ protected static Downloader downloader = new Downloader("gofetcher"); + static private String preselector; + + private Type type; + + /** + * The website textual description, to add in the dispatcher page. + *

+ * Should be short. + * + * @return the description + */ + abstract public String getDescription(); + /** - * The support type (each website we support has a single type). - * - * @author niki - */ - public enum Type { - /** EN: Any, but mostly IT/Sci */ - SLASHDOT, - /** EN: Clone of Slashdot, mostly abandoned */ - PIPEDOT, - /** EN: Linux */ - LWN, - /** FR: Any */ - LEMONDE, - /** EN: IT */ - REGISTER, - /** FR: Linux */ - TOO_LINUX, - /** FR: IT */ - ERE_NUMERIQUE, + * The gopher "selector" to use for output. + *

+ * A kind of "URL path", like "/news/" or "/misc/news/" or... + * + * @return the selector + */ + public String getSelector() { + return getSelector(type); } /** - * Used to process an element into lines. - * - * @author niki - */ - public interface ElementProcessor { - /** - * Detect if this node is a quote and should be trated as such. - * - * @param node - * the node to check - * @return TRUE if it is - */ - public boolean detectQuote(Node node); - - /** - * Process text content (will be called on each text element, allowing - * you to modify it if needed). - * - * @param text - * the text to process - * - * @return the resulting text - */ - public String processText(String text); - - /** - * Ignore this node. - * - * @param node - * the node to ignore - * @return TRUE if it has to be ignored - */ - public boolean ignoreNode(Node node); - - /** - * Manually process this node (and return the manual processing value) - * if so desired. - *

- * If the node is manually processed, it and its children will not be - * automatically processed. - * - * @param node - * the node to optionally process - * - * @return NULL if not processed (will thus be automatically processed - * as usual), a {@link String} (may be empty) if we process it - * manually -- the given {@link String} will be used instead of - * the usual automatic processing if not NULL - */ - public String manualProcessing(Node node); - - /** - * This {@link Node} is a subtitle and should be treated as such - * (highlighted). - * - * @param node - * the node to check - * - * @return NULL if it is not a subtitle, the subtitle to use if it is - */ - public String isSubtitle(Node node); + * The support type. + * + * @return the type + */ + public Type getType() { + return type; } /** - * A default {@link ElementProcessor} (will not detect or process anything - * manually). + * List all the recent items, but only assure the ID and internal URL to + * fetch it later on (until it has been fetched, the rest of the + * {@link Story} is not confirmed). * - * @author niki + * @return the list of new stories + * + * @throws IOException + * in case of I/O */ - public class BasicElementProcessor implements ElementProcessor { - @Override - public boolean detectQuote(Node node) { - return false; - } + public List list() throws IOException { + List list = new ArrayList(); + + for (Entry entry : getUrls()) { + URL url = entry.getKey(); + String defaultCateg = entry.getValue(); + if (defaultCateg == null) { + defaultCateg = ""; + } - @Override - public String processText(String text) { - return text; - } + InputStream in = downloader.open(url); + Document doc = DataUtil.load(in, "UTF-8", url.toString()); + List articles = getArticles(doc); + for (Element article : articles) { + String id = getArticleId(doc, article).trim(); + String title = getArticleTitle(doc, article).trim(); + String author = getArticleAuthor(doc, article).trim(); + String date = getArticleDate(doc, article).trim(); + String categ = getArticleCategory(doc, article, defaultCateg) + .trim(); + String details = getArticleDetails(doc, article).trim(); + String intUrl = getArticleIntUrl(doc, article).trim(); + String extUrl = getArticleExtUrl(doc, article).trim(); + String content = getArticleContent(doc, article).trim(); + + if (id.isEmpty() && date.isEmpty()) { + continue; + } - @Override - public boolean ignoreNode(Node node) { - return false; - } + if (id.isEmpty()) { + id = date.replace(":", "_").replace("+", "_"); + } - @Override - public String manualProcessing(Node node) { - return null; - } + date = date(date); - @Override - public String isSubtitle(Node node) { - return null; + list.add(new Story(getType(), id, title, author, date, categ, + details, intUrl, extUrl, content)); + } } + + return list; } - static private String preselector; + /** + * The {@link URL}s to process for this website. + * + * @return the list of {@link URL}s + * + * @throws IOException + * in case of I/O error + */ + abstract protected List> getUrls() throws IOException; - private Type type; + /** + * The article {@link Element}s of this document. + * + * @param doc + * the main document for the current category + * + * @return the articles + */ + abstract protected List getArticles(Document doc); /** - * List all the recent items, but only assure the ID and internal URL to - * fetch it later on (until it has been fetched, the rest of the - * {@link Story} is not confirmed). + * The ID of the article (defaults to the date element if empty). * - * @return the list of new stories + * @param doc + * the main document for the current category + * @param article + * the article to look into * - * @throws IOException - * in case of I/O + * @return the ID + */ + abstract protected String getArticleId(Document doc, Element article); + + /** + * The article title to display. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the title + */ + abstract protected String getArticleTitle(Document doc, Element article); + + /** + * The optional article author. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the author + */ + abstract protected String getArticleAuthor(Document doc, Element article); + + /** + * The optional article date. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the date + */ + abstract protected String getArticleDate(Document doc, Element article); + + /** + * the optional article category. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * @param currentCategory + * the currently listed category if any (can be NULL) + * + * @return the category */ - abstract public List list() throws IOException; + abstract protected String getArticleCategory(Document doc, Element article, + String currentCategory); + + /** + * the optional details of the article (can replace the date, author and + * category, for instance). + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the details + */ + abstract protected String getArticleDetails(Document doc, Element article); + + /** + * The (required) {@link URL} that points to the news page on the supported + * website. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the internal {@link URL} + */ + abstract protected String getArticleIntUrl(Document doc, Element article); + + /** + * the optional {@link URL} that points to an external website for more + * information. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the external {@link URL} + */ + abstract protected String getArticleExtUrl(Document doc, Element article); + + /** + * The optional article short-content (not the full content, that will be + * fetched by {@link BasicSupport#fetch(Story)}). + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the short content + */ + abstract protected String getArticleContent(Document doc, Element article); /** * Fetch the full article content as well as all the comments associated to @@ -172,37 +265,211 @@ public abstract class BasicSupport { * @throws IOException * in case of I/O error */ - abstract public void fetch(Story story) throws IOException; + public void fetch(Story story) throws IOException { + String fullContent = ""; + + URL url = new URL(story.getUrlInternal()); + InputStream in = downloader.open(url); + try { + Document doc = DataUtil.load(in, "UTF-8", url.toString()); + Element article = getFullArticle(doc); + if (article != null) { + StringBuilder builder = new StringBuilder(); + ElementProcessor eProc = getElementProcessorFullArticle(); + if (eProc != null) { + for (String line : toLines(article, eProc)) { + builder.append(line + "\n"); + } + } else { + builder.append(article.text()); + } + + // Content is too tight with a single break per line: + fullContent = builder.toString().replace("\n", "\n\n") // + .replace("\n\n\n\n", "\n\n") // + .replace("\n\n\n\n", "\n\n") // + .trim(); + } + + if (fullContent.isEmpty()) { + fullContent = story.getContent(); + } + + story.setFullContent(fullContent); + story.setComments(getComments(doc, + getFullArticleCommentPosts(doc, url))); + } finally { + if (in != null) { + in.close(); + } + } + } /** - * The website textual description, to add in the dispatcher page. - *

- * Should be short. + * Return the full article if available. * - * @return the description + * @param doc + * the (full article) document to work on + * + * @return the article or NULL */ - abstract public String getDescription(); + abstract protected Element getFullArticle(Document doc); /** - * The gopher "selector" to use for output. + * Return the list of comment {@link Element}s from this optional container + * -- must NOT return the "container" as a comment {@link Element}. + * + * @param doc + * the (full article) document to work on + * @param intUrl + * the internal {@link URL} this article wa taken from (the + * {@link URL} from the supported website) + * + * @return the list of comment posts + */ + abstract protected List getFullArticleCommentPosts(Document doc, + URL intUrl); + + /** + * The {@link ElementProcessor} to use to convert the main article element + * (see {@link BasicSupport#getFullArticle(Document)}) into text. *

- * A kind of "URL path", like "/news/" or "/misc/news/" or... + * See {@link BasicElementProcessor} for a working, basic implementation. + *

+ * Can be NULL to simply use {@link Element#text()}. * - * @return the selector + * @return the processor, or NULL */ - public String getSelector() { - return getSelector(type); - } + abstract protected ElementProcessor getElementProcessorFullArticle(); /** - * The support type. + * Convert the comment elements into {@link Comment}s * - * @return the type + * @param doc + * the document we work on + * @param posts + * the comment elements + * + * @return the converted {@link Comment}s */ - public Type getType() { - return type; + private List getComments(Document doc, List posts) { + List comments = new ArrayList(); + if (posts != null) { + for (Element post : posts) { + String id = getCommentId(post).trim(); + String author = getCommentAuthor(post).trim(); + String title = getCommentTitle(post).trim(); + String date = getCommentDate(post).trim(); + + List content = new ArrayList(); + + if (id.isEmpty()) { + id = date; + } + + date = date(date); + + Element contentE = getCommentContentElement(post); + if (contentE != null) { + ElementProcessor eProc = getElementProcessorComment(); + if (eProc != null) { + for (String line : toLines(contentE, eProc)) { + content.add(line); + } + } else { + content = Arrays.asList(contentE.text().split("\n")); + } + } + + Comment comment = new Comment(id, author, title, date, content); + comment.addAll(getComments(doc, + getCommentCommentPosts(doc, post))); + + if (!comment.isEmpty()) { + comments.add(comment); + } + } + } + + return comments; } + /** + * Return the list of subcomment {@link Element}s from this comment element + * -- must NOT return the "container" as a comment {@link Element}. + * + * @param doc + * the (full article) document to work on + * @param container + * the container (a comment {@link Element}) + * + * @return the list of comment posts + */ + abstract protected List getCommentCommentPosts(Document doc, + Element container); + + /** + * Compute the ID of the given comment element. + * + * @param post + * the comment element + * + * @return the ID + */ + abstract protected String getCommentId(Element post); + + /** + * Compute the author of the given comment element. + * + * @param post + * the comment element + * + * @return the author + */ + abstract protected String getCommentAuthor(Element post); + + /** + * Compute the title of the given comment element. + * + * @param post + * the comment element + * + * @return the title + */ + abstract protected String getCommentTitle(Element post); + + /** + * Compute the date of the given comment element. + * + * @param post + * the comment element + * + * @return the date + */ + abstract protected String getCommentDate(Element post); + + /** + * Get the main of the given comment element, which can be NULL. + * + * @param post + * the comment element + * + * @return the element + */ + abstract protected Element getCommentContentElement(Element post); + + /** + * The {@link ElementProcessor} to use to convert the main comment element + * (see {@link BasicSupport#getCommentContentElement(Element)}) into text. + *

+ * See {@link BasicElementProcessor} for a working, basic implementation. + *

+ * Can be NULL to simply use {@link Element#text()}. + * + * @return the processor + */ + abstract protected ElementProcessor getElementProcessorComment(); + /** * The support type. * @@ -284,46 +551,6 @@ public abstract class BasicSupport { return preselector + "/" + type + "/"; } - /** - * Get the first {@link Element} of the given class, or an empty span - * {@link Element} if none found. - * - * @param element - * the element to look in - * @param className - * the class to look for - * - * @return the value or an empty span {@link Element} - */ - static protected Element firstOrEmpty(Element element, String className) { - Elements subElements = element.getElementsByClass(className); - if (subElements.size() > 0) { - return subElements.get(0); - } - - return new Element("span"); - } - - /** - * Get the first {@link Element} of the given tag, or an empty span - * {@link Element} if none found. - * - * @param element - * the element to look in - * @param tagName - * the tag to look for - * - * @return the value or an empty span {@link Element} - */ - static protected Element firstOrEmptyTag(Element element, String tagName) { - Elements subElements = element.getElementsByTag(tagName); - if (subElements.size() > 0) { - return subElements.get(0); - } - - return new Element("span"); - } - /** * Process the given element into text (each line is a text paragraph and * can be prepended with ">" signs to indicate a quote or sub-quote or @@ -342,6 +569,7 @@ public abstract class BasicSupport { final StringBuilder currentLine = new StringBuilder(); final List quoted = new ArrayList(); final List ignoredNodes = new ArrayList(); + final List footnotes = new ArrayList(); if (element != null) { new NodeTraversor(new NodeVisitor() { @@ -369,6 +597,18 @@ public abstract class BasicSupport { } } + //

 check
+					if (!ignore) {
+						if (node instanceof Element) {
+							Element el = (Element) node;
+							if ("pre".equals(el.tagName())) {
+								currentLine.append(StringUtils
+										.unhtml(el.text()).trim());
+								ignore = true;
+							}
+						}
+					}
+
 					if (ignore) {
 						ignoredNodes.add(node);
 						return;
@@ -410,6 +650,11 @@ public abstract class BasicSupport {
 						if (block && currentLine.length() > 0) {
 							currentLine.append("\n");
 						}
+
+						if (!element.absUrl("href").trim().isEmpty()) {
+							footnotes.add(element.absUrl("href"));
+							currentLine.append("[" + footnotes.size() + "]");
+						}
 					} else if (node instanceof TextNode) {
 						TextNode textNode = (TextNode) node;
 						String line = StringUtil.normaliseWhitespace(textNode
@@ -442,11 +687,35 @@ public abstract class BasicSupport {
 			}
 		}
 
+		// Fix spaces and nbsp, remove multiple following blank lines
+		List linesCopy = new ArrayList(lines.size());
+		long blanks = 0;
 		for (int i = 0; i < lines.size(); i++) {
-			lines.set(i, lines.get(i).replace("  ", " ").trim());
+			String line = lines.get(i).replace(" ", " ") // nbsp -> space
+					.replace("  ", " ").trim();
+			if (line.isEmpty()) {
+				blanks++;
+			} else {
+				blanks = 0;
+			}
+
+			if (blanks < 2) {
+				linesCopy.add(line);
+			}
+		}
+
+		// Footnotes insertion
+		if (footnotes.size() > 0) {
+			linesCopy.add("");
+			linesCopy.add("");
+			linesCopy.add("");
+			linesCopy.add("");
+			for (int i = 0; i < footnotes.size(); i++) {
+				linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
+			}
 		}
 
-		return lines;
+		return linesCopy;
 	}
 
 	/**
@@ -457,7 +726,7 @@ public abstract class BasicSupport {
 	 * 
 	 * @return the reformated date, or the same value if it was not parsable
 	 */
-	static protected String date(String date) {
+	static private String date(String date) {
 		SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
 
 		long epoch = 0;