X-Git-Url: http://git.nikiroo.be/?a=blobdiff_plain;f=src%2Fbe%2Fnikiroo%2Fgofetch%2Fsupport%2FBasicSupport.java;h=ec19c3207fbbba7447da4ce5e419da9060f742b8;hb=b19b36322d5453c1b01761dd76190a67465e0d53;hp=daa0ed16b4cfe52889665fb9a5caaddd7aa2a6f8;hpb=2d95a873f4dc858d9ed55fa1889b1cf29a78f5eb;p=gofetch.git diff --git a/src/be/nikiroo/gofetch/support/BasicSupport.java b/src/be/nikiroo/gofetch/support/BasicSupport.java index daa0ed1..ec19c32 100644 --- a/src/be/nikiroo/gofetch/support/BasicSupport.java +++ b/src/be/nikiroo/gofetch/support/BasicSupport.java @@ -3,41 +3,556 @@ package be.nikiroo.gofetch.support; import java.io.IOException; import java.io.InputStream; import java.net.URL; -import java.net.URLConnection; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; import java.util.List; -import java.util.zip.GZIPInputStream; +import java.util.Map; +import java.util.Map.Entry; + +import org.jsoup.helper.DataUtil; +import org.jsoup.helper.StringUtil; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeTraversor; +import org.jsoup.select.NodeVisitor; import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; +import be.nikiroo.utils.Downloader; +import be.nikiroo.utils.StringUtils; +/** + * Base class for website support. + * + * @author niki + */ public abstract class BasicSupport { - public enum Type { - SLASHDOT, PIPEDOT, - } + /** + * The downloader to use for all web sites via + * {@link BasicSupport#open(URL)} + */ + static private Downloader downloader = new Downloader("gofetcher"); static private String preselector; - private Type type; + /** + * The optional cookies to use to get the site data. + */ + private Map cookies = new HashMap(); - abstract public List list() throws IOException; + private Type type; - abstract public List getComments(Story story) throws IOException; + /** + * Login on the web site (this method does nothing by default, but can be + * overridden if needed). + * + * @throws IOException + * in case of I/O error + * + */ + public void login() throws IOException { + } + /** + * The website textual description, to add in the dispatcher page. + *

+ * Should be short. + * + * @return the description + */ abstract public String getDescription(); + /** + * The gopher "selector" to use for output. + *

+ * A kind of "URL path", like "/news/" or "/misc/news/" or... + * + * @return the selector + */ public String getSelector() { - return getSelector(type); + return getSelector(getType()); } + /** + * The support type. + * + * @return the type + */ public Type getType() { return type; } + /** + * List all the recent items, but only assure the ID and internal URL to + * fetch it later on (until it has been fetched, the rest of the + * {@link Story} is not confirmed). + * + * @return the list of new stories + * + * @throws IOException + * in case of I/O + */ + public List list() throws IOException { + List list = new ArrayList(); + + login(); + for (Entry entry : getUrls()) { + URL url = entry.getKey(); + String defaultCateg = entry.getValue(); + if (defaultCateg == null) { + defaultCateg = ""; + } + + InputStream in = open(url); + Document doc = DataUtil.load(in, "UTF-8", url.toString()); + List articles = getArticles(doc); + for (Element article : articles) { + String id = getArticleId(doc, article).trim(); + String title = getArticleTitle(doc, article).trim(); + String author = getArticleAuthor(doc, article).trim(); + String date = getArticleDate(doc, article).trim(); + String categ = getArticleCategory(doc, article, defaultCateg) + .trim(); + String details = getArticleDetails(doc, article).trim(); + String intUrl = getArticleIntUrl(doc, article).trim(); + String extUrl = getArticleExtUrl(doc, article).trim(); + String content = getArticleContent(doc, article).trim(); + + if (id.isEmpty() && date.isEmpty()) { + continue; + } + + if (!id.isEmpty()) { + while (id.length() < 10) { + id = "0" + id; + } + } else { + id = date.replace(":", "_").replace("+", "_"); + } + + date = date(date); + + list.add(new Story(getType(), id, title, author, date, categ, + details, intUrl, extUrl, content)); + } + } + + return list; + } + + /** + * The {@link URL}s to process for this website. + * + * @return the list of {@link URL}s + * + * @throws IOException + * in case of I/O error + */ + abstract protected List> getUrls() throws IOException; + + /** + * The article {@link Element}s of this document. + * + * @param doc + * the main document for the current category + * + * @return the articles + */ + abstract protected List getArticles(Document doc); + + /** + * The ID of the article (defaults to the date element if empty). + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the ID + */ + abstract protected String getArticleId(Document doc, Element article); + + /** + * The article title to display. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the title + */ + abstract protected String getArticleTitle(Document doc, Element article); + + /** + * The optional article author. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the author + */ + abstract protected String getArticleAuthor(Document doc, Element article); + + /** + * The optional article date. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the date + */ + abstract protected String getArticleDate(Document doc, Element article); + + /** + * the optional article category. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * @param currentCategory + * the currently listed category if any (can be NULL) + * + * @return the category + */ + abstract protected String getArticleCategory(Document doc, Element article, + String currentCategory); + + /** + * the optional details of the article (can replace the date, author and + * category, for instance). + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the details + */ + abstract protected String getArticleDetails(Document doc, Element article); + + /** + * The (required) {@link URL} that points to the news page on the supported + * website. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the internal {@link URL} + */ + abstract protected String getArticleIntUrl(Document doc, Element article); + + /** + * the optional {@link URL} that points to an external website for more + * information. + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the external {@link URL} + */ + abstract protected String getArticleExtUrl(Document doc, Element article); + + /** + * The optional article short-content (not the full content, that will be + * fetched by {@link BasicSupport#fetch(Story)}). + * + * @param doc + * the main document for the current category + * @param article + * the article to look into + * + * @return the short content + */ + abstract protected String getArticleContent(Document doc, Element article); + + /** + * Fetch the full article content as well as all the comments associated to + * this {@link Story}, if any (can be empty, but not NULL). + * + * @param story + * the story to fetch the comments of + * + * @throws IOException + * in case of I/O error + */ + public void fetch(Story story) throws IOException { + String fullContent = ""; + + URL url = new URL(story.getUrlInternal()); + InputStream in = open(url); + try { + Document doc = DataUtil.load(in, "UTF-8", url.toString()); + Element article = getFullArticle(doc); + if (article != null) { + fullContent = getArticleText(article); + } + + if (fullContent.isEmpty()) { + fullContent = story.getContent(); + } + + story.setFullContent(fullContent); + story.setComments(getComments(doc, + getFullArticleCommentPosts(doc, url))); + } finally { + if (in != null) { + in.close(); + } + } + } + + /** + * Return the text from this {@link Element}, using the + * {@link BasicSupport#getElementProcessorFullArticle()} processor logic. + * + * @param article + * the element to extract the text from + * + * @return the text + */ + protected String getArticleText(Element article) { + StringBuilder builder = new StringBuilder(); + ElementProcessor eProc = getElementProcessorFullArticle(); + if (eProc != null) { + for (String line : toLines(article, eProc)) { + builder.append(line + "\n"); + } + } else { + builder.append(article.text()); + } + + // Content is too tight with a single break per line: + return builder.toString().replace("\n", "\n\n") // + .replace("\n\n\n\n", "\n\n") // + .replace("\n\n\n\n", "\n\n") // + .trim(); + } + + /** + * Return the full article if available (this is the article to retrieve + * from the newly downloaded page at {@link Story#getUrlInternal()}). + * + * @param doc + * the (full article) document to work on + * + * @return the article or NULL + */ + abstract protected Element getFullArticle(Document doc); + + /** + * Return the list of comment {@link Element}s from this optional container + * -- must NOT return the "container" as a comment {@link Element}. + * + * @param doc + * the (full article) document to work on + * @param intUrl + * the internal {@link URL} this article wa taken from (the + * {@link URL} from the supported website) + * + * @return the list of comment posts + */ + abstract protected List getFullArticleCommentPosts(Document doc, + URL intUrl); + + /** + * The {@link ElementProcessor} to use to convert the main article element + * (see {@link BasicSupport#getFullArticle(Document)}) into text. + *

+ * See {@link BasicElementProcessor} for a working, basic implementation. + *

+ * Can be NULL to simply use {@link Element#text()}. + * + * @return the processor, or NULL + */ + abstract protected ElementProcessor getElementProcessorFullArticle(); + + /** + * Open a network resource. + *

+ * You need to close the returned {@link InputStream} when done. + * + * @param url + * the source to open + * + * @return the content + * + * @throws IOException + * in case of I/O error + */ + protected InputStream open(URL url) throws IOException { + return downloader.open(url, url, cookies, null, null, null); + } + + /** + * Convert the comment elements into {@link Comment}s + * + * @param doc + * the document we work on + * @param posts + * the comment elements + * + * @return the converted {@link Comment}s + */ + private List getComments(Document doc, List posts) { + List comments = new ArrayList(); + if (posts != null) { + for (Element post : posts) { + String id = getCommentId(post).trim(); + String author = getCommentAuthor(post).trim(); + String title = getCommentTitle(post).trim(); + String date = getCommentDate(post).trim(); + + List content = new ArrayList(); + + if (id.isEmpty()) { + id = date; + } + + date = date(date); + + Element contentE = getCommentContentElement(post); + if (contentE != null) { + ElementProcessor eProc = getElementProcessorComment(); + if (eProc != null) { + for (String line : toLines(contentE, eProc)) { + content.add(line); + } + } else { + content = Arrays.asList(contentE.text().split("\n")); + } + } + + Comment comment = new Comment(id, author, title, date, content); + comment.addAll(getComments(doc, + getCommentCommentPosts(doc, post))); + + if (!comment.isEmpty()) { + comments.add(comment); + } + } + } + + return comments; + } + + /** + * Return the list of subcomment {@link Element}s from this comment element + * -- must NOT return the "container" as a comment {@link Element}. + * + * @param doc + * the (full article) document to work on + * @param container + * the container (a comment {@link Element}) + * + * @return the list of comment posts + */ + abstract protected List getCommentCommentPosts(Document doc, + Element container); + + /** + * Compute the ID of the given comment element. + * + * @param post + * the comment element + * + * @return the ID + */ + abstract protected String getCommentId(Element post); + + /** + * Compute the author of the given comment element. + * + * @param post + * the comment element + * + * @return the author + */ + abstract protected String getCommentAuthor(Element post); + + /** + * Compute the title of the given comment element. + * + * @param post + * the comment element + * + * @return the title + */ + abstract protected String getCommentTitle(Element post); + + /** + * Compute the date of the given comment element. + * + * @param post + * the comment element + * + * @return the date + */ + abstract protected String getCommentDate(Element post); + + /** + * Get the main of the given comment element, which can be NULL. + * + * @param post + * the comment element + * + * @return the element + */ + abstract protected Element getCommentContentElement(Element post); + + /** + * The {@link ElementProcessor} to use to convert the main comment element + * (see {@link BasicSupport#getCommentContentElement(Element)}) into text. + *

+ * See {@link BasicElementProcessor} for a working, basic implementation. + *

+ * Can be NULL to simply use {@link Element#text()}. + * + * @return the processor + */ + abstract protected ElementProcessor getElementProcessorComment(); + + /** + * The support type. + * + * @param type + * the new type + */ protected void setType(Type type) { this.type = type; } /** + * Add a cookie for all site connections. + * + * @param name + * the cookie name + * @param value + * the value + */ + protected void addCookie(String name, String value) { + cookies.put(name, value); + } + + /** + * The {@link String} to append to the selector (the selector will be + * constructed as "this string" then "/type/". + * * @param preselector * the preselector to set */ @@ -45,6 +560,15 @@ public abstract class BasicSupport { BasicSupport.preselector = preselector; } + /** + * Return a {@link BasicSupport} that is compatible with the given + * {@link Type} if it exists (or NULL if not). + * + * @param type + * the type + * + * @return a compatible {@link BasicSupport} if it exists (or NULL if not) + */ static public BasicSupport getSupport(Type type) { BasicSupport support = null; @@ -56,6 +580,30 @@ public abstract class BasicSupport { case PIPEDOT: support = new Pipedot(); break; + case LWN: + support = new LWN(); + break; + case LEMONDE: + support = new LeMonde(); + break; + case REGISTER: + support = new TheRegister(); + break; + case TOO_LINUX: + support = new TooLinux(); + break; + case ERE_NUMERIQUE: + support = new EreNumerique(); + break; + case PHORONIX: + support = new Phoronix(); + break; + case SEPT_SUR_SEPT: + support = new SeptSurSept(); + break; + case REDDIT: + support = new Reddit(); + break; } if (support != null) { @@ -66,19 +614,216 @@ public abstract class BasicSupport { return support; } + /** + * The gopher "selector" to use for output for this type, using the + * preselector. + *

+ * A kind of "URL path", like "/news/" or "/misc/news/" or... + * + * @param type + * the type to get the selector of + * + * @return the selector + */ static public String getSelector(Type type) { return preselector + "/" + type + "/"; } - // TODO: check Downloader.java? - static protected InputStream open(URL url) throws IOException { - URLConnection conn = url.openConnection(); - conn.connect(); - InputStream in = conn.getInputStream(); - if ("gzip".equals(conn.getContentEncoding())) { - in = new GZIPInputStream(in); + /** + * Process the given element into text (each line is a text paragraph and + * can be prepended with ">" signs to indicate a quote or sub-quote or + * sub-sub-quote...). + * + * @param element + * the element to process + * @param elementProcessor + * the element processor, must not be NULL + * + * @return text lines, each line is a paragraph + */ + static protected List toLines(Element element, + final ElementProcessor elementProcessor) { + final List lines = new ArrayList(); + final StringBuilder currentLine = new StringBuilder(); + final List quoted = new ArrayList(); + final List ignoredNodes = new ArrayList(); + final List footnotes = new ArrayList(); + + if (element != null) { + new NodeTraversor(new NodeVisitor() { + @Override + public void head(Node node, int depth) { + String manual = null; + boolean ignore = elementProcessor.ignoreNode(node) + || ignoredNodes.contains(node.parentNode()); + // Manual processing + if (!ignore) { + manual = elementProcessor.manualProcessing(node); + if (manual != null) { + currentLine.append(manual); + ignore = true; + } + } + + // Subtitle check + if (!ignore) { + String subtitle = elementProcessor.isSubtitle(node); + if (subtitle != null) { + subtitle = subtitle.trim(); + currentLine.append("\n[ " + subtitle + " ]\n"); + ignore = true; + } + } + + //

 check
+					if (!ignore) {
+						if (node instanceof Element) {
+							Element el = (Element) node;
+							if ("pre".equals(el.tagName())) {
+								currentLine.append(StringUtils
+										.unhtml(el.text()).trim());
+								ignore = true;
+							}
+						}
+					}
+
+					if (ignore) {
+						ignoredNodes.add(node);
+						return;
+					}
+
+					String prep = "";
+					for (int i = 0; i < quoted.size(); i++) {
+						prep += ">";
+					}
+					prep += " ";
+
+					boolean enterQuote = elementProcessor.detectQuote(node);
+					boolean leaveQuote = quoted.contains(depth);
+
+					if (enterQuote) {
+						quoted.add(depth);
+					}
+
+					if (leaveQuote) {
+						quoted.remove(Integer.valueOf(depth));
+					}
+
+					if (enterQuote || leaveQuote) {
+						if (currentLine.length() > 0) {
+							if (currentLine.charAt(currentLine.length() - 1) == '\n') {
+								currentLine.setLength(currentLine.length() - 1);
+							}
+							for (String l : currentLine.toString().split("\n")) {
+								lines.add(prep + l);
+							}
+						}
+						currentLine.setLength(0);
+					}
+
+					if (node instanceof Element) {
+						Element element = (Element) node;
+						boolean block = element.isBlock()
+								|| element.tagName().equalsIgnoreCase("br");
+						if (block && currentLine.length() > 0) {
+							currentLine.append("\n");
+						}
+
+						if (!element.absUrl("href").trim().isEmpty()) {
+							footnotes.add(element.absUrl("href"));
+							currentLine.append("[" + footnotes.size() + "]");
+						}
+					} else if (node instanceof TextNode) {
+						TextNode textNode = (TextNode) node;
+						String line = StringUtil.normaliseWhitespace(textNode
+								.getWholeText());
+
+						currentLine.append(elementProcessor.processText(line));
+						currentLine.append(" ");
+					}
+				}
+
+				@Override
+				public void tail(Node node, int depth) {
+				}
+			}).traverse(element);
+		}
+
+		if (currentLine.length() > 0) {
+			String prep = "";
+			for (int i = 0; i < quoted.size(); i++) {
+				prep += ">";
+			}
+			prep += " ";
+			if (currentLine.length() > 0) {
+				if (currentLine.charAt(currentLine.length() - 1) == '\n') {
+					currentLine.setLength(currentLine.length() - 1);
+				}
+				for (String l : currentLine.toString().split("\n")) {
+					lines.add(prep + l);
+				}
+			}
+		}
+
+		// Fix spaces and nbsp, remove multiple following blank lines
+		List linesCopy = new ArrayList(lines.size());
+		long blanks = 0;
+		for (int i = 0; i < lines.size(); i++) {
+			String line = lines.get(i).replace(" ", " ") // nbsp -> space
+					.replace("  ", " ").trim();
+			if (line.isEmpty()) {
+				blanks++;
+			} else {
+				blanks = 0;
+			}
+
+			if (blanks < 2) {
+				linesCopy.add(line);
+			}
+		}
+
+		// Footnotes insertion
+		if (footnotes.size() > 0) {
+			linesCopy.add("");
+			linesCopy.add("");
+			linesCopy.add("");
+			linesCopy.add("");
+			for (int i = 0; i < footnotes.size(); i++) {
+				linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
+			}
+		}
+
+		return linesCopy;
+	}
+
+	/**
+	 * Reformat the date if possible.
+	 * 
+	 * @param date
+	 *            the input date
+	 * 
+	 * @return the reformated date, or the same value if it was not parsable
+	 */
+	static private String date(String date) {
+		SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
+
+		long epoch = 0;
+		try {
+			epoch = Long.parseLong(date.trim());
+		} catch (Exception e) {
+			epoch = 0;
 		}
 
-		return in;
+		if (epoch > 0) {
+			return out.format(new Date(1000 * epoch));
+		}
+
+		try {
+			Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
+					.parse(date.trim());
+			return out.format(dat);
+		} catch (ParseException e) {
+			return date;
+		}
 	}
 }