[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java

package be.nikiroo.gofetch.support;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Map.Entry;

import org.jsoup.helper.DataUtil;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

import be.nikiroo.gofetch.data.Comment;
import be.nikiroo.gofetch.data.Story;
import be.nikiroo.utils.Downloader;
import be.nikiroo.utils.StringUtils;

/**
 * Base class for website support.
 * 
 * @author niki
 */
public abstract class BasicSupport {
	/** The downloader to use for all websites. */
	static protected Downloader downloader = new Downloader("gofetcher");

	static private String preselector;

	private Type type;

	/**
	 * The website textual description, to add in the dispatcher page.
	 * <p>
	 * Should be short.
	 * 
	 * @return the description
	 */
	abstract public String getDescription();

	/**
	 * The gopher "selector" to use for output.
	 * <p>
	 * A kind of "URL path", like "/news/" or "/misc/news/" or...
	 * 
	 * @return the selector
	 */
	public String getSelector() {
		return getSelector(type);
	}

	/**
	 * The support type.
	 * 
	 * @return the type
	 */
	public Type getType() {
		return type;
	}

	/**
	 * List all the recent items, but only assure the ID and internal URL to
	 * fetch it later on (until it has been fetched, the rest of the
	 * {@link Story} is not confirmed).
	 * 
	 * @return the list of new stories
	 * 
	 * @throws IOException
	 *             in case of I/O
	 */
	public List<Story> list() throws IOException {
		List<Story> list = new ArrayList<Story>();

		for (Entry<URL, String> entry : getUrls()) {
			URL url = entry.getKey();
			String defaultCateg = entry.getValue();
			if (defaultCateg == null) {
				defaultCateg = "";
			}

			InputStream in = downloader.open(url);
			Document doc = DataUtil.load(in, "UTF-8", url.toString());
			List<Element> articles = getArticles(doc);
			for (Element article : articles) {
				String id = getArticleId(doc, article).trim();
				String title = getArticleTitle(doc, article).trim();
				String author = getArticleAuthor(doc, article).trim();
				String date = getArticleDate(doc, article).trim();
				String categ = getArticleCategory(doc, article, defaultCateg)
						.trim();
				String details = getArticleDetails(doc, article).trim();
				String intUrl = getArticleIntUrl(doc, article).trim();
				String extUrl = getArticleExtUrl(doc, article).trim();
				String content = getArticleContent(doc, article).trim();

				if (id.isEmpty() && date.isEmpty()) {
					continue;
				}

				if (!id.isEmpty()) {
					while (id.length() < 10) {
						id = "0" + id;
					}
				} else {
					id = date.replace(":", "_").replace("+", "_");
				}

				date = date(date);

				list.add(new Story(getType(), id, title, author, date, categ,
						details, intUrl, extUrl, content));
			}
		}

		return list;
	}

	/**
	 * The {@link URL}s to process for this website.
	 * 
	 * @return the list of {@link URL}s
	 * 
	 * @throws IOException
	 *             in case of I/O error
	 */
	abstract protected List<Entry<URL, String>> getUrls() throws IOException;

	/**
	 * The article {@link Element}s of this document.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * 
	 * @return the articles
	 */
	abstract protected List<Element> getArticles(Document doc);

	/**
	 * The ID of the article (defaults to the date element if empty).
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the ID
	 */
	abstract protected String getArticleId(Document doc, Element article);

	/**
	 * The article title to display.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the title
	 */
	abstract protected String getArticleTitle(Document doc, Element article);

	/**
	 * The optional article author.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the author
	 */
	abstract protected String getArticleAuthor(Document doc, Element article);

	/**
	 * The optional article date.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the date
	 */
	abstract protected String getArticleDate(Document doc, Element article);

	/**
	 * the optional article category.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * @param currentCategory
	 *            the currently listed category if any (can be NULL)
	 * 
	 * @return the category
	 */
	abstract protected String getArticleCategory(Document doc, Element article,
			String currentCategory);

	/**
	 * the optional details of the article (can replace the date, author and
	 * category, for instance).
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the details
	 */
	abstract protected String getArticleDetails(Document doc, Element article);

	/**
	 * The (required) {@link URL} that points to the news page on the supported
	 * website.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the internal {@link URL}
	 */
	abstract protected String getArticleIntUrl(Document doc, Element article);

	/**
	 * the optional {@link URL} that points to an external website for more
	 * information.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the external {@link URL}
	 */
	abstract protected String getArticleExtUrl(Document doc, Element article);

	/**
	 * The optional article short-content (not the full content, that will be
	 * fetched by {@link BasicSupport#fetch(Story)}).
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the short content
	 */
	abstract protected String getArticleContent(Document doc, Element article);

	/**
	 * Fetch the full article content as well as all the comments associated to
	 * this {@link Story}, if any (can be empty, but not NULL).
	 * 
	 * @param story
	 *            the story to fetch the comments of
	 * 
	 * @throws IOException
	 *             in case of I/O error
	 */
	public void fetch(Story story) throws IOException {
		String fullContent = "";

		URL url = new URL(story.getUrlInternal());
		InputStream in = downloader.open(url);
		try {
			Document doc = DataUtil.load(in, "UTF-8", url.toString());
			Element article = getFullArticle(doc);
			if (article != null) {
				StringBuilder builder = new StringBuilder();
				ElementProcessor eProc = getElementProcessorFullArticle();
				if (eProc != null) {
					for (String line : toLines(article, eProc)) {
						builder.append(line + "\n");
					}
				} else {
					builder.append(article.text());
				}

				// Content is too tight with a single break per line:
				fullContent = builder.toString().replace("\n", "\n\n") //
						.replace("\n\n\n\n", "\n\n") //
						.replace("\n\n\n\n", "\n\n") //
						.trim();
			}

			if (fullContent.isEmpty()) {
				fullContent = story.getContent();
			}

			story.setFullContent(fullContent);
			story.setComments(getComments(doc,
					getFullArticleCommentPosts(doc, url)));
		} finally {
			if (in != null) {
				in.close();
			}
		}
	}

	/**
	 * Return the full article if available.
	 * 
	 * @param doc
	 *            the (full article) document to work on
	 * 
	 * @return the article or NULL
	 */
	abstract protected Element getFullArticle(Document doc);

	/**
	 * Return the list of comment {@link Element}s from this optional container
	 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	 * 
	 * @param doc
	 *            the (full article) document to work on
	 * @param intUrl
	 *            the internal {@link URL} this article wa taken from (the
	 *            {@link URL} from the supported website)
	 * 
	 * @return the list of comment posts
	 */
	abstract protected List<Element> getFullArticleCommentPosts(Document doc,
			URL intUrl);

	/**
	 * The {@link ElementProcessor} to use to convert the main article element
	 * (see {@link BasicSupport#getFullArticle(Document)}) into text.
	 * <p>
	 * See {@link BasicElementProcessor} for a working, basic implementation.
	 * <p>
	 * Can be NULL to simply use {@link Element#text()}.
	 * 
	 * @return the processor, or NULL
	 */
	abstract protected ElementProcessor getElementProcessorFullArticle();

	/**
	 * Convert the comment elements into {@link Comment}s
	 * 
	 * @param doc
	 *            the document we work on
	 * @param posts
	 *            the comment elements
	 * 
	 * @return the converted {@link Comment}s
	 */
	private List<Comment> getComments(Document doc, List<Element> posts) {
		List<Comment> comments = new ArrayList<Comment>();
		if (posts != null) {
			for (Element post : posts) {
				String id = getCommentId(post).trim();
				String author = getCommentAuthor(post).trim();
				String title = getCommentTitle(post).trim();
				String date = getCommentDate(post).trim();

				List<String> content = new ArrayList<String>();

				if (id.isEmpty()) {
					id = date;
				}

				date = date(date);

				Element contentE = getCommentContentElement(post);
				if (contentE != null) {
					ElementProcessor eProc = getElementProcessorComment();
					if (eProc != null) {
						for (String line : toLines(contentE, eProc)) {
							content.add(line);
						}
					} else {
						content = Arrays.asList(contentE.text().split("\n"));
					}
				}

				Comment comment = new Comment(id, author, title, date, content);
				comment.addAll(getComments(doc,
						getCommentCommentPosts(doc, post)));

				if (!comment.isEmpty()) {
					comments.add(comment);
				}
			}
		}

		return comments;
	}

	/**
	 * Return the list of subcomment {@link Element}s from this comment element
	 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	 * 
	 * @param doc
	 *            the (full article) document to work on
	 * @param container
	 *            the container (a comment {@link Element})
	 * 
	 * @return the list of comment posts
	 */
	abstract protected List<Element> getCommentCommentPosts(Document doc,
			Element container);

	/**
	 * Compute the ID of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the ID
	 */
	abstract protected String getCommentId(Element post);

	/**
	 * Compute the author of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the author
	 */
	abstract protected String getCommentAuthor(Element post);

	/**
	 * Compute the title of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the title
	 */
	abstract protected String getCommentTitle(Element post);

	/**
	 * Compute the date of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the date
	 */
	abstract protected String getCommentDate(Element post);

	/**
	 * Get the main of the given comment element, which can be NULL.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the element
	 */
	abstract protected Element getCommentContentElement(Element post);

	/**
	 * The {@link ElementProcessor} to use to convert the main comment element
	 * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
	 * <p>
	 * See {@link BasicElementProcessor} for a working, basic implementation.
	 * <p>
	 * Can be NULL to simply use {@link Element#text()}.
	 * 
	 * @return the processor
	 */
	abstract protected ElementProcessor getElementProcessorComment();

	/**
	 * The support type.
	 * 
	 * @param type
	 *            the new type
	 */
	protected void setType(Type type) {
		this.type = type;
	}

	/**
	 * The {@link String} to append to the selector (the selector will be
	 * constructed as "this string" then "/type/".
	 * 
	 * @param preselector
	 *            the preselector to set
	 */
	static public void setPreselector(String preselector) {
		BasicSupport.preselector = preselector;
	}

	/**
	 * Return a {@link BasicSupport} that is compatible with the given
	 * {@link Type} if it exists (or NULL if not).
	 * 
	 * @param type
	 *            the type
	 * 
	 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
	 */
	static public BasicSupport getSupport(Type type) {
		BasicSupport support = null;

		if (type != null) {
			switch (type) {
			case SLASHDOT:
				support = new Slashdot();
				break;
			case PIPEDOT:
				support = new Pipedot();
				break;
			case LWN:
				support = new LWN();
				break;
			case LEMONDE:
				support = new LeMonde();
				break;
			case REGISTER:
				support = new TheRegister();
				break;
			case TOO_LINUX:
				support = new TooLinux();
				break;
			case ERE_NUMERIQUE:
				support = new EreNumerique();
				break;
			case PHORONIX:
				support = new Phoronix();
				break;
			}

			if (support != null) {
				support.setType(type);
			}
		}

		return support;
	}

	/**
	 * The gopher "selector" to use for output for this type, using the
	 * preselector.
	 * <p>
	 * A kind of "URL path", like "/news/" or "/misc/news/" or...
	 * 
	 * @param type
	 *            the type to get the selector of
	 * 
	 * @return the selector
	 */
	static public String getSelector(Type type) {
		return preselector + "/" + type + "/";
	}

	/**
	 * Process the given element into text (each line is a text paragraph and
	 * can be prepended with ">" signs to indicate a quote or sub-quote or
	 * sub-sub-quote...).
	 * 
	 * @param element
	 *            the element to process
	 * @param elementProcessor
	 *            the element processor, must not be NULL
	 * 
	 * @return text lines, each line is a paragraph
	 */
	static protected List<String> toLines(Element element,
			final ElementProcessor elementProcessor) {
		final List<String> lines = new ArrayList<String>();
		final StringBuilder currentLine = new StringBuilder();
		final List<Integer> quoted = new ArrayList<Integer>();
		final List<Node> ignoredNodes = new ArrayList<Node>();
		final List<String> footnotes = new ArrayList<String>();

		if (element != null) {
			new NodeTraversor(new NodeVisitor() {
				@Override
				public void head(Node node, int depth) {
					String manual = null;
					boolean ignore = elementProcessor.ignoreNode(node)
							|| ignoredNodes.contains(node.parentNode());
					// Manual processing
					if (!ignore) {
						manual = elementProcessor.manualProcessing(node);
						if (manual != null) {
							currentLine.append(manual);
							ignore = true;
						}
					}

					// Subtitle check
					if (!ignore) {
						String subtitle = elementProcessor.isSubtitle(node);
						if (subtitle != null) {
							subtitle = subtitle.trim();
							currentLine.append("\n[ " + subtitle + " ]\n");
							ignore = true;
						}
					}

					// <pre> check
					if (!ignore) {
						if (node instanceof Element) {
							Element el = (Element) node;
							if ("pre".equals(el.tagName())) {
								currentLine.append(StringUtils
										.unhtml(el.text()).trim());
								ignore = true;
							}
						}
					}

					if (ignore) {
						ignoredNodes.add(node);
						return;
					}

					String prep = "";
					for (int i = 0; i < quoted.size(); i++) {
						prep += ">";
					}
					prep += " ";

					boolean enterQuote = elementProcessor.detectQuote(node);
					boolean leaveQuote = quoted.contains(depth);

					if (enterQuote) {
						quoted.add(depth);
					}

					if (leaveQuote) {
						quoted.remove(Integer.valueOf(depth));
					}

					if (enterQuote || leaveQuote) {
						if (currentLine.length() > 0) {
							if (currentLine.charAt(currentLine.length() - 1) == '\n') {
								currentLine.setLength(currentLine.length() - 1);
							}
							for (String l : currentLine.toString().split("\n")) {
								lines.add(prep + l);
							}
						}
						currentLine.setLength(0);
					}

					if (node instanceof Element) {
						Element element = (Element) node;
						boolean block = element.isBlock()
								|| element.tagName().equalsIgnoreCase("br");
						if (block && currentLine.length() > 0) {
							currentLine.append("\n");
						}

						if (!element.absUrl("href").trim().isEmpty()) {
							footnotes.add(element.absUrl("href"));
							currentLine.append("[" + footnotes.size() + "]");
						}
					} else if (node instanceof TextNode) {
						TextNode textNode = (TextNode) node;
						String line = StringUtil.normaliseWhitespace(textNode
								.getWholeText());

						currentLine.append(elementProcessor.processText(line));
						currentLine.append(" ");
					}
				}

				@Override
				public void tail(Node node, int depth) {
				}
			}).traverse(element);
		}

		if (currentLine.length() > 0) {
			String prep = "";
			for (int i = 0; i < quoted.size(); i++) {
				prep += ">";
			}
			prep += " ";
			if (currentLine.length() > 0) {
				if (currentLine.charAt(currentLine.length() - 1) == '\n') {
					currentLine.setLength(currentLine.length() - 1);
				}
				for (String l : currentLine.toString().split("\n")) {
					lines.add(prep + l);
				}
			}
		}

		// Fix spaces and nbsp, remove multiple following blank lines
		List<String> linesCopy = new ArrayList<String>(lines.size());
		long blanks = 0;
		for (int i = 0; i < lines.size(); i++) {
			String line = lines.get(i).replace(" ", " ") // nbsp -> space
					.replace("  ", " ").trim();
			if (line.isEmpty()) {
				blanks++;
			} else {
				blanks = 0;
			}

			if (blanks < 2) {
				linesCopy.add(line);
			}
		}

		// Footnotes insertion
		if (footnotes.size() > 0) {
			linesCopy.add("");
			linesCopy.add("");
			linesCopy.add("");
			linesCopy.add("");
			for (int i = 0; i < footnotes.size(); i++) {
				linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
			}
		}

		return linesCopy;
	}

	/**
	 * Reformat the date if possible.
	 * 
	 * @param date
	 *            the input date
	 * 
	 * @return the reformated date, or the same value if it was not parsable
	 */
	static private String date(String date) {
		SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");

		long epoch = 0;
		try {
			epoch = Long.parseLong(date.trim());
		} catch (Exception e) {
			epoch = 0;
		}

		if (epoch > 0) {
			return out.format(new Date(1000 * epoch));
		}

		try {
			Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
					.parse(date.trim());
			return out.format(dat);
		} catch (ParseException e) {
			return date;
		}
	}
}
Commit	Line	Data
73785268 NR	1	package be.nikiroo.gofetch.support;
	2
	3	import java.io.IOException;
3e62b034 NR	4	import java.io.InputStream;
3e62b034 NR	5	import java.net.URL;
b34d1f35 NR	6	import java.text.ParseException;
b34d1f35 NR	7	import java.text.SimpleDateFormat;
27008a87	8	import java.util.ArrayList;
3e62b034	9	import java.util.Arrays;
b34d1f35	10	import java.util.Date;
73785268	11	import java.util.List;
3e62b034	12	import java.util.Map.Entry;
73785268	13
3e62b034	14	import org.jsoup.helper.DataUtil;
27008a87	15	import org.jsoup.helper.StringUtil;
3e62b034	16	import org.jsoup.nodes.Document;
27008a87 NR	17	import org.jsoup.nodes.Element;
	18	import org.jsoup.nodes.Node;
	19	import org.jsoup.nodes.TextNode;
27008a87 NR	20	import org.jsoup.select.NodeTraversor;
	21	import org.jsoup.select.NodeVisitor;
	22
3e62b034	23	import be.nikiroo.gofetch.data.Comment;
73785268	24	import be.nikiroo.gofetch.data.Story;
136ab801	25	import be.nikiroo.utils.Downloader;
3e62b034	26	import be.nikiroo.utils.StringUtils;
73785268	27
b34d1f35 NR	28	/**
	29	* Base class for website support.
	30	*
	31	* @author niki
	32	*/
73785268	33	public abstract class BasicSupport {
b34d1f35	34	/** The downloader to use for all websites. */
1ab7ff0a	35	static protected Downloader downloader = new Downloader("gofetcher");
136ab801	36
3e62b034 NR	37	static private String preselector;
	38
	39	private Type type;
	40
	41	/**
	42	* The website textual description, to add in the dispatcher page.
	43	* <p>
	44	* Should be short.
	45	*
	46	* @return the description
	47	*/
	48	abstract public String getDescription();
	49
b34d1f35	50	/**
3e62b034 NR	51	* The gopher "selector" to use for output.
	52	* <p>
	53	* A kind of "URL path", like "/news/" or "/misc/news/" or...
	54	*
	55	* @return the selector
	56	*/
	57	public String getSelector() {
	58	return getSelector(type);
73785268 NR	59	}
73785268 NR	60
20217360	61	/**
3e62b034 NR	62	* The support type.
	63	*
	64	* @return the type
	65	*/
	66	public Type getType() {
	67	return type;
27008a87 NR	68	}
27008a87 NR	69
20217360	70	/**
3e62b034 NR	71	* List all the recent items, but only assure the ID and internal URL to
	72	* fetch it later on (until it has been fetched, the rest of the
	73	* {@link Story} is not confirmed).
20217360	74	*
3e62b034 NR	75	* @return the list of new stories
	76	*
	77	* @throws IOException
	78	* in case of I/O
20217360	79	*/
3e62b034 NR	80	public List<Story> list() throws IOException {
	81	List<Story> list = new ArrayList<Story>();
	82
	83	for (Entry<URL, String> entry : getUrls()) {
	84	URL url = entry.getKey();
	85	String defaultCateg = entry.getValue();
	86	if (defaultCateg == null) {
	87	defaultCateg = "";
	88	}
20217360	89
3e62b034 NR	90	InputStream in = downloader.open(url);
	91	Document doc = DataUtil.load(in, "UTF-8", url.toString());
	92	List<Element> articles = getArticles(doc);
	93	for (Element article : articles) {
	94	String id = getArticleId(doc, article).trim();
	95	String title = getArticleTitle(doc, article).trim();
	96	String author = getArticleAuthor(doc, article).trim();
	97	String date = getArticleDate(doc, article).trim();
	98	String categ = getArticleCategory(doc, article, defaultCateg)
	99	.trim();
	100	String details = getArticleDetails(doc, article).trim();
	101	String intUrl = getArticleIntUrl(doc, article).trim();
	102	String extUrl = getArticleExtUrl(doc, article).trim();
	103	String content = getArticleContent(doc, article).trim();
	104
	105	if (id.isEmpty() && date.isEmpty()) {
	106	continue;
	107	}
20217360	108
1ab7ff0a NR	109	if (!id.isEmpty()) {
	110	while (id.length() < 10) {
	111	id = "0" + id;
	112	}
	113	} else {
3e62b034 NR	114	id = date.replace(":", "_").replace("+", "_");
3e62b034 NR	115	}
20217360	116
3e62b034	117	date = date(date);
b9afb12e	118
3e62b034 NR	119	list.add(new Story(getType(), id, title, author, date, categ,
	120	details, intUrl, extUrl, content));
	121	}
b9afb12e	122	}
3e62b034 NR	123
3e62b034 NR	124	return list;
20217360 NR	125	}
20217360 NR	126
3e62b034 NR	127	/**
	128	* The {@link URL}s to process for this website.
	129	*
	130	* @return the list of {@link URL}s
	131	*
	132	* @throws IOException
	133	* in case of I/O error
	134	*/
	135	abstract protected List<Entry<URL, String>> getUrls() throws IOException;
73785268	136
3e62b034 NR	137	/**
	138	* The article {@link Element}s of this document.
	139	*
	140	* @param doc
	141	* the main document for the current category
	142	*
	143	* @return the articles
	144	*/
	145	abstract protected List<Element> getArticles(Document doc);
73785268	146
100a8395	147	/**
3e62b034	148	* The ID of the article (defaults to the date element if empty).
100a8395	149	*
3e62b034 NR	150	* @param doc
	151	* the main document for the current category
	152	* @param article
	153	* the article to look into
100a8395	154	*
3e62b034 NR	155	* @return the ID
	156	*/
	157	abstract protected String getArticleId(Document doc, Element article);
	158
	159	/**
	160	* The article title to display.
	161	*
	162	* @param doc
	163	* the main document for the current category
	164	* @param article
	165	* the article to look into
	166	*
	167	* @return the title
	168	*/
	169	abstract protected String getArticleTitle(Document doc, Element article);
	170
	171	/**
	172	* The optional article author.
	173	*
	174	* @param doc
	175	* the main document for the current category
	176	* @param article
	177	* the article to look into
	178	*
	179	* @return the author
	180	*/
	181	abstract protected String getArticleAuthor(Document doc, Element article);
	182
	183	/**
	184	* The optional article date.
	185	*
	186	* @param doc
	187	* the main document for the current category
	188	* @param article
	189	* the article to look into
	190	*
	191	* @return the date
	192	*/
	193	abstract protected String getArticleDate(Document doc, Element article);
	194
	195	/**
	196	* the optional article category.
	197	*
	198	* @param doc
	199	* the main document for the current category
	200	* @param article
	201	* the article to look into
	202	* @param currentCategory
	203	* the currently listed category if any (can be NULL)
	204	*
	205	* @return the category
100a8395	206	*/
3e62b034 NR	207	abstract protected String getArticleCategory(Document doc, Element article,
	208	String currentCategory);
	209
	210	/**
	211	* the optional details of the article (can replace the date, author and
	212	* category, for instance).
	213	*
	214	* @param doc
	215	* the main document for the current category
	216	* @param article
	217	* the article to look into
	218	*
	219	* @return the details
	220	*/
	221	abstract protected String getArticleDetails(Document doc, Element article);
	222
	223	/**
	224	* The (required) {@link URL} that points to the news page on the supported
	225	* website.
	226	*
	227	* @param doc
	228	* the main document for the current category
	229	* @param article
	230	* the article to look into
	231	*
	232	* @return the internal {@link URL}
	233	*/
	234	abstract protected String getArticleIntUrl(Document doc, Element article);
	235
	236	/**
	237	* the optional {@link URL} that points to an external website for more
	238	* information.
	239	*
	240	* @param doc
	241	* the main document for the current category
	242	* @param article
	243	* the article to look into
	244	*
	245	* @return the external {@link URL}
	246	*/
	247	abstract protected String getArticleExtUrl(Document doc, Element article);
	248
	249	/**
	250	* The optional article short-content (not the full content, that will be
	251	* fetched by {@link BasicSupport#fetch(Story)}).
	252	*
	253	* @param doc
	254	* the main document for the current category
	255	* @param article
	256	* the article to look into
	257	*
	258	* @return the short content
	259	*/
	260	abstract protected String getArticleContent(Document doc, Element article);
73785268	261
5c056aad NR	262	/**
	263	* Fetch the full article content as well as all the comments associated to
	264	* this {@link Story}, if any (can be empty, but not NULL).
	265	*
	266	* @param story
	267	* the story to fetch the comments of
	268	*
	269	* @throws IOException
	270	* in case of I/O error
	271	*/
3e62b034 NR	272	public void fetch(Story story) throws IOException {
	273	String fullContent = "";
	274
	275	URL url = new URL(story.getUrlInternal());
	276	InputStream in = downloader.open(url);
	277	try {
	278	Document doc = DataUtil.load(in, "UTF-8", url.toString());
	279	Element article = getFullArticle(doc);
	280	if (article != null) {
	281	StringBuilder builder = new StringBuilder();
	282	ElementProcessor eProc = getElementProcessorFullArticle();
	283	if (eProc != null) {
	284	for (String line : toLines(article, eProc)) {
	285	builder.append(line + "\n");
	286	}
	287	} else {
	288	builder.append(article.text());
	289	}
	290
	291	// Content is too tight with a single break per line:
	292	fullContent = builder.toString().replace("\n", "\n\n") //
	293	.replace("\n\n\n\n", "\n\n") //
	294	.replace("\n\n\n\n", "\n\n") //
	295	.trim();
	296	}
	297
	298	if (fullContent.isEmpty()) {
	299	fullContent = story.getContent();
	300	}
	301
	302	story.setFullContent(fullContent);
	303	story.setComments(getComments(doc,
	304	getFullArticleCommentPosts(doc, url)));
	305	} finally {
	306	if (in != null) {
	307	in.close();
	308	}
	309	}
	310	}
73785268	311
b34d1f35	312	/**
3e62b034	313	* Return the full article if available.
b34d1f35	314	*
3e62b034 NR	315	* @param doc
	316	* the (full article) document to work on
	317	*
	318	* @return the article or NULL
b34d1f35	319	*/
3e62b034	320	abstract protected Element getFullArticle(Document doc);
2d95a873	321
b34d1f35	322	/**
3e62b034 NR	323	* Return the list of comment {@link Element}s from this optional container
	324	* -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	325	*
	326	* @param doc
	327	* the (full article) document to work on
	328	* @param intUrl
	329	* the internal {@link URL} this article wa taken from (the
	330	* {@link URL} from the supported website)
	331	*
	332	* @return the list of comment posts
	333	*/
	334	abstract protected List<Element> getFullArticleCommentPosts(Document doc,
	335	URL intUrl);
	336
	337	/**
	338	* The {@link ElementProcessor} to use to convert the main article element
	339	* (see {@link BasicSupport#getFullArticle(Document)}) into text.
b34d1f35	340	* <p>
3e62b034 NR	341	* See {@link BasicElementProcessor} for a working, basic implementation.
	342	* <p>
	343	* Can be NULL to simply use {@link Element#text()}.
b34d1f35	344	*
3e62b034	345	* @return the processor, or NULL
b34d1f35	346	*/
3e62b034	347	abstract protected ElementProcessor getElementProcessorFullArticle();
73785268	348
b34d1f35	349	/**
3e62b034	350	* Convert the comment elements into {@link Comment}s
b34d1f35	351	*
3e62b034 NR	352	* @param doc
	353	* the document we work on
	354	* @param posts
	355	* the comment elements
	356	*
	357	* @return the converted {@link Comment}s
b34d1f35	358	*/
3e62b034 NR	359	private List<Comment> getComments(Document doc, List<Element> posts) {
	360	List<Comment> comments = new ArrayList<Comment>();
	361	if (posts != null) {
	362	for (Element post : posts) {
	363	String id = getCommentId(post).trim();
	364	String author = getCommentAuthor(post).trim();
	365	String title = getCommentTitle(post).trim();
	366	String date = getCommentDate(post).trim();
	367
	368	List<String> content = new ArrayList<String>();
	369
	370	if (id.isEmpty()) {
	371	id = date;
	372	}
	373
	374	date = date(date);
	375
	376	Element contentE = getCommentContentElement(post);
	377	if (contentE != null) {
	378	ElementProcessor eProc = getElementProcessorComment();
	379	if (eProc != null) {
	380	for (String line : toLines(contentE, eProc)) {
	381	content.add(line);
	382	}
	383	} else {
	384	content = Arrays.asList(contentE.text().split("\n"));
	385	}
	386	}
	387
	388	Comment comment = new Comment(id, author, title, date, content);
	389	comment.addAll(getComments(doc,
	390	getCommentCommentPosts(doc, post)));
	391
	392	if (!comment.isEmpty()) {
	393	comments.add(comment);
	394	}
	395	}
	396	}
	397
	398	return comments;
73785268 NR	399	}
73785268 NR	400
3e62b034 NR	401	/**
	402	* Return the list of subcomment {@link Element}s from this comment element
	403	* -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	404	*
	405	* @param doc
	406	* the (full article) document to work on
	407	* @param container
	408	* the container (a comment {@link Element})
	409	*
	410	* @return the list of comment posts
	411	*/
	412	abstract protected List<Element> getCommentCommentPosts(Document doc,
	413	Element container);
	414
	415	/**
	416	* Compute the ID of the given comment element.
	417	*
	418	* @param post
	419	* the comment element
	420	*
	421	* @return the ID
	422	*/
	423	abstract protected String getCommentId(Element post);
	424
	425	/**
	426	* Compute the author of the given comment element.
	427	*
	428	* @param post
	429	* the comment element
	430	*
	431	* @return the author
	432	*/
	433	abstract protected String getCommentAuthor(Element post);
	434
	435	/**
	436	* Compute the title of the given comment element.
	437	*
	438	* @param post
	439	* the comment element
	440	*
	441	* @return the title
	442	*/
	443	abstract protected String getCommentTitle(Element post);
	444
	445	/**
	446	* Compute the date of the given comment element.
	447	*
	448	* @param post
	449	* the comment element
	450	*
	451	* @return the date
	452	*/
	453	abstract protected String getCommentDate(Element post);
	454
	455	/**
	456	* Get the main of the given comment element, which can be NULL.
	457	*
	458	* @param post
	459	* the comment element
	460	*
	461	* @return the element
	462	*/
	463	abstract protected Element getCommentContentElement(Element post);
	464
465	/**
466	* The {@link ElementProcessor} to use to convert the main comment element
467	* (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
468	* <p>
469	* See {@link BasicElementProcessor} for a working, basic implementation.
470	* <p>
471	* Can be NULL to simply use {@link Element#text()}.
472	*
473	* @return the processor
474	*/
475	abstract protected ElementProcessor getElementProcessorComment();
476
b34d1f35 NR	477	/**
	478	* The support type.
	479	*
	480	* @param type
	481	* the new type
	482	*/
73785268 NR	483	protected void setType(Type type) {
	484	this.type = type;
	485	}
	486
	487	/**
b34d1f35 NR	488	* The {@link String} to append to the selector (the selector will be
	489	* constructed as "this string" then "/type/".
	490	*
73785268 NR	491	* @param preselector
	492	* the preselector to set
	493	*/
	494	static public void setPreselector(String preselector) {
	495	BasicSupport.preselector = preselector;
	496	}
	497
20217360 NR	498	/**
	499	* Return a {@link BasicSupport} that is compatible with the given
	500	* {@link Type} if it exists (or NULL if not).
	501	*
	502	* @param type
	503	* the type
	504	*
	505	* @return a compatible {@link BasicSupport} if it exists (or NULL if not)
	506	*/
73785268 NR	507	static public BasicSupport getSupport(Type type) {
	508	BasicSupport support = null;
	509
	510	if (type != null) {
	511	switch (type) {
	512	case SLASHDOT:
	513	support = new Slashdot();
	514	break;
2d95a873 NR	515	case PIPEDOT:
	516	support = new Pipedot();
	517	break;
eaaeae39 NR	518	case LWN:
	519	support = new LWN();
	520	break;
100a8395 NR	521	case LEMONDE:
	522	support = new LeMonde();
	523	break;
d28c4aac NR	524	case REGISTER:
	525	support = new TheRegister();
	526	break;
b34d1f35	527	case TOO_LINUX:
cd555a1e NR	528	support = new TooLinux();
cd555a1e NR	529	break;
31755801 NR	530	case ERE_NUMERIQUE:
	531	support = new EreNumerique();
	532	break;
127e065f NR	533	case PHORONIX:
	534	support = new Phoronix();
	535	break;
73785268 NR	536	}
	537
	538	if (support != null) {
	539	support.setType(type);
	540	}
	541	}
	542
	543	return support;
	544	}
	545
b34d1f35 NR	546	/**
	547	* The gopher "selector" to use for output for this type, using the
	548	* preselector.
	549	* <p>
	550	* A kind of "URL path", like "/news/" or "/misc/news/" or...
	551	*
	552	* @param type
	553	* the type to get the selector of
	554	*
	555	* @return the selector
	556	*/
73785268 NR	557	static public String getSelector(Type type) {
	558	return preselector + "/" + type + "/";
	559	}
	560
20217360 NR	561	/**
	562	* Process the given element into text (each line is a text paragraph and
	563	* can be prepended with ">" signs to indicate a quote or sub-quote or
	564	* sub-sub-quote...).
	565	*
	566	* @param element
	567	* the element to process
	568	* @param elementProcessor
	569	* the element processor, must not be NULL
	570	*
	571	* @return text lines, each line is a paragraph
	572	*/
27008a87	573	static protected List<String> toLines(Element element,
20217360	574	final ElementProcessor elementProcessor) {
27008a87 NR	575	final List<String> lines = new ArrayList<String>();
	576	final StringBuilder currentLine = new StringBuilder();
	577	final List<Integer> quoted = new ArrayList<Integer>();
	578	final List<Node> ignoredNodes = new ArrayList<Node>();
3e62b034	579	final List<String> footnotes = new ArrayList<String>();
27008a87 NR	580
	581	if (element != null) {
	582	new NodeTraversor(new NodeVisitor() {
	583	@Override
	584	public void head(Node node, int depth) {
100a8395	585	String manual = null;
20217360	586	boolean ignore = elementProcessor.ignoreNode(node)
100a8395	587	\|\| ignoredNodes.contains(node.parentNode());
b9afb12e	588	// Manual processing
100a8395	589	if (!ignore) {
20217360	590	manual = elementProcessor.manualProcessing(node);
100a8395 NR	591	if (manual != null) {
	592	currentLine.append(manual);
	593	ignore = true;
	594	}
	595	}
	596
b9afb12e NR	597	// Subtitle check
	598	if (!ignore) {
	599	String subtitle = elementProcessor.isSubtitle(node);
	600	if (subtitle != null) {
	601	subtitle = subtitle.trim();
	602	currentLine.append("\n[ " + subtitle + " ]\n");
	603	ignore = true;
	604	}
	605	}
	606
3e62b034 NR	607	// <pre> check
	608	if (!ignore) {
	609	if (node instanceof Element) {
	610	Element el = (Element) node;
	611	if ("pre".equals(el.tagName())) {
	612	currentLine.append(StringUtils
	613	.unhtml(el.text()).trim());
	614	ignore = true;
	615	}
	616	}
	617	}
	618
100a8395	619	if (ignore) {
27008a87 NR	620	ignoredNodes.add(node);
	621	return;
	622	}
	623
	624	String prep = "";
	625	for (int i = 0; i < quoted.size(); i++) {
	626	prep += ">";
	627	}
	628	prep += " ";
	629
20217360	630	boolean enterQuote = elementProcessor.detectQuote(node);
27008a87 NR	631	boolean leaveQuote = quoted.contains(depth);
	632
	633	if (enterQuote) {
	634	quoted.add(depth);
	635	}
	636
	637	if (leaveQuote) {
	638	quoted.remove(Integer.valueOf(depth));
	639	}
	640
	641	if (enterQuote \|\| leaveQuote) {
	642	if (currentLine.length() > 0) {
	643	if (currentLine.charAt(currentLine.length() - 1) == '\n') {
	644	currentLine.setLength(currentLine.length() - 1);
	645	}
	646	for (String l : currentLine.toString().split("\n")) {
	647	lines.add(prep + l);
	648	}
	649	}
	650	currentLine.setLength(0);
	651	}
	652
	653	if (node instanceof Element) {
	654	Element element = (Element) node;
	655	boolean block = element.isBlock()
	656	\|\| element.tagName().equalsIgnoreCase("br");
	657	if (block && currentLine.length() > 0) {
	658	currentLine.append("\n");
	659	}
3e62b034 NR	660
	661	if (!element.absUrl("href").trim().isEmpty()) {
	662	footnotes.add(element.absUrl("href"));
	663	currentLine.append("[" + footnotes.size() + "]");
	664	}
27008a87 NR	665	} else if (node instanceof TextNode) {
	666	TextNode textNode = (TextNode) node;
	667	String line = StringUtil.normaliseWhitespace(textNode
	668	.getWholeText());
	669
20217360	670	currentLine.append(elementProcessor.processText(line));
27008a87 NR	671	currentLine.append(" ");
	672	}
	673	}
	674
	675	@Override
	676	public void tail(Node node, int depth) {
	677	}
	678	}).traverse(element);
	679	}
	680
	681	if (currentLine.length() > 0) {
	682	String prep = "";
	683	for (int i = 0; i < quoted.size(); i++) {
	684	prep += ">";
	685	}
	686	prep += " ";
	687	if (currentLine.length() > 0) {
	688	if (currentLine.charAt(currentLine.length() - 1) == '\n') {
	689	currentLine.setLength(currentLine.length() - 1);
	690	}
	691	for (String l : currentLine.toString().split("\n")) {
	692	lines.add(prep + l);
	693	}
	694	}
	695	}
	696
3e62b034 NR	697	// Fix spaces and nbsp, remove multiple following blank lines
	698	List<String> linesCopy = new ArrayList<String>(lines.size());
	699	long blanks = 0;
27008a87	700	for (int i = 0; i < lines.size(); i++) {
3e62b034 NR	701	String line = lines.get(i).replace(" ", " ") // nbsp -> space
	702	.replace(" ", " ").trim();
	703	if (line.isEmpty()) {
	704	blanks++;
	705	} else {
	706	blanks = 0;
	707	}
	708
	709	if (blanks < 2) {
	710	linesCopy.add(line);
	711	}
	712	}
	713
	714	// Footnotes insertion
	715	if (footnotes.size() > 0) {
	716	linesCopy.add("");
	717	linesCopy.add("");
	718	linesCopy.add("");
	719	linesCopy.add("");
	720	for (int i = 0; i < footnotes.size(); i++) {
	721	linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
	722	}
27008a87 NR	723	}
27008a87 NR	724
3e62b034	725	return linesCopy;
b34d1f35 NR	726	}
	727
	728	/**
	729	* Reformat the date if possible.
	730	*
	731	* @param date
	732	* the input date
	733	*
	734	* @return the reformated date, or the same value if it was not parsable
	735	*/
3e62b034	736	static private String date(String date) {
b34d1f35 NR	737	SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
	738
	739	long epoch = 0;
	740	try {
c9cffa91	741	epoch = Long.parseLong(date.trim());
b34d1f35 NR	742	} catch (Exception e) {
b34d1f35 NR	743	epoch = 0;
880740c4 NR	744	}
880740c4 NR	745
b34d1f35 NR	746	if (epoch > 0) {
	747	return out.format(new Date(1000 * epoch));
	748	}
	749
	750	try {
	751	Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
	752	.parse(date.trim());
	753	return out.format(dat);
	754	} catch (ParseException e) {
	755	return date;
	756	}
27008a87	757	}
73785268	758	}