[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java

package be.nikiroo.gofetch.support;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Map.Entry;

import org.jsoup.helper.DataUtil;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

import be.nikiroo.gofetch.data.Comment;
import be.nikiroo.gofetch.data.Story;
import be.nikiroo.utils.Downloader;
import be.nikiroo.utils.StringUtils;

/**
 * Base class for website support.
 * 
 * @author niki
 */
public abstract class BasicSupport {
	/**
	 * The downloader to use for all websites via {@link BasicSupport#open(URL)}
	 */
	static private Downloader downloader = new Downloader("gofetcher");

	static private String preselector;

	private Type type;

	/**
	 * The website textual description, to add in the dispatcher page.
	 * <p>
	 * Should be short.
	 * 
	 * @return the description
	 */
	abstract public String getDescription();

	/**
	 * The gopher "selector" to use for output.
	 * <p>
	 * A kind of "URL path", like "/news/" or "/misc/news/" or...
	 * 
	 * @return the selector
	 */
	public String getSelector() {
		return getSelector(getType());
	}

	/**
	 * The support type.
	 * 
	 * @return the type
	 */
	public Type getType() {
		return type;
	}

	/**
	 * List all the recent items, but only assure the ID and internal URL to
	 * fetch it later on (until it has been fetched, the rest of the
	 * {@link Story} is not confirmed).
	 * 
	 * @return the list of new stories
	 * 
	 * @throws IOException
	 *             in case of I/O
	 */
	public List<Story> list() throws IOException {
		List<Story> list = new ArrayList<Story>();

		for (Entry<URL, String> entry : getUrls()) {
			URL url = entry.getKey();
			String defaultCateg = entry.getValue();
			if (defaultCateg == null) {
				defaultCateg = "";
			}

			InputStream in = open(url);
			Document doc = DataUtil.load(in, "UTF-8", url.toString());
			List<Element> articles = getArticles(doc);
			for (Element article : articles) {
				String id = getArticleId(doc, article).trim();
				String title = getArticleTitle(doc, article).trim();
				String author = getArticleAuthor(doc, article).trim();
				String date = getArticleDate(doc, article).trim();
				String categ = getArticleCategory(doc, article, defaultCateg)
						.trim();
				String details = getArticleDetails(doc, article).trim();
				String intUrl = getArticleIntUrl(doc, article).trim();
				String extUrl = getArticleExtUrl(doc, article).trim();
				String content = getArticleContent(doc, article).trim();

				if (id.isEmpty() && date.isEmpty()) {
					continue;
				}

				if (!id.isEmpty()) {
					while (id.length() < 10) {
						id = "0" + id;
					}
				} else {
					id = date.replace(":", "_").replace("+", "_");
				}

				date = date(date);

				list.add(new Story(getType(), id, title, author, date, categ,
						details, intUrl, extUrl, content));
			}
		}

		return list;
	}

	/**
	 * The {@link URL}s to process for this website.
	 * 
	 * @return the list of {@link URL}s
	 * 
	 * @throws IOException
	 *             in case of I/O error
	 */
	abstract protected List<Entry<URL, String>> getUrls() throws IOException;

	/**
	 * The article {@link Element}s of this document.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * 
	 * @return the articles
	 */
	abstract protected List<Element> getArticles(Document doc);

	/**
	 * The ID of the article (defaults to the date element if empty).
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the ID
	 */
	abstract protected String getArticleId(Document doc, Element article);

	/**
	 * The article title to display.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the title
	 */
	abstract protected String getArticleTitle(Document doc, Element article);

	/**
	 * The optional article author.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the author
	 */
	abstract protected String getArticleAuthor(Document doc, Element article);

	/**
	 * The optional article date.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the date
	 */
	abstract protected String getArticleDate(Document doc, Element article);

	/**
	 * the optional article category.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * @param currentCategory
	 *            the currently listed category if any (can be NULL)
	 * 
	 * @return the category
	 */
	abstract protected String getArticleCategory(Document doc, Element article,
			String currentCategory);

	/**
	 * the optional details of the article (can replace the date, author and
	 * category, for instance).
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the details
	 */
	abstract protected String getArticleDetails(Document doc, Element article);

	/**
	 * The (required) {@link URL} that points to the news page on the supported
	 * website.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the internal {@link URL}
	 */
	abstract protected String getArticleIntUrl(Document doc, Element article);

	/**
	 * the optional {@link URL} that points to an external website for more
	 * information.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the external {@link URL}
	 */
	abstract protected String getArticleExtUrl(Document doc, Element article);

	/**
	 * The optional article short-content (not the full content, that will be
	 * fetched by {@link BasicSupport#fetch(Story)}).
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the short content
	 */
	abstract protected String getArticleContent(Document doc, Element article);

	/**
	 * Fetch the full article content as well as all the comments associated to
	 * this {@link Story}, if any (can be empty, but not NULL).
	 * 
	 * @param story
	 *            the story to fetch the comments of
	 * 
	 * @throws IOException
	 *             in case of I/O error
	 */
	public void fetch(Story story) throws IOException {
		String fullContent = "";

		URL url = new URL(story.getUrlInternal());
		InputStream in = open(url);
		try {
			Document doc = DataUtil.load(in, "UTF-8", url.toString());
			Element article = getFullArticle(doc);
			if (article != null) {
				StringBuilder builder = new StringBuilder();
				ElementProcessor eProc = getElementProcessorFullArticle();
				if (eProc != null) {
					for (String line : toLines(article, eProc)) {
						builder.append(line + "\n");
					}
				} else {
					builder.append(article.text());
				}

				// Content is too tight with a single break per line:
				fullContent = builder.toString().replace("\n", "\n\n") //
						.replace("\n\n\n\n", "\n\n") //
						.replace("\n\n\n\n", "\n\n") //
						.trim();
			}

			if (fullContent.isEmpty()) {
				fullContent = story.getContent();
			}

			story.setFullContent(fullContent);
			story.setComments(getComments(doc,
					getFullArticleCommentPosts(doc, url)));
		} finally {
			if (in != null) {
				in.close();
			}
		}
	}

	/**
	 * Return the full article if available.
	 * 
	 * @param doc
	 *            the (full article) document to work on
	 * 
	 * @return the article or NULL
	 */
	abstract protected Element getFullArticle(Document doc);

	/**
	 * Return the list of comment {@link Element}s from this optional container
	 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	 * 
	 * @param doc
	 *            the (full article) document to work on
	 * @param intUrl
	 *            the internal {@link URL} this article wa taken from (the
	 *            {@link URL} from the supported website)
	 * 
	 * @return the list of comment posts
	 */
	abstract protected List<Element> getFullArticleCommentPosts(Document doc,
			URL intUrl);

	/**
	 * The {@link ElementProcessor} to use to convert the main article element
	 * (see {@link BasicSupport#getFullArticle(Document)}) into text.
	 * <p>
	 * See {@link BasicElementProcessor} for a working, basic implementation.
	 * <p>
	 * Can be NULL to simply use {@link Element#text()}.
	 * 
	 * @return the processor, or NULL
	 */
	abstract protected ElementProcessor getElementProcessorFullArticle();

	/**
	 * Open a network resource.
	 * <p>
	 * You need to close the returned {@link InputStream} when done.
	 * 
	 * @param url
	 *            the source to open
	 * 
	 * @return the content
	 * 
	 * @throws IOException
	 *             in case of I/O error
	 */
	protected InputStream open(URL url) throws IOException {
		return downloader.open(url);
	}

	/**
	 * Convert the comment elements into {@link Comment}s
	 * 
	 * @param doc
	 *            the document we work on
	 * @param posts
	 *            the comment elements
	 * 
	 * @return the converted {@link Comment}s
	 */
	private List<Comment> getComments(Document doc, List<Element> posts) {
		List<Comment> comments = new ArrayList<Comment>();
		if (posts != null) {
			for (Element post : posts) {
				String id = getCommentId(post).trim();
				String author = getCommentAuthor(post).trim();
				String title = getCommentTitle(post).trim();
				String date = getCommentDate(post).trim();

				List<String> content = new ArrayList<String>();

				if (id.isEmpty()) {
					id = date;
				}

				date = date(date);

				Element contentE = getCommentContentElement(post);
				if (contentE != null) {
					ElementProcessor eProc = getElementProcessorComment();
					if (eProc != null) {
						for (String line : toLines(contentE, eProc)) {
							content.add(line);
						}
					} else {
						content = Arrays.asList(contentE.text().split("\n"));
					}
				}

				Comment comment = new Comment(id, author, title, date, content);
				comment.addAll(getComments(doc,
						getCommentCommentPosts(doc, post)));

				if (!comment.isEmpty()) {
					comments.add(comment);
				}
			}
		}

		return comments;
	}

	/**
	 * Return the list of subcomment {@link Element}s from this comment element
	 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	 * 
	 * @param doc
	 *            the (full article) document to work on
	 * @param container
	 *            the container (a comment {@link Element})
	 * 
	 * @return the list of comment posts
	 */
	abstract protected List<Element> getCommentCommentPosts(Document doc,
			Element container);

	/**
	 * Compute the ID of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the ID
	 */
	abstract protected String getCommentId(Element post);

	/**
	 * Compute the author of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the author
	 */
	abstract protected String getCommentAuthor(Element post);

	/**
	 * Compute the title of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the title
	 */
	abstract protected String getCommentTitle(Element post);

	/**
	 * Compute the date of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the date
	 */
	abstract protected String getCommentDate(Element post);

	/**
	 * Get the main of the given comment element, which can be NULL.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the element
	 */
	abstract protected Element getCommentContentElement(Element post);

	/**
	 * The {@link ElementProcessor} to use to convert the main comment element
	 * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
	 * <p>
	 * See {@link BasicElementProcessor} for a working, basic implementation.
	 * <p>
	 * Can be NULL to simply use {@link Element#text()}.
	 * 
	 * @return the processor
	 */
	abstract protected ElementProcessor getElementProcessorComment();

	/**
	 * The support type.
	 * 
	 * @param type
	 *            the new type
	 */
	protected void setType(Type type) {
		this.type = type;
	}

	/**
	 * The {@link String} to append to the selector (the selector will be
	 * constructed as "this string" then "/type/".
	 * 
	 * @param preselector
	 *            the preselector to set
	 */
	static public void setPreselector(String preselector) {
		BasicSupport.preselector = preselector;
	}

	/**
	 * Return a {@link BasicSupport} that is compatible with the given
	 * {@link Type} if it exists (or NULL if not).
	 * 
	 * @param type
	 *            the type
	 * 
	 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
	 */
	static public BasicSupport getSupport(Type type) {
		BasicSupport support = null;

		if (type != null) {
			switch (type) {
			case SLASHDOT:
				support = new Slashdot();
				break;
			case PIPEDOT:
				support = new Pipedot();
				break;
			case LWN:
				support = new LWN();
				break;
			case LEMONDE:
				support = new LeMonde();
				break;
			case REGISTER:
				support = new TheRegister();
				break;
			case TOO_LINUX:
				support = new TooLinux();
				break;
			case ERE_NUMERIQUE:
				support = new EreNumerique();
				break;
			case PHORONIX:
				support = new Phoronix();
				break;
			}

			if (support != null) {
				support.setType(type);
			}
		}

		return support;
	}

	/**
	 * The gopher "selector" to use for output for this type, using the
	 * preselector.
	 * <p>
	 * A kind of "URL path", like "/news/" or "/misc/news/" or...
	 * 
	 * @param type
	 *            the type to get the selector of
	 * 
	 * @return the selector
	 */
	static public String getSelector(Type type) {
		return preselector + "/" + type + "/";
	}

	/**
	 * Process the given element into text (each line is a text paragraph and
	 * can be prepended with ">" signs to indicate a quote or sub-quote or
	 * sub-sub-quote...).
	 * 
	 * @param element
	 *            the element to process
	 * @param elementProcessor
	 *            the element processor, must not be NULL
	 * 
	 * @return text lines, each line is a paragraph
	 */
	static protected List<String> toLines(Element element,
			final ElementProcessor elementProcessor) {
		final List<String> lines = new ArrayList<String>();
		final StringBuilder currentLine = new StringBuilder();
		final List<Integer> quoted = new ArrayList<Integer>();
		final List<Node> ignoredNodes = new ArrayList<Node>();
		final List<String> footnotes = new ArrayList<String>();

		if (element != null) {
			new NodeTraversor(new NodeVisitor() {
				@Override
				public void head(Node node, int depth) {
					String manual = null;
					boolean ignore = elementProcessor.ignoreNode(node)
							|| ignoredNodes.contains(node.parentNode());
					// Manual processing
					if (!ignore) {
						manual = elementProcessor.manualProcessing(node);
						if (manual != null) {
							currentLine.append(manual);
							ignore = true;
						}
					}

					// Subtitle check
					if (!ignore) {
						String subtitle = elementProcessor.isSubtitle(node);
						if (subtitle != null) {
							subtitle = subtitle.trim();
							currentLine.append("\n[ " + subtitle + " ]\n");
							ignore = true;
						}
					}

					// <pre> check
					if (!ignore) {
						if (node instanceof Element) {
							Element el = (Element) node;
							if ("pre".equals(el.tagName())) {
								currentLine.append(StringUtils
										.unhtml(el.text()).trim());
								ignore = true;
							}
						}
					}

					if (ignore) {
						ignoredNodes.add(node);
						return;
					}

					String prep = "";
					for (int i = 0; i < quoted.size(); i++) {
						prep += ">";
					}
					prep += " ";

					boolean enterQuote = elementProcessor.detectQuote(node);
					boolean leaveQuote = quoted.contains(depth);

					if (enterQuote) {
						quoted.add(depth);
					}

					if (leaveQuote) {
						quoted.remove(Integer.valueOf(depth));
					}

					if (enterQuote || leaveQuote) {
						if (currentLine.length() > 0) {
							if (currentLine.charAt(currentLine.length() - 1) == '\n') {
								currentLine.setLength(currentLine.length() - 1);
							}
							for (String l : currentLine.toString().split("\n")) {
								lines.add(prep + l);
							}
						}
						currentLine.setLength(0);
					}

					if (node instanceof Element) {
						Element element = (Element) node;
						boolean block = element.isBlock()
								|| element.tagName().equalsIgnoreCase("br");
						if (block && currentLine.length() > 0) {
							currentLine.append("\n");
						}

						if (!element.absUrl("href").trim().isEmpty()) {
							footnotes.add(element.absUrl("href"));
							currentLine.append("[" + footnotes.size() + "]");
						}
					} else if (node instanceof TextNode) {
						TextNode textNode = (TextNode) node;
						String line = StringUtil.normaliseWhitespace(textNode
								.getWholeText());

						currentLine.append(elementProcessor.processText(line));
						currentLine.append(" ");
					}
				}

				@Override
				public void tail(Node node, int depth) {
				}
			}).traverse(element);
		}

		if (currentLine.length() > 0) {
			String prep = "";
			for (int i = 0; i < quoted.size(); i++) {
				prep += ">";
			}
			prep += " ";
			if (currentLine.length() > 0) {
				if (currentLine.charAt(currentLine.length() - 1) == '\n') {
					currentLine.setLength(currentLine.length() - 1);
				}
				for (String l : currentLine.toString().split("\n")) {
					lines.add(prep + l);
				}
			}
		}

		// Fix spaces and nbsp, remove multiple following blank lines
		List<String> linesCopy = new ArrayList<String>(lines.size());
		long blanks = 0;
		for (int i = 0; i < lines.size(); i++) {
			String line = lines.get(i).replace(" ", " ") // nbsp -> space
					.replace("  ", " ").trim();
			if (line.isEmpty()) {
				blanks++;
			} else {
				blanks = 0;
			}

			if (blanks < 2) {
				linesCopy.add(line);
			}
		}

		// Footnotes insertion
		if (footnotes.size() > 0) {
			linesCopy.add("");
			linesCopy.add("");
			linesCopy.add("");
			linesCopy.add("");
			for (int i = 0; i < footnotes.size(); i++) {
				linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
			}
		}

		return linesCopy;
	}

	/**
	 * Reformat the date if possible.
	 * 
	 * @param date
	 *            the input date
	 * 
	 * @return the reformated date, or the same value if it was not parsable
	 */
	static private String date(String date) {
		SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");

		long epoch = 0;
		try {
			epoch = Long.parseLong(date.trim());
		} catch (Exception e) {
			epoch = 0;
		}

		if (epoch > 0) {
			return out.format(new Date(1000 * epoch));
		}

		try {
			Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
					.parse(date.trim());
			return out.format(dat);
		} catch (ParseException e) {
			return date;
		}
	}
}
Commit	Line	Data
	1	package be.nikiroo.gofetch.support;
	2
	3	import java.io.IOException;
	4	import java.io.InputStream;
	5	import java.net.URL;
	6	import java.text.ParseException;
	7	import java.text.SimpleDateFormat;
	8	import java.util.ArrayList;
	9	import java.util.Arrays;
	10	import java.util.Date;
	11	import java.util.List;
	12	import java.util.Map.Entry;
	13
	14	import org.jsoup.helper.DataUtil;
	15	import org.jsoup.helper.StringUtil;
	16	import org.jsoup.nodes.Document;
	17	import org.jsoup.nodes.Element;
	18	import org.jsoup.nodes.Node;
	19	import org.jsoup.nodes.TextNode;
	20	import org.jsoup.select.NodeTraversor;
	21	import org.jsoup.select.NodeVisitor;
	22
	23	import be.nikiroo.gofetch.data.Comment;
	24	import be.nikiroo.gofetch.data.Story;
	25	import be.nikiroo.utils.Downloader;
	26	import be.nikiroo.utils.StringUtils;
	27
	28	/**
	29	* Base class for website support.
	30	*
	31	* @author niki
	32	*/
	33	public abstract class BasicSupport {
	34	/**
	35	* The downloader to use for all websites via {@link BasicSupport#open(URL)}
	36	*/
	37	static private Downloader downloader = new Downloader("gofetcher");
	38
	39	static private String preselector;
	40
	41	private Type type;
	42
	43	/**
	44	* The website textual description, to add in the dispatcher page.
	45	* <p>
	46	* Should be short.
	47	*
	48	* @return the description
	49	*/
	50	abstract public String getDescription();
	51
	52	/**
	53	* The gopher "selector" to use for output.
	54	* <p>
	55	* A kind of "URL path", like "/news/" or "/misc/news/" or...
	56	*
	57	* @return the selector
	58	*/
	59	public String getSelector() {
	60	return getSelector(getType());
	61	}
	62
	63	/**
	64	* The support type.
	65	*
	66	* @return the type
	67	*/
	68	public Type getType() {
	69	return type;
	70	}
	71
	72	/**
	73	* List all the recent items, but only assure the ID and internal URL to
	74	* fetch it later on (until it has been fetched, the rest of the
	75	* {@link Story} is not confirmed).
	76	*
	77	* @return the list of new stories
	78	*
	79	* @throws IOException
	80	* in case of I/O
	81	*/
	82	public List<Story> list() throws IOException {
	83	List<Story> list = new ArrayList<Story>();
	84
	85	for (Entry<URL, String> entry : getUrls()) {
	86	URL url = entry.getKey();
	87	String defaultCateg = entry.getValue();
	88	if (defaultCateg == null) {
	89	defaultCateg = "";
	90	}
	91
	92	InputStream in = open(url);
	93	Document doc = DataUtil.load(in, "UTF-8", url.toString());
	94	List<Element> articles = getArticles(doc);
	95	for (Element article : articles) {
	96	String id = getArticleId(doc, article).trim();
	97	String title = getArticleTitle(doc, article).trim();
	98	String author = getArticleAuthor(doc, article).trim();
	99	String date = getArticleDate(doc, article).trim();
	100	String categ = getArticleCategory(doc, article, defaultCateg)
	101	.trim();
	102	String details = getArticleDetails(doc, article).trim();
	103	String intUrl = getArticleIntUrl(doc, article).trim();
	104	String extUrl = getArticleExtUrl(doc, article).trim();
	105	String content = getArticleContent(doc, article).trim();
	106
	107	if (id.isEmpty() && date.isEmpty()) {
	108	continue;
	109	}
	110
	111	if (!id.isEmpty()) {
	112	while (id.length() < 10) {
	113	id = "0" + id;
	114	}
	115	} else {
	116	id = date.replace(":", "_").replace("+", "_");
	117	}
	118
	119	date = date(date);
	120
	121	list.add(new Story(getType(), id, title, author, date, categ,
	122	details, intUrl, extUrl, content));
	123	}
	124	}
	125
	126	return list;
	127	}
	128
	129	/**
	130	* The {@link URL}s to process for this website.
	131	*
	132	* @return the list of {@link URL}s
	133	*
	134	* @throws IOException
	135	* in case of I/O error
	136	*/
	137	abstract protected List<Entry<URL, String>> getUrls() throws IOException;
	138
	139	/**
	140	* The article {@link Element}s of this document.
	141	*
	142	* @param doc
	143	* the main document for the current category
	144	*
	145	* @return the articles
	146	*/
	147	abstract protected List<Element> getArticles(Document doc);
	148
	149	/**
	150	* The ID of the article (defaults to the date element if empty).
	151	*
	152	* @param doc
	153	* the main document for the current category
	154	* @param article
	155	* the article to look into
	156	*
	157	* @return the ID
	158	*/
	159	abstract protected String getArticleId(Document doc, Element article);
	160
	161	/**
	162	* The article title to display.
	163	*
	164	* @param doc
	165	* the main document for the current category
	166	* @param article
	167	* the article to look into
	168	*
	169	* @return the title
	170	*/
	171	abstract protected String getArticleTitle(Document doc, Element article);
	172
	173	/**
	174	* The optional article author.
	175	*
	176	* @param doc
	177	* the main document for the current category
	178	* @param article
	179	* the article to look into
	180	*
	181	* @return the author
	182	*/
	183	abstract protected String getArticleAuthor(Document doc, Element article);
	184
	185	/**
	186	* The optional article date.
	187	*
	188	* @param doc
	189	* the main document for the current category
	190	* @param article
	191	* the article to look into
	192	*
	193	* @return the date
	194	*/
	195	abstract protected String getArticleDate(Document doc, Element article);
	196
	197	/**
	198	* the optional article category.
	199	*
	200	* @param doc
	201	* the main document for the current category
	202	* @param article
	203	* the article to look into
	204	* @param currentCategory
	205	* the currently listed category if any (can be NULL)
	206	*
	207	* @return the category
	208	*/
	209	abstract protected String getArticleCategory(Document doc, Element article,
	210	String currentCategory);
	211
	212	/**
	213	* the optional details of the article (can replace the date, author and
	214	* category, for instance).
	215	*
	216	* @param doc
	217	* the main document for the current category
	218	* @param article
	219	* the article to look into
	220	*
	221	* @return the details
	222	*/
	223	abstract protected String getArticleDetails(Document doc, Element article);
	224
	225	/**
	226	* The (required) {@link URL} that points to the news page on the supported
	227	* website.
	228	*
	229	* @param doc
	230	* the main document for the current category
	231	* @param article
	232	* the article to look into
	233	*
	234	* @return the internal {@link URL}
	235	*/
	236	abstract protected String getArticleIntUrl(Document doc, Element article);
	237
	238	/**
	239	* the optional {@link URL} that points to an external website for more
	240	* information.
	241	*
	242	* @param doc
	243	* the main document for the current category
	244	* @param article
	245	* the article to look into
	246	*
	247	* @return the external {@link URL}
	248	*/
	249	abstract protected String getArticleExtUrl(Document doc, Element article);
	250
	251	/**
	252	* The optional article short-content (not the full content, that will be
	253	* fetched by {@link BasicSupport#fetch(Story)}).
	254	*
	255	* @param doc
	256	* the main document for the current category
	257	* @param article
	258	* the article to look into
	259	*
	260	* @return the short content
	261	*/
	262	abstract protected String getArticleContent(Document doc, Element article);
	263
	264	/**
	265	* Fetch the full article content as well as all the comments associated to
	266	* this {@link Story}, if any (can be empty, but not NULL).
	267	*
	268	* @param story
	269	* the story to fetch the comments of
	270	*
	271	* @throws IOException
	272	* in case of I/O error
	273	*/
	274	public void fetch(Story story) throws IOException {
	275	String fullContent = "";
	276
	277	URL url = new URL(story.getUrlInternal());
	278	InputStream in = open(url);
	279	try {
	280	Document doc = DataUtil.load(in, "UTF-8", url.toString());
	281	Element article = getFullArticle(doc);
	282	if (article != null) {
	283	StringBuilder builder = new StringBuilder();
	284	ElementProcessor eProc = getElementProcessorFullArticle();
	285	if (eProc != null) {
	286	for (String line : toLines(article, eProc)) {
	287	builder.append(line + "\n");
	288	}
	289	} else {
	290	builder.append(article.text());
	291	}
	292
	293	// Content is too tight with a single break per line:
	294	fullContent = builder.toString().replace("\n", "\n\n") //
	295	.replace("\n\n\n\n", "\n\n") //
	296	.replace("\n\n\n\n", "\n\n") //
	297	.trim();
	298	}
	299
	300	if (fullContent.isEmpty()) {
	301	fullContent = story.getContent();
	302	}
	303
	304	story.setFullContent(fullContent);
	305	story.setComments(getComments(doc,
	306	getFullArticleCommentPosts(doc, url)));
	307	} finally {
	308	if (in != null) {
	309	in.close();
	310	}
	311	}
	312	}
	313
	314	/**
	315	* Return the full article if available.
	316	*
	317	* @param doc
	318	* the (full article) document to work on
	319	*
	320	* @return the article or NULL
	321	*/
	322	abstract protected Element getFullArticle(Document doc);
	323
	324	/**
	325	* Return the list of comment {@link Element}s from this optional container
	326	* -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	327	*
	328	* @param doc
	329	* the (full article) document to work on
	330	* @param intUrl
	331	* the internal {@link URL} this article wa taken from (the
	332	* {@link URL} from the supported website)
	333	*
	334	* @return the list of comment posts
	335	*/
	336	abstract protected List<Element> getFullArticleCommentPosts(Document doc,
	337	URL intUrl);
	338
	339	/**
	340	* The {@link ElementProcessor} to use to convert the main article element
	341	* (see {@link BasicSupport#getFullArticle(Document)}) into text.
	342	* <p>
	343	* See {@link BasicElementProcessor} for a working, basic implementation.
	344	* <p>
	345	* Can be NULL to simply use {@link Element#text()}.
	346	*
	347	* @return the processor, or NULL
	348	*/
	349	abstract protected ElementProcessor getElementProcessorFullArticle();
	350
	351	/**
	352	* Open a network resource.
	353	* <p>
	354	* You need to close the returned {@link InputStream} when done.
	355	*
	356	* @param url
	357	* the source to open
	358	*
	359	* @return the content
	360	*
	361	* @throws IOException
	362	* in case of I/O error
	363	*/
	364	protected InputStream open(URL url) throws IOException {
	365	return downloader.open(url);
	366	}
	367
	368	/**
	369	* Convert the comment elements into {@link Comment}s
	370	*
	371	* @param doc
	372	* the document we work on
	373	* @param posts
	374	* the comment elements
	375	*
	376	* @return the converted {@link Comment}s
	377	*/
	378	private List<Comment> getComments(Document doc, List<Element> posts) {
	379	List<Comment> comments = new ArrayList<Comment>();
	380	if (posts != null) {
	381	for (Element post : posts) {
	382	String id = getCommentId(post).trim();
	383	String author = getCommentAuthor(post).trim();
	384	String title = getCommentTitle(post).trim();
	385	String date = getCommentDate(post).trim();
	386
	387	List<String> content = new ArrayList<String>();
	388
	389	if (id.isEmpty()) {
	390	id = date;
	391	}
	392
	393	date = date(date);
	394
	395	Element contentE = getCommentContentElement(post);
	396	if (contentE != null) {
	397	ElementProcessor eProc = getElementProcessorComment();
	398	if (eProc != null) {
	399	for (String line : toLines(contentE, eProc)) {
	400	content.add(line);
	401	}
	402	} else {
	403	content = Arrays.asList(contentE.text().split("\n"));
	404	}
	405	}
	406
	407	Comment comment = new Comment(id, author, title, date, content);
	408	comment.addAll(getComments(doc,
	409	getCommentCommentPosts(doc, post)));
	410
	411	if (!comment.isEmpty()) {
	412	comments.add(comment);
	413	}
	414	}
	415	}
	416
	417	return comments;
	418	}
	419
	420	/**
	421	* Return the list of subcomment {@link Element}s from this comment element
	422	* -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	423	*
	424	* @param doc
	425	* the (full article) document to work on
	426	* @param container
	427	* the container (a comment {@link Element})
	428	*
	429	* @return the list of comment posts
	430	*/
	431	abstract protected List<Element> getCommentCommentPosts(Document doc,
	432	Element container);
	433
	434	/**
	435	* Compute the ID of the given comment element.
	436	*
	437	* @param post
	438	* the comment element
	439	*
	440	* @return the ID
	441	*/
	442	abstract protected String getCommentId(Element post);
	443
	444	/**
	445	* Compute the author of the given comment element.
	446	*
	447	* @param post
	448	* the comment element
	449	*
	450	* @return the author
	451	*/
	452	abstract protected String getCommentAuthor(Element post);
	453
	454	/**
	455	* Compute the title of the given comment element.
	456	*
	457	* @param post
	458	* the comment element
	459	*
	460	* @return the title
	461	*/
	462	abstract protected String getCommentTitle(Element post);
	463
	464	/**
	465	* Compute the date of the given comment element.
	466	*
	467	* @param post
	468	* the comment element
	469	*
	470	* @return the date
	471	*/
	472	abstract protected String getCommentDate(Element post);
	473
	474	/**
	475	* Get the main of the given comment element, which can be NULL.
	476	*
	477	* @param post
	478	* the comment element
	479	*
	480	* @return the element
	481	*/
	482	abstract protected Element getCommentContentElement(Element post);
	483
	484	/**
	485	* The {@link ElementProcessor} to use to convert the main comment element
	486	* (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
	487	* <p>
	488	* See {@link BasicElementProcessor} for a working, basic implementation.
	489	* <p>
	490	* Can be NULL to simply use {@link Element#text()}.
	491	*
	492	* @return the processor
	493	*/
	494	abstract protected ElementProcessor getElementProcessorComment();
	495
	496	/**
	497	* The support type.
	498	*
	499	* @param type
	500	* the new type
	501	*/
	502	protected void setType(Type type) {
	503	this.type = type;
	504	}
	505
	506	/**
	507	* The {@link String} to append to the selector (the selector will be
	508	* constructed as "this string" then "/type/".
	509	*
	510	* @param preselector
	511	* the preselector to set
	512	*/
	513	static public void setPreselector(String preselector) {
	514	BasicSupport.preselector = preselector;
	515	}
	516
	517	/**
	518	* Return a {@link BasicSupport} that is compatible with the given
	519	* {@link Type} if it exists (or NULL if not).
	520	*
	521	* @param type
	522	* the type
	523	*
	524	* @return a compatible {@link BasicSupport} if it exists (or NULL if not)
	525	*/
	526	static public BasicSupport getSupport(Type type) {
	527	BasicSupport support = null;
	528
	529	if (type != null) {
	530	switch (type) {
	531	case SLASHDOT:
	532	support = new Slashdot();
	533	break;
	534	case PIPEDOT:
	535	support = new Pipedot();
	536	break;
	537	case LWN:
	538	support = new LWN();
	539	break;
	540	case LEMONDE:
	541	support = new LeMonde();
	542	break;
	543	case REGISTER:
	544	support = new TheRegister();
	545	break;
	546	case TOO_LINUX:
	547	support = new TooLinux();
	548	break;
	549	case ERE_NUMERIQUE:
	550	support = new EreNumerique();
	551	break;
	552	case PHORONIX:
	553	support = new Phoronix();
	554	break;
	555	}
	556
	557	if (support != null) {
	558	support.setType(type);
	559	}
	560	}
	561
	562	return support;
	563	}
	564
	565	/**
	566	* The gopher "selector" to use for output for this type, using the
	567	* preselector.
	568	* <p>
	569	* A kind of "URL path", like "/news/" or "/misc/news/" or...
	570	*
	571	* @param type
	572	* the type to get the selector of
	573	*
	574	* @return the selector
	575	*/
	576	static public String getSelector(Type type) {
	577	return preselector + "/" + type + "/";
	578	}
	579
	580	/**
	581	* Process the given element into text (each line is a text paragraph and
	582	* can be prepended with ">" signs to indicate a quote or sub-quote or
	583	* sub-sub-quote...).
	584	*
	585	* @param element
	586	* the element to process
	587	* @param elementProcessor
	588	* the element processor, must not be NULL
	589	*
	590	* @return text lines, each line is a paragraph
	591	*/
	592	static protected List<String> toLines(Element element,
	593	final ElementProcessor elementProcessor) {
	594	final List<String> lines = new ArrayList<String>();
	595	final StringBuilder currentLine = new StringBuilder();
	596	final List<Integer> quoted = new ArrayList<Integer>();
	597	final List<Node> ignoredNodes = new ArrayList<Node>();
	598	final List<String> footnotes = new ArrayList<String>();
	599
	600	if (element != null) {
	601	new NodeTraversor(new NodeVisitor() {
	602	@Override
	603	public void head(Node node, int depth) {
	604	String manual = null;
	605	boolean ignore = elementProcessor.ignoreNode(node)
	606	\|\| ignoredNodes.contains(node.parentNode());
	607	// Manual processing
	608	if (!ignore) {
	609	manual = elementProcessor.manualProcessing(node);
	610	if (manual != null) {
	611	currentLine.append(manual);
	612	ignore = true;
	613	}
	614	}
	615
	616	// Subtitle check
	617	if (!ignore) {
	618	String subtitle = elementProcessor.isSubtitle(node);
	619	if (subtitle != null) {
	620	subtitle = subtitle.trim();
	621	currentLine.append("\n[ " + subtitle + " ]\n");
	622	ignore = true;
	623	}
	624	}
	625
	626	// <pre> check
	627	if (!ignore) {
	628	if (node instanceof Element) {
	629	Element el = (Element) node;
	630	if ("pre".equals(el.tagName())) {
	631	currentLine.append(StringUtils
	632	.unhtml(el.text()).trim());
	633	ignore = true;
	634	}
	635	}
	636	}
	637
	638	if (ignore) {
	639	ignoredNodes.add(node);
	640	return;
	641	}
	642
	643	String prep = "";
	644	for (int i = 0; i < quoted.size(); i++) {
	645	prep += ">";
	646	}
	647	prep += " ";
	648
	649	boolean enterQuote = elementProcessor.detectQuote(node);
	650	boolean leaveQuote = quoted.contains(depth);
	651
	652	if (enterQuote) {
	653	quoted.add(depth);
	654	}
	655
	656	if (leaveQuote) {
	657	quoted.remove(Integer.valueOf(depth));
	658	}
	659
	660	if (enterQuote \|\| leaveQuote) {
	661	if (currentLine.length() > 0) {
	662	if (currentLine.charAt(currentLine.length() - 1) == '\n') {
	663	currentLine.setLength(currentLine.length() - 1);
	664	}
	665	for (String l : currentLine.toString().split("\n")) {
	666	lines.add(prep + l);
	667	}
	668	}
	669	currentLine.setLength(0);
	670	}
	671
	672	if (node instanceof Element) {
	673	Element element = (Element) node;
	674	boolean block = element.isBlock()
	675	\|\| element.tagName().equalsIgnoreCase("br");
	676	if (block && currentLine.length() > 0) {
	677	currentLine.append("\n");
	678	}
	679
	680	if (!element.absUrl("href").trim().isEmpty()) {
	681	footnotes.add(element.absUrl("href"));
	682	currentLine.append("[" + footnotes.size() + "]");
	683	}
	684	} else if (node instanceof TextNode) {
	685	TextNode textNode = (TextNode) node;
	686	String line = StringUtil.normaliseWhitespace(textNode
	687	.getWholeText());
	688
	689	currentLine.append(elementProcessor.processText(line));
	690	currentLine.append(" ");
	691	}
	692	}
	693
	694	@Override
	695	public void tail(Node node, int depth) {
	696	}
	697	}).traverse(element);
	698	}
	699
	700	if (currentLine.length() > 0) {
	701	String prep = "";
	702	for (int i = 0; i < quoted.size(); i++) {
	703	prep += ">";
	704	}
	705	prep += " ";
	706	if (currentLine.length() > 0) {
	707	if (currentLine.charAt(currentLine.length() - 1) == '\n') {
	708	currentLine.setLength(currentLine.length() - 1);
	709	}
	710	for (String l : currentLine.toString().split("\n")) {
	711	lines.add(prep + l);
	712	}
	713	}
	714	}
	715
	716	// Fix spaces and nbsp, remove multiple following blank lines
	717	List<String> linesCopy = new ArrayList<String>(lines.size());
	718	long blanks = 0;
	719	for (int i = 0; i < lines.size(); i++) {
	720	String line = lines.get(i).replace(" ", " ") // nbsp -> space
	721	.replace(" ", " ").trim();
	722	if (line.isEmpty()) {
	723	blanks++;
	724	} else {
	725	blanks = 0;
	726	}
	727
	728	if (blanks < 2) {
	729	linesCopy.add(line);
	730	}
	731	}
	732
	733	// Footnotes insertion
	734	if (footnotes.size() > 0) {
	735	linesCopy.add("");
	736	linesCopy.add("");
	737	linesCopy.add("");
	738	linesCopy.add("");
	739	for (int i = 0; i < footnotes.size(); i++) {
	740	linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
	741	}
	742	}
	743
	744	return linesCopy;
	745	}
	746
	747	/**
	748	* Reformat the date if possible.
	749	*
	750	* @param date
	751	* the input date
	752	*
	753	* @return the reformated date, or the same value if it was not parsable
	754	*/
	755	static private String date(String date) {
	756	SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
	757
	758	long epoch = 0;
	759	try {
	760	epoch = Long.parseLong(date.trim());
	761	} catch (Exception e) {
	762	epoch = 0;
	763	}
	764
	765	if (epoch > 0) {
	766	return out.format(new Date(1000 * epoch));
	767	}
	768
	769	try {
	770	Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
	771	.parse(date.trim());
	772	return out.format(dat);
	773	} catch (ParseException e) {
	774	return date;
	775	}
	776	}
	777	}