[gofetch.git] / BasicSupport.java

package be.nikiroo.gofetch.support;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.jsoup.helper.DataUtil;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

import be.nikiroo.gofetch.data.Comment;
import be.nikiroo.gofetch.data.Story;
import be.nikiroo.utils.Downloader;
import be.nikiroo.utils.StringUtils;

/**
 * Base class for website support.
 * 
 * @author niki
 */
public abstract class BasicSupport {
	/**
	 * The downloader to use for all web sites via
	 * {@link BasicSupport#open(URL)}
	 */
	static private Downloader downloader = new Downloader("gofetcher");

	static private String preselector;

	/**
	 * The optional cookies to use to get the site data.
	 */
	private Map<String, String> cookies = new HashMap<String, String>();

	private Type type;

	/**
	 * Login on the web site (this method does nothing by default, but can be
	 * overridden if needed).
	 * 
	 * @throws IOException
	 *             in case of I/O error
	 * 
	 */
	public void login() throws IOException {
	}

	/**
	 * The website textual description, to add in the dispatcher page.
	 * <p>
	 * Should be short.
	 * 
	 * @return the description
	 */
	abstract public String getDescription();

	/**
	 * The gopher "selector" to use for output.
	 * <p>
	 * A kind of "URL path", like "/news/" or "/misc/news/" or...
	 * 
	 * @return the selector
	 */
	public String getSelector() {
		return getSelector(getType());
	}

	/**
	 * The support type.
	 * 
	 * @return the type
	 */
	public Type getType() {
		return type;
	}

	/**
	 * List all the recent items, but only assure the ID and internal URL to
	 * fetch it later on (until it has been fetched, the rest of the
	 * {@link Story} is not confirmed).
	 * 
	 * @return the list of new stories
	 * 
	 * @throws IOException
	 *             in case of I/O
	 */
	public List<Story> list() throws IOException {
		List<Story> list = new ArrayList<Story>();

		login();
		for (Entry<URL, String> entry : getUrls()) {
			URL url = entry.getKey();
			String defaultCateg = entry.getValue();
			if (defaultCateg == null) {
				defaultCateg = "";
			}

			InputStream in = open(url);
			Document doc = DataUtil.load(in, "UTF-8", url.toString());
			List<Element> articles = getArticles(doc);
			for (Element article : articles) {
				String id = getArticleId(doc, article).trim();
				String title = getArticleTitle(doc, article).trim();
				String author = getArticleAuthor(doc, article).trim();
				String date = getArticleDate(doc, article).trim();
				String categ = getArticleCategory(doc, article, defaultCateg)
						.trim();
				String details = getArticleDetails(doc, article).trim();
				String intUrl = getArticleIntUrl(doc, article).trim();
				String extUrl = getArticleExtUrl(doc, article).trim();
				String content = getArticleContent(doc, article).trim();

				if (id.isEmpty() && date.isEmpty()) {
					continue;
				}

				if (!id.isEmpty()) {
					while (id.length() < 10) {
						id = "0" + id;
					}
				} else {
					id = date.replace(":", "_").replace("+", "_").replace("/", "-");
				}
				
				date = date(date);

				list.add(new Story(getType(), id, title, author, date, categ,
						details, intUrl, extUrl, content));
			}
		}

		return list;
	}

	/**
	 * The {@link URL}s to process for this website.
	 * 
	 * @return the list of {@link URL}s
	 * 
	 * @throws IOException
	 *             in case of I/O error
	 */
	abstract protected List<Entry<URL, String>> getUrls() throws IOException;

	/**
	 * The article {@link Element}s of this document.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * 
	 * @return the articles
	 */
	abstract protected List<Element> getArticles(Document doc);

	/**
	 * The ID of the article (defaults to the date element if empty).
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the ID
	 */
	abstract protected String getArticleId(Document doc, Element article);

	/**
	 * The article title to display.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the title
	 */
	abstract protected String getArticleTitle(Document doc, Element article);

	/**
	 * The optional article author.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the author
	 */
	abstract protected String getArticleAuthor(Document doc, Element article);

	/**
	 * The optional article date.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the date
	 */
	abstract protected String getArticleDate(Document doc, Element article);

	/**
	 * the optional article category.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * @param currentCategory
	 *            the currently listed category if any (can be NULL)
	 * 
	 * @return the category
	 */
	abstract protected String getArticleCategory(Document doc, Element article,
			String currentCategory);

	/**
	 * the optional details of the article (can replace the date, author and
	 * category, for instance).
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the details
	 */
	abstract protected String getArticleDetails(Document doc, Element article);

	/**
	 * The (required) {@link URL} that points to the news page on the supported
	 * website.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the internal {@link URL}
	 */
	abstract protected String getArticleIntUrl(Document doc, Element article);

	/**
	 * the optional {@link URL} that points to an external website for more
	 * information.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the external {@link URL}
	 */
	abstract protected String getArticleExtUrl(Document doc, Element article);

	/**
	 * The optional article short-content (not the full content, that will be
	 * fetched by {@link BasicSupport#fetch(Story)}).
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the short content
	 */
	abstract protected String getArticleContent(Document doc, Element article);

	/**
	 * Fetch the full article content as well as all the comments associated to
	 * this {@link Story}, if any (can be empty, but not NULL).
	 * 
	 * @param story
	 *            the story to fetch the comments of
	 * 
	 * @throws IOException
	 *             in case of I/O error
	 */
	public void fetch(Story story) throws IOException {
		String fullContent = "";

		URL url = new URL(story.getUrlInternal());
		InputStream in = open(url);
		try {
			Document doc = DataUtil.load(in, "UTF-8", url.toString());
			Element article = getFullArticle(doc);
			if (article != null) {
				fullContent = getArticleText(article);
			}

			if (fullContent.isEmpty()) {
				fullContent = story.getContent();
			}

			story.setFullContent(fullContent);
			story.setComments(getComments(doc,
					getFullArticleCommentPosts(doc, url)));
		} finally {
			if (in != null) {
				in.close();
			}
		}
	}

	/**
	 * Return the text from this {@link Element}, using the
	 * {@link BasicSupport#getElementProcessorFullArticle()} processor logic.
	 * 
	 * @param article
	 *            the element to extract the text from
	 * 
	 * @return the text
	 */
	protected String getArticleText(Element article) {
		StringBuilder builder = new StringBuilder();
		ElementProcessor eProc = getElementProcessorFullArticle();
		if (eProc != null) {
			for (String line : toLines(article, eProc)) {
				builder.append(line + "\n");
			}
		} else {
			builder.append(article.text());
		}

		// Content is too tight with a single break per line:
		return builder.toString().replace("\n", "\n\n") //
				.replace("\n\n\n\n", "\n\n") //
				.replace("\n\n\n\n", "\n\n") //
				.trim();
	}

	/**
	 * Return the full article if available (this is the article to retrieve
	 * from the newly downloaded page at {@link Story#getUrlInternal()}).
	 * 
	 * @param doc
	 *            the (full article) document to work on
	 * 
	 * @return the article or NULL
	 */
	abstract protected Element getFullArticle(Document doc);

	/**
	 * Return the list of comment {@link Element}s from this optional container
	 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	 * 
	 * @param doc
	 *            the (full article) document to work on
	 * @param intUrl
	 *            the internal {@link URL} this article wa taken from (the
	 *            {@link URL} from the supported website)
	 * 
	 * @return the list of comment posts
	 */
	abstract protected List<Element> getFullArticleCommentPosts(Document doc,
			URL intUrl);

	/**
	 * The {@link ElementProcessor} to use to convert the main article element
	 * (see {@link BasicSupport#getFullArticle(Document)}) into text.
	 * <p>
	 * See {@link BasicElementProcessor} for a working, basic implementation.
	 * <p>
	 * Can be NULL to simply use {@link Element#text()}.
	 * 
	 * @return the processor, or NULL
	 */
	abstract protected ElementProcessor getElementProcessorFullArticle();

	/**
	 * Open a network resource.
	 * <p>
	 * You need to close the returned {@link InputStream} when done.
	 * 
	 * @param url
	 *            the source to open
	 * 
	 * @return the content
	 * 
	 * @throws IOException
	 *             in case of I/O error
	 */
	protected InputStream open(URL url) throws IOException {
		return downloader.open(url, url, cookies, null, null, null);
	}

	/**
	 * Convert the comment elements into {@link Comment}s
	 * 
	 * @param doc
	 *            the document we work on
	 * @param posts
	 *            the comment elements
	 * 
	 * @return the converted {@link Comment}s
	 */
	private List<Comment> getComments(Document doc, List<Element> posts) {
		List<Comment> comments = new ArrayList<Comment>();
		if (posts != null) {
			for (Element post : posts) {
				String id = getCommentId(post).trim();
				String author = getCommentAuthor(post).trim();
				String title = getCommentTitle(post).trim();
				String date = getCommentDate(post).trim();

				List<String> content = new ArrayList<String>();

				if (id.isEmpty()) {
					id = date;
				}

				date = date(date);

				Element contentE = getCommentContentElement(post);
				if (contentE != null) {
					ElementProcessor eProc = getElementProcessorComment();
					if (eProc != null) {
						for (String line : toLines(contentE, eProc)) {
							content.add(line);
						}
					} else {
						content = Arrays.asList(contentE.text().split("\n"));
					}
				}

				Comment comment = new Comment(id, author, title, date, content);
				comment.addAll(getComments(doc,
						getCommentCommentPosts(doc, post)));

				if (!comment.isEmpty()) {
					comments.add(comment);
				}
			}
		}

		return comments;
	}

	/**
	 * Return the list of subcomment {@link Element}s from this comment element
	 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	 * 
	 * @param doc
	 *            the (full article) document to work on
	 * @param container
	 *            the container (a comment {@link Element})
	 * 
	 * @return the list of comment posts
	 */
	abstract protected List<Element> getCommentCommentPosts(Document doc,
			Element container);

	/**
	 * Compute the ID of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the ID
	 */
	abstract protected String getCommentId(Element post);

	/**
	 * Compute the author of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the author
	 */
	abstract protected String getCommentAuthor(Element post);

	/**
	 * Compute the title of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the title
	 */
	abstract protected String getCommentTitle(Element post);

	/**
	 * Compute the date of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the date
	 */
	abstract protected String getCommentDate(Element post);

	/**
	 * Get the main of the given comment element, which can be NULL.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the element
	 */
	abstract protected Element getCommentContentElement(Element post);

	/**
	 * The {@link ElementProcessor} to use to convert the main comment element
	 * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
	 * <p>
	 * See {@link BasicElementProcessor} for a working, basic implementation.
	 * <p>
	 * Can be NULL to simply use {@link Element#text()}.
	 * 
	 * @return the processor
	 */
	abstract protected ElementProcessor getElementProcessorComment();

	/**
	 * The support type.
	 * 
	 * @param type
	 *            the new type
	 */
	protected void setType(Type type) {
		this.type = type;
	}

	/**
	 * Add a cookie for all site connections.
	 * 
	 * @param name
	 *            the cookie name
	 * @param value
	 *            the value
	 */
	protected void addCookie(String name, String value) {
		cookies.put(name, value);
	}

	/**
	 * The {@link String} to append to the selector (the selector will be
	 * constructed as "this string" then "/type/".
	 * 
	 * @param preselector
	 *            the preselector to set
	 */
	static public void setPreselector(String preselector) {
		BasicSupport.preselector = preselector;
	}

	/**
	 * Return a {@link BasicSupport} that is compatible with the given
	 * {@link Type} if it exists (or NULL if not).
	 * 
	 * @param type
	 *            the type
	 * 
	 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
	 */
	static public BasicSupport getSupport(Type type) {
		BasicSupport support = null;

		if (type != null) {
			switch (type) {
			case SLASHDOT:
				support = new Slashdot();
				break;
			case PIPEDOT:
				support = new Pipedot();
				break;
			case LWN:
				support = new LWN();
				break;
			case LEMONDE:
				support = new LeMonde();
				break;
			case REGISTER:
				support = new TheRegister();
				break;
			case TOO_LINUX:
				support = new TooLinux();
				break;
			case ERE_NUMERIQUE:
				support = new EreNumerique();
				break;
			case PHORONIX:
				support = new Phoronix();
				break;
			case SEPT_SUR_SEPT:
				support = new SeptSurSept();
				break;
			case REDDIT:
				support = new Reddit();
				break;
			}

			if (support != null) {
				support.setType(type);
			}
		}

		return support;
	}

	/**
	 * The gopher "selector" to use for output for this type, using the
	 * preselector.
	 * <p>
	 * A kind of "URL path", like "/news/" or "/misc/news/" or...
	 * 
	 * @param type
	 *            the type to get the selector of
	 * 
	 * @return the selector
	 */
	static public String getSelector(Type type) {
		return preselector + "/" + type + "/";
	}

	/**
	 * Process the given element into text (each line is a text paragraph and
	 * can be prepended with ">" signs to indicate a quote or sub-quote or
	 * sub-sub-quote...).
	 * 
	 * @param element
	 *            the element to process
	 * @param elementProcessor
	 *            the element processor, must not be NULL
	 * 
	 * @return text lines, each line is a paragraph
	 */
	static protected List<String> toLines(Element element,
			final ElementProcessor elementProcessor) {
		final List<String> lines = new ArrayList<String>();
		final StringBuilder currentLine = new StringBuilder();
		final List<Integer> quoted = new ArrayList<Integer>();
		final List<Node> ignoredNodes = new ArrayList<Node>();
		final List<String> footnotes = new ArrayList<String>();

		if (element != null) {
			new NodeTraversor(new NodeVisitor() {
				@Override
				public void head(Node node, int depth) {
					String manual = null;
					boolean ignore = elementProcessor.ignoreNode(node)
							|| ignoredNodes.contains(node.parentNode());
					// Manual processing
					if (!ignore) {
						manual = elementProcessor.manualProcessing(node);
						if (manual != null) {
							currentLine.append(manual);
							ignore = true;
						}
					}

					// Subtitle check
					if (!ignore) {
						String subtitle = elementProcessor.isSubtitle(node);
						if (subtitle != null) {
							subtitle = subtitle.trim();
							currentLine.append("\n[ " + subtitle + " ]\n");
							ignore = true;
						}
					}

					// <pre> check
					if (!ignore) {
						if (node instanceof Element) {
							Element el = (Element) node;
							if ("pre".equals(el.tagName())) {
								currentLine.append(StringUtils
										.unhtml(el.text()).trim());
								ignore = true;
							}
						}
					}

					if (ignore) {
						ignoredNodes.add(node);
						return;
					}

					String prep = "";
					for (int i = 0; i < quoted.size(); i++) {
						prep += ">";
					}
					prep += " ";

					boolean enterQuote = elementProcessor.detectQuote(node);
					boolean leaveQuote = quoted.contains(depth);

					if (enterQuote) {
						quoted.add(depth);
					}

					if (leaveQuote) {
						quoted.remove(Integer.valueOf(depth));
					}

					if (enterQuote || leaveQuote) {
						if (currentLine.length() > 0) {
							if (currentLine.charAt(currentLine.length() - 1) == '\n') {
								currentLine.setLength(currentLine.length() - 1);
							}
							for (String l : currentLine.toString().split("\n")) {
								lines.add(prep + l);
							}
						}
						currentLine.setLength(0);
					}

					if (node instanceof Element) {
						Element element = (Element) node;
						boolean block = element.isBlock()
								|| element.tagName().equalsIgnoreCase("br");
						if (block && currentLine.length() > 0) {
							currentLine.append("\n");
						}

						if (!element.absUrl("href").trim().isEmpty()) {
							footnotes.add(element.absUrl("href"));
							currentLine.append("[" + footnotes.size() + "]");
						}
					} else if (node instanceof TextNode) {
						TextNode textNode = (TextNode) node;
						String line = StringUtil.normaliseWhitespace(textNode
								.getWholeText());

						currentLine.append(elementProcessor.processText(line));
						currentLine.append(" ");
					}
				}

				@Override
				public void tail(Node node, int depth) {
				}
			}).traverse(element);
		}

		if (currentLine.length() > 0) {
			String prep = "";
			for (int i = 0; i < quoted.size(); i++) {
				prep += ">";
			}
			prep += " ";
			if (currentLine.length() > 0) {
				if (currentLine.charAt(currentLine.length() - 1) == '\n') {
					currentLine.setLength(currentLine.length() - 1);
				}
				for (String l : currentLine.toString().split("\n")) {
					lines.add(prep + l);
				}
			}
		}

		// Fix spaces and nbsp, remove multiple following blank lines
		List<String> linesCopy = new ArrayList<String>(lines.size());
		long blanks = 0;
		for (int i = 0; i < lines.size(); i++) {
			String line = lines.get(i).replace(" ", " ") // nbsp -> space
					.replace("  ", " ").trim();
			if (line.isEmpty()) {
				blanks++;
			} else {
				blanks = 0;
			}

			if (blanks < 2) {
				linesCopy.add(line);
			}
		}

		// Footnotes insertion
		if (footnotes.size() > 0) {
			linesCopy.add("");
			linesCopy.add("");
			linesCopy.add("");
			linesCopy.add("");
			for (int i = 0; i < footnotes.size(); i++) {
				linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
			}
		}

		return linesCopy;
	}

	/**
	 * Reformat the date if possible.
	 * 
	 * @param date
	 *            the input date
	 * 
	 * @return the reformated date, or the same value if it was not parsable
	 */
	static private String date(String date) {
		SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");

		long epoch = 0;
		try {
			epoch = Long.parseLong(date.trim());
		} catch (Exception e) {
			epoch = 0;
		}

		if (epoch > 0) {
			return out.format(new Date(1000 * epoch));
		}

		try {
			Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
					.parse(date.trim());
			return out.format(dat);
		} catch (Exception e) {
			return date;
		}
	}
}
Commit	Line	Data
	1	package be.nikiroo.gofetch.support;
	2
	3	import java.io.IOException;
	4	import java.io.InputStream;
	5	import java.net.URL;
	6	import java.text.ParseException;
	7	import java.text.SimpleDateFormat;
	8	import java.util.ArrayList;
	9	import java.util.Arrays;
	10	import java.util.Date;
	11	import java.util.HashMap;
	12	import java.util.List;
	13	import java.util.Map;
	14	import java.util.Map.Entry;
	15
	16	import org.jsoup.helper.DataUtil;
	17	import org.jsoup.helper.StringUtil;
	18	import org.jsoup.nodes.Document;
	19	import org.jsoup.nodes.Element;
	20	import org.jsoup.nodes.Node;
	21	import org.jsoup.nodes.TextNode;
	22	import org.jsoup.select.NodeTraversor;
	23	import org.jsoup.select.NodeVisitor;
	24
	25	import be.nikiroo.gofetch.data.Comment;
	26	import be.nikiroo.gofetch.data.Story;
	27	import be.nikiroo.utils.Downloader;
	28	import be.nikiroo.utils.StringUtils;
	29
	30	/**
	31	* Base class for website support.
	32	*
	33	* @author niki
	34	*/
	35	public abstract class BasicSupport {
	36	/**
	37	* The downloader to use for all web sites via
	38	* {@link BasicSupport#open(URL)}
	39	*/
	40	static private Downloader downloader = new Downloader("gofetcher");
	41
	42	static private String preselector;
	43
	44	/**
	45	* The optional cookies to use to get the site data.
	46	*/
	47	private Map<String, String> cookies = new HashMap<String, String>();
	48
	49	private Type type;
	50
	51	/**
	52	* Login on the web site (this method does nothing by default, but can be
	53	* overridden if needed).
	54	*
	55	* @throws IOException
	56	* in case of I/O error
	57	*
	58	*/
	59	public void login() throws IOException {
	60	}
	61
	62	/**
	63	* The website textual description, to add in the dispatcher page.
	64	* <p>
	65	* Should be short.
	66	*
	67	* @return the description
	68	*/
	69	abstract public String getDescription();
	70
	71	/**
	72	* The gopher "selector" to use for output.
	73	* <p>
	74	* A kind of "URL path", like "/news/" or "/misc/news/" or...
	75	*
	76	* @return the selector
	77	*/
	78	public String getSelector() {
	79	return getSelector(getType());
	80	}
	81
	82	/**
	83	* The support type.
	84	*
	85	* @return the type
	86	*/
	87	public Type getType() {
	88	return type;
	89	}
	90
	91	/**
	92	* List all the recent items, but only assure the ID and internal URL to
	93	* fetch it later on (until it has been fetched, the rest of the
	94	* {@link Story} is not confirmed).
	95	*
	96	* @return the list of new stories
	97	*
	98	* @throws IOException
	99	* in case of I/O
	100	*/
	101	public List<Story> list() throws IOException {
	102	List<Story> list = new ArrayList<Story>();
	103
	104	login();
	105	for (Entry<URL, String> entry : getUrls()) {
	106	URL url = entry.getKey();
	107	String defaultCateg = entry.getValue();
	108	if (defaultCateg == null) {
	109	defaultCateg = "";
	110	}
	111
	112	InputStream in = open(url);
	113	Document doc = DataUtil.load(in, "UTF-8", url.toString());
	114	List<Element> articles = getArticles(doc);
	115	for (Element article : articles) {
	116	String id = getArticleId(doc, article).trim();
	117	String title = getArticleTitle(doc, article).trim();
	118	String author = getArticleAuthor(doc, article).trim();
	119	String date = getArticleDate(doc, article).trim();
	120	String categ = getArticleCategory(doc, article, defaultCateg)
	121	.trim();
	122	String details = getArticleDetails(doc, article).trim();
	123	String intUrl = getArticleIntUrl(doc, article).trim();
	124	String extUrl = getArticleExtUrl(doc, article).trim();
	125	String content = getArticleContent(doc, article).trim();
	126
	127	if (id.isEmpty() && date.isEmpty()) {
	128	continue;
	129	}
	130
	131	if (!id.isEmpty()) {
	132	while (id.length() < 10) {
	133	id = "0" + id;
	134	}
	135	} else {
	136	id = date.replace(":", "_").replace("+", "_").replace("/", "-");
	137	}
	138
	139	date = date(date);
	140
	141	list.add(new Story(getType(), id, title, author, date, categ,
	142	details, intUrl, extUrl, content));
	143	}
	144	}
	145
	146	return list;
	147	}
	148
	149	/**
	150	* The {@link URL}s to process for this website.
	151	*
	152	* @return the list of {@link URL}s
	153	*
	154	* @throws IOException
	155	* in case of I/O error
	156	*/
	157	abstract protected List<Entry<URL, String>> getUrls() throws IOException;
	158
	159	/**
	160	* The article {@link Element}s of this document.
	161	*
	162	* @param doc
	163	* the main document for the current category
	164	*
	165	* @return the articles
	166	*/
	167	abstract protected List<Element> getArticles(Document doc);
	168
	169	/**
	170	* The ID of the article (defaults to the date element if empty).
	171	*
	172	* @param doc
	173	* the main document for the current category
	174	* @param article
	175	* the article to look into
	176	*
	177	* @return the ID
	178	*/
	179	abstract protected String getArticleId(Document doc, Element article);
	180
	181	/**
	182	* The article title to display.
	183	*
	184	* @param doc
	185	* the main document for the current category
	186	* @param article
	187	* the article to look into
	188	*
	189	* @return the title
	190	*/
	191	abstract protected String getArticleTitle(Document doc, Element article);
	192
	193	/**
	194	* The optional article author.
	195	*
	196	* @param doc
	197	* the main document for the current category
	198	* @param article
	199	* the article to look into
	200	*
	201	* @return the author
	202	*/
	203	abstract protected String getArticleAuthor(Document doc, Element article);
	204
	205	/**
	206	* The optional article date.
	207	*
	208	* @param doc
	209	* the main document for the current category
	210	* @param article
	211	* the article to look into
	212	*
	213	* @return the date
	214	*/
	215	abstract protected String getArticleDate(Document doc, Element article);
	216
	217	/**
	218	* the optional article category.
	219	*
	220	* @param doc
	221	* the main document for the current category
	222	* @param article
	223	* the article to look into
	224	* @param currentCategory
	225	* the currently listed category if any (can be NULL)
	226	*
	227	* @return the category
	228	*/
	229	abstract protected String getArticleCategory(Document doc, Element article,
	230	String currentCategory);
	231
	232	/**
	233	* the optional details of the article (can replace the date, author and
	234	* category, for instance).
	235	*
	236	* @param doc
	237	* the main document for the current category
	238	* @param article
	239	* the article to look into
	240	*
	241	* @return the details
	242	*/
	243	abstract protected String getArticleDetails(Document doc, Element article);
	244
	245	/**
	246	* The (required) {@link URL} that points to the news page on the supported
	247	* website.
	248	*
	249	* @param doc
	250	* the main document for the current category
	251	* @param article
	252	* the article to look into
	253	*
	254	* @return the internal {@link URL}
	255	*/
	256	abstract protected String getArticleIntUrl(Document doc, Element article);
	257
	258	/**
	259	* the optional {@link URL} that points to an external website for more
	260	* information.
	261	*
	262	* @param doc
	263	* the main document for the current category
	264	* @param article
	265	* the article to look into
	266	*
	267	* @return the external {@link URL}
	268	*/
	269	abstract protected String getArticleExtUrl(Document doc, Element article);
	270
	271	/**
	272	* The optional article short-content (not the full content, that will be
	273	* fetched by {@link BasicSupport#fetch(Story)}).
	274	*
	275	* @param doc
	276	* the main document for the current category
	277	* @param article
	278	* the article to look into
	279	*
	280	* @return the short content
	281	*/
	282	abstract protected String getArticleContent(Document doc, Element article);
	283
	284	/**
	285	* Fetch the full article content as well as all the comments associated to
	286	* this {@link Story}, if any (can be empty, but not NULL).
	287	*
	288	* @param story
	289	* the story to fetch the comments of
	290	*
	291	* @throws IOException
	292	* in case of I/O error
	293	*/
	294	public void fetch(Story story) throws IOException {
	295	String fullContent = "";
	296
	297	URL url = new URL(story.getUrlInternal());
	298	InputStream in = open(url);
	299	try {
	300	Document doc = DataUtil.load(in, "UTF-8", url.toString());
	301	Element article = getFullArticle(doc);
	302	if (article != null) {
	303	fullContent = getArticleText(article);
	304	}
	305
	306	if (fullContent.isEmpty()) {
	307	fullContent = story.getContent();
	308	}
	309
	310	story.setFullContent(fullContent);
	311	story.setComments(getComments(doc,
	312	getFullArticleCommentPosts(doc, url)));
	313	} finally {
	314	if (in != null) {
	315	in.close();
	316	}
	317	}
	318	}
	319
	320	/**
	321	* Return the text from this {@link Element}, using the
	322	* {@link BasicSupport#getElementProcessorFullArticle()} processor logic.
	323	*
	324	* @param article
	325	* the element to extract the text from
	326	*
	327	* @return the text
	328	*/
	329	protected String getArticleText(Element article) {
	330	StringBuilder builder = new StringBuilder();
	331	ElementProcessor eProc = getElementProcessorFullArticle();
	332	if (eProc != null) {
	333	for (String line : toLines(article, eProc)) {
	334	builder.append(line + "\n");
	335	}
	336	} else {
	337	builder.append(article.text());
	338	}
	339
	340	// Content is too tight with a single break per line:
	341	return builder.toString().replace("\n", "\n\n") //
	342	.replace("\n\n\n\n", "\n\n") //
	343	.replace("\n\n\n\n", "\n\n") //
	344	.trim();
	345	}
	346
	347	/**
	348	* Return the full article if available (this is the article to retrieve
	349	* from the newly downloaded page at {@link Story#getUrlInternal()}).
	350	*
	351	* @param doc
	352	* the (full article) document to work on
	353	*
	354	* @return the article or NULL
	355	*/
	356	abstract protected Element getFullArticle(Document doc);
	357
	358	/**
	359	* Return the list of comment {@link Element}s from this optional container
	360	* -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	361	*
	362	* @param doc
	363	* the (full article) document to work on
	364	* @param intUrl
	365	* the internal {@link URL} this article wa taken from (the
	366	* {@link URL} from the supported website)
	367	*
	368	* @return the list of comment posts
	369	*/
	370	abstract protected List<Element> getFullArticleCommentPosts(Document doc,
	371	URL intUrl);
	372
	373	/**
	374	* The {@link ElementProcessor} to use to convert the main article element
	375	* (see {@link BasicSupport#getFullArticle(Document)}) into text.
	376	* <p>
	377	* See {@link BasicElementProcessor} for a working, basic implementation.
	378	* <p>
	379	* Can be NULL to simply use {@link Element#text()}.
	380	*
	381	* @return the processor, or NULL
	382	*/
	383	abstract protected ElementProcessor getElementProcessorFullArticle();
	384
	385	/**
	386	* Open a network resource.
	387	* <p>
	388	* You need to close the returned {@link InputStream} when done.
	389	*
	390	* @param url
	391	* the source to open
	392	*
	393	* @return the content
	394	*
	395	* @throws IOException
	396	* in case of I/O error
	397	*/
	398	protected InputStream open(URL url) throws IOException {
	399	return downloader.open(url, url, cookies, null, null, null);
	400	}
	401
	402	/**
	403	* Convert the comment elements into {@link Comment}s
	404	*
	405	* @param doc
	406	* the document we work on
	407	* @param posts
	408	* the comment elements
	409	*
	410	* @return the converted {@link Comment}s
	411	*/
	412	private List<Comment> getComments(Document doc, List<Element> posts) {
	413	List<Comment> comments = new ArrayList<Comment>();
	414	if (posts != null) {
	415	for (Element post : posts) {
	416	String id = getCommentId(post).trim();
	417	String author = getCommentAuthor(post).trim();
	418	String title = getCommentTitle(post).trim();
	419	String date = getCommentDate(post).trim();
	420
	421	List<String> content = new ArrayList<String>();
	422
	423	if (id.isEmpty()) {
	424	id = date;
	425	}
	426
	427	date = date(date);
	428
	429	Element contentE = getCommentContentElement(post);
	430	if (contentE != null) {
	431	ElementProcessor eProc = getElementProcessorComment();
	432	if (eProc != null) {
	433	for (String line : toLines(contentE, eProc)) {
	434	content.add(line);
	435	}
	436	} else {
	437	content = Arrays.asList(contentE.text().split("\n"));
	438	}
	439	}
	440
	441	Comment comment = new Comment(id, author, title, date, content);
	442	comment.addAll(getComments(doc,
	443	getCommentCommentPosts(doc, post)));
	444
	445	if (!comment.isEmpty()) {
	446	comments.add(comment);
	447	}
	448	}
	449	}
	450
	451	return comments;
	452	}
	453
	454	/**
	455	* Return the list of subcomment {@link Element}s from this comment element
	456	* -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	457	*
	458	* @param doc
	459	* the (full article) document to work on
	460	* @param container
	461	* the container (a comment {@link Element})
	462	*
	463	* @return the list of comment posts
	464	*/
	465	abstract protected List<Element> getCommentCommentPosts(Document doc,
	466	Element container);
	467
	468	/**
	469	* Compute the ID of the given comment element.
	470	*
	471	* @param post
	472	* the comment element
	473	*
	474	* @return the ID
	475	*/
	476	abstract protected String getCommentId(Element post);
	477
	478	/**
	479	* Compute the author of the given comment element.
	480	*
	481	* @param post
	482	* the comment element
	483	*
	484	* @return the author
	485	*/
	486	abstract protected String getCommentAuthor(Element post);
	487
	488	/**
	489	* Compute the title of the given comment element.
	490	*
	491	* @param post
	492	* the comment element
	493	*
	494	* @return the title
	495	*/
	496	abstract protected String getCommentTitle(Element post);
	497
	498	/**
	499	* Compute the date of the given comment element.
	500	*
	501	* @param post
	502	* the comment element
	503	*
	504	* @return the date
	505	*/
	506	abstract protected String getCommentDate(Element post);
	507
	508	/**
	509	* Get the main of the given comment element, which can be NULL.
	510	*
	511	* @param post
	512	* the comment element
	513	*
	514	* @return the element
	515	*/
	516	abstract protected Element getCommentContentElement(Element post);
	517
	518	/**
	519	* The {@link ElementProcessor} to use to convert the main comment element
	520	* (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
	521	* <p>
	522	* See {@link BasicElementProcessor} for a working, basic implementation.
	523	* <p>
	524	* Can be NULL to simply use {@link Element#text()}.
	525	*
	526	* @return the processor
	527	*/
	528	abstract protected ElementProcessor getElementProcessorComment();
	529
	530	/**
	531	* The support type.
	532	*
	533	* @param type
	534	* the new type
	535	*/
	536	protected void setType(Type type) {
	537	this.type = type;
	538	}
	539
	540	/**
	541	* Add a cookie for all site connections.
	542	*
	543	* @param name
	544	* the cookie name
	545	* @param value
	546	* the value
	547	*/
	548	protected void addCookie(String name, String value) {
	549	cookies.put(name, value);
	550	}
	551
	552	/**
	553	* The {@link String} to append to the selector (the selector will be
	554	* constructed as "this string" then "/type/".
	555	*
	556	* @param preselector
	557	* the preselector to set
	558	*/
	559	static public void setPreselector(String preselector) {
	560	BasicSupport.preselector = preselector;
	561	}
	562
	563	/**
	564	* Return a {@link BasicSupport} that is compatible with the given
	565	* {@link Type} if it exists (or NULL if not).
	566	*
	567	* @param type
	568	* the type
	569	*
	570	* @return a compatible {@link BasicSupport} if it exists (or NULL if not)
	571	*/
	572	static public BasicSupport getSupport(Type type) {
	573	BasicSupport support = null;
	574
	575	if (type != null) {
	576	switch (type) {
	577	case SLASHDOT:
	578	support = new Slashdot();
	579	break;
	580	case PIPEDOT:
	581	support = new Pipedot();
	582	break;
	583	case LWN:
	584	support = new LWN();
	585	break;
	586	case LEMONDE:
	587	support = new LeMonde();
	588	break;
	589	case REGISTER:
	590	support = new TheRegister();
	591	break;
	592	case TOO_LINUX:
	593	support = new TooLinux();
	594	break;
	595	case ERE_NUMERIQUE:
	596	support = new EreNumerique();
	597	break;
	598	case PHORONIX:
	599	support = new Phoronix();
	600	break;
	601	case SEPT_SUR_SEPT:
	602	support = new SeptSurSept();
	603	break;
	604	case REDDIT:
	605	support = new Reddit();
	606	break;
	607	}
	608
	609	if (support != null) {
	610	support.setType(type);
	611	}
	612	}
	613
	614	return support;
	615	}
	616
	617	/**
	618	* The gopher "selector" to use for output for this type, using the
	619	* preselector.
	620	* <p>
	621	* A kind of "URL path", like "/news/" or "/misc/news/" or...
	622	*
	623	* @param type
	624	* the type to get the selector of
	625	*
	626	* @return the selector
	627	*/
	628	static public String getSelector(Type type) {
	629	return preselector + "/" + type + "/";
	630	}
	631
	632	/**
	633	* Process the given element into text (each line is a text paragraph and
	634	* can be prepended with ">" signs to indicate a quote or sub-quote or
	635	* sub-sub-quote...).
	636	*
	637	* @param element
	638	* the element to process
	639	* @param elementProcessor
	640	* the element processor, must not be NULL
	641	*
	642	* @return text lines, each line is a paragraph
	643	*/
	644	static protected List<String> toLines(Element element,
	645	final ElementProcessor elementProcessor) {
	646	final List<String> lines = new ArrayList<String>();
	647	final StringBuilder currentLine = new StringBuilder();
	648	final List<Integer> quoted = new ArrayList<Integer>();
	649	final List<Node> ignoredNodes = new ArrayList<Node>();
	650	final List<String> footnotes = new ArrayList<String>();
	651
	652	if (element != null) {
	653	new NodeTraversor(new NodeVisitor() {
	654	@Override
	655	public void head(Node node, int depth) {
	656	String manual = null;
	657	boolean ignore = elementProcessor.ignoreNode(node)
	658	\|\| ignoredNodes.contains(node.parentNode());
	659	// Manual processing
	660	if (!ignore) {
	661	manual = elementProcessor.manualProcessing(node);
	662	if (manual != null) {
	663	currentLine.append(manual);
	664	ignore = true;
	665	}
	666	}
	667
	668	// Subtitle check
	669	if (!ignore) {
	670	String subtitle = elementProcessor.isSubtitle(node);
	671	if (subtitle != null) {
	672	subtitle = subtitle.trim();
	673	currentLine.append("\n[ " + subtitle + " ]\n");
	674	ignore = true;
	675	}
	676	}
	677
	678	// <pre> check
	679	if (!ignore) {
	680	if (node instanceof Element) {
	681	Element el = (Element) node;
	682	if ("pre".equals(el.tagName())) {
	683	currentLine.append(StringUtils
	684	.unhtml(el.text()).trim());
	685	ignore = true;
	686	}
	687	}
	688	}
	689
	690	if (ignore) {
	691	ignoredNodes.add(node);
	692	return;
	693	}
	694
	695	String prep = "";
	696	for (int i = 0; i < quoted.size(); i++) {
	697	prep += ">";
	698	}
	699	prep += " ";
	700
	701	boolean enterQuote = elementProcessor.detectQuote(node);
	702	boolean leaveQuote = quoted.contains(depth);
	703
	704	if (enterQuote) {
	705	quoted.add(depth);
	706	}
	707
	708	if (leaveQuote) {
	709	quoted.remove(Integer.valueOf(depth));
	710	}
	711
	712	if (enterQuote \|\| leaveQuote) {
	713	if (currentLine.length() > 0) {
	714	if (currentLine.charAt(currentLine.length() - 1) == '\n') {
	715	currentLine.setLength(currentLine.length() - 1);
	716	}
	717	for (String l : currentLine.toString().split("\n")) {
	718	lines.add(prep + l);
	719	}
	720	}
	721	currentLine.setLength(0);
	722	}
	723
	724	if (node instanceof Element) {
	725	Element element = (Element) node;
	726	boolean block = element.isBlock()
	727	\|\| element.tagName().equalsIgnoreCase("br");
	728	if (block && currentLine.length() > 0) {
	729	currentLine.append("\n");
	730	}
	731
	732	if (!element.absUrl("href").trim().isEmpty()) {
	733	footnotes.add(element.absUrl("href"));
	734	currentLine.append("[" + footnotes.size() + "]");
	735	}
	736	} else if (node instanceof TextNode) {
	737	TextNode textNode = (TextNode) node;
	738	String line = StringUtil.normaliseWhitespace(textNode
	739	.getWholeText());
	740
	741	currentLine.append(elementProcessor.processText(line));
	742	currentLine.append(" ");
	743	}
	744	}
	745
	746	@Override
	747	public void tail(Node node, int depth) {
	748	}
	749	}).traverse(element);
	750	}
	751
	752	if (currentLine.length() > 0) {
	753	String prep = "";
	754	for (int i = 0; i < quoted.size(); i++) {
	755	prep += ">";
	756	}
	757	prep += " ";
	758	if (currentLine.length() > 0) {
	759	if (currentLine.charAt(currentLine.length() - 1) == '\n') {
	760	currentLine.setLength(currentLine.length() - 1);
	761	}
	762	for (String l : currentLine.toString().split("\n")) {
	763	lines.add(prep + l);
	764	}
	765	}
	766	}
	767
	768	// Fix spaces and nbsp, remove multiple following blank lines
	769	List<String> linesCopy = new ArrayList<String>(lines.size());
	770	long blanks = 0;
	771	for (int i = 0; i < lines.size(); i++) {
	772	String line = lines.get(i).replace(" ", " ") // nbsp -> space
	773	.replace(" ", " ").trim();
	774	if (line.isEmpty()) {
	775	blanks++;
	776	} else {
	777	blanks = 0;
	778	}
	779
	780	if (blanks < 2) {
	781	linesCopy.add(line);
	782	}
	783	}
	784
	785	// Footnotes insertion
	786	if (footnotes.size() > 0) {
	787	linesCopy.add("");
	788	linesCopy.add("");
	789	linesCopy.add("");
	790	linesCopy.add("");
	791	for (int i = 0; i < footnotes.size(); i++) {
	792	linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
	793	}
	794	}
	795
	796	return linesCopy;
	797	}
	798
	799	/**
	800	* Reformat the date if possible.
	801	*
	802	* @param date
	803	* the input date
	804	*
	805	* @return the reformated date, or the same value if it was not parsable
	806	*/
	807	static private String date(String date) {
	808	SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
	809
	810	long epoch = 0;
	811	try {
	812	epoch = Long.parseLong(date.trim());
	813	} catch (Exception e) {
	814	epoch = 0;
	815	}
	816
	817	if (epoch > 0) {
	818	return out.format(new Date(1000 * epoch));
	819	}
	820
	821	try {
	822	Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
	823	.parse(date.trim());
	824	return out.format(dat);
	825	} catch (Exception e) {
	826	return date;
	827	}
	828	}
	829	}