[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java

package be.nikiroo.gofetch.support;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Map.Entry;

import org.jsoup.helper.DataUtil;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

import be.nikiroo.gofetch.data.Comment;
import be.nikiroo.gofetch.data.Story;
import be.nikiroo.utils.Downloader;
import be.nikiroo.utils.StringUtils;

/**
 * Base class for website support.
 * 
 * @author niki
 */
public abstract class BasicSupport {
	/** The downloader to use for all websites. */
	protected static Downloader downloader = new Downloader("gofetcher");

	static private String preselector;

	private Type type;

	/**
	 * The website textual description, to add in the dispatcher page.
	 * <p>
	 * Should be short.
	 * 
	 * @return the description
	 */
	abstract public String getDescription();

	/**
	 * The gopher "selector" to use for output.
	 * <p>
	 * A kind of "URL path", like "/news/" or "/misc/news/" or...
	 * 
	 * @return the selector
	 */
	public String getSelector() {
		return getSelector(type);
	}

	/**
	 * The support type.
	 * 
	 * @return the type
	 */
	public Type getType() {
		return type;
	}

	/**
	 * List all the recent items, but only assure the ID and internal URL to
	 * fetch it later on (until it has been fetched, the rest of the
	 * {@link Story} is not confirmed).
	 * 
	 * @return the list of new stories
	 * 
	 * @throws IOException
	 *             in case of I/O
	 */
	public List<Story> list() throws IOException {
		List<Story> list = new ArrayList<Story>();

		for (Entry<URL, String> entry : getUrls()) {
			URL url = entry.getKey();
			String defaultCateg = entry.getValue();
			if (defaultCateg == null) {
				defaultCateg = "";
			}

			InputStream in = downloader.open(url);
			Document doc = DataUtil.load(in, "UTF-8", url.toString());
			List<Element> articles = getArticles(doc);
			for (Element article : articles) {
				String id = getArticleId(doc, article).trim();
				String title = getArticleTitle(doc, article).trim();
				String author = getArticleAuthor(doc, article).trim();
				String date = getArticleDate(doc, article).trim();
				String categ = getArticleCategory(doc, article, defaultCateg)
						.trim();
				String details = getArticleDetails(doc, article).trim();
				String intUrl = getArticleIntUrl(doc, article).trim();
				String extUrl = getArticleExtUrl(doc, article).trim();
				String content = getArticleContent(doc, article).trim();

				if (id.isEmpty() && date.isEmpty()) {
					continue;
				}

				if (id.isEmpty()) {
					id = date.replace(":", "_").replace("+", "_");
				}

				date = date(date);

				list.add(new Story(getType(), id, title, author, date, categ,
						details, intUrl, extUrl, content));
			}
		}

		return list;
	}

	/**
	 * The {@link URL}s to process for this website.
	 * 
	 * @return the list of {@link URL}s
	 * 
	 * @throws IOException
	 *             in case of I/O error
	 */
	abstract protected List<Entry<URL, String>> getUrls() throws IOException;

	/**
	 * The article {@link Element}s of this document.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * 
	 * @return the articles
	 */
	abstract protected List<Element> getArticles(Document doc);

	/**
	 * The ID of the article (defaults to the date element if empty).
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the ID
	 */
	abstract protected String getArticleId(Document doc, Element article);

	/**
	 * The article title to display.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the title
	 */
	abstract protected String getArticleTitle(Document doc, Element article);

	/**
	 * The optional article author.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the author
	 */
	abstract protected String getArticleAuthor(Document doc, Element article);

	/**
	 * The optional article date.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the date
	 */
	abstract protected String getArticleDate(Document doc, Element article);

	/**
	 * the optional article category.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * @param currentCategory
	 *            the currently listed category if any (can be NULL)
	 * 
	 * @return the category
	 */
	abstract protected String getArticleCategory(Document doc, Element article,
			String currentCategory);

	/**
	 * the optional details of the article (can replace the date, author and
	 * category, for instance).
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the details
	 */
	abstract protected String getArticleDetails(Document doc, Element article);

	/**
	 * The (required) {@link URL} that points to the news page on the supported
	 * website.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the internal {@link URL}
	 */
	abstract protected String getArticleIntUrl(Document doc, Element article);

	/**
	 * the optional {@link URL} that points to an external website for more
	 * information.
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the external {@link URL}
	 */
	abstract protected String getArticleExtUrl(Document doc, Element article);

	/**
	 * The optional article short-content (not the full content, that will be
	 * fetched by {@link BasicSupport#fetch(Story)}).
	 * 
	 * @param doc
	 *            the main document for the current category
	 * @param article
	 *            the article to look into
	 * 
	 * @return the short content
	 */
	abstract protected String getArticleContent(Document doc, Element article);

	/**
	 * Fetch the full article content as well as all the comments associated to
	 * this {@link Story}, if any (can be empty, but not NULL).
	 * 
	 * @param story
	 *            the story to fetch the comments of
	 * 
	 * @throws IOException
	 *             in case of I/O error
	 */
	public void fetch(Story story) throws IOException {
		String fullContent = "";

		URL url = new URL(story.getUrlInternal());
		InputStream in = downloader.open(url);
		try {
			Document doc = DataUtil.load(in, "UTF-8", url.toString());
			Element article = getFullArticle(doc);
			if (article != null) {
				StringBuilder builder = new StringBuilder();
				ElementProcessor eProc = getElementProcessorFullArticle();
				if (eProc != null) {
					for (String line : toLines(article, eProc)) {
						builder.append(line + "\n");
					}
				} else {
					builder.append(article.text());
				}

				// Content is too tight with a single break per line:
				fullContent = builder.toString().replace("\n", "\n\n") //
						.replace("\n\n\n\n", "\n\n") //
						.replace("\n\n\n\n", "\n\n") //
						.trim();
			}

			if (fullContent.isEmpty()) {
				fullContent = story.getContent();
			}

			story.setFullContent(fullContent);
			story.setComments(getComments(doc,
					getFullArticleCommentPosts(doc, url)));
		} finally {
			if (in != null) {
				in.close();
			}
		}
	}

	/**
	 * Return the full article if available.
	 * 
	 * @param doc
	 *            the (full article) document to work on
	 * 
	 * @return the article or NULL
	 */
	abstract protected Element getFullArticle(Document doc);

	/**
	 * Return the list of comment {@link Element}s from this optional container
	 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	 * 
	 * @param doc
	 *            the (full article) document to work on
	 * @param intUrl
	 *            the internal {@link URL} this article wa taken from (the
	 *            {@link URL} from the supported website)
	 * 
	 * @return the list of comment posts
	 */
	abstract protected List<Element> getFullArticleCommentPosts(Document doc,
			URL intUrl);

	/**
	 * The {@link ElementProcessor} to use to convert the main article element
	 * (see {@link BasicSupport#getFullArticle(Document)}) into text.
	 * <p>
	 * See {@link BasicElementProcessor} for a working, basic implementation.
	 * <p>
	 * Can be NULL to simply use {@link Element#text()}.
	 * 
	 * @return the processor, or NULL
	 */
	abstract protected ElementProcessor getElementProcessorFullArticle();

	/**
	 * Convert the comment elements into {@link Comment}s
	 * 
	 * @param doc
	 *            the document we work on
	 * @param posts
	 *            the comment elements
	 * 
	 * @return the converted {@link Comment}s
	 */
	private List<Comment> getComments(Document doc, List<Element> posts) {
		List<Comment> comments = new ArrayList<Comment>();
		if (posts != null) {
			for (Element post : posts) {
				String id = getCommentId(post).trim();
				String author = getCommentAuthor(post).trim();
				String title = getCommentTitle(post).trim();
				String date = getCommentDate(post).trim();

				List<String> content = new ArrayList<String>();

				if (id.isEmpty()) {
					id = date;
				}

				date = date(date);

				Element contentE = getCommentContentElement(post);
				if (contentE != null) {
					ElementProcessor eProc = getElementProcessorComment();
					if (eProc != null) {
						for (String line : toLines(contentE, eProc)) {
							content.add(line);
						}
					} else {
						content = Arrays.asList(contentE.text().split("\n"));
					}
				}

				Comment comment = new Comment(id, author, title, date, content);
				comment.addAll(getComments(doc,
						getCommentCommentPosts(doc, post)));

				if (!comment.isEmpty()) {
					comments.add(comment);
				}
			}
		}

		return comments;
	}

	/**
	 * Return the list of subcomment {@link Element}s from this comment element
	 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	 * 
	 * @param doc
	 *            the (full article) document to work on
	 * @param container
	 *            the container (a comment {@link Element})
	 * 
	 * @return the list of comment posts
	 */
	abstract protected List<Element> getCommentCommentPosts(Document doc,
			Element container);

	/**
	 * Compute the ID of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the ID
	 */
	abstract protected String getCommentId(Element post);

	/**
	 * Compute the author of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the author
	 */
	abstract protected String getCommentAuthor(Element post);

	/**
	 * Compute the title of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the title
	 */
	abstract protected String getCommentTitle(Element post);

	/**
	 * Compute the date of the given comment element.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the date
	 */
	abstract protected String getCommentDate(Element post);

	/**
	 * Get the main of the given comment element, which can be NULL.
	 * 
	 * @param post
	 *            the comment element
	 * 
	 * @return the element
	 */
	abstract protected Element getCommentContentElement(Element post);

	/**
	 * The {@link ElementProcessor} to use to convert the main comment element
	 * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
	 * <p>
	 * See {@link BasicElementProcessor} for a working, basic implementation.
	 * <p>
	 * Can be NULL to simply use {@link Element#text()}.
	 * 
	 * @return the processor
	 */
	abstract protected ElementProcessor getElementProcessorComment();

	/**
	 * The support type.
	 * 
	 * @param type
	 *            the new type
	 */
	protected void setType(Type type) {
		this.type = type;
	}

	/**
	 * The {@link String} to append to the selector (the selector will be
	 * constructed as "this string" then "/type/".
	 * 
	 * @param preselector
	 *            the preselector to set
	 */
	static public void setPreselector(String preselector) {
		BasicSupport.preselector = preselector;
	}

	/**
	 * Return a {@link BasicSupport} that is compatible with the given
	 * {@link Type} if it exists (or NULL if not).
	 * 
	 * @param type
	 *            the type
	 * 
	 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
	 */
	static public BasicSupport getSupport(Type type) {
		BasicSupport support = null;

		if (type != null) {
			switch (type) {
			case SLASHDOT:
				support = new Slashdot();
				break;
			case PIPEDOT:
				support = new Pipedot();
				break;
			case LWN:
				support = new LWN();
				break;
			case LEMONDE:
				support = new LeMonde();
				break;
			case REGISTER:
				support = new TheRegister();
				break;
			case TOO_LINUX:
				support = new TooLinux();
				break;
			case ERE_NUMERIQUE:
				support = new EreNumerique();
				break;
			case PHORONIX:
				support = new Phoronix();
				break;
			}

			if (support != null) {
				support.setType(type);
			}
		}

		return support;
	}

	/**
	 * The gopher "selector" to use for output for this type, using the
	 * preselector.
	 * <p>
	 * A kind of "URL path", like "/news/" or "/misc/news/" or...
	 * 
	 * @param type
	 *            the type to get the selector of
	 * 
	 * @return the selector
	 */
	static public String getSelector(Type type) {
		return preselector + "/" + type + "/";
	}

	/**
	 * Process the given element into text (each line is a text paragraph and
	 * can be prepended with ">" signs to indicate a quote or sub-quote or
	 * sub-sub-quote...).
	 * 
	 * @param element
	 *            the element to process
	 * @param elementProcessor
	 *            the element processor, must not be NULL
	 * 
	 * @return text lines, each line is a paragraph
	 */
	static protected List<String> toLines(Element element,
			final ElementProcessor elementProcessor) {
		final List<String> lines = new ArrayList<String>();
		final StringBuilder currentLine = new StringBuilder();
		final List<Integer> quoted = new ArrayList<Integer>();
		final List<Node> ignoredNodes = new ArrayList<Node>();
		final List<String> footnotes = new ArrayList<String>();

		if (element != null) {
			new NodeTraversor(new NodeVisitor() {
				@Override
				public void head(Node node, int depth) {
					String manual = null;
					boolean ignore = elementProcessor.ignoreNode(node)
							|| ignoredNodes.contains(node.parentNode());
					// Manual processing
					if (!ignore) {
						manual = elementProcessor.manualProcessing(node);
						if (manual != null) {
							currentLine.append(manual);
							ignore = true;
						}
					}

					// Subtitle check
					if (!ignore) {
						String subtitle = elementProcessor.isSubtitle(node);
						if (subtitle != null) {
							subtitle = subtitle.trim();
							currentLine.append("\n[ " + subtitle + " ]\n");
							ignore = true;
						}
					}

					// <pre> check
					if (!ignore) {
						if (node instanceof Element) {
							Element el = (Element) node;
							if ("pre".equals(el.tagName())) {
								currentLine.append(StringUtils
										.unhtml(el.text()).trim());
								ignore = true;
							}
						}
					}

					if (ignore) {
						ignoredNodes.add(node);
						return;
					}

					String prep = "";
					for (int i = 0; i < quoted.size(); i++) {
						prep += ">";
					}
					prep += " ";

					boolean enterQuote = elementProcessor.detectQuote(node);
					boolean leaveQuote = quoted.contains(depth);

					if (enterQuote) {
						quoted.add(depth);
					}

					if (leaveQuote) {
						quoted.remove(Integer.valueOf(depth));
					}

					if (enterQuote || leaveQuote) {
						if (currentLine.length() > 0) {
							if (currentLine.charAt(currentLine.length() - 1) == '\n') {
								currentLine.setLength(currentLine.length() - 1);
							}
							for (String l : currentLine.toString().split("\n")) {
								lines.add(prep + l);
							}
						}
						currentLine.setLength(0);
					}

					if (node instanceof Element) {
						Element element = (Element) node;
						boolean block = element.isBlock()
								|| element.tagName().equalsIgnoreCase("br");
						if (block && currentLine.length() > 0) {
							currentLine.append("\n");
						}

						if (!element.absUrl("href").trim().isEmpty()) {
							footnotes.add(element.absUrl("href"));
							currentLine.append("[" + footnotes.size() + "]");
						}
					} else if (node instanceof TextNode) {
						TextNode textNode = (TextNode) node;
						String line = StringUtil.normaliseWhitespace(textNode
								.getWholeText());

						currentLine.append(elementProcessor.processText(line));
						currentLine.append(" ");
					}
				}

				@Override
				public void tail(Node node, int depth) {
				}
			}).traverse(element);
		}

		if (currentLine.length() > 0) {
			String prep = "";
			for (int i = 0; i < quoted.size(); i++) {
				prep += ">";
			}
			prep += " ";
			if (currentLine.length() > 0) {
				if (currentLine.charAt(currentLine.length() - 1) == '\n') {
					currentLine.setLength(currentLine.length() - 1);
				}
				for (String l : currentLine.toString().split("\n")) {
					lines.add(prep + l);
				}
			}
		}

		// Fix spaces and nbsp, remove multiple following blank lines
		List<String> linesCopy = new ArrayList<String>(lines.size());
		long blanks = 0;
		for (int i = 0; i < lines.size(); i++) {
			String line = lines.get(i).replace(" ", " ") // nbsp -> space
					.replace("  ", " ").trim();
			if (line.isEmpty()) {
				blanks++;
			} else {
				blanks = 0;
			}

			if (blanks < 2) {
				linesCopy.add(line);
			}
		}

		// Footnotes insertion
		if (footnotes.size() > 0) {
			linesCopy.add("");
			linesCopy.add("");
			linesCopy.add("");
			linesCopy.add("");
			for (int i = 0; i < footnotes.size(); i++) {
				linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
			}
		}

		return linesCopy;
	}

	/**
	 * Reformat the date if possible.
	 * 
	 * @param date
	 *            the input date
	 * 
	 * @return the reformated date, or the same value if it was not parsable
	 */
	static private String date(String date) {
		SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");

		long epoch = 0;
		try {
			epoch = Long.parseLong(date.trim());
		} catch (Exception e) {
			epoch = 0;
		}

		if (epoch > 0) {
			return out.format(new Date(1000 * epoch));
		}

		try {
			Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
					.parse(date.trim());
			return out.format(dat);
		} catch (ParseException e) {
			return date;
		}
	}
}
Commit	Line	Data
73785268 NR	1	package be.nikiroo.gofetch.support;
	2
	3	import java.io.IOException;
3e62b034 NR	4	import java.io.InputStream;
3e62b034 NR	5	import java.net.URL;
b34d1f35 NR	6	import java.text.ParseException;
b34d1f35 NR	7	import java.text.SimpleDateFormat;
27008a87	8	import java.util.ArrayList;
3e62b034	9	import java.util.Arrays;
b34d1f35	10	import java.util.Date;
73785268	11	import java.util.List;
3e62b034	12	import java.util.Map.Entry;
73785268	13
3e62b034	14	import org.jsoup.helper.DataUtil;
27008a87	15	import org.jsoup.helper.StringUtil;
3e62b034	16	import org.jsoup.nodes.Document;
27008a87 NR	17	import org.jsoup.nodes.Element;
	18	import org.jsoup.nodes.Node;
	19	import org.jsoup.nodes.TextNode;
27008a87 NR	20	import org.jsoup.select.NodeTraversor;
	21	import org.jsoup.select.NodeVisitor;
	22
3e62b034	23	import be.nikiroo.gofetch.data.Comment;
73785268	24	import be.nikiroo.gofetch.data.Story;
136ab801	25	import be.nikiroo.utils.Downloader;
3e62b034	26	import be.nikiroo.utils.StringUtils;
73785268	27
b34d1f35 NR	28	/**
	29	* Base class for website support.
	30	*
	31	* @author niki
	32	*/
73785268	33	public abstract class BasicSupport {
b34d1f35	34	/** The downloader to use for all websites. */
136ab801 NR	35	protected static Downloader downloader = new Downloader("gofetcher");
136ab801 NR	36
3e62b034 NR	37	static private String preselector;
	38
	39	private Type type;
	40
	41	/**
	42	* The website textual description, to add in the dispatcher page.
	43	* <p>
	44	* Should be short.
	45	*
	46	* @return the description
	47	*/
	48	abstract public String getDescription();
	49
b34d1f35	50	/**
3e62b034 NR	51	* The gopher "selector" to use for output.
	52	* <p>
	53	* A kind of "URL path", like "/news/" or "/misc/news/" or...
	54	*
	55	* @return the selector
	56	*/
	57	public String getSelector() {
	58	return getSelector(type);
73785268 NR	59	}
73785268 NR	60
20217360	61	/**
3e62b034 NR	62	* The support type.
	63	*
	64	* @return the type
	65	*/
	66	public Type getType() {
	67	return type;
27008a87 NR	68	}
27008a87 NR	69
20217360	70	/**
3e62b034 NR	71	* List all the recent items, but only assure the ID and internal URL to
	72	* fetch it later on (until it has been fetched, the rest of the
	73	* {@link Story} is not confirmed).
20217360	74	*
3e62b034 NR	75	* @return the list of new stories
	76	*
	77	* @throws IOException
	78	* in case of I/O
20217360	79	*/
3e62b034 NR	80	public List<Story> list() throws IOException {
	81	List<Story> list = new ArrayList<Story>();
	82
	83	for (Entry<URL, String> entry : getUrls()) {
	84	URL url = entry.getKey();
	85	String defaultCateg = entry.getValue();
	86	if (defaultCateg == null) {
	87	defaultCateg = "";
	88	}
20217360	89
3e62b034 NR	90	InputStream in = downloader.open(url);
	91	Document doc = DataUtil.load(in, "UTF-8", url.toString());
	92	List<Element> articles = getArticles(doc);
	93	for (Element article : articles) {
	94	String id = getArticleId(doc, article).trim();
	95	String title = getArticleTitle(doc, article).trim();
	96	String author = getArticleAuthor(doc, article).trim();
	97	String date = getArticleDate(doc, article).trim();
	98	String categ = getArticleCategory(doc, article, defaultCateg)
	99	.trim();
	100	String details = getArticleDetails(doc, article).trim();
	101	String intUrl = getArticleIntUrl(doc, article).trim();
	102	String extUrl = getArticleExtUrl(doc, article).trim();
	103	String content = getArticleContent(doc, article).trim();
	104
	105	if (id.isEmpty() && date.isEmpty()) {
	106	continue;
	107	}
20217360	108
3e62b034 NR	109	if (id.isEmpty()) {
	110	id = date.replace(":", "_").replace("+", "_");
	111	}
20217360	112
3e62b034	113	date = date(date);
b9afb12e	114
3e62b034 NR	115	list.add(new Story(getType(), id, title, author, date, categ,
	116	details, intUrl, extUrl, content));
	117	}
b9afb12e	118	}
3e62b034 NR	119
3e62b034 NR	120	return list;
20217360 NR	121	}
20217360 NR	122
3e62b034 NR	123	/**
	124	* The {@link URL}s to process for this website.
	125	*
	126	* @return the list of {@link URL}s
	127	*
	128	* @throws IOException
	129	* in case of I/O error
	130	*/
	131	abstract protected List<Entry<URL, String>> getUrls() throws IOException;
73785268	132
3e62b034 NR	133	/**
	134	* The article {@link Element}s of this document.
	135	*
	136	* @param doc
	137	* the main document for the current category
	138	*
	139	* @return the articles
	140	*/
	141	abstract protected List<Element> getArticles(Document doc);
73785268	142
100a8395	143	/**
3e62b034	144	* The ID of the article (defaults to the date element if empty).
100a8395	145	*
3e62b034 NR	146	* @param doc
	147	* the main document for the current category
	148	* @param article
	149	* the article to look into
100a8395	150	*
3e62b034 NR	151	* @return the ID
	152	*/
	153	abstract protected String getArticleId(Document doc, Element article);
	154
	155	/**
	156	* The article title to display.
	157	*
	158	* @param doc
	159	* the main document for the current category
	160	* @param article
	161	* the article to look into
	162	*
	163	* @return the title
	164	*/
	165	abstract protected String getArticleTitle(Document doc, Element article);
	166
	167	/**
	168	* The optional article author.
	169	*
	170	* @param doc
	171	* the main document for the current category
	172	* @param article
	173	* the article to look into
	174	*
	175	* @return the author
	176	*/
	177	abstract protected String getArticleAuthor(Document doc, Element article);
	178
	179	/**
	180	* The optional article date.
	181	*
	182	* @param doc
	183	* the main document for the current category
	184	* @param article
	185	* the article to look into
	186	*
	187	* @return the date
	188	*/
	189	abstract protected String getArticleDate(Document doc, Element article);
	190
	191	/**
	192	* the optional article category.
	193	*
	194	* @param doc
	195	* the main document for the current category
	196	* @param article
	197	* the article to look into
	198	* @param currentCategory
	199	* the currently listed category if any (can be NULL)
	200	*
	201	* @return the category
100a8395	202	*/
3e62b034 NR	203	abstract protected String getArticleCategory(Document doc, Element article,
	204	String currentCategory);
	205
	206	/**
	207	* the optional details of the article (can replace the date, author and
	208	* category, for instance).
	209	*
	210	* @param doc
	211	* the main document for the current category
	212	* @param article
	213	* the article to look into
	214	*
	215	* @return the details
	216	*/
	217	abstract protected String getArticleDetails(Document doc, Element article);
	218
	219	/**
	220	* The (required) {@link URL} that points to the news page on the supported
	221	* website.
	222	*
	223	* @param doc
	224	* the main document for the current category
	225	* @param article
	226	* the article to look into
	227	*
	228	* @return the internal {@link URL}
	229	*/
	230	abstract protected String getArticleIntUrl(Document doc, Element article);
	231
	232	/**
	233	* the optional {@link URL} that points to an external website for more
	234	* information.
	235	*
	236	* @param doc
	237	* the main document for the current category
	238	* @param article
	239	* the article to look into
	240	*
	241	* @return the external {@link URL}
	242	*/
	243	abstract protected String getArticleExtUrl(Document doc, Element article);
	244
	245	/**
	246	* The optional article short-content (not the full content, that will be
	247	* fetched by {@link BasicSupport#fetch(Story)}).
	248	*
	249	* @param doc
	250	* the main document for the current category
	251	* @param article
	252	* the article to look into
	253	*
	254	* @return the short content
	255	*/
	256	abstract protected String getArticleContent(Document doc, Element article);
73785268	257
5c056aad NR	258	/**
	259	* Fetch the full article content as well as all the comments associated to
	260	* this {@link Story}, if any (can be empty, but not NULL).
	261	*
	262	* @param story
	263	* the story to fetch the comments of
	264	*
	265	* @throws IOException
	266	* in case of I/O error
	267	*/
3e62b034 NR	268	public void fetch(Story story) throws IOException {
	269	String fullContent = "";
	270
	271	URL url = new URL(story.getUrlInternal());
	272	InputStream in = downloader.open(url);
	273	try {
	274	Document doc = DataUtil.load(in, "UTF-8", url.toString());
	275	Element article = getFullArticle(doc);
	276	if (article != null) {
	277	StringBuilder builder = new StringBuilder();
	278	ElementProcessor eProc = getElementProcessorFullArticle();
	279	if (eProc != null) {
	280	for (String line : toLines(article, eProc)) {
	281	builder.append(line + "\n");
	282	}
	283	} else {
	284	builder.append(article.text());
	285	}
	286
	287	// Content is too tight with a single break per line:
	288	fullContent = builder.toString().replace("\n", "\n\n") //
	289	.replace("\n\n\n\n", "\n\n") //
	290	.replace("\n\n\n\n", "\n\n") //
	291	.trim();
	292	}
	293
	294	if (fullContent.isEmpty()) {
	295	fullContent = story.getContent();
	296	}
	297
	298	story.setFullContent(fullContent);
	299	story.setComments(getComments(doc,
	300	getFullArticleCommentPosts(doc, url)));
	301	} finally {
	302	if (in != null) {
	303	in.close();
	304	}
	305	}
	306	}
73785268	307
b34d1f35	308	/**
3e62b034	309	* Return the full article if available.
b34d1f35	310	*
3e62b034 NR	311	* @param doc
	312	* the (full article) document to work on
	313	*
	314	* @return the article or NULL
b34d1f35	315	*/
3e62b034	316	abstract protected Element getFullArticle(Document doc);
2d95a873	317
b34d1f35	318	/**
3e62b034 NR	319	* Return the list of comment {@link Element}s from this optional container
	320	* -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	321	*
	322	* @param doc
	323	* the (full article) document to work on
	324	* @param intUrl
	325	* the internal {@link URL} this article wa taken from (the
	326	* {@link URL} from the supported website)
	327	*
	328	* @return the list of comment posts
	329	*/
	330	abstract protected List<Element> getFullArticleCommentPosts(Document doc,
	331	URL intUrl);
	332
	333	/**
	334	* The {@link ElementProcessor} to use to convert the main article element
	335	* (see {@link BasicSupport#getFullArticle(Document)}) into text.
b34d1f35	336	* <p>
3e62b034 NR	337	* See {@link BasicElementProcessor} for a working, basic implementation.
	338	* <p>
	339	* Can be NULL to simply use {@link Element#text()}.
b34d1f35	340	*
3e62b034	341	* @return the processor, or NULL
b34d1f35	342	*/
3e62b034	343	abstract protected ElementProcessor getElementProcessorFullArticle();
73785268	344
b34d1f35	345	/**
3e62b034	346	* Convert the comment elements into {@link Comment}s
b34d1f35	347	*
3e62b034 NR	348	* @param doc
	349	* the document we work on
	350	* @param posts
	351	* the comment elements
	352	*
	353	* @return the converted {@link Comment}s
b34d1f35	354	*/
3e62b034 NR	355	private List<Comment> getComments(Document doc, List<Element> posts) {
	356	List<Comment> comments = new ArrayList<Comment>();
	357	if (posts != null) {
	358	for (Element post : posts) {
	359	String id = getCommentId(post).trim();
	360	String author = getCommentAuthor(post).trim();
	361	String title = getCommentTitle(post).trim();
	362	String date = getCommentDate(post).trim();
	363
	364	List<String> content = new ArrayList<String>();
	365
	366	if (id.isEmpty()) {
	367	id = date;
	368	}
	369
	370	date = date(date);
	371
	372	Element contentE = getCommentContentElement(post);
	373	if (contentE != null) {
	374	ElementProcessor eProc = getElementProcessorComment();
	375	if (eProc != null) {
	376	for (String line : toLines(contentE, eProc)) {
	377	content.add(line);
	378	}
	379	} else {
	380	content = Arrays.asList(contentE.text().split("\n"));
	381	}
	382	}
	383
	384	Comment comment = new Comment(id, author, title, date, content);
	385	comment.addAll(getComments(doc,
	386	getCommentCommentPosts(doc, post)));
	387
	388	if (!comment.isEmpty()) {
	389	comments.add(comment);
	390	}
	391	}
	392	}
	393
	394	return comments;
73785268 NR	395	}
73785268 NR	396
3e62b034 NR	397	/**
	398	* Return the list of subcomment {@link Element}s from this comment element
	399	* -- must <b>NOT</b> return the "container" as a comment {@link Element}.
	400	*
	401	* @param doc
	402	* the (full article) document to work on
	403	* @param container
	404	* the container (a comment {@link Element})
	405	*
	406	* @return the list of comment posts
	407	*/
	408	abstract protected List<Element> getCommentCommentPosts(Document doc,
	409	Element container);
	410
	411	/**
	412	* Compute the ID of the given comment element.
	413	*
	414	* @param post
	415	* the comment element
	416	*
	417	* @return the ID
	418	*/
	419	abstract protected String getCommentId(Element post);
	420
	421	/**
	422	* Compute the author of the given comment element.
	423	*
	424	* @param post
	425	* the comment element
	426	*
	427	* @return the author
	428	*/
	429	abstract protected String getCommentAuthor(Element post);
	430
	431	/**
	432	* Compute the title of the given comment element.
	433	*
	434	* @param post
	435	* the comment element
	436	*
	437	* @return the title
	438	*/
	439	abstract protected String getCommentTitle(Element post);
	440
	441	/**
	442	* Compute the date of the given comment element.
	443	*
	444	* @param post
	445	* the comment element
	446	*
	447	* @return the date
	448	*/
	449	abstract protected String getCommentDate(Element post);
	450
	451	/**
	452	* Get the main of the given comment element, which can be NULL.
	453	*
	454	* @param post
	455	* the comment element
	456	*
	457	* @return the element
	458	*/
	459	abstract protected Element getCommentContentElement(Element post);
	460
461	/**
462	* The {@link ElementProcessor} to use to convert the main comment element
463	* (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
464	* <p>
465	* See {@link BasicElementProcessor} for a working, basic implementation.
466	* <p>
467	* Can be NULL to simply use {@link Element#text()}.
468	*
469	* @return the processor
470	*/
471	abstract protected ElementProcessor getElementProcessorComment();
472
b34d1f35 NR	473	/**
	474	* The support type.
	475	*
	476	* @param type
	477	* the new type
	478	*/
73785268 NR	479	protected void setType(Type type) {
	480	this.type = type;
	481	}
	482
	483	/**
b34d1f35 NR	484	* The {@link String} to append to the selector (the selector will be
	485	* constructed as "this string" then "/type/".
	486	*
73785268 NR	487	* @param preselector
	488	* the preselector to set
	489	*/
	490	static public void setPreselector(String preselector) {
	491	BasicSupport.preselector = preselector;
	492	}
	493
20217360 NR	494	/**
	495	* Return a {@link BasicSupport} that is compatible with the given
	496	* {@link Type} if it exists (or NULL if not).
	497	*
	498	* @param type
	499	* the type
	500	*
	501	* @return a compatible {@link BasicSupport} if it exists (or NULL if not)
	502	*/
73785268 NR	503	static public BasicSupport getSupport(Type type) {
	504	BasicSupport support = null;
	505
	506	if (type != null) {
	507	switch (type) {
	508	case SLASHDOT:
	509	support = new Slashdot();
	510	break;
2d95a873 NR	511	case PIPEDOT:
	512	support = new Pipedot();
	513	break;
eaaeae39 NR	514	case LWN:
	515	support = new LWN();
	516	break;
100a8395 NR	517	case LEMONDE:
	518	support = new LeMonde();
	519	break;
d28c4aac NR	520	case REGISTER:
	521	support = new TheRegister();
	522	break;
b34d1f35	523	case TOO_LINUX:
cd555a1e NR	524	support = new TooLinux();
cd555a1e NR	525	break;
31755801 NR	526	case ERE_NUMERIQUE:
	527	support = new EreNumerique();
	528	break;
127e065f NR	529	case PHORONIX:
	530	support = new Phoronix();
	531	break;
73785268 NR	532	}
	533
	534	if (support != null) {
	535	support.setType(type);
	536	}
	537	}
	538
	539	return support;
	540	}
	541
b34d1f35 NR	542	/**
	543	* The gopher "selector" to use for output for this type, using the
	544	* preselector.
	545	* <p>
	546	* A kind of "URL path", like "/news/" or "/misc/news/" or...
	547	*
	548	* @param type
	549	* the type to get the selector of
	550	*
	551	* @return the selector
	552	*/
73785268 NR	553	static public String getSelector(Type type) {
	554	return preselector + "/" + type + "/";
	555	}
	556
20217360 NR	557	/**
	558	* Process the given element into text (each line is a text paragraph and
	559	* can be prepended with ">" signs to indicate a quote or sub-quote or
	560	* sub-sub-quote...).
	561	*
	562	* @param element
	563	* the element to process
	564	* @param elementProcessor
	565	* the element processor, must not be NULL
	566	*
	567	* @return text lines, each line is a paragraph
	568	*/
27008a87	569	static protected List<String> toLines(Element element,
20217360	570	final ElementProcessor elementProcessor) {
27008a87 NR	571	final List<String> lines = new ArrayList<String>();
	572	final StringBuilder currentLine = new StringBuilder();
	573	final List<Integer> quoted = new ArrayList<Integer>();
	574	final List<Node> ignoredNodes = new ArrayList<Node>();
3e62b034	575	final List<String> footnotes = new ArrayList<String>();
27008a87 NR	576
	577	if (element != null) {
	578	new NodeTraversor(new NodeVisitor() {
	579	@Override
	580	public void head(Node node, int depth) {
100a8395	581	String manual = null;
20217360	582	boolean ignore = elementProcessor.ignoreNode(node)
100a8395	583	\|\| ignoredNodes.contains(node.parentNode());
b9afb12e	584	// Manual processing
100a8395	585	if (!ignore) {
20217360	586	manual = elementProcessor.manualProcessing(node);
100a8395 NR	587	if (manual != null) {
	588	currentLine.append(manual);
	589	ignore = true;
	590	}
	591	}
	592
b9afb12e NR	593	// Subtitle check
	594	if (!ignore) {
	595	String subtitle = elementProcessor.isSubtitle(node);
	596	if (subtitle != null) {
	597	subtitle = subtitle.trim();
	598	currentLine.append("\n[ " + subtitle + " ]\n");
	599	ignore = true;
	600	}
	601	}
	602
3e62b034 NR	603	// <pre> check
	604	if (!ignore) {
	605	if (node instanceof Element) {
	606	Element el = (Element) node;
	607	if ("pre".equals(el.tagName())) {
	608	currentLine.append(StringUtils
	609	.unhtml(el.text()).trim());
	610	ignore = true;
	611	}
	612	}
	613	}
	614
100a8395	615	if (ignore) {
27008a87 NR	616	ignoredNodes.add(node);
	617	return;
	618	}
	619
	620	String prep = "";
	621	for (int i = 0; i < quoted.size(); i++) {
	622	prep += ">";
	623	}
	624	prep += " ";
	625
20217360	626	boolean enterQuote = elementProcessor.detectQuote(node);
27008a87 NR	627	boolean leaveQuote = quoted.contains(depth);
	628
	629	if (enterQuote) {
	630	quoted.add(depth);
	631	}
	632
	633	if (leaveQuote) {
	634	quoted.remove(Integer.valueOf(depth));
	635	}
	636
	637	if (enterQuote \|\| leaveQuote) {
	638	if (currentLine.length() > 0) {
	639	if (currentLine.charAt(currentLine.length() - 1) == '\n') {
	640	currentLine.setLength(currentLine.length() - 1);
	641	}
	642	for (String l : currentLine.toString().split("\n")) {
	643	lines.add(prep + l);
	644	}
	645	}
	646	currentLine.setLength(0);
	647	}
	648
	649	if (node instanceof Element) {
	650	Element element = (Element) node;
	651	boolean block = element.isBlock()
	652	\|\| element.tagName().equalsIgnoreCase("br");
	653	if (block && currentLine.length() > 0) {
	654	currentLine.append("\n");
	655	}
3e62b034 NR	656
	657	if (!element.absUrl("href").trim().isEmpty()) {
	658	footnotes.add(element.absUrl("href"));
	659	currentLine.append("[" + footnotes.size() + "]");
	660	}
27008a87 NR	661	} else if (node instanceof TextNode) {
	662	TextNode textNode = (TextNode) node;
	663	String line = StringUtil.normaliseWhitespace(textNode
	664	.getWholeText());
	665
20217360	666	currentLine.append(elementProcessor.processText(line));
27008a87 NR	667	currentLine.append(" ");
	668	}
	669	}
	670
	671	@Override
	672	public void tail(Node node, int depth) {
	673	}
	674	}).traverse(element);
	675	}
	676
	677	if (currentLine.length() > 0) {
	678	String prep = "";
	679	for (int i = 0; i < quoted.size(); i++) {
	680	prep += ">";
	681	}
	682	prep += " ";
	683	if (currentLine.length() > 0) {
	684	if (currentLine.charAt(currentLine.length() - 1) == '\n') {
	685	currentLine.setLength(currentLine.length() - 1);
	686	}
	687	for (String l : currentLine.toString().split("\n")) {
	688	lines.add(prep + l);
	689	}
	690	}
	691	}
	692
3e62b034 NR	693	// Fix spaces and nbsp, remove multiple following blank lines
	694	List<String> linesCopy = new ArrayList<String>(lines.size());
	695	long blanks = 0;
27008a87	696	for (int i = 0; i < lines.size(); i++) {
3e62b034 NR	697	String line = lines.get(i).replace(" ", " ") // nbsp -> space
	698	.replace(" ", " ").trim();
	699	if (line.isEmpty()) {
	700	blanks++;
	701	} else {
	702	blanks = 0;
	703	}
	704
	705	if (blanks < 2) {
	706	linesCopy.add(line);
	707	}
	708	}
	709
	710	// Footnotes insertion
	711	if (footnotes.size() > 0) {
	712	linesCopy.add("");
	713	linesCopy.add("");
	714	linesCopy.add("");
	715	linesCopy.add("");
	716	for (int i = 0; i < footnotes.size(); i++) {
	717	linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
	718	}
27008a87 NR	719	}
27008a87 NR	720
3e62b034	721	return linesCopy;
b34d1f35 NR	722	}
	723
	724	/**
	725	* Reformat the date if possible.
	726	*
	727	* @param date
	728	* the input date
	729	*
	730	* @return the reformated date, or the same value if it was not parsable
	731	*/
3e62b034	732	static private String date(String date) {
b34d1f35 NR	733	SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
	734
	735	long epoch = 0;
	736	try {
c9cffa91	737	epoch = Long.parseLong(date.trim());
b34d1f35 NR	738	} catch (Exception e) {
b34d1f35 NR	739	epoch = 0;
880740c4 NR	740	}
880740c4 NR	741
b34d1f35 NR	742	if (epoch > 0) {
	743	return out.format(new Date(1000 * epoch));
	744	}
	745
	746	try {
	747	Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
	748	.parse(date.trim());
	749	return out.format(dat);
	750	} catch (ParseException e) {
	751	return date;
	752	}
27008a87	753	}
73785268	754	}