[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java

package be.nikiroo.gofetch.support;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

import be.nikiroo.gofetch.data.Story;
import be.nikiroo.utils.Downloader;

/**
 * Base class for website support.
 * 
 * @author niki
 */
public abstract class BasicSupport {
	/** The downloader to use for all websites. */
	protected static Downloader downloader = new Downloader("gofetcher");

	/**
	 * The support type (each website we support has a single type).
	 * 
	 * @author niki
	 */
	public enum Type {
		/** EN: Any, but mostly IT/Sci */
		SLASHDOT,
		/** EN: Clone of Slashdot, mostly abandoned */
		PIPEDOT,
		/** EN: Linux */
		LWN,
		/** FR: Any */
		LEMONDE,
		/** EN: IT */
		REGISTER,
		/** FR: Linux */
		TOO_LINUX,
		/** FR: IT */
		ERE_NUMERIQUE,
	}

	/**
	 * Used to process an element into lines.
	 * 
	 * @author niki
	 */
	public interface ElementProcessor {
		/**
		 * Detect if this node is a quote and should be trated as such.
		 * 
		 * @param node
		 *            the node to check
		 * @return TRUE if it is
		 */
		public boolean detectQuote(Node node);

		/**
		 * Process text content (will be called on each text element, allowing
		 * you to modify it if needed).
		 * 
		 * @param text
		 *            the text to process
		 * 
		 * @return the resulting text
		 */
		public String processText(String text);

		/**
		 * Ignore this node.
		 * 
		 * @param node
		 *            the node to ignore
		 * @return TRUE if it has to be ignored
		 */
		public boolean ignoreNode(Node node);

		/**
		 * Manually process this node (and return the manual processing value)
		 * if so desired.
		 * <p>
		 * If the node is manually processed, it and its children will not be
		 * automatically processed.
		 * 
		 * @param node
		 *            the node to optionally process
		 * 
		 * @return NULL if not processed (will thus be automatically processed
		 *         as usual), a {@link String} (may be empty) if we process it
		 *         manually -- the given {@link String} will be used instead of
		 *         the usual automatic processing if not NULL
		 */
		public String manualProcessing(Node node);
	}

	/**
	 * A default {@link ElementProcessor} (will not detect or process anything
	 * manually).
	 * 
	 * @author niki
	 */
	public class BasicElementProcessor implements ElementProcessor {
		@Override
		public boolean detectQuote(Node node) {
			return false;
		}

		@Override
		public String processText(String text) {
			return text;
		}

		@Override
		public boolean ignoreNode(Node node) {
			return false;
		}

		@Override
		public String manualProcessing(Node node) {
			return null;
		}
	}

	static private String preselector;

	private Type type;

	/**
	 * List all the recent items, but only assure the ID and internal URL to
	 * fetch it later on (until it has been fetched, the rest of the
	 * {@link Story} is not confirmed).
	 * 
	 * @return the list of new stories
	 * 
	 * @throws IOException
	 *             in case of I/O
	 */
	abstract public List<Story> list() throws IOException;

	/**
	 * Fetch the full article content as well as all the comments associated to
	 * this {@link Story}, if any (can be empty, but not NULL).
	 * 
	 * @param story
	 *            the story to fetch the comments of
	 * 
	 * @throws IOException
	 *             in case of I/O error
	 */
	abstract public void fetch(Story story) throws IOException;

	/**
	 * The website textual description, to add in the dispatcher page.
	 * <p>
	 * Should be short.
	 * 
	 * @return the description
	 */
	abstract public String getDescription();

	/**
	 * The gopher "selector" to use for output.
	 * <p>
	 * A kind of "URL path", like "/news/" or "/misc/news/" or...
	 * 
	 * @return the selector
	 */
	public String getSelector() {
		return getSelector(type);
	}

	/**
	 * The support type.
	 * 
	 * @return the type
	 */
	public Type getType() {
		return type;
	}

	/**
	 * The support type.
	 * 
	 * @param type
	 *            the new type
	 */
	protected void setType(Type type) {
		this.type = type;
	}

	/**
	 * The {@link String} to append to the selector (the selector will be
	 * constructed as "this string" then "/type/".
	 * 
	 * @param preselector
	 *            the preselector to set
	 */
	static public void setPreselector(String preselector) {
		BasicSupport.preselector = preselector;
	}

	/**
	 * Return a {@link BasicSupport} that is compatible with the given
	 * {@link Type} if it exists (or NULL if not).
	 * 
	 * @param type
	 *            the type
	 * 
	 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
	 */
	static public BasicSupport getSupport(Type type) {
		BasicSupport support = null;

		if (type != null) {
			switch (type) {
			case SLASHDOT:
				support = new Slashdot();
				break;
			case PIPEDOT:
				support = new Pipedot();
				break;
			case LWN:
				support = new LWN();
				break;
			case LEMONDE:
				support = new LeMonde();
				break;
			case REGISTER:
				support = new TheRegister();
				break;
			case TOO_LINUX:
				support = new TooLinux();
				break;
			case ERE_NUMERIQUE:
				support = new EreNumerique();
				break;
			}

			if (support != null) {
				support.setType(type);
			}
		}

		return support;
	}

	/**
	 * The gopher "selector" to use for output for this type, using the
	 * preselector.
	 * <p>
	 * A kind of "URL path", like "/news/" or "/misc/news/" or...
	 * 
	 * @param type
	 *            the type to get the selector of
	 * 
	 * @return the selector
	 */
	static public String getSelector(Type type) {
		return preselector + "/" + type + "/";
	}

	/**
	 * Get the first {@link Element} of the given class, or an empty span
	 * {@link Element} if none found.
	 * 
	 * @param element
	 *            the element to look in
	 * @param className
	 *            the class to look for
	 * 
	 * @return the value or an empty span {@link Element}
	 */
	static protected Element firstOrEmpty(Element element, String className) {
		Elements subElements = element.getElementsByClass(className);
		if (subElements.size() > 0) {
			return subElements.get(0);
		}

		return new Element("span");
	}

	/**
	 * Get the first {@link Element} of the given tag, or an empty span
	 * {@link Element} if none found.
	 * 
	 * @param element
	 *            the element to look in
	 * @param tagName
	 *            the tag to look for
	 * 
	 * @return the value or an empty span {@link Element}
	 */
	static protected Element firstOrEmptyTag(Element element, String tagName) {
		Elements subElements = element.getElementsByTag(tagName);
		if (subElements.size() > 0) {
			return subElements.get(0);
		}

		return new Element("span");
	}

	/**
	 * Process the given element into text (each line is a text paragraph and
	 * can be prepended with ">" signs to indicate a quote or sub-quote or
	 * sub-sub-quote...).
	 * 
	 * @param element
	 *            the element to process
	 * @param elementProcessor
	 *            the element processor, must not be NULL
	 * 
	 * @return text lines, each line is a paragraph
	 */
	static protected List<String> toLines(Element element,
			final ElementProcessor elementProcessor) {
		final List<String> lines = new ArrayList<String>();
		final StringBuilder currentLine = new StringBuilder();
		final List<Integer> quoted = new ArrayList<Integer>();
		final List<Node> ignoredNodes = new ArrayList<Node>();

		if (element != null) {
			new NodeTraversor(new NodeVisitor() {
				@Override
				public void head(Node node, int depth) {
					String manual = null;
					boolean ignore = elementProcessor.ignoreNode(node)
							|| ignoredNodes.contains(node.parentNode());
					if (!ignore) {
						manual = elementProcessor.manualProcessing(node);
						if (manual != null) {
							currentLine.append(manual);
							ignore = true;
						}
					}

					if (ignore) {
						ignoredNodes.add(node);
						return;
					}

					String prep = "";
					for (int i = 0; i < quoted.size(); i++) {
						prep += ">";
					}
					prep += " ";

					boolean enterQuote = elementProcessor.detectQuote(node);
					boolean leaveQuote = quoted.contains(depth);

					if (enterQuote) {
						quoted.add(depth);
					}

					if (leaveQuote) {
						quoted.remove(Integer.valueOf(depth));
					}

					if (enterQuote || leaveQuote) {
						if (currentLine.length() > 0) {
							if (currentLine.charAt(currentLine.length() - 1) == '\n') {
								currentLine.setLength(currentLine.length() - 1);
							}
							for (String l : currentLine.toString().split("\n")) {
								lines.add(prep + l);
							}
						}
						currentLine.setLength(0);
					}

					if (node instanceof Element) {
						Element element = (Element) node;
						boolean block = element.isBlock()
								|| element.tagName().equalsIgnoreCase("br");
						if (block && currentLine.length() > 0) {
							currentLine.append("\n");
						}
					} else if (node instanceof TextNode) {
						TextNode textNode = (TextNode) node;
						String line = StringUtil.normaliseWhitespace(textNode
								.getWholeText());

						currentLine.append(elementProcessor.processText(line));
						currentLine.append(" ");
					}
				}

				@Override
				public void tail(Node node, int depth) {
				}
			}).traverse(element);
		}

		if (currentLine.length() > 0) {
			String prep = "";
			for (int i = 0; i < quoted.size(); i++) {
				prep += ">";
			}
			prep += " ";
			if (currentLine.length() > 0) {
				if (currentLine.charAt(currentLine.length() - 1) == '\n') {
					currentLine.setLength(currentLine.length() - 1);
				}
				for (String l : currentLine.toString().split("\n")) {
					lines.add(prep + l);
				}
			}
		}

		for (int i = 0; i < lines.size(); i++) {
			lines.set(i, lines.get(i).replace("  ", " ").trim());
		}

		return lines;
	}

	/**
	 * Reformat the date if possible.
	 * 
	 * @param date
	 *            the input date
	 * 
	 * @return the reformated date, or the same value if it was not parsable
	 */
	static protected String date(String date) {
		SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");

		long epoch = 0;
		try {
			epoch = Long.parseLong(date.trim());
		} catch (Exception e) {
			epoch = 0;
		}

		if (epoch > 0) {
			return out.format(new Date(1000 * epoch));
		}

		try {
			Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
					.parse(date.trim());
			return out.format(dat);
		} catch (ParseException e) {
			return date;
		}
	}
}
Commit	Line	Data
	1	package be.nikiroo.gofetch.support;
	2
	3	import java.io.IOException;
	4	import java.text.ParseException;
	5	import java.text.SimpleDateFormat;
	6	import java.util.ArrayList;
	7	import java.util.Date;
	8	import java.util.List;
	9
	10	import org.jsoup.helper.StringUtil;
	11	import org.jsoup.nodes.Element;
	12	import org.jsoup.nodes.Node;
	13	import org.jsoup.nodes.TextNode;
	14	import org.jsoup.select.Elements;
	15	import org.jsoup.select.NodeTraversor;
	16	import org.jsoup.select.NodeVisitor;
	17
	18	import be.nikiroo.gofetch.data.Story;
	19	import be.nikiroo.utils.Downloader;
	20
	21	/**
	22	* Base class for website support.
	23	*
	24	* @author niki
	25	*/
	26	public abstract class BasicSupport {
	27	/** The downloader to use for all websites. */
	28	protected static Downloader downloader = new Downloader("gofetcher");
	29
	30	/**
	31	* The support type (each website we support has a single type).
	32	*
	33	* @author niki
	34	*/
	35	public enum Type {
	36	/** EN: Any, but mostly IT/Sci */
	37	SLASHDOT,
	38	/** EN: Clone of Slashdot, mostly abandoned */
	39	PIPEDOT,
	40	/** EN: Linux */
	41	LWN,
	42	/** FR: Any */
	43	LEMONDE,
	44	/** EN: IT */
	45	REGISTER,
	46	/** FR: Linux */
	47	TOO_LINUX,
	48	/** FR: IT */
	49	ERE_NUMERIQUE,
	50	}
	51
	52	/**
	53	* Used to process an element into lines.
	54	*
	55	* @author niki
	56	*/
	57	public interface ElementProcessor {
	58	/**
	59	* Detect if this node is a quote and should be trated as such.
	60	*
	61	* @param node
	62	* the node to check
	63	* @return TRUE if it is
	64	*/
	65	public boolean detectQuote(Node node);
	66
	67	/**
	68	* Process text content (will be called on each text element, allowing
	69	* you to modify it if needed).
	70	*
	71	* @param text
	72	* the text to process
	73	*
	74	* @return the resulting text
	75	*/
	76	public String processText(String text);
	77
	78	/**
	79	* Ignore this node.
	80	*
	81	* @param node
	82	* the node to ignore
	83	* @return TRUE if it has to be ignored
	84	*/
	85	public boolean ignoreNode(Node node);
	86
	87	/**
	88	* Manually process this node (and return the manual processing value)
	89	* if so desired.
	90	* <p>
	91	* If the node is manually processed, it and its children will not be
	92	* automatically processed.
	93	*
	94	* @param node
	95	* the node to optionally process
	96	*
	97	* @return NULL if not processed (will thus be automatically processed
	98	* as usual), a {@link String} (may be empty) if we process it
	99	* manually -- the given {@link String} will be used instead of
	100	* the usual automatic processing if not NULL
	101	*/
	102	public String manualProcessing(Node node);
	103	}
	104
	105	/**
	106	* A default {@link ElementProcessor} (will not detect or process anything
	107	* manually).
	108	*
	109	* @author niki
	110	*/
	111	public class BasicElementProcessor implements ElementProcessor {
	112	@Override
	113	public boolean detectQuote(Node node) {
	114	return false;
	115	}
	116
	117	@Override
	118	public String processText(String text) {
	119	return text;
	120	}
	121
	122	@Override
	123	public boolean ignoreNode(Node node) {
	124	return false;
	125	}
	126
	127	@Override
	128	public String manualProcessing(Node node) {
	129	return null;
	130	}
	131	}
	132
	133	static private String preselector;
	134
	135	private Type type;
	136
	137	/**
	138	* List all the recent items, but only assure the ID and internal URL to
	139	* fetch it later on (until it has been fetched, the rest of the
	140	* {@link Story} is not confirmed).
	141	*
	142	* @return the list of new stories
	143	*
	144	* @throws IOException
	145	* in case of I/O
	146	*/
	147	abstract public List<Story> list() throws IOException;
	148
	149	/**
	150	* Fetch the full article content as well as all the comments associated to
	151	* this {@link Story}, if any (can be empty, but not NULL).
	152	*
	153	* @param story
	154	* the story to fetch the comments of
	155	*
	156	* @throws IOException
	157	* in case of I/O error
	158	*/
	159	abstract public void fetch(Story story) throws IOException;
	160
	161	/**
	162	* The website textual description, to add in the dispatcher page.
	163	* <p>
	164	* Should be short.
	165	*
	166	* @return the description
	167	*/
	168	abstract public String getDescription();
	169
	170	/**
	171	* The gopher "selector" to use for output.
	172	* <p>
	173	* A kind of "URL path", like "/news/" or "/misc/news/" or...
	174	*
	175	* @return the selector
	176	*/
	177	public String getSelector() {
	178	return getSelector(type);
	179	}
	180
	181	/**
	182	* The support type.
	183	*
	184	* @return the type
	185	*/
	186	public Type getType() {
	187	return type;
	188	}
	189
	190	/**
	191	* The support type.
	192	*
	193	* @param type
	194	* the new type
	195	*/
	196	protected void setType(Type type) {
	197	this.type = type;
	198	}
	199
	200	/**
	201	* The {@link String} to append to the selector (the selector will be
	202	* constructed as "this string" then "/type/".
	203	*
	204	* @param preselector
	205	* the preselector to set
	206	*/
	207	static public void setPreselector(String preselector) {
	208	BasicSupport.preselector = preselector;
	209	}
	210
	211	/**
	212	* Return a {@link BasicSupport} that is compatible with the given
	213	* {@link Type} if it exists (or NULL if not).
	214	*
	215	* @param type
	216	* the type
	217	*
	218	* @return a compatible {@link BasicSupport} if it exists (or NULL if not)
	219	*/
	220	static public BasicSupport getSupport(Type type) {
	221	BasicSupport support = null;
	222
	223	if (type != null) {
	224	switch (type) {
	225	case SLASHDOT:
	226	support = new Slashdot();
	227	break;
	228	case PIPEDOT:
	229	support = new Pipedot();
	230	break;
	231	case LWN:
	232	support = new LWN();
	233	break;
	234	case LEMONDE:
	235	support = new LeMonde();
	236	break;
	237	case REGISTER:
	238	support = new TheRegister();
	239	break;
	240	case TOO_LINUX:
	241	support = new TooLinux();
	242	break;
	243	case ERE_NUMERIQUE:
	244	support = new EreNumerique();
	245	break;
	246	}
	247
	248	if (support != null) {
	249	support.setType(type);
	250	}
	251	}
	252
	253	return support;
	254	}
	255
	256	/**
	257	* The gopher "selector" to use for output for this type, using the
	258	* preselector.
	259	* <p>
	260	* A kind of "URL path", like "/news/" or "/misc/news/" or...
	261	*
	262	* @param type
	263	* the type to get the selector of
	264	*
	265	* @return the selector
	266	*/
	267	static public String getSelector(Type type) {
	268	return preselector + "/" + type + "/";
	269	}
	270
	271	/**
	272	* Get the first {@link Element} of the given class, or an empty span
	273	* {@link Element} if none found.
	274	*
	275	* @param element
	276	* the element to look in
	277	* @param className
	278	* the class to look for
	279	*
	280	* @return the value or an empty span {@link Element}
	281	*/
	282	static protected Element firstOrEmpty(Element element, String className) {
	283	Elements subElements = element.getElementsByClass(className);
	284	if (subElements.size() > 0) {
	285	return subElements.get(0);
	286	}
	287
	288	return new Element("span");
	289	}
	290
	291	/**
	292	* Get the first {@link Element} of the given tag, or an empty span
	293	* {@link Element} if none found.
	294	*
	295	* @param element
	296	* the element to look in
	297	* @param tagName
	298	* the tag to look for
	299	*
	300	* @return the value or an empty span {@link Element}
	301	*/
	302	static protected Element firstOrEmptyTag(Element element, String tagName) {
	303	Elements subElements = element.getElementsByTag(tagName);
	304	if (subElements.size() > 0) {
	305	return subElements.get(0);
	306	}
	307
	308	return new Element("span");
	309	}
	310
	311	/**
	312	* Process the given element into text (each line is a text paragraph and
	313	* can be prepended with ">" signs to indicate a quote or sub-quote or
	314	* sub-sub-quote...).
	315	*
	316	* @param element
	317	* the element to process
	318	* @param elementProcessor
	319	* the element processor, must not be NULL
	320	*
	321	* @return text lines, each line is a paragraph
	322	*/
	323	static protected List<String> toLines(Element element,
	324	final ElementProcessor elementProcessor) {
	325	final List<String> lines = new ArrayList<String>();
	326	final StringBuilder currentLine = new StringBuilder();
	327	final List<Integer> quoted = new ArrayList<Integer>();
	328	final List<Node> ignoredNodes = new ArrayList<Node>();
	329
	330	if (element != null) {
	331	new NodeTraversor(new NodeVisitor() {
	332	@Override
	333	public void head(Node node, int depth) {
	334	String manual = null;
	335	boolean ignore = elementProcessor.ignoreNode(node)
	336	\|\| ignoredNodes.contains(node.parentNode());
	337	if (!ignore) {
	338	manual = elementProcessor.manualProcessing(node);
	339	if (manual != null) {
	340	currentLine.append(manual);
	341	ignore = true;
	342	}
	343	}
	344
	345	if (ignore) {
	346	ignoredNodes.add(node);
	347	return;
	348	}
	349
	350	String prep = "";
	351	for (int i = 0; i < quoted.size(); i++) {
	352	prep += ">";
	353	}
	354	prep += " ";
	355
	356	boolean enterQuote = elementProcessor.detectQuote(node);
	357	boolean leaveQuote = quoted.contains(depth);
	358
	359	if (enterQuote) {
	360	quoted.add(depth);
	361	}
	362
	363	if (leaveQuote) {
	364	quoted.remove(Integer.valueOf(depth));
	365	}
	366
	367	if (enterQuote \|\| leaveQuote) {
	368	if (currentLine.length() > 0) {
	369	if (currentLine.charAt(currentLine.length() - 1) == '\n') {
	370	currentLine.setLength(currentLine.length() - 1);
	371	}
	372	for (String l : currentLine.toString().split("\n")) {
	373	lines.add(prep + l);
	374	}
	375	}
	376	currentLine.setLength(0);
	377	}
	378
	379	if (node instanceof Element) {
	380	Element element = (Element) node;
	381	boolean block = element.isBlock()
	382	\|\| element.tagName().equalsIgnoreCase("br");
	383	if (block && currentLine.length() > 0) {
	384	currentLine.append("\n");
	385	}
	386	} else if (node instanceof TextNode) {
	387	TextNode textNode = (TextNode) node;
	388	String line = StringUtil.normaliseWhitespace(textNode
	389	.getWholeText());
	390
	391	currentLine.append(elementProcessor.processText(line));
	392	currentLine.append(" ");
	393	}
	394	}
	395
	396	@Override
	397	public void tail(Node node, int depth) {
	398	}
	399	}).traverse(element);
	400	}
	401
	402	if (currentLine.length() > 0) {
	403	String prep = "";
	404	for (int i = 0; i < quoted.size(); i++) {
	405	prep += ">";
	406	}
	407	prep += " ";
	408	if (currentLine.length() > 0) {
	409	if (currentLine.charAt(currentLine.length() - 1) == '\n') {
	410	currentLine.setLength(currentLine.length() - 1);
	411	}
	412	for (String l : currentLine.toString().split("\n")) {
	413	lines.add(prep + l);
	414	}
	415	}
	416	}
	417
	418	for (int i = 0; i < lines.size(); i++) {
	419	lines.set(i, lines.get(i).replace(" ", " ").trim());
	420	}
	421
	422	return lines;
	423	}
	424
	425	/**
	426	* Reformat the date if possible.
	427	*
	428	* @param date
	429	* the input date
	430	*
	431	* @return the reformated date, or the same value if it was not parsable
	432	*/
	433	static protected String date(String date) {
	434	SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
	435
	436	long epoch = 0;
	437	try {
	438	epoch = Long.parseLong(date.trim());
	439	} catch (Exception e) {
	440	epoch = 0;
	441	}
	442
	443	if (epoch > 0) {
	444	return out.format(new Date(1000 * epoch));
	445	}
	446
	447	try {
	448	Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
	449	.parse(date.trim());
	450	return out.format(dat);
	451	} catch (ParseException e) {
	452	return date;
	453	}
	454	}
	455	}