package be.nikiroo.gofetch.support;
import java.io.IOException;
-import java.io.InputStream;
-import java.net.URL;
-import java.net.URLConnection;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
+import java.util.Date;
import java.util.List;
-import java.util.zip.GZIPInputStream;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Element;
import org.jsoup.select.NodeVisitor;
import be.nikiroo.gofetch.data.Story;
+import be.nikiroo.utils.Downloader;
+/**
+ * Base class for website support.
+ *
+ * @author niki
+ */
public abstract class BasicSupport {
+ /** The downloader to use for all websites. */
+ protected static Downloader downloader = new Downloader("gofetcher");
+
+ /**
+ * The support type (each website we support has a single type).
+ *
+ * @author niki
+ */
public enum Type {
- SLASHDOT, PIPEDOT, LWN,
+ /** EN: Any, but mostly IT/Sci */
+ SLASHDOT,
+ /** EN: Clone of Slashdot, mostly abandoned */
+ PIPEDOT,
+ /** EN: Linux */
+ LWN,
+ /** FR: Any */
+ LEMONDE,
+ /** EN: IT */
+ REGISTER,
+ /** FR: Linux */
+ TOO_LINUX,
+ /** FR: IT */
+ ERE_NUMERIQUE,
}
- public interface QuoteProcessor {
+ /**
+ * Used to process an element into lines.
+ *
+ * @author niki
+ */
+ public interface ElementProcessor {
+ /**
+ * Detect if this node is a quote and should be trated as such.
+ *
+ * @param node
+ * the node to check
+ * @return TRUE if it is
+ */
public boolean detectQuote(Node node);
+ /**
+ * Process text content (will be called on each text element, allowing
+ * you to modify it if needed).
+ *
+ * @param text
+ * the text to process
+ *
+ * @return the resulting text
+ */
public String processText(String text);
+ /**
+ * Ignore this node.
+ *
+ * @param node
+ * the node to ignore
+ * @return TRUE if it has to be ignored
+ */
public boolean ignoreNode(Node node);
+
+ /**
+ * Manually process this node (and return the manual processing value)
+ * if so desired.
+ * <p>
+ * If the node is manually processed, it and its children will not be
+ * automatically processed.
+ *
+ * @param node
+ * the node to optionally process
+ *
+ * @return NULL if not processed (will thus be automatically processed
+ * as usual), a {@link String} (may be empty) if we process it
+ * manually -- the given {@link String} will be used instead of
+ * the usual automatic processing if not NULL
+ */
+ public String manualProcessing(Node node);
+ }
+
+ /**
+ * A default {@link ElementProcessor} (will not detect or process anything
+ * manually).
+ *
+ * @author niki
+ */
+ public class BasicElementProcessor implements ElementProcessor {
+ @Override
+ public boolean detectQuote(Node node) {
+ return false;
+ }
+
+ @Override
+ public String processText(String text) {
+ return text;
+ }
+
+ @Override
+ public boolean ignoreNode(Node node) {
+ return false;
+ }
+
+ @Override
+ public String manualProcessing(Node node) {
+ return null;
+ }
}
static private String preselector;
private Type type;
+ /**
+ * List all the recent items, but only assure the ID and internal URL to
+ * fetch it later on (until it has been fetched, the rest of the
+ * {@link Story} is not confirmed).
+ *
+ * @return the list of new stories
+ *
+ * @throws IOException
+ * in case of I/O
+ */
abstract public List<Story> list() throws IOException;
/**
*/
abstract public void fetch(Story story) throws IOException;
+ /**
+ * The website textual description, to add in the dispatcher page.
+ * <p>
+ * Should be short.
+ *
+ * @return the description
+ */
abstract public String getDescription();
+ /**
+ * The gopher "selector" to use for output.
+ * <p>
+ * A kind of "URL path", like "/news/" or "/misc/news/" or...
+ *
+ * @return the selector
+ */
public String getSelector() {
return getSelector(type);
}
+ /**
+ * The support type.
+ *
+ * @return the type
+ */
public Type getType() {
return type;
}
+ /**
+ * The support type.
+ *
+ * @param type
+ * the new type
+ */
protected void setType(Type type) {
this.type = type;
}
/**
+ * The {@link String} to append to the selector (the selector will be
+ * constructed as "this string" then "/type/".
+ *
* @param preselector
* the preselector to set
*/
BasicSupport.preselector = preselector;
}
+ /**
+ * Return a {@link BasicSupport} that is compatible with the given
+ * {@link Type} if it exists (or NULL if not).
+ *
+ * @param type
+ * the type
+ *
+ * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
+ */
static public BasicSupport getSupport(Type type) {
BasicSupport support = null;
case LWN:
support = new LWN();
break;
+ case LEMONDE:
+ support = new LeMonde();
+ break;
+ case REGISTER:
+ support = new TheRegister();
+ break;
+ case TOO_LINUX:
+ support = new TooLinux();
+ break;
+ case ERE_NUMERIQUE:
+ support = new EreNumerique();
+ break;
}
if (support != null) {
return support;
}
+ /**
+ * The gopher "selector" to use for output for this type, using the
+ * preselector.
+ * <p>
+ * A kind of "URL path", like "/news/" or "/misc/news/" or...
+ *
+ * @param type
+ * the type to get the selector of
+ *
+ * @return the selector
+ */
static public String getSelector(Type type) {
return preselector + "/" + type + "/";
}
- // TODO: check Downloader.java?
- static protected InputStream open(URL url) throws IOException {
- URLConnection conn = url.openConnection();
- conn.connect();
- InputStream in = conn.getInputStream();
- if ("gzip".equals(conn.getContentEncoding())) {
- in = new GZIPInputStream(in);
- }
-
- return in;
- }
-
/**
* Get the first {@link Element} of the given class, or an empty span
* {@link Element} if none found.
return new Element("span");
}
+ /**
+ * Process the given element into text (each line is a text paragraph and
+ * can be prepended with ">" signs to indicate a quote or sub-quote or
+ * sub-sub-quote...).
+ *
+ * @param element
+ * the element to process
+ * @param elementProcessor
+ * the element processor, must not be NULL
+ *
+ * @return text lines, each line is a paragraph
+ */
static protected List<String> toLines(Element element,
- final QuoteProcessor quoteProcessor) {
+ final ElementProcessor elementProcessor) {
final List<String> lines = new ArrayList<String>();
final StringBuilder currentLine = new StringBuilder();
final List<Integer> quoted = new ArrayList<Integer>();
new NodeTraversor(new NodeVisitor() {
@Override
public void head(Node node, int depth) {
- if (quoteProcessor.ignoreNode(node)
- || ignoredNodes.contains(node.parentNode())) {
+ String manual = null;
+ boolean ignore = elementProcessor.ignoreNode(node)
+ || ignoredNodes.contains(node.parentNode());
+ if (!ignore) {
+ manual = elementProcessor.manualProcessing(node);
+ if (manual != null) {
+ currentLine.append(manual);
+ ignore = true;
+ }
+ }
+
+ if (ignore) {
ignoredNodes.add(node);
return;
}
}
prep += " ";
- boolean enterQuote = quoteProcessor.detectQuote(node);
+ boolean enterQuote = elementProcessor.detectQuote(node);
boolean leaveQuote = quoted.contains(depth);
if (enterQuote) {
String line = StringUtil.normaliseWhitespace(textNode
.getWholeText());
- currentLine.append(quoteProcessor.processText(line));
+ currentLine.append(elementProcessor.processText(line));
currentLine.append(" ");
}
}
return lines;
}
+
+ /**
+ * Reformat the date if possible.
+ *
+ * @param date
+ * the input date
+ *
+ * @return the reformated date, or the same value if it was not parsable
+ */
+ static protected String date(String date) {
+ SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
+
+ long epoch = 0;
+ try {
+ epoch = Long.parseLong(date);
+ } catch (Exception e) {
+ epoch = 0;
+ }
+
+ if (epoch > 0) {
+ return out.format(new Date(1000 * epoch));
+ }
+
+ try {
+ Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
+ .parse(date.trim());
+ return out.format(dat);
+ } catch (ParseException e) {
+ return date;
+ }
+ }
}