src/be/nikiroo/gofetch/support/BasicSupport.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.net.URL;
   6 import java.net.URLConnection;
   7 import java.util.ArrayList;
   8 import java.util.List;
   9 import java.util.zip.GZIPInputStream;
  10
  11 import org.jsoup.helper.StringUtil;
  12 import org.jsoup.nodes.Element;
  13 import org.jsoup.nodes.Node;
  14 import org.jsoup.nodes.TextNode;
  15 import org.jsoup.select.Elements;
  16 import org.jsoup.select.NodeTraversor;
  17 import org.jsoup.select.NodeVisitor;
  18
  19 import be.nikiroo.gofetch.data.Story;
  20
  21 public abstract class BasicSupport {
  22         public enum Type {
  23                 SLASHDOT, PIPEDOT, LWN, LEMONDE,
  24         }
  25
  26         /**
  27          * Used to process an element into lines.
  28          *
  29          * @author niki
  30          */
  31         public interface ElementProcessor {
  32                 /**
  33                  * Detect if this node is a quote and should be trated as such.
  34                  *
  35                  * @param node
  36                  *            the node to check
  37                  * @return TRUE if it is
  38                  */
  39                 public boolean detectQuote(Node node);
  40
  41                 /**
  42                  * Process text content (will be called on each text element, allowing
  43                  * you to modify it if needed).
  44                  *
  45                  * @param text
  46                  *            the text to process
  47                  * @return
  48                  */
  49                 public String processText(String text);
  50
  51                 /**
  52                  * Ignore this node.
  53                  *
  54                  * @param node
  55                  *            the node to ignore
  56                  * @return TRUE if it has to be ignored
  57                  */
  58                 public boolean ignoreNode(Node node);
  59
  60                 /**
  61                  * Manually process this node (and return the manual processing value)
  62                  * if so desired.
  63                  * <p>
  64                  * If the node is manually processed, it and its children will not be
  65                  * automatically processed.
  66                  *
  67                  * @param node
  68                  *            the node to optionally process
  69                  *
  70                  * @return NULL if not processed (will thus be automatically processed
  71                  *         as usual), a {@link String} (may be empty) if we process it
  72                  *         manually -- the given {@link String} will be used instead of
  73                  *         the usual automatic processing if not NULL
  74                  */
  75                 public String manualProcessing(Node node);
  76         }
  77
  78         /**
  79          * A default {@link ElementProcessor} (will not detect or process anything
  80          * manually).
  81          *
  82          * @author niki
  83          */
  84         public class BasicElementProcessor implements ElementProcessor {
  85                 @Override
  86                 public boolean detectQuote(Node node) {
  87                         return false;
  88                 }
  89
  90                 @Override
  91                 public String processText(String text) {
  92                         return text;
  93                 }
  94
  95                 @Override
  96                 public boolean ignoreNode(Node node) {
  97                         return false;
  98                 }
  99
 100                 @Override
 101                 public String manualProcessing(Node node) {
 102                         return null;
 103                 }
 104         }
 105
 106         static private String preselector;
 107
 108         private Type type;
 109
 110         /**
 111          * List all the recent items, but only assure the ID and internal URL to
 112          * fetch it later on (until it has been fetched, the rest of the
 113          * {@link Story} is not confirmed).
 114          *
 115          * @return the list of new stories
 116          *
 117          * @throws IOException
 118          *             in case of I/O
 119          */
 120         abstract public List<Story> list() throws IOException;
 121
 122         /**
 123          * Fetch the full article content as well as all the comments associated to
 124          * this {@link Story}, if any (can be empty, but not NULL).
 125          *
 126          * @param story
 127          *            the story to fetch the comments of
 128          *
 129          * @throws IOException
 130          *             in case of I/O error
 131          */
 132         abstract public void fetch(Story story) throws IOException;
 133
 134         abstract public String getDescription();
 135
 136         public String getSelector() {
 137                 return getSelector(type);
 138         }
 139
 140         public Type getType() {
 141                 return type;
 142         }
 143
 144         protected void setType(Type type) {
 145                 this.type = type;
 146         }
 147
 148         /**
 149          * @param preselector
 150          *            the preselector to set
 151          */
 152         static public void setPreselector(String preselector) {
 153                 BasicSupport.preselector = preselector;
 154         }
 155
 156         /**
 157          * Return a {@link BasicSupport} that is compatible with the given
 158          * {@link Type} if it exists (or NULL if not).
 159          *
 160          * @param type
 161          *            the type
 162          *
 163          * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
 164          */
 165         static public BasicSupport getSupport(Type type) {
 166                 BasicSupport support = null;
 167
 168                 if (type != null) {
 169                         switch (type) {
 170                         case SLASHDOT:
 171                                 support = new Slashdot();
 172                                 break;
 173                         case PIPEDOT:
 174                                 support = new Pipedot();
 175                                 break;
 176                         case LWN:
 177                                 support = new LWN();
 178                                 break;
 179                         case LEMONDE:
 180                                 support = new LeMonde();
 181                                 break;
 182                         }
 183
 184                         if (support != null) {
 185                                 support.setType(type);
 186                         }
 187                 }
 188
 189                 return support;
 190         }
 191
 192         static public String getSelector(Type type) {
 193                 return preselector + "/" + type + "/";
 194         }
 195
 196         // TODO: check Downloader.java?
 197         static protected InputStream open(URL url) throws IOException {
 198                 URLConnection conn = url.openConnection();
 199                 conn.connect();
 200                 InputStream in = conn.getInputStream();
 201                 if ("gzip".equals(conn.getContentEncoding())) {
 202                         in = new GZIPInputStream(in);
 203                 }
 204
 205                 return in;
 206         }
 207
 208         /**
 209          * Get the first {@link Element} of the given class, or an empty span
 210          * {@link Element} if none found.
 211          *
 212          * @param element
 213          *            the element to look in
 214          * @param className
 215          *            the class to look for
 216          *
 217          * @return the value or an empty span {@link Element}
 218          */
 219         static protected Element firstOrEmpty(Element element, String className) {
 220                 Elements subElements = element.getElementsByClass(className);
 221                 if (subElements.size() > 0) {
 222                         return subElements.get(0);
 223                 }
 224
 225                 return new Element("span");
 226         }
 227
 228         /**
 229          * Get the first {@link Element} of the given tag, or an empty span
 230          * {@link Element} if none found.
 231          *
 232          * @param element
 233          *            the element to look in
 234          * @param tagName
 235          *            the tag to look for
 236          *
 237          * @return the value or an empty span {@link Element}
 238          */
 239         static protected Element firstOrEmptyTag(Element element, String tagName) {
 240                 Elements subElements = element.getElementsByTag(tagName);
 241                 if (subElements.size() > 0) {
 242                         return subElements.get(0);
 243                 }
 244
 245                 return new Element("span");
 246         }
 247
 248         /**
 249          * Process the given element into text (each line is a text paragraph and
 250          * can be prepended with ">" signs to indicate a quote or sub-quote or
 251          * sub-sub-quote...).
 252          *
 253          * @param element
 254          *            the element to process
 255          * @param elementProcessor
 256          *            the element processor, must not be NULL
 257          *
 258          * @return text lines, each line is a paragraph
 259          */
 260         static protected List<String> toLines(Element element,
 261                         final ElementProcessor elementProcessor) {
 262                 final List<String> lines = new ArrayList<String>();
 263                 final StringBuilder currentLine = new StringBuilder();
 264                 final List<Integer> quoted = new ArrayList<Integer>();
 265                 final List<Node> ignoredNodes = new ArrayList<Node>();
 266
 267                 if (element != null) {
 268                         new NodeTraversor(new NodeVisitor() {
 269                                 @Override
 270                                 public void head(Node node, int depth) {
 271                                         String manual = null;
 272                                         boolean ignore = elementProcessor.ignoreNode(node)
 273                                                         || ignoredNodes.contains(node.parentNode());
 274                                         if (!ignore) {
 275                                                 manual = elementProcessor.manualProcessing(node);
 276                                                 if (manual != null) {
 277                                                         currentLine.append(manual);
 278                                                         ignore = true;
 279                                                 }
 280                                         }
 281
 282                                         if (ignore) {
 283                                                 ignoredNodes.add(node);
 284                                                 return;
 285                                         }
 286
 287                                         String prep = "";
 288                                         for (int i = 0; i < quoted.size(); i++) {
 289                                                 prep += ">";
 290                                         }
 291                                         prep += " ";
 292
 293                                         boolean enterQuote = elementProcessor.detectQuote(node);
 294                                         boolean leaveQuote = quoted.contains(depth);
 295
 296                                         if (enterQuote) {
 297                                                 quoted.add(depth);
 298                                         }
 299
 300                                         if (leaveQuote) {
 301                                                 quoted.remove(Integer.valueOf(depth));
 302                                         }
 303
 304                                         if (enterQuote || leaveQuote) {
 305                                                 if (currentLine.length() > 0) {
 306                                                         if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 307                                                                 currentLine.setLength(currentLine.length() - 1);
 308                                                         }
 309                                                         for (String l : currentLine.toString().split("\n")) {
 310                                                                 lines.add(prep + l);
 311                                                         }
 312                                                 }
 313                                                 currentLine.setLength(0);
 314                                         }
 315
 316                                         if (node instanceof Element) {
 317                                                 Element element = (Element) node;
 318                                                 boolean block = element.isBlock()
 319                                                                 || element.tagName().equalsIgnoreCase("br");
 320                                                 if (block && currentLine.length() > 0) {
 321                                                         currentLine.append("\n");
 322                                                 }
 323                                         } else if (node instanceof TextNode) {
 324                                                 TextNode textNode = (TextNode) node;
 325                                                 String line = StringUtil.normaliseWhitespace(textNode
 326                                                                 .getWholeText());
 327
 328                                                 currentLine.append(elementProcessor.processText(line));
 329                                                 currentLine.append(" ");
 330                                         }
 331                                 }
 332
 333                                 @Override
 334                                 public void tail(Node node, int depth) {
 335                                 }
 336                         }).traverse(element);
 337                 }
 338
 339                 if (currentLine.length() > 0) {
 340                         String prep = "";
 341                         for (int i = 0; i < quoted.size(); i++) {
 342                                 prep += ">";
 343                         }
 344                         prep += " ";
 345                         if (currentLine.length() > 0) {
 346                                 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 347                                         currentLine.setLength(currentLine.length() - 1);
 348                                 }
 349                                 for (String l : currentLine.toString().split("\n")) {
 350                                         lines.add(prep + l);
 351                                 }
 352                         }
 353                 }
 354
 355                 for (int i = 0; i < lines.size(); i++) {
 356                         lines.set(i, lines.get(i).replace("  ", " ").trim());
 357                 }
 358
 359                 return lines;
 360         }
 361 }