nikiroo/gofetch/support/BasicSupport.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.util.ArrayList;
   5 import java.util.List;
   6
   7 import org.jsoup.helper.StringUtil;
   8 import org.jsoup.nodes.Element;
   9 import org.jsoup.nodes.Node;
  10 import org.jsoup.nodes.TextNode;
  11 import org.jsoup.select.Elements;
  12 import org.jsoup.select.NodeTraversor;
  13 import org.jsoup.select.NodeVisitor;
  14
  15 import be.nikiroo.gofetch.data.Story;
  16 import be.nikiroo.utils.Downloader;
  17
  18 public abstract class BasicSupport {
  19         protected static Downloader downloader = new Downloader("gofetcher");
  20
  21         public enum Type {
  22                 SLASHDOT, PIPEDOT, LWN, LEMONDE, REGISTER, TOOLINUX,
  23         }
  24
  25         /**
  26          * Used to process an element into lines.
  27          *
  28          * @author niki
  29          */
  30         public interface ElementProcessor {
  31                 /**
  32                  * Detect if this node is a quote and should be trated as such.
  33                  *
  34                  * @param node
  35                  *            the node to check
  36                  * @return TRUE if it is
  37                  */
  38                 public boolean detectQuote(Node node);
  39
  40                 /**
  41                  * Process text content (will be called on each text element, allowing
  42                  * you to modify it if needed).
  43                  *
  44                  * @param text
  45                  *            the text to process
  46                  * @return
  47                  */
  48                 public String processText(String text);
  49
  50                 /**
  51                  * Ignore this node.
  52                  *
  53                  * @param node
  54                  *            the node to ignore
  55                  * @return TRUE if it has to be ignored
  56                  */
  57                 public boolean ignoreNode(Node node);
  58
  59                 /**
  60                  * Manually process this node (and return the manual processing value)
  61                  * if so desired.
  62                  * <p>
  63                  * If the node is manually processed, it and its children will not be
  64                  * automatically processed.
  65                  *
  66                  * @param node
  67                  *            the node to optionally process
  68                  *
  69                  * @return NULL if not processed (will thus be automatically processed
  70                  *         as usual), a {@link String} (may be empty) if we process it
  71                  *         manually -- the given {@link String} will be used instead of
  72                  *         the usual automatic processing if not NULL
  73                  */
  74                 public String manualProcessing(Node node);
  75         }
  76
  77         /**
  78          * A default {@link ElementProcessor} (will not detect or process anything
  79          * manually).
  80          *
  81          * @author niki
  82          */
  83         public class BasicElementProcessor implements ElementProcessor {
  84                 @Override
  85                 public boolean detectQuote(Node node) {
  86                         return false;
  87                 }
  88
  89                 @Override
  90                 public String processText(String text) {
  91                         return text;
  92                 }
  93
  94                 @Override
  95                 public boolean ignoreNode(Node node) {
  96                         return false;
  97                 }
  98
  99                 @Override
 100                 public String manualProcessing(Node node) {
 101                         return null;
 102                 }
 103         }
 104
 105         static private String preselector;
 106
 107         private Type type;
 108
 109         /**
 110          * List all the recent items, but only assure the ID and internal URL to
 111          * fetch it later on (until it has been fetched, the rest of the
 112          * {@link Story} is not confirmed).
 113          *
 114          * @return the list of new stories
 115          *
 116          * @throws IOException
 117          *             in case of I/O
 118          */
 119         abstract public List<Story> list() throws IOException;
 120
 121         /**
 122          * Fetch the full article content as well as all the comments associated to
 123          * this {@link Story}, if any (can be empty, but not NULL).
 124          *
 125          * @param story
 126          *            the story to fetch the comments of
 127          *
 128          * @throws IOException
 129          *             in case of I/O error
 130          */
 131         abstract public void fetch(Story story) throws IOException;
 132
 133         abstract public String getDescription();
 134
 135         public String getSelector() {
 136                 return getSelector(type);
 137         }
 138
 139         public Type getType() {
 140                 return type;
 141         }
 142
 143         protected void setType(Type type) {
 144                 this.type = type;
 145         }
 146
 147         /**
 148          * @param preselector
 149          *            the preselector to set
 150          */
 151         static public void setPreselector(String preselector) {
 152                 BasicSupport.preselector = preselector;
 153         }
 154
 155         /**
 156          * Return a {@link BasicSupport} that is compatible with the given
 157          * {@link Type} if it exists (or NULL if not).
 158          *
 159          * @param type
 160          *            the type
 161          *
 162          * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
 163          */
 164         static public BasicSupport getSupport(Type type) {
 165                 BasicSupport support = null;
 166
 167                 if (type != null) {
 168                         switch (type) {
 169                         case SLASHDOT:
 170                                 support = new Slashdot();
 171                                 break;
 172                         case PIPEDOT:
 173                                 support = new Pipedot();
 174                                 break;
 175                         case LWN:
 176                                 support = new LWN();
 177                                 break;
 178                         case LEMONDE:
 179                                 support = new LeMonde();
 180                                 break;
 181                         case REGISTER:
 182                                 support = new TheRegister();
 183                                 break;
 184                         case TOOLINUX:
 185                                 support = new TooLinux();
 186                                 break;
 187                         }
 188
 189                         if (support != null) {
 190                                 support.setType(type);
 191                         }
 192                 }
 193
 194                 return support;
 195         }
 196
 197         static public String getSelector(Type type) {
 198                 return preselector + "/" + type + "/";
 199         }
 200
 201         /**
 202          * Get the first {@link Element} of the given class, or an empty span
 203          * {@link Element} if none found.
 204          *
 205          * @param element
 206          *            the element to look in
 207          * @param className
 208          *            the class to look for
 209          *
 210          * @return the value or an empty span {@link Element}
 211          */
 212         static protected Element firstOrEmpty(Element element, String className) {
 213                 Elements subElements = element.getElementsByClass(className);
 214                 if (subElements.size() > 0) {
 215                         return subElements.get(0);
 216                 }
 217
 218                 return new Element("span");
 219         }
 220
 221         /**
 222          * Get the first {@link Element} of the given tag, or an empty span
 223          * {@link Element} if none found.
 224          *
 225          * @param element
 226          *            the element to look in
 227          * @param tagName
 228          *            the tag to look for
 229          *
 230          * @return the value or an empty span {@link Element}
 231          */
 232         static protected Element firstOrEmptyTag(Element element, String tagName) {
 233                 Elements subElements = element.getElementsByTag(tagName);
 234                 if (subElements.size() > 0) {
 235                         return subElements.get(0);
 236                 }
 237
 238                 return new Element("span");
 239         }
 240
 241         /**
 242          * Process the given element into text (each line is a text paragraph and
 243          * can be prepended with ">" signs to indicate a quote or sub-quote or
 244          * sub-sub-quote...).
 245          *
 246          * @param element
 247          *            the element to process
 248          * @param elementProcessor
 249          *            the element processor, must not be NULL
 250          *
 251          * @return text lines, each line is a paragraph
 252          */
 253         static protected List<String> toLines(Element element,
 254                         final ElementProcessor elementProcessor) {
 255                 final List<String> lines = new ArrayList<String>();
 256                 final StringBuilder currentLine = new StringBuilder();
 257                 final List<Integer> quoted = new ArrayList<Integer>();
 258                 final List<Node> ignoredNodes = new ArrayList<Node>();
 259                 final List<String> footnotes = new ArrayList<String>();
 260
 261                 if (element != null) {
 262                         new NodeTraversor(new NodeVisitor() {
 263                                 @Override
 264                                 public void head(Node node, int depth) {
 265                                         String manual = null;
 266                                         boolean ignore = elementProcessor.ignoreNode(node)
 267                                                         || ignoredNodes.contains(node.parentNode());
 268                                         if (!ignore) {
 269                                                 manual = elementProcessor.manualProcessing(node);
 270                                                 if (manual != null) {
 271                                                         currentLine.append(manual);
 272                                                         ignore = true;
 273                                                 }
 274                                         }
 275
 276                                         if (ignore) {
 277                                                 ignoredNodes.add(node);
 278                                                 return;
 279                                         }
 280
 281                                         String prep = "";
 282                                         for (int i = 0; i < quoted.size(); i++) {
 283                                                 prep += ">";
 284                                         }
 285                                         prep += " ";
 286
 287                                         boolean enterQuote = elementProcessor.detectQuote(node);
 288                                         boolean leaveQuote = quoted.contains(depth);
 289
 290                                         if (enterQuote) {
 291                                                 quoted.add(depth);
 292                                         }
 293
 294                                         if (leaveQuote) {
 295                                                 quoted.remove(Integer.valueOf(depth));
 296                                         }
 297
 298                                         if (enterQuote || leaveQuote) {
 299                                                 if (currentLine.length() > 0) {
 300                                                         if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 301                                                                 currentLine.setLength(currentLine.length() - 1);
 302                                                         }
 303                                                         for (String l : currentLine.toString().split("\n")) {
 304                                                                 lines.add(prep + l);
 305                                                         }
 306                                                 }
 307                                                 currentLine.setLength(0);
 308                                         }
 309
 310                                         if (node instanceof Element) {
 311                                                 Element element = (Element) node;
 312                                                 boolean block = element.isBlock()
 313                                                                 || element.tagName().equalsIgnoreCase("br");
 314                                                 if (block && currentLine.length() > 0) {
 315                                                         currentLine.append("\n");
 316                                                 }
 317
 318                                                 if (!element.absUrl("href").trim().isEmpty()) {
 319                                                         footnotes.add(element.absUrl("href"));
 320                                                         currentLine.append("[" + footnotes.size() + "]");
 321                                                 }
 322                                         } else if (node instanceof TextNode) {
 323                                                 TextNode textNode = (TextNode) node;
 324                                                 String line = StringUtil.normaliseWhitespace(textNode
 325                                                                 .getWholeText());
 326
 327                                                 currentLine.append(elementProcessor.processText(line));
 328                                                 currentLine.append(" ");
 329                                         }
 330                                 }
 331
 332                                 @Override
 333                                 public void tail(Node node, int depth) {
 334                                 }
 335                         }).traverse(element);
 336                 }
 337
 338                 if (currentLine.length() > 0) {
 339                         String prep = "";
 340                         for (int i = 0; i < quoted.size(); i++) {
 341                                 prep += ">";
 342                         }
 343                         prep += " ";
 344                         if (currentLine.length() > 0) {
 345                                 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 346                                         currentLine.setLength(currentLine.length() - 1);
 347                                 }
 348                                 for (String l : currentLine.toString().split("\n")) {
 349                                         lines.add(prep + l);
 350                                 }
 351                         }
 352                 }
 353
 354                 for (int i = 0; i < lines.size(); i++) {
 355                         lines.set(i, lines.get(i).replace("  ", " ").trim());
 356                 }
 357
 358                 if (footnotes.size() > 0) {
 359                         lines.add("");
 360                         lines.add("");
 361                         lines.add("");
 362                         lines.add("");
 363                         for (int i = 0; i < footnotes.size(); i++) {
 364                                 lines.add("[" + (i + 1) + "] " + footnotes.get(i));
 365                         }
 366                 }
 367
 368                 return lines;
 369         }
 370 }