src/be/nikiroo/gofetch/support/BasicSupport.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.text.SimpleDateFormat;
   6 import java.util.ArrayList;
   7 import java.util.Date;
   8 import java.util.List;
   9
  10 import org.jsoup.helper.StringUtil;
  11 import org.jsoup.nodes.Element;
  12 import org.jsoup.nodes.Node;
  13 import org.jsoup.nodes.TextNode;
  14 import org.jsoup.select.Elements;
  15 import org.jsoup.select.NodeTraversor;
  16 import org.jsoup.select.NodeVisitor;
  17
  18 import be.nikiroo.gofetch.data.Story;
  19 import be.nikiroo.utils.Downloader;
  20
  21 /**
  22  * Base class for website support.
  23  *
  24  * @author niki
  25  */
  26 public abstract class BasicSupport {
  27         /** The downloader to use for all websites. */
  28         protected static Downloader downloader = new Downloader("gofetcher");
  29
  30         /**
  31          * The support type (each website we support has a single type).
  32          *
  33          * @author niki
  34          */
  35         public enum Type {
  36                 /** EN: Any, but mostly IT/Sci */
  37                 SLASHDOT,
  38                 /** EN: Clone of Slashdot, mostly abandoned */
  39                 PIPEDOT,
  40                 /** EN: Linux */
  41                 LWN,
  42                 /** FR: Any */
  43                 LEMONDE,
  44                 /** EN: IT */
  45                 REGISTER,
  46                 /** FR: Linux */
  47                 TOO_LINUX,
  48         }
  49
  50         /**
  51          * Used to process an element into lines.
  52          *
  53          * @author niki
  54          */
  55         public interface ElementProcessor {
  56                 /**
  57                  * Detect if this node is a quote and should be trated as such.
  58                  *
  59                  * @param node
  60                  *            the node to check
  61                  * @return TRUE if it is
  62                  */
  63                 public boolean detectQuote(Node node);
  64
  65                 /**
  66                  * Process text content (will be called on each text element, allowing
  67                  * you to modify it if needed).
  68                  *
  69                  * @param text
  70                  *            the text to process
  71                  *
  72                  * @return the resulting text
  73                  */
  74                 public String processText(String text);
  75
  76                 /**
  77                  * Ignore this node.
  78                  *
  79                  * @param node
  80                  *            the node to ignore
  81                  * @return TRUE if it has to be ignored
  82                  */
  83                 public boolean ignoreNode(Node node);
  84
  85                 /**
  86                  * Manually process this node (and return the manual processing value)
  87                  * if so desired.
  88                  * <p>
  89                  * If the node is manually processed, it and its children will not be
  90                  * automatically processed.
  91                  *
  92                  * @param node
  93                  *            the node to optionally process
  94                  *
  95                  * @return NULL if not processed (will thus be automatically processed
  96                  *         as usual), a {@link String} (may be empty) if we process it
  97                  *         manually -- the given {@link String} will be used instead of
  98                  *         the usual automatic processing if not NULL
  99                  */
 100                 public String manualProcessing(Node node);
 101         }
 102
 103         /**
 104          * A default {@link ElementProcessor} (will not detect or process anything
 105          * manually).
 106          *
 107          * @author niki
 108          */
 109         public class BasicElementProcessor implements ElementProcessor {
 110                 @Override
 111                 public boolean detectQuote(Node node) {
 112                         return false;
 113                 }
 114
 115                 @Override
 116                 public String processText(String text) {
 117                         return text;
 118                 }
 119
 120                 @Override
 121                 public boolean ignoreNode(Node node) {
 122                         return false;
 123                 }
 124
 125                 @Override
 126                 public String manualProcessing(Node node) {
 127                         return null;
 128                 }
 129         }
 130
 131         static private String preselector;
 132
 133         private Type type;
 134
 135         /**
 136          * List all the recent items, but only assure the ID and internal URL to
 137          * fetch it later on (until it has been fetched, the rest of the
 138          * {@link Story} is not confirmed).
 139          *
 140          * @return the list of new stories
 141          *
 142          * @throws IOException
 143          *             in case of I/O
 144          */
 145         abstract public List<Story> list() throws IOException;
 146
 147         /**
 148          * Fetch the full article content as well as all the comments associated to
 149          * this {@link Story}, if any (can be empty, but not NULL).
 150          *
 151          * @param story
 152          *            the story to fetch the comments of
 153          *
 154          * @throws IOException
 155          *             in case of I/O error
 156          */
 157         abstract public void fetch(Story story) throws IOException;
 158
 159         /**
 160          * The website textual description, to add in the dispatcher page.
 161          * <p>
 162          * Should be short.
 163          *
 164          * @return the description
 165          */
 166         abstract public String getDescription();
 167
 168         /**
 169          * The gopher "selector" to use for output.
 170          * <p>
 171          * A kind of "URL path", like "/news/" or "/misc/news/" or...
 172          *
 173          * @return the selector
 174          */
 175         public String getSelector() {
 176                 return getSelector(type);
 177         }
 178
 179         /**
 180          * The support type.
 181          *
 182          * @return the type
 183          */
 184         public Type getType() {
 185                 return type;
 186         }
 187
 188         /**
 189          * The support type.
 190          *
 191          * @param type
 192          *            the new type
 193          */
 194         protected void setType(Type type) {
 195                 this.type = type;
 196         }
 197
 198         /**
 199          * The {@link String} to append to the selector (the selector will be
 200          * constructed as "this string" then "/type/".
 201          *
 202          * @param preselector
 203          *            the preselector to set
 204          */
 205         static public void setPreselector(String preselector) {
 206                 BasicSupport.preselector = preselector;
 207         }
 208
 209         /**
 210          * Return a {@link BasicSupport} that is compatible with the given
 211          * {@link Type} if it exists (or NULL if not).
 212          *
 213          * @param type
 214          *            the type
 215          *
 216          * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
 217          */
 218         static public BasicSupport getSupport(Type type) {
 219                 BasicSupport support = null;
 220
 221                 if (type != null) {
 222                         switch (type) {
 223                         case SLASHDOT:
 224                                 support = new Slashdot();
 225                                 break;
 226                         case PIPEDOT:
 227                                 support = new Pipedot();
 228                                 break;
 229                         case LWN:
 230                                 support = new LWN();
 231                                 break;
 232                         case LEMONDE:
 233                                 support = new LeMonde();
 234                                 break;
 235                         case REGISTER:
 236                                 support = new TheRegister();
 237                                 break;
 238                         case TOO_LINUX:
 239                                 support = new TooLinux();
 240                                 break;
 241                         }
 242
 243                         if (support != null) {
 244                                 support.setType(type);
 245                         }
 246                 }
 247
 248                 return support;
 249         }
 250
 251         /**
 252          * The gopher "selector" to use for output for this type, using the
 253          * preselector.
 254          * <p>
 255          * A kind of "URL path", like "/news/" or "/misc/news/" or...
 256          *
 257          * @param type
 258          *            the type to get the selector of
 259          *
 260          * @return the selector
 261          */
 262         static public String getSelector(Type type) {
 263                 return preselector + "/" + type + "/";
 264         }
 265
 266         /**
 267          * Get the first {@link Element} of the given class, or an empty span
 268          * {@link Element} if none found.
 269          *
 270          * @param element
 271          *            the element to look in
 272          * @param className
 273          *            the class to look for
 274          *
 275          * @return the value or an empty span {@link Element}
 276          */
 277         static protected Element firstOrEmpty(Element element, String className) {
 278                 Elements subElements = element.getElementsByClass(className);
 279                 if (subElements.size() > 0) {
 280                         return subElements.get(0);
 281                 }
 282
 283                 return new Element("span");
 284         }
 285
 286         /**
 287          * Get the first {@link Element} of the given tag, or an empty span
 288          * {@link Element} if none found.
 289          *
 290          * @param element
 291          *            the element to look in
 292          * @param tagName
 293          *            the tag to look for
 294          *
 295          * @return the value or an empty span {@link Element}
 296          */
 297         static protected Element firstOrEmptyTag(Element element, String tagName) {
 298                 Elements subElements = element.getElementsByTag(tagName);
 299                 if (subElements.size() > 0) {
 300                         return subElements.get(0);
 301                 }
 302
 303                 return new Element("span");
 304         }
 305
 306         /**
 307          * Process the given element into text (each line is a text paragraph and
 308          * can be prepended with ">" signs to indicate a quote or sub-quote or
 309          * sub-sub-quote...).
 310          *
 311          * @param element
 312          *            the element to process
 313          * @param elementProcessor
 314          *            the element processor, must not be NULL
 315          *
 316          * @return text lines, each line is a paragraph
 317          */
 318         static protected List<String> toLines(Element element,
 319                         final ElementProcessor elementProcessor) {
 320                 final List<String> lines = new ArrayList<String>();
 321                 final StringBuilder currentLine = new StringBuilder();
 322                 final List<Integer> quoted = new ArrayList<Integer>();
 323                 final List<Node> ignoredNodes = new ArrayList<Node>();
 324
 325                 if (element != null) {
 326                         new NodeTraversor(new NodeVisitor() {
 327                                 @Override
 328                                 public void head(Node node, int depth) {
 329                                         String manual = null;
 330                                         boolean ignore = elementProcessor.ignoreNode(node)
 331                                                         || ignoredNodes.contains(node.parentNode());
 332                                         if (!ignore) {
 333                                                 manual = elementProcessor.manualProcessing(node);
 334                                                 if (manual != null) {
 335                                                         currentLine.append(manual);
 336                                                         ignore = true;
 337                                                 }
 338                                         }
 339
 340                                         if (ignore) {
 341                                                 ignoredNodes.add(node);
 342                                                 return;
 343                                         }
 344
 345                                         String prep = "";
 346                                         for (int i = 0; i < quoted.size(); i++) {
 347                                                 prep += ">";
 348                                         }
 349                                         prep += " ";
 350
 351                                         boolean enterQuote = elementProcessor.detectQuote(node);
 352                                         boolean leaveQuote = quoted.contains(depth);
 353
 354                                         if (enterQuote) {
 355                                                 quoted.add(depth);
 356                                         }
 357
 358                                         if (leaveQuote) {
 359                                                 quoted.remove(Integer.valueOf(depth));
 360                                         }
 361
 362                                         if (enterQuote || leaveQuote) {
 363                                                 if (currentLine.length() > 0) {
 364                                                         if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 365                                                                 currentLine.setLength(currentLine.length() - 1);
 366                                                         }
 367                                                         for (String l : currentLine.toString().split("\n")) {
 368                                                                 lines.add(prep + l);
 369                                                         }
 370                                                 }
 371                                                 currentLine.setLength(0);
 372                                         }
 373
 374                                         if (node instanceof Element) {
 375                                                 Element element = (Element) node;
 376                                                 boolean block = element.isBlock()
 377                                                                 || element.tagName().equalsIgnoreCase("br");
 378                                                 if (block && currentLine.length() > 0) {
 379                                                         currentLine.append("\n");
 380                                                 }
 381                                         } else if (node instanceof TextNode) {
 382                                                 TextNode textNode = (TextNode) node;
 383                                                 String line = StringUtil.normaliseWhitespace(textNode
 384                                                                 .getWholeText());
 385
 386                                                 currentLine.append(elementProcessor.processText(line));
 387                                                 currentLine.append(" ");
 388                                         }
 389                                 }
 390
 391                                 @Override
 392                                 public void tail(Node node, int depth) {
 393                                 }
 394                         }).traverse(element);
 395                 }
 396
 397                 if (currentLine.length() > 0) {
 398                         String prep = "";
 399                         for (int i = 0; i < quoted.size(); i++) {
 400                                 prep += ">";
 401                         }
 402                         prep += " ";
 403                         if (currentLine.length() > 0) {
 404                                 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 405                                         currentLine.setLength(currentLine.length() - 1);
 406                                 }
 407                                 for (String l : currentLine.toString().split("\n")) {
 408                                         lines.add(prep + l);
 409                                 }
 410                         }
 411                 }
 412
 413                 for (int i = 0; i < lines.size(); i++) {
 414                         lines.set(i, lines.get(i).replace("  ", " ").trim());
 415                 }
 416
 417                 return lines;
 418         }
 419
 420         /**
 421          * Reformat the date if possible.
 422          *
 423          * @param date
 424          *            the input date
 425          *
 426          * @return the reformated date, or the same value if it was not parsable
 427          */
 428         static protected String date(String date) {
 429                 SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
 430
 431                 long epoch = 0;
 432                 try {
 433                         epoch = Long.parseLong(date);
 434                 } catch (Exception e) {
 435                         epoch = 0;
 436                 }
 437
 438                 if (epoch > 0) {
 439                         return out.format(new Date(1000 * epoch));
 440                 }
 441
 442                 try {
 443                         Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
 444                                         .parse(date.trim());
 445                         return out.format(dat);
 446                 } catch (ParseException e) {
 447                         return date;
 448                 }
 449         }
 450 }