src/be/nikiroo/gofetch/support/BasicSupport.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.text.ParseException;
   5 import java.text.SimpleDateFormat;
   6 import java.util.ArrayList;
   7 import java.util.Date;
   8 import java.util.List;
   9
  10 import org.jsoup.helper.StringUtil;
  11 import org.jsoup.nodes.Element;
  12 import org.jsoup.nodes.Node;
  13 import org.jsoup.nodes.TextNode;
  14 import org.jsoup.select.Elements;
  15 import org.jsoup.select.NodeTraversor;
  16 import org.jsoup.select.NodeVisitor;
  17
  18 import be.nikiroo.gofetch.data.Story;
  19 import be.nikiroo.utils.Downloader;
  20
  21 /**
  22  * Base class for website support.
  23  *
  24  * @author niki
  25  */
  26 public abstract class BasicSupport {
  27         /** The downloader to use for all websites. */
  28         protected static Downloader downloader = new Downloader("gofetcher");
  29
  30         /**
  31          * The support type (each website we support has a single type).
  32          *
  33          * @author niki
  34          */
  35         public enum Type {
  36                 /** EN: Any, but mostly IT/Sci */
  37                 SLASHDOT,
  38                 /** EN: Clone of Slashdot, mostly abandoned */
  39                 PIPEDOT,
  40                 /** EN: Linux */
  41                 LWN,
  42                 /** FR: Any */
  43                 LEMONDE,
  44                 /** EN: IT */
  45                 REGISTER,
  46                 /** FR: Linux */
  47                 TOO_LINUX,
  48                 /** FR: IT */
  49                 ERE_NUMERIQUE,
  50         }
  51
  52         /**
  53          * Used to process an element into lines.
  54          *
  55          * @author niki
  56          */
  57         public interface ElementProcessor {
  58                 /**
  59                  * Detect if this node is a quote and should be trated as such.
  60                  *
  61                  * @param node
  62                  *            the node to check
  63                  * @return TRUE if it is
  64                  */
  65                 public boolean detectQuote(Node node);
  66
  67                 /**
  68                  * Process text content (will be called on each text element, allowing
  69                  * you to modify it if needed).
  70                  *
  71                  * @param text
  72                  *            the text to process
  73                  *
  74                  * @return the resulting text
  75                  */
  76                 public String processText(String text);
  77
  78                 /**
  79                  * Ignore this node.
  80                  *
  81                  * @param node
  82                  *            the node to ignore
  83                  * @return TRUE if it has to be ignored
  84                  */
  85                 public boolean ignoreNode(Node node);
  86
  87                 /**
  88                  * Manually process this node (and return the manual processing value)
  89                  * if so desired.
  90                  * <p>
  91                  * If the node is manually processed, it and its children will not be
  92                  * automatically processed.
  93                  *
  94                  * @param node
  95                  *            the node to optionally process
  96                  *
  97                  * @return NULL if not processed (will thus be automatically processed
  98                  *         as usual), a {@link String} (may be empty) if we process it
  99                  *         manually -- the given {@link String} will be used instead of
 100                  *         the usual automatic processing if not NULL
 101                  */
 102                 public String manualProcessing(Node node);
 103
 104                 /**
 105                  * This {@link Node} is a subtitle and should be treated as such
 106                  * (highlighted).
 107                  *
 108                  * @param node
 109                  *            the node to check
 110                  *
 111                  * @return NULL if it is not a subtitle, the subtitle to use if it is
 112                  */
 113                 public String isSubtitle(Node node);
 114         }
 115
 116         /**
 117          * A default {@link ElementProcessor} (will not detect or process anything
 118          * manually).
 119          *
 120          * @author niki
 121          */
 122         public class BasicElementProcessor implements ElementProcessor {
 123                 @Override
 124                 public boolean detectQuote(Node node) {
 125                         return false;
 126                 }
 127
 128                 @Override
 129                 public String processText(String text) {
 130                         return text;
 131                 }
 132
 133                 @Override
 134                 public boolean ignoreNode(Node node) {
 135                         return false;
 136                 }
 137
 138                 @Override
 139                 public String manualProcessing(Node node) {
 140                         return null;
 141                 }
 142
 143                 @Override
 144                 public String isSubtitle(Node node) {
 145                         return null;
 146                 }
 147         }
 148
 149         static private String preselector;
 150
 151         private Type type;
 152
 153         /**
 154          * List all the recent items, but only assure the ID and internal URL to
 155          * fetch it later on (until it has been fetched, the rest of the
 156          * {@link Story} is not confirmed).
 157          *
 158          * @return the list of new stories
 159          *
 160          * @throws IOException
 161          *             in case of I/O
 162          */
 163         abstract public List<Story> list() throws IOException;
 164
 165         /**
 166          * Fetch the full article content as well as all the comments associated to
 167          * this {@link Story}, if any (can be empty, but not NULL).
 168          *
 169          * @param story
 170          *            the story to fetch the comments of
 171          *
 172          * @throws IOException
 173          *             in case of I/O error
 174          */
 175         abstract public void fetch(Story story) throws IOException;
 176
 177         /**
 178          * The website textual description, to add in the dispatcher page.
 179          * <p>
 180          * Should be short.
 181          *
 182          * @return the description
 183          */
 184         abstract public String getDescription();
 185
 186         /**
 187          * The gopher "selector" to use for output.
 188          * <p>
 189          * A kind of "URL path", like "/news/" or "/misc/news/" or...
 190          *
 191          * @return the selector
 192          */
 193         public String getSelector() {
 194                 return getSelector(type);
 195         }
 196
 197         /**
 198          * The support type.
 199          *
 200          * @return the type
 201          */
 202         public Type getType() {
 203                 return type;
 204         }
 205
 206         /**
 207          * The support type.
 208          *
 209          * @param type
 210          *            the new type
 211          */
 212         protected void setType(Type type) {
 213                 this.type = type;
 214         }
 215
 216         /**
 217          * The {@link String} to append to the selector (the selector will be
 218          * constructed as "this string" then "/type/".
 219          *
 220          * @param preselector
 221          *            the preselector to set
 222          */
 223         static public void setPreselector(String preselector) {
 224                 BasicSupport.preselector = preselector;
 225         }
 226
 227         /**
 228          * Return a {@link BasicSupport} that is compatible with the given
 229          * {@link Type} if it exists (or NULL if not).
 230          *
 231          * @param type
 232          *            the type
 233          *
 234          * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
 235          */
 236         static public BasicSupport getSupport(Type type) {
 237                 BasicSupport support = null;
 238
 239                 if (type != null) {
 240                         switch (type) {
 241                         case SLASHDOT:
 242                                 support = new Slashdot();
 243                                 break;
 244                         case PIPEDOT:
 245                                 support = new Pipedot();
 246                                 break;
 247                         case LWN:
 248                                 support = new LWN();
 249                                 break;
 250                         case LEMONDE:
 251                                 support = new LeMonde();
 252                                 break;
 253                         case REGISTER:
 254                                 support = new TheRegister();
 255                                 break;
 256                         case TOO_LINUX:
 257                                 support = new TooLinux();
 258                                 break;
 259                         case ERE_NUMERIQUE:
 260                                 support = new EreNumerique();
 261                                 break;
 262                         }
 263
 264                         if (support != null) {
 265                                 support.setType(type);
 266                         }
 267                 }
 268
 269                 return support;
 270         }
 271
 272         /**
 273          * The gopher "selector" to use for output for this type, using the
 274          * preselector.
 275          * <p>
 276          * A kind of "URL path", like "/news/" or "/misc/news/" or...
 277          *
 278          * @param type
 279          *            the type to get the selector of
 280          *
 281          * @return the selector
 282          */
 283         static public String getSelector(Type type) {
 284                 return preselector + "/" + type + "/";
 285         }
 286
 287         /**
 288          * Get the first {@link Element} of the given class, or an empty span
 289          * {@link Element} if none found.
 290          *
 291          * @param element
 292          *            the element to look in
 293          * @param className
 294          *            the class to look for
 295          *
 296          * @return the value or an empty span {@link Element}
 297          */
 298         static protected Element firstOrEmpty(Element element, String className) {
 299                 Elements subElements = element.getElementsByClass(className);
 300                 if (subElements.size() > 0) {
 301                         return subElements.get(0);
 302                 }
 303
 304                 return new Element("span");
 305         }
 306
 307         /**
 308          * Get the first {@link Element} of the given tag, or an empty span
 309          * {@link Element} if none found.
 310          *
 311          * @param element
 312          *            the element to look in
 313          * @param tagName
 314          *            the tag to look for
 315          *
 316          * @return the value or an empty span {@link Element}
 317          */
 318         static protected Element firstOrEmptyTag(Element element, String tagName) {
 319                 Elements subElements = element.getElementsByTag(tagName);
 320                 if (subElements.size() > 0) {
 321                         return subElements.get(0);
 322                 }
 323
 324                 return new Element("span");
 325         }
 326
 327         /**
 328          * Process the given element into text (each line is a text paragraph and
 329          * can be prepended with ">" signs to indicate a quote or sub-quote or
 330          * sub-sub-quote...).
 331          *
 332          * @param element
 333          *            the element to process
 334          * @param elementProcessor
 335          *            the element processor, must not be NULL
 336          *
 337          * @return text lines, each line is a paragraph
 338          */
 339         static protected List<String> toLines(Element element,
 340                         final ElementProcessor elementProcessor) {
 341                 final List<String> lines = new ArrayList<String>();
 342                 final StringBuilder currentLine = new StringBuilder();
 343                 final List<Integer> quoted = new ArrayList<Integer>();
 344                 final List<Node> ignoredNodes = new ArrayList<Node>();
 345
 346                 if (element != null) {
 347                         new NodeTraversor(new NodeVisitor() {
 348                                 @Override
 349                                 public void head(Node node, int depth) {
 350                                         String manual = null;
 351                                         boolean ignore = elementProcessor.ignoreNode(node)
 352                                                         || ignoredNodes.contains(node.parentNode());
 353                                         // Manual processing
 354                                         if (!ignore) {
 355                                                 manual = elementProcessor.manualProcessing(node);
 356                                                 if (manual != null) {
 357                                                         currentLine.append(manual);
 358                                                         ignore = true;
 359                                                 }
 360                                         }
 361
 362                                         // Subtitle check
 363                                         if (!ignore) {
 364                                                 String subtitle = elementProcessor.isSubtitle(node);
 365                                                 if (subtitle != null) {
 366                                                         subtitle = subtitle.trim();
 367                                                         currentLine.append("\n[ " + subtitle + " ]\n");
 368                                                         ignore = true;
 369                                                 }
 370                                         }
 371
 372                                         if (ignore) {
 373                                                 ignoredNodes.add(node);
 374                                                 return;
 375                                         }
 376
 377                                         String prep = "";
 378                                         for (int i = 0; i < quoted.size(); i++) {
 379                                                 prep += ">";
 380                                         }
 381                                         prep += " ";
 382
 383                                         boolean enterQuote = elementProcessor.detectQuote(node);
 384                                         boolean leaveQuote = quoted.contains(depth);
 385
 386                                         if (enterQuote) {
 387                                                 quoted.add(depth);
 388                                         }
 389
 390                                         if (leaveQuote) {
 391                                                 quoted.remove(Integer.valueOf(depth));
 392                                         }
 393
 394                                         if (enterQuote || leaveQuote) {
 395                                                 if (currentLine.length() > 0) {
 396                                                         if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 397                                                                 currentLine.setLength(currentLine.length() - 1);
 398                                                         }
 399                                                         for (String l : currentLine.toString().split("\n")) {
 400                                                                 lines.add(prep + l);
 401                                                         }
 402                                                 }
 403                                                 currentLine.setLength(0);
 404                                         }
 405
 406                                         if (node instanceof Element) {
 407                                                 Element element = (Element) node;
 408                                                 boolean block = element.isBlock()
 409                                                                 || element.tagName().equalsIgnoreCase("br");
 410                                                 if (block && currentLine.length() > 0) {
 411                                                         currentLine.append("\n");
 412                                                 }
 413                                         } else if (node instanceof TextNode) {
 414                                                 TextNode textNode = (TextNode) node;
 415                                                 String line = StringUtil.normaliseWhitespace(textNode
 416                                                                 .getWholeText());
 417
 418                                                 currentLine.append(elementProcessor.processText(line));
 419                                                 currentLine.append(" ");
 420                                         }
 421                                 }
 422
 423                                 @Override
 424                                 public void tail(Node node, int depth) {
 425                                 }
 426                         }).traverse(element);
 427                 }
 428
 429                 if (currentLine.length() > 0) {
 430                         String prep = "";
 431                         for (int i = 0; i < quoted.size(); i++) {
 432                                 prep += ">";
 433                         }
 434                         prep += " ";
 435                         if (currentLine.length() > 0) {
 436                                 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 437                                         currentLine.setLength(currentLine.length() - 1);
 438                                 }
 439                                 for (String l : currentLine.toString().split("\n")) {
 440                                         lines.add(prep + l);
 441                                 }
 442                         }
 443                 }
 444
 445                 for (int i = 0; i < lines.size(); i++) {
 446                         lines.set(i, lines.get(i).replace("  ", " ").trim());
 447                 }
 448
 449                 return lines;
 450         }
 451
 452         /**
 453          * Reformat the date if possible.
 454          *
 455          * @param date
 456          *            the input date
 457          *
 458          * @return the reformated date, or the same value if it was not parsable
 459          */
 460         static protected String date(String date) {
 461                 SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
 462
 463                 long epoch = 0;
 464                 try {
 465                         epoch = Long.parseLong(date.trim());
 466                 } catch (Exception e) {
 467                         epoch = 0;
 468                 }
 469
 470                 if (epoch > 0) {
 471                         return out.format(new Date(1000 * epoch));
 472                 }
 473
 474                 try {
 475                         Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
 476                                         .parse(date.trim());
 477                         return out.format(dat);
 478                 } catch (ParseException e) {
 479                         return date;
 480                 }
 481         }
 482 }