src/be/nikiroo/gofetch/support/BasicSupport.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.net.URL;
   6 import java.text.ParseException;
   7 import java.text.SimpleDateFormat;
   8 import java.util.ArrayList;
   9 import java.util.Arrays;
  10 import java.util.Date;
  11 import java.util.List;
  12 import java.util.Map.Entry;
  13
  14 import org.jsoup.helper.DataUtil;
  15 import org.jsoup.helper.StringUtil;
  16 import org.jsoup.nodes.Document;
  17 import org.jsoup.nodes.Element;
  18 import org.jsoup.nodes.Node;
  19 import org.jsoup.nodes.TextNode;
  20 import org.jsoup.select.NodeTraversor;
  21 import org.jsoup.select.NodeVisitor;
  22
  23 import be.nikiroo.gofetch.data.Comment;
  24 import be.nikiroo.gofetch.data.Story;
  25 import be.nikiroo.utils.Downloader;
  26 import be.nikiroo.utils.StringUtils;
  27
  28 /**
  29  * Base class for website support.
  30  *
  31  * @author niki
  32  */
  33 public abstract class BasicSupport {
  34         /** The downloader to use for all websites. */
  35         static protected Downloader downloader = new Downloader("gofetcher");
  36
  37         static private String preselector;
  38
  39         private Type type;
  40
  41         /**
  42          * The website textual description, to add in the dispatcher page.
  43          * <p>
  44          * Should be short.
  45          *
  46          * @return the description
  47          */
  48         abstract public String getDescription();
  49
  50         /**
  51          * The gopher "selector" to use for output.
  52          * <p>
  53          * A kind of "URL path", like "/news/" or "/misc/news/" or...
  54          *
  55          * @return the selector
  56          */
  57         public String getSelector() {
  58                 return getSelector(type);
  59         }
  60
  61         /**
  62          * The support type.
  63          *
  64          * @return the type
  65          */
  66         public Type getType() {
  67                 return type;
  68         }
  69
  70         /**
  71          * List all the recent items, but only assure the ID and internal URL to
  72          * fetch it later on (until it has been fetched, the rest of the
  73          * {@link Story} is not confirmed).
  74          *
  75          * @return the list of new stories
  76          *
  77          * @throws IOException
  78          *             in case of I/O
  79          */
  80         public List<Story> list() throws IOException {
  81                 List<Story> list = new ArrayList<Story>();
  82
  83                 for (Entry<URL, String> entry : getUrls()) {
  84                         URL url = entry.getKey();
  85                         String defaultCateg = entry.getValue();
  86                         if (defaultCateg == null) {
  87                                 defaultCateg = "";
  88                         }
  89
  90                         InputStream in = downloader.open(url);
  91                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
  92                         List<Element> articles = getArticles(doc);
  93                         for (Element article : articles) {
  94                                 String id = getArticleId(doc, article).trim();
  95                                 String title = getArticleTitle(doc, article).trim();
  96                                 String author = getArticleAuthor(doc, article).trim();
  97                                 String date = getArticleDate(doc, article).trim();
  98                                 String categ = getArticleCategory(doc, article, defaultCateg)
  99                                                 .trim();
 100                                 String details = getArticleDetails(doc, article).trim();
 101                                 String intUrl = getArticleIntUrl(doc, article).trim();
 102                                 String extUrl = getArticleExtUrl(doc, article).trim();
 103                                 String content = getArticleContent(doc, article).trim();
 104
 105                                 if (id.isEmpty() && date.isEmpty()) {
 106                                         continue;
 107                                 }
 108
 109                                 if (!id.isEmpty()) {
 110                                         while (id.length() < 10) {
 111                                                 id = "0" + id;
 112                                         }
 113                                 } else {
 114                                         id = date.replace(":", "_").replace("+", "_");
 115                                 }
 116
 117                                 date = date(date);
 118
 119                                 list.add(new Story(getType(), id, title, author, date, categ,
 120                                                 details, intUrl, extUrl, content));
 121                         }
 122                 }
 123
 124                 return list;
 125         }
 126
 127         /**
 128          * The {@link URL}s to process for this website.
 129          *
 130          * @return the list of {@link URL}s
 131          *
 132          * @throws IOException
 133          *             in case of I/O error
 134          */
 135         abstract protected List<Entry<URL, String>> getUrls() throws IOException;
 136
 137         /**
 138          * The article {@link Element}s of this document.
 139          *
 140          * @param doc
 141          *            the main document for the current category
 142          *
 143          * @return the articles
 144          */
 145         abstract protected List<Element> getArticles(Document doc);
 146
 147         /**
 148          * The ID of the article (defaults to the date element if empty).
 149          *
 150          * @param doc
 151          *            the main document for the current category
 152          * @param article
 153          *            the article to look into
 154          *
 155          * @return the ID
 156          */
 157         abstract protected String getArticleId(Document doc, Element article);
 158
 159         /**
 160          * The article title to display.
 161          *
 162          * @param doc
 163          *            the main document for the current category
 164          * @param article
 165          *            the article to look into
 166          *
 167          * @return the title
 168          */
 169         abstract protected String getArticleTitle(Document doc, Element article);
 170
 171         /**
 172          * The optional article author.
 173          *
 174          * @param doc
 175          *            the main document for the current category
 176          * @param article
 177          *            the article to look into
 178          *
 179          * @return the author
 180          */
 181         abstract protected String getArticleAuthor(Document doc, Element article);
 182
 183         /**
 184          * The optional article date.
 185          *
 186          * @param doc
 187          *            the main document for the current category
 188          * @param article
 189          *            the article to look into
 190          *
 191          * @return the date
 192          */
 193         abstract protected String getArticleDate(Document doc, Element article);
 194
 195         /**
 196          * the optional article category.
 197          *
 198          * @param doc
 199          *            the main document for the current category
 200          * @param article
 201          *            the article to look into
 202          * @param currentCategory
 203          *            the currently listed category if any (can be NULL)
 204          *
 205          * @return the category
 206          */
 207         abstract protected String getArticleCategory(Document doc, Element article,
 208                         String currentCategory);
 209
 210         /**
 211          * the optional details of the article (can replace the date, author and
 212          * category, for instance).
 213          *
 214          * @param doc
 215          *            the main document for the current category
 216          * @param article
 217          *            the article to look into
 218          *
 219          * @return the details
 220          */
 221         abstract protected String getArticleDetails(Document doc, Element article);
 222
 223         /**
 224          * The (required) {@link URL} that points to the news page on the supported
 225          * website.
 226          *
 227          * @param doc
 228          *            the main document for the current category
 229          * @param article
 230          *            the article to look into
 231          *
 232          * @return the internal {@link URL}
 233          */
 234         abstract protected String getArticleIntUrl(Document doc, Element article);
 235
 236         /**
 237          * the optional {@link URL} that points to an external website for more
 238          * information.
 239          *
 240          * @param doc
 241          *            the main document for the current category
 242          * @param article
 243          *            the article to look into
 244          *
 245          * @return the external {@link URL}
 246          */
 247         abstract protected String getArticleExtUrl(Document doc, Element article);
 248
 249         /**
 250          * The optional article short-content (not the full content, that will be
 251          * fetched by {@link BasicSupport#fetch(Story)}).
 252          *
 253          * @param doc
 254          *            the main document for the current category
 255          * @param article
 256          *            the article to look into
 257          *
 258          * @return the short content
 259          */
 260         abstract protected String getArticleContent(Document doc, Element article);
 261
 262         /**
 263          * Fetch the full article content as well as all the comments associated to
 264          * this {@link Story}, if any (can be empty, but not NULL).
 265          *
 266          * @param story
 267          *            the story to fetch the comments of
 268          *
 269          * @throws IOException
 270          *             in case of I/O error
 271          */
 272         public void fetch(Story story) throws IOException {
 273                 String fullContent = "";
 274
 275                 URL url = new URL(story.getUrlInternal());
 276                 InputStream in = downloader.open(url);
 277                 try {
 278                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
 279                         Element article = getFullArticle(doc);
 280                         if (article != null) {
 281                                 StringBuilder builder = new StringBuilder();
 282                                 ElementProcessor eProc = getElementProcessorFullArticle();
 283                                 if (eProc != null) {
 284                                         for (String line : toLines(article, eProc)) {
 285                                                 builder.append(line + "\n");
 286                                         }
 287                                 } else {
 288                                         builder.append(article.text());
 289                                 }
 290
 291                                 // Content is too tight with a single break per line:
 292                                 fullContent = builder.toString().replace("\n", "\n\n") //
 293                                                 .replace("\n\n\n\n", "\n\n") //
 294                                                 .replace("\n\n\n\n", "\n\n") //
 295                                                 .trim();
 296                         }
 297
 298                         if (fullContent.isEmpty()) {
 299                                 fullContent = story.getContent();
 300                         }
 301
 302                         story.setFullContent(fullContent);
 303                         story.setComments(getComments(doc,
 304                                         getFullArticleCommentPosts(doc, url)));
 305                 } finally {
 306                         if (in != null) {
 307                                 in.close();
 308                         }
 309                 }
 310         }
 311
 312         /**
 313          * Return the full article if available.
 314          *
 315          * @param doc
 316          *            the (full article) document to work on
 317          *
 318          * @return the article or NULL
 319          */
 320         abstract protected Element getFullArticle(Document doc);
 321
 322         /**
 323          * Return the list of comment {@link Element}s from this optional container
 324          * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
 325          *
 326          * @param doc
 327          *            the (full article) document to work on
 328          * @param intUrl
 329          *            the internal {@link URL} this article wa taken from (the
 330          *            {@link URL} from the supported website)
 331          *
 332          * @return the list of comment posts
 333          */
 334         abstract protected List<Element> getFullArticleCommentPosts(Document doc,
 335                         URL intUrl);
 336
 337         /**
 338          * The {@link ElementProcessor} to use to convert the main article element
 339          * (see {@link BasicSupport#getFullArticle(Document)}) into text.
 340          * <p>
 341          * See {@link BasicElementProcessor} for a working, basic implementation.
 342          * <p>
 343          * Can be NULL to simply use {@link Element#text()}.
 344          *
 345          * @return the processor, or NULL
 346          */
 347         abstract protected ElementProcessor getElementProcessorFullArticle();
 348
 349         /**
 350          * Convert the comment elements into {@link Comment}s
 351          *
 352          * @param doc
 353          *            the document we work on
 354          * @param posts
 355          *            the comment elements
 356          *
 357          * @return the converted {@link Comment}s
 358          */
 359         private List<Comment> getComments(Document doc, List<Element> posts) {
 360                 List<Comment> comments = new ArrayList<Comment>();
 361                 if (posts != null) {
 362                         for (Element post : posts) {
 363                                 String id = getCommentId(post).trim();
 364                                 String author = getCommentAuthor(post).trim();
 365                                 String title = getCommentTitle(post).trim();
 366                                 String date = getCommentDate(post).trim();
 367
 368                                 List<String> content = new ArrayList<String>();
 369
 370                                 if (id.isEmpty()) {
 371                                         id = date;
 372                                 }
 373
 374                                 date = date(date);
 375
 376                                 Element contentE = getCommentContentElement(post);
 377                                 if (contentE != null) {
 378                                         ElementProcessor eProc = getElementProcessorComment();
 379                                         if (eProc != null) {
 380                                                 for (String line : toLines(contentE, eProc)) {
 381                                                         content.add(line);
 382                                                 }
 383                                         } else {
 384                                                 content = Arrays.asList(contentE.text().split("\n"));
 385                                         }
 386                                 }
 387
 388                                 Comment comment = new Comment(id, author, title, date, content);
 389                                 comment.addAll(getComments(doc,
 390                                                 getCommentCommentPosts(doc, post)));
 391
 392                                 if (!comment.isEmpty()) {
 393                                         comments.add(comment);
 394                                 }
 395                         }
 396                 }
 397
 398                 return comments;
 399         }
 400
 401         /**
 402          * Return the list of subcomment {@link Element}s from this comment element
 403          * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
 404          *
 405          * @param doc
 406          *            the (full article) document to work on
 407          * @param container
 408          *            the container (a comment {@link Element})
 409          *
 410          * @return the list of comment posts
 411          */
 412         abstract protected List<Element> getCommentCommentPosts(Document doc,
 413                         Element container);
 414
 415         /**
 416          * Compute the ID of the given comment element.
 417          *
 418          * @param post
 419          *            the comment element
 420          *
 421          * @return the ID
 422          */
 423         abstract protected String getCommentId(Element post);
 424
 425         /**
 426          * Compute the author of the given comment element.
 427          *
 428          * @param post
 429          *            the comment element
 430          *
 431          * @return the author
 432          */
 433         abstract protected String getCommentAuthor(Element post);
 434
 435         /**
 436          * Compute the title of the given comment element.
 437          *
 438          * @param post
 439          *            the comment element
 440          *
 441          * @return the title
 442          */
 443         abstract protected String getCommentTitle(Element post);
 444
 445         /**
 446          * Compute the date of the given comment element.
 447          *
 448          * @param post
 449          *            the comment element
 450          *
 451          * @return the date
 452          */
 453         abstract protected String getCommentDate(Element post);
 454
 455         /**
 456          * Get the main of the given comment element, which can be NULL.
 457          *
 458          * @param post
 459          *            the comment element
 460          *
 461          * @return the element
 462          */
 463         abstract protected Element getCommentContentElement(Element post);
 464
 465         /**
 466          * The {@link ElementProcessor} to use to convert the main comment element
 467          * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
 468          * <p>
 469          * See {@link BasicElementProcessor} for a working, basic implementation.
 470          * <p>
 471          * Can be NULL to simply use {@link Element#text()}.
 472          *
 473          * @return the processor
 474          */
 475         abstract protected ElementProcessor getElementProcessorComment();
 476
 477         /**
 478          * The support type.
 479          *
 480          * @param type
 481          *            the new type
 482          */
 483         protected void setType(Type type) {
 484                 this.type = type;
 485         }
 486
 487         /**
 488          * The {@link String} to append to the selector (the selector will be
 489          * constructed as "this string" then "/type/".
 490          *
 491          * @param preselector
 492          *            the preselector to set
 493          */
 494         static public void setPreselector(String preselector) {
 495                 BasicSupport.preselector = preselector;
 496         }
 497
 498         /**
 499          * Return a {@link BasicSupport} that is compatible with the given
 500          * {@link Type} if it exists (or NULL if not).
 501          *
 502          * @param type
 503          *            the type
 504          *
 505          * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
 506          */
 507         static public BasicSupport getSupport(Type type) {
 508                 BasicSupport support = null;
 509
 510                 if (type != null) {
 511                         switch (type) {
 512                         case SLASHDOT:
 513                                 support = new Slashdot();
 514                                 break;
 515                         case PIPEDOT:
 516                                 support = new Pipedot();
 517                                 break;
 518                         case LWN:
 519                                 support = new LWN();
 520                                 break;
 521                         case LEMONDE:
 522                                 support = new LeMonde();
 523                                 break;
 524                         case REGISTER:
 525                                 support = new TheRegister();
 526                                 break;
 527                         case TOO_LINUX:
 528                                 support = new TooLinux();
 529                                 break;
 530                         case ERE_NUMERIQUE:
 531                                 support = new EreNumerique();
 532                                 break;
 533                         case PHORONIX:
 534                                 support = new Phoronix();
 535                                 break;
 536                         }
 537
 538                         if (support != null) {
 539                                 support.setType(type);
 540                         }
 541                 }
 542
 543                 return support;
 544         }
 545
 546         /**
 547          * The gopher "selector" to use for output for this type, using the
 548          * preselector.
 549          * <p>
 550          * A kind of "URL path", like "/news/" or "/misc/news/" or...
 551          *
 552          * @param type
 553          *            the type to get the selector of
 554          *
 555          * @return the selector
 556          */
 557         static public String getSelector(Type type) {
 558                 return preselector + "/" + type + "/";
 559         }
 560
 561         /**
 562          * Process the given element into text (each line is a text paragraph and
 563          * can be prepended with ">" signs to indicate a quote or sub-quote or
 564          * sub-sub-quote...).
 565          *
 566          * @param element
 567          *            the element to process
 568          * @param elementProcessor
 569          *            the element processor, must not be NULL
 570          *
 571          * @return text lines, each line is a paragraph
 572          */
 573         static protected List<String> toLines(Element element,
 574                         final ElementProcessor elementProcessor) {
 575                 final List<String> lines = new ArrayList<String>();
 576                 final StringBuilder currentLine = new StringBuilder();
 577                 final List<Integer> quoted = new ArrayList<Integer>();
 578                 final List<Node> ignoredNodes = new ArrayList<Node>();
 579                 final List<String> footnotes = new ArrayList<String>();
 580
 581                 if (element != null) {
 582                         new NodeTraversor(new NodeVisitor() {
 583                                 @Override
 584                                 public void head(Node node, int depth) {
 585                                         String manual = null;
 586                                         boolean ignore = elementProcessor.ignoreNode(node)
 587                                                         || ignoredNodes.contains(node.parentNode());
 588                                         // Manual processing
 589                                         if (!ignore) {
 590                                                 manual = elementProcessor.manualProcessing(node);
 591                                                 if (manual != null) {
 592                                                         currentLine.append(manual);
 593                                                         ignore = true;
 594                                                 }
 595                                         }
 596
 597                                         // Subtitle check
 598                                         if (!ignore) {
 599                                                 String subtitle = elementProcessor.isSubtitle(node);
 600                                                 if (subtitle != null) {
 601                                                         subtitle = subtitle.trim();
 602                                                         currentLine.append("\n[ " + subtitle + " ]\n");
 603                                                         ignore = true;
 604                                                 }
 605                                         }
 606
 607                                         // <pre> check
 608                                         if (!ignore) {
 609                                                 if (node instanceof Element) {
 610                                                         Element el = (Element) node;
 611                                                         if ("pre".equals(el.tagName())) {
 612                                                                 currentLine.append(StringUtils
 613                                                                                 .unhtml(el.text()).trim());
 614                                                                 ignore = true;
 615                                                         }
 616                                                 }
 617                                         }
 618
 619                                         if (ignore) {
 620                                                 ignoredNodes.add(node);
 621                                                 return;
 622                                         }
 623
 624                                         String prep = "";
 625                                         for (int i = 0; i < quoted.size(); i++) {
 626                                                 prep += ">";
 627                                         }
 628                                         prep += " ";
 629
 630                                         boolean enterQuote = elementProcessor.detectQuote(node);
 631                                         boolean leaveQuote = quoted.contains(depth);
 632
 633                                         if (enterQuote) {
 634                                                 quoted.add(depth);
 635                                         }
 636
 637                                         if (leaveQuote) {
 638                                                 quoted.remove(Integer.valueOf(depth));
 639                                         }
 640
 641                                         if (enterQuote || leaveQuote) {
 642                                                 if (currentLine.length() > 0) {
 643                                                         if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 644                                                                 currentLine.setLength(currentLine.length() - 1);
 645                                                         }
 646                                                         for (String l : currentLine.toString().split("\n")) {
 647                                                                 lines.add(prep + l);
 648                                                         }
 649                                                 }
 650                                                 currentLine.setLength(0);
 651                                         }
 652
 653                                         if (node instanceof Element) {
 654                                                 Element element = (Element) node;
 655                                                 boolean block = element.isBlock()
 656                                                                 || element.tagName().equalsIgnoreCase("br");
 657                                                 if (block && currentLine.length() > 0) {
 658                                                         currentLine.append("\n");
 659                                                 }
 660
 661                                                 if (!element.absUrl("href").trim().isEmpty()) {
 662                                                         footnotes.add(element.absUrl("href"));
 663                                                         currentLine.append("[" + footnotes.size() + "]");
 664                                                 }
 665                                         } else if (node instanceof TextNode) {
 666                                                 TextNode textNode = (TextNode) node;
 667                                                 String line = StringUtil.normaliseWhitespace(textNode
 668                                                                 .getWholeText());
 669
 670                                                 currentLine.append(elementProcessor.processText(line));
 671                                                 currentLine.append(" ");
 672                                         }
 673                                 }
 674
 675                                 @Override
 676                                 public void tail(Node node, int depth) {
 677                                 }
 678                         }).traverse(element);
 679                 }
 680
 681                 if (currentLine.length() > 0) {
 682                         String prep = "";
 683                         for (int i = 0; i < quoted.size(); i++) {
 684                                 prep += ">";
 685                         }
 686                         prep += " ";
 687                         if (currentLine.length() > 0) {
 688                                 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 689                                         currentLine.setLength(currentLine.length() - 1);
 690                                 }
 691                                 for (String l : currentLine.toString().split("\n")) {
 692                                         lines.add(prep + l);
 693                                 }
 694                         }
 695                 }
 696
 697                 // Fix spaces and nbsp, remove multiple following blank lines
 698                 List<String> linesCopy = new ArrayList<String>(lines.size());
 699                 long blanks = 0;
 700                 for (int i = 0; i < lines.size(); i++) {
 701                         String line = lines.get(i).replace(" ", " ") // nbsp -> space
 702                                         .replace("  ", " ").trim();
 703                         if (line.isEmpty()) {
 704                                 blanks++;
 705                         } else {
 706                                 blanks = 0;
 707                         }
 708
 709                         if (blanks < 2) {
 710                                 linesCopy.add(line);
 711                         }
 712                 }
 713
 714                 // Footnotes insertion
 715                 if (footnotes.size() > 0) {
 716                         linesCopy.add("");
 717                         linesCopy.add("");
 718                         linesCopy.add("");
 719                         linesCopy.add("");
 720                         for (int i = 0; i < footnotes.size(); i++) {
 721                                 linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
 722                         }
 723                 }
 724
 725                 return linesCopy;
 726         }
 727
 728         /**
 729          * Reformat the date if possible.
 730          *
 731          * @param date
 732          *            the input date
 733          *
 734          * @return the reformated date, or the same value if it was not parsable
 735          */
 736         static private String date(String date) {
 737                 SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
 738
 739                 long epoch = 0;
 740                 try {
 741                         epoch = Long.parseLong(date.trim());
 742                 } catch (Exception e) {
 743                         epoch = 0;
 744                 }
 745
 746                 if (epoch > 0) {
 747                         return out.format(new Date(1000 * epoch));
 748                 }
 749
 750                 try {
 751                         Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
 752                                         .parse(date.trim());
 753                         return out.format(dat);
 754                 } catch (ParseException e) {
 755                         return date;
 756                 }
 757         }
 758 }