BasicSupport.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.net.URL;
   6 import java.text.ParseException;
   7 import java.text.SimpleDateFormat;
   8 import java.util.ArrayList;
   9 import java.util.Arrays;
  10 import java.util.Date;
  11 import java.util.List;
  12 import java.util.Map.Entry;
  13
  14 import org.jsoup.helper.DataUtil;
  15 import org.jsoup.helper.StringUtil;
  16 import org.jsoup.nodes.Document;
  17 import org.jsoup.nodes.Element;
  18 import org.jsoup.nodes.Node;
  19 import org.jsoup.nodes.TextNode;
  20 import org.jsoup.select.NodeTraversor;
  21 import org.jsoup.select.NodeVisitor;
  22
  23 import be.nikiroo.gofetch.data.Comment;
  24 import be.nikiroo.gofetch.data.Story;
  25 import be.nikiroo.utils.Downloader;
  26 import be.nikiroo.utils.StringUtils;
  27
  28 /**
  29  * Base class for website support.
  30  *
  31  * @author niki
  32  */
  33 public abstract class BasicSupport {
  34         /** The downloader to use for all websites. */
  35         protected static Downloader downloader = new Downloader("gofetcher");
  36
  37         static private String preselector;
  38
  39         private Type type;
  40
  41         /**
  42          * The website textual description, to add in the dispatcher page.
  43          * <p>
  44          * Should be short.
  45          *
  46          * @return the description
  47          */
  48         abstract public String getDescription();
  49
  50         /**
  51          * The gopher "selector" to use for output.
  52          * <p>
  53          * A kind of "URL path", like "/news/" or "/misc/news/" or...
  54          *
  55          * @return the selector
  56          */
  57         public String getSelector() {
  58                 return getSelector(type);
  59         }
  60
  61         /**
  62          * The support type.
  63          *
  64          * @return the type
  65          */
  66         public Type getType() {
  67                 return type;
  68         }
  69
  70         /**
  71          * List all the recent items, but only assure the ID and internal URL to
  72          * fetch it later on (until it has been fetched, the rest of the
  73          * {@link Story} is not confirmed).
  74          *
  75          * @return the list of new stories
  76          *
  77          * @throws IOException
  78          *             in case of I/O
  79          */
  80         public List<Story> list() throws IOException {
  81                 List<Story> list = new ArrayList<Story>();
  82
  83                 for (Entry<URL, String> entry : getUrls()) {
  84                         URL url = entry.getKey();
  85                         String defaultCateg = entry.getValue();
  86                         if (defaultCateg == null) {
  87                                 defaultCateg = "";
  88                         }
  89
  90                         InputStream in = downloader.open(url);
  91                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
  92                         List<Element> articles = getArticles(doc);
  93                         for (Element article : articles) {
  94                                 String id = getArticleId(doc, article).trim();
  95                                 String title = getArticleTitle(doc, article).trim();
  96                                 String author = getArticleAuthor(doc, article).trim();
  97                                 String date = getArticleDate(doc, article).trim();
  98                                 String categ = getArticleCategory(doc, article, defaultCateg)
  99                                                 .trim();
 100                                 String details = getArticleDetails(doc, article).trim();
 101                                 String intUrl = getArticleIntUrl(doc, article).trim();
 102                                 String extUrl = getArticleExtUrl(doc, article).trim();
 103                                 String content = getArticleContent(doc, article).trim();
 104
 105                                 if (id.isEmpty() && date.isEmpty()) {
 106                                         continue;
 107                                 }
 108
 109                                 if (id.isEmpty()) {
 110                                         id = date.replace(":", "_").replace("+", "_");
 111                                 }
 112
 113                                 date = date(date);
 114
 115                                 list.add(new Story(getType(), id, title, author, date, categ,
 116                                                 details, intUrl, extUrl, content));
 117                         }
 118                 }
 119
 120                 return list;
 121         }
 122
 123         /**
 124          * The {@link URL}s to process for this website.
 125          *
 126          * @return the list of {@link URL}s
 127          *
 128          * @throws IOException
 129          *             in case of I/O error
 130          */
 131         abstract protected List<Entry<URL, String>> getUrls() throws IOException;
 132
 133         /**
 134          * The article {@link Element}s of this document.
 135          *
 136          * @param doc
 137          *            the main document for the current category
 138          *
 139          * @return the articles
 140          */
 141         abstract protected List<Element> getArticles(Document doc);
 142
 143         /**
 144          * The ID of the article (defaults to the date element if empty).
 145          *
 146          * @param doc
 147          *            the main document for the current category
 148          * @param article
 149          *            the article to look into
 150          *
 151          * @return the ID
 152          */
 153         abstract protected String getArticleId(Document doc, Element article);
 154
 155         /**
 156          * The article title to display.
 157          *
 158          * @param doc
 159          *            the main document for the current category
 160          * @param article
 161          *            the article to look into
 162          *
 163          * @return the title
 164          */
 165         abstract protected String getArticleTitle(Document doc, Element article);
 166
 167         /**
 168          * The optional article author.
 169          *
 170          * @param doc
 171          *            the main document for the current category
 172          * @param article
 173          *            the article to look into
 174          *
 175          * @return the author
 176          */
 177         abstract protected String getArticleAuthor(Document doc, Element article);
 178
 179         /**
 180          * The optional article date.
 181          *
 182          * @param doc
 183          *            the main document for the current category
 184          * @param article
 185          *            the article to look into
 186          *
 187          * @return the date
 188          */
 189         abstract protected String getArticleDate(Document doc, Element article);
 190
 191         /**
 192          * the optional article category.
 193          *
 194          * @param doc
 195          *            the main document for the current category
 196          * @param article
 197          *            the article to look into
 198          * @param currentCategory
 199          *            the currently listed category if any (can be NULL)
 200          *
 201          * @return the category
 202          */
 203         abstract protected String getArticleCategory(Document doc, Element article,
 204                         String currentCategory);
 205
 206         /**
 207          * the optional details of the article (can replace the date, author and
 208          * category, for instance).
 209          *
 210          * @param doc
 211          *            the main document for the current category
 212          * @param article
 213          *            the article to look into
 214          *
 215          * @return the details
 216          */
 217         abstract protected String getArticleDetails(Document doc, Element article);
 218
 219         /**
 220          * The (required) {@link URL} that points to the news page on the supported
 221          * website.
 222          *
 223          * @param doc
 224          *            the main document for the current category
 225          * @param article
 226          *            the article to look into
 227          *
 228          * @return the internal {@link URL}
 229          */
 230         abstract protected String getArticleIntUrl(Document doc, Element article);
 231
 232         /**
 233          * the optional {@link URL} that points to an external website for more
 234          * information.
 235          *
 236          * @param doc
 237          *            the main document for the current category
 238          * @param article
 239          *            the article to look into
 240          *
 241          * @return the external {@link URL}
 242          */
 243         abstract protected String getArticleExtUrl(Document doc, Element article);
 244
 245         /**
 246          * The optional article short-content (not the full content, that will be
 247          * fetched by {@link BasicSupport#fetch(Story)}).
 248          *
 249          * @param doc
 250          *            the main document for the current category
 251          * @param article
 252          *            the article to look into
 253          *
 254          * @return the short content
 255          */
 256         abstract protected String getArticleContent(Document doc, Element article);
 257
 258         /**
 259          * Fetch the full article content as well as all the comments associated to
 260          * this {@link Story}, if any (can be empty, but not NULL).
 261          *
 262          * @param story
 263          *            the story to fetch the comments of
 264          *
 265          * @throws IOException
 266          *             in case of I/O error
 267          */
 268         public void fetch(Story story) throws IOException {
 269                 String fullContent = "";
 270
 271                 URL url = new URL(story.getUrlInternal());
 272                 InputStream in = downloader.open(url);
 273                 try {
 274                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
 275                         Element article = getFullArticle(doc);
 276                         if (article != null) {
 277                                 StringBuilder builder = new StringBuilder();
 278                                 ElementProcessor eProc = getElementProcessorFullArticle();
 279                                 if (eProc != null) {
 280                                         for (String line : toLines(article, eProc)) {
 281                                                 builder.append(line + "\n");
 282                                         }
 283                                 } else {
 284                                         builder.append(article.text());
 285                                 }
 286
 287                                 // Content is too tight with a single break per line:
 288                                 fullContent = builder.toString().replace("\n", "\n\n") //
 289                                                 .replace("\n\n\n\n", "\n\n") //
 290                                                 .replace("\n\n\n\n", "\n\n") //
 291                                                 .trim();
 292                         }
 293
 294                         if (fullContent.isEmpty()) {
 295                                 fullContent = story.getContent();
 296                         }
 297
 298                         story.setFullContent(fullContent);
 299                         story.setComments(getComments(doc,
 300                                         getFullArticleCommentPosts(doc, url)));
 301                 } finally {
 302                         if (in != null) {
 303                                 in.close();
 304                         }
 305                 }
 306         }
 307
 308         /**
 309          * Return the full article if available.
 310          *
 311          * @param doc
 312          *            the (full article) document to work on
 313          *
 314          * @return the article or NULL
 315          */
 316         abstract protected Element getFullArticle(Document doc);
 317
 318         /**
 319          * Return the list of comment {@link Element}s from this optional container
 320          * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
 321          *
 322          * @param doc
 323          *            the (full article) document to work on
 324          * @param intUrl
 325          *            the internal {@link URL} this article wa taken from (the
 326          *            {@link URL} from the supported website)
 327          *
 328          * @return the list of comment posts
 329          */
 330         abstract protected List<Element> getFullArticleCommentPosts(Document doc,
 331                         URL intUrl);
 332
 333         /**
 334          * The {@link ElementProcessor} to use to convert the main article element
 335          * (see {@link BasicSupport#getFullArticle(Document)}) into text.
 336          * <p>
 337          * See {@link BasicElementProcessor} for a working, basic implementation.
 338          * <p>
 339          * Can be NULL to simply use {@link Element#text()}.
 340          *
 341          * @return the processor, or NULL
 342          */
 343         abstract protected ElementProcessor getElementProcessorFullArticle();
 344
 345         /**
 346          * Convert the comment elements into {@link Comment}s
 347          *
 348          * @param doc
 349          *            the document we work on
 350          * @param posts
 351          *            the comment elements
 352          *
 353          * @return the converted {@link Comment}s
 354          */
 355         private List<Comment> getComments(Document doc, List<Element> posts) {
 356                 List<Comment> comments = new ArrayList<Comment>();
 357                 if (posts != null) {
 358                         for (Element post : posts) {
 359                                 String id = getCommentId(post).trim();
 360                                 String author = getCommentAuthor(post).trim();
 361                                 String title = getCommentTitle(post).trim();
 362                                 String date = getCommentDate(post).trim();
 363
 364                                 List<String> content = new ArrayList<String>();
 365
 366                                 if (id.isEmpty()) {
 367                                         id = date;
 368                                 }
 369
 370                                 date = date(date);
 371
 372                                 Element contentE = getCommentContentElement(post);
 373                                 if (contentE != null) {
 374                                         ElementProcessor eProc = getElementProcessorComment();
 375                                         if (eProc != null) {
 376                                                 for (String line : toLines(contentE, eProc)) {
 377                                                         content.add(line);
 378                                                 }
 379                                         } else {
 380                                                 content = Arrays.asList(contentE.text().split("\n"));
 381                                         }
 382                                 }
 383
 384                                 Comment comment = new Comment(id, author, title, date, content);
 385                                 comment.addAll(getComments(doc,
 386                                                 getCommentCommentPosts(doc, post)));
 387
 388                                 if (!comment.isEmpty()) {
 389                                         comments.add(comment);
 390                                 }
 391                         }
 392                 }
 393
 394                 return comments;
 395         }
 396
 397         /**
 398          * Return the list of subcomment {@link Element}s from this comment element
 399          * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
 400          *
 401          * @param doc
 402          *            the (full article) document to work on
 403          * @param container
 404          *            the container (a comment {@link Element})
 405          *
 406          * @return the list of comment posts
 407          */
 408         abstract protected List<Element> getCommentCommentPosts(Document doc,
 409                         Element container);
 410
 411         /**
 412          * Compute the ID of the given comment element.
 413          *
 414          * @param post
 415          *            the comment element
 416          *
 417          * @return the ID
 418          */
 419         abstract protected String getCommentId(Element post);
 420
 421         /**
 422          * Compute the author of the given comment element.
 423          *
 424          * @param post
 425          *            the comment element
 426          *
 427          * @return the author
 428          */
 429         abstract protected String getCommentAuthor(Element post);
 430
 431         /**
 432          * Compute the title of the given comment element.
 433          *
 434          * @param post
 435          *            the comment element
 436          *
 437          * @return the title
 438          */
 439         abstract protected String getCommentTitle(Element post);
 440
 441         /**
 442          * Compute the date of the given comment element.
 443          *
 444          * @param post
 445          *            the comment element
 446          *
 447          * @return the date
 448          */
 449         abstract protected String getCommentDate(Element post);
 450
 451         /**
 452          * Get the main of the given comment element, which can be NULL.
 453          *
 454          * @param post
 455          *            the comment element
 456          *
 457          * @return the element
 458          */
 459         abstract protected Element getCommentContentElement(Element post);
 460
 461         /**
 462          * The {@link ElementProcessor} to use to convert the main comment element
 463          * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
 464          * <p>
 465          * See {@link BasicElementProcessor} for a working, basic implementation.
 466          * <p>
 467          * Can be NULL to simply use {@link Element#text()}.
 468          *
 469          * @return the processor
 470          */
 471         abstract protected ElementProcessor getElementProcessorComment();
 472
 473         /**
 474          * The support type.
 475          *
 476          * @param type
 477          *            the new type
 478          */
 479         protected void setType(Type type) {
 480                 this.type = type;
 481         }
 482
 483         /**
 484          * The {@link String} to append to the selector (the selector will be
 485          * constructed as "this string" then "/type/".
 486          *
 487          * @param preselector
 488          *            the preselector to set
 489          */
 490         static public void setPreselector(String preselector) {
 491                 BasicSupport.preselector = preselector;
 492         }
 493
 494         /**
 495          * Return a {@link BasicSupport} that is compatible with the given
 496          * {@link Type} if it exists (or NULL if not).
 497          *
 498          * @param type
 499          *            the type
 500          *
 501          * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
 502          */
 503         static public BasicSupport getSupport(Type type) {
 504                 BasicSupport support = null;
 505
 506                 if (type != null) {
 507                         switch (type) {
 508                         case SLASHDOT:
 509                                 support = new Slashdot();
 510                                 break;
 511                         case PIPEDOT:
 512                                 support = new Pipedot();
 513                                 break;
 514                         case LWN:
 515                                 support = new LWN();
 516                                 break;
 517                         case LEMONDE:
 518                                 support = new LeMonde();
 519                                 break;
 520                         case REGISTER:
 521                                 support = new TheRegister();
 522                                 break;
 523                         case TOO_LINUX:
 524                                 support = new TooLinux();
 525                                 break;
 526                         case ERE_NUMERIQUE:
 527                                 support = new EreNumerique();
 528                                 break;
 529                         case PHORONIX:
 530                                 support = new Phoronix();
 531                                 break;
 532                         }
 533
 534                         if (support != null) {
 535                                 support.setType(type);
 536                         }
 537                 }
 538
 539                 return support;
 540         }
 541
 542         /**
 543          * The gopher "selector" to use for output for this type, using the
 544          * preselector.
 545          * <p>
 546          * A kind of "URL path", like "/news/" or "/misc/news/" or...
 547          *
 548          * @param type
 549          *            the type to get the selector of
 550          *
 551          * @return the selector
 552          */
 553         static public String getSelector(Type type) {
 554                 return preselector + "/" + type + "/";
 555         }
 556
 557         /**
 558          * Process the given element into text (each line is a text paragraph and
 559          * can be prepended with ">" signs to indicate a quote or sub-quote or
 560          * sub-sub-quote...).
 561          *
 562          * @param element
 563          *            the element to process
 564          * @param elementProcessor
 565          *            the element processor, must not be NULL
 566          *
 567          * @return text lines, each line is a paragraph
 568          */
 569         static protected List<String> toLines(Element element,
 570                         final ElementProcessor elementProcessor) {
 571                 final List<String> lines = new ArrayList<String>();
 572                 final StringBuilder currentLine = new StringBuilder();
 573                 final List<Integer> quoted = new ArrayList<Integer>();
 574                 final List<Node> ignoredNodes = new ArrayList<Node>();
 575                 final List<String> footnotes = new ArrayList<String>();
 576
 577                 if (element != null) {
 578                         new NodeTraversor(new NodeVisitor() {
 579                                 @Override
 580                                 public void head(Node node, int depth) {
 581                                         String manual = null;
 582                                         boolean ignore = elementProcessor.ignoreNode(node)
 583                                                         || ignoredNodes.contains(node.parentNode());
 584                                         // Manual processing
 585                                         if (!ignore) {
 586                                                 manual = elementProcessor.manualProcessing(node);
 587                                                 if (manual != null) {
 588                                                         currentLine.append(manual);
 589                                                         ignore = true;
 590                                                 }
 591                                         }
 592
 593                                         // Subtitle check
 594                                         if (!ignore) {
 595                                                 String subtitle = elementProcessor.isSubtitle(node);
 596                                                 if (subtitle != null) {
 597                                                         subtitle = subtitle.trim();
 598                                                         currentLine.append("\n[ " + subtitle + " ]\n");
 599                                                         ignore = true;
 600                                                 }
 601                                         }
 602
 603                                         // <pre> check
 604                                         if (!ignore) {
 605                                                 if (node instanceof Element) {
 606                                                         Element el = (Element) node;
 607                                                         if ("pre".equals(el.tagName())) {
 608                                                                 currentLine.append(StringUtils
 609                                                                                 .unhtml(el.text()).trim());
 610                                                                 ignore = true;
 611                                                         }
 612                                                 }
 613                                         }
 614
 615                                         if (ignore) {
 616                                                 ignoredNodes.add(node);
 617                                                 return;
 618                                         }
 619
 620                                         String prep = "";
 621                                         for (int i = 0; i < quoted.size(); i++) {
 622                                                 prep += ">";
 623                                         }
 624                                         prep += " ";
 625
 626                                         boolean enterQuote = elementProcessor.detectQuote(node);
 627                                         boolean leaveQuote = quoted.contains(depth);
 628
 629                                         if (enterQuote) {
 630                                                 quoted.add(depth);
 631                                         }
 632
 633                                         if (leaveQuote) {
 634                                                 quoted.remove(Integer.valueOf(depth));
 635                                         }
 636
 637                                         if (enterQuote || leaveQuote) {
 638                                                 if (currentLine.length() > 0) {
 639                                                         if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 640                                                                 currentLine.setLength(currentLine.length() - 1);
 641                                                         }
 642                                                         for (String l : currentLine.toString().split("\n")) {
 643                                                                 lines.add(prep + l);
 644                                                         }
 645                                                 }
 646                                                 currentLine.setLength(0);
 647                                         }
 648
 649                                         if (node instanceof Element) {
 650                                                 Element element = (Element) node;
 651                                                 boolean block = element.isBlock()
 652                                                                 || element.tagName().equalsIgnoreCase("br");
 653                                                 if (block && currentLine.length() > 0) {
 654                                                         currentLine.append("\n");
 655                                                 }
 656
 657                                                 if (!element.absUrl("href").trim().isEmpty()) {
 658                                                         footnotes.add(element.absUrl("href"));
 659                                                         currentLine.append("[" + footnotes.size() + "]");
 660                                                 }
 661                                         } else if (node instanceof TextNode) {
 662                                                 TextNode textNode = (TextNode) node;
 663                                                 String line = StringUtil.normaliseWhitespace(textNode
 664                                                                 .getWholeText());
 665
 666                                                 currentLine.append(elementProcessor.processText(line));
 667                                                 currentLine.append(" ");
 668                                         }
 669                                 }
 670
 671                                 @Override
 672                                 public void tail(Node node, int depth) {
 673                                 }
 674                         }).traverse(element);
 675                 }
 676
 677                 if (currentLine.length() > 0) {
 678                         String prep = "";
 679                         for (int i = 0; i < quoted.size(); i++) {
 680                                 prep += ">";
 681                         }
 682                         prep += " ";
 683                         if (currentLine.length() > 0) {
 684                                 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 685                                         currentLine.setLength(currentLine.length() - 1);
 686                                 }
 687                                 for (String l : currentLine.toString().split("\n")) {
 688                                         lines.add(prep + l);
 689                                 }
 690                         }
 691                 }
 692
 693                 // Fix spaces and nbsp, remove multiple following blank lines
 694                 List<String> linesCopy = new ArrayList<String>(lines.size());
 695                 long blanks = 0;
 696                 for (int i = 0; i < lines.size(); i++) {
 697                         String line = lines.get(i).replace(" ", " ") // nbsp -> space
 698                                         .replace("  ", " ").trim();
 699                         if (line.isEmpty()) {
 700                                 blanks++;
 701                         } else {
 702                                 blanks = 0;
 703                         }
 704
 705                         if (blanks < 2) {
 706                                 linesCopy.add(line);
 707                         }
 708                 }
 709
 710                 // Footnotes insertion
 711                 if (footnotes.size() > 0) {
 712                         linesCopy.add("");
 713                         linesCopy.add("");
 714                         linesCopy.add("");
 715                         linesCopy.add("");
 716                         for (int i = 0; i < footnotes.size(); i++) {
 717                                 linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
 718                         }
 719                 }
 720
 721                 return linesCopy;
 722         }
 723
 724         /**
 725          * Reformat the date if possible.
 726          *
 727          * @param date
 728          *            the input date
 729          *
 730          * @return the reformated date, or the same value if it was not parsable
 731          */
 732         static private String date(String date) {
 733                 SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
 734
 735                 long epoch = 0;
 736                 try {
 737                         epoch = Long.parseLong(date.trim());
 738                 } catch (Exception e) {
 739                         epoch = 0;
 740                 }
 741
 742                 if (epoch > 0) {
 743                         return out.format(new Date(1000 * epoch));
 744                 }
 745
 746                 try {
 747                         Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
 748                                         .parse(date.trim());
 749                         return out.format(dat);
 750                 } catch (ParseException e) {
 751                         return date;
 752                 }
 753         }
 754 }