src/be/nikiroo/gofetch/support/BasicSupport.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.net.URL;
   6 import java.text.ParseException;
   7 import java.text.SimpleDateFormat;
   8 import java.util.ArrayList;
   9 import java.util.Arrays;
  10 import java.util.Date;
  11 import java.util.HashMap;
  12 import java.util.List;
  13 import java.util.Map;
  14 import java.util.Map.Entry;
  15
  16 import org.jsoup.helper.DataUtil;
  17 import org.jsoup.helper.StringUtil;
  18 import org.jsoup.nodes.Document;
  19 import org.jsoup.nodes.Element;
  20 import org.jsoup.nodes.Node;
  21 import org.jsoup.nodes.TextNode;
  22 import org.jsoup.select.NodeTraversor;
  23 import org.jsoup.select.NodeVisitor;
  24
  25 import be.nikiroo.gofetch.data.Comment;
  26 import be.nikiroo.gofetch.data.Story;
  27 import be.nikiroo.utils.Downloader;
  28 import be.nikiroo.utils.StringUtils;
  29
  30 /**
  31  * Base class for website support.
  32  *
  33  * @author niki
  34  */
  35 public abstract class BasicSupport {
  36         /**
  37          * The downloader to use for all web sites via
  38          * {@link BasicSupport#open(URL)}
  39          */
  40         static private Downloader downloader = new Downloader("gofetcher");
  41
  42         static private String preselector;
  43
  44         /**
  45          * The optional cookies to use to get the site data.
  46          */
  47         private Map<String, String> cookies = new HashMap<String, String>();
  48
  49         private Type type;
  50
  51         /**
  52          * Login on the web site (this method does nothing by default, but can be
  53          * overridden if needed).
  54          *
  55          * @throws IOException
  56          *             in case of I/O error
  57          *
  58          */
  59         public void login() throws IOException {
  60         }
  61
  62         /**
  63          * The website textual description, to add in the dispatcher page.
  64          * <p>
  65          * Should be short.
  66          *
  67          * @return the description
  68          */
  69         abstract public String getDescription();
  70
  71         /**
  72          * The gopher "selector" to use for output.
  73          * <p>
  74          * A kind of "URL path", like "/news/" or "/misc/news/" or...
  75          *
  76          * @return the selector
  77          */
  78         public String getSelector() {
  79                 return getSelector(getType());
  80         }
  81
  82         /**
  83          * The support type.
  84          *
  85          * @return the type
  86          */
  87         public Type getType() {
  88                 return type;
  89         }
  90
  91         /**
  92          * List all the recent items, but only assure the ID and internal URL to
  93          * fetch it later on (until it has been fetched, the rest of the
  94          * {@link Story} is not confirmed).
  95          *
  96          * @return the list of new stories
  97          *
  98          * @throws IOException
  99          *             in case of I/O
 100          */
 101         public List<Story> list() throws IOException {
 102                 List<Story> list = new ArrayList<Story>();
 103
 104                 login();
 105                 for (Entry<URL, String> entry : getUrls()) {
 106                         URL url = entry.getKey();
 107                         String defaultCateg = entry.getValue();
 108                         if (defaultCateg == null) {
 109                                 defaultCateg = "";
 110                         }
 111
 112                         InputStream in = open(url);
 113                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
 114                         List<Element> articles = getArticles(doc);
 115                         for (Element article : articles) {
 116                                 String id = getArticleId(doc, article).trim();
 117                                 String title = getArticleTitle(doc, article).trim();
 118                                 String author = getArticleAuthor(doc, article).trim();
 119                                 String date = getArticleDate(doc, article).trim();
 120                                 String categ = getArticleCategory(doc, article, defaultCateg)
 121                                                 .trim();
 122                                 String details = getArticleDetails(doc, article).trim();
 123                                 String intUrl = getArticleIntUrl(doc, article).trim();
 124                                 String extUrl = getArticleExtUrl(doc, article).trim();
 125                                 String content = getArticleContent(doc, article).trim();
 126
 127                                 if (id.isEmpty() && date.isEmpty()) {
 128                                         continue;
 129                                 }
 130
 131                                 if (!id.isEmpty()) {
 132                                         while (id.length() < 10) {
 133                                                 id = "0" + id;
 134                                         }
 135                                 } else {
 136                                         id = date.replace(":", "_").replace("+", "_");
 137                                 }
 138
 139                                 date = date(date);
 140
 141                                 list.add(new Story(getType(), id, title, author, date, categ,
 142                                                 details, intUrl, extUrl, content));
 143                         }
 144                 }
 145
 146                 return list;
 147         }
 148
 149         /**
 150          * The {@link URL}s to process for this website.
 151          *
 152          * @return the list of {@link URL}s
 153          *
 154          * @throws IOException
 155          *             in case of I/O error
 156          */
 157         abstract protected List<Entry<URL, String>> getUrls() throws IOException;
 158
 159         /**
 160          * The article {@link Element}s of this document.
 161          *
 162          * @param doc
 163          *            the main document for the current category
 164          *
 165          * @return the articles
 166          */
 167         abstract protected List<Element> getArticles(Document doc);
 168
 169         /**
 170          * The ID of the article (defaults to the date element if empty).
 171          *
 172          * @param doc
 173          *            the main document for the current category
 174          * @param article
 175          *            the article to look into
 176          *
 177          * @return the ID
 178          */
 179         abstract protected String getArticleId(Document doc, Element article);
 180
 181         /**
 182          * The article title to display.
 183          *
 184          * @param doc
 185          *            the main document for the current category
 186          * @param article
 187          *            the article to look into
 188          *
 189          * @return the title
 190          */
 191         abstract protected String getArticleTitle(Document doc, Element article);
 192
 193         /**
 194          * The optional article author.
 195          *
 196          * @param doc
 197          *            the main document for the current category
 198          * @param article
 199          *            the article to look into
 200          *
 201          * @return the author
 202          */
 203         abstract protected String getArticleAuthor(Document doc, Element article);
 204
 205         /**
 206          * The optional article date.
 207          *
 208          * @param doc
 209          *            the main document for the current category
 210          * @param article
 211          *            the article to look into
 212          *
 213          * @return the date
 214          */
 215         abstract protected String getArticleDate(Document doc, Element article);
 216
 217         /**
 218          * the optional article category.
 219          *
 220          * @param doc
 221          *            the main document for the current category
 222          * @param article
 223          *            the article to look into
 224          * @param currentCategory
 225          *            the currently listed category if any (can be NULL)
 226          *
 227          * @return the category
 228          */
 229         abstract protected String getArticleCategory(Document doc, Element article,
 230                         String currentCategory);
 231
 232         /**
 233          * the optional details of the article (can replace the date, author and
 234          * category, for instance).
 235          *
 236          * @param doc
 237          *            the main document for the current category
 238          * @param article
 239          *            the article to look into
 240          *
 241          * @return the details
 242          */
 243         abstract protected String getArticleDetails(Document doc, Element article);
 244
 245         /**
 246          * The (required) {@link URL} that points to the news page on the supported
 247          * website.
 248          *
 249          * @param doc
 250          *            the main document for the current category
 251          * @param article
 252          *            the article to look into
 253          *
 254          * @return the internal {@link URL}
 255          */
 256         abstract protected String getArticleIntUrl(Document doc, Element article);
 257
 258         /**
 259          * the optional {@link URL} that points to an external website for more
 260          * information.
 261          *
 262          * @param doc
 263          *            the main document for the current category
 264          * @param article
 265          *            the article to look into
 266          *
 267          * @return the external {@link URL}
 268          */
 269         abstract protected String getArticleExtUrl(Document doc, Element article);
 270
 271         /**
 272          * The optional article short-content (not the full content, that will be
 273          * fetched by {@link BasicSupport#fetch(Story)}).
 274          *
 275          * @param doc
 276          *            the main document for the current category
 277          * @param article
 278          *            the article to look into
 279          *
 280          * @return the short content
 281          */
 282         abstract protected String getArticleContent(Document doc, Element article);
 283
 284         /**
 285          * Fetch the full article content as well as all the comments associated to
 286          * this {@link Story}, if any (can be empty, but not NULL).
 287          *
 288          * @param story
 289          *            the story to fetch the comments of
 290          *
 291          * @throws IOException
 292          *             in case of I/O error
 293          */
 294         public void fetch(Story story) throws IOException {
 295                 String fullContent = "";
 296
 297                 URL url = new URL(story.getUrlInternal());
 298                 InputStream in = open(url);
 299                 try {
 300                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
 301                         Element article = getFullArticle(doc);
 302                         if (article != null) {
 303                                 StringBuilder builder = new StringBuilder();
 304                                 ElementProcessor eProc = getElementProcessorFullArticle();
 305                                 if (eProc != null) {
 306                                         for (String line : toLines(article, eProc)) {
 307                                                 builder.append(line + "\n");
 308                                         }
 309                                 } else {
 310                                         builder.append(article.text());
 311                                 }
 312
 313                                 // Content is too tight with a single break per line:
 314                                 fullContent = builder.toString().replace("\n", "\n\n") //
 315                                                 .replace("\n\n\n\n", "\n\n") //
 316                                                 .replace("\n\n\n\n", "\n\n") //
 317                                                 .trim();
 318                         }
 319
 320                         if (fullContent.isEmpty()) {
 321                                 fullContent = story.getContent();
 322                         }
 323
 324                         story.setFullContent(fullContent);
 325                         story.setComments(getComments(doc,
 326                                         getFullArticleCommentPosts(doc, url)));
 327                 } finally {
 328                         if (in != null) {
 329                                 in.close();
 330                         }
 331                 }
 332         }
 333
 334         /**
 335          * Return the full article if available (this is the article to retrieve
 336          * from the newly downloaded page at {@link Story#getUrlInternal()}).
 337          *
 338          * @param doc
 339          *            the (full article) document to work on
 340          *
 341          * @return the article or NULL
 342          */
 343         abstract protected Element getFullArticle(Document doc);
 344
 345         /**
 346          * Return the list of comment {@link Element}s from this optional container
 347          * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
 348          *
 349          * @param doc
 350          *            the (full article) document to work on
 351          * @param intUrl
 352          *            the internal {@link URL} this article wa taken from (the
 353          *            {@link URL} from the supported website)
 354          *
 355          * @return the list of comment posts
 356          */
 357         abstract protected List<Element> getFullArticleCommentPosts(Document doc,
 358                         URL intUrl);
 359
 360         /**
 361          * The {@link ElementProcessor} to use to convert the main article element
 362          * (see {@link BasicSupport#getFullArticle(Document)}) into text.
 363          * <p>
 364          * See {@link BasicElementProcessor} for a working, basic implementation.
 365          * <p>
 366          * Can be NULL to simply use {@link Element#text()}.
 367          *
 368          * @return the processor, or NULL
 369          */
 370         abstract protected ElementProcessor getElementProcessorFullArticle();
 371
 372         /**
 373          * Open a network resource.
 374          * <p>
 375          * You need to close the returned {@link InputStream} when done.
 376          *
 377          * @param url
 378          *            the source to open
 379          *
 380          * @return the content
 381          *
 382          * @throws IOException
 383          *             in case of I/O error
 384          */
 385         protected InputStream open(URL url) throws IOException {
 386                 return downloader.open(url, url, cookies, null, null, null);
 387         }
 388
 389         /**
 390          * Convert the comment elements into {@link Comment}s
 391          *
 392          * @param doc
 393          *            the document we work on
 394          * @param posts
 395          *            the comment elements
 396          *
 397          * @return the converted {@link Comment}s
 398          */
 399         private List<Comment> getComments(Document doc, List<Element> posts) {
 400                 List<Comment> comments = new ArrayList<Comment>();
 401                 if (posts != null) {
 402                         for (Element post : posts) {
 403                                 String id = getCommentId(post).trim();
 404                                 String author = getCommentAuthor(post).trim();
 405                                 String title = getCommentTitle(post).trim();
 406                                 String date = getCommentDate(post).trim();
 407
 408                                 List<String> content = new ArrayList<String>();
 409
 410                                 if (id.isEmpty()) {
 411                                         id = date;
 412                                 }
 413
 414                                 date = date(date);
 415
 416                                 Element contentE = getCommentContentElement(post);
 417                                 if (contentE != null) {
 418                                         ElementProcessor eProc = getElementProcessorComment();
 419                                         if (eProc != null) {
 420                                                 for (String line : toLines(contentE, eProc)) {
 421                                                         content.add(line);
 422                                                 }
 423                                         } else {
 424                                                 content = Arrays.asList(contentE.text().split("\n"));
 425                                         }
 426                                 }
 427
 428                                 Comment comment = new Comment(id, author, title, date, content);
 429                                 comment.addAll(getComments(doc,
 430                                                 getCommentCommentPosts(doc, post)));
 431
 432                                 if (!comment.isEmpty()) {
 433                                         comments.add(comment);
 434                                 }
 435                         }
 436                 }
 437
 438                 return comments;
 439         }
 440
 441         /**
 442          * Return the list of subcomment {@link Element}s from this comment element
 443          * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
 444          *
 445          * @param doc
 446          *            the (full article) document to work on
 447          * @param container
 448          *            the container (a comment {@link Element})
 449          *
 450          * @return the list of comment posts
 451          */
 452         abstract protected List<Element> getCommentCommentPosts(Document doc,
 453                         Element container);
 454
 455         /**
 456          * Compute the ID of the given comment element.
 457          *
 458          * @param post
 459          *            the comment element
 460          *
 461          * @return the ID
 462          */
 463         abstract protected String getCommentId(Element post);
 464
 465         /**
 466          * Compute the author of the given comment element.
 467          *
 468          * @param post
 469          *            the comment element
 470          *
 471          * @return the author
 472          */
 473         abstract protected String getCommentAuthor(Element post);
 474
 475         /**
 476          * Compute the title of the given comment element.
 477          *
 478          * @param post
 479          *            the comment element
 480          *
 481          * @return the title
 482          */
 483         abstract protected String getCommentTitle(Element post);
 484
 485         /**
 486          * Compute the date of the given comment element.
 487          *
 488          * @param post
 489          *            the comment element
 490          *
 491          * @return the date
 492          */
 493         abstract protected String getCommentDate(Element post);
 494
 495         /**
 496          * Get the main of the given comment element, which can be NULL.
 497          *
 498          * @param post
 499          *            the comment element
 500          *
 501          * @return the element
 502          */
 503         abstract protected Element getCommentContentElement(Element post);
 504
 505         /**
 506          * The {@link ElementProcessor} to use to convert the main comment element
 507          * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
 508          * <p>
 509          * See {@link BasicElementProcessor} for a working, basic implementation.
 510          * <p>
 511          * Can be NULL to simply use {@link Element#text()}.
 512          *
 513          * @return the processor
 514          */
 515         abstract protected ElementProcessor getElementProcessorComment();
 516
 517         /**
 518          * The support type.
 519          *
 520          * @param type
 521          *            the new type
 522          */
 523         protected void setType(Type type) {
 524                 this.type = type;
 525         }
 526
 527         /**
 528          * Add a cookie for all site connections.
 529          *
 530          * @param name
 531          *            the cookie name
 532          * @param value
 533          *            the value
 534          */
 535         protected void addCookie(String name, String value) {
 536                 cookies.put(name, value);
 537         }
 538
 539         /**
 540          * The {@link String} to append to the selector (the selector will be
 541          * constructed as "this string" then "/type/".
 542          *
 543          * @param preselector
 544          *            the preselector to set
 545          */
 546         static public void setPreselector(String preselector) {
 547                 BasicSupport.preselector = preselector;
 548         }
 549
 550         /**
 551          * Return a {@link BasicSupport} that is compatible with the given
 552          * {@link Type} if it exists (or NULL if not).
 553          *
 554          * @param type
 555          *            the type
 556          *
 557          * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
 558          */
 559         static public BasicSupport getSupport(Type type) {
 560                 BasicSupport support = null;
 561
 562                 if (type != null) {
 563                         switch (type) {
 564                         case SLASHDOT:
 565                                 support = new Slashdot();
 566                                 break;
 567                         case PIPEDOT:
 568                                 support = new Pipedot();
 569                                 break;
 570                         case LWN:
 571                                 support = new LWN();
 572                                 break;
 573                         case LEMONDE:
 574                                 support = new LeMonde();
 575                                 break;
 576                         case REGISTER:
 577                                 support = new TheRegister();
 578                                 break;
 579                         case TOO_LINUX:
 580                                 support = new TooLinux();
 581                                 break;
 582                         case ERE_NUMERIQUE:
 583                                 support = new EreNumerique();
 584                                 break;
 585                         case PHORONIX:
 586                                 support = new Phoronix();
 587                                 break;
 588                         case SEPT_SUR_SEPT:
 589                                 support = new SeptSurSept();
 590                                 break;
 591                         }
 592
 593                         if (support != null) {
 594                                 support.setType(type);
 595                         }
 596                 }
 597
 598                 return support;
 599         }
 600
 601         /**
 602          * The gopher "selector" to use for output for this type, using the
 603          * preselector.
 604          * <p>
 605          * A kind of "URL path", like "/news/" or "/misc/news/" or...
 606          *
 607          * @param type
 608          *            the type to get the selector of
 609          *
 610          * @return the selector
 611          */
 612         static public String getSelector(Type type) {
 613                 return preselector + "/" + type + "/";
 614         }
 615
 616         /**
 617          * Process the given element into text (each line is a text paragraph and
 618          * can be prepended with ">" signs to indicate a quote or sub-quote or
 619          * sub-sub-quote...).
 620          *
 621          * @param element
 622          *            the element to process
 623          * @param elementProcessor
 624          *            the element processor, must not be NULL
 625          *
 626          * @return text lines, each line is a paragraph
 627          */
 628         static protected List<String> toLines(Element element,
 629                         final ElementProcessor elementProcessor) {
 630                 final List<String> lines = new ArrayList<String>();
 631                 final StringBuilder currentLine = new StringBuilder();
 632                 final List<Integer> quoted = new ArrayList<Integer>();
 633                 final List<Node> ignoredNodes = new ArrayList<Node>();
 634                 final List<String> footnotes = new ArrayList<String>();
 635
 636                 if (element != null) {
 637                         new NodeTraversor(new NodeVisitor() {
 638                                 @Override
 639                                 public void head(Node node, int depth) {
 640                                         String manual = null;
 641                                         boolean ignore = elementProcessor.ignoreNode(node)
 642                                                         || ignoredNodes.contains(node.parentNode());
 643                                         // Manual processing
 644                                         if (!ignore) {
 645                                                 manual = elementProcessor.manualProcessing(node);
 646                                                 if (manual != null) {
 647                                                         currentLine.append(manual);
 648                                                         ignore = true;
 649                                                 }
 650                                         }
 651
 652                                         // Subtitle check
 653                                         if (!ignore) {
 654                                                 String subtitle = elementProcessor.isSubtitle(node);
 655                                                 if (subtitle != null) {
 656                                                         subtitle = subtitle.trim();
 657                                                         currentLine.append("\n[ " + subtitle + " ]\n");
 658                                                         ignore = true;
 659                                                 }
 660                                         }
 661
 662                                         // <pre> check
 663                                         if (!ignore) {
 664                                                 if (node instanceof Element) {
 665                                                         Element el = (Element) node;
 666                                                         if ("pre".equals(el.tagName())) {
 667                                                                 currentLine.append(StringUtils
 668                                                                                 .unhtml(el.text()).trim());
 669                                                                 ignore = true;
 670                                                         }
 671                                                 }
 672                                         }
 673
 674                                         if (ignore) {
 675                                                 ignoredNodes.add(node);
 676                                                 return;
 677                                         }
 678
 679                                         String prep = "";
 680                                         for (int i = 0; i < quoted.size(); i++) {
 681                                                 prep += ">";
 682                                         }
 683                                         prep += " ";
 684
 685                                         boolean enterQuote = elementProcessor.detectQuote(node);
 686                                         boolean leaveQuote = quoted.contains(depth);
 687
 688                                         if (enterQuote) {
 689                                                 quoted.add(depth);
 690                                         }
 691
 692                                         if (leaveQuote) {
 693                                                 quoted.remove(Integer.valueOf(depth));
 694                                         }
 695
 696                                         if (enterQuote || leaveQuote) {
 697                                                 if (currentLine.length() > 0) {
 698                                                         if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 699                                                                 currentLine.setLength(currentLine.length() - 1);
 700                                                         }
 701                                                         for (String l : currentLine.toString().split("\n")) {
 702                                                                 lines.add(prep + l);
 703                                                         }
 704                                                 }
 705                                                 currentLine.setLength(0);
 706                                         }
 707
 708                                         if (node instanceof Element) {
 709                                                 Element element = (Element) node;
 710                                                 boolean block = element.isBlock()
 711                                                                 || element.tagName().equalsIgnoreCase("br");
 712                                                 if (block && currentLine.length() > 0) {
 713                                                         currentLine.append("\n");
 714                                                 }
 715
 716                                                 if (!element.absUrl("href").trim().isEmpty()) {
 717                                                         footnotes.add(element.absUrl("href"));
 718                                                         currentLine.append("[" + footnotes.size() + "]");
 719                                                 }
 720                                         } else if (node instanceof TextNode) {
 721                                                 TextNode textNode = (TextNode) node;
 722                                                 String line = StringUtil.normaliseWhitespace(textNode
 723                                                                 .getWholeText());
 724
 725                                                 currentLine.append(elementProcessor.processText(line));
 726                                                 currentLine.append(" ");
 727                                         }
 728                                 }
 729
 730                                 @Override
 731                                 public void tail(Node node, int depth) {
 732                                 }
 733                         }).traverse(element);
 734                 }
 735
 736                 if (currentLine.length() > 0) {
 737                         String prep = "";
 738                         for (int i = 0; i < quoted.size(); i++) {
 739                                 prep += ">";
 740                         }
 741                         prep += " ";
 742                         if (currentLine.length() > 0) {
 743                                 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 744                                         currentLine.setLength(currentLine.length() - 1);
 745                                 }
 746                                 for (String l : currentLine.toString().split("\n")) {
 747                                         lines.add(prep + l);
 748                                 }
 749                         }
 750                 }
 751
 752                 // Fix spaces and nbsp, remove multiple following blank lines
 753                 List<String> linesCopy = new ArrayList<String>(lines.size());
 754                 long blanks = 0;
 755                 for (int i = 0; i < lines.size(); i++) {
 756                         String line = lines.get(i).replace(" ", " ") // nbsp -> space
 757                                         .replace("  ", " ").trim();
 758                         if (line.isEmpty()) {
 759                                 blanks++;
 760                         } else {
 761                                 blanks = 0;
 762                         }
 763
 764                         if (blanks < 2) {
 765                                 linesCopy.add(line);
 766                         }
 767                 }
 768
 769                 // Footnotes insertion
 770                 if (footnotes.size() > 0) {
 771                         linesCopy.add("");
 772                         linesCopy.add("");
 773                         linesCopy.add("");
 774                         linesCopy.add("");
 775                         for (int i = 0; i < footnotes.size(); i++) {
 776                                 linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
 777                         }
 778                 }
 779
 780                 return linesCopy;
 781         }
 782
 783         /**
 784          * Reformat the date if possible.
 785          *
 786          * @param date
 787          *            the input date
 788          *
 789          * @return the reformated date, or the same value if it was not parsable
 790          */
 791         static private String date(String date) {
 792                 SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
 793
 794                 long epoch = 0;
 795                 try {
 796                         epoch = Long.parseLong(date.trim());
 797                 } catch (Exception e) {
 798                         epoch = 0;
 799                 }
 800
 801                 if (epoch > 0) {
 802                         return out.format(new Date(1000 * epoch));
 803                 }
 804
 805                 try {
 806                         Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
 807                                         .parse(date.trim());
 808                         return out.format(dat);
 809                 } catch (ParseException e) {
 810                         return date;
 811                 }
 812         }
 813 }