src/be/nikiroo/gofetch/support/BasicSupport.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.net.URL;
   6 import java.text.ParseException;
   7 import java.text.SimpleDateFormat;
   8 import java.util.ArrayList;
   9 import java.util.Arrays;
  10 import java.util.Date;
  11 import java.util.List;
  12 import java.util.Map.Entry;
  13
  14 import org.jsoup.helper.DataUtil;
  15 import org.jsoup.helper.StringUtil;
  16 import org.jsoup.nodes.Document;
  17 import org.jsoup.nodes.Element;
  18 import org.jsoup.nodes.Node;
  19 import org.jsoup.nodes.TextNode;
  20 import org.jsoup.select.NodeTraversor;
  21 import org.jsoup.select.NodeVisitor;
  22
  23 import be.nikiroo.gofetch.data.Comment;
  24 import be.nikiroo.gofetch.data.Story;
  25 import be.nikiroo.utils.Downloader;
  26 import be.nikiroo.utils.StringUtils;
  27
  28 /**
  29  * Base class for website support.
  30  *
  31  * @author niki
  32  */
  33 public abstract class BasicSupport {
  34         /**
  35          * The downloader to use for all websites via {@link BasicSupport#open(URL)}
  36          */
  37         static private Downloader downloader = new Downloader("gofetcher");
  38
  39         static private String preselector;
  40
  41         private Type type;
  42
  43         /**
  44          * The website textual description, to add in the dispatcher page.
  45          * <p>
  46          * Should be short.
  47          *
  48          * @return the description
  49          */
  50         abstract public String getDescription();
  51
  52         /**
  53          * The gopher "selector" to use for output.
  54          * <p>
  55          * A kind of "URL path", like "/news/" or "/misc/news/" or...
  56          *
  57          * @return the selector
  58          */
  59         public String getSelector() {
  60                 return getSelector(getType());
  61         }
  62
  63         /**
  64          * The support type.
  65          *
  66          * @return the type
  67          */
  68         public Type getType() {
  69                 return type;
  70         }
  71
  72         /**
  73          * List all the recent items, but only assure the ID and internal URL to
  74          * fetch it later on (until it has been fetched, the rest of the
  75          * {@link Story} is not confirmed).
  76          *
  77          * @return the list of new stories
  78          *
  79          * @throws IOException
  80          *             in case of I/O
  81          */
  82         public List<Story> list() throws IOException {
  83                 List<Story> list = new ArrayList<Story>();
  84
  85                 for (Entry<URL, String> entry : getUrls()) {
  86                         URL url = entry.getKey();
  87                         String defaultCateg = entry.getValue();
  88                         if (defaultCateg == null) {
  89                                 defaultCateg = "";
  90                         }
  91
  92                         InputStream in = open(url);
  93                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
  94                         List<Element> articles = getArticles(doc);
  95                         for (Element article : articles) {
  96                                 String id = getArticleId(doc, article).trim();
  97                                 String title = getArticleTitle(doc, article).trim();
  98                                 String author = getArticleAuthor(doc, article).trim();
  99                                 String date = getArticleDate(doc, article).trim();
 100                                 String categ = getArticleCategory(doc, article, defaultCateg)
 101                                                 .trim();
 102                                 String details = getArticleDetails(doc, article).trim();
 103                                 String intUrl = getArticleIntUrl(doc, article).trim();
 104                                 String extUrl = getArticleExtUrl(doc, article).trim();
 105                                 String content = getArticleContent(doc, article).trim();
 106
 107                                 if (id.isEmpty() && date.isEmpty()) {
 108                                         continue;
 109                                 }
 110
 111                                 if (!id.isEmpty()) {
 112                                         while (id.length() < 10) {
 113                                                 id = "0" + id;
 114                                         }
 115                                 } else {
 116                                         id = date.replace(":", "_").replace("+", "_");
 117                                 }
 118
 119                                 date = date(date);
 120
 121                                 list.add(new Story(getType(), id, title, author, date, categ,
 122                                                 details, intUrl, extUrl, content));
 123                         }
 124                 }
 125
 126                 return list;
 127         }
 128
 129         /**
 130          * The {@link URL}s to process for this website.
 131          *
 132          * @return the list of {@link URL}s
 133          *
 134          * @throws IOException
 135          *             in case of I/O error
 136          */
 137         abstract protected List<Entry<URL, String>> getUrls() throws IOException;
 138
 139         /**
 140          * The article {@link Element}s of this document.
 141          *
 142          * @param doc
 143          *            the main document for the current category
 144          *
 145          * @return the articles
 146          */
 147         abstract protected List<Element> getArticles(Document doc);
 148
 149         /**
 150          * The ID of the article (defaults to the date element if empty).
 151          *
 152          * @param doc
 153          *            the main document for the current category
 154          * @param article
 155          *            the article to look into
 156          *
 157          * @return the ID
 158          */
 159         abstract protected String getArticleId(Document doc, Element article);
 160
 161         /**
 162          * The article title to display.
 163          *
 164          * @param doc
 165          *            the main document for the current category
 166          * @param article
 167          *            the article to look into
 168          *
 169          * @return the title
 170          */
 171         abstract protected String getArticleTitle(Document doc, Element article);
 172
 173         /**
 174          * The optional article author.
 175          *
 176          * @param doc
 177          *            the main document for the current category
 178          * @param article
 179          *            the article to look into
 180          *
 181          * @return the author
 182          */
 183         abstract protected String getArticleAuthor(Document doc, Element article);
 184
 185         /**
 186          * The optional article date.
 187          *
 188          * @param doc
 189          *            the main document for the current category
 190          * @param article
 191          *            the article to look into
 192          *
 193          * @return the date
 194          */
 195         abstract protected String getArticleDate(Document doc, Element article);
 196
 197         /**
 198          * the optional article category.
 199          *
 200          * @param doc
 201          *            the main document for the current category
 202          * @param article
 203          *            the article to look into
 204          * @param currentCategory
 205          *            the currently listed category if any (can be NULL)
 206          *
 207          * @return the category
 208          */
 209         abstract protected String getArticleCategory(Document doc, Element article,
 210                         String currentCategory);
 211
 212         /**
 213          * the optional details of the article (can replace the date, author and
 214          * category, for instance).
 215          *
 216          * @param doc
 217          *            the main document for the current category
 218          * @param article
 219          *            the article to look into
 220          *
 221          * @return the details
 222          */
 223         abstract protected String getArticleDetails(Document doc, Element article);
 224
 225         /**
 226          * The (required) {@link URL} that points to the news page on the supported
 227          * website.
 228          *
 229          * @param doc
 230          *            the main document for the current category
 231          * @param article
 232          *            the article to look into
 233          *
 234          * @return the internal {@link URL}
 235          */
 236         abstract protected String getArticleIntUrl(Document doc, Element article);
 237
 238         /**
 239          * the optional {@link URL} that points to an external website for more
 240          * information.
 241          *
 242          * @param doc
 243          *            the main document for the current category
 244          * @param article
 245          *            the article to look into
 246          *
 247          * @return the external {@link URL}
 248          */
 249         abstract protected String getArticleExtUrl(Document doc, Element article);
 250
 251         /**
 252          * The optional article short-content (not the full content, that will be
 253          * fetched by {@link BasicSupport#fetch(Story)}).
 254          *
 255          * @param doc
 256          *            the main document for the current category
 257          * @param article
 258          *            the article to look into
 259          *
 260          * @return the short content
 261          */
 262         abstract protected String getArticleContent(Document doc, Element article);
 263
 264         /**
 265          * Fetch the full article content as well as all the comments associated to
 266          * this {@link Story}, if any (can be empty, but not NULL).
 267          *
 268          * @param story
 269          *            the story to fetch the comments of
 270          *
 271          * @throws IOException
 272          *             in case of I/O error
 273          */
 274         public void fetch(Story story) throws IOException {
 275                 String fullContent = "";
 276
 277                 URL url = new URL(story.getUrlInternal());
 278                 InputStream in = open(url);
 279                 try {
 280                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
 281                         Element article = getFullArticle(doc);
 282                         if (article != null) {
 283                                 StringBuilder builder = new StringBuilder();
 284                                 ElementProcessor eProc = getElementProcessorFullArticle();
 285                                 if (eProc != null) {
 286                                         for (String line : toLines(article, eProc)) {
 287                                                 builder.append(line + "\n");
 288                                         }
 289                                 } else {
 290                                         builder.append(article.text());
 291                                 }
 292
 293                                 // Content is too tight with a single break per line:
 294                                 fullContent = builder.toString().replace("\n", "\n\n") //
 295                                                 .replace("\n\n\n\n", "\n\n") //
 296                                                 .replace("\n\n\n\n", "\n\n") //
 297                                                 .trim();
 298                         }
 299
 300                         if (fullContent.isEmpty()) {
 301                                 fullContent = story.getContent();
 302                         }
 303
 304                         story.setFullContent(fullContent);
 305                         story.setComments(getComments(doc,
 306                                         getFullArticleCommentPosts(doc, url)));
 307                 } finally {
 308                         if (in != null) {
 309                                 in.close();
 310                         }
 311                 }
 312         }
 313
 314         /**
 315          * Return the full article if available.
 316          *
 317          * @param doc
 318          *            the (full article) document to work on
 319          *
 320          * @return the article or NULL
 321          */
 322         abstract protected Element getFullArticle(Document doc);
 323
 324         /**
 325          * Return the list of comment {@link Element}s from this optional container
 326          * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
 327          *
 328          * @param doc
 329          *            the (full article) document to work on
 330          * @param intUrl
 331          *            the internal {@link URL} this article wa taken from (the
 332          *            {@link URL} from the supported website)
 333          *
 334          * @return the list of comment posts
 335          */
 336         abstract protected List<Element> getFullArticleCommentPosts(Document doc,
 337                         URL intUrl);
 338
 339         /**
 340          * The {@link ElementProcessor} to use to convert the main article element
 341          * (see {@link BasicSupport#getFullArticle(Document)}) into text.
 342          * <p>
 343          * See {@link BasicElementProcessor} for a working, basic implementation.
 344          * <p>
 345          * Can be NULL to simply use {@link Element#text()}.
 346          *
 347          * @return the processor, or NULL
 348          */
 349         abstract protected ElementProcessor getElementProcessorFullArticle();
 350
 351         /**
 352          * Open a network resource.
 353          * <p>
 354          * You need to close the returned {@link InputStream} when done.
 355          *
 356          * @param url
 357          *            the source to open
 358          *
 359          * @return the content
 360          *
 361          * @throws IOException
 362          *             in case of I/O error
 363          */
 364         protected InputStream open(URL url) throws IOException {
 365                 return downloader.open(url);
 366         }
 367
 368         /**
 369          * Convert the comment elements into {@link Comment}s
 370          *
 371          * @param doc
 372          *            the document we work on
 373          * @param posts
 374          *            the comment elements
 375          *
 376          * @return the converted {@link Comment}s
 377          */
 378         private List<Comment> getComments(Document doc, List<Element> posts) {
 379                 List<Comment> comments = new ArrayList<Comment>();
 380                 if (posts != null) {
 381                         for (Element post : posts) {
 382                                 String id = getCommentId(post).trim();
 383                                 String author = getCommentAuthor(post).trim();
 384                                 String title = getCommentTitle(post).trim();
 385                                 String date = getCommentDate(post).trim();
 386
 387                                 List<String> content = new ArrayList<String>();
 388
 389                                 if (id.isEmpty()) {
 390                                         id = date;
 391                                 }
 392
 393                                 date = date(date);
 394
 395                                 Element contentE = getCommentContentElement(post);
 396                                 if (contentE != null) {
 397                                         ElementProcessor eProc = getElementProcessorComment();
 398                                         if (eProc != null) {
 399                                                 for (String line : toLines(contentE, eProc)) {
 400                                                         content.add(line);
 401                                                 }
 402                                         } else {
 403                                                 content = Arrays.asList(contentE.text().split("\n"));
 404                                         }
 405                                 }
 406
 407                                 Comment comment = new Comment(id, author, title, date, content);
 408                                 comment.addAll(getComments(doc,
 409                                                 getCommentCommentPosts(doc, post)));
 410
 411                                 if (!comment.isEmpty()) {
 412                                         comments.add(comment);
 413                                 }
 414                         }
 415                 }
 416
 417                 return comments;
 418         }
 419
 420         /**
 421          * Return the list of subcomment {@link Element}s from this comment element
 422          * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
 423          *
 424          * @param doc
 425          *            the (full article) document to work on
 426          * @param container
 427          *            the container (a comment {@link Element})
 428          *
 429          * @return the list of comment posts
 430          */
 431         abstract protected List<Element> getCommentCommentPosts(Document doc,
 432                         Element container);
 433
 434         /**
 435          * Compute the ID of the given comment element.
 436          *
 437          * @param post
 438          *            the comment element
 439          *
 440          * @return the ID
 441          */
 442         abstract protected String getCommentId(Element post);
 443
 444         /**
 445          * Compute the author of the given comment element.
 446          *
 447          * @param post
 448          *            the comment element
 449          *
 450          * @return the author
 451          */
 452         abstract protected String getCommentAuthor(Element post);
 453
 454         /**
 455          * Compute the title of the given comment element.
 456          *
 457          * @param post
 458          *            the comment element
 459          *
 460          * @return the title
 461          */
 462         abstract protected String getCommentTitle(Element post);
 463
 464         /**
 465          * Compute the date of the given comment element.
 466          *
 467          * @param post
 468          *            the comment element
 469          *
 470          * @return the date
 471          */
 472         abstract protected String getCommentDate(Element post);
 473
 474         /**
 475          * Get the main of the given comment element, which can be NULL.
 476          *
 477          * @param post
 478          *            the comment element
 479          *
 480          * @return the element
 481          */
 482         abstract protected Element getCommentContentElement(Element post);
 483
 484         /**
 485          * The {@link ElementProcessor} to use to convert the main comment element
 486          * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
 487          * <p>
 488          * See {@link BasicElementProcessor} for a working, basic implementation.
 489          * <p>
 490          * Can be NULL to simply use {@link Element#text()}.
 491          *
 492          * @return the processor
 493          */
 494         abstract protected ElementProcessor getElementProcessorComment();
 495
 496         /**
 497          * The support type.
 498          *
 499          * @param type
 500          *            the new type
 501          */
 502         protected void setType(Type type) {
 503                 this.type = type;
 504         }
 505
 506         /**
 507          * The {@link String} to append to the selector (the selector will be
 508          * constructed as "this string" then "/type/".
 509          *
 510          * @param preselector
 511          *            the preselector to set
 512          */
 513         static public void setPreselector(String preselector) {
 514                 BasicSupport.preselector = preselector;
 515         }
 516
 517         /**
 518          * Return a {@link BasicSupport} that is compatible with the given
 519          * {@link Type} if it exists (or NULL if not).
 520          *
 521          * @param type
 522          *            the type
 523          *
 524          * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
 525          */
 526         static public BasicSupport getSupport(Type type) {
 527                 BasicSupport support = null;
 528
 529                 if (type != null) {
 530                         switch (type) {
 531                         case SLASHDOT:
 532                                 support = new Slashdot();
 533                                 break;
 534                         case PIPEDOT:
 535                                 support = new Pipedot();
 536                                 break;
 537                         case LWN:
 538                                 support = new LWN();
 539                                 break;
 540                         case LEMONDE:
 541                                 support = new LeMonde();
 542                                 break;
 543                         case REGISTER:
 544                                 support = new TheRegister();
 545                                 break;
 546                         case TOO_LINUX:
 547                                 support = new TooLinux();
 548                                 break;
 549                         case ERE_NUMERIQUE:
 550                                 support = new EreNumerique();
 551                                 break;
 552                         case PHORONIX:
 553                                 support = new Phoronix();
 554                                 break;
 555                         }
 556
 557                         if (support != null) {
 558                                 support.setType(type);
 559                         }
 560                 }
 561
 562                 return support;
 563         }
 564
 565         /**
 566          * The gopher "selector" to use for output for this type, using the
 567          * preselector.
 568          * <p>
 569          * A kind of "URL path", like "/news/" or "/misc/news/" or...
 570          *
 571          * @param type
 572          *            the type to get the selector of
 573          *
 574          * @return the selector
 575          */
 576         static public String getSelector(Type type) {
 577                 return preselector + "/" + type + "/";
 578         }
 579
 580         /**
 581          * Process the given element into text (each line is a text paragraph and
 582          * can be prepended with ">" signs to indicate a quote or sub-quote or
 583          * sub-sub-quote...).
 584          *
 585          * @param element
 586          *            the element to process
 587          * @param elementProcessor
 588          *            the element processor, must not be NULL
 589          *
 590          * @return text lines, each line is a paragraph
 591          */
 592         static protected List<String> toLines(Element element,
 593                         final ElementProcessor elementProcessor) {
 594                 final List<String> lines = new ArrayList<String>();
 595                 final StringBuilder currentLine = new StringBuilder();
 596                 final List<Integer> quoted = new ArrayList<Integer>();
 597                 final List<Node> ignoredNodes = new ArrayList<Node>();
 598                 final List<String> footnotes = new ArrayList<String>();
 599
 600                 if (element != null) {
 601                         new NodeTraversor(new NodeVisitor() {
 602                                 @Override
 603                                 public void head(Node node, int depth) {
 604                                         String manual = null;
 605                                         boolean ignore = elementProcessor.ignoreNode(node)
 606                                                         || ignoredNodes.contains(node.parentNode());
 607                                         // Manual processing
 608                                         if (!ignore) {
 609                                                 manual = elementProcessor.manualProcessing(node);
 610                                                 if (manual != null) {
 611                                                         currentLine.append(manual);
 612                                                         ignore = true;
 613                                                 }
 614                                         }
 615
 616                                         // Subtitle check
 617                                         if (!ignore) {
 618                                                 String subtitle = elementProcessor.isSubtitle(node);
 619                                                 if (subtitle != null) {
 620                                                         subtitle = subtitle.trim();
 621                                                         currentLine.append("\n[ " + subtitle + " ]\n");
 622                                                         ignore = true;
 623                                                 }
 624                                         }
 625
 626                                         // <pre> check
 627                                         if (!ignore) {
 628                                                 if (node instanceof Element) {
 629                                                         Element el = (Element) node;
 630                                                         if ("pre".equals(el.tagName())) {
 631                                                                 currentLine.append(StringUtils
 632                                                                                 .unhtml(el.text()).trim());
 633                                                                 ignore = true;
 634                                                         }
 635                                                 }
 636                                         }
 637
 638                                         if (ignore) {
 639                                                 ignoredNodes.add(node);
 640                                                 return;
 641                                         }
 642
 643                                         String prep = "";
 644                                         for (int i = 0; i < quoted.size(); i++) {
 645                                                 prep += ">";
 646                                         }
 647                                         prep += " ";
 648
 649                                         boolean enterQuote = elementProcessor.detectQuote(node);
 650                                         boolean leaveQuote = quoted.contains(depth);
 651
 652                                         if (enterQuote) {
 653                                                 quoted.add(depth);
 654                                         }
 655
 656                                         if (leaveQuote) {
 657                                                 quoted.remove(Integer.valueOf(depth));
 658                                         }
 659
 660                                         if (enterQuote || leaveQuote) {
 661                                                 if (currentLine.length() > 0) {
 662                                                         if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 663                                                                 currentLine.setLength(currentLine.length() - 1);
 664                                                         }
 665                                                         for (String l : currentLine.toString().split("\n")) {
 666                                                                 lines.add(prep + l);
 667                                                         }
 668                                                 }
 669                                                 currentLine.setLength(0);
 670                                         }
 671
 672                                         if (node instanceof Element) {
 673                                                 Element element = (Element) node;
 674                                                 boolean block = element.isBlock()
 675                                                                 || element.tagName().equalsIgnoreCase("br");
 676                                                 if (block && currentLine.length() > 0) {
 677                                                         currentLine.append("\n");
 678                                                 }
 679
 680                                                 if (!element.absUrl("href").trim().isEmpty()) {
 681                                                         footnotes.add(element.absUrl("href"));
 682                                                         currentLine.append("[" + footnotes.size() + "]");
 683                                                 }
 684                                         } else if (node instanceof TextNode) {
 685                                                 TextNode textNode = (TextNode) node;
 686                                                 String line = StringUtil.normaliseWhitespace(textNode
 687                                                                 .getWholeText());
 688
 689                                                 currentLine.append(elementProcessor.processText(line));
 690                                                 currentLine.append(" ");
 691                                         }
 692                                 }
 693
 694                                 @Override
 695                                 public void tail(Node node, int depth) {
 696                                 }
 697                         }).traverse(element);
 698                 }
 699
 700                 if (currentLine.length() > 0) {
 701                         String prep = "";
 702                         for (int i = 0; i < quoted.size(); i++) {
 703                                 prep += ">";
 704                         }
 705                         prep += " ";
 706                         if (currentLine.length() > 0) {
 707                                 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
 708                                         currentLine.setLength(currentLine.length() - 1);
 709                                 }
 710                                 for (String l : currentLine.toString().split("\n")) {
 711                                         lines.add(prep + l);
 712                                 }
 713                         }
 714                 }
 715
 716                 // Fix spaces and nbsp, remove multiple following blank lines
 717                 List<String> linesCopy = new ArrayList<String>(lines.size());
 718                 long blanks = 0;
 719                 for (int i = 0; i < lines.size(); i++) {
 720                         String line = lines.get(i).replace(" ", " ") // nbsp -> space
 721                                         .replace("  ", " ").trim();
 722                         if (line.isEmpty()) {
 723                                 blanks++;
 724                         } else {
 725                                 blanks = 0;
 726                         }
 727
 728                         if (blanks < 2) {
 729                                 linesCopy.add(line);
 730                         }
 731                 }
 732
 733                 // Footnotes insertion
 734                 if (footnotes.size() > 0) {
 735                         linesCopy.add("");
 736                         linesCopy.add("");
 737                         linesCopy.add("");
 738                         linesCopy.add("");
 739                         for (int i = 0; i < footnotes.size(); i++) {
 740                                 linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
 741                         }
 742                 }
 743
 744                 return linesCopy;
 745         }
 746
 747         /**
 748          * Reformat the date if possible.
 749          *
 750          * @param date
 751          *            the input date
 752          *
 753          * @return the reformated date, or the same value if it was not parsable
 754          */
 755         static private String date(String date) {
 756                 SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
 757
 758                 long epoch = 0;
 759                 try {
 760                         epoch = Long.parseLong(date.trim());
 761                 } catch (Exception e) {
 762                         epoch = 0;
 763                 }
 764
 765                 if (epoch > 0) {
 766                         return out.format(new Date(1000 * epoch));
 767                 }
 768
 769                 try {
 770                         Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
 771                                         .parse(date.trim());
 772                         return out.format(dat);
 773                 } catch (ParseException e) {
 774                         return date;
 775                 }
 776         }
 777 }