src/be/nikiroo/fanfix/supported/BasicSupport_Deprecated.java

   1 package be.nikiroo.fanfix.supported;
   2
   3 import java.io.BufferedReader;
   4 import java.io.ByteArrayInputStream;
   5 import java.io.File;
   6 import java.io.IOException;
   7 import java.io.InputStream;
   8 import java.io.InputStreamReader;
   9 import java.net.MalformedURLException;
  10 import java.net.URL;
  11 import java.util.ArrayList;
  12 import java.util.Date;
  13 import java.util.List;
  14 import java.util.Map.Entry;
  15 import java.util.Scanner;
  16
  17 import be.nikiroo.fanfix.Instance;
  18 import be.nikiroo.fanfix.bundles.Config;
  19 import be.nikiroo.fanfix.bundles.StringId;
  20 import be.nikiroo.fanfix.data.Chapter;
  21 import be.nikiroo.fanfix.data.MetaData;
  22 import be.nikiroo.fanfix.data.Paragraph;
  23 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
  24 import be.nikiroo.fanfix.data.Story;
  25 import be.nikiroo.utils.Image;
  26 import be.nikiroo.utils.Progress;
  27 import be.nikiroo.utils.StringUtils;
  28
  29 /**
  30  * DEPRECATED: use the new Jsoup 'Node' system.
  31  * <p>
  32  * This class is the base class used by the other support classes. It can be
  33  * used outside of this package, and have static method that you can use to get
  34  * access to the correct support class.
  35  * <p>
  36  * It will be used with 'resources' (usually web pages or files).
  37  *
  38  * @author niki
  39  */
  40 @Deprecated
  41 public abstract class BasicSupport_Deprecated extends BasicSupport {
  42         private InputStream in;
  43         private URL currentReferer; // with only one 'r', as in 'HTTP'...
  44
  45         // quote chars
  46         private char openQuote = Instance.getTrans().getCharacter(
  47                         StringId.OPEN_SINGLE_QUOTE);
  48         private char closeQuote = Instance.getTrans().getCharacter(
  49                         StringId.CLOSE_SINGLE_QUOTE);
  50         private char openDoubleQuote = Instance.getTrans().getCharacter(
  51                         StringId.OPEN_DOUBLE_QUOTE);
  52         private char closeDoubleQuote = Instance.getTrans().getCharacter(
  53                         StringId.CLOSE_DOUBLE_QUOTE);
  54
  55         // New methods not used in Deprecated mode
  56         @Override
  57         protected String getDesc() throws IOException {
  58                 throw new RuntimeException("should not be used by legacy code");
  59         }
  60
  61         @Override
  62         protected MetaData getMeta() throws IOException {
  63                 throw new RuntimeException("should not be used by legacy code");
  64         }
  65
  66         @Override
  67         protected List<Entry<String, URL>> getChapters(Progress pg)
  68                         throws IOException {
  69                 throw new RuntimeException("should not be used by legacy code");
  70         }
  71
  72         @Override
  73         protected String getChapterContent(URL chapUrl, int number, Progress pg)
  74                         throws IOException {
  75                 throw new RuntimeException("should not be used by legacy code");
  76         }
  77
  78         public Story process(Progress pg) throws IOException {
  79                 return process(getSource(), pg);
  80         }
  81
  82         //
  83
  84         /**
  85          * Return the {@link MetaData} of this story.
  86          *
  87          * @param source
  88          *            the source of the story
  89          * @param in
  90          *            the input (the main resource)
  91          *
  92          * @return the associated {@link MetaData}, never NULL
  93          *
  94          * @throws IOException
  95          *             in case of I/O error
  96          */
  97         protected abstract MetaData getMeta(URL source, InputStream in)
  98                         throws IOException;
  99
 100         /**
 101          * Return the story description.
 102          *
 103          * @param source
 104          *            the source of the story
 105          * @param in
 106          *            the input (the main resource)
 107          *
 108          * @return the description
 109          *
 110          * @throws IOException
 111          *             in case of I/O error
 112          */
 113         protected abstract String getDesc(URL source, InputStream in)
 114                         throws IOException;
 115
 116         /**
 117          * Return the list of chapters (name and resource).
 118          *
 119          * @param source
 120          *            the source of the story
 121          * @param in
 122          *            the input (the main resource)
 123          * @param pg
 124          *            the optional progress reporter
 125          *
 126          * @return the chapters
 127          *
 128          * @throws IOException
 129          *             in case of I/O error
 130          */
 131         protected abstract List<Entry<String, URL>> getChapters(URL source,
 132                         InputStream in, Progress pg) throws IOException;
 133
 134         /**
 135          * Return the content of the chapter (possibly HTML encoded, if
 136          * {@link BasicSupport_Deprecated#isHtml()} is TRUE).
 137          *
 138          * @param source
 139          *            the source of the story
 140          * @param in
 141          *            the input (the main resource)
 142          * @param number
 143          *            the chapter number
 144          * @param pg
 145          *            the optional progress reporter
 146          *
 147          * @return the content
 148          *
 149          * @throws IOException
 150          *             in case of I/O error
 151          */
 152         protected abstract String getChapterContent(URL source, InputStream in,
 153                         int number, Progress pg) throws IOException;
 154
 155         /**
 156          * Process the given story resource into a partially filled {@link Story}
 157          * object containing the name and metadata, except for the description.
 158          *
 159          * @param url
 160          *            the story resource
 161          *
 162          * @return the {@link Story}
 163          *
 164          * @throws IOException
 165          *             in case of I/O error
 166          */
 167         public Story processMeta(URL url) throws IOException {
 168                 return processMeta(url, true, false, null);
 169         }
 170
 171         /**
 172          * Process the given story resource into a partially filled {@link Story}
 173          * object containing the name and metadata.
 174          *
 175          * @param url
 176          *            the story resource
 177          * @param close
 178          *            close "this" and "in" when done
 179          * @param getDesc
 180          *            retrieve the description of the story, or not
 181          * @param pg
 182          *            the optional progress reporter
 183          *
 184          * @return the {@link Story}, never NULL
 185          *
 186          * @throws IOException
 187          *             in case of I/O error
 188          */
 189         protected Story processMeta(URL url, boolean close, boolean getDesc,
 190                         Progress pg) throws IOException {
 191                 if (pg == null) {
 192                         pg = new Progress();
 193                 } else {
 194                         pg.setMinMax(0, 100);
 195                 }
 196
 197                 login();
 198                 pg.setProgress(10);
 199
 200                 url = getCanonicalUrl(url);
 201
 202                 setCurrentReferer(url);
 203
 204                 in = openInput(url); // NULL allowed here
 205                 try {
 206                         preprocess(url, getInput());
 207                         pg.setProgress(30);
 208
 209                         Story story = new Story();
 210                         MetaData meta = getMeta(url, getInput());
 211                         if (meta.getCreationDate() == null
 212                                         || meta.getCreationDate().isEmpty()) {
 213                                 meta.setCreationDate(StringUtils.fromTime(new Date().getTime()));
 214                         }
 215                         story.setMeta(meta);
 216
 217                         pg.setProgress(50);
 218
 219                         if (meta.getCover() == null) {
 220                                 meta.setCover(getDefaultCover(meta.getSubject()));
 221                         }
 222
 223                         pg.setProgress(60);
 224
 225                         if (getDesc) {
 226                                 String descChapterName = Instance.getTrans().getString(
 227                                                 StringId.DESCRIPTION);
 228                                 story.getMeta().setResume(
 229                                                 makeChapter(url, 0, descChapterName,
 230                                                                 getDesc(url, getInput()), null));
 231                         }
 232
 233                         pg.setProgress(100);
 234                         return story;
 235                 } finally {
 236                         if (close) {
 237                                 close();
 238
 239                                 if (in != null) {
 240                                         in.close();
 241                                 }
 242                         }
 243                 }
 244         }
 245
 246         /**
 247          * Process the given story resource into a fully filled {@link Story}
 248          * object.
 249          *
 250          * @param url
 251          *            the story resource
 252          * @param pg
 253          *            the optional progress reporter
 254          *
 255          * @return the {@link Story}, never NULL
 256          *
 257          * @throws IOException
 258          *             in case of I/O error
 259          */
 260         protected Story process(URL url, Progress pg) throws IOException {
 261                 if (pg == null) {
 262                         pg = new Progress();
 263                 } else {
 264                         pg.setMinMax(0, 100);
 265                 }
 266
 267                 url = getCanonicalUrl(url);
 268                 pg.setProgress(1);
 269                 try {
 270                         Progress pgMeta = new Progress();
 271                         pg.addProgress(pgMeta, 10);
 272                         Story story = processMeta(url, false, true, pgMeta);
 273                         if (!pgMeta.isDone()) {
 274                                 pgMeta.setProgress(pgMeta.getMax()); // 10%
 275                         }
 276
 277                         pg.setName("Retrieving " + story.getMeta().getTitle());
 278
 279                         setCurrentReferer(url);
 280
 281                         Progress pgGetChapters = new Progress();
 282                         pg.addProgress(pgGetChapters, 10);
 283                         story.setChapters(new ArrayList<Chapter>());
 284                         List<Entry<String, URL>> chapters = getChapters(url, getInput(),
 285                                         pgGetChapters);
 286                         if (!pgGetChapters.isDone()) {
 287                                 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
 288                         }
 289
 290                         if (chapters != null) {
 291                                 Progress pgChaps = new Progress("Extracting chapters", 0,
 292                                                 chapters.size() * 300);
 293                                 pg.addProgress(pgChaps, 80);
 294
 295                                 long words = 0;
 296                                 int i = 1;
 297                                 for (Entry<String, URL> chap : chapters) {
 298                                         pgChaps.setName("Extracting chapter " + i);
 299                                         InputStream chapIn = null;
 300                                         if (chap.getValue() != null) {
 301                                                 setCurrentReferer(chap.getValue());
 302                                                 chapIn = Instance.getCache().open(chap.getValue(),
 303                                                                 this, false);
 304                                         }
 305                                         pgChaps.setProgress(i * 100);
 306                                         try {
 307                                                 Progress pgGetChapterContent = new Progress();
 308                                                 Progress pgMakeChapter = new Progress();
 309                                                 pgChaps.addProgress(pgGetChapterContent, 100);
 310                                                 pgChaps.addProgress(pgMakeChapter, 100);
 311
 312                                                 String content = getChapterContent(url, chapIn, i,
 313                                                                 pgGetChapterContent);
 314                                                 if (!pgGetChapterContent.isDone()) {
 315                                                         pgGetChapterContent.setProgress(pgGetChapterContent
 316                                                                         .getMax());
 317                                                 }
 318
 319                                                 Chapter cc = makeChapter(url, i, chap.getKey(),
 320                                                                 content, pgMakeChapter);
 321                                                 if (!pgMakeChapter.isDone()) {
 322                                                         pgMakeChapter.setProgress(pgMakeChapter.getMax());
 323                                                 }
 324
 325                                                 words += cc.getWords();
 326                                                 story.getChapters().add(cc);
 327                                                 story.getMeta().setWords(words);
 328                                         } finally {
 329                                                 if (chapIn != null) {
 330                                                         chapIn.close();
 331                                                 }
 332                                         }
 333
 334                                         i++;
 335                                 }
 336
 337                                 pgChaps.setName("Extracting chapters");
 338                         } else {
 339                                 pg.setProgress(80);
 340                         }
 341
 342                         return story;
 343
 344                 } finally {
 345                         close();
 346
 347                         if (in != null) {
 348                                 in.close();
 349                         }
 350                 }
 351         }
 352
 353         /**
 354          * Prepare the support if needed before processing.
 355          *
 356          * @param source
 357          *            the source of the story
 358          * @param in
 359          *            the input (the main resource)
 360          *
 361          * @throws IOException
 362          *             on I/O error
 363          */
 364         @SuppressWarnings("unused")
 365         protected void preprocess(URL source, InputStream in) throws IOException {
 366         }
 367
 368         /**
 369          * Create a {@link Chapter} object from the given information, formatting
 370          * the content as it should be.
 371          *
 372          * @param source
 373          *            the source of the story
 374          * @param number
 375          *            the chapter number
 376          * @param name
 377          *            the chapter name
 378          * @param content
 379          *            the chapter content
 380          * @param pg
 381          *            the optional progress reporter
 382          *
 383          * @return the {@link Chapter}
 384          *
 385          * @throws IOException
 386          *             in case of I/O error
 387          */
 388         protected Chapter makeChapter(URL source, int number, String name,
 389                         String content, Progress pg) throws IOException {
 390                 // Chapter name: process it correctly, then remove the possible
 391                 // redundant "Chapter x: " in front of it, or "-" (as in
 392                 // "Chapter 5: - Fun!" after the ": " was automatically added)
 393                 String chapterName = processPara(name).getContent().trim();
 394                 for (String lang : Instance.getConfig().getString(Config.CHAPTER)
 395                                 .split(",")) {
 396                         String chapterWord = Instance.getConfig().getStringX(
 397                                         Config.CHAPTER, lang);
 398                         if (chapterName.startsWith(chapterWord)) {
 399                                 chapterName = chapterName.substring(chapterWord.length())
 400                                                 .trim();
 401                                 break;
 402                         }
 403                 }
 404
 405                 if (chapterName.startsWith(Integer.toString(number))) {
 406                         chapterName = chapterName.substring(
 407                                         Integer.toString(number).length()).trim();
 408                 }
 409
 410                 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
 411                         chapterName = chapterName.substring(1).trim();
 412                 }
 413                 //
 414
 415                 Chapter chap = new Chapter(number, chapterName);
 416
 417                 if (content != null) {
 418                         List<Paragraph> paras = makeParagraphs(source, content, pg);
 419                         long words = 0;
 420                         for (Paragraph para : paras) {
 421                                 words += para.getWords();
 422                         }
 423                         chap.setParagraphs(paras);
 424                         chap.setWords(words);
 425                 }
 426
 427                 return chap;
 428
 429         }
 430
 431         /**
 432          * Convert the given content into {@link Paragraph}s.
 433          *
 434          * @param source
 435          *            the source URL of the story
 436          * @param content
 437          *            the textual content
 438          * @param pg
 439          *            the optional progress reporter
 440          *
 441          * @return the {@link Paragraph}s
 442          *
 443          * @throws IOException
 444          *             in case of I/O error
 445          */
 446         protected List<Paragraph> makeParagraphs(URL source, String content,
 447                         Progress pg) throws IOException {
 448                 if (pg == null) {
 449                         pg = new Progress();
 450                 }
 451
 452                 if (isHtml()) {
 453                         // Special <HR> processing:
 454                         content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
 455                                         "<br/>* * *<br/>");
 456                 }
 457
 458                 List<Paragraph> paras = new ArrayList<Paragraph>();
 459
 460                 if (content != null && !content.trim().isEmpty()) {
 461                         if (isHtml()) {
 462                                 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
 463                                 pg.setMinMax(0, tab.length);
 464                                 int i = 1;
 465                                 for (String line : tab) {
 466                                         if (line.startsWith("[") && line.endsWith("]")) {
 467                                                 pg.setName("Extracting image " + i);
 468                                         }
 469                                         paras.add(makeParagraph(source, line.trim()));
 470                                         pg.setProgress(i++);
 471                                 }
 472                                 pg.setName(null);
 473                         } else {
 474                                 List<String> lines = new ArrayList<String>();
 475                                 BufferedReader buff = null;
 476                                 try {
 477                                         buff = new BufferedReader(
 478                                                         new InputStreamReader(new ByteArrayInputStream(
 479                                                                         content.getBytes("UTF-8")), "UTF-8"));
 480                                         for (String line = buff.readLine(); line != null; line = buff
 481                                                         .readLine()) {
 482                                                 lines.add(line.trim());
 483                                         }
 484                                 } finally {
 485                                         if (buff != null) {
 486                                                 buff.close();
 487                                         }
 488                                 }
 489
 490                                 pg.setMinMax(0, lines.size());
 491                                 int i = 0;
 492                                 for (String line : lines) {
 493                                         if (line.startsWith("[") && line.endsWith("]")) {
 494                                                 pg.setName("Extracting image " + i);
 495                                         }
 496                                         paras.add(makeParagraph(source, line));
 497                                         pg.setProgress(i++);
 498                                 }
 499                                 pg.setName(null);
 500                         }
 501
 502                         // Check quotes for "bad" format
 503                         List<Paragraph> newParas = new ArrayList<Paragraph>();
 504                         for (Paragraph para : paras) {
 505                                 newParas.addAll(requotify(para));
 506                         }
 507                         paras = newParas;
 508
 509                         // Remove double blanks/brks
 510                         fixBlanksBreaks(paras);
 511                 }
 512
 513                 return paras;
 514         }
 515
 516         /**
 517          * Convert the given line into a single {@link Paragraph}.
 518          *
 519          * @param source
 520          *            the source URL of the story
 521          * @param line
 522          *            the textual content of the paragraph
 523          *
 524          * @return the {@link Paragraph}
 525          */
 526         private Paragraph makeParagraph(URL source, String line) {
 527                 Image image = null;
 528                 if (line.startsWith("[") && line.endsWith("]")) {
 529                         image = getImage(this, source, line.substring(1, line.length() - 1)
 530                                         .trim());
 531                 }
 532
 533                 if (image != null) {
 534                         return new Paragraph(image);
 535                 }
 536
 537                 return processPara(line);
 538         }
 539
 540         /**
 541          * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
 542          * those {@link Paragraph}s.
 543          * <p>
 544          * The resulting list will not contain a starting or trailing blank/break
 545          * nor 2 blanks or breaks following each other.
 546          *
 547          * @param paras
 548          *            the list of {@link Paragraph}s to fix
 549          */
 550         protected void fixBlanksBreaks(List<Paragraph> paras) {
 551                 boolean space = false;
 552                 boolean brk = true;
 553                 for (int i = 0; i < paras.size(); i++) {
 554                         Paragraph para = paras.get(i);
 555                         boolean thisSpace = para.getType() == ParagraphType.BLANK;
 556                         boolean thisBrk = para.getType() == ParagraphType.BREAK;
 557
 558                         if (i > 0 && space && thisBrk) {
 559                                 paras.remove(i - 1);
 560                                 i--;
 561                         } else if ((space || brk) && (thisSpace || thisBrk)) {
 562                                 paras.remove(i);
 563                                 i--;
 564                         }
 565
 566                         space = thisSpace;
 567                         brk = thisBrk;
 568                 }
 569
 570                 // Remove blank/brk at start
 571                 if (paras.size() > 0
 572                                 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
 573                                                 0).getType() == ParagraphType.BREAK)) {
 574                         paras.remove(0);
 575                 }
 576
 577                 // Remove blank/brk at end
 578                 int last = paras.size() - 1;
 579                 if (paras.size() > 0
 580                                 && (paras.get(last).getType() == ParagraphType.BLANK || paras
 581                                                 .get(last).getType() == ParagraphType.BREAK)) {
 582                         paras.remove(last);
 583                 }
 584         }
 585
 586         /**
 587          * Get the default cover related to this subject (see <tt>.info</tt> files).
 588          *
 589          * @param subject
 590          *            the subject
 591          *
 592          * @return the cover if any, or NULL
 593          */
 594         static Image getDefaultCover(String subject) {
 595                 if (subject != null && !subject.isEmpty()
 596                                 && Instance.getCoverDir() != null) {
 597                         try {
 598                                 File fileCover = new File(Instance.getCoverDir(), subject);
 599                                 return getImage(null, fileCover.toURI().toURL(), subject);
 600                         } catch (MalformedURLException e) {
 601                         }
 602                 }
 603
 604                 return null;
 605         }
 606
 607         /**
 608          * Return the list of supported image extensions.
 609          *
 610          * @param emptyAllowed
 611          *            TRUE to allow an empty extension on first place, which can be
 612          *            used when you may already have an extension in your input but
 613          *            are not sure about it
 614          *
 615          * @return the extensions
 616          */
 617         static String[] getImageExt(boolean emptyAllowed) {
 618                 if (emptyAllowed) {
 619                         return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
 620                 }
 621
 622                 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
 623         }
 624
 625         /**
 626          * Check if the given resource can be a local image or a remote image, then
 627          * refresh the cache with it if it is.
 628          *
 629          * @param source
 630          *            the story source
 631          * @param line
 632          *            the resource to check
 633          *
 634          * @return the image if found, or NULL
 635          *
 636          */
 637         static Image getImage(BasicSupport_Deprecated support, URL source,
 638                         String line) {
 639                 URL url = getImageUrl(support, source, line);
 640                 if (url != null) {
 641                         if ("file".equals(url.getProtocol())) {
 642                                 if (new File(url.getPath()).isDirectory()) {
 643                                         return null;
 644                                 }
 645                         }
 646                         InputStream in = null;
 647                         try {
 648                                 in = Instance.getCache().open(url, getSupport(url), true);
 649                                 return new Image(in);
 650                         } catch (IOException e) {
 651                         } finally {
 652                                 if (in != null) {
 653                                         try {
 654                                                 in.close();
 655                                         } catch (IOException e) {
 656                                         }
 657                                 }
 658                         }
 659                 }
 660
 661                 return null;
 662         }
 663
 664         /**
 665          * Check if the given resource can be a local image or a remote image, then
 666          * refresh the cache with it if it is.
 667          *
 668          * @param source
 669          *            the story source
 670          * @param line
 671          *            the resource to check
 672          *
 673          * @return the image URL if found, or NULL
 674          *
 675          */
 676         static URL getImageUrl(BasicSupport_Deprecated support, URL source,
 677                         String line) {
 678                 URL url = null;
 679
 680                 if (line != null) {
 681                         // try for files
 682                         if (source != null) {
 683                                 try {
 684
 685                                         String relPath = null;
 686                                         String absPath = null;
 687                                         try {
 688                                                 String path = new File(source.getFile()).getParent();
 689                                                 relPath = new File(new File(path), line.trim())
 690                                                                 .getAbsolutePath();
 691                                         } catch (Exception e) {
 692                                                 // Cannot be converted to path (one possibility to take
 693                                                 // into account: absolute path on Windows)
 694                                         }
 695                                         try {
 696                                                 absPath = new File(line.trim()).getAbsolutePath();
 697                                         } catch (Exception e) {
 698                                                 // Cannot be converted to path (at all)
 699                                         }
 700
 701                                         for (String ext : getImageExt(true)) {
 702                                                 File absFile = new File(absPath + ext);
 703                                                 File relFile = new File(relPath + ext);
 704                                                 if (absPath != null && absFile.exists()
 705                                                                 && absFile.isFile()) {
 706                                                         url = absFile.toURI().toURL();
 707                                                 } else if (relPath != null && relFile.exists()
 708                                                                 && relFile.isFile()) {
 709                                                         url = relFile.toURI().toURL();
 710                                                 }
 711                                         }
 712                                 } catch (Exception e) {
 713                                         // Should not happen since we control the correct arguments
 714                                 }
 715                         }
 716
 717                         if (url == null) {
 718                                 // try for URLs
 719                                 try {
 720                                         for (String ext : getImageExt(true)) {
 721                                                 if (Instance.getCache()
 722                                                                 .check(new URL(line + ext), true)) {
 723                                                         url = new URL(line + ext);
 724                                                         break;
 725                                                 }
 726                                         }
 727
 728                                         // try out of cache
 729                                         if (url == null) {
 730                                                 for (String ext : getImageExt(true)) {
 731                                                         try {
 732                                                                 url = new URL(line + ext);
 733                                                                 Instance.getCache().refresh(url, support, true);
 734                                                                 break;
 735                                                         } catch (IOException e) {
 736                                                                 // no image with this ext
 737                                                                 url = null;
 738                                                         }
 739                                                 }
 740                                         }
 741                                 } catch (MalformedURLException e) {
 742                                         // Not an url
 743                                 }
 744                         }
 745
 746                         // refresh the cached file
 747                         if (url != null) {
 748                                 try {
 749                                         Instance.getCache().refresh(url, support, true);
 750                                 } catch (IOException e) {
 751                                         // woops, broken image
 752                                         url = null;
 753                                 }
 754                         }
 755                 }
 756
 757                 return url;
 758         }
 759
 760         /**
 761          * Open the input file that will be used through the support.
 762          * <p>
 763          * Can return NULL, in which case you are supposed to work without an
 764          * {@link InputStream}.
 765          *
 766          * @param source
 767          *            the source {@link URL}
 768          *
 769          * @return the {@link InputStream}
 770          *
 771          * @throws IOException
 772          *             in case of I/O error
 773          */
 774         protected InputStream openInput(URL source) throws IOException {
 775                 return Instance.getCache().open(source, this, false);
 776         }
 777
 778         /**
 779          * Reset then return {@link BasicSupport_Deprecated#in}.
 780          *
 781          * @return {@link BasicSupport_Deprecated#in}
 782          */
 783         protected InputStream getInput() {
 784                 return reset(in);
 785         }
 786
 787         /**
 788          * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
 789          * and requotify them (i.e., separate them into QUOTE paragraphs and other
 790          * paragraphs (quotes or not)).
 791          *
 792          * @param para
 793          *            the paragraph to requotify (not necessarily a quote)
 794          *
 795          * @return the correctly (or so we hope) quotified paragraphs
 796          */
 797         protected List<Paragraph> requotify(Paragraph para) {
 798                 List<Paragraph> newParas = new ArrayList<Paragraph>();
 799
 800                 if (para.getType() == ParagraphType.QUOTE
 801                                 && para.getContent().length() > 2) {
 802                         String line = para.getContent();
 803                         boolean singleQ = line.startsWith("" + openQuote);
 804                         boolean doubleQ = line.startsWith("" + openDoubleQuote);
 805
 806                         // Do not try when more than one quote at a time
 807                         // (some stories are not easily readable if we do)
 808                         if (singleQ
 809                                         && line.indexOf(closeQuote, 1) < line
 810                                                         .lastIndexOf(closeQuote)) {
 811                                 newParas.add(para);
 812                                 return newParas;
 813                         }
 814                         if (doubleQ
 815                                         && line.indexOf(closeDoubleQuote, 1) < line
 816                                                         .lastIndexOf(closeDoubleQuote)) {
 817                                 newParas.add(para);
 818                                 return newParas;
 819                         }
 820                         //
 821
 822                         if (!singleQ && !doubleQ) {
 823                                 line = openDoubleQuote + line + closeDoubleQuote;
 824                                 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
 825                                                 .getWords()));
 826                         } else {
 827                                 char open = singleQ ? openQuote : openDoubleQuote;
 828                                 char close = singleQ ? closeQuote : closeDoubleQuote;
 829
 830                                 int posDot = -1;
 831                                 boolean inQuote = false;
 832                                 int i = 0;
 833                                 for (char car : line.toCharArray()) {
 834                                         if (car == open) {
 835                                                 inQuote = true;
 836                                         } else if (car == close) {
 837                                                 inQuote = false;
 838                                         } else if (car == '.' && !inQuote) {
 839                                                 posDot = i;
 840                                                 break;
 841                                         }
 842                                         i++;
 843                                 }
 844
 845                                 if (posDot >= 0) {
 846                                         String rest = line.substring(posDot + 1).trim();
 847                                         line = line.substring(0, posDot + 1).trim();
 848                                         long words = 1;
 849                                         for (char car : line.toCharArray()) {
 850                                                 if (car == ' ') {
 851                                                         words++;
 852                                                 }
 853                                         }
 854                                         newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
 855                                         if (!rest.isEmpty()) {
 856                                                 newParas.addAll(requotify(processPara(rest)));
 857                                         }
 858                                 } else {
 859                                         newParas.add(para);
 860                                 }
 861                         }
 862                 } else {
 863                         newParas.add(para);
 864                 }
 865
 866                 return newParas;
 867         }
 868
 869         /**
 870          * Process a {@link Paragraph} from a raw line of text.
 871          * <p>
 872          * Will also fix quotes and HTML encoding if needed.
 873          *
 874          * @param line
 875          *            the raw line
 876          *
 877          * @return the processed {@link Paragraph}
 878          */
 879         protected Paragraph processPara(String line) {
 880                 line = ifUnhtml(line).trim();
 881
 882                 boolean space = true;
 883                 boolean brk = true;
 884                 boolean quote = false;
 885                 boolean tentativeCloseQuote = false;
 886                 char prev = '\0';
 887                 int dashCount = 0;
 888                 long words = 1;
 889
 890                 StringBuilder builder = new StringBuilder();
 891                 for (char car : line.toCharArray()) {
 892                         if (car != '-') {
 893                                 if (dashCount > 0) {
 894                                         // dash, ndash and mdash: - – —
 895                                         // currently: always use mdash
 896                                         builder.append(dashCount == 1 ? '-' : '—');
 897                                 }
 898                                 dashCount = 0;
 899                         }
 900
 901                         if (tentativeCloseQuote) {
 902                                 tentativeCloseQuote = false;
 903                                 if (Character.isLetterOrDigit(car)) {
 904                                         builder.append("'");
 905                                 } else {
 906                                         // handle double-single quotes as double quotes
 907                                         if (prev == car) {
 908                                                 builder.append(closeDoubleQuote);
 909                                                 continue;
 910                                         }
 911
 912                                         builder.append(closeQuote);
 913                                 }
 914                         }
 915
 916                         switch (car) {
 917                         case ' ': // note: unbreakable space
 918                         case ' ':
 919                         case '\t':
 920                         case '\n': // just in case
 921                         case '\r': // just in case
 922                                 if (builder.length() > 0
 923                                                 && builder.charAt(builder.length() - 1) != ' ') {
 924                                         words++;
 925                                 }
 926                                 builder.append(' ');
 927                                 break;
 928
 929                         case '\'':
 930                                 if (space || (brk && quote)) {
 931                                         quote = true;
 932                                         // handle double-single quotes as double quotes
 933                                         if (prev == car) {
 934                                                 builder.deleteCharAt(builder.length() - 1);
 935                                                 builder.append(openDoubleQuote);
 936                                         } else {
 937                                                 builder.append(openQuote);
 938                                         }
 939                                 } else if (prev == ' ' || prev == car) {
 940                                         // handle double-single quotes as double quotes
 941                                         if (prev == car) {
 942                                                 builder.deleteCharAt(builder.length() - 1);
 943                                                 builder.append(openDoubleQuote);
 944                                         } else {
 945                                                 builder.append(openQuote);
 946                                         }
 947                                 } else {
 948                                         // it is a quote ("I'm off") or a 'quote' ("This
 949                                         // 'good' restaurant"...)
 950                                         tentativeCloseQuote = true;
 951                                 }
 952                                 break;
 953
 954                         case '"':
 955                                 if (space || (brk && quote)) {
 956                                         quote = true;
 957                                         builder.append(openDoubleQuote);
 958                                 } else if (prev == ' ') {
 959                                         builder.append(openDoubleQuote);
 960                                 } else {
 961                                         builder.append(closeDoubleQuote);
 962                                 }
 963                                 break;
 964
 965                         case '-':
 966                                 if (space) {
 967                                         quote = true;
 968                                 } else {
 969                                         dashCount++;
 970                                 }
 971                                 space = false;
 972                                 break;
 973
 974                         case '*':
 975                         case '~':
 976                         case '/':
 977                         case '\\':
 978                         case '<':
 979                         case '>':
 980                         case '=':
 981                         case '+':
 982                         case '_':
 983                         case '–':
 984                         case '—':
 985                                 space = false;
 986                                 builder.append(car);
 987                                 break;
 988
 989                         case '‘':
 990                         case '`':
 991                         case '‹':
 992                         case '﹁':
 993                         case '〈':
 994                         case '「':
 995                                 if (space || (brk && quote)) {
 996                                         quote = true;
 997                                         builder.append(openQuote);
 998                                 } else {
 999                                         // handle double-single quotes as double quotes
1000                                         if (prev == car) {
1001                                                 builder.deleteCharAt(builder.length() - 1);
1002                                                 builder.append(openDoubleQuote);
1003                                         } else {
1004                                                 builder.append(openQuote);
1005                                         }
1006                                 }
1007                                 space = false;
1008                                 brk = false;
1009                                 break;
1010
1011                         case '’':
1012                         case '›':
1013                         case '﹂':
1014                         case '〉':
1015                         case '」':
1016                                 space = false;
1017                                 brk = false;
1018                                 // handle double-single quotes as double quotes
1019                                 if (prev == car) {
1020                                         builder.deleteCharAt(builder.length() - 1);
1021                                         builder.append(closeDoubleQuote);
1022                                 } else {
1023                                         builder.append(closeQuote);
1024                                 }
1025                                 break;
1026
1027                         case '«':
1028                         case '“':
1029                         case '﹃':
1030                         case '《':
1031                         case '『':
1032                                 if (space || (brk && quote)) {
1033                                         quote = true;
1034                                         builder.append(openDoubleQuote);
1035                                 } else {
1036                                         builder.append(openDoubleQuote);
1037                                 }
1038                                 space = false;
1039                                 brk = false;
1040                                 break;
1041
1042                         case '»':
1043                         case '”':
1044                         case '﹄':
1045                         case '》':
1046                         case '』':
1047                                 space = false;
1048                                 brk = false;
1049                                 builder.append(closeDoubleQuote);
1050                                 break;
1051
1052                         default:
1053                                 space = false;
1054                                 brk = false;
1055                                 builder.append(car);
1056                                 break;
1057                         }
1058
1059                         prev = car;
1060                 }
1061
1062                 if (tentativeCloseQuote) {
1063                         tentativeCloseQuote = false;
1064                         builder.append(closeQuote);
1065                 }
1066
1067                 line = builder.toString().trim();
1068
1069                 ParagraphType type = ParagraphType.NORMAL;
1070                 if (space) {
1071                         type = ParagraphType.BLANK;
1072                 } else if (brk) {
1073                         type = ParagraphType.BREAK;
1074                 } else if (quote) {
1075                         type = ParagraphType.QUOTE;
1076                 }
1077
1078                 return new Paragraph(type, line, words);
1079         }
1080
1081         /**
1082          * Remove the HTML from the input <b>if</b>
1083          * {@link BasicSupport_Deprecated#isHtml()} is true.
1084          *
1085          * @param input
1086          *            the input
1087          *
1088          * @return the no html version if needed
1089          */
1090         private String ifUnhtml(String input) {
1091                 if (isHtml() && input != null) {
1092                         return StringUtils.unhtml(input);
1093                 }
1094
1095                 return input;
1096         }
1097
1098         /**
1099          * Reset the given {@link InputStream} and return it.
1100          *
1101          * @param in
1102          *            the {@link InputStream} to reset
1103          *
1104          * @return the same {@link InputStream} after reset
1105          */
1106         static protected InputStream reset(InputStream in) {
1107                 try {
1108                         if (in != null) {
1109                                 in.reset();
1110                         }
1111                 } catch (IOException e) {
1112                 }
1113
1114                 return in;
1115         }
1116
1117         /**
1118          * Return the first line from the given input which correspond to the given
1119          * selectors.
1120          *
1121          * @param in
1122          *            the input
1123          * @param needle
1124          *            a string that must be found inside the target line (also
1125          *            supports "^" at start to say "only if it starts with" the
1126          *            needle)
1127          * @param relativeLine
1128          *            the line to return based upon the target line position (-1 =
1129          *            the line before, 0 = the target line...)
1130          *
1131          * @return the line, or NULL if not found
1132          */
1133         static protected String getLine(InputStream in, String needle,
1134                         int relativeLine) {
1135                 return getLine(in, needle, relativeLine, true);
1136         }
1137
1138         /**
1139          * Return a line from the given input which correspond to the given
1140          * selectors.
1141          *
1142          * @param in
1143          *            the input
1144          * @param needle
1145          *            a string that must be found inside the target line (also
1146          *            supports "^" at start to say "only if it starts with" the
1147          *            needle)
1148          * @param relativeLine
1149          *            the line to return based upon the target line position (-1 =
1150          *            the line before, 0 = the target line...)
1151          * @param first
1152          *            takes the first result (as opposed to the last one, which will
1153          *            also always spend the input)
1154          *
1155          * @return the line, or NULL if not found
1156          */
1157         static protected String getLine(InputStream in, String needle,
1158                         int relativeLine, boolean first) {
1159                 String rep = null;
1160
1161                 reset(in);
1162
1163                 List<String> lines = new ArrayList<String>();
1164                 @SuppressWarnings("resource")
1165                 Scanner scan = new Scanner(in, "UTF-8");
1166                 int index = -1;
1167                 scan.useDelimiter("\\n");
1168                 while (scan.hasNext()) {
1169                         lines.add(scan.next());
1170
1171                         if (index == -1) {
1172                                 if (needle.startsWith("^")) {
1173                                         if (lines.get(lines.size() - 1).startsWith(
1174                                                         needle.substring(1))) {
1175                                                 index = lines.size() - 1;
1176                                         }
1177
1178                                 } else {
1179                                         if (lines.get(lines.size() - 1).contains(needle)) {
1180                                                 index = lines.size() - 1;
1181                                         }
1182                                 }
1183                         }
1184
1185                         if (index >= 0 && index + relativeLine < lines.size()) {
1186                                 rep = lines.get(index + relativeLine);
1187                                 if (first) {
1188                                         break;
1189                                 }
1190                         }
1191                 }
1192
1193                 return rep;
1194         }
1195
1196         /**
1197          * Return the text between the key and the endKey (and optional subKey can
1198          * be passed, in this case we will look for the key first, then take the
1199          * text between the subKey and the endKey).
1200          * <p>
1201          * Will only match the first line with the given key if more than one are
1202          * possible. Which also means that if the subKey or endKey is not found on
1203          * that line, NULL will be returned.
1204          *
1205          * @param in
1206          *            the input
1207          * @param key
1208          *            the key to match (also supports "^" at start to say
1209          *            "only if it starts with" the key)
1210          * @param subKey
1211          *            the sub key or NULL if none
1212          * @param endKey
1213          *            the end key or NULL for "up to the end"
1214          * @return the text or NULL if not found
1215          */
1216         static protected String getKeyLine(InputStream in, String key,
1217                         String subKey, String endKey) {
1218                 return getKeyText(getLine(in, key, 0), key, subKey, endKey);
1219         }
1220
1221         /**
1222          * Return the text between the key and the endKey (and optional subKey can
1223          * be passed, in this case we will look for the key first, then take the
1224          * text between the subKey and the endKey).
1225          *
1226          * @param in
1227          *            the input
1228          * @param key
1229          *            the key to match (also supports "^" at start to say
1230          *            "only if it starts with" the key)
1231          * @param subKey
1232          *            the sub key or NULL if none
1233          * @param endKey
1234          *            the end key or NULL for "up to the end"
1235          * @return the text or NULL if not found
1236          */
1237         static protected String getKeyText(String in, String key, String subKey,
1238                         String endKey) {
1239                 String result = null;
1240
1241                 String line = in;
1242                 if (line != null && line.contains(key)) {
1243                         line = line.substring(line.indexOf(key) + key.length());
1244                         if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
1245                                 if (subKey != null) {
1246                                         line = line.substring(line.indexOf(subKey)
1247                                                         + subKey.length());
1248                                 }
1249                                 if (endKey == null || line.contains(endKey)) {
1250                                         if (endKey != null) {
1251                                                 line = line.substring(0, line.indexOf(endKey));
1252                                                 result = line;
1253                                         }
1254                                 }
1255                         }
1256                 }
1257
1258                 return result;
1259         }
1260
1261         /**
1262          * Return the text between the key and the endKey (optional subKeys can be
1263          * passed, in this case we will look for the subKeys first, then take the
1264          * text between the key and the endKey).
1265          *
1266          * @param in
1267          *            the input
1268          * @param key
1269          *            the key to match
1270          * @param endKey
1271          *            the end key or NULL for "up to the end"
1272          * @param afters
1273          *            the sub-keys to find before checking for key/endKey
1274          *
1275          * @return the text or NULL if not found
1276          */
1277         static protected String getKeyTextAfter(String in, String key,
1278                         String endKey, String... afters) {
1279
1280                 if (in != null && !in.isEmpty()) {
1281                         int pos = indexOfAfter(in, 0, afters);
1282                         if (pos < 0) {
1283                                 return null;
1284                         }
1285
1286                         in = in.substring(pos);
1287                 }
1288
1289                 return getKeyText(in, key, null, endKey);
1290         }
1291
1292         /**
1293          * Return the first index after all the given "afters" have been found in
1294          * the {@link String}, or -1 if it was not possible.
1295          *
1296          * @param in
1297          *            the input
1298          * @param startAt
1299          *            start at this position in the string
1300          * @param afters
1301          *            the sub-keys to find before checking for key/endKey
1302          *
1303          * @return the text or NULL if not found
1304          */
1305         static protected int indexOfAfter(String in, int startAt, String... afters) {
1306                 int pos = -1;
1307                 if (in != null && !in.isEmpty()) {
1308                         pos = startAt;
1309                         if (afters != null) {
1310                                 for (int i = 0; pos >= 0 && i < afters.length; i++) {
1311                                         String subKey = afters[i];
1312                                         if (!subKey.isEmpty()) {
1313                                                 pos = in.indexOf(subKey, pos);
1314                                                 if (pos >= 0) {
1315                                                         pos += subKey.length();
1316                                                 }
1317                                         }
1318                                 }
1319                         }
1320                 }
1321
1322                 return pos;
1323         }
1324 }