supported/BasicSupport_Deprecated.java

   1 package be.nikiroo.fanfix.supported;
   2
   3 import java.io.BufferedReader;
   4 import java.io.ByteArrayInputStream;
   5 import java.io.File;
   6 import java.io.IOException;
   7 import java.io.InputStream;
   8 import java.io.InputStreamReader;
   9 import java.net.MalformedURLException;
  10 import java.net.URL;
  11 import java.util.ArrayList;
  12 import java.util.Date;
  13 import java.util.List;
  14 import java.util.Map.Entry;
  15 import java.util.Scanner;
  16
  17 import be.nikiroo.fanfix.Instance;
  18 import be.nikiroo.fanfix.bundles.Config;
  19 import be.nikiroo.fanfix.bundles.StringId;
  20 import be.nikiroo.fanfix.data.Chapter;
  21 import be.nikiroo.fanfix.data.MetaData;
  22 import be.nikiroo.fanfix.data.Paragraph;
  23 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
  24 import be.nikiroo.fanfix.data.Story;
  25 import be.nikiroo.utils.Image;
  26 import be.nikiroo.utils.Progress;
  27 import be.nikiroo.utils.StringUtils;
  28
  29 /**
  30  * DEPRECATED: use the new Jsoup 'Node' system.
  31  * <p>
  32  * This class is the base class used by the other support classes. It can be
  33  * used outside of this package, and have static method that you can use to get
  34  * access to the correct support class.
  35  * <p>
  36  * It will be used with 'resources' (usually web pages or files).
  37  *
  38  * @author niki
  39  */
  40 @Deprecated
  41 public abstract class BasicSupport_Deprecated extends BasicSupport {
  42         private InputStream in;
  43
  44         // quote chars
  45         private char openQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_SINGLE_QUOTE);
  46         private char closeQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_SINGLE_QUOTE);
  47         private char openDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_DOUBLE_QUOTE);
  48         private char closeDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_DOUBLE_QUOTE);
  49
  50         // New methods not used in Deprecated mode
  51         @Override
  52         protected String getDesc() throws IOException {
  53                 throw new RuntimeException("should not be used by legacy code");
  54         }
  55
  56         @Override
  57         protected MetaData getMeta() throws IOException {
  58                 throw new RuntimeException("should not be used by legacy code");
  59         }
  60
  61         @Override
  62         protected List<Entry<String, URL>> getChapters(Progress pg)
  63                         throws IOException {
  64                 throw new RuntimeException("should not be used by legacy code");
  65         }
  66
  67         @Override
  68         protected String getChapterContent(URL chapUrl, int number, Progress pg)
  69                         throws IOException {
  70                 throw new RuntimeException("should not be used by legacy code");
  71         }
  72
  73         @Override
  74         public Story process(Progress pg) throws IOException {
  75                 return process(getSource(), pg);
  76         }
  77
  78         //
  79
  80         /**
  81          * Return the {@link MetaData} of this story.
  82          *
  83          * @param source
  84          *            the source of the story
  85          * @param in
  86          *            the input (the main resource)
  87          *
  88          * @return the associated {@link MetaData}, never NULL
  89          *
  90          * @throws IOException
  91          *             in case of I/O error
  92          */
  93         protected abstract MetaData getMeta(URL source, InputStream in)
  94                         throws IOException;
  95
  96         /**
  97          * Return the story description.
  98          *
  99          * @param source
 100          *            the source of the story
 101          * @param in
 102          *            the input (the main resource)
 103          *
 104          * @return the description
 105          *
 106          * @throws IOException
 107          *             in case of I/O error
 108          */
 109         protected abstract String getDesc(URL source, InputStream in)
 110                         throws IOException;
 111
 112         /**
 113          * Return the list of chapters (name and resource).
 114          *
 115          * @param source
 116          *            the source of the story
 117          * @param in
 118          *            the input (the main resource)
 119          * @param pg
 120          *            the optional progress reporter
 121          *
 122          * @return the chapters
 123          *
 124          * @throws IOException
 125          *             in case of I/O error
 126          */
 127         protected abstract List<Entry<String, URL>> getChapters(URL source,
 128                         InputStream in, Progress pg) throws IOException;
 129
 130         /**
 131          * Return the content of the chapter (possibly HTML encoded, if
 132          * {@link BasicSupport_Deprecated#isHtml()} is TRUE).
 133          *
 134          * @param source
 135          *            the source of the story
 136          * @param in
 137          *            the input (the main resource)
 138          * @param number
 139          *            the chapter number
 140          * @param pg
 141          *            the optional progress reporter
 142          *
 143          * @return the content
 144          *
 145          * @throws IOException
 146          *             in case of I/O error
 147          */
 148         protected abstract String getChapterContent(URL source, InputStream in,
 149                         int number, Progress pg) throws IOException;
 150
 151         /**
 152          * Process the given story resource into a partially filled {@link Story}
 153          * object containing the name and metadata, except for the description.
 154          *
 155          * @param url
 156          *            the story resource
 157          *
 158          * @return the {@link Story}
 159          *
 160          * @throws IOException
 161          *             in case of I/O error
 162          */
 163         public Story processMeta(URL url) throws IOException {
 164                 return processMeta(url, true, false, null);
 165         }
 166
 167         /**
 168          * Process the given story resource into a partially filled {@link Story}
 169          * object containing the name and metadata.
 170          *
 171          * @param url
 172          *            the story resource
 173          * @param close
 174          *            close "this" and "in" when done
 175          * @param getDesc
 176          *            retrieve the description of the story, or not
 177          * @param pg
 178          *            the optional progress reporter
 179          *
 180          * @return the {@link Story}, never NULL
 181          *
 182          * @throws IOException
 183          *             in case of I/O error
 184          */
 185         protected Story processMeta(URL url, boolean close, boolean getDesc,
 186                         Progress pg) throws IOException {
 187                 if (pg == null) {
 188                         pg = new Progress();
 189                 } else {
 190                         pg.setMinMax(0, 100);
 191                 }
 192
 193                 login();
 194                 pg.setProgress(10);
 195
 196                 url = getCanonicalUrl(url);
 197
 198                 setCurrentReferer(url);
 199
 200                 in = openInput(url); // NULL allowed here
 201                 try {
 202                         preprocess(url, getInput());
 203                         pg.setProgress(30);
 204
 205                         Story story = new Story();
 206                         MetaData meta = getMeta(url, getInput());
 207                         if (meta.getCreationDate() == null
 208                                         || meta.getCreationDate().isEmpty()) {
 209                                 meta.setCreationDate(StringUtils.fromTime(new Date().getTime()));
 210                         }
 211                         story.setMeta(meta);
 212                         pg.put("meta", meta);
 213
 214                         pg.setProgress(50);
 215
 216                         if (meta.getCover() == null) {
 217                                 meta.setCover(getDefaultCover(meta.getSubject()));
 218                         }
 219
 220                         pg.setProgress(60);
 221
 222                         if (getDesc) {
 223                                 String descChapterName = Instance.getInstance().getTrans().getString(StringId.DESCRIPTION);
 224                                 story.getMeta().setResume(makeChapter(url, 0, descChapterName, getDesc(url, getInput()), null));
 225                         }
 226
 227                         pg.setProgress(100);
 228                         return story;
 229                 } finally {
 230                         if (close) {
 231                                 close();
 232
 233                                 if (in != null) {
 234                                         in.close();
 235                                 }
 236                         }
 237                 }
 238         }
 239
 240         /**
 241          * Process the given story resource into a fully filled {@link Story}
 242          * object.
 243          *
 244          * @param url
 245          *            the story resource
 246          * @param pg
 247          *            the optional progress reporter
 248          *
 249          * @return the {@link Story}, never NULL
 250          *
 251          * @throws IOException
 252          *             in case of I/O error
 253          */
 254         protected Story process(URL url, Progress pg) throws IOException {
 255                 if (pg == null) {
 256                         pg = new Progress();
 257                 } else {
 258                         pg.setMinMax(0, 100);
 259                 }
 260
 261                 url = getCanonicalUrl(url);
 262                 pg.setProgress(1);
 263                 try {
 264                         Progress pgMeta = new Progress();
 265                         pg.addProgress(pgMeta, 10);
 266                         Story story = processMeta(url, false, true, pgMeta);
 267                         pg.put("meta", story.getMeta());
 268                         if (!pgMeta.isDone()) {
 269                                 pgMeta.setProgress(pgMeta.getMax()); // 10%
 270                         }
 271
 272                         pg.setName("Retrieving " + story.getMeta().getTitle());
 273
 274                         setCurrentReferer(url);
 275
 276                         Progress pgGetChapters = new Progress();
 277                         pg.addProgress(pgGetChapters, 10);
 278                         story.setChapters(new ArrayList<Chapter>());
 279                         List<Entry<String, URL>> chapters = getChapters(url, getInput(),
 280                                         pgGetChapters);
 281                         if (!pgGetChapters.isDone()) {
 282                                 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
 283                         }
 284
 285                         if (chapters != null) {
 286                                 Progress pgChaps = new Progress("Extracting chapters", 0,
 287                                                 chapters.size() * 300);
 288                                 pg.addProgress(pgChaps, 80);
 289
 290                                 long words = 0;
 291                                 int i = 1;
 292                                 for (Entry<String, URL> chap : chapters) {
 293                                         pgChaps.setName("Extracting chapter " + i);
 294                                         InputStream chapIn = null;
 295                                         if (chap.getValue() != null) {
 296                                                 setCurrentReferer(chap.getValue());
 297                                                 chapIn = Instance.getInstance().getCache().open(chap.getValue(), this, false);
 298                                         }
 299                                         pgChaps.setProgress(i * 100);
 300                                         try {
 301                                                 Progress pgGetChapterContent = new Progress();
 302                                                 Progress pgMakeChapter = new Progress();
 303                                                 pgChaps.addProgress(pgGetChapterContent, 100);
 304                                                 pgChaps.addProgress(pgMakeChapter, 100);
 305
 306                                                 String content = getChapterContent(url, chapIn, i,
 307                                                                 pgGetChapterContent);
 308                                                 if (!pgGetChapterContent.isDone()) {
 309                                                         pgGetChapterContent.setProgress(pgGetChapterContent
 310                                                                         .getMax());
 311                                                 }
 312
 313                                                 Chapter cc = makeChapter(url, i, chap.getKey(),
 314                                                                 content, pgMakeChapter);
 315                                                 if (!pgMakeChapter.isDone()) {
 316                                                         pgMakeChapter.setProgress(pgMakeChapter.getMax());
 317                                                 }
 318
 319                                                 words += cc.getWords();
 320                                                 story.getChapters().add(cc);
 321                                                 story.getMeta().setWords(words);
 322                                         } finally {
 323                                                 if (chapIn != null) {
 324                                                         chapIn.close();
 325                                                 }
 326                                         }
 327
 328                                         i++;
 329                                 }
 330
 331                                 pgChaps.setName("Extracting chapters");
 332                         } else {
 333                                 pg.setProgress(80);
 334                         }
 335
 336                         return story;
 337
 338                 } finally {
 339                         close();
 340
 341                         if (in != null) {
 342                                 in.close();
 343                         }
 344                 }
 345         }
 346
 347         /**
 348          * Prepare the support if needed before processing.
 349          *
 350          * @param source
 351          *            the source of the story
 352          * @param in
 353          *            the input (the main resource)
 354          *
 355          * @throws IOException
 356          *             on I/O error
 357          */
 358         @SuppressWarnings("unused")
 359         protected void preprocess(URL source, InputStream in) throws IOException {
 360         }
 361
 362         /**
 363          * Create a {@link Chapter} object from the given information, formatting
 364          * the content as it should be.
 365          *
 366          * @param source
 367          *            the source of the story
 368          * @param number
 369          *            the chapter number
 370          * @param name
 371          *            the chapter name
 372          * @param content
 373          *            the chapter content
 374          * @param pg
 375          *            the optional progress reporter
 376          *
 377          * @return the {@link Chapter}
 378          *
 379          * @throws IOException
 380          *             in case of I/O error
 381          */
 382         protected Chapter makeChapter(URL source, int number, String name,
 383                         String content, Progress pg) throws IOException {
 384                 // Chapter name: process it correctly, then remove the possible
 385                 // redundant "Chapter x: " in front of it, or "-" (as in
 386                 // "Chapter 5: - Fun!" after the ": " was automatically added)
 387                 String chapterName = processPara(name).getContent().trim();
 388                 for (String lang : Instance.getInstance().getConfig().getList(Config.CONF_CHAPTER)) {
 389                         String chapterWord = Instance.getInstance().getConfig().getStringX(Config.CONF_CHAPTER, lang);
 390                         if (chapterName.startsWith(chapterWord)) {
 391                                 chapterName = chapterName.substring(chapterWord.length())
 392                                                 .trim();
 393                                 break;
 394                         }
 395                 }
 396
 397                 if (chapterName.startsWith(Integer.toString(number))) {
 398                         chapterName = chapterName.substring(
 399                                         Integer.toString(number).length()).trim();
 400                 }
 401
 402                 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
 403                         chapterName = chapterName.substring(1).trim();
 404                 }
 405                 //
 406
 407                 Chapter chap = new Chapter(number, chapterName);
 408
 409                 if (content != null) {
 410                         List<Paragraph> paras = makeParagraphs(source, content, pg);
 411                         long words = 0;
 412                         for (Paragraph para : paras) {
 413                                 words += para.getWords();
 414                         }
 415                         chap.setParagraphs(paras);
 416                         chap.setWords(words);
 417                 }
 418
 419                 return chap;
 420
 421         }
 422
 423         /**
 424          * Convert the given content into {@link Paragraph}s.
 425          *
 426          * @param source
 427          *            the source URL of the story
 428          * @param content
 429          *            the textual content
 430          * @param pg
 431          *            the optional progress reporter
 432          *
 433          * @return the {@link Paragraph}s
 434          *
 435          * @throws IOException
 436          *             in case of I/O error
 437          */
 438         protected List<Paragraph> makeParagraphs(URL source, String content,
 439                         Progress pg) throws IOException {
 440                 if (pg == null) {
 441                         pg = new Progress();
 442                 }
 443
 444                 if (isHtml()) {
 445                         // Special <HR> processing:
 446                         content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
 447                                         "<br/>* * *<br/>");
 448                 }
 449
 450                 List<Paragraph> paras = new ArrayList<Paragraph>();
 451
 452                 if (content != null && !content.trim().isEmpty()) {
 453                         if (isHtml()) {
 454                                 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
 455                                 pg.setMinMax(0, tab.length);
 456                                 int i = 1;
 457                                 for (String line : tab) {
 458                                         if (line.startsWith("[") && line.endsWith("]")) {
 459                                                 pg.setName("Extracting image " + i);
 460                                         }
 461                                         paras.add(makeParagraph(source, line.trim()));
 462                                         pg.setProgress(i++);
 463                                 }
 464                                 pg.setName(null);
 465                         } else {
 466                                 List<String> lines = new ArrayList<String>();
 467                                 BufferedReader buff = null;
 468                                 try {
 469                                         buff = new BufferedReader(
 470                                                         new InputStreamReader(new ByteArrayInputStream(
 471                                                                         content.getBytes("UTF-8")), "UTF-8"));
 472                                         for (String line = buff.readLine(); line != null; line = buff
 473                                                         .readLine()) {
 474                                                 lines.add(line.trim());
 475                                         }
 476                                 } finally {
 477                                         if (buff != null) {
 478                                                 buff.close();
 479                                         }
 480                                 }
 481
 482                                 pg.setMinMax(0, lines.size());
 483                                 int i = 0;
 484                                 for (String line : lines) {
 485                                         if (line.startsWith("[") && line.endsWith("]")) {
 486                                                 pg.setName("Extracting image " + i);
 487                                         }
 488                                         paras.add(makeParagraph(source, line));
 489                                         pg.setProgress(i++);
 490                                 }
 491                                 pg.setName(null);
 492                         }
 493
 494                         // Check quotes for "bad" format
 495                         List<Paragraph> newParas = new ArrayList<Paragraph>();
 496                         for (Paragraph para : paras) {
 497                                 newParas.addAll(requotify(para));
 498                         }
 499                         paras = newParas;
 500
 501                         // Remove double blanks/brks
 502                         fixBlanksBreaks(paras);
 503                 }
 504
 505                 return paras;
 506         }
 507
 508         /**
 509          * Convert the given line into a single {@link Paragraph}.
 510          *
 511          * @param source
 512          *            the source URL of the story
 513          * @param line
 514          *            the textual content of the paragraph
 515          *
 516          * @return the {@link Paragraph}
 517          */
 518         private Paragraph makeParagraph(URL source, String line) {
 519                 Image image = null;
 520                 if (line.startsWith("[") && line.endsWith("]")) {
 521                         image = getImage(this, source, line.substring(1, line.length() - 1)
 522                                         .trim());
 523                 }
 524
 525                 if (image != null) {
 526                         return new Paragraph(image);
 527                 }
 528
 529                 return processPara(line);
 530         }
 531
 532         /**
 533          * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
 534          * those {@link Paragraph}s.
 535          * <p>
 536          * The resulting list will not contain a starting or trailing blank/break
 537          * nor 2 blanks or breaks following each other.
 538          *
 539          * @param paras
 540          *            the list of {@link Paragraph}s to fix
 541          */
 542         protected void fixBlanksBreaks(List<Paragraph> paras) {
 543                 boolean space = false;
 544                 boolean brk = true;
 545                 for (int i = 0; i < paras.size(); i++) {
 546                         Paragraph para = paras.get(i);
 547                         boolean thisSpace = para.getType() == ParagraphType.BLANK;
 548                         boolean thisBrk = para.getType() == ParagraphType.BREAK;
 549
 550                         if (i > 0 && space && thisBrk) {
 551                                 paras.remove(i - 1);
 552                                 i--;
 553                         } else if ((space || brk) && (thisSpace || thisBrk)) {
 554                                 paras.remove(i);
 555                                 i--;
 556                         }
 557
 558                         space = thisSpace;
 559                         brk = thisBrk;
 560                 }
 561
 562                 // Remove blank/brk at start
 563                 if (paras.size() > 0
 564                                 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
 565                                                 0).getType() == ParagraphType.BREAK)) {
 566                         paras.remove(0);
 567                 }
 568
 569                 // Remove blank/brk at end
 570                 int last = paras.size() - 1;
 571                 if (paras.size() > 0
 572                                 && (paras.get(last).getType() == ParagraphType.BLANK || paras
 573                                                 .get(last).getType() == ParagraphType.BREAK)) {
 574                         paras.remove(last);
 575                 }
 576         }
 577
 578         /**
 579          * Get the default cover related to this subject (see <tt>.info</tt> files).
 580          *
 581          * @param subject
 582          *            the subject
 583          *
 584          * @return the cover if any, or NULL
 585          */
 586         static Image getDefaultCover(String subject) {
 587                 if (subject != null && !subject.isEmpty() && Instance.getInstance().getCoverDir() != null) {
 588                         try {
 589                                 File fileCover = new File(Instance.getInstance().getCoverDir(), subject);
 590                                 return getImage(null, fileCover.toURI().toURL(), subject);
 591                         } catch (MalformedURLException e) {
 592                         }
 593                 }
 594
 595                 return null;
 596         }
 597
 598         /**
 599          * Return the list of supported image extensions.
 600          *
 601          * @param emptyAllowed
 602          *            TRUE to allow an empty extension on first place, which can be
 603          *            used when you may already have an extension in your input but
 604          *            are not sure about it
 605          *
 606          * @return the extensions
 607          */
 608         static String[] getImageExt(boolean emptyAllowed) {
 609                 if (emptyAllowed) {
 610                         return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
 611                 }
 612
 613                 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
 614         }
 615
 616         /**
 617          * Check if the given resource can be a local image or a remote image, then
 618          * refresh the cache with it if it is.
 619          *
 620          * @param source
 621          *            the story source
 622          * @param line
 623          *            the resource to check
 624          *
 625          * @return the image if found, or NULL
 626          *
 627          */
 628         static Image getImage(BasicSupport_Deprecated support, URL source,
 629                         String line) {
 630                 URL url = getImageUrl(support, source, line);
 631                 if (url != null) {
 632                         if ("file".equals(url.getProtocol())) {
 633                                 if (new File(url.getPath()).isDirectory()) {
 634                                         return null;
 635                                 }
 636                         }
 637                         InputStream in = null;
 638                         try {
 639                                 in = Instance.getInstance().getCache().open(url, getSupport(url), true);
 640                                 return new Image(in);
 641                         } catch (IOException e) {
 642                         } finally {
 643                                 if (in != null) {
 644                                         try {
 645                                                 in.close();
 646                                         } catch (IOException e) {
 647                                         }
 648                                 }
 649                         }
 650                 }
 651
 652                 return null;
 653         }
 654
 655         /**
 656          * Check if the given resource can be a local image or a remote image, then
 657          * refresh the cache with it if it is.
 658          *
 659          * @param source
 660          *            the story source
 661          * @param line
 662          *            the resource to check
 663          *
 664          * @return the image URL if found, or NULL
 665          *
 666          */
 667         static URL getImageUrl(BasicSupport_Deprecated support, URL source,
 668                         String line) {
 669                 URL url = null;
 670
 671                 if (line != null) {
 672                         // try for files
 673                         if (source != null) {
 674                                 try {
 675                                         String relPath = null;
 676                                         String absPath = null;
 677                                         try {
 678                                                 String path = new File(source.getFile()).getParent();
 679                                                 relPath = new File(new File(path), line.trim())
 680                                                                 .getAbsolutePath();
 681                                         } catch (Exception e) {
 682                                                 // Cannot be converted to path (one possibility to take
 683                                                 // into account: absolute path on Windows)
 684                                         }
 685                                         try {
 686                                                 absPath = new File(line.trim()).getAbsolutePath();
 687                                         } catch (Exception e) {
 688                                                 // Cannot be converted to path (at all)
 689                                         }
 690
 691                                         for (String ext : getImageExt(true)) {
 692                                                 File absFile = new File(absPath + ext);
 693                                                 File relFile = new File(relPath + ext);
 694                                                 if (absPath != null && absFile.exists()
 695                                                                 && absFile.isFile()) {
 696                                                         url = absFile.toURI().toURL();
 697                                                 } else if (relPath != null && relFile.exists()
 698                                                                 && relFile.isFile()) {
 699                                                         url = relFile.toURI().toURL();
 700                                                 }
 701                                         }
 702                                 } catch (Exception e) {
 703                                         // Should not happen since we control the correct arguments
 704                                 }
 705                         }
 706
 707                         if (url == null) {
 708                                 // try for URLs
 709                                 try {
 710                                         for (String ext : getImageExt(true)) {
 711                                                 if (Instance.getInstance().getCache().check(new URL(line + ext), true)) {
 712                                                         url = new URL(line + ext);
 713                                                         break;
 714                                                 }
 715                                         }
 716
 717                                         // try out of cache
 718                                         if (url == null) {
 719                                                 for (String ext : getImageExt(true)) {
 720                                                         try {
 721                                                                 url = new URL(line + ext);
 722                                                                 Instance.getInstance().getCache().refresh(url, support, true);
 723                                                                 break;
 724                                                         } catch (IOException e) {
 725                                                                 // no image with this ext
 726                                                                 url = null;
 727                                                         }
 728                                                 }
 729                                         }
 730                                 } catch (MalformedURLException e) {
 731                                         // Not an url
 732                                 }
 733                         }
 734
 735                         // refresh the cached file
 736                         if (url != null) {
 737                                 try {
 738                                         Instance.getInstance().getCache().refresh(url, support, true);
 739                                 } catch (IOException e) {
 740                                         // woops, broken image
 741                                         url = null;
 742                                 }
 743                         }
 744                 }
 745
 746                 return url;
 747         }
 748
 749         /**
 750          * Open the input file that will be used through the support.
 751          * <p>
 752          * Can return NULL, in which case you are supposed to work without an
 753          * {@link InputStream}.
 754          *
 755          * @param source
 756          *            the source {@link URL}
 757          *
 758          * @return the {@link InputStream}
 759          *
 760          * @throws IOException
 761          *             in case of I/O error
 762          */
 763         protected InputStream openInput(URL source) throws IOException {
 764                 return Instance.getInstance().getCache().open(source, this, false);
 765         }
 766
 767         /**
 768          * Reset then return {@link BasicSupport_Deprecated#in}.
 769          *
 770          * @return {@link BasicSupport_Deprecated#in}
 771          */
 772         protected InputStream getInput() {
 773                 return reset(in);
 774         }
 775
 776         /**
 777          * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
 778          * and requotify them (i.e., separate them into QUOTE paragraphs and other
 779          * paragraphs (quotes or not)).
 780          *
 781          * @param para
 782          *            the paragraph to requotify (not necessarily a quote)
 783          *
 784          * @return the correctly (or so we hope) quotified paragraphs
 785          */
 786         protected List<Paragraph> requotify(Paragraph para) {
 787                 List<Paragraph> newParas = new ArrayList<Paragraph>();
 788
 789                 if (para.getType() == ParagraphType.QUOTE
 790                                 && para.getContent().length() > 2) {
 791                         String line = para.getContent();
 792                         boolean singleQ = line.startsWith("" + openQuote);
 793                         boolean doubleQ = line.startsWith("" + openDoubleQuote);
 794
 795                         // Do not try when more than one quote at a time
 796                         // (some stories are not easily readable if we do)
 797                         if (singleQ
 798                                         && line.indexOf(closeQuote, 1) < line
 799                                                         .lastIndexOf(closeQuote)) {
 800                                 newParas.add(para);
 801                                 return newParas;
 802                         }
 803                         if (doubleQ
 804                                         && line.indexOf(closeDoubleQuote, 1) < line
 805                                                         .lastIndexOf(closeDoubleQuote)) {
 806                                 newParas.add(para);
 807                                 return newParas;
 808                         }
 809                         //
 810
 811                         if (!singleQ && !doubleQ) {
 812                                 line = openDoubleQuote + line + closeDoubleQuote;
 813                                 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
 814                                                 .getWords()));
 815                         } else {
 816                                 char open = singleQ ? openQuote : openDoubleQuote;
 817                                 char close = singleQ ? closeQuote : closeDoubleQuote;
 818
 819                                 int posDot = -1;
 820                                 boolean inQuote = false;
 821                                 int i = 0;
 822                                 for (char car : line.toCharArray()) {
 823                                         if (car == open) {
 824                                                 inQuote = true;
 825                                         } else if (car == close) {
 826                                                 inQuote = false;
 827                                         } else if (car == '.' && !inQuote) {
 828                                                 posDot = i;
 829                                                 break;
 830                                         }
 831                                         i++;
 832                                 }
 833
 834                                 if (posDot >= 0) {
 835                                         String rest = line.substring(posDot + 1).trim();
 836                                         line = line.substring(0, posDot + 1).trim();
 837                                         long words = 1;
 838                                         for (char car : line.toCharArray()) {
 839                                                 if (car == ' ') {
 840                                                         words++;
 841                                                 }
 842                                         }
 843                                         newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
 844                                         if (!rest.isEmpty()) {
 845                                                 newParas.addAll(requotify(processPara(rest)));
 846                                         }
 847                                 } else {
 848                                         newParas.add(para);
 849                                 }
 850                         }
 851                 } else {
 852                         newParas.add(para);
 853                 }
 854
 855                 return newParas;
 856         }
 857
 858         /**
 859          * Process a {@link Paragraph} from a raw line of text.
 860          * <p>
 861          * Will also fix quotes and HTML encoding if needed.
 862          *
 863          * @param line
 864          *            the raw line
 865          *
 866          * @return the processed {@link Paragraph}
 867          */
 868         protected Paragraph processPara(String line) {
 869                 line = ifUnhtml(line).trim();
 870
 871                 boolean space = true;
 872                 boolean brk = true;
 873                 boolean quote = false;
 874                 boolean tentativeCloseQuote = false;
 875                 char prev = '\0';
 876                 int dashCount = 0;
 877                 long words = 1;
 878
 879                 StringBuilder builder = new StringBuilder();
 880                 for (char car : line.toCharArray()) {
 881                         if (car != '-') {
 882                                 if (dashCount > 0) {
 883                                         // dash, ndash and mdash: - – —
 884                                         // currently: always use mdash
 885                                         builder.append(dashCount == 1 ? '-' : '—');
 886                                 }
 887                                 dashCount = 0;
 888                         }
 889
 890                         if (tentativeCloseQuote) {
 891                                 tentativeCloseQuote = false;
 892                                 if (Character.isLetterOrDigit(car)) {
 893                                         builder.append("'");
 894                                 } else {
 895                                         // handle double-single quotes as double quotes
 896                                         if (prev == car) {
 897                                                 builder.append(closeDoubleQuote);
 898                                                 continue;
 899                                         }
 900
 901                                         builder.append(closeQuote);
 902                                 }
 903                         }
 904
 905                         switch (car) {
 906                         case ' ': // note: unbreakable space
 907                         case ' ':
 908                         case '\t':
 909                         case '\n': // just in case
 910                         case '\r': // just in case
 911                                 if (builder.length() > 0
 912                                                 && builder.charAt(builder.length() - 1) != ' ') {
 913                                         words++;
 914                                 }
 915                                 builder.append(' ');
 916                                 break;
 917
 918                         case '\'':
 919                                 if (space || (brk && quote)) {
 920                                         quote = true;
 921                                         // handle double-single quotes as double quotes
 922                                         if (prev == car) {
 923                                                 builder.deleteCharAt(builder.length() - 1);
 924                                                 builder.append(openDoubleQuote);
 925                                         } else {
 926                                                 builder.append(openQuote);
 927                                         }
 928                                 } else if (prev == ' ' || prev == car) {
 929                                         // handle double-single quotes as double quotes
 930                                         if (prev == car) {
 931                                                 builder.deleteCharAt(builder.length() - 1);
 932                                                 builder.append(openDoubleQuote);
 933                                         } else {
 934                                                 builder.append(openQuote);
 935                                         }
 936                                 } else {
 937                                         // it is a quote ("I'm off") or a 'quote' ("This
 938                                         // 'good' restaurant"...)
 939                                         tentativeCloseQuote = true;
 940                                 }
 941                                 break;
 942
 943                         case '"':
 944                                 if (space || (brk && quote)) {
 945                                         quote = true;
 946                                         builder.append(openDoubleQuote);
 947                                 } else if (prev == ' ') {
 948                                         builder.append(openDoubleQuote);
 949                                 } else {
 950                                         builder.append(closeDoubleQuote);
 951                                 }
 952                                 break;
 953
 954                         case '-':
 955                                 if (space) {
 956                                         quote = true;
 957                                 } else {
 958                                         dashCount++;
 959                                 }
 960                                 space = false;
 961                                 break;
 962
 963                         case '*':
 964                         case '~':
 965                         case '/':
 966                         case '\\':
 967                         case '<':
 968                         case '>':
 969                         case '=':
 970                         case '+':
 971                         case '_':
 972                         case '–':
 973                         case '—':
 974                                 space = false;
 975                                 builder.append(car);
 976                                 break;
 977
 978                         case '‘':
 979                         case '`':
 980                         case '‹':
 981                         case '﹁':
 982                         case '〈':
 983                         case '「':
 984                                 if (space || (brk && quote)) {
 985                                         quote = true;
 986                                         builder.append(openQuote);
 987                                 } else {
 988                                         // handle double-single quotes as double quotes
 989                                         if (prev == car) {
 990                                                 builder.deleteCharAt(builder.length() - 1);
 991                                                 builder.append(openDoubleQuote);
 992                                         } else {
 993                                                 builder.append(openQuote);
 994                                         }
 995                                 }
 996                                 space = false;
 997                                 brk = false;
 998                                 break;
 999
1000                         case '’':
1001                         case '›':
1002                         case '﹂':
1003                         case '〉':
1004                         case '」':
1005                                 space = false;
1006                                 brk = false;
1007                                 // handle double-single quotes as double quotes
1008                                 if (prev == car) {
1009                                         builder.deleteCharAt(builder.length() - 1);
1010                                         builder.append(closeDoubleQuote);
1011                                 } else {
1012                                         builder.append(closeQuote);
1013                                 }
1014                                 break;
1015
1016                         case '«':
1017                         case '“':
1018                         case '﹃':
1019                         case '《':
1020                         case '『':
1021                                 if (space || (brk && quote)) {
1022                                         quote = true;
1023                                         builder.append(openDoubleQuote);
1024                                 } else {
1025                                         builder.append(openDoubleQuote);
1026                                 }
1027                                 space = false;
1028                                 brk = false;
1029                                 break;
1030
1031                         case '»':
1032                         case '”':
1033                         case '﹄':
1034                         case '》':
1035                         case '』':
1036                                 space = false;
1037                                 brk = false;
1038                                 builder.append(closeDoubleQuote);
1039                                 break;
1040
1041                         default:
1042                                 space = false;
1043                                 brk = false;
1044                                 builder.append(car);
1045                                 break;
1046                         }
1047
1048                         prev = car;
1049                 }
1050
1051                 if (tentativeCloseQuote) {
1052                         tentativeCloseQuote = false;
1053                         builder.append(closeQuote);
1054                 }
1055
1056                 line = builder.toString().trim();
1057
1058                 ParagraphType type = ParagraphType.NORMAL;
1059                 if (space) {
1060                         type = ParagraphType.BLANK;
1061                 } else if (brk) {
1062                         type = ParagraphType.BREAK;
1063                 } else if (quote) {
1064                         type = ParagraphType.QUOTE;
1065                 }
1066
1067                 return new Paragraph(type, line, words);
1068         }
1069
1070         /**
1071          * Remove the HTML from the input <b>if</b>
1072          * {@link BasicSupport_Deprecated#isHtml()} is true.
1073          *
1074          * @param input
1075          *            the input
1076          *
1077          * @return the no html version if needed
1078          */
1079         private String ifUnhtml(String input) {
1080                 if (isHtml() && input != null) {
1081                         return StringUtils.unhtml(input);
1082                 }
1083
1084                 return input;
1085         }
1086
1087         /**
1088          * Reset the given {@link InputStream} and return it.
1089          *
1090          * @param in
1091          *            the {@link InputStream} to reset
1092          *
1093          * @return the same {@link InputStream} after reset
1094          */
1095         static protected InputStream reset(InputStream in) {
1096                 try {
1097                         if (in != null) {
1098                                 in.reset();
1099                         }
1100                 } catch (IOException e) {
1101                 }
1102
1103                 return in;
1104         }
1105
1106         /**
1107          * Return the first line from the given input which correspond to the given
1108          * selectors.
1109          *
1110          * @param in
1111          *            the input
1112          * @param needle
1113          *            a string that must be found inside the target line (also
1114          *            supports "^" at start to say "only if it starts with" the
1115          *            needle)
1116          * @param relativeLine
1117          *            the line to return based upon the target line position (-1 =
1118          *            the line before, 0 = the target line...)
1119          *
1120          * @return the line, or NULL if not found
1121          */
1122         static protected String getLine(InputStream in, String needle,
1123                         int relativeLine) {
1124                 return getLine(in, needle, relativeLine, true);
1125         }
1126
1127         /**
1128          * Return a line from the given input which correspond to the given
1129          * selectors.
1130          *
1131          * @param in
1132          *            the input
1133          * @param needle
1134          *            a string that must be found inside the target line (also
1135          *            supports "^" at start to say "only if it starts with" the
1136          *            needle)
1137          * @param relativeLine
1138          *            the line to return based upon the target line position (-1 =
1139          *            the line before, 0 = the target line...)
1140          * @param first
1141          *            takes the first result (as opposed to the last one, which will
1142          *            also always spend the input)
1143          *
1144          * @return the line, or NULL if not found
1145          */
1146         static protected String getLine(InputStream in, String needle,
1147                         int relativeLine, boolean first) {
1148                 String rep = null;
1149
1150                 reset(in);
1151
1152                 List<String> lines = new ArrayList<String>();
1153                 @SuppressWarnings("resource")
1154                 Scanner scan = new Scanner(in, "UTF-8");
1155                 int index = -1;
1156                 scan.useDelimiter("\\n");
1157                 while (scan.hasNext()) {
1158                         lines.add(scan.next());
1159
1160                         if (index == -1) {
1161                                 if (needle.startsWith("^")) {
1162                                         if (lines.get(lines.size() - 1).startsWith(
1163                                                         needle.substring(1))) {
1164                                                 index = lines.size() - 1;
1165                                         }
1166
1167                                 } else {
1168                                         if (lines.get(lines.size() - 1).contains(needle)) {
1169                                                 index = lines.size() - 1;
1170                                         }
1171                                 }
1172                         }
1173
1174                         if (index >= 0 && index + relativeLine < lines.size()) {
1175                                 rep = lines.get(index + relativeLine);
1176                                 if (first) {
1177                                         break;
1178                                 }
1179                         }
1180                 }
1181
1182                 return rep;
1183         }
1184
1185         /**
1186          * Return the text between the key and the endKey (and optional subKey can
1187          * be passed, in this case we will look for the key first, then take the
1188          * text between the subKey and the endKey).
1189          * <p>
1190          * Will only match the first line with the given key if more than one are
1191          * possible. Which also means that if the subKey or endKey is not found on
1192          * that line, NULL will be returned.
1193          *
1194          * @param in
1195          *            the input
1196          * @param key
1197          *            the key to match (also supports "^" at start to say
1198          *            "only if it starts with" the key)
1199          * @param subKey
1200          *            the sub key or NULL if none
1201          * @param endKey
1202          *            the end key or NULL for "up to the end"
1203          * @return the text or NULL if not found
1204          */
1205         static protected String getKeyLine(InputStream in, String key,
1206                         String subKey, String endKey) {
1207                 return getKeyText(getLine(in, key, 0), key, subKey, endKey);
1208         }
1209
1210         /**
1211          * Return the text between the key and the endKey (and optional subKey can
1212          * be passed, in this case we will look for the key first, then take the
1213          * text between the subKey and the endKey).
1214          *
1215          * @param in
1216          *            the input
1217          * @param key
1218          *            the key to match (also supports "^" at start to say
1219          *            "only if it starts with" the key)
1220          * @param subKey
1221          *            the sub key or NULL if none
1222          * @param endKey
1223          *            the end key or NULL for "up to the end"
1224          * @return the text or NULL if not found
1225          */
1226         static protected String getKeyText(String in, String key, String subKey,
1227                         String endKey) {
1228                 String result = null;
1229
1230                 String line = in;
1231                 if (line != null && line.contains(key)) {
1232                         line = line.substring(line.indexOf(key) + key.length());
1233                         if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
1234                                 if (subKey != null) {
1235                                         line = line.substring(line.indexOf(subKey)
1236                                                         + subKey.length());
1237                                 }
1238                                 if (endKey == null || line.contains(endKey)) {
1239                                         if (endKey != null) {
1240                                                 line = line.substring(0, line.indexOf(endKey));
1241                                                 result = line;
1242                                         }
1243                                 }
1244                         }
1245                 }
1246
1247                 return result;
1248         }
1249
1250         /**
1251          * Return the text between the key and the endKey (optional subKeys can be
1252          * passed, in this case we will look for the subKeys first, then take the
1253          * text between the key and the endKey).
1254          *
1255          * @param in
1256          *            the input
1257          * @param key
1258          *            the key to match
1259          * @param endKey
1260          *            the end key or NULL for "up to the end"
1261          * @param afters
1262          *            the sub-keys to find before checking for key/endKey
1263          *
1264          * @return the text or NULL if not found
1265          */
1266         static protected String getKeyTextAfter(String in, String key,
1267                         String endKey, String... afters) {
1268
1269                 if (in != null && !in.isEmpty()) {
1270                         int pos = indexOfAfter(in, 0, afters);
1271                         if (pos < 0) {
1272                                 return null;
1273                         }
1274
1275                         in = in.substring(pos);
1276                 }
1277
1278                 return getKeyText(in, key, null, endKey);
1279         }
1280
1281         /**
1282          * Return the first index after all the given "afters" have been found in
1283          * the {@link String}, or -1 if it was not possible.
1284          *
1285          * @param in
1286          *            the input
1287          * @param startAt
1288          *            start at this position in the string
1289          * @param afters
1290          *            the sub-keys to find before checking for key/endKey
1291          *
1292          * @return the text or NULL if not found
1293          */
1294         static protected int indexOfAfter(String in, int startAt, String... afters) {
1295                 int pos = -1;
1296                 if (in != null && !in.isEmpty()) {
1297                         pos = startAt;
1298                         if (afters != null) {
1299                                 for (int i = 0; pos >= 0 && i < afters.length; i++) {
1300                                         String subKey = afters[i];
1301                                         if (!subKey.isEmpty()) {
1302                                                 pos = in.indexOf(subKey, pos);
1303                                                 if (pos >= 0) {
1304                                                         pos += subKey.length();
1305                                                 }
1306                                         }
1307                                 }
1308                         }
1309                 }
1310
1311                 return pos;
1312         }
1313 }