Fix some bad line breaks on HTML supports
[fanfix.git] / src / be / nikiroo / fanfix / supported / BasicSupport.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.awt.image.BufferedImage;
4 import java.io.BufferedReader;
5 import java.io.ByteArrayInputStream;
6 import java.io.File;
7 import java.io.IOException;
8 import java.io.InputStream;
9 import java.io.InputStreamReader;
10 import java.net.MalformedURLException;
11 import java.net.URL;
12 import java.util.ArrayList;
13 import java.util.Date;
14 import java.util.HashMap;
15 import java.util.List;
16 import java.util.Map;
17 import java.util.Map.Entry;
18 import java.util.Scanner;
19
20 import be.nikiroo.fanfix.Instance;
21 import be.nikiroo.fanfix.bundles.Config;
22 import be.nikiroo.fanfix.bundles.StringId;
23 import be.nikiroo.fanfix.data.Chapter;
24 import be.nikiroo.fanfix.data.MetaData;
25 import be.nikiroo.fanfix.data.Paragraph;
26 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
27 import be.nikiroo.fanfix.data.Story;
28 import be.nikiroo.utils.IOUtils;
29 import be.nikiroo.utils.Progress;
30 import be.nikiroo.utils.StringUtils;
31
32 /**
33 * This class is the base class used by the other support classes. It can be
34 * used outside of this package, and have static method that you can use to get
35 * access to the correct support class.
36 * <p>
37 * It will be used with 'resources' (usually web pages or files).
38 *
39 * @author niki
40 */
41 public abstract class BasicSupport {
42 /**
43 * The supported input types for which we can get a {@link BasicSupport}
44 * object.
45 *
46 * @author niki
47 */
48 public enum SupportType {
49 /** EPUB files created with this program */
50 EPUB,
51 /** Pure text file with some rules */
52 TEXT,
53 /** TEXT but with associated .info file */
54 INFO_TEXT,
55 /** My Little Pony fanfictions */
56 FIMFICTION,
57 /** Fanfictions from a lot of different universes */
58 FANFICTION,
59 /** Website with lots of Mangas */
60 MANGAFOX,
61 /** Furry website with comics support */
62 E621,
63 /** Furry website with stories */
64 YIFFSTAR,
65 /** CBZ files */
66 CBZ,
67 /** HTML files */
68 HTML;
69
70 /**
71 * A description of this support type (more information than the
72 * {@link BasicSupport#getSourceName()}).
73 *
74 * @return the description
75 */
76 public String getDesc() {
77 String desc = Instance.getTrans().getStringX(StringId.INPUT_DESC,
78 this.name());
79
80 if (desc == null) {
81 desc = Instance.getTrans().getString(StringId.INPUT_DESC, this);
82 }
83
84 return desc;
85 }
86
87 /**
88 * The name of this support type (a short version).
89 *
90 * @return the name
91 */
92 public String getSourceName() {
93 BasicSupport support = BasicSupport.getSupport(this);
94 if (support != null) {
95 return support.getSourceName();
96 }
97
98 return null;
99 }
100
101 @Override
102 public String toString() {
103 return super.toString().toLowerCase();
104 }
105
106 /**
107 * Call {@link SupportType#valueOf(String.toUpperCase())}.
108 *
109 * @param typeName
110 * the possible type name
111 *
112 * @return NULL or the type
113 */
114 public static SupportType valueOfUC(String typeName) {
115 return SupportType.valueOf(typeName == null ? null : typeName
116 .toUpperCase());
117 }
118
119 /**
120 * Call {@link SupportType#valueOf(String.toUpperCase())} but return
121 * NULL for NULL instead of raising exception.
122 *
123 * @param typeName
124 * the possible type name
125 *
126 * @return NULL or the type
127 */
128 public static SupportType valueOfNullOkUC(String typeName) {
129 if (typeName == null) {
130 return null;
131 }
132
133 return SupportType.valueOfUC(typeName);
134 }
135
136 /**
137 * Call {@link SupportType#valueOf(String.toUpperCase())} but return
138 * NULL in case of error instead of raising an exception.
139 *
140 * @param typeName
141 * the possible type name
142 *
143 * @return NULL or the type
144 */
145 public static SupportType valueOfAllOkUC(String typeName) {
146 try {
147 return SupportType.valueOfUC(typeName);
148 } catch (Exception e) {
149 return null;
150 }
151 }
152 }
153
154 private InputStream in;
155 private SupportType type;
156 private URL currentReferer; // with only one 'r', as in 'HTTP'...
157
158 // quote chars
159 private char openQuote = Instance.getTrans().getChar(
160 StringId.OPEN_SINGLE_QUOTE);
161 private char closeQuote = Instance.getTrans().getChar(
162 StringId.CLOSE_SINGLE_QUOTE);
163 private char openDoubleQuote = Instance.getTrans().getChar(
164 StringId.OPEN_DOUBLE_QUOTE);
165 private char closeDoubleQuote = Instance.getTrans().getChar(
166 StringId.CLOSE_DOUBLE_QUOTE);
167
168 /**
169 * The name of this support class.
170 *
171 * @return the name
172 */
173 protected abstract String getSourceName();
174
175 /**
176 * Check if the given resource is supported by this {@link BasicSupport}.
177 *
178 * @param url
179 * the resource to check for
180 *
181 * @return TRUE if it is
182 */
183 protected abstract boolean supports(URL url);
184
185 /**
186 * Return TRUE if the support will return HTML encoded content values for
187 * the chapters content.
188 *
189 * @return TRUE for HTML
190 */
191 protected abstract boolean isHtml();
192
193 protected abstract MetaData getMeta(URL source, InputStream in)
194 throws IOException;
195
196 /**
197 * Return the story description.
198 *
199 * @param source
200 * the source of the story
201 * @param in
202 * the input (the main resource)
203 *
204 * @return the description
205 *
206 * @throws IOException
207 * in case of I/O error
208 */
209 protected abstract String getDesc(URL source, InputStream in)
210 throws IOException;
211
212 /**
213 * Return the list of chapters (name and resource).
214 *
215 * @param source
216 * the source of the story
217 * @param in
218 * the input (the main resource)
219 *
220 * @return the chapters
221 *
222 * @throws IOException
223 * in case of I/O error
224 */
225 protected abstract List<Entry<String, URL>> getChapters(URL source,
226 InputStream in) throws IOException;
227
228 /**
229 * Return the content of the chapter (possibly HTML encoded, if
230 * {@link BasicSupport#isHtml()} is TRUE).
231 *
232 * @param source
233 * the source of the story
234 * @param in
235 * the input (the main resource)
236 * @param number
237 * the chapter number
238 *
239 * @return the content
240 *
241 * @throws IOException
242 * in case of I/O error
243 */
244 protected abstract String getChapterContent(URL source, InputStream in,
245 int number) throws IOException;
246
247 /**
248 * Log into the support (can be a no-op depending upon the support).
249 *
250 * @throws IOException
251 * in case of I/O error
252 */
253 public void login() throws IOException {
254
255 }
256
257 /**
258 * Return the list of cookies (values included) that must be used to
259 * correctly fetch the resources.
260 * <p>
261 * You are expected to call the super method implementation if you override
262 * it.
263 *
264 * @return the cookies
265 *
266 * @throws IOException
267 * in case of I/O error
268 */
269 public Map<String, String> getCookies() throws IOException {
270 return new HashMap<String, String>();
271 }
272
273 /**
274 * Return the canonical form of the main {@link URL}.
275 *
276 * @param source
277 * the source {@link URL}
278 *
279 * @return the canonical form of this {@link URL}
280 *
281 * @throws IOException
282 * in case of I/O error
283 */
284 public URL getCanonicalUrl(URL source) throws IOException {
285 return source;
286 }
287
288 /**
289 * Process the given story resource into a partially filled {@link Story}
290 * object containing the name and metadata, except for the description.
291 *
292 * @param url
293 * the story resource
294 *
295 * @return the {@link Story}
296 *
297 * @throws IOException
298 * in case of I/O error
299 */
300 public Story processMeta(URL url) throws IOException {
301 return processMeta(url, true, false);
302 }
303
304 /**
305 * Process the given story resource into a partially filled {@link Story}
306 * object containing the name and metadata.
307 *
308 * @param url
309 * the story resource
310 *
311 * @param close
312 * close "this" and "in" when done
313 *
314 * @return the {@link Story}
315 *
316 * @throws IOException
317 * in case of I/O error
318 */
319 protected Story processMeta(URL url, boolean close, boolean getDesc)
320 throws IOException {
321 login();
322
323 url = getCanonicalUrl(url);
324
325 setCurrentReferer(url);
326
327 in = openInput(url);
328 if (in == null) {
329 return null;
330 }
331
332 try {
333 preprocess(url, getInput());
334
335 Story story = new Story();
336 MetaData meta = getMeta(url, getInput());
337 if (meta.getCreationDate() == null
338 || meta.getCreationDate().isEmpty()) {
339 meta.setCreationDate(StringUtils.fromTime(new Date().getTime()));
340 }
341 story.setMeta(meta);
342
343 if (meta != null && meta.getCover() == null) {
344 meta.setCover(getDefaultCover(meta.getSubject()));
345 }
346
347 if (getDesc) {
348 String descChapterName = Instance.getTrans().getString(
349 StringId.DESCRIPTION);
350 story.getMeta().setResume(
351 makeChapter(url, 0, descChapterName,
352 getDesc(url, getInput())));
353 }
354
355 return story;
356 } finally {
357 if (close) {
358 try {
359 close();
360 } catch (IOException e) {
361 Instance.syserr(e);
362 }
363
364 if (in != null) {
365 in.close();
366 }
367 }
368
369 setCurrentReferer(null);
370 }
371 }
372
373 /**
374 * Process the given story resource into a fully filled {@link Story}
375 * object.
376 *
377 * @param url
378 * the story resource
379 * @param pg
380 * the optional progress reporter
381 *
382 * @return the {@link Story}
383 *
384 * @throws IOException
385 * in case of I/O error
386 */
387 public Story process(URL url, Progress pg) throws IOException {
388 if (pg == null) {
389 pg = new Progress();
390 } else {
391 pg.setMinMax(0, 100);
392 }
393
394 url = getCanonicalUrl(url);
395 pg.setProgress(1);
396 try {
397 Story story = processMeta(url, false, true);
398 pg.setProgress(10);
399 if (story == null) {
400 pg.setProgress(100);
401 return null;
402 }
403
404 pg.setName("Retrieving " + story.getMeta().getTitle());
405
406 setCurrentReferer(url);
407
408 story.setChapters(new ArrayList<Chapter>());
409
410 List<Entry<String, URL>> chapters = getChapters(url, getInput());
411 pg.setProgress(20);
412
413 int i = 1;
414 if (chapters != null) {
415 Progress pgChaps = new Progress(0, chapters.size());
416 pg.addProgress(pgChaps, 80);
417
418 long words = 0;
419 for (Entry<String, URL> chap : chapters) {
420 setCurrentReferer(chap.getValue());
421 InputStream chapIn = Instance.getCache().open(
422 chap.getValue(), this, true);
423 try {
424 Chapter cc = makeChapter(url, i, chap.getKey(),
425 getChapterContent(url, chapIn, i));
426 words += cc.getWords();
427 story.getChapters().add(cc);
428 if (story.getMeta() != null) {
429 story.getMeta().setWords(words);
430 }
431 } finally {
432 chapIn.close();
433 }
434
435 pgChaps.setProgress(i++);
436 }
437 } else {
438 pg.setProgress(100);
439 }
440
441 return story;
442
443 } finally {
444 try {
445 close();
446 } catch (IOException e) {
447 Instance.syserr(e);
448 }
449
450 if (in != null) {
451 in.close();
452 }
453
454 setCurrentReferer(null);
455 }
456 }
457
458 /**
459 * The support type.
460 *
461 * @return the type
462 */
463 public SupportType getType() {
464 return type;
465 }
466
467 /**
468 * The current referer {@link URL} (only one 'r', as in 'HTML'...), i.e.,
469 * the current {@link URL} we work on.
470 *
471 * @return the referer
472 */
473 public URL getCurrentReferer() {
474 return currentReferer;
475 }
476
477 /**
478 * The current referer {@link URL} (only one 'r', as in 'HTML'...), i.e.,
479 * the current {@link URL} we work on.
480 *
481 * @param currentReferer
482 * the new referer
483 */
484 protected void setCurrentReferer(URL currentReferer) {
485 this.currentReferer = currentReferer;
486 }
487
488 /**
489 * The support type.
490 *
491 * @param type
492 * the new type
493 *
494 * @return this
495 */
496 protected BasicSupport setType(SupportType type) {
497 this.type = type;
498 return this;
499 }
500
501 /**
502 * Prepare the support if needed before processing.
503 *
504 * @param source
505 * the source of the story
506 * @param in
507 * the input (the main resource)
508 *
509 * @throws IOException
510 * on I/O error
511 */
512 protected void preprocess(URL source, InputStream in) throws IOException {
513 }
514
515 /**
516 * Now that we have processed the {@link Story}, close the resources if any.
517 *
518 * @throws IOException
519 * on I/O error
520 */
521 protected void close() throws IOException {
522 }
523
524 /**
525 * Create a {@link Chapter} object from the given information, formatting
526 * the content as it should be.
527 *
528 * @param number
529 * the chapter number
530 * @param name
531 * the chapter name
532 * @param content
533 * the chapter content
534 *
535 * @return the {@link Chapter}
536 *
537 * @throws IOException
538 * in case of I/O error
539 */
540 protected Chapter makeChapter(URL source, int number, String name,
541 String content) throws IOException {
542 // Chapter name: process it correctly, then remove the possible
543 // redundant "Chapter x: " in front of it
544 String chapterName = processPara(name).getContent().trim();
545 for (String lang : Instance.getConfig().getString(Config.CHAPTER)
546 .split(",")) {
547 String chapterWord = Instance.getConfig().getStringX(
548 Config.CHAPTER, lang);
549 if (chapterName.startsWith(chapterWord)) {
550 chapterName = chapterName.substring(chapterWord.length())
551 .trim();
552 break;
553 }
554 }
555
556 if (chapterName.startsWith(Integer.toString(number))) {
557 chapterName = chapterName.substring(
558 Integer.toString(number).length()).trim();
559 }
560
561 if (chapterName.startsWith(":")) {
562 chapterName = chapterName.substring(1).trim();
563 }
564 //
565
566 Chapter chap = new Chapter(number, chapterName);
567
568 if (content != null) {
569 List<Paragraph> paras = makeParagraphs(source, content);
570 long words = 0;
571 for (Paragraph para : paras) {
572 words += para.getWords();
573 }
574 chap.setParagraphs(paras);
575 chap.setWords(words);
576 }
577
578 return chap;
579
580 }
581
582 /**
583 * Convert the given content into {@link Paragraph}s.
584 *
585 * @param source
586 * the source URL of the story
587 * @param content
588 * the textual content
589 *
590 * @return the {@link Paragraph}s
591 *
592 * @throws IOException
593 * in case of I/O error
594 */
595 protected List<Paragraph> makeParagraphs(URL source, String content)
596 throws IOException {
597 if (isHtml()) {
598 // Special <HR> processing:
599 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
600 "<br/>* * *<br/>");
601 }
602
603 List<Paragraph> paras = new ArrayList<Paragraph>();
604
605 if (content != null && !content.trim().isEmpty()) {
606 if (isHtml()) {
607 for (String line : content.split("(<p>|</p>|<br>|<br/>)")) {
608 paras.add(makeParagraph(source, line.trim()));
609 }
610 } else {
611 BufferedReader buff = null;
612 try {
613 buff = new BufferedReader(
614 new InputStreamReader(new ByteArrayInputStream(
615 content.getBytes("UTF-8")), "UTF-8"));
616 for (String line = buff.readLine(); line != null; line = buff
617 .readLine()) {
618 paras.add(makeParagraph(source, line.trim()));
619 }
620 } finally {
621 if (buff != null) {
622 buff.close();
623 }
624 }
625 }
626
627 // Check quotes for "bad" format
628 List<Paragraph> newParas = new ArrayList<Paragraph>();
629 for (Paragraph para : paras) {
630 newParas.addAll(requotify(para));
631 }
632 paras = newParas;
633
634 // Remove double blanks/brks
635 fixBlanksBreaks(paras);
636 }
637
638 return paras;
639 }
640
641 /**
642 * Convert the given line into a single {@link Paragraph}.
643 *
644 * @param source
645 * the source URL of the story
646 * @param line
647 * the textual content of the paragraph
648 *
649 * @return the {@link Paragraph}
650 */
651 private Paragraph makeParagraph(URL source, String line) {
652 URL image = null;
653 if (line.startsWith("[") && line.endsWith("]")) {
654 image = getImageUrl(this, source,
655 line.substring(1, line.length() - 1).trim());
656 }
657
658 if (image != null) {
659 return new Paragraph(image);
660 } else {
661 return processPara(line);
662 }
663 }
664
665 /**
666 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
667 * those {@link Paragraph}s.
668 * <p>
669 * The resulting list will not contain a starting or trailing blank/break
670 * nor 2 blanks or breaks following each other.
671 *
672 * @param paras
673 * the list of {@link Paragraph}s to fix
674 */
675 protected void fixBlanksBreaks(List<Paragraph> paras) {
676 boolean space = false;
677 boolean brk = true;
678 for (int i = 0; i < paras.size(); i++) {
679 Paragraph para = paras.get(i);
680 boolean thisSpace = para.getType() == ParagraphType.BLANK;
681 boolean thisBrk = para.getType() == ParagraphType.BREAK;
682
683 if (i > 0 && space && thisBrk) {
684 paras.remove(i - 1);
685 i--;
686 } else if ((space || brk) && (thisSpace || thisBrk)) {
687 paras.remove(i);
688 i--;
689 }
690
691 space = thisSpace;
692 brk = thisBrk;
693 }
694
695 // Remove blank/brk at start
696 if (paras.size() > 0
697 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
698 0).getType() == ParagraphType.BREAK)) {
699 paras.remove(0);
700 }
701
702 // Remove blank/brk at end
703 int last = paras.size() - 1;
704 if (paras.size() > 0
705 && (paras.get(last).getType() == ParagraphType.BLANK || paras
706 .get(last).getType() == ParagraphType.BREAK)) {
707 paras.remove(last);
708 }
709 }
710
711 /**
712 * Get the default cover related to this subject (see <tt>.info</tt> files).
713 *
714 * @param subject
715 * the subject
716 *
717 * @return the cover if any, or NULL
718 */
719 static BufferedImage getDefaultCover(String subject) {
720 if (subject != null && !subject.isEmpty()
721 && Instance.getCoverDir() != null) {
722 try {
723 File fileCover = new File(Instance.getCoverDir(), subject);
724 return getImage(null, fileCover.toURI().toURL(), subject);
725 } catch (MalformedURLException e) {
726 }
727 }
728
729 return null;
730 }
731
732 /**
733 * Return the list of supported image extensions.
734 *
735 * @param emptyAllowed
736 * TRUE to allow an empty extension on first place, which can be
737 * used when you may already have an extension in your input but
738 * are not sure about it
739 *
740 * @return the extensions
741 */
742 static String[] getImageExt(boolean emptyAllowed) {
743 if (emptyAllowed) {
744 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
745 } else {
746 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
747 }
748 }
749
750 /**
751 * Check if the given resource can be a local image or a remote image, then
752 * refresh the cache with it if it is.
753 *
754 * @param source
755 * the story source
756 * @param line
757 * the resource to check
758 *
759 * @return the image if found, or NULL
760 *
761 */
762 static BufferedImage getImage(BasicSupport support, URL source, String line) {
763 URL url = getImageUrl(support, source, line);
764 if (url != null) {
765 InputStream in = null;
766 try {
767 in = Instance.getCache().open(url, getSupport(url), true);
768 return IOUtils.toImage(in);
769 } catch (IOException e) {
770 } finally {
771 if (in != null) {
772 try {
773 in.close();
774 } catch (IOException e) {
775 }
776 }
777 }
778 }
779
780 return null;
781 }
782
783 /**
784 * Check if the given resource can be a local image or a remote image, then
785 * refresh the cache with it if it is.
786 *
787 * @param source
788 * the story source
789 * @param line
790 * the resource to check
791 *
792 * @return the image URL if found, or NULL
793 *
794 */
795 static URL getImageUrl(BasicSupport support, URL source, String line) {
796 URL url = null;
797
798 if (line != null) {
799 // try for files
800 String path = null;
801 if (source != null) {
802 path = new File(source.getFile()).getParent();
803 try {
804 String basePath = new File(new File(path), line.trim())
805 .getAbsolutePath();
806 for (String ext : getImageExt(true)) {
807 if (new File(basePath + ext).exists()) {
808 url = new File(basePath + ext).toURI().toURL();
809 }
810 }
811 } catch (Exception e) {
812 // Nothing to do here
813 }
814 }
815
816 if (url == null) {
817 // try for URLs
818 try {
819 for (String ext : getImageExt(true)) {
820 if (Instance.getCache().check(new URL(line + ext))) {
821 url = new URL(line + ext);
822 break;
823 }
824 }
825
826 // try out of cache
827 if (url == null) {
828 for (String ext : getImageExt(true)) {
829 try {
830 url = new URL(line + ext);
831 Instance.getCache().refresh(url, support, true);
832 break;
833 } catch (IOException e) {
834 // no image with this ext
835 url = null;
836 }
837 }
838 }
839 } catch (MalformedURLException e) {
840 // Not an url
841 }
842 }
843
844 // refresh the cached file
845 if (url != null) {
846 try {
847 Instance.getCache().refresh(url, support, true);
848 } catch (IOException e) {
849 // woops, broken image
850 url = null;
851 }
852 }
853 }
854
855 return url;
856 }
857
858 /**
859 * Open the input file that will be used through the support.
860 *
861 * @param source
862 * the source {@link URL}
863 *
864 * @return the {@link InputStream}
865 *
866 * @throws IOException
867 * in case of I/O error
868 */
869 protected InputStream openInput(URL source) throws IOException {
870 return Instance.getCache().open(source, this, false);
871 }
872
873 /**
874 * Reset the given {@link InputStream} and return it.
875 *
876 * @param in
877 * the {@link InputStream} to reset
878 *
879 * @return the same {@link InputStream} after reset
880 */
881 protected InputStream reset(InputStream in) {
882 try {
883 in.reset();
884 } catch (IOException e) {
885 }
886 return in;
887 }
888
889 /**
890 * Reset then return {@link BasicSupport#in}.
891 *
892 * @return {@link BasicSupport#in}
893 */
894 protected InputStream getInput() {
895 return reset(in);
896 }
897
898 /**
899 * Fix the author name if it is prefixed with some "by" {@link String}.
900 *
901 * @param author
902 * the author with a possible prefix
903 *
904 * @return the author without prefixes
905 */
906 protected String fixAuthor(String author) {
907 if (author != null) {
908 for (String suffix : new String[] { " ", ":" }) {
909 for (String byString : Instance.getConfig()
910 .getString(Config.BYS).split(",")) {
911 byString += suffix;
912 if (author.toUpperCase().startsWith(byString.toUpperCase())) {
913 author = author.substring(byString.length()).trim();
914 }
915 }
916 }
917
918 // Special case (without suffix):
919 if (author.startsWith("©")) {
920 author = author.substring(1);
921 }
922 }
923
924 return author;
925 }
926
927 /**
928 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
929 * and requotify them (i.e., separate them into QUOTE paragraphs and other
930 * paragraphs (quotes or not)).
931 *
932 * @param para
933 * the paragraph to requotify (not necessarily a quote)
934 *
935 * @return the correctly (or so we hope) quotified paragraphs
936 */
937 protected List<Paragraph> requotify(Paragraph para) {
938 List<Paragraph> newParas = new ArrayList<Paragraph>();
939
940 if (para.getType() == ParagraphType.QUOTE
941 && para.getContent().length() > 2) {
942 String line = para.getContent();
943 boolean singleQ = line.startsWith("" + openQuote);
944 boolean doubleQ = line.startsWith("" + openDoubleQuote);
945
946 // Do not try when more than one quote at a time
947 // (some stories are not easily readable if we do)
948 if (singleQ
949 && line.indexOf(closeQuote, 1) < line
950 .lastIndexOf(closeQuote)) {
951 newParas.add(para);
952 return newParas;
953 }
954 if (doubleQ
955 && line.indexOf(closeDoubleQuote, 1) < line
956 .lastIndexOf(closeDoubleQuote)) {
957 newParas.add(para);
958 return newParas;
959 }
960 //
961
962 if (!singleQ && !doubleQ) {
963 line = openDoubleQuote + line + closeDoubleQuote;
964 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
965 .getWords()));
966 } else {
967 char open = singleQ ? openQuote : openDoubleQuote;
968 char close = singleQ ? closeQuote : closeDoubleQuote;
969
970 int posDot = -1;
971 boolean inQuote = false;
972 int i = 0;
973 for (char car : line.toCharArray()) {
974 if (car == open) {
975 inQuote = true;
976 } else if (car == close) {
977 inQuote = false;
978 } else if (car == '.' && !inQuote) {
979 posDot = i;
980 break;
981 }
982 i++;
983 }
984
985 if (posDot >= 0) {
986 String rest = line.substring(posDot + 1).trim();
987 line = line.substring(0, posDot + 1).trim();
988 long words = 1;
989 for (char car : line.toCharArray()) {
990 if (car == ' ') {
991 words++;
992 }
993 }
994 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
995 if (!rest.isEmpty()) {
996 newParas.addAll(requotify(processPara(rest)));
997 }
998 } else {
999 newParas.add(para);
1000 }
1001 }
1002 } else {
1003 newParas.add(para);
1004 }
1005
1006 return newParas;
1007 }
1008
1009 /**
1010 * Process a {@link Paragraph} from a raw line of text.
1011 * <p>
1012 * Will also fix quotes and HTML encoding if needed.
1013 *
1014 * @param line
1015 * the raw line
1016 *
1017 * @return the processed {@link Paragraph}
1018 */
1019 protected Paragraph processPara(String line) {
1020 line = ifUnhtml(line).trim();
1021
1022 boolean space = true;
1023 boolean brk = true;
1024 boolean quote = false;
1025 boolean tentativeCloseQuote = false;
1026 char prev = '\0';
1027 int dashCount = 0;
1028 long words = 1;
1029
1030 StringBuilder builder = new StringBuilder();
1031 for (char car : line.toCharArray()) {
1032 if (car != '-') {
1033 if (dashCount > 0) {
1034 // dash, ndash and mdash: - – —
1035 // currently: always use mdash
1036 builder.append(dashCount == 1 ? '-' : '—');
1037 }
1038 dashCount = 0;
1039 }
1040
1041 if (tentativeCloseQuote) {
1042 tentativeCloseQuote = false;
1043 if (Character.isLetterOrDigit(car)) {
1044 builder.append("'");
1045 } else {
1046 // handle double-single quotes as double quotes
1047 if (prev == car) {
1048 builder.append(closeDoubleQuote);
1049 continue;
1050 } else {
1051 builder.append(closeQuote);
1052 }
1053 }
1054 }
1055
1056 switch (car) {
1057 case ' ': // note: unbreakable space
1058 case ' ':
1059 case '\t':
1060 case '\n': // just in case
1061 case '\r': // just in case
1062 if (builder.length() > 0
1063 && builder.charAt(builder.length() - 1) != ' ') {
1064 words++;
1065 }
1066 builder.append(' ');
1067 break;
1068
1069 case '\'':
1070 if (space || (brk && quote)) {
1071 quote = true;
1072 // handle double-single quotes as double quotes
1073 if (prev == car) {
1074 builder.deleteCharAt(builder.length() - 1);
1075 builder.append(openDoubleQuote);
1076 } else {
1077 builder.append(openQuote);
1078 }
1079 } else if (prev == ' ' || prev == car) {
1080 // handle double-single quotes as double quotes
1081 if (prev == car) {
1082 builder.deleteCharAt(builder.length() - 1);
1083 builder.append(openDoubleQuote);
1084 } else {
1085 builder.append(openQuote);
1086 }
1087 } else {
1088 // it is a quote ("I'm off") or a 'quote' ("This
1089 // 'good' restaurant"...)
1090 tentativeCloseQuote = true;
1091 }
1092 break;
1093
1094 case '"':
1095 if (space || (brk && quote)) {
1096 quote = true;
1097 builder.append(openDoubleQuote);
1098 } else if (prev == ' ') {
1099 builder.append(openDoubleQuote);
1100 } else {
1101 builder.append(closeDoubleQuote);
1102 }
1103 break;
1104
1105 case '-':
1106 if (space) {
1107 quote = true;
1108 } else {
1109 dashCount++;
1110 }
1111 space = false;
1112 break;
1113
1114 case '*':
1115 case '~':
1116 case '/':
1117 case '\\':
1118 case '<':
1119 case '>':
1120 case '=':
1121 case '+':
1122 case '_':
1123 case '–':
1124 case '—':
1125 space = false;
1126 builder.append(car);
1127 break;
1128
1129 case '‘':
1130 case '`':
1131 case '‹':
1132 case '﹁':
1133 case '〈':
1134 case '「':
1135 if (space || (brk && quote)) {
1136 quote = true;
1137 builder.append(openQuote);
1138 } else {
1139 // handle double-single quotes as double quotes
1140 if (prev == car) {
1141 builder.deleteCharAt(builder.length() - 1);
1142 builder.append(openDoubleQuote);
1143 } else {
1144 builder.append(openQuote);
1145 }
1146 }
1147 space = false;
1148 brk = false;
1149 break;
1150
1151 case '’':
1152 case '›':
1153 case '﹂':
1154 case '〉':
1155 case '」':
1156 space = false;
1157 brk = false;
1158 // handle double-single quotes as double quotes
1159 if (prev == car) {
1160 builder.deleteCharAt(builder.length() - 1);
1161 builder.append(closeDoubleQuote);
1162 } else {
1163 builder.append(closeQuote);
1164 }
1165 break;
1166
1167 case '«':
1168 case '“':
1169 case '﹃':
1170 case '《':
1171 case '『':
1172 if (space || (brk && quote)) {
1173 quote = true;
1174 builder.append(openDoubleQuote);
1175 } else {
1176 builder.append(openDoubleQuote);
1177 }
1178 space = false;
1179 brk = false;
1180 break;
1181
1182 case '»':
1183 case '”':
1184 case '﹄':
1185 case '》':
1186 case '』':
1187 space = false;
1188 brk = false;
1189 builder.append(closeDoubleQuote);
1190 break;
1191
1192 default:
1193 space = false;
1194 brk = false;
1195 builder.append(car);
1196 break;
1197 }
1198
1199 prev = car;
1200 }
1201
1202 if (tentativeCloseQuote) {
1203 tentativeCloseQuote = false;
1204 builder.append(closeQuote);
1205 }
1206
1207 line = builder.toString().trim();
1208
1209 ParagraphType type = ParagraphType.NORMAL;
1210 if (space) {
1211 type = ParagraphType.BLANK;
1212 } else if (brk) {
1213 type = ParagraphType.BREAK;
1214 } else if (quote) {
1215 type = ParagraphType.QUOTE;
1216 }
1217
1218 return new Paragraph(type, line, words);
1219 }
1220
1221 /**
1222 * Remove the HTML from the input <b>if</b> {@link BasicSupport#isHtml()} is
1223 * true.
1224 *
1225 * @param input
1226 * the input
1227 *
1228 * @return the no html version if needed
1229 */
1230 private String ifUnhtml(String input) {
1231 if (isHtml() && input != null) {
1232 return StringUtils.unhtml(input);
1233 }
1234
1235 return input;
1236 }
1237
1238 /**
1239 * Return a {@link BasicSupport} implementation supporting the given
1240 * resource if possible.
1241 *
1242 * @param url
1243 * the story resource
1244 *
1245 * @return an implementation that supports it, or NULL
1246 */
1247 public static BasicSupport getSupport(URL url) {
1248 if (url == null) {
1249 return null;
1250 }
1251
1252 // TEXT and INFO_TEXT always support files (not URLs though)
1253 for (SupportType type : SupportType.values()) {
1254 if (type != SupportType.TEXT && type != SupportType.INFO_TEXT) {
1255 BasicSupport support = getSupport(type);
1256 if (support != null && support.supports(url)) {
1257 return support;
1258 }
1259 }
1260 }
1261
1262 for (SupportType type : new SupportType[] { SupportType.INFO_TEXT,
1263 SupportType.TEXT }) {
1264 BasicSupport support = getSupport(type);
1265 if (support != null && support.supports(url)) {
1266 return support;
1267 }
1268 }
1269
1270 return null;
1271 }
1272
1273 /**
1274 * Return a {@link BasicSupport} implementation supporting the given type.
1275 *
1276 * @param type
1277 * the type
1278 *
1279 * @return an implementation that supports it, or NULL
1280 */
1281 public static BasicSupport getSupport(SupportType type) {
1282 switch (type) {
1283 case EPUB:
1284 return new Epub().setType(type);
1285 case INFO_TEXT:
1286 return new InfoText().setType(type);
1287 case FIMFICTION:
1288 return new Fimfiction().setType(type);
1289 case FANFICTION:
1290 return new Fanfiction().setType(type);
1291 case TEXT:
1292 return new Text().setType(type);
1293 case MANGAFOX:
1294 return new MangaFox().setType(type);
1295 case E621:
1296 return new E621().setType(type);
1297 case YIFFSTAR:
1298 return new YiffStar().setType(type);
1299 case CBZ:
1300 return new Cbz().setType(type);
1301 case HTML:
1302 return new Html().setType(type);
1303 }
1304
1305 return null;
1306 }
1307
1308 /**
1309 * Return the first line from the given input which correspond to the given
1310 * selectors.
1311 *
1312 * @param in
1313 * the input
1314 * @param needle
1315 * a string that must be found inside the target line (also
1316 * supports "^" at start to say "only if it starts with" the
1317 * needle)
1318 * @param relativeLine
1319 * the line to return based upon the target line position (-1 =
1320 * the line before, 0 = the target line...)
1321 *
1322 * @return the line
1323 */
1324 static String getLine(InputStream in, String needle, int relativeLine) {
1325 return getLine(in, needle, relativeLine, true);
1326 }
1327
1328 /**
1329 * Return a line from the given input which correspond to the given
1330 * selectors.
1331 *
1332 * @param in
1333 * the input
1334 * @param needle
1335 * a string that must be found inside the target line (also
1336 * supports "^" at start to say "only if it starts with" the
1337 * needle)
1338 * @param relativeLine
1339 * the line to return based upon the target line position (-1 =
1340 * the line before, 0 = the target line...)
1341 * @param first
1342 * takes the first result (as opposed to the last one, which will
1343 * also always spend the input)
1344 *
1345 * @return the line
1346 */
1347 static String getLine(InputStream in, String needle, int relativeLine,
1348 boolean first) {
1349 String rep = null;
1350
1351 try {
1352 in.reset();
1353 } catch (IOException e) {
1354 Instance.syserr(e);
1355 }
1356
1357 List<String> lines = new ArrayList<String>();
1358 @SuppressWarnings("resource")
1359 Scanner scan = new Scanner(in, "UTF-8");
1360 int index = -1;
1361 scan.useDelimiter("\\n");
1362 while (scan.hasNext()) {
1363 lines.add(scan.next());
1364
1365 if (index == -1) {
1366 if (needle.startsWith("^")) {
1367 if (lines.get(lines.size() - 1).startsWith(
1368 needle.substring(1))) {
1369 index = lines.size() - 1;
1370 }
1371
1372 } else {
1373 if (lines.get(lines.size() - 1).contains(needle)) {
1374 index = lines.size() - 1;
1375 }
1376 }
1377 }
1378
1379 if (index >= 0 && index + relativeLine < lines.size()) {
1380 rep = lines.get(index + relativeLine);
1381 if (first) {
1382 break;
1383 }
1384 }
1385 }
1386
1387 return rep;
1388 }
1389 }