Improve importing progress reporting
[fanfix.git] / src / be / nikiroo / fanfix / supported / BasicSupport.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.awt.image.BufferedImage;
4 import java.io.BufferedReader;
5 import java.io.ByteArrayInputStream;
6 import java.io.File;
7 import java.io.IOException;
8 import java.io.InputStream;
9 import java.io.InputStreamReader;
10 import java.net.MalformedURLException;
11 import java.net.URL;
12 import java.util.ArrayList;
13 import java.util.Date;
14 import java.util.HashMap;
15 import java.util.List;
16 import java.util.Map;
17 import java.util.Map.Entry;
18 import java.util.Scanner;
19
20 import be.nikiroo.fanfix.Instance;
21 import be.nikiroo.fanfix.bundles.Config;
22 import be.nikiroo.fanfix.bundles.StringId;
23 import be.nikiroo.fanfix.data.Chapter;
24 import be.nikiroo.fanfix.data.MetaData;
25 import be.nikiroo.fanfix.data.Paragraph;
26 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
27 import be.nikiroo.fanfix.data.Story;
28 import be.nikiroo.utils.IOUtils;
29 import be.nikiroo.utils.Progress;
30 import be.nikiroo.utils.StringUtils;
31
32 /**
33 * This class is the base class used by the other support classes. It can be
34 * used outside of this package, and have static method that you can use to get
35 * access to the correct support class.
36 * <p>
37 * It will be used with 'resources' (usually web pages or files).
38 *
39 * @author niki
40 */
41 public abstract class BasicSupport {
42 /**
43 * The supported input types for which we can get a {@link BasicSupport}
44 * object.
45 *
46 * @author niki
47 */
48 public enum SupportType {
49 /** EPUB files created with this program */
50 EPUB,
51 /** Pure text file with some rules */
52 TEXT,
53 /** TEXT but with associated .info file */
54 INFO_TEXT,
55 /** My Little Pony fanfictions */
56 FIMFICTION,
57 /** Fanfictions from a lot of different universes */
58 FANFICTION,
59 /** Website with lots of Mangas */
60 MANGAFOX,
61 /** Furry website with comics support */
62 E621,
63 /** Furry website with stories */
64 YIFFSTAR,
65 /** CBZ files */
66 CBZ,
67 /** HTML files */
68 HTML;
69
70 /**
71 * A description of this support type (more information than the
72 * {@link BasicSupport#getSourceName()}).
73 *
74 * @return the description
75 */
76 public String getDesc() {
77 String desc = Instance.getTrans().getStringX(StringId.INPUT_DESC,
78 this.name());
79
80 if (desc == null) {
81 desc = Instance.getTrans().getString(StringId.INPUT_DESC, this);
82 }
83
84 return desc;
85 }
86
87 /**
88 * The name of this support type (a short version).
89 *
90 * @return the name
91 */
92 public String getSourceName() {
93 BasicSupport support = BasicSupport.getSupport(this);
94 if (support != null) {
95 return support.getSourceName();
96 }
97
98 return null;
99 }
100
101 @Override
102 public String toString() {
103 return super.toString().toLowerCase();
104 }
105
106 /**
107 * Call {@link SupportType#valueOf(String.toUpperCase())}.
108 *
109 * @param typeName
110 * the possible type name
111 *
112 * @return NULL or the type
113 */
114 public static SupportType valueOfUC(String typeName) {
115 return SupportType.valueOf(typeName == null ? null : typeName
116 .toUpperCase());
117 }
118
119 /**
120 * Call {@link SupportType#valueOf(String.toUpperCase())} but return
121 * NULL for NULL instead of raising exception.
122 *
123 * @param typeName
124 * the possible type name
125 *
126 * @return NULL or the type
127 */
128 public static SupportType valueOfNullOkUC(String typeName) {
129 if (typeName == null) {
130 return null;
131 }
132
133 return SupportType.valueOfUC(typeName);
134 }
135
136 /**
137 * Call {@link SupportType#valueOf(String.toUpperCase())} but return
138 * NULL in case of error instead of raising an exception.
139 *
140 * @param typeName
141 * the possible type name
142 *
143 * @return NULL or the type
144 */
145 public static SupportType valueOfAllOkUC(String typeName) {
146 try {
147 return SupportType.valueOfUC(typeName);
148 } catch (Exception e) {
149 return null;
150 }
151 }
152 }
153
154 private InputStream in;
155 private SupportType type;
156 private URL currentReferer; // with only one 'r', as in 'HTTP'...
157
158 // quote chars
159 private char openQuote = Instance.getTrans().getChar(
160 StringId.OPEN_SINGLE_QUOTE);
161 private char closeQuote = Instance.getTrans().getChar(
162 StringId.CLOSE_SINGLE_QUOTE);
163 private char openDoubleQuote = Instance.getTrans().getChar(
164 StringId.OPEN_DOUBLE_QUOTE);
165 private char closeDoubleQuote = Instance.getTrans().getChar(
166 StringId.CLOSE_DOUBLE_QUOTE);
167
168 /**
169 * The name of this support class.
170 *
171 * @return the name
172 */
173 protected abstract String getSourceName();
174
175 /**
176 * Check if the given resource is supported by this {@link BasicSupport}.
177 *
178 * @param url
179 * the resource to check for
180 *
181 * @return TRUE if it is
182 */
183 protected abstract boolean supports(URL url);
184
185 /**
186 * Return TRUE if the support will return HTML encoded content values for
187 * the chapters content.
188 *
189 * @return TRUE for HTML
190 */
191 protected abstract boolean isHtml();
192
193 protected abstract MetaData getMeta(URL source, InputStream in)
194 throws IOException;
195
196 /**
197 * Return the story description.
198 *
199 * @param source
200 * the source of the story
201 * @param in
202 * the input (the main resource)
203 *
204 * @return the description
205 *
206 * @throws IOException
207 * in case of I/O error
208 */
209 protected abstract String getDesc(URL source, InputStream in)
210 throws IOException;
211
212 /**
213 * Return the list of chapters (name and resource).
214 *
215 * @param source
216 * the source of the story
217 * @param in
218 * the input (the main resource)
219 * @param pg
220 * the optional progress reporter
221 *
222 * @return the chapters
223 *
224 * @throws IOException
225 * in case of I/O error
226 */
227 protected abstract List<Entry<String, URL>> getChapters(URL source,
228 InputStream in, Progress pg) throws IOException;
229
230 /**
231 * Return the content of the chapter (possibly HTML encoded, if
232 * {@link BasicSupport#isHtml()} is TRUE).
233 *
234 * @param source
235 * the source of the story
236 * @param in
237 * the input (the main resource)
238 * @param number
239 * the chapter number
240 * @param pg
241 * the optional progress reporter
242 *
243 * @return the content
244 *
245 * @throws IOException
246 * in case of I/O error
247 */
248 protected abstract String getChapterContent(URL source, InputStream in,
249 int number, Progress pg) throws IOException;
250
251 /**
252 * Log into the support (can be a no-op depending upon the support).
253 *
254 * @throws IOException
255 * in case of I/O error
256 */
257 public void login() throws IOException {
258
259 }
260
261 /**
262 * Return the list of cookies (values included) that must be used to
263 * correctly fetch the resources.
264 * <p>
265 * You are expected to call the super method implementation if you override
266 * it.
267 *
268 * @return the cookies
269 *
270 * @throws IOException
271 * in case of I/O error
272 */
273 public Map<String, String> getCookies() throws IOException {
274 return new HashMap<String, String>();
275 }
276
277 /**
278 * Return the canonical form of the main {@link URL}.
279 *
280 * @param source
281 * the source {@link URL}
282 *
283 * @return the canonical form of this {@link URL}
284 *
285 * @throws IOException
286 * in case of I/O error
287 */
288 public URL getCanonicalUrl(URL source) throws IOException {
289 return source;
290 }
291
292 /**
293 * Process the given story resource into a partially filled {@link Story}
294 * object containing the name and metadata, except for the description.
295 *
296 * @param url
297 * the story resource
298 *
299 * @return the {@link Story}
300 *
301 * @throws IOException
302 * in case of I/O error
303 */
304 public Story processMeta(URL url) throws IOException {
305 return processMeta(url, true, false, null);
306 }
307
308 /**
309 * Process the given story resource into a partially filled {@link Story}
310 * object containing the name and metadata.
311 *
312 * @param url
313 * the story resource
314 *
315 * @param close
316 * close "this" and "in" when done
317 * @param pg
318 * the optional progress reporter
319 *
320 * @return the {@link Story}
321 *
322 * @throws IOException
323 * in case of I/O error
324 */
325 protected Story processMeta(URL url, boolean close, boolean getDesc,
326 Progress pg) throws IOException {
327 if (pg == null) {
328 pg = new Progress();
329 } else {
330 pg.setMinMax(0, 100);
331 }
332
333 login();
334 pg.setProgress(10);
335
336 url = getCanonicalUrl(url);
337
338 setCurrentReferer(url);
339
340 in = openInput(url);
341 if (in == null) {
342 return null;
343 }
344
345 try {
346 preprocess(url, getInput());
347 pg.setProgress(30);
348
349 Story story = new Story();
350 MetaData meta = getMeta(url, getInput());
351 if (meta.getCreationDate() == null
352 || meta.getCreationDate().isEmpty()) {
353 meta.setCreationDate(StringUtils.fromTime(new Date().getTime()));
354 }
355 story.setMeta(meta);
356
357 pg.setProgress(50);
358
359 if (meta != null && meta.getCover() == null) {
360 meta.setCover(getDefaultCover(meta.getSubject()));
361 }
362
363 pg.setProgress(60);
364
365 if (getDesc) {
366 String descChapterName = Instance.getTrans().getString(
367 StringId.DESCRIPTION);
368 story.getMeta().setResume(
369 makeChapter(url, 0, descChapterName,
370 getDesc(url, getInput()), null));
371 }
372
373 pg.setProgress(100);
374 return story;
375 } finally {
376 if (close) {
377 try {
378 close();
379 } catch (IOException e) {
380 Instance.syserr(e);
381 }
382
383 if (in != null) {
384 in.close();
385 }
386 }
387
388 setCurrentReferer(null);
389 }
390 }
391
392 /**
393 * Process the given story resource into a fully filled {@link Story}
394 * object.
395 *
396 * @param url
397 * the story resource
398 * @param pg
399 * the optional progress reporter
400 *
401 * @return the {@link Story}
402 *
403 * @throws IOException
404 * in case of I/O error
405 */
406 public Story process(URL url, Progress pg) throws IOException {
407 if (pg == null) {
408 pg = new Progress();
409 } else {
410 pg.setMinMax(0, 100);
411 }
412
413 url = getCanonicalUrl(url);
414 pg.setProgress(1);
415 try {
416 Progress pgMeta = new Progress();
417 pg.addProgress(pgMeta, 10);
418 Story story = processMeta(url, false, true, pgMeta);
419 if (!pgMeta.isDone()) {
420 pgMeta.setProgress(pgMeta.getMax()); // 10%
421 }
422
423 if (story == null) {
424 pg.setProgress(90);
425 return null;
426 }
427
428 pg.setName("Retrieving " + story.getMeta().getTitle());
429
430 setCurrentReferer(url);
431
432 Progress pgGetChapters = new Progress();
433 pg.addProgress(pgGetChapters, 10);
434 story.setChapters(new ArrayList<Chapter>());
435 List<Entry<String, URL>> chapters = getChapters(url, getInput(),
436 pgGetChapters);
437 if (!pgGetChapters.isDone()) {
438 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
439 }
440
441 if (chapters != null) {
442 Progress pgChaps = new Progress("Extracting chapters", 0,
443 chapters.size() * 300);
444 pg.addProgress(pgChaps, 80);
445
446 long words = 0;
447 int i = 1;
448 for (Entry<String, URL> chap : chapters) {
449 pgChaps.setName("Extracting chapter " + i);
450 setCurrentReferer(chap.getValue());
451 InputStream chapIn = Instance.getCache().open(
452 chap.getValue(), this, true);
453 pgChaps.setProgress(i * 100);
454 try {
455 Progress pgGetChapterContent = new Progress();
456 Progress pgMakeChapter = new Progress();
457 pgChaps.addProgress(pgGetChapterContent, 100);
458 pgChaps.addProgress(pgMakeChapter, 100);
459
460 String content = getChapterContent(url, chapIn, i,
461 pgGetChapterContent);
462 if (!pgGetChapterContent.isDone()) {
463 pgGetChapterContent.setProgress(pgGetChapterContent
464 .getMax());
465 }
466
467 Chapter cc = makeChapter(url, i, chap.getKey(),
468 content, pgMakeChapter);
469 if (!pgMakeChapter.isDone()) {
470 pgMakeChapter.setProgress(pgMakeChapter.getMax());
471 }
472
473 words += cc.getWords();
474 story.getChapters().add(cc);
475 if (story.getMeta() != null) {
476 story.getMeta().setWords(words);
477 }
478 } finally {
479 chapIn.close();
480 }
481
482 i++;
483 }
484
485 pgChaps.setName("Extracting chapters");
486 } else {
487 pg.setProgress(80);
488 }
489
490 return story;
491
492 } finally {
493 try {
494 close();
495 } catch (IOException e) {
496 Instance.syserr(e);
497 }
498
499 if (in != null) {
500 in.close();
501 }
502
503 setCurrentReferer(null);
504 }
505 }
506
507 /**
508 * The support type.
509 *
510 * @return the type
511 */
512 public SupportType getType() {
513 return type;
514 }
515
516 /**
517 * The current referer {@link URL} (only one 'r', as in 'HTML'...), i.e.,
518 * the current {@link URL} we work on.
519 *
520 * @return the referer
521 */
522 public URL getCurrentReferer() {
523 return currentReferer;
524 }
525
526 /**
527 * The current referer {@link URL} (only one 'r', as in 'HTML'...), i.e.,
528 * the current {@link URL} we work on.
529 *
530 * @param currentReferer
531 * the new referer
532 */
533 protected void setCurrentReferer(URL currentReferer) {
534 this.currentReferer = currentReferer;
535 }
536
537 /**
538 * The support type.
539 *
540 * @param type
541 * the new type
542 *
543 * @return this
544 */
545 protected BasicSupport setType(SupportType type) {
546 this.type = type;
547 return this;
548 }
549
550 /**
551 * Prepare the support if needed before processing.
552 *
553 * @param source
554 * the source of the story
555 * @param in
556 * the input (the main resource)
557 *
558 * @throws IOException
559 * on I/O error
560 */
561 protected void preprocess(URL source, InputStream in) throws IOException {
562 }
563
564 /**
565 * Now that we have processed the {@link Story}, close the resources if any.
566 *
567 * @throws IOException
568 * on I/O error
569 */
570 protected void close() throws IOException {
571 }
572
573 /**
574 * Create a {@link Chapter} object from the given information, formatting
575 * the content as it should be.
576 *
577 * @param number
578 * the chapter number
579 * @param name
580 * the chapter name
581 * @param content
582 * the chapter content
583 * @param pg
584 * the optional progress reporter
585 *
586 * @return the {@link Chapter}
587 *
588 * @throws IOException
589 * in case of I/O error
590 */
591 protected Chapter makeChapter(URL source, int number, String name,
592 String content, Progress pg) throws IOException {
593 // Chapter name: process it correctly, then remove the possible
594 // redundant "Chapter x: " in front of it
595 String chapterName = processPara(name).getContent().trim();
596 for (String lang : Instance.getConfig().getString(Config.CHAPTER)
597 .split(",")) {
598 String chapterWord = Instance.getConfig().getStringX(
599 Config.CHAPTER, lang);
600 if (chapterName.startsWith(chapterWord)) {
601 chapterName = chapterName.substring(chapterWord.length())
602 .trim();
603 break;
604 }
605 }
606
607 if (chapterName.startsWith(Integer.toString(number))) {
608 chapterName = chapterName.substring(
609 Integer.toString(number).length()).trim();
610 }
611
612 if (chapterName.startsWith(":")) {
613 chapterName = chapterName.substring(1).trim();
614 }
615 //
616
617 Chapter chap = new Chapter(number, chapterName);
618
619 if (content != null) {
620 List<Paragraph> paras = makeParagraphs(source, content, pg);
621 long words = 0;
622 for (Paragraph para : paras) {
623 words += para.getWords();
624 }
625 chap.setParagraphs(paras);
626 chap.setWords(words);
627 }
628
629 return chap;
630
631 }
632
633 /**
634 * Convert the given content into {@link Paragraph}s.
635 *
636 * @param source
637 * the source URL of the story
638 * @param content
639 * the textual content
640 * @param pg
641 * the optional progress reporter
642 *
643 * @return the {@link Paragraph}s
644 *
645 * @throws IOException
646 * in case of I/O error
647 */
648 protected List<Paragraph> makeParagraphs(URL source, String content,
649 Progress pg) throws IOException {
650 if (pg == null) {
651 pg = new Progress();
652 }
653
654 if (isHtml()) {
655 // Special <HR> processing:
656 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
657 "<br/>* * *<br/>");
658 }
659
660 List<Paragraph> paras = new ArrayList<Paragraph>();
661
662 if (content != null && !content.trim().isEmpty()) {
663 if (isHtml()) {
664 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
665 pg.setMinMax(0, tab.length);
666 int i = 1;
667 for (String line : tab) {
668 if (line.startsWith("[") && line.endsWith("]")) {
669 pg.setName("Extracting image " + i);
670 }
671 paras.add(makeParagraph(source, line.trim()));
672 pg.setProgress(i++);
673 }
674 pg.setName(null);
675 } else {
676 List<String> lines = new ArrayList<String>();
677 BufferedReader buff = null;
678 try {
679 buff = new BufferedReader(
680 new InputStreamReader(new ByteArrayInputStream(
681 content.getBytes("UTF-8")), "UTF-8"));
682 for (String line = buff.readLine(); line != null; line = buff
683 .readLine()) {
684 lines.add(line.trim());
685 }
686 } finally {
687 if (buff != null) {
688 buff.close();
689 }
690 }
691
692 pg.setMinMax(0, lines.size());
693 int i = 0;
694 for (String line : lines) {
695 if (line.startsWith("[") && line.endsWith("]")) {
696 pg.setName("Extracting image " + i);
697 }
698 paras.add(makeParagraph(source, line));
699 pg.setProgress(i++);
700 }
701 pg.setName(null);
702 }
703
704 // Check quotes for "bad" format
705 List<Paragraph> newParas = new ArrayList<Paragraph>();
706 for (Paragraph para : paras) {
707 newParas.addAll(requotify(para));
708 }
709 paras = newParas;
710
711 // Remove double blanks/brks
712 fixBlanksBreaks(paras);
713 }
714
715 return paras;
716 }
717
718 /**
719 * Convert the given line into a single {@link Paragraph}.
720 *
721 * @param source
722 * the source URL of the story
723 * @param line
724 * the textual content of the paragraph
725 *
726 * @return the {@link Paragraph}
727 */
728 private Paragraph makeParagraph(URL source, String line) {
729 URL image = null;
730 if (line.startsWith("[") && line.endsWith("]")) {
731 image = getImageUrl(this, source,
732 line.substring(1, line.length() - 1).trim());
733 }
734
735 if (image != null) {
736 return new Paragraph(image);
737 } else {
738 return processPara(line);
739 }
740 }
741
742 /**
743 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
744 * those {@link Paragraph}s.
745 * <p>
746 * The resulting list will not contain a starting or trailing blank/break
747 * nor 2 blanks or breaks following each other.
748 *
749 * @param paras
750 * the list of {@link Paragraph}s to fix
751 */
752 protected void fixBlanksBreaks(List<Paragraph> paras) {
753 boolean space = false;
754 boolean brk = true;
755 for (int i = 0; i < paras.size(); i++) {
756 Paragraph para = paras.get(i);
757 boolean thisSpace = para.getType() == ParagraphType.BLANK;
758 boolean thisBrk = para.getType() == ParagraphType.BREAK;
759
760 if (i > 0 && space && thisBrk) {
761 paras.remove(i - 1);
762 i--;
763 } else if ((space || brk) && (thisSpace || thisBrk)) {
764 paras.remove(i);
765 i--;
766 }
767
768 space = thisSpace;
769 brk = thisBrk;
770 }
771
772 // Remove blank/brk at start
773 if (paras.size() > 0
774 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
775 0).getType() == ParagraphType.BREAK)) {
776 paras.remove(0);
777 }
778
779 // Remove blank/brk at end
780 int last = paras.size() - 1;
781 if (paras.size() > 0
782 && (paras.get(last).getType() == ParagraphType.BLANK || paras
783 .get(last).getType() == ParagraphType.BREAK)) {
784 paras.remove(last);
785 }
786 }
787
788 /**
789 * Get the default cover related to this subject (see <tt>.info</tt> files).
790 *
791 * @param subject
792 * the subject
793 *
794 * @return the cover if any, or NULL
795 */
796 static BufferedImage getDefaultCover(String subject) {
797 if (subject != null && !subject.isEmpty()
798 && Instance.getCoverDir() != null) {
799 try {
800 File fileCover = new File(Instance.getCoverDir(), subject);
801 return getImage(null, fileCover.toURI().toURL(), subject);
802 } catch (MalformedURLException e) {
803 }
804 }
805
806 return null;
807 }
808
809 /**
810 * Return the list of supported image extensions.
811 *
812 * @param emptyAllowed
813 * TRUE to allow an empty extension on first place, which can be
814 * used when you may already have an extension in your input but
815 * are not sure about it
816 *
817 * @return the extensions
818 */
819 static String[] getImageExt(boolean emptyAllowed) {
820 if (emptyAllowed) {
821 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
822 } else {
823 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
824 }
825 }
826
827 /**
828 * Check if the given resource can be a local image or a remote image, then
829 * refresh the cache with it if it is.
830 *
831 * @param source
832 * the story source
833 * @param line
834 * the resource to check
835 *
836 * @return the image if found, or NULL
837 *
838 */
839 static BufferedImage getImage(BasicSupport support, URL source, String line) {
840 URL url = getImageUrl(support, source, line);
841 if (url != null) {
842 InputStream in = null;
843 try {
844 in = Instance.getCache().open(url, getSupport(url), true);
845 return IOUtils.toImage(in);
846 } catch (IOException e) {
847 } finally {
848 if (in != null) {
849 try {
850 in.close();
851 } catch (IOException e) {
852 }
853 }
854 }
855 }
856
857 return null;
858 }
859
860 /**
861 * Check if the given resource can be a local image or a remote image, then
862 * refresh the cache with it if it is.
863 *
864 * @param source
865 * the story source
866 * @param line
867 * the resource to check
868 *
869 * @return the image URL if found, or NULL
870 *
871 */
872 static URL getImageUrl(BasicSupport support, URL source, String line) {
873 URL url = null;
874
875 if (line != null) {
876 // try for files
877 String path = null;
878 if (source != null) {
879 path = new File(source.getFile()).getParent();
880 try {
881 String basePath = new File(new File(path), line.trim())
882 .getAbsolutePath();
883 for (String ext : getImageExt(true)) {
884 if (new File(basePath + ext).exists()) {
885 url = new File(basePath + ext).toURI().toURL();
886 }
887 }
888 } catch (Exception e) {
889 // Nothing to do here
890 }
891 }
892
893 if (url == null) {
894 // try for URLs
895 try {
896 for (String ext : getImageExt(true)) {
897 if (Instance.getCache().check(new URL(line + ext))) {
898 url = new URL(line + ext);
899 break;
900 }
901 }
902
903 // try out of cache
904 if (url == null) {
905 for (String ext : getImageExt(true)) {
906 try {
907 url = new URL(line + ext);
908 Instance.getCache().refresh(url, support, true);
909 break;
910 } catch (IOException e) {
911 // no image with this ext
912 url = null;
913 }
914 }
915 }
916 } catch (MalformedURLException e) {
917 // Not an url
918 }
919 }
920
921 // refresh the cached file
922 if (url != null) {
923 try {
924 Instance.getCache().refresh(url, support, true);
925 } catch (IOException e) {
926 // woops, broken image
927 url = null;
928 }
929 }
930 }
931
932 return url;
933 }
934
935 /**
936 * Open the input file that will be used through the support.
937 *
938 * @param source
939 * the source {@link URL}
940 *
941 * @return the {@link InputStream}
942 *
943 * @throws IOException
944 * in case of I/O error
945 */
946 protected InputStream openInput(URL source) throws IOException {
947 return Instance.getCache().open(source, this, false);
948 }
949
950 /**
951 * Reset the given {@link InputStream} and return it.
952 *
953 * @param in
954 * the {@link InputStream} to reset
955 *
956 * @return the same {@link InputStream} after reset
957 */
958 protected InputStream reset(InputStream in) {
959 try {
960 in.reset();
961 } catch (IOException e) {
962 }
963 return in;
964 }
965
966 /**
967 * Reset then return {@link BasicSupport#in}.
968 *
969 * @return {@link BasicSupport#in}
970 */
971 protected InputStream getInput() {
972 return reset(in);
973 }
974
975 /**
976 * Fix the author name if it is prefixed with some "by" {@link String}.
977 *
978 * @param author
979 * the author with a possible prefix
980 *
981 * @return the author without prefixes
982 */
983 protected String fixAuthor(String author) {
984 if (author != null) {
985 for (String suffix : new String[] { " ", ":" }) {
986 for (String byString : Instance.getConfig()
987 .getString(Config.BYS).split(",")) {
988 byString += suffix;
989 if (author.toUpperCase().startsWith(byString.toUpperCase())) {
990 author = author.substring(byString.length()).trim();
991 }
992 }
993 }
994
995 // Special case (without suffix):
996 if (author.startsWith("©")) {
997 author = author.substring(1);
998 }
999 }
1000
1001 return author;
1002 }
1003
1004 /**
1005 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
1006 * and requotify them (i.e., separate them into QUOTE paragraphs and other
1007 * paragraphs (quotes or not)).
1008 *
1009 * @param para
1010 * the paragraph to requotify (not necessarily a quote)
1011 *
1012 * @return the correctly (or so we hope) quotified paragraphs
1013 */
1014 protected List<Paragraph> requotify(Paragraph para) {
1015 List<Paragraph> newParas = new ArrayList<Paragraph>();
1016
1017 if (para.getType() == ParagraphType.QUOTE
1018 && para.getContent().length() > 2) {
1019 String line = para.getContent();
1020 boolean singleQ = line.startsWith("" + openQuote);
1021 boolean doubleQ = line.startsWith("" + openDoubleQuote);
1022
1023 // Do not try when more than one quote at a time
1024 // (some stories are not easily readable if we do)
1025 if (singleQ
1026 && line.indexOf(closeQuote, 1) < line
1027 .lastIndexOf(closeQuote)) {
1028 newParas.add(para);
1029 return newParas;
1030 }
1031 if (doubleQ
1032 && line.indexOf(closeDoubleQuote, 1) < line
1033 .lastIndexOf(closeDoubleQuote)) {
1034 newParas.add(para);
1035 return newParas;
1036 }
1037 //
1038
1039 if (!singleQ && !doubleQ) {
1040 line = openDoubleQuote + line + closeDoubleQuote;
1041 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
1042 .getWords()));
1043 } else {
1044 char open = singleQ ? openQuote : openDoubleQuote;
1045 char close = singleQ ? closeQuote : closeDoubleQuote;
1046
1047 int posDot = -1;
1048 boolean inQuote = false;
1049 int i = 0;
1050 for (char car : line.toCharArray()) {
1051 if (car == open) {
1052 inQuote = true;
1053 } else if (car == close) {
1054 inQuote = false;
1055 } else if (car == '.' && !inQuote) {
1056 posDot = i;
1057 break;
1058 }
1059 i++;
1060 }
1061
1062 if (posDot >= 0) {
1063 String rest = line.substring(posDot + 1).trim();
1064 line = line.substring(0, posDot + 1).trim();
1065 long words = 1;
1066 for (char car : line.toCharArray()) {
1067 if (car == ' ') {
1068 words++;
1069 }
1070 }
1071 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
1072 if (!rest.isEmpty()) {
1073 newParas.addAll(requotify(processPara(rest)));
1074 }
1075 } else {
1076 newParas.add(para);
1077 }
1078 }
1079 } else {
1080 newParas.add(para);
1081 }
1082
1083 return newParas;
1084 }
1085
1086 /**
1087 * Process a {@link Paragraph} from a raw line of text.
1088 * <p>
1089 * Will also fix quotes and HTML encoding if needed.
1090 *
1091 * @param line
1092 * the raw line
1093 *
1094 * @return the processed {@link Paragraph}
1095 */
1096 protected Paragraph processPara(String line) {
1097 line = ifUnhtml(line).trim();
1098
1099 boolean space = true;
1100 boolean brk = true;
1101 boolean quote = false;
1102 boolean tentativeCloseQuote = false;
1103 char prev = '\0';
1104 int dashCount = 0;
1105 long words = 1;
1106
1107 StringBuilder builder = new StringBuilder();
1108 for (char car : line.toCharArray()) {
1109 if (car != '-') {
1110 if (dashCount > 0) {
1111 // dash, ndash and mdash: - – —
1112 // currently: always use mdash
1113 builder.append(dashCount == 1 ? '-' : '—');
1114 }
1115 dashCount = 0;
1116 }
1117
1118 if (tentativeCloseQuote) {
1119 tentativeCloseQuote = false;
1120 if (Character.isLetterOrDigit(car)) {
1121 builder.append("'");
1122 } else {
1123 // handle double-single quotes as double quotes
1124 if (prev == car) {
1125 builder.append(closeDoubleQuote);
1126 continue;
1127 } else {
1128 builder.append(closeQuote);
1129 }
1130 }
1131 }
1132
1133 switch (car) {
1134 case ' ': // note: unbreakable space
1135 case ' ':
1136 case '\t':
1137 case '\n': // just in case
1138 case '\r': // just in case
1139 if (builder.length() > 0
1140 && builder.charAt(builder.length() - 1) != ' ') {
1141 words++;
1142 }
1143 builder.append(' ');
1144 break;
1145
1146 case '\'':
1147 if (space || (brk && quote)) {
1148 quote = true;
1149 // handle double-single quotes as double quotes
1150 if (prev == car) {
1151 builder.deleteCharAt(builder.length() - 1);
1152 builder.append(openDoubleQuote);
1153 } else {
1154 builder.append(openQuote);
1155 }
1156 } else if (prev == ' ' || prev == car) {
1157 // handle double-single quotes as double quotes
1158 if (prev == car) {
1159 builder.deleteCharAt(builder.length() - 1);
1160 builder.append(openDoubleQuote);
1161 } else {
1162 builder.append(openQuote);
1163 }
1164 } else {
1165 // it is a quote ("I'm off") or a 'quote' ("This
1166 // 'good' restaurant"...)
1167 tentativeCloseQuote = true;
1168 }
1169 break;
1170
1171 case '"':
1172 if (space || (brk && quote)) {
1173 quote = true;
1174 builder.append(openDoubleQuote);
1175 } else if (prev == ' ') {
1176 builder.append(openDoubleQuote);
1177 } else {
1178 builder.append(closeDoubleQuote);
1179 }
1180 break;
1181
1182 case '-':
1183 if (space) {
1184 quote = true;
1185 } else {
1186 dashCount++;
1187 }
1188 space = false;
1189 break;
1190
1191 case '*':
1192 case '~':
1193 case '/':
1194 case '\\':
1195 case '<':
1196 case '>':
1197 case '=':
1198 case '+':
1199 case '_':
1200 case '–':
1201 case '—':
1202 space = false;
1203 builder.append(car);
1204 break;
1205
1206 case '‘':
1207 case '`':
1208 case '‹':
1209 case '﹁':
1210 case '〈':
1211 case '「':
1212 if (space || (brk && quote)) {
1213 quote = true;
1214 builder.append(openQuote);
1215 } else {
1216 // handle double-single quotes as double quotes
1217 if (prev == car) {
1218 builder.deleteCharAt(builder.length() - 1);
1219 builder.append(openDoubleQuote);
1220 } else {
1221 builder.append(openQuote);
1222 }
1223 }
1224 space = false;
1225 brk = false;
1226 break;
1227
1228 case '’':
1229 case '›':
1230 case '﹂':
1231 case '〉':
1232 case '」':
1233 space = false;
1234 brk = false;
1235 // handle double-single quotes as double quotes
1236 if (prev == car) {
1237 builder.deleteCharAt(builder.length() - 1);
1238 builder.append(closeDoubleQuote);
1239 } else {
1240 builder.append(closeQuote);
1241 }
1242 break;
1243
1244 case '«':
1245 case '“':
1246 case '﹃':
1247 case '《':
1248 case '『':
1249 if (space || (brk && quote)) {
1250 quote = true;
1251 builder.append(openDoubleQuote);
1252 } else {
1253 builder.append(openDoubleQuote);
1254 }
1255 space = false;
1256 brk = false;
1257 break;
1258
1259 case '»':
1260 case '”':
1261 case '﹄':
1262 case '》':
1263 case '』':
1264 space = false;
1265 brk = false;
1266 builder.append(closeDoubleQuote);
1267 break;
1268
1269 default:
1270 space = false;
1271 brk = false;
1272 builder.append(car);
1273 break;
1274 }
1275
1276 prev = car;
1277 }
1278
1279 if (tentativeCloseQuote) {
1280 tentativeCloseQuote = false;
1281 builder.append(closeQuote);
1282 }
1283
1284 line = builder.toString().trim();
1285
1286 ParagraphType type = ParagraphType.NORMAL;
1287 if (space) {
1288 type = ParagraphType.BLANK;
1289 } else if (brk) {
1290 type = ParagraphType.BREAK;
1291 } else if (quote) {
1292 type = ParagraphType.QUOTE;
1293 }
1294
1295 return new Paragraph(type, line, words);
1296 }
1297
1298 /**
1299 * Remove the HTML from the input <b>if</b> {@link BasicSupport#isHtml()} is
1300 * true.
1301 *
1302 * @param input
1303 * the input
1304 *
1305 * @return the no html version if needed
1306 */
1307 private String ifUnhtml(String input) {
1308 if (isHtml() && input != null) {
1309 return StringUtils.unhtml(input);
1310 }
1311
1312 return input;
1313 }
1314
1315 /**
1316 * Return a {@link BasicSupport} implementation supporting the given
1317 * resource if possible.
1318 *
1319 * @param url
1320 * the story resource
1321 *
1322 * @return an implementation that supports it, or NULL
1323 */
1324 public static BasicSupport getSupport(URL url) {
1325 if (url == null) {
1326 return null;
1327 }
1328
1329 // TEXT and INFO_TEXT always support files (not URLs though)
1330 for (SupportType type : SupportType.values()) {
1331 if (type != SupportType.TEXT && type != SupportType.INFO_TEXT) {
1332 BasicSupport support = getSupport(type);
1333 if (support != null && support.supports(url)) {
1334 return support;
1335 }
1336 }
1337 }
1338
1339 for (SupportType type : new SupportType[] { SupportType.INFO_TEXT,
1340 SupportType.TEXT }) {
1341 BasicSupport support = getSupport(type);
1342 if (support != null && support.supports(url)) {
1343 return support;
1344 }
1345 }
1346
1347 return null;
1348 }
1349
1350 /**
1351 * Return a {@link BasicSupport} implementation supporting the given type.
1352 *
1353 * @param type
1354 * the type
1355 *
1356 * @return an implementation that supports it, or NULL
1357 */
1358 public static BasicSupport getSupport(SupportType type) {
1359 switch (type) {
1360 case EPUB:
1361 return new Epub().setType(type);
1362 case INFO_TEXT:
1363 return new InfoText().setType(type);
1364 case FIMFICTION:
1365 return new Fimfiction().setType(type);
1366 case FANFICTION:
1367 return new Fanfiction().setType(type);
1368 case TEXT:
1369 return new Text().setType(type);
1370 case MANGAFOX:
1371 return new MangaFox().setType(type);
1372 case E621:
1373 return new E621().setType(type);
1374 case YIFFSTAR:
1375 return new YiffStar().setType(type);
1376 case CBZ:
1377 return new Cbz().setType(type);
1378 case HTML:
1379 return new Html().setType(type);
1380 }
1381
1382 return null;
1383 }
1384
1385 /**
1386 * Return the first line from the given input which correspond to the given
1387 * selectors.
1388 *
1389 * @param in
1390 * the input
1391 * @param needle
1392 * a string that must be found inside the target line (also
1393 * supports "^" at start to say "only if it starts with" the
1394 * needle)
1395 * @param relativeLine
1396 * the line to return based upon the target line position (-1 =
1397 * the line before, 0 = the target line...)
1398 *
1399 * @return the line
1400 */
1401 static String getLine(InputStream in, String needle, int relativeLine) {
1402 return getLine(in, needle, relativeLine, true);
1403 }
1404
1405 /**
1406 * Return a line from the given input which correspond to the given
1407 * selectors.
1408 *
1409 * @param in
1410 * the input
1411 * @param needle
1412 * a string that must be found inside the target line (also
1413 * supports "^" at start to say "only if it starts with" the
1414 * needle)
1415 * @param relativeLine
1416 * the line to return based upon the target line position (-1 =
1417 * the line before, 0 = the target line...)
1418 * @param first
1419 * takes the first result (as opposed to the last one, which will
1420 * also always spend the input)
1421 *
1422 * @return the line
1423 */
1424 static String getLine(InputStream in, String needle, int relativeLine,
1425 boolean first) {
1426 String rep = null;
1427
1428 try {
1429 in.reset();
1430 } catch (IOException e) {
1431 Instance.syserr(e);
1432 }
1433
1434 List<String> lines = new ArrayList<String>();
1435 @SuppressWarnings("resource")
1436 Scanner scan = new Scanner(in, "UTF-8");
1437 int index = -1;
1438 scan.useDelimiter("\\n");
1439 while (scan.hasNext()) {
1440 lines.add(scan.next());
1441
1442 if (index == -1) {
1443 if (needle.startsWith("^")) {
1444 if (lines.get(lines.size() - 1).startsWith(
1445 needle.substring(1))) {
1446 index = lines.size() - 1;
1447 }
1448
1449 } else {
1450 if (lines.get(lines.size() - 1).contains(needle)) {
1451 index = lines.size() - 1;
1452 }
1453 }
1454 }
1455
1456 if (index >= 0 && index + relativeLine < lines.size()) {
1457 rep = lines.get(index + relativeLine);
1458 if (first) {
1459 break;
1460 }
1461 }
1462 }
1463
1464 return rep;
1465 }
1466 }