Some jDoc fixes
[nikiroo-utils.git] / src / be / nikiroo / fanfix / supported / BasicSupport.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.awt.image.BufferedImage;
4 import java.io.BufferedReader;
5 import java.io.ByteArrayInputStream;
6 import java.io.File;
7 import java.io.IOException;
8 import java.io.InputStream;
9 import java.io.InputStreamReader;
10 import java.net.MalformedURLException;
11 import java.net.URL;
12 import java.util.ArrayList;
13 import java.util.Date;
14 import java.util.HashMap;
15 import java.util.List;
16 import java.util.Map;
17 import java.util.Map.Entry;
18 import java.util.Scanner;
19
20 import be.nikiroo.fanfix.Instance;
21 import be.nikiroo.fanfix.bundles.Config;
22 import be.nikiroo.fanfix.bundles.StringId;
23 import be.nikiroo.fanfix.data.Chapter;
24 import be.nikiroo.fanfix.data.MetaData;
25 import be.nikiroo.fanfix.data.Paragraph;
26 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
27 import be.nikiroo.fanfix.data.Story;
28 import be.nikiroo.utils.IOUtils;
29 import be.nikiroo.utils.Progress;
30 import be.nikiroo.utils.StringUtils;
31
32 /**
33 * This class is the base class used by the other support classes. It can be
34 * used outside of this package, and have static method that you can use to get
35 * access to the correct support class.
36 * <p>
37 * It will be used with 'resources' (usually web pages or files).
38 *
39 * @author niki
40 */
41 public abstract class BasicSupport {
42 /**
43 * The supported input types for which we can get a {@link BasicSupport}
44 * object.
45 *
46 * @author niki
47 */
48 public enum SupportType {
49 /** EPUB files created with this program */
50 EPUB,
51 /** Pure text file with some rules */
52 TEXT,
53 /** TEXT but with associated .info file */
54 INFO_TEXT,
55 /** My Little Pony fanfictions */
56 FIMFICTION,
57 /** Fanfictions from a lot of different universes */
58 FANFICTION,
59 /** Website with lots of Mangas */
60 MANGAFOX,
61 /** Furry website with comics support */
62 E621,
63 /** Furry website with stories */
64 YIFFSTAR,
65 /** Comics and images groups, mostly but not only NSFW */
66 E_HENTAI,
67 /** CBZ files */
68 CBZ,
69 /** HTML files */
70 HTML;
71
72 /**
73 * A description of this support type (more information than the
74 * {@link BasicSupport#getSourceName()}).
75 *
76 * @return the description
77 */
78 public String getDesc() {
79 String desc = Instance.getTrans().getStringX(StringId.INPUT_DESC,
80 this.name());
81
82 if (desc == null) {
83 desc = Instance.getTrans().getString(StringId.INPUT_DESC, this);
84 }
85
86 return desc;
87 }
88
89 /**
90 * The name of this support type (a short version).
91 *
92 * @return the name
93 */
94 public String getSourceName() {
95 BasicSupport support = BasicSupport.getSupport(this);
96 if (support != null) {
97 return support.getSourceName();
98 }
99
100 return null;
101 }
102
103 @Override
104 public String toString() {
105 return super.toString().toLowerCase();
106 }
107
108 /**
109 * Call {@link SupportType#valueOf(String)} after conversion to upper
110 * case.
111 *
112 * @param typeName
113 * the possible type name
114 *
115 * @return NULL or the type
116 */
117 public static SupportType valueOfUC(String typeName) {
118 return SupportType.valueOf(typeName == null ? null : typeName
119 .toUpperCase());
120 }
121
122 /**
123 * Call {@link SupportType#valueOf(String)} after conversion to upper
124 * case but return NULL for NULL instead of raising exception.
125 *
126 * @param typeName
127 * the possible type name
128 *
129 * @return NULL or the type
130 */
131 public static SupportType valueOfNullOkUC(String typeName) {
132 if (typeName == null) {
133 return null;
134 }
135
136 return SupportType.valueOfUC(typeName);
137 }
138
139 /**
140 * Call {@link SupportType#valueOf(String)} after conversion to upper
141 * case but return NULL in case of error instead of raising an
142 * exception.
143 *
144 * @param typeName
145 * the possible type name
146 *
147 * @return NULL or the type
148 */
149 public static SupportType valueOfAllOkUC(String typeName) {
150 try {
151 return SupportType.valueOfUC(typeName);
152 } catch (Exception e) {
153 return null;
154 }
155 }
156 }
157
158 private InputStream in;
159 private SupportType type;
160 private URL currentReferer; // with only one 'r', as in 'HTTP'...
161
162 // quote chars
163 private char openQuote = Instance.getTrans().getCharacter(
164 StringId.OPEN_SINGLE_QUOTE);
165 private char closeQuote = Instance.getTrans().getCharacter(
166 StringId.CLOSE_SINGLE_QUOTE);
167 private char openDoubleQuote = Instance.getTrans().getCharacter(
168 StringId.OPEN_DOUBLE_QUOTE);
169 private char closeDoubleQuote = Instance.getTrans().getCharacter(
170 StringId.CLOSE_DOUBLE_QUOTE);
171
172 /**
173 * The name of this support class.
174 *
175 * @return the name
176 */
177 protected abstract String getSourceName();
178
179 /**
180 * Check if the given resource is supported by this {@link BasicSupport}.
181 *
182 * @param url
183 * the resource to check for
184 *
185 * @return TRUE if it is
186 */
187 protected abstract boolean supports(URL url);
188
189 /**
190 * Return TRUE if the support will return HTML encoded content values for
191 * the chapters content.
192 *
193 * @return TRUE for HTML
194 */
195 protected abstract boolean isHtml();
196
197 /**
198 * Return the {@link MetaData} of this story.
199 *
200 * @param source
201 * the source of the story
202 * @param in
203 * the input (the main resource)
204 *
205 * @return the associated {@link MetaData}
206 *
207 * @throws IOException
208 * in case of I/O error
209 */
210 protected abstract MetaData getMeta(URL source, InputStream in)
211 throws IOException;
212
213 /**
214 * Return the story description.
215 *
216 * @param source
217 * the source of the story
218 * @param in
219 * the input (the main resource)
220 *
221 * @return the description
222 *
223 * @throws IOException
224 * in case of I/O error
225 */
226 protected abstract String getDesc(URL source, InputStream in)
227 throws IOException;
228
229 /**
230 * Return the list of chapters (name and resource).
231 *
232 * @param source
233 * the source of the story
234 * @param in
235 * the input (the main resource)
236 * @param pg
237 * the optional progress reporter
238 *
239 * @return the chapters
240 *
241 * @throws IOException
242 * in case of I/O error
243 */
244 protected abstract List<Entry<String, URL>> getChapters(URL source,
245 InputStream in, Progress pg) throws IOException;
246
247 /**
248 * Return the content of the chapter (possibly HTML encoded, if
249 * {@link BasicSupport#isHtml()} is TRUE).
250 *
251 * @param source
252 * the source of the story
253 * @param in
254 * the input (the main resource)
255 * @param number
256 * the chapter number
257 * @param pg
258 * the optional progress reporter
259 *
260 * @return the content
261 *
262 * @throws IOException
263 * in case of I/O error
264 */
265 protected abstract String getChapterContent(URL source, InputStream in,
266 int number, Progress pg) throws IOException;
267
268 /**
269 * Log into the support (can be a no-op depending upon the support).
270 *
271 * @throws IOException
272 * in case of I/O error
273 */
274 public void login() throws IOException {
275
276 }
277
278 /**
279 * Return the list of cookies (values included) that must be used to
280 * correctly fetch the resources.
281 * <p>
282 * You are expected to call the super method implementation if you override
283 * it.
284 *
285 * @return the cookies
286 *
287 * @throws IOException
288 * in case of I/O error
289 */
290 public Map<String, String> getCookies() throws IOException {
291 return new HashMap<String, String>();
292 }
293
294 /**
295 * Return the canonical form of the main {@link URL}.
296 *
297 * @param source
298 * the source {@link URL}
299 *
300 * @return the canonical form of this {@link URL}
301 *
302 * @throws IOException
303 * in case of I/O error
304 */
305 public URL getCanonicalUrl(URL source) throws IOException {
306 return source;
307 }
308
309 /**
310 * Process the given story resource into a partially filled {@link Story}
311 * object containing the name and metadata, except for the description.
312 *
313 * @param url
314 * the story resource
315 *
316 * @return the {@link Story}
317 *
318 * @throws IOException
319 * in case of I/O error
320 */
321 public Story processMeta(URL url) throws IOException {
322 return processMeta(url, true, false, null);
323 }
324
325 /**
326 * Process the given story resource into a partially filled {@link Story}
327 * object containing the name and metadata.
328 *
329 * @param url
330 * the story resource
331 * @param close
332 * close "this" and "in" when done
333 * @param getDesc
334 * retrieve the description of the story, or not
335 * @param pg
336 * the optional progress reporter
337 *
338 * @return the {@link Story}
339 *
340 * @throws IOException
341 * in case of I/O error
342 */
343 protected Story processMeta(URL url, boolean close, boolean getDesc,
344 Progress pg) throws IOException {
345 if (pg == null) {
346 pg = new Progress();
347 } else {
348 pg.setMinMax(0, 100);
349 }
350
351 login();
352 pg.setProgress(10);
353
354 url = getCanonicalUrl(url);
355
356 setCurrentReferer(url);
357
358 in = openInput(url);
359 if (in == null) {
360 return null;
361 }
362
363 try {
364 preprocess(url, getInput());
365 pg.setProgress(30);
366
367 Story story = new Story();
368 MetaData meta = getMeta(url, getInput());
369 if (meta.getCreationDate() == null
370 || meta.getCreationDate().isEmpty()) {
371 meta.setCreationDate(StringUtils.fromTime(new Date().getTime()));
372 }
373 story.setMeta(meta);
374
375 pg.setProgress(50);
376
377 if (meta != null && meta.getCover() == null) {
378 meta.setCover(getDefaultCover(meta.getSubject()));
379 }
380
381 pg.setProgress(60);
382
383 if (getDesc) {
384 String descChapterName = Instance.getTrans().getString(
385 StringId.DESCRIPTION);
386 story.getMeta().setResume(
387 makeChapter(url, 0, descChapterName,
388 getDesc(url, getInput()), null));
389 }
390
391 pg.setProgress(100);
392 return story;
393 } finally {
394 if (close) {
395 try {
396 close();
397 } catch (IOException e) {
398 Instance.syserr(e);
399 }
400
401 if (in != null) {
402 in.close();
403 }
404 }
405
406 setCurrentReferer(null);
407 }
408 }
409
410 /**
411 * Process the given story resource into a fully filled {@link Story}
412 * object.
413 *
414 * @param url
415 * the story resource
416 * @param pg
417 * the optional progress reporter
418 *
419 * @return the {@link Story}
420 *
421 * @throws IOException
422 * in case of I/O error
423 */
424 public Story process(URL url, Progress pg) throws IOException {
425 if (pg == null) {
426 pg = new Progress();
427 } else {
428 pg.setMinMax(0, 100);
429 }
430
431 url = getCanonicalUrl(url);
432 pg.setProgress(1);
433 try {
434 Progress pgMeta = new Progress();
435 pg.addProgress(pgMeta, 10);
436 Story story = processMeta(url, false, true, pgMeta);
437 if (!pgMeta.isDone()) {
438 pgMeta.setProgress(pgMeta.getMax()); // 10%
439 }
440
441 if (story == null) {
442 pg.setProgress(90);
443 return null;
444 }
445
446 pg.setName("Retrieving " + story.getMeta().getTitle());
447
448 setCurrentReferer(url);
449
450 Progress pgGetChapters = new Progress();
451 pg.addProgress(pgGetChapters, 10);
452 story.setChapters(new ArrayList<Chapter>());
453 List<Entry<String, URL>> chapters = getChapters(url, getInput(),
454 pgGetChapters);
455 if (!pgGetChapters.isDone()) {
456 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
457 }
458
459 if (chapters != null) {
460 Progress pgChaps = new Progress("Extracting chapters", 0,
461 chapters.size() * 300);
462 pg.addProgress(pgChaps, 80);
463
464 long words = 0;
465 int i = 1;
466 for (Entry<String, URL> chap : chapters) {
467 pgChaps.setName("Extracting chapter " + i);
468 setCurrentReferer(chap.getValue());
469 InputStream chapIn = Instance.getCache().open(
470 chap.getValue(), this, true);
471 pgChaps.setProgress(i * 100);
472 try {
473 Progress pgGetChapterContent = new Progress();
474 Progress pgMakeChapter = new Progress();
475 pgChaps.addProgress(pgGetChapterContent, 100);
476 pgChaps.addProgress(pgMakeChapter, 100);
477
478 String content = getChapterContent(url, chapIn, i,
479 pgGetChapterContent);
480 if (!pgGetChapterContent.isDone()) {
481 pgGetChapterContent.setProgress(pgGetChapterContent
482 .getMax());
483 }
484
485 Chapter cc = makeChapter(url, i, chap.getKey(),
486 content, pgMakeChapter);
487 if (!pgMakeChapter.isDone()) {
488 pgMakeChapter.setProgress(pgMakeChapter.getMax());
489 }
490
491 words += cc.getWords();
492 story.getChapters().add(cc);
493 if (story.getMeta() != null) {
494 story.getMeta().setWords(words);
495 }
496 } finally {
497 chapIn.close();
498 }
499
500 i++;
501 }
502
503 pgChaps.setName("Extracting chapters");
504 } else {
505 pg.setProgress(80);
506 }
507
508 return story;
509
510 } finally {
511 try {
512 close();
513 } catch (IOException e) {
514 Instance.syserr(e);
515 }
516
517 if (in != null) {
518 in.close();
519 }
520
521 setCurrentReferer(null);
522 }
523 }
524
525 /**
526 * The support type.
527 *
528 * @return the type
529 */
530 public SupportType getType() {
531 return type;
532 }
533
534 /**
535 * The current referer {@link URL} (only one 'r', as in 'HTML'...), i.e.,
536 * the current {@link URL} we work on.
537 *
538 * @return the referer
539 */
540 public URL getCurrentReferer() {
541 return currentReferer;
542 }
543
544 /**
545 * The current referer {@link URL} (only one 'r', as in 'HTML'...), i.e.,
546 * the current {@link URL} we work on.
547 *
548 * @param currentReferer
549 * the new referer
550 */
551 protected void setCurrentReferer(URL currentReferer) {
552 this.currentReferer = currentReferer;
553 }
554
555 /**
556 * The support type.
557 *
558 * @param type
559 * the new type
560 *
561 * @return this
562 */
563 protected BasicSupport setType(SupportType type) {
564 this.type = type;
565 return this;
566 }
567
568 /**
569 * Prepare the support if needed before processing.
570 *
571 * @param source
572 * the source of the story
573 * @param in
574 * the input (the main resource)
575 *
576 * @throws IOException
577 * on I/O error
578 */
579 protected void preprocess(URL source, InputStream in) throws IOException {
580 }
581
582 /**
583 * Now that we have processed the {@link Story}, close the resources if any.
584 *
585 * @throws IOException
586 * on I/O error
587 */
588 protected void close() throws IOException {
589 }
590
591 /**
592 * Create a {@link Chapter} object from the given information, formatting
593 * the content as it should be.
594 *
595 * @param source
596 * the source of the story
597 * @param number
598 * the chapter number
599 * @param name
600 * the chapter name
601 * @param content
602 * the chapter content
603 * @param pg
604 * the optional progress reporter
605 *
606 * @return the {@link Chapter}
607 *
608 * @throws IOException
609 * in case of I/O error
610 */
611 protected Chapter makeChapter(URL source, int number, String name,
612 String content, Progress pg) throws IOException {
613 // Chapter name: process it correctly, then remove the possible
614 // redundant "Chapter x: " in front of it, or "-" (as in
615 // "Chapter 5: - Fun!" after the ": " was automatically added)
616 String chapterName = processPara(name).getContent().trim();
617 for (String lang : Instance.getConfig().getString(Config.CHAPTER)
618 .split(",")) {
619 String chapterWord = Instance.getConfig().getStringX(
620 Config.CHAPTER, lang);
621 if (chapterName.startsWith(chapterWord)) {
622 chapterName = chapterName.substring(chapterWord.length())
623 .trim();
624 break;
625 }
626 }
627
628 if (chapterName.startsWith(Integer.toString(number))) {
629 chapterName = chapterName.substring(
630 Integer.toString(number).length()).trim();
631 }
632
633 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
634 chapterName = chapterName.substring(1).trim();
635 }
636 //
637
638 Chapter chap = new Chapter(number, chapterName);
639
640 if (content != null) {
641 List<Paragraph> paras = makeParagraphs(source, content, pg);
642 long words = 0;
643 for (Paragraph para : paras) {
644 words += para.getWords();
645 }
646 chap.setParagraphs(paras);
647 chap.setWords(words);
648 }
649
650 return chap;
651
652 }
653
654 /**
655 * Convert the given content into {@link Paragraph}s.
656 *
657 * @param source
658 * the source URL of the story
659 * @param content
660 * the textual content
661 * @param pg
662 * the optional progress reporter
663 *
664 * @return the {@link Paragraph}s
665 *
666 * @throws IOException
667 * in case of I/O error
668 */
669 protected List<Paragraph> makeParagraphs(URL source, String content,
670 Progress pg) throws IOException {
671 if (pg == null) {
672 pg = new Progress();
673 }
674
675 if (isHtml()) {
676 // Special <HR> processing:
677 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
678 "<br/>* * *<br/>");
679 }
680
681 List<Paragraph> paras = new ArrayList<Paragraph>();
682
683 if (content != null && !content.trim().isEmpty()) {
684 if (isHtml()) {
685 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
686 pg.setMinMax(0, tab.length);
687 int i = 1;
688 for (String line : tab) {
689 if (line.startsWith("[") && line.endsWith("]")) {
690 pg.setName("Extracting image " + i);
691 }
692 paras.add(makeParagraph(source, line.trim()));
693 pg.setProgress(i++);
694 }
695 pg.setName(null);
696 } else {
697 List<String> lines = new ArrayList<String>();
698 BufferedReader buff = null;
699 try {
700 buff = new BufferedReader(
701 new InputStreamReader(new ByteArrayInputStream(
702 content.getBytes("UTF-8")), "UTF-8"));
703 for (String line = buff.readLine(); line != null; line = buff
704 .readLine()) {
705 lines.add(line.trim());
706 }
707 } finally {
708 if (buff != null) {
709 buff.close();
710 }
711 }
712
713 pg.setMinMax(0, lines.size());
714 int i = 0;
715 for (String line : lines) {
716 if (line.startsWith("[") && line.endsWith("]")) {
717 pg.setName("Extracting image " + i);
718 }
719 paras.add(makeParagraph(source, line));
720 pg.setProgress(i++);
721 }
722 pg.setName(null);
723 }
724
725 // Check quotes for "bad" format
726 List<Paragraph> newParas = new ArrayList<Paragraph>();
727 for (Paragraph para : paras) {
728 newParas.addAll(requotify(para));
729 }
730 paras = newParas;
731
732 // Remove double blanks/brks
733 fixBlanksBreaks(paras);
734 }
735
736 return paras;
737 }
738
739 /**
740 * Convert the given line into a single {@link Paragraph}.
741 *
742 * @param source
743 * the source URL of the story
744 * @param line
745 * the textual content of the paragraph
746 *
747 * @return the {@link Paragraph}
748 */
749 private Paragraph makeParagraph(URL source, String line) {
750 URL image = null;
751 if (line.startsWith("[") && line.endsWith("]")) {
752 image = getImageUrl(this, source,
753 line.substring(1, line.length() - 1).trim());
754 }
755
756 if (image != null) {
757 return new Paragraph(image);
758 } else {
759 return processPara(line);
760 }
761 }
762
763 /**
764 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
765 * those {@link Paragraph}s.
766 * <p>
767 * The resulting list will not contain a starting or trailing blank/break
768 * nor 2 blanks or breaks following each other.
769 *
770 * @param paras
771 * the list of {@link Paragraph}s to fix
772 */
773 protected void fixBlanksBreaks(List<Paragraph> paras) {
774 boolean space = false;
775 boolean brk = true;
776 for (int i = 0; i < paras.size(); i++) {
777 Paragraph para = paras.get(i);
778 boolean thisSpace = para.getType() == ParagraphType.BLANK;
779 boolean thisBrk = para.getType() == ParagraphType.BREAK;
780
781 if (i > 0 && space && thisBrk) {
782 paras.remove(i - 1);
783 i--;
784 } else if ((space || brk) && (thisSpace || thisBrk)) {
785 paras.remove(i);
786 i--;
787 }
788
789 space = thisSpace;
790 brk = thisBrk;
791 }
792
793 // Remove blank/brk at start
794 if (paras.size() > 0
795 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
796 0).getType() == ParagraphType.BREAK)) {
797 paras.remove(0);
798 }
799
800 // Remove blank/brk at end
801 int last = paras.size() - 1;
802 if (paras.size() > 0
803 && (paras.get(last).getType() == ParagraphType.BLANK || paras
804 .get(last).getType() == ParagraphType.BREAK)) {
805 paras.remove(last);
806 }
807 }
808
809 /**
810 * Get the default cover related to this subject (see <tt>.info</tt> files).
811 *
812 * @param subject
813 * the subject
814 *
815 * @return the cover if any, or NULL
816 */
817 static BufferedImage getDefaultCover(String subject) {
818 if (subject != null && !subject.isEmpty()
819 && Instance.getCoverDir() != null) {
820 try {
821 File fileCover = new File(Instance.getCoverDir(), subject);
822 return getImage(null, fileCover.toURI().toURL(), subject);
823 } catch (MalformedURLException e) {
824 }
825 }
826
827 return null;
828 }
829
830 /**
831 * Return the list of supported image extensions.
832 *
833 * @param emptyAllowed
834 * TRUE to allow an empty extension on first place, which can be
835 * used when you may already have an extension in your input but
836 * are not sure about it
837 *
838 * @return the extensions
839 */
840 static String[] getImageExt(boolean emptyAllowed) {
841 if (emptyAllowed) {
842 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
843 } else {
844 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
845 }
846 }
847
848 /**
849 * Check if the given resource can be a local image or a remote image, then
850 * refresh the cache with it if it is.
851 *
852 * @param source
853 * the story source
854 * @param line
855 * the resource to check
856 *
857 * @return the image if found, or NULL
858 *
859 */
860 static BufferedImage getImage(BasicSupport support, URL source, String line) {
861 URL url = getImageUrl(support, source, line);
862 if (url != null) {
863 InputStream in = null;
864 try {
865 in = Instance.getCache().open(url, getSupport(url), true);
866 return IOUtils.toImage(in);
867 } catch (IOException e) {
868 } finally {
869 if (in != null) {
870 try {
871 in.close();
872 } catch (IOException e) {
873 }
874 }
875 }
876 }
877
878 return null;
879 }
880
881 /**
882 * Check if the given resource can be a local image or a remote image, then
883 * refresh the cache with it if it is.
884 *
885 * @param source
886 * the story source
887 * @param line
888 * the resource to check
889 *
890 * @return the image URL if found, or NULL
891 *
892 */
893 static URL getImageUrl(BasicSupport support, URL source, String line) {
894 URL url = null;
895
896 if (line != null) {
897 // try for files
898 if (source != null) {
899 try {
900
901 String relPath = null;
902 String absPath = null;
903 try {
904 String path = new File(source.getFile()).getParent();
905 relPath = new File(new File(path), line.trim())
906 .getAbsolutePath();
907 } catch (Exception e) {
908 // Cannot be converted to path (one possibility to take
909 // into account: absolute path on Windows)
910 }
911 try {
912 absPath = new File(line.trim()).getAbsolutePath();
913 } catch (Exception e) {
914 // Cannot be converted to path (at all)
915 }
916
917 for (String ext : getImageExt(true)) {
918 if (absPath != null && new File(absPath + ext).exists()) {
919 url = new File(absPath + ext).toURI().toURL();
920 } else if (relPath != null
921 && new File(relPath + ext).exists()) {
922 url = new File(relPath + ext).toURI().toURL();
923 }
924 }
925 } catch (Exception e) {
926 // Should not happen since we control the correct arguments
927 }
928 }
929
930 if (url == null) {
931 // try for URLs
932 try {
933 for (String ext : getImageExt(true)) {
934 if (Instance.getCache().check(new URL(line + ext))) {
935 url = new URL(line + ext);
936 break;
937 }
938 }
939
940 // try out of cache
941 if (url == null) {
942 for (String ext : getImageExt(true)) {
943 try {
944 url = new URL(line + ext);
945 Instance.getCache().refresh(url, support, true);
946 break;
947 } catch (IOException e) {
948 // no image with this ext
949 url = null;
950 }
951 }
952 }
953 } catch (MalformedURLException e) {
954 // Not an url
955 }
956 }
957
958 // refresh the cached file
959 if (url != null) {
960 try {
961 Instance.getCache().refresh(url, support, true);
962 } catch (IOException e) {
963 // woops, broken image
964 url = null;
965 }
966 }
967 }
968
969 return url;
970 }
971
972 /**
973 * Open the input file that will be used through the support.
974 *
975 * @param source
976 * the source {@link URL}
977 *
978 * @return the {@link InputStream}
979 *
980 * @throws IOException
981 * in case of I/O error
982 */
983 protected InputStream openInput(URL source) throws IOException {
984 return Instance.getCache().open(source, this, false);
985 }
986
987 /**
988 * Reset the given {@link InputStream} and return it.
989 *
990 * @param in
991 * the {@link InputStream} to reset
992 *
993 * @return the same {@link InputStream} after reset
994 */
995 protected InputStream reset(InputStream in) {
996 try {
997 in.reset();
998 } catch (IOException e) {
999 }
1000 return in;
1001 }
1002
1003 /**
1004 * Reset then return {@link BasicSupport#in}.
1005 *
1006 * @return {@link BasicSupport#in}
1007 */
1008 protected InputStream getInput() {
1009 return reset(in);
1010 }
1011
1012 /**
1013 * Fix the author name if it is prefixed with some "by" {@link String}.
1014 *
1015 * @param author
1016 * the author with a possible prefix
1017 *
1018 * @return the author without prefixes
1019 */
1020 protected String fixAuthor(String author) {
1021 if (author != null) {
1022 for (String suffix : new String[] { " ", ":" }) {
1023 for (String byString : Instance.getConfig()
1024 .getString(Config.BYS).split(",")) {
1025 byString += suffix;
1026 if (author.toUpperCase().startsWith(byString.toUpperCase())) {
1027 author = author.substring(byString.length()).trim();
1028 }
1029 }
1030 }
1031
1032 // Special case (without suffix):
1033 if (author.startsWith("©")) {
1034 author = author.substring(1);
1035 }
1036 }
1037
1038 return author;
1039 }
1040
1041 /**
1042 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
1043 * and requotify them (i.e., separate them into QUOTE paragraphs and other
1044 * paragraphs (quotes or not)).
1045 *
1046 * @param para
1047 * the paragraph to requotify (not necessarily a quote)
1048 *
1049 * @return the correctly (or so we hope) quotified paragraphs
1050 */
1051 protected List<Paragraph> requotify(Paragraph para) {
1052 List<Paragraph> newParas = new ArrayList<Paragraph>();
1053
1054 if (para.getType() == ParagraphType.QUOTE
1055 && para.getContent().length() > 2) {
1056 String line = para.getContent();
1057 boolean singleQ = line.startsWith("" + openQuote);
1058 boolean doubleQ = line.startsWith("" + openDoubleQuote);
1059
1060 // Do not try when more than one quote at a time
1061 // (some stories are not easily readable if we do)
1062 if (singleQ
1063 && line.indexOf(closeQuote, 1) < line
1064 .lastIndexOf(closeQuote)) {
1065 newParas.add(para);
1066 return newParas;
1067 }
1068 if (doubleQ
1069 && line.indexOf(closeDoubleQuote, 1) < line
1070 .lastIndexOf(closeDoubleQuote)) {
1071 newParas.add(para);
1072 return newParas;
1073 }
1074 //
1075
1076 if (!singleQ && !doubleQ) {
1077 line = openDoubleQuote + line + closeDoubleQuote;
1078 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
1079 .getWords()));
1080 } else {
1081 char open = singleQ ? openQuote : openDoubleQuote;
1082 char close = singleQ ? closeQuote : closeDoubleQuote;
1083
1084 int posDot = -1;
1085 boolean inQuote = false;
1086 int i = 0;
1087 for (char car : line.toCharArray()) {
1088 if (car == open) {
1089 inQuote = true;
1090 } else if (car == close) {
1091 inQuote = false;
1092 } else if (car == '.' && !inQuote) {
1093 posDot = i;
1094 break;
1095 }
1096 i++;
1097 }
1098
1099 if (posDot >= 0) {
1100 String rest = line.substring(posDot + 1).trim();
1101 line = line.substring(0, posDot + 1).trim();
1102 long words = 1;
1103 for (char car : line.toCharArray()) {
1104 if (car == ' ') {
1105 words++;
1106 }
1107 }
1108 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
1109 if (!rest.isEmpty()) {
1110 newParas.addAll(requotify(processPara(rest)));
1111 }
1112 } else {
1113 newParas.add(para);
1114 }
1115 }
1116 } else {
1117 newParas.add(para);
1118 }
1119
1120 return newParas;
1121 }
1122
1123 /**
1124 * Process a {@link Paragraph} from a raw line of text.
1125 * <p>
1126 * Will also fix quotes and HTML encoding if needed.
1127 *
1128 * @param line
1129 * the raw line
1130 *
1131 * @return the processed {@link Paragraph}
1132 */
1133 protected Paragraph processPara(String line) {
1134 line = ifUnhtml(line).trim();
1135
1136 boolean space = true;
1137 boolean brk = true;
1138 boolean quote = false;
1139 boolean tentativeCloseQuote = false;
1140 char prev = '\0';
1141 int dashCount = 0;
1142 long words = 1;
1143
1144 StringBuilder builder = new StringBuilder();
1145 for (char car : line.toCharArray()) {
1146 if (car != '-') {
1147 if (dashCount > 0) {
1148 // dash, ndash and mdash: - – —
1149 // currently: always use mdash
1150 builder.append(dashCount == 1 ? '-' : '—');
1151 }
1152 dashCount = 0;
1153 }
1154
1155 if (tentativeCloseQuote) {
1156 tentativeCloseQuote = false;
1157 if (Character.isLetterOrDigit(car)) {
1158 builder.append("'");
1159 } else {
1160 // handle double-single quotes as double quotes
1161 if (prev == car) {
1162 builder.append(closeDoubleQuote);
1163 continue;
1164 } else {
1165 builder.append(closeQuote);
1166 }
1167 }
1168 }
1169
1170 switch (car) {
1171 case ' ': // note: unbreakable space
1172 case ' ':
1173 case '\t':
1174 case '\n': // just in case
1175 case '\r': // just in case
1176 if (builder.length() > 0
1177 && builder.charAt(builder.length() - 1) != ' ') {
1178 words++;
1179 }
1180 builder.append(' ');
1181 break;
1182
1183 case '\'':
1184 if (space || (brk && quote)) {
1185 quote = true;
1186 // handle double-single quotes as double quotes
1187 if (prev == car) {
1188 builder.deleteCharAt(builder.length() - 1);
1189 builder.append(openDoubleQuote);
1190 } else {
1191 builder.append(openQuote);
1192 }
1193 } else if (prev == ' ' || prev == car) {
1194 // handle double-single quotes as double quotes
1195 if (prev == car) {
1196 builder.deleteCharAt(builder.length() - 1);
1197 builder.append(openDoubleQuote);
1198 } else {
1199 builder.append(openQuote);
1200 }
1201 } else {
1202 // it is a quote ("I'm off") or a 'quote' ("This
1203 // 'good' restaurant"...)
1204 tentativeCloseQuote = true;
1205 }
1206 break;
1207
1208 case '"':
1209 if (space || (brk && quote)) {
1210 quote = true;
1211 builder.append(openDoubleQuote);
1212 } else if (prev == ' ') {
1213 builder.append(openDoubleQuote);
1214 } else {
1215 builder.append(closeDoubleQuote);
1216 }
1217 break;
1218
1219 case '-':
1220 if (space) {
1221 quote = true;
1222 } else {
1223 dashCount++;
1224 }
1225 space = false;
1226 break;
1227
1228 case '*':
1229 case '~':
1230 case '/':
1231 case '\\':
1232 case '<':
1233 case '>':
1234 case '=':
1235 case '+':
1236 case '_':
1237 case '–':
1238 case '—':
1239 space = false;
1240 builder.append(car);
1241 break;
1242
1243 case '‘':
1244 case '`':
1245 case '‹':
1246 case '﹁':
1247 case '〈':
1248 case '「':
1249 if (space || (brk && quote)) {
1250 quote = true;
1251 builder.append(openQuote);
1252 } else {
1253 // handle double-single quotes as double quotes
1254 if (prev == car) {
1255 builder.deleteCharAt(builder.length() - 1);
1256 builder.append(openDoubleQuote);
1257 } else {
1258 builder.append(openQuote);
1259 }
1260 }
1261 space = false;
1262 brk = false;
1263 break;
1264
1265 case '’':
1266 case '›':
1267 case '﹂':
1268 case '〉':
1269 case '」':
1270 space = false;
1271 brk = false;
1272 // handle double-single quotes as double quotes
1273 if (prev == car) {
1274 builder.deleteCharAt(builder.length() - 1);
1275 builder.append(closeDoubleQuote);
1276 } else {
1277 builder.append(closeQuote);
1278 }
1279 break;
1280
1281 case '«':
1282 case '“':
1283 case '﹃':
1284 case '《':
1285 case '『':
1286 if (space || (brk && quote)) {
1287 quote = true;
1288 builder.append(openDoubleQuote);
1289 } else {
1290 builder.append(openDoubleQuote);
1291 }
1292 space = false;
1293 brk = false;
1294 break;
1295
1296 case '»':
1297 case '”':
1298 case '﹄':
1299 case '》':
1300 case '』':
1301 space = false;
1302 brk = false;
1303 builder.append(closeDoubleQuote);
1304 break;
1305
1306 default:
1307 space = false;
1308 brk = false;
1309 builder.append(car);
1310 break;
1311 }
1312
1313 prev = car;
1314 }
1315
1316 if (tentativeCloseQuote) {
1317 tentativeCloseQuote = false;
1318 builder.append(closeQuote);
1319 }
1320
1321 line = builder.toString().trim();
1322
1323 ParagraphType type = ParagraphType.NORMAL;
1324 if (space) {
1325 type = ParagraphType.BLANK;
1326 } else if (brk) {
1327 type = ParagraphType.BREAK;
1328 } else if (quote) {
1329 type = ParagraphType.QUOTE;
1330 }
1331
1332 return new Paragraph(type, line, words);
1333 }
1334
1335 /**
1336 * Remove the HTML from the input <b>if</b> {@link BasicSupport#isHtml()} is
1337 * true.
1338 *
1339 * @param input
1340 * the input
1341 *
1342 * @return the no html version if needed
1343 */
1344 private String ifUnhtml(String input) {
1345 if (isHtml() && input != null) {
1346 return StringUtils.unhtml(input);
1347 }
1348
1349 return input;
1350 }
1351
1352 /**
1353 * Return a {@link BasicSupport} implementation supporting the given
1354 * resource if possible.
1355 *
1356 * @param url
1357 * the story resource
1358 *
1359 * @return an implementation that supports it, or NULL
1360 */
1361 public static BasicSupport getSupport(URL url) {
1362 if (url == null) {
1363 return null;
1364 }
1365
1366 // TEXT and INFO_TEXT always support files (not URLs though)
1367 for (SupportType type : SupportType.values()) {
1368 if (type != SupportType.TEXT && type != SupportType.INFO_TEXT) {
1369 BasicSupport support = getSupport(type);
1370 if (support != null && support.supports(url)) {
1371 return support;
1372 }
1373 }
1374 }
1375
1376 for (SupportType type : new SupportType[] { SupportType.INFO_TEXT,
1377 SupportType.TEXT }) {
1378 BasicSupport support = getSupport(type);
1379 if (support != null && support.supports(url)) {
1380 return support;
1381 }
1382 }
1383
1384 return null;
1385 }
1386
1387 /**
1388 * Return a {@link BasicSupport} implementation supporting the given type.
1389 *
1390 * @param type
1391 * the type
1392 *
1393 * @return an implementation that supports it, or NULL
1394 */
1395 public static BasicSupport getSupport(SupportType type) {
1396 switch (type) {
1397 case EPUB:
1398 return new Epub().setType(type);
1399 case INFO_TEXT:
1400 return new InfoText().setType(type);
1401 case FIMFICTION:
1402 return new Fimfiction().setType(type);
1403 case FANFICTION:
1404 return new Fanfiction().setType(type);
1405 case TEXT:
1406 return new Text().setType(type);
1407 case MANGAFOX:
1408 return new MangaFox().setType(type);
1409 case E621:
1410 return new E621().setType(type);
1411 case YIFFSTAR:
1412 return new YiffStar().setType(type);
1413 case E_HENTAI:
1414 return new EHentai().setType(type);
1415 case CBZ:
1416 return new Cbz().setType(type);
1417 case HTML:
1418 return new Html().setType(type);
1419 }
1420
1421 return null;
1422 }
1423
1424 /**
1425 * Return the first line from the given input which correspond to the given
1426 * selectors.
1427 *
1428 * @param in
1429 * the input
1430 * @param needle
1431 * a string that must be found inside the target line (also
1432 * supports "^" at start to say "only if it starts with" the
1433 * needle)
1434 * @param relativeLine
1435 * the line to return based upon the target line position (-1 =
1436 * the line before, 0 = the target line...)
1437 *
1438 * @return the line
1439 */
1440 static String getLine(InputStream in, String needle, int relativeLine) {
1441 return getLine(in, needle, relativeLine, true);
1442 }
1443
1444 /**
1445 * Return a line from the given input which correspond to the given
1446 * selectors.
1447 *
1448 * @param in
1449 * the input
1450 * @param needle
1451 * a string that must be found inside the target line (also
1452 * supports "^" at start to say "only if it starts with" the
1453 * needle)
1454 * @param relativeLine
1455 * the line to return based upon the target line position (-1 =
1456 * the line before, 0 = the target line...)
1457 * @param first
1458 * takes the first result (as opposed to the last one, which will
1459 * also always spend the input)
1460 *
1461 * @return the line
1462 */
1463 static String getLine(InputStream in, String needle, int relativeLine,
1464 boolean first) {
1465 String rep = null;
1466
1467 try {
1468 in.reset();
1469 } catch (IOException e) {
1470 Instance.syserr(e);
1471 }
1472
1473 List<String> lines = new ArrayList<String>();
1474 @SuppressWarnings("resource")
1475 Scanner scan = new Scanner(in, "UTF-8");
1476 int index = -1;
1477 scan.useDelimiter("\\n");
1478 while (scan.hasNext()) {
1479 lines.add(scan.next());
1480
1481 if (index == -1) {
1482 if (needle.startsWith("^")) {
1483 if (lines.get(lines.size() - 1).startsWith(
1484 needle.substring(1))) {
1485 index = lines.size() - 1;
1486 }
1487
1488 } else {
1489 if (lines.get(lines.size() - 1).contains(needle)) {
1490 index = lines.size() - 1;
1491 }
1492 }
1493 }
1494
1495 if (index >= 0 && index + relativeLine < lines.size()) {
1496 rep = lines.get(index + relativeLine);
1497 if (first) {
1498 break;
1499 }
1500 }
1501 }
1502
1503 return rep;
1504 }
1505
1506 /**
1507 * Return the text between the key and the endKey (and optional subKey can
1508 * be passed, in this case we will look for the key first, then take the
1509 * text between the subKey and the endKey).
1510 * <p>
1511 * Will only match the first line with the given key if more than one are
1512 * possible. Which also means that if the subKey or endKey is not found on
1513 * that line, NULL will be returned.
1514 *
1515 * @param in
1516 * the input
1517 * @param key
1518 * the key to match (also supports "^" at start to say
1519 * "only if it starts with" the key)
1520 * @param subKey
1521 * the sub key or NULL if none
1522 * @param endKey
1523 * the end key or NULL for "up to the end"
1524 * @return the text or NULL if not found
1525 */
1526 static String getKeyLine(InputStream in, String key, String subKey,
1527 String endKey) {
1528 String result = null;
1529
1530 String line = getLine(in, key, 0);
1531 if (line != null && line.contains(key)) {
1532 line = line.substring(line.indexOf(key) + key.length());
1533 if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
1534 if (subKey != null) {
1535 line = line.substring(line.indexOf(subKey)
1536 + subKey.length());
1537 }
1538 if (endKey == null || line.contains(endKey)) {
1539 if (endKey != null) {
1540 line = line.substring(0, line.indexOf(endKey));
1541 result = line;
1542 }
1543 }
1544 }
1545 }
1546
1547 return result;
1548 }
1549 }