Fix: "Chapter 5: - Fun!" -> "Chapter 5: Fun!"
[fanfix.git] / src / be / nikiroo / fanfix / supported / BasicSupport.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.awt.image.BufferedImage;
4 import java.io.BufferedReader;
5 import java.io.ByteArrayInputStream;
6 import java.io.File;
7 import java.io.IOException;
8 import java.io.InputStream;
9 import java.io.InputStreamReader;
10 import java.net.MalformedURLException;
11 import java.net.URL;
12 import java.util.ArrayList;
13 import java.util.Date;
14 import java.util.HashMap;
15 import java.util.List;
16 import java.util.Map;
17 import java.util.Map.Entry;
18 import java.util.Scanner;
19
20 import be.nikiroo.fanfix.Instance;
21 import be.nikiroo.fanfix.bundles.Config;
22 import be.nikiroo.fanfix.bundles.StringId;
23 import be.nikiroo.fanfix.data.Chapter;
24 import be.nikiroo.fanfix.data.MetaData;
25 import be.nikiroo.fanfix.data.Paragraph;
26 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
27 import be.nikiroo.fanfix.data.Story;
28 import be.nikiroo.utils.IOUtils;
29 import be.nikiroo.utils.Progress;
30 import be.nikiroo.utils.StringUtils;
31
32 /**
33 * This class is the base class used by the other support classes. It can be
34 * used outside of this package, and have static method that you can use to get
35 * access to the correct support class.
36 * <p>
37 * It will be used with 'resources' (usually web pages or files).
38 *
39 * @author niki
40 */
41 public abstract class BasicSupport {
42 /**
43 * The supported input types for which we can get a {@link BasicSupport}
44 * object.
45 *
46 * @author niki
47 */
48 public enum SupportType {
49 /** EPUB files created with this program */
50 EPUB,
51 /** Pure text file with some rules */
52 TEXT,
53 /** TEXT but with associated .info file */
54 INFO_TEXT,
55 /** My Little Pony fanfictions */
56 FIMFICTION,
57 /** Fanfictions from a lot of different universes */
58 FANFICTION,
59 /** Website with lots of Mangas */
60 MANGAFOX,
61 /** Furry website with comics support */
62 E621,
63 /** Furry website with stories */
64 YIFFSTAR,
65 /** Comics and images groups, mostly but not only NSFW */
66 E_HENTAI,
67 /** CBZ files */
68 CBZ,
69 /** HTML files */
70 HTML;
71
72 /**
73 * A description of this support type (more information than the
74 * {@link BasicSupport#getSourceName()}).
75 *
76 * @return the description
77 */
78 public String getDesc() {
79 String desc = Instance.getTrans().getStringX(StringId.INPUT_DESC,
80 this.name());
81
82 if (desc == null) {
83 desc = Instance.getTrans().getString(StringId.INPUT_DESC, this);
84 }
85
86 return desc;
87 }
88
89 /**
90 * The name of this support type (a short version).
91 *
92 * @return the name
93 */
94 public String getSourceName() {
95 BasicSupport support = BasicSupport.getSupport(this);
96 if (support != null) {
97 return support.getSourceName();
98 }
99
100 return null;
101 }
102
103 @Override
104 public String toString() {
105 return super.toString().toLowerCase();
106 }
107
108 /**
109 * Call {@link SupportType#valueOf(String.toUpperCase())}.
110 *
111 * @param typeName
112 * the possible type name
113 *
114 * @return NULL or the type
115 */
116 public static SupportType valueOfUC(String typeName) {
117 return SupportType.valueOf(typeName == null ? null : typeName
118 .toUpperCase());
119 }
120
121 /**
122 * Call {@link SupportType#valueOf(String.toUpperCase())} but return
123 * NULL for NULL instead of raising exception.
124 *
125 * @param typeName
126 * the possible type name
127 *
128 * @return NULL or the type
129 */
130 public static SupportType valueOfNullOkUC(String typeName) {
131 if (typeName == null) {
132 return null;
133 }
134
135 return SupportType.valueOfUC(typeName);
136 }
137
138 /**
139 * Call {@link SupportType#valueOf(String.toUpperCase())} but return
140 * NULL in case of error instead of raising an exception.
141 *
142 * @param typeName
143 * the possible type name
144 *
145 * @return NULL or the type
146 */
147 public static SupportType valueOfAllOkUC(String typeName) {
148 try {
149 return SupportType.valueOfUC(typeName);
150 } catch (Exception e) {
151 return null;
152 }
153 }
154 }
155
156 private InputStream in;
157 private SupportType type;
158 private URL currentReferer; // with only one 'r', as in 'HTTP'...
159
160 // quote chars
161 private char openQuote = Instance.getTrans().getCharacter(
162 StringId.OPEN_SINGLE_QUOTE);
163 private char closeQuote = Instance.getTrans().getCharacter(
164 StringId.CLOSE_SINGLE_QUOTE);
165 private char openDoubleQuote = Instance.getTrans().getCharacter(
166 StringId.OPEN_DOUBLE_QUOTE);
167 private char closeDoubleQuote = Instance.getTrans().getCharacter(
168 StringId.CLOSE_DOUBLE_QUOTE);
169
170 /**
171 * The name of this support class.
172 *
173 * @return the name
174 */
175 protected abstract String getSourceName();
176
177 /**
178 * Check if the given resource is supported by this {@link BasicSupport}.
179 *
180 * @param url
181 * the resource to check for
182 *
183 * @return TRUE if it is
184 */
185 protected abstract boolean supports(URL url);
186
187 /**
188 * Return TRUE if the support will return HTML encoded content values for
189 * the chapters content.
190 *
191 * @return TRUE for HTML
192 */
193 protected abstract boolean isHtml();
194
195 protected abstract MetaData getMeta(URL source, InputStream in)
196 throws IOException;
197
198 /**
199 * Return the story description.
200 *
201 * @param source
202 * the source of the story
203 * @param in
204 * the input (the main resource)
205 *
206 * @return the description
207 *
208 * @throws IOException
209 * in case of I/O error
210 */
211 protected abstract String getDesc(URL source, InputStream in)
212 throws IOException;
213
214 /**
215 * Return the list of chapters (name and resource).
216 *
217 * @param source
218 * the source of the story
219 * @param in
220 * the input (the main resource)
221 * @param pg
222 * the optional progress reporter
223 *
224 * @return the chapters
225 *
226 * @throws IOException
227 * in case of I/O error
228 */
229 protected abstract List<Entry<String, URL>> getChapters(URL source,
230 InputStream in, Progress pg) throws IOException;
231
232 /**
233 * Return the content of the chapter (possibly HTML encoded, if
234 * {@link BasicSupport#isHtml()} is TRUE).
235 *
236 * @param source
237 * the source of the story
238 * @param in
239 * the input (the main resource)
240 * @param number
241 * the chapter number
242 * @param pg
243 * the optional progress reporter
244 *
245 * @return the content
246 *
247 * @throws IOException
248 * in case of I/O error
249 */
250 protected abstract String getChapterContent(URL source, InputStream in,
251 int number, Progress pg) throws IOException;
252
253 /**
254 * Log into the support (can be a no-op depending upon the support).
255 *
256 * @throws IOException
257 * in case of I/O error
258 */
259 public void login() throws IOException {
260
261 }
262
263 /**
264 * Return the list of cookies (values included) that must be used to
265 * correctly fetch the resources.
266 * <p>
267 * You are expected to call the super method implementation if you override
268 * it.
269 *
270 * @return the cookies
271 *
272 * @throws IOException
273 * in case of I/O error
274 */
275 public Map<String, String> getCookies() throws IOException {
276 return new HashMap<String, String>();
277 }
278
279 /**
280 * Return the canonical form of the main {@link URL}.
281 *
282 * @param source
283 * the source {@link URL}
284 *
285 * @return the canonical form of this {@link URL}
286 *
287 * @throws IOException
288 * in case of I/O error
289 */
290 public URL getCanonicalUrl(URL source) throws IOException {
291 return source;
292 }
293
294 /**
295 * Process the given story resource into a partially filled {@link Story}
296 * object containing the name and metadata, except for the description.
297 *
298 * @param url
299 * the story resource
300 *
301 * @return the {@link Story}
302 *
303 * @throws IOException
304 * in case of I/O error
305 */
306 public Story processMeta(URL url) throws IOException {
307 return processMeta(url, true, false, null);
308 }
309
310 /**
311 * Process the given story resource into a partially filled {@link Story}
312 * object containing the name and metadata.
313 *
314 * @param url
315 * the story resource
316 *
317 * @param close
318 * close "this" and "in" when done
319 * @param pg
320 * the optional progress reporter
321 *
322 * @return the {@link Story}
323 *
324 * @throws IOException
325 * in case of I/O error
326 */
327 protected Story processMeta(URL url, boolean close, boolean getDesc,
328 Progress pg) throws IOException {
329 if (pg == null) {
330 pg = new Progress();
331 } else {
332 pg.setMinMax(0, 100);
333 }
334
335 login();
336 pg.setProgress(10);
337
338 url = getCanonicalUrl(url);
339
340 setCurrentReferer(url);
341
342 in = openInput(url);
343 if (in == null) {
344 return null;
345 }
346
347 try {
348 preprocess(url, getInput());
349 pg.setProgress(30);
350
351 Story story = new Story();
352 MetaData meta = getMeta(url, getInput());
353 if (meta.getCreationDate() == null
354 || meta.getCreationDate().isEmpty()) {
355 meta.setCreationDate(StringUtils.fromTime(new Date().getTime()));
356 }
357 story.setMeta(meta);
358
359 pg.setProgress(50);
360
361 if (meta != null && meta.getCover() == null) {
362 meta.setCover(getDefaultCover(meta.getSubject()));
363 }
364
365 pg.setProgress(60);
366
367 if (getDesc) {
368 String descChapterName = Instance.getTrans().getString(
369 StringId.DESCRIPTION);
370 story.getMeta().setResume(
371 makeChapter(url, 0, descChapterName,
372 getDesc(url, getInput()), null));
373 }
374
375 pg.setProgress(100);
376 return story;
377 } finally {
378 if (close) {
379 try {
380 close();
381 } catch (IOException e) {
382 Instance.syserr(e);
383 }
384
385 if (in != null) {
386 in.close();
387 }
388 }
389
390 setCurrentReferer(null);
391 }
392 }
393
394 /**
395 * Process the given story resource into a fully filled {@link Story}
396 * object.
397 *
398 * @param url
399 * the story resource
400 * @param pg
401 * the optional progress reporter
402 *
403 * @return the {@link Story}
404 *
405 * @throws IOException
406 * in case of I/O error
407 */
408 public Story process(URL url, Progress pg) throws IOException {
409 if (pg == null) {
410 pg = new Progress();
411 } else {
412 pg.setMinMax(0, 100);
413 }
414
415 url = getCanonicalUrl(url);
416 pg.setProgress(1);
417 try {
418 Progress pgMeta = new Progress();
419 pg.addProgress(pgMeta, 10);
420 Story story = processMeta(url, false, true, pgMeta);
421 if (!pgMeta.isDone()) {
422 pgMeta.setProgress(pgMeta.getMax()); // 10%
423 }
424
425 if (story == null) {
426 pg.setProgress(90);
427 return null;
428 }
429
430 pg.setName("Retrieving " + story.getMeta().getTitle());
431
432 setCurrentReferer(url);
433
434 Progress pgGetChapters = new Progress();
435 pg.addProgress(pgGetChapters, 10);
436 story.setChapters(new ArrayList<Chapter>());
437 List<Entry<String, URL>> chapters = getChapters(url, getInput(),
438 pgGetChapters);
439 if (!pgGetChapters.isDone()) {
440 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
441 }
442
443 if (chapters != null) {
444 Progress pgChaps = new Progress("Extracting chapters", 0,
445 chapters.size() * 300);
446 pg.addProgress(pgChaps, 80);
447
448 long words = 0;
449 int i = 1;
450 for (Entry<String, URL> chap : chapters) {
451 pgChaps.setName("Extracting chapter " + i);
452 setCurrentReferer(chap.getValue());
453 InputStream chapIn = Instance.getCache().open(
454 chap.getValue(), this, true);
455 pgChaps.setProgress(i * 100);
456 try {
457 Progress pgGetChapterContent = new Progress();
458 Progress pgMakeChapter = new Progress();
459 pgChaps.addProgress(pgGetChapterContent, 100);
460 pgChaps.addProgress(pgMakeChapter, 100);
461
462 String content = getChapterContent(url, chapIn, i,
463 pgGetChapterContent);
464 if (!pgGetChapterContent.isDone()) {
465 pgGetChapterContent.setProgress(pgGetChapterContent
466 .getMax());
467 }
468
469 Chapter cc = makeChapter(url, i, chap.getKey(),
470 content, pgMakeChapter);
471 if (!pgMakeChapter.isDone()) {
472 pgMakeChapter.setProgress(pgMakeChapter.getMax());
473 }
474
475 words += cc.getWords();
476 story.getChapters().add(cc);
477 if (story.getMeta() != null) {
478 story.getMeta().setWords(words);
479 }
480 } finally {
481 chapIn.close();
482 }
483
484 i++;
485 }
486
487 pgChaps.setName("Extracting chapters");
488 } else {
489 pg.setProgress(80);
490 }
491
492 return story;
493
494 } finally {
495 try {
496 close();
497 } catch (IOException e) {
498 Instance.syserr(e);
499 }
500
501 if (in != null) {
502 in.close();
503 }
504
505 setCurrentReferer(null);
506 }
507 }
508
509 /**
510 * The support type.
511 *
512 * @return the type
513 */
514 public SupportType getType() {
515 return type;
516 }
517
518 /**
519 * The current referer {@link URL} (only one 'r', as in 'HTML'...), i.e.,
520 * the current {@link URL} we work on.
521 *
522 * @return the referer
523 */
524 public URL getCurrentReferer() {
525 return currentReferer;
526 }
527
528 /**
529 * The current referer {@link URL} (only one 'r', as in 'HTML'...), i.e.,
530 * the current {@link URL} we work on.
531 *
532 * @param currentReferer
533 * the new referer
534 */
535 protected void setCurrentReferer(URL currentReferer) {
536 this.currentReferer = currentReferer;
537 }
538
539 /**
540 * The support type.
541 *
542 * @param type
543 * the new type
544 *
545 * @return this
546 */
547 protected BasicSupport setType(SupportType type) {
548 this.type = type;
549 return this;
550 }
551
552 /**
553 * Prepare the support if needed before processing.
554 *
555 * @param source
556 * the source of the story
557 * @param in
558 * the input (the main resource)
559 *
560 * @throws IOException
561 * on I/O error
562 */
563 protected void preprocess(URL source, InputStream in) throws IOException {
564 }
565
566 /**
567 * Now that we have processed the {@link Story}, close the resources if any.
568 *
569 * @throws IOException
570 * on I/O error
571 */
572 protected void close() throws IOException {
573 }
574
575 /**
576 * Create a {@link Chapter} object from the given information, formatting
577 * the content as it should be.
578 *
579 * @param number
580 * the chapter number
581 * @param name
582 * the chapter name
583 * @param content
584 * the chapter content
585 * @param pg
586 * the optional progress reporter
587 *
588 * @return the {@link Chapter}
589 *
590 * @throws IOException
591 * in case of I/O error
592 */
593 protected Chapter makeChapter(URL source, int number, String name,
594 String content, Progress pg) throws IOException {
595 // Chapter name: process it correctly, then remove the possible
596 // redundant "Chapter x: " in front of it, or "-" (as in
597 // "Chapter 5: - Fun!" after the ": " was automatically added)
598 String chapterName = processPara(name).getContent().trim();
599 for (String lang : Instance.getConfig().getString(Config.CHAPTER)
600 .split(",")) {
601 String chapterWord = Instance.getConfig().getStringX(
602 Config.CHAPTER, lang);
603 if (chapterName.startsWith(chapterWord)) {
604 chapterName = chapterName.substring(chapterWord.length())
605 .trim();
606 break;
607 }
608 }
609
610 if (chapterName.startsWith(Integer.toString(number))) {
611 chapterName = chapterName.substring(
612 Integer.toString(number).length()).trim();
613 }
614
615 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
616 chapterName = chapterName.substring(1).trim();
617 }
618 //
619
620 Chapter chap = new Chapter(number, chapterName);
621
622 if (content != null) {
623 List<Paragraph> paras = makeParagraphs(source, content, pg);
624 long words = 0;
625 for (Paragraph para : paras) {
626 words += para.getWords();
627 }
628 chap.setParagraphs(paras);
629 chap.setWords(words);
630 }
631
632 return chap;
633
634 }
635
636 /**
637 * Convert the given content into {@link Paragraph}s.
638 *
639 * @param source
640 * the source URL of the story
641 * @param content
642 * the textual content
643 * @param pg
644 * the optional progress reporter
645 *
646 * @return the {@link Paragraph}s
647 *
648 * @throws IOException
649 * in case of I/O error
650 */
651 protected List<Paragraph> makeParagraphs(URL source, String content,
652 Progress pg) throws IOException {
653 if (pg == null) {
654 pg = new Progress();
655 }
656
657 if (isHtml()) {
658 // Special <HR> processing:
659 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
660 "<br/>* * *<br/>");
661 }
662
663 List<Paragraph> paras = new ArrayList<Paragraph>();
664
665 if (content != null && !content.trim().isEmpty()) {
666 if (isHtml()) {
667 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
668 pg.setMinMax(0, tab.length);
669 int i = 1;
670 for (String line : tab) {
671 if (line.startsWith("[") && line.endsWith("]")) {
672 pg.setName("Extracting image " + i);
673 }
674 paras.add(makeParagraph(source, line.trim()));
675 pg.setProgress(i++);
676 }
677 pg.setName(null);
678 } else {
679 List<String> lines = new ArrayList<String>();
680 BufferedReader buff = null;
681 try {
682 buff = new BufferedReader(
683 new InputStreamReader(new ByteArrayInputStream(
684 content.getBytes("UTF-8")), "UTF-8"));
685 for (String line = buff.readLine(); line != null; line = buff
686 .readLine()) {
687 lines.add(line.trim());
688 }
689 } finally {
690 if (buff != null) {
691 buff.close();
692 }
693 }
694
695 pg.setMinMax(0, lines.size());
696 int i = 0;
697 for (String line : lines) {
698 if (line.startsWith("[") && line.endsWith("]")) {
699 pg.setName("Extracting image " + i);
700 }
701 paras.add(makeParagraph(source, line));
702 pg.setProgress(i++);
703 }
704 pg.setName(null);
705 }
706
707 // Check quotes for "bad" format
708 List<Paragraph> newParas = new ArrayList<Paragraph>();
709 for (Paragraph para : paras) {
710 newParas.addAll(requotify(para));
711 }
712 paras = newParas;
713
714 // Remove double blanks/brks
715 fixBlanksBreaks(paras);
716 }
717
718 return paras;
719 }
720
721 /**
722 * Convert the given line into a single {@link Paragraph}.
723 *
724 * @param source
725 * the source URL of the story
726 * @param line
727 * the textual content of the paragraph
728 *
729 * @return the {@link Paragraph}
730 */
731 private Paragraph makeParagraph(URL source, String line) {
732 URL image = null;
733 if (line.startsWith("[") && line.endsWith("]")) {
734 image = getImageUrl(this, source,
735 line.substring(1, line.length() - 1).trim());
736 }
737
738 if (image != null) {
739 return new Paragraph(image);
740 } else {
741 return processPara(line);
742 }
743 }
744
745 /**
746 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
747 * those {@link Paragraph}s.
748 * <p>
749 * The resulting list will not contain a starting or trailing blank/break
750 * nor 2 blanks or breaks following each other.
751 *
752 * @param paras
753 * the list of {@link Paragraph}s to fix
754 */
755 protected void fixBlanksBreaks(List<Paragraph> paras) {
756 boolean space = false;
757 boolean brk = true;
758 for (int i = 0; i < paras.size(); i++) {
759 Paragraph para = paras.get(i);
760 boolean thisSpace = para.getType() == ParagraphType.BLANK;
761 boolean thisBrk = para.getType() == ParagraphType.BREAK;
762
763 if (i > 0 && space && thisBrk) {
764 paras.remove(i - 1);
765 i--;
766 } else if ((space || brk) && (thisSpace || thisBrk)) {
767 paras.remove(i);
768 i--;
769 }
770
771 space = thisSpace;
772 brk = thisBrk;
773 }
774
775 // Remove blank/brk at start
776 if (paras.size() > 0
777 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
778 0).getType() == ParagraphType.BREAK)) {
779 paras.remove(0);
780 }
781
782 // Remove blank/brk at end
783 int last = paras.size() - 1;
784 if (paras.size() > 0
785 && (paras.get(last).getType() == ParagraphType.BLANK || paras
786 .get(last).getType() == ParagraphType.BREAK)) {
787 paras.remove(last);
788 }
789 }
790
791 /**
792 * Get the default cover related to this subject (see <tt>.info</tt> files).
793 *
794 * @param subject
795 * the subject
796 *
797 * @return the cover if any, or NULL
798 */
799 static BufferedImage getDefaultCover(String subject) {
800 if (subject != null && !subject.isEmpty()
801 && Instance.getCoverDir() != null) {
802 try {
803 File fileCover = new File(Instance.getCoverDir(), subject);
804 return getImage(null, fileCover.toURI().toURL(), subject);
805 } catch (MalformedURLException e) {
806 }
807 }
808
809 return null;
810 }
811
812 /**
813 * Return the list of supported image extensions.
814 *
815 * @param emptyAllowed
816 * TRUE to allow an empty extension on first place, which can be
817 * used when you may already have an extension in your input but
818 * are not sure about it
819 *
820 * @return the extensions
821 */
822 static String[] getImageExt(boolean emptyAllowed) {
823 if (emptyAllowed) {
824 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
825 } else {
826 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
827 }
828 }
829
830 /**
831 * Check if the given resource can be a local image or a remote image, then
832 * refresh the cache with it if it is.
833 *
834 * @param source
835 * the story source
836 * @param line
837 * the resource to check
838 *
839 * @return the image if found, or NULL
840 *
841 */
842 static BufferedImage getImage(BasicSupport support, URL source, String line) {
843 URL url = getImageUrl(support, source, line);
844 if (url != null) {
845 InputStream in = null;
846 try {
847 in = Instance.getCache().open(url, getSupport(url), true);
848 return IOUtils.toImage(in);
849 } catch (IOException e) {
850 } finally {
851 if (in != null) {
852 try {
853 in.close();
854 } catch (IOException e) {
855 }
856 }
857 }
858 }
859
860 return null;
861 }
862
863 /**
864 * Check if the given resource can be a local image or a remote image, then
865 * refresh the cache with it if it is.
866 *
867 * @param source
868 * the story source
869 * @param line
870 * the resource to check
871 *
872 * @return the image URL if found, or NULL
873 *
874 */
875 static URL getImageUrl(BasicSupport support, URL source, String line) {
876 URL url = null;
877
878 if (line != null) {
879 // try for files
880 if (source != null) {
881 try {
882
883 String relPath = null;
884 String absPath = null;
885 try {
886 String path = new File(source.getFile()).getParent();
887 relPath = new File(new File(path), line.trim())
888 .getAbsolutePath();
889 } catch (Exception e) {
890 // Cannot be converted to path (one possibility to take
891 // into account: absolute path on Windows)
892 }
893 try {
894 absPath = new File(line.trim()).getAbsolutePath();
895 } catch (Exception e) {
896 // Cannot be converted to path (at all)
897 }
898
899 for (String ext : getImageExt(true)) {
900 if (absPath != null && new File(absPath + ext).exists()) {
901 url = new File(absPath + ext).toURI().toURL();
902 } else if (relPath != null
903 && new File(relPath + ext).exists()) {
904 url = new File(relPath + ext).toURI().toURL();
905 }
906 }
907 } catch (Exception e) {
908 // Should not happen since we control the correct arguments
909 }
910 }
911
912 if (url == null) {
913 // try for URLs
914 try {
915 for (String ext : getImageExt(true)) {
916 if (Instance.getCache().check(new URL(line + ext))) {
917 url = new URL(line + ext);
918 break;
919 }
920 }
921
922 // try out of cache
923 if (url == null) {
924 for (String ext : getImageExt(true)) {
925 try {
926 url = new URL(line + ext);
927 Instance.getCache().refresh(url, support, true);
928 break;
929 } catch (IOException e) {
930 // no image with this ext
931 url = null;
932 }
933 }
934 }
935 } catch (MalformedURLException e) {
936 // Not an url
937 }
938 }
939
940 // refresh the cached file
941 if (url != null) {
942 try {
943 Instance.getCache().refresh(url, support, true);
944 } catch (IOException e) {
945 // woops, broken image
946 url = null;
947 }
948 }
949 }
950
951 return url;
952 }
953
954 /**
955 * Open the input file that will be used through the support.
956 *
957 * @param source
958 * the source {@link URL}
959 *
960 * @return the {@link InputStream}
961 *
962 * @throws IOException
963 * in case of I/O error
964 */
965 protected InputStream openInput(URL source) throws IOException {
966 return Instance.getCache().open(source, this, false);
967 }
968
969 /**
970 * Reset the given {@link InputStream} and return it.
971 *
972 * @param in
973 * the {@link InputStream} to reset
974 *
975 * @return the same {@link InputStream} after reset
976 */
977 protected InputStream reset(InputStream in) {
978 try {
979 in.reset();
980 } catch (IOException e) {
981 }
982 return in;
983 }
984
985 /**
986 * Reset then return {@link BasicSupport#in}.
987 *
988 * @return {@link BasicSupport#in}
989 */
990 protected InputStream getInput() {
991 return reset(in);
992 }
993
994 /**
995 * Fix the author name if it is prefixed with some "by" {@link String}.
996 *
997 * @param author
998 * the author with a possible prefix
999 *
1000 * @return the author without prefixes
1001 */
1002 protected String fixAuthor(String author) {
1003 if (author != null) {
1004 for (String suffix : new String[] { " ", ":" }) {
1005 for (String byString : Instance.getConfig()
1006 .getString(Config.BYS).split(",")) {
1007 byString += suffix;
1008 if (author.toUpperCase().startsWith(byString.toUpperCase())) {
1009 author = author.substring(byString.length()).trim();
1010 }
1011 }
1012 }
1013
1014 // Special case (without suffix):
1015 if (author.startsWith("©")) {
1016 author = author.substring(1);
1017 }
1018 }
1019
1020 return author;
1021 }
1022
1023 /**
1024 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
1025 * and requotify them (i.e., separate them into QUOTE paragraphs and other
1026 * paragraphs (quotes or not)).
1027 *
1028 * @param para
1029 * the paragraph to requotify (not necessarily a quote)
1030 *
1031 * @return the correctly (or so we hope) quotified paragraphs
1032 */
1033 protected List<Paragraph> requotify(Paragraph para) {
1034 List<Paragraph> newParas = new ArrayList<Paragraph>();
1035
1036 if (para.getType() == ParagraphType.QUOTE
1037 && para.getContent().length() > 2) {
1038 String line = para.getContent();
1039 boolean singleQ = line.startsWith("" + openQuote);
1040 boolean doubleQ = line.startsWith("" + openDoubleQuote);
1041
1042 // Do not try when more than one quote at a time
1043 // (some stories are not easily readable if we do)
1044 if (singleQ
1045 && line.indexOf(closeQuote, 1) < line
1046 .lastIndexOf(closeQuote)) {
1047 newParas.add(para);
1048 return newParas;
1049 }
1050 if (doubleQ
1051 && line.indexOf(closeDoubleQuote, 1) < line
1052 .lastIndexOf(closeDoubleQuote)) {
1053 newParas.add(para);
1054 return newParas;
1055 }
1056 //
1057
1058 if (!singleQ && !doubleQ) {
1059 line = openDoubleQuote + line + closeDoubleQuote;
1060 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
1061 .getWords()));
1062 } else {
1063 char open = singleQ ? openQuote : openDoubleQuote;
1064 char close = singleQ ? closeQuote : closeDoubleQuote;
1065
1066 int posDot = -1;
1067 boolean inQuote = false;
1068 int i = 0;
1069 for (char car : line.toCharArray()) {
1070 if (car == open) {
1071 inQuote = true;
1072 } else if (car == close) {
1073 inQuote = false;
1074 } else if (car == '.' && !inQuote) {
1075 posDot = i;
1076 break;
1077 }
1078 i++;
1079 }
1080
1081 if (posDot >= 0) {
1082 String rest = line.substring(posDot + 1).trim();
1083 line = line.substring(0, posDot + 1).trim();
1084 long words = 1;
1085 for (char car : line.toCharArray()) {
1086 if (car == ' ') {
1087 words++;
1088 }
1089 }
1090 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
1091 if (!rest.isEmpty()) {
1092 newParas.addAll(requotify(processPara(rest)));
1093 }
1094 } else {
1095 newParas.add(para);
1096 }
1097 }
1098 } else {
1099 newParas.add(para);
1100 }
1101
1102 return newParas;
1103 }
1104
1105 /**
1106 * Process a {@link Paragraph} from a raw line of text.
1107 * <p>
1108 * Will also fix quotes and HTML encoding if needed.
1109 *
1110 * @param line
1111 * the raw line
1112 *
1113 * @return the processed {@link Paragraph}
1114 */
1115 protected Paragraph processPara(String line) {
1116 line = ifUnhtml(line).trim();
1117
1118 boolean space = true;
1119 boolean brk = true;
1120 boolean quote = false;
1121 boolean tentativeCloseQuote = false;
1122 char prev = '\0';
1123 int dashCount = 0;
1124 long words = 1;
1125
1126 StringBuilder builder = new StringBuilder();
1127 for (char car : line.toCharArray()) {
1128 if (car != '-') {
1129 if (dashCount > 0) {
1130 // dash, ndash and mdash: - – —
1131 // currently: always use mdash
1132 builder.append(dashCount == 1 ? '-' : '—');
1133 }
1134 dashCount = 0;
1135 }
1136
1137 if (tentativeCloseQuote) {
1138 tentativeCloseQuote = false;
1139 if (Character.isLetterOrDigit(car)) {
1140 builder.append("'");
1141 } else {
1142 // handle double-single quotes as double quotes
1143 if (prev == car) {
1144 builder.append(closeDoubleQuote);
1145 continue;
1146 } else {
1147 builder.append(closeQuote);
1148 }
1149 }
1150 }
1151
1152 switch (car) {
1153 case ' ': // note: unbreakable space
1154 case ' ':
1155 case '\t':
1156 case '\n': // just in case
1157 case '\r': // just in case
1158 if (builder.length() > 0
1159 && builder.charAt(builder.length() - 1) != ' ') {
1160 words++;
1161 }
1162 builder.append(' ');
1163 break;
1164
1165 case '\'':
1166 if (space || (brk && quote)) {
1167 quote = true;
1168 // handle double-single quotes as double quotes
1169 if (prev == car) {
1170 builder.deleteCharAt(builder.length() - 1);
1171 builder.append(openDoubleQuote);
1172 } else {
1173 builder.append(openQuote);
1174 }
1175 } else if (prev == ' ' || prev == car) {
1176 // handle double-single quotes as double quotes
1177 if (prev == car) {
1178 builder.deleteCharAt(builder.length() - 1);
1179 builder.append(openDoubleQuote);
1180 } else {
1181 builder.append(openQuote);
1182 }
1183 } else {
1184 // it is a quote ("I'm off") or a 'quote' ("This
1185 // 'good' restaurant"...)
1186 tentativeCloseQuote = true;
1187 }
1188 break;
1189
1190 case '"':
1191 if (space || (brk && quote)) {
1192 quote = true;
1193 builder.append(openDoubleQuote);
1194 } else if (prev == ' ') {
1195 builder.append(openDoubleQuote);
1196 } else {
1197 builder.append(closeDoubleQuote);
1198 }
1199 break;
1200
1201 case '-':
1202 if (space) {
1203 quote = true;
1204 } else {
1205 dashCount++;
1206 }
1207 space = false;
1208 break;
1209
1210 case '*':
1211 case '~':
1212 case '/':
1213 case '\\':
1214 case '<':
1215 case '>':
1216 case '=':
1217 case '+':
1218 case '_':
1219 case '–':
1220 case '—':
1221 space = false;
1222 builder.append(car);
1223 break;
1224
1225 case '‘':
1226 case '`':
1227 case '‹':
1228 case '﹁':
1229 case '〈':
1230 case '「':
1231 if (space || (brk && quote)) {
1232 quote = true;
1233 builder.append(openQuote);
1234 } else {
1235 // handle double-single quotes as double quotes
1236 if (prev == car) {
1237 builder.deleteCharAt(builder.length() - 1);
1238 builder.append(openDoubleQuote);
1239 } else {
1240 builder.append(openQuote);
1241 }
1242 }
1243 space = false;
1244 brk = false;
1245 break;
1246
1247 case '’':
1248 case '›':
1249 case '﹂':
1250 case '〉':
1251 case '」':
1252 space = false;
1253 brk = false;
1254 // handle double-single quotes as double quotes
1255 if (prev == car) {
1256 builder.deleteCharAt(builder.length() - 1);
1257 builder.append(closeDoubleQuote);
1258 } else {
1259 builder.append(closeQuote);
1260 }
1261 break;
1262
1263 case '«':
1264 case '“':
1265 case '﹃':
1266 case '《':
1267 case '『':
1268 if (space || (brk && quote)) {
1269 quote = true;
1270 builder.append(openDoubleQuote);
1271 } else {
1272 builder.append(openDoubleQuote);
1273 }
1274 space = false;
1275 brk = false;
1276 break;
1277
1278 case '»':
1279 case '”':
1280 case '﹄':
1281 case '》':
1282 case '』':
1283 space = false;
1284 brk = false;
1285 builder.append(closeDoubleQuote);
1286 break;
1287
1288 default:
1289 space = false;
1290 brk = false;
1291 builder.append(car);
1292 break;
1293 }
1294
1295 prev = car;
1296 }
1297
1298 if (tentativeCloseQuote) {
1299 tentativeCloseQuote = false;
1300 builder.append(closeQuote);
1301 }
1302
1303 line = builder.toString().trim();
1304
1305 ParagraphType type = ParagraphType.NORMAL;
1306 if (space) {
1307 type = ParagraphType.BLANK;
1308 } else if (brk) {
1309 type = ParagraphType.BREAK;
1310 } else if (quote) {
1311 type = ParagraphType.QUOTE;
1312 }
1313
1314 return new Paragraph(type, line, words);
1315 }
1316
1317 /**
1318 * Remove the HTML from the input <b>if</b> {@link BasicSupport#isHtml()} is
1319 * true.
1320 *
1321 * @param input
1322 * the input
1323 *
1324 * @return the no html version if needed
1325 */
1326 private String ifUnhtml(String input) {
1327 if (isHtml() && input != null) {
1328 return StringUtils.unhtml(input);
1329 }
1330
1331 return input;
1332 }
1333
1334 /**
1335 * Return a {@link BasicSupport} implementation supporting the given
1336 * resource if possible.
1337 *
1338 * @param url
1339 * the story resource
1340 *
1341 * @return an implementation that supports it, or NULL
1342 */
1343 public static BasicSupport getSupport(URL url) {
1344 if (url == null) {
1345 return null;
1346 }
1347
1348 // TEXT and INFO_TEXT always support files (not URLs though)
1349 for (SupportType type : SupportType.values()) {
1350 if (type != SupportType.TEXT && type != SupportType.INFO_TEXT) {
1351 BasicSupport support = getSupport(type);
1352 if (support != null && support.supports(url)) {
1353 return support;
1354 }
1355 }
1356 }
1357
1358 for (SupportType type : new SupportType[] { SupportType.INFO_TEXT,
1359 SupportType.TEXT }) {
1360 BasicSupport support = getSupport(type);
1361 if (support != null && support.supports(url)) {
1362 return support;
1363 }
1364 }
1365
1366 return null;
1367 }
1368
1369 /**
1370 * Return a {@link BasicSupport} implementation supporting the given type.
1371 *
1372 * @param type
1373 * the type
1374 *
1375 * @return an implementation that supports it, or NULL
1376 */
1377 public static BasicSupport getSupport(SupportType type) {
1378 switch (type) {
1379 case EPUB:
1380 return new Epub().setType(type);
1381 case INFO_TEXT:
1382 return new InfoText().setType(type);
1383 case FIMFICTION:
1384 return new Fimfiction().setType(type);
1385 case FANFICTION:
1386 return new Fanfiction().setType(type);
1387 case TEXT:
1388 return new Text().setType(type);
1389 case MANGAFOX:
1390 return new MangaFox().setType(type);
1391 case E621:
1392 return new E621().setType(type);
1393 case YIFFSTAR:
1394 return new YiffStar().setType(type);
1395 case E_HENTAI:
1396 return new EHentai().setType(type);
1397 case CBZ:
1398 return new Cbz().setType(type);
1399 case HTML:
1400 return new Html().setType(type);
1401 }
1402
1403 return null;
1404 }
1405
1406 /**
1407 * Return the first line from the given input which correspond to the given
1408 * selectors.
1409 *
1410 * @param in
1411 * the input
1412 * @param needle
1413 * a string that must be found inside the target line (also
1414 * supports "^" at start to say "only if it starts with" the
1415 * needle)
1416 * @param relativeLine
1417 * the line to return based upon the target line position (-1 =
1418 * the line before, 0 = the target line...)
1419 *
1420 * @return the line
1421 */
1422 static String getLine(InputStream in, String needle, int relativeLine) {
1423 return getLine(in, needle, relativeLine, true);
1424 }
1425
1426 /**
1427 * Return a line from the given input which correspond to the given
1428 * selectors.
1429 *
1430 * @param in
1431 * the input
1432 * @param needle
1433 * a string that must be found inside the target line (also
1434 * supports "^" at start to say "only if it starts with" the
1435 * needle)
1436 * @param relativeLine
1437 * the line to return based upon the target line position (-1 =
1438 * the line before, 0 = the target line...)
1439 * @param first
1440 * takes the first result (as opposed to the last one, which will
1441 * also always spend the input)
1442 *
1443 * @return the line
1444 */
1445 static String getLine(InputStream in, String needle, int relativeLine,
1446 boolean first) {
1447 String rep = null;
1448
1449 try {
1450 in.reset();
1451 } catch (IOException e) {
1452 Instance.syserr(e);
1453 }
1454
1455 List<String> lines = new ArrayList<String>();
1456 @SuppressWarnings("resource")
1457 Scanner scan = new Scanner(in, "UTF-8");
1458 int index = -1;
1459 scan.useDelimiter("\\n");
1460 while (scan.hasNext()) {
1461 lines.add(scan.next());
1462
1463 if (index == -1) {
1464 if (needle.startsWith("^")) {
1465 if (lines.get(lines.size() - 1).startsWith(
1466 needle.substring(1))) {
1467 index = lines.size() - 1;
1468 }
1469
1470 } else {
1471 if (lines.get(lines.size() - 1).contains(needle)) {
1472 index = lines.size() - 1;
1473 }
1474 }
1475 }
1476
1477 if (index >= 0 && index + relativeLine < lines.size()) {
1478 rep = lines.get(index + relativeLine);
1479 if (first) {
1480 break;
1481 }
1482 }
1483 }
1484
1485 return rep;
1486 }
1487
1488 /**
1489 * Return the text between the key and the endKey (and optional subKey can
1490 * be passed, in this case we will look for the key first, then take the
1491 * text between the subKey and the endKey).
1492 * <p>
1493 * Will only match the first line with the given key if more than one are
1494 * possible. Which also means that if the subKey or endKey is not found on
1495 * that line, NULL will be returned.
1496 *
1497 * @param in
1498 * the input
1499 * @param key
1500 * the key to match (also supports "^" at start to say
1501 * "only if it starts with" the key)
1502 * @param subKey
1503 * the sub key or NULL if none
1504 * @param endKey
1505 * the end key or NULL for "up to the end"
1506 * @return the text or NULL if not found
1507 */
1508 static String getKeyLine(InputStream in, String key, String subKey,
1509 String endKey) {
1510 String result = null;
1511
1512 String line = getLine(in, key, 0);
1513 if (line != null && line.contains(key)) {
1514 line = line.substring(line.indexOf(key) + key.length());
1515 if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
1516 if (subKey != null) {
1517 line = line.substring(line.indexOf(subKey)
1518 + subKey.length());
1519 }
1520 if (endKey == null || line.contains(endKey)) {
1521 if (endKey != null) {
1522 line = line.substring(0, line.indexOf(endKey));
1523 result = line;
1524 }
1525 }
1526 }
1527 }
1528
1529 return result;
1530 }
1531 }