Wordcount (including UI), date of creation
[fanfix.git] / src / be / nikiroo / fanfix / supported / BasicSupport.java
CommitLineData
08fe2e33
NR
1package be.nikiroo.fanfix.supported;
2
68686a37 3import java.awt.image.BufferedImage;
68e370a4 4import java.io.BufferedReader;
08fe2e33
NR
5import java.io.ByteArrayInputStream;
6import java.io.File;
7import java.io.IOException;
8import java.io.InputStream;
68e370a4 9import java.io.InputStreamReader;
08fe2e33
NR
10import java.net.MalformedURLException;
11import java.net.URL;
08fe2e33 12import java.util.ArrayList;
793f1071 13import java.util.Date;
08fe2e33
NR
14import java.util.HashMap;
15import java.util.List;
16import java.util.Map;
17import java.util.Map.Entry;
18import java.util.Scanner;
19
20import be.nikiroo.fanfix.Instance;
21import be.nikiroo.fanfix.bundles.Config;
22import be.nikiroo.fanfix.bundles.StringId;
23import be.nikiroo.fanfix.data.Chapter;
24import be.nikiroo.fanfix.data.MetaData;
25import be.nikiroo.fanfix.data.Paragraph;
08fe2e33 26import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
9252c65e 27import be.nikiroo.fanfix.data.Story;
595dfa7a 28import be.nikiroo.utils.IOUtils;
3b2b638f 29import be.nikiroo.utils.Progress;
08fe2e33
NR
30import be.nikiroo.utils.StringUtils;
31
32/**
33 * This class is the base class used by the other support classes. It can be
34 * used outside of this package, and have static method that you can use to get
35 * access to the correct support class.
36 * <p>
37 * It will be used with 'resources' (usually web pages or files).
38 *
39 * @author niki
40 */
41public abstract class BasicSupport {
42 /**
43 * The supported input types for which we can get a {@link BasicSupport}
44 * object.
45 *
46 * @author niki
47 */
48 public enum SupportType {
49 /** EPUB files created with this program */
50 EPUB,
51 /** Pure text file with some rules */
52 TEXT,
53 /** TEXT but with associated .info file */
54 INFO_TEXT,
55 /** My Little Pony fanfictions */
56 FIMFICTION,
57 /** Fanfictions from a lot of different universes */
58 FANFICTION,
59 /** Website with lots of Mangas */
60 MANGAFOX,
61 /** Furry website with comics support */
62 E621,
a4143cd7
NR
63 /** Furry website with stories */
64 YIFFSTAR,
08fe2e33 65 /** CBZ files */
373da363
NR
66 CBZ,
67 /** HTML files */
68 HTML;
08fe2e33
NR
69
70 /**
71 * A description of this support type (more information than the
72 * {@link BasicSupport#getSourceName()}).
73 *
74 * @return the description
75 */
76 public String getDesc() {
77 String desc = Instance.getTrans().getStringX(StringId.INPUT_DESC,
78 this.name());
79
80 if (desc == null) {
81 desc = Instance.getTrans().getString(StringId.INPUT_DESC, this);
82 }
83
84 return desc;
85 }
86
87 /**
88 * The name of this support type (a short version).
89 *
90 * @return the name
91 */
92 public String getSourceName() {
93 BasicSupport support = BasicSupport.getSupport(this);
94 if (support != null) {
95 return support.getSourceName();
96 }
97
98 return null;
99 }
100
101 @Override
102 public String toString() {
103 return super.toString().toLowerCase();
104 }
105
106 /**
107 * Call {@link SupportType#valueOf(String.toUpperCase())}.
108 *
109 * @param typeName
110 * the possible type name
111 *
112 * @return NULL or the type
113 */
114 public static SupportType valueOfUC(String typeName) {
115 return SupportType.valueOf(typeName == null ? null : typeName
116 .toUpperCase());
117 }
118
119 /**
120 * Call {@link SupportType#valueOf(String.toUpperCase())} but return
121 * NULL for NULL instead of raising exception.
122 *
123 * @param typeName
124 * the possible type name
125 *
126 * @return NULL or the type
127 */
128 public static SupportType valueOfNullOkUC(String typeName) {
129 if (typeName == null) {
130 return null;
131 }
132
133 return SupportType.valueOfUC(typeName);
134 }
135
136 /**
137 * Call {@link SupportType#valueOf(String.toUpperCase())} but return
138 * NULL in case of error instead of raising an exception.
139 *
140 * @param typeName
141 * the possible type name
142 *
143 * @return NULL or the type
144 */
145 public static SupportType valueOfAllOkUC(String typeName) {
146 try {
147 return SupportType.valueOfUC(typeName);
148 } catch (Exception e) {
149 return null;
150 }
151 }
152 }
153
08fe2e33
NR
154 private InputStream in;
155 private SupportType type;
22848428 156 private URL currentReferer; // with only one 'r', as in 'HTTP'...
08fe2e33
NR
157
158 // quote chars
159 private char openQuote = Instance.getTrans().getChar(
160 StringId.OPEN_SINGLE_QUOTE);
161 private char closeQuote = Instance.getTrans().getChar(
162 StringId.CLOSE_SINGLE_QUOTE);
163 private char openDoubleQuote = Instance.getTrans().getChar(
164 StringId.OPEN_DOUBLE_QUOTE);
165 private char closeDoubleQuote = Instance.getTrans().getChar(
166 StringId.CLOSE_DOUBLE_QUOTE);
167
168 /**
169 * The name of this support class.
170 *
171 * @return the name
172 */
173 protected abstract String getSourceName();
174
175 /**
176 * Check if the given resource is supported by this {@link BasicSupport}.
177 *
178 * @param url
179 * the resource to check for
180 *
181 * @return TRUE if it is
182 */
183 protected abstract boolean supports(URL url);
184
185 /**
186 * Return TRUE if the support will return HTML encoded content values for
187 * the chapters content.
188 *
189 * @return TRUE for HTML
190 */
191 protected abstract boolean isHtml();
192
68686a37 193 protected abstract MetaData getMeta(URL source, InputStream in)
08fe2e33
NR
194 throws IOException;
195
196 /**
197 * Return the story description.
198 *
199 * @param source
200 * the source of the story
201 * @param in
202 * the input (the main resource)
203 *
204 * @return the description
205 *
206 * @throws IOException
207 * in case of I/O error
208 */
209 protected abstract String getDesc(URL source, InputStream in)
210 throws IOException;
211
08fe2e33
NR
212 /**
213 * Return the list of chapters (name and resource).
214 *
215 * @param source
216 * the source of the story
217 * @param in
218 * the input (the main resource)
219 *
220 * @return the chapters
221 *
222 * @throws IOException
223 * in case of I/O error
224 */
225 protected abstract List<Entry<String, URL>> getChapters(URL source,
226 InputStream in) throws IOException;
227
228 /**
229 * Return the content of the chapter (possibly HTML encoded, if
230 * {@link BasicSupport#isHtml()} is TRUE).
231 *
232 * @param source
233 * the source of the story
234 * @param in
235 * the input (the main resource)
236 * @param number
237 * the chapter number
238 *
239 * @return the content
240 *
241 * @throws IOException
242 * in case of I/O error
243 */
244 protected abstract String getChapterContent(URL source, InputStream in,
245 int number) throws IOException;
246
6e06d2cc
NR
247 /**
248 * Log into the support (can be a no-op depending upon the support).
249 *
250 * @throws IOException
251 * in case of I/O error
252 */
253 public void login() throws IOException {
254
255 }
256
08fe2e33
NR
257 /**
258 * Return the list of cookies (values included) that must be used to
259 * correctly fetch the resources.
260 * <p>
261 * You are expected to call the super method implementation if you override
262 * it.
263 *
264 * @return the cookies
6e06d2cc
NR
265 *
266 * @throws IOException
267 * in case of I/O error
08fe2e33 268 */
6e06d2cc 269 public Map<String, String> getCookies() throws IOException {
08fe2e33
NR
270 return new HashMap<String, String>();
271 }
272
a4143cd7
NR
273 /**
274 * Return the canonical form of the main {@link URL}.
275 *
276 * @param source
277 * the source {@link URL}
278 *
279 * @return the canonical form of this {@link URL}
280 *
281 * @throws IOException
282 * in case of I/O error
283 */
284 public URL getCanonicalUrl(URL source) throws IOException {
285 return source;
286 }
287
08fe2e33
NR
288 /**
289 * Process the given story resource into a partially filled {@link Story}
290 * object containing the name and metadata, except for the description.
291 *
292 * @param url
293 * the story resource
294 *
295 * @return the {@link Story}
296 *
297 * @throws IOException
298 * in case of I/O error
299 */
300 public Story processMeta(URL url) throws IOException {
301 return processMeta(url, true, false);
302 }
303
304 /**
305 * Process the given story resource into a partially filled {@link Story}
306 * object containing the name and metadata.
307 *
308 * @param url
309 * the story resource
310 *
311 * @param close
312 * close "this" and "in" when done
313 *
314 * @return the {@link Story}
315 *
316 * @throws IOException
317 * in case of I/O error
318 */
319 protected Story processMeta(URL url, boolean close, boolean getDesc)
320 throws IOException {
6e06d2cc
NR
321 login();
322
a4143cd7
NR
323 url = getCanonicalUrl(url);
324
325 setCurrentReferer(url);
326
373da363 327 in = openInput(url);
08fe2e33
NR
328 if (in == null) {
329 return null;
330 }
331
332 try {
68686a37 333 preprocess(url, getInput());
08fe2e33
NR
334
335 Story story = new Story();
68686a37 336 MetaData meta = getMeta(url, getInput());
793f1071
NR
337 if (meta.getCreationDate() == null
338 || meta.getCreationDate().isEmpty()) {
339 meta.setCreationDate(StringUtils.fromTime(new Date().getTime()));
340 }
68686a37
NR
341 story.setMeta(meta);
342
343 if (meta != null && meta.getCover() == null) {
344 meta.setCover(getDefaultCover(meta.getSubject()));
345 }
08fe2e33
NR
346
347 if (getDesc) {
348 String descChapterName = Instance.getTrans().getString(
349 StringId.DESCRIPTION);
350 story.getMeta().setResume(
351 makeChapter(url, 0, descChapterName,
352 getDesc(url, getInput())));
353 }
354
355 return story;
356 } finally {
357 if (close) {
358 try {
359 close();
360 } catch (IOException e) {
361 Instance.syserr(e);
362 }
363
364 if (in != null) {
365 in.close();
366 }
367 }
a4143cd7
NR
368
369 setCurrentReferer(null);
08fe2e33
NR
370 }
371 }
372
373 /**
374 * Process the given story resource into a fully filled {@link Story}
375 * object.
376 *
377 * @param url
378 * the story resource
92fb0719
NR
379 * @param pg
380 * the optional progress reporter
08fe2e33
NR
381 *
382 * @return the {@link Story}
383 *
384 * @throws IOException
385 * in case of I/O error
386 */
92fb0719
NR
387 public Story process(URL url, Progress pg) throws IOException {
388 if (pg == null) {
389 pg = new Progress();
390 } else {
391 pg.setMinMax(0, 100);
392 }
393
a4143cd7 394 url = getCanonicalUrl(url);
92fb0719 395 pg.setProgress(1);
08fe2e33
NR
396 try {
397 Story story = processMeta(url, false, true);
92fb0719 398 pg.setProgress(10);
08fe2e33 399 if (story == null) {
92fb0719 400 pg.setProgress(100);
08fe2e33
NR
401 return null;
402 }
403
a4143cd7
NR
404 setCurrentReferer(url);
405
08fe2e33
NR
406 story.setChapters(new ArrayList<Chapter>());
407
08fe2e33 408 List<Entry<String, URL>> chapters = getChapters(url, getInput());
92fb0719
NR
409 pg.setProgress(20);
410
08fe2e33
NR
411 int i = 1;
412 if (chapters != null) {
92fb0719
NR
413 Progress pgChaps = new Progress(0, chapters.size());
414 pg.addProgress(pgChaps, 80);
415
793f1071 416 long words = 0;
08fe2e33
NR
417 for (Entry<String, URL> chap : chapters) {
418 setCurrentReferer(chap.getValue());
419 InputStream chapIn = Instance.getCache().open(
420 chap.getValue(), this, true);
421 try {
793f1071
NR
422 Chapter cc = makeChapter(url, i, chap.getKey(),
423 getChapterContent(url, chapIn, i));
424 words += cc.getWords();
425 story.getChapters().add(cc);
426 if (story.getMeta() != null) {
427 story.getMeta().setWords(words);
428 }
08fe2e33
NR
429 } finally {
430 chapIn.close();
431 }
a6395bef 432
3b2b638f 433 pgChaps.setProgress(i++);
08fe2e33 434 }
92fb0719
NR
435 } else {
436 pg.setProgress(100);
08fe2e33
NR
437 }
438
439 return story;
440
441 } finally {
442 try {
443 close();
444 } catch (IOException e) {
445 Instance.syserr(e);
446 }
447
448 if (in != null) {
449 in.close();
450 }
451
a4143cd7 452 setCurrentReferer(null);
08fe2e33
NR
453 }
454 }
455
456 /**
a4143cd7 457 * The support type.
08fe2e33
NR
458 *
459 * @return the type
460 */
461 public SupportType getType() {
462 return type;
463 }
464
465 /**
466 * The current referer {@link URL} (only one 'r', as in 'HTML'...), i.e.,
467 * the current {@link URL} we work on.
468 *
469 * @return the referer
470 */
471 public URL getCurrentReferer() {
472 return currentReferer;
473 }
474
475 /**
476 * The current referer {@link URL} (only one 'r', as in 'HTML'...), i.e.,
477 * the current {@link URL} we work on.
478 *
479 * @param currentReferer
480 * the new referer
481 */
482 protected void setCurrentReferer(URL currentReferer) {
483 this.currentReferer = currentReferer;
484 }
485
486 /**
487 * The support type.
488 *
489 * @param type
490 * the new type
491 *
492 * @return this
493 */
494 protected BasicSupport setType(SupportType type) {
495 this.type = type;
496 return this;
497 }
498
499 /**
68686a37 500 * Prepare the support if needed before processing.
08fe2e33
NR
501 *
502 * @param source
503 * the source of the story
504 * @param in
505 * the input (the main resource)
506 *
08fe2e33
NR
507 * @throws IOException
508 * on I/O error
509 */
68686a37 510 protected void preprocess(URL source, InputStream in) throws IOException {
08fe2e33
NR
511 }
512
513 /**
514 * Now that we have processed the {@link Story}, close the resources if any.
515 *
516 * @throws IOException
517 * on I/O error
518 */
519 protected void close() throws IOException {
520 }
521
522 /**
523 * Create a {@link Chapter} object from the given information, formatting
524 * the content as it should be.
525 *
526 * @param number
527 * the chapter number
528 * @param name
529 * the chapter name
530 * @param content
531 * the chapter content
532 *
533 * @return the {@link Chapter}
534 *
535 * @throws IOException
536 * in case of I/O error
537 */
538 protected Chapter makeChapter(URL source, int number, String name,
539 String content) throws IOException {
08fe2e33
NR
540 // Chapter name: process it correctly, then remove the possible
541 // redundant "Chapter x: " in front of it
542 String chapterName = processPara(name).getContent().trim();
543 for (String lang : Instance.getConfig().getString(Config.CHAPTER)
544 .split(",")) {
545 String chapterWord = Instance.getConfig().getStringX(
546 Config.CHAPTER, lang);
547 if (chapterName.startsWith(chapterWord)) {
548 chapterName = chapterName.substring(chapterWord.length())
549 .trim();
550 break;
551 }
552 }
553
554 if (chapterName.startsWith(Integer.toString(number))) {
555 chapterName = chapterName.substring(
556 Integer.toString(number).length()).trim();
557 }
558
559 if (chapterName.startsWith(":")) {
560 chapterName = chapterName.substring(1).trim();
561 }
562 //
563
564 Chapter chap = new Chapter(number, chapterName);
565
68e370a4 566 if (content != null) {
793f1071
NR
567 List<Paragraph> paras = makeParagraphs(source, content);
568 long words = 0;
569 for (Paragraph para : paras) {
570 words += para.getWords();
571 }
572 chap.setParagraphs(paras);
573 chap.setWords(words);
08fe2e33
NR
574 }
575
68e370a4
NR
576 return chap;
577
578 }
579
580 /**
581 * Convert the given content into {@link Paragraph}s.
582 *
583 * @param source
584 * the source URL of the story
585 * @param content
586 * the textual content
587 *
588 * @return the {@link Paragraph}s
589 *
590 * @throws IOException
591 * in case of I/O error
592 */
593 protected List<Paragraph> makeParagraphs(URL source, String content)
594 throws IOException {
08fe2e33
NR
595 if (isHtml()) {
596 // Special <HR> processing:
597 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
598 "\n* * *\n");
599 }
600
68e370a4 601 List<Paragraph> paras = new ArrayList<Paragraph>();
9252c65e 602 InputStream in = new ByteArrayInputStream(content.getBytes("UTF-8"));
08fe2e33 603 try {
68e370a4
NR
604 BufferedReader buff = new BufferedReader(new InputStreamReader(in,
605 "UTF-8"));
606
607 for (String encodedLine = buff.readLine(); encodedLine != null; encodedLine = buff
608 .readLine()) {
609 String lines[];
610 if (isHtml()) {
611 lines = encodedLine.split("(<p>|</p>|<br>|<br/>|\\n)");
612 } else {
613 lines = new String[] { encodedLine };
08fe2e33
NR
614 }
615
68e370a4
NR
616 for (String aline : lines) {
617 String line = aline.trim();
618
619 URL image = null;
620 if (line.startsWith("[") && line.endsWith("]")) {
621 image = getImageUrl(this, source,
622 line.substring(1, line.length() - 1).trim());
623 }
624
625 if (image != null) {
626 paras.add(new Paragraph(image));
627 } else {
628 paras.add(processPara(line));
629 }
08fe2e33
NR
630 }
631 }
68e370a4
NR
632 } finally {
633 in.close();
634 }
08fe2e33 635
68e370a4
NR
636 // Check quotes for "bad" format
637 List<Paragraph> newParas = new ArrayList<Paragraph>();
638 for (Paragraph para : paras) {
639 newParas.addAll(requotify(para));
640 }
641 paras = newParas;
08fe2e33 642
68e370a4
NR
643 // Remove double blanks/brks
644 fixBlanksBreaks(paras);
08fe2e33 645
68e370a4
NR
646 return paras;
647 }
08fe2e33 648
68e370a4
NR
649 /**
650 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
651 * those {@link Paragraph}s.
652 * <p>
653 * The resulting list will not contain a starting or trailing blank/break
654 * nor 2 blanks or breaks following each other.
655 *
656 * @param paras
657 * the list of {@link Paragraph}s to fix
658 */
659 protected void fixBlanksBreaks(List<Paragraph> paras) {
660 boolean space = false;
661 boolean brk = true;
662 for (int i = 0; i < paras.size(); i++) {
663 Paragraph para = paras.get(i);
664 boolean thisSpace = para.getType() == ParagraphType.BLANK;
665 boolean thisBrk = para.getType() == ParagraphType.BREAK;
666
667 if (i > 0 && space && thisBrk) {
668 paras.remove(i - 1);
669 i--;
670 } else if ((space || brk) && (thisSpace || thisBrk)) {
671 paras.remove(i);
672 i--;
08fe2e33
NR
673 }
674
68e370a4
NR
675 space = thisSpace;
676 brk = thisBrk;
677 }
08fe2e33 678
68e370a4
NR
679 // Remove blank/brk at start
680 if (paras.size() > 0
681 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
682 0).getType() == ParagraphType.BREAK)) {
683 paras.remove(0);
684 }
685
686 // Remove blank/brk at end
687 int last = paras.size() - 1;
688 if (paras.size() > 0
689 && (paras.get(last).getType() == ParagraphType.BLANK || paras
690 .get(last).getType() == ParagraphType.BREAK)) {
691 paras.remove(last);
08fe2e33
NR
692 }
693 }
694
68e370a4
NR
695 /**
696 * Get the default cover related to this subject (see <tt>.info</tt> files).
697 *
698 * @param subject
699 * the subject
700 *
701 * @return the cover if any, or NULL
702 */
68686a37
NR
703 static BufferedImage getDefaultCover(String subject) {
704 if (subject != null && !subject.isEmpty()
705 && Instance.getCoverDir() != null) {
706 try {
707 File fileCover = new File(Instance.getCoverDir(), subject);
333f0e7b 708 return getImage(null, fileCover.toURI().toURL(), subject);
68686a37
NR
709 } catch (MalformedURLException e) {
710 }
711 }
712
713 return null;
714 }
715
08fe2e33
NR
716 /**
717 * Return the list of supported image extensions.
718 *
a4143cd7
NR
719 * @param emptyAllowed
720 * TRUE to allow an empty extension on first place, which can be
721 * used when you may already have an extension in your input but
722 * are not sure about it
723 *
08fe2e33
NR
724 * @return the extensions
725 */
68686a37 726 static String[] getImageExt(boolean emptyAllowed) {
08fe2e33
NR
727 if (emptyAllowed) {
728 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
729 } else {
730 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
731 }
732 }
733
a4143cd7
NR
734 /**
735 * Check if the given resource can be a local image or a remote image, then
736 * refresh the cache with it if it is.
737 *
738 * @param source
739 * the story source
740 * @param line
741 * the resource to check
742 *
743 * @return the image if found, or NULL
744 *
745 */
333f0e7b
NR
746 static BufferedImage getImage(BasicSupport support, URL source, String line) {
747 URL url = getImageUrl(support, source, line);
68686a37
NR
748 if (url != null) {
749 InputStream in = null;
750 try {
751 in = Instance.getCache().open(url, getSupport(url), true);
595dfa7a 752 return IOUtils.toImage(in);
68686a37
NR
753 } catch (IOException e) {
754 } finally {
755 if (in != null) {
756 try {
757 in.close();
758 } catch (IOException e) {
759 }
760 }
761 }
762 }
763
764 return null;
765 }
766
08fe2e33
NR
767 /**
768 * Check if the given resource can be a local image or a remote image, then
769 * refresh the cache with it if it is.
770 *
771 * @param source
772 * the story source
773 * @param line
774 * the resource to check
775 *
776 * @return the image URL if found, or NULL
777 *
778 */
333f0e7b 779 static URL getImageUrl(BasicSupport support, URL source, String line) {
08fe2e33
NR
780 URL url = null;
781
68686a37
NR
782 if (line != null) {
783 // try for files
784 String path = null;
785 if (source != null) {
786 path = new File(source.getFile()).getParent();
787 try {
333f0e7b
NR
788 String basePath = new File(new File(path), line.trim())
789 .getAbsolutePath();
68686a37 790 for (String ext : getImageExt(true)) {
333f0e7b
NR
791 if (new File(basePath + ext).exists()) {
792 url = new File(basePath + ext).toURI().toURL();
68686a37 793 }
08fe2e33 794 }
68686a37
NR
795 } catch (Exception e) {
796 // Nothing to do here
08fe2e33 797 }
68686a37 798 }
08fe2e33 799
68686a37
NR
800 if (url == null) {
801 // try for URLs
802 try {
08fe2e33 803 for (String ext : getImageExt(true)) {
68686a37 804 if (Instance.getCache().check(new URL(line + ext))) {
08fe2e33 805 url = new URL(line + ext);
333f0e7b 806 break;
08fe2e33
NR
807 }
808 }
68686a37
NR
809
810 // try out of cache
811 if (url == null) {
812 for (String ext : getImageExt(true)) {
813 try {
814 url = new URL(line + ext);
333f0e7b 815 Instance.getCache().refresh(url, support, true);
68686a37
NR
816 break;
817 } catch (IOException e) {
818 // no image with this ext
819 url = null;
820 }
821 }
822 }
823 } catch (MalformedURLException e) {
824 // Not an url
08fe2e33 825 }
08fe2e33 826 }
08fe2e33 827
68686a37
NR
828 // refresh the cached file
829 if (url != null) {
830 try {
333f0e7b 831 Instance.getCache().refresh(url, support, true);
68686a37
NR
832 } catch (IOException e) {
833 // woops, broken image
834 url = null;
835 }
08fe2e33
NR
836 }
837 }
838
839 return url;
840 }
841
373da363
NR
842 /**
843 * Open the input file that will be used through the support.
844 *
845 * @param source
846 * the source {@link URL}
847 *
848 * @return the {@link InputStream}
849 *
850 * @throws IOException
851 * in case of I/O error
852 */
853 protected InputStream openInput(URL source) throws IOException {
854 return Instance.getCache().open(source, this, false);
855 }
856
a4143cd7
NR
857 /**
858 * Reset the given {@link InputStream} and return it.
859 *
860 * @param in
861 * the {@link InputStream} to reset
862 *
863 * @return the same {@link InputStream} after reset
864 */
68686a37
NR
865 protected InputStream reset(InputStream in) {
866 try {
867 in.reset();
868 } catch (IOException e) {
869 }
870 return in;
871 }
872
08fe2e33
NR
873 /**
874 * Reset then return {@link BasicSupport#in}.
875 *
876 * @return {@link BasicSupport#in}
08fe2e33 877 */
68686a37
NR
878 protected InputStream getInput() {
879 return reset(in);
08fe2e33
NR
880 }
881
882 /**
883 * Fix the author name if it is prefixed with some "by" {@link String}.
884 *
885 * @param author
886 * the author with a possible prefix
887 *
888 * @return the author without prefixes
889 */
68686a37 890 protected String fixAuthor(String author) {
08fe2e33
NR
891 if (author != null) {
892 for (String suffix : new String[] { " ", ":" }) {
893 for (String byString : Instance.getConfig()
894 .getString(Config.BYS).split(",")) {
895 byString += suffix;
896 if (author.toUpperCase().startsWith(byString.toUpperCase())) {
897 author = author.substring(byString.length()).trim();
898 }
899 }
900 }
901
902 // Special case (without suffix):
903 if (author.startsWith("©")) {
904 author = author.substring(1);
905 }
906 }
907
908 return author;
909 }
910
911 /**
912 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
913 * and requotify them (i.e., separate them into QUOTE paragraphs and other
914 * paragraphs (quotes or not)).
915 *
916 * @param para
a4143cd7 917 * the paragraph to requotify (not necessarily a quote)
08fe2e33
NR
918 *
919 * @return the correctly (or so we hope) quotified paragraphs
920 */
68e370a4 921 protected List<Paragraph> requotify(Paragraph para) {
08fe2e33
NR
922 List<Paragraph> newParas = new ArrayList<Paragraph>();
923
68686a37
NR
924 if (para.getType() == ParagraphType.QUOTE
925 && para.getContent().length() > 2) {
08fe2e33
NR
926 String line = para.getContent();
927 boolean singleQ = line.startsWith("" + openQuote);
928 boolean doubleQ = line.startsWith("" + openDoubleQuote);
929
b4dc6ab5
NR
930 // Do not try when more than one quote at a time
931 // (some stories are not easily readable if we do)
932 if (singleQ
933 && line.indexOf(closeQuote, 1) < line
934 .lastIndexOf(closeQuote)) {
935 newParas.add(para);
936 return newParas;
937 }
938 if (doubleQ
939 && line.indexOf(closeDoubleQuote, 1) < line
940 .lastIndexOf(closeDoubleQuote)) {
941 newParas.add(para);
942 return newParas;
943 }
944 //
945
08fe2e33
NR
946 if (!singleQ && !doubleQ) {
947 line = openDoubleQuote + line + closeDoubleQuote;
793f1071
NR
948 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
949 .getWords()));
08fe2e33 950 } else {
a6395bef 951 char open = singleQ ? openQuote : openDoubleQuote;
08fe2e33 952 char close = singleQ ? closeQuote : closeDoubleQuote;
a6395bef
NR
953
954 int posDot = -1;
955 boolean inQuote = false;
956 int i = 0;
957 for (char car : line.toCharArray()) {
958 if (car == open) {
959 inQuote = true;
960 } else if (car == close) {
961 inQuote = false;
962 } else if (car == '.' && !inQuote) {
963 posDot = i;
964 break;
965 }
966 i++;
08fe2e33
NR
967 }
968
969 if (posDot >= 0) {
970 String rest = line.substring(posDot + 1).trim();
971 line = line.substring(0, posDot + 1).trim();
793f1071
NR
972 long words = 1;
973 for (char car : line.toCharArray()) {
974 if (car == ' ') {
975 words++;
976 }
977 }
978 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
68686a37
NR
979 if (!rest.isEmpty()) {
980 newParas.addAll(requotify(processPara(rest)));
981 }
08fe2e33
NR
982 } else {
983 newParas.add(para);
984 }
985 }
986 } else {
987 newParas.add(para);
988 }
989
990 return newParas;
991 }
992
993 /**
994 * Process a {@link Paragraph} from a raw line of text.
995 * <p>
996 * Will also fix quotes and HTML encoding if needed.
997 *
998 * @param line
999 * the raw line
1000 *
1001 * @return the processed {@link Paragraph}
1002 */
22848428 1003 protected Paragraph processPara(String line) {
08fe2e33
NR
1004 line = ifUnhtml(line).trim();
1005
1006 boolean space = true;
1007 boolean brk = true;
1008 boolean quote = false;
1009 boolean tentativeCloseQuote = false;
1010 char prev = '\0';
1011 int dashCount = 0;
793f1071 1012 long words = 1;
08fe2e33
NR
1013
1014 StringBuilder builder = new StringBuilder();
1015 for (char car : line.toCharArray()) {
1016 if (car != '-') {
1017 if (dashCount > 0) {
1018 // dash, ndash and mdash: - – —
1019 // currently: always use mdash
1020 builder.append(dashCount == 1 ? '-' : '—');
1021 }
1022 dashCount = 0;
1023 }
1024
1025 if (tentativeCloseQuote) {
1026 tentativeCloseQuote = false;
22848428 1027 if (Character.isLetterOrDigit(car)) {
08fe2e33
NR
1028 builder.append("'");
1029 } else {
22848428
NR
1030 // handle double-single quotes as double quotes
1031 if (prev == car) {
1032 builder.append(closeDoubleQuote);
1033 continue;
1034 } else {
1035 builder.append(closeQuote);
1036 }
08fe2e33
NR
1037 }
1038 }
1039
1040 switch (car) {
1041 case ' ': // note: unbreakable space
1042 case ' ':
1043 case '\t':
1044 case '\n': // just in case
1045 case '\r': // just in case
793f1071
NR
1046 if (builder.length() > 0
1047 && builder.charAt(builder.length() - 1) != ' ') {
1048 words++;
1049 }
08fe2e33
NR
1050 builder.append(' ');
1051 break;
1052
1053 case '\'':
1054 if (space || (brk && quote)) {
1055 quote = true;
22848428
NR
1056 // handle double-single quotes as double quotes
1057 if (prev == car) {
1058 builder.deleteCharAt(builder.length() - 1);
1059 builder.append(openDoubleQuote);
1060 } else {
1061 builder.append(openQuote);
1062 }
1063 } else if (prev == ' ' || prev == car) {
1064 // handle double-single quotes as double quotes
1065 if (prev == car) {
1066 builder.deleteCharAt(builder.length() - 1);
1067 builder.append(openDoubleQuote);
1068 } else {
1069 builder.append(openQuote);
1070 }
08fe2e33
NR
1071 } else {
1072 // it is a quote ("I'm off") or a 'quote' ("This
1073 // 'good' restaurant"...)
1074 tentativeCloseQuote = true;
1075 }
1076 break;
1077
1078 case '"':
1079 if (space || (brk && quote)) {
1080 quote = true;
1081 builder.append(openDoubleQuote);
1082 } else if (prev == ' ') {
1083 builder.append(openDoubleQuote);
1084 } else {
1085 builder.append(closeDoubleQuote);
1086 }
1087 break;
1088
1089 case '-':
1090 if (space) {
1091 quote = true;
1092 } else {
1093 dashCount++;
1094 }
1095 space = false;
1096 break;
1097
1098 case '*':
1099 case '~':
1100 case '/':
1101 case '\\':
1102 case '<':
1103 case '>':
1104 case '=':
1105 case '+':
1106 case '_':
1107 case '–':
1108 case '—':
1109 space = false;
1110 builder.append(car);
1111 break;
1112
1113 case '‘':
1114 case '`':
1115 case '‹':
1116 case '﹁':
1117 case '〈':
1118 case '「':
1119 if (space || (brk && quote)) {
1120 quote = true;
1121 builder.append(openQuote);
1122 } else {
22848428
NR
1123 // handle double-single quotes as double quotes
1124 if (prev == car) {
1125 builder.deleteCharAt(builder.length() - 1);
1126 builder.append(openDoubleQuote);
1127 } else {
1128 builder.append(openQuote);
1129 }
08fe2e33
NR
1130 }
1131 space = false;
1132 brk = false;
1133 break;
1134
1135 case '’':
1136 case '›':
1137 case '﹂':
1138 case '〉':
1139 case '」':
1140 space = false;
1141 brk = false;
22848428
NR
1142 // handle double-single quotes as double quotes
1143 if (prev == car) {
1144 builder.deleteCharAt(builder.length() - 1);
1145 builder.append(closeDoubleQuote);
1146 } else {
1147 builder.append(closeQuote);
1148 }
08fe2e33
NR
1149 break;
1150
1151 case '«':
1152 case '“':
1153 case '﹃':
1154 case '《':
1155 case '『':
1156 if (space || (brk && quote)) {
1157 quote = true;
1158 builder.append(openDoubleQuote);
1159 } else {
1160 builder.append(openDoubleQuote);
1161 }
1162 space = false;
1163 brk = false;
1164 break;
1165
1166 case '»':
1167 case '”':
1168 case '﹄':
1169 case '》':
1170 case '』':
1171 space = false;
1172 brk = false;
1173 builder.append(closeDoubleQuote);
1174 break;
1175
1176 default:
1177 space = false;
1178 brk = false;
1179 builder.append(car);
1180 break;
1181 }
1182
1183 prev = car;
1184 }
1185
1186 if (tentativeCloseQuote) {
1187 tentativeCloseQuote = false;
1188 builder.append(closeQuote);
1189 }
1190
1191 line = builder.toString().trim();
1192
1193 ParagraphType type = ParagraphType.NORMAL;
1194 if (space) {
1195 type = ParagraphType.BLANK;
1196 } else if (brk) {
1197 type = ParagraphType.BREAK;
1198 } else if (quote) {
1199 type = ParagraphType.QUOTE;
1200 }
1201
793f1071 1202 return new Paragraph(type, line, words);
08fe2e33
NR
1203 }
1204
1205 /**
a4143cd7 1206 * Remove the HTML from the input <b>if</b> {@link BasicSupport#isHtml()} is
08fe2e33
NR
1207 * true.
1208 *
1209 * @param input
1210 * the input
1211 *
1212 * @return the no html version if needed
1213 */
1214 private String ifUnhtml(String input) {
1215 if (isHtml() && input != null) {
1216 return StringUtils.unhtml(input);
1217 }
1218
1219 return input;
1220 }
1221
1222 /**
1223 * Return a {@link BasicSupport} implementation supporting the given
1224 * resource if possible.
1225 *
1226 * @param url
1227 * the story resource
1228 *
1229 * @return an implementation that supports it, or NULL
1230 */
1231 public static BasicSupport getSupport(URL url) {
1232 if (url == null) {
1233 return null;
1234 }
1235
1236 // TEXT and INFO_TEXT always support files (not URLs though)
1237 for (SupportType type : SupportType.values()) {
1238 if (type != SupportType.TEXT && type != SupportType.INFO_TEXT) {
1239 BasicSupport support = getSupport(type);
1240 if (support != null && support.supports(url)) {
1241 return support;
1242 }
1243 }
1244 }
1245
373da363
NR
1246 for (SupportType type : new SupportType[] { SupportType.INFO_TEXT,
1247 SupportType.TEXT }) {
08fe2e33
NR
1248 BasicSupport support = getSupport(type);
1249 if (support != null && support.supports(url)) {
1250 return support;
1251 }
1252 }
1253
1254 return null;
1255 }
1256
1257 /**
1258 * Return a {@link BasicSupport} implementation supporting the given type.
1259 *
1260 * @param type
1261 * the type
1262 *
1263 * @return an implementation that supports it, or NULL
1264 */
1265 public static BasicSupport getSupport(SupportType type) {
1266 switch (type) {
1267 case EPUB:
1268 return new Epub().setType(type);
1269 case INFO_TEXT:
1270 return new InfoText().setType(type);
1271 case FIMFICTION:
1272 return new Fimfiction().setType(type);
1273 case FANFICTION:
1274 return new Fanfiction().setType(type);
1275 case TEXT:
1276 return new Text().setType(type);
1277 case MANGAFOX:
1278 return new MangaFox().setType(type);
1279 case E621:
1280 return new E621().setType(type);
a4143cd7
NR
1281 case YIFFSTAR:
1282 return new YiffStar().setType(type);
08fe2e33
NR
1283 case CBZ:
1284 return new Cbz().setType(type);
373da363
NR
1285 case HTML:
1286 return new Html().setType(type);
08fe2e33
NR
1287 }
1288
1289 return null;
1290 }
68686a37
NR
1291
1292 /**
1293 * Return the first line from the given input which correspond to the given
1294 * selectors.
1295 *
1296 * @param in
1297 * the input
1298 * @param needle
1299 * a string that must be found inside the target line (also
1300 * supports "^" at start to say "only if it starts with" the
1301 * needle)
1302 * @param relativeLine
1303 * the line to return based upon the target line position (-1 =
1304 * the line before, 0 = the target line...)
1305 *
1306 * @return the line
1307 */
1308 static String getLine(InputStream in, String needle, int relativeLine) {
1309 return getLine(in, needle, relativeLine, true);
1310 }
1311
1312 /**
1313 * Return a line from the given input which correspond to the given
1314 * selectors.
1315 *
1316 * @param in
1317 * the input
1318 * @param needle
1319 * a string that must be found inside the target line (also
1320 * supports "^" at start to say "only if it starts with" the
1321 * needle)
1322 * @param relativeLine
1323 * the line to return based upon the target line position (-1 =
1324 * the line before, 0 = the target line...)
1325 * @param first
1326 * takes the first result (as opposed to the last one, which will
1327 * also always spend the input)
1328 *
1329 * @return the line
1330 */
1331 static String getLine(InputStream in, String needle, int relativeLine,
1332 boolean first) {
1333 String rep = null;
1334
1335 try {
1336 in.reset();
1337 } catch (IOException e) {
1338 Instance.syserr(e);
1339 }
1340
1341 List<String> lines = new ArrayList<String>();
1342 @SuppressWarnings("resource")
1343 Scanner scan = new Scanner(in, "UTF-8");
1344 int index = -1;
1345 scan.useDelimiter("\\n");
1346 while (scan.hasNext()) {
1347 lines.add(scan.next());
1348
1349 if (index == -1) {
1350 if (needle.startsWith("^")) {
1351 if (lines.get(lines.size() - 1).startsWith(
1352 needle.substring(1))) {
1353 index = lines.size() - 1;
1354 }
1355
1356 } else {
1357 if (lines.get(lines.size() - 1).contains(needle)) {
1358 index = lines.size() - 1;
1359 }
1360 }
1361 }
1362
1363 if (index >= 0 && index + relativeLine < lines.size()) {
1364 rep = lines.get(index + relativeLine);
1365 if (first) {
1366 break;
1367 }
1368 }
1369 }
1370
1371 return rep;
1372 }
08fe2e33 1373}