d1dbc00b4595d3517f20f4cd0a50dc449b4cfc36
[nikiroo-utils.git] / src / be / nikiroo / fanfix / supported / BasicSupport_Deprecated.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.BufferedReader;
4 import java.io.ByteArrayInputStream;
5 import java.io.File;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.io.InputStreamReader;
9 import java.net.MalformedURLException;
10 import java.net.URL;
11 import java.util.ArrayList;
12 import java.util.Date;
13 import java.util.List;
14 import java.util.Map.Entry;
15 import java.util.Scanner;
16
17 import be.nikiroo.fanfix.Instance;
18 import be.nikiroo.fanfix.bundles.Config;
19 import be.nikiroo.fanfix.bundles.StringId;
20 import be.nikiroo.fanfix.data.Chapter;
21 import be.nikiroo.fanfix.data.MetaData;
22 import be.nikiroo.fanfix.data.Paragraph;
23 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
24 import be.nikiroo.fanfix.data.Story;
25 import be.nikiroo.utils.Image;
26 import be.nikiroo.utils.Progress;
27 import be.nikiroo.utils.StringUtils;
28
29 /**
30 * DEPRECATED: use the new Jsoup 'Node' system.
31 * <p>
32 * This class is the base class used by the other support classes. It can be
33 * used outside of this package, and have static method that you can use to get
34 * access to the correct support class.
35 * <p>
36 * It will be used with 'resources' (usually web pages or files).
37 *
38 * @author niki
39 */
40 @Deprecated
41 public abstract class BasicSupport_Deprecated extends BasicSupport {
42 private InputStream in;
43
44 // quote chars
45 private char openQuote = Instance.getTrans().getCharacter(
46 StringId.OPEN_SINGLE_QUOTE);
47 private char closeQuote = Instance.getTrans().getCharacter(
48 StringId.CLOSE_SINGLE_QUOTE);
49 private char openDoubleQuote = Instance.getTrans().getCharacter(
50 StringId.OPEN_DOUBLE_QUOTE);
51 private char closeDoubleQuote = Instance.getTrans().getCharacter(
52 StringId.CLOSE_DOUBLE_QUOTE);
53
54 // New methods not used in Deprecated mode
55 @Override
56 protected String getDesc() throws IOException {
57 throw new RuntimeException("should not be used by legacy code");
58 }
59
60 @Override
61 protected MetaData getMeta() throws IOException {
62 throw new RuntimeException("should not be used by legacy code");
63 }
64
65 @Override
66 protected List<Entry<String, URL>> getChapters(Progress pg)
67 throws IOException {
68 throw new RuntimeException("should not be used by legacy code");
69 }
70
71 @Override
72 protected String getChapterContent(URL chapUrl, int number, Progress pg)
73 throws IOException {
74 throw new RuntimeException("should not be used by legacy code");
75 }
76
77 @Override
78 public Story process(Progress pg) throws IOException {
79 return process(getSource(), pg);
80 }
81
82 //
83
84 /**
85 * Return the {@link MetaData} of this story.
86 *
87 * @param source
88 * the source of the story
89 * @param in
90 * the input (the main resource)
91 *
92 * @return the associated {@link MetaData}, never NULL
93 *
94 * @throws IOException
95 * in case of I/O error
96 */
97 protected abstract MetaData getMeta(URL source, InputStream in)
98 throws IOException;
99
100 /**
101 * Return the story description.
102 *
103 * @param source
104 * the source of the story
105 * @param in
106 * the input (the main resource)
107 *
108 * @return the description
109 *
110 * @throws IOException
111 * in case of I/O error
112 */
113 protected abstract String getDesc(URL source, InputStream in)
114 throws IOException;
115
116 /**
117 * Return the list of chapters (name and resource).
118 *
119 * @param source
120 * the source of the story
121 * @param in
122 * the input (the main resource)
123 * @param pg
124 * the optional progress reporter
125 *
126 * @return the chapters
127 *
128 * @throws IOException
129 * in case of I/O error
130 */
131 protected abstract List<Entry<String, URL>> getChapters(URL source,
132 InputStream in, Progress pg) throws IOException;
133
134 /**
135 * Return the content of the chapter (possibly HTML encoded, if
136 * {@link BasicSupport_Deprecated#isHtml()} is TRUE).
137 *
138 * @param source
139 * the source of the story
140 * @param in
141 * the input (the main resource)
142 * @param number
143 * the chapter number
144 * @param pg
145 * the optional progress reporter
146 *
147 * @return the content
148 *
149 * @throws IOException
150 * in case of I/O error
151 */
152 protected abstract String getChapterContent(URL source, InputStream in,
153 int number, Progress pg) throws IOException;
154
155 /**
156 * Process the given story resource into a partially filled {@link Story}
157 * object containing the name and metadata, except for the description.
158 *
159 * @param url
160 * the story resource
161 *
162 * @return the {@link Story}
163 *
164 * @throws IOException
165 * in case of I/O error
166 */
167 public Story processMeta(URL url) throws IOException {
168 return processMeta(url, true, false, null);
169 }
170
171 /**
172 * Process the given story resource into a partially filled {@link Story}
173 * object containing the name and metadata.
174 *
175 * @param url
176 * the story resource
177 * @param close
178 * close "this" and "in" when done
179 * @param getDesc
180 * retrieve the description of the story, or not
181 * @param pg
182 * the optional progress reporter
183 *
184 * @return the {@link Story}, never NULL
185 *
186 * @throws IOException
187 * in case of I/O error
188 */
189 protected Story processMeta(URL url, boolean close, boolean getDesc,
190 Progress pg) throws IOException {
191 if (pg == null) {
192 pg = new Progress();
193 } else {
194 pg.setMinMax(0, 100);
195 }
196
197 login();
198 pg.setProgress(10);
199
200 url = getCanonicalUrl(url);
201
202 setCurrentReferer(url);
203
204 in = openInput(url); // NULL allowed here
205 try {
206 preprocess(url, getInput());
207 pg.setProgress(30);
208
209 Story story = new Story();
210 MetaData meta = getMeta(url, getInput());
211 if (meta.getCreationDate() == null
212 || meta.getCreationDate().isEmpty()) {
213 meta.setCreationDate(StringUtils.fromTime(new Date().getTime()));
214 }
215 story.setMeta(meta);
216
217 pg.setProgress(50);
218
219 if (meta.getCover() == null) {
220 meta.setCover(getDefaultCover(meta.getSubject()));
221 }
222
223 pg.setProgress(60);
224
225 if (getDesc) {
226 String descChapterName = Instance.getTrans().getString(
227 StringId.DESCRIPTION);
228 story.getMeta().setResume(
229 makeChapter(url, 0, descChapterName,
230 getDesc(url, getInput()), null));
231 }
232
233 pg.setProgress(100);
234 return story;
235 } finally {
236 if (close) {
237 close();
238
239 if (in != null) {
240 in.close();
241 }
242 }
243 }
244 }
245
246 /**
247 * Process the given story resource into a fully filled {@link Story}
248 * object.
249 *
250 * @param url
251 * the story resource
252 * @param pg
253 * the optional progress reporter
254 *
255 * @return the {@link Story}, never NULL
256 *
257 * @throws IOException
258 * in case of I/O error
259 */
260 protected Story process(URL url, Progress pg) throws IOException {
261 if (pg == null) {
262 pg = new Progress();
263 } else {
264 pg.setMinMax(0, 100);
265 }
266
267 url = getCanonicalUrl(url);
268 pg.setProgress(1);
269 try {
270 Progress pgMeta = new Progress();
271 pg.addProgress(pgMeta, 10);
272 Story story = processMeta(url, false, true, pgMeta);
273 if (!pgMeta.isDone()) {
274 pgMeta.setProgress(pgMeta.getMax()); // 10%
275 }
276
277 pg.setName("Retrieving " + story.getMeta().getTitle());
278
279 setCurrentReferer(url);
280
281 Progress pgGetChapters = new Progress();
282 pg.addProgress(pgGetChapters, 10);
283 story.setChapters(new ArrayList<Chapter>());
284 List<Entry<String, URL>> chapters = getChapters(url, getInput(),
285 pgGetChapters);
286 if (!pgGetChapters.isDone()) {
287 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
288 }
289
290 if (chapters != null) {
291 Progress pgChaps = new Progress("Extracting chapters", 0,
292 chapters.size() * 300);
293 pg.addProgress(pgChaps, 80);
294
295 long words = 0;
296 int i = 1;
297 for (Entry<String, URL> chap : chapters) {
298 pgChaps.setName("Extracting chapter " + i);
299 InputStream chapIn = null;
300 if (chap.getValue() != null) {
301 setCurrentReferer(chap.getValue());
302 chapIn = Instance.getCache().open(chap.getValue(),
303 this, false);
304 }
305 pgChaps.setProgress(i * 100);
306 try {
307 Progress pgGetChapterContent = new Progress();
308 Progress pgMakeChapter = new Progress();
309 pgChaps.addProgress(pgGetChapterContent, 100);
310 pgChaps.addProgress(pgMakeChapter, 100);
311
312 String content = getChapterContent(url, chapIn, i,
313 pgGetChapterContent);
314 if (!pgGetChapterContent.isDone()) {
315 pgGetChapterContent.setProgress(pgGetChapterContent
316 .getMax());
317 }
318
319 Chapter cc = makeChapter(url, i, chap.getKey(),
320 content, pgMakeChapter);
321 if (!pgMakeChapter.isDone()) {
322 pgMakeChapter.setProgress(pgMakeChapter.getMax());
323 }
324
325 words += cc.getWords();
326 story.getChapters().add(cc);
327 story.getMeta().setWords(words);
328 } finally {
329 if (chapIn != null) {
330 chapIn.close();
331 }
332 }
333
334 i++;
335 }
336
337 pgChaps.setName("Extracting chapters");
338 } else {
339 pg.setProgress(80);
340 }
341
342 return story;
343
344 } finally {
345 close();
346
347 if (in != null) {
348 in.close();
349 }
350 }
351 }
352
353 /**
354 * Prepare the support if needed before processing.
355 *
356 * @param source
357 * the source of the story
358 * @param in
359 * the input (the main resource)
360 *
361 * @throws IOException
362 * on I/O error
363 */
364 @SuppressWarnings("unused")
365 protected void preprocess(URL source, InputStream in) throws IOException {
366 }
367
368 /**
369 * Create a {@link Chapter} object from the given information, formatting
370 * the content as it should be.
371 *
372 * @param source
373 * the source of the story
374 * @param number
375 * the chapter number
376 * @param name
377 * the chapter name
378 * @param content
379 * the chapter content
380 * @param pg
381 * the optional progress reporter
382 *
383 * @return the {@link Chapter}
384 *
385 * @throws IOException
386 * in case of I/O error
387 */
388 protected Chapter makeChapter(URL source, int number, String name,
389 String content, Progress pg) throws IOException {
390 // Chapter name: process it correctly, then remove the possible
391 // redundant "Chapter x: " in front of it, or "-" (as in
392 // "Chapter 5: - Fun!" after the ": " was automatically added)
393 String chapterName = processPara(name).getContent().trim();
394 for (String lang : Instance.getConfig().getString(Config.CHAPTER)
395 .split(",")) {
396 String chapterWord = Instance.getConfig().getStringX(
397 Config.CHAPTER, lang);
398 if (chapterName.startsWith(chapterWord)) {
399 chapterName = chapterName.substring(chapterWord.length())
400 .trim();
401 break;
402 }
403 }
404
405 if (chapterName.startsWith(Integer.toString(number))) {
406 chapterName = chapterName.substring(
407 Integer.toString(number).length()).trim();
408 }
409
410 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
411 chapterName = chapterName.substring(1).trim();
412 }
413 //
414
415 Chapter chap = new Chapter(number, chapterName);
416
417 if (content != null) {
418 List<Paragraph> paras = makeParagraphs(source, content, pg);
419 long words = 0;
420 for (Paragraph para : paras) {
421 words += para.getWords();
422 }
423 chap.setParagraphs(paras);
424 chap.setWords(words);
425 }
426
427 return chap;
428
429 }
430
431 /**
432 * Convert the given content into {@link Paragraph}s.
433 *
434 * @param source
435 * the source URL of the story
436 * @param content
437 * the textual content
438 * @param pg
439 * the optional progress reporter
440 *
441 * @return the {@link Paragraph}s
442 *
443 * @throws IOException
444 * in case of I/O error
445 */
446 protected List<Paragraph> makeParagraphs(URL source, String content,
447 Progress pg) throws IOException {
448 if (pg == null) {
449 pg = new Progress();
450 }
451
452 if (isHtml()) {
453 // Special <HR> processing:
454 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
455 "<br/>* * *<br/>");
456 }
457
458 List<Paragraph> paras = new ArrayList<Paragraph>();
459
460 if (content != null && !content.trim().isEmpty()) {
461 if (isHtml()) {
462 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
463 pg.setMinMax(0, tab.length);
464 int i = 1;
465 for (String line : tab) {
466 if (line.startsWith("[") && line.endsWith("]")) {
467 pg.setName("Extracting image " + i);
468 }
469 paras.add(makeParagraph(source, line.trim()));
470 pg.setProgress(i++);
471 }
472 pg.setName(null);
473 } else {
474 List<String> lines = new ArrayList<String>();
475 BufferedReader buff = null;
476 try {
477 buff = new BufferedReader(
478 new InputStreamReader(new ByteArrayInputStream(
479 content.getBytes("UTF-8")), "UTF-8"));
480 for (String line = buff.readLine(); line != null; line = buff
481 .readLine()) {
482 lines.add(line.trim());
483 }
484 } finally {
485 if (buff != null) {
486 buff.close();
487 }
488 }
489
490 pg.setMinMax(0, lines.size());
491 int i = 0;
492 for (String line : lines) {
493 if (line.startsWith("[") && line.endsWith("]")) {
494 pg.setName("Extracting image " + i);
495 }
496 paras.add(makeParagraph(source, line));
497 pg.setProgress(i++);
498 }
499 pg.setName(null);
500 }
501
502 // Check quotes for "bad" format
503 List<Paragraph> newParas = new ArrayList<Paragraph>();
504 for (Paragraph para : paras) {
505 newParas.addAll(requotify(para));
506 }
507 paras = newParas;
508
509 // Remove double blanks/brks
510 fixBlanksBreaks(paras);
511 }
512
513 return paras;
514 }
515
516 /**
517 * Convert the given line into a single {@link Paragraph}.
518 *
519 * @param source
520 * the source URL of the story
521 * @param line
522 * the textual content of the paragraph
523 *
524 * @return the {@link Paragraph}
525 */
526 private Paragraph makeParagraph(URL source, String line) {
527 Image image = null;
528 if (line.startsWith("[") && line.endsWith("]")) {
529 image = getImage(this, source, line.substring(1, line.length() - 1)
530 .trim());
531 }
532
533 if (image != null) {
534 return new Paragraph(image);
535 }
536
537 return processPara(line);
538 }
539
540 /**
541 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
542 * those {@link Paragraph}s.
543 * <p>
544 * The resulting list will not contain a starting or trailing blank/break
545 * nor 2 blanks or breaks following each other.
546 *
547 * @param paras
548 * the list of {@link Paragraph}s to fix
549 */
550 protected void fixBlanksBreaks(List<Paragraph> paras) {
551 boolean space = false;
552 boolean brk = true;
553 for (int i = 0; i < paras.size(); i++) {
554 Paragraph para = paras.get(i);
555 boolean thisSpace = para.getType() == ParagraphType.BLANK;
556 boolean thisBrk = para.getType() == ParagraphType.BREAK;
557
558 if (i > 0 && space && thisBrk) {
559 paras.remove(i - 1);
560 i--;
561 } else if ((space || brk) && (thisSpace || thisBrk)) {
562 paras.remove(i);
563 i--;
564 }
565
566 space = thisSpace;
567 brk = thisBrk;
568 }
569
570 // Remove blank/brk at start
571 if (paras.size() > 0
572 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
573 0).getType() == ParagraphType.BREAK)) {
574 paras.remove(0);
575 }
576
577 // Remove blank/brk at end
578 int last = paras.size() - 1;
579 if (paras.size() > 0
580 && (paras.get(last).getType() == ParagraphType.BLANK || paras
581 .get(last).getType() == ParagraphType.BREAK)) {
582 paras.remove(last);
583 }
584 }
585
586 /**
587 * Get the default cover related to this subject (see <tt>.info</tt> files).
588 *
589 * @param subject
590 * the subject
591 *
592 * @return the cover if any, or NULL
593 */
594 static Image getDefaultCover(String subject) {
595 if (subject != null && !subject.isEmpty()
596 && Instance.getCoverDir() != null) {
597 try {
598 File fileCover = new File(Instance.getCoverDir(), subject);
599 return getImage(null, fileCover.toURI().toURL(), subject);
600 } catch (MalformedURLException e) {
601 }
602 }
603
604 return null;
605 }
606
607 /**
608 * Return the list of supported image extensions.
609 *
610 * @param emptyAllowed
611 * TRUE to allow an empty extension on first place, which can be
612 * used when you may already have an extension in your input but
613 * are not sure about it
614 *
615 * @return the extensions
616 */
617 static String[] getImageExt(boolean emptyAllowed) {
618 if (emptyAllowed) {
619 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
620 }
621
622 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
623 }
624
625 /**
626 * Check if the given resource can be a local image or a remote image, then
627 * refresh the cache with it if it is.
628 *
629 * @param source
630 * the story source
631 * @param line
632 * the resource to check
633 *
634 * @return the image if found, or NULL
635 *
636 */
637 static Image getImage(BasicSupport_Deprecated support, URL source,
638 String line) {
639 URL url = getImageUrl(support, source, line);
640 if (url != null) {
641 if ("file".equals(url.getProtocol())) {
642 if (new File(url.getPath()).isDirectory()) {
643 return null;
644 }
645 }
646 InputStream in = null;
647 try {
648 in = Instance.getCache().open(url, getSupport(url), true);
649 return new Image(in);
650 } catch (IOException e) {
651 } finally {
652 if (in != null) {
653 try {
654 in.close();
655 } catch (IOException e) {
656 }
657 }
658 }
659 }
660
661 return null;
662 }
663
664 /**
665 * Check if the given resource can be a local image or a remote image, then
666 * refresh the cache with it if it is.
667 *
668 * @param source
669 * the story source
670 * @param line
671 * the resource to check
672 *
673 * @return the image URL if found, or NULL
674 *
675 */
676 static URL getImageUrl(BasicSupport_Deprecated support, URL source,
677 String line) {
678 URL url = null;
679
680 if (line != null) {
681 // try for files
682 if (source != null) {
683 try {
684 String relPath = null;
685 String absPath = null;
686 try {
687 String path = new File(source.getFile()).getParent();
688 relPath = new File(new File(path), line.trim())
689 .getAbsolutePath();
690 } catch (Exception e) {
691 // Cannot be converted to path (one possibility to take
692 // into account: absolute path on Windows)
693 }
694 try {
695 absPath = new File(line.trim()).getAbsolutePath();
696 } catch (Exception e) {
697 // Cannot be converted to path (at all)
698 }
699
700 for (String ext : getImageExt(true)) {
701 File absFile = new File(absPath + ext);
702 File relFile = new File(relPath + ext);
703 if (absPath != null && absFile.exists()
704 && absFile.isFile()) {
705 url = absFile.toURI().toURL();
706 } else if (relPath != null && relFile.exists()
707 && relFile.isFile()) {
708 url = relFile.toURI().toURL();
709 }
710 }
711 } catch (Exception e) {
712 // Should not happen since we control the correct arguments
713 }
714 }
715
716 if (url == null) {
717 // try for URLs
718 try {
719 for (String ext : getImageExt(true)) {
720 if (Instance.getCache()
721 .check(new URL(line + ext), true)) {
722 url = new URL(line + ext);
723 break;
724 }
725 }
726
727 // try out of cache
728 if (url == null) {
729 for (String ext : getImageExt(true)) {
730 try {
731 url = new URL(line + ext);
732 Instance.getCache().refresh(url, support, true);
733 break;
734 } catch (IOException e) {
735 // no image with this ext
736 url = null;
737 }
738 }
739 }
740 } catch (MalformedURLException e) {
741 // Not an url
742 }
743 }
744
745 // refresh the cached file
746 if (url != null) {
747 try {
748 Instance.getCache().refresh(url, support, true);
749 } catch (IOException e) {
750 // woops, broken image
751 url = null;
752 }
753 }
754 }
755
756 return url;
757 }
758
759 /**
760 * Open the input file that will be used through the support.
761 * <p>
762 * Can return NULL, in which case you are supposed to work without an
763 * {@link InputStream}.
764 *
765 * @param source
766 * the source {@link URL}
767 *
768 * @return the {@link InputStream}
769 *
770 * @throws IOException
771 * in case of I/O error
772 */
773 protected InputStream openInput(URL source) throws IOException {
774 return Instance.getCache().open(source, this, false);
775 }
776
777 /**
778 * Reset then return {@link BasicSupport_Deprecated#in}.
779 *
780 * @return {@link BasicSupport_Deprecated#in}
781 */
782 protected InputStream getInput() {
783 return reset(in);
784 }
785
786 /**
787 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
788 * and requotify them (i.e., separate them into QUOTE paragraphs and other
789 * paragraphs (quotes or not)).
790 *
791 * @param para
792 * the paragraph to requotify (not necessarily a quote)
793 *
794 * @return the correctly (or so we hope) quotified paragraphs
795 */
796 protected List<Paragraph> requotify(Paragraph para) {
797 List<Paragraph> newParas = new ArrayList<Paragraph>();
798
799 if (para.getType() == ParagraphType.QUOTE
800 && para.getContent().length() > 2) {
801 String line = para.getContent();
802 boolean singleQ = line.startsWith("" + openQuote);
803 boolean doubleQ = line.startsWith("" + openDoubleQuote);
804
805 // Do not try when more than one quote at a time
806 // (some stories are not easily readable if we do)
807 if (singleQ
808 && line.indexOf(closeQuote, 1) < line
809 .lastIndexOf(closeQuote)) {
810 newParas.add(para);
811 return newParas;
812 }
813 if (doubleQ
814 && line.indexOf(closeDoubleQuote, 1) < line
815 .lastIndexOf(closeDoubleQuote)) {
816 newParas.add(para);
817 return newParas;
818 }
819 //
820
821 if (!singleQ && !doubleQ) {
822 line = openDoubleQuote + line + closeDoubleQuote;
823 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
824 .getWords()));
825 } else {
826 char open = singleQ ? openQuote : openDoubleQuote;
827 char close = singleQ ? closeQuote : closeDoubleQuote;
828
829 int posDot = -1;
830 boolean inQuote = false;
831 int i = 0;
832 for (char car : line.toCharArray()) {
833 if (car == open) {
834 inQuote = true;
835 } else if (car == close) {
836 inQuote = false;
837 } else if (car == '.' && !inQuote) {
838 posDot = i;
839 break;
840 }
841 i++;
842 }
843
844 if (posDot >= 0) {
845 String rest = line.substring(posDot + 1).trim();
846 line = line.substring(0, posDot + 1).trim();
847 long words = 1;
848 for (char car : line.toCharArray()) {
849 if (car == ' ') {
850 words++;
851 }
852 }
853 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
854 if (!rest.isEmpty()) {
855 newParas.addAll(requotify(processPara(rest)));
856 }
857 } else {
858 newParas.add(para);
859 }
860 }
861 } else {
862 newParas.add(para);
863 }
864
865 return newParas;
866 }
867
868 /**
869 * Process a {@link Paragraph} from a raw line of text.
870 * <p>
871 * Will also fix quotes and HTML encoding if needed.
872 *
873 * @param line
874 * the raw line
875 *
876 * @return the processed {@link Paragraph}
877 */
878 protected Paragraph processPara(String line) {
879 line = ifUnhtml(line).trim();
880
881 boolean space = true;
882 boolean brk = true;
883 boolean quote = false;
884 boolean tentativeCloseQuote = false;
885 char prev = '\0';
886 int dashCount = 0;
887 long words = 1;
888
889 StringBuilder builder = new StringBuilder();
890 for (char car : line.toCharArray()) {
891 if (car != '-') {
892 if (dashCount > 0) {
893 // dash, ndash and mdash: - – —
894 // currently: always use mdash
895 builder.append(dashCount == 1 ? '-' : '—');
896 }
897 dashCount = 0;
898 }
899
900 if (tentativeCloseQuote) {
901 tentativeCloseQuote = false;
902 if (Character.isLetterOrDigit(car)) {
903 builder.append("'");
904 } else {
905 // handle double-single quotes as double quotes
906 if (prev == car) {
907 builder.append(closeDoubleQuote);
908 continue;
909 }
910
911 builder.append(closeQuote);
912 }
913 }
914
915 switch (car) {
916 case ' ': // note: unbreakable space
917 case ' ':
918 case '\t':
919 case '\n': // just in case
920 case '\r': // just in case
921 if (builder.length() > 0
922 && builder.charAt(builder.length() - 1) != ' ') {
923 words++;
924 }
925 builder.append(' ');
926 break;
927
928 case '\'':
929 if (space || (brk && quote)) {
930 quote = true;
931 // handle double-single quotes as double quotes
932 if (prev == car) {
933 builder.deleteCharAt(builder.length() - 1);
934 builder.append(openDoubleQuote);
935 } else {
936 builder.append(openQuote);
937 }
938 } else if (prev == ' ' || prev == car) {
939 // handle double-single quotes as double quotes
940 if (prev == car) {
941 builder.deleteCharAt(builder.length() - 1);
942 builder.append(openDoubleQuote);
943 } else {
944 builder.append(openQuote);
945 }
946 } else {
947 // it is a quote ("I'm off") or a 'quote' ("This
948 // 'good' restaurant"...)
949 tentativeCloseQuote = true;
950 }
951 break;
952
953 case '"':
954 if (space || (brk && quote)) {
955 quote = true;
956 builder.append(openDoubleQuote);
957 } else if (prev == ' ') {
958 builder.append(openDoubleQuote);
959 } else {
960 builder.append(closeDoubleQuote);
961 }
962 break;
963
964 case '-':
965 if (space) {
966 quote = true;
967 } else {
968 dashCount++;
969 }
970 space = false;
971 break;
972
973 case '*':
974 case '~':
975 case '/':
976 case '\\':
977 case '<':
978 case '>':
979 case '=':
980 case '+':
981 case '_':
982 case '–':
983 case '—':
984 space = false;
985 builder.append(car);
986 break;
987
988 case '‘':
989 case '`':
990 case '‹':
991 case '﹁':
992 case '〈':
993 case '「':
994 if (space || (brk && quote)) {
995 quote = true;
996 builder.append(openQuote);
997 } else {
998 // handle double-single quotes as double quotes
999 if (prev == car) {
1000 builder.deleteCharAt(builder.length() - 1);
1001 builder.append(openDoubleQuote);
1002 } else {
1003 builder.append(openQuote);
1004 }
1005 }
1006 space = false;
1007 brk = false;
1008 break;
1009
1010 case '’':
1011 case '›':
1012 case '﹂':
1013 case '〉':
1014 case '」':
1015 space = false;
1016 brk = false;
1017 // handle double-single quotes as double quotes
1018 if (prev == car) {
1019 builder.deleteCharAt(builder.length() - 1);
1020 builder.append(closeDoubleQuote);
1021 } else {
1022 builder.append(closeQuote);
1023 }
1024 break;
1025
1026 case '«':
1027 case '“':
1028 case '﹃':
1029 case '《':
1030 case '『':
1031 if (space || (brk && quote)) {
1032 quote = true;
1033 builder.append(openDoubleQuote);
1034 } else {
1035 builder.append(openDoubleQuote);
1036 }
1037 space = false;
1038 brk = false;
1039 break;
1040
1041 case '»':
1042 case '”':
1043 case '﹄':
1044 case '》':
1045 case '』':
1046 space = false;
1047 brk = false;
1048 builder.append(closeDoubleQuote);
1049 break;
1050
1051 default:
1052 space = false;
1053 brk = false;
1054 builder.append(car);
1055 break;
1056 }
1057
1058 prev = car;
1059 }
1060
1061 if (tentativeCloseQuote) {
1062 tentativeCloseQuote = false;
1063 builder.append(closeQuote);
1064 }
1065
1066 line = builder.toString().trim();
1067
1068 ParagraphType type = ParagraphType.NORMAL;
1069 if (space) {
1070 type = ParagraphType.BLANK;
1071 } else if (brk) {
1072 type = ParagraphType.BREAK;
1073 } else if (quote) {
1074 type = ParagraphType.QUOTE;
1075 }
1076
1077 return new Paragraph(type, line, words);
1078 }
1079
1080 /**
1081 * Remove the HTML from the input <b>if</b>
1082 * {@link BasicSupport_Deprecated#isHtml()} is true.
1083 *
1084 * @param input
1085 * the input
1086 *
1087 * @return the no html version if needed
1088 */
1089 private String ifUnhtml(String input) {
1090 if (isHtml() && input != null) {
1091 return StringUtils.unhtml(input);
1092 }
1093
1094 return input;
1095 }
1096
1097 /**
1098 * Reset the given {@link InputStream} and return it.
1099 *
1100 * @param in
1101 * the {@link InputStream} to reset
1102 *
1103 * @return the same {@link InputStream} after reset
1104 */
1105 static protected InputStream reset(InputStream in) {
1106 try {
1107 if (in != null) {
1108 in.reset();
1109 }
1110 } catch (IOException e) {
1111 }
1112
1113 return in;
1114 }
1115
1116 /**
1117 * Return the first line from the given input which correspond to the given
1118 * selectors.
1119 *
1120 * @param in
1121 * the input
1122 * @param needle
1123 * a string that must be found inside the target line (also
1124 * supports "^" at start to say "only if it starts with" the
1125 * needle)
1126 * @param relativeLine
1127 * the line to return based upon the target line position (-1 =
1128 * the line before, 0 = the target line...)
1129 *
1130 * @return the line, or NULL if not found
1131 */
1132 static protected String getLine(InputStream in, String needle,
1133 int relativeLine) {
1134 return getLine(in, needle, relativeLine, true);
1135 }
1136
1137 /**
1138 * Return a line from the given input which correspond to the given
1139 * selectors.
1140 *
1141 * @param in
1142 * the input
1143 * @param needle
1144 * a string that must be found inside the target line (also
1145 * supports "^" at start to say "only if it starts with" the
1146 * needle)
1147 * @param relativeLine
1148 * the line to return based upon the target line position (-1 =
1149 * the line before, 0 = the target line...)
1150 * @param first
1151 * takes the first result (as opposed to the last one, which will
1152 * also always spend the input)
1153 *
1154 * @return the line, or NULL if not found
1155 */
1156 static protected String getLine(InputStream in, String needle,
1157 int relativeLine, boolean first) {
1158 String rep = null;
1159
1160 reset(in);
1161
1162 List<String> lines = new ArrayList<String>();
1163 @SuppressWarnings("resource")
1164 Scanner scan = new Scanner(in, "UTF-8");
1165 int index = -1;
1166 scan.useDelimiter("\\n");
1167 while (scan.hasNext()) {
1168 lines.add(scan.next());
1169
1170 if (index == -1) {
1171 if (needle.startsWith("^")) {
1172 if (lines.get(lines.size() - 1).startsWith(
1173 needle.substring(1))) {
1174 index = lines.size() - 1;
1175 }
1176
1177 } else {
1178 if (lines.get(lines.size() - 1).contains(needle)) {
1179 index = lines.size() - 1;
1180 }
1181 }
1182 }
1183
1184 if (index >= 0 && index + relativeLine < lines.size()) {
1185 rep = lines.get(index + relativeLine);
1186 if (first) {
1187 break;
1188 }
1189 }
1190 }
1191
1192 return rep;
1193 }
1194
1195 /**
1196 * Return the text between the key and the endKey (and optional subKey can
1197 * be passed, in this case we will look for the key first, then take the
1198 * text between the subKey and the endKey).
1199 * <p>
1200 * Will only match the first line with the given key if more than one are
1201 * possible. Which also means that if the subKey or endKey is not found on
1202 * that line, NULL will be returned.
1203 *
1204 * @param in
1205 * the input
1206 * @param key
1207 * the key to match (also supports "^" at start to say
1208 * "only if it starts with" the key)
1209 * @param subKey
1210 * the sub key or NULL if none
1211 * @param endKey
1212 * the end key or NULL for "up to the end"
1213 * @return the text or NULL if not found
1214 */
1215 static protected String getKeyLine(InputStream in, String key,
1216 String subKey, String endKey) {
1217 return getKeyText(getLine(in, key, 0), key, subKey, endKey);
1218 }
1219
1220 /**
1221 * Return the text between the key and the endKey (and optional subKey can
1222 * be passed, in this case we will look for the key first, then take the
1223 * text between the subKey and the endKey).
1224 *
1225 * @param in
1226 * the input
1227 * @param key
1228 * the key to match (also supports "^" at start to say
1229 * "only if it starts with" the key)
1230 * @param subKey
1231 * the sub key or NULL if none
1232 * @param endKey
1233 * the end key or NULL for "up to the end"
1234 * @return the text or NULL if not found
1235 */
1236 static protected String getKeyText(String in, String key, String subKey,
1237 String endKey) {
1238 String result = null;
1239
1240 String line = in;
1241 if (line != null && line.contains(key)) {
1242 line = line.substring(line.indexOf(key) + key.length());
1243 if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
1244 if (subKey != null) {
1245 line = line.substring(line.indexOf(subKey)
1246 + subKey.length());
1247 }
1248 if (endKey == null || line.contains(endKey)) {
1249 if (endKey != null) {
1250 line = line.substring(0, line.indexOf(endKey));
1251 result = line;
1252 }
1253 }
1254 }
1255 }
1256
1257 return result;
1258 }
1259
1260 /**
1261 * Return the text between the key and the endKey (optional subKeys can be
1262 * passed, in this case we will look for the subKeys first, then take the
1263 * text between the key and the endKey).
1264 *
1265 * @param in
1266 * the input
1267 * @param key
1268 * the key to match
1269 * @param endKey
1270 * the end key or NULL for "up to the end"
1271 * @param afters
1272 * the sub-keys to find before checking for key/endKey
1273 *
1274 * @return the text or NULL if not found
1275 */
1276 static protected String getKeyTextAfter(String in, String key,
1277 String endKey, String... afters) {
1278
1279 if (in != null && !in.isEmpty()) {
1280 int pos = indexOfAfter(in, 0, afters);
1281 if (pos < 0) {
1282 return null;
1283 }
1284
1285 in = in.substring(pos);
1286 }
1287
1288 return getKeyText(in, key, null, endKey);
1289 }
1290
1291 /**
1292 * Return the first index after all the given "afters" have been found in
1293 * the {@link String}, or -1 if it was not possible.
1294 *
1295 * @param in
1296 * the input
1297 * @param startAt
1298 * start at this position in the string
1299 * @param afters
1300 * the sub-keys to find before checking for key/endKey
1301 *
1302 * @return the text or NULL if not found
1303 */
1304 static protected int indexOfAfter(String in, int startAt, String... afters) {
1305 int pos = -1;
1306 if (in != null && !in.isEmpty()) {
1307 pos = startAt;
1308 if (afters != null) {
1309 for (int i = 0; pos >= 0 && i < afters.length; i++) {
1310 String subKey = afters[i];
1311 if (!subKey.isEmpty()) {
1312 pos = in.indexOf(subKey, pos);
1313 if (pos >= 0) {
1314 pos += subKey.length();
1315 }
1316 }
1317 }
1318 }
1319 }
1320
1321 return pos;
1322 }
1323 }