Merge commit 'e6bb1700749980e69b5e913acbfd276f129c24dc'
[nikiroo-utils.git] / src / be / nikiroo / fanfix / supported / BasicSupport_Deprecated.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.BufferedReader;
4 import java.io.ByteArrayInputStream;
5 import java.io.File;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.io.InputStreamReader;
9 import java.net.MalformedURLException;
10 import java.net.URL;
11 import java.util.ArrayList;
12 import java.util.Date;
13 import java.util.List;
14 import java.util.Map.Entry;
15 import java.util.Scanner;
16
17 import be.nikiroo.fanfix.Instance;
18 import be.nikiroo.fanfix.bundles.Config;
19 import be.nikiroo.fanfix.bundles.StringId;
20 import be.nikiroo.fanfix.data.Chapter;
21 import be.nikiroo.fanfix.data.MetaData;
22 import be.nikiroo.fanfix.data.Paragraph;
23 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
24 import be.nikiroo.fanfix.data.Story;
25 import be.nikiroo.utils.Image;
26 import be.nikiroo.utils.Progress;
27 import be.nikiroo.utils.StringUtils;
28
29 /**
30 * DEPRECATED: use the new Jsoup 'Node' system.
31 * <p>
32 * This class is the base class used by the other support classes. It can be
33 * used outside of this package, and have static method that you can use to get
34 * access to the correct support class.
35 * <p>
36 * It will be used with 'resources' (usually web pages or files).
37 *
38 * @author niki
39 */
40 @Deprecated
41 public abstract class BasicSupport_Deprecated extends BasicSupport {
42 private InputStream in;
43
44 // quote chars
45 private char openQuote = Instance.getTrans().getCharacter(
46 StringId.OPEN_SINGLE_QUOTE);
47 private char closeQuote = Instance.getTrans().getCharacter(
48 StringId.CLOSE_SINGLE_QUOTE);
49 private char openDoubleQuote = Instance.getTrans().getCharacter(
50 StringId.OPEN_DOUBLE_QUOTE);
51 private char closeDoubleQuote = Instance.getTrans().getCharacter(
52 StringId.CLOSE_DOUBLE_QUOTE);
53
54 // New methods not used in Deprecated mode
55 @Override
56 protected String getDesc() throws IOException {
57 throw new RuntimeException("should not be used by legacy code");
58 }
59
60 @Override
61 protected MetaData getMeta() throws IOException {
62 throw new RuntimeException("should not be used by legacy code");
63 }
64
65 @Override
66 protected List<Entry<String, URL>> getChapters(Progress pg)
67 throws IOException {
68 throw new RuntimeException("should not be used by legacy code");
69 }
70
71 @Override
72 protected String getChapterContent(URL chapUrl, int number, Progress pg)
73 throws IOException {
74 throw new RuntimeException("should not be used by legacy code");
75 }
76
77 @Override
78 public Story process(Progress pg) throws IOException {
79 return process(getSource(), pg);
80 }
81
82 //
83
84 /**
85 * Return the {@link MetaData} of this story.
86 *
87 * @param source
88 * the source of the story
89 * @param in
90 * the input (the main resource)
91 *
92 * @return the associated {@link MetaData}, never NULL
93 *
94 * @throws IOException
95 * in case of I/O error
96 */
97 protected abstract MetaData getMeta(URL source, InputStream in)
98 throws IOException;
99
100 /**
101 * Return the story description.
102 *
103 * @param source
104 * the source of the story
105 * @param in
106 * the input (the main resource)
107 *
108 * @return the description
109 *
110 * @throws IOException
111 * in case of I/O error
112 */
113 protected abstract String getDesc(URL source, InputStream in)
114 throws IOException;
115
116 /**
117 * Return the list of chapters (name and resource).
118 *
119 * @param source
120 * the source of the story
121 * @param in
122 * the input (the main resource)
123 * @param pg
124 * the optional progress reporter
125 *
126 * @return the chapters
127 *
128 * @throws IOException
129 * in case of I/O error
130 */
131 protected abstract List<Entry<String, URL>> getChapters(URL source,
132 InputStream in, Progress pg) throws IOException;
133
134 /**
135 * Return the content of the chapter (possibly HTML encoded, if
136 * {@link BasicSupport_Deprecated#isHtml()} is TRUE).
137 *
138 * @param source
139 * the source of the story
140 * @param in
141 * the input (the main resource)
142 * @param number
143 * the chapter number
144 * @param pg
145 * the optional progress reporter
146 *
147 * @return the content
148 *
149 * @throws IOException
150 * in case of I/O error
151 */
152 protected abstract String getChapterContent(URL source, InputStream in,
153 int number, Progress pg) throws IOException;
154
155 /**
156 * Process the given story resource into a partially filled {@link Story}
157 * object containing the name and metadata, except for the description.
158 *
159 * @param url
160 * the story resource
161 *
162 * @return the {@link Story}
163 *
164 * @throws IOException
165 * in case of I/O error
166 */
167 public Story processMeta(URL url) throws IOException {
168 return processMeta(url, true, false, null);
169 }
170
171 /**
172 * Process the given story resource into a partially filled {@link Story}
173 * object containing the name and metadata.
174 *
175 * @param url
176 * the story resource
177 * @param close
178 * close "this" and "in" when done
179 * @param getDesc
180 * retrieve the description of the story, or not
181 * @param pg
182 * the optional progress reporter
183 *
184 * @return the {@link Story}, never NULL
185 *
186 * @throws IOException
187 * in case of I/O error
188 */
189 protected Story processMeta(URL url, boolean close, boolean getDesc,
190 Progress pg) throws IOException {
191 if (pg == null) {
192 pg = new Progress();
193 } else {
194 pg.setMinMax(0, 100);
195 }
196
197 login();
198 pg.setProgress(10);
199
200 url = getCanonicalUrl(url);
201
202 setCurrentReferer(url);
203
204 in = openInput(url); // NULL allowed here
205 try {
206 preprocess(url, getInput());
207 pg.setProgress(30);
208
209 Story story = new Story();
210 MetaData meta = getMeta(url, getInput());
211 if (meta.getCreationDate() == null
212 || meta.getCreationDate().isEmpty()) {
213 meta.setCreationDate(StringUtils.fromTime(new Date().getTime()));
214 }
215 story.setMeta(meta);
216
217 pg.setProgress(50);
218
219 if (meta.getCover() == null) {
220 meta.setCover(getDefaultCover(meta.getSubject()));
221 }
222
223 pg.setProgress(60);
224
225 if (getDesc) {
226 String descChapterName = Instance.getTrans().getString(
227 StringId.DESCRIPTION);
228 story.getMeta().setResume(
229 makeChapter(url, 0, descChapterName,
230 getDesc(url, getInput()), null));
231 }
232
233 pg.setProgress(100);
234 return story;
235 } finally {
236 if (close) {
237 close();
238
239 if (in != null) {
240 in.close();
241 }
242 }
243 }
244 }
245
246 /**
247 * Process the given story resource into a fully filled {@link Story}
248 * object.
249 *
250 * @param url
251 * the story resource
252 * @param pg
253 * the optional progress reporter
254 *
255 * @return the {@link Story}, never NULL
256 *
257 * @throws IOException
258 * in case of I/O error
259 */
260 protected Story process(URL url, Progress pg) throws IOException {
261 if (pg == null) {
262 pg = new Progress();
263 } else {
264 pg.setMinMax(0, 100);
265 }
266
267 url = getCanonicalUrl(url);
268 pg.setProgress(1);
269 try {
270 Progress pgMeta = new Progress();
271 pg.addProgress(pgMeta, 10);
272 Story story = processMeta(url, false, true, pgMeta);
273 if (!pgMeta.isDone()) {
274 pgMeta.setProgress(pgMeta.getMax()); // 10%
275 }
276
277 pg.setName("Retrieving " + story.getMeta().getTitle());
278
279 setCurrentReferer(url);
280
281 Progress pgGetChapters = new Progress();
282 pg.addProgress(pgGetChapters, 10);
283 story.setChapters(new ArrayList<Chapter>());
284 List<Entry<String, URL>> chapters = getChapters(url, getInput(),
285 pgGetChapters);
286 if (!pgGetChapters.isDone()) {
287 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
288 }
289
290 if (chapters != null) {
291 Progress pgChaps = new Progress("Extracting chapters", 0,
292 chapters.size() * 300);
293 pg.addProgress(pgChaps, 80);
294
295 long words = 0;
296 int i = 1;
297 for (Entry<String, URL> chap : chapters) {
298 pgChaps.setName("Extracting chapter " + i);
299 InputStream chapIn = null;
300 if (chap.getValue() != null) {
301 setCurrentReferer(chap.getValue());
302 chapIn = Instance.getCache().open(chap.getValue(),
303 this, false);
304 }
305 pgChaps.setProgress(i * 100);
306 try {
307 Progress pgGetChapterContent = new Progress();
308 Progress pgMakeChapter = new Progress();
309 pgChaps.addProgress(pgGetChapterContent, 100);
310 pgChaps.addProgress(pgMakeChapter, 100);
311
312 String content = getChapterContent(url, chapIn, i,
313 pgGetChapterContent);
314 if (!pgGetChapterContent.isDone()) {
315 pgGetChapterContent.setProgress(pgGetChapterContent
316 .getMax());
317 }
318
319 Chapter cc = makeChapter(url, i, chap.getKey(),
320 content, pgMakeChapter);
321 if (!pgMakeChapter.isDone()) {
322 pgMakeChapter.setProgress(pgMakeChapter.getMax());
323 }
324
325 words += cc.getWords();
326 story.getChapters().add(cc);
327 story.getMeta().setWords(words);
328 } finally {
329 if (chapIn != null) {
330 chapIn.close();
331 }
332 }
333
334 i++;
335 }
336
337 pgChaps.setName("Extracting chapters");
338 } else {
339 pg.setProgress(80);
340 }
341
342 return story;
343
344 } finally {
345 close();
346
347 if (in != null) {
348 in.close();
349 }
350 }
351 }
352
353 /**
354 * Prepare the support if needed before processing.
355 *
356 * @param source
357 * the source of the story
358 * @param in
359 * the input (the main resource)
360 *
361 * @throws IOException
362 * on I/O error
363 */
364 @SuppressWarnings("unused")
365 protected void preprocess(URL source, InputStream in) throws IOException {
366 }
367
368 /**
369 * Create a {@link Chapter} object from the given information, formatting
370 * the content as it should be.
371 *
372 * @param source
373 * the source of the story
374 * @param number
375 * the chapter number
376 * @param name
377 * the chapter name
378 * @param content
379 * the chapter content
380 * @param pg
381 * the optional progress reporter
382 *
383 * @return the {@link Chapter}
384 *
385 * @throws IOException
386 * in case of I/O error
387 */
388 protected Chapter makeChapter(URL source, int number, String name,
389 String content, Progress pg) throws IOException {
390 // Chapter name: process it correctly, then remove the possible
391 // redundant "Chapter x: " in front of it, or "-" (as in
392 // "Chapter 5: - Fun!" after the ": " was automatically added)
393 String chapterName = processPara(name).getContent().trim();
394 for (String lang : Instance.getConfig().getList(Config.CONF_CHAPTER)) {
395 String chapterWord = Instance.getConfig().getStringX(
396 Config.CONF_CHAPTER, lang);
397 if (chapterName.startsWith(chapterWord)) {
398 chapterName = chapterName.substring(chapterWord.length())
399 .trim();
400 break;
401 }
402 }
403
404 if (chapterName.startsWith(Integer.toString(number))) {
405 chapterName = chapterName.substring(
406 Integer.toString(number).length()).trim();
407 }
408
409 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
410 chapterName = chapterName.substring(1).trim();
411 }
412 //
413
414 Chapter chap = new Chapter(number, chapterName);
415
416 if (content != null) {
417 List<Paragraph> paras = makeParagraphs(source, content, pg);
418 long words = 0;
419 for (Paragraph para : paras) {
420 words += para.getWords();
421 }
422 chap.setParagraphs(paras);
423 chap.setWords(words);
424 }
425
426 return chap;
427
428 }
429
430 /**
431 * Convert the given content into {@link Paragraph}s.
432 *
433 * @param source
434 * the source URL of the story
435 * @param content
436 * the textual content
437 * @param pg
438 * the optional progress reporter
439 *
440 * @return the {@link Paragraph}s
441 *
442 * @throws IOException
443 * in case of I/O error
444 */
445 protected List<Paragraph> makeParagraphs(URL source, String content,
446 Progress pg) throws IOException {
447 if (pg == null) {
448 pg = new Progress();
449 }
450
451 if (isHtml()) {
452 // Special <HR> processing:
453 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
454 "<br/>* * *<br/>");
455 }
456
457 List<Paragraph> paras = new ArrayList<Paragraph>();
458
459 if (content != null && !content.trim().isEmpty()) {
460 if (isHtml()) {
461 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
462 pg.setMinMax(0, tab.length);
463 int i = 1;
464 for (String line : tab) {
465 if (line.startsWith("[") && line.endsWith("]")) {
466 pg.setName("Extracting image " + i);
467 }
468 paras.add(makeParagraph(source, line.trim()));
469 pg.setProgress(i++);
470 }
471 pg.setName(null);
472 } else {
473 List<String> lines = new ArrayList<String>();
474 BufferedReader buff = null;
475 try {
476 buff = new BufferedReader(
477 new InputStreamReader(new ByteArrayInputStream(
478 content.getBytes("UTF-8")), "UTF-8"));
479 for (String line = buff.readLine(); line != null; line = buff
480 .readLine()) {
481 lines.add(line.trim());
482 }
483 } finally {
484 if (buff != null) {
485 buff.close();
486 }
487 }
488
489 pg.setMinMax(0, lines.size());
490 int i = 0;
491 for (String line : lines) {
492 if (line.startsWith("[") && line.endsWith("]")) {
493 pg.setName("Extracting image " + i);
494 }
495 paras.add(makeParagraph(source, line));
496 pg.setProgress(i++);
497 }
498 pg.setName(null);
499 }
500
501 // Check quotes for "bad" format
502 List<Paragraph> newParas = new ArrayList<Paragraph>();
503 for (Paragraph para : paras) {
504 newParas.addAll(requotify(para));
505 }
506 paras = newParas;
507
508 // Remove double blanks/brks
509 fixBlanksBreaks(paras);
510 }
511
512 return paras;
513 }
514
515 /**
516 * Convert the given line into a single {@link Paragraph}.
517 *
518 * @param source
519 * the source URL of the story
520 * @param line
521 * the textual content of the paragraph
522 *
523 * @return the {@link Paragraph}
524 */
525 private Paragraph makeParagraph(URL source, String line) {
526 Image image = null;
527 if (line.startsWith("[") && line.endsWith("]")) {
528 image = getImage(this, source, line.substring(1, line.length() - 1)
529 .trim());
530 }
531
532 if (image != null) {
533 return new Paragraph(image);
534 }
535
536 return processPara(line);
537 }
538
539 /**
540 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
541 * those {@link Paragraph}s.
542 * <p>
543 * The resulting list will not contain a starting or trailing blank/break
544 * nor 2 blanks or breaks following each other.
545 *
546 * @param paras
547 * the list of {@link Paragraph}s to fix
548 */
549 protected void fixBlanksBreaks(List<Paragraph> paras) {
550 boolean space = false;
551 boolean brk = true;
552 for (int i = 0; i < paras.size(); i++) {
553 Paragraph para = paras.get(i);
554 boolean thisSpace = para.getType() == ParagraphType.BLANK;
555 boolean thisBrk = para.getType() == ParagraphType.BREAK;
556
557 if (i > 0 && space && thisBrk) {
558 paras.remove(i - 1);
559 i--;
560 } else if ((space || brk) && (thisSpace || thisBrk)) {
561 paras.remove(i);
562 i--;
563 }
564
565 space = thisSpace;
566 brk = thisBrk;
567 }
568
569 // Remove blank/brk at start
570 if (paras.size() > 0
571 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
572 0).getType() == ParagraphType.BREAK)) {
573 paras.remove(0);
574 }
575
576 // Remove blank/brk at end
577 int last = paras.size() - 1;
578 if (paras.size() > 0
579 && (paras.get(last).getType() == ParagraphType.BLANK || paras
580 .get(last).getType() == ParagraphType.BREAK)) {
581 paras.remove(last);
582 }
583 }
584
585 /**
586 * Get the default cover related to this subject (see <tt>.info</tt> files).
587 *
588 * @param subject
589 * the subject
590 *
591 * @return the cover if any, or NULL
592 */
593 static Image getDefaultCover(String subject) {
594 if (subject != null && !subject.isEmpty()
595 && Instance.getCoverDir() != null) {
596 try {
597 File fileCover = new File(Instance.getCoverDir(), subject);
598 return getImage(null, fileCover.toURI().toURL(), subject);
599 } catch (MalformedURLException e) {
600 }
601 }
602
603 return null;
604 }
605
606 /**
607 * Return the list of supported image extensions.
608 *
609 * @param emptyAllowed
610 * TRUE to allow an empty extension on first place, which can be
611 * used when you may already have an extension in your input but
612 * are not sure about it
613 *
614 * @return the extensions
615 */
616 static String[] getImageExt(boolean emptyAllowed) {
617 if (emptyAllowed) {
618 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
619 }
620
621 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
622 }
623
624 /**
625 * Check if the given resource can be a local image or a remote image, then
626 * refresh the cache with it if it is.
627 *
628 * @param source
629 * the story source
630 * @param line
631 * the resource to check
632 *
633 * @return the image if found, or NULL
634 *
635 */
636 static Image getImage(BasicSupport_Deprecated support, URL source,
637 String line) {
638 URL url = getImageUrl(support, source, line);
639 if (url != null) {
640 if ("file".equals(url.getProtocol())) {
641 if (new File(url.getPath()).isDirectory()) {
642 return null;
643 }
644 }
645 InputStream in = null;
646 try {
647 in = Instance.getCache().open(url, getSupport(url), true);
648 return new Image(in);
649 } catch (IOException e) {
650 } finally {
651 if (in != null) {
652 try {
653 in.close();
654 } catch (IOException e) {
655 }
656 }
657 }
658 }
659
660 return null;
661 }
662
663 /**
664 * Check if the given resource can be a local image or a remote image, then
665 * refresh the cache with it if it is.
666 *
667 * @param source
668 * the story source
669 * @param line
670 * the resource to check
671 *
672 * @return the image URL if found, or NULL
673 *
674 */
675 static URL getImageUrl(BasicSupport_Deprecated support, URL source,
676 String line) {
677 URL url = null;
678
679 if (line != null) {
680 // try for files
681 if (source != null) {
682 try {
683 String relPath = null;
684 String absPath = null;
685 try {
686 String path = new File(source.getFile()).getParent();
687 relPath = new File(new File(path), line.trim())
688 .getAbsolutePath();
689 } catch (Exception e) {
690 // Cannot be converted to path (one possibility to take
691 // into account: absolute path on Windows)
692 }
693 try {
694 absPath = new File(line.trim()).getAbsolutePath();
695 } catch (Exception e) {
696 // Cannot be converted to path (at all)
697 }
698
699 for (String ext : getImageExt(true)) {
700 File absFile = new File(absPath + ext);
701 File relFile = new File(relPath + ext);
702 if (absPath != null && absFile.exists()
703 && absFile.isFile()) {
704 url = absFile.toURI().toURL();
705 } else if (relPath != null && relFile.exists()
706 && relFile.isFile()) {
707 url = relFile.toURI().toURL();
708 }
709 }
710 } catch (Exception e) {
711 // Should not happen since we control the correct arguments
712 }
713 }
714
715 if (url == null) {
716 // try for URLs
717 try {
718 for (String ext : getImageExt(true)) {
719 if (Instance.getCache()
720 .check(new URL(line + ext), true)) {
721 url = new URL(line + ext);
722 break;
723 }
724 }
725
726 // try out of cache
727 if (url == null) {
728 for (String ext : getImageExt(true)) {
729 try {
730 url = new URL(line + ext);
731 Instance.getCache().refresh(url, support, true);
732 break;
733 } catch (IOException e) {
734 // no image with this ext
735 url = null;
736 }
737 }
738 }
739 } catch (MalformedURLException e) {
740 // Not an url
741 }
742 }
743
744 // refresh the cached file
745 if (url != null) {
746 try {
747 Instance.getCache().refresh(url, support, true);
748 } catch (IOException e) {
749 // woops, broken image
750 url = null;
751 }
752 }
753 }
754
755 return url;
756 }
757
758 /**
759 * Open the input file that will be used through the support.
760 * <p>
761 * Can return NULL, in which case you are supposed to work without an
762 * {@link InputStream}.
763 *
764 * @param source
765 * the source {@link URL}
766 *
767 * @return the {@link InputStream}
768 *
769 * @throws IOException
770 * in case of I/O error
771 */
772 protected InputStream openInput(URL source) throws IOException {
773 return Instance.getCache().open(source, this, false);
774 }
775
776 /**
777 * Reset then return {@link BasicSupport_Deprecated#in}.
778 *
779 * @return {@link BasicSupport_Deprecated#in}
780 */
781 protected InputStream getInput() {
782 return reset(in);
783 }
784
785 /**
786 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
787 * and requotify them (i.e., separate them into QUOTE paragraphs and other
788 * paragraphs (quotes or not)).
789 *
790 * @param para
791 * the paragraph to requotify (not necessarily a quote)
792 *
793 * @return the correctly (or so we hope) quotified paragraphs
794 */
795 protected List<Paragraph> requotify(Paragraph para) {
796 List<Paragraph> newParas = new ArrayList<Paragraph>();
797
798 if (para.getType() == ParagraphType.QUOTE
799 && para.getContent().length() > 2) {
800 String line = para.getContent();
801 boolean singleQ = line.startsWith("" + openQuote);
802 boolean doubleQ = line.startsWith("" + openDoubleQuote);
803
804 // Do not try when more than one quote at a time
805 // (some stories are not easily readable if we do)
806 if (singleQ
807 && line.indexOf(closeQuote, 1) < line
808 .lastIndexOf(closeQuote)) {
809 newParas.add(para);
810 return newParas;
811 }
812 if (doubleQ
813 && line.indexOf(closeDoubleQuote, 1) < line
814 .lastIndexOf(closeDoubleQuote)) {
815 newParas.add(para);
816 return newParas;
817 }
818 //
819
820 if (!singleQ && !doubleQ) {
821 line = openDoubleQuote + line + closeDoubleQuote;
822 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
823 .getWords()));
824 } else {
825 char open = singleQ ? openQuote : openDoubleQuote;
826 char close = singleQ ? closeQuote : closeDoubleQuote;
827
828 int posDot = -1;
829 boolean inQuote = false;
830 int i = 0;
831 for (char car : line.toCharArray()) {
832 if (car == open) {
833 inQuote = true;
834 } else if (car == close) {
835 inQuote = false;
836 } else if (car == '.' && !inQuote) {
837 posDot = i;
838 break;
839 }
840 i++;
841 }
842
843 if (posDot >= 0) {
844 String rest = line.substring(posDot + 1).trim();
845 line = line.substring(0, posDot + 1).trim();
846 long words = 1;
847 for (char car : line.toCharArray()) {
848 if (car == ' ') {
849 words++;
850 }
851 }
852 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
853 if (!rest.isEmpty()) {
854 newParas.addAll(requotify(processPara(rest)));
855 }
856 } else {
857 newParas.add(para);
858 }
859 }
860 } else {
861 newParas.add(para);
862 }
863
864 return newParas;
865 }
866
867 /**
868 * Process a {@link Paragraph} from a raw line of text.
869 * <p>
870 * Will also fix quotes and HTML encoding if needed.
871 *
872 * @param line
873 * the raw line
874 *
875 * @return the processed {@link Paragraph}
876 */
877 protected Paragraph processPara(String line) {
878 line = ifUnhtml(line).trim();
879
880 boolean space = true;
881 boolean brk = true;
882 boolean quote = false;
883 boolean tentativeCloseQuote = false;
884 char prev = '\0';
885 int dashCount = 0;
886 long words = 1;
887
888 StringBuilder builder = new StringBuilder();
889 for (char car : line.toCharArray()) {
890 if (car != '-') {
891 if (dashCount > 0) {
892 // dash, ndash and mdash: - – —
893 // currently: always use mdash
894 builder.append(dashCount == 1 ? '-' : '—');
895 }
896 dashCount = 0;
897 }
898
899 if (tentativeCloseQuote) {
900 tentativeCloseQuote = false;
901 if (Character.isLetterOrDigit(car)) {
902 builder.append("'");
903 } else {
904 // handle double-single quotes as double quotes
905 if (prev == car) {
906 builder.append(closeDoubleQuote);
907 continue;
908 }
909
910 builder.append(closeQuote);
911 }
912 }
913
914 switch (car) {
915 case ' ': // note: unbreakable space
916 case ' ':
917 case '\t':
918 case '\n': // just in case
919 case '\r': // just in case
920 if (builder.length() > 0
921 && builder.charAt(builder.length() - 1) != ' ') {
922 words++;
923 }
924 builder.append(' ');
925 break;
926
927 case '\'':
928 if (space || (brk && quote)) {
929 quote = true;
930 // handle double-single quotes as double quotes
931 if (prev == car) {
932 builder.deleteCharAt(builder.length() - 1);
933 builder.append(openDoubleQuote);
934 } else {
935 builder.append(openQuote);
936 }
937 } else if (prev == ' ' || prev == car) {
938 // handle double-single quotes as double quotes
939 if (prev == car) {
940 builder.deleteCharAt(builder.length() - 1);
941 builder.append(openDoubleQuote);
942 } else {
943 builder.append(openQuote);
944 }
945 } else {
946 // it is a quote ("I'm off") or a 'quote' ("This
947 // 'good' restaurant"...)
948 tentativeCloseQuote = true;
949 }
950 break;
951
952 case '"':
953 if (space || (brk && quote)) {
954 quote = true;
955 builder.append(openDoubleQuote);
956 } else if (prev == ' ') {
957 builder.append(openDoubleQuote);
958 } else {
959 builder.append(closeDoubleQuote);
960 }
961 break;
962
963 case '-':
964 if (space) {
965 quote = true;
966 } else {
967 dashCount++;
968 }
969 space = false;
970 break;
971
972 case '*':
973 case '~':
974 case '/':
975 case '\\':
976 case '<':
977 case '>':
978 case '=':
979 case '+':
980 case '_':
981 case '–':
982 case '—':
983 space = false;
984 builder.append(car);
985 break;
986
987 case '‘':
988 case '`':
989 case '‹':
990 case '﹁':
991 case '〈':
992 case '「':
993 if (space || (brk && quote)) {
994 quote = true;
995 builder.append(openQuote);
996 } else {
997 // handle double-single quotes as double quotes
998 if (prev == car) {
999 builder.deleteCharAt(builder.length() - 1);
1000 builder.append(openDoubleQuote);
1001 } else {
1002 builder.append(openQuote);
1003 }
1004 }
1005 space = false;
1006 brk = false;
1007 break;
1008
1009 case '’':
1010 case '›':
1011 case '﹂':
1012 case '〉':
1013 case '」':
1014 space = false;
1015 brk = false;
1016 // handle double-single quotes as double quotes
1017 if (prev == car) {
1018 builder.deleteCharAt(builder.length() - 1);
1019 builder.append(closeDoubleQuote);
1020 } else {
1021 builder.append(closeQuote);
1022 }
1023 break;
1024
1025 case '«':
1026 case '“':
1027 case '﹃':
1028 case '《':
1029 case '『':
1030 if (space || (brk && quote)) {
1031 quote = true;
1032 builder.append(openDoubleQuote);
1033 } else {
1034 builder.append(openDoubleQuote);
1035 }
1036 space = false;
1037 brk = false;
1038 break;
1039
1040 case '»':
1041 case '”':
1042 case '﹄':
1043 case '》':
1044 case '』':
1045 space = false;
1046 brk = false;
1047 builder.append(closeDoubleQuote);
1048 break;
1049
1050 default:
1051 space = false;
1052 brk = false;
1053 builder.append(car);
1054 break;
1055 }
1056
1057 prev = car;
1058 }
1059
1060 if (tentativeCloseQuote) {
1061 tentativeCloseQuote = false;
1062 builder.append(closeQuote);
1063 }
1064
1065 line = builder.toString().trim();
1066
1067 ParagraphType type = ParagraphType.NORMAL;
1068 if (space) {
1069 type = ParagraphType.BLANK;
1070 } else if (brk) {
1071 type = ParagraphType.BREAK;
1072 } else if (quote) {
1073 type = ParagraphType.QUOTE;
1074 }
1075
1076 return new Paragraph(type, line, words);
1077 }
1078
1079 /**
1080 * Remove the HTML from the input <b>if</b>
1081 * {@link BasicSupport_Deprecated#isHtml()} is true.
1082 *
1083 * @param input
1084 * the input
1085 *
1086 * @return the no html version if needed
1087 */
1088 private String ifUnhtml(String input) {
1089 if (isHtml() && input != null) {
1090 return StringUtils.unhtml(input);
1091 }
1092
1093 return input;
1094 }
1095
1096 /**
1097 * Reset the given {@link InputStream} and return it.
1098 *
1099 * @param in
1100 * the {@link InputStream} to reset
1101 *
1102 * @return the same {@link InputStream} after reset
1103 */
1104 static protected InputStream reset(InputStream in) {
1105 try {
1106 if (in != null) {
1107 in.reset();
1108 }
1109 } catch (IOException e) {
1110 }
1111
1112 return in;
1113 }
1114
1115 /**
1116 * Return the first line from the given input which correspond to the given
1117 * selectors.
1118 *
1119 * @param in
1120 * the input
1121 * @param needle
1122 * a string that must be found inside the target line (also
1123 * supports "^" at start to say "only if it starts with" the
1124 * needle)
1125 * @param relativeLine
1126 * the line to return based upon the target line position (-1 =
1127 * the line before, 0 = the target line...)
1128 *
1129 * @return the line, or NULL if not found
1130 */
1131 static protected String getLine(InputStream in, String needle,
1132 int relativeLine) {
1133 return getLine(in, needle, relativeLine, true);
1134 }
1135
1136 /**
1137 * Return a line from the given input which correspond to the given
1138 * selectors.
1139 *
1140 * @param in
1141 * the input
1142 * @param needle
1143 * a string that must be found inside the target line (also
1144 * supports "^" at start to say "only if it starts with" the
1145 * needle)
1146 * @param relativeLine
1147 * the line to return based upon the target line position (-1 =
1148 * the line before, 0 = the target line...)
1149 * @param first
1150 * takes the first result (as opposed to the last one, which will
1151 * also always spend the input)
1152 *
1153 * @return the line, or NULL if not found
1154 */
1155 static protected String getLine(InputStream in, String needle,
1156 int relativeLine, boolean first) {
1157 String rep = null;
1158
1159 reset(in);
1160
1161 List<String> lines = new ArrayList<String>();
1162 @SuppressWarnings("resource")
1163 Scanner scan = new Scanner(in, "UTF-8");
1164 int index = -1;
1165 scan.useDelimiter("\\n");
1166 while (scan.hasNext()) {
1167 lines.add(scan.next());
1168
1169 if (index == -1) {
1170 if (needle.startsWith("^")) {
1171 if (lines.get(lines.size() - 1).startsWith(
1172 needle.substring(1))) {
1173 index = lines.size() - 1;
1174 }
1175
1176 } else {
1177 if (lines.get(lines.size() - 1).contains(needle)) {
1178 index = lines.size() - 1;
1179 }
1180 }
1181 }
1182
1183 if (index >= 0 && index + relativeLine < lines.size()) {
1184 rep = lines.get(index + relativeLine);
1185 if (first) {
1186 break;
1187 }
1188 }
1189 }
1190
1191 return rep;
1192 }
1193
1194 /**
1195 * Return the text between the key and the endKey (and optional subKey can
1196 * be passed, in this case we will look for the key first, then take the
1197 * text between the subKey and the endKey).
1198 * <p>
1199 * Will only match the first line with the given key if more than one are
1200 * possible. Which also means that if the subKey or endKey is not found on
1201 * that line, NULL will be returned.
1202 *
1203 * @param in
1204 * the input
1205 * @param key
1206 * the key to match (also supports "^" at start to say
1207 * "only if it starts with" the key)
1208 * @param subKey
1209 * the sub key or NULL if none
1210 * @param endKey
1211 * the end key or NULL for "up to the end"
1212 * @return the text or NULL if not found
1213 */
1214 static protected String getKeyLine(InputStream in, String key,
1215 String subKey, String endKey) {
1216 return getKeyText(getLine(in, key, 0), key, subKey, endKey);
1217 }
1218
1219 /**
1220 * Return the text between the key and the endKey (and optional subKey can
1221 * be passed, in this case we will look for the key first, then take the
1222 * text between the subKey and the endKey).
1223 *
1224 * @param in
1225 * the input
1226 * @param key
1227 * the key to match (also supports "^" at start to say
1228 * "only if it starts with" the key)
1229 * @param subKey
1230 * the sub key or NULL if none
1231 * @param endKey
1232 * the end key or NULL for "up to the end"
1233 * @return the text or NULL if not found
1234 */
1235 static protected String getKeyText(String in, String key, String subKey,
1236 String endKey) {
1237 String result = null;
1238
1239 String line = in;
1240 if (line != null && line.contains(key)) {
1241 line = line.substring(line.indexOf(key) + key.length());
1242 if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
1243 if (subKey != null) {
1244 line = line.substring(line.indexOf(subKey)
1245 + subKey.length());
1246 }
1247 if (endKey == null || line.contains(endKey)) {
1248 if (endKey != null) {
1249 line = line.substring(0, line.indexOf(endKey));
1250 result = line;
1251 }
1252 }
1253 }
1254 }
1255
1256 return result;
1257 }
1258
1259 /**
1260 * Return the text between the key and the endKey (optional subKeys can be
1261 * passed, in this case we will look for the subKeys first, then take the
1262 * text between the key and the endKey).
1263 *
1264 * @param in
1265 * the input
1266 * @param key
1267 * the key to match
1268 * @param endKey
1269 * the end key or NULL for "up to the end"
1270 * @param afters
1271 * the sub-keys to find before checking for key/endKey
1272 *
1273 * @return the text or NULL if not found
1274 */
1275 static protected String getKeyTextAfter(String in, String key,
1276 String endKey, String... afters) {
1277
1278 if (in != null && !in.isEmpty()) {
1279 int pos = indexOfAfter(in, 0, afters);
1280 if (pos < 0) {
1281 return null;
1282 }
1283
1284 in = in.substring(pos);
1285 }
1286
1287 return getKeyText(in, key, null, endKey);
1288 }
1289
1290 /**
1291 * Return the first index after all the given "afters" have been found in
1292 * the {@link String}, or -1 if it was not possible.
1293 *
1294 * @param in
1295 * the input
1296 * @param startAt
1297 * start at this position in the string
1298 * @param afters
1299 * the sub-keys to find before checking for key/endKey
1300 *
1301 * @return the text or NULL if not found
1302 */
1303 static protected int indexOfAfter(String in, int startAt, String... afters) {
1304 int pos = -1;
1305 if (in != null && !in.isEmpty()) {
1306 pos = startAt;
1307 if (afters != null) {
1308 for (int i = 0; pos >= 0 && i < afters.length; i++) {
1309 String subKey = afters[i];
1310 if (!subKey.isEmpty()) {
1311 pos = in.indexOf(subKey, pos);
1312 if (pos >= 0) {
1313 pos += subKey.length();
1314 }
1315 }
1316 }
1317 }
1318 }
1319
1320 return pos;
1321 }
1322 }