Merge branch 'master' into subtree
[nikiroo-utils.git] / supported / BasicSupport_Deprecated.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.BufferedReader;
4 import java.io.ByteArrayInputStream;
5 import java.io.File;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.io.InputStreamReader;
9 import java.net.MalformedURLException;
10 import java.net.URL;
11 import java.util.ArrayList;
12 import java.util.Date;
13 import java.util.List;
14 import java.util.Map.Entry;
15 import java.util.Scanner;
16
17 import be.nikiroo.fanfix.Instance;
18 import be.nikiroo.fanfix.bundles.Config;
19 import be.nikiroo.fanfix.bundles.StringId;
20 import be.nikiroo.fanfix.data.Chapter;
21 import be.nikiroo.fanfix.data.MetaData;
22 import be.nikiroo.fanfix.data.Paragraph;
23 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
24 import be.nikiroo.fanfix.data.Story;
25 import be.nikiroo.utils.Image;
26 import be.nikiroo.utils.Progress;
27 import be.nikiroo.utils.StringUtils;
28
29 /**
30 * DEPRECATED: use the new Jsoup 'Node' system.
31 * <p>
32 * This class is the base class used by the other support classes. It can be
33 * used outside of this package, and have static method that you can use to get
34 * access to the correct support class.
35 * <p>
36 * It will be used with 'resources' (usually web pages or files).
37 *
38 * @author niki
39 */
40 @Deprecated
41 public abstract class BasicSupport_Deprecated extends BasicSupport {
42 private InputStream in;
43
44 // quote chars
45 private char openQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_SINGLE_QUOTE);
46 private char closeQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_SINGLE_QUOTE);
47 private char openDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_DOUBLE_QUOTE);
48 private char closeDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_DOUBLE_QUOTE);
49
50 // New methods not used in Deprecated mode
51 @Override
52 protected String getDesc() throws IOException {
53 throw new RuntimeException("should not be used by legacy code");
54 }
55
56 @Override
57 protected MetaData getMeta() throws IOException {
58 throw new RuntimeException("should not be used by legacy code");
59 }
60
61 @Override
62 protected List<Entry<String, URL>> getChapters(Progress pg)
63 throws IOException {
64 throw new RuntimeException("should not be used by legacy code");
65 }
66
67 @Override
68 protected String getChapterContent(URL chapUrl, int number, Progress pg)
69 throws IOException {
70 throw new RuntimeException("should not be used by legacy code");
71 }
72
73 @Override
74 public Story process(Progress pg) throws IOException {
75 return process(getSource(), pg);
76 }
77
78 //
79
80 /**
81 * Return the {@link MetaData} of this story.
82 *
83 * @param source
84 * the source of the story
85 * @param in
86 * the input (the main resource)
87 *
88 * @return the associated {@link MetaData}, never NULL
89 *
90 * @throws IOException
91 * in case of I/O error
92 */
93 protected abstract MetaData getMeta(URL source, InputStream in)
94 throws IOException;
95
96 /**
97 * Return the story description.
98 *
99 * @param source
100 * the source of the story
101 * @param in
102 * the input (the main resource)
103 *
104 * @return the description
105 *
106 * @throws IOException
107 * in case of I/O error
108 */
109 protected abstract String getDesc(URL source, InputStream in)
110 throws IOException;
111
112 /**
113 * Return the list of chapters (name and resource).
114 *
115 * @param source
116 * the source of the story
117 * @param in
118 * the input (the main resource)
119 * @param pg
120 * the optional progress reporter
121 *
122 * @return the chapters
123 *
124 * @throws IOException
125 * in case of I/O error
126 */
127 protected abstract List<Entry<String, URL>> getChapters(URL source,
128 InputStream in, Progress pg) throws IOException;
129
130 /**
131 * Return the content of the chapter (possibly HTML encoded, if
132 * {@link BasicSupport_Deprecated#isHtml()} is TRUE).
133 *
134 * @param source
135 * the source of the story
136 * @param in
137 * the input (the main resource)
138 * @param number
139 * the chapter number
140 * @param pg
141 * the optional progress reporter
142 *
143 * @return the content
144 *
145 * @throws IOException
146 * in case of I/O error
147 */
148 protected abstract String getChapterContent(URL source, InputStream in,
149 int number, Progress pg) throws IOException;
150
151 /**
152 * Process the given story resource into a partially filled {@link Story}
153 * object containing the name and metadata, except for the description.
154 *
155 * @param url
156 * the story resource
157 *
158 * @return the {@link Story}
159 *
160 * @throws IOException
161 * in case of I/O error
162 */
163 public Story processMeta(URL url) throws IOException {
164 return processMeta(url, true, false, null);
165 }
166
167 /**
168 * Process the given story resource into a partially filled {@link Story}
169 * object containing the name and metadata.
170 *
171 * @param url
172 * the story resource
173 * @param close
174 * close "this" and "in" when done
175 * @param getDesc
176 * retrieve the description of the story, or not
177 * @param pg
178 * the optional progress reporter
179 *
180 * @return the {@link Story}, never NULL
181 *
182 * @throws IOException
183 * in case of I/O error
184 */
185 protected Story processMeta(URL url, boolean close, boolean getDesc,
186 Progress pg) throws IOException {
187 if (pg == null) {
188 pg = new Progress();
189 } else {
190 pg.setMinMax(0, 100);
191 }
192
193 login();
194 pg.setProgress(10);
195
196 url = getCanonicalUrl(url);
197
198 setCurrentReferer(url);
199
200 in = openInput(url); // NULL allowed here
201 try {
202 preprocess(url, getInput());
203 pg.setProgress(30);
204
205 Story story = new Story();
206
207 MetaData meta = getMeta(url, getInput());
208 meta.setType(getType().toString());
209 meta.setSource(getType().getSourceName());
210 meta.setPublisher(getType().getSourceName());
211
212 if (meta.getCreationDate() == null
213 || meta.getCreationDate().trim().isEmpty()) {
214 meta.setCreationDate(bsHelper.formatDate(
215 StringUtils.fromTime(new Date().getTime())));
216 }
217 story.setMeta(meta);
218 pg.put("meta", meta);
219
220 pg.setProgress(50);
221
222 if (meta.getCover() == null) {
223 meta.setCover(getDefaultCover(meta.getSubject()));
224 }
225
226 pg.setProgress(60);
227
228 if (getDesc) {
229 String descChapterName = Instance.getInstance().getTrans().getString(StringId.DESCRIPTION);
230 story.getMeta().setResume(makeChapter(url, 0, descChapterName, getDesc(url, getInput()), null));
231 }
232
233 pg.setProgress(100);
234 return story;
235 } finally {
236 if (close) {
237 close();
238
239 if (in != null) {
240 in.close();
241 }
242 }
243 }
244 }
245
246 /**
247 * Process the given story resource into a fully filled {@link Story}
248 * object.
249 *
250 * @param url
251 * the story resource
252 * @param pg
253 * the optional progress reporter
254 *
255 * @return the {@link Story}, never NULL
256 *
257 * @throws IOException
258 * in case of I/O error
259 */
260 protected Story process(URL url, Progress pg) throws IOException {
261 if (pg == null) {
262 pg = new Progress();
263 } else {
264 pg.setMinMax(0, 100);
265 }
266
267 url = getCanonicalUrl(url);
268 pg.setProgress(1);
269 try {
270 Progress pgMeta = new Progress();
271 pg.addProgress(pgMeta, 10);
272 Story story = processMeta(url, false, true, pgMeta);
273 pg.put("meta", story.getMeta());
274 if (!pgMeta.isDone()) {
275 pgMeta.setProgress(pgMeta.getMax()); // 10%
276 }
277
278 setCurrentReferer(url);
279
280 Progress pgGetChapters = new Progress();
281 pg.addProgress(pgGetChapters, 10);
282 story.setChapters(new ArrayList<Chapter>());
283 List<Entry<String, URL>> chapters = getChapters(url, getInput(),
284 pgGetChapters);
285 if (!pgGetChapters.isDone()) {
286 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
287 }
288
289 if (chapters != null) {
290 Progress pgChaps = new Progress("Extracting chapters", 0,
291 chapters.size() * 300);
292 pg.addProgress(pgChaps, 80);
293
294 long words = 0;
295 int i = 1;
296 for (Entry<String, URL> chap : chapters) {
297 pgChaps.setName("Extracting chapter " + i);
298 InputStream chapIn = null;
299 if (chap.getValue() != null) {
300 setCurrentReferer(chap.getValue());
301 chapIn = Instance.getInstance().getCache().open(chap.getValue(), this, false);
302 }
303 pgChaps.setProgress(i * 100);
304 try {
305 Progress pgGetChapterContent = new Progress();
306 Progress pgMakeChapter = new Progress();
307 pgChaps.addProgress(pgGetChapterContent, 100);
308 pgChaps.addProgress(pgMakeChapter, 100);
309
310 String content = getChapterContent(url, chapIn, i,
311 pgGetChapterContent);
312 if (!pgGetChapterContent.isDone()) {
313 pgGetChapterContent.setProgress(pgGetChapterContent
314 .getMax());
315 }
316
317 Chapter cc = makeChapter(url, i, chap.getKey(),
318 content, pgMakeChapter);
319 if (!pgMakeChapter.isDone()) {
320 pgMakeChapter.setProgress(pgMakeChapter.getMax());
321 }
322
323 words += cc.getWords();
324 story.getChapters().add(cc);
325 } finally {
326 if (chapIn != null) {
327 chapIn.close();
328 }
329 }
330
331 i++;
332 }
333
334 story.getMeta().setWords(words);
335
336 pgChaps.setName("Extracting chapters");
337 } else {
338 pg.setProgress(80);
339 }
340
341 // Check for "no chapters" stories
342 if (story.getChapters().isEmpty()
343 && story.getMeta().getResume() != null
344 && !story.getMeta().getResume().getParagraphs().isEmpty()) {
345 Chapter resume = story.getMeta().getResume();
346 resume.setName("");
347 resume.setNumber(1);
348 story.getChapters().add(resume);
349 story.getMeta().setWords(resume.getWords());
350
351 String descChapterName = Instance.getInstance().getTrans()
352 .getString(StringId.DESCRIPTION);
353 resume = new Chapter(0, descChapterName);
354 story.getMeta().setResume(resume);
355 }
356
357 return story;
358 } finally {
359 close();
360
361 if (in != null) {
362 in.close();
363 }
364 }
365 }
366
367 /**
368 * Prepare the support if needed before processing.
369 *
370 * @param source
371 * the source of the story
372 * @param in
373 * the input (the main resource)
374 *
375 * @throws IOException
376 * on I/O error
377 */
378 @SuppressWarnings("unused")
379 protected void preprocess(URL source, InputStream in) throws IOException {
380 }
381
382 /**
383 * Create a {@link Chapter} object from the given information, formatting
384 * the content as it should be.
385 *
386 * @param source
387 * the source of the story
388 * @param number
389 * the chapter number
390 * @param name
391 * the chapter name
392 * @param content
393 * the chapter content
394 * @param pg
395 * the optional progress reporter
396 *
397 * @return the {@link Chapter}, never NULL
398 *
399 * @throws IOException
400 * in case of I/O error
401 */
402 protected Chapter makeChapter(URL source, int number, String name,
403 String content, Progress pg) throws IOException {
404 // Chapter name: process it correctly, then remove the possible
405 // redundant "Chapter x: " in front of it, or "-" (as in
406 // "Chapter 5: - Fun!" after the ": " was automatically added)
407 String chapterName = processPara(name).getContent().trim();
408 for (String lang : Instance.getInstance().getConfig().getList(Config.CONF_CHAPTER)) {
409 String chapterWord = Instance.getInstance().getConfig().getStringX(Config.CONF_CHAPTER, lang);
410 if (chapterName.startsWith(chapterWord)) {
411 chapterName = chapterName.substring(chapterWord.length())
412 .trim();
413 break;
414 }
415 }
416
417 if (chapterName.startsWith(Integer.toString(number))) {
418 chapterName = chapterName.substring(
419 Integer.toString(number).length()).trim();
420 }
421
422 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
423 chapterName = chapterName.substring(1).trim();
424 }
425 //
426
427 Chapter chap = new Chapter(number, chapterName);
428
429 if (content != null) {
430 List<Paragraph> paras = makeParagraphs(source, content, pg);
431 long words = 0;
432 for (Paragraph para : paras) {
433 words += para.getWords();
434 }
435 chap.setParagraphs(paras);
436 chap.setWords(words);
437 }
438
439 return chap;
440
441 }
442
443 /**
444 * Convert the given content into {@link Paragraph}s.
445 *
446 * @param source
447 * the source URL of the story
448 * @param content
449 * the textual content
450 * @param pg
451 * the optional progress reporter
452 *
453 * @return the {@link Paragraph}s (can be empty, but never NULL)
454 *
455 * @throws IOException
456 * in case of I/O error
457 */
458 protected List<Paragraph> makeParagraphs(URL source, String content,
459 Progress pg) throws IOException {
460 if (pg == null) {
461 pg = new Progress();
462 }
463
464 if (isHtml()) {
465 // Special <HR> processing:
466 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
467 "<br/>* * *<br/>");
468 }
469
470 List<Paragraph> paras = new ArrayList<Paragraph>();
471 if (content != null && !content.trim().isEmpty()) {
472 if (isHtml()) {
473 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
474 pg.setMinMax(0, tab.length);
475 int i = 1;
476 for (String line : tab) {
477 if (line.startsWith("[") && line.endsWith("]")) {
478 pg.setName("Extracting image " + i);
479 }
480 paras.add(makeParagraph(source, line.trim()));
481 pg.setProgress(i++);
482 }
483 pg.setName(null);
484 } else {
485 List<String> lines = new ArrayList<String>();
486 BufferedReader buff = null;
487 try {
488 buff = new BufferedReader(
489 new InputStreamReader(new ByteArrayInputStream(
490 content.getBytes("UTF-8")), "UTF-8"));
491 for (String line = buff.readLine(); line != null; line = buff
492 .readLine()) {
493 lines.add(line.trim());
494 }
495 } finally {
496 if (buff != null) {
497 buff.close();
498 }
499 }
500
501 pg.setMinMax(0, lines.size());
502 int i = 0;
503 for (String line : lines) {
504 if (line.startsWith("[") && line.endsWith("]")) {
505 pg.setName("Extracting image " + i);
506 }
507 paras.add(makeParagraph(source, line));
508 pg.setProgress(i++);
509 }
510 pg.setName(null);
511 }
512
513 // Check quotes for "bad" format
514 List<Paragraph> newParas = new ArrayList<Paragraph>();
515 for (Paragraph para : paras) {
516 newParas.addAll(requotify(para));
517 }
518 paras = newParas;
519
520 // Remove double blanks/brks
521 fixBlanksBreaks(paras);
522 }
523
524 return paras;
525 }
526
527 /**
528 * Convert the given line into a single {@link Paragraph}.
529 *
530 * @param source
531 * the source URL of the story
532 * @param line
533 * the textual content of the paragraph
534 *
535 * @return the {@link Paragraph}, never NULL
536 */
537 private Paragraph makeParagraph(URL source, String line) {
538 Image image = null;
539 if (line.startsWith("[") && line.endsWith("]")) {
540 image = getImage(this, source, line.substring(1, line.length() - 1)
541 .trim());
542 }
543
544 if (image != null) {
545 return new Paragraph(image);
546 }
547
548 return processPara(line);
549 }
550
551 /**
552 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
553 * those {@link Paragraph}s.
554 * <p>
555 * The resulting list will not contain a starting or trailing blank/break
556 * nor 2 blanks or breaks following each other.
557 *
558 * @param paras
559 * the list of {@link Paragraph}s to fix
560 */
561 protected void fixBlanksBreaks(List<Paragraph> paras) {
562 boolean space = false;
563 boolean brk = true;
564 for (int i = 0; i < paras.size(); i++) {
565 Paragraph para = paras.get(i);
566 boolean thisSpace = para.getType() == ParagraphType.BLANK;
567 boolean thisBrk = para.getType() == ParagraphType.BREAK;
568
569 if (i > 0 && space && thisBrk) {
570 paras.remove(i - 1);
571 i--;
572 } else if ((space || brk) && (thisSpace || thisBrk)) {
573 paras.remove(i);
574 i--;
575 }
576
577 space = thisSpace;
578 brk = thisBrk;
579 }
580
581 // Remove blank/brk at start
582 if (paras.size() > 0
583 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
584 0).getType() == ParagraphType.BREAK)) {
585 paras.remove(0);
586 }
587
588 // Remove blank/brk at end
589 int last = paras.size() - 1;
590 if (paras.size() > 0
591 && (paras.get(last).getType() == ParagraphType.BLANK || paras
592 .get(last).getType() == ParagraphType.BREAK)) {
593 paras.remove(last);
594 }
595 }
596
597 /**
598 * Get the default cover related to this subject (see <tt>.info</tt> files).
599 *
600 * @param subject
601 * the subject
602 *
603 * @return the cover if any, or NULL
604 */
605 static Image getDefaultCover(String subject) {
606 if (subject != null && !subject.isEmpty() && Instance.getInstance().getCoverDir() != null) {
607 try {
608 File fileCover = new File(Instance.getInstance().getCoverDir(), subject);
609 return getImage(null, fileCover.toURI().toURL(), subject);
610 } catch (MalformedURLException e) {
611 }
612 }
613
614 return null;
615 }
616
617 /**
618 * Return the list of supported image extensions.
619 *
620 * @param emptyAllowed
621 * TRUE to allow an empty extension on first place, which can be
622 * used when you may already have an extension in your input but
623 * are not sure about it
624 *
625 * @return the extensions
626 */
627 static String[] getImageExt(boolean emptyAllowed) {
628 if (emptyAllowed) {
629 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
630 }
631
632 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
633 }
634
635 /**
636 * Check if the given resource can be a local image or a remote image, then
637 * refresh the cache with it if it is.
638 *
639 * @param source
640 * the story source
641 * @param line
642 * the resource to check
643 *
644 * @return the image if found, or NULL
645 *
646 */
647 static Image getImage(BasicSupport_Deprecated support, URL source,
648 String line) {
649 URL url = getImageUrl(support, source, line);
650 if (url != null) {
651 if ("file".equals(url.getProtocol())) {
652 if (new File(url.getPath()).isDirectory()) {
653 return null;
654 }
655 }
656 InputStream in = null;
657 try {
658 in = Instance.getInstance().getCache().open(url, getSupport(url), true);
659 Image img = new Image(in);
660 if (img.getSize() == 0) {
661 img.close();
662 throw new IOException(
663 "Empty image not accepted");
664 }
665 return img;
666 } catch (IOException e) {
667 } finally {
668 if (in != null) {
669 try {
670 in.close();
671 } catch (IOException e) {
672 }
673 }
674 }
675 }
676
677 return null;
678 }
679
680 /**
681 * Check if the given resource can be a local image or a remote image, then
682 * refresh the cache with it if it is.
683 *
684 * @param source
685 * the story source
686 * @param line
687 * the resource to check
688 *
689 * @return the image URL if found, or NULL
690 *
691 */
692 static URL getImageUrl(BasicSupport_Deprecated support, URL source,
693 String line) {
694 URL url = null;
695
696 if (line != null) {
697 // try for files
698 if (source != null) {
699 try {
700 String relPath = null;
701 String absPath = null;
702 try {
703 String path = new File(source.getFile()).getParent();
704 relPath = new File(new File(path), line.trim())
705 .getAbsolutePath();
706 } catch (Exception e) {
707 // Cannot be converted to path (one possibility to take
708 // into account: absolute path on Windows)
709 }
710 try {
711 absPath = new File(line.trim()).getAbsolutePath();
712 } catch (Exception e) {
713 // Cannot be converted to path (at all)
714 }
715
716 for (String ext : getImageExt(true)) {
717 File absFile = new File(absPath + ext);
718 File relFile = new File(relPath + ext);
719 if (absPath != null && absFile.exists()
720 && absFile.isFile()) {
721 url = absFile.toURI().toURL();
722 } else if (relPath != null && relFile.exists()
723 && relFile.isFile()) {
724 url = relFile.toURI().toURL();
725 }
726 }
727 } catch (Exception e) {
728 // Should not happen since we control the correct arguments
729 }
730 }
731
732 if (url == null) {
733 // try for URLs
734 try {
735 for (String ext : getImageExt(true)) {
736 if (Instance.getInstance().getCache().check(new URL(line + ext), true)) {
737 url = new URL(line + ext);
738 break;
739 }
740 }
741
742 // try out of cache
743 if (url == null) {
744 for (String ext : getImageExt(true)) {
745 try {
746 url = new URL(line + ext);
747 Instance.getInstance().getCache().refresh(url, support, true);
748 break;
749 } catch (IOException e) {
750 // no image with this ext
751 url = null;
752 }
753 }
754 }
755 } catch (MalformedURLException e) {
756 // Not an url
757 }
758 }
759
760 // refresh the cached file
761 if (url != null) {
762 try {
763 Instance.getInstance().getCache().refresh(url, support, true);
764 } catch (IOException e) {
765 // woops, broken image
766 url = null;
767 }
768 }
769 }
770
771 return url;
772 }
773
774 /**
775 * Open the input file that will be used through the support.
776 * <p>
777 * Can return NULL, in which case you are supposed to work without an
778 * {@link InputStream}.
779 *
780 * @param source
781 * the source {@link URL}
782 *
783 * @return the {@link InputStream}
784 *
785 * @throws IOException
786 * in case of I/O error
787 */
788 protected InputStream openInput(URL source) throws IOException {
789 return Instance.getInstance().getCache().open(source, this, false);
790 }
791
792 /**
793 * Reset then return {@link BasicSupport_Deprecated#in}.
794 *
795 * @return {@link BasicSupport_Deprecated#in}
796 */
797 protected InputStream getInput() {
798 return reset(in);
799 }
800
801 /**
802 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
803 * and requotify them (i.e., separate them into QUOTE paragraphs and other
804 * paragraphs (quotes or not)).
805 *
806 * @param para
807 * the paragraph to requotify (not necessarily a quote)
808 *
809 * @return the correctly (or so we hope) quotified paragraphs
810 */
811 protected List<Paragraph> requotify(Paragraph para) {
812 List<Paragraph> newParas = new ArrayList<Paragraph>();
813
814 if (para.getType() == ParagraphType.QUOTE
815 && para.getContent().length() > 2) {
816 String line = para.getContent();
817 boolean singleQ = line.startsWith("" + openQuote);
818 boolean doubleQ = line.startsWith("" + openDoubleQuote);
819
820 // Do not try when more than one quote at a time
821 // (some stories are not easily readable if we do)
822 if (singleQ
823 && line.indexOf(closeQuote, 1) < line
824 .lastIndexOf(closeQuote)) {
825 newParas.add(para);
826 return newParas;
827 }
828 if (doubleQ
829 && line.indexOf(closeDoubleQuote, 1) < line
830 .lastIndexOf(closeDoubleQuote)) {
831 newParas.add(para);
832 return newParas;
833 }
834 //
835
836 if (!singleQ && !doubleQ) {
837 line = openDoubleQuote + line + closeDoubleQuote;
838 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
839 .getWords()));
840 } else {
841 char open = singleQ ? openQuote : openDoubleQuote;
842 char close = singleQ ? closeQuote : closeDoubleQuote;
843
844 int posDot = -1;
845 boolean inQuote = false;
846 int i = 0;
847 for (char car : line.toCharArray()) {
848 if (car == open) {
849 inQuote = true;
850 } else if (car == close) {
851 inQuote = false;
852 } else if (car == '.' && !inQuote) {
853 posDot = i;
854 break;
855 }
856 i++;
857 }
858
859 if (posDot >= 0) {
860 String rest = line.substring(posDot + 1).trim();
861 line = line.substring(0, posDot + 1).trim();
862 long words = 1;
863 for (char car : line.toCharArray()) {
864 if (car == ' ') {
865 words++;
866 }
867 }
868 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
869 if (!rest.isEmpty()) {
870 newParas.addAll(requotify(processPara(rest)));
871 }
872 } else {
873 newParas.add(para);
874 }
875 }
876 } else {
877 newParas.add(para);
878 }
879
880 return newParas;
881 }
882
883 /**
884 * Process a {@link Paragraph} from a raw line of text.
885 * <p>
886 * Will also fix quotes and HTML encoding if needed.
887 *
888 * @param line
889 * the raw line
890 *
891 * @return the processed {@link Paragraph}, never NULL
892 */
893 protected Paragraph processPara(String line) {
894 line = ifUnhtml(line).trim();
895
896 boolean space = true;
897 boolean brk = true;
898 boolean quote = false;
899 boolean tentativeCloseQuote = false;
900 char prev = '\0';
901 int dashCount = 0;
902 long words = 1;
903
904 StringBuilder builder = new StringBuilder();
905 for (char car : line.toCharArray()) {
906 if (car != '-') {
907 if (dashCount > 0) {
908 // dash, ndash and mdash: - – —
909 // currently: always use mdash
910 builder.append(dashCount == 1 ? '-' : '—');
911 }
912 dashCount = 0;
913 }
914
915 if (tentativeCloseQuote) {
916 tentativeCloseQuote = false;
917 if (Character.isLetterOrDigit(car)) {
918 builder.append("'");
919 } else {
920 // handle double-single quotes as double quotes
921 if (prev == car) {
922 builder.append(closeDoubleQuote);
923 continue;
924 }
925
926 builder.append(closeQuote);
927 }
928 }
929
930 switch (car) {
931 case ' ': // note: unbreakable space
932 case ' ':
933 case '\t':
934 case '\n': // just in case
935 case '\r': // just in case
936 if (builder.length() > 0
937 && builder.charAt(builder.length() - 1) != ' ') {
938 words++;
939 }
940 builder.append(' ');
941 break;
942
943 case '\'':
944 if (space || (brk && quote)) {
945 quote = true;
946 // handle double-single quotes as double quotes
947 if (prev == car) {
948 builder.deleteCharAt(builder.length() - 1);
949 builder.append(openDoubleQuote);
950 } else {
951 builder.append(openQuote);
952 }
953 } else if (prev == ' ' || prev == car) {
954 // handle double-single quotes as double quotes
955 if (prev == car) {
956 builder.deleteCharAt(builder.length() - 1);
957 builder.append(openDoubleQuote);
958 } else {
959 builder.append(openQuote);
960 }
961 } else {
962 // it is a quote ("I'm off") or a 'quote' ("This
963 // 'good' restaurant"...)
964 tentativeCloseQuote = true;
965 }
966 break;
967
968 case '"':
969 if (space || (brk && quote)) {
970 quote = true;
971 builder.append(openDoubleQuote);
972 } else if (prev == ' ') {
973 builder.append(openDoubleQuote);
974 } else {
975 builder.append(closeDoubleQuote);
976 }
977 break;
978
979 case '-':
980 if (space) {
981 quote = true;
982 } else {
983 dashCount++;
984 }
985 space = false;
986 break;
987
988 case '*':
989 case '~':
990 case '/':
991 case '\\':
992 case '<':
993 case '>':
994 case '=':
995 case '+':
996 case '_':
997 case '–':
998 case '—':
999 space = false;
1000 builder.append(car);
1001 break;
1002
1003 case '‘':
1004 case '`':
1005 case '‹':
1006 case '﹁':
1007 case '〈':
1008 case '「':
1009 if (space || (brk && quote)) {
1010 quote = true;
1011 builder.append(openQuote);
1012 } else {
1013 // handle double-single quotes as double quotes
1014 if (prev == car) {
1015 builder.deleteCharAt(builder.length() - 1);
1016 builder.append(openDoubleQuote);
1017 } else {
1018 builder.append(openQuote);
1019 }
1020 }
1021 space = false;
1022 brk = false;
1023 break;
1024
1025 case '’':
1026 case '›':
1027 case '﹂':
1028 case '〉':
1029 case '」':
1030 space = false;
1031 brk = false;
1032 // handle double-single quotes as double quotes
1033 if (prev == car) {
1034 builder.deleteCharAt(builder.length() - 1);
1035 builder.append(closeDoubleQuote);
1036 } else {
1037 builder.append(closeQuote);
1038 }
1039 break;
1040
1041 case '«':
1042 case '“':
1043 case '﹃':
1044 case '《':
1045 case '『':
1046 if (space || (brk && quote)) {
1047 quote = true;
1048 builder.append(openDoubleQuote);
1049 } else {
1050 builder.append(openDoubleQuote);
1051 }
1052 space = false;
1053 brk = false;
1054 break;
1055
1056 case '»':
1057 case '”':
1058 case '﹄':
1059 case '》':
1060 case '』':
1061 space = false;
1062 brk = false;
1063 builder.append(closeDoubleQuote);
1064 break;
1065
1066 default:
1067 space = false;
1068 brk = false;
1069 builder.append(car);
1070 break;
1071 }
1072
1073 prev = car;
1074 }
1075
1076 if (tentativeCloseQuote) {
1077 tentativeCloseQuote = false;
1078 builder.append(closeQuote);
1079 }
1080
1081 line = builder.toString().trim();
1082
1083 ParagraphType type = ParagraphType.NORMAL;
1084 if (space) {
1085 type = ParagraphType.BLANK;
1086 } else if (brk) {
1087 type = ParagraphType.BREAK;
1088 } else if (quote) {
1089 type = ParagraphType.QUOTE;
1090 }
1091
1092 return new Paragraph(type, line, words);
1093 }
1094
1095 /**
1096 * Remove the HTML from the input <b>if</b>
1097 * {@link BasicSupport_Deprecated#isHtml()} is true.
1098 *
1099 * @param input
1100 * the input
1101 *
1102 * @return the no html version if needed
1103 */
1104 private String ifUnhtml(String input) {
1105 if (isHtml() && input != null) {
1106 return StringUtils.unhtml(input);
1107 }
1108
1109 return input;
1110 }
1111
1112 /**
1113 * Reset the given {@link InputStream} and return it.
1114 *
1115 * @param in
1116 * the {@link InputStream} to reset
1117 *
1118 * @return the same {@link InputStream} after reset
1119 */
1120 static protected InputStream reset(InputStream in) {
1121 try {
1122 if (in != null) {
1123 in.reset();
1124 }
1125 } catch (IOException e) {
1126 }
1127
1128 return in;
1129 }
1130
1131 /**
1132 * Return the first line from the given input which correspond to the given
1133 * selectors.
1134 *
1135 * @param in
1136 * the input
1137 * @param needle
1138 * a string that must be found inside the target line (also
1139 * supports "^" at start to say "only if it starts with" the
1140 * needle)
1141 * @param relativeLine
1142 * the line to return based upon the target line position (-1 =
1143 * the line before, 0 = the target line...)
1144 *
1145 * @return the line, or NULL if not found
1146 */
1147 static protected String getLine(InputStream in, String needle,
1148 int relativeLine) {
1149 return getLine(in, needle, relativeLine, true);
1150 }
1151
1152 /**
1153 * Return a line from the given input which correspond to the given
1154 * selectors.
1155 *
1156 * @param in
1157 * the input
1158 * @param needle
1159 * a string that must be found inside the target line (also
1160 * supports "^" at start to say "only if it starts with" the
1161 * needle)
1162 * @param relativeLine
1163 * the line to return based upon the target line position (-1 =
1164 * the line before, 0 = the target line...)
1165 * @param first
1166 * takes the first result (as opposed to the last one, which will
1167 * also always spend the input)
1168 *
1169 * @return the line, or NULL if not found
1170 */
1171 static protected String getLine(InputStream in, String needle,
1172 int relativeLine, boolean first) {
1173 String rep = null;
1174
1175 reset(in);
1176
1177 List<String> lines = new ArrayList<String>();
1178 @SuppressWarnings("resource")
1179 Scanner scan = new Scanner(in, "UTF-8");
1180 int index = -1;
1181 scan.useDelimiter("\\n");
1182 while (scan.hasNext()) {
1183 lines.add(scan.next());
1184
1185 if (index == -1) {
1186 if (needle.startsWith("^")) {
1187 if (lines.get(lines.size() - 1).startsWith(
1188 needle.substring(1))) {
1189 index = lines.size() - 1;
1190 }
1191
1192 } else {
1193 if (lines.get(lines.size() - 1).contains(needle)) {
1194 index = lines.size() - 1;
1195 }
1196 }
1197 }
1198
1199 if (index >= 0 && index + relativeLine < lines.size()) {
1200 rep = lines.get(index + relativeLine);
1201 if (first) {
1202 break;
1203 }
1204 }
1205 }
1206
1207 return rep;
1208 }
1209
1210 /**
1211 * Return the text between the key and the endKey (and optional subKey can
1212 * be passed, in this case we will look for the key first, then take the
1213 * text between the subKey and the endKey).
1214 * <p>
1215 * Will only match the first line with the given key if more than one are
1216 * possible. Which also means that if the subKey or endKey is not found on
1217 * that line, NULL will be returned.
1218 *
1219 * @param in
1220 * the input
1221 * @param key
1222 * the key to match (also supports "^" at start to say
1223 * "only if it starts with" the key)
1224 * @param subKey
1225 * the sub key or NULL if none
1226 * @param endKey
1227 * the end key or NULL for "up to the end"
1228 * @return the text or NULL if not found
1229 */
1230 static protected String getKeyLine(InputStream in, String key,
1231 String subKey, String endKey) {
1232 return getKeyText(getLine(in, key, 0), key, subKey, endKey);
1233 }
1234
1235 /**
1236 * Return the text between the key and the endKey (and optional subKey can
1237 * be passed, in this case we will look for the key first, then take the
1238 * text between the subKey and the endKey).
1239 *
1240 * @param in
1241 * the input
1242 * @param key
1243 * the key to match (also supports "^" at start to say
1244 * "only if it starts with" the key)
1245 * @param subKey
1246 * the sub key or NULL if none
1247 * @param endKey
1248 * the end key or NULL for "up to the end"
1249 * @return the text or NULL if not found
1250 */
1251 static protected String getKeyText(String in, String key, String subKey,
1252 String endKey) {
1253 String result = null;
1254
1255 String line = in;
1256 if (line != null && line.contains(key)) {
1257 line = line.substring(line.indexOf(key) + key.length());
1258 if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
1259 if (subKey != null) {
1260 line = line.substring(line.indexOf(subKey)
1261 + subKey.length());
1262 }
1263 if (endKey == null || line.contains(endKey)) {
1264 if (endKey != null) {
1265 line = line.substring(0, line.indexOf(endKey));
1266 result = line;
1267 }
1268 }
1269 }
1270 }
1271
1272 return result;
1273 }
1274
1275 /**
1276 * Return the text between the key and the endKey (optional subKeys can be
1277 * passed, in this case we will look for the subKeys first, then take the
1278 * text between the key and the endKey).
1279 *
1280 * @param in
1281 * the input
1282 * @param key
1283 * the key to match
1284 * @param endKey
1285 * the end key or NULL for "up to the end"
1286 * @param afters
1287 * the sub-keys to find before checking for key/endKey
1288 *
1289 * @return the text or NULL if not found
1290 */
1291 static protected String getKeyTextAfter(String in, String key,
1292 String endKey, String... afters) {
1293
1294 if (in != null && !in.isEmpty()) {
1295 int pos = indexOfAfter(in, 0, afters);
1296 if (pos < 0) {
1297 return null;
1298 }
1299
1300 in = in.substring(pos);
1301 }
1302
1303 return getKeyText(in, key, null, endKey);
1304 }
1305
1306 /**
1307 * Return the first index after all the given "afters" have been found in
1308 * the {@link String}, or -1 if it was not possible.
1309 *
1310 * @param in
1311 * the input
1312 * @param startAt
1313 * start at this position in the string
1314 * @param afters
1315 * the sub-keys to find before checking for key/endKey
1316 *
1317 * @return the text or NULL if not found
1318 */
1319 static protected int indexOfAfter(String in, int startAt, String... afters) {
1320 int pos = -1;
1321 if (in != null && !in.isEmpty()) {
1322 pos = startAt;
1323 if (afters != null) {
1324 for (int i = 0; pos >= 0 && i < afters.length; i++) {
1325 String subKey = afters[i];
1326 if (!subKey.isEmpty()) {
1327 pos = in.indexOf(subKey, pos);
1328 if (pos >= 0) {
1329 pos += subKey.length();
1330 }
1331 }
1332 }
1333 }
1334 }
1335
1336 return pos;
1337 }
1338 }