fix library caching issues and change get-by-source, by-author.. into result-list
[fanfix.git] / supported / BasicSupport_Deprecated.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.BufferedReader;
4 import java.io.ByteArrayInputStream;
5 import java.io.File;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.io.InputStreamReader;
9 import java.net.MalformedURLException;
10 import java.net.URL;
11 import java.util.ArrayList;
12 import java.util.Date;
13 import java.util.List;
14 import java.util.Map.Entry;
15 import java.util.Scanner;
16
17 import be.nikiroo.fanfix.Instance;
18 import be.nikiroo.fanfix.bundles.Config;
19 import be.nikiroo.fanfix.bundles.StringId;
20 import be.nikiroo.fanfix.data.Chapter;
21 import be.nikiroo.fanfix.data.MetaData;
22 import be.nikiroo.fanfix.data.Paragraph;
23 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
24 import be.nikiroo.fanfix.data.Story;
25 import be.nikiroo.utils.Image;
26 import be.nikiroo.utils.Progress;
27 import be.nikiroo.utils.StringUtils;
28
29 /**
30 * DEPRECATED: use the new Jsoup 'Node' system.
31 * <p>
32 * This class is the base class used by the other support classes. It can be
33 * used outside of this package, and have static method that you can use to get
34 * access to the correct support class.
35 * <p>
36 * It will be used with 'resources' (usually web pages or files).
37 *
38 * @author niki
39 */
40 @Deprecated
41 public abstract class BasicSupport_Deprecated extends BasicSupport {
42 private InputStream in;
43
44 // quote chars
45 private char openQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_SINGLE_QUOTE);
46 private char closeQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_SINGLE_QUOTE);
47 private char openDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_DOUBLE_QUOTE);
48 private char closeDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_DOUBLE_QUOTE);
49
50 // New methods not used in Deprecated mode
51 @Override
52 protected String getDesc() throws IOException {
53 throw new RuntimeException("should not be used by legacy code");
54 }
55
56 @Override
57 protected MetaData getMeta() throws IOException {
58 throw new RuntimeException("should not be used by legacy code");
59 }
60
61 @Override
62 protected List<Entry<String, URL>> getChapters(Progress pg)
63 throws IOException {
64 throw new RuntimeException("should not be used by legacy code");
65 }
66
67 @Override
68 protected String getChapterContent(URL chapUrl, int number, Progress pg)
69 throws IOException {
70 throw new RuntimeException("should not be used by legacy code");
71 }
72
73 @Override
74 public Story process(Progress pg) throws IOException {
75 return process(getSource(), pg);
76 }
77
78 //
79
80 /**
81 * Return the {@link MetaData} of this story.
82 *
83 * @param source
84 * the source of the story
85 * @param in
86 * the input (the main resource)
87 *
88 * @return the associated {@link MetaData}, never NULL
89 *
90 * @throws IOException
91 * in case of I/O error
92 */
93 protected abstract MetaData getMeta(URL source, InputStream in)
94 throws IOException;
95
96 /**
97 * Return the story description.
98 *
99 * @param source
100 * the source of the story
101 * @param in
102 * the input (the main resource)
103 *
104 * @return the description
105 *
106 * @throws IOException
107 * in case of I/O error
108 */
109 protected abstract String getDesc(URL source, InputStream in)
110 throws IOException;
111
112 /**
113 * Return the list of chapters (name and resource).
114 *
115 * @param source
116 * the source of the story
117 * @param in
118 * the input (the main resource)
119 * @param pg
120 * the optional progress reporter
121 *
122 * @return the chapters
123 *
124 * @throws IOException
125 * in case of I/O error
126 */
127 protected abstract List<Entry<String, URL>> getChapters(URL source,
128 InputStream in, Progress pg) throws IOException;
129
130 /**
131 * Return the content of the chapter (possibly HTML encoded, if
132 * {@link BasicSupport_Deprecated#isHtml()} is TRUE).
133 *
134 * @param source
135 * the source of the story
136 * @param in
137 * the input (the main resource)
138 * @param number
139 * the chapter number
140 * @param pg
141 * the optional progress reporter
142 *
143 * @return the content
144 *
145 * @throws IOException
146 * in case of I/O error
147 */
148 protected abstract String getChapterContent(URL source, InputStream in,
149 int number, Progress pg) throws IOException;
150
151 /**
152 * Process the given story resource into a partially filled {@link Story}
153 * object containing the name and metadata, except for the description.
154 *
155 * @param url
156 * the story resource
157 *
158 * @return the {@link Story}
159 *
160 * @throws IOException
161 * in case of I/O error
162 */
163 public Story processMeta(URL url) throws IOException {
164 return processMeta(url, true, false, null);
165 }
166
167 /**
168 * Process the given story resource into a partially filled {@link Story}
169 * object containing the name and metadata.
170 *
171 * @param url
172 * the story resource
173 * @param close
174 * close "this" and "in" when done
175 * @param getDesc
176 * retrieve the description of the story, or not
177 * @param pg
178 * the optional progress reporter
179 *
180 * @return the {@link Story}, never NULL
181 *
182 * @throws IOException
183 * in case of I/O error
184 */
185 protected Story processMeta(URL url, boolean close, boolean getDesc,
186 Progress pg) throws IOException {
187 if (pg == null) {
188 pg = new Progress();
189 } else {
190 pg.setMinMax(0, 100);
191 }
192
193 login();
194 pg.setProgress(10);
195
196 url = getCanonicalUrl(url);
197
198 setCurrentReferer(url);
199
200 in = openInput(url); // NULL allowed here
201 try {
202 preprocess(url, getInput());
203 pg.setProgress(30);
204
205 Story story = new Story();
206 MetaData meta = getMeta(url, getInput());
207 if (meta.getCreationDate() == null
208 || meta.getCreationDate().isEmpty()) {
209 meta.setCreationDate(StringUtils.fromTime(new Date().getTime()));
210 }
211 story.setMeta(meta);
212
213 pg.setProgress(50);
214
215 if (meta.getCover() == null) {
216 meta.setCover(getDefaultCover(meta.getSubject()));
217 }
218
219 pg.setProgress(60);
220
221 if (getDesc) {
222 String descChapterName = Instance.getInstance().getTrans().getString(StringId.DESCRIPTION);
223 story.getMeta().setResume(makeChapter(url, 0, descChapterName, getDesc(url, getInput()), null));
224 }
225
226 pg.setProgress(100);
227 return story;
228 } finally {
229 if (close) {
230 close();
231
232 if (in != null) {
233 in.close();
234 }
235 }
236 }
237 }
238
239 /**
240 * Process the given story resource into a fully filled {@link Story}
241 * object.
242 *
243 * @param url
244 * the story resource
245 * @param pg
246 * the optional progress reporter
247 *
248 * @return the {@link Story}, never NULL
249 *
250 * @throws IOException
251 * in case of I/O error
252 */
253 protected Story process(URL url, Progress pg) throws IOException {
254 if (pg == null) {
255 pg = new Progress();
256 } else {
257 pg.setMinMax(0, 100);
258 }
259
260 url = getCanonicalUrl(url);
261 pg.setProgress(1);
262 try {
263 Progress pgMeta = new Progress();
264 pg.addProgress(pgMeta, 10);
265 Story story = processMeta(url, false, true, pgMeta);
266 if (!pgMeta.isDone()) {
267 pgMeta.setProgress(pgMeta.getMax()); // 10%
268 }
269
270 pg.setName("Retrieving " + story.getMeta().getTitle());
271
272 setCurrentReferer(url);
273
274 Progress pgGetChapters = new Progress();
275 pg.addProgress(pgGetChapters, 10);
276 story.setChapters(new ArrayList<Chapter>());
277 List<Entry<String, URL>> chapters = getChapters(url, getInput(),
278 pgGetChapters);
279 if (!pgGetChapters.isDone()) {
280 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
281 }
282
283 if (chapters != null) {
284 Progress pgChaps = new Progress("Extracting chapters", 0,
285 chapters.size() * 300);
286 pg.addProgress(pgChaps, 80);
287
288 long words = 0;
289 int i = 1;
290 for (Entry<String, URL> chap : chapters) {
291 pgChaps.setName("Extracting chapter " + i);
292 InputStream chapIn = null;
293 if (chap.getValue() != null) {
294 setCurrentReferer(chap.getValue());
295 chapIn = Instance.getInstance().getCache().open(chap.getValue(), this, false);
296 }
297 pgChaps.setProgress(i * 100);
298 try {
299 Progress pgGetChapterContent = new Progress();
300 Progress pgMakeChapter = new Progress();
301 pgChaps.addProgress(pgGetChapterContent, 100);
302 pgChaps.addProgress(pgMakeChapter, 100);
303
304 String content = getChapterContent(url, chapIn, i,
305 pgGetChapterContent);
306 if (!pgGetChapterContent.isDone()) {
307 pgGetChapterContent.setProgress(pgGetChapterContent
308 .getMax());
309 }
310
311 Chapter cc = makeChapter(url, i, chap.getKey(),
312 content, pgMakeChapter);
313 if (!pgMakeChapter.isDone()) {
314 pgMakeChapter.setProgress(pgMakeChapter.getMax());
315 }
316
317 words += cc.getWords();
318 story.getChapters().add(cc);
319 story.getMeta().setWords(words);
320 } finally {
321 if (chapIn != null) {
322 chapIn.close();
323 }
324 }
325
326 i++;
327 }
328
329 pgChaps.setName("Extracting chapters");
330 } else {
331 pg.setProgress(80);
332 }
333
334 return story;
335
336 } finally {
337 close();
338
339 if (in != null) {
340 in.close();
341 }
342 }
343 }
344
345 /**
346 * Prepare the support if needed before processing.
347 *
348 * @param source
349 * the source of the story
350 * @param in
351 * the input (the main resource)
352 *
353 * @throws IOException
354 * on I/O error
355 */
356 @SuppressWarnings("unused")
357 protected void preprocess(URL source, InputStream in) throws IOException {
358 }
359
360 /**
361 * Create a {@link Chapter} object from the given information, formatting
362 * the content as it should be.
363 *
364 * @param source
365 * the source of the story
366 * @param number
367 * the chapter number
368 * @param name
369 * the chapter name
370 * @param content
371 * the chapter content
372 * @param pg
373 * the optional progress reporter
374 *
375 * @return the {@link Chapter}
376 *
377 * @throws IOException
378 * in case of I/O error
379 */
380 protected Chapter makeChapter(URL source, int number, String name,
381 String content, Progress pg) throws IOException {
382 // Chapter name: process it correctly, then remove the possible
383 // redundant "Chapter x: " in front of it, or "-" (as in
384 // "Chapter 5: - Fun!" after the ": " was automatically added)
385 String chapterName = processPara(name).getContent().trim();
386 for (String lang : Instance.getInstance().getConfig().getList(Config.CONF_CHAPTER)) {
387 String chapterWord = Instance.getInstance().getConfig().getStringX(Config.CONF_CHAPTER, lang);
388 if (chapterName.startsWith(chapterWord)) {
389 chapterName = chapterName.substring(chapterWord.length())
390 .trim();
391 break;
392 }
393 }
394
395 if (chapterName.startsWith(Integer.toString(number))) {
396 chapterName = chapterName.substring(
397 Integer.toString(number).length()).trim();
398 }
399
400 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
401 chapterName = chapterName.substring(1).trim();
402 }
403 //
404
405 Chapter chap = new Chapter(number, chapterName);
406
407 if (content != null) {
408 List<Paragraph> paras = makeParagraphs(source, content, pg);
409 long words = 0;
410 for (Paragraph para : paras) {
411 words += para.getWords();
412 }
413 chap.setParagraphs(paras);
414 chap.setWords(words);
415 }
416
417 return chap;
418
419 }
420
421 /**
422 * Convert the given content into {@link Paragraph}s.
423 *
424 * @param source
425 * the source URL of the story
426 * @param content
427 * the textual content
428 * @param pg
429 * the optional progress reporter
430 *
431 * @return the {@link Paragraph}s
432 *
433 * @throws IOException
434 * in case of I/O error
435 */
436 protected List<Paragraph> makeParagraphs(URL source, String content,
437 Progress pg) throws IOException {
438 if (pg == null) {
439 pg = new Progress();
440 }
441
442 if (isHtml()) {
443 // Special <HR> processing:
444 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
445 "<br/>* * *<br/>");
446 }
447
448 List<Paragraph> paras = new ArrayList<Paragraph>();
449
450 if (content != null && !content.trim().isEmpty()) {
451 if (isHtml()) {
452 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
453 pg.setMinMax(0, tab.length);
454 int i = 1;
455 for (String line : tab) {
456 if (line.startsWith("[") && line.endsWith("]")) {
457 pg.setName("Extracting image " + i);
458 }
459 paras.add(makeParagraph(source, line.trim()));
460 pg.setProgress(i++);
461 }
462 pg.setName(null);
463 } else {
464 List<String> lines = new ArrayList<String>();
465 BufferedReader buff = null;
466 try {
467 buff = new BufferedReader(
468 new InputStreamReader(new ByteArrayInputStream(
469 content.getBytes("UTF-8")), "UTF-8"));
470 for (String line = buff.readLine(); line != null; line = buff
471 .readLine()) {
472 lines.add(line.trim());
473 }
474 } finally {
475 if (buff != null) {
476 buff.close();
477 }
478 }
479
480 pg.setMinMax(0, lines.size());
481 int i = 0;
482 for (String line : lines) {
483 if (line.startsWith("[") && line.endsWith("]")) {
484 pg.setName("Extracting image " + i);
485 }
486 paras.add(makeParagraph(source, line));
487 pg.setProgress(i++);
488 }
489 pg.setName(null);
490 }
491
492 // Check quotes for "bad" format
493 List<Paragraph> newParas = new ArrayList<Paragraph>();
494 for (Paragraph para : paras) {
495 newParas.addAll(requotify(para));
496 }
497 paras = newParas;
498
499 // Remove double blanks/brks
500 fixBlanksBreaks(paras);
501 }
502
503 return paras;
504 }
505
506 /**
507 * Convert the given line into a single {@link Paragraph}.
508 *
509 * @param source
510 * the source URL of the story
511 * @param line
512 * the textual content of the paragraph
513 *
514 * @return the {@link Paragraph}
515 */
516 private Paragraph makeParagraph(URL source, String line) {
517 Image image = null;
518 if (line.startsWith("[") && line.endsWith("]")) {
519 image = getImage(this, source, line.substring(1, line.length() - 1)
520 .trim());
521 }
522
523 if (image != null) {
524 return new Paragraph(image);
525 }
526
527 return processPara(line);
528 }
529
530 /**
531 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
532 * those {@link Paragraph}s.
533 * <p>
534 * The resulting list will not contain a starting or trailing blank/break
535 * nor 2 blanks or breaks following each other.
536 *
537 * @param paras
538 * the list of {@link Paragraph}s to fix
539 */
540 protected void fixBlanksBreaks(List<Paragraph> paras) {
541 boolean space = false;
542 boolean brk = true;
543 for (int i = 0; i < paras.size(); i++) {
544 Paragraph para = paras.get(i);
545 boolean thisSpace = para.getType() == ParagraphType.BLANK;
546 boolean thisBrk = para.getType() == ParagraphType.BREAK;
547
548 if (i > 0 && space && thisBrk) {
549 paras.remove(i - 1);
550 i--;
551 } else if ((space || brk) && (thisSpace || thisBrk)) {
552 paras.remove(i);
553 i--;
554 }
555
556 space = thisSpace;
557 brk = thisBrk;
558 }
559
560 // Remove blank/brk at start
561 if (paras.size() > 0
562 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
563 0).getType() == ParagraphType.BREAK)) {
564 paras.remove(0);
565 }
566
567 // Remove blank/brk at end
568 int last = paras.size() - 1;
569 if (paras.size() > 0
570 && (paras.get(last).getType() == ParagraphType.BLANK || paras
571 .get(last).getType() == ParagraphType.BREAK)) {
572 paras.remove(last);
573 }
574 }
575
576 /**
577 * Get the default cover related to this subject (see <tt>.info</tt> files).
578 *
579 * @param subject
580 * the subject
581 *
582 * @return the cover if any, or NULL
583 */
584 static Image getDefaultCover(String subject) {
585 if (subject != null && !subject.isEmpty() && Instance.getInstance().getCoverDir() != null) {
586 try {
587 File fileCover = new File(Instance.getInstance().getCoverDir(), subject);
588 return getImage(null, fileCover.toURI().toURL(), subject);
589 } catch (MalformedURLException e) {
590 }
591 }
592
593 return null;
594 }
595
596 /**
597 * Return the list of supported image extensions.
598 *
599 * @param emptyAllowed
600 * TRUE to allow an empty extension on first place, which can be
601 * used when you may already have an extension in your input but
602 * are not sure about it
603 *
604 * @return the extensions
605 */
606 static String[] getImageExt(boolean emptyAllowed) {
607 if (emptyAllowed) {
608 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
609 }
610
611 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
612 }
613
614 /**
615 * Check if the given resource can be a local image or a remote image, then
616 * refresh the cache with it if it is.
617 *
618 * @param source
619 * the story source
620 * @param line
621 * the resource to check
622 *
623 * @return the image if found, or NULL
624 *
625 */
626 static Image getImage(BasicSupport_Deprecated support, URL source,
627 String line) {
628 URL url = getImageUrl(support, source, line);
629 if (url != null) {
630 if ("file".equals(url.getProtocol())) {
631 if (new File(url.getPath()).isDirectory()) {
632 return null;
633 }
634 }
635 InputStream in = null;
636 try {
637 in = Instance.getInstance().getCache().open(url, getSupport(url), true);
638 return new Image(in);
639 } catch (IOException e) {
640 } finally {
641 if (in != null) {
642 try {
643 in.close();
644 } catch (IOException e) {
645 }
646 }
647 }
648 }
649
650 return null;
651 }
652
653 /**
654 * Check if the given resource can be a local image or a remote image, then
655 * refresh the cache with it if it is.
656 *
657 * @param source
658 * the story source
659 * @param line
660 * the resource to check
661 *
662 * @return the image URL if found, or NULL
663 *
664 */
665 static URL getImageUrl(BasicSupport_Deprecated support, URL source,
666 String line) {
667 URL url = null;
668
669 if (line != null) {
670 // try for files
671 if (source != null) {
672 try {
673 String relPath = null;
674 String absPath = null;
675 try {
676 String path = new File(source.getFile()).getParent();
677 relPath = new File(new File(path), line.trim())
678 .getAbsolutePath();
679 } catch (Exception e) {
680 // Cannot be converted to path (one possibility to take
681 // into account: absolute path on Windows)
682 }
683 try {
684 absPath = new File(line.trim()).getAbsolutePath();
685 } catch (Exception e) {
686 // Cannot be converted to path (at all)
687 }
688
689 for (String ext : getImageExt(true)) {
690 File absFile = new File(absPath + ext);
691 File relFile = new File(relPath + ext);
692 if (absPath != null && absFile.exists()
693 && absFile.isFile()) {
694 url = absFile.toURI().toURL();
695 } else if (relPath != null && relFile.exists()
696 && relFile.isFile()) {
697 url = relFile.toURI().toURL();
698 }
699 }
700 } catch (Exception e) {
701 // Should not happen since we control the correct arguments
702 }
703 }
704
705 if (url == null) {
706 // try for URLs
707 try {
708 for (String ext : getImageExt(true)) {
709 if (Instance.getInstance().getCache().check(new URL(line + ext), true)) {
710 url = new URL(line + ext);
711 break;
712 }
713 }
714
715 // try out of cache
716 if (url == null) {
717 for (String ext : getImageExt(true)) {
718 try {
719 url = new URL(line + ext);
720 Instance.getInstance().getCache().refresh(url, support, true);
721 break;
722 } catch (IOException e) {
723 // no image with this ext
724 url = null;
725 }
726 }
727 }
728 } catch (MalformedURLException e) {
729 // Not an url
730 }
731 }
732
733 // refresh the cached file
734 if (url != null) {
735 try {
736 Instance.getInstance().getCache().refresh(url, support, true);
737 } catch (IOException e) {
738 // woops, broken image
739 url = null;
740 }
741 }
742 }
743
744 return url;
745 }
746
747 /**
748 * Open the input file that will be used through the support.
749 * <p>
750 * Can return NULL, in which case you are supposed to work without an
751 * {@link InputStream}.
752 *
753 * @param source
754 * the source {@link URL}
755 *
756 * @return the {@link InputStream}
757 *
758 * @throws IOException
759 * in case of I/O error
760 */
761 protected InputStream openInput(URL source) throws IOException {
762 return Instance.getInstance().getCache().open(source, this, false);
763 }
764
765 /**
766 * Reset then return {@link BasicSupport_Deprecated#in}.
767 *
768 * @return {@link BasicSupport_Deprecated#in}
769 */
770 protected InputStream getInput() {
771 return reset(in);
772 }
773
774 /**
775 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
776 * and requotify them (i.e., separate them into QUOTE paragraphs and other
777 * paragraphs (quotes or not)).
778 *
779 * @param para
780 * the paragraph to requotify (not necessarily a quote)
781 *
782 * @return the correctly (or so we hope) quotified paragraphs
783 */
784 protected List<Paragraph> requotify(Paragraph para) {
785 List<Paragraph> newParas = new ArrayList<Paragraph>();
786
787 if (para.getType() == ParagraphType.QUOTE
788 && para.getContent().length() > 2) {
789 String line = para.getContent();
790 boolean singleQ = line.startsWith("" + openQuote);
791 boolean doubleQ = line.startsWith("" + openDoubleQuote);
792
793 // Do not try when more than one quote at a time
794 // (some stories are not easily readable if we do)
795 if (singleQ
796 && line.indexOf(closeQuote, 1) < line
797 .lastIndexOf(closeQuote)) {
798 newParas.add(para);
799 return newParas;
800 }
801 if (doubleQ
802 && line.indexOf(closeDoubleQuote, 1) < line
803 .lastIndexOf(closeDoubleQuote)) {
804 newParas.add(para);
805 return newParas;
806 }
807 //
808
809 if (!singleQ && !doubleQ) {
810 line = openDoubleQuote + line + closeDoubleQuote;
811 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
812 .getWords()));
813 } else {
814 char open = singleQ ? openQuote : openDoubleQuote;
815 char close = singleQ ? closeQuote : closeDoubleQuote;
816
817 int posDot = -1;
818 boolean inQuote = false;
819 int i = 0;
820 for (char car : line.toCharArray()) {
821 if (car == open) {
822 inQuote = true;
823 } else if (car == close) {
824 inQuote = false;
825 } else if (car == '.' && !inQuote) {
826 posDot = i;
827 break;
828 }
829 i++;
830 }
831
832 if (posDot >= 0) {
833 String rest = line.substring(posDot + 1).trim();
834 line = line.substring(0, posDot + 1).trim();
835 long words = 1;
836 for (char car : line.toCharArray()) {
837 if (car == ' ') {
838 words++;
839 }
840 }
841 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
842 if (!rest.isEmpty()) {
843 newParas.addAll(requotify(processPara(rest)));
844 }
845 } else {
846 newParas.add(para);
847 }
848 }
849 } else {
850 newParas.add(para);
851 }
852
853 return newParas;
854 }
855
856 /**
857 * Process a {@link Paragraph} from a raw line of text.
858 * <p>
859 * Will also fix quotes and HTML encoding if needed.
860 *
861 * @param line
862 * the raw line
863 *
864 * @return the processed {@link Paragraph}
865 */
866 protected Paragraph processPara(String line) {
867 line = ifUnhtml(line).trim();
868
869 boolean space = true;
870 boolean brk = true;
871 boolean quote = false;
872 boolean tentativeCloseQuote = false;
873 char prev = '\0';
874 int dashCount = 0;
875 long words = 1;
876
877 StringBuilder builder = new StringBuilder();
878 for (char car : line.toCharArray()) {
879 if (car != '-') {
880 if (dashCount > 0) {
881 // dash, ndash and mdash: - – —
882 // currently: always use mdash
883 builder.append(dashCount == 1 ? '-' : '—');
884 }
885 dashCount = 0;
886 }
887
888 if (tentativeCloseQuote) {
889 tentativeCloseQuote = false;
890 if (Character.isLetterOrDigit(car)) {
891 builder.append("'");
892 } else {
893 // handle double-single quotes as double quotes
894 if (prev == car) {
895 builder.append(closeDoubleQuote);
896 continue;
897 }
898
899 builder.append(closeQuote);
900 }
901 }
902
903 switch (car) {
904 case ' ': // note: unbreakable space
905 case ' ':
906 case '\t':
907 case '\n': // just in case
908 case '\r': // just in case
909 if (builder.length() > 0
910 && builder.charAt(builder.length() - 1) != ' ') {
911 words++;
912 }
913 builder.append(' ');
914 break;
915
916 case '\'':
917 if (space || (brk && quote)) {
918 quote = true;
919 // handle double-single quotes as double quotes
920 if (prev == car) {
921 builder.deleteCharAt(builder.length() - 1);
922 builder.append(openDoubleQuote);
923 } else {
924 builder.append(openQuote);
925 }
926 } else if (prev == ' ' || prev == car) {
927 // handle double-single quotes as double quotes
928 if (prev == car) {
929 builder.deleteCharAt(builder.length() - 1);
930 builder.append(openDoubleQuote);
931 } else {
932 builder.append(openQuote);
933 }
934 } else {
935 // it is a quote ("I'm off") or a 'quote' ("This
936 // 'good' restaurant"...)
937 tentativeCloseQuote = true;
938 }
939 break;
940
941 case '"':
942 if (space || (brk && quote)) {
943 quote = true;
944 builder.append(openDoubleQuote);
945 } else if (prev == ' ') {
946 builder.append(openDoubleQuote);
947 } else {
948 builder.append(closeDoubleQuote);
949 }
950 break;
951
952 case '-':
953 if (space) {
954 quote = true;
955 } else {
956 dashCount++;
957 }
958 space = false;
959 break;
960
961 case '*':
962 case '~':
963 case '/':
964 case '\\':
965 case '<':
966 case '>':
967 case '=':
968 case '+':
969 case '_':
970 case '–':
971 case '—':
972 space = false;
973 builder.append(car);
974 break;
975
976 case '‘':
977 case '`':
978 case '‹':
979 case '﹁':
980 case '〈':
981 case '「':
982 if (space || (brk && quote)) {
983 quote = true;
984 builder.append(openQuote);
985 } else {
986 // handle double-single quotes as double quotes
987 if (prev == car) {
988 builder.deleteCharAt(builder.length() - 1);
989 builder.append(openDoubleQuote);
990 } else {
991 builder.append(openQuote);
992 }
993 }
994 space = false;
995 brk = false;
996 break;
997
998 case '’':
999 case '›':
1000 case '﹂':
1001 case '〉':
1002 case '」':
1003 space = false;
1004 brk = false;
1005 // handle double-single quotes as double quotes
1006 if (prev == car) {
1007 builder.deleteCharAt(builder.length() - 1);
1008 builder.append(closeDoubleQuote);
1009 } else {
1010 builder.append(closeQuote);
1011 }
1012 break;
1013
1014 case '«':
1015 case '“':
1016 case '﹃':
1017 case '《':
1018 case '『':
1019 if (space || (brk && quote)) {
1020 quote = true;
1021 builder.append(openDoubleQuote);
1022 } else {
1023 builder.append(openDoubleQuote);
1024 }
1025 space = false;
1026 brk = false;
1027 break;
1028
1029 case '»':
1030 case '”':
1031 case '﹄':
1032 case '》':
1033 case '』':
1034 space = false;
1035 brk = false;
1036 builder.append(closeDoubleQuote);
1037 break;
1038
1039 default:
1040 space = false;
1041 brk = false;
1042 builder.append(car);
1043 break;
1044 }
1045
1046 prev = car;
1047 }
1048
1049 if (tentativeCloseQuote) {
1050 tentativeCloseQuote = false;
1051 builder.append(closeQuote);
1052 }
1053
1054 line = builder.toString().trim();
1055
1056 ParagraphType type = ParagraphType.NORMAL;
1057 if (space) {
1058 type = ParagraphType.BLANK;
1059 } else if (brk) {
1060 type = ParagraphType.BREAK;
1061 } else if (quote) {
1062 type = ParagraphType.QUOTE;
1063 }
1064
1065 return new Paragraph(type, line, words);
1066 }
1067
1068 /**
1069 * Remove the HTML from the input <b>if</b>
1070 * {@link BasicSupport_Deprecated#isHtml()} is true.
1071 *
1072 * @param input
1073 * the input
1074 *
1075 * @return the no html version if needed
1076 */
1077 private String ifUnhtml(String input) {
1078 if (isHtml() && input != null) {
1079 return StringUtils.unhtml(input);
1080 }
1081
1082 return input;
1083 }
1084
1085 /**
1086 * Reset the given {@link InputStream} and return it.
1087 *
1088 * @param in
1089 * the {@link InputStream} to reset
1090 *
1091 * @return the same {@link InputStream} after reset
1092 */
1093 static protected InputStream reset(InputStream in) {
1094 try {
1095 if (in != null) {
1096 in.reset();
1097 }
1098 } catch (IOException e) {
1099 }
1100
1101 return in;
1102 }
1103
1104 /**
1105 * Return the first line from the given input which correspond to the given
1106 * selectors.
1107 *
1108 * @param in
1109 * the input
1110 * @param needle
1111 * a string that must be found inside the target line (also
1112 * supports "^" at start to say "only if it starts with" the
1113 * needle)
1114 * @param relativeLine
1115 * the line to return based upon the target line position (-1 =
1116 * the line before, 0 = the target line...)
1117 *
1118 * @return the line, or NULL if not found
1119 */
1120 static protected String getLine(InputStream in, String needle,
1121 int relativeLine) {
1122 return getLine(in, needle, relativeLine, true);
1123 }
1124
1125 /**
1126 * Return a line from the given input which correspond to the given
1127 * selectors.
1128 *
1129 * @param in
1130 * the input
1131 * @param needle
1132 * a string that must be found inside the target line (also
1133 * supports "^" at start to say "only if it starts with" the
1134 * needle)
1135 * @param relativeLine
1136 * the line to return based upon the target line position (-1 =
1137 * the line before, 0 = the target line...)
1138 * @param first
1139 * takes the first result (as opposed to the last one, which will
1140 * also always spend the input)
1141 *
1142 * @return the line, or NULL if not found
1143 */
1144 static protected String getLine(InputStream in, String needle,
1145 int relativeLine, boolean first) {
1146 String rep = null;
1147
1148 reset(in);
1149
1150 List<String> lines = new ArrayList<String>();
1151 @SuppressWarnings("resource")
1152 Scanner scan = new Scanner(in, "UTF-8");
1153 int index = -1;
1154 scan.useDelimiter("\\n");
1155 while (scan.hasNext()) {
1156 lines.add(scan.next());
1157
1158 if (index == -1) {
1159 if (needle.startsWith("^")) {
1160 if (lines.get(lines.size() - 1).startsWith(
1161 needle.substring(1))) {
1162 index = lines.size() - 1;
1163 }
1164
1165 } else {
1166 if (lines.get(lines.size() - 1).contains(needle)) {
1167 index = lines.size() - 1;
1168 }
1169 }
1170 }
1171
1172 if (index >= 0 && index + relativeLine < lines.size()) {
1173 rep = lines.get(index + relativeLine);
1174 if (first) {
1175 break;
1176 }
1177 }
1178 }
1179
1180 return rep;
1181 }
1182
1183 /**
1184 * Return the text between the key and the endKey (and optional subKey can
1185 * be passed, in this case we will look for the key first, then take the
1186 * text between the subKey and the endKey).
1187 * <p>
1188 * Will only match the first line with the given key if more than one are
1189 * possible. Which also means that if the subKey or endKey is not found on
1190 * that line, NULL will be returned.
1191 *
1192 * @param in
1193 * the input
1194 * @param key
1195 * the key to match (also supports "^" at start to say
1196 * "only if it starts with" the key)
1197 * @param subKey
1198 * the sub key or NULL if none
1199 * @param endKey
1200 * the end key or NULL for "up to the end"
1201 * @return the text or NULL if not found
1202 */
1203 static protected String getKeyLine(InputStream in, String key,
1204 String subKey, String endKey) {
1205 return getKeyText(getLine(in, key, 0), key, subKey, endKey);
1206 }
1207
1208 /**
1209 * Return the text between the key and the endKey (and optional subKey can
1210 * be passed, in this case we will look for the key first, then take the
1211 * text between the subKey and the endKey).
1212 *
1213 * @param in
1214 * the input
1215 * @param key
1216 * the key to match (also supports "^" at start to say
1217 * "only if it starts with" the key)
1218 * @param subKey
1219 * the sub key or NULL if none
1220 * @param endKey
1221 * the end key or NULL for "up to the end"
1222 * @return the text or NULL if not found
1223 */
1224 static protected String getKeyText(String in, String key, String subKey,
1225 String endKey) {
1226 String result = null;
1227
1228 String line = in;
1229 if (line != null && line.contains(key)) {
1230 line = line.substring(line.indexOf(key) + key.length());
1231 if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
1232 if (subKey != null) {
1233 line = line.substring(line.indexOf(subKey)
1234 + subKey.length());
1235 }
1236 if (endKey == null || line.contains(endKey)) {
1237 if (endKey != null) {
1238 line = line.substring(0, line.indexOf(endKey));
1239 result = line;
1240 }
1241 }
1242 }
1243 }
1244
1245 return result;
1246 }
1247
1248 /**
1249 * Return the text between the key and the endKey (optional subKeys can be
1250 * passed, in this case we will look for the subKeys first, then take the
1251 * text between the key and the endKey).
1252 *
1253 * @param in
1254 * the input
1255 * @param key
1256 * the key to match
1257 * @param endKey
1258 * the end key or NULL for "up to the end"
1259 * @param afters
1260 * the sub-keys to find before checking for key/endKey
1261 *
1262 * @return the text or NULL if not found
1263 */
1264 static protected String getKeyTextAfter(String in, String key,
1265 String endKey, String... afters) {
1266
1267 if (in != null && !in.isEmpty()) {
1268 int pos = indexOfAfter(in, 0, afters);
1269 if (pos < 0) {
1270 return null;
1271 }
1272
1273 in = in.substring(pos);
1274 }
1275
1276 return getKeyText(in, key, null, endKey);
1277 }
1278
1279 /**
1280 * Return the first index after all the given "afters" have been found in
1281 * the {@link String}, or -1 if it was not possible.
1282 *
1283 * @param in
1284 * the input
1285 * @param startAt
1286 * start at this position in the string
1287 * @param afters
1288 * the sub-keys to find before checking for key/endKey
1289 *
1290 * @return the text or NULL if not found
1291 */
1292 static protected int indexOfAfter(String in, int startAt, String... afters) {
1293 int pos = -1;
1294 if (in != null && !in.isEmpty()) {
1295 pos = startAt;
1296 if (afters != null) {
1297 for (int i = 0; pos >= 0 && i < afters.length; i++) {
1298 String subKey = afters[i];
1299 if (!subKey.isEmpty()) {
1300 pos = in.indexOf(subKey, pos);
1301 if (pos >= 0) {
1302 pos += subKey.length();
1303 }
1304 }
1305 }
1306 }
1307 }
1308
1309 return pos;
1310 }
1311 }