fix creation date format
[nikiroo-utils.git] / supported / BasicSupport_Deprecated.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.BufferedReader;
4 import java.io.ByteArrayInputStream;
5 import java.io.File;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.io.InputStreamReader;
9 import java.net.MalformedURLException;
10 import java.net.URL;
11 import java.util.ArrayList;
12 import java.util.Date;
13 import java.util.List;
14 import java.util.Map.Entry;
15 import java.util.Scanner;
16
17 import be.nikiroo.fanfix.Instance;
18 import be.nikiroo.fanfix.bundles.Config;
19 import be.nikiroo.fanfix.bundles.StringId;
20 import be.nikiroo.fanfix.data.Chapter;
21 import be.nikiroo.fanfix.data.MetaData;
22 import be.nikiroo.fanfix.data.Paragraph;
23 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
24 import be.nikiroo.fanfix.data.Story;
25 import be.nikiroo.utils.Image;
26 import be.nikiroo.utils.Progress;
27 import be.nikiroo.utils.StringUtils;
28
29 /**
30 * DEPRECATED: use the new Jsoup 'Node' system.
31 * <p>
32 * This class is the base class used by the other support classes. It can be
33 * used outside of this package, and have static method that you can use to get
34 * access to the correct support class.
35 * <p>
36 * It will be used with 'resources' (usually web pages or files).
37 *
38 * @author niki
39 */
40 @Deprecated
41 public abstract class BasicSupport_Deprecated extends BasicSupport {
42 private InputStream in;
43
44 // quote chars
45 private char openQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_SINGLE_QUOTE);
46 private char closeQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_SINGLE_QUOTE);
47 private char openDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_DOUBLE_QUOTE);
48 private char closeDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_DOUBLE_QUOTE);
49
50 // New methods not used in Deprecated mode
51 @Override
52 protected String getDesc() throws IOException {
53 throw new RuntimeException("should not be used by legacy code");
54 }
55
56 @Override
57 protected MetaData getMeta() throws IOException {
58 throw new RuntimeException("should not be used by legacy code");
59 }
60
61 @Override
62 protected List<Entry<String, URL>> getChapters(Progress pg)
63 throws IOException {
64 throw new RuntimeException("should not be used by legacy code");
65 }
66
67 @Override
68 protected String getChapterContent(URL chapUrl, int number, Progress pg)
69 throws IOException {
70 throw new RuntimeException("should not be used by legacy code");
71 }
72
73 @Override
74 public Story process(Progress pg) throws IOException {
75 return process(getSource(), pg);
76 }
77
78 //
79
80 /**
81 * Return the {@link MetaData} of this story.
82 *
83 * @param source
84 * the source of the story
85 * @param in
86 * the input (the main resource)
87 *
88 * @return the associated {@link MetaData}, never NULL
89 *
90 * @throws IOException
91 * in case of I/O error
92 */
93 protected abstract MetaData getMeta(URL source, InputStream in)
94 throws IOException;
95
96 /**
97 * Return the story description.
98 *
99 * @param source
100 * the source of the story
101 * @param in
102 * the input (the main resource)
103 *
104 * @return the description
105 *
106 * @throws IOException
107 * in case of I/O error
108 */
109 protected abstract String getDesc(URL source, InputStream in)
110 throws IOException;
111
112 /**
113 * Return the list of chapters (name and resource).
114 *
115 * @param source
116 * the source of the story
117 * @param in
118 * the input (the main resource)
119 * @param pg
120 * the optional progress reporter
121 *
122 * @return the chapters
123 *
124 * @throws IOException
125 * in case of I/O error
126 */
127 protected abstract List<Entry<String, URL>> getChapters(URL source,
128 InputStream in, Progress pg) throws IOException;
129
130 /**
131 * Return the content of the chapter (possibly HTML encoded, if
132 * {@link BasicSupport_Deprecated#isHtml()} is TRUE).
133 *
134 * @param source
135 * the source of the story
136 * @param in
137 * the input (the main resource)
138 * @param number
139 * the chapter number
140 * @param pg
141 * the optional progress reporter
142 *
143 * @return the content
144 *
145 * @throws IOException
146 * in case of I/O error
147 */
148 protected abstract String getChapterContent(URL source, InputStream in,
149 int number, Progress pg) throws IOException;
150
151 /**
152 * Process the given story resource into a partially filled {@link Story}
153 * object containing the name and metadata, except for the description.
154 *
155 * @param url
156 * the story resource
157 *
158 * @return the {@link Story}
159 *
160 * @throws IOException
161 * in case of I/O error
162 */
163 public Story processMeta(URL url) throws IOException {
164 return processMeta(url, true, false, null);
165 }
166
167 /**
168 * Process the given story resource into a partially filled {@link Story}
169 * object containing the name and metadata.
170 *
171 * @param url
172 * the story resource
173 * @param close
174 * close "this" and "in" when done
175 * @param getDesc
176 * retrieve the description of the story, or not
177 * @param pg
178 * the optional progress reporter
179 *
180 * @return the {@link Story}, never NULL
181 *
182 * @throws IOException
183 * in case of I/O error
184 */
185 protected Story processMeta(URL url, boolean close, boolean getDesc,
186 Progress pg) throws IOException {
187 if (pg == null) {
188 pg = new Progress();
189 } else {
190 pg.setMinMax(0, 100);
191 }
192
193 login();
194 pg.setProgress(10);
195
196 url = getCanonicalUrl(url);
197
198 setCurrentReferer(url);
199
200 in = openInput(url); // NULL allowed here
201 try {
202 preprocess(url, getInput());
203 pg.setProgress(30);
204
205 Story story = new Story();
206 MetaData meta = getMeta(url, getInput());
207 if (meta.getCreationDate() == null
208 || meta.getCreationDate().trim().isEmpty()) {
209 meta.setCreationDate(bsHelper.formatDate(
210 StringUtils.fromTime(new Date().getTime())));
211 }
212 story.setMeta(meta);
213 pg.put("meta", meta);
214
215 pg.setProgress(50);
216
217 if (meta.getCover() == null) {
218 meta.setCover(getDefaultCover(meta.getSubject()));
219 }
220
221 pg.setProgress(60);
222
223 if (getDesc) {
224 String descChapterName = Instance.getInstance().getTrans().getString(StringId.DESCRIPTION);
225 story.getMeta().setResume(makeChapter(url, 0, descChapterName, getDesc(url, getInput()), null));
226 }
227
228 pg.setProgress(100);
229 return story;
230 } finally {
231 if (close) {
232 close();
233
234 if (in != null) {
235 in.close();
236 }
237 }
238 }
239 }
240
241 /**
242 * Process the given story resource into a fully filled {@link Story}
243 * object.
244 *
245 * @param url
246 * the story resource
247 * @param pg
248 * the optional progress reporter
249 *
250 * @return the {@link Story}, never NULL
251 *
252 * @throws IOException
253 * in case of I/O error
254 */
255 protected Story process(URL url, Progress pg) throws IOException {
256 if (pg == null) {
257 pg = new Progress();
258 } else {
259 pg.setMinMax(0, 100);
260 }
261
262 url = getCanonicalUrl(url);
263 pg.setProgress(1);
264 try {
265 Progress pgMeta = new Progress();
266 pg.addProgress(pgMeta, 10);
267 Story story = processMeta(url, false, true, pgMeta);
268 pg.put("meta", story.getMeta());
269 if (!pgMeta.isDone()) {
270 pgMeta.setProgress(pgMeta.getMax()); // 10%
271 }
272
273 setCurrentReferer(url);
274
275 Progress pgGetChapters = new Progress();
276 pg.addProgress(pgGetChapters, 10);
277 story.setChapters(new ArrayList<Chapter>());
278 List<Entry<String, URL>> chapters = getChapters(url, getInput(),
279 pgGetChapters);
280 if (!pgGetChapters.isDone()) {
281 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
282 }
283
284 if (chapters != null) {
285 Progress pgChaps = new Progress("Extracting chapters", 0,
286 chapters.size() * 300);
287 pg.addProgress(pgChaps, 80);
288
289 long words = 0;
290 int i = 1;
291 for (Entry<String, URL> chap : chapters) {
292 pgChaps.setName("Extracting chapter " + i);
293 InputStream chapIn = null;
294 if (chap.getValue() != null) {
295 setCurrentReferer(chap.getValue());
296 chapIn = Instance.getInstance().getCache().open(chap.getValue(), this, false);
297 }
298 pgChaps.setProgress(i * 100);
299 try {
300 Progress pgGetChapterContent = new Progress();
301 Progress pgMakeChapter = new Progress();
302 pgChaps.addProgress(pgGetChapterContent, 100);
303 pgChaps.addProgress(pgMakeChapter, 100);
304
305 String content = getChapterContent(url, chapIn, i,
306 pgGetChapterContent);
307 if (!pgGetChapterContent.isDone()) {
308 pgGetChapterContent.setProgress(pgGetChapterContent
309 .getMax());
310 }
311
312 Chapter cc = makeChapter(url, i, chap.getKey(),
313 content, pgMakeChapter);
314 if (!pgMakeChapter.isDone()) {
315 pgMakeChapter.setProgress(pgMakeChapter.getMax());
316 }
317
318 words += cc.getWords();
319 story.getChapters().add(cc);
320 story.getMeta().setWords(words);
321 } finally {
322 if (chapIn != null) {
323 chapIn.close();
324 }
325 }
326
327 i++;
328 }
329
330 pgChaps.setName("Extracting chapters");
331 } else {
332 pg.setProgress(80);
333 }
334
335 return story;
336
337 } finally {
338 close();
339
340 if (in != null) {
341 in.close();
342 }
343 }
344 }
345
346 /**
347 * Prepare the support if needed before processing.
348 *
349 * @param source
350 * the source of the story
351 * @param in
352 * the input (the main resource)
353 *
354 * @throws IOException
355 * on I/O error
356 */
357 @SuppressWarnings("unused")
358 protected void preprocess(URL source, InputStream in) throws IOException {
359 }
360
361 /**
362 * Create a {@link Chapter} object from the given information, formatting
363 * the content as it should be.
364 *
365 * @param source
366 * the source of the story
367 * @param number
368 * the chapter number
369 * @param name
370 * the chapter name
371 * @param content
372 * the chapter content
373 * @param pg
374 * the optional progress reporter
375 *
376 * @return the {@link Chapter}
377 *
378 * @throws IOException
379 * in case of I/O error
380 */
381 protected Chapter makeChapter(URL source, int number, String name,
382 String content, Progress pg) throws IOException {
383 // Chapter name: process it correctly, then remove the possible
384 // redundant "Chapter x: " in front of it, or "-" (as in
385 // "Chapter 5: - Fun!" after the ": " was automatically added)
386 String chapterName = processPara(name).getContent().trim();
387 for (String lang : Instance.getInstance().getConfig().getList(Config.CONF_CHAPTER)) {
388 String chapterWord = Instance.getInstance().getConfig().getStringX(Config.CONF_CHAPTER, lang);
389 if (chapterName.startsWith(chapterWord)) {
390 chapterName = chapterName.substring(chapterWord.length())
391 .trim();
392 break;
393 }
394 }
395
396 if (chapterName.startsWith(Integer.toString(number))) {
397 chapterName = chapterName.substring(
398 Integer.toString(number).length()).trim();
399 }
400
401 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
402 chapterName = chapterName.substring(1).trim();
403 }
404 //
405
406 Chapter chap = new Chapter(number, chapterName);
407
408 if (content != null) {
409 List<Paragraph> paras = makeParagraphs(source, content, pg);
410 long words = 0;
411 for (Paragraph para : paras) {
412 words += para.getWords();
413 }
414 chap.setParagraphs(paras);
415 chap.setWords(words);
416 }
417
418 return chap;
419
420 }
421
422 /**
423 * Convert the given content into {@link Paragraph}s.
424 *
425 * @param source
426 * the source URL of the story
427 * @param content
428 * the textual content
429 * @param pg
430 * the optional progress reporter
431 *
432 * @return the {@link Paragraph}s
433 *
434 * @throws IOException
435 * in case of I/O error
436 */
437 protected List<Paragraph> makeParagraphs(URL source, String content,
438 Progress pg) throws IOException {
439 if (pg == null) {
440 pg = new Progress();
441 }
442
443 if (isHtml()) {
444 // Special <HR> processing:
445 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
446 "<br/>* * *<br/>");
447 }
448
449 List<Paragraph> paras = new ArrayList<Paragraph>();
450
451 if (content != null && !content.trim().isEmpty()) {
452 if (isHtml()) {
453 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
454 pg.setMinMax(0, tab.length);
455 int i = 1;
456 for (String line : tab) {
457 if (line.startsWith("[") && line.endsWith("]")) {
458 pg.setName("Extracting image " + i);
459 }
460 paras.add(makeParagraph(source, line.trim()));
461 pg.setProgress(i++);
462 }
463 pg.setName(null);
464 } else {
465 List<String> lines = new ArrayList<String>();
466 BufferedReader buff = null;
467 try {
468 buff = new BufferedReader(
469 new InputStreamReader(new ByteArrayInputStream(
470 content.getBytes("UTF-8")), "UTF-8"));
471 for (String line = buff.readLine(); line != null; line = buff
472 .readLine()) {
473 lines.add(line.trim());
474 }
475 } finally {
476 if (buff != null) {
477 buff.close();
478 }
479 }
480
481 pg.setMinMax(0, lines.size());
482 int i = 0;
483 for (String line : lines) {
484 if (line.startsWith("[") && line.endsWith("]")) {
485 pg.setName("Extracting image " + i);
486 }
487 paras.add(makeParagraph(source, line));
488 pg.setProgress(i++);
489 }
490 pg.setName(null);
491 }
492
493 // Check quotes for "bad" format
494 List<Paragraph> newParas = new ArrayList<Paragraph>();
495 for (Paragraph para : paras) {
496 newParas.addAll(requotify(para));
497 }
498 paras = newParas;
499
500 // Remove double blanks/brks
501 fixBlanksBreaks(paras);
502 }
503
504 return paras;
505 }
506
507 /**
508 * Convert the given line into a single {@link Paragraph}.
509 *
510 * @param source
511 * the source URL of the story
512 * @param line
513 * the textual content of the paragraph
514 *
515 * @return the {@link Paragraph}
516 */
517 private Paragraph makeParagraph(URL source, String line) {
518 Image image = null;
519 if (line.startsWith("[") && line.endsWith("]")) {
520 image = getImage(this, source, line.substring(1, line.length() - 1)
521 .trim());
522 }
523
524 if (image != null) {
525 return new Paragraph(image);
526 }
527
528 return processPara(line);
529 }
530
531 /**
532 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
533 * those {@link Paragraph}s.
534 * <p>
535 * The resulting list will not contain a starting or trailing blank/break
536 * nor 2 blanks or breaks following each other.
537 *
538 * @param paras
539 * the list of {@link Paragraph}s to fix
540 */
541 protected void fixBlanksBreaks(List<Paragraph> paras) {
542 boolean space = false;
543 boolean brk = true;
544 for (int i = 0; i < paras.size(); i++) {
545 Paragraph para = paras.get(i);
546 boolean thisSpace = para.getType() == ParagraphType.BLANK;
547 boolean thisBrk = para.getType() == ParagraphType.BREAK;
548
549 if (i > 0 && space && thisBrk) {
550 paras.remove(i - 1);
551 i--;
552 } else if ((space || brk) && (thisSpace || thisBrk)) {
553 paras.remove(i);
554 i--;
555 }
556
557 space = thisSpace;
558 brk = thisBrk;
559 }
560
561 // Remove blank/brk at start
562 if (paras.size() > 0
563 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
564 0).getType() == ParagraphType.BREAK)) {
565 paras.remove(0);
566 }
567
568 // Remove blank/brk at end
569 int last = paras.size() - 1;
570 if (paras.size() > 0
571 && (paras.get(last).getType() == ParagraphType.BLANK || paras
572 .get(last).getType() == ParagraphType.BREAK)) {
573 paras.remove(last);
574 }
575 }
576
577 /**
578 * Get the default cover related to this subject (see <tt>.info</tt> files).
579 *
580 * @param subject
581 * the subject
582 *
583 * @return the cover if any, or NULL
584 */
585 static Image getDefaultCover(String subject) {
586 if (subject != null && !subject.isEmpty() && Instance.getInstance().getCoverDir() != null) {
587 try {
588 File fileCover = new File(Instance.getInstance().getCoverDir(), subject);
589 return getImage(null, fileCover.toURI().toURL(), subject);
590 } catch (MalformedURLException e) {
591 }
592 }
593
594 return null;
595 }
596
597 /**
598 * Return the list of supported image extensions.
599 *
600 * @param emptyAllowed
601 * TRUE to allow an empty extension on first place, which can be
602 * used when you may already have an extension in your input but
603 * are not sure about it
604 *
605 * @return the extensions
606 */
607 static String[] getImageExt(boolean emptyAllowed) {
608 if (emptyAllowed) {
609 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
610 }
611
612 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
613 }
614
615 /**
616 * Check if the given resource can be a local image or a remote image, then
617 * refresh the cache with it if it is.
618 *
619 * @param source
620 * the story source
621 * @param line
622 * the resource to check
623 *
624 * @return the image if found, or NULL
625 *
626 */
627 static Image getImage(BasicSupport_Deprecated support, URL source,
628 String line) {
629 URL url = getImageUrl(support, source, line);
630 if (url != null) {
631 if ("file".equals(url.getProtocol())) {
632 if (new File(url.getPath()).isDirectory()) {
633 return null;
634 }
635 }
636 InputStream in = null;
637 try {
638 in = Instance.getInstance().getCache().open(url, getSupport(url), true);
639 return new Image(in);
640 } catch (IOException e) {
641 } finally {
642 if (in != null) {
643 try {
644 in.close();
645 } catch (IOException e) {
646 }
647 }
648 }
649 }
650
651 return null;
652 }
653
654 /**
655 * Check if the given resource can be a local image or a remote image, then
656 * refresh the cache with it if it is.
657 *
658 * @param source
659 * the story source
660 * @param line
661 * the resource to check
662 *
663 * @return the image URL if found, or NULL
664 *
665 */
666 static URL getImageUrl(BasicSupport_Deprecated support, URL source,
667 String line) {
668 URL url = null;
669
670 if (line != null) {
671 // try for files
672 if (source != null) {
673 try {
674 String relPath = null;
675 String absPath = null;
676 try {
677 String path = new File(source.getFile()).getParent();
678 relPath = new File(new File(path), line.trim())
679 .getAbsolutePath();
680 } catch (Exception e) {
681 // Cannot be converted to path (one possibility to take
682 // into account: absolute path on Windows)
683 }
684 try {
685 absPath = new File(line.trim()).getAbsolutePath();
686 } catch (Exception e) {
687 // Cannot be converted to path (at all)
688 }
689
690 for (String ext : getImageExt(true)) {
691 File absFile = new File(absPath + ext);
692 File relFile = new File(relPath + ext);
693 if (absPath != null && absFile.exists()
694 && absFile.isFile()) {
695 url = absFile.toURI().toURL();
696 } else if (relPath != null && relFile.exists()
697 && relFile.isFile()) {
698 url = relFile.toURI().toURL();
699 }
700 }
701 } catch (Exception e) {
702 // Should not happen since we control the correct arguments
703 }
704 }
705
706 if (url == null) {
707 // try for URLs
708 try {
709 for (String ext : getImageExt(true)) {
710 if (Instance.getInstance().getCache().check(new URL(line + ext), true)) {
711 url = new URL(line + ext);
712 break;
713 }
714 }
715
716 // try out of cache
717 if (url == null) {
718 for (String ext : getImageExt(true)) {
719 try {
720 url = new URL(line + ext);
721 Instance.getInstance().getCache().refresh(url, support, true);
722 break;
723 } catch (IOException e) {
724 // no image with this ext
725 url = null;
726 }
727 }
728 }
729 } catch (MalformedURLException e) {
730 // Not an url
731 }
732 }
733
734 // refresh the cached file
735 if (url != null) {
736 try {
737 Instance.getInstance().getCache().refresh(url, support, true);
738 } catch (IOException e) {
739 // woops, broken image
740 url = null;
741 }
742 }
743 }
744
745 return url;
746 }
747
748 /**
749 * Open the input file that will be used through the support.
750 * <p>
751 * Can return NULL, in which case you are supposed to work without an
752 * {@link InputStream}.
753 *
754 * @param source
755 * the source {@link URL}
756 *
757 * @return the {@link InputStream}
758 *
759 * @throws IOException
760 * in case of I/O error
761 */
762 protected InputStream openInput(URL source) throws IOException {
763 return Instance.getInstance().getCache().open(source, this, false);
764 }
765
766 /**
767 * Reset then return {@link BasicSupport_Deprecated#in}.
768 *
769 * @return {@link BasicSupport_Deprecated#in}
770 */
771 protected InputStream getInput() {
772 return reset(in);
773 }
774
775 /**
776 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
777 * and requotify them (i.e., separate them into QUOTE paragraphs and other
778 * paragraphs (quotes or not)).
779 *
780 * @param para
781 * the paragraph to requotify (not necessarily a quote)
782 *
783 * @return the correctly (or so we hope) quotified paragraphs
784 */
785 protected List<Paragraph> requotify(Paragraph para) {
786 List<Paragraph> newParas = new ArrayList<Paragraph>();
787
788 if (para.getType() == ParagraphType.QUOTE
789 && para.getContent().length() > 2) {
790 String line = para.getContent();
791 boolean singleQ = line.startsWith("" + openQuote);
792 boolean doubleQ = line.startsWith("" + openDoubleQuote);
793
794 // Do not try when more than one quote at a time
795 // (some stories are not easily readable if we do)
796 if (singleQ
797 && line.indexOf(closeQuote, 1) < line
798 .lastIndexOf(closeQuote)) {
799 newParas.add(para);
800 return newParas;
801 }
802 if (doubleQ
803 && line.indexOf(closeDoubleQuote, 1) < line
804 .lastIndexOf(closeDoubleQuote)) {
805 newParas.add(para);
806 return newParas;
807 }
808 //
809
810 if (!singleQ && !doubleQ) {
811 line = openDoubleQuote + line + closeDoubleQuote;
812 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
813 .getWords()));
814 } else {
815 char open = singleQ ? openQuote : openDoubleQuote;
816 char close = singleQ ? closeQuote : closeDoubleQuote;
817
818 int posDot = -1;
819 boolean inQuote = false;
820 int i = 0;
821 for (char car : line.toCharArray()) {
822 if (car == open) {
823 inQuote = true;
824 } else if (car == close) {
825 inQuote = false;
826 } else if (car == '.' && !inQuote) {
827 posDot = i;
828 break;
829 }
830 i++;
831 }
832
833 if (posDot >= 0) {
834 String rest = line.substring(posDot + 1).trim();
835 line = line.substring(0, posDot + 1).trim();
836 long words = 1;
837 for (char car : line.toCharArray()) {
838 if (car == ' ') {
839 words++;
840 }
841 }
842 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
843 if (!rest.isEmpty()) {
844 newParas.addAll(requotify(processPara(rest)));
845 }
846 } else {
847 newParas.add(para);
848 }
849 }
850 } else {
851 newParas.add(para);
852 }
853
854 return newParas;
855 }
856
857 /**
858 * Process a {@link Paragraph} from a raw line of text.
859 * <p>
860 * Will also fix quotes and HTML encoding if needed.
861 *
862 * @param line
863 * the raw line
864 *
865 * @return the processed {@link Paragraph}
866 */
867 protected Paragraph processPara(String line) {
868 line = ifUnhtml(line).trim();
869
870 boolean space = true;
871 boolean brk = true;
872 boolean quote = false;
873 boolean tentativeCloseQuote = false;
874 char prev = '\0';
875 int dashCount = 0;
876 long words = 1;
877
878 StringBuilder builder = new StringBuilder();
879 for (char car : line.toCharArray()) {
880 if (car != '-') {
881 if (dashCount > 0) {
882 // dash, ndash and mdash: - – —
883 // currently: always use mdash
884 builder.append(dashCount == 1 ? '-' : '—');
885 }
886 dashCount = 0;
887 }
888
889 if (tentativeCloseQuote) {
890 tentativeCloseQuote = false;
891 if (Character.isLetterOrDigit(car)) {
892 builder.append("'");
893 } else {
894 // handle double-single quotes as double quotes
895 if (prev == car) {
896 builder.append(closeDoubleQuote);
897 continue;
898 }
899
900 builder.append(closeQuote);
901 }
902 }
903
904 switch (car) {
905 case ' ': // note: unbreakable space
906 case ' ':
907 case '\t':
908 case '\n': // just in case
909 case '\r': // just in case
910 if (builder.length() > 0
911 && builder.charAt(builder.length() - 1) != ' ') {
912 words++;
913 }
914 builder.append(' ');
915 break;
916
917 case '\'':
918 if (space || (brk && quote)) {
919 quote = true;
920 // handle double-single quotes as double quotes
921 if (prev == car) {
922 builder.deleteCharAt(builder.length() - 1);
923 builder.append(openDoubleQuote);
924 } else {
925 builder.append(openQuote);
926 }
927 } else if (prev == ' ' || prev == car) {
928 // handle double-single quotes as double quotes
929 if (prev == car) {
930 builder.deleteCharAt(builder.length() - 1);
931 builder.append(openDoubleQuote);
932 } else {
933 builder.append(openQuote);
934 }
935 } else {
936 // it is a quote ("I'm off") or a 'quote' ("This
937 // 'good' restaurant"...)
938 tentativeCloseQuote = true;
939 }
940 break;
941
942 case '"':
943 if (space || (brk && quote)) {
944 quote = true;
945 builder.append(openDoubleQuote);
946 } else if (prev == ' ') {
947 builder.append(openDoubleQuote);
948 } else {
949 builder.append(closeDoubleQuote);
950 }
951 break;
952
953 case '-':
954 if (space) {
955 quote = true;
956 } else {
957 dashCount++;
958 }
959 space = false;
960 break;
961
962 case '*':
963 case '~':
964 case '/':
965 case '\\':
966 case '<':
967 case '>':
968 case '=':
969 case '+':
970 case '_':
971 case '–':
972 case '—':
973 space = false;
974 builder.append(car);
975 break;
976
977 case '‘':
978 case '`':
979 case '‹':
980 case '﹁':
981 case '〈':
982 case '「':
983 if (space || (brk && quote)) {
984 quote = true;
985 builder.append(openQuote);
986 } else {
987 // handle double-single quotes as double quotes
988 if (prev == car) {
989 builder.deleteCharAt(builder.length() - 1);
990 builder.append(openDoubleQuote);
991 } else {
992 builder.append(openQuote);
993 }
994 }
995 space = false;
996 brk = false;
997 break;
998
999 case '’':
1000 case '›':
1001 case '﹂':
1002 case '〉':
1003 case '」':
1004 space = false;
1005 brk = false;
1006 // handle double-single quotes as double quotes
1007 if (prev == car) {
1008 builder.deleteCharAt(builder.length() - 1);
1009 builder.append(closeDoubleQuote);
1010 } else {
1011 builder.append(closeQuote);
1012 }
1013 break;
1014
1015 case '«':
1016 case '“':
1017 case '﹃':
1018 case '《':
1019 case '『':
1020 if (space || (brk && quote)) {
1021 quote = true;
1022 builder.append(openDoubleQuote);
1023 } else {
1024 builder.append(openDoubleQuote);
1025 }
1026 space = false;
1027 brk = false;
1028 break;
1029
1030 case '»':
1031 case '”':
1032 case '﹄':
1033 case '》':
1034 case '』':
1035 space = false;
1036 brk = false;
1037 builder.append(closeDoubleQuote);
1038 break;
1039
1040 default:
1041 space = false;
1042 brk = false;
1043 builder.append(car);
1044 break;
1045 }
1046
1047 prev = car;
1048 }
1049
1050 if (tentativeCloseQuote) {
1051 tentativeCloseQuote = false;
1052 builder.append(closeQuote);
1053 }
1054
1055 line = builder.toString().trim();
1056
1057 ParagraphType type = ParagraphType.NORMAL;
1058 if (space) {
1059 type = ParagraphType.BLANK;
1060 } else if (brk) {
1061 type = ParagraphType.BREAK;
1062 } else if (quote) {
1063 type = ParagraphType.QUOTE;
1064 }
1065
1066 return new Paragraph(type, line, words);
1067 }
1068
1069 /**
1070 * Remove the HTML from the input <b>if</b>
1071 * {@link BasicSupport_Deprecated#isHtml()} is true.
1072 *
1073 * @param input
1074 * the input
1075 *
1076 * @return the no html version if needed
1077 */
1078 private String ifUnhtml(String input) {
1079 if (isHtml() && input != null) {
1080 return StringUtils.unhtml(input);
1081 }
1082
1083 return input;
1084 }
1085
1086 /**
1087 * Reset the given {@link InputStream} and return it.
1088 *
1089 * @param in
1090 * the {@link InputStream} to reset
1091 *
1092 * @return the same {@link InputStream} after reset
1093 */
1094 static protected InputStream reset(InputStream in) {
1095 try {
1096 if (in != null) {
1097 in.reset();
1098 }
1099 } catch (IOException e) {
1100 }
1101
1102 return in;
1103 }
1104
1105 /**
1106 * Return the first line from the given input which correspond to the given
1107 * selectors.
1108 *
1109 * @param in
1110 * the input
1111 * @param needle
1112 * a string that must be found inside the target line (also
1113 * supports "^" at start to say "only if it starts with" the
1114 * needle)
1115 * @param relativeLine
1116 * the line to return based upon the target line position (-1 =
1117 * the line before, 0 = the target line...)
1118 *
1119 * @return the line, or NULL if not found
1120 */
1121 static protected String getLine(InputStream in, String needle,
1122 int relativeLine) {
1123 return getLine(in, needle, relativeLine, true);
1124 }
1125
1126 /**
1127 * Return a line from the given input which correspond to the given
1128 * selectors.
1129 *
1130 * @param in
1131 * the input
1132 * @param needle
1133 * a string that must be found inside the target line (also
1134 * supports "^" at start to say "only if it starts with" the
1135 * needle)
1136 * @param relativeLine
1137 * the line to return based upon the target line position (-1 =
1138 * the line before, 0 = the target line...)
1139 * @param first
1140 * takes the first result (as opposed to the last one, which will
1141 * also always spend the input)
1142 *
1143 * @return the line, or NULL if not found
1144 */
1145 static protected String getLine(InputStream in, String needle,
1146 int relativeLine, boolean first) {
1147 String rep = null;
1148
1149 reset(in);
1150
1151 List<String> lines = new ArrayList<String>();
1152 @SuppressWarnings("resource")
1153 Scanner scan = new Scanner(in, "UTF-8");
1154 int index = -1;
1155 scan.useDelimiter("\\n");
1156 while (scan.hasNext()) {
1157 lines.add(scan.next());
1158
1159 if (index == -1) {
1160 if (needle.startsWith("^")) {
1161 if (lines.get(lines.size() - 1).startsWith(
1162 needle.substring(1))) {
1163 index = lines.size() - 1;
1164 }
1165
1166 } else {
1167 if (lines.get(lines.size() - 1).contains(needle)) {
1168 index = lines.size() - 1;
1169 }
1170 }
1171 }
1172
1173 if (index >= 0 && index + relativeLine < lines.size()) {
1174 rep = lines.get(index + relativeLine);
1175 if (first) {
1176 break;
1177 }
1178 }
1179 }
1180
1181 return rep;
1182 }
1183
1184 /**
1185 * Return the text between the key and the endKey (and optional subKey can
1186 * be passed, in this case we will look for the key first, then take the
1187 * text between the subKey and the endKey).
1188 * <p>
1189 * Will only match the first line with the given key if more than one are
1190 * possible. Which also means that if the subKey or endKey is not found on
1191 * that line, NULL will be returned.
1192 *
1193 * @param in
1194 * the input
1195 * @param key
1196 * the key to match (also supports "^" at start to say
1197 * "only if it starts with" the key)
1198 * @param subKey
1199 * the sub key or NULL if none
1200 * @param endKey
1201 * the end key or NULL for "up to the end"
1202 * @return the text or NULL if not found
1203 */
1204 static protected String getKeyLine(InputStream in, String key,
1205 String subKey, String endKey) {
1206 return getKeyText(getLine(in, key, 0), key, subKey, endKey);
1207 }
1208
1209 /**
1210 * Return the text between the key and the endKey (and optional subKey can
1211 * be passed, in this case we will look for the key first, then take the
1212 * text between the subKey and the endKey).
1213 *
1214 * @param in
1215 * the input
1216 * @param key
1217 * the key to match (also supports "^" at start to say
1218 * "only if it starts with" the key)
1219 * @param subKey
1220 * the sub key or NULL if none
1221 * @param endKey
1222 * the end key or NULL for "up to the end"
1223 * @return the text or NULL if not found
1224 */
1225 static protected String getKeyText(String in, String key, String subKey,
1226 String endKey) {
1227 String result = null;
1228
1229 String line = in;
1230 if (line != null && line.contains(key)) {
1231 line = line.substring(line.indexOf(key) + key.length());
1232 if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
1233 if (subKey != null) {
1234 line = line.substring(line.indexOf(subKey)
1235 + subKey.length());
1236 }
1237 if (endKey == null || line.contains(endKey)) {
1238 if (endKey != null) {
1239 line = line.substring(0, line.indexOf(endKey));
1240 result = line;
1241 }
1242 }
1243 }
1244 }
1245
1246 return result;
1247 }
1248
1249 /**
1250 * Return the text between the key and the endKey (optional subKeys can be
1251 * passed, in this case we will look for the subKeys first, then take the
1252 * text between the key and the endKey).
1253 *
1254 * @param in
1255 * the input
1256 * @param key
1257 * the key to match
1258 * @param endKey
1259 * the end key or NULL for "up to the end"
1260 * @param afters
1261 * the sub-keys to find before checking for key/endKey
1262 *
1263 * @return the text or NULL if not found
1264 */
1265 static protected String getKeyTextAfter(String in, String key,
1266 String endKey, String... afters) {
1267
1268 if (in != null && !in.isEmpty()) {
1269 int pos = indexOfAfter(in, 0, afters);
1270 if (pos < 0) {
1271 return null;
1272 }
1273
1274 in = in.substring(pos);
1275 }
1276
1277 return getKeyText(in, key, null, endKey);
1278 }
1279
1280 /**
1281 * Return the first index after all the given "afters" have been found in
1282 * the {@link String}, or -1 if it was not possible.
1283 *
1284 * @param in
1285 * the input
1286 * @param startAt
1287 * start at this position in the string
1288 * @param afters
1289 * the sub-keys to find before checking for key/endKey
1290 *
1291 * @return the text or NULL if not found
1292 */
1293 static protected int indexOfAfter(String in, int startAt, String... afters) {
1294 int pos = -1;
1295 if (in != null && !in.isEmpty()) {
1296 pos = startAt;
1297 if (afters != null) {
1298 for (int i = 0; pos >= 0 && i < afters.length; i++) {
1299 String subKey = afters[i];
1300 if (!subKey.isEmpty()) {
1301 pos = in.indexOf(subKey, pos);
1302 if (pos >= 0) {
1303 pos += subKey.length();
1304 }
1305 }
1306 }
1307 }
1308 }
1309
1310 return pos;
1311 }
1312 }