Support for no-chapter stories or stories with descriiption before Chatper
[nikiroo-utils.git] / supported / BasicSupport_Deprecated.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.BufferedReader;
4 import java.io.ByteArrayInputStream;
5 import java.io.File;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.io.InputStreamReader;
9 import java.net.MalformedURLException;
10 import java.net.URL;
11 import java.util.ArrayList;
12 import java.util.Date;
13 import java.util.List;
14 import java.util.Map.Entry;
15 import java.util.Scanner;
16
17 import be.nikiroo.fanfix.Instance;
18 import be.nikiroo.fanfix.bundles.Config;
19 import be.nikiroo.fanfix.bundles.StringId;
20 import be.nikiroo.fanfix.data.Chapter;
21 import be.nikiroo.fanfix.data.MetaData;
22 import be.nikiroo.fanfix.data.Paragraph;
23 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
24 import be.nikiroo.fanfix.data.Story;
25 import be.nikiroo.utils.Image;
26 import be.nikiroo.utils.Progress;
27 import be.nikiroo.utils.StringUtils;
28
29 /**
30 * DEPRECATED: use the new Jsoup 'Node' system.
31 * <p>
32 * This class is the base class used by the other support classes. It can be
33 * used outside of this package, and have static method that you can use to get
34 * access to the correct support class.
35 * <p>
36 * It will be used with 'resources' (usually web pages or files).
37 *
38 * @author niki
39 */
40 @Deprecated
41 public abstract class BasicSupport_Deprecated extends BasicSupport {
42 private InputStream in;
43
44 // quote chars
45 private char openQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_SINGLE_QUOTE);
46 private char closeQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_SINGLE_QUOTE);
47 private char openDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_DOUBLE_QUOTE);
48 private char closeDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_DOUBLE_QUOTE);
49
50 // New methods not used in Deprecated mode
51 @Override
52 protected String getDesc() throws IOException {
53 throw new RuntimeException("should not be used by legacy code");
54 }
55
56 @Override
57 protected MetaData getMeta() throws IOException {
58 throw new RuntimeException("should not be used by legacy code");
59 }
60
61 @Override
62 protected List<Entry<String, URL>> getChapters(Progress pg)
63 throws IOException {
64 throw new RuntimeException("should not be used by legacy code");
65 }
66
67 @Override
68 protected String getChapterContent(URL chapUrl, int number, Progress pg)
69 throws IOException {
70 throw new RuntimeException("should not be used by legacy code");
71 }
72
73 @Override
74 public Story process(Progress pg) throws IOException {
75 return process(getSource(), pg);
76 }
77
78 //
79
80 /**
81 * Return the {@link MetaData} of this story.
82 *
83 * @param source
84 * the source of the story
85 * @param in
86 * the input (the main resource)
87 *
88 * @return the associated {@link MetaData}, never NULL
89 *
90 * @throws IOException
91 * in case of I/O error
92 */
93 protected abstract MetaData getMeta(URL source, InputStream in)
94 throws IOException;
95
96 /**
97 * Return the story description.
98 *
99 * @param source
100 * the source of the story
101 * @param in
102 * the input (the main resource)
103 *
104 * @return the description
105 *
106 * @throws IOException
107 * in case of I/O error
108 */
109 protected abstract String getDesc(URL source, InputStream in)
110 throws IOException;
111
112 /**
113 * Return the list of chapters (name and resource).
114 *
115 * @param source
116 * the source of the story
117 * @param in
118 * the input (the main resource)
119 * @param pg
120 * the optional progress reporter
121 *
122 * @return the chapters
123 *
124 * @throws IOException
125 * in case of I/O error
126 */
127 protected abstract List<Entry<String, URL>> getChapters(URL source,
128 InputStream in, Progress pg) throws IOException;
129
130 /**
131 * Return the content of the chapter (possibly HTML encoded, if
132 * {@link BasicSupport_Deprecated#isHtml()} is TRUE).
133 *
134 * @param source
135 * the source of the story
136 * @param in
137 * the input (the main resource)
138 * @param number
139 * the chapter number
140 * @param pg
141 * the optional progress reporter
142 *
143 * @return the content
144 *
145 * @throws IOException
146 * in case of I/O error
147 */
148 protected abstract String getChapterContent(URL source, InputStream in,
149 int number, Progress pg) throws IOException;
150
151 /**
152 * Process the given story resource into a partially filled {@link Story}
153 * object containing the name and metadata, except for the description.
154 *
155 * @param url
156 * the story resource
157 *
158 * @return the {@link Story}
159 *
160 * @throws IOException
161 * in case of I/O error
162 */
163 public Story processMeta(URL url) throws IOException {
164 return processMeta(url, true, false, null);
165 }
166
167 /**
168 * Process the given story resource into a partially filled {@link Story}
169 * object containing the name and metadata.
170 *
171 * @param url
172 * the story resource
173 * @param close
174 * close "this" and "in" when done
175 * @param getDesc
176 * retrieve the description of the story, or not
177 * @param pg
178 * the optional progress reporter
179 *
180 * @return the {@link Story}, never NULL
181 *
182 * @throws IOException
183 * in case of I/O error
184 */
185 protected Story processMeta(URL url, boolean close, boolean getDesc,
186 Progress pg) throws IOException {
187 if (pg == null) {
188 pg = new Progress();
189 } else {
190 pg.setMinMax(0, 100);
191 }
192
193 login();
194 pg.setProgress(10);
195
196 url = getCanonicalUrl(url);
197
198 setCurrentReferer(url);
199
200 in = openInput(url); // NULL allowed here
201 try {
202 preprocess(url, getInput());
203 pg.setProgress(30);
204
205 Story story = new Story();
206 MetaData meta = getMeta(url, getInput());
207 if (meta.getCreationDate() == null
208 || meta.getCreationDate().trim().isEmpty()) {
209 meta.setCreationDate(bsHelper.formatDate(
210 StringUtils.fromTime(new Date().getTime())));
211 }
212 story.setMeta(meta);
213 pg.put("meta", meta);
214
215 pg.setProgress(50);
216
217 if (meta.getCover() == null) {
218 meta.setCover(getDefaultCover(meta.getSubject()));
219 }
220
221 pg.setProgress(60);
222
223 if (getDesc) {
224 String descChapterName = Instance.getInstance().getTrans().getString(StringId.DESCRIPTION);
225 story.getMeta().setResume(makeChapter(url, 0, descChapterName, getDesc(url, getInput()), null));
226 }
227
228 pg.setProgress(100);
229 return story;
230 } finally {
231 if (close) {
232 close();
233
234 if (in != null) {
235 in.close();
236 }
237 }
238 }
239 }
240
241 /**
242 * Process the given story resource into a fully filled {@link Story}
243 * object.
244 *
245 * @param url
246 * the story resource
247 * @param pg
248 * the optional progress reporter
249 *
250 * @return the {@link Story}, never NULL
251 *
252 * @throws IOException
253 * in case of I/O error
254 */
255 protected Story process(URL url, Progress pg) throws IOException {
256 if (pg == null) {
257 pg = new Progress();
258 } else {
259 pg.setMinMax(0, 100);
260 }
261
262 url = getCanonicalUrl(url);
263 pg.setProgress(1);
264 try {
265 Progress pgMeta = new Progress();
266 pg.addProgress(pgMeta, 10);
267 Story story = processMeta(url, false, true, pgMeta);
268 pg.put("meta", story.getMeta());
269 if (!pgMeta.isDone()) {
270 pgMeta.setProgress(pgMeta.getMax()); // 10%
271 }
272
273 setCurrentReferer(url);
274
275 Progress pgGetChapters = new Progress();
276 pg.addProgress(pgGetChapters, 10);
277 story.setChapters(new ArrayList<Chapter>());
278 List<Entry<String, URL>> chapters = getChapters(url, getInput(),
279 pgGetChapters);
280 if (!pgGetChapters.isDone()) {
281 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
282 }
283
284 if (chapters != null) {
285 Progress pgChaps = new Progress("Extracting chapters", 0,
286 chapters.size() * 300);
287 pg.addProgress(pgChaps, 80);
288
289 long words = 0;
290 int i = 1;
291 for (Entry<String, URL> chap : chapters) {
292 pgChaps.setName("Extracting chapter " + i);
293 InputStream chapIn = null;
294 if (chap.getValue() != null) {
295 setCurrentReferer(chap.getValue());
296 chapIn = Instance.getInstance().getCache().open(chap.getValue(), this, false);
297 }
298 pgChaps.setProgress(i * 100);
299 try {
300 Progress pgGetChapterContent = new Progress();
301 Progress pgMakeChapter = new Progress();
302 pgChaps.addProgress(pgGetChapterContent, 100);
303 pgChaps.addProgress(pgMakeChapter, 100);
304
305 String content = getChapterContent(url, chapIn, i,
306 pgGetChapterContent);
307 if (!pgGetChapterContent.isDone()) {
308 pgGetChapterContent.setProgress(pgGetChapterContent
309 .getMax());
310 }
311
312 Chapter cc = makeChapter(url, i, chap.getKey(),
313 content, pgMakeChapter);
314 if (!pgMakeChapter.isDone()) {
315 pgMakeChapter.setProgress(pgMakeChapter.getMax());
316 }
317
318 words += cc.getWords();
319 story.getChapters().add(cc);
320 story.getMeta().setWords(words);
321 } finally {
322 if (chapIn != null) {
323 chapIn.close();
324 }
325 }
326
327 i++;
328 }
329
330 pgChaps.setName("Extracting chapters");
331 } else {
332 pg.setProgress(80);
333 }
334
335 // Check for "no chapters" stories
336 if (story.getChapters().isEmpty()
337 && story.getMeta().getResume() != null
338 && !story.getMeta().getResume().getParagraphs().isEmpty()) {
339 Chapter resume = story.getMeta().getResume();
340 resume.setName("");
341 resume.setNumber(1);
342 story.getChapters().add(resume);
343
344 String descChapterName = Instance.getInstance().getTrans()
345 .getString(StringId.DESCRIPTION);
346 resume = new Chapter(0, descChapterName);
347 story.getMeta().setResume(resume);
348 }
349
350 return story;
351 } finally {
352 close();
353
354 if (in != null) {
355 in.close();
356 }
357 }
358 }
359
360 /**
361 * Prepare the support if needed before processing.
362 *
363 * @param source
364 * the source of the story
365 * @param in
366 * the input (the main resource)
367 *
368 * @throws IOException
369 * on I/O error
370 */
371 @SuppressWarnings("unused")
372 protected void preprocess(URL source, InputStream in) throws IOException {
373 }
374
375 /**
376 * Create a {@link Chapter} object from the given information, formatting
377 * the content as it should be.
378 *
379 * @param source
380 * the source of the story
381 * @param number
382 * the chapter number
383 * @param name
384 * the chapter name
385 * @param content
386 * the chapter content
387 * @param pg
388 * the optional progress reporter
389 *
390 * @return the {@link Chapter}, never NULL
391 *
392 * @throws IOException
393 * in case of I/O error
394 */
395 protected Chapter makeChapter(URL source, int number, String name,
396 String content, Progress pg) throws IOException {
397 // Chapter name: process it correctly, then remove the possible
398 // redundant "Chapter x: " in front of it, or "-" (as in
399 // "Chapter 5: - Fun!" after the ": " was automatically added)
400 String chapterName = processPara(name).getContent().trim();
401 for (String lang : Instance.getInstance().getConfig().getList(Config.CONF_CHAPTER)) {
402 String chapterWord = Instance.getInstance().getConfig().getStringX(Config.CONF_CHAPTER, lang);
403 if (chapterName.startsWith(chapterWord)) {
404 chapterName = chapterName.substring(chapterWord.length())
405 .trim();
406 break;
407 }
408 }
409
410 if (chapterName.startsWith(Integer.toString(number))) {
411 chapterName = chapterName.substring(
412 Integer.toString(number).length()).trim();
413 }
414
415 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
416 chapterName = chapterName.substring(1).trim();
417 }
418 //
419
420 Chapter chap = new Chapter(number, chapterName);
421
422 if (content != null) {
423 List<Paragraph> paras = makeParagraphs(source, content, pg);
424 long words = 0;
425 for (Paragraph para : paras) {
426 words += para.getWords();
427 }
428 chap.setParagraphs(paras);
429 chap.setWords(words);
430 }
431
432 return chap;
433
434 }
435
436 /**
437 * Convert the given content into {@link Paragraph}s.
438 *
439 * @param source
440 * the source URL of the story
441 * @param content
442 * the textual content
443 * @param pg
444 * the optional progress reporter
445 *
446 * @return the {@link Paragraph}s (can be empty, but never NULL)
447 *
448 * @throws IOException
449 * in case of I/O error
450 */
451 protected List<Paragraph> makeParagraphs(URL source, String content,
452 Progress pg) throws IOException {
453 if (pg == null) {
454 pg = new Progress();
455 }
456
457 if (isHtml()) {
458 // Special <HR> processing:
459 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
460 "<br/>* * *<br/>");
461 }
462
463 List<Paragraph> paras = new ArrayList<Paragraph>();
464 if (content != null && !content.trim().isEmpty()) {
465 if (isHtml()) {
466 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
467 pg.setMinMax(0, tab.length);
468 int i = 1;
469 for (String line : tab) {
470 if (line.startsWith("[") && line.endsWith("]")) {
471 pg.setName("Extracting image " + i);
472 }
473 paras.add(makeParagraph(source, line.trim()));
474 pg.setProgress(i++);
475 }
476 pg.setName(null);
477 } else {
478 List<String> lines = new ArrayList<String>();
479 BufferedReader buff = null;
480 try {
481 buff = new BufferedReader(
482 new InputStreamReader(new ByteArrayInputStream(
483 content.getBytes("UTF-8")), "UTF-8"));
484 for (String line = buff.readLine(); line != null; line = buff
485 .readLine()) {
486 lines.add(line.trim());
487 }
488 } finally {
489 if (buff != null) {
490 buff.close();
491 }
492 }
493
494 pg.setMinMax(0, lines.size());
495 int i = 0;
496 for (String line : lines) {
497 if (line.startsWith("[") && line.endsWith("]")) {
498 pg.setName("Extracting image " + i);
499 }
500 paras.add(makeParagraph(source, line));
501 pg.setProgress(i++);
502 }
503 pg.setName(null);
504 }
505
506 // Check quotes for "bad" format
507 List<Paragraph> newParas = new ArrayList<Paragraph>();
508 for (Paragraph para : paras) {
509 newParas.addAll(requotify(para));
510 }
511 paras = newParas;
512
513 // Remove double blanks/brks
514 fixBlanksBreaks(paras);
515 }
516
517 return paras;
518 }
519
520 /**
521 * Convert the given line into a single {@link Paragraph}.
522 *
523 * @param source
524 * the source URL of the story
525 * @param line
526 * the textual content of the paragraph
527 *
528 * @return the {@link Paragraph}, never NULL
529 */
530 private Paragraph makeParagraph(URL source, String line) {
531 Image image = null;
532 if (line.startsWith("[") && line.endsWith("]")) {
533 image = getImage(this, source, line.substring(1, line.length() - 1)
534 .trim());
535 }
536
537 if (image != null) {
538 return new Paragraph(image);
539 }
540
541 return processPara(line);
542 }
543
544 /**
545 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
546 * those {@link Paragraph}s.
547 * <p>
548 * The resulting list will not contain a starting or trailing blank/break
549 * nor 2 blanks or breaks following each other.
550 *
551 * @param paras
552 * the list of {@link Paragraph}s to fix
553 */
554 protected void fixBlanksBreaks(List<Paragraph> paras) {
555 boolean space = false;
556 boolean brk = true;
557 for (int i = 0; i < paras.size(); i++) {
558 Paragraph para = paras.get(i);
559 boolean thisSpace = para.getType() == ParagraphType.BLANK;
560 boolean thisBrk = para.getType() == ParagraphType.BREAK;
561
562 if (i > 0 && space && thisBrk) {
563 paras.remove(i - 1);
564 i--;
565 } else if ((space || brk) && (thisSpace || thisBrk)) {
566 paras.remove(i);
567 i--;
568 }
569
570 space = thisSpace;
571 brk = thisBrk;
572 }
573
574 // Remove blank/brk at start
575 if (paras.size() > 0
576 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
577 0).getType() == ParagraphType.BREAK)) {
578 paras.remove(0);
579 }
580
581 // Remove blank/brk at end
582 int last = paras.size() - 1;
583 if (paras.size() > 0
584 && (paras.get(last).getType() == ParagraphType.BLANK || paras
585 .get(last).getType() == ParagraphType.BREAK)) {
586 paras.remove(last);
587 }
588 }
589
590 /**
591 * Get the default cover related to this subject (see <tt>.info</tt> files).
592 *
593 * @param subject
594 * the subject
595 *
596 * @return the cover if any, or NULL
597 */
598 static Image getDefaultCover(String subject) {
599 if (subject != null && !subject.isEmpty() && Instance.getInstance().getCoverDir() != null) {
600 try {
601 File fileCover = new File(Instance.getInstance().getCoverDir(), subject);
602 return getImage(null, fileCover.toURI().toURL(), subject);
603 } catch (MalformedURLException e) {
604 }
605 }
606
607 return null;
608 }
609
610 /**
611 * Return the list of supported image extensions.
612 *
613 * @param emptyAllowed
614 * TRUE to allow an empty extension on first place, which can be
615 * used when you may already have an extension in your input but
616 * are not sure about it
617 *
618 * @return the extensions
619 */
620 static String[] getImageExt(boolean emptyAllowed) {
621 if (emptyAllowed) {
622 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
623 }
624
625 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
626 }
627
628 /**
629 * Check if the given resource can be a local image or a remote image, then
630 * refresh the cache with it if it is.
631 *
632 * @param source
633 * the story source
634 * @param line
635 * the resource to check
636 *
637 * @return the image if found, or NULL
638 *
639 */
640 static Image getImage(BasicSupport_Deprecated support, URL source,
641 String line) {
642 URL url = getImageUrl(support, source, line);
643 if (url != null) {
644 if ("file".equals(url.getProtocol())) {
645 if (new File(url.getPath()).isDirectory()) {
646 return null;
647 }
648 }
649 InputStream in = null;
650 try {
651 in = Instance.getInstance().getCache().open(url, getSupport(url), true);
652 return new Image(in);
653 } catch (IOException e) {
654 } finally {
655 if (in != null) {
656 try {
657 in.close();
658 } catch (IOException e) {
659 }
660 }
661 }
662 }
663
664 return null;
665 }
666
667 /**
668 * Check if the given resource can be a local image or a remote image, then
669 * refresh the cache with it if it is.
670 *
671 * @param source
672 * the story source
673 * @param line
674 * the resource to check
675 *
676 * @return the image URL if found, or NULL
677 *
678 */
679 static URL getImageUrl(BasicSupport_Deprecated support, URL source,
680 String line) {
681 URL url = null;
682
683 if (line != null) {
684 // try for files
685 if (source != null) {
686 try {
687 String relPath = null;
688 String absPath = null;
689 try {
690 String path = new File(source.getFile()).getParent();
691 relPath = new File(new File(path), line.trim())
692 .getAbsolutePath();
693 } catch (Exception e) {
694 // Cannot be converted to path (one possibility to take
695 // into account: absolute path on Windows)
696 }
697 try {
698 absPath = new File(line.trim()).getAbsolutePath();
699 } catch (Exception e) {
700 // Cannot be converted to path (at all)
701 }
702
703 for (String ext : getImageExt(true)) {
704 File absFile = new File(absPath + ext);
705 File relFile = new File(relPath + ext);
706 if (absPath != null && absFile.exists()
707 && absFile.isFile()) {
708 url = absFile.toURI().toURL();
709 } else if (relPath != null && relFile.exists()
710 && relFile.isFile()) {
711 url = relFile.toURI().toURL();
712 }
713 }
714 } catch (Exception e) {
715 // Should not happen since we control the correct arguments
716 }
717 }
718
719 if (url == null) {
720 // try for URLs
721 try {
722 for (String ext : getImageExt(true)) {
723 if (Instance.getInstance().getCache().check(new URL(line + ext), true)) {
724 url = new URL(line + ext);
725 break;
726 }
727 }
728
729 // try out of cache
730 if (url == null) {
731 for (String ext : getImageExt(true)) {
732 try {
733 url = new URL(line + ext);
734 Instance.getInstance().getCache().refresh(url, support, true);
735 break;
736 } catch (IOException e) {
737 // no image with this ext
738 url = null;
739 }
740 }
741 }
742 } catch (MalformedURLException e) {
743 // Not an url
744 }
745 }
746
747 // refresh the cached file
748 if (url != null) {
749 try {
750 Instance.getInstance().getCache().refresh(url, support, true);
751 } catch (IOException e) {
752 // woops, broken image
753 url = null;
754 }
755 }
756 }
757
758 return url;
759 }
760
761 /**
762 * Open the input file that will be used through the support.
763 * <p>
764 * Can return NULL, in which case you are supposed to work without an
765 * {@link InputStream}.
766 *
767 * @param source
768 * the source {@link URL}
769 *
770 * @return the {@link InputStream}
771 *
772 * @throws IOException
773 * in case of I/O error
774 */
775 protected InputStream openInput(URL source) throws IOException {
776 return Instance.getInstance().getCache().open(source, this, false);
777 }
778
779 /**
780 * Reset then return {@link BasicSupport_Deprecated#in}.
781 *
782 * @return {@link BasicSupport_Deprecated#in}
783 */
784 protected InputStream getInput() {
785 return reset(in);
786 }
787
788 /**
789 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
790 * and requotify them (i.e., separate them into QUOTE paragraphs and other
791 * paragraphs (quotes or not)).
792 *
793 * @param para
794 * the paragraph to requotify (not necessarily a quote)
795 *
796 * @return the correctly (or so we hope) quotified paragraphs
797 */
798 protected List<Paragraph> requotify(Paragraph para) {
799 List<Paragraph> newParas = new ArrayList<Paragraph>();
800
801 if (para.getType() == ParagraphType.QUOTE
802 && para.getContent().length() > 2) {
803 String line = para.getContent();
804 boolean singleQ = line.startsWith("" + openQuote);
805 boolean doubleQ = line.startsWith("" + openDoubleQuote);
806
807 // Do not try when more than one quote at a time
808 // (some stories are not easily readable if we do)
809 if (singleQ
810 && line.indexOf(closeQuote, 1) < line
811 .lastIndexOf(closeQuote)) {
812 newParas.add(para);
813 return newParas;
814 }
815 if (doubleQ
816 && line.indexOf(closeDoubleQuote, 1) < line
817 .lastIndexOf(closeDoubleQuote)) {
818 newParas.add(para);
819 return newParas;
820 }
821 //
822
823 if (!singleQ && !doubleQ) {
824 line = openDoubleQuote + line + closeDoubleQuote;
825 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
826 .getWords()));
827 } else {
828 char open = singleQ ? openQuote : openDoubleQuote;
829 char close = singleQ ? closeQuote : closeDoubleQuote;
830
831 int posDot = -1;
832 boolean inQuote = false;
833 int i = 0;
834 for (char car : line.toCharArray()) {
835 if (car == open) {
836 inQuote = true;
837 } else if (car == close) {
838 inQuote = false;
839 } else if (car == '.' && !inQuote) {
840 posDot = i;
841 break;
842 }
843 i++;
844 }
845
846 if (posDot >= 0) {
847 String rest = line.substring(posDot + 1).trim();
848 line = line.substring(0, posDot + 1).trim();
849 long words = 1;
850 for (char car : line.toCharArray()) {
851 if (car == ' ') {
852 words++;
853 }
854 }
855 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
856 if (!rest.isEmpty()) {
857 newParas.addAll(requotify(processPara(rest)));
858 }
859 } else {
860 newParas.add(para);
861 }
862 }
863 } else {
864 newParas.add(para);
865 }
866
867 return newParas;
868 }
869
870 /**
871 * Process a {@link Paragraph} from a raw line of text.
872 * <p>
873 * Will also fix quotes and HTML encoding if needed.
874 *
875 * @param line
876 * the raw line
877 *
878 * @return the processed {@link Paragraph}, never NULL
879 */
880 protected Paragraph processPara(String line) {
881 line = ifUnhtml(line).trim();
882
883 boolean space = true;
884 boolean brk = true;
885 boolean quote = false;
886 boolean tentativeCloseQuote = false;
887 char prev = '\0';
888 int dashCount = 0;
889 long words = 1;
890
891 StringBuilder builder = new StringBuilder();
892 for (char car : line.toCharArray()) {
893 if (car != '-') {
894 if (dashCount > 0) {
895 // dash, ndash and mdash: - – —
896 // currently: always use mdash
897 builder.append(dashCount == 1 ? '-' : '—');
898 }
899 dashCount = 0;
900 }
901
902 if (tentativeCloseQuote) {
903 tentativeCloseQuote = false;
904 if (Character.isLetterOrDigit(car)) {
905 builder.append("'");
906 } else {
907 // handle double-single quotes as double quotes
908 if (prev == car) {
909 builder.append(closeDoubleQuote);
910 continue;
911 }
912
913 builder.append(closeQuote);
914 }
915 }
916
917 switch (car) {
918 case ' ': // note: unbreakable space
919 case ' ':
920 case '\t':
921 case '\n': // just in case
922 case '\r': // just in case
923 if (builder.length() > 0
924 && builder.charAt(builder.length() - 1) != ' ') {
925 words++;
926 }
927 builder.append(' ');
928 break;
929
930 case '\'':
931 if (space || (brk && quote)) {
932 quote = true;
933 // handle double-single quotes as double quotes
934 if (prev == car) {
935 builder.deleteCharAt(builder.length() - 1);
936 builder.append(openDoubleQuote);
937 } else {
938 builder.append(openQuote);
939 }
940 } else if (prev == ' ' || prev == car) {
941 // handle double-single quotes as double quotes
942 if (prev == car) {
943 builder.deleteCharAt(builder.length() - 1);
944 builder.append(openDoubleQuote);
945 } else {
946 builder.append(openQuote);
947 }
948 } else {
949 // it is a quote ("I'm off") or a 'quote' ("This
950 // 'good' restaurant"...)
951 tentativeCloseQuote = true;
952 }
953 break;
954
955 case '"':
956 if (space || (brk && quote)) {
957 quote = true;
958 builder.append(openDoubleQuote);
959 } else if (prev == ' ') {
960 builder.append(openDoubleQuote);
961 } else {
962 builder.append(closeDoubleQuote);
963 }
964 break;
965
966 case '-':
967 if (space) {
968 quote = true;
969 } else {
970 dashCount++;
971 }
972 space = false;
973 break;
974
975 case '*':
976 case '~':
977 case '/':
978 case '\\':
979 case '<':
980 case '>':
981 case '=':
982 case '+':
983 case '_':
984 case '–':
985 case '—':
986 space = false;
987 builder.append(car);
988 break;
989
990 case '‘':
991 case '`':
992 case '‹':
993 case '﹁':
994 case '〈':
995 case '「':
996 if (space || (brk && quote)) {
997 quote = true;
998 builder.append(openQuote);
999 } else {
1000 // handle double-single quotes as double quotes
1001 if (prev == car) {
1002 builder.deleteCharAt(builder.length() - 1);
1003 builder.append(openDoubleQuote);
1004 } else {
1005 builder.append(openQuote);
1006 }
1007 }
1008 space = false;
1009 brk = false;
1010 break;
1011
1012 case '’':
1013 case '›':
1014 case '﹂':
1015 case '〉':
1016 case '」':
1017 space = false;
1018 brk = false;
1019 // handle double-single quotes as double quotes
1020 if (prev == car) {
1021 builder.deleteCharAt(builder.length() - 1);
1022 builder.append(closeDoubleQuote);
1023 } else {
1024 builder.append(closeQuote);
1025 }
1026 break;
1027
1028 case '«':
1029 case '“':
1030 case '﹃':
1031 case '《':
1032 case '『':
1033 if (space || (brk && quote)) {
1034 quote = true;
1035 builder.append(openDoubleQuote);
1036 } else {
1037 builder.append(openDoubleQuote);
1038 }
1039 space = false;
1040 brk = false;
1041 break;
1042
1043 case '»':
1044 case '”':
1045 case '﹄':
1046 case '》':
1047 case '』':
1048 space = false;
1049 brk = false;
1050 builder.append(closeDoubleQuote);
1051 break;
1052
1053 default:
1054 space = false;
1055 brk = false;
1056 builder.append(car);
1057 break;
1058 }
1059
1060 prev = car;
1061 }
1062
1063 if (tentativeCloseQuote) {
1064 tentativeCloseQuote = false;
1065 builder.append(closeQuote);
1066 }
1067
1068 line = builder.toString().trim();
1069
1070 ParagraphType type = ParagraphType.NORMAL;
1071 if (space) {
1072 type = ParagraphType.BLANK;
1073 } else if (brk) {
1074 type = ParagraphType.BREAK;
1075 } else if (quote) {
1076 type = ParagraphType.QUOTE;
1077 }
1078
1079 return new Paragraph(type, line, words);
1080 }
1081
1082 /**
1083 * Remove the HTML from the input <b>if</b>
1084 * {@link BasicSupport_Deprecated#isHtml()} is true.
1085 *
1086 * @param input
1087 * the input
1088 *
1089 * @return the no html version if needed
1090 */
1091 private String ifUnhtml(String input) {
1092 if (isHtml() && input != null) {
1093 return StringUtils.unhtml(input);
1094 }
1095
1096 return input;
1097 }
1098
1099 /**
1100 * Reset the given {@link InputStream} and return it.
1101 *
1102 * @param in
1103 * the {@link InputStream} to reset
1104 *
1105 * @return the same {@link InputStream} after reset
1106 */
1107 static protected InputStream reset(InputStream in) {
1108 try {
1109 if (in != null) {
1110 in.reset();
1111 }
1112 } catch (IOException e) {
1113 }
1114
1115 return in;
1116 }
1117
1118 /**
1119 * Return the first line from the given input which correspond to the given
1120 * selectors.
1121 *
1122 * @param in
1123 * the input
1124 * @param needle
1125 * a string that must be found inside the target line (also
1126 * supports "^" at start to say "only if it starts with" the
1127 * needle)
1128 * @param relativeLine
1129 * the line to return based upon the target line position (-1 =
1130 * the line before, 0 = the target line...)
1131 *
1132 * @return the line, or NULL if not found
1133 */
1134 static protected String getLine(InputStream in, String needle,
1135 int relativeLine) {
1136 return getLine(in, needle, relativeLine, true);
1137 }
1138
1139 /**
1140 * Return a line from the given input which correspond to the given
1141 * selectors.
1142 *
1143 * @param in
1144 * the input
1145 * @param needle
1146 * a string that must be found inside the target line (also
1147 * supports "^" at start to say "only if it starts with" the
1148 * needle)
1149 * @param relativeLine
1150 * the line to return based upon the target line position (-1 =
1151 * the line before, 0 = the target line...)
1152 * @param first
1153 * takes the first result (as opposed to the last one, which will
1154 * also always spend the input)
1155 *
1156 * @return the line, or NULL if not found
1157 */
1158 static protected String getLine(InputStream in, String needle,
1159 int relativeLine, boolean first) {
1160 String rep = null;
1161
1162 reset(in);
1163
1164 List<String> lines = new ArrayList<String>();
1165 @SuppressWarnings("resource")
1166 Scanner scan = new Scanner(in, "UTF-8");
1167 int index = -1;
1168 scan.useDelimiter("\\n");
1169 while (scan.hasNext()) {
1170 lines.add(scan.next());
1171
1172 if (index == -1) {
1173 if (needle.startsWith("^")) {
1174 if (lines.get(lines.size() - 1).startsWith(
1175 needle.substring(1))) {
1176 index = lines.size() - 1;
1177 }
1178
1179 } else {
1180 if (lines.get(lines.size() - 1).contains(needle)) {
1181 index = lines.size() - 1;
1182 }
1183 }
1184 }
1185
1186 if (index >= 0 && index + relativeLine < lines.size()) {
1187 rep = lines.get(index + relativeLine);
1188 if (first) {
1189 break;
1190 }
1191 }
1192 }
1193
1194 return rep;
1195 }
1196
1197 /**
1198 * Return the text between the key and the endKey (and optional subKey can
1199 * be passed, in this case we will look for the key first, then take the
1200 * text between the subKey and the endKey).
1201 * <p>
1202 * Will only match the first line with the given key if more than one are
1203 * possible. Which also means that if the subKey or endKey is not found on
1204 * that line, NULL will be returned.
1205 *
1206 * @param in
1207 * the input
1208 * @param key
1209 * the key to match (also supports "^" at start to say
1210 * "only if it starts with" the key)
1211 * @param subKey
1212 * the sub key or NULL if none
1213 * @param endKey
1214 * the end key or NULL for "up to the end"
1215 * @return the text or NULL if not found
1216 */
1217 static protected String getKeyLine(InputStream in, String key,
1218 String subKey, String endKey) {
1219 return getKeyText(getLine(in, key, 0), key, subKey, endKey);
1220 }
1221
1222 /**
1223 * Return the text between the key and the endKey (and optional subKey can
1224 * be passed, in this case we will look for the key first, then take the
1225 * text between the subKey and the endKey).
1226 *
1227 * @param in
1228 * the input
1229 * @param key
1230 * the key to match (also supports "^" at start to say
1231 * "only if it starts with" the key)
1232 * @param subKey
1233 * the sub key or NULL if none
1234 * @param endKey
1235 * the end key or NULL for "up to the end"
1236 * @return the text or NULL if not found
1237 */
1238 static protected String getKeyText(String in, String key, String subKey,
1239 String endKey) {
1240 String result = null;
1241
1242 String line = in;
1243 if (line != null && line.contains(key)) {
1244 line = line.substring(line.indexOf(key) + key.length());
1245 if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
1246 if (subKey != null) {
1247 line = line.substring(line.indexOf(subKey)
1248 + subKey.length());
1249 }
1250 if (endKey == null || line.contains(endKey)) {
1251 if (endKey != null) {
1252 line = line.substring(0, line.indexOf(endKey));
1253 result = line;
1254 }
1255 }
1256 }
1257 }
1258
1259 return result;
1260 }
1261
1262 /**
1263 * Return the text between the key and the endKey (optional subKeys can be
1264 * passed, in this case we will look for the subKeys first, then take the
1265 * text between the key and the endKey).
1266 *
1267 * @param in
1268 * the input
1269 * @param key
1270 * the key to match
1271 * @param endKey
1272 * the end key or NULL for "up to the end"
1273 * @param afters
1274 * the sub-keys to find before checking for key/endKey
1275 *
1276 * @return the text or NULL if not found
1277 */
1278 static protected String getKeyTextAfter(String in, String key,
1279 String endKey, String... afters) {
1280
1281 if (in != null && !in.isEmpty()) {
1282 int pos = indexOfAfter(in, 0, afters);
1283 if (pos < 0) {
1284 return null;
1285 }
1286
1287 in = in.substring(pos);
1288 }
1289
1290 return getKeyText(in, key, null, endKey);
1291 }
1292
1293 /**
1294 * Return the first index after all the given "afters" have been found in
1295 * the {@link String}, or -1 if it was not possible.
1296 *
1297 * @param in
1298 * the input
1299 * @param startAt
1300 * start at this position in the string
1301 * @param afters
1302 * the sub-keys to find before checking for key/endKey
1303 *
1304 * @return the text or NULL if not found
1305 */
1306 static protected int indexOfAfter(String in, int startAt, String... afters) {
1307 int pos = -1;
1308 if (in != null && !in.isEmpty()) {
1309 pos = startAt;
1310 if (afters != null) {
1311 for (int i = 0; pos >= 0 && i < afters.length; i++) {
1312 String subKey = afters[i];
1313 if (!subKey.isEmpty()) {
1314 pos = in.indexOf(subKey, pos);
1315 if (pos >= 0) {
1316 pos += subKey.length();
1317 }
1318 }
1319 }
1320 }
1321 }
1322
1323 return pos;
1324 }
1325 }