98b74db5d0567f9d415b6da23a3d849a79faa8ed
[nikiroo-utils.git] / supported / BasicSupport_Deprecated.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.BufferedReader;
4 import java.io.ByteArrayInputStream;
5 import java.io.File;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.io.InputStreamReader;
9 import java.net.MalformedURLException;
10 import java.net.URL;
11 import java.util.ArrayList;
12 import java.util.Date;
13 import java.util.List;
14 import java.util.Map.Entry;
15 import java.util.Scanner;
16
17 import be.nikiroo.fanfix.Instance;
18 import be.nikiroo.fanfix.bundles.Config;
19 import be.nikiroo.fanfix.bundles.StringId;
20 import be.nikiroo.fanfix.data.Chapter;
21 import be.nikiroo.fanfix.data.MetaData;
22 import be.nikiroo.fanfix.data.Paragraph;
23 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
24 import be.nikiroo.fanfix.data.Story;
25 import be.nikiroo.utils.Image;
26 import be.nikiroo.utils.Progress;
27 import be.nikiroo.utils.StringUtils;
28
29 /**
30 * DEPRECATED: use the new Jsoup 'Node' system.
31 * <p>
32 * This class is the base class used by the other support classes. It can be
33 * used outside of this package, and have static method that you can use to get
34 * access to the correct support class.
35 * <p>
36 * It will be used with 'resources' (usually web pages or files).
37 *
38 * @author niki
39 */
40 @Deprecated
41 public abstract class BasicSupport_Deprecated extends BasicSupport {
42 private InputStream in;
43
44 // quote chars
45 private char openQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_SINGLE_QUOTE);
46 private char closeQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_SINGLE_QUOTE);
47 private char openDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_DOUBLE_QUOTE);
48 private char closeDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_DOUBLE_QUOTE);
49
50 // New methods not used in Deprecated mode
51 @Override
52 protected String getDesc() throws IOException {
53 throw new RuntimeException("should not be used by legacy code");
54 }
55
56 @Override
57 protected MetaData getMeta() throws IOException {
58 throw new RuntimeException("should not be used by legacy code");
59 }
60
61 @Override
62 protected List<Entry<String, URL>> getChapters(Progress pg)
63 throws IOException {
64 throw new RuntimeException("should not be used by legacy code");
65 }
66
67 @Override
68 protected String getChapterContent(URL chapUrl, int number, Progress pg)
69 throws IOException {
70 throw new RuntimeException("should not be used by legacy code");
71 }
72
73 @Override
74 public Story process(Progress pg) throws IOException {
75 return process(getSource(), pg);
76 }
77
78 //
79
80 /**
81 * Return the {@link MetaData} of this story.
82 *
83 * @param source
84 * the source of the story
85 * @param in
86 * the input (the main resource)
87 *
88 * @return the associated {@link MetaData}, never NULL
89 *
90 * @throws IOException
91 * in case of I/O error
92 */
93 protected abstract MetaData getMeta(URL source, InputStream in)
94 throws IOException;
95
96 /**
97 * Return the story description.
98 *
99 * @param source
100 * the source of the story
101 * @param in
102 * the input (the main resource)
103 *
104 * @return the description
105 *
106 * @throws IOException
107 * in case of I/O error
108 */
109 protected abstract String getDesc(URL source, InputStream in)
110 throws IOException;
111
112 /**
113 * Return the list of chapters (name and resource).
114 *
115 * @param source
116 * the source of the story
117 * @param in
118 * the input (the main resource)
119 * @param pg
120 * the optional progress reporter
121 *
122 * @return the chapters
123 *
124 * @throws IOException
125 * in case of I/O error
126 */
127 protected abstract List<Entry<String, URL>> getChapters(URL source,
128 InputStream in, Progress pg) throws IOException;
129
130 /**
131 * Return the content of the chapter (possibly HTML encoded, if
132 * {@link BasicSupport_Deprecated#isHtml()} is TRUE).
133 *
134 * @param source
135 * the source of the story
136 * @param in
137 * the input (the main resource)
138 * @param number
139 * the chapter number
140 * @param pg
141 * the optional progress reporter
142 *
143 * @return the content
144 *
145 * @throws IOException
146 * in case of I/O error
147 */
148 protected abstract String getChapterContent(URL source, InputStream in,
149 int number, Progress pg) throws IOException;
150
151 /**
152 * Process the given story resource into a partially filled {@link Story}
153 * object containing the name and metadata, except for the description.
154 *
155 * @param url
156 * the story resource
157 *
158 * @return the {@link Story}
159 *
160 * @throws IOException
161 * in case of I/O error
162 */
163 public Story processMeta(URL url) throws IOException {
164 return processMeta(url, true, false, null);
165 }
166
167 /**
168 * Process the given story resource into a partially filled {@link Story}
169 * object containing the name and metadata.
170 *
171 * @param url
172 * the story resource
173 * @param close
174 * close "this" and "in" when done
175 * @param getDesc
176 * retrieve the description of the story, or not
177 * @param pg
178 * the optional progress reporter
179 *
180 * @return the {@link Story}, never NULL
181 *
182 * @throws IOException
183 * in case of I/O error
184 */
185 protected Story processMeta(URL url, boolean close, boolean getDesc,
186 Progress pg) throws IOException {
187 if (pg == null) {
188 pg = new Progress();
189 } else {
190 pg.setMinMax(0, 100);
191 }
192
193 login();
194 pg.setProgress(10);
195
196 url = getCanonicalUrl(url);
197
198 setCurrentReferer(url);
199
200 in = openInput(url); // NULL allowed here
201 try {
202 preprocess(url, getInput());
203 pg.setProgress(30);
204
205 Story story = new Story();
206 MetaData meta = getMeta(url, getInput());
207 if (meta.getCreationDate() == null
208 || meta.getCreationDate().isEmpty()) {
209 meta.setCreationDate(StringUtils.fromTime(new Date().getTime()));
210 }
211 story.setMeta(meta);
212 pg.put("meta", meta);
213
214 pg.setProgress(50);
215
216 if (meta.getCover() == null) {
217 meta.setCover(getDefaultCover(meta.getSubject()));
218 }
219
220 pg.setProgress(60);
221
222 if (getDesc) {
223 String descChapterName = Instance.getInstance().getTrans().getString(StringId.DESCRIPTION);
224 story.getMeta().setResume(makeChapter(url, 0, descChapterName, getDesc(url, getInput()), null));
225 }
226
227 pg.setProgress(100);
228 return story;
229 } finally {
230 if (close) {
231 close();
232
233 if (in != null) {
234 in.close();
235 }
236 }
237 }
238 }
239
240 /**
241 * Process the given story resource into a fully filled {@link Story}
242 * object.
243 *
244 * @param url
245 * the story resource
246 * @param pg
247 * the optional progress reporter
248 *
249 * @return the {@link Story}, never NULL
250 *
251 * @throws IOException
252 * in case of I/O error
253 */
254 protected Story process(URL url, Progress pg) throws IOException {
255 if (pg == null) {
256 pg = new Progress();
257 } else {
258 pg.setMinMax(0, 100);
259 }
260
261 url = getCanonicalUrl(url);
262 pg.setProgress(1);
263 try {
264 Progress pgMeta = new Progress();
265 pg.addProgress(pgMeta, 10);
266 Story story = processMeta(url, false, true, pgMeta);
267 pg.put("meta", story.getMeta());
268 if (!pgMeta.isDone()) {
269 pgMeta.setProgress(pgMeta.getMax()); // 10%
270 }
271
272 pg.setName("Retrieving " + story.getMeta().getTitle());
273
274 setCurrentReferer(url);
275
276 Progress pgGetChapters = new Progress();
277 pg.addProgress(pgGetChapters, 10);
278 story.setChapters(new ArrayList<Chapter>());
279 List<Entry<String, URL>> chapters = getChapters(url, getInput(),
280 pgGetChapters);
281 if (!pgGetChapters.isDone()) {
282 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
283 }
284
285 if (chapters != null) {
286 Progress pgChaps = new Progress("Extracting chapters", 0,
287 chapters.size() * 300);
288 pg.addProgress(pgChaps, 80);
289
290 long words = 0;
291 int i = 1;
292 for (Entry<String, URL> chap : chapters) {
293 pgChaps.setName("Extracting chapter " + i);
294 InputStream chapIn = null;
295 if (chap.getValue() != null) {
296 setCurrentReferer(chap.getValue());
297 chapIn = Instance.getInstance().getCache().open(chap.getValue(), this, false);
298 }
299 pgChaps.setProgress(i * 100);
300 try {
301 Progress pgGetChapterContent = new Progress();
302 Progress pgMakeChapter = new Progress();
303 pgChaps.addProgress(pgGetChapterContent, 100);
304 pgChaps.addProgress(pgMakeChapter, 100);
305
306 String content = getChapterContent(url, chapIn, i,
307 pgGetChapterContent);
308 if (!pgGetChapterContent.isDone()) {
309 pgGetChapterContent.setProgress(pgGetChapterContent
310 .getMax());
311 }
312
313 Chapter cc = makeChapter(url, i, chap.getKey(),
314 content, pgMakeChapter);
315 if (!pgMakeChapter.isDone()) {
316 pgMakeChapter.setProgress(pgMakeChapter.getMax());
317 }
318
319 words += cc.getWords();
320 story.getChapters().add(cc);
321 story.getMeta().setWords(words);
322 } finally {
323 if (chapIn != null) {
324 chapIn.close();
325 }
326 }
327
328 i++;
329 }
330
331 pgChaps.setName("Extracting chapters");
332 } else {
333 pg.setProgress(80);
334 }
335
336 return story;
337
338 } finally {
339 close();
340
341 if (in != null) {
342 in.close();
343 }
344 }
345 }
346
347 /**
348 * Prepare the support if needed before processing.
349 *
350 * @param source
351 * the source of the story
352 * @param in
353 * the input (the main resource)
354 *
355 * @throws IOException
356 * on I/O error
357 */
358 @SuppressWarnings("unused")
359 protected void preprocess(URL source, InputStream in) throws IOException {
360 }
361
362 /**
363 * Create a {@link Chapter} object from the given information, formatting
364 * the content as it should be.
365 *
366 * @param source
367 * the source of the story
368 * @param number
369 * the chapter number
370 * @param name
371 * the chapter name
372 * @param content
373 * the chapter content
374 * @param pg
375 * the optional progress reporter
376 *
377 * @return the {@link Chapter}
378 *
379 * @throws IOException
380 * in case of I/O error
381 */
382 protected Chapter makeChapter(URL source, int number, String name,
383 String content, Progress pg) throws IOException {
384 // Chapter name: process it correctly, then remove the possible
385 // redundant "Chapter x: " in front of it, or "-" (as in
386 // "Chapter 5: - Fun!" after the ": " was automatically added)
387 String chapterName = processPara(name).getContent().trim();
388 for (String lang : Instance.getInstance().getConfig().getList(Config.CONF_CHAPTER)) {
389 String chapterWord = Instance.getInstance().getConfig().getStringX(Config.CONF_CHAPTER, lang);
390 if (chapterName.startsWith(chapterWord)) {
391 chapterName = chapterName.substring(chapterWord.length())
392 .trim();
393 break;
394 }
395 }
396
397 if (chapterName.startsWith(Integer.toString(number))) {
398 chapterName = chapterName.substring(
399 Integer.toString(number).length()).trim();
400 }
401
402 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
403 chapterName = chapterName.substring(1).trim();
404 }
405 //
406
407 Chapter chap = new Chapter(number, chapterName);
408
409 if (content != null) {
410 List<Paragraph> paras = makeParagraphs(source, content, pg);
411 long words = 0;
412 for (Paragraph para : paras) {
413 words += para.getWords();
414 }
415 chap.setParagraphs(paras);
416 chap.setWords(words);
417 }
418
419 return chap;
420
421 }
422
423 /**
424 * Convert the given content into {@link Paragraph}s.
425 *
426 * @param source
427 * the source URL of the story
428 * @param content
429 * the textual content
430 * @param pg
431 * the optional progress reporter
432 *
433 * @return the {@link Paragraph}s
434 *
435 * @throws IOException
436 * in case of I/O error
437 */
438 protected List<Paragraph> makeParagraphs(URL source, String content,
439 Progress pg) throws IOException {
440 if (pg == null) {
441 pg = new Progress();
442 }
443
444 if (isHtml()) {
445 // Special <HR> processing:
446 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
447 "<br/>* * *<br/>");
448 }
449
450 List<Paragraph> paras = new ArrayList<Paragraph>();
451
452 if (content != null && !content.trim().isEmpty()) {
453 if (isHtml()) {
454 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
455 pg.setMinMax(0, tab.length);
456 int i = 1;
457 for (String line : tab) {
458 if (line.startsWith("[") && line.endsWith("]")) {
459 pg.setName("Extracting image " + i);
460 }
461 paras.add(makeParagraph(source, line.trim()));
462 pg.setProgress(i++);
463 }
464 pg.setName(null);
465 } else {
466 List<String> lines = new ArrayList<String>();
467 BufferedReader buff = null;
468 try {
469 buff = new BufferedReader(
470 new InputStreamReader(new ByteArrayInputStream(
471 content.getBytes("UTF-8")), "UTF-8"));
472 for (String line = buff.readLine(); line != null; line = buff
473 .readLine()) {
474 lines.add(line.trim());
475 }
476 } finally {
477 if (buff != null) {
478 buff.close();
479 }
480 }
481
482 pg.setMinMax(0, lines.size());
483 int i = 0;
484 for (String line : lines) {
485 if (line.startsWith("[") && line.endsWith("]")) {
486 pg.setName("Extracting image " + i);
487 }
488 paras.add(makeParagraph(source, line));
489 pg.setProgress(i++);
490 }
491 pg.setName(null);
492 }
493
494 // Check quotes for "bad" format
495 List<Paragraph> newParas = new ArrayList<Paragraph>();
496 for (Paragraph para : paras) {
497 newParas.addAll(requotify(para));
498 }
499 paras = newParas;
500
501 // Remove double blanks/brks
502 fixBlanksBreaks(paras);
503 }
504
505 return paras;
506 }
507
508 /**
509 * Convert the given line into a single {@link Paragraph}.
510 *
511 * @param source
512 * the source URL of the story
513 * @param line
514 * the textual content of the paragraph
515 *
516 * @return the {@link Paragraph}
517 */
518 private Paragraph makeParagraph(URL source, String line) {
519 Image image = null;
520 if (line.startsWith("[") && line.endsWith("]")) {
521 image = getImage(this, source, line.substring(1, line.length() - 1)
522 .trim());
523 }
524
525 if (image != null) {
526 return new Paragraph(image);
527 }
528
529 return processPara(line);
530 }
531
532 /**
533 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
534 * those {@link Paragraph}s.
535 * <p>
536 * The resulting list will not contain a starting or trailing blank/break
537 * nor 2 blanks or breaks following each other.
538 *
539 * @param paras
540 * the list of {@link Paragraph}s to fix
541 */
542 protected void fixBlanksBreaks(List<Paragraph> paras) {
543 boolean space = false;
544 boolean brk = true;
545 for (int i = 0; i < paras.size(); i++) {
546 Paragraph para = paras.get(i);
547 boolean thisSpace = para.getType() == ParagraphType.BLANK;
548 boolean thisBrk = para.getType() == ParagraphType.BREAK;
549
550 if (i > 0 && space && thisBrk) {
551 paras.remove(i - 1);
552 i--;
553 } else if ((space || brk) && (thisSpace || thisBrk)) {
554 paras.remove(i);
555 i--;
556 }
557
558 space = thisSpace;
559 brk = thisBrk;
560 }
561
562 // Remove blank/brk at start
563 if (paras.size() > 0
564 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
565 0).getType() == ParagraphType.BREAK)) {
566 paras.remove(0);
567 }
568
569 // Remove blank/brk at end
570 int last = paras.size() - 1;
571 if (paras.size() > 0
572 && (paras.get(last).getType() == ParagraphType.BLANK || paras
573 .get(last).getType() == ParagraphType.BREAK)) {
574 paras.remove(last);
575 }
576 }
577
578 /**
579 * Get the default cover related to this subject (see <tt>.info</tt> files).
580 *
581 * @param subject
582 * the subject
583 *
584 * @return the cover if any, or NULL
585 */
586 static Image getDefaultCover(String subject) {
587 if (subject != null && !subject.isEmpty() && Instance.getInstance().getCoverDir() != null) {
588 try {
589 File fileCover = new File(Instance.getInstance().getCoverDir(), subject);
590 return getImage(null, fileCover.toURI().toURL(), subject);
591 } catch (MalformedURLException e) {
592 }
593 }
594
595 return null;
596 }
597
598 /**
599 * Return the list of supported image extensions.
600 *
601 * @param emptyAllowed
602 * TRUE to allow an empty extension on first place, which can be
603 * used when you may already have an extension in your input but
604 * are not sure about it
605 *
606 * @return the extensions
607 */
608 static String[] getImageExt(boolean emptyAllowed) {
609 if (emptyAllowed) {
610 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
611 }
612
613 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
614 }
615
616 /**
617 * Check if the given resource can be a local image or a remote image, then
618 * refresh the cache with it if it is.
619 *
620 * @param source
621 * the story source
622 * @param line
623 * the resource to check
624 *
625 * @return the image if found, or NULL
626 *
627 */
628 static Image getImage(BasicSupport_Deprecated support, URL source,
629 String line) {
630 URL url = getImageUrl(support, source, line);
631 if (url != null) {
632 if ("file".equals(url.getProtocol())) {
633 if (new File(url.getPath()).isDirectory()) {
634 return null;
635 }
636 }
637 InputStream in = null;
638 try {
639 in = Instance.getInstance().getCache().open(url, getSupport(url), true);
640 return new Image(in);
641 } catch (IOException e) {
642 } finally {
643 if (in != null) {
644 try {
645 in.close();
646 } catch (IOException e) {
647 }
648 }
649 }
650 }
651
652 return null;
653 }
654
655 /**
656 * Check if the given resource can be a local image or a remote image, then
657 * refresh the cache with it if it is.
658 *
659 * @param source
660 * the story source
661 * @param line
662 * the resource to check
663 *
664 * @return the image URL if found, or NULL
665 *
666 */
667 static URL getImageUrl(BasicSupport_Deprecated support, URL source,
668 String line) {
669 URL url = null;
670
671 if (line != null) {
672 // try for files
673 if (source != null) {
674 try {
675 String relPath = null;
676 String absPath = null;
677 try {
678 String path = new File(source.getFile()).getParent();
679 relPath = new File(new File(path), line.trim())
680 .getAbsolutePath();
681 } catch (Exception e) {
682 // Cannot be converted to path (one possibility to take
683 // into account: absolute path on Windows)
684 }
685 try {
686 absPath = new File(line.trim()).getAbsolutePath();
687 } catch (Exception e) {
688 // Cannot be converted to path (at all)
689 }
690
691 for (String ext : getImageExt(true)) {
692 File absFile = new File(absPath + ext);
693 File relFile = new File(relPath + ext);
694 if (absPath != null && absFile.exists()
695 && absFile.isFile()) {
696 url = absFile.toURI().toURL();
697 } else if (relPath != null && relFile.exists()
698 && relFile.isFile()) {
699 url = relFile.toURI().toURL();
700 }
701 }
702 } catch (Exception e) {
703 // Should not happen since we control the correct arguments
704 }
705 }
706
707 if (url == null) {
708 // try for URLs
709 try {
710 for (String ext : getImageExt(true)) {
711 if (Instance.getInstance().getCache().check(new URL(line + ext), true)) {
712 url = new URL(line + ext);
713 break;
714 }
715 }
716
717 // try out of cache
718 if (url == null) {
719 for (String ext : getImageExt(true)) {
720 try {
721 url = new URL(line + ext);
722 Instance.getInstance().getCache().refresh(url, support, true);
723 break;
724 } catch (IOException e) {
725 // no image with this ext
726 url = null;
727 }
728 }
729 }
730 } catch (MalformedURLException e) {
731 // Not an url
732 }
733 }
734
735 // refresh the cached file
736 if (url != null) {
737 try {
738 Instance.getInstance().getCache().refresh(url, support, true);
739 } catch (IOException e) {
740 // woops, broken image
741 url = null;
742 }
743 }
744 }
745
746 return url;
747 }
748
749 /**
750 * Open the input file that will be used through the support.
751 * <p>
752 * Can return NULL, in which case you are supposed to work without an
753 * {@link InputStream}.
754 *
755 * @param source
756 * the source {@link URL}
757 *
758 * @return the {@link InputStream}
759 *
760 * @throws IOException
761 * in case of I/O error
762 */
763 protected InputStream openInput(URL source) throws IOException {
764 return Instance.getInstance().getCache().open(source, this, false);
765 }
766
767 /**
768 * Reset then return {@link BasicSupport_Deprecated#in}.
769 *
770 * @return {@link BasicSupport_Deprecated#in}
771 */
772 protected InputStream getInput() {
773 return reset(in);
774 }
775
776 /**
777 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
778 * and requotify them (i.e., separate them into QUOTE paragraphs and other
779 * paragraphs (quotes or not)).
780 *
781 * @param para
782 * the paragraph to requotify (not necessarily a quote)
783 *
784 * @return the correctly (or so we hope) quotified paragraphs
785 */
786 protected List<Paragraph> requotify(Paragraph para) {
787 List<Paragraph> newParas = new ArrayList<Paragraph>();
788
789 if (para.getType() == ParagraphType.QUOTE
790 && para.getContent().length() > 2) {
791 String line = para.getContent();
792 boolean singleQ = line.startsWith("" + openQuote);
793 boolean doubleQ = line.startsWith("" + openDoubleQuote);
794
795 // Do not try when more than one quote at a time
796 // (some stories are not easily readable if we do)
797 if (singleQ
798 && line.indexOf(closeQuote, 1) < line
799 .lastIndexOf(closeQuote)) {
800 newParas.add(para);
801 return newParas;
802 }
803 if (doubleQ
804 && line.indexOf(closeDoubleQuote, 1) < line
805 .lastIndexOf(closeDoubleQuote)) {
806 newParas.add(para);
807 return newParas;
808 }
809 //
810
811 if (!singleQ && !doubleQ) {
812 line = openDoubleQuote + line + closeDoubleQuote;
813 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
814 .getWords()));
815 } else {
816 char open = singleQ ? openQuote : openDoubleQuote;
817 char close = singleQ ? closeQuote : closeDoubleQuote;
818
819 int posDot = -1;
820 boolean inQuote = false;
821 int i = 0;
822 for (char car : line.toCharArray()) {
823 if (car == open) {
824 inQuote = true;
825 } else if (car == close) {
826 inQuote = false;
827 } else if (car == '.' && !inQuote) {
828 posDot = i;
829 break;
830 }
831 i++;
832 }
833
834 if (posDot >= 0) {
835 String rest = line.substring(posDot + 1).trim();
836 line = line.substring(0, posDot + 1).trim();
837 long words = 1;
838 for (char car : line.toCharArray()) {
839 if (car == ' ') {
840 words++;
841 }
842 }
843 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
844 if (!rest.isEmpty()) {
845 newParas.addAll(requotify(processPara(rest)));
846 }
847 } else {
848 newParas.add(para);
849 }
850 }
851 } else {
852 newParas.add(para);
853 }
854
855 return newParas;
856 }
857
858 /**
859 * Process a {@link Paragraph} from a raw line of text.
860 * <p>
861 * Will also fix quotes and HTML encoding if needed.
862 *
863 * @param line
864 * the raw line
865 *
866 * @return the processed {@link Paragraph}
867 */
868 protected Paragraph processPara(String line) {
869 line = ifUnhtml(line).trim();
870
871 boolean space = true;
872 boolean brk = true;
873 boolean quote = false;
874 boolean tentativeCloseQuote = false;
875 char prev = '\0';
876 int dashCount = 0;
877 long words = 1;
878
879 StringBuilder builder = new StringBuilder();
880 for (char car : line.toCharArray()) {
881 if (car != '-') {
882 if (dashCount > 0) {
883 // dash, ndash and mdash: - – —
884 // currently: always use mdash
885 builder.append(dashCount == 1 ? '-' : '—');
886 }
887 dashCount = 0;
888 }
889
890 if (tentativeCloseQuote) {
891 tentativeCloseQuote = false;
892 if (Character.isLetterOrDigit(car)) {
893 builder.append("'");
894 } else {
895 // handle double-single quotes as double quotes
896 if (prev == car) {
897 builder.append(closeDoubleQuote);
898 continue;
899 }
900
901 builder.append(closeQuote);
902 }
903 }
904
905 switch (car) {
906 case ' ': // note: unbreakable space
907 case ' ':
908 case '\t':
909 case '\n': // just in case
910 case '\r': // just in case
911 if (builder.length() > 0
912 && builder.charAt(builder.length() - 1) != ' ') {
913 words++;
914 }
915 builder.append(' ');
916 break;
917
918 case '\'':
919 if (space || (brk && quote)) {
920 quote = true;
921 // handle double-single quotes as double quotes
922 if (prev == car) {
923 builder.deleteCharAt(builder.length() - 1);
924 builder.append(openDoubleQuote);
925 } else {
926 builder.append(openQuote);
927 }
928 } else if (prev == ' ' || prev == car) {
929 // handle double-single quotes as double quotes
930 if (prev == car) {
931 builder.deleteCharAt(builder.length() - 1);
932 builder.append(openDoubleQuote);
933 } else {
934 builder.append(openQuote);
935 }
936 } else {
937 // it is a quote ("I'm off") or a 'quote' ("This
938 // 'good' restaurant"...)
939 tentativeCloseQuote = true;
940 }
941 break;
942
943 case '"':
944 if (space || (brk && quote)) {
945 quote = true;
946 builder.append(openDoubleQuote);
947 } else if (prev == ' ') {
948 builder.append(openDoubleQuote);
949 } else {
950 builder.append(closeDoubleQuote);
951 }
952 break;
953
954 case '-':
955 if (space) {
956 quote = true;
957 } else {
958 dashCount++;
959 }
960 space = false;
961 break;
962
963 case '*':
964 case '~':
965 case '/':
966 case '\\':
967 case '<':
968 case '>':
969 case '=':
970 case '+':
971 case '_':
972 case '–':
973 case '—':
974 space = false;
975 builder.append(car);
976 break;
977
978 case '‘':
979 case '`':
980 case '‹':
981 case '﹁':
982 case '〈':
983 case '「':
984 if (space || (brk && quote)) {
985 quote = true;
986 builder.append(openQuote);
987 } else {
988 // handle double-single quotes as double quotes
989 if (prev == car) {
990 builder.deleteCharAt(builder.length() - 1);
991 builder.append(openDoubleQuote);
992 } else {
993 builder.append(openQuote);
994 }
995 }
996 space = false;
997 brk = false;
998 break;
999
1000 case '’':
1001 case '›':
1002 case '﹂':
1003 case '〉':
1004 case '」':
1005 space = false;
1006 brk = false;
1007 // handle double-single quotes as double quotes
1008 if (prev == car) {
1009 builder.deleteCharAt(builder.length() - 1);
1010 builder.append(closeDoubleQuote);
1011 } else {
1012 builder.append(closeQuote);
1013 }
1014 break;
1015
1016 case '«':
1017 case '“':
1018 case '﹃':
1019 case '《':
1020 case '『':
1021 if (space || (brk && quote)) {
1022 quote = true;
1023 builder.append(openDoubleQuote);
1024 } else {
1025 builder.append(openDoubleQuote);
1026 }
1027 space = false;
1028 brk = false;
1029 break;
1030
1031 case '»':
1032 case '”':
1033 case '﹄':
1034 case '》':
1035 case '』':
1036 space = false;
1037 brk = false;
1038 builder.append(closeDoubleQuote);
1039 break;
1040
1041 default:
1042 space = false;
1043 brk = false;
1044 builder.append(car);
1045 break;
1046 }
1047
1048 prev = car;
1049 }
1050
1051 if (tentativeCloseQuote) {
1052 tentativeCloseQuote = false;
1053 builder.append(closeQuote);
1054 }
1055
1056 line = builder.toString().trim();
1057
1058 ParagraphType type = ParagraphType.NORMAL;
1059 if (space) {
1060 type = ParagraphType.BLANK;
1061 } else if (brk) {
1062 type = ParagraphType.BREAK;
1063 } else if (quote) {
1064 type = ParagraphType.QUOTE;
1065 }
1066
1067 return new Paragraph(type, line, words);
1068 }
1069
1070 /**
1071 * Remove the HTML from the input <b>if</b>
1072 * {@link BasicSupport_Deprecated#isHtml()} is true.
1073 *
1074 * @param input
1075 * the input
1076 *
1077 * @return the no html version if needed
1078 */
1079 private String ifUnhtml(String input) {
1080 if (isHtml() && input != null) {
1081 return StringUtils.unhtml(input);
1082 }
1083
1084 return input;
1085 }
1086
1087 /**
1088 * Reset the given {@link InputStream} and return it.
1089 *
1090 * @param in
1091 * the {@link InputStream} to reset
1092 *
1093 * @return the same {@link InputStream} after reset
1094 */
1095 static protected InputStream reset(InputStream in) {
1096 try {
1097 if (in != null) {
1098 in.reset();
1099 }
1100 } catch (IOException e) {
1101 }
1102
1103 return in;
1104 }
1105
1106 /**
1107 * Return the first line from the given input which correspond to the given
1108 * selectors.
1109 *
1110 * @param in
1111 * the input
1112 * @param needle
1113 * a string that must be found inside the target line (also
1114 * supports "^" at start to say "only if it starts with" the
1115 * needle)
1116 * @param relativeLine
1117 * the line to return based upon the target line position (-1 =
1118 * the line before, 0 = the target line...)
1119 *
1120 * @return the line, or NULL if not found
1121 */
1122 static protected String getLine(InputStream in, String needle,
1123 int relativeLine) {
1124 return getLine(in, needle, relativeLine, true);
1125 }
1126
1127 /**
1128 * Return a line from the given input which correspond to the given
1129 * selectors.
1130 *
1131 * @param in
1132 * the input
1133 * @param needle
1134 * a string that must be found inside the target line (also
1135 * supports "^" at start to say "only if it starts with" the
1136 * needle)
1137 * @param relativeLine
1138 * the line to return based upon the target line position (-1 =
1139 * the line before, 0 = the target line...)
1140 * @param first
1141 * takes the first result (as opposed to the last one, which will
1142 * also always spend the input)
1143 *
1144 * @return the line, or NULL if not found
1145 */
1146 static protected String getLine(InputStream in, String needle,
1147 int relativeLine, boolean first) {
1148 String rep = null;
1149
1150 reset(in);
1151
1152 List<String> lines = new ArrayList<String>();
1153 @SuppressWarnings("resource")
1154 Scanner scan = new Scanner(in, "UTF-8");
1155 int index = -1;
1156 scan.useDelimiter("\\n");
1157 while (scan.hasNext()) {
1158 lines.add(scan.next());
1159
1160 if (index == -1) {
1161 if (needle.startsWith("^")) {
1162 if (lines.get(lines.size() - 1).startsWith(
1163 needle.substring(1))) {
1164 index = lines.size() - 1;
1165 }
1166
1167 } else {
1168 if (lines.get(lines.size() - 1).contains(needle)) {
1169 index = lines.size() - 1;
1170 }
1171 }
1172 }
1173
1174 if (index >= 0 && index + relativeLine < lines.size()) {
1175 rep = lines.get(index + relativeLine);
1176 if (first) {
1177 break;
1178 }
1179 }
1180 }
1181
1182 return rep;
1183 }
1184
1185 /**
1186 * Return the text between the key and the endKey (and optional subKey can
1187 * be passed, in this case we will look for the key first, then take the
1188 * text between the subKey and the endKey).
1189 * <p>
1190 * Will only match the first line with the given key if more than one are
1191 * possible. Which also means that if the subKey or endKey is not found on
1192 * that line, NULL will be returned.
1193 *
1194 * @param in
1195 * the input
1196 * @param key
1197 * the key to match (also supports "^" at start to say
1198 * "only if it starts with" the key)
1199 * @param subKey
1200 * the sub key or NULL if none
1201 * @param endKey
1202 * the end key or NULL for "up to the end"
1203 * @return the text or NULL if not found
1204 */
1205 static protected String getKeyLine(InputStream in, String key,
1206 String subKey, String endKey) {
1207 return getKeyText(getLine(in, key, 0), key, subKey, endKey);
1208 }
1209
1210 /**
1211 * Return the text between the key and the endKey (and optional subKey can
1212 * be passed, in this case we will look for the key first, then take the
1213 * text between the subKey and the endKey).
1214 *
1215 * @param in
1216 * the input
1217 * @param key
1218 * the key to match (also supports "^" at start to say
1219 * "only if it starts with" the key)
1220 * @param subKey
1221 * the sub key or NULL if none
1222 * @param endKey
1223 * the end key or NULL for "up to the end"
1224 * @return the text or NULL if not found
1225 */
1226 static protected String getKeyText(String in, String key, String subKey,
1227 String endKey) {
1228 String result = null;
1229
1230 String line = in;
1231 if (line != null && line.contains(key)) {
1232 line = line.substring(line.indexOf(key) + key.length());
1233 if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
1234 if (subKey != null) {
1235 line = line.substring(line.indexOf(subKey)
1236 + subKey.length());
1237 }
1238 if (endKey == null || line.contains(endKey)) {
1239 if (endKey != null) {
1240 line = line.substring(0, line.indexOf(endKey));
1241 result = line;
1242 }
1243 }
1244 }
1245 }
1246
1247 return result;
1248 }
1249
1250 /**
1251 * Return the text between the key and the endKey (optional subKeys can be
1252 * passed, in this case we will look for the subKeys first, then take the
1253 * text between the key and the endKey).
1254 *
1255 * @param in
1256 * the input
1257 * @param key
1258 * the key to match
1259 * @param endKey
1260 * the end key or NULL for "up to the end"
1261 * @param afters
1262 * the sub-keys to find before checking for key/endKey
1263 *
1264 * @return the text or NULL if not found
1265 */
1266 static protected String getKeyTextAfter(String in, String key,
1267 String endKey, String... afters) {
1268
1269 if (in != null && !in.isEmpty()) {
1270 int pos = indexOfAfter(in, 0, afters);
1271 if (pos < 0) {
1272 return null;
1273 }
1274
1275 in = in.substring(pos);
1276 }
1277
1278 return getKeyText(in, key, null, endKey);
1279 }
1280
1281 /**
1282 * Return the first index after all the given "afters" have been found in
1283 * the {@link String}, or -1 if it was not possible.
1284 *
1285 * @param in
1286 * the input
1287 * @param startAt
1288 * start at this position in the string
1289 * @param afters
1290 * the sub-keys to find before checking for key/endKey
1291 *
1292 * @return the text or NULL if not found
1293 */
1294 static protected int indexOfAfter(String in, int startAt, String... afters) {
1295 int pos = -1;
1296 if (in != null && !in.isEmpty()) {
1297 pos = startAt;
1298 if (afters != null) {
1299 for (int i = 0; pos >= 0 && i < afters.length; i++) {
1300 String subKey = afters[i];
1301 if (!subKey.isEmpty()) {
1302 pos = in.indexOf(subKey, pos);
1303 if (pos >= 0) {
1304 pos += subKey.length();
1305 }
1306 }
1307 }
1308 }
1309 }
1310
1311 return pos;
1312 }
1313 }