keep publisher on re-import
[fanfix.git] / src / be / nikiroo / fanfix / supported / BasicSupport_Deprecated.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.BufferedReader;
4 import java.io.ByteArrayInputStream;
5 import java.io.File;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.io.InputStreamReader;
9 import java.net.MalformedURLException;
10 import java.net.URL;
11 import java.util.ArrayList;
12 import java.util.Date;
13 import java.util.List;
14 import java.util.Map.Entry;
15 import java.util.Scanner;
16
17 import be.nikiroo.fanfix.Instance;
18 import be.nikiroo.fanfix.bundles.Config;
19 import be.nikiroo.fanfix.bundles.StringId;
20 import be.nikiroo.fanfix.data.Chapter;
21 import be.nikiroo.fanfix.data.MetaData;
22 import be.nikiroo.fanfix.data.Paragraph;
23 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
24 import be.nikiroo.fanfix.data.Story;
25 import be.nikiroo.utils.Image;
26 import be.nikiroo.utils.Progress;
27 import be.nikiroo.utils.StringUtils;
28
29 /**
30 * DEPRECATED: use the new Jsoup 'Node' system.
31 * <p>
32 * This class is the base class used by the other support classes. It can be
33 * used outside of this package, and have static method that you can use to get
34 * access to the correct support class.
35 * <p>
36 * It will be used with 'resources' (usually web pages or files).
37 *
38 * @author niki
39 */
40 @Deprecated
41 public abstract class BasicSupport_Deprecated extends BasicSupport {
42 private InputStream in;
43
44 // quote chars
45 private char openQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_SINGLE_QUOTE);
46 private char closeQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_SINGLE_QUOTE);
47 private char openDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_DOUBLE_QUOTE);
48 private char closeDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_DOUBLE_QUOTE);
49
50 // New methods not used in Deprecated mode
51 @Override
52 protected String getDesc() throws IOException {
53 throw new RuntimeException("should not be used by legacy code");
54 }
55
56 @Override
57 protected MetaData getMeta() throws IOException {
58 throw new RuntimeException("should not be used by legacy code");
59 }
60
61 @Override
62 protected List<Entry<String, URL>> getChapters(Progress pg)
63 throws IOException {
64 throw new RuntimeException("should not be used by legacy code");
65 }
66
67 @Override
68 protected String getChapterContent(URL chapUrl, int number, Progress pg)
69 throws IOException {
70 throw new RuntimeException("should not be used by legacy code");
71 }
72
73 @Override
74 public Story process(Progress pg) throws IOException {
75 return process(getSource(), pg);
76 }
77
78 //
79
80 /**
81 * Return the {@link MetaData} of this story.
82 *
83 * @param source
84 * the source of the story
85 * @param in
86 * the input (the main resource)
87 *
88 * @return the associated {@link MetaData}, never NULL
89 *
90 * @throws IOException
91 * in case of I/O error
92 */
93 protected abstract MetaData getMeta(URL source, InputStream in)
94 throws IOException;
95
96 /**
97 * Return the story description.
98 *
99 * @param source
100 * the source of the story
101 * @param in
102 * the input (the main resource)
103 *
104 * @return the description
105 *
106 * @throws IOException
107 * in case of I/O error
108 */
109 protected abstract String getDesc(URL source, InputStream in)
110 throws IOException;
111
112 /**
113 * Return the list of chapters (name and resource).
114 *
115 * @param source
116 * the source of the story
117 * @param in
118 * the input (the main resource)
119 * @param pg
120 * the optional progress reporter
121 *
122 * @return the chapters
123 *
124 * @throws IOException
125 * in case of I/O error
126 */
127 protected abstract List<Entry<String, URL>> getChapters(URL source,
128 InputStream in, Progress pg) throws IOException;
129
130 /**
131 * Return the content of the chapter (possibly HTML encoded, if
132 * {@link BasicSupport_Deprecated#isHtml()} is TRUE).
133 *
134 * @param source
135 * the source of the story
136 * @param in
137 * the input (the main resource)
138 * @param number
139 * the chapter number
140 * @param pg
141 * the optional progress reporter
142 *
143 * @return the content
144 *
145 * @throws IOException
146 * in case of I/O error
147 */
148 protected abstract String getChapterContent(URL source, InputStream in,
149 int number, Progress pg) throws IOException;
150
151 /**
152 * Process the given story resource into a partially filled {@link Story}
153 * object containing the name and metadata, except for the description.
154 *
155 * @param url
156 * the story resource
157 *
158 * @return the {@link Story}
159 *
160 * @throws IOException
161 * in case of I/O error
162 */
163 public Story processMeta(URL url) throws IOException {
164 return processMeta(url, true, false, null);
165 }
166
167 /**
168 * Process the given story resource into a partially filled {@link Story}
169 * object containing the name and metadata.
170 *
171 * @param url
172 * the story resource
173 * @param close
174 * close "this" and "in" when done
175 * @param getDesc
176 * retrieve the description of the story, or not
177 * @param pg
178 * the optional progress reporter
179 *
180 * @return the {@link Story}, never NULL
181 *
182 * @throws IOException
183 * in case of I/O error
184 */
185 protected Story processMeta(URL url, boolean close, boolean getDesc,
186 Progress pg) throws IOException {
187 if (pg == null) {
188 pg = new Progress();
189 } else {
190 pg.setMinMax(0, 100);
191 }
192
193 login();
194 pg.setProgress(10);
195
196 url = getCanonicalUrl(url);
197
198 setCurrentReferer(url);
199
200 in = openInput(url); // NULL allowed here
201 try {
202 preprocess(url, getInput());
203 pg.setProgress(30);
204
205 Story story = new Story();
206
207 MetaData meta = getMeta(url, getInput());
208 meta.setType(getType().toString());
209 meta.setSource(getType().getSourceName());
210 if (meta.getPublisher() == null) {
211 meta.setPublisher(getType().getSourceName());
212 }
213
214 if (meta.getCreationDate() == null
215 || meta.getCreationDate().trim().isEmpty()) {
216 meta.setCreationDate(bsHelper.formatDate(
217 StringUtils.fromTime(new Date().getTime())));
218 }
219 story.setMeta(meta);
220 pg.put("meta", meta);
221
222 pg.setProgress(50);
223
224 if (meta.getCover() == null) {
225 meta.setCover(getDefaultCover(meta.getSubject()));
226 }
227
228 pg.setProgress(60);
229
230 if (getDesc) {
231 String descChapterName = Instance.getInstance().getTrans().getString(StringId.DESCRIPTION);
232 story.getMeta().setResume(makeChapter(url, 0, descChapterName, getDesc(url, getInput()), null));
233 }
234
235 pg.setProgress(100);
236 return story;
237 } finally {
238 if (close) {
239 close();
240
241 if (in != null) {
242 in.close();
243 }
244 }
245 }
246 }
247
248 /**
249 * Process the given story resource into a fully filled {@link Story}
250 * object.
251 *
252 * @param url
253 * the story resource
254 * @param pg
255 * the optional progress reporter
256 *
257 * @return the {@link Story}, never NULL
258 *
259 * @throws IOException
260 * in case of I/O error
261 */
262 protected Story process(URL url, Progress pg) throws IOException {
263 if (pg == null) {
264 pg = new Progress();
265 } else {
266 pg.setMinMax(0, 100);
267 }
268
269 url = getCanonicalUrl(url);
270 pg.setProgress(1);
271 try {
272 Progress pgMeta = new Progress();
273 pg.addProgress(pgMeta, 10);
274 Story story = processMeta(url, false, true, pgMeta);
275 pg.put("meta", story.getMeta());
276 if (!pgMeta.isDone()) {
277 pgMeta.setProgress(pgMeta.getMax()); // 10%
278 }
279
280 setCurrentReferer(url);
281
282 Progress pgGetChapters = new Progress();
283 pg.addProgress(pgGetChapters, 10);
284 story.setChapters(new ArrayList<Chapter>());
285 List<Entry<String, URL>> chapters = getChapters(url, getInput(),
286 pgGetChapters);
287 if (!pgGetChapters.isDone()) {
288 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
289 }
290
291 if (chapters != null) {
292 Progress pgChaps = new Progress("Extracting chapters", 0,
293 chapters.size() * 300);
294 pg.addProgress(pgChaps, 80);
295
296 long words = 0;
297 int i = 1;
298 for (Entry<String, URL> chap : chapters) {
299 pgChaps.setName("Extracting chapter " + i);
300 InputStream chapIn = null;
301 if (chap.getValue() != null) {
302 setCurrentReferer(chap.getValue());
303 chapIn = Instance.getInstance().getCache().open(chap.getValue(), this, false);
304 }
305 pgChaps.setProgress(i * 100);
306 try {
307 Progress pgGetChapterContent = new Progress();
308 Progress pgMakeChapter = new Progress();
309 pgChaps.addProgress(pgGetChapterContent, 100);
310 pgChaps.addProgress(pgMakeChapter, 100);
311
312 String content = getChapterContent(url, chapIn, i,
313 pgGetChapterContent);
314 if (!pgGetChapterContent.isDone()) {
315 pgGetChapterContent.setProgress(pgGetChapterContent
316 .getMax());
317 }
318
319 Chapter cc = makeChapter(url, i, chap.getKey(),
320 content, pgMakeChapter);
321 if (!pgMakeChapter.isDone()) {
322 pgMakeChapter.setProgress(pgMakeChapter.getMax());
323 }
324
325 words += cc.getWords();
326 story.getChapters().add(cc);
327 } finally {
328 if (chapIn != null) {
329 chapIn.close();
330 }
331 }
332
333 i++;
334 }
335
336 story.getMeta().setWords(words);
337
338 pgChaps.setName("Extracting chapters");
339 } else {
340 pg.setProgress(80);
341 }
342
343 // Check for "no chapters" stories
344 if (story.getChapters().isEmpty()
345 && story.getMeta().getResume() != null
346 && !story.getMeta().getResume().getParagraphs().isEmpty()) {
347 Chapter resume = story.getMeta().getResume();
348 resume.setName("");
349 resume.setNumber(1);
350 story.getChapters().add(resume);
351 story.getMeta().setWords(resume.getWords());
352
353 String descChapterName = Instance.getInstance().getTrans()
354 .getString(StringId.DESCRIPTION);
355 resume = new Chapter(0, descChapterName);
356 story.getMeta().setResume(resume);
357 }
358
359 return story;
360 } finally {
361 close();
362
363 if (in != null) {
364 in.close();
365 }
366 }
367 }
368
369 /**
370 * Prepare the support if needed before processing.
371 *
372 * @param source
373 * the source of the story
374 * @param in
375 * the input (the main resource)
376 *
377 * @throws IOException
378 * on I/O error
379 */
380 @SuppressWarnings("unused")
381 protected void preprocess(URL source, InputStream in) throws IOException {
382 }
383
384 /**
385 * Create a {@link Chapter} object from the given information, formatting
386 * the content as it should be.
387 *
388 * @param source
389 * the source of the story
390 * @param number
391 * the chapter number
392 * @param name
393 * the chapter name
394 * @param content
395 * the chapter content
396 * @param pg
397 * the optional progress reporter
398 *
399 * @return the {@link Chapter}, never NULL
400 *
401 * @throws IOException
402 * in case of I/O error
403 */
404 protected Chapter makeChapter(URL source, int number, String name,
405 String content, Progress pg) throws IOException {
406 // Chapter name: process it correctly, then remove the possible
407 // redundant "Chapter x: " in front of it, or "-" (as in
408 // "Chapter 5: - Fun!" after the ": " was automatically added)
409 String chapterName = processPara(name).getContent().trim();
410 for (String lang : Instance.getInstance().getConfig().getList(Config.CONF_CHAPTER)) {
411 String chapterWord = Instance.getInstance().getConfig().getStringX(Config.CONF_CHAPTER, lang);
412 if (chapterName.startsWith(chapterWord)) {
413 chapterName = chapterName.substring(chapterWord.length())
414 .trim();
415 break;
416 }
417 }
418
419 if (chapterName.startsWith(Integer.toString(number))) {
420 chapterName = chapterName.substring(
421 Integer.toString(number).length()).trim();
422 }
423
424 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
425 chapterName = chapterName.substring(1).trim();
426 }
427 //
428
429 Chapter chap = new Chapter(number, chapterName);
430
431 if (content != null) {
432 List<Paragraph> paras = makeParagraphs(source, content, pg);
433 long words = 0;
434 for (Paragraph para : paras) {
435 words += para.getWords();
436 }
437 chap.setParagraphs(paras);
438 chap.setWords(words);
439 }
440
441 return chap;
442
443 }
444
445 /**
446 * Convert the given content into {@link Paragraph}s.
447 *
448 * @param source
449 * the source URL of the story
450 * @param content
451 * the textual content
452 * @param pg
453 * the optional progress reporter
454 *
455 * @return the {@link Paragraph}s (can be empty, but never NULL)
456 *
457 * @throws IOException
458 * in case of I/O error
459 */
460 protected List<Paragraph> makeParagraphs(URL source, String content,
461 Progress pg) throws IOException {
462 if (pg == null) {
463 pg = new Progress();
464 }
465
466 if (isHtml()) {
467 // Special <HR> processing:
468 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
469 "<br/>* * *<br/>");
470 }
471
472 List<Paragraph> paras = new ArrayList<Paragraph>();
473 if (content != null && !content.trim().isEmpty()) {
474 if (isHtml()) {
475 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
476 pg.setMinMax(0, tab.length);
477 int i = 1;
478 for (String line : tab) {
479 if (line.startsWith("[") && line.endsWith("]")) {
480 pg.setName("Extracting image " + i);
481 }
482 paras.add(makeParagraph(source, line.trim()));
483 pg.setProgress(i++);
484 }
485 pg.setName(null);
486 } else {
487 List<String> lines = new ArrayList<String>();
488 BufferedReader buff = null;
489 try {
490 buff = new BufferedReader(
491 new InputStreamReader(new ByteArrayInputStream(
492 content.getBytes("UTF-8")), "UTF-8"));
493 for (String line = buff.readLine(); line != null; line = buff
494 .readLine()) {
495 lines.add(line.trim());
496 }
497 } finally {
498 if (buff != null) {
499 buff.close();
500 }
501 }
502
503 pg.setMinMax(0, lines.size());
504 int i = 0;
505 for (String line : lines) {
506 if (line.startsWith("[") && line.endsWith("]")) {
507 pg.setName("Extracting image " + i);
508 }
509 paras.add(makeParagraph(source, line));
510 pg.setProgress(i++);
511 }
512 pg.setName(null);
513 }
514
515 // Check quotes for "bad" format
516 List<Paragraph> newParas = new ArrayList<Paragraph>();
517 for (Paragraph para : paras) {
518 newParas.addAll(requotify(para));
519 }
520 paras = newParas;
521
522 // Remove double blanks/brks
523 fixBlanksBreaks(paras);
524 }
525
526 return paras;
527 }
528
529 /**
530 * Convert the given line into a single {@link Paragraph}.
531 *
532 * @param source
533 * the source URL of the story
534 * @param line
535 * the textual content of the paragraph
536 *
537 * @return the {@link Paragraph}, never NULL
538 */
539 private Paragraph makeParagraph(URL source, String line) {
540 Image image = null;
541 if (line.startsWith("[") && line.endsWith("]")) {
542 image = getImage(this, source, line.substring(1, line.length() - 1)
543 .trim());
544 }
545
546 if (image != null) {
547 return new Paragraph(image);
548 }
549
550 return processPara(line);
551 }
552
553 /**
554 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
555 * those {@link Paragraph}s.
556 * <p>
557 * The resulting list will not contain a starting or trailing blank/break
558 * nor 2 blanks or breaks following each other.
559 *
560 * @param paras
561 * the list of {@link Paragraph}s to fix
562 */
563 protected void fixBlanksBreaks(List<Paragraph> paras) {
564 boolean space = false;
565 boolean brk = true;
566 for (int i = 0; i < paras.size(); i++) {
567 Paragraph para = paras.get(i);
568 boolean thisSpace = para.getType() == ParagraphType.BLANK;
569 boolean thisBrk = para.getType() == ParagraphType.BREAK;
570
571 if (i > 0 && space && thisBrk) {
572 paras.remove(i - 1);
573 i--;
574 } else if ((space || brk) && (thisSpace || thisBrk)) {
575 paras.remove(i);
576 i--;
577 }
578
579 space = thisSpace;
580 brk = thisBrk;
581 }
582
583 // Remove blank/brk at start
584 if (paras.size() > 0
585 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
586 0).getType() == ParagraphType.BREAK)) {
587 paras.remove(0);
588 }
589
590 // Remove blank/brk at end
591 int last = paras.size() - 1;
592 if (paras.size() > 0
593 && (paras.get(last).getType() == ParagraphType.BLANK || paras
594 .get(last).getType() == ParagraphType.BREAK)) {
595 paras.remove(last);
596 }
597 }
598
599 /**
600 * Get the default cover related to this subject (see <tt>.info</tt> files).
601 *
602 * @param subject
603 * the subject
604 *
605 * @return the cover if any, or NULL
606 */
607 static Image getDefaultCover(String subject) {
608 if (subject != null && !subject.isEmpty() && Instance.getInstance().getCoverDir() != null) {
609 try {
610 File fileCover = new File(Instance.getInstance().getCoverDir(), subject);
611 return getImage(null, fileCover.toURI().toURL(), subject);
612 } catch (MalformedURLException e) {
613 }
614 }
615
616 return null;
617 }
618
619 /**
620 * Return the list of supported image extensions.
621 *
622 * @param emptyAllowed
623 * TRUE to allow an empty extension on first place, which can be
624 * used when you may already have an extension in your input but
625 * are not sure about it
626 *
627 * @return the extensions
628 */
629 static String[] getImageExt(boolean emptyAllowed) {
630 if (emptyAllowed) {
631 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
632 }
633
634 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
635 }
636
637 /**
638 * Check if the given resource can be a local image or a remote image, then
639 * refresh the cache with it if it is.
640 *
641 * @param source
642 * the story source
643 * @param line
644 * the resource to check
645 *
646 * @return the image if found, or NULL
647 *
648 */
649 static Image getImage(BasicSupport_Deprecated support, URL source,
650 String line) {
651 URL url = getImageUrl(support, source, line);
652 if (url != null) {
653 if ("file".equals(url.getProtocol())) {
654 if (new File(url.getPath()).isDirectory()) {
655 return null;
656 }
657 }
658 InputStream in = null;
659 try {
660 in = Instance.getInstance().getCache().open(url, getSupport(url), true);
661 Image img = new Image(in);
662 if (img.getSize() == 0) {
663 img.close();
664 throw new IOException(
665 "Empty image not accepted");
666 }
667 return img;
668 } catch (IOException e) {
669 } finally {
670 if (in != null) {
671 try {
672 in.close();
673 } catch (IOException e) {
674 }
675 }
676 }
677 }
678
679 return null;
680 }
681
682 /**
683 * Check if the given resource can be a local image or a remote image, then
684 * refresh the cache with it if it is.
685 *
686 * @param source
687 * the story source
688 * @param line
689 * the resource to check
690 *
691 * @return the image URL if found, or NULL
692 *
693 */
694 static URL getImageUrl(BasicSupport_Deprecated support, URL source,
695 String line) {
696 URL url = null;
697
698 if (line != null) {
699 // try for files
700 if (source != null) {
701 try {
702 String relPath = null;
703 String absPath = null;
704 try {
705 String path = new File(source.getFile()).getParent();
706 relPath = new File(new File(path), line.trim())
707 .getAbsolutePath();
708 } catch (Exception e) {
709 // Cannot be converted to path (one possibility to take
710 // into account: absolute path on Windows)
711 }
712 try {
713 absPath = new File(line.trim()).getAbsolutePath();
714 } catch (Exception e) {
715 // Cannot be converted to path (at all)
716 }
717
718 for (String ext : getImageExt(true)) {
719 File absFile = new File(absPath + ext);
720 File relFile = new File(relPath + ext);
721 if (absPath != null && absFile.exists()
722 && absFile.isFile()) {
723 url = absFile.toURI().toURL();
724 } else if (relPath != null && relFile.exists()
725 && relFile.isFile()) {
726 url = relFile.toURI().toURL();
727 }
728 }
729 } catch (Exception e) {
730 // Should not happen since we control the correct arguments
731 }
732 }
733
734 if (url == null) {
735 // try for URLs
736 try {
737 for (String ext : getImageExt(true)) {
738 if (Instance.getInstance().getCache().check(new URL(line + ext), true)) {
739 url = new URL(line + ext);
740 break;
741 }
742 }
743
744 // try out of cache
745 if (url == null) {
746 for (String ext : getImageExt(true)) {
747 try {
748 url = new URL(line + ext);
749 Instance.getInstance().getCache().refresh(url, support, true);
750 break;
751 } catch (IOException e) {
752 // no image with this ext
753 url = null;
754 }
755 }
756 }
757 } catch (MalformedURLException e) {
758 // Not an url
759 }
760 }
761
762 // refresh the cached file
763 if (url != null) {
764 try {
765 Instance.getInstance().getCache().refresh(url, support, true);
766 } catch (IOException e) {
767 // woops, broken image
768 url = null;
769 }
770 }
771 }
772
773 return url;
774 }
775
776 /**
777 * Open the input file that will be used through the support.
778 * <p>
779 * Can return NULL, in which case you are supposed to work without an
780 * {@link InputStream}.
781 *
782 * @param source
783 * the source {@link URL}
784 *
785 * @return the {@link InputStream}
786 *
787 * @throws IOException
788 * in case of I/O error
789 */
790 protected InputStream openInput(URL source) throws IOException {
791 return Instance.getInstance().getCache().open(source, this, false);
792 }
793
794 /**
795 * Reset then return {@link BasicSupport_Deprecated#in}.
796 *
797 * @return {@link BasicSupport_Deprecated#in}
798 */
799 protected InputStream getInput() {
800 return reset(in);
801 }
802
803 /**
804 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
805 * and requotify them (i.e., separate them into QUOTE paragraphs and other
806 * paragraphs (quotes or not)).
807 *
808 * @param para
809 * the paragraph to requotify (not necessarily a quote)
810 *
811 * @return the correctly (or so we hope) quotified paragraphs
812 */
813 protected List<Paragraph> requotify(Paragraph para) {
814 List<Paragraph> newParas = new ArrayList<Paragraph>();
815
816 if (para.getType() == ParagraphType.QUOTE
817 && para.getContent().length() > 2) {
818 String line = para.getContent();
819 boolean singleQ = line.startsWith("" + openQuote);
820 boolean doubleQ = line.startsWith("" + openDoubleQuote);
821
822 // Do not try when more than one quote at a time
823 // (some stories are not easily readable if we do)
824 if (singleQ
825 && line.indexOf(closeQuote, 1) < line
826 .lastIndexOf(closeQuote)) {
827 newParas.add(para);
828 return newParas;
829 }
830 if (doubleQ
831 && line.indexOf(closeDoubleQuote, 1) < line
832 .lastIndexOf(closeDoubleQuote)) {
833 newParas.add(para);
834 return newParas;
835 }
836 //
837
838 if (!singleQ && !doubleQ) {
839 line = openDoubleQuote + line + closeDoubleQuote;
840 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
841 .getWords()));
842 } else {
843 char open = singleQ ? openQuote : openDoubleQuote;
844 char close = singleQ ? closeQuote : closeDoubleQuote;
845
846 int posDot = -1;
847 boolean inQuote = false;
848 int i = 0;
849 for (char car : line.toCharArray()) {
850 if (car == open) {
851 inQuote = true;
852 } else if (car == close) {
853 inQuote = false;
854 } else if (car == '.' && !inQuote) {
855 posDot = i;
856 break;
857 }
858 i++;
859 }
860
861 if (posDot >= 0) {
862 String rest = line.substring(posDot + 1).trim();
863 line = line.substring(0, posDot + 1).trim();
864 long words = 1;
865 for (char car : line.toCharArray()) {
866 if (car == ' ') {
867 words++;
868 }
869 }
870 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
871 if (!rest.isEmpty()) {
872 newParas.addAll(requotify(processPara(rest)));
873 }
874 } else {
875 newParas.add(para);
876 }
877 }
878 } else {
879 newParas.add(para);
880 }
881
882 return newParas;
883 }
884
885 /**
886 * Process a {@link Paragraph} from a raw line of text.
887 * <p>
888 * Will also fix quotes and HTML encoding if needed.
889 *
890 * @param line
891 * the raw line
892 *
893 * @return the processed {@link Paragraph}, never NULL
894 */
895 protected Paragraph processPara(String line) {
896 line = ifUnhtml(line).trim();
897
898 boolean space = true;
899 boolean brk = true;
900 boolean quote = false;
901 boolean tentativeCloseQuote = false;
902 char prev = '\0';
903 int dashCount = 0;
904 long words = 1;
905
906 StringBuilder builder = new StringBuilder();
907 for (char car : line.toCharArray()) {
908 if (car != '-') {
909 if (dashCount > 0) {
910 // dash, ndash and mdash: - – —
911 // currently: always use mdash
912 builder.append(dashCount == 1 ? '-' : '—');
913 }
914 dashCount = 0;
915 }
916
917 if (tentativeCloseQuote) {
918 tentativeCloseQuote = false;
919 if (Character.isLetterOrDigit(car)) {
920 builder.append("'");
921 } else {
922 // handle double-single quotes as double quotes
923 if (prev == car) {
924 builder.append(closeDoubleQuote);
925 continue;
926 }
927
928 builder.append(closeQuote);
929 }
930 }
931
932 switch (car) {
933 case ' ': // note: unbreakable space
934 case ' ':
935 case '\t':
936 case '\n': // just in case
937 case '\r': // just in case
938 if (builder.length() > 0
939 && builder.charAt(builder.length() - 1) != ' ') {
940 words++;
941 }
942 builder.append(' ');
943 break;
944
945 case '\'':
946 if (space || (brk && quote)) {
947 quote = true;
948 // handle double-single quotes as double quotes
949 if (prev == car) {
950 builder.deleteCharAt(builder.length() - 1);
951 builder.append(openDoubleQuote);
952 } else {
953 builder.append(openQuote);
954 }
955 } else if (prev == ' ' || prev == car) {
956 // handle double-single quotes as double quotes
957 if (prev == car) {
958 builder.deleteCharAt(builder.length() - 1);
959 builder.append(openDoubleQuote);
960 } else {
961 builder.append(openQuote);
962 }
963 } else {
964 // it is a quote ("I'm off") or a 'quote' ("This
965 // 'good' restaurant"...)
966 tentativeCloseQuote = true;
967 }
968 break;
969
970 case '"':
971 if (space || (brk && quote)) {
972 quote = true;
973 builder.append(openDoubleQuote);
974 } else if (prev == ' ') {
975 builder.append(openDoubleQuote);
976 } else {
977 builder.append(closeDoubleQuote);
978 }
979 break;
980
981 case '-':
982 if (space) {
983 quote = true;
984 } else {
985 dashCount++;
986 }
987 space = false;
988 break;
989
990 case '*':
991 case '~':
992 case '/':
993 case '\\':
994 case '<':
995 case '>':
996 case '=':
997 case '+':
998 case '_':
999 case '–':
1000 case '—':
1001 space = false;
1002 builder.append(car);
1003 break;
1004
1005 case '‘':
1006 case '`':
1007 case '‹':
1008 case '﹁':
1009 case '〈':
1010 case '「':
1011 if (space || (brk && quote)) {
1012 quote = true;
1013 builder.append(openQuote);
1014 } else {
1015 // handle double-single quotes as double quotes
1016 if (prev == car) {
1017 builder.deleteCharAt(builder.length() - 1);
1018 builder.append(openDoubleQuote);
1019 } else {
1020 builder.append(openQuote);
1021 }
1022 }
1023 space = false;
1024 brk = false;
1025 break;
1026
1027 case '’':
1028 case '›':
1029 case '﹂':
1030 case '〉':
1031 case '」':
1032 space = false;
1033 brk = false;
1034 // handle double-single quotes as double quotes
1035 if (prev == car) {
1036 builder.deleteCharAt(builder.length() - 1);
1037 builder.append(closeDoubleQuote);
1038 } else {
1039 builder.append(closeQuote);
1040 }
1041 break;
1042
1043 case '«':
1044 case '“':
1045 case '﹃':
1046 case '《':
1047 case '『':
1048 if (space || (brk && quote)) {
1049 quote = true;
1050 builder.append(openDoubleQuote);
1051 } else {
1052 builder.append(openDoubleQuote);
1053 }
1054 space = false;
1055 brk = false;
1056 break;
1057
1058 case '»':
1059 case '”':
1060 case '﹄':
1061 case '》':
1062 case '』':
1063 space = false;
1064 brk = false;
1065 builder.append(closeDoubleQuote);
1066 break;
1067
1068 default:
1069 space = false;
1070 brk = false;
1071 builder.append(car);
1072 break;
1073 }
1074
1075 prev = car;
1076 }
1077
1078 if (tentativeCloseQuote) {
1079 tentativeCloseQuote = false;
1080 builder.append(closeQuote);
1081 }
1082
1083 line = builder.toString().trim();
1084
1085 ParagraphType type = ParagraphType.NORMAL;
1086 if (space) {
1087 type = ParagraphType.BLANK;
1088 } else if (brk) {
1089 type = ParagraphType.BREAK;
1090 } else if (quote) {
1091 type = ParagraphType.QUOTE;
1092 }
1093
1094 return new Paragraph(type, line, words);
1095 }
1096
1097 /**
1098 * Remove the HTML from the input <b>if</b>
1099 * {@link BasicSupport_Deprecated#isHtml()} is true.
1100 *
1101 * @param input
1102 * the input
1103 *
1104 * @return the no html version if needed
1105 */
1106 private String ifUnhtml(String input) {
1107 if (isHtml() && input != null) {
1108 return StringUtils.unhtml(input);
1109 }
1110
1111 return input;
1112 }
1113
1114 /**
1115 * Reset the given {@link InputStream} and return it.
1116 *
1117 * @param in
1118 * the {@link InputStream} to reset
1119 *
1120 * @return the same {@link InputStream} after reset
1121 */
1122 static protected InputStream reset(InputStream in) {
1123 try {
1124 if (in != null) {
1125 in.reset();
1126 }
1127 } catch (IOException e) {
1128 }
1129
1130 return in;
1131 }
1132
1133 /**
1134 * Return the first line from the given input which correspond to the given
1135 * selectors.
1136 *
1137 * @param in
1138 * the input
1139 * @param needle
1140 * a string that must be found inside the target line (also
1141 * supports "^" at start to say "only if it starts with" the
1142 * needle)
1143 * @param relativeLine
1144 * the line to return based upon the target line position (-1 =
1145 * the line before, 0 = the target line...)
1146 *
1147 * @return the line, or NULL if not found
1148 */
1149 static protected String getLine(InputStream in, String needle,
1150 int relativeLine) {
1151 return getLine(in, needle, relativeLine, true);
1152 }
1153
1154 /**
1155 * Return a line from the given input which correspond to the given
1156 * selectors.
1157 *
1158 * @param in
1159 * the input
1160 * @param needle
1161 * a string that must be found inside the target line (also
1162 * supports "^" at start to say "only if it starts with" the
1163 * needle)
1164 * @param relativeLine
1165 * the line to return based upon the target line position (-1 =
1166 * the line before, 0 = the target line...)
1167 * @param first
1168 * takes the first result (as opposed to the last one, which will
1169 * also always spend the input)
1170 *
1171 * @return the line, or NULL if not found
1172 */
1173 static protected String getLine(InputStream in, String needle,
1174 int relativeLine, boolean first) {
1175 String rep = null;
1176
1177 reset(in);
1178
1179 List<String> lines = new ArrayList<String>();
1180 @SuppressWarnings("resource")
1181 Scanner scan = new Scanner(in, "UTF-8");
1182 int index = -1;
1183 scan.useDelimiter("\\n");
1184 while (scan.hasNext()) {
1185 lines.add(scan.next());
1186
1187 if (index == -1) {
1188 if (needle.startsWith("^")) {
1189 if (lines.get(lines.size() - 1).startsWith(
1190 needle.substring(1))) {
1191 index = lines.size() - 1;
1192 }
1193
1194 } else {
1195 if (lines.get(lines.size() - 1).contains(needle)) {
1196 index = lines.size() - 1;
1197 }
1198 }
1199 }
1200
1201 if (index >= 0 && index + relativeLine < lines.size()) {
1202 rep = lines.get(index + relativeLine);
1203 if (first) {
1204 break;
1205 }
1206 }
1207 }
1208
1209 return rep;
1210 }
1211
1212 /**
1213 * Return the text between the key and the endKey (and optional subKey can
1214 * be passed, in this case we will look for the key first, then take the
1215 * text between the subKey and the endKey).
1216 * <p>
1217 * Will only match the first line with the given key if more than one are
1218 * possible. Which also means that if the subKey or endKey is not found on
1219 * that line, NULL will be returned.
1220 *
1221 * @param in
1222 * the input
1223 * @param key
1224 * the key to match (also supports "^" at start to say
1225 * "only if it starts with" the key)
1226 * @param subKey
1227 * the sub key or NULL if none
1228 * @param endKey
1229 * the end key or NULL for "up to the end"
1230 * @return the text or NULL if not found
1231 */
1232 static protected String getKeyLine(InputStream in, String key,
1233 String subKey, String endKey) {
1234 return getKeyText(getLine(in, key, 0), key, subKey, endKey);
1235 }
1236
1237 /**
1238 * Return the text between the key and the endKey (and optional subKey can
1239 * be passed, in this case we will look for the key first, then take the
1240 * text between the subKey and the endKey).
1241 *
1242 * @param in
1243 * the input
1244 * @param key
1245 * the key to match (also supports "^" at start to say
1246 * "only if it starts with" the key)
1247 * @param subKey
1248 * the sub key or NULL if none
1249 * @param endKey
1250 * the end key or NULL for "up to the end"
1251 * @return the text or NULL if not found
1252 */
1253 static protected String getKeyText(String in, String key, String subKey,
1254 String endKey) {
1255 String result = null;
1256
1257 String line = in;
1258 if (line != null && line.contains(key)) {
1259 line = line.substring(line.indexOf(key) + key.length());
1260 if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
1261 if (subKey != null) {
1262 line = line.substring(line.indexOf(subKey)
1263 + subKey.length());
1264 }
1265 if (endKey == null || line.contains(endKey)) {
1266 if (endKey != null) {
1267 line = line.substring(0, line.indexOf(endKey));
1268 result = line;
1269 }
1270 }
1271 }
1272 }
1273
1274 return result;
1275 }
1276
1277 /**
1278 * Return the text between the key and the endKey (optional subKeys can be
1279 * passed, in this case we will look for the subKeys first, then take the
1280 * text between the key and the endKey).
1281 *
1282 * @param in
1283 * the input
1284 * @param key
1285 * the key to match
1286 * @param endKey
1287 * the end key or NULL for "up to the end"
1288 * @param afters
1289 * the sub-keys to find before checking for key/endKey
1290 *
1291 * @return the text or NULL if not found
1292 */
1293 static protected String getKeyTextAfter(String in, String key,
1294 String endKey, String... afters) {
1295
1296 if (in != null && !in.isEmpty()) {
1297 int pos = indexOfAfter(in, 0, afters);
1298 if (pos < 0) {
1299 return null;
1300 }
1301
1302 in = in.substring(pos);
1303 }
1304
1305 return getKeyText(in, key, null, endKey);
1306 }
1307
1308 /**
1309 * Return the first index after all the given "afters" have been found in
1310 * the {@link String}, or -1 if it was not possible.
1311 *
1312 * @param in
1313 * the input
1314 * @param startAt
1315 * start at this position in the string
1316 * @param afters
1317 * the sub-keys to find before checking for key/endKey
1318 *
1319 * @return the text or NULL if not found
1320 */
1321 static protected int indexOfAfter(String in, int startAt, String... afters) {
1322 int pos = -1;
1323 if (in != null && !in.isEmpty()) {
1324 pos = startAt;
1325 if (afters != null) {
1326 for (int i = 0; pos >= 0 && i < afters.length; i++) {
1327 String subKey = afters[i];
1328 if (!subKey.isEmpty()) {
1329 pos = in.indexOf(subKey, pos);
1330 if (pos >= 0) {
1331 pos += subKey.length();
1332 }
1333 }
1334 }
1335 }
1336 }
1337
1338 return pos;
1339 }
1340 }