do not allow empty cover images
[fanfix.git] / src / be / nikiroo / fanfix / supported / BasicSupport_Deprecated.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.BufferedReader;
4 import java.io.ByteArrayInputStream;
5 import java.io.File;
6 import java.io.IOException;
7 import java.io.InputStream;
8 import java.io.InputStreamReader;
9 import java.net.MalformedURLException;
10 import java.net.URL;
11 import java.util.ArrayList;
12 import java.util.Date;
13 import java.util.List;
14 import java.util.Map.Entry;
15 import java.util.Scanner;
16
17 import be.nikiroo.fanfix.Instance;
18 import be.nikiroo.fanfix.bundles.Config;
19 import be.nikiroo.fanfix.bundles.StringId;
20 import be.nikiroo.fanfix.data.Chapter;
21 import be.nikiroo.fanfix.data.MetaData;
22 import be.nikiroo.fanfix.data.Paragraph;
23 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
24 import be.nikiroo.fanfix.data.Story;
25 import be.nikiroo.utils.Image;
26 import be.nikiroo.utils.Progress;
27 import be.nikiroo.utils.StringUtils;
28
29 /**
30 * DEPRECATED: use the new Jsoup 'Node' system.
31 * <p>
32 * This class is the base class used by the other support classes. It can be
33 * used outside of this package, and have static method that you can use to get
34 * access to the correct support class.
35 * <p>
36 * It will be used with 'resources' (usually web pages or files).
37 *
38 * @author niki
39 */
40 @Deprecated
41 public abstract class BasicSupport_Deprecated extends BasicSupport {
42 private InputStream in;
43
44 // quote chars
45 private char openQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_SINGLE_QUOTE);
46 private char closeQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_SINGLE_QUOTE);
47 private char openDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_DOUBLE_QUOTE);
48 private char closeDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_DOUBLE_QUOTE);
49
50 // New methods not used in Deprecated mode
51 @Override
52 protected String getDesc() throws IOException {
53 throw new RuntimeException("should not be used by legacy code");
54 }
55
56 @Override
57 protected MetaData getMeta() throws IOException {
58 throw new RuntimeException("should not be used by legacy code");
59 }
60
61 @Override
62 protected List<Entry<String, URL>> getChapters(Progress pg)
63 throws IOException {
64 throw new RuntimeException("should not be used by legacy code");
65 }
66
67 @Override
68 protected String getChapterContent(URL chapUrl, int number, Progress pg)
69 throws IOException {
70 throw new RuntimeException("should not be used by legacy code");
71 }
72
73 @Override
74 public Story process(Progress pg) throws IOException {
75 return process(getSource(), pg);
76 }
77
78 //
79
80 /**
81 * Return the {@link MetaData} of this story.
82 *
83 * @param source
84 * the source of the story
85 * @param in
86 * the input (the main resource)
87 *
88 * @return the associated {@link MetaData}, never NULL
89 *
90 * @throws IOException
91 * in case of I/O error
92 */
93 protected abstract MetaData getMeta(URL source, InputStream in)
94 throws IOException;
95
96 /**
97 * Return the story description.
98 *
99 * @param source
100 * the source of the story
101 * @param in
102 * the input (the main resource)
103 *
104 * @return the description
105 *
106 * @throws IOException
107 * in case of I/O error
108 */
109 protected abstract String getDesc(URL source, InputStream in)
110 throws IOException;
111
112 /**
113 * Return the list of chapters (name and resource).
114 *
115 * @param source
116 * the source of the story
117 * @param in
118 * the input (the main resource)
119 * @param pg
120 * the optional progress reporter
121 *
122 * @return the chapters
123 *
124 * @throws IOException
125 * in case of I/O error
126 */
127 protected abstract List<Entry<String, URL>> getChapters(URL source,
128 InputStream in, Progress pg) throws IOException;
129
130 /**
131 * Return the content of the chapter (possibly HTML encoded, if
132 * {@link BasicSupport_Deprecated#isHtml()} is TRUE).
133 *
134 * @param source
135 * the source of the story
136 * @param in
137 * the input (the main resource)
138 * @param number
139 * the chapter number
140 * @param pg
141 * the optional progress reporter
142 *
143 * @return the content
144 *
145 * @throws IOException
146 * in case of I/O error
147 */
148 protected abstract String getChapterContent(URL source, InputStream in,
149 int number, Progress pg) throws IOException;
150
151 /**
152 * Process the given story resource into a partially filled {@link Story}
153 * object containing the name and metadata, except for the description.
154 *
155 * @param url
156 * the story resource
157 *
158 * @return the {@link Story}
159 *
160 * @throws IOException
161 * in case of I/O error
162 */
163 public Story processMeta(URL url) throws IOException {
164 return processMeta(url, true, false, null);
165 }
166
167 /**
168 * Process the given story resource into a partially filled {@link Story}
169 * object containing the name and metadata.
170 *
171 * @param url
172 * the story resource
173 * @param close
174 * close "this" and "in" when done
175 * @param getDesc
176 * retrieve the description of the story, or not
177 * @param pg
178 * the optional progress reporter
179 *
180 * @return the {@link Story}, never NULL
181 *
182 * @throws IOException
183 * in case of I/O error
184 */
185 protected Story processMeta(URL url, boolean close, boolean getDesc,
186 Progress pg) throws IOException {
187 if (pg == null) {
188 pg = new Progress();
189 } else {
190 pg.setMinMax(0, 100);
191 }
192
193 login();
194 pg.setProgress(10);
195
196 url = getCanonicalUrl(url);
197
198 setCurrentReferer(url);
199
200 in = openInput(url); // NULL allowed here
201 try {
202 preprocess(url, getInput());
203 pg.setProgress(30);
204
205 Story story = new Story();
206 MetaData meta = getMeta(url, getInput());
207 if (meta.getCreationDate() == null
208 || meta.getCreationDate().trim().isEmpty()) {
209 meta.setCreationDate(bsHelper.formatDate(
210 StringUtils.fromTime(new Date().getTime())));
211 }
212 story.setMeta(meta);
213 pg.put("meta", meta);
214
215 pg.setProgress(50);
216
217 if (meta.getCover() == null) {
218 meta.setCover(getDefaultCover(meta.getSubject()));
219 }
220
221 pg.setProgress(60);
222
223 if (getDesc) {
224 String descChapterName = Instance.getInstance().getTrans().getString(StringId.DESCRIPTION);
225 story.getMeta().setResume(makeChapter(url, 0, descChapterName, getDesc(url, getInput()), null));
226 }
227
228 pg.setProgress(100);
229 return story;
230 } finally {
231 if (close) {
232 close();
233
234 if (in != null) {
235 in.close();
236 }
237 }
238 }
239 }
240
241 /**
242 * Process the given story resource into a fully filled {@link Story}
243 * object.
244 *
245 * @param url
246 * the story resource
247 * @param pg
248 * the optional progress reporter
249 *
250 * @return the {@link Story}, never NULL
251 *
252 * @throws IOException
253 * in case of I/O error
254 */
255 protected Story process(URL url, Progress pg) throws IOException {
256 if (pg == null) {
257 pg = new Progress();
258 } else {
259 pg.setMinMax(0, 100);
260 }
261
262 url = getCanonicalUrl(url);
263 pg.setProgress(1);
264 try {
265 Progress pgMeta = new Progress();
266 pg.addProgress(pgMeta, 10);
267 Story story = processMeta(url, false, true, pgMeta);
268 pg.put("meta", story.getMeta());
269 if (!pgMeta.isDone()) {
270 pgMeta.setProgress(pgMeta.getMax()); // 10%
271 }
272
273 setCurrentReferer(url);
274
275 Progress pgGetChapters = new Progress();
276 pg.addProgress(pgGetChapters, 10);
277 story.setChapters(new ArrayList<Chapter>());
278 List<Entry<String, URL>> chapters = getChapters(url, getInput(),
279 pgGetChapters);
280 if (!pgGetChapters.isDone()) {
281 pgGetChapters.setProgress(pgGetChapters.getMax()); // 20%
282 }
283
284 if (chapters != null) {
285 Progress pgChaps = new Progress("Extracting chapters", 0,
286 chapters.size() * 300);
287 pg.addProgress(pgChaps, 80);
288
289 long words = 0;
290 int i = 1;
291 for (Entry<String, URL> chap : chapters) {
292 pgChaps.setName("Extracting chapter " + i);
293 InputStream chapIn = null;
294 if (chap.getValue() != null) {
295 setCurrentReferer(chap.getValue());
296 chapIn = Instance.getInstance().getCache().open(chap.getValue(), this, false);
297 }
298 pgChaps.setProgress(i * 100);
299 try {
300 Progress pgGetChapterContent = new Progress();
301 Progress pgMakeChapter = new Progress();
302 pgChaps.addProgress(pgGetChapterContent, 100);
303 pgChaps.addProgress(pgMakeChapter, 100);
304
305 String content = getChapterContent(url, chapIn, i,
306 pgGetChapterContent);
307 if (!pgGetChapterContent.isDone()) {
308 pgGetChapterContent.setProgress(pgGetChapterContent
309 .getMax());
310 }
311
312 Chapter cc = makeChapter(url, i, chap.getKey(),
313 content, pgMakeChapter);
314 if (!pgMakeChapter.isDone()) {
315 pgMakeChapter.setProgress(pgMakeChapter.getMax());
316 }
317
318 words += cc.getWords();
319 story.getChapters().add(cc);
320 } finally {
321 if (chapIn != null) {
322 chapIn.close();
323 }
324 }
325
326 i++;
327 }
328
329 story.getMeta().setWords(words);
330
331 pgChaps.setName("Extracting chapters");
332 } else {
333 pg.setProgress(80);
334 }
335
336 // Check for "no chapters" stories
337 if (story.getChapters().isEmpty()
338 && story.getMeta().getResume() != null
339 && !story.getMeta().getResume().getParagraphs().isEmpty()) {
340 Chapter resume = story.getMeta().getResume();
341 resume.setName("");
342 resume.setNumber(1);
343 story.getChapters().add(resume);
344 story.getMeta().setWords(resume.getWords());
345
346 String descChapterName = Instance.getInstance().getTrans()
347 .getString(StringId.DESCRIPTION);
348 resume = new Chapter(0, descChapterName);
349 story.getMeta().setResume(resume);
350 }
351
352 return story;
353 } finally {
354 close();
355
356 if (in != null) {
357 in.close();
358 }
359 }
360 }
361
362 /**
363 * Prepare the support if needed before processing.
364 *
365 * @param source
366 * the source of the story
367 * @param in
368 * the input (the main resource)
369 *
370 * @throws IOException
371 * on I/O error
372 */
373 @SuppressWarnings("unused")
374 protected void preprocess(URL source, InputStream in) throws IOException {
375 }
376
377 /**
378 * Create a {@link Chapter} object from the given information, formatting
379 * the content as it should be.
380 *
381 * @param source
382 * the source of the story
383 * @param number
384 * the chapter number
385 * @param name
386 * the chapter name
387 * @param content
388 * the chapter content
389 * @param pg
390 * the optional progress reporter
391 *
392 * @return the {@link Chapter}, never NULL
393 *
394 * @throws IOException
395 * in case of I/O error
396 */
397 protected Chapter makeChapter(URL source, int number, String name,
398 String content, Progress pg) throws IOException {
399 // Chapter name: process it correctly, then remove the possible
400 // redundant "Chapter x: " in front of it, or "-" (as in
401 // "Chapter 5: - Fun!" after the ": " was automatically added)
402 String chapterName = processPara(name).getContent().trim();
403 for (String lang : Instance.getInstance().getConfig().getList(Config.CONF_CHAPTER)) {
404 String chapterWord = Instance.getInstance().getConfig().getStringX(Config.CONF_CHAPTER, lang);
405 if (chapterName.startsWith(chapterWord)) {
406 chapterName = chapterName.substring(chapterWord.length())
407 .trim();
408 break;
409 }
410 }
411
412 if (chapterName.startsWith(Integer.toString(number))) {
413 chapterName = chapterName.substring(
414 Integer.toString(number).length()).trim();
415 }
416
417 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
418 chapterName = chapterName.substring(1).trim();
419 }
420 //
421
422 Chapter chap = new Chapter(number, chapterName);
423
424 if (content != null) {
425 List<Paragraph> paras = makeParagraphs(source, content, pg);
426 long words = 0;
427 for (Paragraph para : paras) {
428 words += para.getWords();
429 }
430 chap.setParagraphs(paras);
431 chap.setWords(words);
432 }
433
434 return chap;
435
436 }
437
438 /**
439 * Convert the given content into {@link Paragraph}s.
440 *
441 * @param source
442 * the source URL of the story
443 * @param content
444 * the textual content
445 * @param pg
446 * the optional progress reporter
447 *
448 * @return the {@link Paragraph}s (can be empty, but never NULL)
449 *
450 * @throws IOException
451 * in case of I/O error
452 */
453 protected List<Paragraph> makeParagraphs(URL source, String content,
454 Progress pg) throws IOException {
455 if (pg == null) {
456 pg = new Progress();
457 }
458
459 if (isHtml()) {
460 // Special <HR> processing:
461 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
462 "<br/>* * *<br/>");
463 }
464
465 List<Paragraph> paras = new ArrayList<Paragraph>();
466 if (content != null && !content.trim().isEmpty()) {
467 if (isHtml()) {
468 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
469 pg.setMinMax(0, tab.length);
470 int i = 1;
471 for (String line : tab) {
472 if (line.startsWith("[") && line.endsWith("]")) {
473 pg.setName("Extracting image " + i);
474 }
475 paras.add(makeParagraph(source, line.trim()));
476 pg.setProgress(i++);
477 }
478 pg.setName(null);
479 } else {
480 List<String> lines = new ArrayList<String>();
481 BufferedReader buff = null;
482 try {
483 buff = new BufferedReader(
484 new InputStreamReader(new ByteArrayInputStream(
485 content.getBytes("UTF-8")), "UTF-8"));
486 for (String line = buff.readLine(); line != null; line = buff
487 .readLine()) {
488 lines.add(line.trim());
489 }
490 } finally {
491 if (buff != null) {
492 buff.close();
493 }
494 }
495
496 pg.setMinMax(0, lines.size());
497 int i = 0;
498 for (String line : lines) {
499 if (line.startsWith("[") && line.endsWith("]")) {
500 pg.setName("Extracting image " + i);
501 }
502 paras.add(makeParagraph(source, line));
503 pg.setProgress(i++);
504 }
505 pg.setName(null);
506 }
507
508 // Check quotes for "bad" format
509 List<Paragraph> newParas = new ArrayList<Paragraph>();
510 for (Paragraph para : paras) {
511 newParas.addAll(requotify(para));
512 }
513 paras = newParas;
514
515 // Remove double blanks/brks
516 fixBlanksBreaks(paras);
517 }
518
519 return paras;
520 }
521
522 /**
523 * Convert the given line into a single {@link Paragraph}.
524 *
525 * @param source
526 * the source URL of the story
527 * @param line
528 * the textual content of the paragraph
529 *
530 * @return the {@link Paragraph}, never NULL
531 */
532 private Paragraph makeParagraph(URL source, String line) {
533 Image image = null;
534 if (line.startsWith("[") && line.endsWith("]")) {
535 image = getImage(this, source, line.substring(1, line.length() - 1)
536 .trim());
537 }
538
539 if (image != null) {
540 return new Paragraph(image);
541 }
542
543 return processPara(line);
544 }
545
546 /**
547 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
548 * those {@link Paragraph}s.
549 * <p>
550 * The resulting list will not contain a starting or trailing blank/break
551 * nor 2 blanks or breaks following each other.
552 *
553 * @param paras
554 * the list of {@link Paragraph}s to fix
555 */
556 protected void fixBlanksBreaks(List<Paragraph> paras) {
557 boolean space = false;
558 boolean brk = true;
559 for (int i = 0; i < paras.size(); i++) {
560 Paragraph para = paras.get(i);
561 boolean thisSpace = para.getType() == ParagraphType.BLANK;
562 boolean thisBrk = para.getType() == ParagraphType.BREAK;
563
564 if (i > 0 && space && thisBrk) {
565 paras.remove(i - 1);
566 i--;
567 } else if ((space || brk) && (thisSpace || thisBrk)) {
568 paras.remove(i);
569 i--;
570 }
571
572 space = thisSpace;
573 brk = thisBrk;
574 }
575
576 // Remove blank/brk at start
577 if (paras.size() > 0
578 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
579 0).getType() == ParagraphType.BREAK)) {
580 paras.remove(0);
581 }
582
583 // Remove blank/brk at end
584 int last = paras.size() - 1;
585 if (paras.size() > 0
586 && (paras.get(last).getType() == ParagraphType.BLANK || paras
587 .get(last).getType() == ParagraphType.BREAK)) {
588 paras.remove(last);
589 }
590 }
591
592 /**
593 * Get the default cover related to this subject (see <tt>.info</tt> files).
594 *
595 * @param subject
596 * the subject
597 *
598 * @return the cover if any, or NULL
599 */
600 static Image getDefaultCover(String subject) {
601 if (subject != null && !subject.isEmpty() && Instance.getInstance().getCoverDir() != null) {
602 try {
603 File fileCover = new File(Instance.getInstance().getCoverDir(), subject);
604 return getImage(null, fileCover.toURI().toURL(), subject);
605 } catch (MalformedURLException e) {
606 }
607 }
608
609 return null;
610 }
611
612 /**
613 * Return the list of supported image extensions.
614 *
615 * @param emptyAllowed
616 * TRUE to allow an empty extension on first place, which can be
617 * used when you may already have an extension in your input but
618 * are not sure about it
619 *
620 * @return the extensions
621 */
622 static String[] getImageExt(boolean emptyAllowed) {
623 if (emptyAllowed) {
624 return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
625 }
626
627 return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
628 }
629
630 /**
631 * Check if the given resource can be a local image or a remote image, then
632 * refresh the cache with it if it is.
633 *
634 * @param source
635 * the story source
636 * @param line
637 * the resource to check
638 *
639 * @return the image if found, or NULL
640 *
641 */
642 static Image getImage(BasicSupport_Deprecated support, URL source,
643 String line) {
644 URL url = getImageUrl(support, source, line);
645 if (url != null) {
646 if ("file".equals(url.getProtocol())) {
647 if (new File(url.getPath()).isDirectory()) {
648 return null;
649 }
650 }
651 InputStream in = null;
652 try {
653 in = Instance.getInstance().getCache().open(url, getSupport(url), true);
654 Image img = new Image(in);
655 if (img.getSize() == 0) {
656 img.close();
657 throw new IOException(
658 "Empty image not accepted");
659 }
660 return img;
661 } catch (IOException e) {
662 } finally {
663 if (in != null) {
664 try {
665 in.close();
666 } catch (IOException e) {
667 }
668 }
669 }
670 }
671
672 return null;
673 }
674
675 /**
676 * Check if the given resource can be a local image or a remote image, then
677 * refresh the cache with it if it is.
678 *
679 * @param source
680 * the story source
681 * @param line
682 * the resource to check
683 *
684 * @return the image URL if found, or NULL
685 *
686 */
687 static URL getImageUrl(BasicSupport_Deprecated support, URL source,
688 String line) {
689 URL url = null;
690
691 if (line != null) {
692 // try for files
693 if (source != null) {
694 try {
695 String relPath = null;
696 String absPath = null;
697 try {
698 String path = new File(source.getFile()).getParent();
699 relPath = new File(new File(path), line.trim())
700 .getAbsolutePath();
701 } catch (Exception e) {
702 // Cannot be converted to path (one possibility to take
703 // into account: absolute path on Windows)
704 }
705 try {
706 absPath = new File(line.trim()).getAbsolutePath();
707 } catch (Exception e) {
708 // Cannot be converted to path (at all)
709 }
710
711 for (String ext : getImageExt(true)) {
712 File absFile = new File(absPath + ext);
713 File relFile = new File(relPath + ext);
714 if (absPath != null && absFile.exists()
715 && absFile.isFile()) {
716 url = absFile.toURI().toURL();
717 } else if (relPath != null && relFile.exists()
718 && relFile.isFile()) {
719 url = relFile.toURI().toURL();
720 }
721 }
722 } catch (Exception e) {
723 // Should not happen since we control the correct arguments
724 }
725 }
726
727 if (url == null) {
728 // try for URLs
729 try {
730 for (String ext : getImageExt(true)) {
731 if (Instance.getInstance().getCache().check(new URL(line + ext), true)) {
732 url = new URL(line + ext);
733 break;
734 }
735 }
736
737 // try out of cache
738 if (url == null) {
739 for (String ext : getImageExt(true)) {
740 try {
741 url = new URL(line + ext);
742 Instance.getInstance().getCache().refresh(url, support, true);
743 break;
744 } catch (IOException e) {
745 // no image with this ext
746 url = null;
747 }
748 }
749 }
750 } catch (MalformedURLException e) {
751 // Not an url
752 }
753 }
754
755 // refresh the cached file
756 if (url != null) {
757 try {
758 Instance.getInstance().getCache().refresh(url, support, true);
759 } catch (IOException e) {
760 // woops, broken image
761 url = null;
762 }
763 }
764 }
765
766 return url;
767 }
768
769 /**
770 * Open the input file that will be used through the support.
771 * <p>
772 * Can return NULL, in which case you are supposed to work without an
773 * {@link InputStream}.
774 *
775 * @param source
776 * the source {@link URL}
777 *
778 * @return the {@link InputStream}
779 *
780 * @throws IOException
781 * in case of I/O error
782 */
783 protected InputStream openInput(URL source) throws IOException {
784 return Instance.getInstance().getCache().open(source, this, false);
785 }
786
787 /**
788 * Reset then return {@link BasicSupport_Deprecated#in}.
789 *
790 * @return {@link BasicSupport_Deprecated#in}
791 */
792 protected InputStream getInput() {
793 return reset(in);
794 }
795
796 /**
797 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
798 * and requotify them (i.e., separate them into QUOTE paragraphs and other
799 * paragraphs (quotes or not)).
800 *
801 * @param para
802 * the paragraph to requotify (not necessarily a quote)
803 *
804 * @return the correctly (or so we hope) quotified paragraphs
805 */
806 protected List<Paragraph> requotify(Paragraph para) {
807 List<Paragraph> newParas = new ArrayList<Paragraph>();
808
809 if (para.getType() == ParagraphType.QUOTE
810 && para.getContent().length() > 2) {
811 String line = para.getContent();
812 boolean singleQ = line.startsWith("" + openQuote);
813 boolean doubleQ = line.startsWith("" + openDoubleQuote);
814
815 // Do not try when more than one quote at a time
816 // (some stories are not easily readable if we do)
817 if (singleQ
818 && line.indexOf(closeQuote, 1) < line
819 .lastIndexOf(closeQuote)) {
820 newParas.add(para);
821 return newParas;
822 }
823 if (doubleQ
824 && line.indexOf(closeDoubleQuote, 1) < line
825 .lastIndexOf(closeDoubleQuote)) {
826 newParas.add(para);
827 return newParas;
828 }
829 //
830
831 if (!singleQ && !doubleQ) {
832 line = openDoubleQuote + line + closeDoubleQuote;
833 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
834 .getWords()));
835 } else {
836 char open = singleQ ? openQuote : openDoubleQuote;
837 char close = singleQ ? closeQuote : closeDoubleQuote;
838
839 int posDot = -1;
840 boolean inQuote = false;
841 int i = 0;
842 for (char car : line.toCharArray()) {
843 if (car == open) {
844 inQuote = true;
845 } else if (car == close) {
846 inQuote = false;
847 } else if (car == '.' && !inQuote) {
848 posDot = i;
849 break;
850 }
851 i++;
852 }
853
854 if (posDot >= 0) {
855 String rest = line.substring(posDot + 1).trim();
856 line = line.substring(0, posDot + 1).trim();
857 long words = 1;
858 for (char car : line.toCharArray()) {
859 if (car == ' ') {
860 words++;
861 }
862 }
863 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
864 if (!rest.isEmpty()) {
865 newParas.addAll(requotify(processPara(rest)));
866 }
867 } else {
868 newParas.add(para);
869 }
870 }
871 } else {
872 newParas.add(para);
873 }
874
875 return newParas;
876 }
877
878 /**
879 * Process a {@link Paragraph} from a raw line of text.
880 * <p>
881 * Will also fix quotes and HTML encoding if needed.
882 *
883 * @param line
884 * the raw line
885 *
886 * @return the processed {@link Paragraph}, never NULL
887 */
888 protected Paragraph processPara(String line) {
889 line = ifUnhtml(line).trim();
890
891 boolean space = true;
892 boolean brk = true;
893 boolean quote = false;
894 boolean tentativeCloseQuote = false;
895 char prev = '\0';
896 int dashCount = 0;
897 long words = 1;
898
899 StringBuilder builder = new StringBuilder();
900 for (char car : line.toCharArray()) {
901 if (car != '-') {
902 if (dashCount > 0) {
903 // dash, ndash and mdash: - – —
904 // currently: always use mdash
905 builder.append(dashCount == 1 ? '-' : '—');
906 }
907 dashCount = 0;
908 }
909
910 if (tentativeCloseQuote) {
911 tentativeCloseQuote = false;
912 if (Character.isLetterOrDigit(car)) {
913 builder.append("'");
914 } else {
915 // handle double-single quotes as double quotes
916 if (prev == car) {
917 builder.append(closeDoubleQuote);
918 continue;
919 }
920
921 builder.append(closeQuote);
922 }
923 }
924
925 switch (car) {
926 case ' ': // note: unbreakable space
927 case ' ':
928 case '\t':
929 case '\n': // just in case
930 case '\r': // just in case
931 if (builder.length() > 0
932 && builder.charAt(builder.length() - 1) != ' ') {
933 words++;
934 }
935 builder.append(' ');
936 break;
937
938 case '\'':
939 if (space || (brk && quote)) {
940 quote = true;
941 // handle double-single quotes as double quotes
942 if (prev == car) {
943 builder.deleteCharAt(builder.length() - 1);
944 builder.append(openDoubleQuote);
945 } else {
946 builder.append(openQuote);
947 }
948 } else if (prev == ' ' || prev == car) {
949 // handle double-single quotes as double quotes
950 if (prev == car) {
951 builder.deleteCharAt(builder.length() - 1);
952 builder.append(openDoubleQuote);
953 } else {
954 builder.append(openQuote);
955 }
956 } else {
957 // it is a quote ("I'm off") or a 'quote' ("This
958 // 'good' restaurant"...)
959 tentativeCloseQuote = true;
960 }
961 break;
962
963 case '"':
964 if (space || (brk && quote)) {
965 quote = true;
966 builder.append(openDoubleQuote);
967 } else if (prev == ' ') {
968 builder.append(openDoubleQuote);
969 } else {
970 builder.append(closeDoubleQuote);
971 }
972 break;
973
974 case '-':
975 if (space) {
976 quote = true;
977 } else {
978 dashCount++;
979 }
980 space = false;
981 break;
982
983 case '*':
984 case '~':
985 case '/':
986 case '\\':
987 case '<':
988 case '>':
989 case '=':
990 case '+':
991 case '_':
992 case '–':
993 case '—':
994 space = false;
995 builder.append(car);
996 break;
997
998 case '‘':
999 case '`':
1000 case '‹':
1001 case '﹁':
1002 case '〈':
1003 case '「':
1004 if (space || (brk && quote)) {
1005 quote = true;
1006 builder.append(openQuote);
1007 } else {
1008 // handle double-single quotes as double quotes
1009 if (prev == car) {
1010 builder.deleteCharAt(builder.length() - 1);
1011 builder.append(openDoubleQuote);
1012 } else {
1013 builder.append(openQuote);
1014 }
1015 }
1016 space = false;
1017 brk = false;
1018 break;
1019
1020 case '’':
1021 case '›':
1022 case '﹂':
1023 case '〉':
1024 case '」':
1025 space = false;
1026 brk = false;
1027 // handle double-single quotes as double quotes
1028 if (prev == car) {
1029 builder.deleteCharAt(builder.length() - 1);
1030 builder.append(closeDoubleQuote);
1031 } else {
1032 builder.append(closeQuote);
1033 }
1034 break;
1035
1036 case '«':
1037 case '“':
1038 case '﹃':
1039 case '《':
1040 case '『':
1041 if (space || (brk && quote)) {
1042 quote = true;
1043 builder.append(openDoubleQuote);
1044 } else {
1045 builder.append(openDoubleQuote);
1046 }
1047 space = false;
1048 brk = false;
1049 break;
1050
1051 case '»':
1052 case '”':
1053 case '﹄':
1054 case '》':
1055 case '』':
1056 space = false;
1057 brk = false;
1058 builder.append(closeDoubleQuote);
1059 break;
1060
1061 default:
1062 space = false;
1063 brk = false;
1064 builder.append(car);
1065 break;
1066 }
1067
1068 prev = car;
1069 }
1070
1071 if (tentativeCloseQuote) {
1072 tentativeCloseQuote = false;
1073 builder.append(closeQuote);
1074 }
1075
1076 line = builder.toString().trim();
1077
1078 ParagraphType type = ParagraphType.NORMAL;
1079 if (space) {
1080 type = ParagraphType.BLANK;
1081 } else if (brk) {
1082 type = ParagraphType.BREAK;
1083 } else if (quote) {
1084 type = ParagraphType.QUOTE;
1085 }
1086
1087 return new Paragraph(type, line, words);
1088 }
1089
1090 /**
1091 * Remove the HTML from the input <b>if</b>
1092 * {@link BasicSupport_Deprecated#isHtml()} is true.
1093 *
1094 * @param input
1095 * the input
1096 *
1097 * @return the no html version if needed
1098 */
1099 private String ifUnhtml(String input) {
1100 if (isHtml() && input != null) {
1101 return StringUtils.unhtml(input);
1102 }
1103
1104 return input;
1105 }
1106
1107 /**
1108 * Reset the given {@link InputStream} and return it.
1109 *
1110 * @param in
1111 * the {@link InputStream} to reset
1112 *
1113 * @return the same {@link InputStream} after reset
1114 */
1115 static protected InputStream reset(InputStream in) {
1116 try {
1117 if (in != null) {
1118 in.reset();
1119 }
1120 } catch (IOException e) {
1121 }
1122
1123 return in;
1124 }
1125
1126 /**
1127 * Return the first line from the given input which correspond to the given
1128 * selectors.
1129 *
1130 * @param in
1131 * the input
1132 * @param needle
1133 * a string that must be found inside the target line (also
1134 * supports "^" at start to say "only if it starts with" the
1135 * needle)
1136 * @param relativeLine
1137 * the line to return based upon the target line position (-1 =
1138 * the line before, 0 = the target line...)
1139 *
1140 * @return the line, or NULL if not found
1141 */
1142 static protected String getLine(InputStream in, String needle,
1143 int relativeLine) {
1144 return getLine(in, needle, relativeLine, true);
1145 }
1146
1147 /**
1148 * Return a line from the given input which correspond to the given
1149 * selectors.
1150 *
1151 * @param in
1152 * the input
1153 * @param needle
1154 * a string that must be found inside the target line (also
1155 * supports "^" at start to say "only if it starts with" the
1156 * needle)
1157 * @param relativeLine
1158 * the line to return based upon the target line position (-1 =
1159 * the line before, 0 = the target line...)
1160 * @param first
1161 * takes the first result (as opposed to the last one, which will
1162 * also always spend the input)
1163 *
1164 * @return the line, or NULL if not found
1165 */
1166 static protected String getLine(InputStream in, String needle,
1167 int relativeLine, boolean first) {
1168 String rep = null;
1169
1170 reset(in);
1171
1172 List<String> lines = new ArrayList<String>();
1173 @SuppressWarnings("resource")
1174 Scanner scan = new Scanner(in, "UTF-8");
1175 int index = -1;
1176 scan.useDelimiter("\\n");
1177 while (scan.hasNext()) {
1178 lines.add(scan.next());
1179
1180 if (index == -1) {
1181 if (needle.startsWith("^")) {
1182 if (lines.get(lines.size() - 1).startsWith(
1183 needle.substring(1))) {
1184 index = lines.size() - 1;
1185 }
1186
1187 } else {
1188 if (lines.get(lines.size() - 1).contains(needle)) {
1189 index = lines.size() - 1;
1190 }
1191 }
1192 }
1193
1194 if (index >= 0 && index + relativeLine < lines.size()) {
1195 rep = lines.get(index + relativeLine);
1196 if (first) {
1197 break;
1198 }
1199 }
1200 }
1201
1202 return rep;
1203 }
1204
1205 /**
1206 * Return the text between the key and the endKey (and optional subKey can
1207 * be passed, in this case we will look for the key first, then take the
1208 * text between the subKey and the endKey).
1209 * <p>
1210 * Will only match the first line with the given key if more than one are
1211 * possible. Which also means that if the subKey or endKey is not found on
1212 * that line, NULL will be returned.
1213 *
1214 * @param in
1215 * the input
1216 * @param key
1217 * the key to match (also supports "^" at start to say
1218 * "only if it starts with" the key)
1219 * @param subKey
1220 * the sub key or NULL if none
1221 * @param endKey
1222 * the end key or NULL for "up to the end"
1223 * @return the text or NULL if not found
1224 */
1225 static protected String getKeyLine(InputStream in, String key,
1226 String subKey, String endKey) {
1227 return getKeyText(getLine(in, key, 0), key, subKey, endKey);
1228 }
1229
1230 /**
1231 * Return the text between the key and the endKey (and optional subKey can
1232 * be passed, in this case we will look for the key first, then take the
1233 * text between the subKey and the endKey).
1234 *
1235 * @param in
1236 * the input
1237 * @param key
1238 * the key to match (also supports "^" at start to say
1239 * "only if it starts with" the key)
1240 * @param subKey
1241 * the sub key or NULL if none
1242 * @param endKey
1243 * the end key or NULL for "up to the end"
1244 * @return the text or NULL if not found
1245 */
1246 static protected String getKeyText(String in, String key, String subKey,
1247 String endKey) {
1248 String result = null;
1249
1250 String line = in;
1251 if (line != null && line.contains(key)) {
1252 line = line.substring(line.indexOf(key) + key.length());
1253 if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
1254 if (subKey != null) {
1255 line = line.substring(line.indexOf(subKey)
1256 + subKey.length());
1257 }
1258 if (endKey == null || line.contains(endKey)) {
1259 if (endKey != null) {
1260 line = line.substring(0, line.indexOf(endKey));
1261 result = line;
1262 }
1263 }
1264 }
1265 }
1266
1267 return result;
1268 }
1269
1270 /**
1271 * Return the text between the key and the endKey (optional subKeys can be
1272 * passed, in this case we will look for the subKeys first, then take the
1273 * text between the key and the endKey).
1274 *
1275 * @param in
1276 * the input
1277 * @param key
1278 * the key to match
1279 * @param endKey
1280 * the end key or NULL for "up to the end"
1281 * @param afters
1282 * the sub-keys to find before checking for key/endKey
1283 *
1284 * @return the text or NULL if not found
1285 */
1286 static protected String getKeyTextAfter(String in, String key,
1287 String endKey, String... afters) {
1288
1289 if (in != null && !in.isEmpty()) {
1290 int pos = indexOfAfter(in, 0, afters);
1291 if (pos < 0) {
1292 return null;
1293 }
1294
1295 in = in.substring(pos);
1296 }
1297
1298 return getKeyText(in, key, null, endKey);
1299 }
1300
1301 /**
1302 * Return the first index after all the given "afters" have been found in
1303 * the {@link String}, or -1 if it was not possible.
1304 *
1305 * @param in
1306 * the input
1307 * @param startAt
1308 * start at this position in the string
1309 * @param afters
1310 * the sub-keys to find before checking for key/endKey
1311 *
1312 * @return the text or NULL if not found
1313 */
1314 static protected int indexOfAfter(String in, int startAt, String... afters) {
1315 int pos = -1;
1316 if (in != null && !in.isEmpty()) {
1317 pos = startAt;
1318 if (afters != null) {
1319 for (int i = 0; pos >= 0 && i < afters.length; i++) {
1320 String subKey = afters[i];
1321 if (!subKey.isEmpty()) {
1322 pos = in.indexOf(subKey, pos);
1323 if (pos >= 0) {
1324 pos += subKey.length();
1325 }
1326 }
1327 }
1328 }
1329 }
1330
1331 return pos;
1332 }
1333 }