Support for no-chapter stories or stories with descriiption before Chatper
[nikiroo-utils.git] / supported / BasicSupportPara.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.BufferedReader;
4 import java.io.ByteArrayInputStream;
5 import java.io.IOException;
6 import java.io.InputStreamReader;
7 import java.net.URL;
8 import java.util.ArrayList;
9 import java.util.List;
10
11 import be.nikiroo.fanfix.Instance;
12 import be.nikiroo.fanfix.bundles.Config;
13 import be.nikiroo.fanfix.bundles.StringId;
14 import be.nikiroo.fanfix.data.Chapter;
15 import be.nikiroo.fanfix.data.Paragraph;
16 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
17 import be.nikiroo.utils.Image;
18 import be.nikiroo.utils.Progress;
19 import be.nikiroo.utils.StringUtils;
20
21 /**
22 * Helper class for {@link BasicSupport}, mostly dedicated to {@link Paragraph}
23 * and text formating for the {@link BasicSupport} class.
24 *
25 * @author niki
26 */
27 public class BasicSupportPara {
28 // quote chars
29 private static char openQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_SINGLE_QUOTE);
30 private static char closeQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_SINGLE_QUOTE);
31 private static char openDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_DOUBLE_QUOTE);
32 private static char closeDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_DOUBLE_QUOTE);
33
34 // used by this class:
35 BasicSupportHelper bsHelper;
36 BasicSupportImages bsImages;
37
38 public BasicSupportPara(BasicSupportHelper bsHelper, BasicSupportImages bsImages) {
39 this.bsHelper = bsHelper;
40 this.bsImages = bsImages;
41 }
42
43 /**
44 * Create a {@link Chapter} object from the given information, formatting
45 * the content as it should be.
46 *
47 * @param support
48 * the linked {@link BasicSupport}
49 * @param source
50 * the source of the story (for image lookup in the same path if
51 * the source is a file, can be NULL)
52 * @param number
53 * the chapter number
54 * @param name
55 * the chapter name
56 * @param content
57 * the chapter content
58 * @param pg
59 * the optional progress reporter
60 * @param html
61 * TRUE if the input content is in HTML mode
62 *
63 * @return the {@link Chapter}, never NULL
64 *
65 * @throws IOException
66 * in case of I/O error
67 */
68 public Chapter makeChapter(BasicSupport support, URL source,
69 int number, String name, String content, boolean html, Progress pg)
70 throws IOException {
71 // Chapter name: process it correctly, then remove the possible
72 // redundant "Chapter x: " in front of it, or "-" (as in
73 // "Chapter 5: - Fun!" after the ": " was automatically added)
74 String chapterName = processPara(name, false)
75 .getContent().trim();
76 for (String lang : Instance.getInstance().getConfig().getList(Config.CONF_CHAPTER)) {
77 String chapterWord = Instance.getInstance().getConfig().getStringX(Config.CONF_CHAPTER, lang);
78 if (chapterName.startsWith(chapterWord)) {
79 chapterName = chapterName.substring(chapterWord.length())
80 .trim();
81 break;
82 }
83 }
84
85 if (chapterName.startsWith(Integer.toString(number))) {
86 chapterName = chapterName.substring(
87 Integer.toString(number).length()).trim();
88 }
89
90 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
91 chapterName = chapterName.substring(1).trim();
92 }
93 //
94
95 Chapter chap = new Chapter(number, chapterName);
96
97 if (content != null) {
98 List<Paragraph> paras = makeParagraphs(support, source, content,
99 html, pg);
100 long words = 0;
101 for (Paragraph para : paras) {
102 words += para.getWords();
103 }
104 chap.setParagraphs(paras);
105 chap.setWords(words);
106 }
107
108 return chap;
109 }
110
111 /**
112 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
113 * and requotify them (i.e., separate them into QUOTE paragraphs and other
114 * paragraphs (quotes or not)).
115 *
116 * @param para
117 * the paragraph to requotify (not necessarily a quote)
118 * @param html
119 * TRUE if the input content is in HTML mode
120 *
121 * @return the correctly (or so we hope) quotified paragraphs
122 */
123 protected List<Paragraph> requotify(Paragraph para, boolean html) {
124 List<Paragraph> newParas = new ArrayList<Paragraph>();
125
126 if (para.getType() == ParagraphType.QUOTE
127 && para.getContent().length() > 2) {
128 String line = para.getContent();
129 boolean singleQ = line.startsWith("" + openQuote);
130 boolean doubleQ = line.startsWith("" + openDoubleQuote);
131
132 // Do not try when more than one quote at a time
133 // (some stories are not easily readable if we do)
134 if (singleQ
135 && line.indexOf(closeQuote, 1) < line
136 .lastIndexOf(closeQuote)) {
137 newParas.add(para);
138 return newParas;
139 }
140 if (doubleQ
141 && line.indexOf(closeDoubleQuote, 1) < line
142 .lastIndexOf(closeDoubleQuote)) {
143 newParas.add(para);
144 return newParas;
145 }
146 //
147
148 if (!singleQ && !doubleQ) {
149 line = openDoubleQuote + line + closeDoubleQuote;
150 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
151 .getWords()));
152 } else {
153 char open = singleQ ? openQuote : openDoubleQuote;
154 char close = singleQ ? closeQuote : closeDoubleQuote;
155
156 int posDot = -1;
157 boolean inQuote = false;
158 int i = 0;
159 for (char car : line.toCharArray()) {
160 if (car == open) {
161 inQuote = true;
162 } else if (car == close) {
163 inQuote = false;
164 } else if (car == '.' && !inQuote) {
165 posDot = i;
166 break;
167 }
168 i++;
169 }
170
171 if (posDot >= 0) {
172 String rest = line.substring(posDot + 1).trim();
173 line = line.substring(0, posDot + 1).trim();
174 long words = 1;
175 for (char car : line.toCharArray()) {
176 if (car == ' ') {
177 words++;
178 }
179 }
180 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
181 if (!rest.isEmpty()) {
182 newParas.addAll(requotify(processPara(rest, html), html));
183 }
184 } else {
185 newParas.add(para);
186 }
187 }
188 } else {
189 newParas.add(para);
190 }
191
192 return newParas;
193 }
194
195 /**
196 * Process a {@link Paragraph} from a raw line of text.
197 * <p>
198 * Will also fix quotes and HTML encoding if needed.
199 *
200 * @param line
201 * the raw line
202 * @param html
203 * TRUE if the input content is in HTML mode
204 *
205 * @return the processed {@link Paragraph}, never NULL
206 */
207 protected Paragraph processPara(String line, boolean html) {
208 if (html) {
209 line = StringUtils.unhtml(line).trim();
210 }
211 boolean space = true;
212 boolean brk = true;
213 boolean quote = false;
214 boolean tentativeCloseQuote = false;
215 char prev = '\0';
216 int dashCount = 0;
217 long words = 1;
218
219 StringBuilder builder = new StringBuilder();
220 for (char car : line.toCharArray()) {
221 if (car != '-') {
222 if (dashCount > 0) {
223 // dash, ndash and mdash: - – —
224 // currently: always use mdash
225 builder.append(dashCount == 1 ? '-' : '—');
226 }
227 dashCount = 0;
228 }
229
230 if (tentativeCloseQuote) {
231 tentativeCloseQuote = false;
232 if (Character.isLetterOrDigit(car)) {
233 builder.append("'");
234 } else {
235 // handle double-single quotes as double quotes
236 if (prev == car) {
237 builder.append(closeDoubleQuote);
238 continue;
239 }
240
241 builder.append(closeQuote);
242 }
243 }
244
245 switch (car) {
246 case ' ': // note: unbreakable space
247 case ' ':
248 case '\t':
249 case '\n': // just in case
250 case '\r': // just in case
251 if (builder.length() > 0
252 && builder.charAt(builder.length() - 1) != ' ') {
253 words++;
254 }
255 builder.append(' ');
256 break;
257
258 case '\'':
259 if (space || (brk && quote)) {
260 quote = true;
261 // handle double-single quotes as double quotes
262 if (prev == car) {
263 builder.deleteCharAt(builder.length() - 1);
264 builder.append(openDoubleQuote);
265 } else {
266 builder.append(openQuote);
267 }
268 } else if (prev == ' ' || prev == car) {
269 // handle double-single quotes as double quotes
270 if (prev == car) {
271 builder.deleteCharAt(builder.length() - 1);
272 builder.append(openDoubleQuote);
273 } else {
274 builder.append(openQuote);
275 }
276 } else {
277 // it is a quote ("I'm off") or a 'quote' ("This
278 // 'good' restaurant"...)
279 tentativeCloseQuote = true;
280 }
281 break;
282
283 case '"':
284 if (space || (brk && quote)) {
285 quote = true;
286 builder.append(openDoubleQuote);
287 } else if (prev == ' ') {
288 builder.append(openDoubleQuote);
289 } else {
290 builder.append(closeDoubleQuote);
291 }
292 break;
293
294 case '-':
295 if (space) {
296 quote = true;
297 } else {
298 dashCount++;
299 }
300 space = false;
301 break;
302
303 case '*':
304 case '~':
305 case '/':
306 case '\\':
307 case '<':
308 case '>':
309 case '=':
310 case '+':
311 case '_':
312 case '–':
313 case '—':
314 space = false;
315 builder.append(car);
316 break;
317
318 case '‘':
319 case '`':
320 case '‹':
321 case '﹁':
322 case '〈':
323 case '「':
324 if (space || (brk && quote)) {
325 quote = true;
326 builder.append(openQuote);
327 } else {
328 // handle double-single quotes as double quotes
329 if (prev == car) {
330 builder.deleteCharAt(builder.length() - 1);
331 builder.append(openDoubleQuote);
332 } else {
333 builder.append(openQuote);
334 }
335 }
336 space = false;
337 brk = false;
338 break;
339
340 case '’':
341 case '›':
342 case '﹂':
343 case '〉':
344 case '」':
345 space = false;
346 brk = false;
347 // handle double-single quotes as double quotes
348 if (prev == car) {
349 builder.deleteCharAt(builder.length() - 1);
350 builder.append(closeDoubleQuote);
351 } else {
352 builder.append(closeQuote);
353 }
354 break;
355
356 case '«':
357 case '“':
358 case '﹃':
359 case '《':
360 case '『':
361 if (space || (brk && quote)) {
362 quote = true;
363 builder.append(openDoubleQuote);
364 } else {
365 builder.append(openDoubleQuote);
366 }
367 space = false;
368 brk = false;
369 break;
370
371 case '»':
372 case '”':
373 case '﹄':
374 case '》':
375 case '』':
376 space = false;
377 brk = false;
378 builder.append(closeDoubleQuote);
379 break;
380
381 default:
382 space = false;
383 brk = false;
384 builder.append(car);
385 break;
386 }
387
388 prev = car;
389 }
390
391 if (tentativeCloseQuote) {
392 tentativeCloseQuote = false;
393 builder.append(closeQuote);
394 }
395
396 line = builder.toString().trim();
397
398 ParagraphType type = ParagraphType.NORMAL;
399 if (space) {
400 type = ParagraphType.BLANK;
401 } else if (brk) {
402 type = ParagraphType.BREAK;
403 } else if (quote) {
404 type = ParagraphType.QUOTE;
405 }
406
407 return new Paragraph(type, line, words);
408 }
409
410 /**
411 * Convert the given content into {@link Paragraph}s.
412 *
413 * @param support
414 * the linked {@link BasicSupport} (can be NULL), used to
415 * download optional image content in []
416 * @param source
417 * the source URL of the story (for image lookup in the same path
418 * if the source is a file, can be NULL)
419 * @param content
420 * the textual content
421 * @param html
422 * TRUE if the input content is in HTML mode
423 * @param pg
424 * the optional progress reporter
425 *
426 * @return the {@link Paragraph}s (can be empty but never NULL)
427 *
428 * @throws IOException
429 * in case of I/O error
430 */
431 protected List<Paragraph> makeParagraphs(BasicSupport support,
432 URL source, String content, boolean html, Progress pg)
433 throws IOException {
434 if (pg == null) {
435 pg = new Progress();
436 }
437
438 if (html) {
439 // Special <HR> processing:
440 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
441 "<br/>* * *<br/>");
442 }
443
444 List<Paragraph> paras = new ArrayList<Paragraph>();
445
446 if (content != null && !content.trim().isEmpty()) {
447 if (html) {
448 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
449 pg.setMinMax(0, tab.length);
450 int i = 1;
451 for (String line : tab) {
452 if (line.startsWith("[") && line.endsWith("]")) {
453 pg.setName("Extracting image " + i);
454 }
455 paras.add(makeParagraph(support, source, line.trim(), html));
456 pg.setProgress(i++);
457 }
458 } else {
459 List<String> lines = new ArrayList<String>();
460 BufferedReader buff = null;
461 try {
462 buff = new BufferedReader(
463 new InputStreamReader(new ByteArrayInputStream(
464 content.getBytes("UTF-8")), "UTF-8"));
465 for (String line = buff.readLine(); line != null; line = buff
466 .readLine()) {
467 lines.add(line.trim());
468 }
469 } finally {
470 if (buff != null) {
471 buff.close();
472 }
473 }
474
475 pg.setMinMax(0, lines.size());
476 int i = 0;
477 for (String line : lines) {
478 if (line.startsWith("[") && line.endsWith("]")) {
479 pg.setName("Extracting image " + i);
480 }
481 paras.add(makeParagraph(support, source, line, html));
482 pg.setProgress(i++);
483 }
484 }
485
486 pg.done();
487 pg.setName(null);
488
489 // Check quotes for "bad" format
490 List<Paragraph> newParas = new ArrayList<Paragraph>();
491 for (Paragraph para : paras) {
492 newParas.addAll(requotify(para, html));
493 }
494 paras = newParas;
495
496 // Remove double blanks/brks
497 fixBlanksBreaks(paras);
498 }
499
500 return paras;
501 }
502
503 /**
504 * Convert the given line into a single {@link Paragraph}.
505 *
506 * @param support
507 * the linked {@link BasicSupport} (can be NULL), used to
508 * download optional image content in []
509 * @param source
510 * the source URL of the story (for image lookup in the same path
511 * if the source is a file, can be NULL)
512 * @param line
513 * the textual content of the paragraph
514 * @param html
515 * TRUE if the input content is in HTML mode
516 *
517 * @return the {@link Paragraph}, never NULL
518 */
519 protected Paragraph makeParagraph(BasicSupport support, URL source,
520 String line, boolean html) {
521 Image image = null;
522 if (line.startsWith("[") && line.endsWith("]")) {
523 image = bsHelper.getImage(support, source, line
524 .substring(1, line.length() - 1).trim());
525 }
526
527 if (image != null) {
528 return new Paragraph(image);
529 }
530
531 return processPara(line, html);
532 }
533
534 /**
535 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
536 * those {@link Paragraph}s.
537 * <p>
538 * The resulting list will not contain a starting or trailing blank/break
539 * nor 2 blanks or breaks following each other.
540 *
541 * @param paras
542 * the list of {@link Paragraph}s to fix
543 */
544 protected void fixBlanksBreaks(List<Paragraph> paras) {
545 boolean space = false;
546 boolean brk = true;
547 for (int i = 0; i < paras.size(); i++) {
548 Paragraph para = paras.get(i);
549 boolean thisSpace = para.getType() == ParagraphType.BLANK;
550 boolean thisBrk = para.getType() == ParagraphType.BREAK;
551
552 if (i > 0 && space && thisBrk) {
553 paras.remove(i - 1);
554 i--;
555 } else if ((space || brk) && (thisSpace || thisBrk)) {
556 paras.remove(i);
557 i--;
558 }
559
560 space = thisSpace;
561 brk = thisBrk;
562 }
563
564 // Remove blank/brk at start
565 if (paras.size() > 0
566 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
567 0).getType() == ParagraphType.BREAK)) {
568 paras.remove(0);
569 }
570
571 // Remove blank/brk at end
572 int last = paras.size() - 1;
573 if (paras.size() > 0
574 && (paras.get(last).getType() == ParagraphType.BLANK || paras
575 .get(last).getType() == ParagraphType.BREAK)) {
576 paras.remove(last);
577 }
578 }
579 }