doc
[nikiroo-utils.git] / supported / BasicSupportPara.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.BufferedReader;
4 import java.io.ByteArrayInputStream;
5 import java.io.IOException;
6 import java.io.InputStreamReader;
7 import java.net.URL;
8 import java.util.ArrayList;
9 import java.util.List;
10
11 import be.nikiroo.fanfix.Instance;
12 import be.nikiroo.fanfix.bundles.Config;
13 import be.nikiroo.fanfix.bundles.StringId;
14 import be.nikiroo.fanfix.data.Chapter;
15 import be.nikiroo.fanfix.data.Paragraph;
16 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
17 import be.nikiroo.utils.Image;
18 import be.nikiroo.utils.Progress;
19 import be.nikiroo.utils.StringUtils;
20
21 /**
22 * Helper class for {@link BasicSupport}, mostly dedicated to {@link Paragraph}
23 * and text formating for the {@link BasicSupport} class.
24 *
25 * @author niki
26 */
27 public class BasicSupportPara {
28 // quote chars
29 private static char openQuote = Instance.getTrans().getCharacter(
30 StringId.OPEN_SINGLE_QUOTE);
31 private static char closeQuote = Instance.getTrans().getCharacter(
32 StringId.CLOSE_SINGLE_QUOTE);
33 private static char openDoubleQuote = Instance.getTrans().getCharacter(
34 StringId.OPEN_DOUBLE_QUOTE);
35 private static char closeDoubleQuote = Instance.getTrans().getCharacter(
36 StringId.CLOSE_DOUBLE_QUOTE);
37
38 // used by this class:
39 BasicSupportHelper bsHelper;
40 BasicSupportImages bsImages;
41
42 public BasicSupportPara(BasicSupportHelper bsHelper, BasicSupportImages bsImages) {
43 this.bsHelper = bsHelper;
44 this.bsImages = bsImages;
45 }
46
47 /**
48 * Create a {@link Chapter} object from the given information, formatting
49 * the content as it should be.
50 *
51 * @param support
52 * the linked {@link BasicSupport}
53 * @param source
54 * the source of the story (for image lookup in the same path if
55 * the source is a file, can be NULL)
56 * @param number
57 * the chapter number
58 * @param name
59 * the chapter name
60 * @param content
61 * the chapter content
62 * @param pg
63 * the optional progress reporter
64 * @param html
65 * TRUE if the input content is in HTML mode
66 *
67 * @return the {@link Chapter}
68 *
69 * @throws IOException
70 * in case of I/O error
71 */
72 public Chapter makeChapter(BasicSupport support, URL source,
73 int number, String name, String content, boolean html, Progress pg)
74 throws IOException {
75 // Chapter name: process it correctly, then remove the possible
76 // redundant "Chapter x: " in front of it, or "-" (as in
77 // "Chapter 5: - Fun!" after the ": " was automatically added)
78 String chapterName = processPara(name, false)
79 .getContent().trim();
80 for (String lang : Instance.getConfig().getList(Config.CONF_CHAPTER)) {
81 String chapterWord = Instance.getConfig().getStringX(
82 Config.CONF_CHAPTER, lang);
83 if (chapterName.startsWith(chapterWord)) {
84 chapterName = chapterName.substring(chapterWord.length())
85 .trim();
86 break;
87 }
88 }
89
90 if (chapterName.startsWith(Integer.toString(number))) {
91 chapterName = chapterName.substring(
92 Integer.toString(number).length()).trim();
93 }
94
95 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
96 chapterName = chapterName.substring(1).trim();
97 }
98 //
99
100 Chapter chap = new Chapter(number, chapterName);
101
102 if (content != null) {
103 List<Paragraph> paras = makeParagraphs(support, source, content,
104 html, pg);
105 long words = 0;
106 for (Paragraph para : paras) {
107 words += para.getWords();
108 }
109 chap.setParagraphs(paras);
110 chap.setWords(words);
111 }
112
113 return chap;
114 }
115
116 /**
117 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
118 * and requotify them (i.e., separate them into QUOTE paragraphs and other
119 * paragraphs (quotes or not)).
120 *
121 * @param para
122 * the paragraph to requotify (not necessarily a quote)
123 * @param html
124 * TRUE if the input content is in HTML mode
125 *
126 * @return the correctly (or so we hope) quotified paragraphs
127 */
128 protected List<Paragraph> requotify(Paragraph para, boolean html) {
129 List<Paragraph> newParas = new ArrayList<Paragraph>();
130
131 if (para.getType() == ParagraphType.QUOTE
132 && para.getContent().length() > 2) {
133 String line = para.getContent();
134 boolean singleQ = line.startsWith("" + openQuote);
135 boolean doubleQ = line.startsWith("" + openDoubleQuote);
136
137 // Do not try when more than one quote at a time
138 // (some stories are not easily readable if we do)
139 if (singleQ
140 && line.indexOf(closeQuote, 1) < line
141 .lastIndexOf(closeQuote)) {
142 newParas.add(para);
143 return newParas;
144 }
145 if (doubleQ
146 && line.indexOf(closeDoubleQuote, 1) < line
147 .lastIndexOf(closeDoubleQuote)) {
148 newParas.add(para);
149 return newParas;
150 }
151 //
152
153 if (!singleQ && !doubleQ) {
154 line = openDoubleQuote + line + closeDoubleQuote;
155 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
156 .getWords()));
157 } else {
158 char open = singleQ ? openQuote : openDoubleQuote;
159 char close = singleQ ? closeQuote : closeDoubleQuote;
160
161 int posDot = -1;
162 boolean inQuote = false;
163 int i = 0;
164 for (char car : line.toCharArray()) {
165 if (car == open) {
166 inQuote = true;
167 } else if (car == close) {
168 inQuote = false;
169 } else if (car == '.' && !inQuote) {
170 posDot = i;
171 break;
172 }
173 i++;
174 }
175
176 if (posDot >= 0) {
177 String rest = line.substring(posDot + 1).trim();
178 line = line.substring(0, posDot + 1).trim();
179 long words = 1;
180 for (char car : line.toCharArray()) {
181 if (car == ' ') {
182 words++;
183 }
184 }
185 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
186 if (!rest.isEmpty()) {
187 newParas.addAll(requotify(processPara(rest, html), html));
188 }
189 } else {
190 newParas.add(para);
191 }
192 }
193 } else {
194 newParas.add(para);
195 }
196
197 return newParas;
198 }
199
200 /**
201 * Process a {@link Paragraph} from a raw line of text.
202 * <p>
203 * Will also fix quotes and HTML encoding if needed.
204 *
205 * @param line
206 * the raw line
207 * @param html
208 * TRUE if the input content is in HTML mode
209 *
210 * @return the processed {@link Paragraph}
211 */
212 protected Paragraph processPara(String line, boolean html) {
213 if (html) {
214 line = StringUtils.unhtml(line).trim();
215 }
216 boolean space = true;
217 boolean brk = true;
218 boolean quote = false;
219 boolean tentativeCloseQuote = false;
220 char prev = '\0';
221 int dashCount = 0;
222 long words = 1;
223
224 StringBuilder builder = new StringBuilder();
225 for (char car : line.toCharArray()) {
226 if (car != '-') {
227 if (dashCount > 0) {
228 // dash, ndash and mdash: - – —
229 // currently: always use mdash
230 builder.append(dashCount == 1 ? '-' : '—');
231 }
232 dashCount = 0;
233 }
234
235 if (tentativeCloseQuote) {
236 tentativeCloseQuote = false;
237 if (Character.isLetterOrDigit(car)) {
238 builder.append("'");
239 } else {
240 // handle double-single quotes as double quotes
241 if (prev == car) {
242 builder.append(closeDoubleQuote);
243 continue;
244 }
245
246 builder.append(closeQuote);
247 }
248 }
249
250 switch (car) {
251 case ' ': // note: unbreakable space
252 case ' ':
253 case '\t':
254 case '\n': // just in case
255 case '\r': // just in case
256 if (builder.length() > 0
257 && builder.charAt(builder.length() - 1) != ' ') {
258 words++;
259 }
260 builder.append(' ');
261 break;
262
263 case '\'':
264 if (space || (brk && quote)) {
265 quote = true;
266 // handle double-single quotes as double quotes
267 if (prev == car) {
268 builder.deleteCharAt(builder.length() - 1);
269 builder.append(openDoubleQuote);
270 } else {
271 builder.append(openQuote);
272 }
273 } else if (prev == ' ' || prev == car) {
274 // handle double-single quotes as double quotes
275 if (prev == car) {
276 builder.deleteCharAt(builder.length() - 1);
277 builder.append(openDoubleQuote);
278 } else {
279 builder.append(openQuote);
280 }
281 } else {
282 // it is a quote ("I'm off") or a 'quote' ("This
283 // 'good' restaurant"...)
284 tentativeCloseQuote = true;
285 }
286 break;
287
288 case '"':
289 if (space || (brk && quote)) {
290 quote = true;
291 builder.append(openDoubleQuote);
292 } else if (prev == ' ') {
293 builder.append(openDoubleQuote);
294 } else {
295 builder.append(closeDoubleQuote);
296 }
297 break;
298
299 case '-':
300 if (space) {
301 quote = true;
302 } else {
303 dashCount++;
304 }
305 space = false;
306 break;
307
308 case '*':
309 case '~':
310 case '/':
311 case '\\':
312 case '<':
313 case '>':
314 case '=':
315 case '+':
316 case '_':
317 case '–':
318 case '—':
319 space = false;
320 builder.append(car);
321 break;
322
323 case '‘':
324 case '`':
325 case '‹':
326 case 'īš':
327 case '〈':
328 case '「':
329 if (space || (brk && quote)) {
330 quote = true;
331 builder.append(openQuote);
332 } else {
333 // handle double-single quotes as double quotes
334 if (prev == car) {
335 builder.deleteCharAt(builder.length() - 1);
336 builder.append(openDoubleQuote);
337 } else {
338 builder.append(openQuote);
339 }
340 }
341 space = false;
342 brk = false;
343 break;
344
345 case '’':
346 case 'â€ē':
347 case 'īš‚':
348 case '〉':
349 case '」':
350 space = false;
351 brk = false;
352 // handle double-single quotes as double quotes
353 if (prev == car) {
354 builder.deleteCharAt(builder.length() - 1);
355 builder.append(closeDoubleQuote);
356 } else {
357 builder.append(closeQuote);
358 }
359 break;
360
361 case 'ÂĢ':
362 case '“':
363 case 'īšƒ':
364 case '《':
365 case '『':
366 if (space || (brk && quote)) {
367 quote = true;
368 builder.append(openDoubleQuote);
369 } else {
370 builder.append(openDoubleQuote);
371 }
372 space = false;
373 brk = false;
374 break;
375
376 case 'Âģ':
377 case '”':
378 case 'īš„':
379 case '》':
380 case '』':
381 space = false;
382 brk = false;
383 builder.append(closeDoubleQuote);
384 break;
385
386 default:
387 space = false;
388 brk = false;
389 builder.append(car);
390 break;
391 }
392
393 prev = car;
394 }
395
396 if (tentativeCloseQuote) {
397 tentativeCloseQuote = false;
398 builder.append(closeQuote);
399 }
400
401 line = builder.toString().trim();
402
403 ParagraphType type = ParagraphType.NORMAL;
404 if (space) {
405 type = ParagraphType.BLANK;
406 } else if (brk) {
407 type = ParagraphType.BREAK;
408 } else if (quote) {
409 type = ParagraphType.QUOTE;
410 }
411
412 return new Paragraph(type, line, words);
413 }
414
415 /**
416 * Convert the given content into {@link Paragraph}s.
417 *
418 * @param support
419 * the linked {@link BasicSupport} (can be NULL), used to
420 * download optional image content in []
421 * @param source
422 * the source URL of the story (for image lookup in the same path
423 * if the source is a file, can be NULL)
424 * @param content
425 * the textual content
426 * @param html
427 * TRUE if the input content is in HTML mode
428 * @param pg
429 * the optional progress reporter
430 *
431 * @return the {@link Paragraph}s
432 *
433 * @throws IOException
434 * in case of I/O error
435 */
436 protected List<Paragraph> makeParagraphs(BasicSupport support,
437 URL source, String content, boolean html, Progress pg)
438 throws IOException {
439 if (pg == null) {
440 pg = new Progress();
441 }
442
443 if (html) {
444 // Special <HR> processing:
445 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
446 "<br/>* * *<br/>");
447 }
448
449 List<Paragraph> paras = new ArrayList<Paragraph>();
450
451 if (content != null && !content.trim().isEmpty()) {
452 if (html) {
453 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
454 pg.setMinMax(0, tab.length);
455 int i = 1;
456 for (String line : tab) {
457 if (line.startsWith("[") && line.endsWith("]")) {
458 pg.setName("Extracting image " + i);
459 }
460 paras.add(makeParagraph(support, source, line.trim(), html));
461 pg.setProgress(i++);
462 }
463 } else {
464 List<String> lines = new ArrayList<String>();
465 BufferedReader buff = null;
466 try {
467 buff = new BufferedReader(
468 new InputStreamReader(new ByteArrayInputStream(
469 content.getBytes("UTF-8")), "UTF-8"));
470 for (String line = buff.readLine(); line != null; line = buff
471 .readLine()) {
472 lines.add(line.trim());
473 }
474 } finally {
475 if (buff != null) {
476 buff.close();
477 }
478 }
479
480 pg.setMinMax(0, lines.size());
481 int i = 0;
482 for (String line : lines) {
483 if (line.startsWith("[") && line.endsWith("]")) {
484 pg.setName("Extracting image " + i);
485 }
486 paras.add(makeParagraph(support, source, line, html));
487 pg.setProgress(i++);
488 }
489 }
490
491 pg.done();
492 pg.setName(null);
493
494 // Check quotes for "bad" format
495 List<Paragraph> newParas = new ArrayList<Paragraph>();
496 for (Paragraph para : paras) {
497 newParas.addAll(requotify(para, html));
498 }
499 paras = newParas;
500
501 // Remove double blanks/brks
502 fixBlanksBreaks(paras);
503 }
504
505 return paras;
506 }
507
508 /**
509 * Convert the given line into a single {@link Paragraph}.
510 *
511 * @param support
512 * the linked {@link BasicSupport} (can be NULL), used to
513 * download optional image content in []
514 * @param source
515 * the source URL of the story (for image lookup in the same path
516 * if the source is a file, can be NULL)
517 * @param line
518 * the textual content of the paragraph
519 * @param html
520 * TRUE if the input content is in HTML mode
521 *
522 * @return the {@link Paragraph}
523 */
524 protected Paragraph makeParagraph(BasicSupport support, URL source,
525 String line, boolean html) {
526 Image image = null;
527 if (line.startsWith("[") && line.endsWith("]")) {
528 image = bsHelper.getImage(support, source, line
529 .substring(1, line.length() - 1).trim());
530 }
531
532 if (image != null) {
533 return new Paragraph(image);
534 }
535
536 return processPara(line, html);
537 }
538
539 /**
540 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
541 * those {@link Paragraph}s.
542 * <p>
543 * The resulting list will not contain a starting or trailing blank/break
544 * nor 2 blanks or breaks following each other.
545 *
546 * @param paras
547 * the list of {@link Paragraph}s to fix
548 */
549 protected void fixBlanksBreaks(List<Paragraph> paras) {
550 boolean space = false;
551 boolean brk = true;
552 for (int i = 0; i < paras.size(); i++) {
553 Paragraph para = paras.get(i);
554 boolean thisSpace = para.getType() == ParagraphType.BLANK;
555 boolean thisBrk = para.getType() == ParagraphType.BREAK;
556
557 if (i > 0 && space && thisBrk) {
558 paras.remove(i - 1);
559 i--;
560 } else if ((space || brk) && (thisSpace || thisBrk)) {
561 paras.remove(i);
562 i--;
563 }
564
565 space = thisSpace;
566 brk = thisBrk;
567 }
568
569 // Remove blank/brk at start
570 if (paras.size() > 0
571 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
572 0).getType() == ParagraphType.BREAK)) {
573 paras.remove(0);
574 }
575
576 // Remove blank/brk at end
577 int last = paras.size() - 1;
578 if (paras.size() > 0
579 && (paras.get(last).getType() == ParagraphType.BLANK || paras
580 .get(last).getType() == ParagraphType.BREAK)) {
581 paras.remove(last);
582 }
583 }
584 }