Merge branch 'master' into subtree
[nikiroo-utils.git] / supported / BasicSupportPara.java
CommitLineData
0ffa4754
NR
1package be.nikiroo.fanfix.supported;
2
3import java.io.BufferedReader;
4import java.io.ByteArrayInputStream;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.net.URL;
8import java.util.ArrayList;
9import java.util.List;
10
11import be.nikiroo.fanfix.Instance;
12import be.nikiroo.fanfix.bundles.Config;
13import be.nikiroo.fanfix.bundles.StringId;
14import be.nikiroo.fanfix.data.Chapter;
15import be.nikiroo.fanfix.data.Paragraph;
16import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
17import be.nikiroo.utils.Image;
18import be.nikiroo.utils.Progress;
19import be.nikiroo.utils.StringUtils;
20
21/**
22 * Helper class for {@link BasicSupport}, mostly dedicated to {@link Paragraph}
8d59ce07 23 * and text formating for the {@link BasicSupport} class.
0ffa4754
NR
24 *
25 * @author niki
26 */
8d59ce07 27public class BasicSupportPara {
0ffa4754 28 // quote chars
d66deb8d
NR
29 private static char openQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_SINGLE_QUOTE);
30 private static char closeQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_SINGLE_QUOTE);
31 private static char openDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_DOUBLE_QUOTE);
32 private static char closeDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_DOUBLE_QUOTE);
0ffa4754 33
8d59ce07
NR
34 // used by this class:
35 BasicSupportHelper bsHelper;
36 BasicSupportImages bsImages;
37
e992c260
NR
38 /**
39 * Create a new {@link BasicSupportPara}.
40 * <p>
41 * Note that you need an instance of both {@link BasicSupportHelper} and
42 * {@link BasicSupportImages} for it to work.
43 *
44 * @param bsHelper
45 * the required {@link BasicSupportHelper}
46 * @param bsImages
47 * the required {@link BasicSupportImages}
48 */
8d59ce07
NR
49 public BasicSupportPara(BasicSupportHelper bsHelper, BasicSupportImages bsImages) {
50 this.bsHelper = bsHelper;
51 this.bsImages = bsImages;
52 }
53
0ffa4754
NR
54 /**
55 * Create a {@link Chapter} object from the given information, formatting
56 * the content as it should be.
57 *
58 * @param support
e992c260 59 * the linked {@link BasicSupport} (can be NULL)
0ffa4754 60 * @param source
4642806a
NR
61 * the source of the story (for image lookup in the same path if
62 * the source is a file, can be NULL)
0ffa4754
NR
63 * @param number
64 * the chapter number
65 * @param name
66 * the chapter name
67 * @param content
68 * the chapter content
69 * @param pg
70 * the optional progress reporter
71 * @param html
72 * TRUE if the input content is in HTML mode
73 *
75a6a3ea 74 * @return the {@link Chapter}, never NULL
0ffa4754
NR
75 *
76 * @throws IOException
77 * in case of I/O error
78 */
8d59ce07 79 public Chapter makeChapter(BasicSupport support, URL source,
0ffa4754
NR
80 int number, String name, String content, boolean html, Progress pg)
81 throws IOException {
82 // Chapter name: process it correctly, then remove the possible
83 // redundant "Chapter x: " in front of it, or "-" (as in
84 // "Chapter 5: - Fun!" after the ": " was automatically added)
8d59ce07 85 String chapterName = processPara(name, false)
0ffa4754 86 .getContent().trim();
d66deb8d
NR
87 for (String lang : Instance.getInstance().getConfig().getList(Config.CONF_CHAPTER)) {
88 String chapterWord = Instance.getInstance().getConfig().getStringX(Config.CONF_CHAPTER, lang);
0ffa4754
NR
89 if (chapterName.startsWith(chapterWord)) {
90 chapterName = chapterName.substring(chapterWord.length())
91 .trim();
92 break;
93 }
94 }
95
96 if (chapterName.startsWith(Integer.toString(number))) {
97 chapterName = chapterName.substring(
98 Integer.toString(number).length()).trim();
99 }
100
101 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
102 chapterName = chapterName.substring(1).trim();
103 }
104 //
105
106 Chapter chap = new Chapter(number, chapterName);
107
108 if (content != null) {
109 List<Paragraph> paras = makeParagraphs(support, source, content,
110 html, pg);
111 long words = 0;
112 for (Paragraph para : paras) {
113 words += para.getWords();
114 }
115 chap.setParagraphs(paras);
116 chap.setWords(words);
117 }
118
119 return chap;
120 }
121
122 /**
123 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
124 * and requotify them (i.e., separate them into QUOTE paragraphs and other
125 * paragraphs (quotes or not)).
126 *
127 * @param para
128 * the paragraph to requotify (not necessarily a quote)
129 * @param html
130 * TRUE if the input content is in HTML mode
131 *
132 * @return the correctly (or so we hope) quotified paragraphs
133 */
8d59ce07 134 protected List<Paragraph> requotify(Paragraph para, boolean html) {
0ffa4754
NR
135 List<Paragraph> newParas = new ArrayList<Paragraph>();
136
137 if (para.getType() == ParagraphType.QUOTE
138 && para.getContent().length() > 2) {
139 String line = para.getContent();
140 boolean singleQ = line.startsWith("" + openQuote);
141 boolean doubleQ = line.startsWith("" + openDoubleQuote);
142
143 // Do not try when more than one quote at a time
144 // (some stories are not easily readable if we do)
145 if (singleQ
146 && line.indexOf(closeQuote, 1) < line
147 .lastIndexOf(closeQuote)) {
148 newParas.add(para);
149 return newParas;
150 }
151 if (doubleQ
152 && line.indexOf(closeDoubleQuote, 1) < line
153 .lastIndexOf(closeDoubleQuote)) {
154 newParas.add(para);
155 return newParas;
156 }
157 //
158
159 if (!singleQ && !doubleQ) {
160 line = openDoubleQuote + line + closeDoubleQuote;
161 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
162 .getWords()));
163 } else {
164 char open = singleQ ? openQuote : openDoubleQuote;
165 char close = singleQ ? closeQuote : closeDoubleQuote;
166
167 int posDot = -1;
168 boolean inQuote = false;
169 int i = 0;
170 for (char car : line.toCharArray()) {
171 if (car == open) {
172 inQuote = true;
173 } else if (car == close) {
174 inQuote = false;
175 } else if (car == '.' && !inQuote) {
176 posDot = i;
177 break;
178 }
179 i++;
180 }
181
182 if (posDot >= 0) {
183 String rest = line.substring(posDot + 1).trim();
184 line = line.substring(0, posDot + 1).trim();
185 long words = 1;
186 for (char car : line.toCharArray()) {
187 if (car == ' ') {
188 words++;
189 }
190 }
191 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
192 if (!rest.isEmpty()) {
193 newParas.addAll(requotify(processPara(rest, html), html));
194 }
195 } else {
196 newParas.add(para);
197 }
198 }
199 } else {
200 newParas.add(para);
201 }
202
203 return newParas;
204 }
205
206 /**
207 * Process a {@link Paragraph} from a raw line of text.
208 * <p>
209 * Will also fix quotes and HTML encoding if needed.
210 *
211 * @param line
212 * the raw line
213 * @param html
214 * TRUE if the input content is in HTML mode
215 *
75a6a3ea 216 * @return the processed {@link Paragraph}, never NULL
0ffa4754 217 */
8d59ce07 218 protected Paragraph processPara(String line, boolean html) {
0ffa4754
NR
219 if (html) {
220 line = StringUtils.unhtml(line).trim();
221 }
222 boolean space = true;
223 boolean brk = true;
224 boolean quote = false;
225 boolean tentativeCloseQuote = false;
226 char prev = '\0';
227 int dashCount = 0;
228 long words = 1;
229
230 StringBuilder builder = new StringBuilder();
231 for (char car : line.toCharArray()) {
232 if (car != '-') {
233 if (dashCount > 0) {
234 // dash, ndash and mdash: - – —
235 // currently: always use mdash
236 builder.append(dashCount == 1 ? '-' : '—');
237 }
238 dashCount = 0;
239 }
240
241 if (tentativeCloseQuote) {
242 tentativeCloseQuote = false;
243 if (Character.isLetterOrDigit(car)) {
244 builder.append("'");
245 } else {
246 // handle double-single quotes as double quotes
247 if (prev == car) {
248 builder.append(closeDoubleQuote);
249 continue;
250 }
251
252 builder.append(closeQuote);
253 }
254 }
255
256 switch (car) {
257 case ' ': // note: unbreakable space
258 case ' ':
259 case '\t':
260 case '\n': // just in case
261 case '\r': // just in case
262 if (builder.length() > 0
263 && builder.charAt(builder.length() - 1) != ' ') {
264 words++;
265 }
266 builder.append(' ');
267 break;
268
269 case '\'':
270 if (space || (brk && quote)) {
271 quote = true;
272 // handle double-single quotes as double quotes
273 if (prev == car) {
274 builder.deleteCharAt(builder.length() - 1);
275 builder.append(openDoubleQuote);
276 } else {
277 builder.append(openQuote);
278 }
279 } else if (prev == ' ' || prev == car) {
280 // handle double-single quotes as double quotes
281 if (prev == car) {
282 builder.deleteCharAt(builder.length() - 1);
283 builder.append(openDoubleQuote);
284 } else {
285 builder.append(openQuote);
286 }
287 } else {
288 // it is a quote ("I'm off") or a 'quote' ("This
289 // 'good' restaurant"...)
290 tentativeCloseQuote = true;
291 }
292 break;
293
294 case '"':
295 if (space || (brk && quote)) {
296 quote = true;
297 builder.append(openDoubleQuote);
298 } else if (prev == ' ') {
299 builder.append(openDoubleQuote);
300 } else {
301 builder.append(closeDoubleQuote);
302 }
303 break;
304
305 case '-':
306 if (space) {
307 quote = true;
308 } else {
309 dashCount++;
310 }
311 space = false;
312 break;
313
314 case '*':
315 case '~':
316 case '/':
317 case '\\':
318 case '<':
319 case '>':
320 case '=':
321 case '+':
322 case '_':
323 case '–':
324 case '—':
325 space = false;
326 builder.append(car);
327 break;
328
329 case '‘':
330 case '`':
331 case '‹':
332 case 'īš':
333 case '〈':
334 case '「':
335 if (space || (brk && quote)) {
336 quote = true;
337 builder.append(openQuote);
338 } else {
339 // handle double-single quotes as double quotes
340 if (prev == car) {
341 builder.deleteCharAt(builder.length() - 1);
342 builder.append(openDoubleQuote);
343 } else {
344 builder.append(openQuote);
345 }
346 }
347 space = false;
348 brk = false;
349 break;
350
351 case '’':
352 case 'â€ē':
353 case 'īš‚':
354 case '〉':
355 case '」':
356 space = false;
357 brk = false;
358 // handle double-single quotes as double quotes
359 if (prev == car) {
360 builder.deleteCharAt(builder.length() - 1);
361 builder.append(closeDoubleQuote);
362 } else {
363 builder.append(closeQuote);
364 }
365 break;
366
367 case 'ÂĢ':
368 case '“':
369 case 'īšƒ':
370 case '《':
371 case '『':
372 if (space || (brk && quote)) {
373 quote = true;
374 builder.append(openDoubleQuote);
375 } else {
376 builder.append(openDoubleQuote);
377 }
378 space = false;
379 brk = false;
380 break;
381
382 case 'Âģ':
383 case '”':
384 case 'īš„':
385 case '》':
386 case '』':
387 space = false;
388 brk = false;
389 builder.append(closeDoubleQuote);
390 break;
391
392 default:
393 space = false;
394 brk = false;
395 builder.append(car);
396 break;
397 }
398
399 prev = car;
400 }
401
402 if (tentativeCloseQuote) {
403 tentativeCloseQuote = false;
404 builder.append(closeQuote);
405 }
406
407 line = builder.toString().trim();
408
409 ParagraphType type = ParagraphType.NORMAL;
410 if (space) {
411 type = ParagraphType.BLANK;
412 } else if (brk) {
413 type = ParagraphType.BREAK;
414 } else if (quote) {
415 type = ParagraphType.QUOTE;
416 }
417
418 return new Paragraph(type, line, words);
419 }
420
421 /**
422 * Convert the given content into {@link Paragraph}s.
423 *
424 * @param support
4642806a
NR
425 * the linked {@link BasicSupport} (can be NULL), used to
426 * download optional image content in []
0ffa4754 427 * @param source
4642806a
NR
428 * the source URL of the story (for image lookup in the same path
429 * if the source is a file, can be NULL)
0ffa4754
NR
430 * @param content
431 * the textual content
432 * @param html
433 * TRUE if the input content is in HTML mode
434 * @param pg
435 * the optional progress reporter
436 *
75a6a3ea 437 * @return the {@link Paragraph}s (can be empty but never NULL)
0ffa4754
NR
438 *
439 * @throws IOException
440 * in case of I/O error
441 */
8d59ce07 442 protected List<Paragraph> makeParagraphs(BasicSupport support,
0ffa4754
NR
443 URL source, String content, boolean html, Progress pg)
444 throws IOException {
445 if (pg == null) {
446 pg = new Progress();
447 }
448
449 if (html) {
450 // Special <HR> processing:
451 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
452 "<br/>* * *<br/>");
453 }
454
455 List<Paragraph> paras = new ArrayList<Paragraph>();
456
457 if (content != null && !content.trim().isEmpty()) {
458 if (html) {
459 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
460 pg.setMinMax(0, tab.length);
461 int i = 1;
462 for (String line : tab) {
463 if (line.startsWith("[") && line.endsWith("]")) {
464 pg.setName("Extracting image " + i);
465 }
466 paras.add(makeParagraph(support, source, line.trim(), html));
467 pg.setProgress(i++);
468 }
0ffa4754
NR
469 } else {
470 List<String> lines = new ArrayList<String>();
471 BufferedReader buff = null;
472 try {
473 buff = new BufferedReader(
474 new InputStreamReader(new ByteArrayInputStream(
475 content.getBytes("UTF-8")), "UTF-8"));
476 for (String line = buff.readLine(); line != null; line = buff
477 .readLine()) {
478 lines.add(line.trim());
479 }
480 } finally {
481 if (buff != null) {
482 buff.close();
483 }
484 }
485
486 pg.setMinMax(0, lines.size());
487 int i = 0;
488 for (String line : lines) {
489 if (line.startsWith("[") && line.endsWith("]")) {
490 pg.setName("Extracting image " + i);
491 }
492 paras.add(makeParagraph(support, source, line, html));
493 pg.setProgress(i++);
494 }
0ffa4754
NR
495 }
496
68328e17
NR
497 pg.done();
498 pg.setName(null);
499
0ffa4754
NR
500 // Check quotes for "bad" format
501 List<Paragraph> newParas = new ArrayList<Paragraph>();
502 for (Paragraph para : paras) {
8d59ce07 503 newParas.addAll(requotify(para, html));
0ffa4754
NR
504 }
505 paras = newParas;
506
507 // Remove double blanks/brks
508 fixBlanksBreaks(paras);
509 }
510
511 return paras;
512 }
513
514 /**
515 * Convert the given line into a single {@link Paragraph}.
516 *
517 * @param support
4642806a
NR
518 * the linked {@link BasicSupport} (can be NULL), used to
519 * download optional image content in []
0ffa4754 520 * @param source
4642806a
NR
521 * the source URL of the story (for image lookup in the same path
522 * if the source is a file, can be NULL)
0ffa4754
NR
523 * @param line
524 * the textual content of the paragraph
525 * @param html
526 * TRUE if the input content is in HTML mode
527 *
75a6a3ea 528 * @return the {@link Paragraph}, never NULL
0ffa4754 529 */
8d59ce07 530 protected Paragraph makeParagraph(BasicSupport support, URL source,
0ffa4754
NR
531 String line, boolean html) {
532 Image image = null;
533 if (line.startsWith("[") && line.endsWith("]")) {
8d59ce07 534 image = bsHelper.getImage(support, source, line
0ffa4754
NR
535 .substring(1, line.length() - 1).trim());
536 }
537
538 if (image != null) {
539 return new Paragraph(image);
540 }
541
8d59ce07 542 return processPara(line, html);
0ffa4754
NR
543 }
544
545 /**
546 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
547 * those {@link Paragraph}s.
548 * <p>
549 * The resulting list will not contain a starting or trailing blank/break
550 * nor 2 blanks or breaks following each other.
551 *
552 * @param paras
553 * the list of {@link Paragraph}s to fix
554 */
8d59ce07 555 protected void fixBlanksBreaks(List<Paragraph> paras) {
0ffa4754
NR
556 boolean space = false;
557 boolean brk = true;
558 for (int i = 0; i < paras.size(); i++) {
559 Paragraph para = paras.get(i);
560 boolean thisSpace = para.getType() == ParagraphType.BLANK;
561 boolean thisBrk = para.getType() == ParagraphType.BREAK;
562
563 if (i > 0 && space && thisBrk) {
564 paras.remove(i - 1);
565 i--;
566 } else if ((space || brk) && (thisSpace || thisBrk)) {
567 paras.remove(i);
568 i--;
569 }
570
571 space = thisSpace;
572 brk = thisBrk;
573 }
574
575 // Remove blank/brk at start
576 if (paras.size() > 0
577 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
578 0).getType() == ParagraphType.BREAK)) {
579 paras.remove(0);
580 }
581
582 // Remove blank/brk at end
583 int last = paras.size() - 1;
584 if (paras.size() > 0
585 && (paras.get(last).getType() == ParagraphType.BLANK || paras
586 .get(last).getType() == ParagraphType.BREAK)) {
587 paras.remove(last);
588 }
589 }
590}