tests: fix NPE, add BasicSupportUtilities tests
[nikiroo-utils.git] / src / be / nikiroo / fanfix / supported / BasicSupportPara.java
CommitLineData
0ffa4754
NR
1package be.nikiroo.fanfix.supported;
2
3import java.io.BufferedReader;
4import java.io.ByteArrayInputStream;
5import java.io.IOException;
6import java.io.InputStreamReader;
7import java.net.URL;
8import java.util.ArrayList;
9import java.util.List;
10
11import be.nikiroo.fanfix.Instance;
12import be.nikiroo.fanfix.bundles.Config;
13import be.nikiroo.fanfix.bundles.StringId;
14import be.nikiroo.fanfix.data.Chapter;
15import be.nikiroo.fanfix.data.Paragraph;
16import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
17import be.nikiroo.utils.Image;
18import be.nikiroo.utils.Progress;
19import be.nikiroo.utils.StringUtils;
20
21/**
22 * Helper class for {@link BasicSupport}, mostly dedicated to {@link Paragraph}
8d59ce07 23 * and text formating for the {@link BasicSupport} class.
0ffa4754
NR
24 *
25 * @author niki
26 */
8d59ce07 27public class BasicSupportPara {
0ffa4754
NR
28 // quote chars
29 private static char openQuote = Instance.getTrans().getCharacter(
30 StringId.OPEN_SINGLE_QUOTE);
31 private static char closeQuote = Instance.getTrans().getCharacter(
32 StringId.CLOSE_SINGLE_QUOTE);
33 private static char openDoubleQuote = Instance.getTrans().getCharacter(
34 StringId.OPEN_DOUBLE_QUOTE);
35 private static char closeDoubleQuote = Instance.getTrans().getCharacter(
36 StringId.CLOSE_DOUBLE_QUOTE);
37
8d59ce07
NR
38 // used by this class:
39 BasicSupportHelper bsHelper;
40 BasicSupportImages bsImages;
41
42 public BasicSupportPara(BasicSupportHelper bsHelper, BasicSupportImages bsImages) {
43 this.bsHelper = bsHelper;
44 this.bsImages = bsImages;
45 }
46
0ffa4754
NR
47 /**
48 * Create a {@link Chapter} object from the given information, formatting
49 * the content as it should be.
50 *
51 * @param support
52 * the linked {@link BasicSupport}
53 * @param source
54 * the source of the story
55 * @param number
56 * the chapter number
57 * @param name
58 * the chapter name
59 * @param content
60 * the chapter content
61 * @param pg
62 * the optional progress reporter
63 * @param html
64 * TRUE if the input content is in HTML mode
65 *
66 * @return the {@link Chapter}
67 *
68 * @throws IOException
69 * in case of I/O error
70 */
8d59ce07 71 public Chapter makeChapter(BasicSupport support, URL source,
0ffa4754
NR
72 int number, String name, String content, boolean html, Progress pg)
73 throws IOException {
74 // Chapter name: process it correctly, then remove the possible
75 // redundant "Chapter x: " in front of it, or "-" (as in
76 // "Chapter 5: - Fun!" after the ": " was automatically added)
8d59ce07 77 String chapterName = processPara(name, false)
0ffa4754 78 .getContent().trim();
13fdb89a 79 for (String lang : Instance.getConfig().getList(Config.CONF_CHAPTER)) {
0ffa4754 80 String chapterWord = Instance.getConfig().getStringX(
13fdb89a 81 Config.CONF_CHAPTER, lang);
0ffa4754
NR
82 if (chapterName.startsWith(chapterWord)) {
83 chapterName = chapterName.substring(chapterWord.length())
84 .trim();
85 break;
86 }
87 }
88
89 if (chapterName.startsWith(Integer.toString(number))) {
90 chapterName = chapterName.substring(
91 Integer.toString(number).length()).trim();
92 }
93
94 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
95 chapterName = chapterName.substring(1).trim();
96 }
97 //
98
99 Chapter chap = new Chapter(number, chapterName);
100
101 if (content != null) {
102 List<Paragraph> paras = makeParagraphs(support, source, content,
103 html, pg);
104 long words = 0;
105 for (Paragraph para : paras) {
106 words += para.getWords();
107 }
108 chap.setParagraphs(paras);
109 chap.setWords(words);
110 }
111
112 return chap;
113 }
114
115 /**
116 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
117 * and requotify them (i.e., separate them into QUOTE paragraphs and other
118 * paragraphs (quotes or not)).
119 *
120 * @param para
121 * the paragraph to requotify (not necessarily a quote)
122 * @param html
123 * TRUE if the input content is in HTML mode
124 *
125 * @return the correctly (or so we hope) quotified paragraphs
126 */
8d59ce07 127 protected List<Paragraph> requotify(Paragraph para, boolean html) {
0ffa4754
NR
128 List<Paragraph> newParas = new ArrayList<Paragraph>();
129
130 if (para.getType() == ParagraphType.QUOTE
131 && para.getContent().length() > 2) {
132 String line = para.getContent();
133 boolean singleQ = line.startsWith("" + openQuote);
134 boolean doubleQ = line.startsWith("" + openDoubleQuote);
135
136 // Do not try when more than one quote at a time
137 // (some stories are not easily readable if we do)
138 if (singleQ
139 && line.indexOf(closeQuote, 1) < line
140 .lastIndexOf(closeQuote)) {
141 newParas.add(para);
142 return newParas;
143 }
144 if (doubleQ
145 && line.indexOf(closeDoubleQuote, 1) < line
146 .lastIndexOf(closeDoubleQuote)) {
147 newParas.add(para);
148 return newParas;
149 }
150 //
151
152 if (!singleQ && !doubleQ) {
153 line = openDoubleQuote + line + closeDoubleQuote;
154 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
155 .getWords()));
156 } else {
157 char open = singleQ ? openQuote : openDoubleQuote;
158 char close = singleQ ? closeQuote : closeDoubleQuote;
159
160 int posDot = -1;
161 boolean inQuote = false;
162 int i = 0;
163 for (char car : line.toCharArray()) {
164 if (car == open) {
165 inQuote = true;
166 } else if (car == close) {
167 inQuote = false;
168 } else if (car == '.' && !inQuote) {
169 posDot = i;
170 break;
171 }
172 i++;
173 }
174
175 if (posDot >= 0) {
176 String rest = line.substring(posDot + 1).trim();
177 line = line.substring(0, posDot + 1).trim();
178 long words = 1;
179 for (char car : line.toCharArray()) {
180 if (car == ' ') {
181 words++;
182 }
183 }
184 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
185 if (!rest.isEmpty()) {
186 newParas.addAll(requotify(processPara(rest, html), html));
187 }
188 } else {
189 newParas.add(para);
190 }
191 }
192 } else {
193 newParas.add(para);
194 }
195
196 return newParas;
197 }
198
199 /**
200 * Process a {@link Paragraph} from a raw line of text.
201 * <p>
202 * Will also fix quotes and HTML encoding if needed.
203 *
204 * @param line
205 * the raw line
206 * @param html
207 * TRUE if the input content is in HTML mode
208 *
209 * @return the processed {@link Paragraph}
210 */
8d59ce07 211 protected Paragraph processPara(String line, boolean html) {
0ffa4754
NR
212 if (html) {
213 line = StringUtils.unhtml(line).trim();
214 }
215 boolean space = true;
216 boolean brk = true;
217 boolean quote = false;
218 boolean tentativeCloseQuote = false;
219 char prev = '\0';
220 int dashCount = 0;
221 long words = 1;
222
223 StringBuilder builder = new StringBuilder();
224 for (char car : line.toCharArray()) {
225 if (car != '-') {
226 if (dashCount > 0) {
227 // dash, ndash and mdash: - – —
228 // currently: always use mdash
229 builder.append(dashCount == 1 ? '-' : '—');
230 }
231 dashCount = 0;
232 }
233
234 if (tentativeCloseQuote) {
235 tentativeCloseQuote = false;
236 if (Character.isLetterOrDigit(car)) {
237 builder.append("'");
238 } else {
239 // handle double-single quotes as double quotes
240 if (prev == car) {
241 builder.append(closeDoubleQuote);
242 continue;
243 }
244
245 builder.append(closeQuote);
246 }
247 }
248
249 switch (car) {
250 case ' ': // note: unbreakable space
251 case ' ':
252 case '\t':
253 case '\n': // just in case
254 case '\r': // just in case
255 if (builder.length() > 0
256 && builder.charAt(builder.length() - 1) != ' ') {
257 words++;
258 }
259 builder.append(' ');
260 break;
261
262 case '\'':
263 if (space || (brk && quote)) {
264 quote = true;
265 // handle double-single quotes as double quotes
266 if (prev == car) {
267 builder.deleteCharAt(builder.length() - 1);
268 builder.append(openDoubleQuote);
269 } else {
270 builder.append(openQuote);
271 }
272 } else if (prev == ' ' || prev == car) {
273 // handle double-single quotes as double quotes
274 if (prev == car) {
275 builder.deleteCharAt(builder.length() - 1);
276 builder.append(openDoubleQuote);
277 } else {
278 builder.append(openQuote);
279 }
280 } else {
281 // it is a quote ("I'm off") or a 'quote' ("This
282 // 'good' restaurant"...)
283 tentativeCloseQuote = true;
284 }
285 break;
286
287 case '"':
288 if (space || (brk && quote)) {
289 quote = true;
290 builder.append(openDoubleQuote);
291 } else if (prev == ' ') {
292 builder.append(openDoubleQuote);
293 } else {
294 builder.append(closeDoubleQuote);
295 }
296 break;
297
298 case '-':
299 if (space) {
300 quote = true;
301 } else {
302 dashCount++;
303 }
304 space = false;
305 break;
306
307 case '*':
308 case '~':
309 case '/':
310 case '\\':
311 case '<':
312 case '>':
313 case '=':
314 case '+':
315 case '_':
316 case '–':
317 case '—':
318 space = false;
319 builder.append(car);
320 break;
321
322 case '‘':
323 case '`':
324 case '‹':
325 case '﹁':
326 case '〈':
327 case '「':
328 if (space || (brk && quote)) {
329 quote = true;
330 builder.append(openQuote);
331 } else {
332 // handle double-single quotes as double quotes
333 if (prev == car) {
334 builder.deleteCharAt(builder.length() - 1);
335 builder.append(openDoubleQuote);
336 } else {
337 builder.append(openQuote);
338 }
339 }
340 space = false;
341 brk = false;
342 break;
343
344 case '’':
345 case '›':
346 case '﹂':
347 case '〉':
348 case '」':
349 space = false;
350 brk = false;
351 // handle double-single quotes as double quotes
352 if (prev == car) {
353 builder.deleteCharAt(builder.length() - 1);
354 builder.append(closeDoubleQuote);
355 } else {
356 builder.append(closeQuote);
357 }
358 break;
359
360 case '«':
361 case '“':
362 case '﹃':
363 case '《':
364 case '『':
365 if (space || (brk && quote)) {
366 quote = true;
367 builder.append(openDoubleQuote);
368 } else {
369 builder.append(openDoubleQuote);
370 }
371 space = false;
372 brk = false;
373 break;
374
375 case '»':
376 case '”':
377 case '﹄':
378 case '》':
379 case '』':
380 space = false;
381 brk = false;
382 builder.append(closeDoubleQuote);
383 break;
384
385 default:
386 space = false;
387 brk = false;
388 builder.append(car);
389 break;
390 }
391
392 prev = car;
393 }
394
395 if (tentativeCloseQuote) {
396 tentativeCloseQuote = false;
397 builder.append(closeQuote);
398 }
399
400 line = builder.toString().trim();
401
402 ParagraphType type = ParagraphType.NORMAL;
403 if (space) {
404 type = ParagraphType.BLANK;
405 } else if (brk) {
406 type = ParagraphType.BREAK;
407 } else if (quote) {
408 type = ParagraphType.QUOTE;
409 }
410
411 return new Paragraph(type, line, words);
412 }
413
414 /**
415 * Convert the given content into {@link Paragraph}s.
416 *
417 * @param support
8d59ce07
NR
418 * the linked {@link BasicSupport} (can be NULL),
419 * used to download optional image content in []
0ffa4754
NR
420 * @param source
421 * the source URL of the story
422 * @param content
423 * the textual content
424 * @param html
425 * TRUE if the input content is in HTML mode
426 * @param pg
427 * the optional progress reporter
428 *
429 * @return the {@link Paragraph}s
430 *
431 * @throws IOException
432 * in case of I/O error
433 */
8d59ce07 434 protected List<Paragraph> makeParagraphs(BasicSupport support,
0ffa4754
NR
435 URL source, String content, boolean html, Progress pg)
436 throws IOException {
437 if (pg == null) {
438 pg = new Progress();
439 }
440
441 if (html) {
442 // Special <HR> processing:
443 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
444 "<br/>* * *<br/>");
445 }
446
447 List<Paragraph> paras = new ArrayList<Paragraph>();
448
449 if (content != null && !content.trim().isEmpty()) {
450 if (html) {
451 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
452 pg.setMinMax(0, tab.length);
453 int i = 1;
454 for (String line : tab) {
455 if (line.startsWith("[") && line.endsWith("]")) {
456 pg.setName("Extracting image " + i);
457 }
458 paras.add(makeParagraph(support, source, line.trim(), html));
459 pg.setProgress(i++);
460 }
0ffa4754
NR
461 } else {
462 List<String> lines = new ArrayList<String>();
463 BufferedReader buff = null;
464 try {
465 buff = new BufferedReader(
466 new InputStreamReader(new ByteArrayInputStream(
467 content.getBytes("UTF-8")), "UTF-8"));
468 for (String line = buff.readLine(); line != null; line = buff
469 .readLine()) {
470 lines.add(line.trim());
471 }
472 } finally {
473 if (buff != null) {
474 buff.close();
475 }
476 }
477
478 pg.setMinMax(0, lines.size());
479 int i = 0;
480 for (String line : lines) {
481 if (line.startsWith("[") && line.endsWith("]")) {
482 pg.setName("Extracting image " + i);
483 }
484 paras.add(makeParagraph(support, source, line, html));
485 pg.setProgress(i++);
486 }
0ffa4754
NR
487 }
488
68328e17
NR
489 pg.done();
490 pg.setName(null);
491
0ffa4754
NR
492 // Check quotes for "bad" format
493 List<Paragraph> newParas = new ArrayList<Paragraph>();
494 for (Paragraph para : paras) {
8d59ce07 495 newParas.addAll(requotify(para, html));
0ffa4754
NR
496 }
497 paras = newParas;
498
499 // Remove double blanks/brks
500 fixBlanksBreaks(paras);
501 }
502
503 return paras;
504 }
505
506 /**
507 * Convert the given line into a single {@link Paragraph}.
508 *
509 * @param support
8d59ce07
NR
510 * the linked {@link BasicSupport} (can be NULL),
511 * used to download optional image content in []
0ffa4754
NR
512 * @param source
513 * the source URL of the story
514 * @param line
515 * the textual content of the paragraph
516 * @param html
517 * TRUE if the input content is in HTML mode
518 *
519 * @return the {@link Paragraph}
520 */
8d59ce07 521 protected Paragraph makeParagraph(BasicSupport support, URL source,
0ffa4754
NR
522 String line, boolean html) {
523 Image image = null;
524 if (line.startsWith("[") && line.endsWith("]")) {
8d59ce07 525 image = bsHelper.getImage(support, source, line
0ffa4754
NR
526 .substring(1, line.length() - 1).trim());
527 }
528
529 if (image != null) {
530 return new Paragraph(image);
531 }
532
8d59ce07 533 return processPara(line, html);
0ffa4754
NR
534 }
535
536 /**
537 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
538 * those {@link Paragraph}s.
539 * <p>
540 * The resulting list will not contain a starting or trailing blank/break
541 * nor 2 blanks or breaks following each other.
542 *
543 * @param paras
544 * the list of {@link Paragraph}s to fix
545 */
8d59ce07 546 protected void fixBlanksBreaks(List<Paragraph> paras) {
0ffa4754
NR
547 boolean space = false;
548 boolean brk = true;
549 for (int i = 0; i < paras.size(); i++) {
550 Paragraph para = paras.get(i);
551 boolean thisSpace = para.getType() == ParagraphType.BLANK;
552 boolean thisBrk = para.getType() == ParagraphType.BREAK;
553
554 if (i > 0 && space && thisBrk) {
555 paras.remove(i - 1);
556 i--;
557 } else if ((space || brk) && (thisSpace || thisBrk)) {
558 paras.remove(i);
559 i--;
560 }
561
562 space = thisSpace;
563 brk = thisBrk;
564 }
565
566 // Remove blank/brk at start
567 if (paras.size() > 0
568 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
569 0).getType() == ParagraphType.BREAK)) {
570 paras.remove(0);
571 }
572
573 // Remove blank/brk at end
574 int last = paras.size() - 1;
575 if (paras.size() > 0
576 && (paras.get(last).getType() == ParagraphType.BLANK || paras
577 .get(last).getType() == ParagraphType.BREAK)) {
578 paras.remove(last);
579 }
580 }
581}