change config bundle to better work with new nikiroo-utils
[fanfix.git] / src / be / nikiroo / fanfix / supported / BasicSupportPara.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.BufferedReader;
4 import java.io.ByteArrayInputStream;
5 import java.io.IOException;
6 import java.io.InputStreamReader;
7 import java.net.URL;
8 import java.util.ArrayList;
9 import java.util.List;
10
11 import be.nikiroo.fanfix.Instance;
12 import be.nikiroo.fanfix.bundles.Config;
13 import be.nikiroo.fanfix.bundles.StringId;
14 import be.nikiroo.fanfix.data.Chapter;
15 import be.nikiroo.fanfix.data.Paragraph;
16 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
17 import be.nikiroo.utils.Image;
18 import be.nikiroo.utils.Progress;
19 import be.nikiroo.utils.StringUtils;
20
21 /**
22 * Helper class for {@link BasicSupport}, mostly dedicated to {@link Paragraph}
23 * and text formating for the {@link BasicSupport} class itself (not its
24 * children).
25 *
26 * @author niki
27 */
28 class BasicSupportPara {
29 // quote chars
30 private static char openQuote = Instance.getTrans().getCharacter(
31 StringId.OPEN_SINGLE_QUOTE);
32 private static char closeQuote = Instance.getTrans().getCharacter(
33 StringId.CLOSE_SINGLE_QUOTE);
34 private static char openDoubleQuote = Instance.getTrans().getCharacter(
35 StringId.OPEN_DOUBLE_QUOTE);
36 private static char closeDoubleQuote = Instance.getTrans().getCharacter(
37 StringId.CLOSE_DOUBLE_QUOTE);
38
39 /**
40 * Create a {@link Chapter} object from the given information, formatting
41 * the content as it should be.
42 *
43 * @param support
44 * the linked {@link BasicSupport}
45 * @param source
46 * the source of the story
47 * @param number
48 * the chapter number
49 * @param name
50 * the chapter name
51 * @param content
52 * the chapter content
53 * @param pg
54 * the optional progress reporter
55 * @param html
56 * TRUE if the input content is in HTML mode
57 *
58 * @return the {@link Chapter}
59 *
60 * @throws IOException
61 * in case of I/O error
62 */
63 public static Chapter makeChapter(BasicSupport support, URL source,
64 int number, String name, String content, boolean html, Progress pg)
65 throws IOException {
66 // Chapter name: process it correctly, then remove the possible
67 // redundant "Chapter x: " in front of it, or "-" (as in
68 // "Chapter 5: - Fun!" after the ": " was automatically added)
69 String chapterName = BasicSupportPara.processPara(name, false)
70 .getContent().trim();
71 for (String lang : Instance.getConfig().getList(Config.CHAPTER)) {
72 String chapterWord = Instance.getConfig().getStringX(
73 Config.CHAPTER, lang);
74 if (chapterName.startsWith(chapterWord)) {
75 chapterName = chapterName.substring(chapterWord.length())
76 .trim();
77 break;
78 }
79 }
80
81 if (chapterName.startsWith(Integer.toString(number))) {
82 chapterName = chapterName.substring(
83 Integer.toString(number).length()).trim();
84 }
85
86 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
87 chapterName = chapterName.substring(1).trim();
88 }
89 //
90
91 Chapter chap = new Chapter(number, chapterName);
92
93 if (content != null) {
94 List<Paragraph> paras = makeParagraphs(support, source, content,
95 html, pg);
96 long words = 0;
97 for (Paragraph para : paras) {
98 words += para.getWords();
99 }
100 chap.setParagraphs(paras);
101 chap.setWords(words);
102 }
103
104 return chap;
105 }
106
107 /**
108 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
109 * and requotify them (i.e., separate them into QUOTE paragraphs and other
110 * paragraphs (quotes or not)).
111 *
112 * @param para
113 * the paragraph to requotify (not necessarily a quote)
114 * @param html
115 * TRUE if the input content is in HTML mode
116 *
117 * @return the correctly (or so we hope) quotified paragraphs
118 */
119 private static List<Paragraph> requotify(Paragraph para, boolean html) {
120 List<Paragraph> newParas = new ArrayList<Paragraph>();
121
122 if (para.getType() == ParagraphType.QUOTE
123 && para.getContent().length() > 2) {
124 String line = para.getContent();
125 boolean singleQ = line.startsWith("" + openQuote);
126 boolean doubleQ = line.startsWith("" + openDoubleQuote);
127
128 // Do not try when more than one quote at a time
129 // (some stories are not easily readable if we do)
130 if (singleQ
131 && line.indexOf(closeQuote, 1) < line
132 .lastIndexOf(closeQuote)) {
133 newParas.add(para);
134 return newParas;
135 }
136 if (doubleQ
137 && line.indexOf(closeDoubleQuote, 1) < line
138 .lastIndexOf(closeDoubleQuote)) {
139 newParas.add(para);
140 return newParas;
141 }
142 //
143
144 if (!singleQ && !doubleQ) {
145 line = openDoubleQuote + line + closeDoubleQuote;
146 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
147 .getWords()));
148 } else {
149 char open = singleQ ? openQuote : openDoubleQuote;
150 char close = singleQ ? closeQuote : closeDoubleQuote;
151
152 int posDot = -1;
153 boolean inQuote = false;
154 int i = 0;
155 for (char car : line.toCharArray()) {
156 if (car == open) {
157 inQuote = true;
158 } else if (car == close) {
159 inQuote = false;
160 } else if (car == '.' && !inQuote) {
161 posDot = i;
162 break;
163 }
164 i++;
165 }
166
167 if (posDot >= 0) {
168 String rest = line.substring(posDot + 1).trim();
169 line = line.substring(0, posDot + 1).trim();
170 long words = 1;
171 for (char car : line.toCharArray()) {
172 if (car == ' ') {
173 words++;
174 }
175 }
176 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
177 if (!rest.isEmpty()) {
178 newParas.addAll(requotify(processPara(rest, html), html));
179 }
180 } else {
181 newParas.add(para);
182 }
183 }
184 } else {
185 newParas.add(para);
186 }
187
188 return newParas;
189 }
190
191 /**
192 * Process a {@link Paragraph} from a raw line of text.
193 * <p>
194 * Will also fix quotes and HTML encoding if needed.
195 *
196 * @param line
197 * the raw line
198 * @param html
199 * TRUE if the input content is in HTML mode
200 *
201 * @return the processed {@link Paragraph}
202 */
203 private static Paragraph processPara(String line, boolean html) {
204 if (html) {
205 line = StringUtils.unhtml(line).trim();
206 }
207 boolean space = true;
208 boolean brk = true;
209 boolean quote = false;
210 boolean tentativeCloseQuote = false;
211 char prev = '\0';
212 int dashCount = 0;
213 long words = 1;
214
215 StringBuilder builder = new StringBuilder();
216 for (char car : line.toCharArray()) {
217 if (car != '-') {
218 if (dashCount > 0) {
219 // dash, ndash and mdash: - – —
220 // currently: always use mdash
221 builder.append(dashCount == 1 ? '-' : '—');
222 }
223 dashCount = 0;
224 }
225
226 if (tentativeCloseQuote) {
227 tentativeCloseQuote = false;
228 if (Character.isLetterOrDigit(car)) {
229 builder.append("'");
230 } else {
231 // handle double-single quotes as double quotes
232 if (prev == car) {
233 builder.append(closeDoubleQuote);
234 continue;
235 }
236
237 builder.append(closeQuote);
238 }
239 }
240
241 switch (car) {
242 case ' ': // note: unbreakable space
243 case ' ':
244 case '\t':
245 case '\n': // just in case
246 case '\r': // just in case
247 if (builder.length() > 0
248 && builder.charAt(builder.length() - 1) != ' ') {
249 words++;
250 }
251 builder.append(' ');
252 break;
253
254 case '\'':
255 if (space || (brk && quote)) {
256 quote = true;
257 // handle double-single quotes as double quotes
258 if (prev == car) {
259 builder.deleteCharAt(builder.length() - 1);
260 builder.append(openDoubleQuote);
261 } else {
262 builder.append(openQuote);
263 }
264 } else if (prev == ' ' || prev == car) {
265 // handle double-single quotes as double quotes
266 if (prev == car) {
267 builder.deleteCharAt(builder.length() - 1);
268 builder.append(openDoubleQuote);
269 } else {
270 builder.append(openQuote);
271 }
272 } else {
273 // it is a quote ("I'm off") or a 'quote' ("This
274 // 'good' restaurant"...)
275 tentativeCloseQuote = true;
276 }
277 break;
278
279 case '"':
280 if (space || (brk && quote)) {
281 quote = true;
282 builder.append(openDoubleQuote);
283 } else if (prev == ' ') {
284 builder.append(openDoubleQuote);
285 } else {
286 builder.append(closeDoubleQuote);
287 }
288 break;
289
290 case '-':
291 if (space) {
292 quote = true;
293 } else {
294 dashCount++;
295 }
296 space = false;
297 break;
298
299 case '*':
300 case '~':
301 case '/':
302 case '\\':
303 case '<':
304 case '>':
305 case '=':
306 case '+':
307 case '_':
308 case '–':
309 case '—':
310 space = false;
311 builder.append(car);
312 break;
313
314 case '‘':
315 case '`':
316 case '‹':
317 case '﹁':
318 case '〈':
319 case '「':
320 if (space || (brk && quote)) {
321 quote = true;
322 builder.append(openQuote);
323 } else {
324 // handle double-single quotes as double quotes
325 if (prev == car) {
326 builder.deleteCharAt(builder.length() - 1);
327 builder.append(openDoubleQuote);
328 } else {
329 builder.append(openQuote);
330 }
331 }
332 space = false;
333 brk = false;
334 break;
335
336 case '’':
337 case '›':
338 case '﹂':
339 case '〉':
340 case '」':
341 space = false;
342 brk = false;
343 // handle double-single quotes as double quotes
344 if (prev == car) {
345 builder.deleteCharAt(builder.length() - 1);
346 builder.append(closeDoubleQuote);
347 } else {
348 builder.append(closeQuote);
349 }
350 break;
351
352 case '«':
353 case '“':
354 case '﹃':
355 case '《':
356 case '『':
357 if (space || (brk && quote)) {
358 quote = true;
359 builder.append(openDoubleQuote);
360 } else {
361 builder.append(openDoubleQuote);
362 }
363 space = false;
364 brk = false;
365 break;
366
367 case '»':
368 case '”':
369 case '﹄':
370 case '》':
371 case '』':
372 space = false;
373 brk = false;
374 builder.append(closeDoubleQuote);
375 break;
376
377 default:
378 space = false;
379 brk = false;
380 builder.append(car);
381 break;
382 }
383
384 prev = car;
385 }
386
387 if (tentativeCloseQuote) {
388 tentativeCloseQuote = false;
389 builder.append(closeQuote);
390 }
391
392 line = builder.toString().trim();
393
394 ParagraphType type = ParagraphType.NORMAL;
395 if (space) {
396 type = ParagraphType.BLANK;
397 } else if (brk) {
398 type = ParagraphType.BREAK;
399 } else if (quote) {
400 type = ParagraphType.QUOTE;
401 }
402
403 return new Paragraph(type, line, words);
404 }
405
406 /**
407 * Convert the given content into {@link Paragraph}s.
408 *
409 * @param support
410 * the linked {@link BasicSupport}
411 * @param source
412 * the source URL of the story
413 * @param content
414 * the textual content
415 * @param html
416 * TRUE if the input content is in HTML mode
417 * @param pg
418 * the optional progress reporter
419 *
420 * @return the {@link Paragraph}s
421 *
422 * @throws IOException
423 * in case of I/O error
424 */
425 private static List<Paragraph> makeParagraphs(BasicSupport support,
426 URL source, String content, boolean html, Progress pg)
427 throws IOException {
428 if (pg == null) {
429 pg = new Progress();
430 }
431
432 if (html) {
433 // Special <HR> processing:
434 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
435 "<br/>* * *<br/>");
436 }
437
438 List<Paragraph> paras = new ArrayList<Paragraph>();
439
440 if (content != null && !content.trim().isEmpty()) {
441 if (html) {
442 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
443 pg.setMinMax(0, tab.length);
444 int i = 1;
445 for (String line : tab) {
446 if (line.startsWith("[") && line.endsWith("]")) {
447 pg.setName("Extracting image " + i);
448 }
449 paras.add(makeParagraph(support, source, line.trim(), html));
450 pg.setProgress(i++);
451 }
452 } else {
453 List<String> lines = new ArrayList<String>();
454 BufferedReader buff = null;
455 try {
456 buff = new BufferedReader(
457 new InputStreamReader(new ByteArrayInputStream(
458 content.getBytes("UTF-8")), "UTF-8"));
459 for (String line = buff.readLine(); line != null; line = buff
460 .readLine()) {
461 lines.add(line.trim());
462 }
463 } finally {
464 if (buff != null) {
465 buff.close();
466 }
467 }
468
469 pg.setMinMax(0, lines.size());
470 int i = 0;
471 for (String line : lines) {
472 if (line.startsWith("[") && line.endsWith("]")) {
473 pg.setName("Extracting image " + i);
474 }
475 paras.add(makeParagraph(support, source, line, html));
476 pg.setProgress(i++);
477 }
478 }
479
480 pg.done();
481 pg.setName(null);
482
483 // Check quotes for "bad" format
484 List<Paragraph> newParas = new ArrayList<Paragraph>();
485 for (Paragraph para : paras) {
486 newParas.addAll(BasicSupportPara.requotify(para, html));
487 }
488 paras = newParas;
489
490 // Remove double blanks/brks
491 fixBlanksBreaks(paras);
492 }
493
494 return paras;
495 }
496
497 /**
498 * Convert the given line into a single {@link Paragraph}.
499 *
500 * @param support
501 * the linked {@link BasicSupport}
502 * @param source
503 * the source URL of the story
504 * @param line
505 * the textual content of the paragraph
506 * @param html
507 * TRUE if the input content is in HTML mode
508 *
509 * @return the {@link Paragraph}
510 */
511 private static Paragraph makeParagraph(BasicSupport support, URL source,
512 String line, boolean html) {
513 Image image = null;
514 if (line.startsWith("[") && line.endsWith("]")) {
515 image = BasicSupportHelper.getImage(support, source, line
516 .substring(1, line.length() - 1).trim());
517 }
518
519 if (image != null) {
520 return new Paragraph(image);
521 }
522
523 return BasicSupportPara.processPara(line, html);
524 }
525
526 /**
527 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
528 * those {@link Paragraph}s.
529 * <p>
530 * The resulting list will not contain a starting or trailing blank/break
531 * nor 2 blanks or breaks following each other.
532 *
533 * @param paras
534 * the list of {@link Paragraph}s to fix
535 */
536 private static void fixBlanksBreaks(List<Paragraph> paras) {
537 boolean space = false;
538 boolean brk = true;
539 for (int i = 0; i < paras.size(); i++) {
540 Paragraph para = paras.get(i);
541 boolean thisSpace = para.getType() == ParagraphType.BLANK;
542 boolean thisBrk = para.getType() == ParagraphType.BREAK;
543
544 if (i > 0 && space && thisBrk) {
545 paras.remove(i - 1);
546 i--;
547 } else if ((space || brk) && (thisSpace || thisBrk)) {
548 paras.remove(i);
549 i--;
550 }
551
552 space = thisSpace;
553 brk = thisBrk;
554 }
555
556 // Remove blank/brk at start
557 if (paras.size() > 0
558 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
559 0).getType() == ParagraphType.BREAK)) {
560 paras.remove(0);
561 }
562
563 // Remove blank/brk at end
564 int last = paras.size() - 1;
565 if (paras.size() > 0
566 && (paras.get(last).getType() == ParagraphType.BLANK || paras
567 .get(last).getType() == ParagraphType.BREAK)) {
568 paras.remove(last);
569 }
570 }
571 }