d3beea53b47c39b251c5b31647397f54628ac54f
[fanfix.git] / src / be / nikiroo / fanfix / supported / BasicSupportPara.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.BufferedReader;
4 import java.io.ByteArrayInputStream;
5 import java.io.IOException;
6 import java.io.InputStreamReader;
7 import java.net.URL;
8 import java.util.ArrayList;
9 import java.util.List;
10
11 import be.nikiroo.fanfix.Instance;
12 import be.nikiroo.fanfix.bundles.Config;
13 import be.nikiroo.fanfix.bundles.StringId;
14 import be.nikiroo.fanfix.data.Chapter;
15 import be.nikiroo.fanfix.data.Paragraph;
16 import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
17 import be.nikiroo.utils.Image;
18 import be.nikiroo.utils.Progress;
19 import be.nikiroo.utils.StringUtils;
20
21 /**
22 * Helper class for {@link BasicSupport}, mostly dedicated to {@link Paragraph}
23 * and text formating for the {@link BasicSupport} class itself (not its
24 * children).
25 *
26 * @author niki
27 */
28 class BasicSupportPara {
29 // quote chars
30 private static char openQuote = Instance.getTrans().getCharacter(
31 StringId.OPEN_SINGLE_QUOTE);
32 private static char closeQuote = Instance.getTrans().getCharacter(
33 StringId.CLOSE_SINGLE_QUOTE);
34 private static char openDoubleQuote = Instance.getTrans().getCharacter(
35 StringId.OPEN_DOUBLE_QUOTE);
36 private static char closeDoubleQuote = Instance.getTrans().getCharacter(
37 StringId.CLOSE_DOUBLE_QUOTE);
38
39 /**
40 * Create a {@link Chapter} object from the given information, formatting
41 * the content as it should be.
42 *
43 * @param support
44 * the linked {@link BasicSupport}
45 * @param source
46 * the source of the story
47 * @param number
48 * the chapter number
49 * @param name
50 * the chapter name
51 * @param content
52 * the chapter content
53 * @param pg
54 * the optional progress reporter
55 * @param html
56 * TRUE if the input content is in HTML mode
57 *
58 * @return the {@link Chapter}
59 *
60 * @throws IOException
61 * in case of I/O error
62 */
63 public static Chapter makeChapter(BasicSupport support, URL source,
64 int number, String name, String content, boolean html, Progress pg)
65 throws IOException {
66 // Chapter name: process it correctly, then remove the possible
67 // redundant "Chapter x: " in front of it, or "-" (as in
68 // "Chapter 5: - Fun!" after the ": " was automatically added)
69 String chapterName = BasicSupportPara.processPara(name, false)
70 .getContent().trim();
71 for (String lang : Instance.getConfig().getString(Config.CHAPTER)
72 .split(",")) {
73 String chapterWord = Instance.getConfig().getStringX(
74 Config.CHAPTER, lang);
75 if (chapterName.startsWith(chapterWord)) {
76 chapterName = chapterName.substring(chapterWord.length())
77 .trim();
78 break;
79 }
80 }
81
82 if (chapterName.startsWith(Integer.toString(number))) {
83 chapterName = chapterName.substring(
84 Integer.toString(number).length()).trim();
85 }
86
87 while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
88 chapterName = chapterName.substring(1).trim();
89 }
90 //
91
92 Chapter chap = new Chapter(number, chapterName);
93
94 if (content != null) {
95 List<Paragraph> paras = makeParagraphs(support, source, content,
96 html, pg);
97 long words = 0;
98 for (Paragraph para : paras) {
99 words += para.getWords();
100 }
101 chap.setParagraphs(paras);
102 chap.setWords(words);
103 }
104
105 return chap;
106 }
107
108 /**
109 * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
110 * and requotify them (i.e., separate them into QUOTE paragraphs and other
111 * paragraphs (quotes or not)).
112 *
113 * @param para
114 * the paragraph to requotify (not necessarily a quote)
115 * @param html
116 * TRUE if the input content is in HTML mode
117 *
118 * @return the correctly (or so we hope) quotified paragraphs
119 */
120 private static List<Paragraph> requotify(Paragraph para, boolean html) {
121 List<Paragraph> newParas = new ArrayList<Paragraph>();
122
123 if (para.getType() == ParagraphType.QUOTE
124 && para.getContent().length() > 2) {
125 String line = para.getContent();
126 boolean singleQ = line.startsWith("" + openQuote);
127 boolean doubleQ = line.startsWith("" + openDoubleQuote);
128
129 // Do not try when more than one quote at a time
130 // (some stories are not easily readable if we do)
131 if (singleQ
132 && line.indexOf(closeQuote, 1) < line
133 .lastIndexOf(closeQuote)) {
134 newParas.add(para);
135 return newParas;
136 }
137 if (doubleQ
138 && line.indexOf(closeDoubleQuote, 1) < line
139 .lastIndexOf(closeDoubleQuote)) {
140 newParas.add(para);
141 return newParas;
142 }
143 //
144
145 if (!singleQ && !doubleQ) {
146 line = openDoubleQuote + line + closeDoubleQuote;
147 newParas.add(new Paragraph(ParagraphType.QUOTE, line, para
148 .getWords()));
149 } else {
150 char open = singleQ ? openQuote : openDoubleQuote;
151 char close = singleQ ? closeQuote : closeDoubleQuote;
152
153 int posDot = -1;
154 boolean inQuote = false;
155 int i = 0;
156 for (char car : line.toCharArray()) {
157 if (car == open) {
158 inQuote = true;
159 } else if (car == close) {
160 inQuote = false;
161 } else if (car == '.' && !inQuote) {
162 posDot = i;
163 break;
164 }
165 i++;
166 }
167
168 if (posDot >= 0) {
169 String rest = line.substring(posDot + 1).trim();
170 line = line.substring(0, posDot + 1).trim();
171 long words = 1;
172 for (char car : line.toCharArray()) {
173 if (car == ' ') {
174 words++;
175 }
176 }
177 newParas.add(new Paragraph(ParagraphType.QUOTE, line, words));
178 if (!rest.isEmpty()) {
179 newParas.addAll(requotify(processPara(rest, html), html));
180 }
181 } else {
182 newParas.add(para);
183 }
184 }
185 } else {
186 newParas.add(para);
187 }
188
189 return newParas;
190 }
191
192 /**
193 * Process a {@link Paragraph} from a raw line of text.
194 * <p>
195 * Will also fix quotes and HTML encoding if needed.
196 *
197 * @param line
198 * the raw line
199 * @param html
200 * TRUE if the input content is in HTML mode
201 *
202 * @return the processed {@link Paragraph}
203 */
204 private static Paragraph processPara(String line, boolean html) {
205 if (html) {
206 line = StringUtils.unhtml(line).trim();
207 }
208 boolean space = true;
209 boolean brk = true;
210 boolean quote = false;
211 boolean tentativeCloseQuote = false;
212 char prev = '\0';
213 int dashCount = 0;
214 long words = 1;
215
216 StringBuilder builder = new StringBuilder();
217 for (char car : line.toCharArray()) {
218 if (car != '-') {
219 if (dashCount > 0) {
220 // dash, ndash and mdash: - – —
221 // currently: always use mdash
222 builder.append(dashCount == 1 ? '-' : '—');
223 }
224 dashCount = 0;
225 }
226
227 if (tentativeCloseQuote) {
228 tentativeCloseQuote = false;
229 if (Character.isLetterOrDigit(car)) {
230 builder.append("'");
231 } else {
232 // handle double-single quotes as double quotes
233 if (prev == car) {
234 builder.append(closeDoubleQuote);
235 continue;
236 }
237
238 builder.append(closeQuote);
239 }
240 }
241
242 switch (car) {
243 case ' ': // note: unbreakable space
244 case ' ':
245 case '\t':
246 case '\n': // just in case
247 case '\r': // just in case
248 if (builder.length() > 0
249 && builder.charAt(builder.length() - 1) != ' ') {
250 words++;
251 }
252 builder.append(' ');
253 break;
254
255 case '\'':
256 if (space || (brk && quote)) {
257 quote = true;
258 // handle double-single quotes as double quotes
259 if (prev == car) {
260 builder.deleteCharAt(builder.length() - 1);
261 builder.append(openDoubleQuote);
262 } else {
263 builder.append(openQuote);
264 }
265 } else if (prev == ' ' || prev == car) {
266 // handle double-single quotes as double quotes
267 if (prev == car) {
268 builder.deleteCharAt(builder.length() - 1);
269 builder.append(openDoubleQuote);
270 } else {
271 builder.append(openQuote);
272 }
273 } else {
274 // it is a quote ("I'm off") or a 'quote' ("This
275 // 'good' restaurant"...)
276 tentativeCloseQuote = true;
277 }
278 break;
279
280 case '"':
281 if (space || (brk && quote)) {
282 quote = true;
283 builder.append(openDoubleQuote);
284 } else if (prev == ' ') {
285 builder.append(openDoubleQuote);
286 } else {
287 builder.append(closeDoubleQuote);
288 }
289 break;
290
291 case '-':
292 if (space) {
293 quote = true;
294 } else {
295 dashCount++;
296 }
297 space = false;
298 break;
299
300 case '*':
301 case '~':
302 case '/':
303 case '\\':
304 case '<':
305 case '>':
306 case '=':
307 case '+':
308 case '_':
309 case '–':
310 case '—':
311 space = false;
312 builder.append(car);
313 break;
314
315 case '‘':
316 case '`':
317 case '‹':
318 case 'īš':
319 case '〈':
320 case '「':
321 if (space || (brk && quote)) {
322 quote = true;
323 builder.append(openQuote);
324 } else {
325 // handle double-single quotes as double quotes
326 if (prev == car) {
327 builder.deleteCharAt(builder.length() - 1);
328 builder.append(openDoubleQuote);
329 } else {
330 builder.append(openQuote);
331 }
332 }
333 space = false;
334 brk = false;
335 break;
336
337 case '’':
338 case 'â€ē':
339 case 'īš‚':
340 case '〉':
341 case '」':
342 space = false;
343 brk = false;
344 // handle double-single quotes as double quotes
345 if (prev == car) {
346 builder.deleteCharAt(builder.length() - 1);
347 builder.append(closeDoubleQuote);
348 } else {
349 builder.append(closeQuote);
350 }
351 break;
352
353 case 'ÂĢ':
354 case '“':
355 case 'īšƒ':
356 case '《':
357 case '『':
358 if (space || (brk && quote)) {
359 quote = true;
360 builder.append(openDoubleQuote);
361 } else {
362 builder.append(openDoubleQuote);
363 }
364 space = false;
365 brk = false;
366 break;
367
368 case 'Âģ':
369 case '”':
370 case 'īš„':
371 case '》':
372 case '』':
373 space = false;
374 brk = false;
375 builder.append(closeDoubleQuote);
376 break;
377
378 default:
379 space = false;
380 brk = false;
381 builder.append(car);
382 break;
383 }
384
385 prev = car;
386 }
387
388 if (tentativeCloseQuote) {
389 tentativeCloseQuote = false;
390 builder.append(closeQuote);
391 }
392
393 line = builder.toString().trim();
394
395 ParagraphType type = ParagraphType.NORMAL;
396 if (space) {
397 type = ParagraphType.BLANK;
398 } else if (brk) {
399 type = ParagraphType.BREAK;
400 } else if (quote) {
401 type = ParagraphType.QUOTE;
402 }
403
404 return new Paragraph(type, line, words);
405 }
406
407 /**
408 * Convert the given content into {@link Paragraph}s.
409 *
410 * @param support
411 * the linked {@link BasicSupport}
412 * @param source
413 * the source URL of the story
414 * @param content
415 * the textual content
416 * @param html
417 * TRUE if the input content is in HTML mode
418 * @param pg
419 * the optional progress reporter
420 *
421 * @return the {@link Paragraph}s
422 *
423 * @throws IOException
424 * in case of I/O error
425 */
426 private static List<Paragraph> makeParagraphs(BasicSupport support,
427 URL source, String content, boolean html, Progress pg)
428 throws IOException {
429 if (pg == null) {
430 pg = new Progress();
431 }
432
433 if (html) {
434 // Special <HR> processing:
435 content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
436 "<br/>* * *<br/>");
437 }
438
439 List<Paragraph> paras = new ArrayList<Paragraph>();
440
441 if (content != null && !content.trim().isEmpty()) {
442 if (html) {
443 String[] tab = content.split("(<p>|</p>|<br>|<br/>)");
444 pg.setMinMax(0, tab.length);
445 int i = 1;
446 for (String line : tab) {
447 if (line.startsWith("[") && line.endsWith("]")) {
448 pg.setName("Extracting image " + i);
449 }
450 paras.add(makeParagraph(support, source, line.trim(), html));
451 pg.setProgress(i++);
452 }
453 } else {
454 List<String> lines = new ArrayList<String>();
455 BufferedReader buff = null;
456 try {
457 buff = new BufferedReader(
458 new InputStreamReader(new ByteArrayInputStream(
459 content.getBytes("UTF-8")), "UTF-8"));
460 for (String line = buff.readLine(); line != null; line = buff
461 .readLine()) {
462 lines.add(line.trim());
463 }
464 } finally {
465 if (buff != null) {
466 buff.close();
467 }
468 }
469
470 pg.setMinMax(0, lines.size());
471 int i = 0;
472 for (String line : lines) {
473 if (line.startsWith("[") && line.endsWith("]")) {
474 pg.setName("Extracting image " + i);
475 }
476 paras.add(makeParagraph(support, source, line, html));
477 pg.setProgress(i++);
478 }
479 }
480
481 pg.done();
482 pg.setName(null);
483
484 // Check quotes for "bad" format
485 List<Paragraph> newParas = new ArrayList<Paragraph>();
486 for (Paragraph para : paras) {
487 newParas.addAll(BasicSupportPara.requotify(para, html));
488 }
489 paras = newParas;
490
491 // Remove double blanks/brks
492 fixBlanksBreaks(paras);
493 }
494
495 return paras;
496 }
497
498 /**
499 * Convert the given line into a single {@link Paragraph}.
500 *
501 * @param support
502 * the linked {@link BasicSupport}
503 * @param source
504 * the source URL of the story
505 * @param line
506 * the textual content of the paragraph
507 * @param html
508 * TRUE if the input content is in HTML mode
509 *
510 * @return the {@link Paragraph}
511 */
512 private static Paragraph makeParagraph(BasicSupport support, URL source,
513 String line, boolean html) {
514 Image image = null;
515 if (line.startsWith("[") && line.endsWith("]")) {
516 image = BasicSupportHelper.getImage(support, source, line
517 .substring(1, line.length() - 1).trim());
518 }
519
520 if (image != null) {
521 return new Paragraph(image);
522 }
523
524 return BasicSupportPara.processPara(line, html);
525 }
526
527 /**
528 * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of
529 * those {@link Paragraph}s.
530 * <p>
531 * The resulting list will not contain a starting or trailing blank/break
532 * nor 2 blanks or breaks following each other.
533 *
534 * @param paras
535 * the list of {@link Paragraph}s to fix
536 */
537 private static void fixBlanksBreaks(List<Paragraph> paras) {
538 boolean space = false;
539 boolean brk = true;
540 for (int i = 0; i < paras.size(); i++) {
541 Paragraph para = paras.get(i);
542 boolean thisSpace = para.getType() == ParagraphType.BLANK;
543 boolean thisBrk = para.getType() == ParagraphType.BREAK;
544
545 if (i > 0 && space && thisBrk) {
546 paras.remove(i - 1);
547 i--;
548 } else if ((space || brk) && (thisSpace || thisBrk)) {
549 paras.remove(i);
550 i--;
551 }
552
553 space = thisSpace;
554 brk = thisBrk;
555 }
556
557 // Remove blank/brk at start
558 if (paras.size() > 0
559 && (paras.get(0).getType() == ParagraphType.BLANK || paras.get(
560 0).getType() == ParagraphType.BREAK)) {
561 paras.remove(0);
562 }
563
564 // Remove blank/brk at end
565 int last = paras.size() - 1;
566 if (paras.size() > 0
567 && (paras.get(last).getType() == ParagraphType.BLANK || paras
568 .get(last).getType() == ParagraphType.BREAK)) {
569 paras.remove(last);
570 }
571 }
572 }