Commit | Line | Data |
---|---|---|
0ffa4754 NR |
1 | package be.nikiroo.fanfix.supported; |
2 | ||
3 | import java.io.BufferedReader; | |
4 | import java.io.ByteArrayInputStream; | |
5 | import java.io.IOException; | |
6 | import java.io.InputStreamReader; | |
7 | import java.net.URL; | |
8 | import java.util.ArrayList; | |
9 | import java.util.List; | |
10 | ||
11 | import be.nikiroo.fanfix.Instance; | |
12 | import be.nikiroo.fanfix.bundles.Config; | |
13 | import be.nikiroo.fanfix.bundles.StringId; | |
14 | import be.nikiroo.fanfix.data.Chapter; | |
15 | import be.nikiroo.fanfix.data.Paragraph; | |
16 | import be.nikiroo.fanfix.data.Paragraph.ParagraphType; | |
17 | import be.nikiroo.utils.Image; | |
18 | import be.nikiroo.utils.Progress; | |
19 | import be.nikiroo.utils.StringUtils; | |
20 | ||
21 | /** | |
22 | * Helper class for {@link BasicSupport}, mostly dedicated to {@link Paragraph} | |
8d59ce07 | 23 | * and text formating for the {@link BasicSupport} class. |
0ffa4754 NR |
24 | * |
25 | * @author niki | |
26 | */ | |
8d59ce07 | 27 | public class BasicSupportPara { |
0ffa4754 | 28 | // quote chars |
d66deb8d NR |
29 | private static char openQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_SINGLE_QUOTE); |
30 | private static char closeQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_SINGLE_QUOTE); | |
31 | private static char openDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.OPEN_DOUBLE_QUOTE); | |
32 | private static char closeDoubleQuote = Instance.getInstance().getTrans().getCharacter(StringId.CLOSE_DOUBLE_QUOTE); | |
0ffa4754 | 33 | |
8d59ce07 NR |
34 | // used by this class: |
35 | BasicSupportHelper bsHelper; | |
36 | BasicSupportImages bsImages; | |
37 | ||
e992c260 NR |
38 | /** |
39 | * Create a new {@link BasicSupportPara}. | |
40 | * <p> | |
41 | * Note that you need an instance of both {@link BasicSupportHelper} and | |
42 | * {@link BasicSupportImages} for it to work. | |
43 | * | |
44 | * @param bsHelper | |
45 | * the required {@link BasicSupportHelper} | |
46 | * @param bsImages | |
47 | * the required {@link BasicSupportImages} | |
48 | */ | |
8d59ce07 NR |
49 | public BasicSupportPara(BasicSupportHelper bsHelper, BasicSupportImages bsImages) { |
50 | this.bsHelper = bsHelper; | |
51 | this.bsImages = bsImages; | |
52 | } | |
53 | ||
0ffa4754 NR |
54 | /** |
55 | * Create a {@link Chapter} object from the given information, formatting | |
56 | * the content as it should be. | |
57 | * | |
58 | * @param support | |
e992c260 | 59 | * the linked {@link BasicSupport} (can be NULL) |
0ffa4754 | 60 | * @param source |
4642806a NR |
61 | * the source of the story (for image lookup in the same path if |
62 | * the source is a file, can be NULL) | |
0ffa4754 NR |
63 | * @param number |
64 | * the chapter number | |
65 | * @param name | |
66 | * the chapter name | |
67 | * @param content | |
68 | * the chapter content | |
69 | * @param pg | |
70 | * the optional progress reporter | |
71 | * @param html | |
72 | * TRUE if the input content is in HTML mode | |
73 | * | |
75a6a3ea | 74 | * @return the {@link Chapter}, never NULL |
0ffa4754 NR |
75 | * |
76 | * @throws IOException | |
77 | * in case of I/O error | |
78 | */ | |
8d59ce07 | 79 | public Chapter makeChapter(BasicSupport support, URL source, |
0ffa4754 NR |
80 | int number, String name, String content, boolean html, Progress pg) |
81 | throws IOException { | |
82 | // Chapter name: process it correctly, then remove the possible | |
83 | // redundant "Chapter x: " in front of it, or "-" (as in | |
84 | // "Chapter 5: - Fun!" after the ": " was automatically added) | |
8d59ce07 | 85 | String chapterName = processPara(name, false) |
0ffa4754 | 86 | .getContent().trim(); |
d66deb8d NR |
87 | for (String lang : Instance.getInstance().getConfig().getList(Config.CONF_CHAPTER)) { |
88 | String chapterWord = Instance.getInstance().getConfig().getStringX(Config.CONF_CHAPTER, lang); | |
0ffa4754 NR |
89 | if (chapterName.startsWith(chapterWord)) { |
90 | chapterName = chapterName.substring(chapterWord.length()) | |
91 | .trim(); | |
92 | break; | |
93 | } | |
94 | } | |
95 | ||
96 | if (chapterName.startsWith(Integer.toString(number))) { | |
97 | chapterName = chapterName.substring( | |
98 | Integer.toString(number).length()).trim(); | |
99 | } | |
100 | ||
101 | while (chapterName.startsWith(":") || chapterName.startsWith("-")) { | |
102 | chapterName = chapterName.substring(1).trim(); | |
103 | } | |
104 | // | |
105 | ||
106 | Chapter chap = new Chapter(number, chapterName); | |
107 | ||
108 | if (content != null) { | |
109 | List<Paragraph> paras = makeParagraphs(support, source, content, | |
110 | html, pg); | |
111 | long words = 0; | |
112 | for (Paragraph para : paras) { | |
113 | words += para.getWords(); | |
114 | } | |
115 | chap.setParagraphs(paras); | |
116 | chap.setWords(words); | |
117 | } | |
118 | ||
119 | return chap; | |
120 | } | |
121 | ||
122 | /** | |
123 | * Check quotes for bad format (i.e., quotes with normal paragraphs inside) | |
124 | * and requotify them (i.e., separate them into QUOTE paragraphs and other | |
125 | * paragraphs (quotes or not)). | |
126 | * | |
127 | * @param para | |
128 | * the paragraph to requotify (not necessarily a quote) | |
129 | * @param html | |
130 | * TRUE if the input content is in HTML mode | |
131 | * | |
132 | * @return the correctly (or so we hope) quotified paragraphs | |
133 | */ | |
8d59ce07 | 134 | protected List<Paragraph> requotify(Paragraph para, boolean html) { |
0ffa4754 NR |
135 | List<Paragraph> newParas = new ArrayList<Paragraph>(); |
136 | ||
137 | if (para.getType() == ParagraphType.QUOTE | |
138 | && para.getContent().length() > 2) { | |
139 | String line = para.getContent(); | |
140 | boolean singleQ = line.startsWith("" + openQuote); | |
141 | boolean doubleQ = line.startsWith("" + openDoubleQuote); | |
142 | ||
143 | // Do not try when more than one quote at a time | |
144 | // (some stories are not easily readable if we do) | |
145 | if (singleQ | |
146 | && line.indexOf(closeQuote, 1) < line | |
147 | .lastIndexOf(closeQuote)) { | |
148 | newParas.add(para); | |
149 | return newParas; | |
150 | } | |
151 | if (doubleQ | |
152 | && line.indexOf(closeDoubleQuote, 1) < line | |
153 | .lastIndexOf(closeDoubleQuote)) { | |
154 | newParas.add(para); | |
155 | return newParas; | |
156 | } | |
157 | // | |
158 | ||
159 | if (!singleQ && !doubleQ) { | |
160 | line = openDoubleQuote + line + closeDoubleQuote; | |
161 | newParas.add(new Paragraph(ParagraphType.QUOTE, line, para | |
162 | .getWords())); | |
163 | } else { | |
164 | char open = singleQ ? openQuote : openDoubleQuote; | |
165 | char close = singleQ ? closeQuote : closeDoubleQuote; | |
166 | ||
167 | int posDot = -1; | |
168 | boolean inQuote = false; | |
169 | int i = 0; | |
170 | for (char car : line.toCharArray()) { | |
171 | if (car == open) { | |
172 | inQuote = true; | |
173 | } else if (car == close) { | |
174 | inQuote = false; | |
175 | } else if (car == '.' && !inQuote) { | |
176 | posDot = i; | |
177 | break; | |
178 | } | |
179 | i++; | |
180 | } | |
181 | ||
182 | if (posDot >= 0) { | |
183 | String rest = line.substring(posDot + 1).trim(); | |
184 | line = line.substring(0, posDot + 1).trim(); | |
185 | long words = 1; | |
186 | for (char car : line.toCharArray()) { | |
187 | if (car == ' ') { | |
188 | words++; | |
189 | } | |
190 | } | |
191 | newParas.add(new Paragraph(ParagraphType.QUOTE, line, words)); | |
192 | if (!rest.isEmpty()) { | |
193 | newParas.addAll(requotify(processPara(rest, html), html)); | |
194 | } | |
195 | } else { | |
196 | newParas.add(para); | |
197 | } | |
198 | } | |
199 | } else { | |
200 | newParas.add(para); | |
201 | } | |
202 | ||
203 | return newParas; | |
204 | } | |
205 | ||
206 | /** | |
207 | * Process a {@link Paragraph} from a raw line of text. | |
208 | * <p> | |
209 | * Will also fix quotes and HTML encoding if needed. | |
210 | * | |
211 | * @param line | |
212 | * the raw line | |
213 | * @param html | |
214 | * TRUE if the input content is in HTML mode | |
215 | * | |
75a6a3ea | 216 | * @return the processed {@link Paragraph}, never NULL |
0ffa4754 | 217 | */ |
8d59ce07 | 218 | protected Paragraph processPara(String line, boolean html) { |
0ffa4754 NR |
219 | if (html) { |
220 | line = StringUtils.unhtml(line).trim(); | |
221 | } | |
222 | boolean space = true; | |
223 | boolean brk = true; | |
224 | boolean quote = false; | |
225 | boolean tentativeCloseQuote = false; | |
226 | char prev = '\0'; | |
227 | int dashCount = 0; | |
228 | long words = 1; | |
229 | ||
230 | StringBuilder builder = new StringBuilder(); | |
231 | for (char car : line.toCharArray()) { | |
232 | if (car != '-') { | |
233 | if (dashCount > 0) { | |
234 | // dash, ndash and mdash: - â â | |
235 | // currently: always use mdash | |
236 | builder.append(dashCount == 1 ? '-' : 'â'); | |
237 | } | |
238 | dashCount = 0; | |
239 | } | |
240 | ||
241 | if (tentativeCloseQuote) { | |
242 | tentativeCloseQuote = false; | |
243 | if (Character.isLetterOrDigit(car)) { | |
244 | builder.append("'"); | |
245 | } else { | |
246 | // handle double-single quotes as double quotes | |
247 | if (prev == car) { | |
248 | builder.append(closeDoubleQuote); | |
249 | continue; | |
250 | } | |
251 | ||
252 | builder.append(closeQuote); | |
253 | } | |
254 | } | |
255 | ||
256 | switch (car) { | |
257 | case 'Â ': // note: unbreakable space | |
258 | case ' ': | |
259 | case '\t': | |
260 | case '\n': // just in case | |
261 | case '\r': // just in case | |
262 | if (builder.length() > 0 | |
263 | && builder.charAt(builder.length() - 1) != ' ') { | |
264 | words++; | |
265 | } | |
266 | builder.append(' '); | |
267 | break; | |
268 | ||
269 | case '\'': | |
270 | if (space || (brk && quote)) { | |
271 | quote = true; | |
272 | // handle double-single quotes as double quotes | |
273 | if (prev == car) { | |
274 | builder.deleteCharAt(builder.length() - 1); | |
275 | builder.append(openDoubleQuote); | |
276 | } else { | |
277 | builder.append(openQuote); | |
278 | } | |
279 | } else if (prev == ' ' || prev == car) { | |
280 | // handle double-single quotes as double quotes | |
281 | if (prev == car) { | |
282 | builder.deleteCharAt(builder.length() - 1); | |
283 | builder.append(openDoubleQuote); | |
284 | } else { | |
285 | builder.append(openQuote); | |
286 | } | |
287 | } else { | |
288 | // it is a quote ("I'm off") or a 'quote' ("This | |
289 | // 'good' restaurant"...) | |
290 | tentativeCloseQuote = true; | |
291 | } | |
292 | break; | |
293 | ||
294 | case '"': | |
295 | if (space || (brk && quote)) { | |
296 | quote = true; | |
297 | builder.append(openDoubleQuote); | |
298 | } else if (prev == ' ') { | |
299 | builder.append(openDoubleQuote); | |
300 | } else { | |
301 | builder.append(closeDoubleQuote); | |
302 | } | |
303 | break; | |
304 | ||
305 | case '-': | |
306 | if (space) { | |
307 | quote = true; | |
308 | } else { | |
309 | dashCount++; | |
310 | } | |
311 | space = false; | |
312 | break; | |
313 | ||
314 | case '*': | |
315 | case '~': | |
316 | case '/': | |
317 | case '\\': | |
318 | case '<': | |
319 | case '>': | |
320 | case '=': | |
321 | case '+': | |
322 | case '_': | |
323 | case 'â': | |
324 | case 'â': | |
325 | space = false; | |
326 | builder.append(car); | |
327 | break; | |
328 | ||
329 | case 'â': | |
330 | case '`': | |
331 | case 'âš': | |
332 | case 'īš': | |
333 | case 'ã': | |
334 | case 'ã': | |
335 | if (space || (brk && quote)) { | |
336 | quote = true; | |
337 | builder.append(openQuote); | |
338 | } else { | |
339 | // handle double-single quotes as double quotes | |
340 | if (prev == car) { | |
341 | builder.deleteCharAt(builder.length() - 1); | |
342 | builder.append(openDoubleQuote); | |
343 | } else { | |
344 | builder.append(openQuote); | |
345 | } | |
346 | } | |
347 | space = false; | |
348 | brk = false; | |
349 | break; | |
350 | ||
351 | case 'â': | |
352 | case 'âē': | |
353 | case 'īš': | |
354 | case 'ã': | |
355 | case 'ã': | |
356 | space = false; | |
357 | brk = false; | |
358 | // handle double-single quotes as double quotes | |
359 | if (prev == car) { | |
360 | builder.deleteCharAt(builder.length() - 1); | |
361 | builder.append(closeDoubleQuote); | |
362 | } else { | |
363 | builder.append(closeQuote); | |
364 | } | |
365 | break; | |
366 | ||
367 | case 'ÂĢ': | |
368 | case 'â': | |
369 | case 'īš': | |
370 | case 'ã': | |
371 | case 'ã': | |
372 | if (space || (brk && quote)) { | |
373 | quote = true; | |
374 | builder.append(openDoubleQuote); | |
375 | } else { | |
376 | builder.append(openDoubleQuote); | |
377 | } | |
378 | space = false; | |
379 | brk = false; | |
380 | break; | |
381 | ||
382 | case 'Âģ': | |
383 | case 'â': | |
384 | case 'īš': | |
385 | case 'ã': | |
386 | case 'ã': | |
387 | space = false; | |
388 | brk = false; | |
389 | builder.append(closeDoubleQuote); | |
390 | break; | |
391 | ||
392 | default: | |
393 | space = false; | |
394 | brk = false; | |
395 | builder.append(car); | |
396 | break; | |
397 | } | |
398 | ||
399 | prev = car; | |
400 | } | |
401 | ||
402 | if (tentativeCloseQuote) { | |
403 | tentativeCloseQuote = false; | |
404 | builder.append(closeQuote); | |
405 | } | |
406 | ||
407 | line = builder.toString().trim(); | |
408 | ||
409 | ParagraphType type = ParagraphType.NORMAL; | |
410 | if (space) { | |
411 | type = ParagraphType.BLANK; | |
412 | } else if (brk) { | |
413 | type = ParagraphType.BREAK; | |
414 | } else if (quote) { | |
415 | type = ParagraphType.QUOTE; | |
416 | } | |
417 | ||
418 | return new Paragraph(type, line, words); | |
419 | } | |
420 | ||
421 | /** | |
422 | * Convert the given content into {@link Paragraph}s. | |
423 | * | |
424 | * @param support | |
4642806a NR |
425 | * the linked {@link BasicSupport} (can be NULL), used to |
426 | * download optional image content in [] | |
0ffa4754 | 427 | * @param source |
4642806a NR |
428 | * the source URL of the story (for image lookup in the same path |
429 | * if the source is a file, can be NULL) | |
0ffa4754 NR |
430 | * @param content |
431 | * the textual content | |
432 | * @param html | |
433 | * TRUE if the input content is in HTML mode | |
434 | * @param pg | |
435 | * the optional progress reporter | |
436 | * | |
75a6a3ea | 437 | * @return the {@link Paragraph}s (can be empty but never NULL) |
0ffa4754 NR |
438 | * |
439 | * @throws IOException | |
440 | * in case of I/O error | |
441 | */ | |
8d59ce07 | 442 | protected List<Paragraph> makeParagraphs(BasicSupport support, |
0ffa4754 NR |
443 | URL source, String content, boolean html, Progress pg) |
444 | throws IOException { | |
445 | if (pg == null) { | |
446 | pg = new Progress(); | |
447 | } | |
448 | ||
449 | if (html) { | |
450 | // Special <HR> processing: | |
451 | content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)", | |
452 | "<br/>* * *<br/>"); | |
453 | } | |
454 | ||
455 | List<Paragraph> paras = new ArrayList<Paragraph>(); | |
456 | ||
457 | if (content != null && !content.trim().isEmpty()) { | |
458 | if (html) { | |
459 | String[] tab = content.split("(<p>|</p>|<br>|<br/>)"); | |
460 | pg.setMinMax(0, tab.length); | |
461 | int i = 1; | |
462 | for (String line : tab) { | |
463 | if (line.startsWith("[") && line.endsWith("]")) { | |
464 | pg.setName("Extracting image " + i); | |
465 | } | |
466 | paras.add(makeParagraph(support, source, line.trim(), html)); | |
467 | pg.setProgress(i++); | |
468 | } | |
0ffa4754 NR |
469 | } else { |
470 | List<String> lines = new ArrayList<String>(); | |
471 | BufferedReader buff = null; | |
472 | try { | |
473 | buff = new BufferedReader( | |
474 | new InputStreamReader(new ByteArrayInputStream( | |
475 | content.getBytes("UTF-8")), "UTF-8")); | |
476 | for (String line = buff.readLine(); line != null; line = buff | |
477 | .readLine()) { | |
478 | lines.add(line.trim()); | |
479 | } | |
480 | } finally { | |
481 | if (buff != null) { | |
482 | buff.close(); | |
483 | } | |
484 | } | |
485 | ||
486 | pg.setMinMax(0, lines.size()); | |
487 | int i = 0; | |
488 | for (String line : lines) { | |
489 | if (line.startsWith("[") && line.endsWith("]")) { | |
490 | pg.setName("Extracting image " + i); | |
491 | } | |
492 | paras.add(makeParagraph(support, source, line, html)); | |
493 | pg.setProgress(i++); | |
494 | } | |
0ffa4754 NR |
495 | } |
496 | ||
68328e17 NR |
497 | pg.done(); |
498 | pg.setName(null); | |
499 | ||
0ffa4754 NR |
500 | // Check quotes for "bad" format |
501 | List<Paragraph> newParas = new ArrayList<Paragraph>(); | |
502 | for (Paragraph para : paras) { | |
8d59ce07 | 503 | newParas.addAll(requotify(para, html)); |
0ffa4754 NR |
504 | } |
505 | paras = newParas; | |
506 | ||
507 | // Remove double blanks/brks | |
508 | fixBlanksBreaks(paras); | |
509 | } | |
510 | ||
511 | return paras; | |
512 | } | |
513 | ||
514 | /** | |
515 | * Convert the given line into a single {@link Paragraph}. | |
516 | * | |
517 | * @param support | |
4642806a NR |
518 | * the linked {@link BasicSupport} (can be NULL), used to |
519 | * download optional image content in [] | |
0ffa4754 | 520 | * @param source |
4642806a NR |
521 | * the source URL of the story (for image lookup in the same path |
522 | * if the source is a file, can be NULL) | |
0ffa4754 NR |
523 | * @param line |
524 | * the textual content of the paragraph | |
525 | * @param html | |
526 | * TRUE if the input content is in HTML mode | |
527 | * | |
75a6a3ea | 528 | * @return the {@link Paragraph}, never NULL |
0ffa4754 | 529 | */ |
8d59ce07 | 530 | protected Paragraph makeParagraph(BasicSupport support, URL source, |
0ffa4754 NR |
531 | String line, boolean html) { |
532 | Image image = null; | |
533 | if (line.startsWith("[") && line.endsWith("]")) { | |
8d59ce07 | 534 | image = bsHelper.getImage(support, source, line |
0ffa4754 NR |
535 | .substring(1, line.length() - 1).trim()); |
536 | } | |
537 | ||
538 | if (image != null) { | |
539 | return new Paragraph(image); | |
540 | } | |
541 | ||
8d59ce07 | 542 | return processPara(line, html); |
0ffa4754 NR |
543 | } |
544 | ||
545 | /** | |
546 | * Fix the {@link ParagraphType#BLANK}s and {@link ParagraphType#BREAK}s of | |
547 | * those {@link Paragraph}s. | |
548 | * <p> | |
549 | * The resulting list will not contain a starting or trailing blank/break | |
550 | * nor 2 blanks or breaks following each other. | |
551 | * | |
552 | * @param paras | |
553 | * the list of {@link Paragraph}s to fix | |
554 | */ | |
8d59ce07 | 555 | protected void fixBlanksBreaks(List<Paragraph> paras) { |
0ffa4754 NR |
556 | boolean space = false; |
557 | boolean brk = true; | |
558 | for (int i = 0; i < paras.size(); i++) { | |
559 | Paragraph para = paras.get(i); | |
560 | boolean thisSpace = para.getType() == ParagraphType.BLANK; | |
561 | boolean thisBrk = para.getType() == ParagraphType.BREAK; | |
562 | ||
563 | if (i > 0 && space && thisBrk) { | |
564 | paras.remove(i - 1); | |
565 | i--; | |
566 | } else if ((space || brk) && (thisSpace || thisBrk)) { | |
567 | paras.remove(i); | |
568 | i--; | |
569 | } | |
570 | ||
571 | space = thisSpace; | |
572 | brk = thisBrk; | |
573 | } | |
574 | ||
575 | // Remove blank/brk at start | |
576 | if (paras.size() > 0 | |
577 | && (paras.get(0).getType() == ParagraphType.BLANK || paras.get( | |
578 | 0).getType() == ParagraphType.BREAK)) { | |
579 | paras.remove(0); | |
580 | } | |
581 | ||
582 | // Remove blank/brk at end | |
583 | int last = paras.size() - 1; | |
584 | if (paras.size() > 0 | |
585 | && (paras.get(last).getType() == ParagraphType.BLANK || paras | |
586 | .get(last).getType() == ParagraphType.BREAK)) { | |
587 | paras.remove(last); | |
588 | } | |
589 | } | |
590 | } |