- return newParas;
- }
-
- /**
- * Process a {@link Paragraph} from a raw line of text.
- * <p>
- * Will also fix quotes and HTML encoding if needed.
- *
- * @param line
- * the raw line
- *
- * @return the processed {@link Paragraph}
- */
- protected Paragraph processPara(String line) {
- line = ifUnhtml(line).trim();
-
- boolean space = true;
- boolean brk = true;
- boolean quote = false;
- boolean tentativeCloseQuote = false;
- char prev = '\0';
- int dashCount = 0;
-
- StringBuilder builder = new StringBuilder();
- for (char car : line.toCharArray()) {
- if (car != '-') {
- if (dashCount > 0) {
- // dash, ndash and mdash: - – —
- // currently: always use mdash
- builder.append(dashCount == 1 ? '-' : '—');
- }
- dashCount = 0;
- }
-
- if (tentativeCloseQuote) {
- tentativeCloseQuote = false;
- if (Character.isLetterOrDigit(car)) {
- builder.append("'");
- } else {
- // handle double-single quotes as double quotes
- if (prev == car) {
- builder.append(closeDoubleQuote);
- continue;
- } else {
- builder.append(closeQuote);
- }
- }
- }
-
- switch (car) {
- case ' ': // note: unbreakable space
- case ' ':
- case '\t':
- case '\n': // just in case
- case '\r': // just in case
- builder.append(' ');
- break;
-
- case '\'':
- if (space || (brk && quote)) {
- quote = true;
- // handle double-single quotes as double quotes
- if (prev == car) {
- builder.deleteCharAt(builder.length() - 1);
- builder.append(openDoubleQuote);
- } else {
- builder.append(openQuote);
- }
- } else if (prev == ' ' || prev == car) {
- // handle double-single quotes as double quotes
- if (prev == car) {
- builder.deleteCharAt(builder.length() - 1);
- builder.append(openDoubleQuote);
- } else {
- builder.append(openQuote);
- }
- } else {
- // it is a quote ("I'm off") or a 'quote' ("This
- // 'good' restaurant"...)
- tentativeCloseQuote = true;
- }
- break;
-
- case '"':
- if (space || (brk && quote)) {
- quote = true;
- builder.append(openDoubleQuote);
- } else if (prev == ' ') {
- builder.append(openDoubleQuote);
- } else {
- builder.append(closeDoubleQuote);
- }
- break;
-
- case '-':
- if (space) {
- quote = true;
- } else {
- dashCount++;
- }
- space = false;
- break;
-
- case '*':
- case '~':
- case '/':
- case '\\':
- case '<':
- case '>':
- case '=':
- case '+':
- case '_':
- case '–':
- case '—':
- space = false;
- builder.append(car);
- break;
-
- case '‘':
- case '`':
- case '‹':
- case '﹁':
- case '〈':
- case '「':
- if (space || (brk && quote)) {
- quote = true;
- builder.append(openQuote);
- } else {
- // handle double-single quotes as double quotes
- if (prev == car) {
- builder.deleteCharAt(builder.length() - 1);
- builder.append(openDoubleQuote);
- } else {
- builder.append(openQuote);
- }
+ pg.setProgress(1);
+ Progress pgMeta = new Progress();
+ pg.addProgress(pgMeta, 10);
+ Story story = processMeta(true, pgMeta);
+ pgMeta.done(); // 10%
+ pg.put("meta", story.getMeta());
+
+ Progress pgGetChapters = new Progress();
+ pg.addProgress(pgGetChapters, 10);
+ story.setChapters(new ArrayList<Chapter>());
+ List<Entry<String, URL>> chapters = getChapters(pgGetChapters);
+ pgGetChapters.done(); // 20%
+
+ if (chapters != null) {
+ Progress pgChaps = new Progress("Extracting chapters", 0,
+ chapters.size() * 300);
+ pg.addProgress(pgChaps, 80);
+
+ long words = 0;
+ int i = 1;
+ for (Entry<String, URL> chap : chapters) {
+ pgChaps.setName("Extracting chapter " + i);
+ URL chapUrl = chap.getValue();
+ String chapName = chap.getKey();
+ if (chapUrl != null) {
+ setCurrentReferer(chapUrl);