X-Git-Url: http://git.nikiroo.be/?a=blobdiff_plain;f=src%2Fbe%2Fnikiroo%2Ffanfix%2Fsupported%2FBasicSupport.java;h=56a3bb80cb6d51fe40a9c4d830e12cb275102d76;hb=14c0debd37e7de3fa8b5acb81c234f96452ec2a2;hp=0c127aa21a17d59631f6d3200e31ea08a582f077;hpb=6930dfa8bc58fd89b2927e72f9115d14c602704d;p=nikiroo-utils.git diff --git a/src/be/nikiroo/fanfix/supported/BasicSupport.java b/src/be/nikiroo/fanfix/supported/BasicSupport.java index 0c127aa..56a3bb8 100644 --- a/src/be/nikiroo/fanfix/supported/BasicSupport.java +++ b/src/be/nikiroo/fanfix/supported/BasicSupport.java @@ -1,11 +1,7 @@ package be.nikiroo.fanfix.supported; -import java.io.BufferedReader; -import java.io.ByteArrayInputStream; -import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; @@ -13,18 +9,21 @@ import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Map.Entry; import java.util.Scanner; +import java.util.Map.Entry; + +import org.json.JSONException; +import org.json.JSONObject; +import org.jsoup.helper.DataUtil; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; import be.nikiroo.fanfix.Instance; -import be.nikiroo.fanfix.bundles.Config; import be.nikiroo.fanfix.bundles.StringId; import be.nikiroo.fanfix.data.Chapter; import be.nikiroo.fanfix.data.MetaData; -import be.nikiroo.fanfix.data.Paragraph; -import be.nikiroo.fanfix.data.Paragraph.ParagraphType; import be.nikiroo.fanfix.data.Story; -import be.nikiroo.utils.Image; import be.nikiroo.utils.Progress; import be.nikiroo.utils.StringUtils; @@ -38,142 +37,14 @@ import be.nikiroo.utils.StringUtils; * @author niki */ public abstract class BasicSupport { - /** - * The supported input types for which we can get a {@link BasicSupport} - * object. - * - * @author niki - */ - public enum SupportType { - /** EPUB files created with this program */ - EPUB, - /** Pure text file with some rules */ - TEXT, - /** TEXT but with associated .info file */ - INFO_TEXT, - /** My Little Pony fanfictions */ - FIMFICTION, - /** Fanfictions from a lot of different universes */ - FANFICTION, - /** Website with lots of Mangas */ - MANGAFOX, - /** Furry website with comics support */ - E621, - /** Furry website with stories */ - YIFFSTAR, - /** Comics and images groups, mostly but not only NSFW */ - E_HENTAI, - /** CBZ files */ - CBZ, - /** HTML files */ - HTML; - - /** - * A description of this support type (more information than the - * {@link BasicSupport#getSourceName()}). - * - * @return the description - */ - public String getDesc() { - String desc = Instance.getTrans().getStringX(StringId.INPUT_DESC, - this.name()); - - if (desc == null) { - desc = Instance.getTrans().getString(StringId.INPUT_DESC, this); - } - - return desc; - } - - /** - * The name of this support type (a short version). - * - * @return the name - */ - public String getSourceName() { - BasicSupport support = BasicSupport.getSupport(this); - if (support != null) { - return support.getSourceName(); - } - - return null; - } - - @Override - public String toString() { - return super.toString().toLowerCase(); - } - - /** - * Call {@link SupportType#valueOf(String)} after conversion to upper - * case. - * - * @param typeName - * the possible type name - * - * @return NULL or the type - */ - public static SupportType valueOfUC(String typeName) { - return SupportType.valueOf(typeName == null ? null : typeName - .toUpperCase()); - } - - /** - * Call {@link SupportType#valueOf(String)} after conversion to upper - * case but return NULL for NULL instead of raising exception. - * - * @param typeName - * the possible type name - * - * @return NULL or the type - */ - public static SupportType valueOfNullOkUC(String typeName) { - if (typeName == null) { - return null; - } - - return SupportType.valueOfUC(typeName); - } - - /** - * Call {@link SupportType#valueOf(String)} after conversion to upper - * case but return NULL in case of error instead of raising an - * exception. - * - * @param typeName - * the possible type name - * - * @return NULL or the type - */ - public static SupportType valueOfAllOkUC(String typeName) { - try { - return SupportType.valueOfUC(typeName); - } catch (Exception e) { - return null; - } - } - } - - private InputStream in; + private Document sourceNode; + private URL source; private SupportType type; private URL currentReferer; // with only one 'r', as in 'HTTP'... - - // quote chars - private char openQuote = Instance.getTrans().getCharacter( - StringId.OPEN_SINGLE_QUOTE); - private char closeQuote = Instance.getTrans().getCharacter( - StringId.CLOSE_SINGLE_QUOTE); - private char openDoubleQuote = Instance.getTrans().getCharacter( - StringId.OPEN_DOUBLE_QUOTE); - private char closeDoubleQuote = Instance.getTrans().getCharacter( - StringId.CLOSE_DOUBLE_QUOTE); - - /** - * The name of this support class. - * - * @return the name - */ - protected abstract String getSourceName(); + + static protected BasicSupportHelper bsHelper = new BasicSupportHelper(); + static protected BasicSupportImages bsImages = new BasicSupportImages(); + static protected BasicSupportPara bsPara = new BasicSupportPara(new BasicSupportHelper(), new BasicSupportImages()); /** * Check if the given resource is supported by this {@link BasicSupport}. @@ -196,61 +67,45 @@ public abstract class BasicSupport { /** * Return the {@link MetaData} of this story. * - * @param source - * the source of the story - * @param in - * the input (the main resource) - * * @return the associated {@link MetaData}, never NULL * * @throws IOException * in case of I/O error */ - protected abstract MetaData getMeta(URL source, InputStream in) - throws IOException; + protected abstract MetaData getMeta() throws IOException; /** * Return the story description. * - * @param source - * the source of the story - * @param in - * the input (the main resource) - * * @return the description * * @throws IOException * in case of I/O error */ - protected abstract String getDesc(URL source, InputStream in) - throws IOException; + protected abstract String getDesc() throws IOException; /** * Return the list of chapters (name and resource). + *
+ * Can be NULL if this {@link BasicSupport} do no use chapters.
*
- * @param source
- * the source of the story
- * @param in
- * the input (the main resource)
* @param pg
* the optional progress reporter
*
- * @return the chapters
+ * @return the chapters or NULL
*
* @throws IOException
* in case of I/O error
*/
- protected abstract List
+ * Can return NULL, in which case you are supposed to work without a source
+ * node.
*
* @param source
- * the source of the story
- * @param in
- * the input (the main resource)
+ * the source {@link URL}
+ *
+ * @return the {@link InputStream}
*
* @throws IOException
- * on I/O error
+ * in case of I/O error
*/
- @SuppressWarnings("unused")
- protected void preprocess(URL source, InputStream in) throws IOException {
+ protected Document loadDocument(URL source) throws IOException {
+ String url = getCanonicalUrl(source).toString();
+ return DataUtil.load(Instance.getInstance().getCache().open(source, this, false), "UTF-8", url.toString());
}
/**
- * Now that we have processed the {@link Story}, close the resources if any.
+ * Log into the support (can be a no-op depending upon the support).
*
* @throws IOException
- * on I/O error
+ * in case of I/O error
*/
- @SuppressWarnings("unused")
- protected void close() throws IOException {
+ protected void login() throws IOException {
}
/**
- * Create a {@link Chapter} object from the given information, formatting
- * the content as it should be.
- *
- * @param source
- * the source of the story
- * @param number
- * the chapter number
- * @param name
- * the chapter name
- * @param content
- * the chapter content
- * @param pg
- * the optional progress reporter
- *
- * @return the {@link Chapter}
- *
- * @throws IOException
- * in case of I/O error
+ * Now that we have processed the {@link Story}, close the resources if any.
*/
- protected Chapter makeChapter(URL source, int number, String name,
- String content, Progress pg) throws IOException {
- // Chapter name: process it correctly, then remove the possible
- // redundant "Chapter x: " in front of it, or "-" (as in
- // "Chapter 5: - Fun!" after the ": " was automatically added)
- String chapterName = processPara(name).getContent().trim();
- for (String lang : Instance.getConfig().getString(Config.CHAPTER)
- .split(",")) {
- String chapterWord = Instance.getConfig().getStringX(
- Config.CHAPTER, lang);
- if (chapterName.startsWith(chapterWord)) {
- chapterName = chapterName.substring(chapterWord.length())
- .trim();
- break;
- }
- }
-
- if (chapterName.startsWith(Integer.toString(number))) {
- chapterName = chapterName.substring(
- Integer.toString(number).length()).trim();
- }
-
- while (chapterName.startsWith(":") || chapterName.startsWith("-")) {
- chapterName = chapterName.substring(1).trim();
- }
- //
-
- Chapter chap = new Chapter(number, chapterName);
-
- if (content != null) {
- List |
- * The resulting list will not contain a starting or trailing blank/break
- * nor 2 blanks or breaks following each other.
- *
- * @param paras
- * the list of {@link Paragraph}s to fix
- */
- protected void fixBlanksBreaks(List
+ * Note that this method expects small JSON files (everything is copied into
+ * memory at least twice).
*
- * @return the cover if any, or NULL
- */
- static Image getDefaultCover(String subject) {
- if (subject != null && !subject.isEmpty()
- && Instance.getCoverDir() != null) {
- try {
- File fileCover = new File(Instance.getCoverDir(), subject);
- return getImage(null, fileCover.toURI().toURL(), subject);
- } catch (MalformedURLException e) {
- }
- }
-
- return null;
- }
-
- /**
- * Return the list of supported image extensions.
+ * @param url
+ * the URL to parse
+ * @param stable
+ * TRUE for more stable resources, FALSE when they often change
*
- * @param emptyAllowed
- * TRUE to allow an empty extension on first place, which can be
- * used when you may already have an extension in your input but
- * are not sure about it
+ * @return the JSON object
*
- * @return the extensions
+ * @throws IOException
+ * in case of I/O error
*/
- static String[] getImageExt(boolean emptyAllowed) {
- if (emptyAllowed) {
- return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
+ protected JSONObject getJson(String url, boolean stable)
+ throws IOException {
+ try {
+ return getJson(new URL(url), stable);
+ } catch (MalformedURLException e) {
+ throw new IOException("Malformed URL: " + url, e);
}
-
- return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
}
/**
- * Check if the given resource can be a local image or a remote image, then
- * refresh the cache with it if it is.
+ * Utility method to convert the given URL into a JSON object.
+ *
+ * Note that this method expects small JSON files (everything is copied into
+ * memory at least twice).
*
- * @param source
- * the story source
- * @param line
- * the resource to check
+ * @param url
+ * the URL to parse
+ * @param stable
+ * TRUE for more stable resources, FALSE when they often change
*
- * @return the image if found, or NULL
+ * @return the JSON object
*
+ * @throws IOException
+ * in case of I/O error
*/
- static Image getImage(BasicSupport support, URL source, String line) {
- URL url = getImageUrl(support, source, line);
- if (url != null) {
- if ("file".equals(url.getProtocol())) {
- if (new File(url.getPath()).isDirectory()) {
- return null;
- }
- }
- InputStream in = null;
+ protected JSONObject getJson(URL url, boolean stable) throws IOException {
+ InputStream in = Instance.getInstance().getCache().open(url, null,
+ stable);
+ try {
+ Scanner scan = new Scanner(in);
+ scan.useDelimiter("\0");
try {
- in = Instance.getCache().open(url, getSupport(url), true);
- return new Image(in);
- } catch (IOException e) {
+ return new JSONObject(scan.next());
+ } catch (JSONException e) {
+ throw new IOException(e);
} finally {
- if (in != null) {
- try {
- in.close();
- } catch (IOException e) {
- }
- }
+ scan.close();
}
+ } finally {
+ in.close();
}
-
- return null;
}
/**
- * Check if the given resource can be a local image or a remote image, then
- * refresh the cache with it if it is.
+ * Process the given story resource into a fully filled {@link Story}
+ * object.
*
- * @param source
- * the story source
- * @param line
- * the resource to check
+ * @param pg
+ * the optional progress reporter
*
- * @return the image URL if found, or NULL
+ * @return the {@link Story}, never NULL
*
+ * @throws IOException
+ * in case of I/O error
*/
- static URL getImageUrl(BasicSupport support, URL source, String line) {
- URL url = null;
-
- if (line != null) {
- // try for files
- if (source != null) {
- try {
-
- String relPath = null;
- String absPath = null;
- try {
- String path = new File(source.getFile()).getParent();
- relPath = new File(new File(path), line.trim())
- .getAbsolutePath();
- } catch (Exception e) {
- // Cannot be converted to path (one possibility to take
- // into account: absolute path on Windows)
- }
- try {
- absPath = new File(line.trim()).getAbsolutePath();
- } catch (Exception e) {
- // Cannot be converted to path (at all)
- }
-
- for (String ext : getImageExt(true)) {
- File absFile = new File(absPath + ext);
- File relFile = new File(relPath + ext);
- if (absPath != null && absFile.exists()
- && absFile.isFile()) {
- url = absFile.toURI().toURL();
- } else if (relPath != null && relFile.exists()
- && relFile.isFile()) {
- url = relFile.toURI().toURL();
- }
- }
- } catch (Exception e) {
- // Should not happen since we control the correct arguments
- }
- }
-
- if (url == null) {
- // try for URLs
- try {
- for (String ext : getImageExt(true)) {
- if (Instance.getCache()
- .check(new URL(line + ext), true)) {
- url = new URL(line + ext);
- break;
- }
- }
-
- // try out of cache
- if (url == null) {
- for (String ext : getImageExt(true)) {
- try {
- url = new URL(line + ext);
- Instance.getCache().refresh(url, support, true);
- break;
- } catch (IOException e) {
- // no image with this ext
- url = null;
- }
- }
- }
- } catch (MalformedURLException e) {
- // Not an url
- }
- }
+ // TODO: ADD final when BasicSupport_Deprecated is gone
+ public Story process(Progress pg) throws IOException {
+ setCurrentReferer(source);
+ login();
+ sourceNode = loadDocument(source);
- // refresh the cached file
- if (url != null) {
- try {
- Instance.getCache().refresh(url, support, true);
- } catch (IOException e) {
- // woops, broken image
- url = null;
- }
- }
+ try {
+ Story story = doProcess(pg);
+
+ // Check for "no chapters" stories
+ if (story.getChapters().isEmpty()
+ && story.getMeta().getResume() != null
+ && !story.getMeta().getResume().getParagraphs().isEmpty()) {
+ Chapter resume = story.getMeta().getResume();
+ resume.setName("");
+ resume.setNumber(1);
+ story.getChapters().add(resume);
+ story.getMeta().setWords(resume.getWords());
+
+ String descChapterName = Instance.getInstance().getTrans()
+ .getString(StringId.DESCRIPTION);
+ resume = new Chapter(0, descChapterName);
+ story.getMeta().setResume(resume);
+ }
+
+ return story;
+ } finally {
+ close();
}
-
- return url;
}
/**
- * Open the input file that will be used through the support.
+ * Actual processing step, without the calls to other methods.
*
- * Can return NULL, in which case you are supposed to work without an
- * {@link InputStream}.
+ * Will convert the story resource into a fully filled {@link Story} object.
*
- * @param source
- * the source {@link URL}
+ * @param pg
+ * the optional progress reporter
*
- * @return the {@link InputStream}
+ * @return the {@link Story}, never NULL
*
* @throws IOException
* in case of I/O error
*/
- protected InputStream openInput(URL source) throws IOException {
- return Instance.getCache().open(source, this, false);
- }
-
- /**
- * Reset then return {@link BasicSupport#in}.
- *
- * @return {@link BasicSupport#in}
- */
- protected InputStream getInput() {
- return reset(in);
- }
-
- /**
- * Fix the author name if it is prefixed with some "by" {@link String}.
- *
- * @param author
- * the author with a possible prefix
- *
- * @return the author without prefixes
- */
- protected String fixAuthor(String author) {
- if (author != null) {
- for (String suffix : new String[] { " ", ":" }) {
- for (String byString : Instance.getConfig()
- .getString(Config.BYS).split(",")) {
- byString += suffix;
- if (author.toUpperCase().startsWith(byString.toUpperCase())) {
- author = author.substring(byString.length()).trim();
- }
- }
- }
-
- // Special case (without suffix):
- if (author.startsWith("©")) {
- author = author.substring(1);
- }
- }
-
- return author;
- }
-
- /**
- * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
- * and requotify them (i.e., separate them into QUOTE paragraphs and other
- * paragraphs (quotes or not)).
- *
- * @param para
- * the paragraph to requotify (not necessarily a quote)
- *
- * @return the correctly (or so we hope) quotified paragraphs
- */
- protected List
- * Will also fix quotes and HTML encoding if needed.
- *
- * @param line
- * the raw line
- *
- * @return the processed {@link Paragraph}
- */
- protected Paragraph processPara(String line) {
- line = ifUnhtml(line).trim();
-
- boolean space = true;
- boolean brk = true;
- boolean quote = false;
- boolean tentativeCloseQuote = false;
- char prev = '\0';
- int dashCount = 0;
- long words = 1;
-
- StringBuilder builder = new StringBuilder();
- for (char car : line.toCharArray()) {
- if (car != '-') {
- if (dashCount > 0) {
- // dash, ndash and mdash: - â â
- // currently: always use mdash
- builder.append(dashCount == 1 ? '-' : 'â');
- }
- dashCount = 0;
- }
-
- if (tentativeCloseQuote) {
- tentativeCloseQuote = false;
- if (Character.isLetterOrDigit(car)) {
- builder.append("'");
- } else {
- // handle double-single quotes as double quotes
- if (prev == car) {
- builder.append(closeDoubleQuote);
- continue;
- }
-
- builder.append(closeQuote);
- }
- }
-
- switch (car) {
- case 'Â ': // note: unbreakable space
- case ' ':
- case '\t':
- case '\n': // just in case
- case '\r': // just in case
- if (builder.length() > 0
- && builder.charAt(builder.length() - 1) != ' ') {
- words++;
- }
- builder.append(' ');
- break;
-
- case '\'':
- if (space || (brk && quote)) {
- quote = true;
- // handle double-single quotes as double quotes
- if (prev == car) {
- builder.deleteCharAt(builder.length() - 1);
- builder.append(openDoubleQuote);
- } else {
- builder.append(openQuote);
- }
- } else if (prev == ' ' || prev == car) {
- // handle double-single quotes as double quotes
- if (prev == car) {
- builder.deleteCharAt(builder.length() - 1);
- builder.append(openDoubleQuote);
- } else {
- builder.append(openQuote);
- }
- } else {
- // it is a quote ("I'm off") or a 'quote' ("This
- // 'good' restaurant"...)
- tentativeCloseQuote = true;
- }
- break;
-
- case '"':
- if (space || (brk && quote)) {
- quote = true;
- builder.append(openDoubleQuote);
- } else if (prev == ' ') {
- builder.append(openDoubleQuote);
- } else {
- builder.append(closeDoubleQuote);
- }
- break;
-
- case '-':
- if (space) {
- quote = true;
- } else {
- dashCount++;
- }
- space = false;
- break;
-
- case '*':
- case '~':
- case '/':
- case '\\':
- case '<':
- case '>':
- case '=':
- case '+':
- case '_':
- case 'â':
- case 'â':
- space = false;
- builder.append(car);
- break;
+ pg.setProgress(1);
+ Progress pgMeta = new Progress();
+ pg.addProgress(pgMeta, 10);
+ Story story = processMeta(true, pgMeta);
+ pgMeta.done(); // 10%
+ pg.put("meta", story.getMeta());
+
+ Progress pgGetChapters = new Progress();
+ pg.addProgress(pgGetChapters, 10);
+ story.setChapters(new ArrayList
- * Will only match the first line with the given key if more than one are
- * possible. Which also means that if the subKey or endKey is not found on
- * that line, NULL will be returned.
- *
- * @param in
- * the input
- * @param key
- * the key to match (also supports "^" at start to say
- * "only if it starts with" the key)
- * @param subKey
- * the sub key or NULL if none
- * @param endKey
- * the end key or NULL for "up to the end"
- * @return the text or NULL if not found
- */
- static protected String getKeyLine(InputStream in, String key,
- String subKey, String endKey) {
- return getKeyText(getLine(in, key, 0), key, subKey, endKey);
- }
-
- /**
- * Return the text between the key and the endKey (and optional subKey can
- * be passed, in this case we will look for the key first, then take the
- * text between the subKey and the endKey).
- *
- * @param in
- * the input
- * @param key
- * the key to match (also supports "^" at start to say
- * "only if it starts with" the key)
- * @param subKey
- * the sub key or NULL if none
- * @param endKey
- * the end key or NULL for "up to the end"
- * @return the text or NULL if not found
- */
- static protected String getKeyText(String in, String key, String subKey,
- String endKey) {
- String result = null;
-
- String line = in;
- if (line != null && line.contains(key)) {
- line = line.substring(line.indexOf(key) + key.length());
- if (subKey == null || subKey.isEmpty() || line.contains(subKey)) {
- if (subKey != null) {
- line = line.substring(line.indexOf(subKey)
- + subKey.length());
- }
- if (endKey == null || line.contains(endKey)) {
- if (endKey != null) {
- line = line.substring(0, line.indexOf(endKey));
- result = line;
- }
- }
- }
- }
-
- return result;
- }
-
- /**
- * Return the text between the key and the endKey (optional subKeys can be
- * passed, in this case we will look for the subKeys first, then take the
- * text between the key and the endKey).
- *
- * @param in
- * the input
- * @param key
- * the key to match
- * @param endKey
- * the end key or NULL for "up to the end"
- * @param afters
- * the sub-keys to find before checking for key/endKey
- *
- * @return the text or NULL if not found
- */
- static protected String getKeyTextAfter(String in, String key,
- String endKey, String... afters) {
-
- if (in != null && !in.isEmpty()) {
- int pos = indexOfAfter(in, 0, afters);
- if (pos < 0) {
- return null;
- }
-
- in = in.substring(pos);
- }
-
- return getKeyText(in, key, null, endKey);
- }
-
- /**
- * Return the first index after all the given "afters" have been found in
- * the {@link String}, or -1 if it was not possible.
- *
- * @param in
- * the input
- * @param startAt
- * start at this position in the string
- * @param afters
- * the sub-keys to find before checking for key/endKey
- *
- * @return the text or NULL if not found
- */
- static protected int indexOfAfter(String in, int startAt, String... afters) {
- int pos = -1;
- if (in != null && !in.isEmpty()) {
- pos = startAt;
- if (afters != null) {
- for (int i = 0; pos >= 0 && i < afters.length; i++) {
- String subKey = afters[i];
- if (!subKey.isEmpty()) {
- pos = in.indexOf(subKey, pos);
- if (pos >= 0) {
- pos += subKey.length();
- }
- }
- }
- }
+ if (support != null) {
+ support.setType(type);
+ support.source = support.getCanonicalUrl(url);
}
- return pos;
+ return support;
}
}
processing:
- content = content.replaceAll("(
]*>)|(
)|(
)",
- "
* * *
");
- }
-
- List
|
)");
- pg.setMinMax(0, tab.length);
- int i = 1;
- for (String line : tab) {
- if (line.startsWith("[") && line.endsWith("]")) {
- pg.setName("Extracting image " + i);
- }
- paras.add(makeParagraph(source, line.trim()));
- pg.setProgress(i++);
- }
- pg.setName(null);
- } else {
- List