package be.nikiroo.fanfix.supported; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Scanner; import be.nikiroo.fanfix.Instance; import be.nikiroo.fanfix.bundles.Config; import be.nikiroo.fanfix.bundles.StringId; import be.nikiroo.fanfix.data.Chapter; import be.nikiroo.fanfix.data.MetaData; import be.nikiroo.fanfix.data.Paragraph; import be.nikiroo.fanfix.data.Story; import be.nikiroo.fanfix.data.Paragraph.ParagraphType; import be.nikiroo.utils.StringUtils; /** * This class is the base class used by the other support classes. It can be * used outside of this package, and have static method that you can use to get * access to the correct support class. *

* It will be used with 'resources' (usually web pages or files). * * @author niki */ public abstract class BasicSupport { /** * The supported input types for which we can get a {@link BasicSupport} * object. * * @author niki */ public enum SupportType { /** EPUB files created with this program */ EPUB, /** Pure text file with some rules */ TEXT, /** TEXT but with associated .info file */ INFO_TEXT, /** My Little Pony fanfictions */ FIMFICTION, /** Fanfictions from a lot of different universes */ FANFICTION, /** Website with lots of Mangas */ MANGAFOX, /** Furry website with comics support */ E621, /** CBZ files */ CBZ; /** * A description of this support type (more information than the * {@link BasicSupport#getSourceName()}). * * @return the description */ public String getDesc() { String desc = Instance.getTrans().getStringX(StringId.INPUT_DESC, this.name()); if (desc == null) { desc = Instance.getTrans().getString(StringId.INPUT_DESC, this); } return desc; } /** * The name of this support type (a short version). * * @return the name */ public String getSourceName() { BasicSupport support = BasicSupport.getSupport(this); if (support != null) { return support.getSourceName(); } return null; } @Override public String toString() { return super.toString().toLowerCase(); } /** * Call {@link SupportType#valueOf(String.toUpperCase())}. * * @param typeName * the possible type name * * @return NULL or the type */ public static SupportType valueOfUC(String typeName) { return SupportType.valueOf(typeName == null ? null : typeName .toUpperCase()); } /** * Call {@link SupportType#valueOf(String.toUpperCase())} but return * NULL for NULL instead of raising exception. * * @param typeName * the possible type name * * @return NULL or the type */ public static SupportType valueOfNullOkUC(String typeName) { if (typeName == null) { return null; } return SupportType.valueOfUC(typeName); } /** * Call {@link SupportType#valueOf(String.toUpperCase())} but return * NULL in case of error instead of raising an exception. * * @param typeName * the possible type name * * @return NULL or the type */ public static SupportType valueOfAllOkUC(String typeName) { try { return SupportType.valueOfUC(typeName); } catch (Exception e) { return null; } } } /** Only used by {@link BasicSupport#getInput()} just so it is always reset. */ private InputStream in; private SupportType type; private URL currentReferer; // with on 'r', as in 'HTTP'... // quote chars private char openQuote = Instance.getTrans().getChar( StringId.OPEN_SINGLE_QUOTE); private char closeQuote = Instance.getTrans().getChar( StringId.CLOSE_SINGLE_QUOTE); private char openDoubleQuote = Instance.getTrans().getChar( StringId.OPEN_DOUBLE_QUOTE); private char closeDoubleQuote = Instance.getTrans().getChar( StringId.CLOSE_DOUBLE_QUOTE); /** * The name of this support class. * * @return the name */ protected abstract String getSourceName(); /** * Check if the given resource is supported by this {@link BasicSupport}. * * @param url * the resource to check for * * @return TRUE if it is */ protected abstract boolean supports(URL url); /** * Return TRUE if the support will return HTML encoded content values for * the chapters content. * * @return TRUE for HTML */ protected abstract boolean isHtml(); /** * Return the story title. * * @param source * the source of the story * @param in * the input (the main resource) * * @return the title * * @throws IOException * in case of I/O error */ protected abstract String getTitle(URL source, InputStream in) throws IOException; /** * Return the story author. * * @param source * the source of the story * @param in * the input (the main resource) * * @return the author * * @throws IOException * in case of I/O error */ protected abstract String getAuthor(URL source, InputStream in) throws IOException; /** * Return the story publication date. * * @param source * the source of the story * @param in * the input (the main resource) * * @return the date * * @throws IOException * in case of I/O error */ protected abstract String getDate(URL source, InputStream in) throws IOException; /** * Return the subject of the story (for instance, if it is a fanfiction, * what is the original work; if it is a technical text, what is the * technical subject...). * * @param source * the source of the story * @param in * the input (the main resource) * * @return the subject * * @throws IOException * in case of I/O error */ protected abstract String getSubject(URL source, InputStream in) throws IOException; /** * Return the story description. * * @param source * the source of the story * @param in * the input (the main resource) * * @return the description * * @throws IOException * in case of I/O error */ protected abstract String getDesc(URL source, InputStream in) throws IOException; /** * Return the story cover resource if any, or NULL if none. *

* The default cover should not be checked for here. * * @param source * the source of the story * @param in * the input (the main resource) * * @return the cover or NULL * * @throws IOException * in case of I/O error */ protected abstract URL getCover(URL source, InputStream in) throws IOException; /** * Return the list of chapters (name and resource). * * @param source * the source of the story * @param in * the input (the main resource) * * @return the chapters * * @throws IOException * in case of I/O error */ protected abstract List> getChapters(URL source, InputStream in) throws IOException; /** * Return the content of the chapter (possibly HTML encoded, if * {@link BasicSupport#isHtml()} is TRUE). * * @param source * the source of the story * @param in * the input (the main resource) * @param number * the chapter number * * @return the content * * @throws IOException * in case of I/O error */ protected abstract String getChapterContent(URL source, InputStream in, int number) throws IOException; /** * Check if this {@link BasicSupport} is mainly catered to image files. * * @return TRUE if it is */ public boolean isImageDocument(URL source, InputStream in) throws IOException { return false; } /** * Return the list of cookies (values included) that must be used to * correctly fetch the resources. *

* You are expected to call the super method implementation if you override * it. * * @return the cookies */ public Map getCookies() { return new HashMap(); } /** * Process the given story resource into a partially filled {@link Story} * object containing the name and metadata, except for the description. * * @param url * the story resource * * @return the {@link Story} * * @throws IOException * in case of I/O error */ public Story processMeta(URL url) throws IOException { return processMeta(url, true, false); } /** * Process the given story resource into a partially filled {@link Story} * object containing the name and metadata. * * @param url * the story resource * * @param close * close "this" and "in" when done * * @return the {@link Story} * * @throws IOException * in case of I/O error */ protected Story processMeta(URL url, boolean close, boolean getDesc) throws IOException { in = Instance.getCache().open(url, this, false); if (in == null) { return null; } try { preprocess(getInput()); Story story = new Story(); story.setMeta(new MetaData()); story.getMeta().setTitle(ifUnhtml(getTitle(url, getInput()))); story.getMeta().setAuthor( fixAuthor(ifUnhtml(getAuthor(url, getInput())))); story.getMeta().setDate(ifUnhtml(getDate(url, getInput()))); story.getMeta().setTags(getTags(url, getInput())); story.getMeta().setSource(getSourceName()); story.getMeta().setPublisher( ifUnhtml(getPublisher(url, getInput()))); story.getMeta().setUuid(getUuid(url, getInput())); story.getMeta().setLuid(getLuid(url, getInput())); story.getMeta().setLang(getLang(url, getInput())); story.getMeta().setSubject(ifUnhtml(getSubject(url, getInput()))); story.getMeta().setImageDocument(isImageDocument(url, getInput())); if (getDesc) { String descChapterName = Instance.getTrans().getString( StringId.DESCRIPTION); story.getMeta().setResume( makeChapter(url, 0, descChapterName, getDesc(url, getInput()))); } return story; } finally { if (close) { try { close(); } catch (IOException e) { Instance.syserr(e); } if (in != null) { in.close(); } } } } /** * Process the given story resource into a fully filled {@link Story} * object. * * @param url * the story resource * * @return the {@link Story} * * @throws IOException * in case of I/O error */ public Story process(URL url) throws IOException { setCurrentReferer(url); try { Story story = processMeta(url, false, true); if (story == null) { return null; } story.setChapters(new ArrayList()); URL cover = getCover(url, getInput()); if (cover == null) { String subject = story.getMeta() == null ? null : story .getMeta().getSubject(); if (subject != null && !subject.isEmpty() && Instance.getCoverDir() != null) { File fileCover = new File(Instance.getCoverDir(), subject); cover = getImage(fileCover.toURI().toURL(), subject); } } if (cover != null) { InputStream coverIn = null; try { coverIn = Instance.getCache().open(cover, this, true); story.getMeta().setCover(StringUtils.toImage(coverIn)); } catch (IOException e) { Instance.syserr(new IOException(Instance.getTrans() .getString(StringId.ERR_BS_NO_COVER, cover), e)); } finally { if (coverIn != null) coverIn.close(); } } List> chapters = getChapters(url, getInput()); int i = 1; if (chapters != null) { for (Entry chap : chapters) { setCurrentReferer(chap.getValue()); InputStream chapIn = Instance.getCache().open( chap.getValue(), this, true); try { story.getChapters().add( makeChapter(url, i, chap.getKey(), getChapterContent(url, chapIn, i))); } finally { chapIn.close(); } i++; } } return story; } finally { try { close(); } catch (IOException e) { Instance.syserr(e); } if (in != null) { in.close(); } currentReferer = null; } } /** * The support type.$ * * @return the type */ public SupportType getType() { return type; } /** * The current referer {@link URL} (only one 'r', as in 'HTML'...), i.e., * the current {@link URL} we work on. * * @return the referer */ public URL getCurrentReferer() { return currentReferer; } /** * The current referer {@link URL} (only one 'r', as in 'HTML'...), i.e., * the current {@link URL} we work on. * * @param currentReferer * the new referer */ protected void setCurrentReferer(URL currentReferer) { this.currentReferer = currentReferer; } /** * The support type. * * @param type * the new type * * @return this */ protected BasicSupport setType(SupportType type) { this.type = type; return this; } /** * Return the story publisher (by default, * {@link BasicSupport#getSourceName()}). * * @param source * the source of the story * @param in * the input (the main resource) * * @return the publisher * * @throws IOException * in case of I/O error */ protected String getPublisher(URL source, InputStream in) throws IOException { return getSourceName(); } /** * Return the story UUID, a unique value representing the story (it is often * an URL). *

* By default, this is the {@link URL} of the resource. * * @param source * the source of the story * @param in * the input (the main resource) * * @return the uuid * * @throws IOException * in case of I/O error */ protected String getUuid(URL source, InputStream in) throws IOException { return source.toString(); } /** * Return the story Library UID, a unique value representing the story (it * is often a number) in the local library. *

* By default, this is empty. * * @param source * the source of the story * @param in * the input (the main resource) * * @return the id * * @throws IOException * in case of I/O error */ protected String getLuid(URL source, InputStream in) throws IOException { return ""; } /** * Return the 2-letter language code of this story. *

* By default, this is 'EN'. * * @param source * the source of the story * @param in * the input (the main resource) * * @return the language * * @throws IOException * in case of I/O error */ protected String getLang(URL source, InputStream in) throws IOException { return "EN"; } /** * Return the list of tags for this story. * * @param source * the source of the story * @param in * the input (the main resource) * * @return the tags * * @throws IOException * in case of I/O error */ protected List getTags(URL source, InputStream in) throws IOException { return new ArrayList(); } /** * Return the first line from the given input which correspond to the given * selectors. *

* Do not reset the input, which will be pointing at the line just after the * result (input will be spent if no result is found). * * @param in * the input * @param needle * a string that must be found inside the target line * @param relativeLine * the line to return based upon the target line position (-1 = * the line before, 0 = the target line...) * * @return the line */ protected String getLine(InputStream in, String needle, int relativeLine) { return getLine(in, needle, relativeLine, true); } /** * Return a line from the given input which correspond to the given * selectors. *

* Do not reset the input, which will be pointing at the line just after the * result (input will be spent if no result is found) when first is TRUE, * and will always be spent if first is FALSE. * * @param in * the input * @param needle * a string that must be found inside the target line * @param relativeLine * the line to return based upon the target line position (-1 = * the line before, 0 = the target line...) * @param first * takes the first result (as opposed to the last one, which will * also always spend the input) * * @return the line */ protected String getLine(InputStream in, String needle, int relativeLine, boolean first) { String rep = null; List lines = new ArrayList(); @SuppressWarnings("resource") Scanner scan = new Scanner(in, "UTF-8"); int index = -1; scan.useDelimiter("\\n"); while (scan.hasNext()) { lines.add(scan.next()); if (index == -1 && lines.get(lines.size() - 1).contains(needle)) { index = lines.size() - 1; } if (index >= 0 && index + relativeLine < lines.size()) { rep = lines.get(index + relativeLine); if (first) { break; } } } return rep; } /** * Prepare the support if needed before processing. * * @throws IOException * on I/O error */ protected void preprocess(InputStream in) throws IOException { } /** * Now that we have processed the {@link Story}, close the resources if any. * * @throws IOException * on I/O error */ protected void close() throws IOException { } /** * Create a {@link Chapter} object from the given information, formatting * the content as it should be. * * @param number * the chapter number * @param name * the chapter name * @param content * the chapter content * * @return the {@link Chapter} * * @throws IOException * in case of I/O error */ protected Chapter makeChapter(URL source, int number, String name, String content) throws IOException { // Chapter name: process it correctly, then remove the possible // redundant "Chapter x: " in front of it String chapterName = processPara(name).getContent().trim(); for (String lang : Instance.getConfig().getString(Config.CHAPTER) .split(",")) { String chapterWord = Instance.getConfig().getStringX( Config.CHAPTER, lang); if (chapterName.startsWith(chapterWord)) { chapterName = chapterName.substring(chapterWord.length()) .trim(); break; } } if (chapterName.startsWith(Integer.toString(number))) { chapterName = chapterName.substring( Integer.toString(number).length()).trim(); } if (chapterName.startsWith(":")) { chapterName = chapterName.substring(1).trim(); } // Chapter chap = new Chapter(number, chapterName); if (content == null) { return chap; } if (isHtml()) { // Special


processing: content = content.replaceAll("(
]*>)|(
)|(
)", "\n* * *\n"); } InputStream in = new ByteArrayInputStream( content.getBytes(StandardCharsets.UTF_8)); try { @SuppressWarnings("resource") Scanner scan = new Scanner(in, "UTF-8"); scan.useDelimiter("(\\n|

)"); // \n for test,

for html List paras = new ArrayList(); while (scan.hasNext()) { String line = scan.next().trim(); boolean image = false; if (line.startsWith("[") && line.endsWith("]")) { URL url = getImage(source, line.substring(1, line.length() - 1).trim()); if (url != null) { paras.add(new Paragraph(url)); image = true; } } if (!image) { paras.add(processPara(line)); } } // Check quotes for "bad" format List newParas = new ArrayList(); for (Paragraph para : paras) { newParas.addAll(requotify(para)); } paras = newParas; // Remove double blanks/brks boolean space = false; boolean brk = true; for (int i = 0; i < paras.size(); i++) { Paragraph para = paras.get(i); boolean thisSpace = para.getType() == ParagraphType.BLANK; boolean thisBrk = para.getType() == ParagraphType.BREAK; if (space && thisBrk) { paras.remove(i - 1); i--; } else if ((space || brk) && (thisSpace || thisBrk)) { paras.remove(i); i--; } space = thisSpace; brk = thisBrk; } // Remove blank/brk at start if (paras.size() > 0 && (paras.get(0).getType() == ParagraphType.BLANK || paras .get(0).getType() == ParagraphType.BREAK)) { paras.remove(0); } // Remove blank/brk at end int last = paras.size() - 1; if (paras.size() > 0 && (paras.get(last).getType() == ParagraphType.BLANK || paras .get(last).getType() == ParagraphType.BREAK)) { paras.remove(last); } chap.setParagraphs(paras); return chap; } finally { in.close(); } } /** * Return the list of supported image extensions. * * @return the extensions */ protected String[] getImageExt(boolean emptyAllowed) { if (emptyAllowed) { return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" }; } else { return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" }; } } /** * Check if the given resource can be a local image or a remote image, then * refresh the cache with it if it is. * * @param source * the story source * @param line * the resource to check * * @return the image URL if found, or NULL * */ protected URL getImage(URL source, String line) { String path = new File(source.getFile()).getParent(); URL url = null; // try for files try { String urlBase = new File(new File(path), line.trim()).toURI() .toURL().toString(); for (String ext : getImageExt(true)) { if (new File(urlBase + ext).exists()) { url = new File(urlBase + ext).toURI().toURL(); } } } catch (Exception e) { // Nothing to do here } if (url == null) { // try for URLs try { for (String ext : getImageExt(true)) { if (Instance.getCache().check(new URL(line + ext))) { url = new URL(line + ext); } } // try out of cache if (url == null) { for (String ext : getImageExt(true)) { try { url = new URL(line + ext); Instance.getCache().refresh(url, this, true); break; } catch (IOException e) { // no image with this ext url = null; } } } } catch (MalformedURLException e) { // Not an url } } // refresh the cached file if (url != null) { try { Instance.getCache().refresh(url, this, true); } catch (IOException e) { // woops, broken image url = null; } } return url; } /** * Reset then return {@link BasicSupport#in}. * * @return {@link BasicSupport#in} * * @throws IOException * in case of I/O error */ protected InputStream getInput() throws IOException { in.reset(); return in; } /** * Fix the author name if it is prefixed with some "by" {@link String}. * * @param author * the author with a possible prefix * * @return the author without prefixes */ private String fixAuthor(String author) { if (author != null) { for (String suffix : new String[] { " ", ":" }) { for (String byString : Instance.getConfig() .getString(Config.BYS).split(",")) { byString += suffix; if (author.toUpperCase().startsWith(byString.toUpperCase())) { author = author.substring(byString.length()).trim(); } } } // Special case (without suffix): if (author.startsWith("©")) { author = author.substring(1); } } return author; } /** * Check quotes for bad format (i.e., quotes with normal paragraphs inside) * and requotify them (i.e., separate them into QUOTE paragraphs and other * paragraphs (quotes or not)). * * @param para * the paragraph to requotify (not necessaraly a quote) * * @return the correctly (or so we hope) quotified paragraphs */ private List requotify(Paragraph para) { List newParas = new ArrayList(); if (para.getType() == ParagraphType.QUOTE) { String line = para.getContent(); boolean singleQ = line.startsWith("" + openQuote); boolean doubleQ = line.startsWith("" + openDoubleQuote); if (!singleQ && !doubleQ) { line = openDoubleQuote + line + closeDoubleQuote; newParas.add(new Paragraph(ParagraphType.QUOTE, line)); } else { char close = singleQ ? closeQuote : closeDoubleQuote; int posClose = line.indexOf(close); int posDot = line.indexOf("."); while (posDot >= 0 && posDot < posClose) { posDot = line.indexOf(".", posDot + 1); } if (posDot >= 0) { String rest = line.substring(posDot + 1).trim(); line = line.substring(0, posDot + 1).trim(); newParas.add(new Paragraph(ParagraphType.QUOTE, line)); newParas.addAll(requotify(processPara(rest))); } else { newParas.add(para); } } } else { newParas.add(para); } return newParas; } /** * Process a {@link Paragraph} from a raw line of text. *

* Will also fix quotes and HTML encoding if needed. * * @param line * the raw line * * @return the processed {@link Paragraph} */ private Paragraph processPara(String line) { line = ifUnhtml(line).trim(); boolean space = true; boolean brk = true; boolean quote = false; boolean tentativeCloseQuote = false; char prev = '\0'; int dashCount = 0; StringBuilder builder = new StringBuilder(); for (char car : line.toCharArray()) { if (car != '-') { if (dashCount > 0) { // dash, ndash and mdash: - – — // currently: always use mdash builder.append(dashCount == 1 ? '-' : '—'); } dashCount = 0; } if (tentativeCloseQuote) { tentativeCloseQuote = false; if ((car >= 'a' && car <= 'z') || (car >= 'A' && car <= 'Z') || (car >= '0' && car <= '9')) { builder.append("'"); } else { builder.append(closeQuote); } } switch (car) { case ' ': // note: unbreakable space case ' ': case '\t': case '\n': // just in case case '\r': // just in case builder.append(' '); break; case '\'': if (space || (brk && quote)) { quote = true; builder.append(openQuote); } else if (prev == ' ') { builder.append(openQuote); } else { // it is a quote ("I'm off") or a 'quote' ("This // 'good' restaurant"...) tentativeCloseQuote = true; } break; case '"': if (space || (brk && quote)) { quote = true; builder.append(openDoubleQuote); } else if (prev == ' ') { builder.append(openDoubleQuote); } else { builder.append(closeDoubleQuote); } break; case '-': if (space) { quote = true; } else { dashCount++; } space = false; break; case '*': case '~': case '/': case '\\': case '<': case '>': case '=': case '+': case '_': case '–': case '—': space = false; builder.append(car); break; case '‘': case '`': case '‹': case '﹁': case '〈': case '「': if (space || (brk && quote)) { quote = true; builder.append(openQuote); } else { builder.append(openQuote); } space = false; brk = false; break; case '’': case '›': case '﹂': case '〉': case '」': space = false; brk = false; builder.append(closeQuote); break; case '«': case '“': case '﹃': case '《': case '『': if (space || (brk && quote)) { quote = true; builder.append(openDoubleQuote); } else { builder.append(openDoubleQuote); } space = false; brk = false; break; case '»': case '”': case '﹄': case '》': case '』': space = false; brk = false; builder.append(closeDoubleQuote); break; default: space = false; brk = false; builder.append(car); break; } prev = car; } if (tentativeCloseQuote) { tentativeCloseQuote = false; builder.append(closeQuote); } line = builder.toString().trim(); ParagraphType type = ParagraphType.NORMAL; if (space) { type = ParagraphType.BLANK; } else if (brk) { type = ParagraphType.BREAK; } else if (quote) { type = ParagraphType.QUOTE; } return new Paragraph(type, line); } /** * Remove the HTML from the inpit if {@link BasicSupport#isHtml()} is * true. * * @param input * the input * * @return the no html version if needed */ private String ifUnhtml(String input) { if (isHtml() && input != null) { return StringUtils.unhtml(input); } return input; } /** * Return a {@link BasicSupport} implementation supporting the given * resource if possible. * * @param url * the story resource * * @return an implementation that supports it, or NULL */ public static BasicSupport getSupport(URL url) { if (url == null) { return null; } // TEXT and INFO_TEXT always support files (not URLs though) for (SupportType type : SupportType.values()) { if (type != SupportType.TEXT && type != SupportType.INFO_TEXT) { BasicSupport support = getSupport(type); if (support != null && support.supports(url)) { return support; } } } for (SupportType type : new SupportType[] { SupportType.TEXT, SupportType.INFO_TEXT }) { BasicSupport support = getSupport(type); if (support != null && support.supports(url)) { return support; } } return null; } /** * Return a {@link BasicSupport} implementation supporting the given type. * * @param type * the type * * @return an implementation that supports it, or NULL */ public static BasicSupport getSupport(SupportType type) { switch (type) { case EPUB: return new Epub().setType(type); case INFO_TEXT: return new InfoText().setType(type); case FIMFICTION: return new Fimfiction().setType(type); case FANFICTION: return new Fanfiction().setType(type); case TEXT: return new Text().setType(type); case MANGAFOX: return new MangaFox().setType(type); case E621: return new E621().setType(type); case CBZ: return new Cbz().setType(type); } return null; } }