package be.nikiroo.fanfix.supported;
-import java.io.ByteArrayInputStream;
-import java.io.File;
import java.io.IOException;
import java.io.InputStream;
-import java.net.MalformedURLException;
import java.net.URL;
-import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
+import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
-import java.util.Scanner;
+
+import org.jsoup.helper.DataUtil;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
import be.nikiroo.fanfix.Instance;
-import be.nikiroo.fanfix.bundles.Config;
import be.nikiroo.fanfix.bundles.StringId;
import be.nikiroo.fanfix.data.Chapter;
import be.nikiroo.fanfix.data.MetaData;
-import be.nikiroo.fanfix.data.Paragraph;
import be.nikiroo.fanfix.data.Story;
-import be.nikiroo.fanfix.data.Paragraph.ParagraphType;
+import be.nikiroo.utils.Progress;
import be.nikiroo.utils.StringUtils;
/**
* @author niki
*/
public abstract class BasicSupport {
- /**
- * The supported input types for which we can get a {@link BasicSupport}
- * object.
- *
- * @author niki
- */
- public enum SupportType {
- /** EPUB files created with this program */
- EPUB,
- /** Pure text file with some rules */
- TEXT,
- /** TEXT but with associated .info file */
- INFO_TEXT,
- /** My Little Pony fanfictions */
- FIMFICTION,
- /** Fanfictions from a lot of different universes */
- FANFICTION,
- /** Website with lots of Mangas */
- MANGAFOX,
- /** Furry website with comics support */
- E621,
- /** CBZ files */
- CBZ;
-
- /**
- * A description of this support type (more information than the
- * {@link BasicSupport#getSourceName()}).
- *
- * @return the description
- */
- public String getDesc() {
- String desc = Instance.getTrans().getStringX(StringId.INPUT_DESC,
- this.name());
-
- if (desc == null) {
- desc = Instance.getTrans().getString(StringId.INPUT_DESC, this);
- }
-
- return desc;
- }
-
- /**
- * The name of this support type (a short version).
- *
- * @return the name
- */
- public String getSourceName() {
- BasicSupport support = BasicSupport.getSupport(this);
- if (support != null) {
- return support.getSourceName();
- }
-
- return null;
- }
-
- @Override
- public String toString() {
- return super.toString().toLowerCase();
- }
-
- /**
- * Call {@link SupportType#valueOf(String.toUpperCase())}.
- *
- * @param typeName
- * the possible type name
- *
- * @return NULL or the type
- */
- public static SupportType valueOfUC(String typeName) {
- return SupportType.valueOf(typeName == null ? null : typeName
- .toUpperCase());
- }
-
- /**
- * Call {@link SupportType#valueOf(String.toUpperCase())} but return
- * NULL for NULL instead of raising exception.
- *
- * @param typeName
- * the possible type name
- *
- * @return NULL or the type
- */
- public static SupportType valueOfNullOkUC(String typeName) {
- if (typeName == null) {
- return null;
- }
-
- return SupportType.valueOfUC(typeName);
- }
-
- /**
- * Call {@link SupportType#valueOf(String.toUpperCase())} but return
- * NULL in case of error instead of raising an exception.
- *
- * @param typeName
- * the possible type name
- *
- * @return NULL or the type
- */
- public static SupportType valueOfAllOkUC(String typeName) {
- try {
- return SupportType.valueOfUC(typeName);
- } catch (Exception e) {
- return null;
- }
- }
- }
-
- /** Only used by {@link BasicSupport#getInput()} just so it is always reset. */
- private InputStream in;
+ private Document sourceNode;
+ private URL source;
private SupportType type;
- private URL currentReferer; // with on 'r', as in 'HTTP'...
-
- // quote chars
- private char openQuote = Instance.getTrans().getChar(
- StringId.OPEN_SINGLE_QUOTE);
- private char closeQuote = Instance.getTrans().getChar(
- StringId.CLOSE_SINGLE_QUOTE);
- private char openDoubleQuote = Instance.getTrans().getChar(
- StringId.OPEN_DOUBLE_QUOTE);
- private char closeDoubleQuote = Instance.getTrans().getChar(
- StringId.CLOSE_DOUBLE_QUOTE);
-
- /**
- * The name of this support class.
- *
- * @return the name
- */
- protected abstract String getSourceName();
+ private URL currentReferer; // with only one 'r', as in 'HTTP'...
+
+ static protected BasicSupportHelper bsHelper = new BasicSupportHelper();
+ static protected BasicSupportImages bsImages = new BasicSupportImages();
+ static protected BasicSupportPara bsPara = new BasicSupportPara(new BasicSupportHelper(), new BasicSupportImages());
/**
* Check if the given resource is supported by this {@link BasicSupport}.
protected abstract boolean isHtml();
/**
- * Return the story title.
+ * Return the {@link MetaData} of this story.
*
- * @param source
- * the source of the story
- * @param in
- * the input (the main resource)
- *
- * @return the title
+ * @return the associated {@link MetaData}, never NULL
*
* @throws IOException
* in case of I/O error
*/
- protected abstract String getTitle(URL source, InputStream in)
- throws IOException;
-
- /**
- * Return the story author.
- *
- * @param source
- * the source of the story
- * @param in
- * the input (the main resource)
- *
- * @return the author
- *
- * @throws IOException
- * in case of I/O error
- */
- protected abstract String getAuthor(URL source, InputStream in)
- throws IOException;
-
- /**
- * Return the story publication date.
- *
- * @param source
- * the source of the story
- * @param in
- * the input (the main resource)
- *
- * @return the date
- *
- * @throws IOException
- * in case of I/O error
- */
- protected abstract String getDate(URL source, InputStream in)
- throws IOException;
-
- /**
- * Return the subject of the story (for instance, if it is a fanfiction,
- * what is the original work; if it is a technical text, what is the
- * technical subject...).
- *
- * @param source
- * the source of the story
- * @param in
- * the input (the main resource)
- *
- * @return the subject
- *
- * @throws IOException
- * in case of I/O error
- */
- protected abstract String getSubject(URL source, InputStream in)
- throws IOException;
+ protected abstract MetaData getMeta() throws IOException;
/**
* Return the story description.
*
- * @param source
- * the source of the story
- * @param in
- * the input (the main resource)
- *
* @return the description
*
* @throws IOException
* in case of I/O error
*/
- protected abstract String getDesc(URL source, InputStream in)
- throws IOException;
+ protected abstract String getDesc() throws IOException;
/**
- * Return the story cover resource if any, or NULL if none.
+ * Return the list of chapters (name and resource).
* <p>
- * The default cover should not be checked for here.
+ * Can be NULL if this {@link BasicSupport} do no use chapters.
*
- * @param source
- * the source of the story
- * @param in
- * the input (the main resource)
+ * @param pg
+ * the optional progress reporter
*
- * @return the cover or NULL
+ * @return the chapters or NULL
*
* @throws IOException
* in case of I/O error
*/
- protected abstract URL getCover(URL source, InputStream in)
+ protected abstract List<Entry<String, URL>> getChapters(Progress pg)
throws IOException;
- /**
- * Return the list of chapters (name and resource).
- *
- * @param source
- * the source of the story
- * @param in
- * the input (the main resource)
- *
- * @return the chapters
- *
- * @throws IOException
- * in case of I/O error
- */
- protected abstract List<Entry<String, URL>> getChapters(URL source,
- InputStream in) throws IOException;
-
/**
* Return the content of the chapter (possibly HTML encoded, if
* {@link BasicSupport#isHtml()} is TRUE).
*
- * @param source
- * the source of the story
- * @param in
- * the input (the main resource)
+ * @param chapUrl
+ * the chapter {@link URL}
* @param number
* the chapter number
+ * @param pg
+ * the optional progress reporter
*
* @return the content
*
* @throws IOException
* in case of I/O error
*/
- protected abstract String getChapterContent(URL source, InputStream in,
- int number) throws IOException;
-
- /**
- * Check if this {@link BasicSupport} is mainly catered to image files.
- *
- * @return TRUE if it is
- */
- public boolean isImageDocument(URL source, InputStream in)
- throws IOException {
- return false;
- }
+ protected abstract String getChapterContent(URL chapUrl, int number,
+ Progress pg) throws IOException;
/**
* Return the list of cookies (values included) that must be used to
}
/**
- * Process the given story resource into a partially filled {@link Story}
- * object containing the name and metadata, except for the description.
+ * OAuth authorisation (aka, "bearer XXXXXXX").
*
- * @param url
- * the story resource
- *
- * @return the {@link Story}
- *
- * @throws IOException
- * in case of I/O error
+ * @return the OAuth string
*/
- public Story processMeta(URL url) throws IOException {
- return processMeta(url, true, false);
+ public String getOAuth() {
+ return null;
}
/**
- * Process the given story resource into a partially filled {@link Story}
- * object containing the name and metadata.
- *
- * @param url
- * the story resource
+ * Return the canonical form of the main {@link URL}.
*
- * @param close
- * close "this" and "in" when done
- *
- * @return the {@link Story}
+ * @param source
+ * the source {@link URL}, which can be NULL
*
- * @throws IOException
- * in case of I/O error
+ * @return the canonical form of this {@link URL} or NULL if the source was
+ * NULL
*/
- protected Story processMeta(URL url, boolean close, boolean getDesc)
- throws IOException {
- in = Instance.getCache().open(url, this, false);
- if (in == null) {
- return null;
- }
-
- try {
- preprocess(getInput());
-
- Story story = new Story();
- story.setMeta(new MetaData());
- story.getMeta().setTitle(ifUnhtml(getTitle(url, getInput())));
- story.getMeta().setAuthor(
- fixAuthor(ifUnhtml(getAuthor(url, getInput()))));
- story.getMeta().setDate(ifUnhtml(getDate(url, getInput())));
- story.getMeta().setTags(getTags(url, getInput()));
- story.getMeta().setSource(getSourceName());
- story.getMeta().setPublisher(
- ifUnhtml(getPublisher(url, getInput())));
- story.getMeta().setUuid(getUuid(url, getInput()));
- story.getMeta().setLuid(getLuid(url, getInput()));
- story.getMeta().setLang(getLang(url, getInput()));
- story.getMeta().setSubject(ifUnhtml(getSubject(url, getInput())));
- story.getMeta().setImageDocument(isImageDocument(url, getInput()));
-
- if (getDesc) {
- String descChapterName = Instance.getTrans().getString(
- StringId.DESCRIPTION);
- story.getMeta().setResume(
- makeChapter(url, 0, descChapterName,
- getDesc(url, getInput())));
- }
-
- return story;
- } finally {
- if (close) {
- try {
- close();
- } catch (IOException e) {
- Instance.syserr(e);
- }
-
- if (in != null) {
- in.close();
- }
- }
- }
+ protected URL getCanonicalUrl(URL source) {
+ return source;
}
/**
- * Process the given story resource into a fully filled {@link Story}
- * object.
+ * The main {@link Node} for this {@link Story}.
*
- * @param url
- * the story resource
- *
- * @return the {@link Story}
- *
- * @throws IOException
- * in case of I/O error
+ * @return the node
*/
- public Story process(URL url) throws IOException {
- setCurrentReferer(url);
-
- try {
- Story story = processMeta(url, false, true);
- if (story == null) {
- return null;
- }
-
- story.setChapters(new ArrayList<Chapter>());
-
- URL cover = getCover(url, getInput());
- if (cover == null) {
- String subject = story.getMeta() == null ? null : story
- .getMeta().getSubject();
- if (subject != null && !subject.isEmpty()
- && Instance.getCoverDir() != null) {
- File fileCover = new File(Instance.getCoverDir(), subject);
- cover = getImage(fileCover.toURI().toURL(), subject);
- }
- }
-
- if (cover != null) {
- InputStream coverIn = null;
- try {
- coverIn = Instance.getCache().open(cover, this, true);
- story.getMeta().setCover(StringUtils.toImage(coverIn));
- } catch (IOException e) {
- Instance.syserr(new IOException(Instance.getTrans()
- .getString(StringId.ERR_BS_NO_COVER, cover), e));
- } finally {
- if (coverIn != null)
- coverIn.close();
- }
- }
-
- List<Entry<String, URL>> chapters = getChapters(url, getInput());
- int i = 1;
- if (chapters != null) {
- for (Entry<String, URL> chap : chapters) {
- setCurrentReferer(chap.getValue());
- InputStream chapIn = Instance.getCache().open(
- chap.getValue(), this, true);
- try {
- story.getChapters().add(
- makeChapter(url, i, chap.getKey(),
- getChapterContent(url, chapIn, i)));
- } finally {
- chapIn.close();
- }
- i++;
- }
- }
-
- return story;
-
- } finally {
- try {
- close();
- } catch (IOException e) {
- Instance.syserr(e);
- }
-
- if (in != null) {
- in.close();
- }
-
- currentReferer = null;
- }
+ protected Element getSourceNode() {
+ return sourceNode;
}
/**
- * The support type.$
+ * The main {@link URL} for this {@link Story}.
*
- * @return the type
+ * @return the URL
*/
- public SupportType getType() {
- return type;
+ protected URL getSource() {
+ return source;
}
/**
/**
* The support type.
*
- * @param type
- * the new type
- *
- * @return this
+ * @return the type
*/
- protected BasicSupport setType(SupportType type) {
- this.type = type;
- return this;
+ public SupportType getType() {
+ return type;
}
/**
- * Return the story publisher (by default,
- * {@link BasicSupport#getSourceName()}).
- *
- * @param source
- * the source of the story
- * @param in
- * the input (the main resource)
- *
- * @return the publisher
+ * The support type.
*
- * @throws IOException
- * in case of I/O error
+ * @param type
+ * the new type
*/
- protected String getPublisher(URL source, InputStream in)
- throws IOException {
- return getSourceName();
+ protected void setType(SupportType type) {
+ this.type = type;
}
/**
- * Return the story UUID, a unique value representing the story (it is often
- * an URL).
+ * Open an input link that will be used for the support.
* <p>
- * By default, this is the {@link URL} of the resource.
+ * Can return NULL, in which case you are supposed to work without a source
+ * node.
*
* @param source
- * the source of the story
- * @param in
- * the input (the main resource)
+ * the source {@link URL}
*
- * @return the uuid
+ * @return the {@link InputStream}
*
* @throws IOException
* in case of I/O error
*/
- protected String getUuid(URL source, InputStream in) throws IOException {
- return source.toString();
+ protected Document loadDocument(URL source) throws IOException {
+ String url = getCanonicalUrl(source).toString();
+ return DataUtil.load(Instance.getInstance().getCache().open(source, this, false), "UTF-8", url.toString());
}
/**
- * Return the story Library UID, a unique value representing the story (it
- * is often a number) in the local library.
- * <p>
- * By default, this is empty.
- *
- * @param source
- * the source of the story
- * @param in
- * the input (the main resource)
- *
- * @return the id
+ * Log into the support (can be a no-op depending upon the support).
*
* @throws IOException
* in case of I/O error
*/
- protected String getLuid(URL source, InputStream in) throws IOException {
- return "";
+ protected void login() throws IOException {
}
/**
- * Return the 2-letter language code of this story.
- * <p>
- * By default, this is 'EN'.
- *
- * @param source
- * the source of the story
- * @param in
- * the input (the main resource)
- *
- * @return the language
- *
- * @throws IOException
- * in case of I/O error
+ * Now that we have processed the {@link Story}, close the resources if any.
*/
- protected String getLang(URL source, InputStream in) throws IOException {
- return "EN";
+ protected void close() {
+ setCurrentReferer(null);
}
/**
- * Return the list of tags for this story.
+ * Process the given story resource into a partially filled {@link Story}
+ * object containing the name and metadata.
*
- * @param source
- * the source of the story
- * @param in
- * the input (the main resource)
+ * @param getDesc
+ * retrieve the description of the story, or not
+ * @param pg
+ * the optional progress reporter
*
- * @return the tags
+ * @return the {@link Story}, never NULL
*
* @throws IOException
* in case of I/O error
*/
- protected List<String> getTags(URL source, InputStream in)
+ protected Story processMeta(boolean getDesc, Progress pg)
throws IOException {
- return new ArrayList<String>();
- }
+ if (pg == null) {
+ pg = new Progress();
+ } else {
+ pg.setMinMax(0, 100);
+ }
- /**
- * Return the first line from the given input which correspond to the given
- * selectors.
- * <p>
- * Do not reset the input, which will be pointing at the line just after the
- * result (input will be spent if no result is found).
- *
- * @param in
- * the input
- * @param needle
- * a string that must be found inside the target line (also
- * supports "^" at start to say "only if it starts with" the
- * needle)
- * @param relativeLine
- * the line to return based upon the target line position (-1 =
- * the line before, 0 = the target line...)
- *
- * @return the line
- */
- protected String getLine(InputStream in, String needle, int relativeLine) {
- return getLine(in, needle, relativeLine, true);
- }
+ pg.setProgress(30);
- /**
- * Return a line from the given input which correspond to the given
- * selectors.
- * <p>
- * Do not reset the input, which will be pointing at the line just after the
- * result (input will be spent if no result is found) when first is TRUE,
- * and will always be spent if first is FALSE.
- *
- * @param in
- * the input
- * @param needle
- * a string that must be found inside the target line (also
- * supports "^" at start to say "only if it starts with" the
- * needle)
- * @param relativeLine
- * the line to return based upon the target line position (-1 =
- * the line before, 0 = the target line...)
- * @param first
- * takes the first result (as opposed to the last one, which will
- * also always spend the input)
- *
- * @return the line
- */
- protected String getLine(InputStream in, String needle, int relativeLine,
- boolean first) {
- String rep = null;
+ Story story = new Story();
+ MetaData meta = getMeta();
+ if (meta.getCreationDate() == null || meta.getCreationDate().isEmpty()) {
+ meta.setCreationDate(StringUtils.fromTime(new Date().getTime()));
+ }
+ story.setMeta(meta);
- List<String> lines = new ArrayList<String>();
- @SuppressWarnings("resource")
- Scanner scan = new Scanner(in, "UTF-8");
- int index = -1;
- scan.useDelimiter("\\n");
- while (scan.hasNext()) {
- lines.add(scan.next());
+ pg.setProgress(50);
- if (index == -1) {
- if (needle.startsWith("^")) {
- if (lines.get(lines.size() - 1).startsWith(
- needle.substring(1))) {
- index = lines.size() - 1;
- }
+ if (meta.getCover() == null) {
+ meta.setCover(bsHelper.getDefaultCover(meta.getSubject()));
+ }
- } else {
- if (lines.get(lines.size() - 1).contains(needle)) {
- index = lines.size() - 1;
- }
- }
- }
+ pg.setProgress(60);
- if (index >= 0 && index + relativeLine < lines.size()) {
- rep = lines.get(index + relativeLine);
- if (first) {
- break;
- }
- }
+ if (getDesc) {
+ String descChapterName = Instance.getInstance().getTrans().getString(StringId.DESCRIPTION);
+ story.getMeta().setResume(bsPara.makeChapter(this, source, 0, descChapterName, //
+ getDesc(), isHtml(), null));
}
- return rep;
- }
-
- /**
- * Prepare the support if needed before processing.
- *
- * @throws IOException
- * on I/O error
- */
- protected void preprocess(InputStream in) throws IOException {
+ pg.done();
+ return story;
}
/**
- * Now that we have processed the {@link Story}, close the resources if any.
+ * Process the given story resource into a fully filled {@link Story}
+ * object.
*
- * @throws IOException
- * on I/O error
- */
- protected void close() throws IOException {
- }
-
- /**
- * Create a {@link Chapter} object from the given information, formatting
- * the content as it should be.
+ * @param pg
+ * the optional progress reporter
*
- * @param number
- * the chapter number
- * @param name
- * the chapter name
- * @param content
- * the chapter content
- *
- * @return the {@link Chapter}
+ * @return the {@link Story}, never NULL
*
* @throws IOException
* in case of I/O error
*/
- protected Chapter makeChapter(URL source, int number, String name,
- String content) throws IOException {
-
- // Chapter name: process it correctly, then remove the possible
- // redundant "Chapter x: " in front of it
- String chapterName = processPara(name).getContent().trim();
- for (String lang : Instance.getConfig().getString(Config.CHAPTER)
- .split(",")) {
- String chapterWord = Instance.getConfig().getStringX(
- Config.CHAPTER, lang);
- if (chapterName.startsWith(chapterWord)) {
- chapterName = chapterName.substring(chapterWord.length())
- .trim();
- break;
- }
- }
+ // TODO: ADD final when BasicSupport_Deprecated is gone
+ public Story process(Progress pg) throws IOException {
+ setCurrentReferer(source);
+ login();
+ sourceNode = loadDocument(source);
- if (chapterName.startsWith(Integer.toString(number))) {
- chapterName = chapterName.substring(
- Integer.toString(number).length()).trim();
- }
-
- if (chapterName.startsWith(":")) {
- chapterName = chapterName.substring(1).trim();
- }
- //
-
- Chapter chap = new Chapter(number, chapterName);
-
- if (content == null) {
- return chap;
- }
-
- if (isHtml()) {
- // Special <HR> processing:
- content = content.replaceAll("(<hr [^>]*>)|(<hr/>)|(<hr>)",
- "\n* * *\n");
- }
-
- InputStream in = new ByteArrayInputStream(
- content.getBytes(StandardCharsets.UTF_8));
try {
- @SuppressWarnings("resource")
- Scanner scan = new Scanner(in, "UTF-8");
- scan.useDelimiter("(\\n|</p>)"); // \n for test, </p> for html
-
- List<Paragraph> paras = new ArrayList<Paragraph>();
- while (scan.hasNext()) {
- String line = scan.next().trim();
- boolean image = false;
- if (line.startsWith("[") && line.endsWith("]")) {
- URL url = getImage(source,
- line.substring(1, line.length() - 1).trim());
- if (url != null) {
- paras.add(new Paragraph(url));
- image = true;
- }
- }
-
- if (!image) {
- paras.add(processPara(line));
- }
- }
-
- // Check quotes for "bad" format
- List<Paragraph> newParas = new ArrayList<Paragraph>();
- for (Paragraph para : paras) {
- newParas.addAll(requotify(para));
- }
- paras = newParas;
-
- // Remove double blanks/brks
- boolean space = false;
- boolean brk = true;
- for (int i = 0; i < paras.size(); i++) {
- Paragraph para = paras.get(i);
- boolean thisSpace = para.getType() == ParagraphType.BLANK;
- boolean thisBrk = para.getType() == ParagraphType.BREAK;
-
- if (space && thisBrk) {
- paras.remove(i - 1);
- i--;
- } else if ((space || brk) && (thisSpace || thisBrk)) {
- paras.remove(i);
- i--;
- }
-
- space = thisSpace;
- brk = thisBrk;
- }
-
- // Remove blank/brk at start
- if (paras.size() > 0
- && (paras.get(0).getType() == ParagraphType.BLANK || paras
- .get(0).getType() == ParagraphType.BREAK)) {
- paras.remove(0);
- }
-
- // Remove blank/brk at end
- int last = paras.size() - 1;
- if (paras.size() > 0
- && (paras.get(last).getType() == ParagraphType.BLANK || paras
- .get(last).getType() == ParagraphType.BREAK)) {
- paras.remove(last);
- }
-
- chap.setParagraphs(paras);
-
- return chap;
+ return doProcess(pg);
} finally {
- in.close();
- }
- }
-
- /**
- * Return the list of supported image extensions.
- *
- * @return the extensions
- */
- protected String[] getImageExt(boolean emptyAllowed) {
- if (emptyAllowed) {
- return new String[] { "", ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
- } else {
- return new String[] { ".png", ".jpg", ".jpeg", ".gif", ".bmp" };
+ close();
}
}
/**
- * Check if the given resource can be a local image or a remote image, then
- * refresh the cache with it if it is.
+ * Actual processing step, without the calls to other methods.
+ * <p>
+ * Will convert the story resource into a fully filled {@link Story} object.
*
- * @param source
- * the story source
- * @param line
- * the resource to check
+ * @param pg
+ * the optional progress reporter
*
- * @return the image URL if found, or NULL
- *
- */
- protected URL getImage(URL source, String line) {
- String path = new File(source.getFile()).getParent();
- URL url = null;
-
- // try for files
- try {
- String urlBase = new File(new File(path), line.trim()).toURI()
- .toURL().toString();
- for (String ext : getImageExt(true)) {
- if (new File(urlBase + ext).exists()) {
- url = new File(urlBase + ext).toURI().toURL();
- }
- }
- } catch (Exception e) {
- // Nothing to do here
- }
-
- if (url == null) {
- // try for URLs
- try {
- for (String ext : getImageExt(true)) {
- if (Instance.getCache().check(new URL(line + ext))) {
- url = new URL(line + ext);
- }
- }
-
- // try out of cache
- if (url == null) {
- for (String ext : getImageExt(true)) {
- try {
- url = new URL(line + ext);
- Instance.getCache().refresh(url, this, true);
- break;
- } catch (IOException e) {
- // no image with this ext
- url = null;
- }
- }
- }
- } catch (MalformedURLException e) {
- // Not an url
- }
- }
-
- // refresh the cached file
- if (url != null) {
- try {
- Instance.getCache().refresh(url, this, true);
- } catch (IOException e) {
- // woops, broken image
- url = null;
- }
- }
-
- return url;
- }
-
- /**
- * Reset then return {@link BasicSupport#in}.
- *
- * @return {@link BasicSupport#in}
+ * @return the {@link Story}, never NULL
*
* @throws IOException
* in case of I/O error
*/
- protected InputStream getInput() throws IOException {
- in.reset();
- return in;
- }
-
- /**
- * Fix the author name if it is prefixed with some "by" {@link String}.
- *
- * @param author
- * the author with a possible prefix
- *
- * @return the author without prefixes
- */
- private String fixAuthor(String author) {
- if (author != null) {
- for (String suffix : new String[] { " ", ":" }) {
- for (String byString : Instance.getConfig()
- .getString(Config.BYS).split(",")) {
- byString += suffix;
- if (author.toUpperCase().startsWith(byString.toUpperCase())) {
- author = author.substring(byString.length()).trim();
- }
- }
- }
-
- // Special case (without suffix):
- if (author.startsWith("©")) {
- author = author.substring(1);
- }
- }
-
- return author;
- }
-
- /**
- * Check quotes for bad format (i.e., quotes with normal paragraphs inside)
- * and requotify them (i.e., separate them into QUOTE paragraphs and other
- * paragraphs (quotes or not)).
- *
- * @param para
- * the paragraph to requotify (not necessaraly a quote)
- *
- * @return the correctly (or so we hope) quotified paragraphs
- */
- private List<Paragraph> requotify(Paragraph para) {
- List<Paragraph> newParas = new ArrayList<Paragraph>();
-
- if (para.getType() == ParagraphType.QUOTE) {
- String line = para.getContent();
- boolean singleQ = line.startsWith("" + openQuote);
- boolean doubleQ = line.startsWith("" + openDoubleQuote);
-
- if (!singleQ && !doubleQ) {
- line = openDoubleQuote + line + closeDoubleQuote;
- newParas.add(new Paragraph(ParagraphType.QUOTE, line));
- } else {
- char close = singleQ ? closeQuote : closeDoubleQuote;
- int posClose = line.indexOf(close);
- int posDot = line.indexOf(".");
- while (posDot >= 0 && posDot < posClose) {
- posDot = line.indexOf(".", posDot + 1);
- }
-
- if (posDot >= 0) {
- String rest = line.substring(posDot + 1).trim();
- line = line.substring(0, posDot + 1).trim();
- newParas.add(new Paragraph(ParagraphType.QUOTE, line));
- newParas.addAll(requotify(processPara(rest)));
- } else {
- newParas.add(para);
- }
- }
+ protected Story doProcess(Progress pg) throws IOException {
+ if (pg == null) {
+ pg = new Progress();
} else {
- newParas.add(para);
+ pg.setMinMax(0, 100);
}
- return newParas;
- }
-
- /**
- * Process a {@link Paragraph} from a raw line of text.
- * <p>
- * Will also fix quotes and HTML encoding if needed.
- *
- * @param line
- * the raw line
- *
- * @return the processed {@link Paragraph}
- */
- private Paragraph processPara(String line) {
- line = ifUnhtml(line).trim();
+ pg.setProgress(1);
+ Progress pgMeta = new Progress();
+ pg.addProgress(pgMeta, 10);
+ Story story = processMeta(true, pgMeta);
+ pgMeta.done(); // 10%
- boolean space = true;
- boolean brk = true;
- boolean quote = false;
- boolean tentativeCloseQuote = false;
- char prev = '\0';
- int dashCount = 0;
+ pg.setName("Retrieving " + story.getMeta().getTitle());
- StringBuilder builder = new StringBuilder();
- for (char car : line.toCharArray()) {
- if (car != '-') {
- if (dashCount > 0) {
- // dash, ndash and mdash: - – —
- // currently: always use mdash
- builder.append(dashCount == 1 ? '-' : '—');
- }
- dashCount = 0;
- }
+ Progress pgGetChapters = new Progress();
+ pg.addProgress(pgGetChapters, 10);
+ story.setChapters(new ArrayList<Chapter>());
+ List<Entry<String, URL>> chapters = getChapters(pgGetChapters);
+ pgGetChapters.done(); // 20%
- if (tentativeCloseQuote) {
- tentativeCloseQuote = false;
- if ((car >= 'a' && car <= 'z') || (car >= 'A' && car <= 'Z')
- || (car >= '0' && car <= '9')) {
- builder.append("'");
- } else {
- builder.append(closeQuote);
- }
- }
+ if (chapters != null) {
+ Progress pgChaps = new Progress("Extracting chapters", 0,
+ chapters.size() * 300);
+ pg.addProgress(pgChaps, 80);
- switch (car) {
- case ' ': // note: unbreakable space
- case ' ':
- case '\t':
- case '\n': // just in case
- case '\r': // just in case
- builder.append(' ');
- break;
-
- case '\'':
- if (space || (brk && quote)) {
- quote = true;
- builder.append(openQuote);
- } else if (prev == ' ') {
- builder.append(openQuote);
- } else {
- // it is a quote ("I'm off") or a 'quote' ("This
- // 'good' restaurant"...)
- tentativeCloseQuote = true;
- }
- break;
-
- case '"':
- if (space || (brk && quote)) {
- quote = true;
- builder.append(openDoubleQuote);
- } else if (prev == ' ') {
- builder.append(openDoubleQuote);
- } else {
- builder.append(closeDoubleQuote);
- }
- break;
-
- case '-':
- if (space) {
- quote = true;
- } else {
- dashCount++;
- }
- space = false;
- break;
-
- case '*':
- case '~':
- case '/':
- case '\\':
- case '<':
- case '>':
- case '=':
- case '+':
- case '_':
- case '–':
- case '—':
- space = false;
- builder.append(car);
- break;
-
- case '‘':
- case '`':
- case '‹':
- case '﹁':
- case '〈':
- case '「':
- if (space || (brk && quote)) {
- quote = true;
- builder.append(openQuote);
- } else {
- builder.append(openQuote);
+ long words = 0;
+ int i = 1;
+ for (Entry<String, URL> chap : chapters) {
+ pgChaps.setName("Extracting chapter " + i);
+ URL chapUrl = chap.getValue();
+ String chapName = chap.getKey();
+ if (chapUrl != null) {
+ setCurrentReferer(chapUrl);
}
- space = false;
- brk = false;
- break;
- case '’':
- case '›':
- case '﹂':
- case '〉':
- case '」':
- space = false;
- brk = false;
- builder.append(closeQuote);
- break;
+ pgChaps.setProgress(i * 100);
+ Progress pgGetChapterContent = new Progress();
+ Progress pgMakeChapter = new Progress();
+ pgChaps.addProgress(pgGetChapterContent, 100);
+ pgChaps.addProgress(pgMakeChapter, 100);
- case '«':
- case '“':
- case '﹃':
- case '《':
- case '『':
- if (space || (brk && quote)) {
- quote = true;
- builder.append(openDoubleQuote);
- } else {
- builder.append(openDoubleQuote);
- }
- space = false;
- brk = false;
- break;
+ String content = getChapterContent(chapUrl, i,
+ pgGetChapterContent);
+ pgGetChapterContent.done();
+ Chapter cc = bsPara.makeChapter(this, chapUrl, i,
+ chapName, content, isHtml(), pgMakeChapter);
+ pgMakeChapter.done();
- case '»':
- case '”':
- case '﹄':
- case '》':
- case '』':
- space = false;
- brk = false;
- builder.append(closeDoubleQuote);
- break;
+ words += cc.getWords();
+ story.getChapters().add(cc);
+ story.getMeta().setWords(words);
- default:
- space = false;
- brk = false;
- builder.append(car);
- break;
+ i++;
}
- prev = car;
+ pgChaps.setName("Extracting chapters");
+ pgChaps.done();
}
- if (tentativeCloseQuote) {
- tentativeCloseQuote = false;
- builder.append(closeQuote);
- }
+ pg.done();
- line = builder.toString().trim();
-
- ParagraphType type = ParagraphType.NORMAL;
- if (space) {
- type = ParagraphType.BLANK;
- } else if (brk) {
- type = ParagraphType.BREAK;
- } else if (quote) {
- type = ParagraphType.QUOTE;
- }
-
- return new Paragraph(type, line);
+ return story;
}
/**
- * Remove the HTML from the inpit <b>if</b> {@link BasicSupport#isHtml()} is
- * true.
+ * Create a chapter from the given data.
*
- * @param input
- * the input
+ * @param source
+ * the source URL for this content, which can be used to try and
+ * find images if images are present in the format [image-url]
+ * @param number
+ * the chapter number (0 = description)
+ * @param name
+ * the chapter name
+ * @param content
+ * the content of the chapter
+ * @return the {@link Chapter}
*
- * @return the no html version if needed
+ * @throws IOException
+ * in case of I/O error
*/
- private String ifUnhtml(String input) {
- if (isHtml() && input != null) {
- return StringUtils.unhtml(input);
- }
-
- return input;
+ public Chapter makeChapter(URL source, int number, String name,
+ String content) throws IOException {
+ return bsPara.makeChapter(this, source, number, name,
+ content, isHtml(), null);
}
/**
// TEXT and INFO_TEXT always support files (not URLs though)
for (SupportType type : SupportType.values()) {
if (type != SupportType.TEXT && type != SupportType.INFO_TEXT) {
- BasicSupport support = getSupport(type);
+ BasicSupport support = getSupport(type, url);
if (support != null && support.supports(url)) {
return support;
}
}
}
- for (SupportType type : new SupportType[] { SupportType.TEXT,
- SupportType.INFO_TEXT }) {
- BasicSupport support = getSupport(type);
+ for (SupportType type : new SupportType[] { SupportType.INFO_TEXT,
+ SupportType.TEXT }) {
+ BasicSupport support = getSupport(type, url);
if (support != null && support.supports(url)) {
return support;
}
* Return a {@link BasicSupport} implementation supporting the given type.
*
* @param type
- * the type
+ * the type, must not be NULL
+ * @param url
+ * the {@link URL} to support (can be NULL to get an
+ * "abstract support"; if not NULL, will be used as the source
+ * URL)
*
* @return an implementation that supports it, or NULL
*/
- public static BasicSupport getSupport(SupportType type) {
+ public static BasicSupport getSupport(SupportType type, URL url) {
+ BasicSupport support = null;
+
switch (type) {
case EPUB:
- return new Epub().setType(type);
+ support = new Epub();
+ break;
case INFO_TEXT:
- return new InfoText().setType(type);
+ support = new InfoText();
+ break;
case FIMFICTION:
- return new Fimfiction().setType(type);
+ try {
+ // Can fail if no client key or NO in options
+ support = new FimfictionApi();
+ } catch (IOException e) {
+ support = new Fimfiction();
+ }
+ break;
case FANFICTION:
- return new Fanfiction().setType(type);
+ support = new Fanfiction();
+ break;
case TEXT:
- return new Text().setType(type);
- case MANGAFOX:
- return new MangaFox().setType(type);
+ support = new Text();
+ break;
+ case MANGAHUB:
+ support = new MangaHub();
+ break;
case E621:
- return new E621().setType(type);
+ support = new E621();
+ break;
+ case YIFFSTAR:
+ support = new YiffStar();
+ break;
+ case E_HENTAI:
+ support = new EHentai();
+ break;
+ case MANGA_LEL:
+ support = new MangaLel();
+ break;
case CBZ:
- return new Cbz().setType(type);
+ support = new Cbz();
+ break;
+ case HTML:
+ support = new Html();
+ break;
}
- return null;
+ if (support != null) {
+ support.setType(type);
+ support.source = support.getCanonicalUrl(url);
+ }
+
+ return support;
}
}