package be.nikiroo.fanfix.supported; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Scanner; import be.nikiroo.fanfix.Instance; import be.nikiroo.fanfix.bundles.Config; import be.nikiroo.fanfix.bundles.StringId; import be.nikiroo.fanfix.data.Chapter; import be.nikiroo.fanfix.data.MetaData; import be.nikiroo.fanfix.data.Paragraph; import be.nikiroo.fanfix.data.Story; import be.nikiroo.fanfix.data.Paragraph.ParagraphType; import be.nikiroo.utils.StringUtils; /** * This class is the base class used by the other support classes. It can be * used outside of this package, and have static method that you can use to get * access to the correct support class. *
* It will be used with 'resources' (usually web pages or files). * * @author niki */ public abstract class BasicSupport { /** * The supported input types for which we can get a {@link BasicSupport} * object. * * @author niki */ public enum SupportType { /** EPUB files created with this program */ EPUB, /** Pure text file with some rules */ TEXT, /** TEXT but with associated .info file */ INFO_TEXT, /** My Little Pony fanfictions */ FIMFICTION, /** Fanfictions from a lot of different universes */ FANFICTION, /** Website with lots of Mangas */ MANGAFOX, /** Furry website with comics support */ E621, /** CBZ files */ CBZ; /** * A description of this support type (more information than the * {@link BasicSupport#getSourceName()}). * * @return the description */ public String getDesc() { String desc = Instance.getTrans().getStringX(StringId.INPUT_DESC, this.name()); if (desc == null) { desc = Instance.getTrans().getString(StringId.INPUT_DESC, this); } return desc; } /** * The name of this support type (a short version). * * @return the name */ public String getSourceName() { BasicSupport support = BasicSupport.getSupport(this); if (support != null) { return support.getSourceName(); } return null; } @Override public String toString() { return super.toString().toLowerCase(); } /** * Call {@link SupportType#valueOf(String.toUpperCase())}. * * @param typeName * the possible type name * * @return NULL or the type */ public static SupportType valueOfUC(String typeName) { return SupportType.valueOf(typeName == null ? null : typeName .toUpperCase()); } /** * Call {@link SupportType#valueOf(String.toUpperCase())} but return * NULL for NULL instead of raising exception. * * @param typeName * the possible type name * * @return NULL or the type */ public static SupportType valueOfNullOkUC(String typeName) { if (typeName == null) { return null; } return SupportType.valueOfUC(typeName); } /** * Call {@link SupportType#valueOf(String.toUpperCase())} but return * NULL in case of error instead of raising an exception. * * @param typeName * the possible type name * * @return NULL or the type */ public static SupportType valueOfAllOkUC(String typeName) { try { return SupportType.valueOfUC(typeName); } catch (Exception e) { return null; } } } /** Only used by {@link BasicSupport#getInput()} just so it is always reset. */ private InputStream in; private SupportType type; private URL currentReferer; // with on 'r', as in 'HTTP'... // quote chars private char openQuote = Instance.getTrans().getChar( StringId.OPEN_SINGLE_QUOTE); private char closeQuote = Instance.getTrans().getChar( StringId.CLOSE_SINGLE_QUOTE); private char openDoubleQuote = Instance.getTrans().getChar( StringId.OPEN_DOUBLE_QUOTE); private char closeDoubleQuote = Instance.getTrans().getChar( StringId.CLOSE_DOUBLE_QUOTE); /** * The name of this support class. * * @return the name */ protected abstract String getSourceName(); /** * Check if the given resource is supported by this {@link BasicSupport}. * * @param url * the resource to check for * * @return TRUE if it is */ protected abstract boolean supports(URL url); /** * Return TRUE if the support will return HTML encoded content values for * the chapters content. * * @return TRUE for HTML */ protected abstract boolean isHtml(); /** * Return the story title. * * @param source * the source of the story * @param in * the input (the main resource) * * @return the title * * @throws IOException * in case of I/O error */ protected abstract String getTitle(URL source, InputStream in) throws IOException; /** * Return the story author. * * @param source * the source of the story * @param in * the input (the main resource) * * @return the author * * @throws IOException * in case of I/O error */ protected abstract String getAuthor(URL source, InputStream in) throws IOException; /** * Return the story publication date. * * @param source * the source of the story * @param in * the input (the main resource) * * @return the date * * @throws IOException * in case of I/O error */ protected abstract String getDate(URL source, InputStream in) throws IOException; /** * Return the subject of the story (for instance, if it is a fanfiction, * what is the original work; if it is a technical text, what is the * technical subject...). * * @param source * the source of the story * @param in * the input (the main resource) * * @return the subject * * @throws IOException * in case of I/O error */ protected abstract String getSubject(URL source, InputStream in) throws IOException; /** * Return the story description. * * @param source * the source of the story * @param in * the input (the main resource) * * @return the description * * @throws IOException * in case of I/O error */ protected abstract String getDesc(URL source, InputStream in) throws IOException; /** * Return the story cover resource if any, or NULL if none. *
* The default cover should not be checked for here.
*
* @param source
* the source of the story
* @param in
* the input (the main resource)
*
* @return the cover or NULL
*
* @throws IOException
* in case of I/O error
*/
protected abstract URL getCover(URL source, InputStream in)
throws IOException;
/**
* Return the list of chapters (name and resource).
*
* @param source
* the source of the story
* @param in
* the input (the main resource)
*
* @return the chapters
*
* @throws IOException
* in case of I/O error
*/
protected abstract List
* You are expected to call the super method implementation if you override
* it.
*
* @return the cookies
*/
public Map
* By default, this is the {@link URL} of the resource.
*
* @param source
* the source of the story
* @param in
* the input (the main resource)
*
* @return the uuid
*
* @throws IOException
* in case of I/O error
*/
protected String getUuid(URL source, InputStream in) throws IOException {
return source.toString();
}
/**
* Return the story Library UID, a unique value representing the story (it
* is often a number) in the local library.
*
* By default, this is empty.
*
* @param source
* the source of the story
* @param in
* the input (the main resource)
*
* @return the id
*
* @throws IOException
* in case of I/O error
*/
protected String getLuid(URL source, InputStream in) throws IOException {
return "";
}
/**
* Return the 2-letter language code of this story.
*
* By default, this is 'EN'.
*
* @param source
* the source of the story
* @param in
* the input (the main resource)
*
* @return the language
*
* @throws IOException
* in case of I/O error
*/
protected String getLang(URL source, InputStream in) throws IOException {
return "EN";
}
/**
* Return the list of tags for this story.
*
* @param source
* the source of the story
* @param in
* the input (the main resource)
*
* @return the tags
*
* @throws IOException
* in case of I/O error
*/
protected List
* Do not reset the input, which will be pointing at the line just after the
* result (input will be spent if no result is found).
*
* @param in
* the input
* @param needle
* a string that must be found inside the target line (also
* supports "^" at start to say "only if it starts with" the
* needle)
* @param relativeLine
* the line to return based upon the target line position (-1 =
* the line before, 0 = the target line...)
*
* @return the line
*/
protected String getLine(InputStream in, String needle, int relativeLine) {
return getLine(in, needle, relativeLine, true);
}
/**
* Return a line from the given input which correspond to the given
* selectors.
*
* Do not reset the input, which will be pointing at the line just after the
* result (input will be spent if no result is found) when first is TRUE,
* and will always be spent if first is FALSE.
*
* @param in
* the input
* @param needle
* a string that must be found inside the target line (also
* supports "^" at start to say "only if it starts with" the
* needle)
* @param relativeLine
* the line to return based upon the target line position (-1 =
* the line before, 0 = the target line...)
* @param first
* takes the first result (as opposed to the last one, which will
* also always spend the input)
*
* @return the line
*/
protected String getLine(InputStream in, String needle, int relativeLine,
boolean first) {
String rep = null;
List
* Will also fix quotes and HTML encoding if needed.
*
* @param line
* the raw line
*
* @return the processed {@link Paragraph}
*/
private Paragraph processPara(String line) {
line = ifUnhtml(line).trim();
boolean space = true;
boolean brk = true;
boolean quote = false;
boolean tentativeCloseQuote = false;
char prev = '\0';
int dashCount = 0;
StringBuilder builder = new StringBuilder();
for (char car : line.toCharArray()) {
if (car != '-') {
if (dashCount > 0) {
// dash, ndash and mdash: - – —
// currently: always use mdash
builder.append(dashCount == 1 ? '-' : '—');
}
dashCount = 0;
}
if (tentativeCloseQuote) {
tentativeCloseQuote = false;
if ((car >= 'a' && car <= 'z') || (car >= 'A' && car <= 'Z')
|| (car >= '0' && car <= '9')) {
builder.append("'");
} else {
builder.append(closeQuote);
}
}
switch (car) {
case ' ': // note: unbreakable space
case ' ':
case '\t':
case '\n': // just in case
case '\r': // just in case
builder.append(' ');
break;
case '\'':
if (space || (brk && quote)) {
quote = true;
builder.append(openQuote);
} else if (prev == ' ') {
builder.append(openQuote);
} else {
// it is a quote ("I'm off") or a 'quote' ("This
// 'good' restaurant"...)
tentativeCloseQuote = true;
}
break;
case '"':
if (space || (brk && quote)) {
quote = true;
builder.append(openDoubleQuote);
} else if (prev == ' ') {
builder.append(openDoubleQuote);
} else {
builder.append(closeDoubleQuote);
}
break;
case '-':
if (space) {
quote = true;
} else {
dashCount++;
}
space = false;
break;
case '*':
case '~':
case '/':
case '\\':
case '<':
case '>':
case '=':
case '+':
case '_':
case '–':
case '—':
space = false;
builder.append(car);
break;
case '‘':
case '`':
case '‹':
case '﹁':
case '〈':
case '「':
if (space || (brk && quote)) {
quote = true;
builder.append(openQuote);
} else {
builder.append(openQuote);
}
space = false;
brk = false;
break;
case '’':
case '›':
case '﹂':
case '〉':
case '」':
space = false;
brk = false;
builder.append(closeQuote);
break;
case '«':
case '“':
case '﹃':
case '《':
case '『':
if (space || (brk && quote)) {
quote = true;
builder.append(openDoubleQuote);
} else {
builder.append(openDoubleQuote);
}
space = false;
brk = false;
break;
case '»':
case '”':
case '﹄':
case '》':
case '』':
space = false;
brk = false;
builder.append(closeDoubleQuote);
break;
default:
space = false;
brk = false;
builder.append(car);
break;
}
prev = car;
}
if (tentativeCloseQuote) {
tentativeCloseQuote = false;
builder.append(closeQuote);
}
line = builder.toString().trim();
ParagraphType type = ParagraphType.NORMAL;
if (space) {
type = ParagraphType.BLANK;
} else if (brk) {
type = ParagraphType.BREAK;
} else if (quote) {
type = ParagraphType.QUOTE;
}
return new Paragraph(type, line);
}
/**
* Remove the HTML from the inpit if {@link BasicSupport#isHtml()} is
* true.
*
* @param input
* the input
*
* @return the no html version if needed
*/
private String ifUnhtml(String input) {
if (isHtml() && input != null) {
return StringUtils.unhtml(input);
}
return input;
}
/**
* Return a {@link BasicSupport} implementation supporting the given
* resource if possible.
*
* @param url
* the story resource
*
* @return an implementation that supports it, or NULL
*/
public static BasicSupport getSupport(URL url) {
if (url == null) {
return null;
}
// TEXT and INFO_TEXT always support files (not URLs though)
for (SupportType type : SupportType.values()) {
if (type != SupportType.TEXT && type != SupportType.INFO_TEXT) {
BasicSupport support = getSupport(type);
if (support != null && support.supports(url)) {
return support;
}
}
}
for (SupportType type : new SupportType[] { SupportType.TEXT,
SupportType.INFO_TEXT }) {
BasicSupport support = getSupport(type);
if (support != null && support.supports(url)) {
return support;
}
}
return null;
}
/**
* Return a {@link BasicSupport} implementation supporting the given type.
*
* @param type
* the type
*
* @return an implementation that supports it, or NULL
*/
public static BasicSupport getSupport(SupportType type) {
switch (type) {
case EPUB:
return new Epub().setType(type);
case INFO_TEXT:
return new InfoText().setType(type);
case FIMFICTION:
return new Fimfiction().setType(type);
case FANFICTION:
return new Fanfiction().setType(type);
case TEXT:
return new Text().setType(type);
case MANGAFOX:
return new MangaFox().setType(type);
case E621:
return new E621().setType(type);
case CBZ:
return new Cbz().setType(type);
}
return null;
}
}
processing:
content = content.replaceAll("(
]*>)|(
)|(
)",
"\n* * *\n");
}
InputStream in = new ByteArrayInputStream(
content.getBytes(StandardCharsets.UTF_8));
try {
@SuppressWarnings("resource")
Scanner scan = new Scanner(in, "UTF-8");
scan.useDelimiter("(\\n|