X-Git-Url: http://git.nikiroo.be/?p=nikiroo-utils.git;a=blobdiff_plain;f=supported%2FE621.java;h=f1660e18cb34a9da6373ac03e6918ccc630ff25f;hp=dfa9e5ed6a60e4694fc8494dcf196b4328259ee6;hb=2e1300b9580ae1b6dded5a734c617a66a116c16d;hpb=669a62833b4458bad0772debdd06921080500221

diff --git a/supported/E621.java b/supported/E621.java
index dfa9e5e..f1660e1 100644
--- a/supported/E621.java
+++ b/supported/E621.java
@@ -9,22 +9,25 @@ import java.net.URLDecoder;
 import java.util.AbstractMap;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.Date;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map.Entry;
-import java.util.Scanner;
+
+import org.jsoup.helper.DataUtil;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
 
 import be.nikiroo.fanfix.Instance;
-import be.nikiroo.fanfix.data.Chapter;
 import be.nikiroo.fanfix.data.MetaData;
-import be.nikiroo.fanfix.data.Story;
+import be.nikiroo.utils.IOUtils;
 import be.nikiroo.utils.Image;
 import be.nikiroo.utils.Progress;
 import be.nikiroo.utils.StringUtils;
 
 /**
- * Support class for <a href="http://e621.net/">e621.net</a> and <a
- * href="http://e926.net/">e926.net</a>, a Furry website supporting comics,
+ * Support class for <a href="http://e621.net/">e621.net</a> and
+ * <a href="http://e926.net/">e926.net</a>, a Furry website supporting comics,
  * including some of MLP.
  * <p>
  * <a href="http://e926.net/">e926.net</a> only shows the "clean" images and
@@ -32,335 +35,226 @@ import be.nikiroo.utils.StringUtils;
  * 
  * @author niki
  */
-class E621 extends BasicSupport_Deprecated {
+class E621 extends BasicSupport {
+	@Override
+	protected boolean supports(URL url) {
+		String host = url.getHost();
+		if (host.startsWith("www.")) {
+			host = host.substring("www.".length());
+		}
+
+		return ("e621.net".equals(host) || "e926.net".equals(host)) && (isPool(url) || isSearchOrSet(url));
+	}
+
 	@Override
-	protected MetaData getMeta(URL source, InputStream in) throws IOException {
+	protected boolean isHtml() {
+		return true;
+	}
+
+	@Override
+	protected MetaData getMeta() throws IOException {
 		MetaData meta = new MetaData();
 
-		meta.setTitle(getTitle(reset(in)));
-		meta.setAuthor(getAuthor(source, reset(in)));
+		meta.setTitle(getTitle());
+		meta.setAuthor(getAuthor());
 		meta.setDate("");
-		meta.setTags(getTags(source, reset(in), false));
+		meta.setTags(getTags());
 		meta.setSource(getType().getSourceName());
-		meta.setUrl(source.toString());
+		meta.setUrl(getSource().toString());
 		meta.setPublisher(getType().getSourceName());
-		meta.setUuid(source.toString());
+		meta.setUuid(getSource().toString());
 		meta.setLuid("");
 		meta.setLang("en");
 		meta.setSubject("Furry");
 		meta.setType(getType().toString());
 		meta.setImageDocument(true);
-		meta.setCover(getCover(source, reset(in)));
+		meta.setCover(getCover());
 		meta.setFakeCover(true);
 
 		return meta;
 	}
 
-	private List<String> getTags(URL source, InputStream in, boolean authors) {
-		List<String> tags = new ArrayList<String>();
-
-		if (isSearch(source)) {
-			String tagLine = getLine(in, "id=\"tag-sidebar\"", 1);
-			if (tagLine != null) {
-				String key = "href=\"";
-				for (int pos = tagLine.indexOf(key); pos >= 0; pos = tagLine
-						.indexOf(key, pos + 1)) {
-					int end = tagLine.indexOf("\"", pos + key.length());
-					if (end >= 0) {
-						String href = tagLine.substring(pos, end);
-						String subkey;
-						if (authors)
-							subkey = "?name=";
-						else
-							subkey = "?title=";
-						if (href.contains(subkey)) {
-							String tag = href.substring(href.indexOf(subkey)
-									+ subkey.length());
-							try {
-								tags.add(URLDecoder.decode(tag, "UTF-8"));
-							} catch (UnsupportedEncodingException e) {
-								// supported JVMs must have UTF-8 support
-								e.printStackTrace();
-							}
-						}
-					}
-				}
-
-			}
-		}
-
-		return tags;
-	}
-
 	@Override
-	public Story process(URL url, Progress pg) throws IOException {
-		// There is no chapters on e621, just pagination...
-		Story story = super.process(url, pg);
+	protected String getDesc() throws IOException {
+		if (isSearchOrSet(getSource())) {
+			StringBuilder builder = new StringBuilder();
+			builder.append("A collection of images from ").append(getSource().getHost()).append("\n") //
+					.append("\tTime of creation: " + StringUtils.fromTime(new Date().getTime())).append("\n") //
+					.append("\tTags: ");//
+			for (String tag : getTags()) {
+				builder.append("\t\t").append(tag);
+			}
 
-		Chapter only = new Chapter(1, null);
-		for (Chapter chap : story) {
-			only.getParagraphs().addAll(chap.getParagraphs());
+			return builder.toString();
 		}
 
-		story.getChapters().clear();
-		story.getChapters().add(only);
-
-		return story;
-	}
-
-	@Override
-	protected boolean supports(URL url) {
-		String host = url.getHost();
-		if (host.startsWith("www.")) {
-			host = host.substring("www.".length());
+		if (isPool(getSource())) {
+			Element el = getSourceNode().getElementById("description");
+			if (el != null) {
+				return el.text();
+			}
 		}
 
-		return ("e621.net".equals(host) || "e926.net".equals(host))
-				&& (isPool(url) || isSearch(url));
+		return null;
 	}
 
 	@Override
-	protected boolean isHtml() {
-		return true;
-	}
-
-	private Image getCover(URL source, InputStream in) throws IOException {
-		URL urlForCover = source;
-		if (isPool(source)) {
-			urlForCover = new URL(source.toString() + "?page=1");
-		}
-
-		String images = getChapterContent(urlForCover, in, 1, null);
-		if (!images.isEmpty()) {
-			int pos = images.indexOf("<br/>");
-			if (pos >= 0) {
-				images = images.substring(1, pos - 1);
-				return getImage(this, null, images);
-			}
+	protected List<Entry<String, URL>> getChapters(Progress pg) throws IOException {
+		if (isPool(getSource())) {
+			String baseUrl = "https://e621.net/" + getSource().getPath() + "?page=";
+			return getChapters(getSource(), pg, baseUrl, "");
+		} else if (isSearchOrSet(getSource())) {
+			String baseUrl = "https://e621.net/posts/?page=";
+			String search = "&tags=" + getTagsFromUrl(getSource());
+			return getChapters(getSource(), pg, baseUrl, search);
 		}
 
-		return null;
+		return new LinkedList<Entry<String, URL>>();
 	}
 
-	private String getAuthor(URL source, InputStream in) {
-		if (isSearch(source)) {
-			StringBuilder builder = new StringBuilder();
-			for (String author : getTags(source, in, true)) {
-				if (builder.length() > 0)
-					builder.append(", ");
-				builder.append(author);
-			}
+	private List<Entry<String, URL>> getChapters(URL source, Progress pg, String baseUrl, String parameters)
+			throws IOException {
+		List<Entry<String, URL>> urls = new ArrayList<Entry<String, URL>>();
 
-			return builder.toString();
+		if (source.getHost().contains("e926")) {
+			baseUrl = baseUrl.replace("e621", "e926");
 		}
 
-		String author = getLine(in, "href=\"/post/show/", 0);
-		if (author != null) {
-			String key = "href=\"";
-			int pos = author.indexOf(key);
-			if (pos >= 0) {
-				author = author.substring(pos + key.length());
-				pos = author.indexOf("\"");
-				if (pos >= 0) {
-					author = author.substring(0, pos - 1);
-					String page = source.getProtocol() + "://"
-							+ source.getHost() + author;
-					try {
-						InputStream pageIn = Instance.getCache().open(
-								new URL(page), this, false);
-						try {
-							key = "class=\"tag-type-artist\"";
-							author = getLine(pageIn, key, 0);
-							if (author != null) {
-								pos = author.indexOf("<a href=\"");
-								if (pos >= 0) {
-									author = author.substring(pos);
-									pos = author.indexOf("</a>");
-									if (pos >= 0) {
-										author = author.substring(0, pos);
-										return StringUtils.unhtml(author);
-									}
-								}
-							}
-						} finally {
-							pageIn.close();
-						}
-					} catch (Exception e) {
-						// No author found
+		for (int i = 1; true; i++) {
+			URL url = new URL(baseUrl + i + parameters);
+			try {
+				InputStream pageI = Instance.getInstance().getCache().open(url, this, false);
+				try {
+					if (IOUtils.readSmallStream(pageI).contains("Nobody here but us chickens!")) {
+						break;
 					}
+					urls.add(new AbstractMap.SimpleEntry<String, URL>("Page " + Integer.toString(i), url));
+				} finally {
+					pageI.close();
 				}
+			} catch (Exception e) {
+				break;
 			}
 		}
 
-		return null;
+		// They are sorted in reverse order on the website
+		Collections.reverse(urls);
+		return urls;
 	}
 
-	private String getTitle(InputStream in) {
-		String title = getLine(in, "<title>", 0);
-		if (title != null) {
-			int pos = title.indexOf('>');
-			if (pos >= 0) {
-				title = title.substring(pos + 1);
-				pos = title.indexOf('<');
-				if (pos >= 0) {
-					title = title.substring(0, pos);
-				}
-			}
-
-			if (title.startsWith("Pool:")) {
-				title = title.substring("Pool:".length());
-			}
-
-			title = StringUtils.unhtml(title).trim();
+	@Override
+	protected String getChapterContent(URL chapUrl, int number, Progress pg) throws IOException {
+		StringBuilder builder = new StringBuilder();
+		Document chapterNode = loadDocument(chapUrl);
+		for (Element el : chapterNode.getElementsByTag("article")) {
+			builder.append("[");
+			builder.append(el.attr("data-file-url"));
+			builder.append("]<br/>");
 		}
 
-		return title;
+		return builder.toString();
 	}
 
 	@Override
-	protected String getDesc(URL source, InputStream in) throws IOException {
-		String desc = getLine(in, "margin-bottom: 2em;", 0);
-
-		if (desc != null) {
-			StringBuilder builder = new StringBuilder();
-
-			boolean inTags = false;
-			for (char car : desc.toCharArray()) {
-				if ((inTags && car == '>') || (!inTags && car == '<')) {
-					inTags = !inTags;
-				}
-
-				if (inTags) {
-					builder.append(car);
+	protected URL getCanonicalUrl(URL source) {
+		if (isSetOriginalUrl(source)) {
+			try {
+				Document doc = DataUtil.load(Instance.getInstance().getCache().open(source, this, false), "UTF-8", source.toString());
+				for (Element shortname : doc.getElementsByClass("set-shortname")) {
+					for (Element el : shortname.getElementsByTag("a")) {
+						if (!el.attr("href").isEmpty())
+							return new URL(el.absUrl("href"));
+					}
 				}
+			} catch (IOException e) {
+				Instance.getInstance().getTraceHandler().error(e);
 			}
-
-			return builder.toString().trim();
 		}
 
-		return null;
-	}
-
-	@Override
-	protected List<Entry<String, URL>> getChapters(URL source, InputStream in,
-			Progress pg) throws IOException {
 		if (isPool(source)) {
-			return getChaptersPool(source, in, pg);
-		} else if (isSearch(source)) {
-			return getChaptersSearch(source, in, pg);
+			try {
+				return new URL(source.toString().replace("/pool/show/", "/pools/"));
+			} catch (MalformedURLException e) {
+			}
 		}
 
-		return new LinkedList<Entry<String, URL>>();
+		return super.getCanonicalUrl(source);
 	}
 
-	private List<Entry<String, URL>> getChaptersSearch(URL source,
-			InputStream in, Progress pg) throws IOException {
-		List<Entry<String, URL>> urls = new ArrayList<Entry<String, URL>>();
+	// returns "xxx+ddd+ggg" if "tags=xxx+ddd+ggg" was present in the query
+	private String getTagsFromUrl(URL url) {
+		String tags = url == null ? "" : url.getQuery();
+		int pos = tags.indexOf("tags=");
 
-		String search = source.getPath();
-		if (search.endsWith("/")) {
-			search = search.substring(0, search.length() - 1);
+		if (pos >= 0) {
+			tags = tags.substring(pos).substring("tags=".length());
+		} else {
+			return "";
 		}
 
-		int pos = search.lastIndexOf('/');
-		if (pos >= 0) {
-			search = search.substring(pos + 1);
+		pos = tags.indexOf('&');
+		if (pos > 0) {
+			tags = tags.substring(0, pos);
+		}
+		pos = tags.indexOf('/');
+		if (pos > 0) {
+			tags = tags.substring(0, pos);
 		}
 
-		String baseUrl = "https://e621.net/post/index/";
-		if (source.getHost().contains("e926")) {
-			baseUrl = baseUrl.replace("e621", "e926");
+		return tags;
+	}
+
+	private String getTitle() {
+		String title = "";
+
+		Element el = getSourceNode().getElementsByTag("title").first();
+		if (el != null) {
+			title = el.text().trim();
 		}
 
-		for (int i = 1; true; i++) {
-			URL url = new URL(baseUrl + i + "/" + search + "/");
-			try {
-				InputStream pageI = Instance.getCache().open(url, this, false);
-				try {
-					if (getLine(pageI, "No posts matched your search.", 0) != null)
-						break;
-					urls.add(new AbstractMap.SimpleEntry<String, URL>("Page "
-							+ Integer.toString(i), url));
-				} finally {
-					pageI.close();
-				}
-			} catch (Exception e) {
-				break;
+		for (String s : new String[] { "e621", "-", "e621" }) {
+			if (title.startsWith(s)) {
+				title = title.substring(s.length()).trim();
+			}
+			if (title.endsWith(s)) {
+				title = title.substring(0, title.length() - s.length()).trim();
 			}
+
 		}
 
-		// They are sorted in reverse order on the website
-		Collections.reverse(urls);
-		return urls;
+		if (isSearchOrSet(getSource())) {
+			title = title.isEmpty() ? "e621" : "[e621] " + title;
+		}
+		return title;
 	}
 
-	private List<Entry<String, URL>> getChaptersPool(URL source,
-			InputStream in, Progress pg) throws IOException {
-		List<Entry<String, URL>> urls = new ArrayList<Entry<String, URL>>();
-		int last = 1; // no pool/show when only one page
-
-		@SuppressWarnings("resource")
-		Scanner scan = new Scanner(in, "UTF-8");
-		scan.useDelimiter("\\n");
-		while (scan.hasNext()) {
-			String line = scan.next();
-			for (int pos = line.indexOf(source.getPath()); pos >= 0; pos = line
-					.indexOf(source.getPath(), pos + source.getPath().length())) {
-				int equalPos = line.indexOf("=", pos);
-				int quotePos = line.indexOf("\"", pos);
-				if (equalPos >= 0 && quotePos > equalPos) {
-					String snum = line.substring(equalPos + 1, quotePos);
-					try {
-						int num = Integer.parseInt(snum);
-						if (num > last) {
-							last = num;
-						}
-					} catch (NumberFormatException e) {
+	private String getAuthor() throws IOException {
+		StringBuilder builder = new StringBuilder();
+
+		if (isSearchOrSet(getSource())) {
+			for (Element el : getSourceNode().getElementsByClass("search-tag")) {
+				if (el.attr("itemprop").equals("author")) {
+					if (builder.length() > 0) {
+						builder.append(", ");
 					}
+					builder.append(el.text().trim());
 				}
 			}
 		}
 
-		for (int i = 1; i <= last; i++) {
-			urls.add(new AbstractMap.SimpleEntry<String, URL>(Integer
-					.toString(i), new URL(source.toString() + "?page=" + i)));
-		}
+		if (isPool(getSource())) {
+			String desc = getDesc();
+			String descL = desc.toLowerCase();
 
-		return urls;
-	}
-
-	@Override
-	protected String getChapterContent(URL source, InputStream in, int number,
-			Progress pg) throws IOException {
-		StringBuilder builder = new StringBuilder();
-		String staticSite = "https://static1.e621.net";
-		if (source.getHost().contains("e926")) {
-			staticSite = staticSite.replace("e621", "e926");
-		}
+			if (descL.startsWith("by:") || descL.startsWith("by ")) {
+				desc = desc.substring(3).trim();
+				desc = desc.split("\n")[0];
 
-		String key = staticSite + "/data/preview/";
-
-		@SuppressWarnings("resource")
-		Scanner scan = new Scanner(in, "UTF-8");
-		scan.useDelimiter("\\n");
-		while (scan.hasNext()) {
-			String line = scan.next();
-			if (line.contains("class=\"preview")) {
-				for (int pos = line.indexOf(key); pos >= 0; pos = line.indexOf(
-						key, pos + key.length())) {
-					int endPos = line.indexOf("\"", pos);
-					if (endPos >= 0) {
-						String id = line.substring(pos + key.length(), endPos);
-						id = staticSite + "/data/" + id;
-
-						int dotPos = id.lastIndexOf(".");
-						if (dotPos >= 0) {
-							id = id.substring(0, dotPos);
-							builder.append("[");
-							builder.append(id);
-							builder.append("]<br/>");
-						}
-					}
+				String tab[] = desc.split(" ");
+				for (int i = 0; i < Math.min(tab.length, 5); i++) {
+					if (tab[i].startsWith("http"))
+						break;
+					builder.append(" ").append(tab[i]);
 				}
 			}
 		}
@@ -368,42 +262,52 @@ class E621 extends BasicSupport_Deprecated {
 		return builder.toString();
 	}
 
-	@Override
-	protected URL getCanonicalUrl(URL source) {
-		if (isSearch(source)) {
-			// /post?tags=tag1+tag2 -> ../post/index/1/tag1%32tag2
-			String key = "?tags=";
-			if (source.toString().contains(key)) {
-				int pos = source.toString().indexOf(key);
-				String tags = source.toString().substring(pos + key.length());
-				tags = tags.replace("+", "%20");
-
-				String base = source.toString().substring(0, pos);
-				if (!base.endsWith("/")) {
-					base += "/";
-				}
-				if (base.endsWith("/search/")) {
-					base = base.substring(0, base.indexOf("/search/") + 1);
-				}
-
+	// no tags for pools
+	private List<String> getTags() {
+		List<String> tags = new ArrayList<String>();
+		if (isSearchOrSet(getSource())) {
+			String str = getTagsFromUrl(getSource());
+			for (String tag : str.split("\\+")) {
 				try {
-					return new URL(base + "index/1/" + tags);
-				} catch (MalformedURLException e) {
-					Instance.getTraceHandler().error(e);
+					tags.add(URLDecoder.decode(tag.trim(), "UTF-8").trim());
+				} catch (UnsupportedEncodingException e) {
 				}
 			}
 		}
 
-		return super.getCanonicalUrl(source);
+		return tags;
+	}
+
+	private Image getCover() throws IOException {
+		Image image = null;
+		List<Entry<String, URL>> chapters = getChapters(null);
+		if (!chapters.isEmpty()) {
+			URL chap1Url = chapters.get(0).getValue();
+			String imgsChap1 = getChapterContent(chap1Url, 1, null);
+			if (!imgsChap1.isEmpty()) {
+				imgsChap1 = imgsChap1.split("]")[0].substring(1).trim();
+				image = bsImages.getImage(this, new URL(imgsChap1));
+			}
+		}
+
+		return image;
+	}
+
+	// note: will be removed at getCanonicalUrl()
+	private boolean isSetOriginalUrl(URL originalUrl) {
+		return originalUrl.getPath().startsWith("/post_sets/");
 	}
 
 	private boolean isPool(URL url) {
-		return url.getPath().startsWith("/pool/");
+		return url.getPath().startsWith("/pools/") || url.getPath().startsWith("/pool/show/");
 	}
 
-	private boolean isSearch(URL url) {
-		return url.getPath().startsWith("/post/index/")
-				|| (url.getPath().equals("/post/search") && url.getQuery()
-						.startsWith("tags="));
+	// set will be renamed into search by canonical url
+	private boolean isSearchOrSet(URL url) {
+		return
+		// search:
+		(url.getPath().equals("/posts") && url.getQuery().contains("tags="))
+				// or set:
+				|| isSetOriginalUrl(url);
 	}
 }