e621: improve title search for pools
[fanfix.git] / supported / E621.java
1 package be.nikiroo.fanfix.supported;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.io.UnsupportedEncodingException;
6 import java.net.MalformedURLException;
7 import java.net.URL;
8 import java.net.URLDecoder;
9 import java.util.AbstractMap;
10 import java.util.ArrayList;
11 import java.util.Collections;
12 import java.util.Date;
13 import java.util.LinkedList;
14 import java.util.List;
15 import java.util.Map.Entry;
16
17 import org.jsoup.helper.DataUtil;
18 import org.jsoup.nodes.Document;
19 import org.jsoup.nodes.Element;
20 import org.jsoup.select.Elements;
21
22 import be.nikiroo.fanfix.Instance;
23 import be.nikiroo.fanfix.data.MetaData;
24 import be.nikiroo.utils.IOUtils;
25 import be.nikiroo.utils.Image;
26 import be.nikiroo.utils.Progress;
27 import be.nikiroo.utils.StringUtils;
28
29 /**
30 * Support class for <a href="http://e621.net/">e621.net</a> and
31 * <a href="http://e926.net/">e926.net</a>, a Furry website supporting comics,
32 * including some of MLP.
33 * <p>
34 * <a href="http://e926.net/">e926.net</a> only shows the "clean" images and
35 * comics, but it can be difficult to browse.
36 *
37 * @author niki
38 */
39 class E621 extends BasicSupport {
40 @Override
41 protected boolean supports(URL url) {
42 String host = url.getHost();
43 if (host.startsWith("www.")) {
44 host = host.substring("www.".length());
45 }
46
47 return ("e621.net".equals(host) || "e926.net".equals(host)) && (isPool(url) || isSearchOrSet(url));
48 }
49
50 @Override
51 protected boolean isHtml() {
52 return true;
53 }
54
55 @Override
56 protected MetaData getMeta() throws IOException {
57 MetaData meta = new MetaData();
58
59 meta.setTitle(getTitle());
60 meta.setAuthor(getAuthor());
61 meta.setDate("");
62 meta.setTags(getTags());
63 meta.setSource(getType().getSourceName());
64 meta.setUrl(getSource().toString());
65 meta.setPublisher(getType().getSourceName());
66 meta.setUuid(getSource().toString());
67 meta.setLuid("");
68 meta.setLang("en");
69 meta.setSubject("Furry");
70 meta.setType(getType().toString());
71 meta.setImageDocument(true);
72 meta.setCover(getCover());
73 meta.setFakeCover(true);
74
75 return meta;
76 }
77
78 @Override
79 protected String getDesc() throws IOException {
80 if (isSearchOrSet(getSource())) {
81 StringBuilder builder = new StringBuilder();
82 builder.append("A collection of images from ").append(getSource().getHost()).append("\n") //
83 .append("\tTime of creation: " + StringUtils.fromTime(new Date().getTime())).append("\n") //
84 .append("\tTags: ");//
85 for (String tag : getTags()) {
86 builder.append("\t\t").append(tag);
87 }
88
89 return builder.toString();
90 }
91
92 if (isPool(getSource())) {
93 Element el = getSourceNode().getElementById("description");
94 if (el != null) {
95 return el.text();
96 }
97 }
98
99 return null;
100 }
101
102 @Override
103 protected List<Entry<String, URL>> getChapters(Progress pg) throws IOException {
104 List<Entry<String, URL>> chapters = new LinkedList<Entry<String, URL>>();
105
106 if (isPool(getSource())) {
107 String baseUrl = "https://e621.net/" + getSource().getPath() + "?page=";
108 chapters = getChapters(getSource(), pg, baseUrl, "");
109 } else if (isSearchOrSet(getSource())) {
110 String baseUrl = "https://e621.net/posts/?page=";
111 String search = "&tags=" + getTagsFromUrl(getSource());
112
113 chapters = getChapters(getSource(), pg,
114 baseUrl, search);
115 }
116
117 // sets and some pools are sorted in reverse order on the website
118 if (getSource().getPath().startsWith("/posts")) {
119 Collections.reverse(chapters);
120 }
121
122 return chapters;
123 }
124
125 private List<Entry<String, URL>> getChapters(URL source, Progress pg, String baseUrl, String parameters)
126 throws IOException {
127 List<Entry<String, URL>> urls = new ArrayList<Entry<String, URL>>();
128
129 if (source.getHost().contains("e926")) {
130 baseUrl = baseUrl.replace("e621", "e926");
131 }
132
133 for (int i = 1; true; i++) {
134 URL url = new URL(baseUrl + i + parameters);
135 try {
136 InputStream pageI = Instance.getInstance().getCache().open(url, this, false);
137 try {
138 if (IOUtils.readSmallStream(pageI).contains("Nobody here but us chickens!")) {
139 break;
140 }
141 urls.add(new AbstractMap.SimpleEntry<String, URL>("Page " + Integer.toString(i), url));
142 } finally {
143 pageI.close();
144 }
145 } catch (Exception e) {
146 break;
147 }
148 }
149
150 return urls;
151 }
152
153 @Override
154 protected String getChapterContent(URL chapUrl, int number, Progress pg) throws IOException {
155 StringBuilder builder = new StringBuilder();
156 Document chapterNode = loadDocument(chapUrl);
157
158 Elements articles = chapterNode.getElementsByTag("article");
159
160 // sets and some pools are sorted in reverse order on the website
161 if (getSource().getPath().startsWith("/posts")) {
162 Collections.reverse(articles);
163 }
164
165 for (Element el : articles) {
166 builder.append("[");
167 builder.append(el.attr("data-file-url"));
168 builder.append("]<br/>");
169 }
170
171 return builder.toString();
172 }
173
174 @Override
175 protected URL getCanonicalUrl(URL source) {
176 // Convert search-pools into proper pools
177 if (source.getPath().equals("/posts") && source.getQuery() != null
178 && source.getQuery().startsWith("tags=pool%3A")) {
179 String poolNumber = source.getQuery()
180 .substring("tags=pool%3A".length());
181 try {
182 Integer.parseInt(poolNumber);
183 String base = source.getProtocol() + "://" + source.getHost();
184 if (source.getPort() != -1) {
185 base = base + ":" + source.getPort();
186 }
187 source = new URL(base + "/pools/" + poolNumber);
188 } catch (NumberFormatException e) {
189 // Not a simple pool, skip
190 } catch (MalformedURLException e) {
191 // Cannot happen
192 }
193 }
194
195 if (isSetOriginalUrl(source)) {
196 try {
197 Document doc = DataUtil.load(Instance.getInstance().getCache().open(source, this, false), "UTF-8", source.toString());
198 for (Element shortname : doc.getElementsByClass("set-shortname")) {
199 for (Element el : shortname.getElementsByTag("a")) {
200 if (!el.attr("href").isEmpty())
201 return new URL(el.absUrl("href"));
202 }
203 }
204 } catch (IOException e) {
205 Instance.getInstance().getTraceHandler().error(e);
206 }
207 }
208
209 if (isPool(source)) {
210 try {
211 return new URL(source.toString().replace("/pool/show/", "/pools/"));
212 } catch (MalformedURLException e) {
213 }
214 }
215
216 return super.getCanonicalUrl(source);
217 }
218
219 // returns "xxx+ddd+ggg" if "tags=xxx+ddd+ggg" was present in the query
220 private String getTagsFromUrl(URL url) {
221 String tags = url == null ? "" : url.getQuery();
222 int pos = tags.indexOf("tags=");
223
224 if (pos >= 0) {
225 tags = tags.substring(pos).substring("tags=".length());
226 } else {
227 return "";
228 }
229
230 pos = tags.indexOf('&');
231 if (pos > 0) {
232 tags = tags.substring(0, pos);
233 }
234 pos = tags.indexOf('/');
235 if (pos > 0) {
236 tags = tags.substring(0, pos);
237 }
238
239 return tags;
240 }
241
242 private String getTitle() {
243 String title = "";
244
245 Element el = getSourceNode().getElementsByTag("title").first();
246 if (el != null) {
247 title = el.text().trim();
248 }
249
250 for (String s : new String[] { "e621", "-", "e621", "Pool", "-" }) {
251 if (title.startsWith(s)) {
252 title = title.substring(s.length()).trim();
253 }
254 if (title.endsWith(s)) {
255 title = title.substring(0, title.length() - s.length()).trim();
256 }
257 }
258
259 if (isSearchOrSet(getSource())) {
260 title = title.isEmpty() ? "e621" : "[e621] " + title;
261 }
262
263 return title;
264 }
265
266 private String getAuthor() throws IOException {
267 StringBuilder builder = new StringBuilder();
268
269 if (isSearchOrSet(getSource())) {
270 for (Element el : getSourceNode().getElementsByClass("search-tag")) {
271 if (el.attr("itemprop").equals("author")) {
272 if (builder.length() > 0) {
273 builder.append(", ");
274 }
275 builder.append(el.text().trim());
276 }
277 }
278 }
279
280 if (isPool(getSource())) {
281 String desc = getDesc();
282 String descL = desc.toLowerCase();
283
284 if (descL.startsWith("by:") || descL.startsWith("by ")) {
285 desc = desc.substring(3).trim();
286 desc = desc.split("\n")[0];
287
288 String tab[] = desc.split(" ");
289 for (int i = 0; i < Math.min(tab.length, 5); i++) {
290 if (tab[i].startsWith("http"))
291 break;
292 builder.append(" ").append(tab[i]);
293 }
294 }
295
296 if (builder.length() == 0) {
297 String url = "https://e621.net/" + getSource().getPath()
298 + "?page=1";
299 Document page1 = DataUtil.load(Instance.getInstance().getCache()
300 .open(getSource(), this, false), "UTF-8",
301 url.toString());
302 for (Element el : page1.getElementsByClass("search-tag")) {
303 if (el.attr("itemprop").equals("author")) {
304 if (builder.length() > 0) {
305 builder.append(", ");
306 }
307 builder.append(el.text().trim());
308 }
309 }
310 }
311 }
312
313 return builder.toString();
314 }
315
316 // no tags for pools
317 private List<String> getTags() {
318 List<String> tags = new ArrayList<String>();
319 if (isSearchOrSet(getSource())) {
320 String str = getTagsFromUrl(getSource());
321 for (String tag : str.split("\\+")) {
322 try {
323 tags.add(URLDecoder.decode(tag.trim(), "UTF-8").trim());
324 } catch (UnsupportedEncodingException e) {
325 }
326 }
327 }
328
329 return tags;
330 }
331
332 private Image getCover() throws IOException {
333 Image image = null;
334 List<Entry<String, URL>> chapters = getChapters(null);
335 if (!chapters.isEmpty()) {
336 URL chap1Url = chapters.get(0).getValue();
337 String imgsChap1 = getChapterContent(chap1Url, 1, null);
338 if (!imgsChap1.isEmpty()) {
339 imgsChap1 = imgsChap1.split("]")[0].substring(1).trim();
340 image = bsImages.getImage(this, new URL(imgsChap1));
341 }
342 }
343
344 return image;
345 }
346
347 // note: will be removed at getCanonicalUrl()
348 private boolean isSetOriginalUrl(URL originalUrl) {
349 return originalUrl.getPath().startsWith("/post_sets/");
350 }
351
352 private boolean isPool(URL url) {
353 return url.getPath().startsWith("/pools/") || url.getPath().startsWith("/pool/show/");
354 }
355
356 // set will be renamed into search by canonical url
357 private boolean isSearchOrSet(URL url) {
358 return
359 // search:
360 (url.getPath().equals("/posts") && url.getQuery().contains("tags="))
361 // or set:
362 || isSetOriginalUrl(url);
363 }
364 }