e621: fix detection of "?tags" urls
[nikiroo-utils.git] / src / be / nikiroo / fanfix / supported / E621.java
CommitLineData
08fe2e33
NR
1package be.nikiroo.fanfix.supported;
2
3import java.io.IOException;
4import java.io.InputStream;
b5e9855b 5import java.io.UnsupportedEncodingException;
9b863b20 6import java.net.MalformedURLException;
08fe2e33 7import java.net.URL;
b5e9855b 8import java.net.URLDecoder;
ce297a79 9import java.util.AbstractMap;
08fe2e33 10import java.util.ArrayList;
9b863b20 11import java.util.Collections;
b5e9855b 12import java.util.LinkedList;
08fe2e33
NR
13import java.util.List;
14import java.util.Map.Entry;
15import java.util.Scanner;
16
17import be.nikiroo.fanfix.Instance;
18import be.nikiroo.fanfix.data.Chapter;
68686a37 19import be.nikiroo.fanfix.data.MetaData;
08fe2e33 20import be.nikiroo.fanfix.data.Story;
16a81ef7 21import be.nikiroo.utils.Image;
3b2b638f 22import be.nikiroo.utils.Progress;
08fe2e33
NR
23import be.nikiroo.utils.StringUtils;
24
25/**
26 * Support class for <a href="http://e621.net/">e621.net</a> and <a
27 * href="http://e926.net/">e926.net</a>, a Furry website supporting comics,
28 * including some of MLP.
29 * <p>
30 * <a href="http://e926.net/">e926.net</a> only shows the "clean" images and
31 * comics, but it can be difficult to browse.
32 *
33 * @author niki
34 */
0ffa4754 35class E621 extends BasicSupport_Deprecated {
08fe2e33
NR
36 @Override
37 public String getSourceName() {
38 return "e621.net";
39 }
40
41 @Override
68686a37
NR
42 protected MetaData getMeta(URL source, InputStream in) throws IOException {
43 MetaData meta = new MetaData();
44
45 meta.setTitle(getTitle(reset(in)));
46 meta.setAuthor(getAuthor(source, reset(in)));
47 meta.setDate("");
b5e9855b 48 meta.setTags(getTags(source, reset(in), false));
68686a37 49 meta.setSource(getSourceName());
2206ef66 50 meta.setUrl(source.toString());
68686a37
NR
51 meta.setPublisher(getSourceName());
52 meta.setUuid(source.toString());
53 meta.setLuid("");
276f95c6 54 meta.setLang("en");
a4143cd7 55 meta.setSubject("Furry");
68686a37
NR
56 meta.setType(getType().toString());
57 meta.setImageDocument(true);
b5e9855b 58 meta.setCover(getCover(source, reset(in)));
a9eb3f46 59 meta.setFakeCover(true);
68686a37
NR
60
61 return meta;
08fe2e33
NR
62 }
63
b5e9855b
NR
64 private List<String> getTags(URL source, InputStream in, boolean authors) {
65 List<String> tags = new ArrayList<String>();
66
67 if (isSearch(source)) {
68 String tagLine = getLine(in, "id=\"tag-sidebar\"", 1);
69 if (tagLine != null) {
70 String key = "href=\"";
71 for (int pos = tagLine.indexOf(key); pos >= 0; pos = tagLine
72 .indexOf(key, pos + 1)) {
73 int end = tagLine.indexOf("\"", pos + key.length());
74 if (end >= 0) {
75 String href = tagLine.substring(pos, end);
76 String subkey;
77 if (authors)
78 subkey = "?name=";
79 else
80 subkey = "?title=";
81 if (href.contains(subkey)) {
82 String tag = href.substring(href.indexOf(subkey)
83 + subkey.length());
84 try {
85 tags.add(URLDecoder.decode(tag, "UTF-8"));
86 } catch (UnsupportedEncodingException e) {
87 // supported JVMs must have UTF-8 support
88 e.printStackTrace();
89 }
90 }
91 }
92 }
93
94 }
95 }
96
97 return tags;
98 }
99
08fe2e33 100 @Override
92fb0719 101 public Story process(URL url, Progress pg) throws IOException {
08fe2e33 102 // There is no chapters on e621, just pagination...
92fb0719 103 Story story = super.process(url, pg);
08fe2e33
NR
104
105 Chapter only = new Chapter(1, null);
106 for (Chapter chap : story) {
107 only.getParagraphs().addAll(chap.getParagraphs());
108 }
109
110 story.getChapters().clear();
111 story.getChapters().add(only);
112
113 return story;
114 }
115
116 @Override
117 protected boolean supports(URL url) {
118 String host = url.getHost();
119 if (host.startsWith("www.")) {
120 host = host.substring("www.".length());
121 }
122
123 return ("e621.net".equals(host) || "e926.net".equals(host))
b5e9855b 124 && (isPool(url) || isSearch(url));
08fe2e33
NR
125 }
126
127 @Override
128 protected boolean isHtml() {
129 return true;
130 }
131
b5e9855b 132 private Image getCover(URL source, InputStream in) throws IOException {
678390e0
NR
133 URL urlForCover = source;
134 if (isPool(source)) {
135 urlForCover = new URL(source.toString() + "?page=1");
136 }
b5e9855b 137
678390e0 138 String images = getChapterContent(urlForCover, in, 1, null);
595dfa7a 139 if (!images.isEmpty()) {
406447a4 140 int pos = images.indexOf("<br/>");
595dfa7a
NR
141 if (pos >= 0) {
142 images = images.substring(1, pos - 1);
143 return getImage(this, null, images);
144 }
145 }
146
147 return null;
148 }
149
211f7ddb 150 private String getAuthor(URL source, InputStream in) {
b5e9855b
NR
151 if (isSearch(source)) {
152 StringBuilder builder = new StringBuilder();
153 for (String author : getTags(source, in, true)) {
154 if (builder.length() > 0)
155 builder.append(", ");
156 builder.append(author);
157 }
158
159 return builder.toString();
160 }
161
08fe2e33
NR
162 String author = getLine(in, "href=\"/post/show/", 0);
163 if (author != null) {
164 String key = "href=\"";
165 int pos = author.indexOf(key);
166 if (pos >= 0) {
167 author = author.substring(pos + key.length());
168 pos = author.indexOf("\"");
169 if (pos >= 0) {
170 author = author.substring(0, pos - 1);
171 String page = source.getProtocol() + "://"
172 + source.getHost() + author;
08fe2e33 173 try {
7d0d2be6
NR
174 InputStream pageIn = Instance.getCache().open(
175 new URL(page), this, false);
176 try {
177 key = "class=\"tag-type-artist\"";
178 author = getLine(pageIn, key, 0);
179 if (author != null) {
180 pos = author.indexOf("<a href=\"");
08fe2e33 181 if (pos >= 0) {
7d0d2be6
NR
182 author = author.substring(pos);
183 pos = author.indexOf("</a>");
184 if (pos >= 0) {
185 author = author.substring(0, pos);
186 return StringUtils.unhtml(author);
187 }
08fe2e33
NR
188 }
189 }
7d0d2be6
NR
190 } finally {
191 pageIn.close();
08fe2e33 192 }
7d0d2be6
NR
193 } catch (Exception e) {
194 // No author found
08fe2e33
NR
195 }
196 }
197 }
198 }
199
200 return null;
201 }
202
211f7ddb 203 private String getTitle(InputStream in) {
08fe2e33
NR
204 String title = getLine(in, "<title>", 0);
205 if (title != null) {
206 int pos = title.indexOf('>');
207 if (pos >= 0) {
208 title = title.substring(pos + 1);
209 pos = title.indexOf('<');
210 if (pos >= 0) {
211 title = title.substring(0, pos);
212 }
213 }
214
215 if (title.startsWith("Pool:")) {
216 title = title.substring("Pool:".length());
217 }
218
68686a37 219 title = StringUtils.unhtml(title).trim();
08fe2e33
NR
220 }
221
222 return title;
223 }
224
225 @Override
226 protected String getDesc(URL source, InputStream in) throws IOException {
227 String desc = getLine(in, "margin-bottom: 2em;", 0);
228
229 if (desc != null) {
230 StringBuilder builder = new StringBuilder();
231
232 boolean inTags = false;
233 for (char car : desc.toCharArray()) {
234 if ((inTags && car == '>') || (!inTags && car == '<')) {
235 inTags = !inTags;
236 }
237
238 if (inTags) {
239 builder.append(car);
240 }
241 }
242
243 return builder.toString().trim();
244 }
245
246 return null;
247 }
248
249 @Override
ed08c171
NR
250 protected List<Entry<String, URL>> getChapters(URL source, InputStream in,
251 Progress pg) throws IOException {
b5e9855b
NR
252 if (isPool(source)) {
253 return getChaptersPool(source, in, pg);
254 } else if (isSearch(source)) {
255 return getChaptersSearch(source, in, pg);
256 }
257
258 return new LinkedList<Entry<String, URL>>();
259 }
260
261 private List<Entry<String, URL>> getChaptersSearch(URL source,
262 InputStream in, Progress pg) throws IOException {
263 List<Entry<String, URL>> urls = new ArrayList<Entry<String, URL>>();
264
265 String search = source.getPath();
266 if (search.endsWith("/")) {
267 search = search.substring(0, search.length() - 1);
268 }
269
270 int pos = search.lastIndexOf('/');
271 if (pos >= 0) {
272 search = search.substring(pos + 1);
273 }
274
275 String baseUrl = "https://e621.net/post/index/";
276 if (source.getHost().contains("e926")) {
277 baseUrl = baseUrl.replace("e621", "e926");
278 }
279
280 for (int i = 1; true; i++) {
281 URL url = new URL(baseUrl + i + "/" + search + "/");
282 try {
283 InputStream pageI = Instance.getCache().open(url, this, false);
284 try {
285 if (getLine(pageI, "No posts matched your search.", 0) != null)
286 break;
9b863b20
NR
287 urls.add(new AbstractMap.SimpleEntry<String, URL>("Page "
288 + Integer.toString(i), url));
b5e9855b
NR
289 } finally {
290 pageI.close();
291 }
292 } catch (Exception e) {
293 break;
294 }
295 }
296
9b863b20
NR
297 // They are sorted in reverse order on the website
298 Collections.reverse(urls);
b5e9855b
NR
299 return urls;
300 }
301
302 private List<Entry<String, URL>> getChaptersPool(URL source,
303 InputStream in, Progress pg) throws IOException {
08fe2e33
NR
304 List<Entry<String, URL>> urls = new ArrayList<Entry<String, URL>>();
305 int last = 1; // no pool/show when only one page
306
307 @SuppressWarnings("resource")
308 Scanner scan = new Scanner(in, "UTF-8");
309 scan.useDelimiter("\\n");
310 while (scan.hasNext()) {
311 String line = scan.next();
312 for (int pos = line.indexOf(source.getPath()); pos >= 0; pos = line
313 .indexOf(source.getPath(), pos + source.getPath().length())) {
314 int equalPos = line.indexOf("=", pos);
315 int quotePos = line.indexOf("\"", pos);
316 if (equalPos >= 0 && quotePos > equalPos) {
317 String snum = line.substring(equalPos + 1, quotePos);
318 try {
319 int num = Integer.parseInt(snum);
320 if (num > last) {
321 last = num;
322 }
323 } catch (NumberFormatException e) {
324 }
325 }
326 }
327 }
328
329 for (int i = 1; i <= last; i++) {
ce297a79
NR
330 urls.add(new AbstractMap.SimpleEntry<String, URL>(Integer
331 .toString(i), new URL(source.toString() + "?page=" + i)));
08fe2e33
NR
332 }
333
334 return urls;
335 }
336
337 @Override
ed08c171
NR
338 protected String getChapterContent(URL source, InputStream in, int number,
339 Progress pg) throws IOException {
08fe2e33
NR
340 StringBuilder builder = new StringBuilder();
341 String staticSite = "https://static1.e621.net";
342 if (source.getHost().contains("e926")) {
343 staticSite = staticSite.replace("e621", "e926");
344 }
345
346 String key = staticSite + "/data/preview/";
347
348 @SuppressWarnings("resource")
349 Scanner scan = new Scanner(in, "UTF-8");
350 scan.useDelimiter("\\n");
351 while (scan.hasNext()) {
352 String line = scan.next();
d98a2900 353 if (line.contains("class=\"preview")) {
08fe2e33
NR
354 for (int pos = line.indexOf(key); pos >= 0; pos = line.indexOf(
355 key, pos + key.length())) {
356 int endPos = line.indexOf("\"", pos);
357 if (endPos >= 0) {
358 String id = line.substring(pos + key.length(), endPos);
359 id = staticSite + "/data/" + id;
360
361 int dotPos = id.lastIndexOf(".");
362 if (dotPos >= 0) {
363 id = id.substring(0, dotPos);
364 builder.append("[");
365 builder.append(id);
406447a4 366 builder.append("]<br/>");
08fe2e33
NR
367 }
368 }
369 }
370 }
371 }
372
373 return builder.toString();
374 }
b5e9855b 375
9b863b20
NR
376 @Override
377 protected URL getCanonicalUrl(URL source) {
378 if (isSearch(source)) {
379 // /post?tags=tag1+tag2 -> ../post/index/1/tag1%32tag2
380 String key = "post?tags=";
381 if (source.toString().contains(key)) {
382 int pos = source.toString().indexOf(key);
383 String tags = source.toString().substring(pos + key.length());
1822d603 384 tags = tags.replace("+", "%20");
9b863b20
NR
385 try {
386 return new URL(source.toString().substring(0, pos)
387 + "post/index/1/" + tags);
388 } catch (MalformedURLException e) {
389 Instance.getTraceHandler().error(e);
390 }
391 }
392 }
393 return super.getCanonicalUrl(source);
394 }
395
b5e9855b
NR
396 private boolean isPool(URL url) {
397 return url.getPath().startsWith("/pool/");
398 }
399
400 private boolean isSearch(URL url) {
9b863b20 401 return url.getPath().startsWith("/post/index/")
14d8be1c
NR
402 || (url.getPath().equals("/post/search") && url.getQuery()
403 .startsWith("tags="));
b5e9855b 404 }
08fe2e33 405}