back to dev
[fanfix.git] / src / be / nikiroo / utils / Downloader.java
CommitLineData
8816d2f7
NR
1package be.nikiroo.utils;
2
3import java.io.IOException;
4import java.io.InputStream;
5import java.io.OutputStreamWriter;
6import java.net.CookieHandler;
7import java.net.CookieManager;
8import java.net.CookiePolicy;
9import java.net.CookieStore;
10import java.net.HttpCookie;
11import java.net.HttpURLConnection;
12import java.net.URL;
13import java.net.URLConnection;
14import java.net.URLEncoder;
15import java.util.Map;
16import java.util.zip.GZIPInputStream;
17
18/**
19 * This class will help you download content from Internet Sites ({@link URL}
20 * based).
21 * <p>
22 * It allows you to control some options often required on web sites that do not
23 * want to simply serve HTML, but actively makes your life difficult with stupid
24 * checks.
25 *
26 * @author niki
27 */
28public class Downloader {
29 private String UA;
30 private CookieManager cookies;
530d4062 31 private TraceHandler tracer = new TraceHandler();
f6e8d60d 32 private Cache cache;
1002f328 33 private boolean offline;
8816d2f7
NR
34
35 /**
36 * Create a new {@link Downloader}.
37 *
38 * @param UA
39 * the User-Agent to use to download the resources -- note that
40 * some websites require one, some actively blacklist real UAs
41 * like the one from wget, some whitelist a couple of browsers
42 * only (!)
43 */
44 public Downloader(String UA) {
f6e8d60d
NR
45 this(UA, null);
46 }
47
48 /**
49 * Create a new {@link Downloader}.
50 *
51 * @param UA
52 * the User-Agent to use to download the resources -- note that
53 * some websites require one, some actively blacklist real UAs
54 * like the one from wget, some whitelist a couple of browsers
55 * only (!)
56 * @param cache
57 * the {@link Cache} to use for all access (can be NULL)
58 */
59 public Downloader(String UA, Cache cache) {
8816d2f7
NR
60 this.UA = UA;
61
15f13472 62 cookies = new CookieManager(null, CookiePolicy.ACCEPT_ALL);
8816d2f7 63 CookieHandler.setDefault(cookies);
f6e8d60d 64
15f13472 65 setCache(cache);
8816d2f7 66 }
1002f328
NR
67
68 /**
69 * This {@link Downloader} is forbidden to try and connect to the network.
70 * <p>
71 * If TRUE, it will only check the cache if any.
72 * <p>
73 * Default is FALSE.
74 *
75 * @return TRUE if offline
76 */
77 public boolean isOffline() {
78 return offline;
79 }
80
81 /**
82 * This {@link Downloader} is forbidden to try and connect to the network.
83 * <p>
84 * If TRUE, it will only check the cache if any.
85 * <p>
86 * Default is FALSE.
87 *
88 * @param offline TRUE for offline, FALSE for online
89 */
90 public void setOffline(boolean offline) {
91 this.offline = offline;
92 }
8816d2f7 93
530d4062
NR
94 /**
95 * The traces handler for this {@link Cache}.
96 *
97 * @return the traces handler
98 */
99 public TraceHandler getTraceHandler() {
100 return tracer;
101 }
102
103 /**
104 * The traces handler for this {@link Cache}.
105 *
106 * @param tracer
107 * the new traces handler
108 */
109 public void setTraceHandler(TraceHandler tracer) {
80500544
NR
110 if (tracer == null) {
111 tracer = new TraceHandler(false, false, false);
112 }
113
530d4062
NR
114 this.tracer = tracer;
115 }
116
3052163b
NR
117 /**
118 * The {@link Cache} to use for all access (can be NULL).
119 *
120 * @return the cache
121 */
122 public Cache getCache() {
123 return cache;
124 }
125
126 /**
127 * The {@link Cache} to use for all access (can be NULL).
128 *
129 * @param cache
130 * the new cache
131 */
132 public void setCache(Cache cache) {
133 this.cache = cache;
134 }
135
8816d2f7
NR
136 /**
137 * Clear all the cookies currently in the jar.
138 * <p>
139 * As long as you don't, the cookies are kept.
140 */
141 public void clearCookies() {
142 cookies.getCookieStore().removeAll();
143 }
144
145 /**
146 * Open the given {@link URL} and update the cookies.
147 *
148 * @param url
149 * the {@link URL} to open
150 * @return the {@link InputStream} of the opened page
151 *
152 * @throws IOException
153 * in case of I/O error
154 **/
155 public InputStream open(URL url) throws IOException {
f6e8d60d
NR
156 return open(url, false);
157 }
158
159 /**
160 * Open the given {@link URL} and update the cookies.
161 *
162 * @param url
163 * the {@link URL} to open
164 * @param stable
165 * stable a stable file (that doesn't change too often) --
166 * parameter used to check if the file is too old to keep or not
167 * in the cache (default is false)
168 *
169 * @return the {@link InputStream} of the opened page
170 *
171 * @throws IOException
172 * in case of I/O error
173 **/
174 public InputStream open(URL url, boolean stable) throws IOException {
175 return open(url, url, url, null, null, null, null, stable);
8816d2f7
NR
176 }
177
178 /**
179 * Open the given {@link URL} and update the cookies.
180 *
181 * @param url
182 * the {@link URL} to open
530d4062
NR
183 * @param currentReferer
184 * the current referer, for websites that needs this info
185 * @param cookiesValues
186 * the cookies
8816d2f7
NR
187 * @param postParams
188 * the POST parameters
189 * @param getParams
190 * the GET parameters (priority over POST)
191 * @param oauth
192 * OAuth authorization (aka, "bearer XXXXXXX")
193 *
194 * @return the {@link InputStream} of the opened page
195 *
196 * @throws IOException
1002f328 197 * in case of I/O error (including offline mode + not in cache)
8816d2f7
NR
198 */
199 public InputStream open(URL url, URL currentReferer,
200 Map<String, String> cookiesValues, Map<String, String> postParams,
201 Map<String, String> getParams, String oauth) throws IOException {
f6e8d60d
NR
202 return open(url, currentReferer, cookiesValues, postParams, getParams,
203 oauth, false);
204 }
205
206 /**
207 * Open the given {@link URL} and update the cookies.
208 *
209 * @param url
210 * the {@link URL} to open
211 * @param currentReferer
212 * the current referer, for websites that needs this info
213 * @param cookiesValues
214 * the cookies
215 * @param postParams
216 * the POST parameters
217 * @param getParams
218 * the GET parameters (priority over POST)
219 * @param oauth
220 * OAuth authorization (aka, "bearer XXXXXXX")
221 * @param stable
222 * stable a stable file (that doesn't change too often) --
223 * parameter used to check if the file is too old to keep or not
224 * in the cache (default is false)
225 *
226 * @return the {@link InputStream} of the opened page
227 *
228 * @throws IOException
1002f328 229 * in case of I/O error (including offline mode + not in cache)
f6e8d60d
NR
230 */
231 public InputStream open(URL url, URL currentReferer,
232 Map<String, String> cookiesValues, Map<String, String> postParams,
233 Map<String, String> getParams, String oauth, boolean stable)
234 throws IOException {
8816d2f7 235 return open(url, url, currentReferer, cookiesValues, postParams,
f6e8d60d 236 getParams, oauth, stable);
8816d2f7
NR
237 }
238
8816d2f7
NR
239 /**
240 * Open the given {@link URL} and update the cookies.
241 *
242 * @param url
243 * the {@link URL} to open
244 * @param originalUrl
ae7d1a83
NR
245 * the original {@link URL} before any redirection occurs, which
246 * is also used for the cache ID if needed (so we can retrieve
247 * the content with this URL if needed)
248 * @param currentReferer
249 * the current referer, for websites that needs this info
250 * @param cookiesValues
251 * the cookies
8816d2f7
NR
252 * @param postParams
253 * the POST parameters
254 * @param getParams
255 * the GET parameters (priority over POST)
256 * @param oauth
257 * OAuth authorisation (aka, "bearer XXXXXXX")
f6e8d60d
NR
258 * @param stable
259 * a stable file (that doesn't change too often) -- parameter
260 * used to check if the file is too old to keep or not in the
261 * cache
262 *
8816d2f7
NR
263 * @return the {@link InputStream} of the opened page
264 *
265 * @throws IOException
1002f328 266 * in case of I/O error (including offline mode + not in cache)
8816d2f7 267 */
ae7d1a83
NR
268 public InputStream open(URL url, final URL originalUrl, URL currentReferer,
269 Map<String, String> cookiesValues, Map<String, String> postParams,
270 Map<String, String> getParams, String oauth, boolean stable)
271 throws IOException {
f6e8d60d
NR
272
273 tracer.trace("Request: " + url);
274
275 if (cache != null) {
ae7d1a83 276 InputStream in = cache.load(originalUrl, false, stable);
f6e8d60d 277 if (in != null) {
223aa0d4
NR
278 tracer.trace("Use the cache: " + url);
279 tracer.trace("Original URL : " + originalUrl);
f6e8d60d
NR
280 return in;
281 }
282 }
8816d2f7 283
1002f328
NR
284 if (offline) {
285 tracer.error("Downloader OFFLINE, cannot proceed to URL: " + url);
286 throw new IOException("Downloader is currently OFFLINE, cannot download: " + url);
287 }
288
530d4062 289 tracer.trace("Download: " + url);
8816d2f7
NR
290
291 URLConnection conn = openConnectionWithCookies(url, currentReferer,
292 cookiesValues);
293
294 // Priority: GET over POST
295 Map<String, String> params = getParams;
296 if (getParams == null) {
297 params = postParams;
298 }
299
15f13472 300 StringBuilder requestData = null;
8816d2f7
NR
301 if ((params != null || oauth != null)
302 && conn instanceof HttpURLConnection) {
8816d2f7
NR
303 if (params != null) {
304 requestData = new StringBuilder();
305 for (Map.Entry<String, String> param : params.entrySet()) {
306 if (requestData.length() != 0)
307 requestData.append('&');
308 requestData.append(URLEncoder.encode(param.getKey(),
309 "UTF-8"));
310 requestData.append('=');
311 requestData.append(URLEncoder.encode(
312 String.valueOf(param.getValue()), "UTF-8"));
313 }
314
8816d2f7
NR
315 if (getParams == null && postParams != null) {
316 ((HttpURLConnection) conn).setRequestMethod("POST");
317 }
318
319 conn.setRequestProperty("Content-Type",
320 "application/x-www-form-urlencoded");
15f13472
NR
321 conn.setRequestProperty("Content-Length",
322 Integer.toString(requestData.length()));
8816d2f7
NR
323 }
324
325 if (oauth != null) {
326 conn.setRequestProperty("Authorization", oauth);
327 }
328
329 if (requestData != null) {
15f13472
NR
330 conn.setDoOutput(true);
331 OutputStreamWriter writer = new OutputStreamWriter(
332 conn.getOutputStream());
0988831f 333 try {
0988831f
NR
334 writer.write(requestData.toString());
335 writer.flush();
336 } finally {
15f13472 337 writer.close();
0988831f 338 }
8816d2f7
NR
339 }
340 }
341
15f13472
NR
342 // Manual redirection, much better for POST data
343 if (conn instanceof HttpURLConnection) {
344 ((HttpURLConnection) conn).setInstanceFollowRedirects(false);
345 }
346
8816d2f7
NR
347 conn.connect();
348
349 // Check if redirect
59654e2a
NR
350 // BEWARE! POST data cannot be redirected (some webservers complain) for
351 // HTTP codes 302 and 303
6149689f
NR
352 if (conn instanceof HttpURLConnection) {
353 int repCode = 0;
354 try {
355 // Can fail in some circumstances
356 repCode = ((HttpURLConnection) conn).getResponseCode();
357 } catch (IOException e) {
358 }
359
360 if (repCode / 100 == 3) {
361 String newUrl = conn.getHeaderField("Location");
362 return open(new URL(newUrl), originalUrl, currentReferer,
59654e2a
NR
363 cookiesValues, //
364 (repCode == 302 || repCode == 303) ? null : postParams, //
365 getParams, oauth, stable);
6149689f 366 }
8816d2f7
NR
367 }
368
59654e2a
NR
369 try {
370 InputStream in = conn.getInputStream();
371 if ("gzip".equals(conn.getContentEncoding())) {
372 in = new GZIPInputStream(in);
373 }
8816d2f7 374
59654e2a
NR
375 if (in == null) {
376 throw new IOException("No InputStream!");
377 }
378
379 if (cache != null) {
eee36623
NR
380 String size = conn.getContentLength() < 0 ? "unknown size"
381 : StringUtils.formatNumber(conn.getContentLength())
59654e2a
NR
382 + "bytes";
383 tracer.trace("Save to cache (" + size + "): " + originalUrl);
eb6dcdbf 384 try {
59654e2a
NR
385 try {
386 long bytes = cache.save(in, originalUrl);
387 tracer.trace("Saved to cache: "
388 + StringUtils.formatNumber(bytes) + "bytes");
389 } finally {
390 in.close();
391 }
392 in = cache.load(originalUrl, true, true);
393 } catch (IOException e) {
394 tracer.error(new IOException(
395 "Cannot save URL to cache, will ignore cache: "
396 + url, e));
eb6dcdbf 397 }
f6e8d60d 398 }
f6e8d60d 399
59654e2a
NR
400 return in;
401 } catch (IOException e) {
402 throw new IOException(String.format(
403 "Cannot find %s (current URL: %s)", originalUrl, url), e);
404 }
8816d2f7
NR
405 }
406
407 /**
408 * Open a connection on the given {@link URL}, and manage the cookies that
409 * come with it.
410 *
411 * @param url
412 * the {@link URL} to open
413 *
414 * @return the connection
415 *
416 * @throws IOException
417 * in case of I/O error
418 */
419 private URLConnection openConnectionWithCookies(URL url,
420 URL currentReferer, Map<String, String> cookiesValues)
421 throws IOException {
422 URLConnection conn = url.openConnection();
423
15f13472
NR
424 String cookies = generateCookies(cookiesValues);
425 if (cookies != null && !cookies.isEmpty()) {
426 conn.setRequestProperty("Cookie", cookies);
427 }
428
8816d2f7 429 conn.setRequestProperty("User-Agent", UA);
8816d2f7 430 conn.setRequestProperty("Accept-Encoding", "gzip");
15f13472 431 conn.setRequestProperty("Accept", "*/*");
59654e2a 432 conn.setRequestProperty("Charset", "utf-8");
15f13472 433
8816d2f7
NR
434 if (currentReferer != null) {
435 conn.setRequestProperty("Referer", currentReferer.toString());
436 conn.setRequestProperty("Host", currentReferer.getHost());
437 }
438
439 return conn;
440 }
441
442 /**
443 * Generate the cookie {@link String} from the local {@link CookieStore} so
444 * it is ready to be passed.
445 *
446 * @return the cookie
447 */
448 private String generateCookies(Map<String, String> cookiesValues) {
449 StringBuilder builder = new StringBuilder();
450 for (HttpCookie cookie : cookies.getCookieStore().getCookies()) {
451 if (builder.length() > 0) {
452 builder.append(';');
453 }
454
8816d2f7
NR
455 builder.append(cookie.toString());
456 }
457
458 if (cookiesValues != null) {
459 for (Map.Entry<String, String> set : cookiesValues.entrySet()) {
460 if (builder.length() > 0) {
461 builder.append(';');
462 }
463 builder.append(set.getKey());
464 builder.append('=');
465 builder.append(set.getValue());
466 }
467 }
468
469 return builder.toString();
470 }
471}