cache save/load workflow
[fanfix.git] / src / be / nikiroo / utils / Downloader.java
1 package be.nikiroo.utils;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.io.OutputStreamWriter;
6 import java.net.CookieHandler;
7 import java.net.CookieManager;
8 import java.net.CookiePolicy;
9 import java.net.CookieStore;
10 import java.net.HttpCookie;
11 import java.net.HttpURLConnection;
12 import java.net.URL;
13 import java.net.URLConnection;
14 import java.net.URLEncoder;
15 import java.util.Map;
16 import java.util.zip.GZIPInputStream;
17
18 /**
19 * This class will help you download content from Internet Sites ({@link URL}
20 * based).
21 * <p>
22 * It allows you to control some options often required on web sites that do not
23 * want to simply serve HTML, but actively makes your life difficult with stupid
24 * checks.
25 *
26 * @author niki
27 */
28 public class Downloader {
29 private String UA;
30 private CookieManager cookies;
31 private TraceHandler tracer = new TraceHandler();
32 private Cache cache;
33 private boolean offline;
34
35 /**
36 * Create a new {@link Downloader}.
37 *
38 * @param UA
39 * the User-Agent to use to download the resources -- note that
40 * some websites require one, some actively blacklist real UAs
41 * like the one from wget, some whitelist a couple of browsers
42 * only (!)
43 */
44 public Downloader(String UA) {
45 this(UA, null);
46 }
47
48 /**
49 * Create a new {@link Downloader}.
50 *
51 * @param UA
52 * the User-Agent to use to download the resources -- note that
53 * some websites require one, some actively blacklist real UAs
54 * like the one from wget, some whitelist a couple of browsers
55 * only (!)
56 * @param cache
57 * the {@link Cache} to use for all access (can be NULL)
58 */
59 public Downloader(String UA, Cache cache) {
60 this.UA = UA;
61
62 cookies = new CookieManager(null, CookiePolicy.ACCEPT_ALL);
63 CookieHandler.setDefault(cookies);
64
65 setCache(cache);
66 }
67
68 /**
69 * This {@link Downloader} is forbidden to try and connect to the network.
70 * <p>
71 * If TRUE, it will only check the cache if any.
72 * <p>
73 * Default is FALSE.
74 *
75 * @return TRUE if offline
76 */
77 public boolean isOffline() {
78 return offline;
79 }
80
81 /**
82 * This {@link Downloader} is forbidden to try and connect to the network.
83 * <p>
84 * If TRUE, it will only check the cache if any.
85 * <p>
86 * Default is FALSE.
87 *
88 * @param offline TRUE for offline, FALSE for online
89 */
90 public void setOffline(boolean offline) {
91 this.offline = offline;
92 }
93
94 /**
95 * The traces handler for this {@link Cache}.
96 *
97 * @return the traces handler
98 */
99 public TraceHandler getTraceHandler() {
100 return tracer;
101 }
102
103 /**
104 * The traces handler for this {@link Cache}.
105 *
106 * @param tracer
107 * the new traces handler
108 */
109 public void setTraceHandler(TraceHandler tracer) {
110 if (tracer == null) {
111 tracer = new TraceHandler(false, false, false);
112 }
113
114 this.tracer = tracer;
115 }
116
117 /**
118 * The {@link Cache} to use for all access (can be NULL).
119 *
120 * @return the cache
121 */
122 public Cache getCache() {
123 return cache;
124 }
125
126 /**
127 * The {@link Cache} to use for all access (can be NULL).
128 *
129 * @param cache
130 * the new cache
131 */
132 public void setCache(Cache cache) {
133 this.cache = cache;
134 }
135
136 /**
137 * Clear all the cookies currently in the jar.
138 * <p>
139 * As long as you don't, the cookies are kept.
140 */
141 public void clearCookies() {
142 cookies.getCookieStore().removeAll();
143 }
144
145 /**
146 * Open the given {@link URL} and update the cookies.
147 *
148 * @param url
149 * the {@link URL} to open
150 * @return the {@link InputStream} of the opened page
151 *
152 * @throws IOException
153 * in case of I/O error
154 **/
155 public InputStream open(URL url) throws IOException {
156 return open(url, false);
157 }
158
159 /**
160 * Open the given {@link URL} and update the cookies.
161 *
162 * @param url
163 * the {@link URL} to open
164 * @param stable
165 * stable a stable file (that doesn't change too often) --
166 * parameter used to check if the file is too old to keep or not
167 * in the cache (default is false)
168 *
169 * @return the {@link InputStream} of the opened page
170 *
171 * @throws IOException
172 * in case of I/O error
173 **/
174 public InputStream open(URL url, boolean stable) throws IOException {
175 return open(url, url, url, null, null, null, null, stable);
176 }
177
178 /**
179 * Open the given {@link URL} and update the cookies.
180 *
181 * @param url
182 * the {@link URL} to open
183 * @param currentReferer
184 * the current referer, for websites that needs this info
185 * @param cookiesValues
186 * the cookies
187 * @param postParams
188 * the POST parameters
189 * @param getParams
190 * the GET parameters (priority over POST)
191 * @param oauth
192 * OAuth authorization (aka, "bearer XXXXXXX")
193 *
194 * @return the {@link InputStream} of the opened page
195 *
196 * @throws IOException
197 * in case of I/O error (including offline mode + not in cache)
198 */
199 public InputStream open(URL url, URL currentReferer,
200 Map<String, String> cookiesValues, Map<String, String> postParams,
201 Map<String, String> getParams, String oauth) throws IOException {
202 return open(url, currentReferer, cookiesValues, postParams, getParams,
203 oauth, false);
204 }
205
206 /**
207 * Open the given {@link URL} and update the cookies.
208 *
209 * @param url
210 * the {@link URL} to open
211 * @param currentReferer
212 * the current referer, for websites that needs this info
213 * @param cookiesValues
214 * the cookies
215 * @param postParams
216 * the POST parameters
217 * @param getParams
218 * the GET parameters (priority over POST)
219 * @param oauth
220 * OAuth authorization (aka, "bearer XXXXXXX")
221 * @param stable
222 * stable a stable file (that doesn't change too often) --
223 * parameter used to check if the file is too old to keep or not
224 * in the cache (default is false)
225 *
226 * @return the {@link InputStream} of the opened page
227 *
228 * @throws IOException
229 * in case of I/O error (including offline mode + not in cache)
230 */
231 public InputStream open(URL url, URL currentReferer,
232 Map<String, String> cookiesValues, Map<String, String> postParams,
233 Map<String, String> getParams, String oauth, boolean stable)
234 throws IOException {
235 return open(url, url, currentReferer, cookiesValues, postParams,
236 getParams, oauth, stable);
237 }
238
239 /**
240 * Open the given {@link URL} and update the cookies.
241 *
242 * @param url
243 * the {@link URL} to open
244 * @param originalUrl
245 * the original {@link URL} before any redirection occurs, which
246 * is also used for the cache ID if needed (so we can retrieve
247 * the content with this URL if needed)
248 * @param currentReferer
249 * the current referer, for websites that needs this info
250 * @param cookiesValues
251 * the cookies
252 * @param postParams
253 * the POST parameters
254 * @param getParams
255 * the GET parameters (priority over POST)
256 * @param oauth
257 * OAuth authorisation (aka, "bearer XXXXXXX")
258 * @param stable
259 * a stable file (that doesn't change too often) -- parameter
260 * used to check if the file is too old to keep or not in the
261 * cache
262 *
263 * @return the {@link InputStream} of the opened page
264 *
265 * @throws IOException
266 * in case of I/O error (including offline mode + not in cache)
267 */
268 public InputStream open(URL url, final URL originalUrl, URL currentReferer,
269 Map<String, String> cookiesValues, Map<String, String> postParams,
270 Map<String, String> getParams, String oauth, boolean stable)
271 throws IOException {
272
273 tracer.trace("Request: " + url);
274
275 if (cache != null) {
276 InputStream in = cache.load(originalUrl, false, stable);
277 if (in != null) {
278 tracer.trace("Use the cache: " + url);
279 tracer.trace("Original URL : " + originalUrl);
280 return in;
281 }
282 }
283
284 if (offline) {
285 tracer.error("Downloader OFFLINE, cannot proceed to URL: " + url);
286 throw new IOException("Downloader is currently OFFLINE, cannot download: " + url);
287 }
288
289 tracer.trace("Download: " + url);
290
291 URLConnection conn = openConnectionWithCookies(url, currentReferer,
292 cookiesValues);
293
294 // Priority: GET over POST
295 Map<String, String> params = getParams;
296 if (getParams == null) {
297 params = postParams;
298 }
299
300 StringBuilder requestData = null;
301 if ((params != null || oauth != null)
302 && conn instanceof HttpURLConnection) {
303 if (params != null) {
304 requestData = new StringBuilder();
305 for (Map.Entry<String, String> param : params.entrySet()) {
306 if (requestData.length() != 0)
307 requestData.append('&');
308 requestData.append(URLEncoder.encode(param.getKey(),
309 "UTF-8"));
310 requestData.append('=');
311 requestData.append(URLEncoder.encode(
312 String.valueOf(param.getValue()), "UTF-8"));
313 }
314
315 if (getParams == null && postParams != null) {
316 ((HttpURLConnection) conn).setRequestMethod("POST");
317 }
318
319 conn.setRequestProperty("Content-Type",
320 "application/x-www-form-urlencoded");
321 conn.setRequestProperty("Content-Length",
322 Integer.toString(requestData.length()));
323 }
324
325 if (oauth != null) {
326 conn.setRequestProperty("Authorization", oauth);
327 }
328
329 if (requestData != null) {
330 conn.setDoOutput(true);
331 OutputStreamWriter writer = new OutputStreamWriter(
332 conn.getOutputStream());
333 try {
334 writer.write(requestData.toString());
335 writer.flush();
336 } finally {
337 writer.close();
338 }
339 }
340 }
341
342 // Manual redirection, much better for POST data
343 if (conn instanceof HttpURLConnection) {
344 ((HttpURLConnection) conn).setInstanceFollowRedirects(false);
345 }
346
347 conn.connect();
348
349 // Check if redirect
350 // BEWARE! POST data cannot be redirected (some webservers complain) for
351 // HTTP codes 302 and 303
352 if (conn instanceof HttpURLConnection) {
353 int repCode = 0;
354 try {
355 // Can fail in some circumstances
356 repCode = ((HttpURLConnection) conn).getResponseCode();
357 } catch (IOException e) {
358 }
359
360 if (repCode / 100 == 3) {
361 String newUrl = conn.getHeaderField("Location");
362 return open(new URL(newUrl), originalUrl, currentReferer,
363 cookiesValues, //
364 (repCode == 302 || repCode == 303) ? null : postParams, //
365 getParams, oauth, stable);
366 }
367 }
368
369 try {
370 InputStream in = conn.getInputStream();
371 if ("gzip".equals(conn.getContentEncoding())) {
372 in = new GZIPInputStream(in);
373 }
374
375 if (in == null) {
376 throw new IOException("No InputStream!");
377 }
378
379 if (cache != null) {
380 String size = conn.getContentLength() < 0 ? "unknown size"
381 : StringUtils.formatNumber(conn.getContentLength())
382 + "bytes";
383 tracer.trace("Save to cache (" + size + "): " + originalUrl);
384 try {
385 try {
386 long bytes = cache.save(in, originalUrl);
387 tracer.trace("Saved to cache: "
388 + StringUtils.formatNumber(bytes) + "bytes");
389 } finally {
390 in.close();
391 }
392 in = cache.load(originalUrl, true, true);
393 } catch (IOException e) {
394 tracer.error(new IOException(
395 "Cannot save URL to cache, will ignore cache: "
396 + url, e));
397 }
398 }
399
400 if (in == null) {
401 throw new IOException(
402 "Cannot retrieve the file after storing it in the cache (??)");
403 }
404
405 return in;
406 } catch (IOException e) {
407 throw new IOException(String.format(
408 "Cannot find %s (current URL: %s)", originalUrl, url), e);
409 }
410 }
411
412 /**
413 * Open a connection on the given {@link URL}, and manage the cookies that
414 * come with it.
415 *
416 * @param url
417 * the {@link URL} to open
418 *
419 * @return the connection
420 *
421 * @throws IOException
422 * in case of I/O error
423 */
424 private URLConnection openConnectionWithCookies(URL url,
425 URL currentReferer, Map<String, String> cookiesValues)
426 throws IOException {
427 URLConnection conn = url.openConnection();
428
429 String cookies = generateCookies(cookiesValues);
430 if (cookies != null && !cookies.isEmpty()) {
431 conn.setRequestProperty("Cookie", cookies);
432 }
433
434 conn.setRequestProperty("User-Agent", UA);
435 conn.setRequestProperty("Accept-Encoding", "gzip");
436 conn.setRequestProperty("Accept", "*/*");
437 conn.setRequestProperty("Charset", "utf-8");
438
439 if (currentReferer != null) {
440 conn.setRequestProperty("Referer", currentReferer.toString());
441 conn.setRequestProperty("Host", currentReferer.getHost());
442 }
443
444 return conn;
445 }
446
447 /**
448 * Generate the cookie {@link String} from the local {@link CookieStore} so
449 * it is ready to be passed.
450 *
451 * @return the cookie
452 */
453 private String generateCookies(Map<String, String> cookiesValues) {
454 StringBuilder builder = new StringBuilder();
455 for (HttpCookie cookie : cookies.getCookieStore().getCookies()) {
456 if (builder.length() > 0) {
457 builder.append(';');
458 }
459
460 builder.append(cookie.toString());
461 }
462
463 if (cookiesValues != null) {
464 for (Map.Entry<String, String> set : cookiesValues.entrySet()) {
465 if (builder.length() > 0) {
466 builder.append(';');
467 }
468 builder.append(set.getKey());
469 builder.append('=');
470 builder.append(set.getValue());
471 }
472 }
473
474 return builder.toString();
475 }
476 }