Merge commit '712ddafb749aada41daab85c36ac12f657b2307e'
[nikiroo-utils.git] / Downloader.java
1 package be.nikiroo.utils;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.io.OutputStreamWriter;
6 import java.net.CookieHandler;
7 import java.net.CookieManager;
8 import java.net.CookiePolicy;
9 import java.net.CookieStore;
10 import java.net.HttpCookie;
11 import java.net.HttpURLConnection;
12 import java.net.URL;
13 import java.net.URLConnection;
14 import java.net.URLEncoder;
15 import java.util.Map;
16 import java.util.zip.GZIPInputStream;
17
18 /**
19 * This class will help you download content from Internet Sites ({@link URL}
20 * based).
21 * <p>
22 * It allows you to control some options often required on web sites that do not
23 * want to simply serve HTML, but actively makes your life difficult with stupid
24 * checks.
25 *
26 * @author niki
27 */
28 public class Downloader {
29 private String UA;
30 private CookieManager cookies;
31 private TraceHandler tracer = new TraceHandler();
32 private Cache cache;
33 private boolean offline;
34
35 /**
36 * Create a new {@link Downloader}.
37 *
38 * @param UA
39 * the User-Agent to use to download the resources -- note that
40 * some websites require one, some actively blacklist real UAs
41 * like the one from wget, some whitelist a couple of browsers
42 * only (!) -- can be NULL
43 */
44 public Downloader(String UA) {
45 this(UA, null);
46 }
47
48 /**
49 * Create a new {@link Downloader}.
50 *
51 * @param UA
52 * the User-Agent to use to download the resources -- note that
53 * some websites require one, some actively blacklist real UAs
54 * like the one from wget, some whitelist a couple of browsers
55 * only (!) -- can be NULL
56 * @param cache
57 * the {@link Cache} to use for all access (can be NULL)
58 */
59 public Downloader(String UA, Cache cache) {
60 this.UA = UA;
61
62 cookies = new CookieManager(null, CookiePolicy.ACCEPT_ALL);
63 CookieHandler.setDefault(cookies);
64
65 setCache(cache);
66 }
67
68 /**
69 * This {@link Downloader} is forbidden to try and connect to the network.
70 * <p>
71 * If TRUE, it will only check the cache if any.
72 * <p>
73 * Default is FALSE.
74 *
75 * @return TRUE if offline
76 */
77 public boolean isOffline() {
78 return offline;
79 }
80
81 /**
82 * This {@link Downloader} is forbidden to try and connect to the network.
83 * <p>
84 * If TRUE, it will only check the cache if any.
85 * <p>
86 * Default is FALSE.
87 *
88 * @param offline TRUE for offline, FALSE for online
89 */
90 public void setOffline(boolean offline) {
91 this.offline = offline;
92 }
93
94 /**
95 * The traces handler for this {@link Cache}.
96 *
97 * @return the traces handler
98 */
99 public TraceHandler getTraceHandler() {
100 return tracer;
101 }
102
103 /**
104 * The traces handler for this {@link Cache}.
105 *
106 * @param tracer
107 * the new traces handler
108 */
109 public void setTraceHandler(TraceHandler tracer) {
110 if (tracer == null) {
111 tracer = new TraceHandler(false, false, false);
112 }
113
114 this.tracer = tracer;
115 }
116
117 /**
118 * The {@link Cache} to use for all access (can be NULL).
119 *
120 * @return the cache
121 */
122 public Cache getCache() {
123 return cache;
124 }
125
126 /**
127 * The {@link Cache} to use for all access (can be NULL).
128 *
129 * @param cache
130 * the new cache
131 */
132 public void setCache(Cache cache) {
133 this.cache = cache;
134 }
135
136 /**
137 * Clear all the cookies currently in the jar.
138 * <p>
139 * As long as you don't, the cookies are kept.
140 */
141 public void clearCookies() {
142 cookies.getCookieStore().removeAll();
143 }
144
145 /**
146 * Open the given {@link URL} and update the cookies.
147 *
148 * @param url
149 * the {@link URL} to open
150 * @return the {@link InputStream} of the opened page
151 *
152 * @throws IOException
153 * in case of I/O error
154 **/
155 public InputStream open(URL url) throws IOException {
156 return open(url, false);
157 }
158
159 /**
160 * Open the given {@link URL} and update the cookies.
161 *
162 * @param url
163 * the {@link URL} to open
164 * @param stable
165 * stable a stable file (that doesn't change too often) --
166 * parameter used to check if the file is too old to keep or not
167 * in the cache (default is false)
168 *
169 * @return the {@link InputStream} of the opened page
170 *
171 * @throws IOException
172 * in case of I/O error
173 **/
174 public InputStream open(URL url, boolean stable) throws IOException {
175 return open(url, url, url, null, null, null, null, stable);
176 }
177
178 /**
179 * Open the given {@link URL} and update the cookies.
180 *
181 * @param url
182 * the {@link URL} to open
183 * @param currentReferer
184 * the current referer, for websites that needs this info
185 * @param cookiesValues
186 * the cookies
187 * @param postParams
188 * the POST parameters
189 * @param getParams
190 * the GET parameters (priority over POST)
191 * @param oauth
192 * OAuth authorization (aka, "bearer XXXXXXX")
193 *
194 * @return the {@link InputStream} of the opened page
195 *
196 * @throws IOException
197 * in case of I/O error (including offline mode + not in cache)
198 */
199 public InputStream open(URL url, URL currentReferer,
200 Map<String, String> cookiesValues, Map<String, String> postParams,
201 Map<String, String> getParams, String oauth) throws IOException {
202 return open(url, currentReferer, cookiesValues, postParams, getParams,
203 oauth, false);
204 }
205
206 /**
207 * Open the given {@link URL} and update the cookies.
208 *
209 * @param url
210 * the {@link URL} to open
211 * @param currentReferer
212 * the current referer, for websites that needs this info
213 * @param cookiesValues
214 * the cookies
215 * @param postParams
216 * the POST parameters
217 * @param getParams
218 * the GET parameters (priority over POST)
219 * @param oauth
220 * OAuth authorization (aka, "bearer XXXXXXX")
221 * @param stable
222 * stable a stable file (that doesn't change too often) --
223 * parameter used to check if the file is too old to keep or not
224 * in the cache (default is false)
225 *
226 * @return the {@link InputStream} of the opened page
227 *
228 * @throws IOException
229 * in case of I/O error (including offline mode + not in cache)
230 */
231 public InputStream open(URL url, URL currentReferer,
232 Map<String, String> cookiesValues, Map<String, String> postParams,
233 Map<String, String> getParams, String oauth, boolean stable)
234 throws IOException {
235 return open(url, url, currentReferer, cookiesValues, postParams,
236 getParams, oauth, stable);
237 }
238
239 /**
240 * Open the given {@link URL} and update the cookies.
241 *
242 * @param url
243 * the {@link URL} to open
244 * @param originalUrl
245 * the original {@link URL} before any redirection occurs, which
246 * is also used for the cache ID if needed (so we can retrieve
247 * the content with this URL if needed)
248 * @param currentReferer
249 * the current referer, for websites that needs this info
250 * @param cookiesValues
251 * the cookies
252 * @param postParams
253 * the POST parameters
254 * @param getParams
255 * the GET parameters (priority over POST)
256 * @param oauth
257 * OAuth authorisation (aka, "bearer XXXXXXX")
258 * @param stable
259 * a stable file (that doesn't change too often) -- parameter
260 * used to check if the file is too old to keep or not in the
261 * cache
262 *
263 * @return the {@link InputStream} of the opened page
264 *
265 * @throws IOException
266 * in case of I/O error (including offline mode + not in cache)
267 */
268 public InputStream open(URL url, final URL originalUrl, URL currentReferer,
269 Map<String, String> cookiesValues, Map<String, String> postParams,
270 Map<String, String> getParams, String oauth, boolean stable)
271 throws IOException {
272
273 tracer.trace("Request: " + url);
274
275 if (cache != null) {
276 InputStream in = cache.load(originalUrl, false, stable);
277 if (in != null) {
278 tracer.trace("Use the cache: " + url);
279 tracer.trace("Original URL : " + originalUrl);
280 return in;
281 }
282 }
283
284 String protocol = originalUrl == null ? null : originalUrl
285 .getProtocol();
286 if (isOffline() && !"file".equalsIgnoreCase(protocol)) {
287 tracer.error("Downloader OFFLINE, cannot proceed to URL: " + url);
288 throw new IOException("Downloader is currently OFFLINE, cannot download: " + url);
289 }
290
291 tracer.trace("Download: " + url);
292
293 URLConnection conn = openConnectionWithCookies(url, currentReferer,
294 cookiesValues);
295
296 // Priority: GET over POST
297 Map<String, String> params = getParams;
298 if (getParams == null) {
299 params = postParams;
300 }
301
302 StringBuilder requestData = null;
303 if ((params != null || oauth != null)
304 && conn instanceof HttpURLConnection) {
305 if (params != null) {
306 requestData = new StringBuilder();
307 for (Map.Entry<String, String> param : params.entrySet()) {
308 if (requestData.length() != 0)
309 requestData.append('&');
310 requestData.append(URLEncoder.encode(param.getKey(),
311 "UTF-8"));
312 requestData.append('=');
313 requestData.append(URLEncoder.encode(
314 String.valueOf(param.getValue()), "UTF-8"));
315 }
316
317 if (getParams == null && postParams != null) {
318 ((HttpURLConnection) conn).setRequestMethod("POST");
319 }
320
321 conn.setRequestProperty("Content-Type",
322 "application/x-www-form-urlencoded");
323 conn.setRequestProperty("Content-Length",
324 Integer.toString(requestData.length()));
325 }
326
327 if (oauth != null) {
328 conn.setRequestProperty("Authorization", oauth);
329 }
330
331 if (requestData != null) {
332 conn.setDoOutput(true);
333 OutputStreamWriter writer = new OutputStreamWriter(
334 conn.getOutputStream());
335 try {
336 writer.write(requestData.toString());
337 writer.flush();
338 } finally {
339 writer.close();
340 }
341 }
342 }
343
344 // Manual redirection, much better for POST data
345 if (conn instanceof HttpURLConnection) {
346 ((HttpURLConnection) conn).setInstanceFollowRedirects(false);
347 }
348
349 conn.connect();
350
351 // Check if redirect
352 // BEWARE! POST data cannot be redirected (some webservers complain) for
353 // HTTP codes 302 and 303
354 if (conn instanceof HttpURLConnection) {
355 int repCode = 0;
356 try {
357 // Can fail in some circumstances
358 repCode = ((HttpURLConnection) conn).getResponseCode();
359 } catch (IOException e) {
360 }
361
362 if (repCode / 100 == 3) {
363 String newUrl = conn.getHeaderField("Location");
364 return open(new URL(newUrl), originalUrl, currentReferer,
365 cookiesValues, //
366 (repCode == 302 || repCode == 303) ? null : postParams, //
367 getParams, oauth, stable);
368 }
369 }
370
371 try {
372 InputStream in = conn.getInputStream();
373 if ("gzip".equals(conn.getContentEncoding())) {
374 in = new GZIPInputStream(in);
375 }
376
377 if (in == null) {
378 throw new IOException("No InputStream!");
379 }
380
381 if (cache != null) {
382 String size = conn.getContentLength() < 0 ? "unknown size"
383 : StringUtils.formatNumber(conn.getContentLength())
384 + "bytes";
385 tracer.trace("Save to cache (" + size + "): " + originalUrl);
386 try {
387 try {
388 long bytes = cache.save(in, originalUrl);
389 tracer.trace("Saved to cache: "
390 + StringUtils.formatNumber(bytes) + "bytes");
391 } finally {
392 in.close();
393 }
394 in = cache.load(originalUrl, true, true);
395 } catch (IOException e) {
396 tracer.error(new IOException(
397 "Cannot save URL to cache, will ignore cache: "
398 + url, e));
399 }
400 }
401
402 if (in == null) {
403 throw new IOException(
404 "Cannot retrieve the file after storing it in the cache (??)");
405 }
406
407 return in;
408 } catch (IOException e) {
409 throw new IOException(String.format(
410 "Cannot find %s (current URL: %s)", originalUrl, url), e);
411 }
412 }
413
414 /**
415 * Open a connection on the given {@link URL}, and manage the cookies that
416 * come with it.
417 *
418 * @param url
419 * the {@link URL} to open
420 *
421 * @return the connection
422 *
423 * @throws IOException
424 * in case of I/O error
425 */
426 private URLConnection openConnectionWithCookies(URL url,
427 URL currentReferer, Map<String, String> cookiesValues)
428 throws IOException {
429 URLConnection conn = url.openConnection();
430
431 String cookies = generateCookies(cookiesValues);
432 if (cookies != null && !cookies.isEmpty()) {
433 conn.setRequestProperty("Cookie", cookies);
434 }
435
436 if (UA != null) {
437 conn.setRequestProperty("User-Agent", UA);
438 }
439 conn.setRequestProperty("Accept-Encoding", "gzip");
440 conn.setRequestProperty("Accept", "*/*");
441 conn.setRequestProperty("Charset", "utf-8");
442
443 if (currentReferer != null) {
444 conn.setRequestProperty("Referer", currentReferer.toString());
445 conn.setRequestProperty("Host", currentReferer.getHost());
446 }
447
448 return conn;
449 }
450
451 /**
452 * Generate the cookie {@link String} from the local {@link CookieStore} so
453 * it is ready to be passed.
454 *
455 * @return the cookie
456 */
457 private String generateCookies(Map<String, String> cookiesValues) {
458 StringBuilder builder = new StringBuilder();
459 for (HttpCookie cookie : cookies.getCookieStore().getCookies()) {
460 if (builder.length() > 0) {
461 builder.append(';');
462 }
463
464 builder.append(cookie.toString());
465 }
466
467 if (cookiesValues != null) {
468 for (Map.Entry<String, String> set : cookiesValues.entrySet()) {
469 if (builder.length() > 0) {
470 builder.append(';');
471 }
472 builder.append(set.getKey());
473 builder.append('=');
474 builder.append(set.getValue());
475 }
476 }
477
478 return builder.toString();
479 }
480 }