Add 'src/jexer/' from commit 'cf01c92f5809a0732409e280fb0f32f27393618d'
[nikiroo-utils.git] / src / be / nikiroo / utils / Downloader.java
CommitLineData
8816d2f7
NR
1package be.nikiroo.utils;
2
3import java.io.IOException;
4import java.io.InputStream;
5import java.io.OutputStreamWriter;
6import java.net.CookieHandler;
7import java.net.CookieManager;
8import java.net.CookiePolicy;
9import java.net.CookieStore;
10import java.net.HttpCookie;
11import java.net.HttpURLConnection;
12import java.net.URL;
13import java.net.URLConnection;
14import java.net.URLEncoder;
15import java.util.Map;
16import java.util.zip.GZIPInputStream;
17
18/**
19 * This class will help you download content from Internet Sites ({@link URL}
20 * based).
21 * <p>
22 * It allows you to control some options often required on web sites that do not
23 * want to simply serve HTML, but actively makes your life difficult with stupid
24 * checks.
25 *
26 * @author niki
27 */
28public class Downloader {
29 private String UA;
30 private CookieManager cookies;
530d4062 31 private TraceHandler tracer = new TraceHandler();
f6e8d60d 32 private Cache cache;
1002f328 33 private boolean offline;
8816d2f7
NR
34
35 /**
36 * Create a new {@link Downloader}.
37 *
38 * @param UA
39 * the User-Agent to use to download the resources -- note that
40 * some websites require one, some actively blacklist real UAs
41 * like the one from wget, some whitelist a couple of browsers
42 * only (!)
43 */
44 public Downloader(String UA) {
f6e8d60d
NR
45 this(UA, null);
46 }
47
48 /**
49 * Create a new {@link Downloader}.
50 *
51 * @param UA
52 * the User-Agent to use to download the resources -- note that
53 * some websites require one, some actively blacklist real UAs
54 * like the one from wget, some whitelist a couple of browsers
55 * only (!)
56 * @param cache
57 * the {@link Cache} to use for all access (can be NULL)
58 */
59 public Downloader(String UA, Cache cache) {
8816d2f7
NR
60 this.UA = UA;
61
15f13472 62 cookies = new CookieManager(null, CookiePolicy.ACCEPT_ALL);
8816d2f7 63 CookieHandler.setDefault(cookies);
f6e8d60d 64
15f13472 65 setCache(cache);
8816d2f7 66 }
1002f328
NR
67
68 /**
69 * This {@link Downloader} is forbidden to try and connect to the network.
70 * <p>
71 * If TRUE, it will only check the cache if any.
72 * <p>
73 * Default is FALSE.
74 *
75 * @return TRUE if offline
76 */
77 public boolean isOffline() {
78 return offline;
79 }
80
81 /**
82 * This {@link Downloader} is forbidden to try and connect to the network.
83 * <p>
84 * If TRUE, it will only check the cache if any.
85 * <p>
86 * Default is FALSE.
87 *
88 * @param offline TRUE for offline, FALSE for online
89 */
90 public void setOffline(boolean offline) {
91 this.offline = offline;
92 }
8816d2f7 93
530d4062
NR
94 /**
95 * The traces handler for this {@link Cache}.
96 *
97 * @return the traces handler
98 */
99 public TraceHandler getTraceHandler() {
100 return tracer;
101 }
102
103 /**
104 * The traces handler for this {@link Cache}.
105 *
106 * @param tracer
107 * the new traces handler
108 */
109 public void setTraceHandler(TraceHandler tracer) {
80500544
NR
110 if (tracer == null) {
111 tracer = new TraceHandler(false, false, false);
112 }
113
530d4062
NR
114 this.tracer = tracer;
115 }
116
3052163b
NR
117 /**
118 * The {@link Cache} to use for all access (can be NULL).
119 *
120 * @return the cache
121 */
122 public Cache getCache() {
123 return cache;
124 }
125
126 /**
127 * The {@link Cache} to use for all access (can be NULL).
128 *
129 * @param cache
130 * the new cache
131 */
132 public void setCache(Cache cache) {
133 this.cache = cache;
134 }
135
8816d2f7
NR
136 /**
137 * Clear all the cookies currently in the jar.
138 * <p>
139 * As long as you don't, the cookies are kept.
140 */
141 public void clearCookies() {
142 cookies.getCookieStore().removeAll();
143 }
144
145 /**
146 * Open the given {@link URL} and update the cookies.
147 *
148 * @param url
149 * the {@link URL} to open
150 * @return the {@link InputStream} of the opened page
151 *
152 * @throws IOException
153 * in case of I/O error
154 **/
155 public InputStream open(URL url) throws IOException {
f6e8d60d
NR
156 return open(url, false);
157 }
158
159 /**
160 * Open the given {@link URL} and update the cookies.
161 *
162 * @param url
163 * the {@link URL} to open
164 * @param stable
165 * stable a stable file (that doesn't change too often) --
166 * parameter used to check if the file is too old to keep or not
167 * in the cache (default is false)
168 *
169 * @return the {@link InputStream} of the opened page
170 *
171 * @throws IOException
172 * in case of I/O error
173 **/
174 public InputStream open(URL url, boolean stable) throws IOException {
175 return open(url, url, url, null, null, null, null, stable);
8816d2f7
NR
176 }
177
178 /**
179 * Open the given {@link URL} and update the cookies.
180 *
181 * @param url
182 * the {@link URL} to open
530d4062
NR
183 * @param currentReferer
184 * the current referer, for websites that needs this info
185 * @param cookiesValues
186 * the cookies
8816d2f7
NR
187 * @param postParams
188 * the POST parameters
189 * @param getParams
190 * the GET parameters (priority over POST)
191 * @param oauth
192 * OAuth authorization (aka, "bearer XXXXXXX")
193 *
194 * @return the {@link InputStream} of the opened page
195 *
196 * @throws IOException
1002f328 197 * in case of I/O error (including offline mode + not in cache)
8816d2f7
NR
198 */
199 public InputStream open(URL url, URL currentReferer,
200 Map<String, String> cookiesValues, Map<String, String> postParams,
201 Map<String, String> getParams, String oauth) throws IOException {
f6e8d60d
NR
202 return open(url, currentReferer, cookiesValues, postParams, getParams,
203 oauth, false);
204 }
205
206 /**
207 * Open the given {@link URL} and update the cookies.
208 *
209 * @param url
210 * the {@link URL} to open
211 * @param currentReferer
212 * the current referer, for websites that needs this info
213 * @param cookiesValues
214 * the cookies
215 * @param postParams
216 * the POST parameters
217 * @param getParams
218 * the GET parameters (priority over POST)
219 * @param oauth
220 * OAuth authorization (aka, "bearer XXXXXXX")
221 * @param stable
222 * stable a stable file (that doesn't change too often) --
223 * parameter used to check if the file is too old to keep or not
224 * in the cache (default is false)
225 *
226 * @return the {@link InputStream} of the opened page
227 *
228 * @throws IOException
1002f328 229 * in case of I/O error (including offline mode + not in cache)
f6e8d60d
NR
230 */
231 public InputStream open(URL url, URL currentReferer,
232 Map<String, String> cookiesValues, Map<String, String> postParams,
233 Map<String, String> getParams, String oauth, boolean stable)
234 throws IOException {
8816d2f7 235 return open(url, url, currentReferer, cookiesValues, postParams,
f6e8d60d 236 getParams, oauth, stable);
8816d2f7
NR
237 }
238
8816d2f7
NR
239 /**
240 * Open the given {@link URL} and update the cookies.
241 *
242 * @param url
243 * the {@link URL} to open
244 * @param originalUrl
ae7d1a83
NR
245 * the original {@link URL} before any redirection occurs, which
246 * is also used for the cache ID if needed (so we can retrieve
247 * the content with this URL if needed)
248 * @param currentReferer
249 * the current referer, for websites that needs this info
250 * @param cookiesValues
251 * the cookies
8816d2f7
NR
252 * @param postParams
253 * the POST parameters
254 * @param getParams
255 * the GET parameters (priority over POST)
256 * @param oauth
257 * OAuth authorisation (aka, "bearer XXXXXXX")
f6e8d60d
NR
258 * @param stable
259 * a stable file (that doesn't change too often) -- parameter
260 * used to check if the file is too old to keep or not in the
261 * cache
262 *
8816d2f7
NR
263 * @return the {@link InputStream} of the opened page
264 *
265 * @throws IOException
1002f328 266 * in case of I/O error (including offline mode + not in cache)
8816d2f7 267 */
ae7d1a83
NR
268 public InputStream open(URL url, final URL originalUrl, URL currentReferer,
269 Map<String, String> cookiesValues, Map<String, String> postParams,
270 Map<String, String> getParams, String oauth, boolean stable)
271 throws IOException {
f6e8d60d
NR
272
273 tracer.trace("Request: " + url);
274
275 if (cache != null) {
ae7d1a83 276 InputStream in = cache.load(originalUrl, false, stable);
f6e8d60d 277 if (in != null) {
223aa0d4
NR
278 tracer.trace("Use the cache: " + url);
279 tracer.trace("Original URL : " + originalUrl);
f6e8d60d
NR
280 return in;
281 }
282 }
8816d2f7 283
291f7dbd
NR
284 String protocol = originalUrl == null ? null : originalUrl
285 .getProtocol();
286 if (isOffline() && !"file".equalsIgnoreCase(protocol)) {
1002f328
NR
287 tracer.error("Downloader OFFLINE, cannot proceed to URL: " + url);
288 throw new IOException("Downloader is currently OFFLINE, cannot download: " + url);
289 }
290
530d4062 291 tracer.trace("Download: " + url);
8816d2f7
NR
292
293 URLConnection conn = openConnectionWithCookies(url, currentReferer,
294 cookiesValues);
295
296 // Priority: GET over POST
297 Map<String, String> params = getParams;
298 if (getParams == null) {
299 params = postParams;
300 }
301
15f13472 302 StringBuilder requestData = null;
8816d2f7
NR
303 if ((params != null || oauth != null)
304 && conn instanceof HttpURLConnection) {
8816d2f7
NR
305 if (params != null) {
306 requestData = new StringBuilder();
307 for (Map.Entry<String, String> param : params.entrySet()) {
308 if (requestData.length() != 0)
309 requestData.append('&');
310 requestData.append(URLEncoder.encode(param.getKey(),
311 "UTF-8"));
312 requestData.append('=');
313 requestData.append(URLEncoder.encode(
314 String.valueOf(param.getValue()), "UTF-8"));
315 }
316
8816d2f7
NR
317 if (getParams == null && postParams != null) {
318 ((HttpURLConnection) conn).setRequestMethod("POST");
319 }
320
321 conn.setRequestProperty("Content-Type",
322 "application/x-www-form-urlencoded");
15f13472
NR
323 conn.setRequestProperty("Content-Length",
324 Integer.toString(requestData.length()));
8816d2f7
NR
325 }
326
327 if (oauth != null) {
328 conn.setRequestProperty("Authorization", oauth);
329 }
330
331 if (requestData != null) {
15f13472
NR
332 conn.setDoOutput(true);
333 OutputStreamWriter writer = new OutputStreamWriter(
334 conn.getOutputStream());
0988831f 335 try {
0988831f
NR
336 writer.write(requestData.toString());
337 writer.flush();
338 } finally {
15f13472 339 writer.close();
0988831f 340 }
8816d2f7
NR
341 }
342 }
343
15f13472
NR
344 // Manual redirection, much better for POST data
345 if (conn instanceof HttpURLConnection) {
346 ((HttpURLConnection) conn).setInstanceFollowRedirects(false);
347 }
348
8816d2f7
NR
349 conn.connect();
350
351 // Check if redirect
59654e2a
NR
352 // BEWARE! POST data cannot be redirected (some webservers complain) for
353 // HTTP codes 302 and 303
6149689f
NR
354 if (conn instanceof HttpURLConnection) {
355 int repCode = 0;
356 try {
357 // Can fail in some circumstances
358 repCode = ((HttpURLConnection) conn).getResponseCode();
359 } catch (IOException e) {
360 }
361
362 if (repCode / 100 == 3) {
363 String newUrl = conn.getHeaderField("Location");
364 return open(new URL(newUrl), originalUrl, currentReferer,
59654e2a
NR
365 cookiesValues, //
366 (repCode == 302 || repCode == 303) ? null : postParams, //
367 getParams, oauth, stable);
6149689f 368 }
8816d2f7
NR
369 }
370
59654e2a
NR
371 try {
372 InputStream in = conn.getInputStream();
373 if ("gzip".equals(conn.getContentEncoding())) {
374 in = new GZIPInputStream(in);
375 }
8816d2f7 376
59654e2a
NR
377 if (in == null) {
378 throw new IOException("No InputStream!");
379 }
380
381 if (cache != null) {
eee36623
NR
382 String size = conn.getContentLength() < 0 ? "unknown size"
383 : StringUtils.formatNumber(conn.getContentLength())
59654e2a
NR
384 + "bytes";
385 tracer.trace("Save to cache (" + size + "): " + originalUrl);
eb6dcdbf 386 try {
59654e2a
NR
387 try {
388 long bytes = cache.save(in, originalUrl);
389 tracer.trace("Saved to cache: "
390 + StringUtils.formatNumber(bytes) + "bytes");
391 } finally {
392 in.close();
393 }
394 in = cache.load(originalUrl, true, true);
395 } catch (IOException e) {
396 tracer.error(new IOException(
397 "Cannot save URL to cache, will ignore cache: "
398 + url, e));
eb6dcdbf 399 }
f6e8d60d 400 }
f6e8d60d 401
47c88873
NR
402 if (in == null) {
403 throw new IOException(
404 "Cannot retrieve the file after storing it in the cache (??)");
405 }
406
59654e2a
NR
407 return in;
408 } catch (IOException e) {
409 throw new IOException(String.format(
410 "Cannot find %s (current URL: %s)", originalUrl, url), e);
411 }
8816d2f7
NR
412 }
413
414 /**
415 * Open a connection on the given {@link URL}, and manage the cookies that
416 * come with it.
417 *
418 * @param url
419 * the {@link URL} to open
420 *
421 * @return the connection
422 *
423 * @throws IOException
424 * in case of I/O error
425 */
426 private URLConnection openConnectionWithCookies(URL url,
427 URL currentReferer, Map<String, String> cookiesValues)
428 throws IOException {
429 URLConnection conn = url.openConnection();
430
15f13472
NR
431 String cookies = generateCookies(cookiesValues);
432 if (cookies != null && !cookies.isEmpty()) {
433 conn.setRequestProperty("Cookie", cookies);
434 }
435
8816d2f7 436 conn.setRequestProperty("User-Agent", UA);
8816d2f7 437 conn.setRequestProperty("Accept-Encoding", "gzip");
15f13472 438 conn.setRequestProperty("Accept", "*/*");
59654e2a 439 conn.setRequestProperty("Charset", "utf-8");
15f13472 440
8816d2f7
NR
441 if (currentReferer != null) {
442 conn.setRequestProperty("Referer", currentReferer.toString());
443 conn.setRequestProperty("Host", currentReferer.getHost());
444 }
445
446 return conn;
447 }
448
449 /**
450 * Generate the cookie {@link String} from the local {@link CookieStore} so
451 * it is ready to be passed.
452 *
453 * @return the cookie
454 */
455 private String generateCookies(Map<String, String> cookiesValues) {
456 StringBuilder builder = new StringBuilder();
457 for (HttpCookie cookie : cookies.getCookieStore().getCookies()) {
458 if (builder.length() > 0) {
459 builder.append(';');
460 }
461
8816d2f7
NR
462 builder.append(cookie.toString());
463 }
464
465 if (cookiesValues != null) {
466 for (Map.Entry<String, String> set : cookiesValues.entrySet()) {
467 if (builder.length() > 0) {
468 builder.append(';');
469 }
470 builder.append(set.getKey());
471 builder.append('=');
472 builder.append(set.getValue());
473 }
474 }
475
476 return builder.toString();
477 }
478}