Commit | Line | Data |
---|---|---|
8816d2f7 NR |
1 | package be.nikiroo.utils; |
2 | ||
3 | import java.io.IOException; | |
4 | import java.io.InputStream; | |
5 | import java.io.OutputStreamWriter; | |
6 | import java.net.CookieHandler; | |
7 | import java.net.CookieManager; | |
8 | import java.net.CookiePolicy; | |
9 | import java.net.CookieStore; | |
10 | import java.net.HttpCookie; | |
11 | import java.net.HttpURLConnection; | |
12 | import java.net.URL; | |
13 | import java.net.URLConnection; | |
14 | import java.net.URLEncoder; | |
15 | import java.util.Map; | |
16 | import java.util.zip.GZIPInputStream; | |
17 | ||
18 | /** | |
19 | * This class will help you download content from Internet Sites ({@link URL} | |
20 | * based). | |
21 | * <p> | |
22 | * It allows you to control some options often required on web sites that do not | |
23 | * want to simply serve HTML, but actively makes your life difficult with stupid | |
24 | * checks. | |
25 | * | |
26 | * @author niki | |
27 | */ | |
28 | public class Downloader { | |
29 | private String UA; | |
30 | private CookieManager cookies; | |
530d4062 | 31 | private TraceHandler tracer = new TraceHandler(); |
f6e8d60d | 32 | private Cache cache; |
1002f328 | 33 | private boolean offline; |
8816d2f7 NR |
34 | |
35 | /** | |
36 | * Create a new {@link Downloader}. | |
37 | * | |
38 | * @param UA | |
39 | * the User-Agent to use to download the resources -- note that | |
40 | * some websites require one, some actively blacklist real UAs | |
41 | * like the one from wget, some whitelist a couple of browsers | |
42 | * only (!) | |
43 | */ | |
44 | public Downloader(String UA) { | |
f6e8d60d NR |
45 | this(UA, null); |
46 | } | |
47 | ||
48 | /** | |
49 | * Create a new {@link Downloader}. | |
50 | * | |
51 | * @param UA | |
52 | * the User-Agent to use to download the resources -- note that | |
53 | * some websites require one, some actively blacklist real UAs | |
54 | * like the one from wget, some whitelist a couple of browsers | |
55 | * only (!) | |
56 | * @param cache | |
57 | * the {@link Cache} to use for all access (can be NULL) | |
58 | */ | |
59 | public Downloader(String UA, Cache cache) { | |
8816d2f7 NR |
60 | this.UA = UA; |
61 | ||
15f13472 | 62 | cookies = new CookieManager(null, CookiePolicy.ACCEPT_ALL); |
8816d2f7 | 63 | CookieHandler.setDefault(cookies); |
f6e8d60d | 64 | |
15f13472 | 65 | setCache(cache); |
8816d2f7 | 66 | } |
1002f328 NR |
67 | |
68 | /** | |
69 | * This {@link Downloader} is forbidden to try and connect to the network. | |
70 | * <p> | |
71 | * If TRUE, it will only check the cache if any. | |
72 | * <p> | |
73 | * Default is FALSE. | |
74 | * | |
75 | * @return TRUE if offline | |
76 | */ | |
77 | public boolean isOffline() { | |
78 | return offline; | |
79 | } | |
80 | ||
81 | /** | |
82 | * This {@link Downloader} is forbidden to try and connect to the network. | |
83 | * <p> | |
84 | * If TRUE, it will only check the cache if any. | |
85 | * <p> | |
86 | * Default is FALSE. | |
87 | * | |
88 | * @param offline TRUE for offline, FALSE for online | |
89 | */ | |
90 | public void setOffline(boolean offline) { | |
91 | this.offline = offline; | |
92 | } | |
8816d2f7 | 93 | |
530d4062 NR |
94 | /** |
95 | * The traces handler for this {@link Cache}. | |
96 | * | |
97 | * @return the traces handler | |
98 | */ | |
99 | public TraceHandler getTraceHandler() { | |
100 | return tracer; | |
101 | } | |
102 | ||
103 | /** | |
104 | * The traces handler for this {@link Cache}. | |
105 | * | |
106 | * @param tracer | |
107 | * the new traces handler | |
108 | */ | |
109 | public void setTraceHandler(TraceHandler tracer) { | |
80500544 NR |
110 | if (tracer == null) { |
111 | tracer = new TraceHandler(false, false, false); | |
112 | } | |
113 | ||
530d4062 NR |
114 | this.tracer = tracer; |
115 | } | |
116 | ||
3052163b NR |
117 | /** |
118 | * The {@link Cache} to use for all access (can be NULL). | |
119 | * | |
120 | * @return the cache | |
121 | */ | |
122 | public Cache getCache() { | |
123 | return cache; | |
124 | } | |
125 | ||
126 | /** | |
127 | * The {@link Cache} to use for all access (can be NULL). | |
128 | * | |
129 | * @param cache | |
130 | * the new cache | |
131 | */ | |
132 | public void setCache(Cache cache) { | |
133 | this.cache = cache; | |
134 | } | |
135 | ||
8816d2f7 NR |
136 | /** |
137 | * Clear all the cookies currently in the jar. | |
138 | * <p> | |
139 | * As long as you don't, the cookies are kept. | |
140 | */ | |
141 | public void clearCookies() { | |
142 | cookies.getCookieStore().removeAll(); | |
143 | } | |
144 | ||
145 | /** | |
146 | * Open the given {@link URL} and update the cookies. | |
147 | * | |
148 | * @param url | |
149 | * the {@link URL} to open | |
150 | * @return the {@link InputStream} of the opened page | |
151 | * | |
152 | * @throws IOException | |
153 | * in case of I/O error | |
154 | **/ | |
155 | public InputStream open(URL url) throws IOException { | |
f6e8d60d NR |
156 | return open(url, false); |
157 | } | |
158 | ||
159 | /** | |
160 | * Open the given {@link URL} and update the cookies. | |
161 | * | |
162 | * @param url | |
163 | * the {@link URL} to open | |
164 | * @param stable | |
165 | * stable a stable file (that doesn't change too often) -- | |
166 | * parameter used to check if the file is too old to keep or not | |
167 | * in the cache (default is false) | |
168 | * | |
169 | * @return the {@link InputStream} of the opened page | |
170 | * | |
171 | * @throws IOException | |
172 | * in case of I/O error | |
173 | **/ | |
174 | public InputStream open(URL url, boolean stable) throws IOException { | |
175 | return open(url, url, url, null, null, null, null, stable); | |
8816d2f7 NR |
176 | } |
177 | ||
178 | /** | |
179 | * Open the given {@link URL} and update the cookies. | |
180 | * | |
181 | * @param url | |
182 | * the {@link URL} to open | |
530d4062 NR |
183 | * @param currentReferer |
184 | * the current referer, for websites that needs this info | |
185 | * @param cookiesValues | |
186 | * the cookies | |
8816d2f7 NR |
187 | * @param postParams |
188 | * the POST parameters | |
189 | * @param getParams | |
190 | * the GET parameters (priority over POST) | |
191 | * @param oauth | |
192 | * OAuth authorization (aka, "bearer XXXXXXX") | |
193 | * | |
194 | * @return the {@link InputStream} of the opened page | |
195 | * | |
196 | * @throws IOException | |
1002f328 | 197 | * in case of I/O error (including offline mode + not in cache) |
8816d2f7 NR |
198 | */ |
199 | public InputStream open(URL url, URL currentReferer, | |
200 | Map<String, String> cookiesValues, Map<String, String> postParams, | |
201 | Map<String, String> getParams, String oauth) throws IOException { | |
f6e8d60d NR |
202 | return open(url, currentReferer, cookiesValues, postParams, getParams, |
203 | oauth, false); | |
204 | } | |
205 | ||
206 | /** | |
207 | * Open the given {@link URL} and update the cookies. | |
208 | * | |
209 | * @param url | |
210 | * the {@link URL} to open | |
211 | * @param currentReferer | |
212 | * the current referer, for websites that needs this info | |
213 | * @param cookiesValues | |
214 | * the cookies | |
215 | * @param postParams | |
216 | * the POST parameters | |
217 | * @param getParams | |
218 | * the GET parameters (priority over POST) | |
219 | * @param oauth | |
220 | * OAuth authorization (aka, "bearer XXXXXXX") | |
221 | * @param stable | |
222 | * stable a stable file (that doesn't change too often) -- | |
223 | * parameter used to check if the file is too old to keep or not | |
224 | * in the cache (default is false) | |
225 | * | |
226 | * @return the {@link InputStream} of the opened page | |
227 | * | |
228 | * @throws IOException | |
1002f328 | 229 | * in case of I/O error (including offline mode + not in cache) |
f6e8d60d NR |
230 | */ |
231 | public InputStream open(URL url, URL currentReferer, | |
232 | Map<String, String> cookiesValues, Map<String, String> postParams, | |
233 | Map<String, String> getParams, String oauth, boolean stable) | |
234 | throws IOException { | |
8816d2f7 | 235 | return open(url, url, currentReferer, cookiesValues, postParams, |
f6e8d60d | 236 | getParams, oauth, stable); |
8816d2f7 NR |
237 | } |
238 | ||
8816d2f7 NR |
239 | /** |
240 | * Open the given {@link URL} and update the cookies. | |
241 | * | |
242 | * @param url | |
243 | * the {@link URL} to open | |
244 | * @param originalUrl | |
ae7d1a83 NR |
245 | * the original {@link URL} before any redirection occurs, which |
246 | * is also used for the cache ID if needed (so we can retrieve | |
247 | * the content with this URL if needed) | |
248 | * @param currentReferer | |
249 | * the current referer, for websites that needs this info | |
250 | * @param cookiesValues | |
251 | * the cookies | |
8816d2f7 NR |
252 | * @param postParams |
253 | * the POST parameters | |
254 | * @param getParams | |
255 | * the GET parameters (priority over POST) | |
256 | * @param oauth | |
257 | * OAuth authorisation (aka, "bearer XXXXXXX") | |
f6e8d60d NR |
258 | * @param stable |
259 | * a stable file (that doesn't change too often) -- parameter | |
260 | * used to check if the file is too old to keep or not in the | |
261 | * cache | |
262 | * | |
8816d2f7 NR |
263 | * @return the {@link InputStream} of the opened page |
264 | * | |
265 | * @throws IOException | |
1002f328 | 266 | * in case of I/O error (including offline mode + not in cache) |
8816d2f7 | 267 | */ |
ae7d1a83 NR |
268 | public InputStream open(URL url, final URL originalUrl, URL currentReferer, |
269 | Map<String, String> cookiesValues, Map<String, String> postParams, | |
270 | Map<String, String> getParams, String oauth, boolean stable) | |
271 | throws IOException { | |
f6e8d60d NR |
272 | |
273 | tracer.trace("Request: " + url); | |
274 | ||
275 | if (cache != null) { | |
ae7d1a83 | 276 | InputStream in = cache.load(originalUrl, false, stable); |
f6e8d60d | 277 | if (in != null) { |
223aa0d4 NR |
278 | tracer.trace("Use the cache: " + url); |
279 | tracer.trace("Original URL : " + originalUrl); | |
f6e8d60d NR |
280 | return in; |
281 | } | |
282 | } | |
8816d2f7 | 283 | |
291f7dbd NR |
284 | String protocol = originalUrl == null ? null : originalUrl |
285 | .getProtocol(); | |
286 | if (isOffline() && !"file".equalsIgnoreCase(protocol)) { | |
1002f328 NR |
287 | tracer.error("Downloader OFFLINE, cannot proceed to URL: " + url); |
288 | throw new IOException("Downloader is currently OFFLINE, cannot download: " + url); | |
289 | } | |
290 | ||
530d4062 | 291 | tracer.trace("Download: " + url); |
8816d2f7 NR |
292 | |
293 | URLConnection conn = openConnectionWithCookies(url, currentReferer, | |
294 | cookiesValues); | |
295 | ||
296 | // Priority: GET over POST | |
297 | Map<String, String> params = getParams; | |
298 | if (getParams == null) { | |
299 | params = postParams; | |
300 | } | |
301 | ||
15f13472 | 302 | StringBuilder requestData = null; |
8816d2f7 NR |
303 | if ((params != null || oauth != null) |
304 | && conn instanceof HttpURLConnection) { | |
8816d2f7 NR |
305 | if (params != null) { |
306 | requestData = new StringBuilder(); | |
307 | for (Map.Entry<String, String> param : params.entrySet()) { | |
308 | if (requestData.length() != 0) | |
309 | requestData.append('&'); | |
310 | requestData.append(URLEncoder.encode(param.getKey(), | |
311 | "UTF-8")); | |
312 | requestData.append('='); | |
313 | requestData.append(URLEncoder.encode( | |
314 | String.valueOf(param.getValue()), "UTF-8")); | |
315 | } | |
316 | ||
8816d2f7 NR |
317 | if (getParams == null && postParams != null) { |
318 | ((HttpURLConnection) conn).setRequestMethod("POST"); | |
319 | } | |
320 | ||
321 | conn.setRequestProperty("Content-Type", | |
322 | "application/x-www-form-urlencoded"); | |
15f13472 NR |
323 | conn.setRequestProperty("Content-Length", |
324 | Integer.toString(requestData.length())); | |
8816d2f7 NR |
325 | } |
326 | ||
327 | if (oauth != null) { | |
328 | conn.setRequestProperty("Authorization", oauth); | |
329 | } | |
330 | ||
331 | if (requestData != null) { | |
15f13472 NR |
332 | conn.setDoOutput(true); |
333 | OutputStreamWriter writer = new OutputStreamWriter( | |
334 | conn.getOutputStream()); | |
0988831f | 335 | try { |
0988831f NR |
336 | writer.write(requestData.toString()); |
337 | writer.flush(); | |
338 | } finally { | |
15f13472 | 339 | writer.close(); |
0988831f | 340 | } |
8816d2f7 NR |
341 | } |
342 | } | |
343 | ||
15f13472 NR |
344 | // Manual redirection, much better for POST data |
345 | if (conn instanceof HttpURLConnection) { | |
346 | ((HttpURLConnection) conn).setInstanceFollowRedirects(false); | |
347 | } | |
348 | ||
8816d2f7 NR |
349 | conn.connect(); |
350 | ||
351 | // Check if redirect | |
59654e2a NR |
352 | // BEWARE! POST data cannot be redirected (some webservers complain) for |
353 | // HTTP codes 302 and 303 | |
6149689f NR |
354 | if (conn instanceof HttpURLConnection) { |
355 | int repCode = 0; | |
356 | try { | |
357 | // Can fail in some circumstances | |
358 | repCode = ((HttpURLConnection) conn).getResponseCode(); | |
359 | } catch (IOException e) { | |
360 | } | |
361 | ||
362 | if (repCode / 100 == 3) { | |
363 | String newUrl = conn.getHeaderField("Location"); | |
364 | return open(new URL(newUrl), originalUrl, currentReferer, | |
59654e2a NR |
365 | cookiesValues, // |
366 | (repCode == 302 || repCode == 303) ? null : postParams, // | |
367 | getParams, oauth, stable); | |
6149689f | 368 | } |
8816d2f7 NR |
369 | } |
370 | ||
59654e2a NR |
371 | try { |
372 | InputStream in = conn.getInputStream(); | |
373 | if ("gzip".equals(conn.getContentEncoding())) { | |
374 | in = new GZIPInputStream(in); | |
375 | } | |
8816d2f7 | 376 | |
59654e2a NR |
377 | if (in == null) { |
378 | throw new IOException("No InputStream!"); | |
379 | } | |
380 | ||
381 | if (cache != null) { | |
eee36623 NR |
382 | String size = conn.getContentLength() < 0 ? "unknown size" |
383 | : StringUtils.formatNumber(conn.getContentLength()) | |
59654e2a NR |
384 | + "bytes"; |
385 | tracer.trace("Save to cache (" + size + "): " + originalUrl); | |
eb6dcdbf | 386 | try { |
59654e2a NR |
387 | try { |
388 | long bytes = cache.save(in, originalUrl); | |
389 | tracer.trace("Saved to cache: " | |
390 | + StringUtils.formatNumber(bytes) + "bytes"); | |
391 | } finally { | |
392 | in.close(); | |
393 | } | |
394 | in = cache.load(originalUrl, true, true); | |
395 | } catch (IOException e) { | |
396 | tracer.error(new IOException( | |
397 | "Cannot save URL to cache, will ignore cache: " | |
398 | + url, e)); | |
eb6dcdbf | 399 | } |
f6e8d60d | 400 | } |
f6e8d60d | 401 | |
47c88873 NR |
402 | if (in == null) { |
403 | throw new IOException( | |
404 | "Cannot retrieve the file after storing it in the cache (??)"); | |
405 | } | |
406 | ||
59654e2a NR |
407 | return in; |
408 | } catch (IOException e) { | |
409 | throw new IOException(String.format( | |
410 | "Cannot find %s (current URL: %s)", originalUrl, url), e); | |
411 | } | |
8816d2f7 NR |
412 | } |
413 | ||
414 | /** | |
415 | * Open a connection on the given {@link URL}, and manage the cookies that | |
416 | * come with it. | |
417 | * | |
418 | * @param url | |
419 | * the {@link URL} to open | |
420 | * | |
421 | * @return the connection | |
422 | * | |
423 | * @throws IOException | |
424 | * in case of I/O error | |
425 | */ | |
426 | private URLConnection openConnectionWithCookies(URL url, | |
427 | URL currentReferer, Map<String, String> cookiesValues) | |
428 | throws IOException { | |
429 | URLConnection conn = url.openConnection(); | |
430 | ||
15f13472 NR |
431 | String cookies = generateCookies(cookiesValues); |
432 | if (cookies != null && !cookies.isEmpty()) { | |
433 | conn.setRequestProperty("Cookie", cookies); | |
434 | } | |
435 | ||
8816d2f7 | 436 | conn.setRequestProperty("User-Agent", UA); |
8816d2f7 | 437 | conn.setRequestProperty("Accept-Encoding", "gzip"); |
15f13472 | 438 | conn.setRequestProperty("Accept", "*/*"); |
59654e2a | 439 | conn.setRequestProperty("Charset", "utf-8"); |
15f13472 | 440 | |
8816d2f7 NR |
441 | if (currentReferer != null) { |
442 | conn.setRequestProperty("Referer", currentReferer.toString()); | |
443 | conn.setRequestProperty("Host", currentReferer.getHost()); | |
444 | } | |
445 | ||
446 | return conn; | |
447 | } | |
448 | ||
449 | /** | |
450 | * Generate the cookie {@link String} from the local {@link CookieStore} so | |
451 | * it is ready to be passed. | |
452 | * | |
453 | * @return the cookie | |
454 | */ | |
455 | private String generateCookies(Map<String, String> cookiesValues) { | |
456 | StringBuilder builder = new StringBuilder(); | |
457 | for (HttpCookie cookie : cookies.getCookieStore().getCookies()) { | |
458 | if (builder.length() > 0) { | |
459 | builder.append(';'); | |
460 | } | |
461 | ||
8816d2f7 NR |
462 | builder.append(cookie.toString()); |
463 | } | |
464 | ||
465 | if (cookiesValues != null) { | |
466 | for (Map.Entry<String, String> set : cookiesValues.entrySet()) { | |
467 | if (builder.length() > 0) { | |
468 | builder.append(';'); | |
469 | } | |
470 | builder.append(set.getKey()); | |
471 | builder.append('='); | |
472 | builder.append(set.getValue()); | |
473 | } | |
474 | } | |
475 | ||
476 | return builder.toString(); | |
477 | } | |
478 | } |