Version 5.0.0
[fanfix.git] / src / be / nikiroo / utils / Downloader.java
1 package be.nikiroo.utils;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.io.OutputStreamWriter;
6 import java.net.CookieHandler;
7 import java.net.CookieManager;
8 import java.net.CookiePolicy;
9 import java.net.CookieStore;
10 import java.net.HttpCookie;
11 import java.net.HttpURLConnection;
12 import java.net.URL;
13 import java.net.URLConnection;
14 import java.net.URLEncoder;
15 import java.util.Map;
16 import java.util.zip.GZIPInputStream;
17
18 /**
19 * This class will help you download content from Internet Sites ({@link URL}
20 * based).
21 * <p>
22 * It allows you to control some options often required on web sites that do not
23 * want to simply serve HTML, but actively makes your life difficult with stupid
24 * checks.
25 *
26 * @author niki
27 */
28 public class Downloader {
29 private String UA;
30 private CookieManager cookies;
31 private TraceHandler tracer = new TraceHandler();
32 private Cache cache;
33
34 /**
35 * Create a new {@link Downloader}.
36 *
37 * @param UA
38 * the User-Agent to use to download the resources -- note that
39 * some websites require one, some actively blacklist real UAs
40 * like the one from wget, some whitelist a couple of browsers
41 * only (!)
42 */
43 public Downloader(String UA) {
44 this(UA, null);
45 }
46
47 /**
48 * Create a new {@link Downloader}.
49 *
50 * @param UA
51 * the User-Agent to use to download the resources -- note that
52 * some websites require one, some actively blacklist real UAs
53 * like the one from wget, some whitelist a couple of browsers
54 * only (!)
55 * @param cache
56 * the {@link Cache} to use for all access (can be NULL)
57 */
58 public Downloader(String UA, Cache cache) {
59 this.UA = UA;
60
61 cookies = new CookieManager(null, CookiePolicy.ACCEPT_ALL);
62 CookieHandler.setDefault(cookies);
63
64 setCache(cache);
65 }
66
67 /**
68 * The traces handler for this {@link Cache}.
69 *
70 * @return the traces handler
71 */
72 public TraceHandler getTraceHandler() {
73 return tracer;
74 }
75
76 /**
77 * The traces handler for this {@link Cache}.
78 *
79 * @param tracer
80 * the new traces handler
81 */
82 public void setTraceHandler(TraceHandler tracer) {
83 if (tracer == null) {
84 tracer = new TraceHandler(false, false, false);
85 }
86
87 this.tracer = tracer;
88 }
89
90 /**
91 * The {@link Cache} to use for all access (can be NULL).
92 *
93 * @return the cache
94 */
95 public Cache getCache() {
96 return cache;
97 }
98
99 /**
100 * The {@link Cache} to use for all access (can be NULL).
101 *
102 * @param cache
103 * the new cache
104 */
105 public void setCache(Cache cache) {
106 this.cache = cache;
107 }
108
109 /**
110 * Clear all the cookies currently in the jar.
111 * <p>
112 * As long as you don't, the cookies are kept.
113 */
114 public void clearCookies() {
115 cookies.getCookieStore().removeAll();
116 }
117
118 /**
119 * Open the given {@link URL} and update the cookies.
120 *
121 * @param url
122 * the {@link URL} to open
123 * @return the {@link InputStream} of the opened page
124 *
125 * @throws IOException
126 * in case of I/O error
127 **/
128 public InputStream open(URL url) throws IOException {
129 return open(url, false);
130 }
131
132 /**
133 * Open the given {@link URL} and update the cookies.
134 *
135 * @param url
136 * the {@link URL} to open
137 * @param stable
138 * stable a stable file (that doesn't change too often) --
139 * parameter used to check if the file is too old to keep or not
140 * in the cache (default is false)
141 *
142 * @return the {@link InputStream} of the opened page
143 *
144 * @throws IOException
145 * in case of I/O error
146 **/
147 public InputStream open(URL url, boolean stable) throws IOException {
148 return open(url, url, url, null, null, null, null, stable);
149 }
150
151 /**
152 * Open the given {@link URL} and update the cookies.
153 *
154 * @param url
155 * the {@link URL} to open
156 * @param currentReferer
157 * the current referer, for websites that needs this info
158 * @param cookiesValues
159 * the cookies
160 * @param postParams
161 * the POST parameters
162 * @param getParams
163 * the GET parameters (priority over POST)
164 * @param oauth
165 * OAuth authorization (aka, "bearer XXXXXXX")
166 *
167 * @return the {@link InputStream} of the opened page
168 *
169 * @throws IOException
170 * in case of I/O error
171 */
172 public InputStream open(URL url, URL currentReferer,
173 Map<String, String> cookiesValues, Map<String, String> postParams,
174 Map<String, String> getParams, String oauth) throws IOException {
175 return open(url, currentReferer, cookiesValues, postParams, getParams,
176 oauth, false);
177 }
178
179 /**
180 * Open the given {@link URL} and update the cookies.
181 *
182 * @param url
183 * the {@link URL} to open
184 * @param currentReferer
185 * the current referer, for websites that needs this info
186 * @param cookiesValues
187 * the cookies
188 * @param postParams
189 * the POST parameters
190 * @param getParams
191 * the GET parameters (priority over POST)
192 * @param oauth
193 * OAuth authorization (aka, "bearer XXXXXXX")
194 * @param stable
195 * stable a stable file (that doesn't change too often) --
196 * parameter used to check if the file is too old to keep or not
197 * in the cache (default is false)
198 *
199 * @return the {@link InputStream} of the opened page
200 *
201 * @throws IOException
202 * in case of I/O error
203 */
204 public InputStream open(URL url, URL currentReferer,
205 Map<String, String> cookiesValues, Map<String, String> postParams,
206 Map<String, String> getParams, String oauth, boolean stable)
207 throws IOException {
208 return open(url, url, currentReferer, cookiesValues, postParams,
209 getParams, oauth, stable);
210 }
211
212 /**
213 * Open the given {@link URL} and update the cookies.
214 *
215 * @param url
216 * the {@link URL} to open
217 * @param originalUrl
218 * the original {@link URL} before any redirection occurs, which
219 * is also used for the cache ID if needed (so we can retrieve
220 * the content with this URL if needed)
221 * @param currentReferer
222 * the current referer, for websites that needs this info
223 * @param cookiesValues
224 * the cookies
225 * @param postParams
226 * the POST parameters
227 * @param getParams
228 * the GET parameters (priority over POST)
229 * @param oauth
230 * OAuth authorisation (aka, "bearer XXXXXXX")
231 * @param stable
232 * a stable file (that doesn't change too often) -- parameter
233 * used to check if the file is too old to keep or not in the
234 * cache
235 *
236 * @return the {@link InputStream} of the opened page
237 *
238 * @throws IOException
239 * in case of I/O error
240 */
241 public InputStream open(URL url, final URL originalUrl, URL currentReferer,
242 Map<String, String> cookiesValues, Map<String, String> postParams,
243 Map<String, String> getParams, String oauth, boolean stable)
244 throws IOException {
245
246 tracer.trace("Request: " + url);
247
248 if (cache != null) {
249 InputStream in = cache.load(originalUrl, false, stable);
250 if (in != null) {
251 tracer.trace("Use the cache: " + url);
252 tracer.trace("Original URL : " + originalUrl);
253 return in;
254 }
255 }
256
257 tracer.trace("Download: " + url);
258
259 URLConnection conn = openConnectionWithCookies(url, currentReferer,
260 cookiesValues);
261
262 // Priority: GET over POST
263 Map<String, String> params = getParams;
264 if (getParams == null) {
265 params = postParams;
266 }
267
268 StringBuilder requestData = null;
269 if ((params != null || oauth != null)
270 && conn instanceof HttpURLConnection) {
271 if (params != null) {
272 requestData = new StringBuilder();
273 for (Map.Entry<String, String> param : params.entrySet()) {
274 if (requestData.length() != 0)
275 requestData.append('&');
276 requestData.append(URLEncoder.encode(param.getKey(),
277 "UTF-8"));
278 requestData.append('=');
279 requestData.append(URLEncoder.encode(
280 String.valueOf(param.getValue()), "UTF-8"));
281 }
282
283 if (getParams == null && postParams != null) {
284 ((HttpURLConnection) conn).setRequestMethod("POST");
285 }
286
287 conn.setRequestProperty("Content-Type",
288 "application/x-www-form-urlencoded");
289 conn.setRequestProperty("Content-Length",
290 Integer.toString(requestData.length()));
291 }
292
293 if (oauth != null) {
294 conn.setRequestProperty("Authorization", oauth);
295 }
296
297 if (requestData != null) {
298 conn.setDoOutput(true);
299 OutputStreamWriter writer = new OutputStreamWriter(
300 conn.getOutputStream());
301 try {
302 writer.write(requestData.toString());
303 writer.flush();
304 } finally {
305 writer.close();
306 }
307 }
308 }
309
310 // Manual redirection, much better for POST data
311 if (conn instanceof HttpURLConnection) {
312 ((HttpURLConnection) conn).setInstanceFollowRedirects(false);
313 }
314
315 conn.connect();
316
317 // Check if redirect
318 // BEWARE! POST data cannot be redirected (some webservers complain) for
319 // HTTP codes 302 and 303
320 if (conn instanceof HttpURLConnection) {
321 int repCode = 0;
322 try {
323 // Can fail in some circumstances
324 repCode = ((HttpURLConnection) conn).getResponseCode();
325 } catch (IOException e) {
326 }
327
328 if (repCode / 100 == 3) {
329 String newUrl = conn.getHeaderField("Location");
330 return open(new URL(newUrl), originalUrl, currentReferer,
331 cookiesValues, //
332 (repCode == 302 || repCode == 303) ? null : postParams, //
333 getParams, oauth, stable);
334 }
335 }
336
337 try {
338 InputStream in = conn.getInputStream();
339 if ("gzip".equals(conn.getContentEncoding())) {
340 in = new GZIPInputStream(in);
341 }
342
343 if (in == null) {
344 throw new IOException("No InputStream!");
345 }
346
347 if (cache != null) {
348 String size = conn.getContentLength() < 0 ? "unknown size"
349 : StringUtils.formatNumber(conn.getContentLength())
350 + "bytes";
351 tracer.trace("Save to cache (" + size + "): " + originalUrl);
352 try {
353 try {
354 long bytes = cache.save(in, originalUrl);
355 tracer.trace("Saved to cache: "
356 + StringUtils.formatNumber(bytes) + "bytes");
357 } finally {
358 in.close();
359 }
360 in = cache.load(originalUrl, true, true);
361 } catch (IOException e) {
362 tracer.error(new IOException(
363 "Cannot save URL to cache, will ignore cache: "
364 + url, e));
365 }
366 }
367
368 return in;
369 } catch (IOException e) {
370 throw new IOException(String.format(
371 "Cannot find %s (current URL: %s)", originalUrl, url), e);
372 }
373 }
374
375 /**
376 * Open a connection on the given {@link URL}, and manage the cookies that
377 * come with it.
378 *
379 * @param url
380 * the {@link URL} to open
381 *
382 * @return the connection
383 *
384 * @throws IOException
385 * in case of I/O error
386 */
387 private URLConnection openConnectionWithCookies(URL url,
388 URL currentReferer, Map<String, String> cookiesValues)
389 throws IOException {
390 URLConnection conn = url.openConnection();
391
392 String cookies = generateCookies(cookiesValues);
393 if (cookies != null && !cookies.isEmpty()) {
394 conn.setRequestProperty("Cookie", cookies);
395 }
396
397 conn.setRequestProperty("User-Agent", UA);
398 conn.setRequestProperty("Accept-Encoding", "gzip");
399 conn.setRequestProperty("Accept", "*/*");
400 conn.setRequestProperty("Charset", "utf-8");
401
402 if (currentReferer != null) {
403 conn.setRequestProperty("Referer", currentReferer.toString());
404 conn.setRequestProperty("Host", currentReferer.getHost());
405 }
406
407 return conn;
408 }
409
410 /**
411 * Generate the cookie {@link String} from the local {@link CookieStore} so
412 * it is ready to be passed.
413 *
414 * @return the cookie
415 */
416 private String generateCookies(Map<String, String> cookiesValues) {
417 StringBuilder builder = new StringBuilder();
418 for (HttpCookie cookie : cookies.getCookieStore().getCookies()) {
419 if (builder.length() > 0) {
420 builder.append(';');
421 }
422
423 builder.append(cookie.toString());
424 }
425
426 if (cookiesValues != null) {
427 for (Map.Entry<String, String> set : cookiesValues.entrySet()) {
428 if (builder.length() > 0) {
429 builder.append(';');
430 }
431 builder.append(set.getKey());
432 builder.append('=');
433 builder.append(set.getValue());
434 }
435 }
436
437 return builder.toString();
438 }
439 }