Improve cache + jdoc, traces
[fanfix.git] / src / be / nikiroo / utils / Downloader.java
1 package be.nikiroo.utils;
2
3 import java.io.FileNotFoundException;
4 import java.io.IOException;
5 import java.io.InputStream;
6 import java.io.OutputStreamWriter;
7 import java.net.CookieHandler;
8 import java.net.CookieManager;
9 import java.net.CookiePolicy;
10 import java.net.CookieStore;
11 import java.net.HttpCookie;
12 import java.net.HttpURLConnection;
13 import java.net.URL;
14 import java.net.URLConnection;
15 import java.net.URLEncoder;
16 import java.util.Map;
17 import java.util.zip.GZIPInputStream;
18
19 /**
20 * This class will help you download content from Internet Sites ({@link URL}
21 * based).
22 * <p>
23 * It allows you to control some options often required on web sites that do not
24 * want to simply serve HTML, but actively makes your life difficult with stupid
25 * checks.
26 *
27 * @author niki
28 */
29 public class Downloader {
30 private String UA;
31 private CookieManager cookies;
32 private TraceHandler tracer = new TraceHandler();
33 private Cache cache;
34
35 /**
36 * Create a new {@link Downloader}.
37 *
38 * @param UA
39 * the User-Agent to use to download the resources -- note that
40 * some websites require one, some actively blacklist real UAs
41 * like the one from wget, some whitelist a couple of browsers
42 * only (!)
43 */
44 public Downloader(String UA) {
45 this(UA, null);
46 }
47
48 /**
49 * Create a new {@link Downloader}.
50 *
51 * @param UA
52 * the User-Agent to use to download the resources -- note that
53 * some websites require one, some actively blacklist real UAs
54 * like the one from wget, some whitelist a couple of browsers
55 * only (!)
56 * @param cache
57 * the {@link Cache} to use for all access (can be NULL)
58 */
59 public Downloader(String UA, Cache cache) {
60 this.UA = UA;
61
62 cookies = new CookieManager(null, CookiePolicy.ACCEPT_ALL);
63 CookieHandler.setDefault(cookies);
64
65 setCache(cache);
66 }
67
68 /**
69 * The traces handler for this {@link Cache}.
70 *
71 * @return the traces handler
72 */
73 public TraceHandler getTraceHandler() {
74 return tracer;
75 }
76
77 /**
78 * The traces handler for this {@link Cache}.
79 *
80 * @param tracer
81 * the new traces handler
82 */
83 public void setTraceHandler(TraceHandler tracer) {
84 if (tracer == null) {
85 tracer = new TraceHandler(false, false, false);
86 }
87
88 this.tracer = tracer;
89 }
90
91 /**
92 * The {@link Cache} to use for all access (can be NULL).
93 *
94 * @return the cache
95 */
96 public Cache getCache() {
97 return cache;
98 }
99
100 /**
101 * The {@link Cache} to use for all access (can be NULL).
102 *
103 * @param cache
104 * the new cache
105 */
106 public void setCache(Cache cache) {
107 this.cache = cache;
108 }
109
110 /**
111 * Clear all the cookies currently in the jar.
112 * <p>
113 * As long as you don't, the cookies are kept.
114 */
115 public void clearCookies() {
116 cookies.getCookieStore().removeAll();
117 }
118
119 /**
120 * Open the given {@link URL} and update the cookies.
121 *
122 * @param url
123 * the {@link URL} to open
124 * @return the {@link InputStream} of the opened page
125 *
126 * @throws IOException
127 * in case of I/O error
128 **/
129 public InputStream open(URL url) throws IOException {
130 return open(url, false);
131 }
132
133 /**
134 * Open the given {@link URL} and update the cookies.
135 *
136 * @param url
137 * the {@link URL} to open
138 * @param stable
139 * stable a stable file (that doesn't change too often) --
140 * parameter used to check if the file is too old to keep or not
141 * in the cache (default is false)
142 *
143 * @return the {@link InputStream} of the opened page
144 *
145 * @throws IOException
146 * in case of I/O error
147 **/
148 public InputStream open(URL url, boolean stable) throws IOException {
149 return open(url, url, url, null, null, null, null, stable);
150 }
151
152 /**
153 * Open the given {@link URL} and update the cookies.
154 *
155 * @param url
156 * the {@link URL} to open
157 * @param currentReferer
158 * the current referer, for websites that needs this info
159 * @param cookiesValues
160 * the cookies
161 * @param postParams
162 * the POST parameters
163 * @param getParams
164 * the GET parameters (priority over POST)
165 * @param oauth
166 * OAuth authorization (aka, "bearer XXXXXXX")
167 *
168 * @return the {@link InputStream} of the opened page
169 *
170 * @throws IOException
171 * in case of I/O error
172 */
173 public InputStream open(URL url, URL currentReferer,
174 Map<String, String> cookiesValues, Map<String, String> postParams,
175 Map<String, String> getParams, String oauth) throws IOException {
176 return open(url, currentReferer, cookiesValues, postParams, getParams,
177 oauth, false);
178 }
179
180 /**
181 * Open the given {@link URL} and update the cookies.
182 *
183 * @param url
184 * the {@link URL} to open
185 * @param currentReferer
186 * the current referer, for websites that needs this info
187 * @param cookiesValues
188 * the cookies
189 * @param postParams
190 * the POST parameters
191 * @param getParams
192 * the GET parameters (priority over POST)
193 * @param oauth
194 * OAuth authorization (aka, "bearer XXXXXXX")
195 * @param stable
196 * stable a stable file (that doesn't change too often) --
197 * parameter used to check if the file is too old to keep or not
198 * in the cache (default is false)
199 *
200 * @return the {@link InputStream} of the opened page
201 *
202 * @throws IOException
203 * in case of I/O error
204 */
205 public InputStream open(URL url, URL currentReferer,
206 Map<String, String> cookiesValues, Map<String, String> postParams,
207 Map<String, String> getParams, String oauth, boolean stable)
208 throws IOException {
209 return open(url, url, currentReferer, cookiesValues, postParams,
210 getParams, oauth, stable);
211 }
212
213 /**
214 * Open the given {@link URL} and update the cookies.
215 *
216 * @param url
217 * the {@link URL} to open
218 * @param originalUrl
219 * the original {@link URL} before any redirection occurs, which
220 * is also used for the cache ID if needed (so we can retrieve
221 * the content with this URL if needed)
222 * @param currentReferer
223 * the current referer, for websites that needs this info
224 * @param cookiesValues
225 * the cookies
226 * @param postParams
227 * the POST parameters
228 * @param getParams
229 * the GET parameters (priority over POST)
230 * @param oauth
231 * OAuth authorisation (aka, "bearer XXXXXXX")
232 * @param stable
233 * a stable file (that doesn't change too often) -- parameter
234 * used to check if the file is too old to keep or not in the
235 * cache
236 *
237 * @return the {@link InputStream} of the opened page
238 *
239 * @throws IOException
240 * in case of I/O error
241 */
242 public InputStream open(URL url, final URL originalUrl, URL currentReferer,
243 Map<String, String> cookiesValues, Map<String, String> postParams,
244 Map<String, String> getParams, String oauth, boolean stable)
245 throws IOException {
246
247 tracer.trace("Request: " + url);
248
249 if (cache != null) {
250 InputStream in = cache.load(originalUrl, false, stable);
251 if (in != null) {
252 tracer.trace("Use the cache: " + url);
253 tracer.trace("Original URL : " + originalUrl);
254 return in;
255 }
256 }
257
258 tracer.trace("Download: " + url);
259
260 URLConnection conn = openConnectionWithCookies(url, currentReferer,
261 cookiesValues);
262
263 // Priority: GET over POST
264 Map<String, String> params = getParams;
265 if (getParams == null) {
266 params = postParams;
267 }
268
269 StringBuilder requestData = null;
270 if ((params != null || oauth != null)
271 && conn instanceof HttpURLConnection) {
272 if (params != null) {
273 requestData = new StringBuilder();
274 for (Map.Entry<String, String> param : params.entrySet()) {
275 if (requestData.length() != 0)
276 requestData.append('&');
277 requestData.append(URLEncoder.encode(param.getKey(),
278 "UTF-8"));
279 requestData.append('=');
280 requestData.append(URLEncoder.encode(
281 String.valueOf(param.getValue()), "UTF-8"));
282 }
283
284 if (getParams == null && postParams != null) {
285 ((HttpURLConnection) conn).setRequestMethod("POST");
286 }
287
288 conn.setRequestProperty("Content-Type",
289 "application/x-www-form-urlencoded");
290 conn.setRequestProperty("Content-Length",
291 Integer.toString(requestData.length()));
292 }
293
294 if (oauth != null) {
295 conn.setRequestProperty("Authorization", oauth);
296 }
297
298 if (requestData != null) {
299 conn.setDoOutput(true);
300 OutputStreamWriter writer = new OutputStreamWriter(
301 conn.getOutputStream());
302 try {
303 writer.write(requestData.toString());
304 writer.flush();
305 } finally {
306 writer.close();
307 }
308 }
309 }
310
311 // Manual redirection, much better for POST data
312 if (conn instanceof HttpURLConnection) {
313 ((HttpURLConnection) conn).setInstanceFollowRedirects(false);
314 }
315
316 conn.connect();
317
318 // Check if redirect
319 // BEWARE! POST data cannot be redirected (some webservers complain) for
320 // HTTP codes 302 and 303
321 if (conn instanceof HttpURLConnection) {
322 int repCode = 0;
323 try {
324 // Can fail in some circumstances
325 repCode = ((HttpURLConnection) conn).getResponseCode();
326 } catch (IOException e) {
327 }
328
329 if (repCode / 100 == 3) {
330 String newUrl = conn.getHeaderField("Location");
331 return open(new URL(newUrl), originalUrl, currentReferer,
332 cookiesValues, //
333 (repCode == 302 || repCode == 303) ? null : postParams, //
334 getParams, oauth, stable);
335 }
336 }
337
338 try {
339 InputStream in = conn.getInputStream();
340 if ("gzip".equals(conn.getContentEncoding())) {
341 in = new GZIPInputStream(in);
342 }
343
344 if (in == null) {
345 throw new IOException("No InputStream!");
346 }
347
348 if (cache != null) {
349 String size = conn.getContentLengthLong() < 0 ? "unknown size"
350 : StringUtils.formatNumber(conn.getContentLengthLong())
351 + "bytes";
352 tracer.trace("Save to cache (" + size + "): " + originalUrl);
353 try {
354 try {
355 long bytes = cache.save(in, originalUrl);
356 tracer.trace("Saved to cache: "
357 + StringUtils.formatNumber(bytes) + "bytes");
358 } finally {
359 in.close();
360 }
361 in = cache.load(originalUrl, true, true);
362 } catch (IOException e) {
363 tracer.error(new IOException(
364 "Cannot save URL to cache, will ignore cache: "
365 + url, e));
366 }
367 }
368
369 return in;
370 } catch (IOException e) {
371 throw new IOException(String.format(
372 "Cannot find %s (current URL: %s)", originalUrl, url), e);
373 }
374 }
375
376 /**
377 * Open a connection on the given {@link URL}, and manage the cookies that
378 * come with it.
379 *
380 * @param url
381 * the {@link URL} to open
382 *
383 * @return the connection
384 *
385 * @throws IOException
386 * in case of I/O error
387 */
388 private URLConnection openConnectionWithCookies(URL url,
389 URL currentReferer, Map<String, String> cookiesValues)
390 throws IOException {
391 URLConnection conn = url.openConnection();
392
393 String cookies = generateCookies(cookiesValues);
394 if (cookies != null && !cookies.isEmpty()) {
395 conn.setRequestProperty("Cookie", cookies);
396 }
397
398 conn.setRequestProperty("User-Agent", UA);
399 conn.setRequestProperty("Accept-Encoding", "gzip");
400 conn.setRequestProperty("Accept", "*/*");
401 conn.setRequestProperty("Charset", "utf-8");
402
403 if (currentReferer != null) {
404 conn.setRequestProperty("Referer", currentReferer.toString());
405 conn.setRequestProperty("Host", currentReferer.getHost());
406 }
407
408 return conn;
409 }
410
411 /**
412 * Generate the cookie {@link String} from the local {@link CookieStore} so
413 * it is ready to be passed.
414 *
415 * @return the cookie
416 */
417 private String generateCookies(Map<String, String> cookiesValues) {
418 StringBuilder builder = new StringBuilder();
419 for (HttpCookie cookie : cookies.getCookieStore().getCookies()) {
420 if (builder.length() > 0) {
421 builder.append(';');
422 }
423
424 builder.append(cookie.toString());
425 }
426
427 if (cookiesValues != null) {
428 for (Map.Entry<String, String> set : cookiesValues.entrySet()) {
429 if (builder.length() > 0) {
430 builder.append(';');
431 }
432 builder.append(set.getKey());
433 builder.append('=');
434 builder.append(set.getValue());
435 }
436 }
437
438 return builder.toString();
439 }
440 }