Downloader: allow switching the cache
[nikiroo-utils.git] / src / be / nikiroo / utils / Downloader.java
1 package be.nikiroo.utils;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.io.OutputStreamWriter;
6 import java.net.CookieHandler;
7 import java.net.CookieManager;
8 import java.net.CookiePolicy;
9 import java.net.CookieStore;
10 import java.net.HttpCookie;
11 import java.net.HttpURLConnection;
12 import java.net.URL;
13 import java.net.URLConnection;
14 import java.net.URLEncoder;
15 import java.util.Map;
16 import java.util.zip.GZIPInputStream;
17
18 /**
19 * This class will help you download content from Internet Sites ({@link URL}
20 * based).
21 * <p>
22 * It allows you to control some options often required on web sites that do not
23 * want to simply serve HTML, but actively makes your life difficult with stupid
24 * checks.
25 *
26 * @author niki
27 */
28 public class Downloader {
29 private String UA;
30 private CookieManager cookies;
31 private TraceHandler tracer = new TraceHandler();
32 private Cache cache;
33
34 /**
35 * Create a new {@link Downloader}.
36 *
37 * @param UA
38 * the User-Agent to use to download the resources -- note that
39 * some websites require one, some actively blacklist real UAs
40 * like the one from wget, some whitelist a couple of browsers
41 * only (!)
42 */
43 public Downloader(String UA) {
44 this(UA, null);
45 }
46
47 /**
48 * Create a new {@link Downloader}.
49 *
50 * @param UA
51 * the User-Agent to use to download the resources -- note that
52 * some websites require one, some actively blacklist real UAs
53 * like the one from wget, some whitelist a couple of browsers
54 * only (!)
55 * @param cache
56 * the {@link Cache} to use for all access (can be NULL)
57 */
58 public Downloader(String UA, Cache cache) {
59 this.UA = UA;
60
61 cookies = new CookieManager();
62 cookies.setCookiePolicy(CookiePolicy.ACCEPT_ALL);
63 CookieHandler.setDefault(cookies);
64
65 this.cache = cache;
66 }
67
68 /**
69 * The traces handler for this {@link Cache}.
70 *
71 * @return the traces handler
72 */
73 public TraceHandler getTraceHandler() {
74 return tracer;
75 }
76
77 /**
78 * The traces handler for this {@link Cache}.
79 *
80 * @param tracer
81 * the new traces handler
82 */
83 public void setTraceHandler(TraceHandler tracer) {
84 if (tracer == null) {
85 tracer = new TraceHandler(false, false, false);
86 }
87
88 this.tracer = tracer;
89 }
90
91 /**
92 * The {@link Cache} to use for all access (can be NULL).
93 *
94 * @return the cache
95 */
96 public Cache getCache() {
97 return cache;
98 }
99
100 /**
101 * The {@link Cache} to use for all access (can be NULL).
102 *
103 * @param cache
104 * the new cache
105 */
106 public void setCache(Cache cache) {
107 this.cache = cache;
108 }
109
110 /**
111 * Clear all the cookies currently in the jar.
112 * <p>
113 * As long as you don't, the cookies are kept.
114 */
115 public void clearCookies() {
116 cookies.getCookieStore().removeAll();
117 }
118
119 /**
120 * Open the given {@link URL} and update the cookies.
121 *
122 * @param url
123 * the {@link URL} to open
124 * @return the {@link InputStream} of the opened page
125 *
126 * @throws IOException
127 * in case of I/O error
128 **/
129 public InputStream open(URL url) throws IOException {
130 return open(url, false);
131 }
132
133 /**
134 * Open the given {@link URL} and update the cookies.
135 *
136 * @param url
137 * the {@link URL} to open
138 * @param stable
139 * stable a stable file (that doesn't change too often) --
140 * parameter used to check if the file is too old to keep or not
141 * in the cache (default is false)
142 *
143 * @return the {@link InputStream} of the opened page
144 *
145 * @throws IOException
146 * in case of I/O error
147 **/
148 public InputStream open(URL url, boolean stable) throws IOException {
149 return open(url, url, url, null, null, null, null, stable);
150 }
151
152 /**
153 * Open the given {@link URL} and update the cookies.
154 *
155 * @param url
156 * the {@link URL} to open
157 * @param currentReferer
158 * the current referer, for websites that needs this info
159 * @param cookiesValues
160 * the cookies
161 * @param postParams
162 * the POST parameters
163 * @param getParams
164 * the GET parameters (priority over POST)
165 * @param oauth
166 * OAuth authorization (aka, "bearer XXXXXXX")
167 *
168 * @return the {@link InputStream} of the opened page
169 *
170 * @throws IOException
171 * in case of I/O error
172 */
173 public InputStream open(URL url, URL currentReferer,
174 Map<String, String> cookiesValues, Map<String, String> postParams,
175 Map<String, String> getParams, String oauth) throws IOException {
176 return open(url, currentReferer, cookiesValues, postParams, getParams,
177 oauth, false);
178 }
179
180 /**
181 * Open the given {@link URL} and update the cookies.
182 *
183 * @param url
184 * the {@link URL} to open
185 * @param currentReferer
186 * the current referer, for websites that needs this info
187 * @param cookiesValues
188 * the cookies
189 * @param postParams
190 * the POST parameters
191 * @param getParams
192 * the GET parameters (priority over POST)
193 * @param oauth
194 * OAuth authorization (aka, "bearer XXXXXXX")
195 * @param stable
196 * stable a stable file (that doesn't change too often) --
197 * parameter used to check if the file is too old to keep or not
198 * in the cache (default is false)
199 *
200 * @return the {@link InputStream} of the opened page
201 *
202 * @throws IOException
203 * in case of I/O error
204 */
205 public InputStream open(URL url, URL currentReferer,
206 Map<String, String> cookiesValues, Map<String, String> postParams,
207 Map<String, String> getParams, String oauth, boolean stable)
208 throws IOException {
209 return open(url, url, currentReferer, cookiesValues, postParams,
210 getParams, oauth, stable);
211 }
212
213 /**
214 * Open the given {@link URL} and update the cookies.
215 *
216 * @param url
217 * the {@link URL} to open
218 * @param originalUrl
219 * the original {@link URL} before any redirection occurs
220 * @param postParams
221 * the POST parameters
222 * @param getParams
223 * the GET parameters (priority over POST)
224 * @param oauth
225 * OAuth authorisation (aka, "bearer XXXXXXX")
226 * @param stable
227 * a stable file (that doesn't change too often) -- parameter
228 * used to check if the file is too old to keep or not in the
229 * cache
230 *
231 * @return the {@link InputStream} of the opened page
232 *
233 * @throws IOException
234 * in case of I/O error
235 */
236 private InputStream open(URL url, final URL originalUrl,
237 URL currentReferer, Map<String, String> cookiesValues,
238 Map<String, String> postParams, Map<String, String> getParams,
239 String oauth, boolean stable) throws IOException {
240
241 tracer.trace("Request: " + url);
242
243 if (cache != null) {
244 InputStream in = cache.load(url, false, stable);
245 if (in != null) {
246 tracer.trace("Take from cache: " + url);
247 return in;
248 }
249 }
250
251 tracer.trace("Download: " + url);
252
253 URLConnection conn = openConnectionWithCookies(url, currentReferer,
254 cookiesValues);
255
256 // Priority: GET over POST
257 Map<String, String> params = getParams;
258 if (getParams == null) {
259 params = postParams;
260 }
261
262 if ((params != null || oauth != null)
263 && conn instanceof HttpURLConnection) {
264 StringBuilder requestData = null;
265 if (params != null) {
266 requestData = new StringBuilder();
267 for (Map.Entry<String, String> param : params.entrySet()) {
268 if (requestData.length() != 0)
269 requestData.append('&');
270 requestData.append(URLEncoder.encode(param.getKey(),
271 "UTF-8"));
272 requestData.append('=');
273 requestData.append(URLEncoder.encode(
274 String.valueOf(param.getValue()), "UTF-8"));
275 }
276
277 conn.setDoOutput(true);
278
279 if (getParams == null && postParams != null) {
280 ((HttpURLConnection) conn).setRequestMethod("POST");
281 }
282
283 conn.setRequestProperty("Content-Type",
284 "application/x-www-form-urlencoded");
285 conn.setRequestProperty("charset", "utf-8");
286 }
287
288 if (oauth != null) {
289 conn.setRequestProperty("Authorization", oauth);
290 }
291
292 if (requestData != null) {
293 OutputStreamWriter writer = null;
294 try {
295 writer = new OutputStreamWriter(conn.getOutputStream());
296 writer.write(requestData.toString());
297 writer.flush();
298 } finally {
299 if (writer != null) {
300 writer.close();
301 }
302 }
303 }
304 }
305
306 conn.connect();
307
308 // Check if redirect
309 if (conn instanceof HttpURLConnection) {
310 int repCode = 0;
311 try {
312 // Can fail in some circumstances
313 repCode = ((HttpURLConnection) conn).getResponseCode();
314 } catch (IOException e) {
315 }
316
317 if (repCode / 100 == 3) {
318 String newUrl = conn.getHeaderField("Location");
319 return open(new URL(newUrl), originalUrl, currentReferer,
320 cookiesValues, postParams, getParams, oauth, stable);
321 }
322 }
323
324 InputStream in = conn.getInputStream();
325 if ("gzip".equals(conn.getContentEncoding())) {
326 in = new GZIPInputStream(in);
327 }
328
329 if (in != null && cache != null) {
330 tracer.trace("Save to cache: " + url);
331 try {
332 cache.save(in, url);
333 } catch (IOException e) {
334 tracer.error(new IOException(
335 "Cannot save URL to cache, will ignore cache: " + url,
336 e));
337 }
338 }
339
340 return in;
341 }
342
343 /**
344 * Open a connection on the given {@link URL}, and manage the cookies that
345 * come with it.
346 *
347 * @param url
348 * the {@link URL} to open
349 *
350 * @return the connection
351 *
352 * @throws IOException
353 * in case of I/O error
354 */
355 private URLConnection openConnectionWithCookies(URL url,
356 URL currentReferer, Map<String, String> cookiesValues)
357 throws IOException {
358 URLConnection conn = url.openConnection();
359
360 conn.setRequestProperty("User-Agent", UA);
361 conn.setRequestProperty("Cookie", generateCookies(cookiesValues));
362 conn.setRequestProperty("Accept-Encoding", "gzip");
363 if (currentReferer != null) {
364 conn.setRequestProperty("Referer", currentReferer.toString());
365 conn.setRequestProperty("Host", currentReferer.getHost());
366 }
367
368 return conn;
369 }
370
371 /**
372 * Generate the cookie {@link String} from the local {@link CookieStore} so
373 * it is ready to be passed.
374 *
375 * @return the cookie
376 */
377 private String generateCookies(Map<String, String> cookiesValues) {
378 StringBuilder builder = new StringBuilder();
379 for (HttpCookie cookie : cookies.getCookieStore().getCookies()) {
380 if (builder.length() > 0) {
381 builder.append(';');
382 }
383
384 // TODO: check if format is ok
385 builder.append(cookie.toString());
386 }
387
388 if (cookiesValues != null) {
389 for (Map.Entry<String, String> set : cookiesValues.entrySet()) {
390 if (builder.length() > 0) {
391 builder.append(';');
392 }
393 builder.append(set.getKey());
394 builder.append('=');
395 builder.append(set.getValue());
396 }
397 }
398
399 return builder.toString();
400 }
401 }