fix downloader spent input
[nikiroo-utils.git] / src / be / nikiroo / utils / Downloader.java
CommitLineData
8816d2f7
NR
1package be.nikiroo.utils;
2
3import java.io.IOException;
4import java.io.InputStream;
5import java.io.OutputStreamWriter;
6import java.net.CookieHandler;
7import java.net.CookieManager;
8import java.net.CookiePolicy;
9import java.net.CookieStore;
10import java.net.HttpCookie;
11import java.net.HttpURLConnection;
12import java.net.URL;
13import java.net.URLConnection;
14import java.net.URLEncoder;
15import java.util.Map;
16import java.util.zip.GZIPInputStream;
17
18/**
19 * This class will help you download content from Internet Sites ({@link URL}
20 * based).
21 * <p>
22 * It allows you to control some options often required on web sites that do not
23 * want to simply serve HTML, but actively makes your life difficult with stupid
24 * checks.
25 *
26 * @author niki
27 */
28public class Downloader {
29 private String UA;
30 private CookieManager cookies;
530d4062 31 private TraceHandler tracer = new TraceHandler();
f6e8d60d 32 private Cache cache;
8816d2f7
NR
33
34 /**
35 * Create a new {@link Downloader}.
36 *
37 * @param UA
38 * the User-Agent to use to download the resources -- note that
39 * some websites require one, some actively blacklist real UAs
40 * like the one from wget, some whitelist a couple of browsers
41 * only (!)
42 */
43 public Downloader(String UA) {
f6e8d60d
NR
44 this(UA, null);
45 }
46
47 /**
48 * Create a new {@link Downloader}.
49 *
50 * @param UA
51 * the User-Agent to use to download the resources -- note that
52 * some websites require one, some actively blacklist real UAs
53 * like the one from wget, some whitelist a couple of browsers
54 * only (!)
55 * @param cache
56 * the {@link Cache} to use for all access (can be NULL)
57 */
58 public Downloader(String UA, Cache cache) {
8816d2f7
NR
59 this.UA = UA;
60
61 cookies = new CookieManager();
62 cookies.setCookiePolicy(CookiePolicy.ACCEPT_ALL);
63 CookieHandler.setDefault(cookies);
f6e8d60d
NR
64
65 this.cache = cache;
8816d2f7
NR
66 }
67
530d4062
NR
68 /**
69 * The traces handler for this {@link Cache}.
70 *
71 * @return the traces handler
72 */
73 public TraceHandler getTraceHandler() {
74 return tracer;
75 }
76
77 /**
78 * The traces handler for this {@link Cache}.
79 *
80 * @param tracer
81 * the new traces handler
82 */
83 public void setTraceHandler(TraceHandler tracer) {
80500544
NR
84 if (tracer == null) {
85 tracer = new TraceHandler(false, false, false);
86 }
87
530d4062
NR
88 this.tracer = tracer;
89 }
90
3052163b
NR
91 /**
92 * The {@link Cache} to use for all access (can be NULL).
93 *
94 * @return the cache
95 */
96 public Cache getCache() {
97 return cache;
98 }
99
100 /**
101 * The {@link Cache} to use for all access (can be NULL).
102 *
103 * @param cache
104 * the new cache
105 */
106 public void setCache(Cache cache) {
107 this.cache = cache;
108 }
109
8816d2f7
NR
110 /**
111 * Clear all the cookies currently in the jar.
112 * <p>
113 * As long as you don't, the cookies are kept.
114 */
115 public void clearCookies() {
116 cookies.getCookieStore().removeAll();
117 }
118
119 /**
120 * Open the given {@link URL} and update the cookies.
121 *
122 * @param url
123 * the {@link URL} to open
124 * @return the {@link InputStream} of the opened page
125 *
126 * @throws IOException
127 * in case of I/O error
128 **/
129 public InputStream open(URL url) throws IOException {
f6e8d60d
NR
130 return open(url, false);
131 }
132
133 /**
134 * Open the given {@link URL} and update the cookies.
135 *
136 * @param url
137 * the {@link URL} to open
138 * @param stable
139 * stable a stable file (that doesn't change too often) --
140 * parameter used to check if the file is too old to keep or not
141 * in the cache (default is false)
142 *
143 * @return the {@link InputStream} of the opened page
144 *
145 * @throws IOException
146 * in case of I/O error
147 **/
148 public InputStream open(URL url, boolean stable) throws IOException {
149 return open(url, url, url, null, null, null, null, stable);
8816d2f7
NR
150 }
151
152 /**
153 * Open the given {@link URL} and update the cookies.
154 *
155 * @param url
156 * the {@link URL} to open
530d4062
NR
157 * @param currentReferer
158 * the current referer, for websites that needs this info
159 * @param cookiesValues
160 * the cookies
8816d2f7
NR
161 * @param postParams
162 * the POST parameters
163 * @param getParams
164 * the GET parameters (priority over POST)
165 * @param oauth
166 * OAuth authorization (aka, "bearer XXXXXXX")
167 *
168 * @return the {@link InputStream} of the opened page
169 *
170 * @throws IOException
171 * in case of I/O error
172 */
173 public InputStream open(URL url, URL currentReferer,
174 Map<String, String> cookiesValues, Map<String, String> postParams,
175 Map<String, String> getParams, String oauth) throws IOException {
f6e8d60d
NR
176 return open(url, currentReferer, cookiesValues, postParams, getParams,
177 oauth, false);
178 }
179
180 /**
181 * Open the given {@link URL} and update the cookies.
182 *
183 * @param url
184 * the {@link URL} to open
185 * @param currentReferer
186 * the current referer, for websites that needs this info
187 * @param cookiesValues
188 * the cookies
189 * @param postParams
190 * the POST parameters
191 * @param getParams
192 * the GET parameters (priority over POST)
193 * @param oauth
194 * OAuth authorization (aka, "bearer XXXXXXX")
195 * @param stable
196 * stable a stable file (that doesn't change too often) --
197 * parameter used to check if the file is too old to keep or not
198 * in the cache (default is false)
199 *
200 * @return the {@link InputStream} of the opened page
201 *
202 * @throws IOException
203 * in case of I/O error
204 */
205 public InputStream open(URL url, URL currentReferer,
206 Map<String, String> cookiesValues, Map<String, String> postParams,
207 Map<String, String> getParams, String oauth, boolean stable)
208 throws IOException {
8816d2f7 209 return open(url, url, currentReferer, cookiesValues, postParams,
f6e8d60d 210 getParams, oauth, stable);
8816d2f7
NR
211 }
212
8816d2f7
NR
213 /**
214 * Open the given {@link URL} and update the cookies.
215 *
216 * @param url
217 * the {@link URL} to open
218 * @param originalUrl
ae7d1a83
NR
219 * the original {@link URL} before any redirection occurs, which
220 * is also used for the cache ID if needed (so we can retrieve
221 * the content with this URL if needed)
222 * @param currentReferer
223 * the current referer, for websites that needs this info
224 * @param cookiesValues
225 * the cookies
8816d2f7
NR
226 * @param postParams
227 * the POST parameters
228 * @param getParams
229 * the GET parameters (priority over POST)
230 * @param oauth
231 * OAuth authorisation (aka, "bearer XXXXXXX")
f6e8d60d
NR
232 * @param stable
233 * a stable file (that doesn't change too often) -- parameter
234 * used to check if the file is too old to keep or not in the
235 * cache
236 *
8816d2f7
NR
237 * @return the {@link InputStream} of the opened page
238 *
239 * @throws IOException
240 * in case of I/O error
241 */
ae7d1a83
NR
242 public InputStream open(URL url, final URL originalUrl, URL currentReferer,
243 Map<String, String> cookiesValues, Map<String, String> postParams,
244 Map<String, String> getParams, String oauth, boolean stable)
245 throws IOException {
f6e8d60d
NR
246
247 tracer.trace("Request: " + url);
248
249 if (cache != null) {
ae7d1a83 250 InputStream in = cache.load(originalUrl, false, stable);
f6e8d60d 251 if (in != null) {
223aa0d4
NR
252 tracer.trace("Use the cache: " + url);
253 tracer.trace("Original URL : " + originalUrl);
f6e8d60d
NR
254 return in;
255 }
256 }
8816d2f7 257
530d4062 258 tracer.trace("Download: " + url);
8816d2f7
NR
259
260 URLConnection conn = openConnectionWithCookies(url, currentReferer,
261 cookiesValues);
262
263 // Priority: GET over POST
264 Map<String, String> params = getParams;
265 if (getParams == null) {
266 params = postParams;
267 }
268
269 if ((params != null || oauth != null)
270 && conn instanceof HttpURLConnection) {
271 StringBuilder requestData = null;
272 if (params != null) {
273 requestData = new StringBuilder();
274 for (Map.Entry<String, String> param : params.entrySet()) {
275 if (requestData.length() != 0)
276 requestData.append('&');
277 requestData.append(URLEncoder.encode(param.getKey(),
278 "UTF-8"));
279 requestData.append('=');
280 requestData.append(URLEncoder.encode(
281 String.valueOf(param.getValue()), "UTF-8"));
282 }
283
284 conn.setDoOutput(true);
285
286 if (getParams == null && postParams != null) {
287 ((HttpURLConnection) conn).setRequestMethod("POST");
288 }
289
290 conn.setRequestProperty("Content-Type",
291 "application/x-www-form-urlencoded");
292 conn.setRequestProperty("charset", "utf-8");
293 }
294
295 if (oauth != null) {
296 conn.setRequestProperty("Authorization", oauth);
297 }
298
299 if (requestData != null) {
0988831f
NR
300 OutputStreamWriter writer = null;
301 try {
302 writer = new OutputStreamWriter(conn.getOutputStream());
303 writer.write(requestData.toString());
304 writer.flush();
305 } finally {
306 if (writer != null) {
307 writer.close();
308 }
309 }
8816d2f7
NR
310 }
311 }
312
313 conn.connect();
314
315 // Check if redirect
6149689f
NR
316 if (conn instanceof HttpURLConnection) {
317 int repCode = 0;
318 try {
319 // Can fail in some circumstances
320 repCode = ((HttpURLConnection) conn).getResponseCode();
321 } catch (IOException e) {
322 }
323
324 if (repCode / 100 == 3) {
325 String newUrl = conn.getHeaderField("Location");
326 return open(new URL(newUrl), originalUrl, currentReferer,
f6e8d60d 327 cookiesValues, postParams, getParams, oauth, stable);
6149689f 328 }
8816d2f7
NR
329 }
330
331 InputStream in = conn.getInputStream();
332 if ("gzip".equals(conn.getContentEncoding())) {
333 in = new GZIPInputStream(in);
334 }
335
f6e8d60d 336 if (in != null && cache != null) {
76c54eff 337 tracer.trace("Save to cache: " + originalUrl);
f6e8d60d 338 try {
76c54eff
NR
339 cache.save(in, originalUrl);
340 in.close();
341 in = cache.load(originalUrl, true, false);
f6e8d60d
NR
342 } catch (IOException e) {
343 tracer.error(new IOException(
344 "Cannot save URL to cache, will ignore cache: " + url,
345 e));
346 }
347 }
348
8816d2f7
NR
349 return in;
350 }
351
352 /**
353 * Open a connection on the given {@link URL}, and manage the cookies that
354 * come with it.
355 *
356 * @param url
357 * the {@link URL} to open
358 *
359 * @return the connection
360 *
361 * @throws IOException
362 * in case of I/O error
363 */
364 private URLConnection openConnectionWithCookies(URL url,
365 URL currentReferer, Map<String, String> cookiesValues)
366 throws IOException {
367 URLConnection conn = url.openConnection();
368
369 conn.setRequestProperty("User-Agent", UA);
370 conn.setRequestProperty("Cookie", generateCookies(cookiesValues));
371 conn.setRequestProperty("Accept-Encoding", "gzip");
372 if (currentReferer != null) {
373 conn.setRequestProperty("Referer", currentReferer.toString());
374 conn.setRequestProperty("Host", currentReferer.getHost());
375 }
376
377 return conn;
378 }
379
380 /**
381 * Generate the cookie {@link String} from the local {@link CookieStore} so
382 * it is ready to be passed.
383 *
384 * @return the cookie
385 */
386 private String generateCookies(Map<String, String> cookiesValues) {
387 StringBuilder builder = new StringBuilder();
388 for (HttpCookie cookie : cookies.getCookieStore().getCookies()) {
389 if (builder.length() > 0) {
390 builder.append(';');
391 }
392
8816d2f7
NR
393 builder.append(cookie.toString());
394 }
395
396 if (cookiesValues != null) {
397 for (Map.Entry<String, String> set : cookiesValues.entrySet()) {
398 if (builder.length() > 0) {
399 builder.append(';');
400 }
401 builder.append(set.getKey());
402 builder.append('=');
403 builder.append(set.getValue());
404 }
405 }
406
407 return builder.toString();
408 }
409}