Downloader: fix POST redirect bug
[fanfix.git] / src / be / nikiroo / utils / Downloader.java
1 package be.nikiroo.utils;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.io.OutputStreamWriter;
6 import java.net.CookieHandler;
7 import java.net.CookieManager;
8 import java.net.CookiePolicy;
9 import java.net.CookieStore;
10 import java.net.HttpCookie;
11 import java.net.HttpURLConnection;
12 import java.net.URL;
13 import java.net.URLConnection;
14 import java.net.URLEncoder;
15 import java.util.Map;
16 import java.util.zip.GZIPInputStream;
17
18 /**
19 * This class will help you download content from Internet Sites ({@link URL}
20 * based).
21 * <p>
22 * It allows you to control some options often required on web sites that do not
23 * want to simply serve HTML, but actively makes your life difficult with stupid
24 * checks.
25 *
26 * @author niki
27 */
28 public class Downloader {
29 private String UA;
30 private CookieManager cookies;
31 private TraceHandler tracer = new TraceHandler();
32 private Cache cache;
33
34 /**
35 * Create a new {@link Downloader}.
36 *
37 * @param UA
38 * the User-Agent to use to download the resources -- note that
39 * some websites require one, some actively blacklist real UAs
40 * like the one from wget, some whitelist a couple of browsers
41 * only (!)
42 */
43 public Downloader(String UA) {
44 this(UA, null);
45 }
46
47 /**
48 * Create a new {@link Downloader}.
49 *
50 * @param UA
51 * the User-Agent to use to download the resources -- note that
52 * some websites require one, some actively blacklist real UAs
53 * like the one from wget, some whitelist a couple of browsers
54 * only (!)
55 * @param cache
56 * the {@link Cache} to use for all access (can be NULL)
57 */
58 public Downloader(String UA, Cache cache) {
59 this.UA = UA;
60
61 cookies = new CookieManager(null, CookiePolicy.ACCEPT_ALL);
62 CookieHandler.setDefault(cookies);
63
64 setCache(cache);
65 }
66
67 /**
68 * The traces handler for this {@link Cache}.
69 *
70 * @return the traces handler
71 */
72 public TraceHandler getTraceHandler() {
73 return tracer;
74 }
75
76 /**
77 * The traces handler for this {@link Cache}.
78 *
79 * @param tracer
80 * the new traces handler
81 */
82 public void setTraceHandler(TraceHandler tracer) {
83 if (tracer == null) {
84 tracer = new TraceHandler(false, false, false);
85 }
86
87 this.tracer = tracer;
88 }
89
90 /**
91 * The {@link Cache} to use for all access (can be NULL).
92 *
93 * @return the cache
94 */
95 public Cache getCache() {
96 return cache;
97 }
98
99 /**
100 * The {@link Cache} to use for all access (can be NULL).
101 *
102 * @param cache
103 * the new cache
104 */
105 public void setCache(Cache cache) {
106 this.cache = cache;
107 }
108
109 /**
110 * Clear all the cookies currently in the jar.
111 * <p>
112 * As long as you don't, the cookies are kept.
113 */
114 public void clearCookies() {
115 cookies.getCookieStore().removeAll();
116 }
117
118 /**
119 * Open the given {@link URL} and update the cookies.
120 *
121 * @param url
122 * the {@link URL} to open
123 * @return the {@link InputStream} of the opened page
124 *
125 * @throws IOException
126 * in case of I/O error
127 **/
128 public InputStream open(URL url) throws IOException {
129 return open(url, false);
130 }
131
132 /**
133 * Open the given {@link URL} and update the cookies.
134 *
135 * @param url
136 * the {@link URL} to open
137 * @param stable
138 * stable a stable file (that doesn't change too often) --
139 * parameter used to check if the file is too old to keep or not
140 * in the cache (default is false)
141 *
142 * @return the {@link InputStream} of the opened page
143 *
144 * @throws IOException
145 * in case of I/O error
146 **/
147 public InputStream open(URL url, boolean stable) throws IOException {
148 return open(url, url, url, null, null, null, null, stable);
149 }
150
151 /**
152 * Open the given {@link URL} and update the cookies.
153 *
154 * @param url
155 * the {@link URL} to open
156 * @param currentReferer
157 * the current referer, for websites that needs this info
158 * @param cookiesValues
159 * the cookies
160 * @param postParams
161 * the POST parameters
162 * @param getParams
163 * the GET parameters (priority over POST)
164 * @param oauth
165 * OAuth authorization (aka, "bearer XXXXXXX")
166 *
167 * @return the {@link InputStream} of the opened page
168 *
169 * @throws IOException
170 * in case of I/O error
171 */
172 public InputStream open(URL url, URL currentReferer,
173 Map<String, String> cookiesValues, Map<String, String> postParams,
174 Map<String, String> getParams, String oauth) throws IOException {
175 return open(url, currentReferer, cookiesValues, postParams, getParams,
176 oauth, false);
177 }
178
179 /**
180 * Open the given {@link URL} and update the cookies.
181 *
182 * @param url
183 * the {@link URL} to open
184 * @param currentReferer
185 * the current referer, for websites that needs this info
186 * @param cookiesValues
187 * the cookies
188 * @param postParams
189 * the POST parameters
190 * @param getParams
191 * the GET parameters (priority over POST)
192 * @param oauth
193 * OAuth authorization (aka, "bearer XXXXXXX")
194 * @param stable
195 * stable a stable file (that doesn't change too often) --
196 * parameter used to check if the file is too old to keep or not
197 * in the cache (default is false)
198 *
199 * @return the {@link InputStream} of the opened page
200 *
201 * @throws IOException
202 * in case of I/O error
203 */
204 public InputStream open(URL url, URL currentReferer,
205 Map<String, String> cookiesValues, Map<String, String> postParams,
206 Map<String, String> getParams, String oauth, boolean stable)
207 throws IOException {
208 return open(url, url, currentReferer, cookiesValues, postParams,
209 getParams, oauth, stable);
210 }
211
212 /**
213 * Open the given {@link URL} and update the cookies.
214 *
215 * @param url
216 * the {@link URL} to open
217 * @param originalUrl
218 * the original {@link URL} before any redirection occurs, which
219 * is also used for the cache ID if needed (so we can retrieve
220 * the content with this URL if needed)
221 * @param currentReferer
222 * the current referer, for websites that needs this info
223 * @param cookiesValues
224 * the cookies
225 * @param postParams
226 * the POST parameters
227 * @param getParams
228 * the GET parameters (priority over POST)
229 * @param oauth
230 * OAuth authorisation (aka, "bearer XXXXXXX")
231 * @param stable
232 * a stable file (that doesn't change too often) -- parameter
233 * used to check if the file is too old to keep or not in the
234 * cache
235 *
236 * @return the {@link InputStream} of the opened page
237 *
238 * @throws IOException
239 * in case of I/O error
240 */
241 public InputStream open(URL url, final URL originalUrl, URL currentReferer,
242 Map<String, String> cookiesValues, Map<String, String> postParams,
243 Map<String, String> getParams, String oauth, boolean stable)
244 throws IOException {
245
246 tracer.trace("Request: " + url);
247
248 if (cache != null) {
249 InputStream in = cache.load(originalUrl, false, stable);
250 if (in != null) {
251 tracer.trace("Use the cache: " + url);
252 tracer.trace("Original URL : " + originalUrl);
253 return in;
254 }
255 }
256
257 tracer.trace("Download: " + url);
258
259 URLConnection conn = openConnectionWithCookies(url, currentReferer,
260 cookiesValues);
261
262 // Priority: GET over POST
263 Map<String, String> params = getParams;
264 if (getParams == null) {
265 params = postParams;
266 }
267
268 StringBuilder requestData = null;
269 if ((params != null || oauth != null)
270 && conn instanceof HttpURLConnection) {
271 if (params != null) {
272 requestData = new StringBuilder();
273 for (Map.Entry<String, String> param : params.entrySet()) {
274 if (requestData.length() != 0)
275 requestData.append('&');
276 requestData.append(URLEncoder.encode(param.getKey(),
277 "UTF-8"));
278 requestData.append('=');
279 requestData.append(URLEncoder.encode(
280 String.valueOf(param.getValue()), "UTF-8"));
281 }
282
283 if (getParams == null && postParams != null) {
284 ((HttpURLConnection) conn).setRequestMethod("POST");
285 }
286
287 conn.setRequestProperty("Content-Type",
288 "application/x-www-form-urlencoded");
289 conn.setRequestProperty("Content-Length",
290 Integer.toString(requestData.length()));
291 conn.setRequestProperty("charset", "utf-8");
292 }
293
294 if (oauth != null) {
295 conn.setRequestProperty("Authorization", oauth);
296 }
297
298 if (requestData != null) {
299 conn.setDoOutput(true);
300 OutputStreamWriter writer = new OutputStreamWriter(
301 conn.getOutputStream());
302 try {
303 writer.write(requestData.toString());
304 writer.flush();
305 } finally {
306 writer.close();
307 }
308 }
309 }
310
311 // Manual redirection, much better for POST data
312 if (conn instanceof HttpURLConnection) {
313 ((HttpURLConnection) conn).setInstanceFollowRedirects(false);
314 }
315
316 conn.connect();
317
318 // Check if redirect
319 // BEWARE! POST data cannot be redirected, so it is ignored here
320 if (conn instanceof HttpURLConnection) {
321 int repCode = 0;
322 try {
323 // Can fail in some circumstances
324 repCode = ((HttpURLConnection) conn).getResponseCode();
325 } catch (IOException e) {
326 }
327
328 if (repCode / 100 == 3) {
329 String newUrl = conn.getHeaderField("Location");
330 return open(new URL(newUrl), originalUrl, currentReferer,
331 cookiesValues, null, getParams, oauth, stable);
332 }
333 }
334
335 InputStream in = conn.getInputStream();
336 if ("gzip".equals(conn.getContentEncoding())) {
337 in = new GZIPInputStream(in);
338 }
339
340 if (in != null && cache != null) {
341 tracer.trace("Save to cache: " + originalUrl);
342 try {
343 try {
344 cache.save(in, originalUrl);
345 } finally {
346 in.close();
347 }
348 in = cache.load(originalUrl, true, false);
349 } catch (IOException e) {
350 tracer.error(new IOException(
351 "Cannot save URL to cache, will ignore cache: " + url,
352 e));
353 }
354 }
355
356 return in;
357 }
358
359 /**
360 * Open a connection on the given {@link URL}, and manage the cookies that
361 * come with it.
362 *
363 * @param url
364 * the {@link URL} to open
365 *
366 * @return the connection
367 *
368 * @throws IOException
369 * in case of I/O error
370 */
371 private URLConnection openConnectionWithCookies(URL url,
372 URL currentReferer, Map<String, String> cookiesValues)
373 throws IOException {
374 URLConnection conn = url.openConnection();
375
376 String cookies = generateCookies(cookiesValues);
377 if (cookies != null && !cookies.isEmpty()) {
378 conn.setRequestProperty("Cookie", cookies);
379 }
380
381 conn.setRequestProperty("User-Agent", UA);
382 conn.setRequestProperty("Accept-Encoding", "gzip");
383 conn.setRequestProperty("Accept", "*/*");
384
385 if (currentReferer != null) {
386 conn.setRequestProperty("Referer", currentReferer.toString());
387 conn.setRequestProperty("Host", currentReferer.getHost());
388 }
389
390 return conn;
391 }
392
393 /**
394 * Generate the cookie {@link String} from the local {@link CookieStore} so
395 * it is ready to be passed.
396 *
397 * @return the cookie
398 */
399 private String generateCookies(Map<String, String> cookiesValues) {
400 StringBuilder builder = new StringBuilder();
401 for (HttpCookie cookie : cookies.getCookieStore().getCookies()) {
402 if (builder.length() > 0) {
403 builder.append(';');
404 }
405
406 builder.append(cookie.toString());
407 }
408
409 if (cookiesValues != null) {
410 for (Map.Entry<String, String> set : cookiesValues.entrySet()) {
411 if (builder.length() > 0) {
412 builder.append(';');
413 }
414 builder.append(set.getKey());
415 builder.append('=');
416 builder.append(set.getValue());
417 }
418 }
419
420 return builder.toString();
421 }
422 }