Improve Downloader and Cache:
[fanfix.git] / src / be / nikiroo / utils / Downloader.java
CommitLineData
8816d2f7
NR
1package be.nikiroo.utils;
2
3import java.io.IOException;
4import java.io.InputStream;
5import java.io.OutputStreamWriter;
6import java.net.CookieHandler;
7import java.net.CookieManager;
8import java.net.CookiePolicy;
9import java.net.CookieStore;
10import java.net.HttpCookie;
11import java.net.HttpURLConnection;
12import java.net.URL;
13import java.net.URLConnection;
14import java.net.URLEncoder;
15import java.util.Map;
16import java.util.zip.GZIPInputStream;
17
18/**
19 * This class will help you download content from Internet Sites ({@link URL}
20 * based).
21 * <p>
22 * It allows you to control some options often required on web sites that do not
23 * want to simply serve HTML, but actively makes your life difficult with stupid
24 * checks.
25 *
26 * @author niki
27 */
28public class Downloader {
29 private String UA;
30 private CookieManager cookies;
530d4062 31 private TraceHandler tracer = new TraceHandler();
f6e8d60d 32 private Cache cache;
8816d2f7
NR
33
34 /**
35 * Create a new {@link Downloader}.
36 *
37 * @param UA
38 * the User-Agent to use to download the resources -- note that
39 * some websites require one, some actively blacklist real UAs
40 * like the one from wget, some whitelist a couple of browsers
41 * only (!)
42 */
43 public Downloader(String UA) {
f6e8d60d
NR
44 this(UA, null);
45 }
46
47 /**
48 * Create a new {@link Downloader}.
49 *
50 * @param UA
51 * the User-Agent to use to download the resources -- note that
52 * some websites require one, some actively blacklist real UAs
53 * like the one from wget, some whitelist a couple of browsers
54 * only (!)
55 * @param cache
56 * the {@link Cache} to use for all access (can be NULL)
57 */
58 public Downloader(String UA, Cache cache) {
8816d2f7
NR
59 this.UA = UA;
60
61 cookies = new CookieManager();
62 cookies.setCookiePolicy(CookiePolicy.ACCEPT_ALL);
63 CookieHandler.setDefault(cookies);
f6e8d60d
NR
64
65 this.cache = cache;
8816d2f7
NR
66 }
67
530d4062
NR
68 /**
69 * The traces handler for this {@link Cache}.
70 *
71 * @return the traces handler
72 */
73 public TraceHandler getTraceHandler() {
74 return tracer;
75 }
76
77 /**
78 * The traces handler for this {@link Cache}.
79 *
80 * @param tracer
81 * the new traces handler
82 */
83 public void setTraceHandler(TraceHandler tracer) {
80500544
NR
84 if (tracer == null) {
85 tracer = new TraceHandler(false, false, false);
86 }
87
530d4062
NR
88 this.tracer = tracer;
89 }
90
8816d2f7
NR
91 /**
92 * Clear all the cookies currently in the jar.
93 * <p>
94 * As long as you don't, the cookies are kept.
95 */
96 public void clearCookies() {
97 cookies.getCookieStore().removeAll();
98 }
99
100 /**
101 * Open the given {@link URL} and update the cookies.
102 *
103 * @param url
104 * the {@link URL} to open
105 * @return the {@link InputStream} of the opened page
106 *
107 * @throws IOException
108 * in case of I/O error
109 **/
110 public InputStream open(URL url) throws IOException {
f6e8d60d
NR
111 return open(url, false);
112 }
113
114 /**
115 * Open the given {@link URL} and update the cookies.
116 *
117 * @param url
118 * the {@link URL} to open
119 * @param stable
120 * stable a stable file (that doesn't change too often) --
121 * parameter used to check if the file is too old to keep or not
122 * in the cache (default is false)
123 *
124 * @return the {@link InputStream} of the opened page
125 *
126 * @throws IOException
127 * in case of I/O error
128 **/
129 public InputStream open(URL url, boolean stable) throws IOException {
130 return open(url, url, url, null, null, null, null, stable);
8816d2f7
NR
131 }
132
133 /**
134 * Open the given {@link URL} and update the cookies.
135 *
136 * @param url
137 * the {@link URL} to open
530d4062
NR
138 * @param currentReferer
139 * the current referer, for websites that needs this info
140 * @param cookiesValues
141 * the cookies
8816d2f7
NR
142 * @param postParams
143 * the POST parameters
144 * @param getParams
145 * the GET parameters (priority over POST)
146 * @param oauth
147 * OAuth authorization (aka, "bearer XXXXXXX")
148 *
149 * @return the {@link InputStream} of the opened page
150 *
151 * @throws IOException
152 * in case of I/O error
153 */
154 public InputStream open(URL url, URL currentReferer,
155 Map<String, String> cookiesValues, Map<String, String> postParams,
156 Map<String, String> getParams, String oauth) throws IOException {
f6e8d60d
NR
157 return open(url, currentReferer, cookiesValues, postParams, getParams,
158 oauth, false);
159 }
160
161 /**
162 * Open the given {@link URL} and update the cookies.
163 *
164 * @param url
165 * the {@link URL} to open
166 * @param currentReferer
167 * the current referer, for websites that needs this info
168 * @param cookiesValues
169 * the cookies
170 * @param postParams
171 * the POST parameters
172 * @param getParams
173 * the GET parameters (priority over POST)
174 * @param oauth
175 * OAuth authorization (aka, "bearer XXXXXXX")
176 * @param stable
177 * stable a stable file (that doesn't change too often) --
178 * parameter used to check if the file is too old to keep or not
179 * in the cache (default is false)
180 *
181 * @return the {@link InputStream} of the opened page
182 *
183 * @throws IOException
184 * in case of I/O error
185 */
186 public InputStream open(URL url, URL currentReferer,
187 Map<String, String> cookiesValues, Map<String, String> postParams,
188 Map<String, String> getParams, String oauth, boolean stable)
189 throws IOException {
8816d2f7 190 return open(url, url, currentReferer, cookiesValues, postParams,
f6e8d60d 191 getParams, oauth, stable);
8816d2f7
NR
192 }
193
8816d2f7
NR
194 /**
195 * Open the given {@link URL} and update the cookies.
196 *
197 * @param url
198 * the {@link URL} to open
199 * @param originalUrl
200 * the original {@link URL} before any redirection occurs
201 * @param postParams
202 * the POST parameters
203 * @param getParams
204 * the GET parameters (priority over POST)
205 * @param oauth
206 * OAuth authorisation (aka, "bearer XXXXXXX")
f6e8d60d
NR
207 * @param stable
208 * a stable file (that doesn't change too often) -- parameter
209 * used to check if the file is too old to keep or not in the
210 * cache
211 *
8816d2f7
NR
212 * @return the {@link InputStream} of the opened page
213 *
214 * @throws IOException
215 * in case of I/O error
216 */
217 private InputStream open(URL url, final URL originalUrl,
218 URL currentReferer, Map<String, String> cookiesValues,
219 Map<String, String> postParams, Map<String, String> getParams,
f6e8d60d
NR
220 String oauth, boolean stable) throws IOException {
221
222 tracer.trace("Request: " + url);
223
224 if (cache != null) {
225 InputStream in = cache.load(url, false, stable);
226 if (in != null) {
227 tracer.trace("Take from cache: " + url);
228 return in;
229 }
230 }
8816d2f7 231
530d4062 232 tracer.trace("Download: " + url);
8816d2f7
NR
233
234 URLConnection conn = openConnectionWithCookies(url, currentReferer,
235 cookiesValues);
236
237 // Priority: GET over POST
238 Map<String, String> params = getParams;
239 if (getParams == null) {
240 params = postParams;
241 }
242
243 if ((params != null || oauth != null)
244 && conn instanceof HttpURLConnection) {
245 StringBuilder requestData = null;
246 if (params != null) {
247 requestData = new StringBuilder();
248 for (Map.Entry<String, String> param : params.entrySet()) {
249 if (requestData.length() != 0)
250 requestData.append('&');
251 requestData.append(URLEncoder.encode(param.getKey(),
252 "UTF-8"));
253 requestData.append('=');
254 requestData.append(URLEncoder.encode(
255 String.valueOf(param.getValue()), "UTF-8"));
256 }
257
258 conn.setDoOutput(true);
259
260 if (getParams == null && postParams != null) {
261 ((HttpURLConnection) conn).setRequestMethod("POST");
262 }
263
264 conn.setRequestProperty("Content-Type",
265 "application/x-www-form-urlencoded");
266 conn.setRequestProperty("charset", "utf-8");
267 }
268
269 if (oauth != null) {
270 conn.setRequestProperty("Authorization", oauth);
271 }
272
273 if (requestData != null) {
0988831f
NR
274 OutputStreamWriter writer = null;
275 try {
276 writer = new OutputStreamWriter(conn.getOutputStream());
277 writer.write(requestData.toString());
278 writer.flush();
279 } finally {
280 if (writer != null) {
281 writer.close();
282 }
283 }
8816d2f7
NR
284 }
285 }
286
287 conn.connect();
288
289 // Check if redirect
6149689f
NR
290 if (conn instanceof HttpURLConnection) {
291 int repCode = 0;
292 try {
293 // Can fail in some circumstances
294 repCode = ((HttpURLConnection) conn).getResponseCode();
295 } catch (IOException e) {
296 }
297
298 if (repCode / 100 == 3) {
299 String newUrl = conn.getHeaderField("Location");
300 return open(new URL(newUrl), originalUrl, currentReferer,
f6e8d60d 301 cookiesValues, postParams, getParams, oauth, stable);
6149689f 302 }
8816d2f7
NR
303 }
304
305 InputStream in = conn.getInputStream();
306 if ("gzip".equals(conn.getContentEncoding())) {
307 in = new GZIPInputStream(in);
308 }
309
f6e8d60d
NR
310 if (in != null && cache != null) {
311 tracer.trace("Save to cache: " + url);
312 try {
313 cache.save(in, url);
314 } catch (IOException e) {
315 tracer.error(new IOException(
316 "Cannot save URL to cache, will ignore cache: " + url,
317 e));
318 }
319 }
320
8816d2f7
NR
321 return in;
322 }
323
324 /**
325 * Open a connection on the given {@link URL}, and manage the cookies that
326 * come with it.
327 *
328 * @param url
329 * the {@link URL} to open
330 *
331 * @return the connection
332 *
333 * @throws IOException
334 * in case of I/O error
335 */
336 private URLConnection openConnectionWithCookies(URL url,
337 URL currentReferer, Map<String, String> cookiesValues)
338 throws IOException {
339 URLConnection conn = url.openConnection();
340
341 conn.setRequestProperty("User-Agent", UA);
342 conn.setRequestProperty("Cookie", generateCookies(cookiesValues));
343 conn.setRequestProperty("Accept-Encoding", "gzip");
344 if (currentReferer != null) {
345 conn.setRequestProperty("Referer", currentReferer.toString());
346 conn.setRequestProperty("Host", currentReferer.getHost());
347 }
348
349 return conn;
350 }
351
352 /**
353 * Generate the cookie {@link String} from the local {@link CookieStore} so
354 * it is ready to be passed.
355 *
356 * @return the cookie
357 */
358 private String generateCookies(Map<String, String> cookiesValues) {
359 StringBuilder builder = new StringBuilder();
360 for (HttpCookie cookie : cookies.getCookieStore().getCookies()) {
361 if (builder.length() > 0) {
362 builder.append(';');
363 }
364
365 // TODO: check if format is ok
366 builder.append(cookie.toString());
367 }
368
369 if (cookiesValues != null) {
370 for (Map.Entry<String, String> set : cookiesValues.entrySet()) {
371 if (builder.length() > 0) {
372 builder.append(';');
373 }
374 builder.append(set.getKey());
375 builder.append('=');
376 builder.append(set.getValue());
377 }
378 }
379
380 return builder.toString();
381 }
382}