Commit | Line | Data |
---|---|---|
8816d2f7 NR |
1 | package be.nikiroo.utils; |
2 | ||
3 | import java.io.IOException; | |
4 | import java.io.InputStream; | |
5 | import java.io.OutputStreamWriter; | |
6 | import java.net.CookieHandler; | |
7 | import java.net.CookieManager; | |
8 | import java.net.CookiePolicy; | |
9 | import java.net.CookieStore; | |
10 | import java.net.HttpCookie; | |
11 | import java.net.HttpURLConnection; | |
12 | import java.net.URL; | |
13 | import java.net.URLConnection; | |
14 | import java.net.URLEncoder; | |
15 | import java.util.Map; | |
16 | import java.util.zip.GZIPInputStream; | |
17 | ||
18 | /** | |
19 | * This class will help you download content from Internet Sites ({@link URL} | |
20 | * based). | |
21 | * <p> | |
22 | * It allows you to control some options often required on web sites that do not | |
23 | * want to simply serve HTML, but actively makes your life difficult with stupid | |
24 | * checks. | |
25 | * | |
26 | * @author niki | |
27 | */ | |
28 | public class Downloader { | |
29 | private String UA; | |
30 | private CookieManager cookies; | |
530d4062 | 31 | private TraceHandler tracer = new TraceHandler(); |
f6e8d60d | 32 | private Cache cache; |
8816d2f7 NR |
33 | |
34 | /** | |
35 | * Create a new {@link Downloader}. | |
36 | * | |
37 | * @param UA | |
38 | * the User-Agent to use to download the resources -- note that | |
39 | * some websites require one, some actively blacklist real UAs | |
40 | * like the one from wget, some whitelist a couple of browsers | |
41 | * only (!) | |
42 | */ | |
43 | public Downloader(String UA) { | |
f6e8d60d NR |
44 | this(UA, null); |
45 | } | |
46 | ||
47 | /** | |
48 | * Create a new {@link Downloader}. | |
49 | * | |
50 | * @param UA | |
51 | * the User-Agent to use to download the resources -- note that | |
52 | * some websites require one, some actively blacklist real UAs | |
53 | * like the one from wget, some whitelist a couple of browsers | |
54 | * only (!) | |
55 | * @param cache | |
56 | * the {@link Cache} to use for all access (can be NULL) | |
57 | */ | |
58 | public Downloader(String UA, Cache cache) { | |
8816d2f7 NR |
59 | this.UA = UA; |
60 | ||
61 | cookies = new CookieManager(); | |
62 | cookies.setCookiePolicy(CookiePolicy.ACCEPT_ALL); | |
63 | CookieHandler.setDefault(cookies); | |
f6e8d60d NR |
64 | |
65 | this.cache = cache; | |
8816d2f7 NR |
66 | } |
67 | ||
530d4062 NR |
68 | /** |
69 | * The traces handler for this {@link Cache}. | |
70 | * | |
71 | * @return the traces handler | |
72 | */ | |
73 | public TraceHandler getTraceHandler() { | |
74 | return tracer; | |
75 | } | |
76 | ||
77 | /** | |
78 | * The traces handler for this {@link Cache}. | |
79 | * | |
80 | * @param tracer | |
81 | * the new traces handler | |
82 | */ | |
83 | public void setTraceHandler(TraceHandler tracer) { | |
80500544 NR |
84 | if (tracer == null) { |
85 | tracer = new TraceHandler(false, false, false); | |
86 | } | |
87 | ||
530d4062 NR |
88 | this.tracer = tracer; |
89 | } | |
90 | ||
3052163b NR |
91 | /** |
92 | * The {@link Cache} to use for all access (can be NULL). | |
93 | * | |
94 | * @return the cache | |
95 | */ | |
96 | public Cache getCache() { | |
97 | return cache; | |
98 | } | |
99 | ||
100 | /** | |
101 | * The {@link Cache} to use for all access (can be NULL). | |
102 | * | |
103 | * @param cache | |
104 | * the new cache | |
105 | */ | |
106 | public void setCache(Cache cache) { | |
107 | this.cache = cache; | |
108 | } | |
109 | ||
8816d2f7 NR |
110 | /** |
111 | * Clear all the cookies currently in the jar. | |
112 | * <p> | |
113 | * As long as you don't, the cookies are kept. | |
114 | */ | |
115 | public void clearCookies() { | |
116 | cookies.getCookieStore().removeAll(); | |
117 | } | |
118 | ||
119 | /** | |
120 | * Open the given {@link URL} and update the cookies. | |
121 | * | |
122 | * @param url | |
123 | * the {@link URL} to open | |
124 | * @return the {@link InputStream} of the opened page | |
125 | * | |
126 | * @throws IOException | |
127 | * in case of I/O error | |
128 | **/ | |
129 | public InputStream open(URL url) throws IOException { | |
f6e8d60d NR |
130 | return open(url, false); |
131 | } | |
132 | ||
133 | /** | |
134 | * Open the given {@link URL} and update the cookies. | |
135 | * | |
136 | * @param url | |
137 | * the {@link URL} to open | |
138 | * @param stable | |
139 | * stable a stable file (that doesn't change too often) -- | |
140 | * parameter used to check if the file is too old to keep or not | |
141 | * in the cache (default is false) | |
142 | * | |
143 | * @return the {@link InputStream} of the opened page | |
144 | * | |
145 | * @throws IOException | |
146 | * in case of I/O error | |
147 | **/ | |
148 | public InputStream open(URL url, boolean stable) throws IOException { | |
149 | return open(url, url, url, null, null, null, null, stable); | |
8816d2f7 NR |
150 | } |
151 | ||
152 | /** | |
153 | * Open the given {@link URL} and update the cookies. | |
154 | * | |
155 | * @param url | |
156 | * the {@link URL} to open | |
530d4062 NR |
157 | * @param currentReferer |
158 | * the current referer, for websites that needs this info | |
159 | * @param cookiesValues | |
160 | * the cookies | |
8816d2f7 NR |
161 | * @param postParams |
162 | * the POST parameters | |
163 | * @param getParams | |
164 | * the GET parameters (priority over POST) | |
165 | * @param oauth | |
166 | * OAuth authorization (aka, "bearer XXXXXXX") | |
167 | * | |
168 | * @return the {@link InputStream} of the opened page | |
169 | * | |
170 | * @throws IOException | |
171 | * in case of I/O error | |
172 | */ | |
173 | public InputStream open(URL url, URL currentReferer, | |
174 | Map<String, String> cookiesValues, Map<String, String> postParams, | |
175 | Map<String, String> getParams, String oauth) throws IOException { | |
f6e8d60d NR |
176 | return open(url, currentReferer, cookiesValues, postParams, getParams, |
177 | oauth, false); | |
178 | } | |
179 | ||
180 | /** | |
181 | * Open the given {@link URL} and update the cookies. | |
182 | * | |
183 | * @param url | |
184 | * the {@link URL} to open | |
185 | * @param currentReferer | |
186 | * the current referer, for websites that needs this info | |
187 | * @param cookiesValues | |
188 | * the cookies | |
189 | * @param postParams | |
190 | * the POST parameters | |
191 | * @param getParams | |
192 | * the GET parameters (priority over POST) | |
193 | * @param oauth | |
194 | * OAuth authorization (aka, "bearer XXXXXXX") | |
195 | * @param stable | |
196 | * stable a stable file (that doesn't change too often) -- | |
197 | * parameter used to check if the file is too old to keep or not | |
198 | * in the cache (default is false) | |
199 | * | |
200 | * @return the {@link InputStream} of the opened page | |
201 | * | |
202 | * @throws IOException | |
203 | * in case of I/O error | |
204 | */ | |
205 | public InputStream open(URL url, URL currentReferer, | |
206 | Map<String, String> cookiesValues, Map<String, String> postParams, | |
207 | Map<String, String> getParams, String oauth, boolean stable) | |
208 | throws IOException { | |
8816d2f7 | 209 | return open(url, url, currentReferer, cookiesValues, postParams, |
f6e8d60d | 210 | getParams, oauth, stable); |
8816d2f7 NR |
211 | } |
212 | ||
8816d2f7 NR |
213 | /** |
214 | * Open the given {@link URL} and update the cookies. | |
215 | * | |
216 | * @param url | |
217 | * the {@link URL} to open | |
218 | * @param originalUrl | |
ae7d1a83 NR |
219 | * the original {@link URL} before any redirection occurs, which |
220 | * is also used for the cache ID if needed (so we can retrieve | |
221 | * the content with this URL if needed) | |
222 | * @param currentReferer | |
223 | * the current referer, for websites that needs this info | |
224 | * @param cookiesValues | |
225 | * the cookies | |
8816d2f7 NR |
226 | * @param postParams |
227 | * the POST parameters | |
228 | * @param getParams | |
229 | * the GET parameters (priority over POST) | |
230 | * @param oauth | |
231 | * OAuth authorisation (aka, "bearer XXXXXXX") | |
f6e8d60d NR |
232 | * @param stable |
233 | * a stable file (that doesn't change too often) -- parameter | |
234 | * used to check if the file is too old to keep or not in the | |
235 | * cache | |
236 | * | |
8816d2f7 NR |
237 | * @return the {@link InputStream} of the opened page |
238 | * | |
239 | * @throws IOException | |
240 | * in case of I/O error | |
241 | */ | |
ae7d1a83 NR |
242 | public InputStream open(URL url, final URL originalUrl, URL currentReferer, |
243 | Map<String, String> cookiesValues, Map<String, String> postParams, | |
244 | Map<String, String> getParams, String oauth, boolean stable) | |
245 | throws IOException { | |
f6e8d60d NR |
246 | |
247 | tracer.trace("Request: " + url); | |
248 | ||
249 | if (cache != null) { | |
ae7d1a83 | 250 | InputStream in = cache.load(originalUrl, false, stable); |
f6e8d60d | 251 | if (in != null) { |
223aa0d4 NR |
252 | tracer.trace("Use the cache: " + url); |
253 | tracer.trace("Original URL : " + originalUrl); | |
f6e8d60d NR |
254 | return in; |
255 | } | |
256 | } | |
8816d2f7 | 257 | |
530d4062 | 258 | tracer.trace("Download: " + url); |
8816d2f7 NR |
259 | |
260 | URLConnection conn = openConnectionWithCookies(url, currentReferer, | |
261 | cookiesValues); | |
262 | ||
263 | // Priority: GET over POST | |
264 | Map<String, String> params = getParams; | |
265 | if (getParams == null) { | |
266 | params = postParams; | |
267 | } | |
268 | ||
269 | if ((params != null || oauth != null) | |
270 | && conn instanceof HttpURLConnection) { | |
271 | StringBuilder requestData = null; | |
272 | if (params != null) { | |
273 | requestData = new StringBuilder(); | |
274 | for (Map.Entry<String, String> param : params.entrySet()) { | |
275 | if (requestData.length() != 0) | |
276 | requestData.append('&'); | |
277 | requestData.append(URLEncoder.encode(param.getKey(), | |
278 | "UTF-8")); | |
279 | requestData.append('='); | |
280 | requestData.append(URLEncoder.encode( | |
281 | String.valueOf(param.getValue()), "UTF-8")); | |
282 | } | |
283 | ||
284 | conn.setDoOutput(true); | |
285 | ||
286 | if (getParams == null && postParams != null) { | |
287 | ((HttpURLConnection) conn).setRequestMethod("POST"); | |
288 | } | |
289 | ||
290 | conn.setRequestProperty("Content-Type", | |
291 | "application/x-www-form-urlencoded"); | |
292 | conn.setRequestProperty("charset", "utf-8"); | |
293 | } | |
294 | ||
295 | if (oauth != null) { | |
296 | conn.setRequestProperty("Authorization", oauth); | |
297 | } | |
298 | ||
299 | if (requestData != null) { | |
0988831f NR |
300 | OutputStreamWriter writer = null; |
301 | try { | |
302 | writer = new OutputStreamWriter(conn.getOutputStream()); | |
303 | writer.write(requestData.toString()); | |
304 | writer.flush(); | |
305 | } finally { | |
306 | if (writer != null) { | |
307 | writer.close(); | |
308 | } | |
309 | } | |
8816d2f7 NR |
310 | } |
311 | } | |
312 | ||
313 | conn.connect(); | |
314 | ||
315 | // Check if redirect | |
6149689f NR |
316 | if (conn instanceof HttpURLConnection) { |
317 | int repCode = 0; | |
318 | try { | |
319 | // Can fail in some circumstances | |
320 | repCode = ((HttpURLConnection) conn).getResponseCode(); | |
321 | } catch (IOException e) { | |
322 | } | |
323 | ||
324 | if (repCode / 100 == 3) { | |
325 | String newUrl = conn.getHeaderField("Location"); | |
326 | return open(new URL(newUrl), originalUrl, currentReferer, | |
f6e8d60d | 327 | cookiesValues, postParams, getParams, oauth, stable); |
6149689f | 328 | } |
8816d2f7 NR |
329 | } |
330 | ||
331 | InputStream in = conn.getInputStream(); | |
332 | if ("gzip".equals(conn.getContentEncoding())) { | |
333 | in = new GZIPInputStream(in); | |
334 | } | |
335 | ||
f6e8d60d | 336 | if (in != null && cache != null) { |
76c54eff | 337 | tracer.trace("Save to cache: " + originalUrl); |
f6e8d60d | 338 | try { |
eb6dcdbf NR |
339 | try { |
340 | cache.save(in, originalUrl); | |
341 | } finally { | |
342 | in.close(); | |
343 | } | |
76c54eff | 344 | in = cache.load(originalUrl, true, false); |
f6e8d60d NR |
345 | } catch (IOException e) { |
346 | tracer.error(new IOException( | |
347 | "Cannot save URL to cache, will ignore cache: " + url, | |
348 | e)); | |
349 | } | |
350 | } | |
351 | ||
8816d2f7 NR |
352 | return in; |
353 | } | |
354 | ||
355 | /** | |
356 | * Open a connection on the given {@link URL}, and manage the cookies that | |
357 | * come with it. | |
358 | * | |
359 | * @param url | |
360 | * the {@link URL} to open | |
361 | * | |
362 | * @return the connection | |
363 | * | |
364 | * @throws IOException | |
365 | * in case of I/O error | |
366 | */ | |
367 | private URLConnection openConnectionWithCookies(URL url, | |
368 | URL currentReferer, Map<String, String> cookiesValues) | |
369 | throws IOException { | |
370 | URLConnection conn = url.openConnection(); | |
371 | ||
372 | conn.setRequestProperty("User-Agent", UA); | |
373 | conn.setRequestProperty("Cookie", generateCookies(cookiesValues)); | |
374 | conn.setRequestProperty("Accept-Encoding", "gzip"); | |
375 | if (currentReferer != null) { | |
376 | conn.setRequestProperty("Referer", currentReferer.toString()); | |
377 | conn.setRequestProperty("Host", currentReferer.getHost()); | |
378 | } | |
379 | ||
380 | return conn; | |
381 | } | |
382 | ||
383 | /** | |
384 | * Generate the cookie {@link String} from the local {@link CookieStore} so | |
385 | * it is ready to be passed. | |
386 | * | |
387 | * @return the cookie | |
388 | */ | |
389 | private String generateCookies(Map<String, String> cookiesValues) { | |
390 | StringBuilder builder = new StringBuilder(); | |
391 | for (HttpCookie cookie : cookies.getCookieStore().getCookies()) { | |
392 | if (builder.length() > 0) { | |
393 | builder.append(';'); | |
394 | } | |
395 | ||
8816d2f7 NR |
396 | builder.append(cookie.toString()); |
397 | } | |
398 | ||
399 | if (cookiesValues != null) { | |
400 | for (Map.Entry<String, String> set : cookiesValues.entrySet()) { | |
401 | if (builder.length() > 0) { | |
402 | builder.append(';'); | |
403 | } | |
404 | builder.append(set.getKey()); | |
405 | builder.append('='); | |
406 | builder.append(set.getValue()); | |
407 | } | |
408 | } | |
409 | ||
410 | return builder.toString(); | |
411 | } | |
412 | } |