1 package be
.nikiroo
.utils
;
3 import java
.io
.IOException
;
4 import java
.io
.InputStream
;
5 import java
.io
.OutputStreamWriter
;
6 import java
.net
.CookieHandler
;
7 import java
.net
.CookieManager
;
8 import java
.net
.CookiePolicy
;
9 import java
.net
.CookieStore
;
10 import java
.net
.HttpCookie
;
11 import java
.net
.HttpURLConnection
;
13 import java
.net
.URLConnection
;
14 import java
.net
.URLEncoder
;
16 import java
.util
.zip
.GZIPInputStream
;
19 * This class will help you download content from Internet Sites ({@link URL}
22 * It allows you to control some options often required on web sites that do not
23 * want to simply serve HTML, but actively makes your life difficult with stupid
28 public class Downloader
{
30 private CookieManager cookies
;
31 private TraceHandler tracer
= new TraceHandler();
33 private boolean offline
;
36 * Create a new {@link Downloader}.
39 * the User-Agent to use to download the resources -- note that
40 * some websites require one, some actively blacklist real UAs
41 * like the one from wget, some whitelist a couple of browsers
42 * only (!) -- can be NULL
44 public Downloader(String UA
) {
49 * Create a new {@link Downloader}.
52 * the User-Agent to use to download the resources -- note that
53 * some websites require one, some actively blacklist real UAs
54 * like the one from wget, some whitelist a couple of browsers
55 * only (!) -- can be NULL
57 * the {@link Cache} to use for all access (can be NULL)
59 public Downloader(String UA
, Cache cache
) {
62 cookies
= new CookieManager(null, CookiePolicy
.ACCEPT_ALL
);
63 CookieHandler
.setDefault(cookies
);
69 * This {@link Downloader} is forbidden to try and connect to the network.
71 * If TRUE, it will only check the cache if any.
75 * @return TRUE if offline
77 public boolean isOffline() {
82 * This {@link Downloader} is forbidden to try and connect to the network.
84 * If TRUE, it will only check the cache if any.
88 * @param offline TRUE for offline, FALSE for online
90 public void setOffline(boolean offline
) {
91 this.offline
= offline
;
95 * The traces handler for this {@link Cache}.
97 * @return the traces handler
99 public TraceHandler
getTraceHandler() {
104 * The traces handler for this {@link Cache}.
107 * the new traces handler
109 public void setTraceHandler(TraceHandler tracer
) {
110 if (tracer
== null) {
111 tracer
= new TraceHandler(false, false, false);
114 this.tracer
= tracer
;
118 * The {@link Cache} to use for all access (can be NULL).
122 public Cache
getCache() {
127 * The {@link Cache} to use for all access (can be NULL).
132 public void setCache(Cache cache
) {
137 * Clear all the cookies currently in the jar.
139 * As long as you don't, the cookies are kept.
141 public void clearCookies() {
142 cookies
.getCookieStore().removeAll();
146 * Open the given {@link URL} and update the cookies.
149 * the {@link URL} to open
150 * @return the {@link InputStream} of the opened page
152 * @throws IOException
153 * in case of I/O error
155 public InputStream
open(URL url
) throws IOException
{
156 return open(url
, false);
160 * Open the given {@link URL} and update the cookies.
163 * the {@link URL} to open
165 * stable a stable file (that doesn't change too often) --
166 * parameter used to check if the file is too old to keep or not
167 * in the cache (default is false)
169 * @return the {@link InputStream} of the opened page
171 * @throws IOException
172 * in case of I/O error
174 public InputStream
open(URL url
, boolean stable
) throws IOException
{
175 return open(url
, url
, url
, null, null, null, null, stable
);
179 * Open the given {@link URL} and update the cookies.
182 * the {@link URL} to open
183 * @param currentReferer
184 * the current referer, for websites that needs this info
185 * @param cookiesValues
188 * the POST parameters
190 * the GET parameters (priority over POST)
192 * OAuth authorization (aka, "bearer XXXXXXX")
194 * @return the {@link InputStream} of the opened page
196 * @throws IOException
197 * in case of I/O error (including offline mode + not in cache)
199 public InputStream
open(URL url
, URL currentReferer
,
200 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
201 Map
<String
, String
> getParams
, String oauth
) throws IOException
{
202 return open(url
, currentReferer
, cookiesValues
, postParams
, getParams
,
207 * Open the given {@link URL} and update the cookies.
210 * the {@link URL} to open
211 * @param currentReferer
212 * the current referer, for websites that needs this info
213 * @param cookiesValues
216 * the POST parameters
218 * the GET parameters (priority over POST)
220 * OAuth authorization (aka, "bearer XXXXXXX")
222 * stable a stable file (that doesn't change too often) --
223 * parameter used to check if the file is too old to keep or not
224 * in the cache (default is false)
226 * @return the {@link InputStream} of the opened page
228 * @throws IOException
229 * in case of I/O error (including offline mode + not in cache)
231 public InputStream
open(URL url
, URL currentReferer
,
232 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
233 Map
<String
, String
> getParams
, String oauth
, boolean stable
)
235 return open(url
, url
, currentReferer
, cookiesValues
, postParams
,
236 getParams
, oauth
, stable
);
240 * Open the given {@link URL} and update the cookies.
243 * the {@link URL} to open
245 * the original {@link URL} before any redirection occurs, which
246 * is also used for the cache ID if needed (so we can retrieve
247 * the content with this URL if needed)
248 * @param currentReferer
249 * the current referer, for websites that needs this info
250 * @param cookiesValues
253 * the POST parameters
255 * the GET parameters (priority over POST)
257 * OAuth authorisation (aka, "bearer XXXXXXX")
259 * a stable file (that doesn't change too often) -- parameter
260 * used to check if the file is too old to keep or not in the
263 * @return the {@link InputStream} of the opened page
265 * @throws IOException
266 * in case of I/O error (including offline mode + not in cache)
268 public InputStream
open(URL url
, final URL originalUrl
, URL currentReferer
,
269 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
270 Map
<String
, String
> getParams
, String oauth
, boolean stable
)
273 tracer
.trace("Request: " + url
);
276 InputStream in
= cache
.load(originalUrl
, false, stable
);
278 tracer
.trace("Use the cache: " + url
);
279 tracer
.trace("Original URL : " + originalUrl
);
284 String protocol
= originalUrl
== null ?
null : originalUrl
286 if (isOffline() && !"file".equalsIgnoreCase(protocol
)) {
287 tracer
.error("Downloader OFFLINE, cannot proceed to URL: " + url
);
288 throw new IOException("Downloader is currently OFFLINE, cannot download: " + url
);
291 tracer
.trace("Download: " + url
);
293 URLConnection conn
= openConnectionWithCookies(url
, currentReferer
,
296 // Priority: GET over POST
297 Map
<String
, String
> params
= getParams
;
298 if (getParams
== null) {
302 StringBuilder requestData
= null;
303 if ((params
!= null || oauth
!= null)
304 && conn
instanceof HttpURLConnection
) {
305 if (params
!= null) {
306 requestData
= new StringBuilder();
307 for (Map
.Entry
<String
, String
> param
: params
.entrySet()) {
308 if (requestData
.length() != 0)
309 requestData
.append('&');
310 requestData
.append(URLEncoder
.encode(param
.getKey(),
312 requestData
.append('=');
313 requestData
.append(URLEncoder
.encode(
314 String
.valueOf(param
.getValue()), "UTF-8"));
317 if (getParams
== null && postParams
!= null) {
318 ((HttpURLConnection
) conn
).setRequestMethod("POST");
321 conn
.setRequestProperty("Content-Type",
322 "application/x-www-form-urlencoded");
323 conn
.setRequestProperty("Content-Length",
324 Integer
.toString(requestData
.length()));
328 conn
.setRequestProperty("Authorization", oauth
);
331 if (requestData
!= null) {
332 conn
.setDoOutput(true);
333 OutputStreamWriter writer
= new OutputStreamWriter(
334 conn
.getOutputStream());
336 writer
.write(requestData
.toString());
344 // Manual redirection, much better for POST data
345 if (conn
instanceof HttpURLConnection
) {
346 ((HttpURLConnection
) conn
).setInstanceFollowRedirects(false);
352 // BEWARE! POST data cannot be redirected (some webservers complain) for
353 // HTTP codes 302 and 303
354 if (conn
instanceof HttpURLConnection
) {
357 // Can fail in some circumstances
358 repCode
= ((HttpURLConnection
) conn
).getResponseCode();
359 } catch (IOException e
) {
362 if (repCode
/ 100 == 3) {
363 String newUrl
= conn
.getHeaderField("Location");
364 return open(new URL(newUrl
), originalUrl
, currentReferer
,
366 (repCode
== 302 || repCode
== 303) ?
null : postParams
, //
367 getParams
, oauth
, stable
);
372 InputStream in
= conn
.getInputStream();
373 if ("gzip".equals(conn
.getContentEncoding())) {
374 in
= new GZIPInputStream(in
);
378 throw new IOException("No InputStream!");
382 String size
= conn
.getContentLength() < 0 ?
"unknown size"
383 : StringUtils
.formatNumber(conn
.getContentLength())
385 tracer
.trace("Save to cache (" + size
+ "): " + originalUrl
);
388 long bytes
= cache
.save(in
, originalUrl
);
389 tracer
.trace("Saved to cache: "
390 + StringUtils
.formatNumber(bytes
) + "bytes");
394 in
= cache
.load(originalUrl
, true, true);
395 } catch (IOException e
) {
396 tracer
.error(new IOException(
397 "Cannot save URL to cache, will ignore cache: "
403 throw new IOException(
404 "Cannot retrieve the file after storing it in the cache (??)");
408 } catch (IOException e
) {
409 throw new IOException(String
.format(
410 "Cannot find %s (current URL: %s)", originalUrl
, url
), e
);
415 * Open a connection on the given {@link URL}, and manage the cookies that
419 * the {@link URL} to open
421 * @return the connection
423 * @throws IOException
424 * in case of I/O error
426 private URLConnection
openConnectionWithCookies(URL url
,
427 URL currentReferer
, Map
<String
, String
> cookiesValues
)
429 URLConnection conn
= url
.openConnection();
431 String cookies
= generateCookies(cookiesValues
);
432 if (cookies
!= null && !cookies
.isEmpty()) {
433 conn
.setRequestProperty("Cookie", cookies
);
437 conn
.setRequestProperty("User-Agent", UA
);
439 conn
.setRequestProperty("Accept-Encoding", "gzip");
440 conn
.setRequestProperty("Accept", "*/*");
441 conn
.setRequestProperty("Charset", "utf-8");
443 if (currentReferer
!= null) {
444 conn
.setRequestProperty("Referer", currentReferer
.toString());
445 conn
.setRequestProperty("Host", currentReferer
.getHost());
452 * Generate the cookie {@link String} from the local {@link CookieStore} so
453 * it is ready to be passed.
457 private String
generateCookies(Map
<String
, String
> cookiesValues
) {
458 StringBuilder builder
= new StringBuilder();
459 for (HttpCookie cookie
: cookies
.getCookieStore().getCookies()) {
460 if (builder
.length() > 0) {
464 builder
.append(cookie
.toString());
467 if (cookiesValues
!= null) {
468 for (Map
.Entry
<String
, String
> set
: cookiesValues
.entrySet()) {
469 if (builder
.length() > 0) {
472 builder
.append(set
.getKey());
474 builder
.append(set
.getValue());
478 return builder
.toString();