1 package be
.nikiroo
.utils
;
3 import java
.io
.IOException
;
4 import java
.io
.InputStream
;
5 import java
.io
.OutputStreamWriter
;
6 import java
.net
.CookieHandler
;
7 import java
.net
.CookieManager
;
8 import java
.net
.CookiePolicy
;
9 import java
.net
.CookieStore
;
10 import java
.net
.HttpCookie
;
11 import java
.net
.HttpURLConnection
;
13 import java
.net
.URLConnection
;
14 import java
.net
.URLEncoder
;
16 import java
.util
.zip
.GZIPInputStream
;
19 * This class will help you download content from Internet Sites ({@link URL}
22 * It allows you to control some options often required on web sites that do not
23 * want to simply serve HTML, but actively makes your life difficult with stupid
28 public class Downloader
{
30 private CookieManager cookies
;
31 private TraceHandler tracer
= new TraceHandler();
33 private boolean offline
;
36 * Create a new {@link Downloader}.
39 * the User-Agent to use to download the resources -- note that
40 * some websites require one, some actively blacklist real UAs
41 * like the one from wget, some whitelist a couple of browsers
44 public Downloader(String UA
) {
49 * Create a new {@link Downloader}.
52 * the User-Agent to use to download the resources -- note that
53 * some websites require one, some actively blacklist real UAs
54 * like the one from wget, some whitelist a couple of browsers
57 * the {@link Cache} to use for all access (can be NULL)
59 public Downloader(String UA
, Cache cache
) {
62 cookies
= new CookieManager(null, CookiePolicy
.ACCEPT_ALL
);
63 CookieHandler
.setDefault(cookies
);
69 * This {@link Downloader} is forbidden to try and connect to the network.
71 * If TRUE, it will only check the cache if any.
75 * @return TRUE if offline
77 public boolean isOffline() {
82 * This {@link Downloader} is forbidden to try and connect to the network.
84 * If TRUE, it will only check the cache if any.
88 * @param offline TRUE for offline, FALSE for online
90 public void setOffline(boolean offline
) {
91 this.offline
= offline
;
95 * The traces handler for this {@link Cache}.
97 * @return the traces handler
99 public TraceHandler
getTraceHandler() {
104 * The traces handler for this {@link Cache}.
107 * the new traces handler
109 public void setTraceHandler(TraceHandler tracer
) {
110 if (tracer
== null) {
111 tracer
= new TraceHandler(false, false, false);
114 this.tracer
= tracer
;
118 * The {@link Cache} to use for all access (can be NULL).
122 public Cache
getCache() {
127 * The {@link Cache} to use for all access (can be NULL).
132 public void setCache(Cache cache
) {
137 * Clear all the cookies currently in the jar.
139 * As long as you don't, the cookies are kept.
141 public void clearCookies() {
142 cookies
.getCookieStore().removeAll();
146 * Open the given {@link URL} and update the cookies.
149 * the {@link URL} to open
150 * @return the {@link InputStream} of the opened page
152 * @throws IOException
153 * in case of I/O error
155 public InputStream
open(URL url
) throws IOException
{
156 return open(url
, false);
160 * Open the given {@link URL} and update the cookies.
163 * the {@link URL} to open
165 * stable a stable file (that doesn't change too often) --
166 * parameter used to check if the file is too old to keep or not
167 * in the cache (default is false)
169 * @return the {@link InputStream} of the opened page
171 * @throws IOException
172 * in case of I/O error
174 public InputStream
open(URL url
, boolean stable
) throws IOException
{
175 return open(url
, url
, url
, null, null, null, null, stable
);
179 * Open the given {@link URL} and update the cookies.
182 * the {@link URL} to open
183 * @param currentReferer
184 * the current referer, for websites that needs this info
185 * @param cookiesValues
188 * the POST parameters
190 * the GET parameters (priority over POST)
192 * OAuth authorization (aka, "bearer XXXXXXX")
194 * @return the {@link InputStream} of the opened page
196 * @throws IOException
197 * in case of I/O error (including offline mode + not in cache)
199 public InputStream
open(URL url
, URL currentReferer
,
200 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
201 Map
<String
, String
> getParams
, String oauth
) throws IOException
{
202 return open(url
, currentReferer
, cookiesValues
, postParams
, getParams
,
207 * Open the given {@link URL} and update the cookies.
210 * the {@link URL} to open
211 * @param currentReferer
212 * the current referer, for websites that needs this info
213 * @param cookiesValues
216 * the POST parameters
218 * the GET parameters (priority over POST)
220 * OAuth authorization (aka, "bearer XXXXXXX")
222 * stable a stable file (that doesn't change too often) --
223 * parameter used to check if the file is too old to keep or not
224 * in the cache (default is false)
226 * @return the {@link InputStream} of the opened page
228 * @throws IOException
229 * in case of I/O error (including offline mode + not in cache)
231 public InputStream
open(URL url
, URL currentReferer
,
232 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
233 Map
<String
, String
> getParams
, String oauth
, boolean stable
)
235 return open(url
, url
, currentReferer
, cookiesValues
, postParams
,
236 getParams
, oauth
, stable
);
240 * Open the given {@link URL} and update the cookies.
243 * the {@link URL} to open
245 * the original {@link URL} before any redirection occurs, which
246 * is also used for the cache ID if needed (so we can retrieve
247 * the content with this URL if needed)
248 * @param currentReferer
249 * the current referer, for websites that needs this info
250 * @param cookiesValues
253 * the POST parameters
255 * the GET parameters (priority over POST)
257 * OAuth authorisation (aka, "bearer XXXXXXX")
259 * a stable file (that doesn't change too often) -- parameter
260 * used to check if the file is too old to keep or not in the
263 * @return the {@link InputStream} of the opened page
265 * @throws IOException
266 * in case of I/O error (including offline mode + not in cache)
268 public InputStream
open(URL url
, final URL originalUrl
, URL currentReferer
,
269 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
270 Map
<String
, String
> getParams
, String oauth
, boolean stable
)
273 tracer
.trace("Request: " + url
);
276 InputStream in
= cache
.load(originalUrl
, false, stable
);
278 tracer
.trace("Use the cache: " + url
);
279 tracer
.trace("Original URL : " + originalUrl
);
285 tracer
.error("Downloader OFFLINE, cannot proceed to URL: " + url
);
286 throw new IOException("Downloader is currently OFFLINE, cannot download: " + url
);
289 tracer
.trace("Download: " + url
);
291 URLConnection conn
= openConnectionWithCookies(url
, currentReferer
,
294 // Priority: GET over POST
295 Map
<String
, String
> params
= getParams
;
296 if (getParams
== null) {
300 StringBuilder requestData
= null;
301 if ((params
!= null || oauth
!= null)
302 && conn
instanceof HttpURLConnection
) {
303 if (params
!= null) {
304 requestData
= new StringBuilder();
305 for (Map
.Entry
<String
, String
> param
: params
.entrySet()) {
306 if (requestData
.length() != 0)
307 requestData
.append('&');
308 requestData
.append(URLEncoder
.encode(param
.getKey(),
310 requestData
.append('=');
311 requestData
.append(URLEncoder
.encode(
312 String
.valueOf(param
.getValue()), "UTF-8"));
315 if (getParams
== null && postParams
!= null) {
316 ((HttpURLConnection
) conn
).setRequestMethod("POST");
319 conn
.setRequestProperty("Content-Type",
320 "application/x-www-form-urlencoded");
321 conn
.setRequestProperty("Content-Length",
322 Integer
.toString(requestData
.length()));
326 conn
.setRequestProperty("Authorization", oauth
);
329 if (requestData
!= null) {
330 conn
.setDoOutput(true);
331 OutputStreamWriter writer
= new OutputStreamWriter(
332 conn
.getOutputStream());
334 writer
.write(requestData
.toString());
342 // Manual redirection, much better for POST data
343 if (conn
instanceof HttpURLConnection
) {
344 ((HttpURLConnection
) conn
).setInstanceFollowRedirects(false);
350 // BEWARE! POST data cannot be redirected (some webservers complain) for
351 // HTTP codes 302 and 303
352 if (conn
instanceof HttpURLConnection
) {
355 // Can fail in some circumstances
356 repCode
= ((HttpURLConnection
) conn
).getResponseCode();
357 } catch (IOException e
) {
360 if (repCode
/ 100 == 3) {
361 String newUrl
= conn
.getHeaderField("Location");
362 return open(new URL(newUrl
), originalUrl
, currentReferer
,
364 (repCode
== 302 || repCode
== 303) ?
null : postParams
, //
365 getParams
, oauth
, stable
);
370 InputStream in
= conn
.getInputStream();
371 if ("gzip".equals(conn
.getContentEncoding())) {
372 in
= new GZIPInputStream(in
);
376 throw new IOException("No InputStream!");
380 String size
= conn
.getContentLength() < 0 ?
"unknown size"
381 : StringUtils
.formatNumber(conn
.getContentLength())
383 tracer
.trace("Save to cache (" + size
+ "): " + originalUrl
);
386 long bytes
= cache
.save(in
, originalUrl
);
387 tracer
.trace("Saved to cache: "
388 + StringUtils
.formatNumber(bytes
) + "bytes");
392 in
= cache
.load(originalUrl
, true, true);
393 } catch (IOException e
) {
394 tracer
.error(new IOException(
395 "Cannot save URL to cache, will ignore cache: "
401 } catch (IOException e
) {
402 throw new IOException(String
.format(
403 "Cannot find %s (current URL: %s)", originalUrl
, url
), e
);
408 * Open a connection on the given {@link URL}, and manage the cookies that
412 * the {@link URL} to open
414 * @return the connection
416 * @throws IOException
417 * in case of I/O error
419 private URLConnection
openConnectionWithCookies(URL url
,
420 URL currentReferer
, Map
<String
, String
> cookiesValues
)
422 URLConnection conn
= url
.openConnection();
424 String cookies
= generateCookies(cookiesValues
);
425 if (cookies
!= null && !cookies
.isEmpty()) {
426 conn
.setRequestProperty("Cookie", cookies
);
429 conn
.setRequestProperty("User-Agent", UA
);
430 conn
.setRequestProperty("Accept-Encoding", "gzip");
431 conn
.setRequestProperty("Accept", "*/*");
432 conn
.setRequestProperty("Charset", "utf-8");
434 if (currentReferer
!= null) {
435 conn
.setRequestProperty("Referer", currentReferer
.toString());
436 conn
.setRequestProperty("Host", currentReferer
.getHost());
443 * Generate the cookie {@link String} from the local {@link CookieStore} so
444 * it is ready to be passed.
448 private String
generateCookies(Map
<String
, String
> cookiesValues
) {
449 StringBuilder builder
= new StringBuilder();
450 for (HttpCookie cookie
: cookies
.getCookieStore().getCookies()) {
451 if (builder
.length() > 0) {
455 builder
.append(cookie
.toString());
458 if (cookiesValues
!= null) {
459 for (Map
.Entry
<String
, String
> set
: cookiesValues
.entrySet()) {
460 if (builder
.length() > 0) {
463 builder
.append(set
.getKey());
465 builder
.append(set
.getValue());
469 return builder
.toString();