1 package be
.nikiroo
.utils
;
3 import java
.io
.FileNotFoundException
;
4 import java
.io
.IOException
;
5 import java
.io
.InputStream
;
6 import java
.io
.OutputStreamWriter
;
7 import java
.net
.CookieHandler
;
8 import java
.net
.CookieManager
;
9 import java
.net
.CookiePolicy
;
10 import java
.net
.CookieStore
;
11 import java
.net
.HttpCookie
;
12 import java
.net
.HttpURLConnection
;
14 import java
.net
.URLConnection
;
15 import java
.net
.URLEncoder
;
17 import java
.util
.zip
.GZIPInputStream
;
20 * This class will help you download content from Internet Sites ({@link URL}
23 * It allows you to control some options often required on web sites that do not
24 * want to simply serve HTML, but actively makes your life difficult with stupid
29 public class Downloader
{
31 private CookieManager cookies
;
32 private TraceHandler tracer
= new TraceHandler();
36 * Create a new {@link Downloader}.
39 * the User-Agent to use to download the resources -- note that
40 * some websites require one, some actively blacklist real UAs
41 * like the one from wget, some whitelist a couple of browsers
44 public Downloader(String UA
) {
49 * Create a new {@link Downloader}.
52 * the User-Agent to use to download the resources -- note that
53 * some websites require one, some actively blacklist real UAs
54 * like the one from wget, some whitelist a couple of browsers
57 * the {@link Cache} to use for all access (can be NULL)
59 public Downloader(String UA
, Cache cache
) {
62 cookies
= new CookieManager(null, CookiePolicy
.ACCEPT_ALL
);
63 CookieHandler
.setDefault(cookies
);
69 * The traces handler for this {@link Cache}.
71 * @return the traces handler
73 public TraceHandler
getTraceHandler() {
78 * The traces handler for this {@link Cache}.
81 * the new traces handler
83 public void setTraceHandler(TraceHandler tracer
) {
85 tracer
= new TraceHandler(false, false, false);
92 * The {@link Cache} to use for all access (can be NULL).
96 public Cache
getCache() {
101 * The {@link Cache} to use for all access (can be NULL).
106 public void setCache(Cache cache
) {
111 * Clear all the cookies currently in the jar.
113 * As long as you don't, the cookies are kept.
115 public void clearCookies() {
116 cookies
.getCookieStore().removeAll();
120 * Open the given {@link URL} and update the cookies.
123 * the {@link URL} to open
124 * @return the {@link InputStream} of the opened page
126 * @throws IOException
127 * in case of I/O error
129 public InputStream
open(URL url
) throws IOException
{
130 return open(url
, false);
134 * Open the given {@link URL} and update the cookies.
137 * the {@link URL} to open
139 * stable a stable file (that doesn't change too often) --
140 * parameter used to check if the file is too old to keep or not
141 * in the cache (default is false)
143 * @return the {@link InputStream} of the opened page
145 * @throws IOException
146 * in case of I/O error
148 public InputStream
open(URL url
, boolean stable
) throws IOException
{
149 return open(url
, url
, url
, null, null, null, null, stable
);
153 * Open the given {@link URL} and update the cookies.
156 * the {@link URL} to open
157 * @param currentReferer
158 * the current referer, for websites that needs this info
159 * @param cookiesValues
162 * the POST parameters
164 * the GET parameters (priority over POST)
166 * OAuth authorization (aka, "bearer XXXXXXX")
168 * @return the {@link InputStream} of the opened page
170 * @throws IOException
171 * in case of I/O error
173 public InputStream
open(URL url
, URL currentReferer
,
174 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
175 Map
<String
, String
> getParams
, String oauth
) throws IOException
{
176 return open(url
, currentReferer
, cookiesValues
, postParams
, getParams
,
181 * Open the given {@link URL} and update the cookies.
184 * the {@link URL} to open
185 * @param currentReferer
186 * the current referer, for websites that needs this info
187 * @param cookiesValues
190 * the POST parameters
192 * the GET parameters (priority over POST)
194 * OAuth authorization (aka, "bearer XXXXXXX")
196 * stable a stable file (that doesn't change too often) --
197 * parameter used to check if the file is too old to keep or not
198 * in the cache (default is false)
200 * @return the {@link InputStream} of the opened page
202 * @throws IOException
203 * in case of I/O error
205 public InputStream
open(URL url
, URL currentReferer
,
206 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
207 Map
<String
, String
> getParams
, String oauth
, boolean stable
)
209 return open(url
, url
, currentReferer
, cookiesValues
, postParams
,
210 getParams
, oauth
, stable
);
214 * Open the given {@link URL} and update the cookies.
217 * the {@link URL} to open
219 * the original {@link URL} before any redirection occurs, which
220 * is also used for the cache ID if needed (so we can retrieve
221 * the content with this URL if needed)
222 * @param currentReferer
223 * the current referer, for websites that needs this info
224 * @param cookiesValues
227 * the POST parameters
229 * the GET parameters (priority over POST)
231 * OAuth authorisation (aka, "bearer XXXXXXX")
233 * a stable file (that doesn't change too often) -- parameter
234 * used to check if the file is too old to keep or not in the
237 * @return the {@link InputStream} of the opened page
239 * @throws IOException
240 * in case of I/O error
242 public InputStream
open(URL url
, final URL originalUrl
, URL currentReferer
,
243 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
244 Map
<String
, String
> getParams
, String oauth
, boolean stable
)
247 tracer
.trace("Request: " + url
);
250 InputStream in
= cache
.load(originalUrl
, false, stable
);
252 tracer
.trace("Use the cache: " + url
);
253 tracer
.trace("Original URL : " + originalUrl
);
258 tracer
.trace("Download: " + url
);
260 URLConnection conn
= openConnectionWithCookies(url
, currentReferer
,
263 // Priority: GET over POST
264 Map
<String
, String
> params
= getParams
;
265 if (getParams
== null) {
269 StringBuilder requestData
= null;
270 if ((params
!= null || oauth
!= null)
271 && conn
instanceof HttpURLConnection
) {
272 if (params
!= null) {
273 requestData
= new StringBuilder();
274 for (Map
.Entry
<String
, String
> param
: params
.entrySet()) {
275 if (requestData
.length() != 0)
276 requestData
.append('&');
277 requestData
.append(URLEncoder
.encode(param
.getKey(),
279 requestData
.append('=');
280 requestData
.append(URLEncoder
.encode(
281 String
.valueOf(param
.getValue()), "UTF-8"));
284 if (getParams
== null && postParams
!= null) {
285 ((HttpURLConnection
) conn
).setRequestMethod("POST");
288 conn
.setRequestProperty("Content-Type",
289 "application/x-www-form-urlencoded");
290 conn
.setRequestProperty("Content-Length",
291 Integer
.toString(requestData
.length()));
295 conn
.setRequestProperty("Authorization", oauth
);
298 if (requestData
!= null) {
299 conn
.setDoOutput(true);
300 OutputStreamWriter writer
= new OutputStreamWriter(
301 conn
.getOutputStream());
303 writer
.write(requestData
.toString());
311 // Manual redirection, much better for POST data
312 if (conn
instanceof HttpURLConnection
) {
313 ((HttpURLConnection
) conn
).setInstanceFollowRedirects(false);
319 // BEWARE! POST data cannot be redirected (some webservers complain) for
320 // HTTP codes 302 and 303
321 if (conn
instanceof HttpURLConnection
) {
324 // Can fail in some circumstances
325 repCode
= ((HttpURLConnection
) conn
).getResponseCode();
326 } catch (IOException e
) {
329 if (repCode
/ 100 == 3) {
330 String newUrl
= conn
.getHeaderField("Location");
331 return open(new URL(newUrl
), originalUrl
, currentReferer
,
333 (repCode
== 302 || repCode
== 303) ?
null : postParams
, //
334 getParams
, oauth
, stable
);
339 InputStream in
= conn
.getInputStream();
340 if ("gzip".equals(conn
.getContentEncoding())) {
341 in
= new GZIPInputStream(in
);
345 throw new IOException("No InputStream!");
349 String size
= conn
.getContentLengthLong() < 0 ?
"unknown size"
350 : StringUtils
.formatNumber(conn
.getContentLengthLong())
352 tracer
.trace("Save to cache (" + size
+ "): " + originalUrl
);
355 long bytes
= cache
.save(in
, originalUrl
);
356 tracer
.trace("Saved to cache: "
357 + StringUtils
.formatNumber(bytes
) + "bytes");
361 in
= cache
.load(originalUrl
, true, true);
362 } catch (IOException e
) {
363 tracer
.error(new IOException(
364 "Cannot save URL to cache, will ignore cache: "
370 } catch (IOException e
) {
371 throw new IOException(String
.format(
372 "Cannot find %s (current URL: %s)", originalUrl
, url
), e
);
377 * Open a connection on the given {@link URL}, and manage the cookies that
381 * the {@link URL} to open
383 * @return the connection
385 * @throws IOException
386 * in case of I/O error
388 private URLConnection
openConnectionWithCookies(URL url
,
389 URL currentReferer
, Map
<String
, String
> cookiesValues
)
391 URLConnection conn
= url
.openConnection();
393 String cookies
= generateCookies(cookiesValues
);
394 if (cookies
!= null && !cookies
.isEmpty()) {
395 conn
.setRequestProperty("Cookie", cookies
);
398 conn
.setRequestProperty("User-Agent", UA
);
399 conn
.setRequestProperty("Accept-Encoding", "gzip");
400 conn
.setRequestProperty("Accept", "*/*");
401 conn
.setRequestProperty("Charset", "utf-8");
403 if (currentReferer
!= null) {
404 conn
.setRequestProperty("Referer", currentReferer
.toString());
405 conn
.setRequestProperty("Host", currentReferer
.getHost());
412 * Generate the cookie {@link String} from the local {@link CookieStore} so
413 * it is ready to be passed.
417 private String
generateCookies(Map
<String
, String
> cookiesValues
) {
418 StringBuilder builder
= new StringBuilder();
419 for (HttpCookie cookie
: cookies
.getCookieStore().getCookies()) {
420 if (builder
.length() > 0) {
424 builder
.append(cookie
.toString());
427 if (cookiesValues
!= null) {
428 for (Map
.Entry
<String
, String
> set
: cookiesValues
.entrySet()) {
429 if (builder
.length() > 0) {
432 builder
.append(set
.getKey());
434 builder
.append(set
.getValue());
438 return builder
.toString();