1 package be
.nikiroo
.utils
;
3 import java
.io
.IOException
;
4 import java
.io
.InputStream
;
5 import java
.io
.OutputStreamWriter
;
6 import java
.net
.CookieHandler
;
7 import java
.net
.CookieManager
;
8 import java
.net
.CookiePolicy
;
9 import java
.net
.CookieStore
;
10 import java
.net
.HttpCookie
;
11 import java
.net
.HttpURLConnection
;
13 import java
.net
.URLConnection
;
14 import java
.net
.URLEncoder
;
16 import java
.util
.zip
.GZIPInputStream
;
19 * This class will help you download content from Internet Sites ({@link URL}
22 * It allows you to control some options often required on web sites that do not
23 * want to simply serve HTML, but actively makes your life difficult with stupid
28 public class Downloader
{
30 private CookieManager cookies
;
31 private TraceHandler tracer
= new TraceHandler();
35 * Create a new {@link Downloader}.
38 * the User-Agent to use to download the resources -- note that
39 * some websites require one, some actively blacklist real UAs
40 * like the one from wget, some whitelist a couple of browsers
43 public Downloader(String UA
) {
48 * Create a new {@link Downloader}.
51 * the User-Agent to use to download the resources -- note that
52 * some websites require one, some actively blacklist real UAs
53 * like the one from wget, some whitelist a couple of browsers
56 * the {@link Cache} to use for all access (can be NULL)
58 public Downloader(String UA
, Cache cache
) {
61 cookies
= new CookieManager(null, CookiePolicy
.ACCEPT_ALL
);
62 CookieHandler
.setDefault(cookies
);
68 * The traces handler for this {@link Cache}.
70 * @return the traces handler
72 public TraceHandler
getTraceHandler() {
77 * The traces handler for this {@link Cache}.
80 * the new traces handler
82 public void setTraceHandler(TraceHandler tracer
) {
84 tracer
= new TraceHandler(false, false, false);
91 * The {@link Cache} to use for all access (can be NULL).
95 public Cache
getCache() {
100 * The {@link Cache} to use for all access (can be NULL).
105 public void setCache(Cache cache
) {
110 * Clear all the cookies currently in the jar.
112 * As long as you don't, the cookies are kept.
114 public void clearCookies() {
115 cookies
.getCookieStore().removeAll();
119 * Open the given {@link URL} and update the cookies.
122 * the {@link URL} to open
123 * @return the {@link InputStream} of the opened page
125 * @throws IOException
126 * in case of I/O error
128 public InputStream
open(URL url
) throws IOException
{
129 return open(url
, false);
133 * Open the given {@link URL} and update the cookies.
136 * the {@link URL} to open
138 * stable a stable file (that doesn't change too often) --
139 * parameter used to check if the file is too old to keep or not
140 * in the cache (default is false)
142 * @return the {@link InputStream} of the opened page
144 * @throws IOException
145 * in case of I/O error
147 public InputStream
open(URL url
, boolean stable
) throws IOException
{
148 return open(url
, url
, url
, null, null, null, null, stable
);
152 * Open the given {@link URL} and update the cookies.
155 * the {@link URL} to open
156 * @param currentReferer
157 * the current referer, for websites that needs this info
158 * @param cookiesValues
161 * the POST parameters
163 * the GET parameters (priority over POST)
165 * OAuth authorization (aka, "bearer XXXXXXX")
167 * @return the {@link InputStream} of the opened page
169 * @throws IOException
170 * in case of I/O error
172 public InputStream
open(URL url
, URL currentReferer
,
173 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
174 Map
<String
, String
> getParams
, String oauth
) throws IOException
{
175 return open(url
, currentReferer
, cookiesValues
, postParams
, getParams
,
180 * Open the given {@link URL} and update the cookies.
183 * the {@link URL} to open
184 * @param currentReferer
185 * the current referer, for websites that needs this info
186 * @param cookiesValues
189 * the POST parameters
191 * the GET parameters (priority over POST)
193 * OAuth authorization (aka, "bearer XXXXXXX")
195 * stable a stable file (that doesn't change too often) --
196 * parameter used to check if the file is too old to keep or not
197 * in the cache (default is false)
199 * @return the {@link InputStream} of the opened page
201 * @throws IOException
202 * in case of I/O error
204 public InputStream
open(URL url
, URL currentReferer
,
205 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
206 Map
<String
, String
> getParams
, String oauth
, boolean stable
)
208 return open(url
, url
, currentReferer
, cookiesValues
, postParams
,
209 getParams
, oauth
, stable
);
213 * Open the given {@link URL} and update the cookies.
216 * the {@link URL} to open
218 * the original {@link URL} before any redirection occurs, which
219 * is also used for the cache ID if needed (so we can retrieve
220 * the content with this URL if needed)
221 * @param currentReferer
222 * the current referer, for websites that needs this info
223 * @param cookiesValues
226 * the POST parameters
228 * the GET parameters (priority over POST)
230 * OAuth authorisation (aka, "bearer XXXXXXX")
232 * a stable file (that doesn't change too often) -- parameter
233 * used to check if the file is too old to keep or not in the
236 * @return the {@link InputStream} of the opened page
238 * @throws IOException
239 * in case of I/O error
241 public InputStream
open(URL url
, final URL originalUrl
, URL currentReferer
,
242 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
243 Map
<String
, String
> getParams
, String oauth
, boolean stable
)
246 tracer
.trace("Request: " + url
);
249 InputStream in
= cache
.load(originalUrl
, false, stable
);
251 tracer
.trace("Use the cache: " + url
);
252 tracer
.trace("Original URL : " + originalUrl
);
257 tracer
.trace("Download: " + url
);
259 URLConnection conn
= openConnectionWithCookies(url
, currentReferer
,
262 // Priority: GET over POST
263 Map
<String
, String
> params
= getParams
;
264 if (getParams
== null) {
268 StringBuilder requestData
= null;
269 if ((params
!= null || oauth
!= null)
270 && conn
instanceof HttpURLConnection
) {
271 if (params
!= null) {
272 requestData
= new StringBuilder();
273 for (Map
.Entry
<String
, String
> param
: params
.entrySet()) {
274 if (requestData
.length() != 0)
275 requestData
.append('&');
276 requestData
.append(URLEncoder
.encode(param
.getKey(),
278 requestData
.append('=');
279 requestData
.append(URLEncoder
.encode(
280 String
.valueOf(param
.getValue()), "UTF-8"));
283 if (getParams
== null && postParams
!= null) {
284 ((HttpURLConnection
) conn
).setRequestMethod("POST");
287 conn
.setRequestProperty("Content-Type",
288 "application/x-www-form-urlencoded");
289 conn
.setRequestProperty("Content-Length",
290 Integer
.toString(requestData
.length()));
294 conn
.setRequestProperty("Authorization", oauth
);
297 if (requestData
!= null) {
298 conn
.setDoOutput(true);
299 OutputStreamWriter writer
= new OutputStreamWriter(
300 conn
.getOutputStream());
302 writer
.write(requestData
.toString());
310 // Manual redirection, much better for POST data
311 if (conn
instanceof HttpURLConnection
) {
312 ((HttpURLConnection
) conn
).setInstanceFollowRedirects(false);
318 // BEWARE! POST data cannot be redirected (some webservers complain) for
319 // HTTP codes 302 and 303
320 if (conn
instanceof HttpURLConnection
) {
323 // Can fail in some circumstances
324 repCode
= ((HttpURLConnection
) conn
).getResponseCode();
325 } catch (IOException e
) {
328 if (repCode
/ 100 == 3) {
329 String newUrl
= conn
.getHeaderField("Location");
330 return open(new URL(newUrl
), originalUrl
, currentReferer
,
332 (repCode
== 302 || repCode
== 303) ?
null : postParams
, //
333 getParams
, oauth
, stable
);
338 InputStream in
= conn
.getInputStream();
339 if ("gzip".equals(conn
.getContentEncoding())) {
340 in
= new GZIPInputStream(in
);
344 throw new IOException("No InputStream!");
348 String size
= conn
.getContentLength() < 0 ?
"unknown size"
349 : StringUtils
.formatNumber(conn
.getContentLength())
351 tracer
.trace("Save to cache (" + size
+ "): " + originalUrl
);
354 long bytes
= cache
.save(in
, originalUrl
);
355 tracer
.trace("Saved to cache: "
356 + StringUtils
.formatNumber(bytes
) + "bytes");
360 in
= cache
.load(originalUrl
, true, true);
361 } catch (IOException e
) {
362 tracer
.error(new IOException(
363 "Cannot save URL to cache, will ignore cache: "
369 } catch (IOException e
) {
370 throw new IOException(String
.format(
371 "Cannot find %s (current URL: %s)", originalUrl
, url
), e
);
376 * Open a connection on the given {@link URL}, and manage the cookies that
380 * the {@link URL} to open
382 * @return the connection
384 * @throws IOException
385 * in case of I/O error
387 private URLConnection
openConnectionWithCookies(URL url
,
388 URL currentReferer
, Map
<String
, String
> cookiesValues
)
390 URLConnection conn
= url
.openConnection();
392 String cookies
= generateCookies(cookiesValues
);
393 if (cookies
!= null && !cookies
.isEmpty()) {
394 conn
.setRequestProperty("Cookie", cookies
);
397 conn
.setRequestProperty("User-Agent", UA
);
398 conn
.setRequestProperty("Accept-Encoding", "gzip");
399 conn
.setRequestProperty("Accept", "*/*");
400 conn
.setRequestProperty("Charset", "utf-8");
402 if (currentReferer
!= null) {
403 conn
.setRequestProperty("Referer", currentReferer
.toString());
404 conn
.setRequestProperty("Host", currentReferer
.getHost());
411 * Generate the cookie {@link String} from the local {@link CookieStore} so
412 * it is ready to be passed.
416 private String
generateCookies(Map
<String
, String
> cookiesValues
) {
417 StringBuilder builder
= new StringBuilder();
418 for (HttpCookie cookie
: cookies
.getCookieStore().getCookies()) {
419 if (builder
.length() > 0) {
423 builder
.append(cookie
.toString());
426 if (cookiesValues
!= null) {
427 for (Map
.Entry
<String
, String
> set
: cookiesValues
.entrySet()) {
428 if (builder
.length() > 0) {
431 builder
.append(set
.getKey());
433 builder
.append(set
.getValue());
437 return builder
.toString();