1 package be
.nikiroo
.utils
;
3 import java
.io
.IOException
;
4 import java
.io
.InputStream
;
5 import java
.io
.OutputStreamWriter
;
6 import java
.net
.CookieHandler
;
7 import java
.net
.CookieManager
;
8 import java
.net
.CookiePolicy
;
9 import java
.net
.CookieStore
;
10 import java
.net
.HttpCookie
;
11 import java
.net
.HttpURLConnection
;
13 import java
.net
.URLConnection
;
14 import java
.net
.URLEncoder
;
16 import java
.util
.zip
.GZIPInputStream
;
19 * This class will help you download content from Internet Sites ({@link URL}
22 * It allows you to control some options often required on web sites that do not
23 * want to simply serve HTML, but actively makes your life difficult with stupid
28 public class Downloader
{
30 private CookieManager cookies
;
31 private TraceHandler tracer
= new TraceHandler();
35 * Create a new {@link Downloader}.
38 * the User-Agent to use to download the resources -- note that
39 * some websites require one, some actively blacklist real UAs
40 * like the one from wget, some whitelist a couple of browsers
43 public Downloader(String UA
) {
48 * Create a new {@link Downloader}.
51 * the User-Agent to use to download the resources -- note that
52 * some websites require one, some actively blacklist real UAs
53 * like the one from wget, some whitelist a couple of browsers
56 * the {@link Cache} to use for all access (can be NULL)
58 public Downloader(String UA
, Cache cache
) {
61 cookies
= new CookieManager(null, CookiePolicy
.ACCEPT_ALL
);
62 CookieHandler
.setDefault(cookies
);
68 * The traces handler for this {@link Cache}.
70 * @return the traces handler
72 public TraceHandler
getTraceHandler() {
77 * The traces handler for this {@link Cache}.
80 * the new traces handler
82 public void setTraceHandler(TraceHandler tracer
) {
84 tracer
= new TraceHandler(false, false, false);
91 * The {@link Cache} to use for all access (can be NULL).
95 public Cache
getCache() {
100 * The {@link Cache} to use for all access (can be NULL).
105 public void setCache(Cache cache
) {
110 * Clear all the cookies currently in the jar.
112 * As long as you don't, the cookies are kept.
114 public void clearCookies() {
115 cookies
.getCookieStore().removeAll();
119 * Open the given {@link URL} and update the cookies.
122 * the {@link URL} to open
123 * @return the {@link InputStream} of the opened page
125 * @throws IOException
126 * in case of I/O error
128 public InputStream
open(URL url
) throws IOException
{
129 return open(url
, false);
133 * Open the given {@link URL} and update the cookies.
136 * the {@link URL} to open
138 * stable a stable file (that doesn't change too often) --
139 * parameter used to check if the file is too old to keep or not
140 * in the cache (default is false)
142 * @return the {@link InputStream} of the opened page
144 * @throws IOException
145 * in case of I/O error
147 public InputStream
open(URL url
, boolean stable
) throws IOException
{
148 return open(url
, url
, url
, null, null, null, null, stable
);
152 * Open the given {@link URL} and update the cookies.
155 * the {@link URL} to open
156 * @param currentReferer
157 * the current referer, for websites that needs this info
158 * @param cookiesValues
161 * the POST parameters
163 * the GET parameters (priority over POST)
165 * OAuth authorization (aka, "bearer XXXXXXX")
167 * @return the {@link InputStream} of the opened page
169 * @throws IOException
170 * in case of I/O error
172 public InputStream
open(URL url
, URL currentReferer
,
173 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
174 Map
<String
, String
> getParams
, String oauth
) throws IOException
{
175 return open(url
, currentReferer
, cookiesValues
, postParams
, getParams
,
180 * Open the given {@link URL} and update the cookies.
183 * the {@link URL} to open
184 * @param currentReferer
185 * the current referer, for websites that needs this info
186 * @param cookiesValues
189 * the POST parameters
191 * the GET parameters (priority over POST)
193 * OAuth authorization (aka, "bearer XXXXXXX")
195 * stable a stable file (that doesn't change too often) --
196 * parameter used to check if the file is too old to keep or not
197 * in the cache (default is false)
199 * @return the {@link InputStream} of the opened page
201 * @throws IOException
202 * in case of I/O error
204 public InputStream
open(URL url
, URL currentReferer
,
205 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
206 Map
<String
, String
> getParams
, String oauth
, boolean stable
)
208 return open(url
, url
, currentReferer
, cookiesValues
, postParams
,
209 getParams
, oauth
, stable
);
213 * Open the given {@link URL} and update the cookies.
216 * the {@link URL} to open
218 * the original {@link URL} before any redirection occurs, which
219 * is also used for the cache ID if needed (so we can retrieve
220 * the content with this URL if needed)
221 * @param currentReferer
222 * the current referer, for websites that needs this info
223 * @param cookiesValues
226 * the POST parameters
228 * the GET parameters (priority over POST)
230 * OAuth authorisation (aka, "bearer XXXXXXX")
232 * a stable file (that doesn't change too often) -- parameter
233 * used to check if the file is too old to keep or not in the
236 * @return the {@link InputStream} of the opened page
238 * @throws IOException
239 * in case of I/O error
241 public InputStream
open(URL url
, final URL originalUrl
, URL currentReferer
,
242 Map
<String
, String
> cookiesValues
, Map
<String
, String
> postParams
,
243 Map
<String
, String
> getParams
, String oauth
, boolean stable
)
246 tracer
.trace("Request: " + url
);
249 InputStream in
= cache
.load(originalUrl
, false, stable
);
251 tracer
.trace("Use the cache: " + url
);
252 tracer
.trace("Original URL : " + originalUrl
);
257 tracer
.trace("Download: " + url
);
259 URLConnection conn
= openConnectionWithCookies(url
, currentReferer
,
262 // Priority: GET over POST
263 Map
<String
, String
> params
= getParams
;
264 if (getParams
== null) {
268 StringBuilder requestData
= null;
269 if ((params
!= null || oauth
!= null)
270 && conn
instanceof HttpURLConnection
) {
271 if (params
!= null) {
272 requestData
= new StringBuilder();
273 for (Map
.Entry
<String
, String
> param
: params
.entrySet()) {
274 if (requestData
.length() != 0)
275 requestData
.append('&');
276 requestData
.append(URLEncoder
.encode(param
.getKey(),
278 requestData
.append('=');
279 requestData
.append(URLEncoder
.encode(
280 String
.valueOf(param
.getValue()), "UTF-8"));
283 if (getParams
== null && postParams
!= null) {
284 ((HttpURLConnection
) conn
).setRequestMethod("POST");
287 conn
.setRequestProperty("Content-Type",
288 "application/x-www-form-urlencoded");
289 conn
.setRequestProperty("Content-Length",
290 Integer
.toString(requestData
.length()));
291 conn
.setRequestProperty("charset", "utf-8");
295 conn
.setRequestProperty("Authorization", oauth
);
298 if (requestData
!= null) {
299 conn
.setDoOutput(true);
300 OutputStreamWriter writer
= new OutputStreamWriter(
301 conn
.getOutputStream());
303 writer
.write(requestData
.toString());
311 // Manual redirection, much better for POST data
312 if (conn
instanceof HttpURLConnection
) {
313 ((HttpURLConnection
) conn
).setInstanceFollowRedirects(false);
319 // BEWARE! POST data cannot be redirected, so it is ignored here
320 if (conn
instanceof HttpURLConnection
) {
323 // Can fail in some circumstances
324 repCode
= ((HttpURLConnection
) conn
).getResponseCode();
325 } catch (IOException e
) {
328 if (repCode
/ 100 == 3) {
329 String newUrl
= conn
.getHeaderField("Location");
330 return open(new URL(newUrl
), originalUrl
, currentReferer
,
331 cookiesValues
, null, getParams
, oauth
, stable
);
335 InputStream in
= conn
.getInputStream();
336 if ("gzip".equals(conn
.getContentEncoding())) {
337 in
= new GZIPInputStream(in
);
340 if (in
!= null && cache
!= null) {
341 tracer
.trace("Save to cache: " + originalUrl
);
344 cache
.save(in
, originalUrl
);
348 in
= cache
.load(originalUrl
, true, false);
349 } catch (IOException e
) {
350 tracer
.error(new IOException(
351 "Cannot save URL to cache, will ignore cache: " + url
,
360 * Open a connection on the given {@link URL}, and manage the cookies that
364 * the {@link URL} to open
366 * @return the connection
368 * @throws IOException
369 * in case of I/O error
371 private URLConnection
openConnectionWithCookies(URL url
,
372 URL currentReferer
, Map
<String
, String
> cookiesValues
)
374 URLConnection conn
= url
.openConnection();
376 String cookies
= generateCookies(cookiesValues
);
377 if (cookies
!= null && !cookies
.isEmpty()) {
378 conn
.setRequestProperty("Cookie", cookies
);
381 conn
.setRequestProperty("User-Agent", UA
);
382 conn
.setRequestProperty("Accept-Encoding", "gzip");
383 conn
.setRequestProperty("Accept", "*/*");
385 if (currentReferer
!= null) {
386 conn
.setRequestProperty("Referer", currentReferer
.toString());
387 conn
.setRequestProperty("Host", currentReferer
.getHost());
394 * Generate the cookie {@link String} from the local {@link CookieStore} so
395 * it is ready to be passed.
399 private String
generateCookies(Map
<String
, String
> cookiesValues
) {
400 StringBuilder builder
= new StringBuilder();
401 for (HttpCookie cookie
: cookies
.getCookieStore().getCookies()) {
402 if (builder
.length() > 0) {
406 builder
.append(cookie
.toString());
409 if (cookiesValues
!= null) {
410 for (Map
.Entry
<String
, String
> set
: cookiesValues
.entrySet()) {
411 if (builder
.length() > 0) {
414 builder
.append(set
.getKey());
416 builder
.append(set
.getValue());
420 return builder
.toString();