X-Git-Url: http://git.nikiroo.be/?a=blobdiff_plain;f=src%2Fbe%2Fnikiroo%2Futils%2FDownloader.java;h=4191d0aea0da85511e2741c50c2a6c83806f612f;hb=844d50dbf3ceb3480b0effc9085752de503856aa;hp=a8a591a363958412a73eb6ef042d38ba33ae1cf9;hpb=530d4062471346d6ececf76d74a0358c91323998;p=nikiroo-utils.git diff --git a/src/be/nikiroo/utils/Downloader.java b/src/be/nikiroo/utils/Downloader.java index a8a591a..4191d0a 100644 --- a/src/be/nikiroo/utils/Downloader.java +++ b/src/be/nikiroo/utils/Downloader.java @@ -29,6 +29,8 @@ public class Downloader { private String UA; private CookieManager cookies; private TraceHandler tracer = new TraceHandler(); + private Cache cache; + private boolean offline; /** * Create a new {@link Downloader}. @@ -37,14 +39,56 @@ public class Downloader { * the User-Agent to use to download the resources -- note that * some websites require one, some actively blacklist real UAs * like the one from wget, some whitelist a couple of browsers - * only (!) + * only (!) -- can be NULL */ public Downloader(String UA) { + this(UA, null); + } + + /** + * Create a new {@link Downloader}. + * + * @param UA + * the User-Agent to use to download the resources -- note that + * some websites require one, some actively blacklist real UAs + * like the one from wget, some whitelist a couple of browsers + * only (!) -- can be NULL + * @param cache + * the {@link Cache} to use for all access (can be NULL) + */ + public Downloader(String UA, Cache cache) { this.UA = UA; - cookies = new CookieManager(); - cookies.setCookiePolicy(CookiePolicy.ACCEPT_ALL); + cookies = new CookieManager(null, CookiePolicy.ACCEPT_ALL); CookieHandler.setDefault(cookies); + + setCache(cache); + } + + /** + * This {@link Downloader} is forbidden to try and connect to the network. + *

+ * If TRUE, it will only check the cache if any. + *

+ * Default is FALSE. + * + * @return TRUE if offline + */ + public boolean isOffline() { + return offline; + } + + /** + * This {@link Downloader} is forbidden to try and connect to the network. + *

+ * If TRUE, it will only check the cache if any. + *

+ * Default is FALSE. + * + * @param offline TRUE for offline, FALSE for online + */ + public void setOffline(boolean offline) { + this.offline = offline; } /** @@ -63,9 +107,32 @@ public class Downloader { * the new traces handler */ public void setTraceHandler(TraceHandler tracer) { + if (tracer == null) { + tracer = new TraceHandler(false, false, false); + } + this.tracer = tracer; } + /** + * The {@link Cache} to use for all access (can be NULL). + * + * @return the cache + */ + public Cache getCache() { + return cache; + } + + /** + * The {@link Cache} to use for all access (can be NULL). + * + * @param cache + * the new cache + */ + public void setCache(Cache cache) { + this.cache = cache; + } + /** * Clear all the cookies currently in the jar. *

@@ -86,7 +153,26 @@ public class Downloader { * in case of I/O error **/ public InputStream open(URL url) throws IOException { - return open(url, url, url, null, null, null, null); + return open(url, false); + } + + /** + * Open the given {@link URL} and update the cookies. + * + * @param url + * the {@link URL} to open + * @param stable + * stable a stable file (that doesn't change too often) -- + * parameter used to check if the file is too old to keep or not + * in the cache (default is false) + * + * @return the {@link InputStream} of the opened page + * + * @throws IOException + * in case of I/O error + **/ + public InputStream open(URL url, boolean stable) throws IOException { + return open(url, url, url, null, null, null, null, stable); } /** @@ -108,13 +194,46 @@ public class Downloader { * @return the {@link InputStream} of the opened page * * @throws IOException - * in case of I/O error + * in case of I/O error (including offline mode + not in cache) */ public InputStream open(URL url, URL currentReferer, Map cookiesValues, Map postParams, Map getParams, String oauth) throws IOException { + return open(url, currentReferer, cookiesValues, postParams, getParams, + oauth, false); + } + + /** + * Open the given {@link URL} and update the cookies. + * + * @param url + * the {@link URL} to open + * @param currentReferer + * the current referer, for websites that needs this info + * @param cookiesValues + * the cookies + * @param postParams + * the POST parameters + * @param getParams + * the GET parameters (priority over POST) + * @param oauth + * OAuth authorization (aka, "bearer XXXXXXX") + * @param stable + * stable a stable file (that doesn't change too often) -- + * parameter used to check if the file is too old to keep or not + * in the cache (default is false) + * + * @return the {@link InputStream} of the opened page + * + * @throws IOException + * in case of I/O error (including offline mode + not in cache) + */ + public InputStream open(URL url, URL currentReferer, + Map cookiesValues, Map postParams, + Map getParams, String oauth, boolean stable) + throws IOException { return open(url, url, currentReferer, cookiesValues, postParams, - getParams, oauth); + getParams, oauth, stable); } /** @@ -123,23 +242,52 @@ public class Downloader { * @param url * the {@link URL} to open * @param originalUrl - * the original {@link URL} before any redirection occurs + * the original {@link URL} before any redirection occurs, which + * is also used for the cache ID if needed (so we can retrieve + * the content with this URL if needed) + * @param currentReferer + * the current referer, for websites that needs this info + * @param cookiesValues + * the cookies * @param postParams * the POST parameters * @param getParams * the GET parameters (priority over POST) * @param oauth * OAuth authorisation (aka, "bearer XXXXXXX") + * @param stable + * a stable file (that doesn't change too often) -- parameter + * used to check if the file is too old to keep or not in the + * cache + * * @return the {@link InputStream} of the opened page * * @throws IOException - * in case of I/O error + * in case of I/O error (including offline mode + not in cache) */ - private InputStream open(URL url, final URL originalUrl, - URL currentReferer, Map cookiesValues, - Map postParams, Map getParams, - String oauth) throws IOException { + public InputStream open(URL url, final URL originalUrl, URL currentReferer, + Map cookiesValues, Map postParams, + Map getParams, String oauth, boolean stable) + throws IOException { + + tracer.trace("Request: " + url); + if (cache != null) { + InputStream in = cache.load(originalUrl, false, stable); + if (in != null) { + tracer.trace("Use the cache: " + url); + tracer.trace("Original URL : " + originalUrl); + return in; + } + } + + String protocol = originalUrl == null ? null : originalUrl + .getProtocol(); + if (isOffline() && !"file".equalsIgnoreCase(protocol)) { + tracer.error("Downloader OFFLINE, cannot proceed to URL: " + url); + throw new IOException("Downloader is currently OFFLINE, cannot download: " + url); + } + tracer.trace("Download: " + url); URLConnection conn = openConnectionWithCookies(url, currentReferer, @@ -151,9 +299,9 @@ public class Downloader { params = postParams; } + StringBuilder requestData = null; if ((params != null || oauth != null) && conn instanceof HttpURLConnection) { - StringBuilder requestData = null; if (params != null) { requestData = new StringBuilder(); for (Map.Entry param : params.entrySet()) { @@ -166,15 +314,14 @@ public class Downloader { String.valueOf(param.getValue()), "UTF-8")); } - conn.setDoOutput(true); - if (getParams == null && postParams != null) { ((HttpURLConnection) conn).setRequestMethod("POST"); } conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); - conn.setRequestProperty("charset", "utf-8"); + conn.setRequestProperty("Content-Length", + Integer.toString(requestData.length())); } if (oauth != null) { @@ -182,31 +329,86 @@ public class Downloader { } if (requestData != null) { + conn.setDoOutput(true); OutputStreamWriter writer = new OutputStreamWriter( conn.getOutputStream()); - - writer.write(requestData.toString()); - writer.flush(); - writer.close(); + try { + writer.write(requestData.toString()); + writer.flush(); + } finally { + writer.close(); + } } } + // Manual redirection, much better for POST data + if (conn instanceof HttpURLConnection) { + ((HttpURLConnection) conn).setInstanceFollowRedirects(false); + } + conn.connect(); // Check if redirect - if (conn instanceof HttpURLConnection - && ((HttpURLConnection) conn).getResponseCode() / 100 == 3) { - String newUrl = conn.getHeaderField("Location"); - return open(new URL(newUrl), originalUrl, currentReferer, - cookiesValues, postParams, getParams, oauth); - } + // BEWARE! POST data cannot be redirected (some webservers complain) for + // HTTP codes 302 and 303 + if (conn instanceof HttpURLConnection) { + int repCode = 0; + try { + // Can fail in some circumstances + repCode = ((HttpURLConnection) conn).getResponseCode(); + } catch (IOException e) { + } - InputStream in = conn.getInputStream(); - if ("gzip".equals(conn.getContentEncoding())) { - in = new GZIPInputStream(in); + if (repCode / 100 == 3) { + String newUrl = conn.getHeaderField("Location"); + return open(new URL(newUrl), originalUrl, currentReferer, + cookiesValues, // + (repCode == 302 || repCode == 303) ? null : postParams, // + getParams, oauth, stable); + } } - return in; + try { + InputStream in = conn.getInputStream(); + if ("gzip".equals(conn.getContentEncoding())) { + in = new GZIPInputStream(in); + } + + if (in == null) { + throw new IOException("No InputStream!"); + } + + if (cache != null) { + String size = conn.getContentLength() < 0 ? "unknown size" + : StringUtils.formatNumber(conn.getContentLength()) + + "bytes"; + tracer.trace("Save to cache (" + size + "): " + originalUrl); + try { + try { + long bytes = cache.save(in, originalUrl); + tracer.trace("Saved to cache: " + + StringUtils.formatNumber(bytes) + "bytes"); + } finally { + in.close(); + } + in = cache.load(originalUrl, true, true); + } catch (IOException e) { + tracer.error(new IOException( + "Cannot save URL to cache, will ignore cache: " + + url, e)); + } + } + + if (in == null) { + throw new IOException( + "Cannot retrieve the file after storing it in the cache (??)"); + } + + return in; + } catch (IOException e) { + throw new IOException(String.format( + "Cannot find %s (current URL: %s)", originalUrl, url), e); + } } /** @@ -226,9 +428,18 @@ public class Downloader { throws IOException { URLConnection conn = url.openConnection(); - conn.setRequestProperty("User-Agent", UA); - conn.setRequestProperty("Cookie", generateCookies(cookiesValues)); + String cookies = generateCookies(cookiesValues); + if (cookies != null && !cookies.isEmpty()) { + conn.setRequestProperty("Cookie", cookies); + } + + if (UA != null) { + conn.setRequestProperty("User-Agent", UA); + } conn.setRequestProperty("Accept-Encoding", "gzip"); + conn.setRequestProperty("Accept", "*/*"); + conn.setRequestProperty("Charset", "utf-8"); + if (currentReferer != null) { conn.setRequestProperty("Referer", currentReferer.toString()); conn.setRequestProperty("Host", currentReferer.getHost()); @@ -250,7 +461,6 @@ public class Downloader { builder.append(';'); } - // TODO: check if format is ok builder.append(cookie.toString()); }