X-Git-Url: http://git.nikiroo.be/?p=fanfix.git;a=blobdiff_plain;f=src%2Fbe%2Fnikiroo%2Futils%2FDownloader.java;fp=src%2Fbe%2Fnikiroo%2Futils%2FDownloader.java;h=0487933295ec8c4902665d2bffcd81e57debda2c;hp=0000000000000000000000000000000000000000;hb=d46b7b96f94e88a776bcd2dfd756549ffb300cc9;hpb=c9994f27667bc421bcd448d39e55774fddf5c431 diff --git a/src/be/nikiroo/utils/Downloader.java b/src/be/nikiroo/utils/Downloader.java new file mode 100644 index 0000000..0487933 --- /dev/null +++ b/src/be/nikiroo/utils/Downloader.java @@ -0,0 +1,478 @@ +package be.nikiroo.utils; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStreamWriter; +import java.net.CookieHandler; +import java.net.CookieManager; +import java.net.CookiePolicy; +import java.net.CookieStore; +import java.net.HttpCookie; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLEncoder; +import java.util.Map; +import java.util.zip.GZIPInputStream; + +/** + * This class will help you download content from Internet Sites ({@link URL} + * based). + *

+ * It allows you to control some options often required on web sites that do not + * want to simply serve HTML, but actively makes your life difficult with stupid + * checks. + * + * @author niki + */ +public class Downloader { + private String UA; + private CookieManager cookies; + private TraceHandler tracer = new TraceHandler(); + private Cache cache; + private boolean offline; + + /** + * Create a new {@link Downloader}. + * + * @param UA + * the User-Agent to use to download the resources -- note that + * some websites require one, some actively blacklist real UAs + * like the one from wget, some whitelist a couple of browsers + * only (!) + */ + public Downloader(String UA) { + this(UA, null); + } + + /** + * Create a new {@link Downloader}. + * + * @param UA + * the User-Agent to use to download the resources -- note that + * some websites require one, some actively blacklist real UAs + * like the one from wget, some whitelist a couple of browsers + * only (!) + * @param cache + * the {@link Cache} to use for all access (can be NULL) + */ + public Downloader(String UA, Cache cache) { + this.UA = UA; + + cookies = new CookieManager(null, CookiePolicy.ACCEPT_ALL); + CookieHandler.setDefault(cookies); + + setCache(cache); + } + + /** + * This {@link Downloader} is forbidden to try and connect to the network. + *

+ * If TRUE, it will only check the cache if any. + *

+ * Default is FALSE. + * + * @return TRUE if offline + */ + public boolean isOffline() { + return offline; + } + + /** + * This {@link Downloader} is forbidden to try and connect to the network. + *

+ * If TRUE, it will only check the cache if any. + *

+ * Default is FALSE. + * + * @param offline TRUE for offline, FALSE for online + */ + public void setOffline(boolean offline) { + this.offline = offline; + } + + /** + * The traces handler for this {@link Cache}. + * + * @return the traces handler + */ + public TraceHandler getTraceHandler() { + return tracer; + } + + /** + * The traces handler for this {@link Cache}. + * + * @param tracer + * the new traces handler + */ + public void setTraceHandler(TraceHandler tracer) { + if (tracer == null) { + tracer = new TraceHandler(false, false, false); + } + + this.tracer = tracer; + } + + /** + * The {@link Cache} to use for all access (can be NULL). + * + * @return the cache + */ + public Cache getCache() { + return cache; + } + + /** + * The {@link Cache} to use for all access (can be NULL). + * + * @param cache + * the new cache + */ + public void setCache(Cache cache) { + this.cache = cache; + } + + /** + * Clear all the cookies currently in the jar. + *

+ * As long as you don't, the cookies are kept. + */ + public void clearCookies() { + cookies.getCookieStore().removeAll(); + } + + /** + * Open the given {@link URL} and update the cookies. + * + * @param url + * the {@link URL} to open + * @return the {@link InputStream} of the opened page + * + * @throws IOException + * in case of I/O error + **/ + public InputStream open(URL url) throws IOException { + return open(url, false); + } + + /** + * Open the given {@link URL} and update the cookies. + * + * @param url + * the {@link URL} to open + * @param stable + * stable a stable file (that doesn't change too often) -- + * parameter used to check if the file is too old to keep or not + * in the cache (default is false) + * + * @return the {@link InputStream} of the opened page + * + * @throws IOException + * in case of I/O error + **/ + public InputStream open(URL url, boolean stable) throws IOException { + return open(url, url, url, null, null, null, null, stable); + } + + /** + * Open the given {@link URL} and update the cookies. + * + * @param url + * the {@link URL} to open + * @param currentReferer + * the current referer, for websites that needs this info + * @param cookiesValues + * the cookies + * @param postParams + * the POST parameters + * @param getParams + * the GET parameters (priority over POST) + * @param oauth + * OAuth authorization (aka, "bearer XXXXXXX") + * + * @return the {@link InputStream} of the opened page + * + * @throws IOException + * in case of I/O error (including offline mode + not in cache) + */ + public InputStream open(URL url, URL currentReferer, + Map cookiesValues, Map postParams, + Map getParams, String oauth) throws IOException { + return open(url, currentReferer, cookiesValues, postParams, getParams, + oauth, false); + } + + /** + * Open the given {@link URL} and update the cookies. + * + * @param url + * the {@link URL} to open + * @param currentReferer + * the current referer, for websites that needs this info + * @param cookiesValues + * the cookies + * @param postParams + * the POST parameters + * @param getParams + * the GET parameters (priority over POST) + * @param oauth + * OAuth authorization (aka, "bearer XXXXXXX") + * @param stable + * stable a stable file (that doesn't change too often) -- + * parameter used to check if the file is too old to keep or not + * in the cache (default is false) + * + * @return the {@link InputStream} of the opened page + * + * @throws IOException + * in case of I/O error (including offline mode + not in cache) + */ + public InputStream open(URL url, URL currentReferer, + Map cookiesValues, Map postParams, + Map getParams, String oauth, boolean stable) + throws IOException { + return open(url, url, currentReferer, cookiesValues, postParams, + getParams, oauth, stable); + } + + /** + * Open the given {@link URL} and update the cookies. + * + * @param url + * the {@link URL} to open + * @param originalUrl + * the original {@link URL} before any redirection occurs, which + * is also used for the cache ID if needed (so we can retrieve + * the content with this URL if needed) + * @param currentReferer + * the current referer, for websites that needs this info + * @param cookiesValues + * the cookies + * @param postParams + * the POST parameters + * @param getParams + * the GET parameters (priority over POST) + * @param oauth + * OAuth authorisation (aka, "bearer XXXXXXX") + * @param stable + * a stable file (that doesn't change too often) -- parameter + * used to check if the file is too old to keep or not in the + * cache + * + * @return the {@link InputStream} of the opened page + * + * @throws IOException + * in case of I/O error (including offline mode + not in cache) + */ + public InputStream open(URL url, final URL originalUrl, URL currentReferer, + Map cookiesValues, Map postParams, + Map getParams, String oauth, boolean stable) + throws IOException { + + tracer.trace("Request: " + url); + + if (cache != null) { + InputStream in = cache.load(originalUrl, false, stable); + if (in != null) { + tracer.trace("Use the cache: " + url); + tracer.trace("Original URL : " + originalUrl); + return in; + } + } + + String protocol = originalUrl == null ? null : originalUrl + .getProtocol(); + if (isOffline() && !"file".equalsIgnoreCase(protocol)) { + tracer.error("Downloader OFFLINE, cannot proceed to URL: " + url); + throw new IOException("Downloader is currently OFFLINE, cannot download: " + url); + } + + tracer.trace("Download: " + url); + + URLConnection conn = openConnectionWithCookies(url, currentReferer, + cookiesValues); + + // Priority: GET over POST + Map params = getParams; + if (getParams == null) { + params = postParams; + } + + StringBuilder requestData = null; + if ((params != null || oauth != null) + && conn instanceof HttpURLConnection) { + if (params != null) { + requestData = new StringBuilder(); + for (Map.Entry param : params.entrySet()) { + if (requestData.length() != 0) + requestData.append('&'); + requestData.append(URLEncoder.encode(param.getKey(), + "UTF-8")); + requestData.append('='); + requestData.append(URLEncoder.encode( + String.valueOf(param.getValue()), "UTF-8")); + } + + if (getParams == null && postParams != null) { + ((HttpURLConnection) conn).setRequestMethod("POST"); + } + + conn.setRequestProperty("Content-Type", + "application/x-www-form-urlencoded"); + conn.setRequestProperty("Content-Length", + Integer.toString(requestData.length())); + } + + if (oauth != null) { + conn.setRequestProperty("Authorization", oauth); + } + + if (requestData != null) { + conn.setDoOutput(true); + OutputStreamWriter writer = new OutputStreamWriter( + conn.getOutputStream()); + try { + writer.write(requestData.toString()); + writer.flush(); + } finally { + writer.close(); + } + } + } + + // Manual redirection, much better for POST data + if (conn instanceof HttpURLConnection) { + ((HttpURLConnection) conn).setInstanceFollowRedirects(false); + } + + conn.connect(); + + // Check if redirect + // BEWARE! POST data cannot be redirected (some webservers complain) for + // HTTP codes 302 and 303 + if (conn instanceof HttpURLConnection) { + int repCode = 0; + try { + // Can fail in some circumstances + repCode = ((HttpURLConnection) conn).getResponseCode(); + } catch (IOException e) { + } + + if (repCode / 100 == 3) { + String newUrl = conn.getHeaderField("Location"); + return open(new URL(newUrl), originalUrl, currentReferer, + cookiesValues, // + (repCode == 302 || repCode == 303) ? null : postParams, // + getParams, oauth, stable); + } + } + + try { + InputStream in = conn.getInputStream(); + if ("gzip".equals(conn.getContentEncoding())) { + in = new GZIPInputStream(in); + } + + if (in == null) { + throw new IOException("No InputStream!"); + } + + if (cache != null) { + String size = conn.getContentLength() < 0 ? "unknown size" + : StringUtils.formatNumber(conn.getContentLength()) + + "bytes"; + tracer.trace("Save to cache (" + size + "): " + originalUrl); + try { + try { + long bytes = cache.save(in, originalUrl); + tracer.trace("Saved to cache: " + + StringUtils.formatNumber(bytes) + "bytes"); + } finally { + in.close(); + } + in = cache.load(originalUrl, true, true); + } catch (IOException e) { + tracer.error(new IOException( + "Cannot save URL to cache, will ignore cache: " + + url, e)); + } + } + + if (in == null) { + throw new IOException( + "Cannot retrieve the file after storing it in the cache (??)"); + } + + return in; + } catch (IOException e) { + throw new IOException(String.format( + "Cannot find %s (current URL: %s)", originalUrl, url), e); + } + } + + /** + * Open a connection on the given {@link URL}, and manage the cookies that + * come with it. + * + * @param url + * the {@link URL} to open + * + * @return the connection + * + * @throws IOException + * in case of I/O error + */ + private URLConnection openConnectionWithCookies(URL url, + URL currentReferer, Map cookiesValues) + throws IOException { + URLConnection conn = url.openConnection(); + + String cookies = generateCookies(cookiesValues); + if (cookies != null && !cookies.isEmpty()) { + conn.setRequestProperty("Cookie", cookies); + } + + conn.setRequestProperty("User-Agent", UA); + conn.setRequestProperty("Accept-Encoding", "gzip"); + conn.setRequestProperty("Accept", "*/*"); + conn.setRequestProperty("Charset", "utf-8"); + + if (currentReferer != null) { + conn.setRequestProperty("Referer", currentReferer.toString()); + conn.setRequestProperty("Host", currentReferer.getHost()); + } + + return conn; + } + + /** + * Generate the cookie {@link String} from the local {@link CookieStore} so + * it is ready to be passed. + * + * @return the cookie + */ + private String generateCookies(Map cookiesValues) { + StringBuilder builder = new StringBuilder(); + for (HttpCookie cookie : cookies.getCookieStore().getCookies()) { + if (builder.length() > 0) { + builder.append(';'); + } + + builder.append(cookie.toString()); + } + + if (cookiesValues != null) { + for (Map.Entry set : cookiesValues.entrySet()) { + if (builder.length() > 0) { + builder.append(';'); + } + builder.append(set.getKey()); + builder.append('='); + builder.append(set.getValue()); + } + } + + return builder.toString(); + } +}