From 8816d2f781492532ecdbdee8241f53017b44daba Mon Sep 17 00:00:00 2001 From: Niki Roo Date: Sun, 6 Aug 2017 15:39:00 +0200 Subject: [PATCH] New: Downloader, Cache --- configure.sh | 2 +- src/be/nikiroo/utils/Cache.java | 352 +++++++++++++++++++++++++++ src/be/nikiroo/utils/Downloader.java | 264 ++++++++++++++++++++ 3 files changed, 617 insertions(+), 1 deletion(-) create mode 100644 src/be/nikiroo/utils/Cache.java create mode 100644 src/be/nikiroo/utils/Downloader.java diff --git a/configure.sh b/configure.sh index a27a91e..707c129 100755 --- a/configure.sh +++ b/configure.sh @@ -45,7 +45,7 @@ fi; echo "MAIN = be/nikiroo/utils/test/Test" > Makefile -echo "MORE = be/nikiroo/utils/MarkableFileInputStream be/nikiroo/utils/ui/UIUtils be/nikiroo/utils/ui/WrapLayout be/nikiroo/utils/ui/ProgressBar" >> Makefile +echo "MORE = be/nikiroo/utils/MarkableFileInputStream be/nikiroo/utils/ui/UIUtils be/nikiroo/utils/ui/WrapLayout be/nikiroo/utils/ui/ProgressBar be/nikiroo/utils/Downloader be/nikiroo/utils/Cache" >> Makefile echo "TEST = be/nikiroo/utils/test/Test" >> Makefile echo "TEST_PARAMS = $cols $ok $ko" >> Makefile echo "NAME = nikiroo-utils" >> Makefile diff --git a/src/be/nikiroo/utils/Cache.java b/src/be/nikiroo/utils/Cache.java new file mode 100644 index 0000000..60cd74e --- /dev/null +++ b/src/be/nikiroo/utils/Cache.java @@ -0,0 +1,352 @@ +package be.nikiroo.utils; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.Date; + +/** + * A generic cache system, with special support for {@link URL}s. + *

+ * This cache also manages timeout information. + * + * @author niki + */ +public class Cache { + private File dir; + private long tooOldChanging; + private long tooOldStable; + + /** + * Create a new {@link Cache} object. + * + * @param dir + * the directory to use as cache + * @param hoursChanging + * the number of hours after which a cached file that is thought + * to change ~often is considered too old (or -1 for + * "never too old") + * @param hoursStable + * the number of hours after which a cached file that is thought + * to change rarely is considered too old (or -1 for + * "never too old") + * + * @throws IOException + * in case of I/O error + */ + public Cache(File dir, int hoursChanging, int hoursStable) + throws IOException { + this.dir = dir; + this.tooOldChanging = 1000 * 60 * 60 * hoursChanging; + this.tooOldStable = 1000 * 60 * 60 * hoursStable; + + if (dir != null && !dir.exists()) { + dir.mkdirs(); + } + + if (dir == null || !dir.exists()) { + throw new IOException("Cannot create the cache directory: " + + (dir == null ? "null" : dir.getAbsolutePath())); + } + } + + /** + * Check the resource to see if it is in the cache. + * + * @param url + * the resource to check + * @param allowTooOld + * allow files even if they are considered too old + * @param stable + * a stable file (that dones't change too often) -- parameter + * used to check if the file is too old to keep or not + * + * @return TRUE if it is + * + */ + public boolean check(URL url, boolean allowTooOld, boolean stable) { + File file = getCached(url); + if (file.exists()) { + if (allowTooOld || !isOld(file, stable)) { + return true; + } + } + + return false; + } + + /** + * Clean the cache (delete the cached items). + * + * @param onlyOld + * only clean the files that are considered too old for a stable + * resource + * + * @return the number of cleaned items + */ + public int clean(boolean onlyOld) { + return clean(onlyOld, dir); + } + + /** + * Trace information (info/error) generated by this class. + *

+ * You can override it if you don't want the default sysout/syserr. + * + * @param message + * the message + * @param error + * TRUE for error messages, FALSE for information messages + */ + protected void trace(String message, boolean error) { + if (error) { + System.err.println(message); + } else { + System.out.println(message); + } + } + + /** + * Clean the cache (delete the cached items) in the given cache directory. + * + * @param onlyOld + * only clean the files that are considered too old for stable + * resources + * @param cacheDir + * the cache directory to clean + * + * @return the number of cleaned items + */ + private int clean(boolean onlyOld, File cacheDir) { + int num = 0; + for (File file : cacheDir.listFiles()) { + if (file.isDirectory()) { + num += clean(onlyOld, file); + } else { + if (!onlyOld || isOld(file, true)) { + if (file.delete()) { + num++; + } else { + trace("Cannot delete temporary file: " + + file.getAbsolutePath(), true); + } + } + } + } + + return num; + } + + /** + * Open a resource from the cache if it exists. + * + * @param uniqueID + * the unique ID + * @param allowTooOld + * allow files even if they are considered too old + * @param stable + * a stable file (that dones't change too often) -- parameter + * used to check if the file is too old to keep or not + * + * @return the opened resource if found, NULL if not + * + * @throws IOException + * in case of I/O error + */ + public InputStream load(String uniqueID, boolean allowTooOld, boolean stable) { + return load(getCached(uniqueID), allowTooOld, stable); + } + + /** + * Open a resource from the cache if it exists. + * + * @param url + * the resource to open + * @param allowTooOld + * allow files even if they are considered too old + * @param stable + * a stable file (that dones't change too often) -- parameter + * used to check if the file is too old to keep or not + * + * @return the opened resource if found, NULL if not + * + * @throws IOException + * in case of I/O error + */ + public InputStream load(URL url, boolean allowTooOld, boolean stable) + throws IOException { + return load(getCached(url), allowTooOld, stable); + } + + /** + * Open a resource from the cache if it exists. + * + * @param url + * the resource to open + * @param allowTooOld + * allow files even if they are considered too old + * @param stable + * a stable file (that dones't change too often) -- parameter + * used to check if the file is too old to keep or not + * + * @return the opened resource if found, NULL if not + * + * @throws IOException + * in case of I/O error + */ + private InputStream load(File cached, boolean allowTooOld, boolean stable) { + if (cached.exists() && (allowTooOld || !isOld(cached, stable))) { + try { + return new MarkableFileInputStream(new FileInputStream(cached)); + } catch (FileNotFoundException e) { + return null; + } + } + + return null; + } + + /** + * Save the given resource to the cache. + * + * @param in + * the input data + * @param uniqueID + * a unique ID used to locate the cached resource + * + * @return the resulting {@link File} + * + * @throws IOException + * in case of I/O error + */ + public File save(InputStream in, String uniqueID) throws IOException { + File cached = getCached(uniqueID); + cached.getParentFile().mkdirs(); + return save(in, cached); + } + + /** + * Save the given resource to the cache. + * + * @param in + * the input data + * @param url + * the {@link URL} used to locate the cached resource + * + * @throws IOException + * in case of I/O error + */ + public File save(InputStream in, URL url) throws IOException { + File cached = getCached(url); + return save(in, cached); + } + + /** + * Save the given resource to the cache. + * + * @param in + * the input data + * @param cached + * the cached {@link File} to save to + * + * @throws IOException + * in case of I/O error + */ + private File save(InputStream in, File cached) throws IOException { + IOUtils.write(in, cached); + return cached; + } + + /** + * Check if the {@link File} is too old according to + * {@link Cache#tooOldChanging}. + * + * @param file + * the file to check + * @param stable + * TRUE to denote stable files, that are not supposed to change + * too often + * + * @return TRUE if it is + */ + private boolean isOld(File file, boolean stable) { + long max = tooOldChanging; + if (stable) { + max = tooOldStable; + } + + if (max < 0) { + return false; + } + + long time = new Date().getTime() - file.lastModified(); + if (time < 0) { + trace("Timestamp in the future for file: " + file.getAbsolutePath(), + true); + } + + return time < 0 || time > max; + } + + /** + * Return the associated cache {@link File} from this {@link URL}. + * + * @param url + * the {@link URL} + * + * @return the cached {@link File} version of this {@link URL} + */ + private File getCached(URL url) { + File subdir; + + String name = url.getHost(); + if (name == null || name.isEmpty()) { + // File + File file = new File(url.getFile()); + subdir = new File(file.getParent().replace("..", "__")); + subdir = new File(dir, allowedChars(subdir.getPath())); + name = allowedChars(url.getFile()); + } else { + // URL + File subsubDir = new File(dir, allowedChars(url.getHost())); + subdir = new File(subsubDir, "_" + allowedChars(url.getPath())); + name = allowedChars("_" + url.getQuery()); + } + + File cacheFile = new File(subdir, name); + subdir.mkdirs(); + + return cacheFile; + } + + /** + * Get the basic cache resource file corresponding to this unique ID. + *

+ * Note that you may need to add a sub-directory in some cases. + * + * @param uniqueID + * the id + * + * @return the cached version if present, NULL if not + */ + private File getCached(String uniqueID) { + File file = new File(dir, allowedChars(uniqueID)); + File subdir = new File(file.getParentFile(), "_"); + return new File(subdir, file.getName()); + } + + /** + * Replace not allowed chars (in a {@link File}) by "_". + * + * @param raw + * the raw {@link String} + * + * @return the sanitised {@link String} + */ + private String allowedChars(String raw) { + return raw.replace('/', '_').replace(':', '_').replace("\\", "_"); + } +} \ No newline at end of file diff --git a/src/be/nikiroo/utils/Downloader.java b/src/be/nikiroo/utils/Downloader.java new file mode 100644 index 0000000..67fd652 --- /dev/null +++ b/src/be/nikiroo/utils/Downloader.java @@ -0,0 +1,264 @@ +package be.nikiroo.utils; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStreamWriter; +import java.net.CookieHandler; +import java.net.CookieManager; +import java.net.CookiePolicy; +import java.net.CookieStore; +import java.net.HttpCookie; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLEncoder; +import java.util.Map; +import java.util.zip.GZIPInputStream; + +/** + * This class will help you download content from Internet Sites ({@link URL} + * based). + *

+ * It allows you to control some options often required on web sites that do not + * want to simply serve HTML, but actively makes your life difficult with stupid + * checks. + * + * @author niki + */ +public class Downloader { + private String UA; + private CookieManager cookies; + + /** + * Create a new {@link Downloader}. + * + * @param UA + * the User-Agent to use to download the resources -- note that + * some websites require one, some actively blacklist real UAs + * like the one from wget, some whitelist a couple of browsers + * only (!) + */ + public Downloader(String UA) { + this.UA = UA; + + cookies = new CookieManager(); + cookies.setCookiePolicy(CookiePolicy.ACCEPT_ALL); + CookieHandler.setDefault(cookies); + } + + /** + * Clear all the cookies currently in the jar. + *

+ * As long as you don't, the cookies are kept. + */ + public void clearCookies() { + cookies.getCookieStore().removeAll(); + } + + /** + * Open the given {@link URL} and update the cookies. + * + * @param url + * the {@link URL} to open + * @return the {@link InputStream} of the opened page + * + * @throws IOException + * in case of I/O error + **/ + public InputStream open(URL url) throws IOException { + return open(url, url, url, null, null, null, null); + } + + /** + * Open the given {@link URL} and update the cookies. + * + * @param url + * the {@link URL} to open + * @param postParams + * the POST parameters + * @param getParams + * the GET parameters (priority over POST) + * @param oauth + * OAuth authorization (aka, "bearer XXXXXXX") + * + * @return the {@link InputStream} of the opened page + * + * @throws IOException + * in case of I/O error + */ + public InputStream open(URL url, URL currentReferer, + Map cookiesValues, Map postParams, + Map getParams, String oauth) throws IOException { + return open(url, url, currentReferer, cookiesValues, postParams, + getParams, oauth); + } + + /** + * Trace information (info/error) generated by this class. + *

+ * You can override it if you don't want the default sysout/syserr. + * + * @param message + * the message + * @param error + * TRUE for error messages, FALSE for information messages + */ + protected void trace(String message, boolean error) { + if (error) { + System.err.println(message); + } else { + System.out.println(message); + } + } + + /** + * Open the given {@link URL} and update the cookies. + * + * @param url + * the {@link URL} to open + * @param originalUrl + * the original {@link URL} before any redirection occurs + * @param postParams + * the POST parameters + * @param getParams + * the GET parameters (priority over POST) + * @param oauth + * OAuth authorisation (aka, "bearer XXXXXXX") + * @return the {@link InputStream} of the opened page + * + * @throws IOException + * in case of I/O error + */ + private InputStream open(URL url, final URL originalUrl, + URL currentReferer, Map cookiesValues, + Map postParams, Map getParams, + String oauth) throws IOException { + + trace("Download: " + url, false); + + URLConnection conn = openConnectionWithCookies(url, currentReferer, + cookiesValues); + + // Priority: GET over POST + Map params = getParams; + if (getParams == null) { + params = postParams; + } + + if ((params != null || oauth != null) + && conn instanceof HttpURLConnection) { + StringBuilder requestData = null; + if (params != null) { + requestData = new StringBuilder(); + for (Map.Entry param : params.entrySet()) { + if (requestData.length() != 0) + requestData.append('&'); + requestData.append(URLEncoder.encode(param.getKey(), + "UTF-8")); + requestData.append('='); + requestData.append(URLEncoder.encode( + String.valueOf(param.getValue()), "UTF-8")); + } + + conn.setDoOutput(true); + + if (getParams == null && postParams != null) { + ((HttpURLConnection) conn).setRequestMethod("POST"); + } + + conn.setRequestProperty("Content-Type", + "application/x-www-form-urlencoded"); + conn.setRequestProperty("charset", "utf-8"); + } + + if (oauth != null) { + conn.setRequestProperty("Authorization", oauth); + } + + if (requestData != null) { + OutputStreamWriter writer = new OutputStreamWriter( + conn.getOutputStream()); + + writer.write(requestData.toString()); + writer.flush(); + writer.close(); + } + } + + conn.connect(); + + // Check if redirect + if (conn instanceof HttpURLConnection + && ((HttpURLConnection) conn).getResponseCode() / 100 == 3) { + String newUrl = conn.getHeaderField("Location"); + return open(new URL(newUrl), originalUrl, currentReferer, + cookiesValues, postParams, getParams, oauth); + } + + InputStream in = conn.getInputStream(); + if ("gzip".equals(conn.getContentEncoding())) { + in = new GZIPInputStream(in); + } + + return in; + } + + /** + * Open a connection on the given {@link URL}, and manage the cookies that + * come with it. + * + * @param url + * the {@link URL} to open + * + * @return the connection + * + * @throws IOException + * in case of I/O error + */ + private URLConnection openConnectionWithCookies(URL url, + URL currentReferer, Map cookiesValues) + throws IOException { + URLConnection conn = url.openConnection(); + + conn.setRequestProperty("User-Agent", UA); + conn.setRequestProperty("Cookie", generateCookies(cookiesValues)); + conn.setRequestProperty("Accept-Encoding", "gzip"); + if (currentReferer != null) { + conn.setRequestProperty("Referer", currentReferer.toString()); + conn.setRequestProperty("Host", currentReferer.getHost()); + } + + return conn; + } + + /** + * Generate the cookie {@link String} from the local {@link CookieStore} so + * it is ready to be passed. + * + * @return the cookie + */ + private String generateCookies(Map cookiesValues) { + StringBuilder builder = new StringBuilder(); + for (HttpCookie cookie : cookies.getCookieStore().getCookies()) { + if (builder.length() > 0) { + builder.append(';'); + } + + // TODO: check if format is ok + builder.append(cookie.toString()); + } + + if (cookiesValues != null) { + for (Map.Entry set : cookiesValues.entrySet()) { + if (builder.length() > 0) { + builder.append(';'); + } + builder.append(set.getKey()); + builder.append('='); + builder.append(set.getValue()); + } + } + + return builder.toString(); + } +} -- 2.27.0