From b34d1f357b076c1697e381b70cb6ff1bb0278b91 Mon Sep 17 00:00:00 2001 From: Niki Roo Date: Fri, 23 Mar 2018 23:22:09 +0100 Subject: [PATCH] Separate story details components --- src/be/nikiroo/gofetch/data/Story.java | 31 ++++- .../nikiroo/gofetch/support/BasicSupport.java | 116 +++++++++++++++--- src/be/nikiroo/gofetch/support/LWN.java | 21 +++- src/be/nikiroo/gofetch/support/LeMonde.java | 16 ++- src/be/nikiroo/gofetch/support/Pipedot.java | 4 +- src/be/nikiroo/gofetch/support/Slashdot.java | 29 ++++- .../nikiroo/gofetch/support/TheRegister.java | 40 +++--- src/be/nikiroo/gofetch/support/TooLinux.java | 14 ++- 8 files changed, 208 insertions(+), 63 deletions(-) diff --git a/src/be/nikiroo/gofetch/data/Story.java b/src/be/nikiroo/gofetch/data/Story.java index a2ad7d3..5944bec 100644 --- a/src/be/nikiroo/gofetch/data/Story.java +++ b/src/be/nikiroo/gofetch/data/Story.java @@ -15,6 +15,9 @@ public class Story { private Type type; private String id; private String title; + private String author; + private String date; + private String category; private String details; private String urlInternal; private String urlExternal; @@ -32,8 +35,15 @@ public class Story { * the news ID * @param title * the news title + * @param author + * the author name for the details + * @param date + * the post date for the details + * @param category + * the category for the details * @param details - * some details to add to the title + * some details to add to the title (author, date and category + * will be added in the getter if available) * @param urlInternal * the {@link URL} to get this news on the associated news site * @param urlExternal @@ -41,11 +51,15 @@ public class Story { * @param content * the story content */ - public Story(Type type, String id, String title, String details, - String urlInternal, String urlExternal, String content) { + public Story(Type type, String id, String title, String author, + String date, String category, String details, String urlInternal, + String urlExternal, String content) { this.type = type; this.id = id; this.title = title; + this.author = author; + this.date = date; + this.category = category; this.details = details; this.urlInternal = urlInternal; this.urlExternal = urlExternal; @@ -77,6 +91,17 @@ public class Story { * @return the details */ public String getDetails() { + String details = ""; + + if (category != null && !category.trim().isEmpty()) + details += "[" + category + "] "; + if (date != null && !date.trim().isEmpty()) + details += date + " "; + if (author != null && !author.trim().isEmpty()) + details += "(" + this.author + ") "; + if (this.details != null && !this.details.trim().isEmpty()) + details += "\n" + this.details; + return details; } diff --git a/src/be/nikiroo/gofetch/support/BasicSupport.java b/src/be/nikiroo/gofetch/support/BasicSupport.java index 4067979..6d930f6 100644 --- a/src/be/nikiroo/gofetch/support/BasicSupport.java +++ b/src/be/nikiroo/gofetch/support/BasicSupport.java @@ -1,7 +1,10 @@ package be.nikiroo.gofetch.support; import java.io.IOException; +import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Date; import java.util.List; import org.jsoup.helper.StringUtil; @@ -15,11 +18,33 @@ import org.jsoup.select.NodeVisitor; import be.nikiroo.gofetch.data.Story; import be.nikiroo.utils.Downloader; +/** + * Base class for website support. + * + * @author niki + */ public abstract class BasicSupport { + /** The downloader to use for all websites. */ protected static Downloader downloader = new Downloader("gofetcher"); + /** + * The support type (each website we support has a single type). + * + * @author niki + */ public enum Type { - SLASHDOT, PIPEDOT, LWN, LEMONDE, REGISTER, TOOLINUX, + /** EN: Any, but mostly IT/Sci */ + SLASHDOT, + /** EN: Clone of Slashdot, mostly abandoned */ + PIPEDOT, + /** EN: Linux */ + LWN, + /** FR: Any */ + LEMONDE, + /** EN: IT */ + REGISTER, + /** FR: Linux */ + TOO_LINUX, } /** @@ -43,7 +68,8 @@ public abstract class BasicSupport { * * @param text * the text to process - * @return + * + * @return the resulting text */ public String processText(String text); @@ -130,21 +156,49 @@ public abstract class BasicSupport { */ abstract public void fetch(Story story) throws IOException; + /** + * The website textual description, to add in the dispatcher page. + *

+ * Should be short. + * + * @return the description + */ abstract public String getDescription(); + /** + * The gopher "selector" to use for output. + *

+ * A kind of "URL path", like "/news/" or "/misc/news/" or... + * + * @return the selector + */ public String getSelector() { return getSelector(type); } + /** + * The support type. + * + * @return the type + */ public Type getType() { return type; } + /** + * The support type. + * + * @param type + * the new type + */ protected void setType(Type type) { this.type = type; } /** + * The {@link String} to append to the selector (the selector will be + * constructed as "this string" then "/type/". + * * @param preselector * the preselector to set */ @@ -181,7 +235,7 @@ public abstract class BasicSupport { case REGISTER: support = new TheRegister(); break; - case TOOLINUX: + case TOO_LINUX: support = new TooLinux(); break; } @@ -194,6 +248,17 @@ public abstract class BasicSupport { return support; } + /** + * The gopher "selector" to use for output for this type, using the + * preselector. + *

+ * A kind of "URL path", like "/news/" or "/misc/news/" or... + * + * @param type + * the type to get the selector of + * + * @return the selector + */ static public String getSelector(Type type) { return preselector + "/" + type + "/"; } @@ -256,7 +321,6 @@ public abstract class BasicSupport { final StringBuilder currentLine = new StringBuilder(); final List quoted = new ArrayList(); final List ignoredNodes = new ArrayList(); - final List footnotes = new ArrayList(); if (element != null) { new NodeTraversor(new NodeVisitor() { @@ -314,11 +378,6 @@ public abstract class BasicSupport { if (block && currentLine.length() > 0) { currentLine.append("\n"); } - - if (!element.absUrl("href").trim().isEmpty()) { - footnotes.add(element.absUrl("href")); - currentLine.append("[" + footnotes.size() + "]"); - } } else if (node instanceof TextNode) { TextNode textNode = (TextNode) node; String line = StringUtil.normaliseWhitespace(textNode @@ -355,16 +414,37 @@ public abstract class BasicSupport { lines.set(i, lines.get(i).replace(" ", " ").trim()); } - if (footnotes.size() > 0) { - lines.add(""); - lines.add(""); - lines.add(""); - lines.add(""); - for (int i = 0; i < footnotes.size(); i++) { - lines.add("[" + (i + 1) + "] " + footnotes.get(i)); - } + return lines; + } + + /** + * Reformat the date if possible. + * + * @param date + * the input date + * + * @return the reformated date, or the same value if it was not parsable + */ + static protected String date(String date) { + SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd"); + + long epoch = 0; + try { + epoch = Long.parseLong(date); + } catch (Exception e) { + epoch = 0; } - return lines; + if (epoch > 0) { + return out.format(new Date(1000 * epoch)); + } + + try { + Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX") + .parse(date.trim()); + return out.format(dat); + } catch (ParseException e) { + return date; + } } } diff --git a/src/be/nikiroo/gofetch/support/LWN.java b/src/be/nikiroo/gofetch/support/LWN.java index 27b539c..37a5a8f 100644 --- a/src/be/nikiroo/gofetch/support/LWN.java +++ b/src/be/nikiroo/gofetch/support/LWN.java @@ -59,8 +59,16 @@ public class LWN extends BasicSupport { } body = body.trim(); + int pos; + + String categ = ""; + pos = details.indexOf("]"); + if (pos >= 0) { + categ = details.substring(1, pos + 1).trim(); + } + String author = ""; - int pos = details.indexOf(" by "); + pos = details.indexOf(" by "); if (pos >= 0) { author = details.substring(pos + " by ".length()).trim(); } @@ -69,8 +77,15 @@ public class LWN extends BasicSupport { pos = details.indexOf(" Posted "); if (pos >= 0) { date = details.substring(pos + " Posted ".length()).trim(); + pos = details.indexOf(" by "); + if (pos >= 0) { + author = details.substring(0, pos).trim(); + } } + // We extracted everything from details so... + details = ""; + String id = ""; String intUrl = ""; String extUrl = ""; @@ -84,8 +99,8 @@ public class LWN extends BasicSupport { id = intUrl.replaceAll("[^0-9]", ""); } - list.add(new Story(getType(), id, title, details, intUrl, extUrl, - body)); + list.add(new Story(getType(), id, title, author, date, categ, + details, intUrl, extUrl, body)); } return list; diff --git a/src/be/nikiroo/gofetch/support/LeMonde.java b/src/be/nikiroo/gofetch/support/LeMonde.java index d11ba79..4ec2c30 100644 --- a/src/be/nikiroo/gofetch/support/LeMonde.java +++ b/src/be/nikiroo/gofetch/support/LeMonde.java @@ -15,6 +15,11 @@ import org.jsoup.select.Elements; import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; +/** + * Support http://www.lemonde.fr/. + * + * @author niki + */ public class LeMonde extends BasicSupport { @Override public String getDescription() { @@ -39,24 +44,25 @@ public class LeMonde extends BasicSupport { && contentElements.size() > 0) { String id = times.get(0).attr("datetime").replace(":", "_") .replace("+", "_"); - String title = "[" + topic + "] " - + titleElements.get(0).text(); + String title = titleElements.get(0).text(); + String date = date(titleElements.get(0).text()); String content = contentElements.get(0).text(); String intUrl = ""; String extUrl = ""; + String author = ""; String details = ""; Elements detailsElements = article .getElementsByClass("signature"); if (detailsElements.size() > 0) { - details = detailsElements.get(0).text(); + author = detailsElements.get(0).text(); } Elements links = titleElements.get(0).getElementsByTag("a"); if (links.size() > 0) { intUrl = links.get(0).absUrl("href"); - list.add(new Story(getType(), id, title, details, - intUrl, extUrl, content)); + list.add(new Story(getType(), id, title, author, date, + topic, details, intUrl, extUrl, content)); } } } diff --git a/src/be/nikiroo/gofetch/support/Pipedot.java b/src/be/nikiroo/gofetch/support/Pipedot.java index 17388b2..edbb804 100644 --- a/src/be/nikiroo/gofetch/support/Pipedot.java +++ b/src/be/nikiroo/gofetch/support/Pipedot.java @@ -82,8 +82,8 @@ public class Pipedot extends BasicSupport { } } - list.add(new Story(getType(), id, title.text(), details, intUrl, - extUrl, body)); + list.add(new Story(getType(), id, title.text(), "", "", "", + details, intUrl, extUrl, body)); } return list; diff --git a/src/be/nikiroo/gofetch/support/Slashdot.java b/src/be/nikiroo/gofetch/support/Slashdot.java index 43d35f4..4746cc2 100644 --- a/src/be/nikiroo/gofetch/support/Slashdot.java +++ b/src/be/nikiroo/gofetch/support/Slashdot.java @@ -14,6 +14,7 @@ import org.jsoup.select.Elements; import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; +import be.nikiroo.utils.StringUtils; /** * Support https://slashdot.org/. @@ -63,14 +64,38 @@ public class Slashdot extends BasicSupport { details = detailsElements.get(0).text(); } + // details: + // "Posted by AUTHOR on DATE from the further-crackdown dept." + String author = ""; + int pos = details.indexOf(" on "); + if (details.startsWith("Posted by ") && pos >= 0) { + author = details.substring("Posted by ".length(), pos).trim(); + } + pos = details.indexOf(" from the "); + if (pos >= 0) { + details = details.substring(pos).trim(); + } + String body = ""; Element bodyElement = doc.getElementById("text-" + id); if (bodyElement != null) { body = bodyElement.text(); } - list.add(new Story(getType(), id, title.text(), details, intUrl, - extUrl, body)); + String categ = ""; + Element categElement = doc.getElementsByClass("topic").first(); + if (categElement != null) { + categ = StringUtils.unhtml(categElement.text()).trim(); + } + + String date = ""; + Element dateElement = doc.getElementsByTag("time").first(); + if (dateElement != null) { + date = StringUtils.unhtml(dateElement.text()).trim(); + } + + list.add(new Story(getType(), id, title.text(), author, date, + categ, details, intUrl, extUrl, body)); } return list; diff --git a/src/be/nikiroo/gofetch/support/TheRegister.java b/src/be/nikiroo/gofetch/support/TheRegister.java index 3d7496a..5903eaa 100644 --- a/src/be/nikiroo/gofetch/support/TheRegister.java +++ b/src/be/nikiroo/gofetch/support/TheRegister.java @@ -3,9 +3,7 @@ package be.nikiroo.gofetch.support; import java.io.IOException; import java.io.InputStream; import java.net.URL; -import java.text.SimpleDateFormat; import java.util.ArrayList; -import java.util.Date; import java.util.List; import org.jsoup.helper.DataUtil; @@ -18,6 +16,12 @@ import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; import be.nikiroo.utils.StringUtils; +/** + * Support https://www.theregister.co.uk/. + * + * @author niki + */ public class TheRegister extends BasicSupport { @Override public String getDescription() { @@ -46,17 +50,18 @@ public class TheRegister extends BasicSupport { String date = ""; String details = ""; String body = ""; + String categ = ""; + String author = ""; // nope - String topic = ""; - Element topicElement = article.previousElementSibling(); - if (topicElement != null) { - topic = "[" + topicElement.text().trim() + "] "; + Element categElement = article.previousElementSibling(); + if (categElement != null) { + categ = categElement.text().trim(); } + Element titleElement = article.getElementsByTag("h4").first(); if (titleElement != null) { title = StringUtils.unhtml(titleElement.text()).trim(); } - title = topic + title; Element dateElement = article.getElementsByClass("time_stamp") .first(); @@ -80,8 +85,8 @@ public class TheRegister extends BasicSupport { details += StringUtils.unhtml(detailsElement.text()).trim(); } - list.add(new Story(getType(), id, title, details, intUrl, extUrl, - body)); + list.add(new Story(getType(), id, title, author, date, categ, + details, intUrl, extUrl, body)); } return list; @@ -206,21 +211,4 @@ public class TheRegister extends BasicSupport { } } } - - // Return display date from epoch String, or "" if error - private static String date(String epochString) { - long epoch = 0; - try { - epoch = Long.parseLong(epochString); - } catch (Exception e) { - epoch = 0; - } - - if (epoch > 0) { - return new SimpleDateFormat("dd MMM YYYY").format(new Date( - 1000 * epoch)); - } - - return ""; - } } diff --git a/src/be/nikiroo/gofetch/support/TooLinux.java b/src/be/nikiroo/gofetch/support/TooLinux.java index c875783..8061331 100644 --- a/src/be/nikiroo/gofetch/support/TooLinux.java +++ b/src/be/nikiroo/gofetch/support/TooLinux.java @@ -16,10 +16,15 @@ import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; import be.nikiroo.utils.StringUtils; +/** + * Support https://www.toolinux.com/. + * + * @author niki + */ public class TooLinux extends BasicSupport { @Override public String getDescription() { - return "TooLinux: Premier quotidien francophone d'actualité généraliste sur Linux, les logiciels libres et l'interopérabilité, depuis mars 2000."; + return "TooLinux: Actualité généraliste sur Linux et les logiciels libres"; } @Override @@ -38,6 +43,8 @@ public class TooLinux extends BasicSupport { String date = ""; String details = ""; String body = ""; + String author = ""; // nope + String categ = ""; // nope Element urlElement = article.getElementsByTag("a").first(); if (urlElement != null) { @@ -64,13 +71,12 @@ public class TooLinux extends BasicSupport { Element detailsElement = article.getElementsByClass("introduction") .first(); - details = "(" + date + ") "; if (detailsElement != null) { details += StringUtils.unhtml(detailsElement.text()).trim(); } - list.add(new Story(getType(), id, title, details, intUrl, extUrl, - body)); + list.add(new Story(getType(), id, title, author, date, categ, + details, intUrl, extUrl, body)); } return list; -- 2.27.0