# Gofetch
+## Version 1.0.0
+
+- Add Le Monde support
+- Fix some small textual issues
+
## Version 0.2.0
- Add Linux Weekly News support
// note: adds "i"
private static void appendJustified(StringBuilder builder, String text,
String space) {
- for (String line : StringJustifier.full(text,
- LINE_SIZE - space.length())) {
- builder.append("i").append(line).append("\r\n");
+ for (String line : text.split("\n")) {
+ for (String subline : StringJustifier.full(line,
+ LINE_SIZE - space.length())) {
+ builder.append("i").append(subline).append("\r\n");
+ }
}
}
private static void appendLeft(StringBuilder builder, String text,
String prependFirst, String prependOthers, String space) {
String prepend = prependFirst;
- for (String line : StringJustifier.left(text,
- LINE_SIZE - space.length())) {
- builder.append("i").append(space).append(prepend).append(line)
- .append("\r\n");
- prepend = prependOthers;
+ for (String line : text.split("\n")) {
+ for (String subline : StringJustifier.left(line,
+ LINE_SIZE - space.length())) {
+ builder.append("i").append(space).append(prepend)
+ .append(subline).append("\r\n");
+ prepend = prependOthers;
+ }
}
}
}
} else {
builder.append(" <h1>" + story.getTitle() + "</h1>\n");
}
- builder.append(" <div class='details'>(" + story.getDetails()
- + ")</div>\n");
+
+ builder.append(" <div class='details'>");
+ if (story.getDetails() != null && !story.getDetails().isEmpty()) {
+ builder.append("(").append(story.getDetails()).append(")");
+ }
+ builder.append("</div>\n");
builder.append(" <br/>\n");
if (!resume) {
if (resume) {
builder.append(" " + story.getContent() + "\n");
} else {
- builder.append(" " + story.getFullContent() + "\n");
+ builder.append(" "
+ + story.getFullContent().replace("\n", "<br/>")
+ .replace("[ ", "<h2>").replace(" ]", "</h2>")
+ + "\n");
}
builder.append(" </div>\n");
public abstract class BasicSupport {
public enum Type {
- SLASHDOT, PIPEDOT, LWN,
+ SLASHDOT, PIPEDOT, LWN, LEMONDE,
}
public interface QuoteProcessor {
public String processText(String text);
public boolean ignoreNode(Node node);
+
+ /**
+ * Manually process this node if so desired.
+ *
+ * @param node
+ * the node to optionally process
+ *
+ * @return NULL if not processed, a {@link String} (may be empty) if we
+ * must not process it any further
+ */
+ public String manualProcessing(Node node);
}
static private String preselector;
private Type type;
+ /**
+ * List all the recent items, but only assure the ID and internal URL to
+ * fetch it later on (until it has been fetched, the rest of the
+ * {@link Story} is not confirmed).
+ *
+ * @return the list of new stories
+ *
+ * @throws IOException
+ * in case of I/O
+ */
abstract public List<Story> list() throws IOException;
/**
case LWN:
support = new LWN();
break;
+ case LEMONDE:
+ support = new LeMonde();
+ break;
}
if (support != null) {
new NodeTraversor(new NodeVisitor() {
@Override
public void head(Node node, int depth) {
- if (quoteProcessor.ignoreNode(node)
- || ignoredNodes.contains(node.parentNode())) {
+ String manual = null;
+ boolean ignore = quoteProcessor.ignoreNode(node)
+ || ignoredNodes.contains(node.parentNode());
+ if (!ignore) {
+ manual = quoteProcessor.manualProcessing(node);
+ if (manual != null) {
+ currentLine.append(manual);
+ ignore = true;
+ }
+ }
+
+ if (ignore) {
ignoredNodes.add(node);
return;
}
URL url = new URL("https://lwn.net/");
InputStream in = open(url);
Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements stories = doc.getElementsByClass("pure-u-1");
- for (Element story : stories) {
- Elements titles = story.getElementsByClass("Headline");
- Elements listings = story.getElementsByClass("BlurbListing");
+ Elements articles = doc.getElementsByClass("pure-u-1");
+ for (Element article : articles) {
+ Elements titles = article.getElementsByClass("Headline");
+ Elements listings = article.getElementsByClass("BlurbListing");
if (titles.size() == 0) {
continue;
}
String id = "";
String intUrl = "";
String extUrl = "";
- for (Element idElem : story.getElementsByTag("a")) {
+ for (Element idElem : article.getElementsByTag("a")) {
// Last link is the story link
intUrl = idElem.absUrl("href");
pos = intUrl.indexOf("#Comments");
return false;
}
+
+ @Override
+ public String manualProcessing(Node node) {
+ return null;
+ }
});
}
}
--- /dev/null
+package be.nikiroo.gofetch.support;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.jsoup.helper.DataUtil;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.select.Elements;
+
+import be.nikiroo.gofetch.data.Comment;
+import be.nikiroo.gofetch.data.Story;
+
+public class LeMonde extends BasicSupport {
+ @Override
+ public String getDescription() {
+ return "Le Monde: Actualités et Infos en France et dans le monde";
+ }
+
+ @Override
+ public List<Story> list() throws IOException {
+ List<Story> list = new ArrayList<Story>();
+
+ for (String topic : new String[] { "international", "politique",
+ "societe", "sciences" }) {
+ URL url = new URL("http://www.lemonde.fr/" + topic + "/1.html");
+ InputStream in = open(url);
+ Document doc = DataUtil.load(in, "UTF-8", url.toString());
+ Elements articles = doc.getElementsByTag("article");
+ for (Element article : articles) {
+ Elements times = article.getElementsByTag("time");
+ Elements titleElements = article.getElementsByTag("h3");
+ Elements contentElements = article.getElementsByClass("txt3");
+ if (times.size() > 0 && titleElements.size() > 0
+ && contentElements.size() > 0) {
+ String id = times.get(0).attr("datetime").replace(":", "_");
+ String title = "[" + topic + "] "
+ + titleElements.get(0).text();
+ String content = contentElements.get(0).text();
+ String intUrl = "";
+ String extUrl = "";
+ String details = "";
+
+ Elements detailsElements = article
+ .getElementsByClass("signature");
+ if (detailsElements.size() > 0) {
+ details = detailsElements.get(0).text();
+ }
+
+ Elements links = titleElements.get(0).getElementsByTag("a");
+ if (links.size() > 0) {
+ intUrl = links.get(0).absUrl("href");
+ list.add(new Story(getType(), id, title, details,
+ intUrl, extUrl, content));
+ }
+ }
+ }
+ }
+
+ return list;
+ }
+
+ @Override
+ public void fetch(Story story) throws IOException {
+ String fullContent = story.getContent();
+ List<Comment> comments = new ArrayList<Comment>();
+
+ // Note: no comments on this site as far as I can see (or maybe with
+ // some javascript, I need to check...)
+
+ URL url = new URL(story.getUrlInternal());
+ InputStream in = open(url);
+ Document doc = DataUtil.load(in, "UTF-8", url.toString());
+ Element article = doc.getElementById("articleBody");
+ if (article != null) {
+ for (String line : toLines(article, new QuoteProcessor() {
+ @Override
+ public String processText(String text) {
+ return text;
+ }
+
+ @Override
+ public boolean ignoreNode(Node node) {
+ if (node instanceof Element) {
+ Element element = (Element) node;
+ if (element.hasClass("lire")) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ @Override
+ public boolean detectQuote(Node node) {
+ return false;
+ }
+
+ @Override
+ public String manualProcessing(Node node) {
+ if (node instanceof Element) {
+ Element element = (Element) node;
+ if (element.hasClass("intertitre")) {
+ return "\n[ " + element.text() + " ]\n";
+ }
+ }
+ return null;
+ }
+ })) {
+ fullContent += line + "\n";
+ }
+
+ // Content is too tight with a single break per line:
+ fullContent = fullContent.replace("\n", "\n\n") //
+ .replace("\n\n\n\n", "\n\n") //
+ .replace("\n\n\n\n", "\n\n") //
+ .trim();
+ }
+
+ story.setFullContent(fullContent);
+ story.setComments(comments);
+ }
+}
URL url = new URL("https://pipedot.org/");
InputStream in = open(url);
Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements stories = doc.getElementsByClass("story");
- for (Element story : stories) {
- Elements titles = story.getElementsByTag("h1");
+ Elements articles = doc.getElementsByClass("story");
+ for (Element article : articles) {
+ Elements titles = article.getElementsByTag("h1");
if (titles.size() == 0) {
continue;
}
Element title = titles.get(0);
String id = "";
- for (Element idElem : story.getElementsByTag("a")) {
+ for (Element idElem : article.getElementsByTag("a")) {
if (idElem.attr("href").startsWith("/pipe/")) {
id = idElem.attr("href").substring("/pipe/".length());
break;
String intUrl = null;
String extUrl = null;
- Elements links = story.getElementsByTag("a");
+ Elements links = article.getElementsByTag("a");
if (links.size() > 0) {
intUrl = links.get(0).absUrl("href");
}
}
String details = "";
- Elements detailsElements = story.getElementsByTag("div");
+ Elements detailsElements = article.getElementsByTag("div");
if (detailsElements.size() > 0) {
details = detailsElements.get(0).text();
}
String body = "";
- for (Element elem : story.children()) {
+ for (Element elem : article.children()) {
String tag = elem.tag().toString();
if (!tag.equals("header") && !tag.equals("footer")) {
body = elem.text();
public boolean ignoreNode(Node node) {
return false;
}
+
+ @Override
+ public String manualProcessing(Node node) {
+ return null;
+ }
});
}
}
URL url = new URL("https://slashdot.org/");
InputStream in = open(url);
Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements stories = doc.getElementsByTag("header");
- for (Element story : stories) {
- Elements titles = story.getElementsByClass("story-title");
+ Elements articles = doc.getElementsByTag("header");
+ for (Element article : articles) {
+ Elements titles = article.getElementsByClass("story-title");
if (titles.size() == 0) {
continue;
}
}
String details = "";
- Elements detailsElements = story.getElementsByClass("details");
+ Elements detailsElements = article.getElementsByClass("details");
if (detailsElements.size() > 0) {
details = detailsElements.get(0).text();
}
public boolean ignoreNode(Node node) {
return false;
}
+
+ @Override
+ public String manualProcessing(Node node) {
+ return null;
+ }
});
}
}