* the usual automatic processing if not NULL
*/
public String manualProcessing(Node node);
+
+ /**
+ * This {@link Node} is a subtitle and should be treated as such
+ * (highlighted).
+ *
+ * @param node
+ * the node to check
+ *
+ * @return NULL if it is not a subtitle, the subtitle to use if it is
+ */
+ public String isSubtitle(Node node);
}
/**
public String manualProcessing(Node node) {
return null;
}
+
+ @Override
+ public String isSubtitle(Node node) {
+ return null;
+ }
}
static private String preselector;
String manual = null;
boolean ignore = elementProcessor.ignoreNode(node)
|| ignoredNodes.contains(node.parentNode());
+ // Manual processing
if (!ignore) {
manual = elementProcessor.manualProcessing(node);
if (manual != null) {
}
}
+ // Subtitle check
+ if (!ignore) {
+ String subtitle = elementProcessor.isSubtitle(node);
+ if (subtitle != null) {
+ subtitle = subtitle.trim();
+ currentLine.append("\n[ " + subtitle + " ]\n");
+ ignore = true;
+ }
+ }
+
if (ignore) {
ignoredNodes.add(node);
return;
try {
Document doc = DataUtil.load(in, "UTF-8", url.toString());
Element article = doc.getElementsByTag("article").first();
+ if (article != null) {
+ article = article.getElementsByAttributeValue("itemprop",
+ "articleBody").first();
+ }
if (article != null) {
for (String line : toLines(article,
new BasicElementProcessor() {
- // TODO: ignore headlines/pub
+ @Override
+ public boolean ignoreNode(Node node) {
+ return node.attr("class").contains("chapo");
+ }
+
+ @Override
+ public String isSubtitle(Node node) {
+ if (node instanceof Element) {
+ Element element = (Element) node;
+ if (element.tagName().startsWith("h")
+ && element.tagName().length() == 2) {
+ return element.text();
+ }
+ }
+ return null;
+ }
})) {
fullContent += line + "\n";
}
}
@Override
- public String manualProcessing(Node node) {
+ public String isSubtitle(Node node) {
if (node instanceof Element) {
Element element = (Element) node;
if (element.hasClass("intertitre")) {
- return "\n[ " + element.text() + " ]\n";
+ return element.text();
}
}
return null;