Fix subtitles and too much content in EreNumerique
authorNiki Roo <niki@nikiroo.be>
Sun, 25 Mar 2018 12:37:48 +0000 (14:37 +0200)
committerNiki Roo <niki@nikiroo.be>
Sun, 25 Mar 2018 12:37:48 +0000 (14:37 +0200)
src/be/nikiroo/gofetch/support/BasicSupport.java
src/be/nikiroo/gofetch/support/EreNumerique.java
src/be/nikiroo/gofetch/support/LeMonde.java

index 8fc259a19daa84d387edb1a14b3b9d2adf7583b2..b15fac7e5e2598d0d67c3bcf493c6dae03a0a8a1 100644 (file)
@@ -100,6 +100,17 @@ public abstract class BasicSupport {
                 *         the usual automatic processing if not NULL
                 */
                public String manualProcessing(Node node);
+
+               /**
+                * This {@link Node} is a subtitle and should be treated as such
+                * (highlighted).
+                * 
+                * @param node
+                *            the node to check
+                * 
+                * @return NULL if it is not a subtitle, the subtitle to use if it is
+                */
+               public String isSubtitle(Node node);
        }
 
        /**
@@ -128,6 +139,11 @@ public abstract class BasicSupport {
                public String manualProcessing(Node node) {
                        return null;
                }
+
+               @Override
+               public String isSubtitle(Node node) {
+                       return null;
+               }
        }
 
        static private String preselector;
@@ -334,6 +350,7 @@ public abstract class BasicSupport {
                                        String manual = null;
                                        boolean ignore = elementProcessor.ignoreNode(node)
                                                        || ignoredNodes.contains(node.parentNode());
+                                       // Manual processing
                                        if (!ignore) {
                                                manual = elementProcessor.manualProcessing(node);
                                                if (manual != null) {
@@ -342,6 +359,16 @@ public abstract class BasicSupport {
                                                }
                                        }
 
+                                       // Subtitle check
+                                       if (!ignore) {
+                                               String subtitle = elementProcessor.isSubtitle(node);
+                                               if (subtitle != null) {
+                                                       subtitle = subtitle.trim();
+                                                       currentLine.append("\n[ " + subtitle + " ]\n");
+                                                       ignore = true;
+                                               }
+                                       }
+
                                        if (ignore) {
                                                ignoredNodes.add(node);
                                                return;
index bef677d9f8a75a3ec3c8faa9e802e05004719e22..b6a7598027c9b632cb52fb50f22677e4a1a314b4 100644 (file)
@@ -100,10 +100,29 @@ public class EreNumerique extends BasicSupport {
                try {
                        Document doc = DataUtil.load(in, "UTF-8", url.toString());
                        Element article = doc.getElementsByTag("article").first();
+                       if (article != null) {
+                               article = article.getElementsByAttributeValue("itemprop",
+                                               "articleBody").first();
+                       }
                        if (article != null) {
                                for (String line : toLines(article,
                                                new BasicElementProcessor() {
-                                                       // TODO: ignore headlines/pub
+                                                       @Override
+                                                       public boolean ignoreNode(Node node) {
+                                                               return node.attr("class").contains("chapo");
+                                                       }
+
+                                                       @Override
+                                                       public String isSubtitle(Node node) {
+                                                               if (node instanceof Element) {
+                                                                       Element element = (Element) node;
+                                                                       if (element.tagName().startsWith("h")
+                                                                                       && element.tagName().length() == 2) {
+                                                                               return element.text();
+                                                                       }
+                                                               }
+                                                               return null;
+                                                       }
                                                })) {
                                        fullContent += line + "\n";
                                }
index 4ec2c30f39a6fcb0b344ca4dd387d629f35c8dc0..235f7ee2ce2985738401cb45b6d6e8a1b25ec750 100644 (file)
@@ -98,11 +98,11 @@ public class LeMonde extends BasicSupport {
                                }
 
                                @Override
-                               public String manualProcessing(Node node) {
+                               public String isSubtitle(Node node) {
                                        if (node instanceof Element) {
                                                Element element = (Element) node;
                                                if (element.hasClass("intertitre")) {
-                                                       return "\n[ " + element.text() + " ]\n";
+                                                       return element.text();
                                                }
                                        }
                                        return null;