src/be/nikiroo/gofetch/support/EreNumerique.java

   1 package be.nikiroo.gofetch.support;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.net.URL;
   6 import java.util.ArrayList;
   7 import java.util.List;
   8
   9 import org.jsoup.helper.DataUtil;
  10 import org.jsoup.nodes.Document;
  11 import org.jsoup.nodes.Element;
  12 import org.jsoup.nodes.Node;
  13 import org.jsoup.select.Elements;
  14
  15 import be.nikiroo.gofetch.data.Comment;
  16 import be.nikiroo.gofetch.data.Story;
  17 import be.nikiroo.utils.StringUtils;
  18
  19 /**
  20  * Support <a
  21  * href="https://www.erenumerique.fr/">https://www.erenumerique.fr/</a>.
  22  *
  23  * @author niki
  24  */
  25 public class EreNumerique extends BasicSupport {
  26         @Override
  27         public String getDescription() {
  28                 return "Ère Numérique.FR: faites le bon choix !";
  29         }
  30
  31         @Override
  32         public List<Story> list() throws IOException {
  33                 List<Story> list = new ArrayList<Story>();
  34
  35                 for (String categ : new String[] { "informatique" }) {
  36                         URL url = new URL("https://www.erenumerique.fr/" + categ);
  37                         InputStream in = downloader.open(url);
  38                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
  39                         Elements articles = doc.getElementsByClass("item-details");
  40                         for (Element article : articles) {
  41                                 String id = "";
  42                                 String intUrl = "";
  43                                 String extUrl = ""; // nope
  44                                 String title = "";
  45                                 String date = "";
  46                                 String author = "";
  47                                 String details = "";
  48                                 String body = "";
  49
  50                                 // MUST NOT fail:
  51                                 Element dateElement = article //
  52                                                 .getElementsByTag("time").first();
  53                                 if (dateElement == null) {
  54                                         continue;
  55                                 }
  56
  57                                 Element urlElement = article.getElementsByTag("a").first();
  58                                 if (urlElement != null) {
  59                                         intUrl = urlElement.absUrl("href");
  60                                 }
  61
  62                                 id = dateElement.attr("datetime").replace(":", "_")
  63                                                 .replace("+", "_");
  64                                 date = date(dateElement.attr("datetime"));
  65
  66                                 Element titleElement = article.getElementsByTag("h2").first();
  67                                 if (titleElement != null) {
  68                                         title = StringUtils.unhtml(titleElement.text()).trim();
  69                                 }
  70
  71                                 Element authorElement = article.getElementsByClass(
  72                                                 "td-post-author-name").first();
  73                                 if (authorElement != null) {
  74                                         authorElement = authorElement.getElementsByTag("a").first();
  75                                 }
  76                                 if (authorElement != null) {
  77                                         author = StringUtils.unhtml(authorElement.text()).trim();
  78                                 }
  79
  80                                 Element contentElement = article.getElementsByClass(
  81                                                 "td-excerpt").first();
  82                                 if (contentElement != null) {
  83                                         body = StringUtils.unhtml(contentElement.text()).trim();
  84                                 }
  85
  86                                 list.add(new Story(getType(), id, title, author, date, categ,
  87                                                 details, intUrl, extUrl, body));
  88                         }
  89                 }
  90
  91                 return list;
  92         }
  93
  94         @Override
  95         public void fetch(Story story) throws IOException {
  96                 String fullContent = story.getContent();
  97
  98                 URL url = new URL(story.getUrlInternal());
  99                 InputStream in = downloader.open(url);
 100                 try {
 101                         Document doc = DataUtil.load(in, "UTF-8", url.toString());
 102                         Element article = doc.getElementsByTag("article").first();
 103                         if (article != null) {
 104                                 for (String line : toLines(article,
 105                                                 new BasicElementProcessor() {
 106                                                         // TODO: ignore headlines/pub
 107                                                 })) {
 108                                         fullContent += line + "\n";
 109                                 }
 110
 111                                 // Content is too tight with a single break per line:
 112                                 fullContent = fullContent.replace("\n", "\n\n") //
 113                                                 .replace("\n\n\n\n", "\n\n") //
 114                                                 .replace("\n\n\n\n", "\n\n") //
 115                                                 .trim();
 116                         }
 117
 118                         // Get comments URL then parse it, if possible
 119                         Element posts = doc.getElementsByClass("comment-list").first();
 120
 121                         story.setFullContent(fullContent);
 122                         story.setComments(getComments(posts));
 123                 } finally {
 124                         if (in != null) {
 125                                 in.close();
 126                         }
 127                 }
 128         }
 129
 130         private List<Comment> getComments(Element posts) {
 131                 List<Comment> comments = new ArrayList<Comment>();
 132                 if (posts != null) {
 133                         for (Element post : posts.children()) {
 134                                 if (!post.hasClass("comment")) {
 135                                         continue;
 136                                 }
 137
 138                                 String id = "";
 139                                 String author = "";
 140                                 String title = "";
 141                                 String date = "";
 142                                 List<String> content = new ArrayList<String>();
 143
 144                                 Element authorE = post.getElementsByTag("footer").first();
 145                                 if (authorE != null) {
 146                                         authorE = authorE.getElementsByTag("cite").first();
 147                                 }
 148                                 if (authorE != null) {
 149                                         author = StringUtils.unhtml(authorE.text()).trim();
 150                                 }
 151
 152                                 Element idE = post.getElementsByTag("a").first();
 153                                 if (idE != null) {
 154                                         id = idE.attr("id");
 155                                         Element dateE = idE.getElementsByTag("span").first();
 156                                         if (dateE != null) {
 157                                                 date = date(dateE.attr("data-epoch"));
 158                                         }
 159                                 }
 160
 161                                 Element contentE = post.getElementsByClass("comment-content")
 162                                                 .first();
 163                                 if (contentE != null) {
 164                                         for (String line : toLines(contentE,
 165                                                         new BasicElementProcessor() {
 166                                                                 @Override
 167                                                                 public boolean ignoreNode(Node node) {
 168                                                                         // TODO: ignore headlines/pub
 169                                                                         if (node instanceof Element) {
 170                                                                                 Element el = (Element) node;
 171                                                                                 if ("h4".equals(el.tagName())) {
 172                                                                                         return true;
 173                                                                                 }
 174                                                                         }
 175
 176                                                                         return false;
 177                                                                 }
 178                                                         })) {
 179                                                 content.add(line);
 180                                         }
 181                                 }
 182
 183                                 // Since we have no title but still an author, let's switch:
 184                                 title = author;
 185                                 author = "";
 186                                 Comment comment = new Comment(id, author, title, date, content);
 187                                 comments.add(comment);
 188
 189                                 Element children = post.getElementsByClass("children").first();
 190                                 comment.addAll(getComments(children));
 191                         }
 192                 }
 193
 194                 return comments;
 195         }
 196 }