[gofetch.git] / src / be / nikiroo / gofetch / support / EreNumerique.java

package be.nikiroo.gofetch.support;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;

import be.nikiroo.gofetch.data.Comment;
import be.nikiroo.gofetch.data.Story;
import be.nikiroo.utils.StringUtils;

/**
 * Support <a
 * href="https://www.erenumerique.fr/">https://www.erenumerique.fr/</a>.
 * 
 * @author niki
 */
public class EreNumerique extends BasicSupport {
	@Override
	public String getDescription() {
		return "Ère Numérique.FR: faites le bon choix !";
	}

	@Override
	public List<Story> list() throws IOException {
		List<Story> list = new ArrayList<Story>();

		for (String categ : new String[] { "informatique" }) {
			URL url = new URL("https://www.erenumerique.fr/" + categ);
			InputStream in = downloader.open(url);
			Document doc = DataUtil.load(in, "UTF-8", url.toString());
			Elements articles = doc.getElementsByClass("item-details");
			for (Element article : articles) {
				String id = "";
				String intUrl = "";
				String extUrl = ""; // nope
				String title = "";
				String date = "";
				String author = "";
				String details = "";
				String body = "";

				// MUST NOT fail:
				Element dateElement = article //
						.getElementsByTag("time").first();
				if (dateElement == null) {
					continue;
				}

				Element urlElement = article.getElementsByTag("a").first();
				if (urlElement != null) {
					intUrl = urlElement.absUrl("href");
				}

				id = dateElement.attr("datetime").replace(":", "_")
						.replace("+", "_");
				date = date(dateElement.attr("datetime"));

				Element titleElement = article.getElementsByTag("h2").first();
				if (titleElement != null) {
					title = StringUtils.unhtml(titleElement.text()).trim();
				}

				Element authorElement = article.getElementsByClass(
						"td-post-author-name").first();
				if (authorElement != null) {
					authorElement = authorElement.getElementsByTag("a").first();
				}
				if (authorElement != null) {
					author = StringUtils.unhtml(authorElement.text()).trim();
				}

				Element contentElement = article.getElementsByClass(
						"td-excerpt").first();
				if (contentElement != null) {
					body = StringUtils.unhtml(contentElement.text()).trim();
				}

				list.add(new Story(getType(), id, title, author, date, categ,
						details, intUrl, extUrl, body));
			}
		}

		return list;
	}

	@Override
	public void fetch(Story story) throws IOException {
		String fullContent = story.getContent();

		URL url = new URL(story.getUrlInternal());
		InputStream in = downloader.open(url);
		try {
			Document doc = DataUtil.load(in, "UTF-8", url.toString());
			Element article = doc.getElementsByTag("article").first();
			if (article != null) {
				article = article.getElementsByAttributeValue("itemprop",
						"articleBody").first();
			}
			if (article != null) {
				for (String line : toLines(article,
						new BasicElementProcessor() {
							@Override
							public boolean ignoreNode(Node node) {
								return node.attr("class").contains("chapo");
							}

							@Override
							public String isSubtitle(Node node) {
								if (node instanceof Element) {
									Element element = (Element) node;
									if (element.tagName().startsWith("h")
											&& element.tagName().length() == 2) {
										return element.text();
									}
								}
								return null;
							}
						})) {
					fullContent += line + "\n";
				}

				// Content is too tight with a single break per line:
				fullContent = fullContent.replace("\n", "\n\n") //
						.replace("\n\n\n\n", "\n\n") //
						.replace("\n\n\n\n", "\n\n") //
						.trim();
			}

			// Get comments URL then parse it, if possible
			Element posts = doc.getElementsByClass("comment-list").first();

			story.setFullContent(fullContent);
			story.setComments(getComments(posts));
		} finally {
			if (in != null) {
				in.close();
			}
		}
	}

	private List<Comment> getComments(Element posts) {
		List<Comment> comments = new ArrayList<Comment>();
		if (posts != null) {
			for (Element post : posts.children()) {
				if (!post.hasClass("comment")) {
					continue;
				}

				String id = "";
				String author = "";
				String title = "";
				String date = "";
				List<String> content = new ArrayList<String>();

				Element authorE = post.getElementsByTag("footer").first();
				if (authorE != null) {
					authorE = authorE.getElementsByTag("cite").first();
				}
				if (authorE != null) {
					author = StringUtils.unhtml(authorE.text()).trim();
				}

				Element idE = post.getElementsByTag("a").first();
				if (idE != null) {
					id = idE.attr("id");
					Element dateE = idE.getElementsByTag("span").first();
					if (dateE != null) {
						date = date(dateE.attr("data-epoch"));
					}
				}

				Element contentE = post.getElementsByClass("comment-content")
						.first();
				if (contentE != null) {
					for (String line : toLines(contentE,
							new BasicElementProcessor() {
								@Override
								public boolean ignoreNode(Node node) {
									// TODO: ignore headlines/pub
									if (node instanceof Element) {
										Element el = (Element) node;
										if ("h4".equals(el.tagName())) {
											return true;
										}
									}

									return false;
								}
							})) {
						content.add(line);
					}
				}

				// Since we have no title but still an author, let's switch:
				title = author;
				author = "";
				Comment comment = new Comment(id, author, title, date, content);
				comments.add(comment);

				Element children = post.getElementsByClass("children").first();
				comment.addAll(getComments(children));
			}
		}

		return comments;
	}
}
Commit	Line	Data
31755801 NR	1	package be.nikiroo.gofetch.support;
	2
	3	import java.io.IOException;
	4	import java.io.InputStream;
	5	import java.net.URL;
	6	import java.util.ArrayList;
	7	import java.util.List;
	8
	9	import org.jsoup.helper.DataUtil;
	10	import org.jsoup.nodes.Document;
	11	import org.jsoup.nodes.Element;
	12	import org.jsoup.nodes.Node;
	13	import org.jsoup.select.Elements;
	14
	15	import be.nikiroo.gofetch.data.Comment;
	16	import be.nikiroo.gofetch.data.Story;
	17	import be.nikiroo.utils.StringUtils;
	18
	19	/**
	20	* Support <a
	21	* href="https://www.erenumerique.fr/">https://www.erenumerique.fr/</a>.
	22	*
	23	* @author niki
	24	*/
	25	public class EreNumerique extends BasicSupport {
	26	@Override
	27	public String getDescription() {
	28	return "Ère Numérique.FR: faites le bon choix !";
	29	}
	30
	31	@Override
	32	public List<Story> list() throws IOException {
	33	List<Story> list = new ArrayList<Story>();
	34
	35	for (String categ : new String[] { "informatique" }) {
	36	URL url = new URL("https://www.erenumerique.fr/" + categ);
	37	InputStream in = downloader.open(url);
	38	Document doc = DataUtil.load(in, "UTF-8", url.toString());
	39	Elements articles = doc.getElementsByClass("item-details");
	40	for (Element article : articles) {
	41	String id = "";
	42	String intUrl = "";
	43	String extUrl = ""; // nope
	44	String title = "";
	45	String date = "";
	46	String author = "";
	47	String details = "";
	48	String body = "";
	49
	50	// MUST NOT fail:
	51	Element dateElement = article //
	52	.getElementsByTag("time").first();
	53	if (dateElement == null) {
	54	continue;
	55	}
	56
	57	Element urlElement = article.getElementsByTag("a").first();
	58	if (urlElement != null) {
	59	intUrl = urlElement.absUrl("href");
	60	}
	61
	62	id = dateElement.attr("datetime").replace(":", "_")
	63	.replace("+", "_");
	64	date = date(dateElement.attr("datetime"));
65
66	Element titleElement = article.getElementsByTag("h2").first();
67	if (titleElement != null) {
68	title = StringUtils.unhtml(titleElement.text()).trim();
69	}
70
71	Element authorElement = article.getElementsByClass(
72	"td-post-author-name").first();
73	if (authorElement != null) {
74	authorElement = authorElement.getElementsByTag("a").first();
75	}
76	if (authorElement != null) {
77	author = StringUtils.unhtml(authorElement.text()).trim();
78	}
79
80	Element contentElement = article.getElementsByClass(
81	"td-excerpt").first();
82	if (contentElement != null) {
83	body = StringUtils.unhtml(contentElement.text()).trim();
84	}
85
86	list.add(new Story(getType(), id, title, author, date, categ,
87	details, intUrl, extUrl, body));
88	}
89	}
90
91	return list;
92	}
93
94	@Override
95	public void fetch(Story story) throws IOException {
96	String fullContent = story.getContent();
97
98	URL url = new URL(story.getUrlInternal());
99	InputStream in = downloader.open(url);
100	try {
101	Document doc = DataUtil.load(in, "UTF-8", url.toString());
102	Element article = doc.getElementsByTag("article").first();
b9afb12e NR	103	if (article != null) {
	104	article = article.getElementsByAttributeValue("itemprop",
	105	"articleBody").first();
	106	}
31755801 NR	107	if (article != null) {
	108	for (String line : toLines(article,
	109	new BasicElementProcessor() {
b9afb12e NR	110	@Override
	111	public boolean ignoreNode(Node node) {
	112	return node.attr("class").contains("chapo");
	113	}
	114
	115	@Override
	116	public String isSubtitle(Node node) {
	117	if (node instanceof Element) {
	118	Element element = (Element) node;
	119	if (element.tagName().startsWith("h")
	120	&& element.tagName().length() == 2) {
	121	return element.text();
	122	}
	123	}
	124	return null;
	125	}
31755801 NR	126	})) {
	127	fullContent += line + "\n";
	128	}
	129
	130	// Content is too tight with a single break per line:
	131	fullContent = fullContent.replace("\n", "\n\n") //
	132	.replace("\n\n\n\n", "\n\n") //
	133	.replace("\n\n\n\n", "\n\n") //
	134	.trim();
	135	}
	136
	137	// Get comments URL then parse it, if possible
	138	Element posts = doc.getElementsByClass("comment-list").first();
	139
	140	story.setFullContent(fullContent);
	141	story.setComments(getComments(posts));
	142	} finally {
	143	if (in != null) {
	144	in.close();
	145	}
	146	}
	147	}
	148
	149	private List<Comment> getComments(Element posts) {
	150	List<Comment> comments = new ArrayList<Comment>();
	151	if (posts != null) {
	152	for (Element post : posts.children()) {
	153	if (!post.hasClass("comment")) {
	154	continue;
	155	}
	156
	157	String id = "";
	158	String author = "";
	159	String title = "";
	160	String date = "";
	161	List<String> content = new ArrayList<String>();
	162
	163	Element authorE = post.getElementsByTag("footer").first();
	164	if (authorE != null) {
	165	authorE = authorE.getElementsByTag("cite").first();
	166	}
	167	if (authorE != null) {
	168	author = StringUtils.unhtml(authorE.text()).trim();
	169	}
	170
	171	Element idE = post.getElementsByTag("a").first();
	172	if (idE != null) {
	173	id = idE.attr("id");
	174	Element dateE = idE.getElementsByTag("span").first();
	175	if (dateE != null) {
	176	date = date(dateE.attr("data-epoch"));
	177	}
	178	}
	179
	180	Element contentE = post.getElementsByClass("comment-content")
	181	.first();
	182	if (contentE != null) {
	183	for (String line : toLines(contentE,
	184	new BasicElementProcessor() {
	185	@Override
	186	public boolean ignoreNode(Node node) {
	187	// TODO: ignore headlines/pub
	188	if (node instanceof Element) {
	189	Element el = (Element) node;
190	if ("h4".equals(el.tagName())) {
191	return true;
192	}
193	}
194
195	return false;
196	}
197	})) {
198	content.add(line);
199	}
200	}
201
202	// Since we have no title but still an author, let's switch:
203	title = author;
204	author = "";
205	Comment comment = new Comment(id, author, title, date, content);
206	comments.add(comment);
207
208	Element children = post.getElementsByClass("children").first();
209	comment.addAll(getComments(children));
210	}
211	}
212
213	return comments;
214	}
215	}