for (String topic : new String[] { "international", "politique",
"societe", "sciences" }) {
URL url = new URL("http://www.lemonde.fr/" + topic + "/1.html");
- InputStream in = open(url);
+ InputStream in = downloader.open(url);
Document doc = DataUtil.load(in, "UTF-8", url.toString());
Elements articles = doc.getElementsByTag("article");
for (Element article : articles) {
Elements contentElements = article.getElementsByClass("txt3");
if (times.size() > 0 && titleElements.size() > 0
&& contentElements.size() > 0) {
- String id = times.get(0).attr("datetime").replace(":", "_");
+ String id = times.get(0).attr("datetime").replace(":", "_")
+ .replace("+", "_");
String title = "[" + topic + "] "
+ titleElements.get(0).text();
String content = contentElements.get(0).text();
// some javascript, I need to check...)
URL url = new URL(story.getUrlInternal());
- InputStream in = open(url);
+ InputStream in = downloader.open(url);
Document doc = DataUtil.load(in, "UTF-8", url.toString());
Element article = doc.getElementById("articleBody");
if (article != null) {
- for (String line : toLines(article, new QuoteProcessor() {
- @Override
- public String processText(String text) {
- return text;
- }
-
+ for (String line : toLines(article, new BasicElementProcessor() {
@Override
public boolean ignoreNode(Node node) {
if (node instanceof Element) {
return false;
}
- @Override
- public boolean detectQuote(Node node) {
- return false;
- }
-
@Override
public String manualProcessing(Node node) {
if (node instanceof Element) {