List<Story> list = new ArrayList<Story>();
URL url = new URL("https://lwn.net/");
- InputStream in = open(url);
+ InputStream in = downloader.open(url);
Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements stories = doc.getElementsByClass("pure-u-1");
- for (Element story : stories) {
- Elements titles = story.getElementsByClass("Headline");
- Elements listings = story.getElementsByClass("BlurbListing");
+ Elements articles = doc.getElementsByClass("pure-u-1");
+ for (Element article : articles) {
+ Elements titles = article.getElementsByClass("Headline");
+ Elements listings = article.getElementsByClass("BlurbListing");
if (titles.size() == 0) {
continue;
}
}
body = body.trim();
+ int pos;
+
+ String categ = "";
+ pos = details.indexOf("]");
+ if (pos >= 0) {
+ categ = details.substring(1, pos).trim();
+ }
+
String author = "";
- int pos = details.indexOf(" by ");
+ pos = details.indexOf(" by ");
if (pos >= 0) {
author = details.substring(pos + " by ".length()).trim();
}
pos = details.indexOf(" Posted ");
if (pos >= 0) {
date = details.substring(pos + " Posted ".length()).trim();
+ pos = date.indexOf(" by ");
+ if (pos >= 0) {
+ date = date.substring(0, pos).trim();
+ }
}
+ // We extracted everything from details so...
+ details = "";
+
String id = "";
String intUrl = "";
String extUrl = "";
- for (Element idElem : story.getElementsByTag("a")) {
+ for (Element idElem : article.getElementsByTag("a")) {
// Last link is the story link
intUrl = idElem.absUrl("href");
pos = intUrl.indexOf("#Comments");
id = intUrl.replaceAll("[^0-9]", "");
}
- list.add(new Story(getType(), id, title, details, intUrl, extUrl,
- body));
+ list.add(new Story(getType(), id, title, author, date, categ,
+ details, intUrl, extUrl, body));
}
return list;
// Do not try the paid-for stories...
if (!story.getTitle().startsWith("[$]")) {
URL url = new URL(story.getUrlInternal());
- InputStream in = open(url);
+ InputStream in = downloader.open(url);
Document doc = DataUtil.load(in, "UTF-8", url.toString());
Elements fullContentElements = doc
.getElementsByClass("ArticleText");
}
private Comment getComment(Element commentElement) {
- String title = firstOrEmpty(commentElement, "CommentTitle");
- String author = firstOrEmpty(commentElement, "CommentPoster");
+ String title = firstOrEmpty(commentElement, "CommentTitle").text();
+ String author = firstOrEmpty(commentElement, "CommentPoster").text();
String date = "";
int pos = author.lastIndexOf(" by ");
}
}
- String content = "";
+ Element content = null;
Elements commentBodyElements = commentElement
.getElementsByClass("CommentBody");
if (commentBodyElements.size() > 0) {
- for (Node contentNode : commentBodyElements.get(0).childNodes()) {
- if (contentNode instanceof Element) {
- Element contentElement = (Element) contentNode;
- if (!contentElement.hasClass("CommentPoster")) {
- content = content.trim() + " "
- + contentElement.text().trim();
- }
- } else {
- content = content.trim() + " "
- + contentNode.outerHtml().trim();
- }
-
- }
- content = content.trim();
+ content = commentBodyElements.get(0);
}
Comment comment = new Comment(commentElement.id(), author, title, date,
- content);
+ toLines(content));
return comment;
}
- /**
- * Get the first element of the given class, or an empty {@link String} if
- * none found.
- *
- * @param element
- * the element to look in
- * @param className
- * the class to look for
- *
- * @return the value or an empty {@link String}
- */
- private String firstOrEmpty(Element element, String className) {
- Elements subElements = element.getElementsByClass(className);
- if (subElements.size() > 0) {
- return subElements.get(0).text();
- }
+ private List<String> toLines(Element element) {
+ return toLines(element, new BasicElementProcessor() {
+ @Override
+ public String processText(String text) {
+ while (text.startsWith(">")) { // comments
+ text = text.substring(1).trim();
+ }
- return "";
- }
+ return text;
+ }
- /**
- * Get the first element of the given tag, or an empty {@link String} if
- * none found.
- *
- * @param element
- * the element to look in
- * @param tagName
- * the tag to look for
- *
- * @return the value or an empty {@link String}
- */
- private String firstOrEmptyTag(Element element, String tagName) {
- Elements subElements = element.getElementsByTag(tagName);
- if (subElements.size() > 0) {
- return subElements.get(0).text();
- }
+ @Override
+ public boolean detectQuote(Node node) {
+ if (node instanceof Element) {
+ Element elementNode = (Element) node;
+ if (elementNode.tagName().equals("blockquote")
+ || elementNode.hasClass("QuotedText")) {
+ return true;
+ }
+ }
- return "";
+ return false;
+ }
+
+ @Override
+ public boolean ignoreNode(Node node) {
+ if (node instanceof Element) {
+ Element elementNode = (Element) node;
+ if (elementNode.hasClass("CommentPoster")) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+ });
}
}