X-Git-Url: http://git.nikiroo.be/?a=blobdiff_plain;f=src%2Fbe%2Fnikiroo%2Fgofetch%2Fsupport%2FLWN.java;h=eac12e5c239c37aa921d2945333d1b7fbc4c4e9a;hb=64a785f647e030efab3977e0a811e975f05a798f;hp=2fea78a864855529e79d3022e92da3476027d97a;hpb=bb0d9eb242c303df073dc80d39e24b8b10c1dddb;p=gofetch.git diff --git a/src/be/nikiroo/gofetch/support/LWN.java b/src/be/nikiroo/gofetch/support/LWN.java index 2fea78a..eac12e5 100644 --- a/src/be/nikiroo/gofetch/support/LWN.java +++ b/src/be/nikiroo/gofetch/support/LWN.java @@ -1,16 +1,16 @@ package be.nikiroo.gofetch.support; import java.io.IOException; -import java.io.InputStream; import java.net.URL; +import java.util.AbstractMap; import java.util.ArrayList; import java.util.List; +import java.util.Map.Entry; -import org.jsoup.helper.DataUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; -import org.jsoup.select.Elements; +import org.jsoup.nodes.TextNode; import be.nikiroo.gofetch.data.Comment; import be.nikiroo.gofetch.data.Story; @@ -27,193 +27,282 @@ public class LWN extends BasicSupport { } @Override - public List list() throws IOException { - List list = new ArrayList(); + public void fetch(Story story) throws IOException { + // Do not try the paid-for stories... + if (!story.getTitle().startsWith("[$]")) { + super.fetch(story); + } else { + String fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/]."; + story.setFullContent(fullContent); + story.setComments(new ArrayList()); + } + } - URL url = new URL("https://lwn.net/"); - InputStream in = open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements stories = doc.getElementsByClass("pure-u-1"); - for (Element story : stories) { - Elements titles = story.getElementsByClass("Headline"); - Elements listings = story.getElementsByClass("BlurbListing"); - if (titles.size() == 0) { - continue; - } - if (listings.size() == 0) { - continue; - } + @Override + protected List> getUrls() throws IOException { + List> urls = new ArrayList>(); + urls.add(new AbstractMap.SimpleEntry(new URL( + "https://lwn.net/"), "")); + return urls; + } - Element listing = listings.get(0); - if (listing.children().size() < 2) { - continue; - } + @Override + protected List getArticles(Document doc) { + return doc.getElementsByClass("pure-u-1"); + } - String title = titles.get(0).text(); - String details = listing.children().get(0).text(); - String body = ""; - // All but the first and two last children - for (int i = 1; i < listing.children().size() - 2; i++) { - Element e = listing.children().get(i); - body = body.trim() + " " + e.text().trim(); - } - body = body.trim(); + @Override + protected String getArticleId(Document doc, Element article) { + String id = getArticleIntUrl(doc, article).replaceAll("[^0-9]", ""); + while (id.length() < 10) { + id = "0" + id; + } + + return id; + } + + @Override + protected String getArticleTitle(Document doc, Element article) { + Element title = article.getElementsByClass("Headline").first(); + if (title != null) { + return title.text(); + } + + return ""; + } + + @Override + protected String getArticleAuthor(Document doc, Element article) { + String author = ""; + String details = getArticleDetailsReal(article); + int pos = details.indexOf(" by "); + if (pos >= 0) { + author = details.substring(pos + " by ".length()).trim(); + } + + return author; + } - String author = ""; - int pos = details.indexOf(" by "); + @Override + protected String getArticleDate(Document doc, Element article) { + String date = ""; + String details = getArticleDetailsReal(article); + int pos = details.indexOf(" Posted "); + if (pos >= 0) { + date = details.substring(pos + " Posted ".length()).trim(); + pos = date.indexOf(" by "); if (pos >= 0) { - author = details.substring(pos + " by ".length()).trim(); + date = date.substring(0, pos).trim(); } + } - String date = ""; - pos = details.indexOf(" Posted "); + return date; + } + + @Override + protected String getArticleCategory(Document doc, Element article, + String currentCategory) { + String categ = ""; + String details = getArticleDetailsReal(article); + int pos = details.indexOf("]"); + if (pos >= 0) { + categ = details.substring(1, pos).trim(); + } + + return categ; + } + + @Override + protected String getArticleDetails(Document doc, Element article) { + return ""; // We actually extract all the values + } + + @Override + protected String getArticleIntUrl(Document doc, Element article) { + String intUrl = ""; + for (Element idElem : article.getElementsByTag("a")) { + // Last link is the story link + intUrl = idElem.absUrl("href"); + int pos = intUrl.indexOf("#Comments"); if (pos >= 0) { - date = details.substring(pos + " Posted ".length()).trim(); + intUrl = intUrl.substring(0, pos - 1); } + } - String id = ""; - String intUrl = ""; - String extUrl = ""; - for (Element idElem : story.getElementsByTag("a")) { - // Last link is the story link - intUrl = idElem.absUrl("href"); - pos = intUrl.indexOf("#Comments"); - if (pos >= 0) { - intUrl = intUrl.substring(0, pos - 1); - } - id = intUrl.replaceAll("[^0-9]", ""); + return intUrl; + } + + @Override + protected String getArticleExtUrl(Document doc, Element article) { + return ""; + } + + @Override + protected String getArticleContent(Document doc, Element article) { + Element listing = article.getElementsByClass("BlurbListing").first(); + if (listing != null && listing.children().size() >= 2) { + String content = ""; + + // All but the first and two last children + for (int i = 1; i < listing.children().size() - 2; i++) { + Element e = listing.children().get(i); + content = content.trim() + " " + e.text().trim(); } - list.add(new Story(getType(), id, title, details, intUrl, extUrl, - body)); + return content; } - return list; + return ""; } @Override - public void fetch(Story story) throws IOException { - List comments = new ArrayList(); - String fullContent = story.getContent(); + protected Element getFullArticle(Document doc) { + return doc.getElementsByClass("ArticleText").first(); + } - // Do not try the paid-for stories... - if (!story.getTitle().startsWith("[$]")) { - URL url = new URL(story.getUrlInternal()); - InputStream in = open(url); - Document doc = DataUtil.load(in, "UTF-8", url.toString()); - Elements fullContentElements = doc - .getElementsByClass("ArticleText"); - if (fullContentElements.size() > 0) { - // comments.addAll(getComments(listing.get(0))); - fullContent = fullContentElements.get(0).text(); + @Override + protected List getFullArticleCommentPosts(Document doc, URL intUrl) { + return doc.getElementsByClass("lwn-u-1"); + } + + @Override + protected ElementProcessor getElementProcessorFullArticle() { + return new BasicElementProcessor() { + @Override + public boolean ignoreNode(Node node) { + if (node instanceof Element) { + Element el = (Element) node; + if ("Log in".equals(el.text().trim())) { + return true; + } + } else if (node instanceof TextNode) { + TextNode text = (TextNode) node; + String t = text.text().trim(); + if (t.equals("(") || t.equals("to post comments)")) { + return true; + } + } + + return false; } + }; + } - Elements listing = doc.getElementsByClass("lwn-u-1"); - if (listing.size() > 0) { - comments.addAll(getComments(listing.get(0))); + @Override + protected List getCommentCommentPosts(Document doc, + Element container) { + List commentElements = new ArrayList(); + if (container != null) { + for (Element possibleCommentElement : container.children()) { + if (possibleCommentElement.hasClass("CommentBox")) { + commentElements.add(possibleCommentElement); + } else if (possibleCommentElement.hasClass("Comment")) { + commentElements.add(possibleCommentElement); + } } - } else { - fullContent = "[$] Sorry, this article is currently available to LWN suscribers only [https://lwn.net/subscribe/]."; } - story.setFullContent(fullContent); - story.setComments(comments); + return commentElements; } - private List getComments(Element listing) { - List comments = new ArrayList(); - for (Element commentElement : listing.children()) { - if (commentElement.hasClass("CommentBox")) { - Comment comment = getComment(commentElement); - if (!comment.isEmpty()) { - comments.add(comment); - } - } else if (commentElement.hasClass("Comment")) { - if (comments.size() > 0) { - comments.get(comments.size() - 1).addAll( - getComments(commentElement)); + @Override + protected String getCommentId(Element post) { + return post.id(); + } + + @Override + protected String getCommentAuthor(Element post) { + Element detailsE = post.getElementsByClass("CommentPoster").first(); + if (detailsE != null) { + String details = detailsE.text(); + + int pos = details.lastIndexOf(" by "); + if (pos >= 0) { + details = details.substring(pos + " by ".length()).trim(); + + if (details.startsWith("Posted ")) { + return details.substring("Posted ".length()).trim(); } } } - return comments; + + return ""; } - private Comment getComment(Element commentElement) { - String title = firstOrEmpty(commentElement, "CommentTitle"); - String author = firstOrEmpty(commentElement, "CommentPoster"); + @Override + protected String getCommentTitle(Element post) { + Element title = post.getElementsByClass("CommentTitle").first(); + if (title != null) { + return title.text(); + } - String date = ""; - int pos = author.lastIndexOf(" by "); - if (pos >= 0) { - date = author.substring(0, pos).trim(); - author = author.substring(pos + " by ".length()).trim(); + return ""; + } + + @Override + protected String getCommentDate(Element post) { + Element detailsE = post.getElementsByClass("CommentPoster").first(); + if (detailsE != null) { + String details = detailsE.text(); - if (author.startsWith("Posted ")) { - author = author.substring("Posted ".length()).trim(); + int pos = details.lastIndexOf(" by "); + if (pos >= 0) { + return details.substring(0, pos).trim(); } } - String content = ""; - Elements commentBodyElements = commentElement - .getElementsByClass("CommentBody"); - if (commentBodyElements.size() > 0) { - for (Node contentNode : commentBodyElements.get(0).childNodes()) { - if (contentNode instanceof Element) { - Element contentElement = (Element) contentNode; - if (!contentElement.hasClass("CommentPoster")) { - content = content.trim() + " " - + contentElement.text().trim(); + return ""; + } + + @Override + protected Element getCommentContentElement(Element post) { + return post.getElementsByClass("CommentBody").first(); + } + + @Override + protected ElementProcessor getElementProcessorComment() { + return new BasicElementProcessor() { + @Override + public String processText(String text) { + while (text.startsWith(">")) { // comments + text = text.substring(1).trim(); + } + + return text; + } + + @Override + public boolean detectQuote(Node node) { + if (node instanceof Element) { + Element elementNode = (Element) node; + if (elementNode.tagName().equals("blockquote") + || elementNode.hasClass("QuotedText")) { + return true; } - } else { - content = content.trim() + " " - + contentNode.outerHtml().trim(); } + return false; } - content = content.trim(); - } - Comment comment = new Comment(commentElement.id(), author, title, date, - content); - - return comment; - } - - /** - * Get the first element of the given class, or an empty {@link String} if - * none found. - * - * @param element - * the element to look in - * @param className - * the class to look for - * - * @return the value or an empty {@link String} - */ - private String firstOrEmpty(Element element, String className) { - Elements subElements = element.getElementsByClass(className); - if (subElements.size() > 0) { - return subElements.get(0).text(); - } + @Override + public boolean ignoreNode(Node node) { + if (node instanceof Element) { + Element elementNode = (Element) node; + if (elementNode.hasClass("CommentPoster")) { + return true; + } + } - return ""; + return false; + } + }; } - /** - * Get the first element of the given tag, or an empty {@link String} if - * none found. - * - * @param element - * the element to look in - * @param tagName - * the tag to look for - * - * @return the value or an empty {@link String} - */ - private String firstOrEmptyTag(Element element, String tagName) { - Elements subElements = element.getElementsByTag(tagName); - if (subElements.size() > 0) { - return subElements.get(0).text(); + private String getArticleDetailsReal(Element article) { + Element listing = article.getElementsByClass("BlurbListing").first(); + // Valid articles have 2+ listings + if (listing != null && listing.children().size() >= 2) { + return listing.children().get(0).text(); } return "";