- URL url = new URL("https://lwn.net/");
- InputStream in = open(url);
- Document doc = DataUtil.load(in, "UTF-8", url.toString());
- Elements stories = doc.getElementsByClass("pure-u-1");
- for (Element story : stories) {
- Elements titles = story.getElementsByClass("Headline");
- Elements listings = story.getElementsByClass("BlurbListing");
- if (titles.size() == 0) {
- continue;
- }
- if (listings.size() == 0) {
- continue;
- }
-
- Element listing = listings.get(0);
- if (listing.children().size() < 2) {
- continue;
- }
-
-
- String title = titles.get(0).text();
- String details = listing.children().get(0).text();
- String body = listing.children().get(1).text();
-
- String author = "";
- int pos = details.indexOf(" by ");
+ @Override
+ protected List<Entry<URL, String>> getUrls() throws IOException {
+ List<Entry<URL, String>> urls = new ArrayList<Entry<URL, String>>();
+ urls.add(new AbstractMap.SimpleEntry<URL, String>(new URL(
+ "https://lwn.net/"), ""));
+ return urls;
+ }
+
+ @Override
+ protected List<Element> getArticles(Document doc) {
+ return doc.getElementsByClass("pure-u-1");
+ }
+
+ @Override
+ protected String getArticleId(Document doc, Element article) {
+ return getArticleIntUrl(doc, article).replaceAll("[^0-9]", "");
+ }
+
+ @Override
+ protected String getArticleTitle(Document doc, Element article) {
+ Element title = article.getElementsByClass("Headline").first();
+ if (title != null) {
+ return title.text();
+ }
+
+ return "";
+ }
+
+ @Override
+ protected String getArticleAuthor(Document doc, Element article) {
+ String author = "";
+ String details = getArticleDetailsReal(article);
+ int pos = details.indexOf(" by ");
+ if (pos >= 0) {
+ author = details.substring(pos + " by ".length()).trim();
+ }
+
+ return author;
+ }
+
+ @Override
+ protected String getArticleDate(Document doc, Element article) {
+ String date = "";
+ String details = getArticleDetailsReal(article);
+ int pos = details.indexOf(" Posted ");
+ if (pos >= 0) {
+ date = details.substring(pos + " Posted ".length()).trim();
+ pos = date.indexOf(" by ");