1 package be
.nikiroo
.gofetch
.support
;
3 import java
.io
.IOException
;
4 import java
.io
.InputStream
;
6 import java
.text
.ParseException
;
7 import java
.text
.SimpleDateFormat
;
8 import java
.util
.ArrayList
;
9 import java
.util
.Arrays
;
10 import java
.util
.Date
;
11 import java
.util
.List
;
12 import java
.util
.Map
.Entry
;
14 import org
.jsoup
.helper
.DataUtil
;
15 import org
.jsoup
.helper
.StringUtil
;
16 import org
.jsoup
.nodes
.Document
;
17 import org
.jsoup
.nodes
.Element
;
18 import org
.jsoup
.nodes
.Node
;
19 import org
.jsoup
.nodes
.TextNode
;
20 import org
.jsoup
.select
.NodeTraversor
;
21 import org
.jsoup
.select
.NodeVisitor
;
23 import be
.nikiroo
.gofetch
.data
.Comment
;
24 import be
.nikiroo
.gofetch
.data
.Story
;
25 import be
.nikiroo
.utils
.Downloader
;
26 import be
.nikiroo
.utils
.StringUtils
;
29 * Base class for website support.
33 public abstract class BasicSupport
{
35 * The downloader to use for all websites via {@link BasicSupport#open(URL)}
37 static private Downloader downloader
= new Downloader("gofetcher");
39 static private String preselector
;
44 * The website textual description, to add in the dispatcher page.
48 * @return the description
50 abstract public String
getDescription();
53 * The gopher "selector" to use for output.
55 * A kind of "URL path", like "/news/" or "/misc/news/" or...
57 * @return the selector
59 public String
getSelector() {
60 return getSelector(getType());
68 public Type
getType() {
73 * List all the recent items, but only assure the ID and internal URL to
74 * fetch it later on (until it has been fetched, the rest of the
75 * {@link Story} is not confirmed).
77 * @return the list of new stories
82 public List
<Story
> list() throws IOException
{
83 List
<Story
> list
= new ArrayList
<Story
>();
85 for (Entry
<URL
, String
> entry
: getUrls()) {
86 URL url
= entry
.getKey();
87 String defaultCateg
= entry
.getValue();
88 if (defaultCateg
== null) {
92 InputStream in
= open(url
);
93 Document doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
94 List
<Element
> articles
= getArticles(doc
);
95 for (Element article
: articles
) {
96 String id
= getArticleId(doc
, article
).trim();
97 String title
= getArticleTitle(doc
, article
).trim();
98 String author
= getArticleAuthor(doc
, article
).trim();
99 String date
= getArticleDate(doc
, article
).trim();
100 String categ
= getArticleCategory(doc
, article
, defaultCateg
)
102 String details
= getArticleDetails(doc
, article
).trim();
103 String intUrl
= getArticleIntUrl(doc
, article
).trim();
104 String extUrl
= getArticleExtUrl(doc
, article
).trim();
105 String content
= getArticleContent(doc
, article
).trim();
107 if (id
.isEmpty() && date
.isEmpty()) {
112 while (id
.length() < 10) {
116 id
= date
.replace(":", "_").replace("+", "_");
121 list
.add(new Story(getType(), id
, title
, author
, date
, categ
,
122 details
, intUrl
, extUrl
, content
));
130 * The {@link URL}s to process for this website.
132 * @return the list of {@link URL}s
134 * @throws IOException
135 * in case of I/O error
137 abstract protected List
<Entry
<URL
, String
>> getUrls() throws IOException
;
140 * The article {@link Element}s of this document.
143 * the main document for the current category
145 * @return the articles
147 abstract protected List
<Element
> getArticles(Document doc
);
150 * The ID of the article (defaults to the date element if empty).
153 * the main document for the current category
155 * the article to look into
159 abstract protected String
getArticleId(Document doc
, Element article
);
162 * The article title to display.
165 * the main document for the current category
167 * the article to look into
171 abstract protected String
getArticleTitle(Document doc
, Element article
);
174 * The optional article author.
177 * the main document for the current category
179 * the article to look into
183 abstract protected String
getArticleAuthor(Document doc
, Element article
);
186 * The optional article date.
189 * the main document for the current category
191 * the article to look into
195 abstract protected String
getArticleDate(Document doc
, Element article
);
198 * the optional article category.
201 * the main document for the current category
203 * the article to look into
204 * @param currentCategory
205 * the currently listed category if any (can be NULL)
207 * @return the category
209 abstract protected String
getArticleCategory(Document doc
, Element article
,
210 String currentCategory
);
213 * the optional details of the article (can replace the date, author and
214 * category, for instance).
217 * the main document for the current category
219 * the article to look into
221 * @return the details
223 abstract protected String
getArticleDetails(Document doc
, Element article
);
226 * The (required) {@link URL} that points to the news page on the supported
230 * the main document for the current category
232 * the article to look into
234 * @return the internal {@link URL}
236 abstract protected String
getArticleIntUrl(Document doc
, Element article
);
239 * the optional {@link URL} that points to an external website for more
243 * the main document for the current category
245 * the article to look into
247 * @return the external {@link URL}
249 abstract protected String
getArticleExtUrl(Document doc
, Element article
);
252 * The optional article short-content (not the full content, that will be
253 * fetched by {@link BasicSupport#fetch(Story)}).
256 * the main document for the current category
258 * the article to look into
260 * @return the short content
262 abstract protected String
getArticleContent(Document doc
, Element article
);
265 * Fetch the full article content as well as all the comments associated to
266 * this {@link Story}, if any (can be empty, but not NULL).
269 * the story to fetch the comments of
271 * @throws IOException
272 * in case of I/O error
274 public void fetch(Story story
) throws IOException
{
275 String fullContent
= "";
277 URL url
= new URL(story
.getUrlInternal());
278 InputStream in
= open(url
);
280 Document doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
281 Element article
= getFullArticle(doc
);
282 if (article
!= null) {
283 StringBuilder builder
= new StringBuilder();
284 ElementProcessor eProc
= getElementProcessorFullArticle();
286 for (String line
: toLines(article
, eProc
)) {
287 builder
.append(line
+ "\n");
290 builder
.append(article
.text());
293 // Content is too tight with a single break per line:
294 fullContent
= builder
.toString().replace("\n", "\n\n") //
295 .replace("\n\n\n\n", "\n\n") //
296 .replace("\n\n\n\n", "\n\n") //
300 if (fullContent
.isEmpty()) {
301 fullContent
= story
.getContent();
304 story
.setFullContent(fullContent
);
305 story
.setComments(getComments(doc
,
306 getFullArticleCommentPosts(doc
, url
)));
315 * Return the full article if available.
318 * the (full article) document to work on
320 * @return the article or NULL
322 abstract protected Element
getFullArticle(Document doc
);
325 * Return the list of comment {@link Element}s from this optional container
326 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
329 * the (full article) document to work on
331 * the internal {@link URL} this article wa taken from (the
332 * {@link URL} from the supported website)
334 * @return the list of comment posts
336 abstract protected List
<Element
> getFullArticleCommentPosts(Document doc
,
340 * The {@link ElementProcessor} to use to convert the main article element
341 * (see {@link BasicSupport#getFullArticle(Document)}) into text.
343 * See {@link BasicElementProcessor} for a working, basic implementation.
345 * Can be NULL to simply use {@link Element#text()}.
347 * @return the processor, or NULL
349 abstract protected ElementProcessor
getElementProcessorFullArticle();
352 * Open a network resource.
354 * You need to close the returned {@link InputStream} when done.
359 * @return the content
361 * @throws IOException
362 * in case of I/O error
364 protected InputStream
open(URL url
) throws IOException
{
365 return downloader
.open(url
);
369 * Convert the comment elements into {@link Comment}s
372 * the document we work on
374 * the comment elements
376 * @return the converted {@link Comment}s
378 private List
<Comment
> getComments(Document doc
, List
<Element
> posts
) {
379 List
<Comment
> comments
= new ArrayList
<Comment
>();
381 for (Element post
: posts
) {
382 String id
= getCommentId(post
).trim();
383 String author
= getCommentAuthor(post
).trim();
384 String title
= getCommentTitle(post
).trim();
385 String date
= getCommentDate(post
).trim();
387 List
<String
> content
= new ArrayList
<String
>();
395 Element contentE
= getCommentContentElement(post
);
396 if (contentE
!= null) {
397 ElementProcessor eProc
= getElementProcessorComment();
399 for (String line
: toLines(contentE
, eProc
)) {
403 content
= Arrays
.asList(contentE
.text().split("\n"));
407 Comment comment
= new Comment(id
, author
, title
, date
, content
);
408 comment
.addAll(getComments(doc
,
409 getCommentCommentPosts(doc
, post
)));
411 if (!comment
.isEmpty()) {
412 comments
.add(comment
);
421 * Return the list of subcomment {@link Element}s from this comment element
422 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
425 * the (full article) document to work on
427 * the container (a comment {@link Element})
429 * @return the list of comment posts
431 abstract protected List
<Element
> getCommentCommentPosts(Document doc
,
435 * Compute the ID of the given comment element.
438 * the comment element
442 abstract protected String
getCommentId(Element post
);
445 * Compute the author of the given comment element.
448 * the comment element
452 abstract protected String
getCommentAuthor(Element post
);
455 * Compute the title of the given comment element.
458 * the comment element
462 abstract protected String
getCommentTitle(Element post
);
465 * Compute the date of the given comment element.
468 * the comment element
472 abstract protected String
getCommentDate(Element post
);
475 * Get the main of the given comment element, which can be NULL.
478 * the comment element
480 * @return the element
482 abstract protected Element
getCommentContentElement(Element post
);
485 * The {@link ElementProcessor} to use to convert the main comment element
486 * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
488 * See {@link BasicElementProcessor} for a working, basic implementation.
490 * Can be NULL to simply use {@link Element#text()}.
492 * @return the processor
494 abstract protected ElementProcessor
getElementProcessorComment();
502 protected void setType(Type type
) {
507 * The {@link String} to append to the selector (the selector will be
508 * constructed as "this string" then "/type/".
511 * the preselector to set
513 static public void setPreselector(String preselector
) {
514 BasicSupport
.preselector
= preselector
;
518 * Return a {@link BasicSupport} that is compatible with the given
519 * {@link Type} if it exists (or NULL if not).
524 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
526 static public BasicSupport
getSupport(Type type
) {
527 BasicSupport support
= null;
532 support
= new Slashdot();
535 support
= new Pipedot();
541 support
= new LeMonde();
544 support
= new TheRegister();
547 support
= new TooLinux();
550 support
= new EreNumerique();
553 support
= new Phoronix();
557 if (support
!= null) {
558 support
.setType(type
);
566 * The gopher "selector" to use for output for this type, using the
569 * A kind of "URL path", like "/news/" or "/misc/news/" or...
572 * the type to get the selector of
574 * @return the selector
576 static public String
getSelector(Type type
) {
577 return preselector
+ "/" + type
+ "/";
581 * Process the given element into text (each line is a text paragraph and
582 * can be prepended with ">" signs to indicate a quote or sub-quote or
586 * the element to process
587 * @param elementProcessor
588 * the element processor, must not be NULL
590 * @return text lines, each line is a paragraph
592 static protected List
<String
> toLines(Element element
,
593 final ElementProcessor elementProcessor
) {
594 final List
<String
> lines
= new ArrayList
<String
>();
595 final StringBuilder currentLine
= new StringBuilder();
596 final List
<Integer
> quoted
= new ArrayList
<Integer
>();
597 final List
<Node
> ignoredNodes
= new ArrayList
<Node
>();
598 final List
<String
> footnotes
= new ArrayList
<String
>();
600 if (element
!= null) {
601 new NodeTraversor(new NodeVisitor() {
603 public void head(Node node
, int depth
) {
604 String manual
= null;
605 boolean ignore
= elementProcessor
.ignoreNode(node
)
606 || ignoredNodes
.contains(node
.parentNode());
609 manual
= elementProcessor
.manualProcessing(node
);
610 if (manual
!= null) {
611 currentLine
.append(manual
);
618 String subtitle
= elementProcessor
.isSubtitle(node
);
619 if (subtitle
!= null) {
620 subtitle
= subtitle
.trim();
621 currentLine
.append("\n[ " + subtitle
+ " ]\n");
628 if (node
instanceof Element
) {
629 Element el
= (Element
) node
;
630 if ("pre".equals(el
.tagName())) {
631 currentLine
.append(StringUtils
632 .unhtml(el
.text()).trim());
639 ignoredNodes
.add(node
);
644 for (int i
= 0; i
< quoted
.size(); i
++) {
649 boolean enterQuote
= elementProcessor
.detectQuote(node
);
650 boolean leaveQuote
= quoted
.contains(depth
);
657 quoted
.remove(Integer
.valueOf(depth
));
660 if (enterQuote
|| leaveQuote
) {
661 if (currentLine
.length() > 0) {
662 if (currentLine
.charAt(currentLine
.length() - 1) == '\n') {
663 currentLine
.setLength(currentLine
.length() - 1);
665 for (String l
: currentLine
.toString().split("\n")) {
669 currentLine
.setLength(0);
672 if (node
instanceof Element
) {
673 Element element
= (Element
) node
;
674 boolean block
= element
.isBlock()
675 || element
.tagName().equalsIgnoreCase("br");
676 if (block
&& currentLine
.length() > 0) {
677 currentLine
.append("\n");
680 if (!element
.absUrl("href").trim().isEmpty()) {
681 footnotes
.add(element
.absUrl("href"));
682 currentLine
.append("[" + footnotes
.size() + "]");
684 } else if (node
instanceof TextNode
) {
685 TextNode textNode
= (TextNode
) node
;
686 String line
= StringUtil
.normaliseWhitespace(textNode
689 currentLine
.append(elementProcessor
.processText(line
));
690 currentLine
.append(" ");
695 public void tail(Node node
, int depth
) {
697 }).traverse(element
);
700 if (currentLine
.length() > 0) {
702 for (int i
= 0; i
< quoted
.size(); i
++) {
706 if (currentLine
.length() > 0) {
707 if (currentLine
.charAt(currentLine
.length() - 1) == '\n') {
708 currentLine
.setLength(currentLine
.length() - 1);
710 for (String l
: currentLine
.toString().split("\n")) {
716 // Fix spaces and nbsp, remove multiple following blank lines
717 List
<String
> linesCopy
= new ArrayList
<String
>(lines
.size());
719 for (int i
= 0; i
< lines
.size(); i
++) {
720 String line
= lines
.get(i
).replace(" ", " ") // nbsp -> space
721 .replace(" ", " ").trim();
722 if (line
.isEmpty()) {
733 // Footnotes insertion
734 if (footnotes
.size() > 0) {
739 for (int i
= 0; i
< footnotes
.size(); i
++) {
740 linesCopy
.add("[" + (i
+ 1) + "] " + footnotes
.get(i
));
748 * Reformat the date if possible.
753 * @return the reformated date, or the same value if it was not parsable
755 static private String
date(String date
) {
756 SimpleDateFormat out
= new SimpleDateFormat("yyyy/MM/dd");
760 epoch
= Long
.parseLong(date
.trim());
761 } catch (Exception e
) {
766 return out
.format(new Date(1000 * epoch
));
770 Date dat
= new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
772 return out
.format(dat
);
773 } catch (ParseException e
) {