1 package be
.nikiroo
.gofetch
.support
;
3 import java
.io
.IOException
;
4 import java
.io
.InputStream
;
6 import java
.text
.ParseException
;
7 import java
.text
.SimpleDateFormat
;
8 import java
.util
.ArrayList
;
9 import java
.util
.Arrays
;
10 import java
.util
.Date
;
11 import java
.util
.HashMap
;
12 import java
.util
.List
;
14 import java
.util
.Map
.Entry
;
16 import org
.jsoup
.helper
.DataUtil
;
17 import org
.jsoup
.helper
.StringUtil
;
18 import org
.jsoup
.nodes
.Document
;
19 import org
.jsoup
.nodes
.Element
;
20 import org
.jsoup
.nodes
.Node
;
21 import org
.jsoup
.nodes
.TextNode
;
22 import org
.jsoup
.select
.NodeTraversor
;
23 import org
.jsoup
.select
.NodeVisitor
;
25 import be
.nikiroo
.gofetch
.data
.Comment
;
26 import be
.nikiroo
.gofetch
.data
.Story
;
27 import be
.nikiroo
.utils
.Downloader
;
28 import be
.nikiroo
.utils
.StringUtils
;
31 * Base class for website support.
35 public abstract class BasicSupport
{
37 * The downloader to use for all web sites via
38 * {@link BasicSupport#open(URL)}
40 static private Downloader downloader
= new Downloader("gofetcher");
42 static private String preselector
;
45 * The optional cookies to use to get the site data.
47 private Map
<String
, String
> cookies
= new HashMap
<String
, String
>();
52 * Login on the web site (this method does nothing by default, but can be
53 * overridden if needed).
56 * in case of I/O error
59 public void login() throws IOException
{
63 * The website textual description, to add in the dispatcher page.
67 * @return the description
69 abstract public String
getDescription();
72 * The gopher "selector" to use for output.
74 * A kind of "URL path", like "/news/" or "/misc/news/" or...
76 * @return the selector
78 public String
getSelector() {
79 return getSelector(getType());
87 public Type
getType() {
92 * List all the recent items, but only assure the ID and internal URL to
93 * fetch it later on (until it has been fetched, the rest of the
94 * {@link Story} is not confirmed).
96 * @return the list of new stories
101 public List
<Story
> list() throws IOException
{
102 List
<Story
> list
= new ArrayList
<Story
>();
105 for (Entry
<URL
, String
> entry
: getUrls()) {
106 URL url
= entry
.getKey();
107 String defaultCateg
= entry
.getValue();
108 if (defaultCateg
== null) {
112 InputStream in
= open(url
);
113 Document doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
114 List
<Element
> articles
= getArticles(doc
);
115 for (Element article
: articles
) {
116 String id
= getArticleId(doc
, article
).trim();
117 String title
= getArticleTitle(doc
, article
).trim();
118 String author
= getArticleAuthor(doc
, article
).trim();
119 String date
= getArticleDate(doc
, article
).trim();
120 String categ
= getArticleCategory(doc
, article
, defaultCateg
)
122 String details
= getArticleDetails(doc
, article
).trim();
123 String intUrl
= getArticleIntUrl(doc
, article
).trim();
124 String extUrl
= getArticleExtUrl(doc
, article
).trim();
125 String content
= getArticleContent(doc
, article
).trim();
127 if (id
.isEmpty() && date
.isEmpty()) {
132 while (id
.length() < 10) {
136 id
= date
.replace(":", "_").replace("+", "_").replace("/", "-");
141 list
.add(new Story(getType(), id
, title
, author
, date
, categ
,
142 details
, intUrl
, extUrl
, content
));
150 * The {@link URL}s to process for this website.
152 * @return the list of {@link URL}s
154 * @throws IOException
155 * in case of I/O error
157 abstract protected List
<Entry
<URL
, String
>> getUrls() throws IOException
;
160 * The article {@link Element}s of this document.
163 * the main document for the current category
165 * @return the articles
167 abstract protected List
<Element
> getArticles(Document doc
);
170 * The ID of the article (defaults to the date element if empty).
173 * the main document for the current category
175 * the article to look into
179 abstract protected String
getArticleId(Document doc
, Element article
);
182 * The article title to display.
185 * the main document for the current category
187 * the article to look into
191 abstract protected String
getArticleTitle(Document doc
, Element article
);
194 * The optional article author.
197 * the main document for the current category
199 * the article to look into
203 abstract protected String
getArticleAuthor(Document doc
, Element article
);
206 * The optional article date.
209 * the main document for the current category
211 * the article to look into
215 abstract protected String
getArticleDate(Document doc
, Element article
);
218 * the optional article category.
221 * the main document for the current category
223 * the article to look into
224 * @param currentCategory
225 * the currently listed category if any (can be NULL)
227 * @return the category
229 abstract protected String
getArticleCategory(Document doc
, Element article
,
230 String currentCategory
);
233 * the optional details of the article (can replace the date, author and
234 * category, for instance).
237 * the main document for the current category
239 * the article to look into
241 * @return the details
243 abstract protected String
getArticleDetails(Document doc
, Element article
);
246 * The (required) {@link URL} that points to the news page on the supported
250 * the main document for the current category
252 * the article to look into
254 * @return the internal {@link URL}
256 abstract protected String
getArticleIntUrl(Document doc
, Element article
);
259 * the optional {@link URL} that points to an external website for more
263 * the main document for the current category
265 * the article to look into
267 * @return the external {@link URL}
269 abstract protected String
getArticleExtUrl(Document doc
, Element article
);
272 * The optional article short-content (not the full content, that will be
273 * fetched by {@link BasicSupport#fetch(Story)}).
276 * the main document for the current category
278 * the article to look into
280 * @return the short content
282 abstract protected String
getArticleContent(Document doc
, Element article
);
285 * Fetch the full article content as well as all the comments associated to
286 * this {@link Story}, if any (can be empty, but not NULL).
289 * the story to fetch the comments of
291 * @throws IOException
292 * in case of I/O error
294 public void fetch(Story story
) throws IOException
{
295 String fullContent
= "";
297 URL url
= new URL(story
.getUrlInternal());
298 InputStream in
= open(url
);
300 Document doc
= DataUtil
.load(in
, "UTF-8", url
.toString());
301 Element article
= getFullArticle(doc
);
302 if (article
!= null) {
303 fullContent
= getArticleText(article
);
306 if (fullContent
.isEmpty()) {
307 fullContent
= story
.getContent();
310 story
.setFullContent(fullContent
);
311 story
.setComments(getComments(doc
,
312 getFullArticleCommentPosts(doc
, url
)));
321 * Return the text from this {@link Element}, using the
322 * {@link BasicSupport#getElementProcessorFullArticle()} processor logic.
325 * the element to extract the text from
329 protected String
getArticleText(Element article
) {
330 StringBuilder builder
= new StringBuilder();
331 ElementProcessor eProc
= getElementProcessorFullArticle();
333 for (String line
: toLines(article
, eProc
)) {
334 builder
.append(line
+ "\n");
337 builder
.append(article
.text());
340 // Content is too tight with a single break per line:
341 return builder
.toString().replace("\n", "\n\n") //
342 .replace("\n\n\n\n", "\n\n") //
343 .replace("\n\n\n\n", "\n\n") //
348 * Return the full article if available (this is the article to retrieve
349 * from the newly downloaded page at {@link Story#getUrlInternal()}).
352 * the (full article) document to work on
354 * @return the article or NULL
356 abstract protected Element
getFullArticle(Document doc
);
359 * Return the list of comment {@link Element}s from this optional container
360 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
363 * the (full article) document to work on
365 * the internal {@link URL} this article wa taken from (the
366 * {@link URL} from the supported website)
368 * @return the list of comment posts
370 abstract protected List
<Element
> getFullArticleCommentPosts(Document doc
,
374 * The {@link ElementProcessor} to use to convert the main article element
375 * (see {@link BasicSupport#getFullArticle(Document)}) into text.
377 * See {@link BasicElementProcessor} for a working, basic implementation.
379 * Can be NULL to simply use {@link Element#text()}.
381 * @return the processor, or NULL
383 abstract protected ElementProcessor
getElementProcessorFullArticle();
386 * Open a network resource.
388 * You need to close the returned {@link InputStream} when done.
393 * @return the content
395 * @throws IOException
396 * in case of I/O error
398 protected InputStream
open(URL url
) throws IOException
{
399 return downloader
.open(url
, url
, cookies
, null, null, null);
403 * Convert the comment elements into {@link Comment}s
406 * the document we work on
408 * the comment elements
410 * @return the converted {@link Comment}s
412 private List
<Comment
> getComments(Document doc
, List
<Element
> posts
) {
413 List
<Comment
> comments
= new ArrayList
<Comment
>();
415 for (Element post
: posts
) {
416 String id
= getCommentId(post
).trim();
417 String author
= getCommentAuthor(post
).trim();
418 String title
= getCommentTitle(post
).trim();
419 String date
= getCommentDate(post
).trim();
421 List
<String
> content
= new ArrayList
<String
>();
429 Element contentE
= getCommentContentElement(post
);
430 if (contentE
!= null) {
431 ElementProcessor eProc
= getElementProcessorComment();
433 for (String line
: toLines(contentE
, eProc
)) {
437 content
= Arrays
.asList(contentE
.text().split("\n"));
441 Comment comment
= new Comment(id
, author
, title
, date
, content
);
442 comment
.addAll(getComments(doc
,
443 getCommentCommentPosts(doc
, post
)));
445 if (!comment
.isEmpty()) {
446 comments
.add(comment
);
455 * Return the list of subcomment {@link Element}s from this comment element
456 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
459 * the (full article) document to work on
461 * the container (a comment {@link Element})
463 * @return the list of comment posts
465 abstract protected List
<Element
> getCommentCommentPosts(Document doc
,
469 * Compute the ID of the given comment element.
472 * the comment element
476 abstract protected String
getCommentId(Element post
);
479 * Compute the author of the given comment element.
482 * the comment element
486 abstract protected String
getCommentAuthor(Element post
);
489 * Compute the title of the given comment element.
492 * the comment element
496 abstract protected String
getCommentTitle(Element post
);
499 * Compute the date of the given comment element.
502 * the comment element
506 abstract protected String
getCommentDate(Element post
);
509 * Get the main of the given comment element, which can be NULL.
512 * the comment element
514 * @return the element
516 abstract protected Element
getCommentContentElement(Element post
);
519 * The {@link ElementProcessor} to use to convert the main comment element
520 * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
522 * See {@link BasicElementProcessor} for a working, basic implementation.
524 * Can be NULL to simply use {@link Element#text()}.
526 * @return the processor
528 abstract protected ElementProcessor
getElementProcessorComment();
536 protected void setType(Type type
) {
541 * Add a cookie for all site connections.
548 protected void addCookie(String name
, String value
) {
549 cookies
.put(name
, value
);
553 * The {@link String} to append to the selector (the selector will be
554 * constructed as "this string" then "/type/".
557 * the preselector to set
559 static public void setPreselector(String preselector
) {
560 BasicSupport
.preselector
= preselector
;
564 * Return a {@link BasicSupport} that is compatible with the given
565 * {@link Type} if it exists (or NULL if not).
570 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
572 static public BasicSupport
getSupport(Type type
) {
573 BasicSupport support
= null;
578 support
= new Slashdot();
581 support
= new Pipedot();
587 support
= new LeMonde();
590 support
= new TheRegister();
593 support
= new TooLinux();
596 support
= new EreNumerique();
599 support
= new Phoronix();
602 support
= new SeptSurSept();
605 support
= new Reddit();
609 if (support
!= null) {
610 support
.setType(type
);
618 * The gopher "selector" to use for output for this type, using the
621 * A kind of "URL path", like "/news/" or "/misc/news/" or...
624 * the type to get the selector of
626 * @return the selector
628 static public String
getSelector(Type type
) {
629 return preselector
+ "/" + type
+ "/";
633 * Process the given element into text (each line is a text paragraph and
634 * can be prepended with ">" signs to indicate a quote or sub-quote or
638 * the element to process
639 * @param elementProcessor
640 * the element processor, must not be NULL
642 * @return text lines, each line is a paragraph
644 static protected List
<String
> toLines(Element element
,
645 final ElementProcessor elementProcessor
) {
646 final List
<String
> lines
= new ArrayList
<String
>();
647 final StringBuilder currentLine
= new StringBuilder();
648 final List
<Integer
> quoted
= new ArrayList
<Integer
>();
649 final List
<Node
> ignoredNodes
= new ArrayList
<Node
>();
650 final List
<String
> footnotes
= new ArrayList
<String
>();
652 if (element
!= null) {
653 new NodeTraversor(new NodeVisitor() {
655 public void head(Node node
, int depth
) {
656 String manual
= null;
657 boolean ignore
= elementProcessor
.ignoreNode(node
)
658 || ignoredNodes
.contains(node
.parentNode());
661 manual
= elementProcessor
.manualProcessing(node
);
662 if (manual
!= null) {
663 currentLine
.append(manual
);
670 String subtitle
= elementProcessor
.isSubtitle(node
);
671 if (subtitle
!= null) {
672 subtitle
= subtitle
.trim();
673 currentLine
.append("\n[ " + subtitle
+ " ]\n");
680 if (node
instanceof Element
) {
681 Element el
= (Element
) node
;
682 if ("pre".equals(el
.tagName())) {
683 currentLine
.append(StringUtils
684 .unhtml(el
.text()).trim());
691 ignoredNodes
.add(node
);
696 for (int i
= 0; i
< quoted
.size(); i
++) {
701 boolean enterQuote
= elementProcessor
.detectQuote(node
);
702 boolean leaveQuote
= quoted
.contains(depth
);
709 quoted
.remove(Integer
.valueOf(depth
));
712 if (enterQuote
|| leaveQuote
) {
713 if (currentLine
.length() > 0) {
714 if (currentLine
.charAt(currentLine
.length() - 1) == '\n') {
715 currentLine
.setLength(currentLine
.length() - 1);
717 for (String l
: currentLine
.toString().split("\n")) {
721 currentLine
.setLength(0);
724 if (node
instanceof Element
) {
725 Element element
= (Element
) node
;
726 boolean block
= element
.isBlock()
727 || element
.tagName().equalsIgnoreCase("br");
728 if (block
&& currentLine
.length() > 0) {
729 currentLine
.append("\n");
732 if (!element
.absUrl("href").trim().isEmpty()) {
733 footnotes
.add(element
.absUrl("href"));
734 currentLine
.append("[" + footnotes
.size() + "]");
736 } else if (node
instanceof TextNode
) {
737 TextNode textNode
= (TextNode
) node
;
738 String line
= StringUtil
.normaliseWhitespace(textNode
741 currentLine
.append(elementProcessor
.processText(line
));
742 currentLine
.append(" ");
747 public void tail(Node node
, int depth
) {
749 }).traverse(element
);
752 if (currentLine
.length() > 0) {
754 for (int i
= 0; i
< quoted
.size(); i
++) {
758 if (currentLine
.length() > 0) {
759 if (currentLine
.charAt(currentLine
.length() - 1) == '\n') {
760 currentLine
.setLength(currentLine
.length() - 1);
762 for (String l
: currentLine
.toString().split("\n")) {
768 // Fix spaces and nbsp, remove multiple following blank lines
769 List
<String
> linesCopy
= new ArrayList
<String
>(lines
.size());
771 for (int i
= 0; i
< lines
.size(); i
++) {
772 String line
= lines
.get(i
).replace(" ", " ") // nbsp -> space
773 .replace(" ", " ").trim();
774 if (line
.isEmpty()) {
785 // Footnotes insertion
786 if (footnotes
.size() > 0) {
791 for (int i
= 0; i
< footnotes
.size(); i
++) {
792 linesCopy
.add("[" + (i
+ 1) + "] " + footnotes
.get(i
));
800 * Reformat the date if possible.
805 * @return the reformated date, or the same value if it was not parsable
807 static private String
date(String date
) {
808 SimpleDateFormat out
= new SimpleDateFormat("yyyy/MM/dd");
812 epoch
= Long
.parseLong(date
.trim());
813 } catch (Exception e
) {
818 return out
.format(new Date(1000 * epoch
));
822 Date dat
= new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
824 return out
.format(dat
);
825 } catch (Exception e
) {