Reddit test: add expected files
[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.URL;
6 import java.text.ParseException;
7 import java.text.SimpleDateFormat;
8 import java.util.ArrayList;
9 import java.util.Arrays;
10 import java.util.Date;
11 import java.util.HashMap;
12 import java.util.List;
13 import java.util.Map;
14 import java.util.Map.Entry;
15
16 import org.jsoup.helper.DataUtil;
17 import org.jsoup.helper.StringUtil;
18 import org.jsoup.nodes.Document;
19 import org.jsoup.nodes.Element;
20 import org.jsoup.nodes.Node;
21 import org.jsoup.nodes.TextNode;
22 import org.jsoup.select.NodeTraversor;
23 import org.jsoup.select.NodeVisitor;
24
25 import be.nikiroo.gofetch.data.Comment;
26 import be.nikiroo.gofetch.data.Story;
27 import be.nikiroo.utils.Downloader;
28 import be.nikiroo.utils.StringUtils;
29
30 /**
31 * Base class for website support.
32 *
33 * @author niki
34 */
35 public abstract class BasicSupport {
36 /**
37 * The downloader to use for all web sites via
38 * {@link BasicSupport#open(URL)}
39 */
40 static private Downloader downloader = new Downloader("gofetcher");
41
42 static private String preselector;
43
44 /**
45 * The optional cookies to use to get the site data.
46 */
47 private Map<String, String> cookies = new HashMap<String, String>();
48
49 private Type type;
50
51 /**
52 * Login on the web site (this method does nothing by default, but can be
53 * overridden if needed).
54 *
55 * @throws IOException
56 * in case of I/O error
57 *
58 */
59 public void login() throws IOException {
60 }
61
62 /**
63 * The website textual description, to add in the dispatcher page.
64 * <p>
65 * Should be short.
66 *
67 * @return the description
68 */
69 abstract public String getDescription();
70
71 /**
72 * The gopher "selector" to use for output.
73 * <p>
74 * A kind of "URL path", like "/news/" or "/misc/news/" or...
75 *
76 * @return the selector
77 */
78 public String getSelector() {
79 return getSelector(getType());
80 }
81
82 /**
83 * The support type.
84 *
85 * @return the type
86 */
87 public Type getType() {
88 return type;
89 }
90
91 /**
92 * List all the recent items, but only assure the ID and internal URL to
93 * fetch it later on (until it has been fetched, the rest of the
94 * {@link Story} is not confirmed).
95 *
96 * @return the list of new stories
97 *
98 * @throws IOException
99 * in case of I/O
100 */
101 public List<Story> list() throws IOException {
102 List<Story> list = new ArrayList<Story>();
103
104 login();
105 for (Entry<URL, String> entry : getUrls()) {
106 URL url = entry.getKey();
107 String defaultCateg = entry.getValue();
108 if (defaultCateg == null) {
109 defaultCateg = "";
110 }
111
112 InputStream in = open(url);
113 Document doc = DataUtil.load(in, "UTF-8", url.toString());
114 List<Element> articles = getArticles(doc);
115 for (Element article : articles) {
116 String id = getArticleId(doc, article).trim();
117 String title = getArticleTitle(doc, article).trim();
118 String author = getArticleAuthor(doc, article).trim();
119 String date = getArticleDate(doc, article).trim();
120 String categ = getArticleCategory(doc, article, defaultCateg)
121 .trim();
122 String details = getArticleDetails(doc, article).trim();
123 String intUrl = getArticleIntUrl(doc, article).trim();
124 String extUrl = getArticleExtUrl(doc, article).trim();
125 String content = getArticleContent(doc, article).trim();
126
127 if (id.isEmpty() && date.isEmpty()) {
128 continue;
129 }
130
131 if (!id.isEmpty()) {
132 while (id.length() < 10) {
133 id = "0" + id;
134 }
135 } else {
136 id = date.replace(":", "_").replace("+", "_").replace("/", "-");
137 }
138
139 date = date(date);
140
141 list.add(new Story(getType(), id, title, author, date, categ,
142 details, intUrl, extUrl, content));
143 }
144 }
145
146 return list;
147 }
148
149 /**
150 * The {@link URL}s to process for this website.
151 *
152 * @return the list of {@link URL}s
153 *
154 * @throws IOException
155 * in case of I/O error
156 */
157 abstract protected List<Entry<URL, String>> getUrls() throws IOException;
158
159 /**
160 * The article {@link Element}s of this document.
161 *
162 * @param doc
163 * the main document for the current category
164 *
165 * @return the articles
166 */
167 abstract protected List<Element> getArticles(Document doc);
168
169 /**
170 * The ID of the article (defaults to the date element if empty).
171 *
172 * @param doc
173 * the main document for the current category
174 * @param article
175 * the article to look into
176 *
177 * @return the ID
178 */
179 abstract protected String getArticleId(Document doc, Element article);
180
181 /**
182 * The article title to display.
183 *
184 * @param doc
185 * the main document for the current category
186 * @param article
187 * the article to look into
188 *
189 * @return the title
190 */
191 abstract protected String getArticleTitle(Document doc, Element article);
192
193 /**
194 * The optional article author.
195 *
196 * @param doc
197 * the main document for the current category
198 * @param article
199 * the article to look into
200 *
201 * @return the author
202 */
203 abstract protected String getArticleAuthor(Document doc, Element article);
204
205 /**
206 * The optional article date.
207 *
208 * @param doc
209 * the main document for the current category
210 * @param article
211 * the article to look into
212 *
213 * @return the date
214 */
215 abstract protected String getArticleDate(Document doc, Element article);
216
217 /**
218 * the optional article category.
219 *
220 * @param doc
221 * the main document for the current category
222 * @param article
223 * the article to look into
224 * @param currentCategory
225 * the currently listed category if any (can be NULL)
226 *
227 * @return the category
228 */
229 abstract protected String getArticleCategory(Document doc, Element article,
230 String currentCategory);
231
232 /**
233 * the optional details of the article (can replace the date, author and
234 * category, for instance).
235 *
236 * @param doc
237 * the main document for the current category
238 * @param article
239 * the article to look into
240 *
241 * @return the details
242 */
243 abstract protected String getArticleDetails(Document doc, Element article);
244
245 /**
246 * The (required) {@link URL} that points to the news page on the supported
247 * website.
248 *
249 * @param doc
250 * the main document for the current category
251 * @param article
252 * the article to look into
253 *
254 * @return the internal {@link URL}
255 */
256 abstract protected String getArticleIntUrl(Document doc, Element article);
257
258 /**
259 * the optional {@link URL} that points to an external website for more
260 * information.
261 *
262 * @param doc
263 * the main document for the current category
264 * @param article
265 * the article to look into
266 *
267 * @return the external {@link URL}
268 */
269 abstract protected String getArticleExtUrl(Document doc, Element article);
270
271 /**
272 * The optional article short-content (not the full content, that will be
273 * fetched by {@link BasicSupport#fetch(Story)}).
274 *
275 * @param doc
276 * the main document for the current category
277 * @param article
278 * the article to look into
279 *
280 * @return the short content
281 */
282 abstract protected String getArticleContent(Document doc, Element article);
283
284 /**
285 * Fetch the full article content as well as all the comments associated to
286 * this {@link Story}, if any (can be empty, but not NULL).
287 *
288 * @param story
289 * the story to fetch the comments of
290 *
291 * @throws IOException
292 * in case of I/O error
293 */
294 public void fetch(Story story) throws IOException {
295 String fullContent = "";
296
297 URL url = new URL(story.getUrlInternal());
298 InputStream in = open(url);
299 try {
300 Document doc = DataUtil.load(in, "UTF-8", url.toString());
301 Element article = getFullArticle(doc);
302 if (article != null) {
303 fullContent = getArticleText(article);
304 }
305
306 if (fullContent.isEmpty()) {
307 fullContent = story.getContent();
308 }
309
310 story.setFullContent(fullContent);
311 story.setComments(getComments(doc,
312 getFullArticleCommentPosts(doc, url)));
313 } finally {
314 if (in != null) {
315 in.close();
316 }
317 }
318 }
319
320 /**
321 * Return the text from this {@link Element}, using the
322 * {@link BasicSupport#getElementProcessorFullArticle()} processor logic.
323 *
324 * @param article
325 * the element to extract the text from
326 *
327 * @return the text
328 */
329 protected String getArticleText(Element article) {
330 StringBuilder builder = new StringBuilder();
331 ElementProcessor eProc = getElementProcessorFullArticle();
332 if (eProc != null) {
333 for (String line : toLines(article, eProc)) {
334 builder.append(line + "\n");
335 }
336 } else {
337 builder.append(article.text());
338 }
339
340 // Content is too tight with a single break per line:
341 return builder.toString().replace("\n", "\n\n") //
342 .replace("\n\n\n\n", "\n\n") //
343 .replace("\n\n\n\n", "\n\n") //
344 .trim();
345 }
346
347 /**
348 * Return the full article if available (this is the article to retrieve
349 * from the newly downloaded page at {@link Story#getUrlInternal()}).
350 *
351 * @param doc
352 * the (full article) document to work on
353 *
354 * @return the article or NULL
355 */
356 abstract protected Element getFullArticle(Document doc);
357
358 /**
359 * Return the list of comment {@link Element}s from this optional container
360 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
361 *
362 * @param doc
363 * the (full article) document to work on
364 * @param intUrl
365 * the internal {@link URL} this article wa taken from (the
366 * {@link URL} from the supported website)
367 *
368 * @return the list of comment posts
369 */
370 abstract protected List<Element> getFullArticleCommentPosts(Document doc,
371 URL intUrl);
372
373 /**
374 * The {@link ElementProcessor} to use to convert the main article element
375 * (see {@link BasicSupport#getFullArticle(Document)}) into text.
376 * <p>
377 * See {@link BasicElementProcessor} for a working, basic implementation.
378 * <p>
379 * Can be NULL to simply use {@link Element#text()}.
380 *
381 * @return the processor, or NULL
382 */
383 abstract protected ElementProcessor getElementProcessorFullArticle();
384
385 /**
386 * Open a network resource.
387 * <p>
388 * You need to close the returned {@link InputStream} when done.
389 *
390 * @param url
391 * the source to open
392 *
393 * @return the content
394 *
395 * @throws IOException
396 * in case of I/O error
397 */
398 protected InputStream open(URL url) throws IOException {
399 return downloader.open(url, url, cookies, null, null, null);
400 }
401
402 /**
403 * Convert the comment elements into {@link Comment}s
404 *
405 * @param doc
406 * the document we work on
407 * @param posts
408 * the comment elements
409 *
410 * @return the converted {@link Comment}s
411 */
412 private List<Comment> getComments(Document doc, List<Element> posts) {
413 List<Comment> comments = new ArrayList<Comment>();
414 if (posts != null) {
415 for (Element post : posts) {
416 String id = getCommentId(post).trim();
417 String author = getCommentAuthor(post).trim();
418 String title = getCommentTitle(post).trim();
419 String date = getCommentDate(post).trim();
420
421 List<String> content = new ArrayList<String>();
422
423 if (id.isEmpty()) {
424 id = date;
425 }
426
427 date = date(date);
428
429 Element contentE = getCommentContentElement(post);
430 if (contentE != null) {
431 ElementProcessor eProc = getElementProcessorComment();
432 if (eProc != null) {
433 for (String line : toLines(contentE, eProc)) {
434 content.add(line);
435 }
436 } else {
437 content = Arrays.asList(contentE.text().split("\n"));
438 }
439 }
440
441 Comment comment = new Comment(id, author, title, date, content);
442 comment.addAll(getComments(doc,
443 getCommentCommentPosts(doc, post)));
444
445 if (!comment.isEmpty()) {
446 comments.add(comment);
447 }
448 }
449 }
450
451 return comments;
452 }
453
454 /**
455 * Return the list of subcomment {@link Element}s from this comment element
456 * -- must <b>NOT</b> return the "container" as a comment {@link Element}.
457 *
458 * @param doc
459 * the (full article) document to work on
460 * @param container
461 * the container (a comment {@link Element})
462 *
463 * @return the list of comment posts
464 */
465 abstract protected List<Element> getCommentCommentPosts(Document doc,
466 Element container);
467
468 /**
469 * Compute the ID of the given comment element.
470 *
471 * @param post
472 * the comment element
473 *
474 * @return the ID
475 */
476 abstract protected String getCommentId(Element post);
477
478 /**
479 * Compute the author of the given comment element.
480 *
481 * @param post
482 * the comment element
483 *
484 * @return the author
485 */
486 abstract protected String getCommentAuthor(Element post);
487
488 /**
489 * Compute the title of the given comment element.
490 *
491 * @param post
492 * the comment element
493 *
494 * @return the title
495 */
496 abstract protected String getCommentTitle(Element post);
497
498 /**
499 * Compute the date of the given comment element.
500 *
501 * @param post
502 * the comment element
503 *
504 * @return the date
505 */
506 abstract protected String getCommentDate(Element post);
507
508 /**
509 * Get the main of the given comment element, which can be NULL.
510 *
511 * @param post
512 * the comment element
513 *
514 * @return the element
515 */
516 abstract protected Element getCommentContentElement(Element post);
517
518 /**
519 * The {@link ElementProcessor} to use to convert the main comment element
520 * (see {@link BasicSupport#getCommentContentElement(Element)}) into text.
521 * <p>
522 * See {@link BasicElementProcessor} for a working, basic implementation.
523 * <p>
524 * Can be NULL to simply use {@link Element#text()}.
525 *
526 * @return the processor
527 */
528 abstract protected ElementProcessor getElementProcessorComment();
529
530 /**
531 * The support type.
532 *
533 * @param type
534 * the new type
535 */
536 protected void setType(Type type) {
537 this.type = type;
538 }
539
540 /**
541 * Add a cookie for all site connections.
542 *
543 * @param name
544 * the cookie name
545 * @param value
546 * the value
547 */
548 protected void addCookie(String name, String value) {
549 cookies.put(name, value);
550 }
551
552 /**
553 * The {@link String} to append to the selector (the selector will be
554 * constructed as "this string" then "/type/".
555 *
556 * @param preselector
557 * the preselector to set
558 */
559 static public void setPreselector(String preselector) {
560 BasicSupport.preselector = preselector;
561 }
562
563 /**
564 * Return a {@link BasicSupport} that is compatible with the given
565 * {@link Type} if it exists (or NULL if not).
566 *
567 * @param type
568 * the type
569 *
570 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
571 */
572 static public BasicSupport getSupport(Type type) {
573 BasicSupport support = null;
574
575 if (type != null) {
576 switch (type) {
577 case SLASHDOT:
578 support = new Slashdot();
579 break;
580 case PIPEDOT:
581 support = new Pipedot();
582 break;
583 case LWN:
584 support = new LWN();
585 break;
586 case LEMONDE:
587 support = new LeMonde();
588 break;
589 case REGISTER:
590 support = new TheRegister();
591 break;
592 case TOO_LINUX:
593 support = new TooLinux();
594 break;
595 case ERE_NUMERIQUE:
596 support = new EreNumerique();
597 break;
598 case PHORONIX:
599 support = new Phoronix();
600 break;
601 case SEPT_SUR_SEPT:
602 support = new SeptSurSept();
603 break;
604 case REDDIT:
605 support = new Reddit();
606 break;
607 }
608
609 if (support != null) {
610 support.setType(type);
611 }
612 }
613
614 return support;
615 }
616
617 /**
618 * The gopher "selector" to use for output for this type, using the
619 * preselector.
620 * <p>
621 * A kind of "URL path", like "/news/" or "/misc/news/" or...
622 *
623 * @param type
624 * the type to get the selector of
625 *
626 * @return the selector
627 */
628 static public String getSelector(Type type) {
629 return preselector + "/" + type + "/";
630 }
631
632 /**
633 * Process the given element into text (each line is a text paragraph and
634 * can be prepended with ">" signs to indicate a quote or sub-quote or
635 * sub-sub-quote...).
636 *
637 * @param element
638 * the element to process
639 * @param elementProcessor
640 * the element processor, must not be NULL
641 *
642 * @return text lines, each line is a paragraph
643 */
644 static protected List<String> toLines(Element element,
645 final ElementProcessor elementProcessor) {
646 final List<String> lines = new ArrayList<String>();
647 final StringBuilder currentLine = new StringBuilder();
648 final List<Integer> quoted = new ArrayList<Integer>();
649 final List<Node> ignoredNodes = new ArrayList<Node>();
650 final List<String> footnotes = new ArrayList<String>();
651
652 if (element != null) {
653 new NodeTraversor(new NodeVisitor() {
654 @Override
655 public void head(Node node, int depth) {
656 String manual = null;
657 boolean ignore = elementProcessor.ignoreNode(node)
658 || ignoredNodes.contains(node.parentNode());
659 // Manual processing
660 if (!ignore) {
661 manual = elementProcessor.manualProcessing(node);
662 if (manual != null) {
663 currentLine.append(manual);
664 ignore = true;
665 }
666 }
667
668 // Subtitle check
669 if (!ignore) {
670 String subtitle = elementProcessor.isSubtitle(node);
671 if (subtitle != null) {
672 subtitle = subtitle.trim();
673 currentLine.append("\n[ " + subtitle + " ]\n");
674 ignore = true;
675 }
676 }
677
678 // <pre> check
679 if (!ignore) {
680 if (node instanceof Element) {
681 Element el = (Element) node;
682 if ("pre".equals(el.tagName())) {
683 currentLine.append(StringUtils
684 .unhtml(el.text()).trim());
685 ignore = true;
686 }
687 }
688 }
689
690 if (ignore) {
691 ignoredNodes.add(node);
692 return;
693 }
694
695 String prep = "";
696 for (int i = 0; i < quoted.size(); i++) {
697 prep += ">";
698 }
699 prep += " ";
700
701 boolean enterQuote = elementProcessor.detectQuote(node);
702 boolean leaveQuote = quoted.contains(depth);
703
704 if (enterQuote) {
705 quoted.add(depth);
706 }
707
708 if (leaveQuote) {
709 quoted.remove(Integer.valueOf(depth));
710 }
711
712 if (enterQuote || leaveQuote) {
713 if (currentLine.length() > 0) {
714 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
715 currentLine.setLength(currentLine.length() - 1);
716 }
717 for (String l : currentLine.toString().split("\n")) {
718 lines.add(prep + l);
719 }
720 }
721 currentLine.setLength(0);
722 }
723
724 if (node instanceof Element) {
725 Element element = (Element) node;
726 boolean block = element.isBlock()
727 || element.tagName().equalsIgnoreCase("br");
728 if (block && currentLine.length() > 0) {
729 currentLine.append("\n");
730 }
731
732 if (!element.absUrl("href").trim().isEmpty()) {
733 footnotes.add(element.absUrl("href"));
734 currentLine.append("[" + footnotes.size() + "]");
735 }
736 } else if (node instanceof TextNode) {
737 TextNode textNode = (TextNode) node;
738 String line = StringUtil.normaliseWhitespace(textNode
739 .getWholeText());
740
741 currentLine.append(elementProcessor.processText(line));
742 currentLine.append(" ");
743 }
744 }
745
746 @Override
747 public void tail(Node node, int depth) {
748 }
749 }).traverse(element);
750 }
751
752 if (currentLine.length() > 0) {
753 String prep = "";
754 for (int i = 0; i < quoted.size(); i++) {
755 prep += ">";
756 }
757 prep += " ";
758 if (currentLine.length() > 0) {
759 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
760 currentLine.setLength(currentLine.length() - 1);
761 }
762 for (String l : currentLine.toString().split("\n")) {
763 lines.add(prep + l);
764 }
765 }
766 }
767
768 // Fix spaces and nbsp, remove multiple following blank lines
769 List<String> linesCopy = new ArrayList<String>(lines.size());
770 long blanks = 0;
771 for (int i = 0; i < lines.size(); i++) {
772 String line = lines.get(i).replace(" ", " ") // nbsp -> space
773 .replace(" ", " ").trim();
774 if (line.isEmpty()) {
775 blanks++;
776 } else {
777 blanks = 0;
778 }
779
780 if (blanks < 2) {
781 linesCopy.add(line);
782 }
783 }
784
785 // Footnotes insertion
786 if (footnotes.size() > 0) {
787 linesCopy.add("");
788 linesCopy.add("");
789 linesCopy.add("");
790 linesCopy.add("");
791 for (int i = 0; i < footnotes.size(); i++) {
792 linesCopy.add("[" + (i + 1) + "] " + footnotes.get(i));
793 }
794 }
795
796 return linesCopy;
797 }
798
799 /**
800 * Reformat the date if possible.
801 *
802 * @param date
803 * the input date
804 *
805 * @return the reformated date, or the same value if it was not parsable
806 */
807 static private String date(String date) {
808 SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
809
810 long epoch = 0;
811 try {
812 epoch = Long.parseLong(date.trim());
813 } catch (Exception e) {
814 epoch = 0;
815 }
816
817 if (epoch > 0) {
818 return out.format(new Date(1000 * epoch));
819 }
820
821 try {
822 Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
823 .parse(date.trim());
824 return out.format(dat);
825 } catch (Exception e) {
826 return date;
827 }
828 }
829 }