1 package be
.nikiroo
.gofetch
.support
;
3 import java
.io
.IOException
;
4 import java
.text
.ParseException
;
5 import java
.text
.SimpleDateFormat
;
6 import java
.util
.ArrayList
;
10 import org
.jsoup
.helper
.StringUtil
;
11 import org
.jsoup
.nodes
.Element
;
12 import org
.jsoup
.nodes
.Node
;
13 import org
.jsoup
.nodes
.TextNode
;
14 import org
.jsoup
.select
.Elements
;
15 import org
.jsoup
.select
.NodeTraversor
;
16 import org
.jsoup
.select
.NodeVisitor
;
18 import be
.nikiroo
.gofetch
.data
.Story
;
19 import be
.nikiroo
.utils
.Downloader
;
22 * Base class for website support.
26 public abstract class BasicSupport
{
27 /** The downloader to use for all websites. */
28 protected static Downloader downloader
= new Downloader("gofetcher");
31 * The support type (each website we support has a single type).
36 /** EN: Any, but mostly IT/Sci */
38 /** EN: Clone of Slashdot, mostly abandoned */
53 * Used to process an element into lines.
57 public interface ElementProcessor
{
59 * Detect if this node is a quote and should be trated as such.
63 * @return TRUE if it is
65 public boolean detectQuote(Node node
);
68 * Process text content (will be called on each text element, allowing
69 * you to modify it if needed).
74 * @return the resulting text
76 public String
processText(String text
);
83 * @return TRUE if it has to be ignored
85 public boolean ignoreNode(Node node
);
88 * Manually process this node (and return the manual processing value)
91 * If the node is manually processed, it and its children will not be
92 * automatically processed.
95 * the node to optionally process
97 * @return NULL if not processed (will thus be automatically processed
98 * as usual), a {@link String} (may be empty) if we process it
99 * manually -- the given {@link String} will be used instead of
100 * the usual automatic processing if not NULL
102 public String
manualProcessing(Node node
);
105 * This {@link Node} is a subtitle and should be treated as such
111 * @return NULL if it is not a subtitle, the subtitle to use if it is
113 public String
isSubtitle(Node node
);
117 * A default {@link ElementProcessor} (will not detect or process anything
122 public class BasicElementProcessor
implements ElementProcessor
{
124 public boolean detectQuote(Node node
) {
129 public String
processText(String text
) {
134 public boolean ignoreNode(Node node
) {
139 public String
manualProcessing(Node node
) {
144 public String
isSubtitle(Node node
) {
149 static private String preselector
;
154 * List all the recent items, but only assure the ID and internal URL to
155 * fetch it later on (until it has been fetched, the rest of the
156 * {@link Story} is not confirmed).
158 * @return the list of new stories
160 * @throws IOException
163 abstract public List
<Story
> list() throws IOException
;
166 * Fetch the full article content as well as all the comments associated to
167 * this {@link Story}, if any (can be empty, but not NULL).
170 * the story to fetch the comments of
172 * @throws IOException
173 * in case of I/O error
175 abstract public void fetch(Story story
) throws IOException
;
178 * The website textual description, to add in the dispatcher page.
182 * @return the description
184 abstract public String
getDescription();
187 * The gopher "selector" to use for output.
189 * A kind of "URL path", like "/news/" or "/misc/news/" or...
191 * @return the selector
193 public String
getSelector() {
194 return getSelector(type
);
202 public Type
getType() {
212 protected void setType(Type type
) {
217 * The {@link String} to append to the selector (the selector will be
218 * constructed as "this string" then "/type/".
221 * the preselector to set
223 static public void setPreselector(String preselector
) {
224 BasicSupport
.preselector
= preselector
;
228 * Return a {@link BasicSupport} that is compatible with the given
229 * {@link Type} if it exists (or NULL if not).
234 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
236 static public BasicSupport
getSupport(Type type
) {
237 BasicSupport support
= null;
242 support
= new Slashdot();
245 support
= new Pipedot();
251 support
= new LeMonde();
254 support
= new TheRegister();
257 support
= new TooLinux();
260 support
= new EreNumerique();
264 if (support
!= null) {
265 support
.setType(type
);
273 * The gopher "selector" to use for output for this type, using the
276 * A kind of "URL path", like "/news/" or "/misc/news/" or...
279 * the type to get the selector of
281 * @return the selector
283 static public String
getSelector(Type type
) {
284 return preselector
+ "/" + type
+ "/";
288 * Get the first {@link Element} of the given class, or an empty span
289 * {@link Element} if none found.
292 * the element to look in
294 * the class to look for
296 * @return the value or an empty span {@link Element}
298 static protected Element
firstOrEmpty(Element element
, String className
) {
299 Elements subElements
= element
.getElementsByClass(className
);
300 if (subElements
.size() > 0) {
301 return subElements
.get(0);
304 return new Element("span");
308 * Get the first {@link Element} of the given tag, or an empty span
309 * {@link Element} if none found.
312 * the element to look in
314 * the tag to look for
316 * @return the value or an empty span {@link Element}
318 static protected Element
firstOrEmptyTag(Element element
, String tagName
) {
319 Elements subElements
= element
.getElementsByTag(tagName
);
320 if (subElements
.size() > 0) {
321 return subElements
.get(0);
324 return new Element("span");
328 * Process the given element into text (each line is a text paragraph and
329 * can be prepended with ">" signs to indicate a quote or sub-quote or
333 * the element to process
334 * @param elementProcessor
335 * the element processor, must not be NULL
337 * @return text lines, each line is a paragraph
339 static protected List
<String
> toLines(Element element
,
340 final ElementProcessor elementProcessor
) {
341 final List
<String
> lines
= new ArrayList
<String
>();
342 final StringBuilder currentLine
= new StringBuilder();
343 final List
<Integer
> quoted
= new ArrayList
<Integer
>();
344 final List
<Node
> ignoredNodes
= new ArrayList
<Node
>();
346 if (element
!= null) {
347 new NodeTraversor(new NodeVisitor() {
349 public void head(Node node
, int depth
) {
350 String manual
= null;
351 boolean ignore
= elementProcessor
.ignoreNode(node
)
352 || ignoredNodes
.contains(node
.parentNode());
355 manual
= elementProcessor
.manualProcessing(node
);
356 if (manual
!= null) {
357 currentLine
.append(manual
);
364 String subtitle
= elementProcessor
.isSubtitle(node
);
365 if (subtitle
!= null) {
366 subtitle
= subtitle
.trim();
367 currentLine
.append("\n[ " + subtitle
+ " ]\n");
373 ignoredNodes
.add(node
);
378 for (int i
= 0; i
< quoted
.size(); i
++) {
383 boolean enterQuote
= elementProcessor
.detectQuote(node
);
384 boolean leaveQuote
= quoted
.contains(depth
);
391 quoted
.remove(Integer
.valueOf(depth
));
394 if (enterQuote
|| leaveQuote
) {
395 if (currentLine
.length() > 0) {
396 if (currentLine
.charAt(currentLine
.length() - 1) == '\n') {
397 currentLine
.setLength(currentLine
.length() - 1);
399 for (String l
: currentLine
.toString().split("\n")) {
403 currentLine
.setLength(0);
406 if (node
instanceof Element
) {
407 Element element
= (Element
) node
;
408 boolean block
= element
.isBlock()
409 || element
.tagName().equalsIgnoreCase("br");
410 if (block
&& currentLine
.length() > 0) {
411 currentLine
.append("\n");
413 } else if (node
instanceof TextNode
) {
414 TextNode textNode
= (TextNode
) node
;
415 String line
= StringUtil
.normaliseWhitespace(textNode
418 currentLine
.append(elementProcessor
.processText(line
));
419 currentLine
.append(" ");
424 public void tail(Node node
, int depth
) {
426 }).traverse(element
);
429 if (currentLine
.length() > 0) {
431 for (int i
= 0; i
< quoted
.size(); i
++) {
435 if (currentLine
.length() > 0) {
436 if (currentLine
.charAt(currentLine
.length() - 1) == '\n') {
437 currentLine
.setLength(currentLine
.length() - 1);
439 for (String l
: currentLine
.toString().split("\n")) {
445 for (int i
= 0; i
< lines
.size(); i
++) {
446 lines
.set(i
, lines
.get(i
).replace(" ", " ").trim());
453 * Reformat the date if possible.
458 * @return the reformated date, or the same value if it was not parsable
460 static protected String
date(String date
) {
461 SimpleDateFormat out
= new SimpleDateFormat("yyyy/MM/dd");
465 epoch
= Long
.parseLong(date
.trim());
466 } catch (Exception e
) {
471 return out
.format(new Date(1000 * epoch
));
475 Date dat
= new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
477 return out
.format(dat
);
478 } catch (ParseException e
) {