6d930f6f4320bbeae13a14d098f846fa289fd96b
[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.text.ParseException;
5 import java.text.SimpleDateFormat;
6 import java.util.ArrayList;
7 import java.util.Date;
8 import java.util.List;
9
10 import org.jsoup.helper.StringUtil;
11 import org.jsoup.nodes.Element;
12 import org.jsoup.nodes.Node;
13 import org.jsoup.nodes.TextNode;
14 import org.jsoup.select.Elements;
15 import org.jsoup.select.NodeTraversor;
16 import org.jsoup.select.NodeVisitor;
17
18 import be.nikiroo.gofetch.data.Story;
19 import be.nikiroo.utils.Downloader;
20
21 /**
22 * Base class for website support.
23 *
24 * @author niki
25 */
26 public abstract class BasicSupport {
27 /** The downloader to use for all websites. */
28 protected static Downloader downloader = new Downloader("gofetcher");
29
30 /**
31 * The support type (each website we support has a single type).
32 *
33 * @author niki
34 */
35 public enum Type {
36 /** EN: Any, but mostly IT/Sci */
37 SLASHDOT,
38 /** EN: Clone of Slashdot, mostly abandoned */
39 PIPEDOT,
40 /** EN: Linux */
41 LWN,
42 /** FR: Any */
43 LEMONDE,
44 /** EN: IT */
45 REGISTER,
46 /** FR: Linux */
47 TOO_LINUX,
48 }
49
50 /**
51 * Used to process an element into lines.
52 *
53 * @author niki
54 */
55 public interface ElementProcessor {
56 /**
57 * Detect if this node is a quote and should be trated as such.
58 *
59 * @param node
60 * the node to check
61 * @return TRUE if it is
62 */
63 public boolean detectQuote(Node node);
64
65 /**
66 * Process text content (will be called on each text element, allowing
67 * you to modify it if needed).
68 *
69 * @param text
70 * the text to process
71 *
72 * @return the resulting text
73 */
74 public String processText(String text);
75
76 /**
77 * Ignore this node.
78 *
79 * @param node
80 * the node to ignore
81 * @return TRUE if it has to be ignored
82 */
83 public boolean ignoreNode(Node node);
84
85 /**
86 * Manually process this node (and return the manual processing value)
87 * if so desired.
88 * <p>
89 * If the node is manually processed, it and its children will not be
90 * automatically processed.
91 *
92 * @param node
93 * the node to optionally process
94 *
95 * @return NULL if not processed (will thus be automatically processed
96 * as usual), a {@link String} (may be empty) if we process it
97 * manually -- the given {@link String} will be used instead of
98 * the usual automatic processing if not NULL
99 */
100 public String manualProcessing(Node node);
101 }
102
103 /**
104 * A default {@link ElementProcessor} (will not detect or process anything
105 * manually).
106 *
107 * @author niki
108 */
109 public class BasicElementProcessor implements ElementProcessor {
110 @Override
111 public boolean detectQuote(Node node) {
112 return false;
113 }
114
115 @Override
116 public String processText(String text) {
117 return text;
118 }
119
120 @Override
121 public boolean ignoreNode(Node node) {
122 return false;
123 }
124
125 @Override
126 public String manualProcessing(Node node) {
127 return null;
128 }
129 }
130
131 static private String preselector;
132
133 private Type type;
134
135 /**
136 * List all the recent items, but only assure the ID and internal URL to
137 * fetch it later on (until it has been fetched, the rest of the
138 * {@link Story} is not confirmed).
139 *
140 * @return the list of new stories
141 *
142 * @throws IOException
143 * in case of I/O
144 */
145 abstract public List<Story> list() throws IOException;
146
147 /**
148 * Fetch the full article content as well as all the comments associated to
149 * this {@link Story}, if any (can be empty, but not NULL).
150 *
151 * @param story
152 * the story to fetch the comments of
153 *
154 * @throws IOException
155 * in case of I/O error
156 */
157 abstract public void fetch(Story story) throws IOException;
158
159 /**
160 * The website textual description, to add in the dispatcher page.
161 * <p>
162 * Should be short.
163 *
164 * @return the description
165 */
166 abstract public String getDescription();
167
168 /**
169 * The gopher "selector" to use for output.
170 * <p>
171 * A kind of "URL path", like "/news/" or "/misc/news/" or...
172 *
173 * @return the selector
174 */
175 public String getSelector() {
176 return getSelector(type);
177 }
178
179 /**
180 * The support type.
181 *
182 * @return the type
183 */
184 public Type getType() {
185 return type;
186 }
187
188 /**
189 * The support type.
190 *
191 * @param type
192 * the new type
193 */
194 protected void setType(Type type) {
195 this.type = type;
196 }
197
198 /**
199 * The {@link String} to append to the selector (the selector will be
200 * constructed as "this string" then "/type/".
201 *
202 * @param preselector
203 * the preselector to set
204 */
205 static public void setPreselector(String preselector) {
206 BasicSupport.preselector = preselector;
207 }
208
209 /**
210 * Return a {@link BasicSupport} that is compatible with the given
211 * {@link Type} if it exists (or NULL if not).
212 *
213 * @param type
214 * the type
215 *
216 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
217 */
218 static public BasicSupport getSupport(Type type) {
219 BasicSupport support = null;
220
221 if (type != null) {
222 switch (type) {
223 case SLASHDOT:
224 support = new Slashdot();
225 break;
226 case PIPEDOT:
227 support = new Pipedot();
228 break;
229 case LWN:
230 support = new LWN();
231 break;
232 case LEMONDE:
233 support = new LeMonde();
234 break;
235 case REGISTER:
236 support = new TheRegister();
237 break;
238 case TOO_LINUX:
239 support = new TooLinux();
240 break;
241 }
242
243 if (support != null) {
244 support.setType(type);
245 }
246 }
247
248 return support;
249 }
250
251 /**
252 * The gopher "selector" to use for output for this type, using the
253 * preselector.
254 * <p>
255 * A kind of "URL path", like "/news/" or "/misc/news/" or...
256 *
257 * @param type
258 * the type to get the selector of
259 *
260 * @return the selector
261 */
262 static public String getSelector(Type type) {
263 return preselector + "/" + type + "/";
264 }
265
266 /**
267 * Get the first {@link Element} of the given class, or an empty span
268 * {@link Element} if none found.
269 *
270 * @param element
271 * the element to look in
272 * @param className
273 * the class to look for
274 *
275 * @return the value or an empty span {@link Element}
276 */
277 static protected Element firstOrEmpty(Element element, String className) {
278 Elements subElements = element.getElementsByClass(className);
279 if (subElements.size() > 0) {
280 return subElements.get(0);
281 }
282
283 return new Element("span");
284 }
285
286 /**
287 * Get the first {@link Element} of the given tag, or an empty span
288 * {@link Element} if none found.
289 *
290 * @param element
291 * the element to look in
292 * @param tagName
293 * the tag to look for
294 *
295 * @return the value or an empty span {@link Element}
296 */
297 static protected Element firstOrEmptyTag(Element element, String tagName) {
298 Elements subElements = element.getElementsByTag(tagName);
299 if (subElements.size() > 0) {
300 return subElements.get(0);
301 }
302
303 return new Element("span");
304 }
305
306 /**
307 * Process the given element into text (each line is a text paragraph and
308 * can be prepended with ">" signs to indicate a quote or sub-quote or
309 * sub-sub-quote...).
310 *
311 * @param element
312 * the element to process
313 * @param elementProcessor
314 * the element processor, must not be NULL
315 *
316 * @return text lines, each line is a paragraph
317 */
318 static protected List<String> toLines(Element element,
319 final ElementProcessor elementProcessor) {
320 final List<String> lines = new ArrayList<String>();
321 final StringBuilder currentLine = new StringBuilder();
322 final List<Integer> quoted = new ArrayList<Integer>();
323 final List<Node> ignoredNodes = new ArrayList<Node>();
324
325 if (element != null) {
326 new NodeTraversor(new NodeVisitor() {
327 @Override
328 public void head(Node node, int depth) {
329 String manual = null;
330 boolean ignore = elementProcessor.ignoreNode(node)
331 || ignoredNodes.contains(node.parentNode());
332 if (!ignore) {
333 manual = elementProcessor.manualProcessing(node);
334 if (manual != null) {
335 currentLine.append(manual);
336 ignore = true;
337 }
338 }
339
340 if (ignore) {
341 ignoredNodes.add(node);
342 return;
343 }
344
345 String prep = "";
346 for (int i = 0; i < quoted.size(); i++) {
347 prep += ">";
348 }
349 prep += " ";
350
351 boolean enterQuote = elementProcessor.detectQuote(node);
352 boolean leaveQuote = quoted.contains(depth);
353
354 if (enterQuote) {
355 quoted.add(depth);
356 }
357
358 if (leaveQuote) {
359 quoted.remove(Integer.valueOf(depth));
360 }
361
362 if (enterQuote || leaveQuote) {
363 if (currentLine.length() > 0) {
364 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
365 currentLine.setLength(currentLine.length() - 1);
366 }
367 for (String l : currentLine.toString().split("\n")) {
368 lines.add(prep + l);
369 }
370 }
371 currentLine.setLength(0);
372 }
373
374 if (node instanceof Element) {
375 Element element = (Element) node;
376 boolean block = element.isBlock()
377 || element.tagName().equalsIgnoreCase("br");
378 if (block && currentLine.length() > 0) {
379 currentLine.append("\n");
380 }
381 } else if (node instanceof TextNode) {
382 TextNode textNode = (TextNode) node;
383 String line = StringUtil.normaliseWhitespace(textNode
384 .getWholeText());
385
386 currentLine.append(elementProcessor.processText(line));
387 currentLine.append(" ");
388 }
389 }
390
391 @Override
392 public void tail(Node node, int depth) {
393 }
394 }).traverse(element);
395 }
396
397 if (currentLine.length() > 0) {
398 String prep = "";
399 for (int i = 0; i < quoted.size(); i++) {
400 prep += ">";
401 }
402 prep += " ";
403 if (currentLine.length() > 0) {
404 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
405 currentLine.setLength(currentLine.length() - 1);
406 }
407 for (String l : currentLine.toString().split("\n")) {
408 lines.add(prep + l);
409 }
410 }
411 }
412
413 for (int i = 0; i < lines.size(); i++) {
414 lines.set(i, lines.get(i).replace(" ", " ").trim());
415 }
416
417 return lines;
418 }
419
420 /**
421 * Reformat the date if possible.
422 *
423 * @param date
424 * the input date
425 *
426 * @return the reformated date, or the same value if it was not parsable
427 */
428 static protected String date(String date) {
429 SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
430
431 long epoch = 0;
432 try {
433 epoch = Long.parseLong(date);
434 } catch (Exception e) {
435 epoch = 0;
436 }
437
438 if (epoch > 0) {
439 return out.format(new Date(1000 * epoch));
440 }
441
442 try {
443 Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
444 .parse(date.trim());
445 return out.format(dat);
446 } catch (ParseException e) {
447 return date;
448 }
449 }
450 }