Fix subtitles and too much content in EreNumerique
[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.text.ParseException;
5 import java.text.SimpleDateFormat;
6 import java.util.ArrayList;
7 import java.util.Date;
8 import java.util.List;
9
10 import org.jsoup.helper.StringUtil;
11 import org.jsoup.nodes.Element;
12 import org.jsoup.nodes.Node;
13 import org.jsoup.nodes.TextNode;
14 import org.jsoup.select.Elements;
15 import org.jsoup.select.NodeTraversor;
16 import org.jsoup.select.NodeVisitor;
17
18 import be.nikiroo.gofetch.data.Story;
19 import be.nikiroo.utils.Downloader;
20
21 /**
22 * Base class for website support.
23 *
24 * @author niki
25 */
26 public abstract class BasicSupport {
27 /** The downloader to use for all websites. */
28 protected static Downloader downloader = new Downloader("gofetcher");
29
30 /**
31 * The support type (each website we support has a single type).
32 *
33 * @author niki
34 */
35 public enum Type {
36 /** EN: Any, but mostly IT/Sci */
37 SLASHDOT,
38 /** EN: Clone of Slashdot, mostly abandoned */
39 PIPEDOT,
40 /** EN: Linux */
41 LWN,
42 /** FR: Any */
43 LEMONDE,
44 /** EN: IT */
45 REGISTER,
46 /** FR: Linux */
47 TOO_LINUX,
48 /** FR: IT */
49 ERE_NUMERIQUE,
50 }
51
52 /**
53 * Used to process an element into lines.
54 *
55 * @author niki
56 */
57 public interface ElementProcessor {
58 /**
59 * Detect if this node is a quote and should be trated as such.
60 *
61 * @param node
62 * the node to check
63 * @return TRUE if it is
64 */
65 public boolean detectQuote(Node node);
66
67 /**
68 * Process text content (will be called on each text element, allowing
69 * you to modify it if needed).
70 *
71 * @param text
72 * the text to process
73 *
74 * @return the resulting text
75 */
76 public String processText(String text);
77
78 /**
79 * Ignore this node.
80 *
81 * @param node
82 * the node to ignore
83 * @return TRUE if it has to be ignored
84 */
85 public boolean ignoreNode(Node node);
86
87 /**
88 * Manually process this node (and return the manual processing value)
89 * if so desired.
90 * <p>
91 * If the node is manually processed, it and its children will not be
92 * automatically processed.
93 *
94 * @param node
95 * the node to optionally process
96 *
97 * @return NULL if not processed (will thus be automatically processed
98 * as usual), a {@link String} (may be empty) if we process it
99 * manually -- the given {@link String} will be used instead of
100 * the usual automatic processing if not NULL
101 */
102 public String manualProcessing(Node node);
103
104 /**
105 * This {@link Node} is a subtitle and should be treated as such
106 * (highlighted).
107 *
108 * @param node
109 * the node to check
110 *
111 * @return NULL if it is not a subtitle, the subtitle to use if it is
112 */
113 public String isSubtitle(Node node);
114 }
115
116 /**
117 * A default {@link ElementProcessor} (will not detect or process anything
118 * manually).
119 *
120 * @author niki
121 */
122 public class BasicElementProcessor implements ElementProcessor {
123 @Override
124 public boolean detectQuote(Node node) {
125 return false;
126 }
127
128 @Override
129 public String processText(String text) {
130 return text;
131 }
132
133 @Override
134 public boolean ignoreNode(Node node) {
135 return false;
136 }
137
138 @Override
139 public String manualProcessing(Node node) {
140 return null;
141 }
142
143 @Override
144 public String isSubtitle(Node node) {
145 return null;
146 }
147 }
148
149 static private String preselector;
150
151 private Type type;
152
153 /**
154 * List all the recent items, but only assure the ID and internal URL to
155 * fetch it later on (until it has been fetched, the rest of the
156 * {@link Story} is not confirmed).
157 *
158 * @return the list of new stories
159 *
160 * @throws IOException
161 * in case of I/O
162 */
163 abstract public List<Story> list() throws IOException;
164
165 /**
166 * Fetch the full article content as well as all the comments associated to
167 * this {@link Story}, if any (can be empty, but not NULL).
168 *
169 * @param story
170 * the story to fetch the comments of
171 *
172 * @throws IOException
173 * in case of I/O error
174 */
175 abstract public void fetch(Story story) throws IOException;
176
177 /**
178 * The website textual description, to add in the dispatcher page.
179 * <p>
180 * Should be short.
181 *
182 * @return the description
183 */
184 abstract public String getDescription();
185
186 /**
187 * The gopher "selector" to use for output.
188 * <p>
189 * A kind of "URL path", like "/news/" or "/misc/news/" or...
190 *
191 * @return the selector
192 */
193 public String getSelector() {
194 return getSelector(type);
195 }
196
197 /**
198 * The support type.
199 *
200 * @return the type
201 */
202 public Type getType() {
203 return type;
204 }
205
206 /**
207 * The support type.
208 *
209 * @param type
210 * the new type
211 */
212 protected void setType(Type type) {
213 this.type = type;
214 }
215
216 /**
217 * The {@link String} to append to the selector (the selector will be
218 * constructed as "this string" then "/type/".
219 *
220 * @param preselector
221 * the preselector to set
222 */
223 static public void setPreselector(String preselector) {
224 BasicSupport.preselector = preselector;
225 }
226
227 /**
228 * Return a {@link BasicSupport} that is compatible with the given
229 * {@link Type} if it exists (or NULL if not).
230 *
231 * @param type
232 * the type
233 *
234 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
235 */
236 static public BasicSupport getSupport(Type type) {
237 BasicSupport support = null;
238
239 if (type != null) {
240 switch (type) {
241 case SLASHDOT:
242 support = new Slashdot();
243 break;
244 case PIPEDOT:
245 support = new Pipedot();
246 break;
247 case LWN:
248 support = new LWN();
249 break;
250 case LEMONDE:
251 support = new LeMonde();
252 break;
253 case REGISTER:
254 support = new TheRegister();
255 break;
256 case TOO_LINUX:
257 support = new TooLinux();
258 break;
259 case ERE_NUMERIQUE:
260 support = new EreNumerique();
261 break;
262 }
263
264 if (support != null) {
265 support.setType(type);
266 }
267 }
268
269 return support;
270 }
271
272 /**
273 * The gopher "selector" to use for output for this type, using the
274 * preselector.
275 * <p>
276 * A kind of "URL path", like "/news/" or "/misc/news/" or...
277 *
278 * @param type
279 * the type to get the selector of
280 *
281 * @return the selector
282 */
283 static public String getSelector(Type type) {
284 return preselector + "/" + type + "/";
285 }
286
287 /**
288 * Get the first {@link Element} of the given class, or an empty span
289 * {@link Element} if none found.
290 *
291 * @param element
292 * the element to look in
293 * @param className
294 * the class to look for
295 *
296 * @return the value or an empty span {@link Element}
297 */
298 static protected Element firstOrEmpty(Element element, String className) {
299 Elements subElements = element.getElementsByClass(className);
300 if (subElements.size() > 0) {
301 return subElements.get(0);
302 }
303
304 return new Element("span");
305 }
306
307 /**
308 * Get the first {@link Element} of the given tag, or an empty span
309 * {@link Element} if none found.
310 *
311 * @param element
312 * the element to look in
313 * @param tagName
314 * the tag to look for
315 *
316 * @return the value or an empty span {@link Element}
317 */
318 static protected Element firstOrEmptyTag(Element element, String tagName) {
319 Elements subElements = element.getElementsByTag(tagName);
320 if (subElements.size() > 0) {
321 return subElements.get(0);
322 }
323
324 return new Element("span");
325 }
326
327 /**
328 * Process the given element into text (each line is a text paragraph and
329 * can be prepended with ">" signs to indicate a quote or sub-quote or
330 * sub-sub-quote...).
331 *
332 * @param element
333 * the element to process
334 * @param elementProcessor
335 * the element processor, must not be NULL
336 *
337 * @return text lines, each line is a paragraph
338 */
339 static protected List<String> toLines(Element element,
340 final ElementProcessor elementProcessor) {
341 final List<String> lines = new ArrayList<String>();
342 final StringBuilder currentLine = new StringBuilder();
343 final List<Integer> quoted = new ArrayList<Integer>();
344 final List<Node> ignoredNodes = new ArrayList<Node>();
345
346 if (element != null) {
347 new NodeTraversor(new NodeVisitor() {
348 @Override
349 public void head(Node node, int depth) {
350 String manual = null;
351 boolean ignore = elementProcessor.ignoreNode(node)
352 || ignoredNodes.contains(node.parentNode());
353 // Manual processing
354 if (!ignore) {
355 manual = elementProcessor.manualProcessing(node);
356 if (manual != null) {
357 currentLine.append(manual);
358 ignore = true;
359 }
360 }
361
362 // Subtitle check
363 if (!ignore) {
364 String subtitle = elementProcessor.isSubtitle(node);
365 if (subtitle != null) {
366 subtitle = subtitle.trim();
367 currentLine.append("\n[ " + subtitle + " ]\n");
368 ignore = true;
369 }
370 }
371
372 if (ignore) {
373 ignoredNodes.add(node);
374 return;
375 }
376
377 String prep = "";
378 for (int i = 0; i < quoted.size(); i++) {
379 prep += ">";
380 }
381 prep += " ";
382
383 boolean enterQuote = elementProcessor.detectQuote(node);
384 boolean leaveQuote = quoted.contains(depth);
385
386 if (enterQuote) {
387 quoted.add(depth);
388 }
389
390 if (leaveQuote) {
391 quoted.remove(Integer.valueOf(depth));
392 }
393
394 if (enterQuote || leaveQuote) {
395 if (currentLine.length() > 0) {
396 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
397 currentLine.setLength(currentLine.length() - 1);
398 }
399 for (String l : currentLine.toString().split("\n")) {
400 lines.add(prep + l);
401 }
402 }
403 currentLine.setLength(0);
404 }
405
406 if (node instanceof Element) {
407 Element element = (Element) node;
408 boolean block = element.isBlock()
409 || element.tagName().equalsIgnoreCase("br");
410 if (block && currentLine.length() > 0) {
411 currentLine.append("\n");
412 }
413 } else if (node instanceof TextNode) {
414 TextNode textNode = (TextNode) node;
415 String line = StringUtil.normaliseWhitespace(textNode
416 .getWholeText());
417
418 currentLine.append(elementProcessor.processText(line));
419 currentLine.append(" ");
420 }
421 }
422
423 @Override
424 public void tail(Node node, int depth) {
425 }
426 }).traverse(element);
427 }
428
429 if (currentLine.length() > 0) {
430 String prep = "";
431 for (int i = 0; i < quoted.size(); i++) {
432 prep += ">";
433 }
434 prep += " ";
435 if (currentLine.length() > 0) {
436 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
437 currentLine.setLength(currentLine.length() - 1);
438 }
439 for (String l : currentLine.toString().split("\n")) {
440 lines.add(prep + l);
441 }
442 }
443 }
444
445 for (int i = 0; i < lines.size(); i++) {
446 lines.set(i, lines.get(i).replace(" ", " ").trim());
447 }
448
449 return lines;
450 }
451
452 /**
453 * Reformat the date if possible.
454 *
455 * @param date
456 * the input date
457 *
458 * @return the reformated date, or the same value if it was not parsable
459 */
460 static protected String date(String date) {
461 SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
462
463 long epoch = 0;
464 try {
465 epoch = Long.parseLong(date.trim());
466 } catch (Exception e) {
467 epoch = 0;
468 }
469
470 if (epoch > 0) {
471 return out.format(new Date(1000 * epoch));
472 }
473
474 try {
475 Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
476 .parse(date.trim());
477 return out.format(dat);
478 } catch (ParseException e) {
479 return date;
480 }
481 }
482 }