Fix subtitles and too much content in EreNumerique
[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java
CommitLineData
73785268
NR
1package be.nikiroo.gofetch.support;
2
3import java.io.IOException;
b34d1f35
NR
4import java.text.ParseException;
5import java.text.SimpleDateFormat;
27008a87 6import java.util.ArrayList;
b34d1f35 7import java.util.Date;
73785268 8import java.util.List;
73785268 9
27008a87
NR
10import org.jsoup.helper.StringUtil;
11import org.jsoup.nodes.Element;
12import org.jsoup.nodes.Node;
13import org.jsoup.nodes.TextNode;
14import org.jsoup.select.Elements;
15import org.jsoup.select.NodeTraversor;
16import org.jsoup.select.NodeVisitor;
17
73785268 18import be.nikiroo.gofetch.data.Story;
136ab801 19import be.nikiroo.utils.Downloader;
73785268 20
b34d1f35
NR
21/**
22 * Base class for website support.
23 *
24 * @author niki
25 */
73785268 26public abstract class BasicSupport {
b34d1f35 27 /** The downloader to use for all websites. */
136ab801
NR
28 protected static Downloader downloader = new Downloader("gofetcher");
29
b34d1f35
NR
30 /**
31 * The support type (each website we support has a single type).
32 *
33 * @author niki
34 */
73785268 35 public enum Type {
b34d1f35
NR
36 /** EN: Any, but mostly IT/Sci */
37 SLASHDOT,
38 /** EN: Clone of Slashdot, mostly abandoned */
39 PIPEDOT,
40 /** EN: Linux */
41 LWN,
42 /** FR: Any */
43 LEMONDE,
44 /** EN: IT */
45 REGISTER,
46 /** FR: Linux */
47 TOO_LINUX,
31755801
NR
48 /** FR: IT */
49 ERE_NUMERIQUE,
73785268
NR
50 }
51
20217360
NR
52 /**
53 * Used to process an element into lines.
54 *
55 * @author niki
56 */
57 public interface ElementProcessor {
58 /**
59 * Detect if this node is a quote and should be trated as such.
60 *
61 * @param node
62 * the node to check
63 * @return TRUE if it is
64 */
27008a87
NR
65 public boolean detectQuote(Node node);
66
20217360
NR
67 /**
68 * Process text content (will be called on each text element, allowing
69 * you to modify it if needed).
70 *
71 * @param text
72 * the text to process
b34d1f35
NR
73 *
74 * @return the resulting text
20217360 75 */
27008a87
NR
76 public String processText(String text);
77
20217360
NR
78 /**
79 * Ignore this node.
80 *
81 * @param node
82 * the node to ignore
83 * @return TRUE if it has to be ignored
84 */
27008a87 85 public boolean ignoreNode(Node node);
100a8395
NR
86
87 /**
20217360
NR
88 * Manually process this node (and return the manual processing value)
89 * if so desired.
90 * <p>
91 * If the node is manually processed, it and its children will not be
92 * automatically processed.
100a8395
NR
93 *
94 * @param node
95 * the node to optionally process
96 *
20217360
NR
97 * @return NULL if not processed (will thus be automatically processed
98 * as usual), a {@link String} (may be empty) if we process it
99 * manually -- the given {@link String} will be used instead of
100 * the usual automatic processing if not NULL
100a8395
NR
101 */
102 public String manualProcessing(Node node);
b9afb12e
NR
103
104 /**
105 * This {@link Node} is a subtitle and should be treated as such
106 * (highlighted).
107 *
108 * @param node
109 * the node to check
110 *
111 * @return NULL if it is not a subtitle, the subtitle to use if it is
112 */
113 public String isSubtitle(Node node);
27008a87
NR
114 }
115
20217360
NR
116 /**
117 * A default {@link ElementProcessor} (will not detect or process anything
118 * manually).
119 *
120 * @author niki
121 */
122 public class BasicElementProcessor implements ElementProcessor {
123 @Override
124 public boolean detectQuote(Node node) {
125 return false;
126 }
127
128 @Override
129 public String processText(String text) {
130 return text;
131 }
132
133 @Override
134 public boolean ignoreNode(Node node) {
135 return false;
136 }
137
138 @Override
139 public String manualProcessing(Node node) {
140 return null;
141 }
b9afb12e
NR
142
143 @Override
144 public String isSubtitle(Node node) {
145 return null;
146 }
20217360
NR
147 }
148
73785268
NR
149 static private String preselector;
150
151 private Type type;
152
100a8395
NR
153 /**
154 * List all the recent items, but only assure the ID and internal URL to
155 * fetch it later on (until it has been fetched, the rest of the
156 * {@link Story} is not confirmed).
157 *
158 * @return the list of new stories
159 *
160 * @throws IOException
161 * in case of I/O
162 */
73785268
NR
163 abstract public List<Story> list() throws IOException;
164
5c056aad
NR
165 /**
166 * Fetch the full article content as well as all the comments associated to
167 * this {@link Story}, if any (can be empty, but not NULL).
168 *
169 * @param story
170 * the story to fetch the comments of
171 *
172 * @throws IOException
173 * in case of I/O error
174 */
175 abstract public void fetch(Story story) throws IOException;
73785268 176
b34d1f35
NR
177 /**
178 * The website textual description, to add in the dispatcher page.
179 * <p>
180 * Should be short.
181 *
182 * @return the description
183 */
73785268 184 abstract public String getDescription();
2d95a873 185
b34d1f35
NR
186 /**
187 * The gopher "selector" to use for output.
188 * <p>
189 * A kind of "URL path", like "/news/" or "/misc/news/" or...
190 *
191 * @return the selector
192 */
73785268
NR
193 public String getSelector() {
194 return getSelector(type);
195 }
196
b34d1f35
NR
197 /**
198 * The support type.
199 *
200 * @return the type
201 */
73785268
NR
202 public Type getType() {
203 return type;
204 }
205
b34d1f35
NR
206 /**
207 * The support type.
208 *
209 * @param type
210 * the new type
211 */
73785268
NR
212 protected void setType(Type type) {
213 this.type = type;
214 }
215
216 /**
b34d1f35
NR
217 * The {@link String} to append to the selector (the selector will be
218 * constructed as "this string" then "/type/".
219 *
73785268
NR
220 * @param preselector
221 * the preselector to set
222 */
223 static public void setPreselector(String preselector) {
224 BasicSupport.preselector = preselector;
225 }
226
20217360
NR
227 /**
228 * Return a {@link BasicSupport} that is compatible with the given
229 * {@link Type} if it exists (or NULL if not).
230 *
231 * @param type
232 * the type
233 *
234 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
235 */
73785268
NR
236 static public BasicSupport getSupport(Type type) {
237 BasicSupport support = null;
238
239 if (type != null) {
240 switch (type) {
241 case SLASHDOT:
242 support = new Slashdot();
243 break;
2d95a873
NR
244 case PIPEDOT:
245 support = new Pipedot();
246 break;
eaaeae39
NR
247 case LWN:
248 support = new LWN();
249 break;
100a8395
NR
250 case LEMONDE:
251 support = new LeMonde();
252 break;
d28c4aac
NR
253 case REGISTER:
254 support = new TheRegister();
255 break;
b34d1f35 256 case TOO_LINUX:
cd555a1e
NR
257 support = new TooLinux();
258 break;
31755801
NR
259 case ERE_NUMERIQUE:
260 support = new EreNumerique();
261 break;
73785268
NR
262 }
263
264 if (support != null) {
265 support.setType(type);
266 }
267 }
268
269 return support;
270 }
271
b34d1f35
NR
272 /**
273 * The gopher "selector" to use for output for this type, using the
274 * preselector.
275 * <p>
276 * A kind of "URL path", like "/news/" or "/misc/news/" or...
277 *
278 * @param type
279 * the type to get the selector of
280 *
281 * @return the selector
282 */
73785268
NR
283 static public String getSelector(Type type) {
284 return preselector + "/" + type + "/";
285 }
286
27008a87
NR
287 /**
288 * Get the first {@link Element} of the given class, or an empty span
289 * {@link Element} if none found.
290 *
291 * @param element
292 * the element to look in
293 * @param className
294 * the class to look for
295 *
296 * @return the value or an empty span {@link Element}
297 */
298 static protected Element firstOrEmpty(Element element, String className) {
299 Elements subElements = element.getElementsByClass(className);
300 if (subElements.size() > 0) {
301 return subElements.get(0);
302 }
303
304 return new Element("span");
305 }
306
307 /**
308 * Get the first {@link Element} of the given tag, or an empty span
309 * {@link Element} if none found.
310 *
311 * @param element
312 * the element to look in
313 * @param tagName
314 * the tag to look for
315 *
316 * @return the value or an empty span {@link Element}
317 */
318 static protected Element firstOrEmptyTag(Element element, String tagName) {
319 Elements subElements = element.getElementsByTag(tagName);
320 if (subElements.size() > 0) {
321 return subElements.get(0);
322 }
323
324 return new Element("span");
325 }
326
20217360
NR
327 /**
328 * Process the given element into text (each line is a text paragraph and
329 * can be prepended with ">" signs to indicate a quote or sub-quote or
330 * sub-sub-quote...).
331 *
332 * @param element
333 * the element to process
334 * @param elementProcessor
335 * the element processor, must not be NULL
336 *
337 * @return text lines, each line is a paragraph
338 */
27008a87 339 static protected List<String> toLines(Element element,
20217360 340 final ElementProcessor elementProcessor) {
27008a87
NR
341 final List<String> lines = new ArrayList<String>();
342 final StringBuilder currentLine = new StringBuilder();
343 final List<Integer> quoted = new ArrayList<Integer>();
344 final List<Node> ignoredNodes = new ArrayList<Node>();
345
346 if (element != null) {
347 new NodeTraversor(new NodeVisitor() {
348 @Override
349 public void head(Node node, int depth) {
100a8395 350 String manual = null;
20217360 351 boolean ignore = elementProcessor.ignoreNode(node)
100a8395 352 || ignoredNodes.contains(node.parentNode());
b9afb12e 353 // Manual processing
100a8395 354 if (!ignore) {
20217360 355 manual = elementProcessor.manualProcessing(node);
100a8395
NR
356 if (manual != null) {
357 currentLine.append(manual);
358 ignore = true;
359 }
360 }
361
b9afb12e
NR
362 // Subtitle check
363 if (!ignore) {
364 String subtitle = elementProcessor.isSubtitle(node);
365 if (subtitle != null) {
366 subtitle = subtitle.trim();
367 currentLine.append("\n[ " + subtitle + " ]\n");
368 ignore = true;
369 }
370 }
371
100a8395 372 if (ignore) {
27008a87
NR
373 ignoredNodes.add(node);
374 return;
375 }
376
377 String prep = "";
378 for (int i = 0; i < quoted.size(); i++) {
379 prep += ">";
380 }
381 prep += " ";
382
20217360 383 boolean enterQuote = elementProcessor.detectQuote(node);
27008a87
NR
384 boolean leaveQuote = quoted.contains(depth);
385
386 if (enterQuote) {
387 quoted.add(depth);
388 }
389
390 if (leaveQuote) {
391 quoted.remove(Integer.valueOf(depth));
392 }
393
394 if (enterQuote || leaveQuote) {
395 if (currentLine.length() > 0) {
396 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
397 currentLine.setLength(currentLine.length() - 1);
398 }
399 for (String l : currentLine.toString().split("\n")) {
400 lines.add(prep + l);
401 }
402 }
403 currentLine.setLength(0);
404 }
405
406 if (node instanceof Element) {
407 Element element = (Element) node;
408 boolean block = element.isBlock()
409 || element.tagName().equalsIgnoreCase("br");
410 if (block && currentLine.length() > 0) {
411 currentLine.append("\n");
412 }
413 } else if (node instanceof TextNode) {
414 TextNode textNode = (TextNode) node;
415 String line = StringUtil.normaliseWhitespace(textNode
416 .getWholeText());
417
20217360 418 currentLine.append(elementProcessor.processText(line));
27008a87
NR
419 currentLine.append(" ");
420 }
421 }
422
423 @Override
424 public void tail(Node node, int depth) {
425 }
426 }).traverse(element);
427 }
428
429 if (currentLine.length() > 0) {
430 String prep = "";
431 for (int i = 0; i < quoted.size(); i++) {
432 prep += ">";
433 }
434 prep += " ";
435 if (currentLine.length() > 0) {
436 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
437 currentLine.setLength(currentLine.length() - 1);
438 }
439 for (String l : currentLine.toString().split("\n")) {
440 lines.add(prep + l);
441 }
442 }
443 }
444
445 for (int i = 0; i < lines.size(); i++) {
446 lines.set(i, lines.get(i).replace(" ", " ").trim());
447 }
448
b34d1f35
NR
449 return lines;
450 }
451
452 /**
453 * Reformat the date if possible.
454 *
455 * @param date
456 * the input date
457 *
458 * @return the reformated date, or the same value if it was not parsable
459 */
460 static protected String date(String date) {
461 SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
462
463 long epoch = 0;
464 try {
c9cffa91 465 epoch = Long.parseLong(date.trim());
b34d1f35
NR
466 } catch (Exception e) {
467 epoch = 0;
880740c4
NR
468 }
469
b34d1f35
NR
470 if (epoch > 0) {
471 return out.format(new Date(1000 * epoch));
472 }
473
474 try {
475 Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
476 .parse(date.trim());
477 return out.format(dat);
478 } catch (ParseException e) {
479 return date;
480 }
27008a87 481 }
73785268 482}