New website supported: Ère Numérique FR
[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java
CommitLineData
73785268
NR
1package be.nikiroo.gofetch.support;
2
3import java.io.IOException;
b34d1f35
NR
4import java.text.ParseException;
5import java.text.SimpleDateFormat;
27008a87 6import java.util.ArrayList;
b34d1f35 7import java.util.Date;
73785268 8import java.util.List;
73785268 9
27008a87
NR
10import org.jsoup.helper.StringUtil;
11import org.jsoup.nodes.Element;
12import org.jsoup.nodes.Node;
13import org.jsoup.nodes.TextNode;
14import org.jsoup.select.Elements;
15import org.jsoup.select.NodeTraversor;
16import org.jsoup.select.NodeVisitor;
17
73785268 18import be.nikiroo.gofetch.data.Story;
136ab801 19import be.nikiroo.utils.Downloader;
73785268 20
b34d1f35
NR
21/**
22 * Base class for website support.
23 *
24 * @author niki
25 */
73785268 26public abstract class BasicSupport {
b34d1f35 27 /** The downloader to use for all websites. */
136ab801
NR
28 protected static Downloader downloader = new Downloader("gofetcher");
29
b34d1f35
NR
30 /**
31 * The support type (each website we support has a single type).
32 *
33 * @author niki
34 */
73785268 35 public enum Type {
b34d1f35
NR
36 /** EN: Any, but mostly IT/Sci */
37 SLASHDOT,
38 /** EN: Clone of Slashdot, mostly abandoned */
39 PIPEDOT,
40 /** EN: Linux */
41 LWN,
42 /** FR: Any */
43 LEMONDE,
44 /** EN: IT */
45 REGISTER,
46 /** FR: Linux */
47 TOO_LINUX,
31755801
NR
48 /** FR: IT */
49 ERE_NUMERIQUE,
73785268
NR
50 }
51
20217360
NR
52 /**
53 * Used to process an element into lines.
54 *
55 * @author niki
56 */
57 public interface ElementProcessor {
58 /**
59 * Detect if this node is a quote and should be trated as such.
60 *
61 * @param node
62 * the node to check
63 * @return TRUE if it is
64 */
27008a87
NR
65 public boolean detectQuote(Node node);
66
20217360
NR
67 /**
68 * Process text content (will be called on each text element, allowing
69 * you to modify it if needed).
70 *
71 * @param text
72 * the text to process
b34d1f35
NR
73 *
74 * @return the resulting text
20217360 75 */
27008a87
NR
76 public String processText(String text);
77
20217360
NR
78 /**
79 * Ignore this node.
80 *
81 * @param node
82 * the node to ignore
83 * @return TRUE if it has to be ignored
84 */
27008a87 85 public boolean ignoreNode(Node node);
100a8395
NR
86
87 /**
20217360
NR
88 * Manually process this node (and return the manual processing value)
89 * if so desired.
90 * <p>
91 * If the node is manually processed, it and its children will not be
92 * automatically processed.
100a8395
NR
93 *
94 * @param node
95 * the node to optionally process
96 *
20217360
NR
97 * @return NULL if not processed (will thus be automatically processed
98 * as usual), a {@link String} (may be empty) if we process it
99 * manually -- the given {@link String} will be used instead of
100 * the usual automatic processing if not NULL
100a8395
NR
101 */
102 public String manualProcessing(Node node);
27008a87
NR
103 }
104
20217360
NR
105 /**
106 * A default {@link ElementProcessor} (will not detect or process anything
107 * manually).
108 *
109 * @author niki
110 */
111 public class BasicElementProcessor implements ElementProcessor {
112 @Override
113 public boolean detectQuote(Node node) {
114 return false;
115 }
116
117 @Override
118 public String processText(String text) {
119 return text;
120 }
121
122 @Override
123 public boolean ignoreNode(Node node) {
124 return false;
125 }
126
127 @Override
128 public String manualProcessing(Node node) {
129 return null;
130 }
131 }
132
73785268
NR
133 static private String preselector;
134
135 private Type type;
136
100a8395
NR
137 /**
138 * List all the recent items, but only assure the ID and internal URL to
139 * fetch it later on (until it has been fetched, the rest of the
140 * {@link Story} is not confirmed).
141 *
142 * @return the list of new stories
143 *
144 * @throws IOException
145 * in case of I/O
146 */
73785268
NR
147 abstract public List<Story> list() throws IOException;
148
5c056aad
NR
149 /**
150 * Fetch the full article content as well as all the comments associated to
151 * this {@link Story}, if any (can be empty, but not NULL).
152 *
153 * @param story
154 * the story to fetch the comments of
155 *
156 * @throws IOException
157 * in case of I/O error
158 */
159 abstract public void fetch(Story story) throws IOException;
73785268 160
b34d1f35
NR
161 /**
162 * The website textual description, to add in the dispatcher page.
163 * <p>
164 * Should be short.
165 *
166 * @return the description
167 */
73785268 168 abstract public String getDescription();
2d95a873 169
b34d1f35
NR
170 /**
171 * The gopher "selector" to use for output.
172 * <p>
173 * A kind of "URL path", like "/news/" or "/misc/news/" or...
174 *
175 * @return the selector
176 */
73785268
NR
177 public String getSelector() {
178 return getSelector(type);
179 }
180
b34d1f35
NR
181 /**
182 * The support type.
183 *
184 * @return the type
185 */
73785268
NR
186 public Type getType() {
187 return type;
188 }
189
b34d1f35
NR
190 /**
191 * The support type.
192 *
193 * @param type
194 * the new type
195 */
73785268
NR
196 protected void setType(Type type) {
197 this.type = type;
198 }
199
200 /**
b34d1f35
NR
201 * The {@link String} to append to the selector (the selector will be
202 * constructed as "this string" then "/type/".
203 *
73785268
NR
204 * @param preselector
205 * the preselector to set
206 */
207 static public void setPreselector(String preselector) {
208 BasicSupport.preselector = preselector;
209 }
210
20217360
NR
211 /**
212 * Return a {@link BasicSupport} that is compatible with the given
213 * {@link Type} if it exists (or NULL if not).
214 *
215 * @param type
216 * the type
217 *
218 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
219 */
73785268
NR
220 static public BasicSupport getSupport(Type type) {
221 BasicSupport support = null;
222
223 if (type != null) {
224 switch (type) {
225 case SLASHDOT:
226 support = new Slashdot();
227 break;
2d95a873
NR
228 case PIPEDOT:
229 support = new Pipedot();
230 break;
eaaeae39
NR
231 case LWN:
232 support = new LWN();
233 break;
100a8395
NR
234 case LEMONDE:
235 support = new LeMonde();
236 break;
d28c4aac
NR
237 case REGISTER:
238 support = new TheRegister();
239 break;
b34d1f35 240 case TOO_LINUX:
cd555a1e
NR
241 support = new TooLinux();
242 break;
31755801
NR
243 case ERE_NUMERIQUE:
244 support = new EreNumerique();
245 break;
73785268
NR
246 }
247
248 if (support != null) {
249 support.setType(type);
250 }
251 }
252
253 return support;
254 }
255
b34d1f35
NR
256 /**
257 * The gopher "selector" to use for output for this type, using the
258 * preselector.
259 * <p>
260 * A kind of "URL path", like "/news/" or "/misc/news/" or...
261 *
262 * @param type
263 * the type to get the selector of
264 *
265 * @return the selector
266 */
73785268
NR
267 static public String getSelector(Type type) {
268 return preselector + "/" + type + "/";
269 }
270
27008a87
NR
271 /**
272 * Get the first {@link Element} of the given class, or an empty span
273 * {@link Element} if none found.
274 *
275 * @param element
276 * the element to look in
277 * @param className
278 * the class to look for
279 *
280 * @return the value or an empty span {@link Element}
281 */
282 static protected Element firstOrEmpty(Element element, String className) {
283 Elements subElements = element.getElementsByClass(className);
284 if (subElements.size() > 0) {
285 return subElements.get(0);
286 }
287
288 return new Element("span");
289 }
290
291 /**
292 * Get the first {@link Element} of the given tag, or an empty span
293 * {@link Element} if none found.
294 *
295 * @param element
296 * the element to look in
297 * @param tagName
298 * the tag to look for
299 *
300 * @return the value or an empty span {@link Element}
301 */
302 static protected Element firstOrEmptyTag(Element element, String tagName) {
303 Elements subElements = element.getElementsByTag(tagName);
304 if (subElements.size() > 0) {
305 return subElements.get(0);
306 }
307
308 return new Element("span");
309 }
310
20217360
NR
311 /**
312 * Process the given element into text (each line is a text paragraph and
313 * can be prepended with ">" signs to indicate a quote or sub-quote or
314 * sub-sub-quote...).
315 *
316 * @param element
317 * the element to process
318 * @param elementProcessor
319 * the element processor, must not be NULL
320 *
321 * @return text lines, each line is a paragraph
322 */
27008a87 323 static protected List<String> toLines(Element element,
20217360 324 final ElementProcessor elementProcessor) {
27008a87
NR
325 final List<String> lines = new ArrayList<String>();
326 final StringBuilder currentLine = new StringBuilder();
327 final List<Integer> quoted = new ArrayList<Integer>();
328 final List<Node> ignoredNodes = new ArrayList<Node>();
329
330 if (element != null) {
331 new NodeTraversor(new NodeVisitor() {
332 @Override
333 public void head(Node node, int depth) {
100a8395 334 String manual = null;
20217360 335 boolean ignore = elementProcessor.ignoreNode(node)
100a8395
NR
336 || ignoredNodes.contains(node.parentNode());
337 if (!ignore) {
20217360 338 manual = elementProcessor.manualProcessing(node);
100a8395
NR
339 if (manual != null) {
340 currentLine.append(manual);
341 ignore = true;
342 }
343 }
344
345 if (ignore) {
27008a87
NR
346 ignoredNodes.add(node);
347 return;
348 }
349
350 String prep = "";
351 for (int i = 0; i < quoted.size(); i++) {
352 prep += ">";
353 }
354 prep += " ";
355
20217360 356 boolean enterQuote = elementProcessor.detectQuote(node);
27008a87
NR
357 boolean leaveQuote = quoted.contains(depth);
358
359 if (enterQuote) {
360 quoted.add(depth);
361 }
362
363 if (leaveQuote) {
364 quoted.remove(Integer.valueOf(depth));
365 }
366
367 if (enterQuote || leaveQuote) {
368 if (currentLine.length() > 0) {
369 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
370 currentLine.setLength(currentLine.length() - 1);
371 }
372 for (String l : currentLine.toString().split("\n")) {
373 lines.add(prep + l);
374 }
375 }
376 currentLine.setLength(0);
377 }
378
379 if (node instanceof Element) {
380 Element element = (Element) node;
381 boolean block = element.isBlock()
382 || element.tagName().equalsIgnoreCase("br");
383 if (block && currentLine.length() > 0) {
384 currentLine.append("\n");
385 }
386 } else if (node instanceof TextNode) {
387 TextNode textNode = (TextNode) node;
388 String line = StringUtil.normaliseWhitespace(textNode
389 .getWholeText());
390
20217360 391 currentLine.append(elementProcessor.processText(line));
27008a87
NR
392 currentLine.append(" ");
393 }
394 }
395
396 @Override
397 public void tail(Node node, int depth) {
398 }
399 }).traverse(element);
400 }
401
402 if (currentLine.length() > 0) {
403 String prep = "";
404 for (int i = 0; i < quoted.size(); i++) {
405 prep += ">";
406 }
407 prep += " ";
408 if (currentLine.length() > 0) {
409 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
410 currentLine.setLength(currentLine.length() - 1);
411 }
412 for (String l : currentLine.toString().split("\n")) {
413 lines.add(prep + l);
414 }
415 }
416 }
417
418 for (int i = 0; i < lines.size(); i++) {
419 lines.set(i, lines.get(i).replace(" ", " ").trim());
420 }
421
b34d1f35
NR
422 return lines;
423 }
424
425 /**
426 * Reformat the date if possible.
427 *
428 * @param date
429 * the input date
430 *
431 * @return the reformated date, or the same value if it was not parsable
432 */
433 static protected String date(String date) {
434 SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd");
435
436 long epoch = 0;
437 try {
438 epoch = Long.parseLong(date);
439 } catch (Exception e) {
440 epoch = 0;
880740c4
NR
441 }
442
b34d1f35
NR
443 if (epoch > 0) {
444 return out.format(new Date(1000 * epoch));
445 }
446
447 try {
448 Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX")
449 .parse(date.trim());
450 return out.format(dat);
451 } catch (ParseException e) {
452 return date;
453 }
27008a87 454 }
73785268 455}