Commit | Line | Data |
---|---|---|
73785268 NR |
1 | package be.nikiroo.gofetch.support; |
2 | ||
3 | import java.io.IOException; | |
b34d1f35 NR |
4 | import java.text.ParseException; |
5 | import java.text.SimpleDateFormat; | |
27008a87 | 6 | import java.util.ArrayList; |
b34d1f35 | 7 | import java.util.Date; |
73785268 | 8 | import java.util.List; |
73785268 | 9 | |
27008a87 NR |
10 | import org.jsoup.helper.StringUtil; |
11 | import org.jsoup.nodes.Element; | |
12 | import org.jsoup.nodes.Node; | |
13 | import org.jsoup.nodes.TextNode; | |
14 | import org.jsoup.select.Elements; | |
15 | import org.jsoup.select.NodeTraversor; | |
16 | import org.jsoup.select.NodeVisitor; | |
17 | ||
73785268 | 18 | import be.nikiroo.gofetch.data.Story; |
136ab801 | 19 | import be.nikiroo.utils.Downloader; |
73785268 | 20 | |
b34d1f35 NR |
21 | /** |
22 | * Base class for website support. | |
23 | * | |
24 | * @author niki | |
25 | */ | |
73785268 | 26 | public abstract class BasicSupport { |
b34d1f35 | 27 | /** The downloader to use for all websites. */ |
136ab801 NR |
28 | protected static Downloader downloader = new Downloader("gofetcher"); |
29 | ||
b34d1f35 NR |
30 | /** |
31 | * The support type (each website we support has a single type). | |
32 | * | |
33 | * @author niki | |
34 | */ | |
73785268 | 35 | public enum Type { |
b34d1f35 NR |
36 | /** EN: Any, but mostly IT/Sci */ |
37 | SLASHDOT, | |
38 | /** EN: Clone of Slashdot, mostly abandoned */ | |
39 | PIPEDOT, | |
40 | /** EN: Linux */ | |
41 | LWN, | |
42 | /** FR: Any */ | |
43 | LEMONDE, | |
44 | /** EN: IT */ | |
45 | REGISTER, | |
46 | /** FR: Linux */ | |
47 | TOO_LINUX, | |
31755801 NR |
48 | /** FR: IT */ |
49 | ERE_NUMERIQUE, | |
73785268 NR |
50 | } |
51 | ||
20217360 NR |
52 | /** |
53 | * Used to process an element into lines. | |
54 | * | |
55 | * @author niki | |
56 | */ | |
57 | public interface ElementProcessor { | |
58 | /** | |
59 | * Detect if this node is a quote and should be trated as such. | |
60 | * | |
61 | * @param node | |
62 | * the node to check | |
63 | * @return TRUE if it is | |
64 | */ | |
27008a87 NR |
65 | public boolean detectQuote(Node node); |
66 | ||
20217360 NR |
67 | /** |
68 | * Process text content (will be called on each text element, allowing | |
69 | * you to modify it if needed). | |
70 | * | |
71 | * @param text | |
72 | * the text to process | |
b34d1f35 NR |
73 | * |
74 | * @return the resulting text | |
20217360 | 75 | */ |
27008a87 NR |
76 | public String processText(String text); |
77 | ||
20217360 NR |
78 | /** |
79 | * Ignore this node. | |
80 | * | |
81 | * @param node | |
82 | * the node to ignore | |
83 | * @return TRUE if it has to be ignored | |
84 | */ | |
27008a87 | 85 | public boolean ignoreNode(Node node); |
100a8395 NR |
86 | |
87 | /** | |
20217360 NR |
88 | * Manually process this node (and return the manual processing value) |
89 | * if so desired. | |
90 | * <p> | |
91 | * If the node is manually processed, it and its children will not be | |
92 | * automatically processed. | |
100a8395 NR |
93 | * |
94 | * @param node | |
95 | * the node to optionally process | |
96 | * | |
20217360 NR |
97 | * @return NULL if not processed (will thus be automatically processed |
98 | * as usual), a {@link String} (may be empty) if we process it | |
99 | * manually -- the given {@link String} will be used instead of | |
100 | * the usual automatic processing if not NULL | |
100a8395 NR |
101 | */ |
102 | public String manualProcessing(Node node); | |
b9afb12e NR |
103 | |
104 | /** | |
105 | * This {@link Node} is a subtitle and should be treated as such | |
106 | * (highlighted). | |
107 | * | |
108 | * @param node | |
109 | * the node to check | |
110 | * | |
111 | * @return NULL if it is not a subtitle, the subtitle to use if it is | |
112 | */ | |
113 | public String isSubtitle(Node node); | |
27008a87 NR |
114 | } |
115 | ||
20217360 NR |
116 | /** |
117 | * A default {@link ElementProcessor} (will not detect or process anything | |
118 | * manually). | |
119 | * | |
120 | * @author niki | |
121 | */ | |
122 | public class BasicElementProcessor implements ElementProcessor { | |
123 | @Override | |
124 | public boolean detectQuote(Node node) { | |
125 | return false; | |
126 | } | |
127 | ||
128 | @Override | |
129 | public String processText(String text) { | |
130 | return text; | |
131 | } | |
132 | ||
133 | @Override | |
134 | public boolean ignoreNode(Node node) { | |
135 | return false; | |
136 | } | |
137 | ||
138 | @Override | |
139 | public String manualProcessing(Node node) { | |
140 | return null; | |
141 | } | |
b9afb12e NR |
142 | |
143 | @Override | |
144 | public String isSubtitle(Node node) { | |
145 | return null; | |
146 | } | |
20217360 NR |
147 | } |
148 | ||
73785268 NR |
149 | static private String preselector; |
150 | ||
151 | private Type type; | |
152 | ||
100a8395 NR |
153 | /** |
154 | * List all the recent items, but only assure the ID and internal URL to | |
155 | * fetch it later on (until it has been fetched, the rest of the | |
156 | * {@link Story} is not confirmed). | |
157 | * | |
158 | * @return the list of new stories | |
159 | * | |
160 | * @throws IOException | |
161 | * in case of I/O | |
162 | */ | |
73785268 NR |
163 | abstract public List<Story> list() throws IOException; |
164 | ||
5c056aad NR |
165 | /** |
166 | * Fetch the full article content as well as all the comments associated to | |
167 | * this {@link Story}, if any (can be empty, but not NULL). | |
168 | * | |
169 | * @param story | |
170 | * the story to fetch the comments of | |
171 | * | |
172 | * @throws IOException | |
173 | * in case of I/O error | |
174 | */ | |
175 | abstract public void fetch(Story story) throws IOException; | |
73785268 | 176 | |
b34d1f35 NR |
177 | /** |
178 | * The website textual description, to add in the dispatcher page. | |
179 | * <p> | |
180 | * Should be short. | |
181 | * | |
182 | * @return the description | |
183 | */ | |
73785268 | 184 | abstract public String getDescription(); |
2d95a873 | 185 | |
b34d1f35 NR |
186 | /** |
187 | * The gopher "selector" to use for output. | |
188 | * <p> | |
189 | * A kind of "URL path", like "/news/" or "/misc/news/" or... | |
190 | * | |
191 | * @return the selector | |
192 | */ | |
73785268 NR |
193 | public String getSelector() { |
194 | return getSelector(type); | |
195 | } | |
196 | ||
b34d1f35 NR |
197 | /** |
198 | * The support type. | |
199 | * | |
200 | * @return the type | |
201 | */ | |
73785268 NR |
202 | public Type getType() { |
203 | return type; | |
204 | } | |
205 | ||
b34d1f35 NR |
206 | /** |
207 | * The support type. | |
208 | * | |
209 | * @param type | |
210 | * the new type | |
211 | */ | |
73785268 NR |
212 | protected void setType(Type type) { |
213 | this.type = type; | |
214 | } | |
215 | ||
216 | /** | |
b34d1f35 NR |
217 | * The {@link String} to append to the selector (the selector will be |
218 | * constructed as "this string" then "/type/". | |
219 | * | |
73785268 NR |
220 | * @param preselector |
221 | * the preselector to set | |
222 | */ | |
223 | static public void setPreselector(String preselector) { | |
224 | BasicSupport.preselector = preselector; | |
225 | } | |
226 | ||
20217360 NR |
227 | /** |
228 | * Return a {@link BasicSupport} that is compatible with the given | |
229 | * {@link Type} if it exists (or NULL if not). | |
230 | * | |
231 | * @param type | |
232 | * the type | |
233 | * | |
234 | * @return a compatible {@link BasicSupport} if it exists (or NULL if not) | |
235 | */ | |
73785268 NR |
236 | static public BasicSupport getSupport(Type type) { |
237 | BasicSupport support = null; | |
238 | ||
239 | if (type != null) { | |
240 | switch (type) { | |
241 | case SLASHDOT: | |
242 | support = new Slashdot(); | |
243 | break; | |
2d95a873 NR |
244 | case PIPEDOT: |
245 | support = new Pipedot(); | |
246 | break; | |
eaaeae39 NR |
247 | case LWN: |
248 | support = new LWN(); | |
249 | break; | |
100a8395 NR |
250 | case LEMONDE: |
251 | support = new LeMonde(); | |
252 | break; | |
d28c4aac NR |
253 | case REGISTER: |
254 | support = new TheRegister(); | |
255 | break; | |
b34d1f35 | 256 | case TOO_LINUX: |
cd555a1e NR |
257 | support = new TooLinux(); |
258 | break; | |
31755801 NR |
259 | case ERE_NUMERIQUE: |
260 | support = new EreNumerique(); | |
261 | break; | |
73785268 NR |
262 | } |
263 | ||
264 | if (support != null) { | |
265 | support.setType(type); | |
266 | } | |
267 | } | |
268 | ||
269 | return support; | |
270 | } | |
271 | ||
b34d1f35 NR |
272 | /** |
273 | * The gopher "selector" to use for output for this type, using the | |
274 | * preselector. | |
275 | * <p> | |
276 | * A kind of "URL path", like "/news/" or "/misc/news/" or... | |
277 | * | |
278 | * @param type | |
279 | * the type to get the selector of | |
280 | * | |
281 | * @return the selector | |
282 | */ | |
73785268 NR |
283 | static public String getSelector(Type type) { |
284 | return preselector + "/" + type + "/"; | |
285 | } | |
286 | ||
27008a87 NR |
287 | /** |
288 | * Get the first {@link Element} of the given class, or an empty span | |
289 | * {@link Element} if none found. | |
290 | * | |
291 | * @param element | |
292 | * the element to look in | |
293 | * @param className | |
294 | * the class to look for | |
295 | * | |
296 | * @return the value or an empty span {@link Element} | |
297 | */ | |
298 | static protected Element firstOrEmpty(Element element, String className) { | |
299 | Elements subElements = element.getElementsByClass(className); | |
300 | if (subElements.size() > 0) { | |
301 | return subElements.get(0); | |
302 | } | |
303 | ||
304 | return new Element("span"); | |
305 | } | |
306 | ||
307 | /** | |
308 | * Get the first {@link Element} of the given tag, or an empty span | |
309 | * {@link Element} if none found. | |
310 | * | |
311 | * @param element | |
312 | * the element to look in | |
313 | * @param tagName | |
314 | * the tag to look for | |
315 | * | |
316 | * @return the value or an empty span {@link Element} | |
317 | */ | |
318 | static protected Element firstOrEmptyTag(Element element, String tagName) { | |
319 | Elements subElements = element.getElementsByTag(tagName); | |
320 | if (subElements.size() > 0) { | |
321 | return subElements.get(0); | |
322 | } | |
323 | ||
324 | return new Element("span"); | |
325 | } | |
326 | ||
20217360 NR |
327 | /** |
328 | * Process the given element into text (each line is a text paragraph and | |
329 | * can be prepended with ">" signs to indicate a quote or sub-quote or | |
330 | * sub-sub-quote...). | |
331 | * | |
332 | * @param element | |
333 | * the element to process | |
334 | * @param elementProcessor | |
335 | * the element processor, must not be NULL | |
336 | * | |
337 | * @return text lines, each line is a paragraph | |
338 | */ | |
27008a87 | 339 | static protected List<String> toLines(Element element, |
20217360 | 340 | final ElementProcessor elementProcessor) { |
27008a87 NR |
341 | final List<String> lines = new ArrayList<String>(); |
342 | final StringBuilder currentLine = new StringBuilder(); | |
343 | final List<Integer> quoted = new ArrayList<Integer>(); | |
344 | final List<Node> ignoredNodes = new ArrayList<Node>(); | |
345 | ||
346 | if (element != null) { | |
347 | new NodeTraversor(new NodeVisitor() { | |
348 | @Override | |
349 | public void head(Node node, int depth) { | |
100a8395 | 350 | String manual = null; |
20217360 | 351 | boolean ignore = elementProcessor.ignoreNode(node) |
100a8395 | 352 | || ignoredNodes.contains(node.parentNode()); |
b9afb12e | 353 | // Manual processing |
100a8395 | 354 | if (!ignore) { |
20217360 | 355 | manual = elementProcessor.manualProcessing(node); |
100a8395 NR |
356 | if (manual != null) { |
357 | currentLine.append(manual); | |
358 | ignore = true; | |
359 | } | |
360 | } | |
361 | ||
b9afb12e NR |
362 | // Subtitle check |
363 | if (!ignore) { | |
364 | String subtitle = elementProcessor.isSubtitle(node); | |
365 | if (subtitle != null) { | |
366 | subtitle = subtitle.trim(); | |
367 | currentLine.append("\n[ " + subtitle + " ]\n"); | |
368 | ignore = true; | |
369 | } | |
370 | } | |
371 | ||
100a8395 | 372 | if (ignore) { |
27008a87 NR |
373 | ignoredNodes.add(node); |
374 | return; | |
375 | } | |
376 | ||
377 | String prep = ""; | |
378 | for (int i = 0; i < quoted.size(); i++) { | |
379 | prep += ">"; | |
380 | } | |
381 | prep += " "; | |
382 | ||
20217360 | 383 | boolean enterQuote = elementProcessor.detectQuote(node); |
27008a87 NR |
384 | boolean leaveQuote = quoted.contains(depth); |
385 | ||
386 | if (enterQuote) { | |
387 | quoted.add(depth); | |
388 | } | |
389 | ||
390 | if (leaveQuote) { | |
391 | quoted.remove(Integer.valueOf(depth)); | |
392 | } | |
393 | ||
394 | if (enterQuote || leaveQuote) { | |
395 | if (currentLine.length() > 0) { | |
396 | if (currentLine.charAt(currentLine.length() - 1) == '\n') { | |
397 | currentLine.setLength(currentLine.length() - 1); | |
398 | } | |
399 | for (String l : currentLine.toString().split("\n")) { | |
400 | lines.add(prep + l); | |
401 | } | |
402 | } | |
403 | currentLine.setLength(0); | |
404 | } | |
405 | ||
406 | if (node instanceof Element) { | |
407 | Element element = (Element) node; | |
408 | boolean block = element.isBlock() | |
409 | || element.tagName().equalsIgnoreCase("br"); | |
410 | if (block && currentLine.length() > 0) { | |
411 | currentLine.append("\n"); | |
412 | } | |
413 | } else if (node instanceof TextNode) { | |
414 | TextNode textNode = (TextNode) node; | |
415 | String line = StringUtil.normaliseWhitespace(textNode | |
416 | .getWholeText()); | |
417 | ||
20217360 | 418 | currentLine.append(elementProcessor.processText(line)); |
27008a87 NR |
419 | currentLine.append(" "); |
420 | } | |
421 | } | |
422 | ||
423 | @Override | |
424 | public void tail(Node node, int depth) { | |
425 | } | |
426 | }).traverse(element); | |
427 | } | |
428 | ||
429 | if (currentLine.length() > 0) { | |
430 | String prep = ""; | |
431 | for (int i = 0; i < quoted.size(); i++) { | |
432 | prep += ">"; | |
433 | } | |
434 | prep += " "; | |
435 | if (currentLine.length() > 0) { | |
436 | if (currentLine.charAt(currentLine.length() - 1) == '\n') { | |
437 | currentLine.setLength(currentLine.length() - 1); | |
438 | } | |
439 | for (String l : currentLine.toString().split("\n")) { | |
440 | lines.add(prep + l); | |
441 | } | |
442 | } | |
443 | } | |
444 | ||
445 | for (int i = 0; i < lines.size(); i++) { | |
446 | lines.set(i, lines.get(i).replace(" ", " ").trim()); | |
447 | } | |
448 | ||
b34d1f35 NR |
449 | return lines; |
450 | } | |
451 | ||
452 | /** | |
453 | * Reformat the date if possible. | |
454 | * | |
455 | * @param date | |
456 | * the input date | |
457 | * | |
458 | * @return the reformated date, or the same value if it was not parsable | |
459 | */ | |
460 | static protected String date(String date) { | |
461 | SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd"); | |
462 | ||
463 | long epoch = 0; | |
464 | try { | |
c9cffa91 | 465 | epoch = Long.parseLong(date.trim()); |
b34d1f35 NR |
466 | } catch (Exception e) { |
467 | epoch = 0; | |
880740c4 NR |
468 | } |
469 | ||
b34d1f35 NR |
470 | if (epoch > 0) { |
471 | return out.format(new Date(1000 * epoch)); | |
472 | } | |
473 | ||
474 | try { | |
475 | Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX") | |
476 | .parse(date.trim()); | |
477 | return out.format(dat); | |
478 | } catch (ParseException e) { | |
479 | return date; | |
480 | } | |
27008a87 | 481 | } |
73785268 | 482 | } |