Commit | Line | Data |
---|---|---|
73785268 NR |
1 | package be.nikiroo.gofetch.support; |
2 | ||
3 | import java.io.IOException; | |
b34d1f35 NR |
4 | import java.text.ParseException; |
5 | import java.text.SimpleDateFormat; | |
27008a87 | 6 | import java.util.ArrayList; |
b34d1f35 | 7 | import java.util.Date; |
73785268 | 8 | import java.util.List; |
73785268 | 9 | |
27008a87 NR |
10 | import org.jsoup.helper.StringUtil; |
11 | import org.jsoup.nodes.Element; | |
12 | import org.jsoup.nodes.Node; | |
13 | import org.jsoup.nodes.TextNode; | |
14 | import org.jsoup.select.Elements; | |
15 | import org.jsoup.select.NodeTraversor; | |
16 | import org.jsoup.select.NodeVisitor; | |
17 | ||
73785268 | 18 | import be.nikiroo.gofetch.data.Story; |
136ab801 | 19 | import be.nikiroo.utils.Downloader; |
73785268 | 20 | |
b34d1f35 NR |
21 | /** |
22 | * Base class for website support. | |
23 | * | |
24 | * @author niki | |
25 | */ | |
73785268 | 26 | public abstract class BasicSupport { |
b34d1f35 | 27 | /** The downloader to use for all websites. */ |
136ab801 NR |
28 | protected static Downloader downloader = new Downloader("gofetcher"); |
29 | ||
b34d1f35 NR |
30 | /** |
31 | * The support type (each website we support has a single type). | |
32 | * | |
33 | * @author niki | |
34 | */ | |
73785268 | 35 | public enum Type { |
b34d1f35 NR |
36 | /** EN: Any, but mostly IT/Sci */ |
37 | SLASHDOT, | |
38 | /** EN: Clone of Slashdot, mostly abandoned */ | |
39 | PIPEDOT, | |
40 | /** EN: Linux */ | |
41 | LWN, | |
42 | /** FR: Any */ | |
43 | LEMONDE, | |
44 | /** EN: IT */ | |
45 | REGISTER, | |
46 | /** FR: Linux */ | |
47 | TOO_LINUX, | |
31755801 NR |
48 | /** FR: IT */ |
49 | ERE_NUMERIQUE, | |
73785268 NR |
50 | } |
51 | ||
20217360 NR |
52 | /** |
53 | * Used to process an element into lines. | |
54 | * | |
55 | * @author niki | |
56 | */ | |
57 | public interface ElementProcessor { | |
58 | /** | |
59 | * Detect if this node is a quote and should be trated as such. | |
60 | * | |
61 | * @param node | |
62 | * the node to check | |
63 | * @return TRUE if it is | |
64 | */ | |
27008a87 NR |
65 | public boolean detectQuote(Node node); |
66 | ||
20217360 NR |
67 | /** |
68 | * Process text content (will be called on each text element, allowing | |
69 | * you to modify it if needed). | |
70 | * | |
71 | * @param text | |
72 | * the text to process | |
b34d1f35 NR |
73 | * |
74 | * @return the resulting text | |
20217360 | 75 | */ |
27008a87 NR |
76 | public String processText(String text); |
77 | ||
20217360 NR |
78 | /** |
79 | * Ignore this node. | |
80 | * | |
81 | * @param node | |
82 | * the node to ignore | |
83 | * @return TRUE if it has to be ignored | |
84 | */ | |
27008a87 | 85 | public boolean ignoreNode(Node node); |
100a8395 NR |
86 | |
87 | /** | |
20217360 NR |
88 | * Manually process this node (and return the manual processing value) |
89 | * if so desired. | |
90 | * <p> | |
91 | * If the node is manually processed, it and its children will not be | |
92 | * automatically processed. | |
100a8395 NR |
93 | * |
94 | * @param node | |
95 | * the node to optionally process | |
96 | * | |
20217360 NR |
97 | * @return NULL if not processed (will thus be automatically processed |
98 | * as usual), a {@link String} (may be empty) if we process it | |
99 | * manually -- the given {@link String} will be used instead of | |
100 | * the usual automatic processing if not NULL | |
100a8395 NR |
101 | */ |
102 | public String manualProcessing(Node node); | |
27008a87 NR |
103 | } |
104 | ||
20217360 NR |
105 | /** |
106 | * A default {@link ElementProcessor} (will not detect or process anything | |
107 | * manually). | |
108 | * | |
109 | * @author niki | |
110 | */ | |
111 | public class BasicElementProcessor implements ElementProcessor { | |
112 | @Override | |
113 | public boolean detectQuote(Node node) { | |
114 | return false; | |
115 | } | |
116 | ||
117 | @Override | |
118 | public String processText(String text) { | |
119 | return text; | |
120 | } | |
121 | ||
122 | @Override | |
123 | public boolean ignoreNode(Node node) { | |
124 | return false; | |
125 | } | |
126 | ||
127 | @Override | |
128 | public String manualProcessing(Node node) { | |
129 | return null; | |
130 | } | |
131 | } | |
132 | ||
73785268 NR |
133 | static private String preselector; |
134 | ||
135 | private Type type; | |
136 | ||
100a8395 NR |
137 | /** |
138 | * List all the recent items, but only assure the ID and internal URL to | |
139 | * fetch it later on (until it has been fetched, the rest of the | |
140 | * {@link Story} is not confirmed). | |
141 | * | |
142 | * @return the list of new stories | |
143 | * | |
144 | * @throws IOException | |
145 | * in case of I/O | |
146 | */ | |
73785268 NR |
147 | abstract public List<Story> list() throws IOException; |
148 | ||
5c056aad NR |
149 | /** |
150 | * Fetch the full article content as well as all the comments associated to | |
151 | * this {@link Story}, if any (can be empty, but not NULL). | |
152 | * | |
153 | * @param story | |
154 | * the story to fetch the comments of | |
155 | * | |
156 | * @throws IOException | |
157 | * in case of I/O error | |
158 | */ | |
159 | abstract public void fetch(Story story) throws IOException; | |
73785268 | 160 | |
b34d1f35 NR |
161 | /** |
162 | * The website textual description, to add in the dispatcher page. | |
163 | * <p> | |
164 | * Should be short. | |
165 | * | |
166 | * @return the description | |
167 | */ | |
73785268 | 168 | abstract public String getDescription(); |
2d95a873 | 169 | |
b34d1f35 NR |
170 | /** |
171 | * The gopher "selector" to use for output. | |
172 | * <p> | |
173 | * A kind of "URL path", like "/news/" or "/misc/news/" or... | |
174 | * | |
175 | * @return the selector | |
176 | */ | |
73785268 NR |
177 | public String getSelector() { |
178 | return getSelector(type); | |
179 | } | |
180 | ||
b34d1f35 NR |
181 | /** |
182 | * The support type. | |
183 | * | |
184 | * @return the type | |
185 | */ | |
73785268 NR |
186 | public Type getType() { |
187 | return type; | |
188 | } | |
189 | ||
b34d1f35 NR |
190 | /** |
191 | * The support type. | |
192 | * | |
193 | * @param type | |
194 | * the new type | |
195 | */ | |
73785268 NR |
196 | protected void setType(Type type) { |
197 | this.type = type; | |
198 | } | |
199 | ||
200 | /** | |
b34d1f35 NR |
201 | * The {@link String} to append to the selector (the selector will be |
202 | * constructed as "this string" then "/type/". | |
203 | * | |
73785268 NR |
204 | * @param preselector |
205 | * the preselector to set | |
206 | */ | |
207 | static public void setPreselector(String preselector) { | |
208 | BasicSupport.preselector = preselector; | |
209 | } | |
210 | ||
20217360 NR |
211 | /** |
212 | * Return a {@link BasicSupport} that is compatible with the given | |
213 | * {@link Type} if it exists (or NULL if not). | |
214 | * | |
215 | * @param type | |
216 | * the type | |
217 | * | |
218 | * @return a compatible {@link BasicSupport} if it exists (or NULL if not) | |
219 | */ | |
73785268 NR |
220 | static public BasicSupport getSupport(Type type) { |
221 | BasicSupport support = null; | |
222 | ||
223 | if (type != null) { | |
224 | switch (type) { | |
225 | case SLASHDOT: | |
226 | support = new Slashdot(); | |
227 | break; | |
2d95a873 NR |
228 | case PIPEDOT: |
229 | support = new Pipedot(); | |
230 | break; | |
eaaeae39 NR |
231 | case LWN: |
232 | support = new LWN(); | |
233 | break; | |
100a8395 NR |
234 | case LEMONDE: |
235 | support = new LeMonde(); | |
236 | break; | |
d28c4aac NR |
237 | case REGISTER: |
238 | support = new TheRegister(); | |
239 | break; | |
b34d1f35 | 240 | case TOO_LINUX: |
cd555a1e NR |
241 | support = new TooLinux(); |
242 | break; | |
31755801 NR |
243 | case ERE_NUMERIQUE: |
244 | support = new EreNumerique(); | |
245 | break; | |
73785268 NR |
246 | } |
247 | ||
248 | if (support != null) { | |
249 | support.setType(type); | |
250 | } | |
251 | } | |
252 | ||
253 | return support; | |
254 | } | |
255 | ||
b34d1f35 NR |
256 | /** |
257 | * The gopher "selector" to use for output for this type, using the | |
258 | * preselector. | |
259 | * <p> | |
260 | * A kind of "URL path", like "/news/" or "/misc/news/" or... | |
261 | * | |
262 | * @param type | |
263 | * the type to get the selector of | |
264 | * | |
265 | * @return the selector | |
266 | */ | |
73785268 NR |
267 | static public String getSelector(Type type) { |
268 | return preselector + "/" + type + "/"; | |
269 | } | |
270 | ||
27008a87 NR |
271 | /** |
272 | * Get the first {@link Element} of the given class, or an empty span | |
273 | * {@link Element} if none found. | |
274 | * | |
275 | * @param element | |
276 | * the element to look in | |
277 | * @param className | |
278 | * the class to look for | |
279 | * | |
280 | * @return the value or an empty span {@link Element} | |
281 | */ | |
282 | static protected Element firstOrEmpty(Element element, String className) { | |
283 | Elements subElements = element.getElementsByClass(className); | |
284 | if (subElements.size() > 0) { | |
285 | return subElements.get(0); | |
286 | } | |
287 | ||
288 | return new Element("span"); | |
289 | } | |
290 | ||
291 | /** | |
292 | * Get the first {@link Element} of the given tag, or an empty span | |
293 | * {@link Element} if none found. | |
294 | * | |
295 | * @param element | |
296 | * the element to look in | |
297 | * @param tagName | |
298 | * the tag to look for | |
299 | * | |
300 | * @return the value or an empty span {@link Element} | |
301 | */ | |
302 | static protected Element firstOrEmptyTag(Element element, String tagName) { | |
303 | Elements subElements = element.getElementsByTag(tagName); | |
304 | if (subElements.size() > 0) { | |
305 | return subElements.get(0); | |
306 | } | |
307 | ||
308 | return new Element("span"); | |
309 | } | |
310 | ||
20217360 NR |
311 | /** |
312 | * Process the given element into text (each line is a text paragraph and | |
313 | * can be prepended with ">" signs to indicate a quote or sub-quote or | |
314 | * sub-sub-quote...). | |
315 | * | |
316 | * @param element | |
317 | * the element to process | |
318 | * @param elementProcessor | |
319 | * the element processor, must not be NULL | |
320 | * | |
321 | * @return text lines, each line is a paragraph | |
322 | */ | |
27008a87 | 323 | static protected List<String> toLines(Element element, |
20217360 | 324 | final ElementProcessor elementProcessor) { |
27008a87 NR |
325 | final List<String> lines = new ArrayList<String>(); |
326 | final StringBuilder currentLine = new StringBuilder(); | |
327 | final List<Integer> quoted = new ArrayList<Integer>(); | |
328 | final List<Node> ignoredNodes = new ArrayList<Node>(); | |
329 | ||
330 | if (element != null) { | |
331 | new NodeTraversor(new NodeVisitor() { | |
332 | @Override | |
333 | public void head(Node node, int depth) { | |
100a8395 | 334 | String manual = null; |
20217360 | 335 | boolean ignore = elementProcessor.ignoreNode(node) |
100a8395 NR |
336 | || ignoredNodes.contains(node.parentNode()); |
337 | if (!ignore) { | |
20217360 | 338 | manual = elementProcessor.manualProcessing(node); |
100a8395 NR |
339 | if (manual != null) { |
340 | currentLine.append(manual); | |
341 | ignore = true; | |
342 | } | |
343 | } | |
344 | ||
345 | if (ignore) { | |
27008a87 NR |
346 | ignoredNodes.add(node); |
347 | return; | |
348 | } | |
349 | ||
350 | String prep = ""; | |
351 | for (int i = 0; i < quoted.size(); i++) { | |
352 | prep += ">"; | |
353 | } | |
354 | prep += " "; | |
355 | ||
20217360 | 356 | boolean enterQuote = elementProcessor.detectQuote(node); |
27008a87 NR |
357 | boolean leaveQuote = quoted.contains(depth); |
358 | ||
359 | if (enterQuote) { | |
360 | quoted.add(depth); | |
361 | } | |
362 | ||
363 | if (leaveQuote) { | |
364 | quoted.remove(Integer.valueOf(depth)); | |
365 | } | |
366 | ||
367 | if (enterQuote || leaveQuote) { | |
368 | if (currentLine.length() > 0) { | |
369 | if (currentLine.charAt(currentLine.length() - 1) == '\n') { | |
370 | currentLine.setLength(currentLine.length() - 1); | |
371 | } | |
372 | for (String l : currentLine.toString().split("\n")) { | |
373 | lines.add(prep + l); | |
374 | } | |
375 | } | |
376 | currentLine.setLength(0); | |
377 | } | |
378 | ||
379 | if (node instanceof Element) { | |
380 | Element element = (Element) node; | |
381 | boolean block = element.isBlock() | |
382 | || element.tagName().equalsIgnoreCase("br"); | |
383 | if (block && currentLine.length() > 0) { | |
384 | currentLine.append("\n"); | |
385 | } | |
386 | } else if (node instanceof TextNode) { | |
387 | TextNode textNode = (TextNode) node; | |
388 | String line = StringUtil.normaliseWhitespace(textNode | |
389 | .getWholeText()); | |
390 | ||
20217360 | 391 | currentLine.append(elementProcessor.processText(line)); |
27008a87 NR |
392 | currentLine.append(" "); |
393 | } | |
394 | } | |
395 | ||
396 | @Override | |
397 | public void tail(Node node, int depth) { | |
398 | } | |
399 | }).traverse(element); | |
400 | } | |
401 | ||
402 | if (currentLine.length() > 0) { | |
403 | String prep = ""; | |
404 | for (int i = 0; i < quoted.size(); i++) { | |
405 | prep += ">"; | |
406 | } | |
407 | prep += " "; | |
408 | if (currentLine.length() > 0) { | |
409 | if (currentLine.charAt(currentLine.length() - 1) == '\n') { | |
410 | currentLine.setLength(currentLine.length() - 1); | |
411 | } | |
412 | for (String l : currentLine.toString().split("\n")) { | |
413 | lines.add(prep + l); | |
414 | } | |
415 | } | |
416 | } | |
417 | ||
418 | for (int i = 0; i < lines.size(); i++) { | |
419 | lines.set(i, lines.get(i).replace(" ", " ").trim()); | |
420 | } | |
421 | ||
b34d1f35 NR |
422 | return lines; |
423 | } | |
424 | ||
425 | /** | |
426 | * Reformat the date if possible. | |
427 | * | |
428 | * @param date | |
429 | * the input date | |
430 | * | |
431 | * @return the reformated date, or the same value if it was not parsable | |
432 | */ | |
433 | static protected String date(String date) { | |
434 | SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd"); | |
435 | ||
436 | long epoch = 0; | |
437 | try { | |
438 | epoch = Long.parseLong(date); | |
439 | } catch (Exception e) { | |
440 | epoch = 0; | |
880740c4 NR |
441 | } |
442 | ||
b34d1f35 NR |
443 | if (epoch > 0) { |
444 | return out.format(new Date(1000 * epoch)); | |
445 | } | |
446 | ||
447 | try { | |
448 | Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX") | |
449 | .parse(date.trim()); | |
450 | return out.format(dat); | |
451 | } catch (ParseException e) { | |
452 | return date; | |
453 | } | |
27008a87 | 454 | } |
73785268 | 455 | } |