Commit | Line | Data |
---|---|---|
73785268 NR |
1 | package be.nikiroo.gofetch.support; |
2 | ||
3 | import java.io.IOException; | |
b34d1f35 NR |
4 | import java.text.ParseException; |
5 | import java.text.SimpleDateFormat; | |
27008a87 | 6 | import java.util.ArrayList; |
b34d1f35 | 7 | import java.util.Date; |
73785268 | 8 | import java.util.List; |
73785268 | 9 | |
27008a87 NR |
10 | import org.jsoup.helper.StringUtil; |
11 | import org.jsoup.nodes.Element; | |
12 | import org.jsoup.nodes.Node; | |
13 | import org.jsoup.nodes.TextNode; | |
14 | import org.jsoup.select.Elements; | |
15 | import org.jsoup.select.NodeTraversor; | |
16 | import org.jsoup.select.NodeVisitor; | |
17 | ||
73785268 | 18 | import be.nikiroo.gofetch.data.Story; |
136ab801 | 19 | import be.nikiroo.utils.Downloader; |
73785268 | 20 | |
b34d1f35 NR |
21 | /** |
22 | * Base class for website support. | |
23 | * | |
24 | * @author niki | |
25 | */ | |
73785268 | 26 | public abstract class BasicSupport { |
b34d1f35 | 27 | /** The downloader to use for all websites. */ |
136ab801 NR |
28 | protected static Downloader downloader = new Downloader("gofetcher"); |
29 | ||
b34d1f35 NR |
30 | /** |
31 | * The support type (each website we support has a single type). | |
32 | * | |
33 | * @author niki | |
34 | */ | |
73785268 | 35 | public enum Type { |
b34d1f35 NR |
36 | /** EN: Any, but mostly IT/Sci */ |
37 | SLASHDOT, | |
38 | /** EN: Clone of Slashdot, mostly abandoned */ | |
39 | PIPEDOT, | |
40 | /** EN: Linux */ | |
41 | LWN, | |
42 | /** FR: Any */ | |
43 | LEMONDE, | |
44 | /** EN: IT */ | |
45 | REGISTER, | |
46 | /** FR: Linux */ | |
47 | TOO_LINUX, | |
73785268 NR |
48 | } |
49 | ||
20217360 NR |
50 | /** |
51 | * Used to process an element into lines. | |
52 | * | |
53 | * @author niki | |
54 | */ | |
55 | public interface ElementProcessor { | |
56 | /** | |
57 | * Detect if this node is a quote and should be trated as such. | |
58 | * | |
59 | * @param node | |
60 | * the node to check | |
61 | * @return TRUE if it is | |
62 | */ | |
27008a87 NR |
63 | public boolean detectQuote(Node node); |
64 | ||
20217360 NR |
65 | /** |
66 | * Process text content (will be called on each text element, allowing | |
67 | * you to modify it if needed). | |
68 | * | |
69 | * @param text | |
70 | * the text to process | |
b34d1f35 NR |
71 | * |
72 | * @return the resulting text | |
20217360 | 73 | */ |
27008a87 NR |
74 | public String processText(String text); |
75 | ||
20217360 NR |
76 | /** |
77 | * Ignore this node. | |
78 | * | |
79 | * @param node | |
80 | * the node to ignore | |
81 | * @return TRUE if it has to be ignored | |
82 | */ | |
27008a87 | 83 | public boolean ignoreNode(Node node); |
100a8395 NR |
84 | |
85 | /** | |
20217360 NR |
86 | * Manually process this node (and return the manual processing value) |
87 | * if so desired. | |
88 | * <p> | |
89 | * If the node is manually processed, it and its children will not be | |
90 | * automatically processed. | |
100a8395 NR |
91 | * |
92 | * @param node | |
93 | * the node to optionally process | |
94 | * | |
20217360 NR |
95 | * @return NULL if not processed (will thus be automatically processed |
96 | * as usual), a {@link String} (may be empty) if we process it | |
97 | * manually -- the given {@link String} will be used instead of | |
98 | * the usual automatic processing if not NULL | |
100a8395 NR |
99 | */ |
100 | public String manualProcessing(Node node); | |
27008a87 NR |
101 | } |
102 | ||
20217360 NR |
103 | /** |
104 | * A default {@link ElementProcessor} (will not detect or process anything | |
105 | * manually). | |
106 | * | |
107 | * @author niki | |
108 | */ | |
109 | public class BasicElementProcessor implements ElementProcessor { | |
110 | @Override | |
111 | public boolean detectQuote(Node node) { | |
112 | return false; | |
113 | } | |
114 | ||
115 | @Override | |
116 | public String processText(String text) { | |
117 | return text; | |
118 | } | |
119 | ||
120 | @Override | |
121 | public boolean ignoreNode(Node node) { | |
122 | return false; | |
123 | } | |
124 | ||
125 | @Override | |
126 | public String manualProcessing(Node node) { | |
127 | return null; | |
128 | } | |
129 | } | |
130 | ||
73785268 NR |
131 | static private String preselector; |
132 | ||
133 | private Type type; | |
134 | ||
100a8395 NR |
135 | /** |
136 | * List all the recent items, but only assure the ID and internal URL to | |
137 | * fetch it later on (until it has been fetched, the rest of the | |
138 | * {@link Story} is not confirmed). | |
139 | * | |
140 | * @return the list of new stories | |
141 | * | |
142 | * @throws IOException | |
143 | * in case of I/O | |
144 | */ | |
73785268 NR |
145 | abstract public List<Story> list() throws IOException; |
146 | ||
5c056aad NR |
147 | /** |
148 | * Fetch the full article content as well as all the comments associated to | |
149 | * this {@link Story}, if any (can be empty, but not NULL). | |
150 | * | |
151 | * @param story | |
152 | * the story to fetch the comments of | |
153 | * | |
154 | * @throws IOException | |
155 | * in case of I/O error | |
156 | */ | |
157 | abstract public void fetch(Story story) throws IOException; | |
73785268 | 158 | |
b34d1f35 NR |
159 | /** |
160 | * The website textual description, to add in the dispatcher page. | |
161 | * <p> | |
162 | * Should be short. | |
163 | * | |
164 | * @return the description | |
165 | */ | |
73785268 | 166 | abstract public String getDescription(); |
2d95a873 | 167 | |
b34d1f35 NR |
168 | /** |
169 | * The gopher "selector" to use for output. | |
170 | * <p> | |
171 | * A kind of "URL path", like "/news/" or "/misc/news/" or... | |
172 | * | |
173 | * @return the selector | |
174 | */ | |
73785268 NR |
175 | public String getSelector() { |
176 | return getSelector(type); | |
177 | } | |
178 | ||
b34d1f35 NR |
179 | /** |
180 | * The support type. | |
181 | * | |
182 | * @return the type | |
183 | */ | |
73785268 NR |
184 | public Type getType() { |
185 | return type; | |
186 | } | |
187 | ||
b34d1f35 NR |
188 | /** |
189 | * The support type. | |
190 | * | |
191 | * @param type | |
192 | * the new type | |
193 | */ | |
73785268 NR |
194 | protected void setType(Type type) { |
195 | this.type = type; | |
196 | } | |
197 | ||
198 | /** | |
b34d1f35 NR |
199 | * The {@link String} to append to the selector (the selector will be |
200 | * constructed as "this string" then "/type/". | |
201 | * | |
73785268 NR |
202 | * @param preselector |
203 | * the preselector to set | |
204 | */ | |
205 | static public void setPreselector(String preselector) { | |
206 | BasicSupport.preselector = preselector; | |
207 | } | |
208 | ||
20217360 NR |
209 | /** |
210 | * Return a {@link BasicSupport} that is compatible with the given | |
211 | * {@link Type} if it exists (or NULL if not). | |
212 | * | |
213 | * @param type | |
214 | * the type | |
215 | * | |
216 | * @return a compatible {@link BasicSupport} if it exists (or NULL if not) | |
217 | */ | |
73785268 NR |
218 | static public BasicSupport getSupport(Type type) { |
219 | BasicSupport support = null; | |
220 | ||
221 | if (type != null) { | |
222 | switch (type) { | |
223 | case SLASHDOT: | |
224 | support = new Slashdot(); | |
225 | break; | |
2d95a873 NR |
226 | case PIPEDOT: |
227 | support = new Pipedot(); | |
228 | break; | |
eaaeae39 NR |
229 | case LWN: |
230 | support = new LWN(); | |
231 | break; | |
100a8395 NR |
232 | case LEMONDE: |
233 | support = new LeMonde(); | |
234 | break; | |
d28c4aac NR |
235 | case REGISTER: |
236 | support = new TheRegister(); | |
237 | break; | |
b34d1f35 | 238 | case TOO_LINUX: |
cd555a1e NR |
239 | support = new TooLinux(); |
240 | break; | |
73785268 NR |
241 | } |
242 | ||
243 | if (support != null) { | |
244 | support.setType(type); | |
245 | } | |
246 | } | |
247 | ||
248 | return support; | |
249 | } | |
250 | ||
b34d1f35 NR |
251 | /** |
252 | * The gopher "selector" to use for output for this type, using the | |
253 | * preselector. | |
254 | * <p> | |
255 | * A kind of "URL path", like "/news/" or "/misc/news/" or... | |
256 | * | |
257 | * @param type | |
258 | * the type to get the selector of | |
259 | * | |
260 | * @return the selector | |
261 | */ | |
73785268 NR |
262 | static public String getSelector(Type type) { |
263 | return preselector + "/" + type + "/"; | |
264 | } | |
265 | ||
27008a87 NR |
266 | /** |
267 | * Get the first {@link Element} of the given class, or an empty span | |
268 | * {@link Element} if none found. | |
269 | * | |
270 | * @param element | |
271 | * the element to look in | |
272 | * @param className | |
273 | * the class to look for | |
274 | * | |
275 | * @return the value or an empty span {@link Element} | |
276 | */ | |
277 | static protected Element firstOrEmpty(Element element, String className) { | |
278 | Elements subElements = element.getElementsByClass(className); | |
279 | if (subElements.size() > 0) { | |
280 | return subElements.get(0); | |
281 | } | |
282 | ||
283 | return new Element("span"); | |
284 | } | |
285 | ||
286 | /** | |
287 | * Get the first {@link Element} of the given tag, or an empty span | |
288 | * {@link Element} if none found. | |
289 | * | |
290 | * @param element | |
291 | * the element to look in | |
292 | * @param tagName | |
293 | * the tag to look for | |
294 | * | |
295 | * @return the value or an empty span {@link Element} | |
296 | */ | |
297 | static protected Element firstOrEmptyTag(Element element, String tagName) { | |
298 | Elements subElements = element.getElementsByTag(tagName); | |
299 | if (subElements.size() > 0) { | |
300 | return subElements.get(0); | |
301 | } | |
302 | ||
303 | return new Element("span"); | |
304 | } | |
305 | ||
20217360 NR |
306 | /** |
307 | * Process the given element into text (each line is a text paragraph and | |
308 | * can be prepended with ">" signs to indicate a quote or sub-quote or | |
309 | * sub-sub-quote...). | |
310 | * | |
311 | * @param element | |
312 | * the element to process | |
313 | * @param elementProcessor | |
314 | * the element processor, must not be NULL | |
315 | * | |
316 | * @return text lines, each line is a paragraph | |
317 | */ | |
27008a87 | 318 | static protected List<String> toLines(Element element, |
20217360 | 319 | final ElementProcessor elementProcessor) { |
27008a87 NR |
320 | final List<String> lines = new ArrayList<String>(); |
321 | final StringBuilder currentLine = new StringBuilder(); | |
322 | final List<Integer> quoted = new ArrayList<Integer>(); | |
323 | final List<Node> ignoredNodes = new ArrayList<Node>(); | |
324 | ||
325 | if (element != null) { | |
326 | new NodeTraversor(new NodeVisitor() { | |
327 | @Override | |
328 | public void head(Node node, int depth) { | |
100a8395 | 329 | String manual = null; |
20217360 | 330 | boolean ignore = elementProcessor.ignoreNode(node) |
100a8395 NR |
331 | || ignoredNodes.contains(node.parentNode()); |
332 | if (!ignore) { | |
20217360 | 333 | manual = elementProcessor.manualProcessing(node); |
100a8395 NR |
334 | if (manual != null) { |
335 | currentLine.append(manual); | |
336 | ignore = true; | |
337 | } | |
338 | } | |
339 | ||
340 | if (ignore) { | |
27008a87 NR |
341 | ignoredNodes.add(node); |
342 | return; | |
343 | } | |
344 | ||
345 | String prep = ""; | |
346 | for (int i = 0; i < quoted.size(); i++) { | |
347 | prep += ">"; | |
348 | } | |
349 | prep += " "; | |
350 | ||
20217360 | 351 | boolean enterQuote = elementProcessor.detectQuote(node); |
27008a87 NR |
352 | boolean leaveQuote = quoted.contains(depth); |
353 | ||
354 | if (enterQuote) { | |
355 | quoted.add(depth); | |
356 | } | |
357 | ||
358 | if (leaveQuote) { | |
359 | quoted.remove(Integer.valueOf(depth)); | |
360 | } | |
361 | ||
362 | if (enterQuote || leaveQuote) { | |
363 | if (currentLine.length() > 0) { | |
364 | if (currentLine.charAt(currentLine.length() - 1) == '\n') { | |
365 | currentLine.setLength(currentLine.length() - 1); | |
366 | } | |
367 | for (String l : currentLine.toString().split("\n")) { | |
368 | lines.add(prep + l); | |
369 | } | |
370 | } | |
371 | currentLine.setLength(0); | |
372 | } | |
373 | ||
374 | if (node instanceof Element) { | |
375 | Element element = (Element) node; | |
376 | boolean block = element.isBlock() | |
377 | || element.tagName().equalsIgnoreCase("br"); | |
378 | if (block && currentLine.length() > 0) { | |
379 | currentLine.append("\n"); | |
380 | } | |
381 | } else if (node instanceof TextNode) { | |
382 | TextNode textNode = (TextNode) node; | |
383 | String line = StringUtil.normaliseWhitespace(textNode | |
384 | .getWholeText()); | |
385 | ||
20217360 | 386 | currentLine.append(elementProcessor.processText(line)); |
27008a87 NR |
387 | currentLine.append(" "); |
388 | } | |
389 | } | |
390 | ||
391 | @Override | |
392 | public void tail(Node node, int depth) { | |
393 | } | |
394 | }).traverse(element); | |
395 | } | |
396 | ||
397 | if (currentLine.length() > 0) { | |
398 | String prep = ""; | |
399 | for (int i = 0; i < quoted.size(); i++) { | |
400 | prep += ">"; | |
401 | } | |
402 | prep += " "; | |
403 | if (currentLine.length() > 0) { | |
404 | if (currentLine.charAt(currentLine.length() - 1) == '\n') { | |
405 | currentLine.setLength(currentLine.length() - 1); | |
406 | } | |
407 | for (String l : currentLine.toString().split("\n")) { | |
408 | lines.add(prep + l); | |
409 | } | |
410 | } | |
411 | } | |
412 | ||
413 | for (int i = 0; i < lines.size(); i++) { | |
414 | lines.set(i, lines.get(i).replace(" ", " ").trim()); | |
415 | } | |
416 | ||
b34d1f35 NR |
417 | return lines; |
418 | } | |
419 | ||
420 | /** | |
421 | * Reformat the date if possible. | |
422 | * | |
423 | * @param date | |
424 | * the input date | |
425 | * | |
426 | * @return the reformated date, or the same value if it was not parsable | |
427 | */ | |
428 | static protected String date(String date) { | |
429 | SimpleDateFormat out = new SimpleDateFormat("yyyy/MM/dd"); | |
430 | ||
431 | long epoch = 0; | |
432 | try { | |
433 | epoch = Long.parseLong(date); | |
434 | } catch (Exception e) { | |
435 | epoch = 0; | |
880740c4 NR |
436 | } |
437 | ||
b34d1f35 NR |
438 | if (epoch > 0) { |
439 | return out.format(new Date(1000 * epoch)); | |
440 | } | |
441 | ||
442 | try { | |
443 | Date dat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssXXX") | |
444 | .parse(date.trim()); | |
445 | return out.format(dat); | |
446 | } catch (ParseException e) { | |
447 | return date; | |
448 | } | |
27008a87 | 449 | } |
73785268 | 450 | } |