4067979d4dc1b3f1657189be91d85ec8cec345c0
[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.util.ArrayList;
5 import java.util.List;
6
7 import org.jsoup.helper.StringUtil;
8 import org.jsoup.nodes.Element;
9 import org.jsoup.nodes.Node;
10 import org.jsoup.nodes.TextNode;
11 import org.jsoup.select.Elements;
12 import org.jsoup.select.NodeTraversor;
13 import org.jsoup.select.NodeVisitor;
14
15 import be.nikiroo.gofetch.data.Story;
16 import be.nikiroo.utils.Downloader;
17
18 public abstract class BasicSupport {
19 protected static Downloader downloader = new Downloader("gofetcher");
20
21 public enum Type {
22 SLASHDOT, PIPEDOT, LWN, LEMONDE, REGISTER, TOOLINUX,
23 }
24
25 /**
26 * Used to process an element into lines.
27 *
28 * @author niki
29 */
30 public interface ElementProcessor {
31 /**
32 * Detect if this node is a quote and should be trated as such.
33 *
34 * @param node
35 * the node to check
36 * @return TRUE if it is
37 */
38 public boolean detectQuote(Node node);
39
40 /**
41 * Process text content (will be called on each text element, allowing
42 * you to modify it if needed).
43 *
44 * @param text
45 * the text to process
46 * @return
47 */
48 public String processText(String text);
49
50 /**
51 * Ignore this node.
52 *
53 * @param node
54 * the node to ignore
55 * @return TRUE if it has to be ignored
56 */
57 public boolean ignoreNode(Node node);
58
59 /**
60 * Manually process this node (and return the manual processing value)
61 * if so desired.
62 * <p>
63 * If the node is manually processed, it and its children will not be
64 * automatically processed.
65 *
66 * @param node
67 * the node to optionally process
68 *
69 * @return NULL if not processed (will thus be automatically processed
70 * as usual), a {@link String} (may be empty) if we process it
71 * manually -- the given {@link String} will be used instead of
72 * the usual automatic processing if not NULL
73 */
74 public String manualProcessing(Node node);
75 }
76
77 /**
78 * A default {@link ElementProcessor} (will not detect or process anything
79 * manually).
80 *
81 * @author niki
82 */
83 public class BasicElementProcessor implements ElementProcessor {
84 @Override
85 public boolean detectQuote(Node node) {
86 return false;
87 }
88
89 @Override
90 public String processText(String text) {
91 return text;
92 }
93
94 @Override
95 public boolean ignoreNode(Node node) {
96 return false;
97 }
98
99 @Override
100 public String manualProcessing(Node node) {
101 return null;
102 }
103 }
104
105 static private String preselector;
106
107 private Type type;
108
109 /**
110 * List all the recent items, but only assure the ID and internal URL to
111 * fetch it later on (until it has been fetched, the rest of the
112 * {@link Story} is not confirmed).
113 *
114 * @return the list of new stories
115 *
116 * @throws IOException
117 * in case of I/O
118 */
119 abstract public List<Story> list() throws IOException;
120
121 /**
122 * Fetch the full article content as well as all the comments associated to
123 * this {@link Story}, if any (can be empty, but not NULL).
124 *
125 * @param story
126 * the story to fetch the comments of
127 *
128 * @throws IOException
129 * in case of I/O error
130 */
131 abstract public void fetch(Story story) throws IOException;
132
133 abstract public String getDescription();
134
135 public String getSelector() {
136 return getSelector(type);
137 }
138
139 public Type getType() {
140 return type;
141 }
142
143 protected void setType(Type type) {
144 this.type = type;
145 }
146
147 /**
148 * @param preselector
149 * the preselector to set
150 */
151 static public void setPreselector(String preselector) {
152 BasicSupport.preselector = preselector;
153 }
154
155 /**
156 * Return a {@link BasicSupport} that is compatible with the given
157 * {@link Type} if it exists (or NULL if not).
158 *
159 * @param type
160 * the type
161 *
162 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
163 */
164 static public BasicSupport getSupport(Type type) {
165 BasicSupport support = null;
166
167 if (type != null) {
168 switch (type) {
169 case SLASHDOT:
170 support = new Slashdot();
171 break;
172 case PIPEDOT:
173 support = new Pipedot();
174 break;
175 case LWN:
176 support = new LWN();
177 break;
178 case LEMONDE:
179 support = new LeMonde();
180 break;
181 case REGISTER:
182 support = new TheRegister();
183 break;
184 case TOOLINUX:
185 support = new TooLinux();
186 break;
187 }
188
189 if (support != null) {
190 support.setType(type);
191 }
192 }
193
194 return support;
195 }
196
197 static public String getSelector(Type type) {
198 return preselector + "/" + type + "/";
199 }
200
201 /**
202 * Get the first {@link Element} of the given class, or an empty span
203 * {@link Element} if none found.
204 *
205 * @param element
206 * the element to look in
207 * @param className
208 * the class to look for
209 *
210 * @return the value or an empty span {@link Element}
211 */
212 static protected Element firstOrEmpty(Element element, String className) {
213 Elements subElements = element.getElementsByClass(className);
214 if (subElements.size() > 0) {
215 return subElements.get(0);
216 }
217
218 return new Element("span");
219 }
220
221 /**
222 * Get the first {@link Element} of the given tag, or an empty span
223 * {@link Element} if none found.
224 *
225 * @param element
226 * the element to look in
227 * @param tagName
228 * the tag to look for
229 *
230 * @return the value or an empty span {@link Element}
231 */
232 static protected Element firstOrEmptyTag(Element element, String tagName) {
233 Elements subElements = element.getElementsByTag(tagName);
234 if (subElements.size() > 0) {
235 return subElements.get(0);
236 }
237
238 return new Element("span");
239 }
240
241 /**
242 * Process the given element into text (each line is a text paragraph and
243 * can be prepended with ">" signs to indicate a quote or sub-quote or
244 * sub-sub-quote...).
245 *
246 * @param element
247 * the element to process
248 * @param elementProcessor
249 * the element processor, must not be NULL
250 *
251 * @return text lines, each line is a paragraph
252 */
253 static protected List<String> toLines(Element element,
254 final ElementProcessor elementProcessor) {
255 final List<String> lines = new ArrayList<String>();
256 final StringBuilder currentLine = new StringBuilder();
257 final List<Integer> quoted = new ArrayList<Integer>();
258 final List<Node> ignoredNodes = new ArrayList<Node>();
259 final List<String> footnotes = new ArrayList<String>();
260
261 if (element != null) {
262 new NodeTraversor(new NodeVisitor() {
263 @Override
264 public void head(Node node, int depth) {
265 String manual = null;
266 boolean ignore = elementProcessor.ignoreNode(node)
267 || ignoredNodes.contains(node.parentNode());
268 if (!ignore) {
269 manual = elementProcessor.manualProcessing(node);
270 if (manual != null) {
271 currentLine.append(manual);
272 ignore = true;
273 }
274 }
275
276 if (ignore) {
277 ignoredNodes.add(node);
278 return;
279 }
280
281 String prep = "";
282 for (int i = 0; i < quoted.size(); i++) {
283 prep += ">";
284 }
285 prep += " ";
286
287 boolean enterQuote = elementProcessor.detectQuote(node);
288 boolean leaveQuote = quoted.contains(depth);
289
290 if (enterQuote) {
291 quoted.add(depth);
292 }
293
294 if (leaveQuote) {
295 quoted.remove(Integer.valueOf(depth));
296 }
297
298 if (enterQuote || leaveQuote) {
299 if (currentLine.length() > 0) {
300 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
301 currentLine.setLength(currentLine.length() - 1);
302 }
303 for (String l : currentLine.toString().split("\n")) {
304 lines.add(prep + l);
305 }
306 }
307 currentLine.setLength(0);
308 }
309
310 if (node instanceof Element) {
311 Element element = (Element) node;
312 boolean block = element.isBlock()
313 || element.tagName().equalsIgnoreCase("br");
314 if (block && currentLine.length() > 0) {
315 currentLine.append("\n");
316 }
317
318 if (!element.absUrl("href").trim().isEmpty()) {
319 footnotes.add(element.absUrl("href"));
320 currentLine.append("[" + footnotes.size() + "]");
321 }
322 } else if (node instanceof TextNode) {
323 TextNode textNode = (TextNode) node;
324 String line = StringUtil.normaliseWhitespace(textNode
325 .getWholeText());
326
327 currentLine.append(elementProcessor.processText(line));
328 currentLine.append(" ");
329 }
330 }
331
332 @Override
333 public void tail(Node node, int depth) {
334 }
335 }).traverse(element);
336 }
337
338 if (currentLine.length() > 0) {
339 String prep = "";
340 for (int i = 0; i < quoted.size(); i++) {
341 prep += ">";
342 }
343 prep += " ";
344 if (currentLine.length() > 0) {
345 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
346 currentLine.setLength(currentLine.length() - 1);
347 }
348 for (String l : currentLine.toString().split("\n")) {
349 lines.add(prep + l);
350 }
351 }
352 }
353
354 for (int i = 0; i < lines.size(); i++) {
355 lines.set(i, lines.get(i).replace(" ", " ").trim());
356 }
357
358 if (footnotes.size() > 0) {
359 lines.add("");
360 lines.add("");
361 lines.add("");
362 lines.add("");
363 for (int i = 0; i < footnotes.size(); i++) {
364 lines.add("[" + (i + 1) + "] " + footnotes.get(i));
365 }
366 }
367
368 return lines;
369 }
370 }