New support: TooLinux
[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.util.ArrayList;
5 import java.util.List;
6
7 import org.jsoup.helper.StringUtil;
8 import org.jsoup.nodes.Element;
9 import org.jsoup.nodes.Node;
10 import org.jsoup.nodes.TextNode;
11 import org.jsoup.select.Elements;
12 import org.jsoup.select.NodeTraversor;
13 import org.jsoup.select.NodeVisitor;
14
15 import be.nikiroo.gofetch.data.Story;
16 import be.nikiroo.utils.Downloader;
17
18 public abstract class BasicSupport {
19 protected static Downloader downloader = new Downloader("gofetcher");
20
21 public enum Type {
22 SLASHDOT, PIPEDOT, LWN, LEMONDE, REGISTER, TOOLINUX,
23 }
24
25 /**
26 * Used to process an element into lines.
27 *
28 * @author niki
29 */
30 public interface ElementProcessor {
31 /**
32 * Detect if this node is a quote and should be trated as such.
33 *
34 * @param node
35 * the node to check
36 * @return TRUE if it is
37 */
38 public boolean detectQuote(Node node);
39
40 /**
41 * Process text content (will be called on each text element, allowing
42 * you to modify it if needed).
43 *
44 * @param text
45 * the text to process
46 * @return
47 */
48 public String processText(String text);
49
50 /**
51 * Ignore this node.
52 *
53 * @param node
54 * the node to ignore
55 * @return TRUE if it has to be ignored
56 */
57 public boolean ignoreNode(Node node);
58
59 /**
60 * Manually process this node (and return the manual processing value)
61 * if so desired.
62 * <p>
63 * If the node is manually processed, it and its children will not be
64 * automatically processed.
65 *
66 * @param node
67 * the node to optionally process
68 *
69 * @return NULL if not processed (will thus be automatically processed
70 * as usual), a {@link String} (may be empty) if we process it
71 * manually -- the given {@link String} will be used instead of
72 * the usual automatic processing if not NULL
73 */
74 public String manualProcessing(Node node);
75 }
76
77 /**
78 * A default {@link ElementProcessor} (will not detect or process anything
79 * manually).
80 *
81 * @author niki
82 */
83 public class BasicElementProcessor implements ElementProcessor {
84 @Override
85 public boolean detectQuote(Node node) {
86 return false;
87 }
88
89 @Override
90 public String processText(String text) {
91 return text;
92 }
93
94 @Override
95 public boolean ignoreNode(Node node) {
96 return false;
97 }
98
99 @Override
100 public String manualProcessing(Node node) {
101 return null;
102 }
103 }
104
105 static private String preselector;
106
107 private Type type;
108
109 /**
110 * List all the recent items, but only assure the ID and internal URL to
111 * fetch it later on (until it has been fetched, the rest of the
112 * {@link Story} is not confirmed).
113 *
114 * @return the list of new stories
115 *
116 * @throws IOException
117 * in case of I/O
118 */
119 abstract public List<Story> list() throws IOException;
120
121 /**
122 * Fetch the full article content as well as all the comments associated to
123 * this {@link Story}, if any (can be empty, but not NULL).
124 *
125 * @param story
126 * the story to fetch the comments of
127 *
128 * @throws IOException
129 * in case of I/O error
130 */
131 abstract public void fetch(Story story) throws IOException;
132
133 abstract public String getDescription();
134
135 public String getSelector() {
136 return getSelector(type);
137 }
138
139 public Type getType() {
140 return type;
141 }
142
143 protected void setType(Type type) {
144 this.type = type;
145 }
146
147 /**
148 * @param preselector
149 * the preselector to set
150 */
151 static public void setPreselector(String preselector) {
152 BasicSupport.preselector = preselector;
153 }
154
155 /**
156 * Return a {@link BasicSupport} that is compatible with the given
157 * {@link Type} if it exists (or NULL if not).
158 *
159 * @param type
160 * the type
161 *
162 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
163 */
164 static public BasicSupport getSupport(Type type) {
165 BasicSupport support = null;
166
167 if (type != null) {
168 switch (type) {
169 case SLASHDOT:
170 support = new Slashdot();
171 break;
172 case PIPEDOT:
173 support = new Pipedot();
174 break;
175 case LWN:
176 support = new LWN();
177 break;
178 case LEMONDE:
179 support = new LeMonde();
180 break;
181 case REGISTER:
182 support = new TheRegister();
183 break;
184 case TOOLINUX:
185 support = new TooLinux();
186 break;
187 }
188
189 if (support != null) {
190 support.setType(type);
191 }
192 }
193
194 return support;
195 }
196
197 static public String getSelector(Type type) {
198 return preselector + "/" + type + "/";
199 }
200
201 /**
202 * Get the first {@link Element} of the given class, or an empty span
203 * {@link Element} if none found.
204 *
205 * @param element
206 * the element to look in
207 * @param className
208 * the class to look for
209 *
210 * @return the value or an empty span {@link Element}
211 */
212 static protected Element firstOrEmpty(Element element, String className) {
213 Elements subElements = element.getElementsByClass(className);
214 if (subElements.size() > 0) {
215 return subElements.get(0);
216 }
217
218 return new Element("span");
219 }
220
221 /**
222 * Get the first {@link Element} of the given tag, or an empty span
223 * {@link Element} if none found.
224 *
225 * @param element
226 * the element to look in
227 * @param tagName
228 * the tag to look for
229 *
230 * @return the value or an empty span {@link Element}
231 */
232 static protected Element firstOrEmptyTag(Element element, String tagName) {
233 Elements subElements = element.getElementsByTag(tagName);
234 if (subElements.size() > 0) {
235 return subElements.get(0);
236 }
237
238 return new Element("span");
239 }
240
241 /**
242 * Process the given element into text (each line is a text paragraph and
243 * can be prepended with ">" signs to indicate a quote or sub-quote or
244 * sub-sub-quote...).
245 *
246 * @param element
247 * the element to process
248 * @param elementProcessor
249 * the element processor, must not be NULL
250 *
251 * @return text lines, each line is a paragraph
252 */
253 static protected List<String> toLines(Element element,
254 final ElementProcessor elementProcessor) {
255 final List<String> lines = new ArrayList<String>();
256 final StringBuilder currentLine = new StringBuilder();
257 final List<Integer> quoted = new ArrayList<Integer>();
258 final List<Node> ignoredNodes = new ArrayList<Node>();
259
260 if (element != null) {
261 new NodeTraversor(new NodeVisitor() {
262 @Override
263 public void head(Node node, int depth) {
264 String manual = null;
265 boolean ignore = elementProcessor.ignoreNode(node)
266 || ignoredNodes.contains(node.parentNode());
267 if (!ignore) {
268 manual = elementProcessor.manualProcessing(node);
269 if (manual != null) {
270 currentLine.append(manual);
271 ignore = true;
272 }
273 }
274
275 if (ignore) {
276 ignoredNodes.add(node);
277 return;
278 }
279
280 String prep = "";
281 for (int i = 0; i < quoted.size(); i++) {
282 prep += ">";
283 }
284 prep += " ";
285
286 boolean enterQuote = elementProcessor.detectQuote(node);
287 boolean leaveQuote = quoted.contains(depth);
288
289 if (enterQuote) {
290 quoted.add(depth);
291 }
292
293 if (leaveQuote) {
294 quoted.remove(Integer.valueOf(depth));
295 }
296
297 if (enterQuote || leaveQuote) {
298 if (currentLine.length() > 0) {
299 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
300 currentLine.setLength(currentLine.length() - 1);
301 }
302 for (String l : currentLine.toString().split("\n")) {
303 lines.add(prep + l);
304 }
305 }
306 currentLine.setLength(0);
307 }
308
309 if (node instanceof Element) {
310 Element element = (Element) node;
311 boolean block = element.isBlock()
312 || element.tagName().equalsIgnoreCase("br");
313 if (block && currentLine.length() > 0) {
314 currentLine.append("\n");
315 }
316 } else if (node instanceof TextNode) {
317 TextNode textNode = (TextNode) node;
318 String line = StringUtil.normaliseWhitespace(textNode
319 .getWholeText());
320
321 currentLine.append(elementProcessor.processText(line));
322 currentLine.append(" ");
323 }
324 }
325
326 @Override
327 public void tail(Node node, int depth) {
328 }
329 }).traverse(element);
330 }
331
332 if (currentLine.length() > 0) {
333 String prep = "";
334 for (int i = 0; i < quoted.size(); i++) {
335 prep += ">";
336 }
337 prep += " ";
338 if (currentLine.length() > 0) {
339 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
340 currentLine.setLength(currentLine.length() - 1);
341 }
342 for (String l : currentLine.toString().split("\n")) {
343 lines.add(prep + l);
344 }
345 }
346 }
347
348 for (int i = 0; i < lines.size(); i++) {
349 lines.set(i, lines.get(i).replace(" ", " ").trim());
350 }
351
352 return lines;
353 }
354 }