Add new supported site: The Register
[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.util.ArrayList;
5 import java.util.List;
6
7 import org.jsoup.helper.StringUtil;
8 import org.jsoup.nodes.Element;
9 import org.jsoup.nodes.Node;
10 import org.jsoup.nodes.TextNode;
11 import org.jsoup.select.Elements;
12 import org.jsoup.select.NodeTraversor;
13 import org.jsoup.select.NodeVisitor;
14
15 import be.nikiroo.gofetch.data.Story;
16 import be.nikiroo.utils.Downloader;
17
18 public abstract class BasicSupport {
19 protected static Downloader downloader = new Downloader("gofetcher");
20
21 public enum Type {
22 SLASHDOT, PIPEDOT, LWN, LEMONDE, REGISTER,
23 }
24
25 /**
26 * Used to process an element into lines.
27 *
28 * @author niki
29 */
30 public interface ElementProcessor {
31 /**
32 * Detect if this node is a quote and should be trated as such.
33 *
34 * @param node
35 * the node to check
36 * @return TRUE if it is
37 */
38 public boolean detectQuote(Node node);
39
40 /**
41 * Process text content (will be called on each text element, allowing
42 * you to modify it if needed).
43 *
44 * @param text
45 * the text to process
46 * @return
47 */
48 public String processText(String text);
49
50 /**
51 * Ignore this node.
52 *
53 * @param node
54 * the node to ignore
55 * @return TRUE if it has to be ignored
56 */
57 public boolean ignoreNode(Node node);
58
59 /**
60 * Manually process this node (and return the manual processing value)
61 * if so desired.
62 * <p>
63 * If the node is manually processed, it and its children will not be
64 * automatically processed.
65 *
66 * @param node
67 * the node to optionally process
68 *
69 * @return NULL if not processed (will thus be automatically processed
70 * as usual), a {@link String} (may be empty) if we process it
71 * manually -- the given {@link String} will be used instead of
72 * the usual automatic processing if not NULL
73 */
74 public String manualProcessing(Node node);
75 }
76
77 /**
78 * A default {@link ElementProcessor} (will not detect or process anything
79 * manually).
80 *
81 * @author niki
82 */
83 public class BasicElementProcessor implements ElementProcessor {
84 @Override
85 public boolean detectQuote(Node node) {
86 return false;
87 }
88
89 @Override
90 public String processText(String text) {
91 return text;
92 }
93
94 @Override
95 public boolean ignoreNode(Node node) {
96 return false;
97 }
98
99 @Override
100 public String manualProcessing(Node node) {
101 return null;
102 }
103 }
104
105 static private String preselector;
106
107 private Type type;
108
109 /**
110 * List all the recent items, but only assure the ID and internal URL to
111 * fetch it later on (until it has been fetched, the rest of the
112 * {@link Story} is not confirmed).
113 *
114 * @return the list of new stories
115 *
116 * @throws IOException
117 * in case of I/O
118 */
119 abstract public List<Story> list() throws IOException;
120
121 /**
122 * Fetch the full article content as well as all the comments associated to
123 * this {@link Story}, if any (can be empty, but not NULL).
124 *
125 * @param story
126 * the story to fetch the comments of
127 *
128 * @throws IOException
129 * in case of I/O error
130 */
131 abstract public void fetch(Story story) throws IOException;
132
133 abstract public String getDescription();
134
135 public String getSelector() {
136 return getSelector(type);
137 }
138
139 public Type getType() {
140 return type;
141 }
142
143 protected void setType(Type type) {
144 this.type = type;
145 }
146
147 /**
148 * @param preselector
149 * the preselector to set
150 */
151 static public void setPreselector(String preselector) {
152 BasicSupport.preselector = preselector;
153 }
154
155 /**
156 * Return a {@link BasicSupport} that is compatible with the given
157 * {@link Type} if it exists (or NULL if not).
158 *
159 * @param type
160 * the type
161 *
162 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
163 */
164 static public BasicSupport getSupport(Type type) {
165 BasicSupport support = null;
166
167 if (type != null) {
168 switch (type) {
169 case SLASHDOT:
170 support = new Slashdot();
171 break;
172 case PIPEDOT:
173 support = new Pipedot();
174 break;
175 case LWN:
176 support = new LWN();
177 break;
178 case LEMONDE:
179 support = new LeMonde();
180 break;
181 case REGISTER:
182 support = new TheRegister();
183 break;
184 }
185
186 if (support != null) {
187 support.setType(type);
188 }
189 }
190
191 return support;
192 }
193
194 static public String getSelector(Type type) {
195 return preselector + "/" + type + "/";
196 }
197
198 /**
199 * Get the first {@link Element} of the given class, or an empty span
200 * {@link Element} if none found.
201 *
202 * @param element
203 * the element to look in
204 * @param className
205 * the class to look for
206 *
207 * @return the value or an empty span {@link Element}
208 */
209 static protected Element firstOrEmpty(Element element, String className) {
210 Elements subElements = element.getElementsByClass(className);
211 if (subElements.size() > 0) {
212 return subElements.get(0);
213 }
214
215 return new Element("span");
216 }
217
218 /**
219 * Get the first {@link Element} of the given tag, or an empty span
220 * {@link Element} if none found.
221 *
222 * @param element
223 * the element to look in
224 * @param tagName
225 * the tag to look for
226 *
227 * @return the value or an empty span {@link Element}
228 */
229 static protected Element firstOrEmptyTag(Element element, String tagName) {
230 Elements subElements = element.getElementsByTag(tagName);
231 if (subElements.size() > 0) {
232 return subElements.get(0);
233 }
234
235 return new Element("span");
236 }
237
238 /**
239 * Process the given element into text (each line is a text paragraph and
240 * can be prepended with ">" signs to indicate a quote or sub-quote or
241 * sub-sub-quote...).
242 *
243 * @param element
244 * the element to process
245 * @param elementProcessor
246 * the element processor, must not be NULL
247 *
248 * @return text lines, each line is a paragraph
249 */
250 static protected List<String> toLines(Element element,
251 final ElementProcessor elementProcessor) {
252 final List<String> lines = new ArrayList<String>();
253 final StringBuilder currentLine = new StringBuilder();
254 final List<Integer> quoted = new ArrayList<Integer>();
255 final List<Node> ignoredNodes = new ArrayList<Node>();
256
257 if (element != null) {
258 new NodeTraversor(new NodeVisitor() {
259 @Override
260 public void head(Node node, int depth) {
261 String manual = null;
262 boolean ignore = elementProcessor.ignoreNode(node)
263 || ignoredNodes.contains(node.parentNode());
264 if (!ignore) {
265 manual = elementProcessor.manualProcessing(node);
266 if (manual != null) {
267 currentLine.append(manual);
268 ignore = true;
269 }
270 }
271
272 if (ignore) {
273 ignoredNodes.add(node);
274 return;
275 }
276
277 String prep = "";
278 for (int i = 0; i < quoted.size(); i++) {
279 prep += ">";
280 }
281 prep += " ";
282
283 boolean enterQuote = elementProcessor.detectQuote(node);
284 boolean leaveQuote = quoted.contains(depth);
285
286 if (enterQuote) {
287 quoted.add(depth);
288 }
289
290 if (leaveQuote) {
291 quoted.remove(Integer.valueOf(depth));
292 }
293
294 if (enterQuote || leaveQuote) {
295 if (currentLine.length() > 0) {
296 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
297 currentLine.setLength(currentLine.length() - 1);
298 }
299 for (String l : currentLine.toString().split("\n")) {
300 lines.add(prep + l);
301 }
302 }
303 currentLine.setLength(0);
304 }
305
306 if (node instanceof Element) {
307 Element element = (Element) node;
308 boolean block = element.isBlock()
309 || element.tagName().equalsIgnoreCase("br");
310 if (block && currentLine.length() > 0) {
311 currentLine.append("\n");
312 }
313 } else if (node instanceof TextNode) {
314 TextNode textNode = (TextNode) node;
315 String line = StringUtil.normaliseWhitespace(textNode
316 .getWholeText());
317
318 currentLine.append(elementProcessor.processText(line));
319 currentLine.append(" ");
320 }
321 }
322
323 @Override
324 public void tail(Node node, int depth) {
325 }
326 }).traverse(element);
327 }
328
329 if (currentLine.length() > 0) {
330 String prep = "";
331 for (int i = 0; i < quoted.size(); i++) {
332 prep += ">";
333 }
334 prep += " ";
335 if (currentLine.length() > 0) {
336 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
337 currentLine.setLength(currentLine.length() - 1);
338 }
339 for (String l : currentLine.toString().split("\n")) {
340 lines.add(prep + l);
341 }
342 }
343 }
344
345 for (int i = 0; i < lines.size(); i++) {
346 lines.set(i, lines.get(i).replace(" ", " ").trim());
347 }
348
349 return lines;
350 }
351 }