615c72d6b0cbd81c4f59d8472878ea07ef4b730a
[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java
1 package be.nikiroo.gofetch.support;
2
3 import java.io.IOException;
4 import java.util.ArrayList;
5 import java.util.List;
6
7 import org.jsoup.helper.StringUtil;
8 import org.jsoup.nodes.Element;
9 import org.jsoup.nodes.Node;
10 import org.jsoup.nodes.TextNode;
11 import org.jsoup.select.Elements;
12 import org.jsoup.select.NodeTraversor;
13 import org.jsoup.select.NodeVisitor;
14
15 import be.nikiroo.gofetch.data.Story;
16 import be.nikiroo.utils.Downloader;
17
18 public abstract class BasicSupport {
19 protected static Downloader downloader = new Downloader("gofetcher");
20
21 public enum Type {
22 SLASHDOT, PIPEDOT, LWN, LEMONDE,
23 }
24
25 /**
26 * Used to process an element into lines.
27 *
28 * @author niki
29 */
30 public interface ElementProcessor {
31 /**
32 * Detect if this node is a quote and should be trated as such.
33 *
34 * @param node
35 * the node to check
36 * @return TRUE if it is
37 */
38 public boolean detectQuote(Node node);
39
40 /**
41 * Process text content (will be called on each text element, allowing
42 * you to modify it if needed).
43 *
44 * @param text
45 * the text to process
46 * @return
47 */
48 public String processText(String text);
49
50 /**
51 * Ignore this node.
52 *
53 * @param node
54 * the node to ignore
55 * @return TRUE if it has to be ignored
56 */
57 public boolean ignoreNode(Node node);
58
59 /**
60 * Manually process this node (and return the manual processing value)
61 * if so desired.
62 * <p>
63 * If the node is manually processed, it and its children will not be
64 * automatically processed.
65 *
66 * @param node
67 * the node to optionally process
68 *
69 * @return NULL if not processed (will thus be automatically processed
70 * as usual), a {@link String} (may be empty) if we process it
71 * manually -- the given {@link String} will be used instead of
72 * the usual automatic processing if not NULL
73 */
74 public String manualProcessing(Node node);
75 }
76
77 /**
78 * A default {@link ElementProcessor} (will not detect or process anything
79 * manually).
80 *
81 * @author niki
82 */
83 public class BasicElementProcessor implements ElementProcessor {
84 @Override
85 public boolean detectQuote(Node node) {
86 return false;
87 }
88
89 @Override
90 public String processText(String text) {
91 return text;
92 }
93
94 @Override
95 public boolean ignoreNode(Node node) {
96 return false;
97 }
98
99 @Override
100 public String manualProcessing(Node node) {
101 return null;
102 }
103 }
104
105 static private String preselector;
106
107 private Type type;
108
109 /**
110 * List all the recent items, but only assure the ID and internal URL to
111 * fetch it later on (until it has been fetched, the rest of the
112 * {@link Story} is not confirmed).
113 *
114 * @return the list of new stories
115 *
116 * @throws IOException
117 * in case of I/O
118 */
119 abstract public List<Story> list() throws IOException;
120
121 /**
122 * Fetch the full article content as well as all the comments associated to
123 * this {@link Story}, if any (can be empty, but not NULL).
124 *
125 * @param story
126 * the story to fetch the comments of
127 *
128 * @throws IOException
129 * in case of I/O error
130 */
131 abstract public void fetch(Story story) throws IOException;
132
133 abstract public String getDescription();
134
135 public String getSelector() {
136 return getSelector(type);
137 }
138
139 public Type getType() {
140 return type;
141 }
142
143 protected void setType(Type type) {
144 this.type = type;
145 }
146
147 /**
148 * @param preselector
149 * the preselector to set
150 */
151 static public void setPreselector(String preselector) {
152 BasicSupport.preselector = preselector;
153 }
154
155 /**
156 * Return a {@link BasicSupport} that is compatible with the given
157 * {@link Type} if it exists (or NULL if not).
158 *
159 * @param type
160 * the type
161 *
162 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
163 */
164 static public BasicSupport getSupport(Type type) {
165 BasicSupport support = null;
166
167 if (type != null) {
168 switch (type) {
169 case SLASHDOT:
170 support = new Slashdot();
171 break;
172 case PIPEDOT:
173 support = new Pipedot();
174 break;
175 case LWN:
176 support = new LWN();
177 break;
178 case LEMONDE:
179 support = new LeMonde();
180 break;
181 }
182
183 if (support != null) {
184 support.setType(type);
185 }
186 }
187
188 return support;
189 }
190
191 static public String getSelector(Type type) {
192 return preselector + "/" + type + "/";
193 }
194
195 /**
196 * Get the first {@link Element} of the given class, or an empty span
197 * {@link Element} if none found.
198 *
199 * @param element
200 * the element to look in
201 * @param className
202 * the class to look for
203 *
204 * @return the value or an empty span {@link Element}
205 */
206 static protected Element firstOrEmpty(Element element, String className) {
207 Elements subElements = element.getElementsByClass(className);
208 if (subElements.size() > 0) {
209 return subElements.get(0);
210 }
211
212 return new Element("span");
213 }
214
215 /**
216 * Get the first {@link Element} of the given tag, or an empty span
217 * {@link Element} if none found.
218 *
219 * @param element
220 * the element to look in
221 * @param tagName
222 * the tag to look for
223 *
224 * @return the value or an empty span {@link Element}
225 */
226 static protected Element firstOrEmptyTag(Element element, String tagName) {
227 Elements subElements = element.getElementsByTag(tagName);
228 if (subElements.size() > 0) {
229 return subElements.get(0);
230 }
231
232 return new Element("span");
233 }
234
235 /**
236 * Process the given element into text (each line is a text paragraph and
237 * can be prepended with ">" signs to indicate a quote or sub-quote or
238 * sub-sub-quote...).
239 *
240 * @param element
241 * the element to process
242 * @param elementProcessor
243 * the element processor, must not be NULL
244 *
245 * @return text lines, each line is a paragraph
246 */
247 static protected List<String> toLines(Element element,
248 final ElementProcessor elementProcessor) {
249 final List<String> lines = new ArrayList<String>();
250 final StringBuilder currentLine = new StringBuilder();
251 final List<Integer> quoted = new ArrayList<Integer>();
252 final List<Node> ignoredNodes = new ArrayList<Node>();
253
254 if (element != null) {
255 new NodeTraversor(new NodeVisitor() {
256 @Override
257 public void head(Node node, int depth) {
258 String manual = null;
259 boolean ignore = elementProcessor.ignoreNode(node)
260 || ignoredNodes.contains(node.parentNode());
261 if (!ignore) {
262 manual = elementProcessor.manualProcessing(node);
263 if (manual != null) {
264 currentLine.append(manual);
265 ignore = true;
266 }
267 }
268
269 if (ignore) {
270 ignoredNodes.add(node);
271 return;
272 }
273
274 String prep = "";
275 for (int i = 0; i < quoted.size(); i++) {
276 prep += ">";
277 }
278 prep += " ";
279
280 boolean enterQuote = elementProcessor.detectQuote(node);
281 boolean leaveQuote = quoted.contains(depth);
282
283 if (enterQuote) {
284 quoted.add(depth);
285 }
286
287 if (leaveQuote) {
288 quoted.remove(Integer.valueOf(depth));
289 }
290
291 if (enterQuote || leaveQuote) {
292 if (currentLine.length() > 0) {
293 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
294 currentLine.setLength(currentLine.length() - 1);
295 }
296 for (String l : currentLine.toString().split("\n")) {
297 lines.add(prep + l);
298 }
299 }
300 currentLine.setLength(0);
301 }
302
303 if (node instanceof Element) {
304 Element element = (Element) node;
305 boolean block = element.isBlock()
306 || element.tagName().equalsIgnoreCase("br");
307 if (block && currentLine.length() > 0) {
308 currentLine.append("\n");
309 }
310 } else if (node instanceof TextNode) {
311 TextNode textNode = (TextNode) node;
312 String line = StringUtil.normaliseWhitespace(textNode
313 .getWholeText());
314
315 currentLine.append(elementProcessor.processText(line));
316 currentLine.append(" ");
317 }
318 }
319
320 @Override
321 public void tail(Node node, int depth) {
322 }
323 }).traverse(element);
324 }
325
326 if (currentLine.length() > 0) {
327 String prep = "";
328 for (int i = 0; i < quoted.size(); i++) {
329 prep += ">";
330 }
331 prep += " ";
332 if (currentLine.length() > 0) {
333 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
334 currentLine.setLength(currentLine.length() - 1);
335 }
336 for (String l : currentLine.toString().split("\n")) {
337 lines.add(prep + l);
338 }
339 }
340 }
341
342 for (int i = 0; i < lines.size(); i++) {
343 lines.set(i, lines.get(i).replace(" ", " ").trim());
344 }
345
346 return lines;
347 }
348 }