More jDoc, a new BasicElementProcessor
[gofetch.git] / src / be / nikiroo / gofetch / support / BasicSupport.java
CommitLineData
73785268
NR
1package be.nikiroo.gofetch.support;
2
3import java.io.IOException;
4import java.io.InputStream;
5import java.net.URL;
6import java.net.URLConnection;
27008a87 7import java.util.ArrayList;
73785268
NR
8import java.util.List;
9import java.util.zip.GZIPInputStream;
10
27008a87
NR
11import org.jsoup.helper.StringUtil;
12import org.jsoup.nodes.Element;
13import org.jsoup.nodes.Node;
14import org.jsoup.nodes.TextNode;
15import org.jsoup.select.Elements;
16import org.jsoup.select.NodeTraversor;
17import org.jsoup.select.NodeVisitor;
18
73785268
NR
19import be.nikiroo.gofetch.data.Story;
20
21public abstract class BasicSupport {
22 public enum Type {
100a8395 23 SLASHDOT, PIPEDOT, LWN, LEMONDE,
73785268
NR
24 }
25
20217360
NR
26 /**
27 * Used to process an element into lines.
28 *
29 * @author niki
30 */
31 public interface ElementProcessor {
32 /**
33 * Detect if this node is a quote and should be trated as such.
34 *
35 * @param node
36 * the node to check
37 * @return TRUE if it is
38 */
27008a87
NR
39 public boolean detectQuote(Node node);
40
20217360
NR
41 /**
42 * Process text content (will be called on each text element, allowing
43 * you to modify it if needed).
44 *
45 * @param text
46 * the text to process
47 * @return
48 */
27008a87
NR
49 public String processText(String text);
50
20217360
NR
51 /**
52 * Ignore this node.
53 *
54 * @param node
55 * the node to ignore
56 * @return TRUE if it has to be ignored
57 */
27008a87 58 public boolean ignoreNode(Node node);
100a8395
NR
59
60 /**
20217360
NR
61 * Manually process this node (and return the manual processing value)
62 * if so desired.
63 * <p>
64 * If the node is manually processed, it and its children will not be
65 * automatically processed.
100a8395
NR
66 *
67 * @param node
68 * the node to optionally process
69 *
20217360
NR
70 * @return NULL if not processed (will thus be automatically processed
71 * as usual), a {@link String} (may be empty) if we process it
72 * manually -- the given {@link String} will be used instead of
73 * the usual automatic processing if not NULL
100a8395
NR
74 */
75 public String manualProcessing(Node node);
27008a87
NR
76 }
77
20217360
NR
78 /**
79 * A default {@link ElementProcessor} (will not detect or process anything
80 * manually).
81 *
82 * @author niki
83 */
84 public class BasicElementProcessor implements ElementProcessor {
85 @Override
86 public boolean detectQuote(Node node) {
87 return false;
88 }
89
90 @Override
91 public String processText(String text) {
92 return text;
93 }
94
95 @Override
96 public boolean ignoreNode(Node node) {
97 return false;
98 }
99
100 @Override
101 public String manualProcessing(Node node) {
102 return null;
103 }
104 }
105
73785268
NR
106 static private String preselector;
107
108 private Type type;
109
100a8395
NR
110 /**
111 * List all the recent items, but only assure the ID and internal URL to
112 * fetch it later on (until it has been fetched, the rest of the
113 * {@link Story} is not confirmed).
114 *
115 * @return the list of new stories
116 *
117 * @throws IOException
118 * in case of I/O
119 */
73785268
NR
120 abstract public List<Story> list() throws IOException;
121
5c056aad
NR
122 /**
123 * Fetch the full article content as well as all the comments associated to
124 * this {@link Story}, if any (can be empty, but not NULL).
125 *
126 * @param story
127 * the story to fetch the comments of
128 *
129 * @throws IOException
130 * in case of I/O error
131 */
132 abstract public void fetch(Story story) throws IOException;
73785268
NR
133
134 abstract public String getDescription();
2d95a873 135
73785268
NR
136 public String getSelector() {
137 return getSelector(type);
138 }
139
140 public Type getType() {
141 return type;
142 }
143
144 protected void setType(Type type) {
145 this.type = type;
146 }
147
148 /**
149 * @param preselector
150 * the preselector to set
151 */
152 static public void setPreselector(String preselector) {
153 BasicSupport.preselector = preselector;
154 }
155
20217360
NR
156 /**
157 * Return a {@link BasicSupport} that is compatible with the given
158 * {@link Type} if it exists (or NULL if not).
159 *
160 * @param type
161 * the type
162 *
163 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
164 */
73785268
NR
165 static public BasicSupport getSupport(Type type) {
166 BasicSupport support = null;
167
168 if (type != null) {
169 switch (type) {
170 case SLASHDOT:
171 support = new Slashdot();
172 break;
2d95a873
NR
173 case PIPEDOT:
174 support = new Pipedot();
175 break;
eaaeae39
NR
176 case LWN:
177 support = new LWN();
178 break;
100a8395
NR
179 case LEMONDE:
180 support = new LeMonde();
181 break;
73785268
NR
182 }
183
184 if (support != null) {
185 support.setType(type);
186 }
187 }
188
189 return support;
190 }
191
192 static public String getSelector(Type type) {
193 return preselector + "/" + type + "/";
194 }
195
196 // TODO: check Downloader.java?
197 static protected InputStream open(URL url) throws IOException {
198 URLConnection conn = url.openConnection();
199 conn.connect();
200 InputStream in = conn.getInputStream();
201 if ("gzip".equals(conn.getContentEncoding())) {
202 in = new GZIPInputStream(in);
203 }
204
205 return in;
206 }
27008a87
NR
207
208 /**
209 * Get the first {@link Element} of the given class, or an empty span
210 * {@link Element} if none found.
211 *
212 * @param element
213 * the element to look in
214 * @param className
215 * the class to look for
216 *
217 * @return the value or an empty span {@link Element}
218 */
219 static protected Element firstOrEmpty(Element element, String className) {
220 Elements subElements = element.getElementsByClass(className);
221 if (subElements.size() > 0) {
222 return subElements.get(0);
223 }
224
225 return new Element("span");
226 }
227
228 /**
229 * Get the first {@link Element} of the given tag, or an empty span
230 * {@link Element} if none found.
231 *
232 * @param element
233 * the element to look in
234 * @param tagName
235 * the tag to look for
236 *
237 * @return the value or an empty span {@link Element}
238 */
239 static protected Element firstOrEmptyTag(Element element, String tagName) {
240 Elements subElements = element.getElementsByTag(tagName);
241 if (subElements.size() > 0) {
242 return subElements.get(0);
243 }
244
245 return new Element("span");
246 }
247
20217360
NR
248 /**
249 * Process the given element into text (each line is a text paragraph and
250 * can be prepended with ">" signs to indicate a quote or sub-quote or
251 * sub-sub-quote...).
252 *
253 * @param element
254 * the element to process
255 * @param elementProcessor
256 * the element processor, must not be NULL
257 *
258 * @return text lines, each line is a paragraph
259 */
27008a87 260 static protected List<String> toLines(Element element,
20217360 261 final ElementProcessor elementProcessor) {
27008a87
NR
262 final List<String> lines = new ArrayList<String>();
263 final StringBuilder currentLine = new StringBuilder();
264 final List<Integer> quoted = new ArrayList<Integer>();
265 final List<Node> ignoredNodes = new ArrayList<Node>();
266
267 if (element != null) {
268 new NodeTraversor(new NodeVisitor() {
269 @Override
270 public void head(Node node, int depth) {
100a8395 271 String manual = null;
20217360 272 boolean ignore = elementProcessor.ignoreNode(node)
100a8395
NR
273 || ignoredNodes.contains(node.parentNode());
274 if (!ignore) {
20217360 275 manual = elementProcessor.manualProcessing(node);
100a8395
NR
276 if (manual != null) {
277 currentLine.append(manual);
278 ignore = true;
279 }
280 }
281
282 if (ignore) {
27008a87
NR
283 ignoredNodes.add(node);
284 return;
285 }
286
287 String prep = "";
288 for (int i = 0; i < quoted.size(); i++) {
289 prep += ">";
290 }
291 prep += " ";
292
20217360 293 boolean enterQuote = elementProcessor.detectQuote(node);
27008a87
NR
294 boolean leaveQuote = quoted.contains(depth);
295
296 if (enterQuote) {
297 quoted.add(depth);
298 }
299
300 if (leaveQuote) {
301 quoted.remove(Integer.valueOf(depth));
302 }
303
304 if (enterQuote || leaveQuote) {
305 if (currentLine.length() > 0) {
306 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
307 currentLine.setLength(currentLine.length() - 1);
308 }
309 for (String l : currentLine.toString().split("\n")) {
310 lines.add(prep + l);
311 }
312 }
313 currentLine.setLength(0);
314 }
315
316 if (node instanceof Element) {
317 Element element = (Element) node;
318 boolean block = element.isBlock()
319 || element.tagName().equalsIgnoreCase("br");
320 if (block && currentLine.length() > 0) {
321 currentLine.append("\n");
322 }
323 } else if (node instanceof TextNode) {
324 TextNode textNode = (TextNode) node;
325 String line = StringUtil.normaliseWhitespace(textNode
326 .getWholeText());
327
20217360 328 currentLine.append(elementProcessor.processText(line));
27008a87
NR
329 currentLine.append(" ");
330 }
331 }
332
333 @Override
334 public void tail(Node node, int depth) {
335 }
336 }).traverse(element);
337 }
338
339 if (currentLine.length() > 0) {
340 String prep = "";
341 for (int i = 0; i < quoted.size(); i++) {
342 prep += ">";
343 }
344 prep += " ";
345 if (currentLine.length() > 0) {
346 if (currentLine.charAt(currentLine.length() - 1) == '\n') {
347 currentLine.setLength(currentLine.length() - 1);
348 }
349 for (String l : currentLine.toString().split("\n")) {
350 lines.add(prep + l);
351 }
352 }
353 }
354
355 for (int i = 0; i < lines.size(); i++) {
356 lines.set(i, lines.get(i).replace(" ", " ").trim());
357 }
358
359 return lines;
360 }
73785268 361}