1 package be
.nikiroo
.gofetch
.support
;
3 import java
.io
.IOException
;
4 import java
.io
.InputStream
;
6 import java
.net
.URLConnection
;
7 import java
.util
.ArrayList
;
9 import java
.util
.zip
.GZIPInputStream
;
11 import org
.jsoup
.helper
.StringUtil
;
12 import org
.jsoup
.nodes
.Element
;
13 import org
.jsoup
.nodes
.Node
;
14 import org
.jsoup
.nodes
.TextNode
;
15 import org
.jsoup
.select
.Elements
;
16 import org
.jsoup
.select
.NodeTraversor
;
17 import org
.jsoup
.select
.NodeVisitor
;
19 import be
.nikiroo
.gofetch
.data
.Story
;
21 public abstract class BasicSupport
{
23 SLASHDOT
, PIPEDOT
, LWN
, LEMONDE
,
27 * Used to process an element into lines.
31 public interface ElementProcessor
{
33 * Detect if this node is a quote and should be trated as such.
37 * @return TRUE if it is
39 public boolean detectQuote(Node node
);
42 * Process text content (will be called on each text element, allowing
43 * you to modify it if needed).
49 public String
processText(String text
);
56 * @return TRUE if it has to be ignored
58 public boolean ignoreNode(Node node
);
61 * Manually process this node (and return the manual processing value)
64 * If the node is manually processed, it and its children will not be
65 * automatically processed.
68 * the node to optionally process
70 * @return NULL if not processed (will thus be automatically processed
71 * as usual), a {@link String} (may be empty) if we process it
72 * manually -- the given {@link String} will be used instead of
73 * the usual automatic processing if not NULL
75 public String
manualProcessing(Node node
);
79 * A default {@link ElementProcessor} (will not detect or process anything
84 public class BasicElementProcessor
implements ElementProcessor
{
86 public boolean detectQuote(Node node
) {
91 public String
processText(String text
) {
96 public boolean ignoreNode(Node node
) {
101 public String
manualProcessing(Node node
) {
106 static private String preselector
;
111 * List all the recent items, but only assure the ID and internal URL to
112 * fetch it later on (until it has been fetched, the rest of the
113 * {@link Story} is not confirmed).
115 * @return the list of new stories
117 * @throws IOException
120 abstract public List
<Story
> list() throws IOException
;
123 * Fetch the full article content as well as all the comments associated to
124 * this {@link Story}, if any (can be empty, but not NULL).
127 * the story to fetch the comments of
129 * @throws IOException
130 * in case of I/O error
132 abstract public void fetch(Story story
) throws IOException
;
134 abstract public String
getDescription();
136 public String
getSelector() {
137 return getSelector(type
);
140 public Type
getType() {
144 protected void setType(Type type
) {
150 * the preselector to set
152 static public void setPreselector(String preselector
) {
153 BasicSupport
.preselector
= preselector
;
157 * Return a {@link BasicSupport} that is compatible with the given
158 * {@link Type} if it exists (or NULL if not).
163 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
165 static public BasicSupport
getSupport(Type type
) {
166 BasicSupport support
= null;
171 support
= new Slashdot();
174 support
= new Pipedot();
180 support
= new LeMonde();
184 if (support
!= null) {
185 support
.setType(type
);
192 static public String
getSelector(Type type
) {
193 return preselector
+ "/" + type
+ "/";
196 // TODO: check Downloader.java?
197 static protected InputStream
open(URL url
) throws IOException
{
198 URLConnection conn
= url
.openConnection();
200 InputStream in
= conn
.getInputStream();
201 if ("gzip".equals(conn
.getContentEncoding())) {
202 in
= new GZIPInputStream(in
);
209 * Get the first {@link Element} of the given class, or an empty span
210 * {@link Element} if none found.
213 * the element to look in
215 * the class to look for
217 * @return the value or an empty span {@link Element}
219 static protected Element
firstOrEmpty(Element element
, String className
) {
220 Elements subElements
= element
.getElementsByClass(className
);
221 if (subElements
.size() > 0) {
222 return subElements
.get(0);
225 return new Element("span");
229 * Get the first {@link Element} of the given tag, or an empty span
230 * {@link Element} if none found.
233 * the element to look in
235 * the tag to look for
237 * @return the value or an empty span {@link Element}
239 static protected Element
firstOrEmptyTag(Element element
, String tagName
) {
240 Elements subElements
= element
.getElementsByTag(tagName
);
241 if (subElements
.size() > 0) {
242 return subElements
.get(0);
245 return new Element("span");
249 * Process the given element into text (each line is a text paragraph and
250 * can be prepended with ">" signs to indicate a quote or sub-quote or
254 * the element to process
255 * @param elementProcessor
256 * the element processor, must not be NULL
258 * @return text lines, each line is a paragraph
260 static protected List
<String
> toLines(Element element
,
261 final ElementProcessor elementProcessor
) {
262 final List
<String
> lines
= new ArrayList
<String
>();
263 final StringBuilder currentLine
= new StringBuilder();
264 final List
<Integer
> quoted
= new ArrayList
<Integer
>();
265 final List
<Node
> ignoredNodes
= new ArrayList
<Node
>();
267 if (element
!= null) {
268 new NodeTraversor(new NodeVisitor() {
270 public void head(Node node
, int depth
) {
271 String manual
= null;
272 boolean ignore
= elementProcessor
.ignoreNode(node
)
273 || ignoredNodes
.contains(node
.parentNode());
275 manual
= elementProcessor
.manualProcessing(node
);
276 if (manual
!= null) {
277 currentLine
.append(manual
);
283 ignoredNodes
.add(node
);
288 for (int i
= 0; i
< quoted
.size(); i
++) {
293 boolean enterQuote
= elementProcessor
.detectQuote(node
);
294 boolean leaveQuote
= quoted
.contains(depth
);
301 quoted
.remove(Integer
.valueOf(depth
));
304 if (enterQuote
|| leaveQuote
) {
305 if (currentLine
.length() > 0) {
306 if (currentLine
.charAt(currentLine
.length() - 1) == '\n') {
307 currentLine
.setLength(currentLine
.length() - 1);
309 for (String l
: currentLine
.toString().split("\n")) {
313 currentLine
.setLength(0);
316 if (node
instanceof Element
) {
317 Element element
= (Element
) node
;
318 boolean block
= element
.isBlock()
319 || element
.tagName().equalsIgnoreCase("br");
320 if (block
&& currentLine
.length() > 0) {
321 currentLine
.append("\n");
323 } else if (node
instanceof TextNode
) {
324 TextNode textNode
= (TextNode
) node
;
325 String line
= StringUtil
.normaliseWhitespace(textNode
328 currentLine
.append(elementProcessor
.processText(line
));
329 currentLine
.append(" ");
334 public void tail(Node node
, int depth
) {
336 }).traverse(element
);
339 if (currentLine
.length() > 0) {
341 for (int i
= 0; i
< quoted
.size(); i
++) {
345 if (currentLine
.length() > 0) {
346 if (currentLine
.charAt(currentLine
.length() - 1) == '\n') {
347 currentLine
.setLength(currentLine
.length() - 1);
349 for (String l
: currentLine
.toString().split("\n")) {
355 for (int i
= 0; i
< lines
.size(); i
++) {
356 lines
.set(i
, lines
.get(i
).replace(" ", " ").trim());