1 package be
.nikiroo
.gofetch
.support
;
3 import java
.io
.IOException
;
4 import java
.util
.ArrayList
;
7 import org
.jsoup
.helper
.StringUtil
;
8 import org
.jsoup
.nodes
.Element
;
9 import org
.jsoup
.nodes
.Node
;
10 import org
.jsoup
.nodes
.TextNode
;
11 import org
.jsoup
.select
.Elements
;
12 import org
.jsoup
.select
.NodeTraversor
;
13 import org
.jsoup
.select
.NodeVisitor
;
15 import be
.nikiroo
.gofetch
.data
.Story
;
16 import be
.nikiroo
.utils
.Downloader
;
18 public abstract class BasicSupport
{
19 protected static Downloader downloader
= new Downloader("gofetcher");
22 SLASHDOT
, PIPEDOT
, LWN
, LEMONDE
, REGISTER
, TOOLINUX
,
26 * Used to process an element into lines.
30 public interface ElementProcessor
{
32 * Detect if this node is a quote and should be trated as such.
36 * @return TRUE if it is
38 public boolean detectQuote(Node node
);
41 * Process text content (will be called on each text element, allowing
42 * you to modify it if needed).
48 public String
processText(String text
);
55 * @return TRUE if it has to be ignored
57 public boolean ignoreNode(Node node
);
60 * Manually process this node (and return the manual processing value)
63 * If the node is manually processed, it and its children will not be
64 * automatically processed.
67 * the node to optionally process
69 * @return NULL if not processed (will thus be automatically processed
70 * as usual), a {@link String} (may be empty) if we process it
71 * manually -- the given {@link String} will be used instead of
72 * the usual automatic processing if not NULL
74 public String
manualProcessing(Node node
);
78 * A default {@link ElementProcessor} (will not detect or process anything
83 public class BasicElementProcessor
implements ElementProcessor
{
85 public boolean detectQuote(Node node
) {
90 public String
processText(String text
) {
95 public boolean ignoreNode(Node node
) {
100 public String
manualProcessing(Node node
) {
105 static private String preselector
;
110 * List all the recent items, but only assure the ID and internal URL to
111 * fetch it later on (until it has been fetched, the rest of the
112 * {@link Story} is not confirmed).
114 * @return the list of new stories
116 * @throws IOException
119 abstract public List
<Story
> list() throws IOException
;
122 * Fetch the full article content as well as all the comments associated to
123 * this {@link Story}, if any (can be empty, but not NULL).
126 * the story to fetch the comments of
128 * @throws IOException
129 * in case of I/O error
131 abstract public void fetch(Story story
) throws IOException
;
133 abstract public String
getDescription();
135 public String
getSelector() {
136 return getSelector(type
);
139 public Type
getType() {
143 protected void setType(Type type
) {
149 * the preselector to set
151 static public void setPreselector(String preselector
) {
152 BasicSupport
.preselector
= preselector
;
156 * Return a {@link BasicSupport} that is compatible with the given
157 * {@link Type} if it exists (or NULL if not).
162 * @return a compatible {@link BasicSupport} if it exists (or NULL if not)
164 static public BasicSupport
getSupport(Type type
) {
165 BasicSupport support
= null;
170 support
= new Slashdot();
173 support
= new Pipedot();
179 support
= new LeMonde();
182 support
= new TheRegister();
185 support
= new TooLinux();
189 if (support
!= null) {
190 support
.setType(type
);
197 static public String
getSelector(Type type
) {
198 return preselector
+ "/" + type
+ "/";
202 * Get the first {@link Element} of the given class, or an empty span
203 * {@link Element} if none found.
206 * the element to look in
208 * the class to look for
210 * @return the value or an empty span {@link Element}
212 static protected Element
firstOrEmpty(Element element
, String className
) {
213 Elements subElements
= element
.getElementsByClass(className
);
214 if (subElements
.size() > 0) {
215 return subElements
.get(0);
218 return new Element("span");
222 * Get the first {@link Element} of the given tag, or an empty span
223 * {@link Element} if none found.
226 * the element to look in
228 * the tag to look for
230 * @return the value or an empty span {@link Element}
232 static protected Element
firstOrEmptyTag(Element element
, String tagName
) {
233 Elements subElements
= element
.getElementsByTag(tagName
);
234 if (subElements
.size() > 0) {
235 return subElements
.get(0);
238 return new Element("span");
242 * Process the given element into text (each line is a text paragraph and
243 * can be prepended with ">" signs to indicate a quote or sub-quote or
247 * the element to process
248 * @param elementProcessor
249 * the element processor, must not be NULL
251 * @return text lines, each line is a paragraph
253 static protected List
<String
> toLines(Element element
,
254 final ElementProcessor elementProcessor
) {
255 final List
<String
> lines
= new ArrayList
<String
>();
256 final StringBuilder currentLine
= new StringBuilder();
257 final List
<Integer
> quoted
= new ArrayList
<Integer
>();
258 final List
<Node
> ignoredNodes
= new ArrayList
<Node
>();
259 final List
<String
> footnotes
= new ArrayList
<String
>();
261 if (element
!= null) {
262 new NodeTraversor(new NodeVisitor() {
264 public void head(Node node
, int depth
) {
265 String manual
= null;
266 boolean ignore
= elementProcessor
.ignoreNode(node
)
267 || ignoredNodes
.contains(node
.parentNode());
269 manual
= elementProcessor
.manualProcessing(node
);
270 if (manual
!= null) {
271 currentLine
.append(manual
);
277 ignoredNodes
.add(node
);
282 for (int i
= 0; i
< quoted
.size(); i
++) {
287 boolean enterQuote
= elementProcessor
.detectQuote(node
);
288 boolean leaveQuote
= quoted
.contains(depth
);
295 quoted
.remove(Integer
.valueOf(depth
));
298 if (enterQuote
|| leaveQuote
) {
299 if (currentLine
.length() > 0) {
300 if (currentLine
.charAt(currentLine
.length() - 1) == '\n') {
301 currentLine
.setLength(currentLine
.length() - 1);
303 for (String l
: currentLine
.toString().split("\n")) {
307 currentLine
.setLength(0);
310 if (node
instanceof Element
) {
311 Element element
= (Element
) node
;
312 boolean block
= element
.isBlock()
313 || element
.tagName().equalsIgnoreCase("br");
314 if (block
&& currentLine
.length() > 0) {
315 currentLine
.append("\n");
318 if (!element
.absUrl("href").trim().isEmpty()) {
319 footnotes
.add(element
.absUrl("href"));
320 currentLine
.append("[" + footnotes
.size() + "]");
322 } else if (node
instanceof TextNode
) {
323 TextNode textNode
= (TextNode
) node
;
324 String line
= StringUtil
.normaliseWhitespace(textNode
327 currentLine
.append(elementProcessor
.processText(line
));
328 currentLine
.append(" ");
333 public void tail(Node node
, int depth
) {
335 }).traverse(element
);
338 if (currentLine
.length() > 0) {
340 for (int i
= 0; i
< quoted
.size(); i
++) {
344 if (currentLine
.length() > 0) {
345 if (currentLine
.charAt(currentLine
.length() - 1) == '\n') {
346 currentLine
.setLength(currentLine
.length() - 1);
348 for (String l
: currentLine
.toString().split("\n")) {
354 for (int i
= 0; i
< lines
.size(); i
++) {
355 lines
.set(i
, lines
.get(i
).replace(" ", " ").trim());
358 if (footnotes
.size() > 0) {
363 for (int i
= 0; i
< footnotes
.size(); i
++) {
364 lines
.add("[" + (i
+ 1) + "] " + footnotes
.get(i
));