1 package be
.nikiroo
.gofetch
.support
;
3 import java
.io
.IOException
;
4 import java
.io
.InputStream
;
6 import java
.net
.URLConnection
;
7 import java
.util
.ArrayList
;
9 import java
.util
.zip
.GZIPInputStream
;
11 import org
.jsoup
.helper
.StringUtil
;
12 import org
.jsoup
.nodes
.Element
;
13 import org
.jsoup
.nodes
.Node
;
14 import org
.jsoup
.nodes
.TextNode
;
15 import org
.jsoup
.select
.Elements
;
16 import org
.jsoup
.select
.NodeTraversor
;
17 import org
.jsoup
.select
.NodeVisitor
;
19 import be
.nikiroo
.gofetch
.data
.Story
;
21 public abstract class BasicSupport
{
23 SLASHDOT
, PIPEDOT
, LWN
, LEMONDE
,
26 public interface QuoteProcessor
{
27 public boolean detectQuote(Node node
);
29 public String
processText(String text
);
31 public boolean ignoreNode(Node node
);
34 * Manually process this node if so desired.
37 * the node to optionally process
39 * @return NULL if not processed, a {@link String} (may be empty) if we
40 * must not process it any further
42 public String
manualProcessing(Node node
);
45 static private String preselector
;
50 * List all the recent items, but only assure the ID and internal URL to
51 * fetch it later on (until it has been fetched, the rest of the
52 * {@link Story} is not confirmed).
54 * @return the list of new stories
59 abstract public List
<Story
> list() throws IOException
;
62 * Fetch the full article content as well as all the comments associated to
63 * this {@link Story}, if any (can be empty, but not NULL).
66 * the story to fetch the comments of
69 * in case of I/O error
71 abstract public void fetch(Story story
) throws IOException
;
73 abstract public String
getDescription();
75 public String
getSelector() {
76 return getSelector(type
);
79 public Type
getType() {
83 protected void setType(Type type
) {
89 * the preselector to set
91 static public void setPreselector(String preselector
) {
92 BasicSupport
.preselector
= preselector
;
95 static public BasicSupport
getSupport(Type type
) {
96 BasicSupport support
= null;
101 support
= new Slashdot();
104 support
= new Pipedot();
110 support
= new LeMonde();
114 if (support
!= null) {
115 support
.setType(type
);
122 static public String
getSelector(Type type
) {
123 return preselector
+ "/" + type
+ "/";
126 // TODO: check Downloader.java?
127 static protected InputStream
open(URL url
) throws IOException
{
128 URLConnection conn
= url
.openConnection();
130 InputStream in
= conn
.getInputStream();
131 if ("gzip".equals(conn
.getContentEncoding())) {
132 in
= new GZIPInputStream(in
);
139 * Get the first {@link Element} of the given class, or an empty span
140 * {@link Element} if none found.
143 * the element to look in
145 * the class to look for
147 * @return the value or an empty span {@link Element}
149 static protected Element
firstOrEmpty(Element element
, String className
) {
150 Elements subElements
= element
.getElementsByClass(className
);
151 if (subElements
.size() > 0) {
152 return subElements
.get(0);
155 return new Element("span");
159 * Get the first {@link Element} of the given tag, or an empty span
160 * {@link Element} if none found.
163 * the element to look in
165 * the tag to look for
167 * @return the value or an empty span {@link Element}
169 static protected Element
firstOrEmptyTag(Element element
, String tagName
) {
170 Elements subElements
= element
.getElementsByTag(tagName
);
171 if (subElements
.size() > 0) {
172 return subElements
.get(0);
175 return new Element("span");
178 static protected List
<String
> toLines(Element element
,
179 final QuoteProcessor quoteProcessor
) {
180 final List
<String
> lines
= new ArrayList
<String
>();
181 final StringBuilder currentLine
= new StringBuilder();
182 final List
<Integer
> quoted
= new ArrayList
<Integer
>();
183 final List
<Node
> ignoredNodes
= new ArrayList
<Node
>();
185 if (element
!= null) {
186 new NodeTraversor(new NodeVisitor() {
188 public void head(Node node
, int depth
) {
189 String manual
= null;
190 boolean ignore
= quoteProcessor
.ignoreNode(node
)
191 || ignoredNodes
.contains(node
.parentNode());
193 manual
= quoteProcessor
.manualProcessing(node
);
194 if (manual
!= null) {
195 currentLine
.append(manual
);
201 ignoredNodes
.add(node
);
206 for (int i
= 0; i
< quoted
.size(); i
++) {
211 boolean enterQuote
= quoteProcessor
.detectQuote(node
);
212 boolean leaveQuote
= quoted
.contains(depth
);
219 quoted
.remove(Integer
.valueOf(depth
));
222 if (enterQuote
|| leaveQuote
) {
223 if (currentLine
.length() > 0) {
224 if (currentLine
.charAt(currentLine
.length() - 1) == '\n') {
225 currentLine
.setLength(currentLine
.length() - 1);
227 for (String l
: currentLine
.toString().split("\n")) {
231 currentLine
.setLength(0);
234 if (node
instanceof Element
) {
235 Element element
= (Element
) node
;
236 boolean block
= element
.isBlock()
237 || element
.tagName().equalsIgnoreCase("br");
238 if (block
&& currentLine
.length() > 0) {
239 currentLine
.append("\n");
241 } else if (node
instanceof TextNode
) {
242 TextNode textNode
= (TextNode
) node
;
243 String line
= StringUtil
.normaliseWhitespace(textNode
246 currentLine
.append(quoteProcessor
.processText(line
));
247 currentLine
.append(" ");
252 public void tail(Node node
, int depth
) {
254 }).traverse(element
);
257 if (currentLine
.length() > 0) {
259 for (int i
= 0; i
< quoted
.size(); i
++) {
263 if (currentLine
.length() > 0) {
264 if (currentLine
.charAt(currentLine
.length() - 1) == '\n') {
265 currentLine
.setLength(currentLine
.length() - 1);
267 for (String l
: currentLine
.toString().split("\n")) {
273 for (int i
= 0; i
< lines
.size(); i
++) {
274 lines
.set(i
, lines
.get(i
).replace(" ", " ").trim());