├── .gitignore
├── README
├── pom.xml
└── src
└── main
└── java
└── com
└── basistech
└── readability
├── AbstractPageReader.java
├── FilePageReader.java
├── HtmlPage.java
├── HttpPageReader.java
├── NekoJsoupParser.java
├── OffsetRange.java
├── PageCharsetDetector.java
├── PageInfo.java
├── PageLinkInfo.java
├── PageReadException.java
├── PageReader.java
├── Patterns.java
├── Readability.java
├── ReadabilityDriver.java
├── TikaCharsetDetector.java
└── XmlDataMap.java
/.gitignore:
--------------------------------------------------------------------------------
1 | .classpath
2 | .project
3 | target
4 | .settings
5 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | Home for java readability.
2 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.basistech
5 | java-readability
6 | jar
7 | Java version of Readability
8 | 1-SNAPSHOT
9 |
10 | org.sonatype.oss
11 | oss-parent
12 | 5
13 |
14 | A port of the Arclabs readability code to java.
15 |
16 |
17 | The Apache Software License, Version 2.0
18 | http://www.apache.org/licenses/LICENSE-2.0.txt
19 | repo
20 |
21 |
22 |
23 | scm:git:git@github.com:basis-technology-corp/Java-readability.git
24 | git@github.com:basis-technology-corp/Java-readability.git
25 |
26 |
27 |
28 | bimargulies
29 | Benson Margulies
30 | bimargulies@gmail.com
31 |
32 |
33 |
34 | install
35 |
36 |
37 |
38 | org.apache.maven.plugins
39 | maven-gpg-plugin
40 | 1.1
41 |
42 |
43 | org.apache.maven.plugins
44 | maven-deploy-plugin
45 | 2.5
46 |
47 |
48 | org.apache.maven.plugins
49 | maven-release-plugin
50 | 2.1
51 |
52 | true
53 | release,github_release
54 | clean install
55 | deploy
56 | true
57 |
58 |
59 |
60 | org.apache.maven.plugins
61 | maven-compiler-plugin
62 |
63 | 1.6
64 | 1.6
65 | 256M
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 | release
75 |
76 |
77 |
78 | github_release
79 |
80 |
81 |
82 |
83 |
84 | org.slf4j
85 | slf4j-api
86 | 1.6.1
87 | jar
88 | compile
89 |
90 |
91 | commons-io
92 | commons-io
93 | 2.0.1
94 | jar
95 | compile
96 |
97 |
98 | org.jsoup
99 | jsoup
100 | 1.4.1
101 | jar
102 | compile
103 |
104 |
105 | net.sourceforge.nekohtml
106 | nekohtml
107 | 1.9.16
108 | jar
109 | compile
110 |
111 |
112 | org.apache.httpcomponents
113 | httpclient
114 | 4.0.3
115 |
116 |
117 | org.apache.tika
118 | tika-parsers
119 | 0.8
120 |
121 |
122 |
123 | xerces
124 | xercesImpl
125 | 2.9.1
126 | jar
127 | compile
128 |
129 |
130 |
131 |
--------------------------------------------------------------------------------
/src/main/java/com/basistech/readability/AbstractPageReader.java:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2010 Basis Technology Corp.
3 | *
4 | * Basis Technology Corp. licenses this file
5 | * to you under the Apache License, Version 2.0 (the
6 | * "License"); you may not use this file except in compliance
7 | * with the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing,
12 | * software distributed under the License is distributed on an
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | * KIND, either express or implied. See the License for the
15 | * specific language governing permissions and limitations
16 | * under the License.
17 | */
18 | package com.basistech.readability;
19 |
20 | import java.io.IOException;
21 | import java.io.InputStream;
22 | import java.nio.charset.Charset;
23 |
24 | import org.apache.commons.io.IOUtils;
25 | import org.slf4j.Logger;
26 | import org.slf4j.LoggerFactory;
27 |
28 | public class AbstractPageReader {
29 | static final Logger LOG = LoggerFactory.getLogger(HttpPageReader.class);
30 | static final Charset UTF8 = Charset.forName("utf-8");
31 |
32 | private PageCharsetDetector charsetDetector;
33 | private Charset charset;
34 | private boolean serverReturnedEncoding;
35 | private boolean respectServerEncoding;
36 | private String detectedEncoding;
37 |
38 | protected String readContent(InputStream response, String forceEncoding) throws IOException {
39 | byte[] bytes = IOUtils.toByteArray(response);
40 | charset = null;
41 | String hint = null;
42 | if (forceEncoding != null) {
43 | serverReturnedEncoding = true;
44 | try {
45 | charset = Charset.forName(forceEncoding);
46 | hint = charset.name();
47 | } catch (Exception e) {
48 | //
49 | }
50 | }
51 | if (charsetDetector != null && !respectServerEncoding || charset == null) {
52 | String charsetName = charsetDetector.detect(bytes, hint);
53 | if (charsetName != null) {
54 | try {
55 | charset = Charset.forName(charsetName);
56 | detectedEncoding = charset.name();
57 | } catch (Exception e) {
58 | LOG.warn("Detected character set " + charsetName + " not supported");
59 | }
60 | }
61 | }
62 | if (charset == null) {
63 | LOG.warn("Defaulting to utf-8");
64 | charset = UTF8;
65 | }
66 | return new String(bytes, charset);
67 | }
68 |
69 | public PageCharsetDetector getCharsetDetector() {
70 | return charsetDetector;
71 | }
72 |
73 | public void setCharsetDetector(PageCharsetDetector charsetDetector) {
74 | this.charsetDetector = charsetDetector;
75 | }
76 |
77 | public Charset getCharset() {
78 | return charset;
79 | }
80 |
81 | public boolean isServerReturnedEncoding() {
82 | return serverReturnedEncoding;
83 | }
84 |
85 | public void setRespectServerEncoding(boolean respectServerEncoding) {
86 | this.respectServerEncoding = respectServerEncoding;
87 | }
88 |
89 | public boolean isRespectServerEncoding() {
90 | return respectServerEncoding;
91 | }
92 |
93 | public String getDetectedEncoding() {
94 | return detectedEncoding;
95 | }
96 |
97 | }
98 |
--------------------------------------------------------------------------------
/src/main/java/com/basistech/readability/FilePageReader.java:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2010 Basis Technology Corp.
3 | *
4 | * Basis Technology Corp. licenses this file
5 | * to you under the Apache License, Version 2.0 (the
6 | * "License"); you may not use this file except in compliance
7 | * with the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing,
12 | * software distributed under the License is distributed on an
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | * KIND, either express or implied. See the License for the
15 | * specific language governing permissions and limitations
16 | * under the License.
17 | */
18 |
19 | package com.basistech.readability;
20 |
21 | import java.io.File;
22 | import java.io.FileInputStream;
23 | import java.io.IOException;
24 |
25 | import org.apache.tika.io.IOUtils;
26 | import org.slf4j.Logger;
27 | import org.slf4j.LoggerFactory;
28 |
29 | /**
30 | *
31 | */
32 | public class FilePageReader extends AbstractPageReader implements PageReader {
33 | private static final Logger LOG = LoggerFactory.getLogger(FilePageReader.class);
34 |
35 | private File baseDirectory;
36 |
37 | /** {@inheritDoc} */
38 | @Override
39 | public String readPage(String url) throws PageReadException {
40 | int lastSlash = url.replace("\\", "/").lastIndexOf('/');
41 | File testFile = new File(baseDirectory, url.substring(lastSlash + 1));
42 | LOG.info("Reading " + testFile + " for " + url);
43 | FileInputStream fis = null;
44 | try {
45 | try {
46 | fis = new FileInputStream(testFile);
47 | return readContent(fis, null);
48 | } catch (IOException e) {
49 | throw new PageReadException("Failed to read " + url, e);
50 | }
51 | } finally {
52 | if (fis != null) {
53 | IOUtils.closeQuietly(fis);
54 | }
55 | }
56 | }
57 |
58 | public void setBaseDirectory(File baseDirectory) {
59 | this.baseDirectory = baseDirectory;
60 | }
61 |
62 | public File getBaseDirectory() {
63 | return baseDirectory;
64 | }
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/java/com/basistech/readability/HtmlPage.java:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2010 Basis Technology Corp.
3 | *
4 | * Basis Technology Corp. licenses this file
5 | * to you under the Apache License, Version 2.0 (the
6 | * "License"); you may not use this file except in compliance
7 | * with the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing,
12 | * software distributed under the License is distributed on an
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | * KIND, either express or implied. See the License for the
15 | * specific language governing permissions and limitations
16 | * under the License.
17 | */
18 | package com.basistech.readability;
19 |
20 | import java.util.HashMap;
21 | import java.util.Map;
22 |
23 | import org.jsoup.nodes.Document;
24 | import org.jsoup.nodes.Element;
25 |
26 | public class HtmlPage extends XmlDataMap {
27 | public static final String KEY = "htmlPage";
28 | static Map elementActionMap;
29 | static {
30 | elementActionMap = new HashMap();
31 | elementActionMap.put("img", ElementAction.Alt);
32 | elementActionMap.put("applet", ElementAction.Alt);
33 | elementActionMap.put("area", ElementAction.Alt);
34 | elementActionMap.put("input", ElementAction.Alt);
35 | elementActionMap.put("script", ElementAction.Banned);
36 | elementActionMap.put("iframe", ElementAction.Banned);
37 | elementActionMap.put("style", ElementAction.Banned);
38 | elementActionMap.put("br", ElementAction.Whitespace);
39 | elementActionMap.put("p", ElementAction.Sentence);
40 | elementActionMap.put("hr", ElementAction.Sentence);
41 | elementActionMap.put("ul", ElementAction.Sentence);
42 | elementActionMap.put("h1", ElementAction.Sentence);
43 | elementActionMap.put("h2", ElementAction.Sentence);
44 | elementActionMap.put("h3", ElementAction.Sentence);
45 | elementActionMap.put("h4", ElementAction.Sentence);
46 | elementActionMap.put("h5", ElementAction.Sentence);
47 | elementActionMap.put("h6", ElementAction.Sentence);
48 | elementActionMap.put("pre", ElementAction.Sentence);
49 | elementActionMap.put("blockquote", ElementAction.Sentence);
50 | elementActionMap.put("title", ElementAction.Sentence);
51 | elementActionMap.put("div", ElementAction.Sentence);
52 | // hmm, span tags with CSS with certain properties? Hopeless.
53 | elementActionMap.put("center", ElementAction.Whitespace);
54 | elementActionMap.put("form", ElementAction.Sentence);
55 | elementActionMap.put("table", ElementAction.Sentence);
56 | elementActionMap.put("td", ElementAction.Sentence);
57 | elementActionMap.put("th", ElementAction.Sentence);
58 | elementActionMap.put("li", ElementAction.Sentence);
59 | elementActionMap.put("dir", ElementAction.Sentence);
60 | elementActionMap.put("menu", ElementAction.Sentence);
61 | elementActionMap.put("ol", ElementAction.Sentence);
62 | }
63 |
64 | // the data as formatted for RLP -- just the PC-DATA.
65 | private String pcData;
66 | private String mimeType;
67 |
68 | public HtmlPage() {
69 | super();
70 | }
71 |
72 | public void process(Document document) {
73 | Element body = document.body();
74 | if (body != null) { // page might have no body.
75 | process(body);
76 | pcData = pcDataBuffer.toString();
77 | }
78 | }
79 |
80 | public String getPcData() {
81 | return pcData;
82 | }
83 |
84 | @Override
85 | protected ElementAction classifyElement(Element element) {
86 | if (element.hasAttr("basisInline")) {
87 | return null;
88 | }
89 | return elementActionMap.get(element.tagName());
90 | }
91 |
92 | public String getMimeType() {
93 | return mimeType;
94 | }
95 |
96 | public void setMimeType(String mimeType) {
97 | this.mimeType = mimeType;
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/src/main/java/com/basistech/readability/HttpPageReader.java:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2010 Basis Technology Corp.
3 | *
4 | * Basis Technology Corp. licenses this file
5 | * to you under the Apache License, Version 2.0 (the
6 | * "License"); you may not use this file except in compliance
7 | * with the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing,
12 | * software distributed under the License is distributed on an
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | * KIND, either express or implied. See the License for the
15 | * specific language governing permissions and limitations
16 | * under the License.
17 | */
18 |
19 | package com.basistech.readability;
20 |
21 | import java.io.IOException;
22 | import java.io.InputStream;
23 |
24 | import org.apache.http.HttpResponse;
25 | import org.apache.http.HttpStatus;
26 | import org.apache.http.client.methods.HttpGet;
27 | import org.apache.http.impl.client.DefaultHttpClient;
28 | import org.apache.http.params.BasicHttpParams;
29 | import org.apache.http.params.HttpConnectionParams;
30 | import org.apache.http.params.HttpParams;
31 | import org.apache.http.protocol.BasicHttpContext;
32 | import org.apache.http.protocol.HttpContext;
33 | import org.apache.http.util.EntityUtils;
34 | import org.slf4j.Logger;
35 | import org.slf4j.LoggerFactory;
36 |
37 | /**
38 | *
39 | */
40 | public class HttpPageReader extends AbstractPageReader implements PageReader {
41 | static final Logger LOG = LoggerFactory.getLogger(HttpPageReader.class);
42 |
43 | /** {@inheritDoc}*/
44 | @Override
45 | public String readPage(String url) throws PageReadException {
46 | LOG.info("Reading " + url);
47 | HttpParams httpParameters = new BasicHttpParams();
48 | // Set the timeout in milliseconds until a connection is established.
49 | int timeoutConnection = 3000;
50 | HttpConnectionParams.setConnectionTimeout(httpParameters, timeoutConnection);
51 | // Set the default socket timeout (SO_TIMEOUT)
52 | // in milliseconds which is the timeout for waiting for data.
53 | int timeoutSocket = 10000;
54 | HttpConnectionParams.setSoTimeout(httpParameters, timeoutSocket);
55 | DefaultHttpClient httpclient = new DefaultHttpClient(httpParameters);
56 | HttpContext localContext = new BasicHttpContext();
57 | HttpGet get = new HttpGet(url);
58 | InputStream response = null;
59 | HttpResponse httpResponse = null;
60 | try {
61 | try {
62 | httpResponse = httpclient.execute(get, localContext);
63 | int resp = httpResponse.getStatusLine().getStatusCode();
64 | if (HttpStatus.SC_OK != resp) {
65 | LOG.error("Download failed of " + url + " status " + resp + " " + httpResponse.getStatusLine().getReasonPhrase());
66 | return null;
67 | }
68 | String respCharset = EntityUtils.getContentCharSet(httpResponse.getEntity());
69 | return readContent(httpResponse.getEntity().getContent(), respCharset);
70 | } finally {
71 | if (response != null) {
72 | response.close();
73 | }
74 | if (httpResponse != null && httpResponse.getEntity() != null) {
75 | httpResponse.getEntity().consumeContent();
76 | }
77 |
78 | }
79 | } catch (IOException e) {
80 | LOG.error("Download failed of " + url, e);
81 | throw new PageReadException("Failed to read " + url, e);
82 | }
83 | }
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/java/com/basistech/readability/NekoJsoupParser.java:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2010 Basis Technology Corp.
3 | *
4 | * Basis Technology Corp. licenses this file
5 | * to you under the Apache License, Version 2.0 (the
6 | * "License"); you may not use this file except in compliance
7 | * with the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing,
12 | * software distributed under the License is distributed on an
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | * KIND, either express or implied. See the License for the
15 | * specific language governing permissions and limitations
16 | * under the License.
17 | */
18 |
19 | package com.basistech.readability;
20 |
21 | import java.io.IOException;
22 | import java.io.InputStream;
23 | import java.io.StringReader;
24 |
25 | import org.xml.sax.Attributes;
26 | import org.xml.sax.ErrorHandler;
27 | import org.xml.sax.InputSource;
28 | import org.xml.sax.SAXException;
29 | import org.xml.sax.SAXParseException;
30 | import org.xml.sax.helpers.DefaultHandler;
31 |
32 | import org.cyberneko.html.parsers.SAXParser;
33 | import org.jsoup.Jsoup;
34 | import org.jsoup.nodes.Document;
35 | import org.jsoup.nodes.Element;
36 | import org.slf4j.Logger;
37 | import org.slf4j.LoggerFactory;
38 |
39 | /**
40 | * Due to bugs in the Jsoup parser, we want a class that uses Neko to do the parse.
41 | * The same trick could be played with JSoup.
42 | */
43 | public class NekoJsoupParser {
44 | private static final Logger LOG = LoggerFactory.getLogger(NekoJsoupParser.class);
45 |
46 | public NekoJsoupParser() {
47 | //
48 | }
49 |
50 | private final class LocalErrorHandler implements ErrorHandler {
51 | @Override
52 | public void error(SAXParseException e) throws SAXException {
53 | LOG.error("Parse error", e);
54 | throw e;
55 | }
56 |
57 | @Override
58 | public void fatalError(SAXParseException e) throws SAXException {
59 | LOG.error("Parse error", e);
60 | throw e;
61 | }
62 |
63 | @Override
64 | public void warning(SAXParseException e) throws SAXException {
65 | LOG.warn("Parse warning", e);
66 | }
67 | }
68 |
69 | private class Handler extends DefaultHandler {
70 | private Document document;
71 | private Element currentElement;
72 | private int depth;
73 | Handler(Document document) {
74 | this.document = document;
75 | }
76 | @Override
77 | public void characters(char[] data, int start, int length) throws SAXException {
78 | assert currentElement != null;
79 | currentElement.appendText(new String(data, start, length));
80 | }
81 | @Override
82 | public void endDocument() throws SAXException {
83 | assert depth == 0;
84 | }
85 | @Override
86 | public void endElement(String uri, String localName, String qname) throws SAXException {
87 | LOG.debug("end element " + qname);
88 | currentElement = currentElement.parent();
89 | depth--;
90 | }
91 | @Override
92 | public void ignorableWhitespace(char[] data, int start, int length) throws SAXException {
93 | characters(data, start, length);
94 | }
95 | @Override
96 | public void startDocument() throws SAXException {
97 | currentElement = document;
98 | }
99 | @Override
100 | public void startElement(String uri, String localName, String qname, Attributes attrs) throws SAXException {
101 | LOG.debug("start element " + qname + " " + depth);
102 | Element newElement;
103 | newElement = currentElement.appendElement(localName);
104 |
105 | for (int ax = 0; ax < attrs.getLength(); ax++) {
106 | String name = attrs.getQName(ax);
107 | String value = attrs.getValue(ax);
108 | newElement.attr(name, value);
109 | }
110 | currentElement = newElement;
111 | depth++;
112 | }
113 | }
114 |
115 | public Document parse(InputStream data, String baseUri) throws SAXException, IOException {
116 | InputSource source = new InputSource();
117 | source.setByteStream(data);
118 | SAXParser nekoParser = new SAXParser();
119 | Document document = new Document(baseUri);
120 | nekoParser.setContentHandler(new Handler(document));
121 | nekoParser.setErrorHandler(new LocalErrorHandler());
122 | nekoParser.parse(source);
123 | return document;
124 | }
125 |
126 | public Document parse(String data, String baseUri) throws SAXException, IOException {
127 | InputSource source = new InputSource();
128 | source.setCharacterStream(new StringReader(data));
129 | SAXParser nekoParser = new SAXParser();
130 | Document document = new Document(baseUri);
131 | nekoParser.setContentHandler(new Handler(document));
132 | nekoParser.setErrorHandler(new LocalErrorHandler());
133 | nekoParser.parse(source);
134 | return document;
135 | }
136 |
137 | public Document parse(String data) throws SAXException, IOException {
138 | return Jsoup.parse(data);
139 | }
140 | }
141 |
--------------------------------------------------------------------------------
/src/main/java/com/basistech/readability/OffsetRange.java:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2010 Basis Technology Corp.
3 | *
4 | * Basis Technology Corp. licenses this file
5 | * to you under the Apache License, Version 2.0 (the
6 | * "License"); you may not use this file except in compliance
7 | * with the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing,
12 | * software distributed under the License is distributed on an
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | * KIND, either express or implied. See the License for the
15 | * specific language governing permissions and limitations
16 | * under the License.
17 | */
18 | package com.basistech.readability;
19 |
20 | import org.jsoup.nodes.TextNode;
21 |
22 | /**
23 | * Object to relate a range of PC-data to a text node in an XOM tree.
24 | */
25 | public class OffsetRange {
26 | private int start;
27 | private int end;
28 | private TextNode text;
29 |
30 | OffsetRange(int start, int end, TextNode text) {
31 | this.start = start;
32 | this.end = end;
33 | this.text = text;
34 |
35 | assert this.text == null || this.text.text().length() == this.end - this.start;
36 | }
37 |
38 | public String toString() {
39 | return super.toString() + "[" + this.start + "-" + this.end + " " + this.text.text() + "]";
40 | }
41 |
42 | public TextNode getText() {
43 | return text;
44 | }
45 |
46 | public int getEnd() {
47 | return end;
48 | }
49 |
50 | public int getStart() {
51 | return start;
52 | }
53 |
54 | public void setStart(int start) {
55 | this.start = start;
56 | }
57 |
58 | public void setEnd(int end) {
59 | this.end = end;
60 | }
61 |
62 | public void setText(TextNode text) {
63 | this.text = text;
64 | }
65 |
66 | public boolean offsetInRange(int offset) {
67 | return offset >= start && offset < end;
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/main/java/com/basistech/readability/PageCharsetDetector.java:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2010 Basis Technology Corp.
3 | *
4 | * Basis Technology Corp. licenses this file
5 | * to you under the Apache License, Version 2.0 (the
6 | * "License"); you may not use this file except in compliance
7 | * with the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing,
12 | * software distributed under the License is distributed on an
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | * KIND, either express or implied. See the License for the
15 | * specific language governing permissions and limitations
16 | * under the License.
17 | */
18 |
19 | package com.basistech.readability;
20 |
21 | /**
22 | * Generic API for character set detection.
23 | */
24 | public interface PageCharsetDetector {
25 | String detect(byte[] data, String hint);
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/java/com/basistech/readability/PageInfo.java:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2010 Basis Technology Corp.
3 | *
4 | * Basis Technology Corp. licenses this file
5 | * to you under the Apache License, Version 2.0 (the
6 | * "License"); you may not use this file except in compliance
7 | * with the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing,
12 | * software distributed under the License is distributed on an
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | * KIND, either express or implied. See the License for the
15 | * specific language governing permissions and limitations
16 | * under the License.
17 | */
18 |
19 | package com.basistech.readability;
20 |
21 | /**
22 | *
23 | */
24 | public class PageInfo {
25 | private String url;
26 | private String content;
27 | private String title;
28 |
29 | public String getUrl() {
30 | return url;
31 | }
32 | public void setUrl(String url) {
33 | this.url = url;
34 | }
35 | public String getContent() {
36 | return content;
37 | }
38 | public void setContent(String content) {
39 | this.content = content;
40 | }
41 | public String getTitle() {
42 | return title;
43 | }
44 | public void setTitle(String title) {
45 | this.title = title;
46 | }
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/java/com/basistech/readability/PageLinkInfo.java:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2010 Basis Technology Corp.
3 | *
4 | * Basis Technology Corp. licenses this file
5 | * to you under the Apache License, Version 2.0 (the
6 | * "License"); you may not use this file except in compliance
7 | * with the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing,
12 | * software distributed under the License is distributed on an
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | * KIND, either express or implied. See the License for the
15 | * specific language governing permissions and limitations
16 | * under the License.
17 | */
18 |
19 | package com.basistech.readability;
20 |
21 | /**
22 | *
23 | */
24 | public class PageLinkInfo {
25 | private double score;
26 | private String linkText;
27 | private String href;
28 | public PageLinkInfo(double score, String linkText, String href) {
29 | this.score = score;
30 | this.linkText = linkText;
31 | this.href = href;
32 | }
33 | public void setScore(double score) {
34 | this.score = score;
35 | }
36 | public void incrementScore(double incr) {
37 | score = score + incr;
38 | }
39 | public void setLinkText(String linkText) {
40 | this.linkText = linkText;
41 | }
42 | public double getScore() {
43 | return score;
44 | }
45 | public String getLinkText() {
46 | return linkText;
47 | }
48 | public String getHref() {
49 | return href;
50 | }
51 | @Override
52 | public String toString() {
53 | return "PageLinkInfo [score=" + score + ", linkText=" + linkText + ", href=" + href + "]";
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/java/com/basistech/readability/PageReadException.java:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2010 Basis Technology Corp.
3 | *
4 | * Basis Technology Corp. licenses this file
5 | * to you under the Apache License, Version 2.0 (the
6 | * "License"); you may not use this file except in compliance
7 | * with the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing,
12 | * software distributed under the License is distributed on an
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | * KIND, either express or implied. See the License for the
15 | * specific language governing permissions and limitations
16 | * under the License.
17 | */
18 | package com.basistech.readability;
19 |
20 | /**
21 | *
22 | */
23 | public class PageReadException extends Exception {
24 |
25 | public PageReadException() {
26 | super();
27 | }
28 |
29 | public PageReadException(String message) {
30 | super(message);
31 | }
32 |
33 | public PageReadException(String message, Throwable cause) {
34 | super(message, cause);
35 | }
36 |
37 | public PageReadException(Exception e) {
38 | super(e);
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/java/com/basistech/readability/PageReader.java:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2010 Basis Technology Corp.
3 | *
4 | * Basis Technology Corp. licenses this file
5 | * to you under the Apache License, Version 2.0 (the
6 | * "License"); you may not use this file except in compliance
7 | * with the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing,
12 | * software distributed under the License is distributed on an
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | * KIND, either express or implied. See the License for the
15 | * specific language governing permissions and limitations
16 | * under the License.
17 | */
18 |
19 | package com.basistech.readability;
20 |
21 | /**
22 | * Interface to reading HTML pages.
23 | */
24 | public interface PageReader {
25 | /**
26 | * Read the content of a page. Return null and log if
27 | * there's some problem or another. This is responsible
28 | * for dealing with charset.
29 | * @param url
30 | * @return
31 | */
32 | String readPage(String url) throws PageReadException;
33 | /**
34 | * Provide a character set detector.
35 | * @param detector
36 | */
37 | void setCharsetDetector(PageCharsetDetector detector);
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/java/com/basistech/readability/Patterns.java:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2010 Basis Technology Corp.
3 | *
4 | * Basis Technology Corp. licenses this file
5 | * to you under the Apache License, Version 2.0 (the
6 | * "License"); you may not use this file except in compliance
7 | * with the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing,
12 | * software distributed under the License is distributed on an
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | * KIND, either express or implied. See the License for the
15 | * specific language governing permissions and limitations
16 | * under the License.
17 | */
18 |
19 | package com.basistech.readability;
20 |
21 | import java.util.regex.Pattern;
22 |
23 | /**
24 | *
25 | */
26 | final class Patterns {
27 |
28 | static final Pattern PAGE_NUMBER_LIKE = ciPattern("((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$");
29 | static final Pattern PAGE_AND_NUMBER = ciPattern("p(a|g|ag)?(e|ing|ination)?(=|/)[0-9]{1,2}");
30 | static final Pattern PAGE_OR_PAGING = ciPattern("(page|paging)");
31 | static final Pattern EXTRANEOUS = ciPattern("print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sign|single");
32 | static final Pattern NEXT_LINK = ciPattern("(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))");
33 | // Match: next, continue, >, >>, » but not >|, »| as those usually mean last."
34 | static final Pattern PAGINATION = ciPattern("pag(e|ing|inat)");
35 | static final Pattern FIRST_OR_LAST = ciPattern("(first|last)");
36 | static final Pattern NEGATIVE = ciPattern("(combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget)");
37 | static final Pattern PREV_LINK = ciPattern("(prev|earl|old|new|<|«)");
38 | static final Pattern POSITIVE = ciPattern("(article|body|content|entry|hentry|main|page|pagination|post|text|blog|story)");
39 | //static final Pattern REPLACE_BRS = ciPattern("(
]*>[ \n\r\t]*){2,}");
40 | //above causes a stack overflow crash on some pages, bottom behaves differnetly for some reason
41 | static final Pattern REPLACE_BRS = ciPattern("(
]*>[ \n\r\t]*)\1+");
42 |
43 | static final Pattern UNLIKELY_CANDIDATES = ciPattern("combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter");
44 | static final Pattern OK_MAYBE_ITS_A_CANDIDATE = ciPattern("and|article|body|column|main|shadow");
45 | //below works better with espn "recap" pages, but unsure that's a good reason to change behavior.
46 | //static final Pattern OK_MAYBE_ITS_A_CANDIDATE = ciPattern("and|article|body|column|main|shadow|subheader");
47 | static final Pattern ENDS_WITH_DOT = Pattern.compile("\\.( |$)");
48 | static final Pattern DIGIT = Pattern.compile("\\d");
49 | static final Pattern BAR_DASH = Pattern.compile(" [\\|\\-] ");
50 |
51 | private Patterns() {
52 | //
53 | }
54 |
55 | static boolean match(Pattern pattern, String string) {
56 | return pattern.matcher(string).matches();
57 | }
58 |
59 | static boolean exists(Pattern pattern, String string) {
60 | return pattern.matcher(string).find();
61 | }
62 |
63 | private static Pattern ciPattern(String patternString) {
64 | return Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/java/com/basistech/readability/Readability.java:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2010 Basis Technology Corp.
3 | *
4 | * Basis Technology Corp. licenses this file
5 | * to you under the Apache License, Version 2.0 (the
6 | * "License"); you may not use this file except in compliance
7 | * with the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing,
12 | * software distributed under the License is distributed on an
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | * KIND, either express or implied. See the License for the
15 | * specific language governing permissions and limitations
16 | * under the License.
17 | */
18 |
19 | package com.basistech.readability;
20 |
21 | import java.net.URI;
22 | import java.net.URISyntaxException;
23 | import java.util.ArrayList;
24 | import java.util.HashMap;
25 | import java.util.HashSet;
26 | import java.util.LinkedList;
27 | import java.util.List;
28 | import java.util.ListIterator;
29 | import java.util.Map;
30 | import java.util.Set;
31 | import java.util.regex.Matcher;
32 |
33 | import org.jsoup.Jsoup;
34 | import org.jsoup.nodes.Document;
35 | import org.jsoup.nodes.Element;
36 | import org.jsoup.nodes.Node;
37 | import org.jsoup.nodes.TextNode;
38 | import org.jsoup.select.Elements;
39 | import org.slf4j.Logger;
40 | import org.slf4j.LoggerFactory;
41 |
42 | /**
43 | * Java version of the arclab readability javascript program. This uses jsoup to handle the DOM tree and
44 | * provide us with the sorts of operations that the javascript code loves. Make one of these objects for each
45 | * page. Provide it with an object to fetch more next pages to support that stuff.
46 | */
47 | public class Readability {
48 | private static final Logger LOG = LoggerFactory.getLogger(Readability.class);
49 | private static final Set DIV_TO_P_ELEMENTS;
50 | static {
51 | DIV_TO_P_ELEMENTS = new HashSet();
52 | DIV_TO_P_ELEMENTS.add("a");
53 | DIV_TO_P_ELEMENTS.add("blockquote");
54 | DIV_TO_P_ELEMENTS.add("dl");
55 | DIV_TO_P_ELEMENTS.add("div");
56 | DIV_TO_P_ELEMENTS.add("img");
57 | DIV_TO_P_ELEMENTS.add("ol");
58 | DIV_TO_P_ELEMENTS.add("p");
59 | DIV_TO_P_ELEMENTS.add("pre");
60 | DIV_TO_P_ELEMENTS.add("table");
61 | DIV_TO_P_ELEMENTS.add("ul");
62 | }
63 | private Document document;
64 | private Element body;
65 | private PageReader pageReader;
66 | private String givenUrl;
67 | private Set parsedPages;
68 | private boolean impossible;
69 | private String title;
70 | private boolean stripUnlikelyCandidates = true;
71 | private boolean classWeight = true;
72 | private boolean cleanConditionally = true;
73 | private String nextPageLink;
74 | private String articleText;
75 | private boolean readAllPages;
76 | private boolean notFirstPage;
77 | private NekoJsoupParser nekoParser = new NekoJsoupParser();
78 | // for some testing and debugging purposes, obtain string reps of the XML we
79 | // got from parsing.
80 | private List xmlImages;
81 |
82 | public Readability() {
83 | parsedPages = new HashSet();
84 | }
85 |
86 | /**
87 | * Process the content of a page. This takes a String, since JSoup does not handle byte input. Caller has
88 | * to worry about charset detection and conversion.
89 | *
90 | * @param url the initial url
91 | */
92 | public void processDocument(String url) throws PageReadException {
93 | // TODO: reset the results.
94 | impossible = false;
95 | givenUrl = url;
96 | nextPageLink = null;
97 | if (!notFirstPage) {
98 | xmlImages = new ArrayList();
99 | title = null;
100 | }
101 |
102 | String content = pageReader.readPage(url);
103 |
104 | document = Jsoup.parse(content);
105 |
106 | if (document.getElementsByTag("body").size() == 0) {
107 | LOG.error("no body to parse " + url);
108 | impossible = true;
109 | throw new PageReadException("no body to parse");
110 | }
111 |
112 | init(); // this needs another name, it does all the work.
113 | if (readAllPages && nextPageLink != null) {
114 | try {
115 | String textSoFar = articleText;
116 | notFirstPage = true;
117 | processDocument(nextPageLink);
118 | if (articleText != null) {
119 | articleText = textSoFar + articleText;
120 | }
121 | } finally {
122 | notFirstPage = false;
123 | }
124 | }
125 | }
126 |
127 | private void removeScripts() {
128 | Elements scripts = document.getElementsByTag("script");
129 | for (int i = scripts.size() - 1; i >= 0; i--) {
130 | Element e = scripts.get(i);
131 | String src = e.attr("src");
132 | if ("".equals(src) || (src.indexOf("readability") == -1 && src.indexOf("typekit") == -1)) {
133 | e.remove();
134 | }
135 | }
136 | }
137 |
138 | //some pages have a combiantion to generate a space, but
139 | //readability seems to ignore it. convert then to a single
140 | private void handlePP() {
141 | String inner = document.body().html();
142 | inner.replaceAll("
", "");
143 | document.body().html(inner);
144 | }
145 |
146 | private void handleDoubleBr() {
147 | Elements doubleBrs = document.select("br + br");
148 | for (Element br : doubleBrs) {
149 | // we hope that there's a 'p' up there....
150 | Elements parents = br.parents();
151 | Element parent = null;
152 | for (Element aparent : parents) {
153 | if (aparent.tag().getName().equals("p")) {
154 | parent = aparent;
155 | break;
156 | }
157 | }
158 | if (parent == null) {
159 | parent = br.parent();
160 | parent.wrap("
");
161 | }
162 | // now it's safe to make the change.
163 | String inner = parent.html();
164 | inner = Patterns.REPLACE_BRS.matcher(inner).replaceAll("
");
165 | parent.html(inner);
166 | }
167 | }
168 |
169 | private void prepDocument() {
170 | /**
171 | * In some cases a body element can't be found (if the HTML is totally hosed for example) so we create
172 | * a new body node and append it to the document.
173 | */
174 | if (body == null) {
175 | body = document.appendElement("body");
176 | }
177 |
178 | body.attr("id", "readabilityBody");
179 |
180 | Elements frames = document.getElementsByTag("frame");
181 | if (frames.size() > 0) {
182 | LOG.error("Frames. Can't deal. Write code later to look at URLs and fetch");
183 | impossible = true;
184 | return;
185 | }
186 |
187 | Elements stylesheets = document.getElementsByTag("style");
188 | stylesheets.remove();
189 | stylesheets = document.select("link[rel='stylesheet']");
190 | stylesheets.remove();
191 |
192 | /* Turn all double br's into p's */
193 | /*
194 | * Note, this is pretty costly as far as processing goes. Maybe optimize later.
195 | */
196 | handlePP();
197 | handleDoubleBr();
198 | fontsToSpans();
199 | }
200 |
201 | private void fontsToSpans() {
202 | Elements allFonts = document.getElementsByTag("font");
203 | for (Element fontElement : allFonts) {
204 | changeElementTag(fontElement, "span");
205 | }
206 | }
207 |
208 | private String normalizeTrailingSlash(String url) {
209 | return url.replaceAll("/$", "");
210 | }
211 |
212 | private void init() {
213 | removeScripts();
214 | convertNoscriptToDiv();
215 | // there should never be more than one ... */
216 | Elements bodies = document.getElementsByTag("body");
217 | if (bodies.size() > 1) {
218 | LOG.warn("More than one