├── .gitignore ├── README ├── pom.xml └── src └── main └── java └── com └── basistech └── readability ├── AbstractPageReader.java ├── FilePageReader.java ├── HtmlPage.java ├── HttpPageReader.java ├── NekoJsoupParser.java ├── OffsetRange.java ├── PageCharsetDetector.java ├── PageInfo.java ├── PageLinkInfo.java ├── PageReadException.java ├── PageReader.java ├── Patterns.java ├── Readability.java ├── ReadabilityDriver.java ├── TikaCharsetDetector.java └── XmlDataMap.java /.gitignore: -------------------------------------------------------------------------------- 1 | .classpath 2 | .project 3 | target 4 | .settings 5 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Home for java readability. 2 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.basistech 5 | java-readability 6 | jar 7 | Java version of Readability 8 | 1-SNAPSHOT 9 | 10 | org.sonatype.oss 11 | oss-parent 12 | 5 13 | 14 | A port of the Arclabs readability code to java. 15 | 16 | 17 | The Apache Software License, Version 2.0 18 | http://www.apache.org/licenses/LICENSE-2.0.txt 19 | repo 20 | 21 | 22 | 23 | scm:git:git@github.com:basis-technology-corp/Java-readability.git 24 | git@github.com:basis-technology-corp/Java-readability.git 25 | 26 | 27 | 28 | bimargulies 29 | Benson Margulies 30 | bimargulies@gmail.com 31 | 32 | 33 | 34 | install 35 | 36 | 37 | 38 | org.apache.maven.plugins 39 | maven-gpg-plugin 40 | 1.1 41 | 42 | 43 | org.apache.maven.plugins 44 | maven-deploy-plugin 45 | 2.5 46 | 47 | 48 | org.apache.maven.plugins 49 | maven-release-plugin 50 | 2.1 51 | 52 | true 53 | release,github_release 54 | clean install 55 | deploy 56 | true 57 | 58 | 59 | 60 | org.apache.maven.plugins 61 | maven-compiler-plugin 62 | 63 | 1.6 64 | 1.6 65 | 256M 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | release 75 | 76 | 77 | 78 | github_release 79 | 80 | 81 | 82 | 83 | 84 | org.slf4j 85 | slf4j-api 86 | 1.6.1 87 | jar 88 | compile 89 | 90 | 91 | commons-io 92 | commons-io 93 | 2.0.1 94 | jar 95 | compile 96 | 97 | 98 | org.jsoup 99 | jsoup 100 | 1.4.1 101 | jar 102 | compile 103 | 104 | 105 | net.sourceforge.nekohtml 106 | nekohtml 107 | 1.9.16 108 | jar 109 | compile 110 | 111 | 112 | org.apache.httpcomponents 113 | httpclient 114 | 4.0.3 115 | 116 | 117 | org.apache.tika 118 | tika-parsers 119 | 0.8 120 | 121 | 122 | 123 | xerces 124 | xercesImpl 125 | 2.9.1 126 | jar 127 | compile 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/AbstractPageReader.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | package com.basistech.readability; 19 | 20 | import java.io.IOException; 21 | import java.io.InputStream; 22 | import java.nio.charset.Charset; 23 | 24 | import org.apache.commons.io.IOUtils; 25 | import org.slf4j.Logger; 26 | import org.slf4j.LoggerFactory; 27 | 28 | public class AbstractPageReader { 29 | static final Logger LOG = LoggerFactory.getLogger(HttpPageReader.class); 30 | static final Charset UTF8 = Charset.forName("utf-8"); 31 | 32 | private PageCharsetDetector charsetDetector; 33 | private Charset charset; 34 | private boolean serverReturnedEncoding; 35 | private boolean respectServerEncoding; 36 | private String detectedEncoding; 37 | 38 | protected String readContent(InputStream response, String forceEncoding) throws IOException { 39 | byte[] bytes = IOUtils.toByteArray(response); 40 | charset = null; 41 | String hint = null; 42 | if (forceEncoding != null) { 43 | serverReturnedEncoding = true; 44 | try { 45 | charset = Charset.forName(forceEncoding); 46 | hint = charset.name(); 47 | } catch (Exception e) { 48 | // 49 | } 50 | } 51 | if (charsetDetector != null && !respectServerEncoding || charset == null) { 52 | String charsetName = charsetDetector.detect(bytes, hint); 53 | if (charsetName != null) { 54 | try { 55 | charset = Charset.forName(charsetName); 56 | detectedEncoding = charset.name(); 57 | } catch (Exception e) { 58 | LOG.warn("Detected character set " + charsetName + " not supported"); 59 | } 60 | } 61 | } 62 | if (charset == null) { 63 | LOG.warn("Defaulting to utf-8"); 64 | charset = UTF8; 65 | } 66 | return new String(bytes, charset); 67 | } 68 | 69 | public PageCharsetDetector getCharsetDetector() { 70 | return charsetDetector; 71 | } 72 | 73 | public void setCharsetDetector(PageCharsetDetector charsetDetector) { 74 | this.charsetDetector = charsetDetector; 75 | } 76 | 77 | public Charset getCharset() { 78 | return charset; 79 | } 80 | 81 | public boolean isServerReturnedEncoding() { 82 | return serverReturnedEncoding; 83 | } 84 | 85 | public void setRespectServerEncoding(boolean respectServerEncoding) { 86 | this.respectServerEncoding = respectServerEncoding; 87 | } 88 | 89 | public boolean isRespectServerEncoding() { 90 | return respectServerEncoding; 91 | } 92 | 93 | public String getDetectedEncoding() { 94 | return detectedEncoding; 95 | } 96 | 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/FilePageReader.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | 19 | package com.basistech.readability; 20 | 21 | import java.io.File; 22 | import java.io.FileInputStream; 23 | import java.io.IOException; 24 | 25 | import org.apache.tika.io.IOUtils; 26 | import org.slf4j.Logger; 27 | import org.slf4j.LoggerFactory; 28 | 29 | /** 30 | * 31 | */ 32 | public class FilePageReader extends AbstractPageReader implements PageReader { 33 | private static final Logger LOG = LoggerFactory.getLogger(FilePageReader.class); 34 | 35 | private File baseDirectory; 36 | 37 | /** {@inheritDoc} */ 38 | @Override 39 | public String readPage(String url) throws PageReadException { 40 | int lastSlash = url.replace("\\", "/").lastIndexOf('/'); 41 | File testFile = new File(baseDirectory, url.substring(lastSlash + 1)); 42 | LOG.info("Reading " + testFile + " for " + url); 43 | FileInputStream fis = null; 44 | try { 45 | try { 46 | fis = new FileInputStream(testFile); 47 | return readContent(fis, null); 48 | } catch (IOException e) { 49 | throw new PageReadException("Failed to read " + url, e); 50 | } 51 | } finally { 52 | if (fis != null) { 53 | IOUtils.closeQuietly(fis); 54 | } 55 | } 56 | } 57 | 58 | public void setBaseDirectory(File baseDirectory) { 59 | this.baseDirectory = baseDirectory; 60 | } 61 | 62 | public File getBaseDirectory() { 63 | return baseDirectory; 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/HtmlPage.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | package com.basistech.readability; 19 | 20 | import java.util.HashMap; 21 | import java.util.Map; 22 | 23 | import org.jsoup.nodes.Document; 24 | import org.jsoup.nodes.Element; 25 | 26 | public class HtmlPage extends XmlDataMap { 27 | public static final String KEY = "htmlPage"; 28 | static Map elementActionMap; 29 | static { 30 | elementActionMap = new HashMap(); 31 | elementActionMap.put("img", ElementAction.Alt); 32 | elementActionMap.put("applet", ElementAction.Alt); 33 | elementActionMap.put("area", ElementAction.Alt); 34 | elementActionMap.put("input", ElementAction.Alt); 35 | elementActionMap.put("script", ElementAction.Banned); 36 | elementActionMap.put("iframe", ElementAction.Banned); 37 | elementActionMap.put("style", ElementAction.Banned); 38 | elementActionMap.put("br", ElementAction.Whitespace); 39 | elementActionMap.put("p", ElementAction.Sentence); 40 | elementActionMap.put("hr", ElementAction.Sentence); 41 | elementActionMap.put("ul", ElementAction.Sentence); 42 | elementActionMap.put("h1", ElementAction.Sentence); 43 | elementActionMap.put("h2", ElementAction.Sentence); 44 | elementActionMap.put("h3", ElementAction.Sentence); 45 | elementActionMap.put("h4", ElementAction.Sentence); 46 | elementActionMap.put("h5", ElementAction.Sentence); 47 | elementActionMap.put("h6", ElementAction.Sentence); 48 | elementActionMap.put("pre", ElementAction.Sentence); 49 | elementActionMap.put("blockquote", ElementAction.Sentence); 50 | elementActionMap.put("title", ElementAction.Sentence); 51 | elementActionMap.put("div", ElementAction.Sentence); 52 | // hmm, span tags with CSS with certain properties? Hopeless. 53 | elementActionMap.put("center", ElementAction.Whitespace); 54 | elementActionMap.put("form", ElementAction.Sentence); 55 | elementActionMap.put("table", ElementAction.Sentence); 56 | elementActionMap.put("td", ElementAction.Sentence); 57 | elementActionMap.put("th", ElementAction.Sentence); 58 | elementActionMap.put("li", ElementAction.Sentence); 59 | elementActionMap.put("dir", ElementAction.Sentence); 60 | elementActionMap.put("menu", ElementAction.Sentence); 61 | elementActionMap.put("ol", ElementAction.Sentence); 62 | } 63 | 64 | // the data as formatted for RLP -- just the PC-DATA. 65 | private String pcData; 66 | private String mimeType; 67 | 68 | public HtmlPage() { 69 | super(); 70 | } 71 | 72 | public void process(Document document) { 73 | Element body = document.body(); 74 | if (body != null) { // page might have no body. 75 | process(body); 76 | pcData = pcDataBuffer.toString(); 77 | } 78 | } 79 | 80 | public String getPcData() { 81 | return pcData; 82 | } 83 | 84 | @Override 85 | protected ElementAction classifyElement(Element element) { 86 | if (element.hasAttr("basisInline")) { 87 | return null; 88 | } 89 | return elementActionMap.get(element.tagName()); 90 | } 91 | 92 | public String getMimeType() { 93 | return mimeType; 94 | } 95 | 96 | public void setMimeType(String mimeType) { 97 | this.mimeType = mimeType; 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/HttpPageReader.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | 19 | package com.basistech.readability; 20 | 21 | import java.io.IOException; 22 | import java.io.InputStream; 23 | 24 | import org.apache.http.HttpResponse; 25 | import org.apache.http.HttpStatus; 26 | import org.apache.http.client.methods.HttpGet; 27 | import org.apache.http.impl.client.DefaultHttpClient; 28 | import org.apache.http.params.BasicHttpParams; 29 | import org.apache.http.params.HttpConnectionParams; 30 | import org.apache.http.params.HttpParams; 31 | import org.apache.http.protocol.BasicHttpContext; 32 | import org.apache.http.protocol.HttpContext; 33 | import org.apache.http.util.EntityUtils; 34 | import org.slf4j.Logger; 35 | import org.slf4j.LoggerFactory; 36 | 37 | /** 38 | * 39 | */ 40 | public class HttpPageReader extends AbstractPageReader implements PageReader { 41 | static final Logger LOG = LoggerFactory.getLogger(HttpPageReader.class); 42 | 43 | /** {@inheritDoc}*/ 44 | @Override 45 | public String readPage(String url) throws PageReadException { 46 | LOG.info("Reading " + url); 47 | HttpParams httpParameters = new BasicHttpParams(); 48 | // Set the timeout in milliseconds until a connection is established. 49 | int timeoutConnection = 3000; 50 | HttpConnectionParams.setConnectionTimeout(httpParameters, timeoutConnection); 51 | // Set the default socket timeout (SO_TIMEOUT) 52 | // in milliseconds which is the timeout for waiting for data. 53 | int timeoutSocket = 10000; 54 | HttpConnectionParams.setSoTimeout(httpParameters, timeoutSocket); 55 | DefaultHttpClient httpclient = new DefaultHttpClient(httpParameters); 56 | HttpContext localContext = new BasicHttpContext(); 57 | HttpGet get = new HttpGet(url); 58 | InputStream response = null; 59 | HttpResponse httpResponse = null; 60 | try { 61 | try { 62 | httpResponse = httpclient.execute(get, localContext); 63 | int resp = httpResponse.getStatusLine().getStatusCode(); 64 | if (HttpStatus.SC_OK != resp) { 65 | LOG.error("Download failed of " + url + " status " + resp + " " + httpResponse.getStatusLine().getReasonPhrase()); 66 | return null; 67 | } 68 | String respCharset = EntityUtils.getContentCharSet(httpResponse.getEntity()); 69 | return readContent(httpResponse.getEntity().getContent(), respCharset); 70 | } finally { 71 | if (response != null) { 72 | response.close(); 73 | } 74 | if (httpResponse != null && httpResponse.getEntity() != null) { 75 | httpResponse.getEntity().consumeContent(); 76 | } 77 | 78 | } 79 | } catch (IOException e) { 80 | LOG.error("Download failed of " + url, e); 81 | throw new PageReadException("Failed to read " + url, e); 82 | } 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/NekoJsoupParser.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | 19 | package com.basistech.readability; 20 | 21 | import java.io.IOException; 22 | import java.io.InputStream; 23 | import java.io.StringReader; 24 | 25 | import org.xml.sax.Attributes; 26 | import org.xml.sax.ErrorHandler; 27 | import org.xml.sax.InputSource; 28 | import org.xml.sax.SAXException; 29 | import org.xml.sax.SAXParseException; 30 | import org.xml.sax.helpers.DefaultHandler; 31 | 32 | import org.cyberneko.html.parsers.SAXParser; 33 | import org.jsoup.Jsoup; 34 | import org.jsoup.nodes.Document; 35 | import org.jsoup.nodes.Element; 36 | import org.slf4j.Logger; 37 | import org.slf4j.LoggerFactory; 38 | 39 | /** 40 | * Due to bugs in the Jsoup parser, we want a class that uses Neko to do the parse. 41 | * The same trick could be played with JSoup. 42 | */ 43 | public class NekoJsoupParser { 44 | private static final Logger LOG = LoggerFactory.getLogger(NekoJsoupParser.class); 45 | 46 | public NekoJsoupParser() { 47 | // 48 | } 49 | 50 | private final class LocalErrorHandler implements ErrorHandler { 51 | @Override 52 | public void error(SAXParseException e) throws SAXException { 53 | LOG.error("Parse error", e); 54 | throw e; 55 | } 56 | 57 | @Override 58 | public void fatalError(SAXParseException e) throws SAXException { 59 | LOG.error("Parse error", e); 60 | throw e; 61 | } 62 | 63 | @Override 64 | public void warning(SAXParseException e) throws SAXException { 65 | LOG.warn("Parse warning", e); 66 | } 67 | } 68 | 69 | private class Handler extends DefaultHandler { 70 | private Document document; 71 | private Element currentElement; 72 | private int depth; 73 | Handler(Document document) { 74 | this.document = document; 75 | } 76 | @Override 77 | public void characters(char[] data, int start, int length) throws SAXException { 78 | assert currentElement != null; 79 | currentElement.appendText(new String(data, start, length)); 80 | } 81 | @Override 82 | public void endDocument() throws SAXException { 83 | assert depth == 0; 84 | } 85 | @Override 86 | public void endElement(String uri, String localName, String qname) throws SAXException { 87 | LOG.debug("end element " + qname); 88 | currentElement = currentElement.parent(); 89 | depth--; 90 | } 91 | @Override 92 | public void ignorableWhitespace(char[] data, int start, int length) throws SAXException { 93 | characters(data, start, length); 94 | } 95 | @Override 96 | public void startDocument() throws SAXException { 97 | currentElement = document; 98 | } 99 | @Override 100 | public void startElement(String uri, String localName, String qname, Attributes attrs) throws SAXException { 101 | LOG.debug("start element " + qname + " " + depth); 102 | Element newElement; 103 | newElement = currentElement.appendElement(localName); 104 | 105 | for (int ax = 0; ax < attrs.getLength(); ax++) { 106 | String name = attrs.getQName(ax); 107 | String value = attrs.getValue(ax); 108 | newElement.attr(name, value); 109 | } 110 | currentElement = newElement; 111 | depth++; 112 | } 113 | } 114 | 115 | public Document parse(InputStream data, String baseUri) throws SAXException, IOException { 116 | InputSource source = new InputSource(); 117 | source.setByteStream(data); 118 | SAXParser nekoParser = new SAXParser(); 119 | Document document = new Document(baseUri); 120 | nekoParser.setContentHandler(new Handler(document)); 121 | nekoParser.setErrorHandler(new LocalErrorHandler()); 122 | nekoParser.parse(source); 123 | return document; 124 | } 125 | 126 | public Document parse(String data, String baseUri) throws SAXException, IOException { 127 | InputSource source = new InputSource(); 128 | source.setCharacterStream(new StringReader(data)); 129 | SAXParser nekoParser = new SAXParser(); 130 | Document document = new Document(baseUri); 131 | nekoParser.setContentHandler(new Handler(document)); 132 | nekoParser.setErrorHandler(new LocalErrorHandler()); 133 | nekoParser.parse(source); 134 | return document; 135 | } 136 | 137 | public Document parse(String data) throws SAXException, IOException { 138 | return Jsoup.parse(data); 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/OffsetRange.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | package com.basistech.readability; 19 | 20 | import org.jsoup.nodes.TextNode; 21 | 22 | /** 23 | * Object to relate a range of PC-data to a text node in an XOM tree. 24 | */ 25 | public class OffsetRange { 26 | private int start; 27 | private int end; 28 | private TextNode text; 29 | 30 | OffsetRange(int start, int end, TextNode text) { 31 | this.start = start; 32 | this.end = end; 33 | this.text = text; 34 | 35 | assert this.text == null || this.text.text().length() == this.end - this.start; 36 | } 37 | 38 | public String toString() { 39 | return super.toString() + "[" + this.start + "-" + this.end + " " + this.text.text() + "]"; 40 | } 41 | 42 | public TextNode getText() { 43 | return text; 44 | } 45 | 46 | public int getEnd() { 47 | return end; 48 | } 49 | 50 | public int getStart() { 51 | return start; 52 | } 53 | 54 | public void setStart(int start) { 55 | this.start = start; 56 | } 57 | 58 | public void setEnd(int end) { 59 | this.end = end; 60 | } 61 | 62 | public void setText(TextNode text) { 63 | this.text = text; 64 | } 65 | 66 | public boolean offsetInRange(int offset) { 67 | return offset >= start && offset < end; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/PageCharsetDetector.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | 19 | package com.basistech.readability; 20 | 21 | /** 22 | * Generic API for character set detection. 23 | */ 24 | public interface PageCharsetDetector { 25 | String detect(byte[] data, String hint); 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/PageInfo.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | 19 | package com.basistech.readability; 20 | 21 | /** 22 | * 23 | */ 24 | public class PageInfo { 25 | private String url; 26 | private String content; 27 | private String title; 28 | 29 | public String getUrl() { 30 | return url; 31 | } 32 | public void setUrl(String url) { 33 | this.url = url; 34 | } 35 | public String getContent() { 36 | return content; 37 | } 38 | public void setContent(String content) { 39 | this.content = content; 40 | } 41 | public String getTitle() { 42 | return title; 43 | } 44 | public void setTitle(String title) { 45 | this.title = title; 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/PageLinkInfo.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | 19 | package com.basistech.readability; 20 | 21 | /** 22 | * 23 | */ 24 | public class PageLinkInfo { 25 | private double score; 26 | private String linkText; 27 | private String href; 28 | public PageLinkInfo(double score, String linkText, String href) { 29 | this.score = score; 30 | this.linkText = linkText; 31 | this.href = href; 32 | } 33 | public void setScore(double score) { 34 | this.score = score; 35 | } 36 | public void incrementScore(double incr) { 37 | score = score + incr; 38 | } 39 | public void setLinkText(String linkText) { 40 | this.linkText = linkText; 41 | } 42 | public double getScore() { 43 | return score; 44 | } 45 | public String getLinkText() { 46 | return linkText; 47 | } 48 | public String getHref() { 49 | return href; 50 | } 51 | @Override 52 | public String toString() { 53 | return "PageLinkInfo [score=" + score + ", linkText=" + linkText + ", href=" + href + "]"; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/PageReadException.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | package com.basistech.readability; 19 | 20 | /** 21 | * 22 | */ 23 | public class PageReadException extends Exception { 24 | 25 | public PageReadException() { 26 | super(); 27 | } 28 | 29 | public PageReadException(String message) { 30 | super(message); 31 | } 32 | 33 | public PageReadException(String message, Throwable cause) { 34 | super(message, cause); 35 | } 36 | 37 | public PageReadException(Exception e) { 38 | super(e); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/PageReader.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | 19 | package com.basistech.readability; 20 | 21 | /** 22 | * Interface to reading HTML pages. 23 | */ 24 | public interface PageReader { 25 | /** 26 | * Read the content of a page. Return null and log if 27 | * there's some problem or another. This is responsible 28 | * for dealing with charset. 29 | * @param url 30 | * @return 31 | */ 32 | String readPage(String url) throws PageReadException; 33 | /** 34 | * Provide a character set detector. 35 | * @param detector 36 | */ 37 | void setCharsetDetector(PageCharsetDetector detector); 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/Patterns.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | 19 | package com.basistech.readability; 20 | 21 | import java.util.regex.Pattern; 22 | 23 | /** 24 | * 25 | */ 26 | final class Patterns { 27 | 28 | static final Pattern PAGE_NUMBER_LIKE = ciPattern("((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$"); 29 | static final Pattern PAGE_AND_NUMBER = ciPattern("p(a|g|ag)?(e|ing|ination)?(=|/)[0-9]{1,2}"); 30 | static final Pattern PAGE_OR_PAGING = ciPattern("(page|paging)"); 31 | static final Pattern EXTRANEOUS = ciPattern("print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sign|single"); 32 | static final Pattern NEXT_LINK = ciPattern("(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))"); 33 | // Match: next, continue, >, >>, » but not >|, »| as those usually mean last." 34 | static final Pattern PAGINATION = ciPattern("pag(e|ing|inat)"); 35 | static final Pattern FIRST_OR_LAST = ciPattern("(first|last)"); 36 | static final Pattern NEGATIVE = ciPattern("(combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget)"); 37 | static final Pattern PREV_LINK = ciPattern("(prev|earl|old|new|<|«)"); 38 | static final Pattern POSITIVE = ciPattern("(article|body|content|entry|hentry|main|page|pagination|post|text|blog|story)"); 39 | //static final Pattern REPLACE_BRS = ciPattern("(]*>[ \n\r\t]*){2,}"); 40 | //above causes a stack overflow crash on some pages, bottom behaves differnetly for some reason 41 | static final Pattern REPLACE_BRS = ciPattern("(]*>[ \n\r\t]*)\1+"); 42 | 43 | static final Pattern UNLIKELY_CANDIDATES = ciPattern("combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter"); 44 | static final Pattern OK_MAYBE_ITS_A_CANDIDATE = ciPattern("and|article|body|column|main|shadow"); 45 | //below works better with espn "recap" pages, but unsure that's a good reason to change behavior. 46 | //static final Pattern OK_MAYBE_ITS_A_CANDIDATE = ciPattern("and|article|body|column|main|shadow|subheader"); 47 | static final Pattern ENDS_WITH_DOT = Pattern.compile("\\.( |$)"); 48 | static final Pattern DIGIT = Pattern.compile("\\d"); 49 | static final Pattern BAR_DASH = Pattern.compile(" [\\|\\-] "); 50 | 51 | private Patterns() { 52 | // 53 | } 54 | 55 | static boolean match(Pattern pattern, String string) { 56 | return pattern.matcher(string).matches(); 57 | } 58 | 59 | static boolean exists(Pattern pattern, String string) { 60 | return pattern.matcher(string).find(); 61 | } 62 | 63 | private static Pattern ciPattern(String patternString) { 64 | return Pattern.compile(patternString, Pattern.CASE_INSENSITIVE); 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/Readability.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | 19 | package com.basistech.readability; 20 | 21 | import java.net.URI; 22 | import java.net.URISyntaxException; 23 | import java.util.ArrayList; 24 | import java.util.HashMap; 25 | import java.util.HashSet; 26 | import java.util.LinkedList; 27 | import java.util.List; 28 | import java.util.ListIterator; 29 | import java.util.Map; 30 | import java.util.Set; 31 | import java.util.regex.Matcher; 32 | 33 | import org.jsoup.Jsoup; 34 | import org.jsoup.nodes.Document; 35 | import org.jsoup.nodes.Element; 36 | import org.jsoup.nodes.Node; 37 | import org.jsoup.nodes.TextNode; 38 | import org.jsoup.select.Elements; 39 | import org.slf4j.Logger; 40 | import org.slf4j.LoggerFactory; 41 | 42 | /** 43 | * Java version of the arclab readability javascript program. This uses jsoup to handle the DOM tree and 44 | * provide us with the sorts of operations that the javascript code loves. Make one of these objects for each 45 | * page. Provide it with an object to fetch more next pages to support that stuff. 46 | */ 47 | public class Readability { 48 | private static final Logger LOG = LoggerFactory.getLogger(Readability.class); 49 | private static final Set DIV_TO_P_ELEMENTS; 50 | static { 51 | DIV_TO_P_ELEMENTS = new HashSet(); 52 | DIV_TO_P_ELEMENTS.add("a"); 53 | DIV_TO_P_ELEMENTS.add("blockquote"); 54 | DIV_TO_P_ELEMENTS.add("dl"); 55 | DIV_TO_P_ELEMENTS.add("div"); 56 | DIV_TO_P_ELEMENTS.add("img"); 57 | DIV_TO_P_ELEMENTS.add("ol"); 58 | DIV_TO_P_ELEMENTS.add("p"); 59 | DIV_TO_P_ELEMENTS.add("pre"); 60 | DIV_TO_P_ELEMENTS.add("table"); 61 | DIV_TO_P_ELEMENTS.add("ul"); 62 | } 63 | private Document document; 64 | private Element body; 65 | private PageReader pageReader; 66 | private String givenUrl; 67 | private Set parsedPages; 68 | private boolean impossible; 69 | private String title; 70 | private boolean stripUnlikelyCandidates = true; 71 | private boolean classWeight = true; 72 | private boolean cleanConditionally = true; 73 | private String nextPageLink; 74 | private String articleText; 75 | private boolean readAllPages; 76 | private boolean notFirstPage; 77 | private NekoJsoupParser nekoParser = new NekoJsoupParser(); 78 | // for some testing and debugging purposes, obtain string reps of the XML we 79 | // got from parsing. 80 | private List xmlImages; 81 | 82 | public Readability() { 83 | parsedPages = new HashSet(); 84 | } 85 | 86 | /** 87 | * Process the content of a page. This takes a String, since JSoup does not handle byte input. Caller has 88 | * to worry about charset detection and conversion. 89 | * 90 | * @param url the initial url 91 | */ 92 | public void processDocument(String url) throws PageReadException { 93 | // TODO: reset the results. 94 | impossible = false; 95 | givenUrl = url; 96 | nextPageLink = null; 97 | if (!notFirstPage) { 98 | xmlImages = new ArrayList(); 99 | title = null; 100 | } 101 | 102 | String content = pageReader.readPage(url); 103 | 104 | document = Jsoup.parse(content); 105 | 106 | if (document.getElementsByTag("body").size() == 0) { 107 | LOG.error("no body to parse " + url); 108 | impossible = true; 109 | throw new PageReadException("no body to parse"); 110 | } 111 | 112 | init(); // this needs another name, it does all the work. 113 | if (readAllPages && nextPageLink != null) { 114 | try { 115 | String textSoFar = articleText; 116 | notFirstPage = true; 117 | processDocument(nextPageLink); 118 | if (articleText != null) { 119 | articleText = textSoFar + articleText; 120 | } 121 | } finally { 122 | notFirstPage = false; 123 | } 124 | } 125 | } 126 | 127 | private void removeScripts() { 128 | Elements scripts = document.getElementsByTag("script"); 129 | for (int i = scripts.size() - 1; i >= 0; i--) { 130 | Element e = scripts.get(i); 131 | String src = e.attr("src"); 132 | if ("".equals(src) || (src.indexOf("readability") == -1 && src.indexOf("typekit") == -1)) { 133 | e.remove(); 134 | } 135 | } 136 | } 137 | 138 | //some pages have a

combiantion to generate a space, but 139 | //readability seems to ignore it. convert then to a single

140 | private void handlePP() { 141 | String inner = document.body().html(); 142 | inner.replaceAll("

", "

"); 143 | document.body().html(inner); 144 | } 145 | 146 | private void handleDoubleBr() { 147 | Elements doubleBrs = document.select("br + br"); 148 | for (Element br : doubleBrs) { 149 | // we hope that there's a 'p' up there.... 150 | Elements parents = br.parents(); 151 | Element parent = null; 152 | for (Element aparent : parents) { 153 | if (aparent.tag().getName().equals("p")) { 154 | parent = aparent; 155 | break; 156 | } 157 | } 158 | if (parent == null) { 159 | parent = br.parent(); 160 | parent.wrap("

"); 161 | } 162 | // now it's safe to make the change. 163 | String inner = parent.html(); 164 | inner = Patterns.REPLACE_BRS.matcher(inner).replaceAll("

"); 165 | parent.html(inner); 166 | } 167 | } 168 | 169 | private void prepDocument() { 170 | /** 171 | * In some cases a body element can't be found (if the HTML is totally hosed for example) so we create 172 | * a new body node and append it to the document. 173 | */ 174 | if (body == null) { 175 | body = document.appendElement("body"); 176 | } 177 | 178 | body.attr("id", "readabilityBody"); 179 | 180 | Elements frames = document.getElementsByTag("frame"); 181 | if (frames.size() > 0) { 182 | LOG.error("Frames. Can't deal. Write code later to look at URLs and fetch"); 183 | impossible = true; 184 | return; 185 | } 186 | 187 | Elements stylesheets = document.getElementsByTag("style"); 188 | stylesheets.remove(); 189 | stylesheets = document.select("link[rel='stylesheet']"); 190 | stylesheets.remove(); 191 | 192 | /* Turn all double br's into p's */ 193 | /* 194 | * Note, this is pretty costly as far as processing goes. Maybe optimize later. 195 | */ 196 | handlePP(); 197 | handleDoubleBr(); 198 | fontsToSpans(); 199 | } 200 | 201 | private void fontsToSpans() { 202 | Elements allFonts = document.getElementsByTag("font"); 203 | for (Element fontElement : allFonts) { 204 | changeElementTag(fontElement, "span"); 205 | } 206 | } 207 | 208 | private String normalizeTrailingSlash(String url) { 209 | return url.replaceAll("/$", ""); 210 | } 211 | 212 | private void init() { 213 | removeScripts(); 214 | convertNoscriptToDiv(); 215 | // there should never be more than one ... */ 216 | Elements bodies = document.getElementsByTag("body"); 217 | if (bodies.size() > 1) { 218 | LOG.warn("More than one "); 219 | } 220 | body = null; 221 | body = bodies.get(0); 222 | /* 223 | * Make sure this document is added to the list of parsed pages first, so we don't double up on the 224 | * first page 225 | */ 226 | parsedPages.add(normalizeTrailingSlash(givenUrl)); 227 | //respect the readAllPages flag, very important if a stringPage 228 | if (readAllPages) 229 | nextPageLink = findNextPageLink(body); 230 | 231 | if (!notFirstPage) { 232 | title = getArticleTitle(); 233 | } 234 | prepDocument(); 235 | 236 | Element articleContent = grabArticle(null); 237 | if (articleContent == null && !notFirstPage) { 238 | // this happens when the content of the page is very short. 239 | // we don't believe in super-short next pages. 240 | articleText = body.text(); 241 | } else { 242 | xmlImages.add(articleContent.outerHtml()); 243 | articleText = getDisplayText(articleContent); 244 | } 245 | } 246 | 247 | private void convertNoscriptToDiv() { 248 | Elements noscript = document.getElementsByTag("noscript"); 249 | for (Element e : noscript) { 250 | changeElementTag(e, "div"); 251 | } 252 | 253 | } 254 | 255 | private void setContentScore(Element node, double score) { 256 | node.attr("data-readability.contentScore", Double.toString(score)); 257 | } 258 | 259 | private boolean isElementScored(Element node) { 260 | return node.hasAttr("data-readability.contentScore"); 261 | } 262 | 263 | private void incrementContentScore(Element node, double score) { 264 | node.attr("data-readability.contentScore", Double.toString(getContentScore(node) + score)); 265 | } 266 | 267 | private double getContentScore(Element node) { 268 | String scoreString = node.attr("data-readability.contentScore"); 269 | if ("".equals(scoreString)) { 270 | return 0; 271 | } else { 272 | return Double.parseDouble(scoreString); 273 | } 274 | } 275 | 276 | private void initializeNode(Element node) { 277 | // CHECKSTYLE:OFF 278 | node.attr("readability", "true"); 279 | String tagName = node.tagName(); 280 | if ("div".equals(tagName)) { 281 | incrementContentScore(node, 5); 282 | } else if ("pre".equals(tagName) || "td".equals(tagName) || "blockquote".equals(tagName)) { 283 | incrementContentScore(node, 3); 284 | } else if ("address".equals(tagName) || "ol".equals(tagName) || "ul".equals(tagName) 285 | || "dl".equals(tagName) || "dd".equals(tagName) || "dt".equals(tagName) 286 | || "li".equals(tagName) || "form".equals(tagName)) { 287 | incrementContentScore(node, -3); 288 | } else if (tagName.matches("h[1-6]") || "th".equals(tagName)) { 289 | incrementContentScore(node, -5); 290 | } 291 | incrementContentScore(node, getClassWeight(node)); 292 | // CHECKSTYLE:ON 293 | } 294 | 295 | /** 296 | * Get an elements class/id weight. Uses regular expressions to tell if this element looks good or bad. 297 | * 298 | * @param Element 299 | * @return number (Integer) 300 | **/ 301 | private double getClassWeight(Element e) { 302 | if (!classWeight) { 303 | return 0; 304 | } 305 | 306 | int weight = 0; 307 | 308 | /* Look for a special classname */ 309 | String className = e.className(); 310 | if (!"".equals(className)) { 311 | if (Patterns.exists(Patterns.NEGATIVE, className)) { 312 | weight -= 25; 313 | } 314 | if (Patterns.exists(Patterns.POSITIVE, className)) { 315 | weight += 25; 316 | } 317 | } 318 | 319 | /* Look for a special ID */ 320 | String id = e.id(); 321 | if (!"".equals(id)) { 322 | if (Patterns.exists(Patterns.NEGATIVE, id)) { 323 | weight -= 25; 324 | } 325 | if (Patterns.exists(Patterns.POSITIVE, id)) { 326 | weight += 25; 327 | } 328 | } 329 | return weight; 330 | } 331 | 332 | private Element changeElementTag(Element e, String newTag) { 333 | Element newElement = document.createElement(newTag); 334 | /* JSoup gives us the live child list, so we need to make a copy. */ 335 | List copyOfChildNodeList = new ArrayList(); 336 | copyOfChildNodeList.addAll(e.childNodes()); 337 | for (Node n : copyOfChildNodeList) { 338 | n.remove(); 339 | newElement.appendChild(n); 340 | } 341 | e.replaceWith(newElement); 342 | return newElement; 343 | } 344 | 345 | // CHECKSTYLE:OFF 346 | private Element grabArticle(Element pageElement) { 347 | boolean isPaging = pageElement != null; 348 | 349 | if (pageElement == null) { 350 | pageElement = body; 351 | } 352 | 353 | String pageCacheHtml = pageElement.html(); 354 | Elements allElements = pageElement.getAllElements(); 355 | /* 356 | * Note: in Javascript, this list would be *live*. If you deleted a node from the tree, it and its 357 | * children would remove themselves. To get the same effect, we make a linked list and we remove 358 | * things from it. This won't win prizes for speed, but, then again, the code in Javascript has to be 359 | * doing something nearly as awful. 360 | */ 361 | LinkedList allElementsList = new LinkedList(); 362 | allElementsList.addAll(allElements); 363 | 364 | /** 365 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), 366 | * and turn divs into P tags where they have been used inappropriately (as in, where they contain no 367 | * other block level elements.) Note: Assignment from index for performance. See 368 | * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO: Shouldn't this be a reverse 369 | * traversal? 370 | **/ 371 | List nodesToScore = new ArrayList(); 372 | ListIterator elIterator = allElementsList.listIterator(); 373 | Set goodAsDead = new HashSet(); 374 | while (elIterator.hasNext()) { 375 | Element node = elIterator.next(); 376 | if (goodAsDead.contains(node)) { 377 | continue; 378 | } 379 | 380 | /* Remove unlikely candidates */ 381 | if (stripUnlikelyCandidates) { 382 | String unlikelyMatchString = node.className() + node.id(); 383 | if (Patterns.exists(Patterns.UNLIKELY_CANDIDATES, unlikelyMatchString) 384 | && !Patterns.exists(Patterns.OK_MAYBE_ITS_A_CANDIDATE, unlikelyMatchString) 385 | && !"body".equals(node.tagName())) { 386 | LOG.debug("Removing unlikely candidate - " + unlikelyMatchString); 387 | List toRemoveAndBelow = node.getAllElements(); 388 | elIterator.remove(); 389 | /* 390 | * adding 'node' to that set is harmless and reduces the code complexity here. 391 | */ 392 | goodAsDead.addAll(toRemoveAndBelow); 393 | continue; 394 | } 395 | } 396 | 397 | if ("p".equals(node.tagName()) || "td".equals(node.tagName()) || "pre".equals(node.tagName())) { 398 | nodesToScore.add(node); 399 | } 400 | 401 | /* 402 | * Turn all divs that don't have children block level elements into p's 403 | */ 404 | if ("div".equals(node.tagName())) { 405 | boolean hasBlock = false; 406 | for (Element divChild : node.getAllElements()) { 407 | if (divChild != node) { 408 | if (DIV_TO_P_ELEMENTS.contains(divChild.tagName())) { 409 | hasBlock = true; 410 | break; 411 | } 412 | } 413 | } 414 | if (!hasBlock) { 415 | Element newElement = changeElementTag(node, "p"); 416 | nodesToScore.remove(node); 417 | nodesToScore.add(newElement); 418 | } else { 419 | 420 | /* EXPERIMENTAL *//* 421 | * grab just child text and wrap each chunk in a p 422 | */ 423 | int limit = node.childNodes().size(); 424 | for (int i = 0; i < limit; i++) { 425 | Node childNode = node.childNodes().get(i); 426 | if (childNode instanceof TextNode) { 427 | Element p = document.createElement("p"); 428 | p.attr("basisInline", "true"); 429 | p.html(((TextNode)childNode).text()); 430 | childNode.replaceWith(p); 431 | } 432 | } 433 | } 434 | } 435 | } 436 | 437 | /** 438 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. Then add 439 | * their score to their parent node. A score is determined by things like number of commas, class 440 | * names, etc. Maybe eventually link density. 441 | **/ 442 | List candidates = new ArrayList(); 443 | for (Element nodeToScore : nodesToScore) { 444 | Element parentNode = nodeToScore.parent(); 445 | if (null == parentNode) { // might be an orphan whose parent was 446 | // dropped previously. 447 | continue; 448 | } 449 | Element grandParentNode = parentNode.parent(); 450 | if (grandParentNode == null) { 451 | continue; // ditto 452 | } 453 | String innerText = nodeToScore.text(); 454 | 455 | /* 456 | * If this paragraph is less than 25 characters, don't even count it. 457 | */ 458 | if (innerText.length() < 25) { 459 | continue; 460 | } 461 | 462 | /* Initialize readability data for the parent. */ 463 | if ("".equals(parentNode.attr("readability"))) { 464 | initializeNode(parentNode); 465 | candidates.add(parentNode); 466 | } 467 | 468 | /* Initialize readability data for the grandparent. */ 469 | /* 470 | * If the grandparent has no parent, we don't want it as a candidate. It's probably a symptom that 471 | * we're operating in an orphan. 472 | */ 473 | if (grandParentNode.parent() != null && "".equals(grandParentNode.attr("readability"))) { 474 | initializeNode(grandParentNode); 475 | candidates.add(grandParentNode); 476 | } 477 | 478 | double contentScore = 0; 479 | 480 | /* Add a point for the paragraph itself as a base. */ 481 | contentScore++; 482 | 483 | /* Add points for any commas within this paragraph */ 484 | contentScore += innerText.split(",").length; 485 | 486 | /* 487 | * For every 100 characters in this paragraph, add another point. Up to 3 points. 488 | */ 489 | contentScore += Math.min(Math.floor(innerText.length() / 100.0), 3.0); 490 | 491 | /* Add the score to the parent. The grandparent gets half. */ 492 | incrementContentScore(parentNode, contentScore); 493 | 494 | if (grandParentNode != null) { 495 | incrementContentScore(grandParentNode, contentScore / 2.0); 496 | } 497 | } 498 | 499 | /** 500 | * After we've calculated scores, loop through all of the possible candidate nodes we found and find 501 | * the one with the highest score. 502 | **/ 503 | Element topCandidate = null; 504 | for (Element candidate : candidates) { 505 | /** 506 | * Scale the final candidates score based on link density. Good content should have a relatively 507 | * small link density (5% or less) and be mostly unaffected by this operation. 508 | **/ 509 | double score = getContentScore(candidate); 510 | double newScore = score * (1.0 - getLinkDensity(candidate)); 511 | setContentScore(candidate, newScore); 512 | LOG.debug("Candidate [" + candidate.getClass() + "] (" + candidate.className() + ":" 513 | + candidate.id() + ") with score " + newScore); 514 | 515 | if (null == topCandidate || newScore > getContentScore(topCandidate)) { 516 | topCandidate = candidate; 517 | } 518 | } 519 | 520 | /** 521 | * If we still have no top candidate, just use the body as a last resort. We also have to copy the 522 | * body node so it is something we can modify. 523 | **/ 524 | if (topCandidate == null || topCandidate == body) { 525 | topCandidate = document.createElement("div"); 526 | // not efficient but not likely. 527 | topCandidate.html(pageElement.html()); 528 | pageElement.html(""); 529 | pageElement.appendChild(topCandidate); 530 | initializeNode(topCandidate); 531 | } 532 | 533 | /** 534 | * Now that we have the top candidate, look through its siblings for content that might also be 535 | * related. Things like preambles, content split by ads that we removed, etc. 536 | **/ 537 | Element articleContent = document.createElement("div"); 538 | if (isPaging) { 539 | articleContent.attr("id", "readability-content"); 540 | } 541 | double siblingScoreThreshold = Math.max(10, getContentScore(topCandidate) * 0.2); 542 | List siblingNodes = topCandidate.parent().children(); 543 | 544 | for (Element siblingNode : siblingNodes) { 545 | boolean scored = isElementScored(siblingNode); 546 | 547 | boolean append = false; 548 | 549 | LOG.debug("Looking at sibling node: [" + siblingNode.getClass() + "] (" + siblingNode.className() 550 | + ":" + siblingNode.id() + ")"); 551 | if (scored) { 552 | LOG.debug("Sibling has score " + getContentScore(siblingNode)); 553 | } else { 554 | LOG.debug("Sibling has score unknown"); 555 | } 556 | 557 | if (siblingNode == topCandidate) { 558 | append = true; 559 | } 560 | 561 | double contentBonus = 0; 562 | /* 563 | * Give a bonus if sibling nodes and top candidates have the example same classname 564 | */ 565 | if (siblingNode.className().equals(topCandidate.className()) 566 | && !"".equals(topCandidate.className())) { 567 | contentBonus += getContentScore(topCandidate) * 0.2; 568 | } 569 | 570 | if (scored && (getContentScore(siblingNode) + contentBonus >= siblingScoreThreshold)) { 571 | append = true; 572 | } 573 | 574 | if ("p".equals(siblingNode.tagName())) { 575 | double linkDensity = getLinkDensity(siblingNode); 576 | String nodeContent = siblingNode.text(); 577 | int nodeLength = nodeContent.length(); 578 | 579 | if (nodeLength > 80 && linkDensity < 0.25) { 580 | append = true; 581 | } else if (nodeLength < 80 && linkDensity == 0 582 | && Patterns.exists(Patterns.ENDS_WITH_DOT, nodeContent)) { 583 | append = true; 584 | } 585 | } 586 | 587 | if (append) { 588 | LOG.debug("Appending node: [" + siblingNode.getClass() + "]"); 589 | 590 | Element nodeToAppend = null; 591 | if (!"div".equals(siblingNode.tagName()) && !"p".equals(siblingNode.tagName())) { 592 | /* 593 | * We have a node that isn't a common block level element, like a form or td tag. Turn it 594 | * into a div so it doesn't get filtered out later by accident. 595 | */ 596 | 597 | LOG.debug("Altering siblingNode of " + siblingNode.tagName() + " to div."); 598 | nodeToAppend = changeElementTag(siblingNode, "div"); 599 | } else { 600 | nodeToAppend = siblingNode; 601 | } 602 | 603 | /* 604 | * To ensure a node does not interfere with readability styles, remove its classnames 605 | */ 606 | nodeToAppend.removeAttr("class"); 607 | 608 | /* 609 | * Append sibling and subtract from our list because it removes the node when you append to 610 | * another node 611 | */ 612 | articleContent.appendChild(nodeToAppend); 613 | } 614 | } 615 | 616 | document.body().empty(); 617 | document.body().appendChild(articleContent); 618 | 619 | /** 620 | * So we have all of the content that we need. Now we clean it up for presentation. 621 | **/ 622 | prepArticle(articleContent); 623 | 624 | /** 625 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. If 626 | * we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher 627 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of finding 628 | * the -right- content. 629 | **/ 630 | if (articleContent.text().length() < 250) { 631 | pageElement.html(pageCacheHtml); 632 | if (stripUnlikelyCandidates) { 633 | try { 634 | stripUnlikelyCandidates = false; 635 | return grabArticle(pageElement); 636 | } finally { 637 | stripUnlikelyCandidates = true; 638 | } 639 | } else if (classWeight) { 640 | try { 641 | classWeight = false; 642 | return grabArticle(pageElement); 643 | } finally { 644 | classWeight = true; 645 | } 646 | } else if (cleanConditionally) { 647 | try { 648 | cleanConditionally = false; 649 | return grabArticle(pageElement); 650 | } finally { 651 | cleanConditionally = true; 652 | } 653 | } else { 654 | return null; 655 | } 656 | } 657 | 658 | return articleContent; 659 | } 660 | 661 | private String getDisplayText(Element e) { 662 | HtmlPage htmlPage = new HtmlPage(); 663 | htmlPage.process(document); 664 | String thisText = htmlPage.getPcData(); 665 | LOG.debug("Text: " + thisText); 666 | return thisText; 667 | } 668 | 669 | /** 670 | * Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm based on content 671 | * length, classnames, link density, number of images & embeds, etc. 672 | * 673 | * @return void 674 | **/ 675 | private void cleanConditionally(Element e, String tag) { 676 | 677 | if (!cleanConditionally) { 678 | return; 679 | } 680 | 681 | Elements tagsList = e.getElementsByTag(tag); 682 | int curTagsLength = tagsList.size(); 683 | 684 | /** 685 | * Gather counts for other typical elements embedded within. Traverse backwards so we can remove nodes 686 | * at the same time without effecting the traversal. TODO: Consider taking into account original 687 | * contentScore here. 688 | **/ 689 | for (int i = curTagsLength - 1; i >= 0; i--) { 690 | Element ee = tagsList.get(i); 691 | if (ee.ownerDocument() == null) { 692 | continue; // it a child of something we've already killed, so it 693 | // has no document. 694 | } 695 | double weight = getClassWeight(ee); 696 | double contentScore = getContentScore(ee); 697 | 698 | LOG.debug("Cleaning Conditionally [" + ee.getClass() + "] (" + ee.className() + ":" + ee.id() 699 | + ")" + contentScore); 700 | 701 | if (weight + contentScore < 0) { 702 | LOG.debug("Negative content score"); 703 | ee.remove(); 704 | } else if (getCharCount(ee, ',') < 10) { 705 | /** 706 | * If there are not very many commas, and the number of non-paragraph elements is more than 707 | * paragraphs or other ominous signs, remove the element. 708 | **/ 709 | int p = ee.getElementsByTag("p").size(); 710 | int img = ee.getElementsByTag("img").size(); 711 | int li = ee.getElementsByTag("li").size() - 100; 712 | int input = ee.getElementsByTag("input").size(); 713 | 714 | Elements embeds = ee.getElementsByTag("embed"); 715 | int embedCount = embeds.size(); 716 | // removed code that pays specific attention to youtube. 717 | double linkDensity = getLinkDensity(ee); 718 | int contentLength = ee.text().length(); 719 | boolean toRemove = false; 720 | 721 | if (img > p) { 722 | toRemove = true; 723 | } else if (li > p && !"ul".equals(tag) && !"ol".equals(tag)) { 724 | toRemove = true; 725 | } else if (input > Math.floor(p / 3)) { 726 | toRemove = true; 727 | } else if (contentLength < 25 && (img == 0 || img > 2)) { 728 | toRemove = true; 729 | } else if (weight < 25 && linkDensity > 0.2) { 730 | toRemove = true; 731 | } else if (weight >= 25 && linkDensity > 0.5) { 732 | toRemove = true; 733 | } else if ((embedCount == 1 && contentLength < 75) || embedCount > 1) { 734 | toRemove = true; 735 | } 736 | 737 | if (toRemove) { 738 | LOG.debug("failed keep tests."); 739 | ee.remove(); 740 | } 741 | } 742 | } 743 | } 744 | 745 | /** 746 | * Clean out spurious headers from an Element. Checks things like classnames and link density. 747 | * 748 | * @param Element 749 | * @return void 750 | **/ 751 | private void cleanHeaders(Element e) { 752 | for (int headerIndex = 1; headerIndex < 3; headerIndex++) { 753 | Elements headers = e.getElementsByTag("h" + headerIndex); 754 | for (int i = headers.size() - 1; i >= 0; i--) { 755 | if (getClassWeight(headers.get(i)) < 0 || getLinkDensity(headers.get(i)) > 0.33) { 756 | headers.get(i).remove(); 757 | } 758 | } 759 | } 760 | } 761 | 762 | /** 763 | * Prepare the article node for display. Clean out any inline styles, iframes, forms, strip extraneous 764 | *

765 | * tags, etc. This takes an element in, but returns a string. 766 | * 767 | * @param Element 768 | * @return void 769 | **/ 770 | private void prepArticle(Element articleContent) { 771 | // we don't need to do this, we don't care 772 | cleanStyles(articleContent); 773 | // this replaces any break element or an nbsp with a plain break 774 | // element. 775 | // not needed. We will deal with breaks as we deal with breaks 776 | // killBreaks(articleContent); 777 | 778 | /* Clean out junk from the article content */ 779 | cleanConditionally(articleContent, "form"); 780 | clean(articleContent, "object"); 781 | clean(articleContent, "h1"); 782 | 783 | /** 784 | * If there is only one h2, they are probably using it as a header and not a subheader, so remove it 785 | * since we already have a header. 786 | ***/ 787 | if (articleContent.getElementsByTag("h2").size() == 1) { 788 | clean(articleContent, "h2"); 789 | } 790 | clean(articleContent, "iframe"); 791 | 792 | cleanHeaders(articleContent); 793 | 794 | /* 795 | * Do these last as the previous stuff may have removed junk that will affect these 796 | */ 797 | cleanConditionally(articleContent, "table"); 798 | cleanConditionally(articleContent, "ul"); 799 | //could have no children, will crash then 800 | if (articleContent.children().size() != 0) { 801 | cleanConditionally(articleContent.child(0), "div"); 802 | } 803 | 804 | /* Remove extra paragraphs */ 805 | Elements articleParagraphs = articleContent.getElementsByTag("p"); 806 | for (Element para : articleParagraphs) { 807 | int imgCount = para.getElementsByTag("img").size(); 808 | int embedCount = para.getElementsByTag("embed").size(); 809 | int objectCount = para.getElementsByTag("object").size(); 810 | 811 | if (imgCount == 0 && embedCount == 0 && objectCount == 0 && para.text().matches("\\s*")) { 812 | para.remove(); 813 | } 814 | } 815 | 816 | Elements parasWithPreceedingBreaks = articleContent.getElementsByTag("br + p"); 817 | for (Element pe : parasWithPreceedingBreaks) { 818 | Element brElement = pe.previousElementSibling(); 819 | brElement.remove(); 820 | } 821 | } 822 | 823 | private void cleanStyles(Element articleContent) { 824 | // we want to clear off the style attributes in case they influence 825 | // something else. 826 | for (Element e : articleContent.getAllElements()) { 827 | e.removeAttr("style"); 828 | } 829 | } 830 | 831 | /** 832 | * Clean a node of all elements of type "tag". 833 | * 834 | * @param Element 835 | * @param string tag to clean 836 | **/ 837 | private void clean(Element e, String tag) { 838 | Elements targetList = e.getElementsByTag(tag); 839 | targetList.remove(); 840 | } 841 | 842 | private double getLinkDensity(Element e) { 843 | Elements links = e.getElementsByTag("a"); 844 | double textLength = e.text().length(); 845 | double linkLength = 0; 846 | for (Element link : links) { 847 | linkLength += link.text().length(); 848 | } 849 | 850 | return linkLength / textLength; 851 | } 852 | 853 | private String getArticleTitle() { 854 | String curTitle = ""; 855 | String origTitle = ""; 856 | 857 | Elements titleElements = document.getElementsByTag("title"); 858 | if (titleElements.size() > 0) { 859 | if (titleElements.size() > 1) { 860 | LOG.warn("More than one title."); 861 | } 862 | curTitle = titleElements.get(0).text(); 863 | origTitle = curTitle; 864 | } 865 | 866 | if (Patterns.exists(Patterns.BAR_DASH, curTitle)) { 867 | curTitle = origTitle.replaceAll("(.*)[\\|\\-] .*", "$1"); 868 | if (curTitle.split(" ").length < 3) { 869 | curTitle = origTitle.replaceAll("[^\\|\\-]*[\\|\\-](.*)", "$1"); 870 | } 871 | } else if (curTitle.indexOf(": ") != -1) { 872 | curTitle = origTitle.replaceAll(".*:(.*)", "$1"); 873 | 874 | if (curTitle.split(" ").length < 3) { 875 | curTitle = origTitle.replaceAll("[^:]*[:](.*)", "$1"); 876 | } 877 | } else if (curTitle.length() > 150 || curTitle.length() < 15) { 878 | Elements hOnes = document.getElementsByTag("h1"); 879 | if (hOnes.size() == 1) { 880 | curTitle = hOnes.get(0).text(); 881 | } 882 | } 883 | 884 | curTitle = curTitle.trim(); 885 | 886 | if (curTitle.split(" ").length <= 4) { 887 | curTitle = origTitle; 888 | } 889 | return curTitle; 890 | } 891 | 892 | private String findBaseUrl(String stringUrl) { 893 | try { 894 | URI base = findBaseUrl0(stringUrl); 895 | return base.toString(); 896 | } catch (URISyntaxException e) { 897 | LOG.debug("Failed to get base URI", e); 898 | return null; 899 | } 900 | } 901 | 902 | private URI findBaseUrl0(String stringUrl) throws URISyntaxException { 903 | //Compensate for Windows path names. 904 | stringUrl = stringUrl.replace("\\", "/"); 905 | int qindex = stringUrl.indexOf("?"); 906 | if (qindex != -1) { 907 | // stuff after the ? tends to make the Java URL parser burp. 908 | stringUrl = stringUrl.substring(0, qindex); 909 | } 910 | URI url = new URI(stringUrl); 911 | URI baseUrl = new URI(url.getScheme(), url.getAuthority(), url.getPath(), null, null); 912 | 913 | String path = baseUrl.getPath().substring(1); // toss the leading / 914 | String[] pieces = path.split("/"); 915 | List urlSlashes = new ArrayList(); 916 | // reverse 917 | for (String piece : pieces) { 918 | urlSlashes.add(piece); 919 | } 920 | List cleanedSegments = new ArrayList(); 921 | String possibleType = ""; 922 | boolean del; 923 | 924 | for (int i = 0; i < urlSlashes.size(); i++) { 925 | String segment = urlSlashes.get(i); 926 | 927 | // Split off and save anything that looks like a file type. 928 | if (segment.indexOf(".") != -1) { 929 | possibleType = segment.split("\\.")[1]; 930 | 931 | /* 932 | * If the type isn't alpha-only, it's probably not actually a file extension. 933 | */ 934 | if (!possibleType.matches("[^a-zA-Z]")) { 935 | segment = segment.split("\\.")[0]; 936 | } 937 | } 938 | 939 | /** 940 | * EW-CMS specific segment replacement. Ugly. Example: 941 | * http://www.ew.com/ew/article/0,,20313460_20369436,00.html 942 | **/ 943 | if (segment.indexOf(",00") != -1) { 944 | segment = segment.replaceFirst(",00", ""); 945 | } 946 | 947 | // If our first or second segment has anything looking like a page 948 | // number, remove it. 949 | /* Javascript code has some /i's here, we might need to fiddle */ 950 | Matcher pnMatcher = Patterns.PAGE_NUMBER_LIKE.matcher(segment); 951 | if (pnMatcher.matches() && ((i == 1) || (i == 0))) { 952 | segment = pnMatcher.replaceAll(""); 953 | } 954 | 955 | del = false; 956 | 957 | /* 958 | * If this is purely a number, and it's the first or second segment, it's probably a page number. 959 | * Remove it. 960 | */ 961 | if (i < 2 && segment.matches("^\\d{1,2}$")) { 962 | del = true; 963 | } 964 | 965 | /* If this is the first segment and it's just "index", remove it. */ 966 | if (i == 0 && segment.toLowerCase() == "index") 967 | del = true; 968 | 969 | /* 970 | * If our first or second segment is smaller than 3 characters, and the first segment was purely 971 | * alphas, remove it. 972 | */ 973 | /* /i again */ 974 | if (i < 2 && segment.length() < 3 && !urlSlashes.get(0).matches("[a-z]")) 975 | del = true; 976 | 977 | /* If it's not marked for deletion, push it to cleanedSegments. */ 978 | if (!del) { 979 | cleanedSegments.add(segment); 980 | } 981 | } 982 | 983 | String cleanedPath = ""; 984 | for (String s : cleanedSegments) { 985 | cleanedPath = cleanedPath + s; 986 | cleanedPath = cleanedPath + "/"; 987 | } 988 | URI cleaned = new URI(url.getScheme(), url.getAuthority(), "/" 989 | + cleanedPath.substring(0, cleanedPath 990 | .length() - 1), null, null); 991 | return cleaned; 992 | } 993 | 994 | /* 995 | * Officially parsing URL's from HTML pages is a mug's game. 996 | */ 997 | 998 | private String getUrlHost(String url) { 999 | // httpx://host/..... 1000 | 1001 | int hostStart = url.indexOf("//"); 1002 | if (hostStart == -1) { 1003 | return ""; 1004 | } 1005 | int hostEnd = url.indexOf("/", hostStart + 2); 1006 | if (hostEnd == -1) { 1007 | return url.substring(hostStart + 2); 1008 | } else { 1009 | return url.substring(hostStart + 2, hostEnd); 1010 | } 1011 | 1012 | } 1013 | 1014 | private String findNextPageLink(Element body) { 1015 | Map possiblePages = new HashMap(); 1016 | Elements allLinks = body.getElementsByTag("a"); 1017 | String articleBaseUrl = findBaseUrl(givenUrl); 1018 | String baseHost = getUrlHost(articleBaseUrl); 1019 | 1020 | /** 1021 | * Loop through all links, looking for hints that they may be next-page links. Things like having 1022 | * "page" in their textContent, className or id, or being a child of a node with a page-y className or 1023 | * id. Also possible: levenshtein distance? longest common subsequence? After we do that, assign each 1024 | * page a score, and 1025 | **/ 1026 | for (Element link : allLinks) { 1027 | String linkHref = link.attr("abs:href").replaceAll("#.*$", "").replaceAll("/$", ""); 1028 | 1029 | /* If we've already seen this page, ignore it */ 1030 | if ("".equals(linkHref) || linkHref.equals(articleBaseUrl) || linkHref.equals(givenUrl) 1031 | || parsedPages.contains(linkHref)) { 1032 | continue; 1033 | } 1034 | 1035 | String linkHost = getUrlHost(linkHref); 1036 | 1037 | /* If it's on a different domain, skip it. */ 1038 | if (!linkHost.equals(baseHost)) { 1039 | continue; 1040 | } 1041 | 1042 | String linkText = link.text(); // like innerText 1043 | 1044 | /* If the linkText looks like it's not the next page, skip it. */ 1045 | if (Patterns.EXTRANEOUS.matcher(linkText).matches() || linkText.length() > 25) { 1046 | continue; 1047 | } 1048 | 1049 | /* 1050 | * If the leftovers of the URL after removing the base URL don't contain any digits, it's 1051 | * certainly not a next page link. 1052 | */ 1053 | String linkHrefLeftover = linkHref.replaceFirst(articleBaseUrl, ""); 1054 | if (!Patterns.exists(Patterns.DIGIT, linkHrefLeftover)) { 1055 | continue; 1056 | } 1057 | 1058 | PageLinkInfo linkObj = possiblePages.get(linkHref); 1059 | if (linkObj == null) { 1060 | linkObj = new PageLinkInfo(0.0, linkText, linkHref); 1061 | possiblePages.put(linkHref, linkObj); 1062 | } else { 1063 | String newLinkText = linkObj.getLinkText() + " | " + linkText; 1064 | linkObj.setLinkText(newLinkText); 1065 | } 1066 | 1067 | /** 1068 | * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, 1069 | * but the odds are lower. Example: 1070 | * http://www.actionscript.org/resources/articles/745/1/JavaScript 1071 | * -and-VBScript-Injection-in-ActionScript-3/Page1.html 1072 | **/ 1073 | if (linkHref.indexOf(articleBaseUrl) != 0) { 1074 | linkObj.incrementScore(-25); 1075 | } 1076 | 1077 | String linkData = linkText + " " + link.className() + " " + link.id(); 1078 | if (Patterns.exists(Patterns.NEXT_LINK, linkData)) { 1079 | linkObj.incrementScore(50); 1080 | } 1081 | if (Patterns.exists(Patterns.PAGINATION, linkData)) { 1082 | linkObj.incrementScore(25); 1083 | } 1084 | if (Patterns.exists(Patterns.FIRST_OR_LAST, linkData)) { 1085 | // -65 is enough to negate any bonuses gotten from a > or » in 1086 | // the text, 1087 | /* 1088 | * If we already matched on "next", last is probably fine. If we didn't, then it's bad. 1089 | * Penalize. 1090 | */ 1091 | if (!Patterns.exists(Patterns.NEXT_LINK, linkObj.getLinkText())) { 1092 | linkObj.incrementScore(-65); 1093 | } 1094 | } 1095 | 1096 | if (Patterns.exists(Patterns.NEGATIVE, linkData) 1097 | || Patterns.exists(Patterns.EXTRANEOUS, linkData)) { 1098 | linkObj.incrementScore(-50); 1099 | } 1100 | if (Patterns.exists(Patterns.PREV_LINK, linkData)) { 1101 | linkObj.incrementScore(-200); 1102 | } 1103 | 1104 | /* If a parentNode contains page or paging or paginat */ 1105 | Element parentNode = link.parent(); 1106 | boolean positiveNodeMatch = false; 1107 | boolean negativeNodeMatch = false; 1108 | while (parentNode != null) { 1109 | String parentNodeClassAndId = parentNode.className() + " " + parentNode.id(); 1110 | if (!positiveNodeMatch && Patterns.match(Patterns.PAGINATION, parentNodeClassAndId)) { 1111 | positiveNodeMatch = true; 1112 | linkObj.incrementScore(25); 1113 | } 1114 | if (!negativeNodeMatch && Patterns.match(Patterns.NEGATIVE, parentNodeClassAndId)) { 1115 | /* 1116 | * If this is just something like "footer", give it a negative. If it's something like 1117 | * "body-and-footer", leave it be. 1118 | */ 1119 | if (!Patterns.exists(Patterns.POSITIVE, parentNodeClassAndId)) { 1120 | linkObj.incrementScore(-25); 1121 | negativeNodeMatch = true; 1122 | } 1123 | } 1124 | parentNode = parentNode.parent(); 1125 | } 1126 | 1127 | /** 1128 | * If the URL looks like it has paging in it, add to the score. Things like /page/2/, /pagenum/2, 1129 | * ?p=3, ?page=11, ?pagination=34 1130 | **/ 1131 | if (Patterns.exists(Patterns.PAGE_AND_NUMBER, linkHref) 1132 | || Patterns.exists(Patterns.PAGE_OR_PAGING, linkHref)) { 1133 | linkObj.incrementScore(+25); 1134 | } 1135 | 1136 | /* If the URL contains negative values, give a slight decrease. */ 1137 | if (Patterns.exists(Patterns.EXTRANEOUS, linkHref)) { 1138 | linkObj.incrementScore(-15); 1139 | } 1140 | 1141 | /** 1142 | * Minor punishment to anything that doesn't match our current URL. NOTE: I'm finding this to 1143 | * cause more harm than good where something is exactly 50 points. Dan, can you show me a 1144 | * counterexample where this is necessary? if (linkHref.indexOf(window.location.href) !== 0) { 1145 | * linkObj.score -= 1; } 1146 | **/ 1147 | 1148 | /** 1149 | * If the link text can be parsed as a number, give it a minor bonus, with a slight bias towards 1150 | * lower numbered pages. This is so that pages that might not have 'next' in their text can still 1151 | * get scored, and sorted properly by score. 1152 | **/ 1153 | boolean linkNumeric = false; 1154 | int linkTextAsNumber = 0; 1155 | 1156 | try { 1157 | linkTextAsNumber = Integer.parseInt(linkText); 1158 | linkNumeric = true; 1159 | } catch (NumberFormatException e) { 1160 | } 1161 | 1162 | if (linkNumeric) { 1163 | // Punish 1 since we're either already there, or it's probably 1164 | // before what we want anyways. 1165 | if (linkTextAsNumber == 1) { 1166 | linkObj.incrementScore(-10); 1167 | } else { 1168 | // Todo: Describe this better 1169 | linkObj.incrementScore(Math.max(0, 10 - linkTextAsNumber)); 1170 | } 1171 | } 1172 | } 1173 | 1174 | /** 1175 | * Loop through all of our possible pages from above and find our top candidate for the next page URL. 1176 | * Require at least a score of 50, which is a relatively high confidence that this page is the next 1177 | * link. 1178 | **/ 1179 | PageLinkInfo topPage = null; 1180 | for (Map.Entry pageEntry : possiblePages.entrySet()) { 1181 | if (pageEntry.getValue().getScore() >= 50 1182 | && (topPage == null || topPage.getScore() < pageEntry.getValue().getScore())) { 1183 | topPage = pageEntry.getValue(); 1184 | } 1185 | } 1186 | 1187 | if (topPage != null) { 1188 | String nextHref = topPage.getHref().replaceFirst("/$", ""); 1189 | LOG.debug("Next page = " + nextHref); 1190 | parsedPages.add(nextHref); 1191 | return nextHref; 1192 | } else { 1193 | return null; 1194 | } 1195 | } 1196 | 1197 | /** 1198 | * Get the number of times a string s appears in the node e. 1199 | * 1200 | * @param Element 1201 | * @param string - what to split on. Default is "," 1202 | * @return number (integer) 1203 | **/ 1204 | int getCharCount(Element e, char s) { 1205 | return e.text().split(Character.toString(s)).length - 1; 1206 | } 1207 | 1208 | public void setPageReader(PageReader pageReader) { 1209 | this.pageReader = pageReader; 1210 | } 1211 | 1212 | public PageReader getPageReader() { 1213 | return pageReader; 1214 | } 1215 | 1216 | public boolean isImpossible() { 1217 | return impossible; 1218 | } 1219 | 1220 | public String getNextPageLink() { 1221 | return nextPageLink; 1222 | } 1223 | 1224 | public String getTitle() { 1225 | return title; 1226 | } 1227 | 1228 | public String getArticleText() { 1229 | return articleText; 1230 | } 1231 | 1232 | public void setReadAllPages(boolean readAllPages) { 1233 | this.readAllPages = readAllPages; 1234 | } 1235 | 1236 | public boolean isReadAllPages() { 1237 | return readAllPages; 1238 | } 1239 | 1240 | public List getXmlImages() { 1241 | return xmlImages; 1242 | } 1243 | 1244 | } 1245 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/ReadabilityDriver.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | 19 | package com.basistech.readability; 20 | 21 | import java.io.File; 22 | import java.io.FileOutputStream; 23 | import java.io.FilenameFilter; 24 | import java.io.IOException; 25 | 26 | import org.slf4j.Logger; 27 | import org.slf4j.LoggerFactory; 28 | 29 | /* 30 | * At the moment this class will take all html files from the flat directory: ./src/test/resources/htmlInput/ 31 | * and write them to: ./src/test/resources/textOutput/ 32 | * 33 | * In the future, a nice thing to do might be to abstract this so that it can process just about anything you 34 | * throw in there. It's a matter of using the appropriate PageReaders. Also, having the directories be hard-coded 35 | * might be a problem in the future. 36 | */ 37 | 38 | public final class ReadabilityDriver { 39 | 40 | //the logger 41 | private static final Logger LOG = LoggerFactory.getLogger(ReadabilityDriver.class); 42 | 43 | //the paths 44 | private static final String INPUT_PATH = "./src/test/resources/htmlInput/"; 45 | private static final String OUTPUT_PATH = "target"; 46 | 47 | //private constructor 48 | private ReadabilityDriver() { } 49 | 50 | public static void main(String[] args) throws IOException { 51 | 52 | //input directory file 53 | File inputDir = new File(INPUT_PATH); 54 | 55 | //create the FilePageReader for Readability 56 | FilePageReader reader = new FilePageReader(); 57 | reader.setBaseDirectory(inputDir); 58 | 59 | //instantiate Readability and set reader 60 | Readability readability = new Readability(); 61 | readability.setPageReader(reader); 62 | readability.setReadAllPages(false); 63 | reader.setCharsetDetector(new TikaCharsetDetector()); 64 | 65 | //instantiate a file array 66 | File[] htmlFiles; 67 | 68 | //get all html files in directory 69 | if (inputDir.exists()) { 70 | htmlFiles = inputDir.listFiles(new FilenameFilter() { 71 | public boolean accept(File dir, String name) { 72 | return name.matches(".*\\.html$"); 73 | } 74 | }); 75 | } else { 76 | htmlFiles = new File[0]; 77 | } 78 | 79 | //iterate over the files and run Readability on them 80 | for (File page : htmlFiles) { 81 | 82 | //get the page path 83 | String path = page.getPath(); 84 | 85 | //process the page 86 | try { 87 | LOG.info("processing page: " + path); 88 | readability.processDocument(path); 89 | } catch (PageReadException e) { 90 | LOG.error("PageReadError while processing: " + path); 91 | e.printStackTrace(); 92 | continue; 93 | } 94 | 95 | //write the output, forcing a sentence break between title and body with \u2029. 96 | String title = readability.getTitle().trim() + "\u2029"; 97 | String content = readability.getArticleText(); 98 | String returnText = OUTPUT_PATH + page.getName().replaceAll("html$", "txt"); 99 | FileOutputStream fos = new FileOutputStream(returnText); 100 | fos.write((title + System.getProperty("line.separator") + content).getBytes("UTF8")); 101 | fos.flush(); 102 | fos.close(); 103 | } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/TikaCharsetDetector.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | 19 | package com.basistech.readability; 20 | 21 | import org.apache.tika.parser.txt.CharsetDetector; 22 | import org.apache.tika.parser.txt.CharsetMatch; 23 | import org.slf4j.Logger; 24 | import org.slf4j.LoggerFactory; 25 | 26 | /** 27 | * 28 | */ 29 | public class TikaCharsetDetector implements PageCharsetDetector { 30 | static final Logger LOG = LoggerFactory.getLogger(TikaCharsetDetector.class); 31 | @Override 32 | public String detect(byte[] data, String hint) { 33 | CharsetDetector detector = new CharsetDetector(); 34 | if (hint != null) { 35 | detector.setDeclaredEncoding(hint); 36 | } 37 | detector.setText(data); 38 | CharsetMatch match = detector.detect(); 39 | return match.getName(); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/com/basistech/readability/XmlDataMap.java: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2010 Basis Technology Corp. 3 | * 4 | * Basis Technology Corp. licenses this file 5 | * to you under the Apache License, Version 2.0 (the 6 | * "License"); you may not use this file except in compliance 7 | * with the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, 12 | * software distributed under the License is distributed on an 13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | * KIND, either express or implied. See the License for the 15 | * specific language governing permissions and limitations 16 | * under the License. 17 | */ 18 | package com.basistech.readability; 19 | 20 | import java.util.ArrayList; 21 | import java.util.LinkedList; 22 | import java.util.List; 23 | import java.util.ListIterator; 24 | 25 | import org.jsoup.nodes.Element; 26 | import org.jsoup.nodes.Node; 27 | import org.jsoup.nodes.TextNode; 28 | 29 | /** 30 | * Maintains map between PC-DATA offsets and Text nodes in an XML document. Provides some structure for the 31 | * process of pulling the data out. 32 | */ 33 | public abstract class XmlDataMap { 34 | protected char forceSentenceChar = '\u2029'; // paragraph 35 | 36 | /** 37 | * Classify elements in the tree. 38 | */ 39 | public enum ElementAction { 40 | /** 41 | * Ignore text under here. 42 | */ 43 | Banned, 44 | /** 45 | * Not currently used. 46 | */ 47 | Alt, 48 | /** 49 | * Insert whitespace. 50 | */ 51 | Whitespace, 52 | /** 53 | * Treat as sentence boundary. 54 | */ 55 | Sentence, 56 | /** 57 | * Remember where this was. 58 | */ 59 | Mark; 60 | } 61 | 62 | public static class Mark { 63 | private String tag; 64 | private int offset; 65 | 66 | /** 67 | * * @return Returns the tag. 68 | */ 69 | public String getTag() { 70 | return tag; 71 | } 72 | 73 | /** 74 | * @param tag The tag to set. 75 | */ 76 | public void setTag(String tag) { 77 | this.tag = tag; 78 | } 79 | 80 | /** 81 | * * @return Returns the offset. 82 | */ 83 | public int getOffset() { 84 | return offset; 85 | } 86 | 87 | /** 88 | * @param offset The offset to set. 89 | */ 90 | public void setOffset(int offset) { 91 | this.offset = offset; 92 | } 93 | } 94 | 95 | protected StringBuffer pcDataBuffer; 96 | private List marks; 97 | // CHECKSTYLE:OFF 98 | private LinkedList offsetRanges; 99 | // CHECKSTYLE:ON 100 | private int pcDataOffset; 101 | private boolean justAppendedSpace; 102 | private boolean justAppendedPeriod; 103 | 104 | private ListIterator optimizedListPointer; 105 | 106 | private OffsetRange optimizedRangeListElement; 107 | 108 | protected XmlDataMap() { 109 | offsetRanges = new LinkedList(); 110 | pcDataOffset = 0; 111 | pcDataBuffer = new StringBuffer(); 112 | justAppendedSpace = false; 113 | justAppendedPeriod = false; 114 | marks = new ArrayList(); 115 | } 116 | 117 | public OffsetRange findOffsetRangeForOffset(int offset) { 118 | if (optimizedRangeListElement.offsetInRange(offset)) { 119 | return optimizedRangeListElement; 120 | } else if (offset > optimizedRangeListElement.getStart()) { 121 | while (optimizedRangeListElement.getStart() < offset && optimizedListPointer.hasNext()) { 122 | optimizedRangeListElement = optimizedListPointer.next(); 123 | if (optimizedRangeListElement.offsetInRange(offset)) { 124 | return optimizedRangeListElement; 125 | } 126 | } 127 | throw new RuntimeException("Offset " + offset + " beyond last range"); 128 | } else { 129 | // we don't expect to exercise this case. 130 | // has to be smaller, no? 131 | while (offset < optimizedRangeListElement.getStart() && optimizedListPointer.hasPrevious()) { 132 | optimizedRangeListElement = optimizedListPointer.previous(); 133 | if (optimizedRangeListElement.offsetInRange(offset)) { 134 | return optimizedRangeListElement; 135 | } 136 | } 137 | throw new RuntimeException("Offset " + offset + " before the first offset"); 138 | } 139 | } 140 | 141 | /** 142 | * Retrieve the offset ranges for the text nodes of the original tree. 143 | * 144 | * @return the ranges. 145 | */ 146 | public List getOffsetRanges() { 147 | return offsetRanges; 148 | } 149 | 150 | public String getPcData() { 151 | return pcDataBuffer.toString(); 152 | } 153 | 154 | /** 155 | * Retrieve the accumulated pc data. 156 | * 157 | * @return 158 | */ 159 | public StringBuffer getPcDataBuffer() { 160 | return pcDataBuffer; 161 | } 162 | 163 | /** 164 | * If we need to split a range for annotation, we want to keep the map of offset ranges usable. Note that 165 | * the caller has to revalidate or maintain any indices it has grabbed for ranges after the one we are 166 | * splitting. Note that this does not insert the new text node into the parent contents, the caller does 167 | * that. 168 | * 169 | * @param range 170 | * @param splitPoint 171 | * @return 172 | */ 173 | public TextNode splitText(int rangePoint, int splitPoint) { 174 | assert splitPoint > 0; 175 | OffsetRange range = offsetRanges.get(rangePoint); 176 | assert splitPoint < range.getText().text().length(); 177 | TextNode newText = new TextNode(range.getText().text().substring(splitPoint), null); 178 | range.getText().text(range.getText().text().substring(0, splitPoint)); 179 | OffsetRange newRange = new OffsetRange(range.getStart() + splitPoint, range.getEnd(), newText); 180 | offsetRanges.add(rangePoint + 1, newRange); 181 | range.setEnd(splitPoint + range.getStart()); 182 | assert range.getText().text().length() == range.getEnd() - range.getStart(); 183 | return newText; 184 | } 185 | 186 | /** 187 | * A subclass may process metadata into the pc-data stream by calling this directly. 188 | * 189 | * @param textObject 190 | * @param text 191 | */ 192 | protected void append(TextNode textObject, String text) { 193 | // if an entire Text element is whitespace, chances are that it's

NL noise. We don't need it. 194 | boolean spaceText = text.matches("[\\s]*"); 195 | if (spaceText && justAppendedSpace) { 196 | return; 197 | } 198 | if (spaceText) { 199 | justAppendedSpace = true; 200 | } 201 | 202 | OffsetRange offsetRange = new OffsetRange(pcDataOffset, pcDataOffset + text.length(), textObject); 203 | pcDataBuffer.append(text); 204 | pcDataOffset += text.length(); 205 | justAppendedSpace = Character.isWhitespace(text.charAt(text.length() - 1)); 206 | justAppendedPeriod = eosPunctuation(lastNonWhitepaceCharacter(text)); 207 | offsetRanges.add(offsetRange); 208 | } 209 | 210 | protected char lastNonWhitepaceCharacter(String text) { 211 | for (int index = text.length() - 1; index >= 0; index--) { 212 | char c = text.charAt(index); 213 | if (!Character.isWhitespace(c)) { 214 | return c; 215 | } 216 | } 217 | return '\ufeff'; // it won't count as punctuation 218 | } 219 | 220 | //SK: allow quotes to be considered as EOS punctuation, so that we don't 221 | // add extra punctuation to sentences ending with quotes. This isn't 222 | // entirely unicode-friendly, and we may want to fix that someday. 223 | private static boolean eosPunctuation(char c) { 224 | String s = "!?.\u2029\"\u0027\u2018\u2019\u201c\u201d"; 225 | return s.indexOf(c) != -1; 226 | } 227 | 228 | protected void appendPeriod() { 229 | int startPcDataOffset = pcDataOffset; 230 | if (!justAppendedSpace && !justAppendedPeriod) { 231 | String appendMe = " . " + System.getProperty("line.separator"); 232 | pcDataBuffer.append(appendMe); 233 | pcDataOffset += appendMe.length(); 234 | justAppendedPeriod = true; 235 | justAppendedSpace = true; 236 | } else if (!justAppendedPeriod) { 237 | String appendMe = ". " + System.getProperty("line.separator"); 238 | pcDataBuffer.append(appendMe); 239 | pcDataOffset += appendMe.length(); 240 | justAppendedPeriod = true; 241 | justAppendedSpace = true; 242 | } else if (!justAppendedSpace) { 243 | String appendMe = " " + System.getProperty("line.separator"); 244 | pcDataBuffer.append(appendMe); 245 | pcDataOffset += appendMe.length(); 246 | justAppendedPeriod = true; 247 | justAppendedSpace = true; 248 | } 249 | // we make a range so that the code can tell the difference between 'spurious, added, period' 250 | // and 'bug that failed to make an offset range.' 251 | OffsetRange offsetRange = new OffsetRange(startPcDataOffset, pcDataOffset, null); 252 | offsetRanges.add(offsetRange); 253 | 254 | } 255 | 256 | protected void appendSpace() { 257 | if (!justAppendedSpace && !justAppendedPeriod) { 258 | justAppendedSpace = true; 259 | pcDataBuffer.append(' '); 260 | pcDataOffset++; 261 | } 262 | } 263 | 264 | protected abstract ElementAction classifyElement(Element element); 265 | 266 | public void process(Element rootElement) { 267 | recurse(rootElement); 268 | optimizedListPointer = offsetRanges.listIterator(); 269 | optimizedRangeListElement = offsetRanges.getFirst(); 270 | } 271 | 272 | private void recurse(Element element) { 273 | ElementAction action = classifyElement(element); 274 | if (action == ElementAction.Whitespace || action == ElementAction.Sentence) { 275 | appendSpace(); 276 | } 277 | for (Node childNode : element.childNodes()) { 278 | // n.b., cdata not possible if we are coming from TagSoup. If we also handle 279 | // real xhtml by directly parsing it, then we have another story on our hands. 280 | // though we could use canonical XML to get rid of them. 281 | if (childNode instanceof TextNode && action != ElementAction.Banned) { 282 | TextNode textContent = (TextNode)childNode; 283 | String textString = textContent.text(); 284 | append(textContent, textString); 285 | } else if (childNode instanceof Element) { 286 | recurse((Element)childNode); 287 | } 288 | } 289 | if (action == ElementAction.Whitespace) { 290 | appendSpace(); 291 | } else if (action == ElementAction.Sentence) { 292 | appendPeriod(); 293 | } else if (action == ElementAction.Mark) { 294 | Mark mark = new Mark(); 295 | mark.setOffset(pcDataOffset); 296 | mark.setTag(element.tagName()); 297 | } 298 | } 299 | 300 | /** 301 | * * @return Returns the marks. 302 | */ 303 | public List getMarks() { 304 | return marks; 305 | } 306 | 307 | public char getForceSentenceChar() { 308 | return forceSentenceChar; 309 | } 310 | 311 | public void setForceSentenceChar(char forceSentenceChar) { 312 | this.forceSentenceChar = forceSentenceChar; 313 | } 314 | } 315 | --------------------------------------------------------------------------------