├── .gitignore ├── .htaccess ├── LICENSE ├── README.md ├── assets └── css │ ├── readability-min.css │ ├── readability.css │ ├── reset-min.css │ ├── reset.css │ ├── typography-min.css │ └── typography.css ├── common.inc.php ├── config.inc.php ├── index.php ├── lib └── Readability.inc.php └── template ├── footer.inc.html ├── header.inc.html ├── index.html └── reader.html /.gitignore: -------------------------------------------------------------------------------- 1 | cache/* -------------------------------------------------------------------------------- /.htaccess: -------------------------------------------------------------------------------- 1 | 2 | RewriteEngine On 3 | RewriteBase /readability/ 4 | 5 | RewriteCond %{REQUEST_FILENAME} !-f 6 | RewriteCond %{REQUEST_FILENAME} !-d 7 | RewriteRule . index.php [L] 8 | 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) [year] [fullname] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PHP Readability Library 2 | 3 | If you want to use an up-to-date version of this algorithm,check this newer project: 4 | 5 | https://github.com/andreskrey/readability.php 6 | 7 | 8 | ## Back the fun of reading 9 | 10 | The PHP port of [Readability.js by Arc90](http://code.google.com/p/arc90labs-readability/). 11 | 12 | 13 | ## Requirements 14 | 15 | * PHP Version >= 5 16 | * [PHP has builded with DOM(Document Object Model)](http://www.php.net/manual/en/book.dom.php) 17 | 18 | 19 | ## Live demo 20 | 21 | http://graceco.de/readability/ 22 | 23 | 24 | ## Usage 25 | 26 | ``` 27 | require 'lib/Readability.inc.php'; 28 | 29 | $Readability = new Readability($html, $html_input_charset); // default charset is utf-8 30 | $ReadabilityData = $Readability->getContent(); // throws an exception when no suitable content is found 31 | 32 | // You can see more params by var_dump($ReadabilityData); 33 | echo "

".$ReadabilityData['title']."

"; 34 | echo $ReadabilityData['content']; 35 | ``` 36 | 37 | 38 | 39 | PS: For Node.js port, You can [check this](https://github.com/arrix/node-readability/). 40 | 41 | -------------------------------------------------------------------------------- /assets/css/readability-min.css: -------------------------------------------------------------------------------- 1 | body{background:#fbf0d9;font-size:16px;font-family:"minion-pro-1","minion-pro-2",Palatino,Georgia,"Times New Roman",serif;}* html hr,html>body hr{margin:1em 0;}a:link,a:visited,a:hover{color:#333;}#page{width:750px;margin:50px auto;}#logo{color:#5f4b32;font-size:4em;}#desp{color:#5f4b32;font-size:1.2em;margin:-2.5em 0 2em 1em;}#form{margin:2em 0;text-align:right;}#form input{font-size:1.5em;}#form #url{width:500px;}#form #submit{width:50px;border:none;background-color:transparent;cursor:pointer;}#footer{overflow:hidden;text-align:right;list-style:none;}#footer li{margin-right:-2px;float:right;}#footer li a{border-right:1px solid #555;padding:0 .5em;}.article{color:#5f4b32;font-size:16px;}.article a:link{color:#5f4b32;}.article h1,.article h2,.article h3,.article h4,.article h5,.article h6{margin:.5em 0 1.5em 0;font-weight:700;}.article h1{font-size:1.8em;}.article h1 a:link,.article h1 a:hover,.article h1 a:visited{text-decoration:none;}.article h2{font-size:1.5em;}.article h3{font-size:1.3em;}.article h4{font-size:1.1em;}.article h5{font-size:1em;}.article h6{font-size:.8em;}.article p{line-height:1.5em;font-size:20px;margin:0 0 1em 0;}.article code,.article pre,.article img{padding:5px;border:1px solid #999;background:#fff;-moz-border-radius:5px;-webkit-border-radius:5px;border-radius:5px;max-width:90%;margin:.3em 0;}.article code,.article pre{margin:1em 0;font-size:14px;font-family:monaco,"new courier",courier,sans-serif;}.article code *,.article pre *{border:0;}.article dl,.article ol,.article ul{line-height:1.5em;font-size:18px;margin:0 0 .8em 2em;} -------------------------------------------------------------------------------- /assets/css/readability.css: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=4 ts=4 sts=4 fdm=marker ff=unix fenc=utf8 nobomb: */ 2 | /** 3 | * Readability 首页 4 | * 5 | * @author mingcheng 6 | * @date 2011-02-18 7 | * @link http://www.gracecode.com/ 8 | */ 9 | 10 | 11 | body { 12 | background: #fbf0d9; 13 | font-size: 16px; 14 | font-family: "minion-pro-1","minion-pro-2",Palatino,Georgia,"Times New Roman",serif; 15 | } 16 | 17 | * html hr, html > body hr { 18 | margin: 1em 0; 19 | } 20 | 21 | a:link, a:visited, a:hover { 22 | color: #333; 23 | } 24 | 25 | #page { 26 | width: 750px; 27 | margin: 50px auto; 28 | } 29 | 30 | #logo { 31 | color: #5f4b32; 32 | font-size: 4em; 33 | } 34 | 35 | #desp { 36 | color: #5f4b32; 37 | font-size: 1.2em; 38 | margin: -2.5em 0 2em 1em; 39 | } 40 | 41 | #form { 42 | margin: 2em 0; 43 | text-align: right; 44 | } 45 | 46 | #form input { 47 | font-size: 1.5em; 48 | } 49 | 50 | #form #url { 51 | width: 500px; 52 | } 53 | 54 | #form #submit { 55 | width: 50px; 56 | border: none; 57 | background-color: transparent; 58 | cursor: pointer; 59 | } 60 | 61 | #footer { 62 | overflow: hidden; 63 | text-align: right; 64 | list-style: none; 65 | } 66 | 67 | #footer li { 68 | margin-right: -2px; 69 | float: right; 70 | } 71 | 72 | #footer li a { 73 | border-right: 1px solid #555; 74 | padding: 0 .5em; 75 | } 76 | 77 | 78 | .article { 79 | color: #5f4b32; 80 | font-size: 16px; 81 | } 82 | 83 | .article a:link { 84 | color: #5f4b32; 85 | } 86 | 87 | .article h1 , .article h2 , .article h3 , .article h4 , 88 | .article h5 , .article h6 { 89 | margin: .5em 0 1.5em 0; 90 | font-weight: 700; 91 | } 92 | 93 | .article h1 { 94 | font-size: 1.8em; 95 | } 96 | 97 | .article h1 a:link, 98 | .article h1 a:hover, 99 | .article h1 a:visited { 100 | text-decoration: none; 101 | } 102 | 103 | 104 | .article h2 { 105 | font-size: 1.5em; 106 | } 107 | .article h3 { 108 | font-size: 1.3em; 109 | } 110 | 111 | .article h4 { 112 | font-size: 1.1em; 113 | } 114 | 115 | .article h5 { 116 | font-size: 1em; 117 | } 118 | 119 | .article h6 { 120 | font-size: .8em; 121 | } 122 | 123 | .article p { 124 | line-height: 1.5em; 125 | font-size: 20px; 126 | margin: 0 0 1em 0; 127 | } 128 | 129 | .article code, 130 | .article pre, 131 | .article img { 132 | padding: 5px; 133 | border: 1px solid #999; 134 | background: #fff; 135 | -moz-border-radius: 5px; 136 | -webkit-border-radius: 5px; 137 | border-radius: 5px; 138 | max-width: 90%; 139 | margin: .3em 0; 140 | } 141 | 142 | 143 | .article code, 144 | .article pre { 145 | margin: 1em 0; 146 | font-size: 14px; 147 | font-family: monaco, "new courier", courier, sans-serif; 148 | } 149 | 150 | .article code *, .article pre * { 151 | border: 0; 152 | } 153 | 154 | .article dl, .article ol, .article ul { 155 | line-height: 1.5em; 156 | font-size: 18px; 157 | margin: 0 0 .8em 2em; 158 | } 159 | -------------------------------------------------------------------------------- /assets/css/reset-min.css: -------------------------------------------------------------------------------- 1 | html,body,div,span,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,code,del,dfn,em,img,q,dl,dt,dd,ol,ul,li,fieldset,form,label,legend,table,caption,tbody,tfoot,thead,tr,th,td{margin:0;padding:0;border:0;font-weight:inherit;font-style:inherit;font-family:inherit;vertical-align:baseline;}:focus{outline:none;}h1{font-size:200%;}h2{font-size:180%;}h3{font-size:160%;}h4{font-size:140%;}h5{font-size:120%;}h6,p{font-size:100%;}body{line-height:1.5;}table{border-collapse:separate;border-spacing:0;}caption,th,td{text-align:left;font-weight:normal;}table,td,th{vertical-align:middle;}blockquote:before,blockquote:after,q:before,q:after{content:"";}blockquote,q{quotes:"" "";}a img{border:none;outline:none;}ol{margin-left:2em;}sup{vertical-align:text-top;}sub{vertical-align:text-bottom;}html>body p code{white-space:normal;}* html legend{margin:-18px -8px 16px 0;padding:0;}* html hr,html>body hr{margin:-8px auto 11px;} -------------------------------------------------------------------------------- /assets/css/reset.css: -------------------------------------------------------------------------------- 1 | /**! 2 | * CSS ReSet 3 | * 4 | * @author i.feelinglucky@gmail.com 5 | * @link http://www.gracecode.com/ 6 | * @version $Id: reset.source.css 283 2009-11-15 08:08:31Z i.feelinglucky $ 7 | * @change 8 | * [+]new feature [*]improvement [!]change [x]bug fix 9 | * 10 | * [+] 2008-12-25 11 | * 增加 :focus 的 outline 属性,防止 Firefox 出现需线框 12 | */ 13 | 14 | html, body, div, span, object, iframe, 15 | h1, h2, h3, h4, h5, h6, p, blockquote, pre, 16 | a, abbr, acronym, address, code, 17 | del, dfn, em, img, q, dl, dt, dd, ol, ul, li, 18 | fieldset, form, label, legend, 19 | table, caption, tbody, tfoot, thead, tr, th, td { 20 | margin: 0em; padding: 0em; border: 0em; 21 | font-weight: inherit; 22 | font-style: inherit; 23 | font-family: inherit; 24 | vertical-align: baseline; 25 | } 26 | 27 | :focus { 28 | outline: none; 29 | } 30 | 31 | h1 { 32 | font-size: 200%; 33 | } 34 | 35 | h2 { 36 | font-size: 180%; 37 | } 38 | 39 | h3 { 40 | font-size: 160%; 41 | } 42 | 43 | h4 { 44 | font-size: 140%; 45 | } 46 | 47 | h5 { 48 | font-size: 120%; 49 | } 50 | 51 | h6, p { 52 | font-size: 100%; 53 | } 54 | 55 | body { 56 | line-height: 1.5; 57 | } 58 | 59 | table { 60 | border-collapse: separate; 61 | border-spacing: 0; 62 | } 63 | 64 | caption, th, td { 65 | text-align: left; 66 | font-weight: normal; 67 | } 68 | 69 | table, td, th { 70 | vertical-align: middle; 71 | } 72 | 73 | blockquote:before, blockquote:after, q:before, q:after { 74 | content: ""; 75 | } 76 | 77 | blockquote, q { 78 | quotes: "" ""; 79 | } 80 | 81 | a img { 82 | border: none; 83 | outline: none; 84 | } 85 | 86 | ol { 87 | margin-left: 2em; 88 | } 89 | 90 | sup { 91 | vertical-align: text-top; 92 | } 93 | 94 | sub { 95 | vertical-align: text-bottom; 96 | } 97 | 98 | /* hack for ie */ 99 | html>body p code { 100 | white-space: normal; 101 | } 102 | 103 | * html legend { 104 | margin: -18px -8px 16px 0; 105 | padding: 0; 106 | } 107 | 108 | * html hr, html>body hr { 109 | margin: -8px auto 11px; 110 | } 111 | 112 | /* vim: set et sw=4 ts=4 sts=4 fdm=marker ff=unix fenc=utf8 */ 113 | -------------------------------------------------------------------------------- /assets/css/typography-min.css: -------------------------------------------------------------------------------- 1 | body{font-size:75%;color:#222;background:#fff;}.simsun{font-family:Tahoma,SimSun,Arial;}h1,h2,h3,h4,h5,h6{font-weight:bold;}h1{font-size:3em;line-height:1;margin-bottom:.5em;}h2{font-size:2em;margin-bottom:.75em;}h3{font-size:1.5em;line-height:1;margin-bottom:1em;}h4{font-size:1.2em;line-height:1.25;margin-bottom:1.25em;height:1.25em;}h5{font-size:1em;font-weight:bold;margin-bottom:1.5em;}h6{font-size:1em;font-weight:bold;}h1 img,h2 img,h3 img,h4 img,h5 img,h6 img{margin:0;}p{margin:0 0 1.5em;}blockquote{margin:1.5em;color:#666;font-style:italic;}strong{font-weight:bold;}em,dfn{font-style:italic;}dfn{font-weight:bold;}sup,sub{line-height:0;}address{margin:0 0 1.5em;font-style:italic;}pre,code{margin:1.5em 0;white-space:pre;}pre,code,tt{line-height:1.5;}li ul,li ol,ul,ol{margin:0 1.5em;}ul{list-style-type:disc;}ol{list-style-type:decimal;}dl{margin:0 0 1.5em 0;}dl dt{font-weight:bold;}dd{margin-left:1.5em;}table{width:100%;}th{font-weight:bold;}th,td{padding:.5em 1em;}tfoot{font-style:italic;} -------------------------------------------------------------------------------- /assets/css/typography.css: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=4 ts=4 sts=4 fdm=marker ff=unix fenc=utf8 */ 2 | /** 3 | * CSS 文字排版 4 | * 5 | * @change 6 | * 2008-10-15 7 | * 根据 Blue Print CSS 制作初始化版本 8 | * 9 | * @author i.feelinglucky@gmail.com 10 | * @since 2008-09-19 11 | * @link http://www.gracecode.com/ 12 | * @version $Id: grid.source.css 96 2008-10-12 16:49:11Z i.feelinglucky $ 13 | */ 14 | 15 | body { 16 | font-size: 75%; 17 | color: #222; 18 | background: #fff; 19 | /* 20 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 21 | */ 22 | } 23 | 24 | /** 25 | * Windows 平台的“微软雅黑” 26 | * 27 | * feelinglucky: 应慎用“雅黑字体 28 | * @link: http://www.3user.com/topic/index/14019 29 | * @change 30 | * 2008-10-28 取消雅黑字体配置 31 | */ 32 | /* 33 | .yahei { 34 | font-family: "Microsoft YaHei"; 35 | } 36 | */ 37 | 38 | /** 39 | * Windows 平台的“宋体” 40 | * 41 | */ 42 | .simsun { 43 | font-family: Tahoma,SimSun,Arial; 44 | } 45 | 46 | /** 47 | * 重置标题 48 | */ 49 | h1, h2, h3, h4, h5, h6 { 50 | font-weight: bold; 51 | } 52 | 53 | h1 { 54 | font-size: 3em; 55 | line-height: 1; 56 | margin-bottom: 0.5em; 57 | } 58 | 59 | h2 { 60 | font-size: 2em; 61 | margin-bottom: 0.75em; 62 | } 63 | 64 | h3 { 65 | font-size: 1.5em; 66 | line-height: 1; 67 | margin-bottom: 1em; 68 | } 69 | 70 | h4 { 71 | font-size: 1.2em; 72 | line-height: 1.25; 73 | margin-bottom: 1.25em; 74 | height: 1.25em; 75 | } 76 | 77 | h5 { 78 | font-size: 1em; 79 | font-weight: bold; 80 | margin-bottom: 1.5em; 81 | } 82 | 83 | h6 { 84 | font-size: 1em; 85 | font-weight: bold; 86 | } 87 | 88 | h1 img, h2 img, h3 img, h4 img, h5 img, h6 img { 89 | margin: 0; 90 | } 91 | 92 | /** 93 | * 段落样式 94 | */ 95 | p { 96 | margin: 0 0 1.5em; 97 | } 98 | 99 | blockquote { 100 | margin: 1.5em; 101 | color: #666; 102 | font-style: italic; 103 | } 104 | 105 | strong { 106 | font-weight: bold; 107 | } 108 | 109 | em, dfn { 110 | font-style: italic; 111 | } 112 | 113 | dfn { 114 | font-weight: bold; 115 | } 116 | 117 | sup, sub { 118 | line-height: 0; 119 | } 120 | 121 | abbr, acronym { 122 | 123 | } 124 | 125 | address { 126 | margin: 0 0 1.5em; 127 | font-style: italic; 128 | } 129 | 130 | del { 131 | 132 | } 133 | 134 | pre, code { 135 | margin: 1.5em 0; 136 | white-space: pre; 137 | } 138 | 139 | pre, code, tt { 140 | line-height: 1.5; 141 | } 142 | 143 | 144 | /** 145 | * 列表 146 | */ 147 | li ul, li ol, ul, ol { 148 | margin: 0 1.5em; 149 | } 150 | 151 | ul { 152 | list-style-type: disc; 153 | } 154 | 155 | ol { 156 | list-style-type: decimal; 157 | } 158 | 159 | dl { 160 | margin: 0 0 1.5em 0; 161 | } 162 | 163 | dl dt { 164 | font-weight: bold; 165 | } 166 | 167 | dd { 168 | margin-left: 1.5em; 169 | } 170 | 171 | /** 172 | * 表格 173 | */ 174 | table { 175 | width:100%; 176 | } 177 | 178 | th { 179 | font-weight: bold; 180 | } 181 | 182 | th, td { 183 | padding: .5em 1em; 184 | } 185 | 186 | tfoot { 187 | font-style: italic; 188 | } 189 | -------------------------------------------------------------------------------- /common.inc.php: -------------------------------------------------------------------------------- 1 | 7 | * @date 2011-02-17 8 | * @link http://www.gracecode.com/ 9 | */ 10 | 11 | 12 | /** 13 | * 安全获取 GET/POST 的参数 14 | * 15 | * @param String $request_name 16 | * @param Mixed $default_value 17 | * @param String $method 'post', 'get', 'all' default is 'all' 18 | * @return String 19 | */ 20 | function getRequestParam($request_name, $default_value = null, $method = "all") 21 | { 22 | $magic_quotes = ini_get("magic_quotes_gpc") ? true : false; 23 | $method = strtolower($method); 24 | 25 | switch (strtolower($method)) { 26 | default: 27 | case "all": 28 | if (isset($_POST[$request_name])) { 29 | return $magic_quotes ? stripslashes($_POST[$request_name]) : $_POST[$request_name]; 30 | } else if (isset($_GET[$request_name])) { 31 | return $magic_quotes ? stripslashes($_GET[$request_name]) : $_GET[$request_name]; 32 | } else { 33 | return $default_value; 34 | } 35 | break; 36 | 37 | case "get": 38 | if (isset($_GET[$request_name])) { 39 | return $magic_quotes ? stripslashes($_GET[$request_name]) : $_GET[$request_name]; 40 | } else { 41 | return $default_value; 42 | } 43 | break; 44 | 45 | case "post": 46 | if (isset($_POST[$request_name])) { 47 | return $magic_quotes ? stripslashes($_POST[$request_name]) : $_POST[$request_name]; 48 | } else { 49 | return $default_value; 50 | } 51 | break; 52 | 53 | default: 54 | return $default_value; 55 | break; 56 | } 57 | } 58 | 59 | -------------------------------------------------------------------------------- /config.inc.php: -------------------------------------------------------------------------------- 1 | 7 | * @date 2011-02-17 8 | * @link http://www.gracecode.com/ 9 | */ 10 | 11 | define("DIR_ROOT", dirname(__FILE__)); 12 | define("URL_ROOT", "http://lab.gracecode.com/readability/"); 13 | 14 | define("DIR_CACHE", DIR_ROOT . '/cache'); 15 | define("CACHE_TIME", 3600 * 24); 16 | 17 | define('USER_AGENT', "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0)"); 18 | -------------------------------------------------------------------------------- /index.php: -------------------------------------------------------------------------------- 1 | 7 | * @date 2011-02-17 8 | * @link http://www.gracecode.com/ 9 | */ 10 | 11 | require 'config.inc.php'; 12 | require 'common.inc.php'; 13 | require 'lib/Readability.inc.php'; 14 | 15 | $request_url = getRequestParam("url", ""); 16 | $output_type = strtolower(getRequestParam("type", "html")); 17 | 18 | // 如果 URL 参数不正确,则跳转到首页 19 | if (!preg_match('/^http:\/\//i', $request_url) || 20 | !filter_var($request_url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED)) { 21 | include 'template/index.html'; 22 | exit; 23 | } 24 | 25 | $request_url_hash = md5($request_url); 26 | $request_url_cache_file = sprintf(DIR_CACHE."/%s.url", $request_url_hash); 27 | 28 | // 缓存请求数据,避免重复请求 29 | if (file_exists($request_url_cache_file) && 30 | (time() - filemtime($request_url_cache_file) < CACHE_TIME)) { 31 | 32 | $source = file_get_contents($request_url_cache_file); 33 | } else { 34 | 35 | $handle = curl_init(); 36 | curl_setopt_array($handle, array( 37 | CURLOPT_USERAGENT => USER_AGENT, 38 | CURLOPT_FOLLOWLOCATION => true, 39 | CURLOPT_HEADER => false, 40 | CURLOPT_HTTPGET => true, 41 | CURLOPT_RETURNTRANSFER => true, 42 | CURLOPT_TIMEOUT => 30, 43 | CURLOPT_URL => $request_url 44 | )); 45 | 46 | $source = curl_exec($handle); 47 | curl_close($handle); 48 | 49 | // Write request data into cache file. 50 | @file_put_contents($request_url_cache_file, $source); 51 | } 52 | 53 | // 判断编码 54 | //if (!$charset = mb_detect_encoding($source)) { 55 | //} 56 | preg_match("/charset=([\w|\-]+);?/", $source, $match); 57 | $charset = isset($match[1]) ? $match[1] : 'utf-8'; 58 | 59 | /** 60 | * 获取 HTML 内容后,解析主体内容 61 | */ 62 | $Readability = new Readability($source, $charset); 63 | $Data = $Readability->getContent(); 64 | 65 | switch($output_type) { 66 | case 'json': 67 | header("Content-type: text/json;charset=utf-8"); 68 | $Data['url'] = $request_url; 69 | echo json_encode($Data); 70 | break; 71 | 72 | case 'html': default: 73 | header("Content-type: text/html;charset=utf-8"); 74 | 75 | $title = $Data['title']; 76 | $content = $Data['content']; 77 | 78 | include 'template/reader.html'; 79 | } 80 | 81 | -------------------------------------------------------------------------------- /lib/Readability.inc.php: -------------------------------------------------------------------------------- 1 | 17 | * @link http://www.gracecode.com/ 18 | * 19 | * @author Tuxion 20 | * @link http://tuxion.nl/ 21 | */ 22 | 23 | define("READABILITY_VERSION", 0.21); 24 | 25 | class Readability { 26 | // 保存判定结果的标记位名称 27 | const ATTR_CONTENT_SCORE = "contentScore"; 28 | 29 | // DOM 解析类目前只支持 UTF-8 编码 30 | const DOM_DEFAULT_CHARSET = "utf-8"; 31 | 32 | // 当判定失败时显示的内容 33 | const MESSAGE_CAN_NOT_GET = "Readability was unable to parse this page for content."; 34 | 35 | // DOM 解析类(PHP5 已内置) 36 | protected $DOM = null; 37 | 38 | // 需要解析的源代码 39 | protected $source = ""; 40 | 41 | // 章节的父元素列表 42 | private $parentNodes = array(); 43 | 44 | // 需要删除的标签 45 | // Note: added extra tags from https://github.com/ridcully 46 | private $junkTags = Array("style", "form", "iframe", "script", "button", "input", "textarea", 47 | "noscript", "select", "option", "object", "applet", "basefont", 48 | "bgsound", "blink", "canvas", "command", "menu", "nav", "datalist", 49 | "embed", "frame", "frameset", "keygen", "label", "marquee", "link"); 50 | 51 | // 需要删除的属性 52 | private $junkAttrs = Array("style", "class", "onclick", "onmouseover", "align", "border", "margin"); 53 | 54 | 55 | /** 56 | * 构造函数 57 | * @param $input_char 字符串的编码。默认 utf-8,可以省略 58 | */ 59 | function __construct($source, $input_char = "utf-8") { 60 | $this->source = $source; 61 | 62 | // DOM 解析类只能处理 UTF-8 格式的字符 63 | $source = mb_convert_encoding($source, 'HTML-ENTITIES', $input_char); 64 | 65 | // 预处理 HTML 标签,剔除冗余的标签等 66 | $source = $this->preparSource($source); 67 | 68 | // 生成 DOM 解析类 69 | $this->DOM = new DOMDocument('1.0', $input_char); 70 | try { 71 | //libxml_use_internal_errors(true); 72 | // 会有些错误信息,不过不要紧 :^) 73 | if (!@$this->DOM->loadHTML(''.$source)) { 74 | throw new Exception("Parse HTML Error!"); 75 | } 76 | 77 | foreach ($this->DOM->childNodes as $item) { 78 | if ($item->nodeType == XML_PI_NODE) { 79 | $this->DOM->removeChild($item); // remove hack 80 | } 81 | } 82 | 83 | // insert proper 84 | $this->DOM->encoding = Readability::DOM_DEFAULT_CHARSET; 85 | } catch (Exception $e) { 86 | // ... 87 | } 88 | } 89 | 90 | 91 | /** 92 | * 预处理 HTML 标签,使其能够准确被 DOM 解析类处理 93 | * 94 | * @return String 95 | */ 96 | private function preparSource($string) { 97 | // 剔除多余的 HTML 编码标记,避免解析出错 98 | preg_match("/charset=([\w|\-]+);?/", $string, $match); 99 | if (isset($match[1])) { 100 | $string = preg_replace("/charset=([\w|\-]+);?/", "", $string, 1); 101 | } 102 | 103 | // Replace all doubled-up
tags with

tags, and remove fonts. 104 | $string = preg_replace("/[ \r\n\s]*/i", "

", $string); 105 | $string = preg_replace("/<\/?font[^>]*>/i", "", $string); 106 | 107 | // @see https://github.com/feelinglucky/php-readability/issues/7 108 | // - from http://stackoverflow.com/questions/7130867/remove-script-tag-from-html-content 109 | $string = preg_replace("#(.*?)#is", "", $string); 110 | 111 | return trim($string); 112 | } 113 | 114 | 115 | /** 116 | * 删除 DOM 元素中所有的 $TagName 标签 117 | * 118 | * @return DOMDocument 119 | */ 120 | private function removeJunkTag($RootNode, $TagName) { 121 | 122 | $Tags = $RootNode->getElementsByTagName($TagName); 123 | 124 | //Note: always index 0, because removing a tag removes it from the results as well. 125 | while($Tag = $Tags->item(0)){ 126 | $parentNode = $Tag->parentNode; 127 | $parentNode->removeChild($Tag); 128 | } 129 | 130 | return $RootNode; 131 | 132 | } 133 | 134 | /** 135 | * 删除元素中所有不需要的属性 136 | */ 137 | private function removeJunkAttr($RootNode, $Attr) { 138 | $Tags = $RootNode->getElementsByTagName("*"); 139 | 140 | $i = 0; 141 | while($Tag = $Tags->item($i++)) { 142 | $Tag->removeAttribute($Attr); 143 | } 144 | 145 | return $RootNode; 146 | } 147 | 148 | /** 149 | * 根据评分获取页面主要内容的盒模型 150 | * 判定算法来自:http://code.google.com/p/arc90labs-readability/ 151 | * 152 | * @return DOMNode 153 | */ 154 | private function getTopBox() { 155 | // 获得页面所有的章节 156 | $allParagraphs = $this->DOM->getElementsByTagName("p"); 157 | 158 | // Study all the paragraphs and find the chunk that has the best score. 159 | // A score is determined by things like: Number of

's, commas, special classes, etc. 160 | $i = 0; 161 | while($paragraph = $allParagraphs->item($i++)) { 162 | $parentNode = $paragraph->parentNode; 163 | $contentScore = intval($parentNode->getAttribute(Readability::ATTR_CONTENT_SCORE)); 164 | $className = $parentNode->getAttribute("class"); 165 | $id = $parentNode->getAttribute("id"); 166 | 167 | // Look for a special classname 168 | if (preg_match("/(comment|meta|footer|footnote)/i", $className)) { 169 | $contentScore -= 50; 170 | } else if(preg_match( 171 | "/((^|\\s)(section|post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/i", 172 | $className)) { 173 | $contentScore += 25; 174 | } 175 | 176 | // Look for a special ID 177 | if (preg_match("/(comment|meta|footer|footnote)/i", $id)) { 178 | $contentScore -= 50; 179 | } else if (preg_match( 180 | "/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i", 181 | $id)) { 182 | $contentScore += 25; 183 | } 184 | 185 | // Add a point for the paragraph found 186 | // Add points for any commas within this paragraph 187 | if (strlen($paragraph->nodeValue) > 10) { 188 | $contentScore += strlen($paragraph->nodeValue); 189 | } 190 | 191 | // 保存父元素的判定得分 192 | $parentNode->setAttribute(Readability::ATTR_CONTENT_SCORE, $contentScore); 193 | 194 | // 保存章节的父元素,以便下次快速获取 195 | array_push($this->parentNodes, $parentNode); 196 | } 197 | 198 | $topBox = null; 199 | 200 | // Assignment from index for performance. 201 | // See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 202 | for ($i = 0, $len = sizeof($this->parentNodes); $i < $len; $i++) { 203 | $parentNode = $this->parentNodes[$i]; 204 | $contentScore = intval($parentNode->getAttribute(Readability::ATTR_CONTENT_SCORE)); 205 | $orgContentScore = intval($topBox ? $topBox->getAttribute(Readability::ATTR_CONTENT_SCORE) : 0); 206 | 207 | // by raywill, 2016-9-2 208 | // for case:

xxx

yyy

209 | if ($parentNode && $topBox && $topBox->parentNode 210 | && $parentNode !== $topBox 211 | && $parentNode->parentNode === $topBox->parentNode 212 | && $this->scoreMatch($parentNode, $topBox)) { // trust same level 213 | 214 | $topScore = intval($topBox->getAttribute(Readability::ATTR_CONTENT_SCORE)); 215 | $topBox = $topBox->parentNode; 216 | $topBox->setAttribute(Readability::ATTR_CONTENT_SCORE, $topScore + $contentScore); 217 | } else if ($contentScore && $contentScore > $orgContentScore) { 218 | 219 | $topBox = $parentNode; 220 | } 221 | } 222 | 223 | // 此时,$topBox 应为已经判定后的页面内容主元素 224 | return $topBox; 225 | } 226 | 227 | protected function scoreMatch($n1, $n2) { 228 | $n1Score = intval($n1->getAttribute(Readability::ATTR_CONTENT_SCORE)); 229 | $n2Score = intval($n2->getAttribute(Readability::ATTR_CONTENT_SCORE)); 230 | return ($n1Score > 0 && $n2Score > 0); 231 | } 232 | 233 | /** 234 | * 获取 HTML 页面标题 235 | * 236 | * @return String 237 | */ 238 | public function getTitle() { 239 | $split_point = ' - '; 240 | $titleNodes = $this->DOM->getElementsByTagName("title"); 241 | 242 | if ($titleNodes->length 243 | && $titleNode = $titleNodes->item(0)) { 244 | // @see http://stackoverflow.com/questions/717328/how-to-explode-string-right-to-left 245 | $title = trim($titleNode->nodeValue); 246 | $result = array_map('strrev', explode($split_point, strrev($title))); 247 | return sizeof($result) > 1 ? array_pop($result) : $title; 248 | } 249 | 250 | return null; 251 | } 252 | 253 | 254 | /** 255 | * Get Leading Image Url 256 | * 257 | * @return String 258 | */ 259 | public function getLeadImageUrl($node) { 260 | $images = $node->getElementsByTagName("img"); 261 | 262 | if ($images->length){ 263 | $i = 0; 264 | while($leadImage = $images->item($i++)) { 265 | $imgsrc = $leadImage->getAttribute("src"); 266 | $imgdatasrc = $leadImage->getAttribute("data-src"); 267 | $imgsrclast = $imgsrc ? $imgsrc : $imgdatasrc; 268 | list($img['width'],$img['height'])=getimagesize($imgsrclast); 269 | if($img['width'] > 150 && $img['height'] >150){ 270 | return $imgsrclast; 271 | } 272 | 273 | } 274 | } 275 | 276 | return null; 277 | } 278 | 279 | 280 | /** 281 | * 获取页面的主要内容(Readability 以后的内容) 282 | * 283 | * @return Array 284 | */ 285 | public function getContent() { 286 | if (!$this->DOM) return false; 287 | 288 | // 获取页面标题 289 | $ContentTitle = $this->getTitle(); 290 | 291 | // 获取页面主内容 292 | $ContentBox = $this->getTopBox(); 293 | 294 | //Check if we found a suitable top-box. 295 | if($ContentBox === null) 296 | throw new RuntimeException(Readability::MESSAGE_CAN_NOT_GET); 297 | 298 | // 复制内容到新的 DOMDocument 299 | $Target = new DOMDocument; 300 | $Target->appendChild($Target->importNode($ContentBox, true)); 301 | 302 | // 删除不需要的标签 303 | foreach ($this->junkTags as $tag) { 304 | $Target = $this->removeJunkTag($Target, $tag); 305 | } 306 | 307 | // 删除不需要的属性 308 | foreach ($this->junkAttrs as $attr) { 309 | $Target = $this->removeJunkAttr($Target, $attr); 310 | } 311 | 312 | $content = mb_convert_encoding($Target->saveHTML(), Readability::DOM_DEFAULT_CHARSET, "HTML-ENTITIES"); 313 | 314 | // 多个数据,以数组的形式返回 315 | return Array( 316 | 'lead_image_url' => $this->getLeadImageUrl($Target), 317 | 'word_count' => mb_strlen(strip_tags($content), Readability::DOM_DEFAULT_CHARSET), 318 | 'title' => $ContentTitle ? $ContentTitle : null, 319 | 'content' => $content 320 | ); 321 | } 322 | 323 | function __destruct() { } 324 | } 325 | 326 | -------------------------------------------------------------------------------- /template/footer.inc.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 15 | 43 | -------------------------------------------------------------------------------- /template/header.inc.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /template/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Readability - Gracecode.com 6 | 7 | 8 | 9 |
10 |

無標題文檔

11 | Fork me on GitHub 12 |
13 | 14 |
15 |

Readability

16 |

Back the fun of reading

17 | 26 | 27 |

28 | Make Readability! 30 | 31 |

32 | 67 |
68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /template/reader.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | <?php echo mb_convert_encoding($title, "HTML-ENTITIES", "UTF-8"); ?> - Gracecode.com 7 | 8 | 9 | 10 |
11 |

無標題文檔

12 | Fork me on GitHub 13 |
14 | 15 |
16 |

19 | 20 | 21 | 22 |
23 | 24 | 25 | 26 | 27 | --------------------------------------------------------------------------------