├── .gitignore ├── README.md ├── composer.json ├── custom └── skel.inc.php ├── lib ├── StringHelper.php └── UrlTable.php ├── spider.php └── tests ├── bootstrap.php ├── lib ├── HttpBaseTest.php ├── HttpClientTest.php ├── HttpConnTest.php ├── HttpProcesserTest.php ├── HttpRequestTest.php ├── HttpResponseTest.php ├── UrlParserTest.php └── UrlTableMySQLTest.php └── phpunit.xml /.gitignore: -------------------------------------------------------------------------------- 1 | # ignores 2 | custom/test.inc.php 3 | composer.lock 4 | vendor 5 | .idea 6 | 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PHP - spider 框架 2 | =================== 3 | 4 | 这是最近使用纯 `php` 代码开发的并行抓取(爬虫)框架,基于 [hightman\httpclient](https://github.com/hightman/httpclient) 组件。 5 | 6 | 您必须先装有 [composer](http://getcomposer.org),然后在项目里先运行以下命令下载组件: 7 | 8 | ~~~ 9 | composer install 10 | ~~~ 11 | 12 | 13 | 使用 pspider 14 | -------------- 15 | 16 | 这里头的 URL 表管理需要 MySQLi 扩展支持,表结构和自定义的内容参见自定义文件。 17 | 18 | 1. 复制 `custom/skel.inc.php` 为 `custom/your.inc.php` 19 | 2. 根据说明修改 custom/your.inc.php 20 | 3. 根据 custom/your.inc.php 里的注释创建 mysql 的 URL 表 21 | 4. 运行 spider.php -u http://... 即可开始循环抓取 22 | 5. UrlTable 的实现很简单仅作示例,具体可自行重做 23 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "hightman/pspider", 3 | "description": "A pure PHP spider framework", 4 | "keywords": ["php", "spider", "crawler", "robot"], 5 | "homepage": "https://github.com/hightman/pspider/", 6 | "type": "project", 7 | "license": "MIT", 8 | "require": { 9 | "php": ">=5.4.0", 10 | "ext-mysqli": "*", 11 | "lib-pcre": "*", 12 | "hightman/httpclient": "*" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /custom/skel.inc.php: -------------------------------------------------------------------------------- 1 | 6 | * @link http://www.hightman.cn/ 7 | * @copyright Copyright © 2008-2013 Twomice Studio 8 | */ 9 | 10 | use hightman\http\Response; 11 | use hightman\http\Request; 12 | 13 | /// --- custom 并发抓取数量 14 | define('PSP_NUM_PARALLEL', 5); 15 | 16 | /// --- custom 同一 URL 连续抓取间隔 17 | define('PSP_CRAWL_PERIOD', 3600); 18 | 19 | /// --- Adding StringHelper on need 20 | /// require_once __DIR__ . '/../lib/StringHelper.php'; 21 | 22 | /** 23 | * 设置 MySQL 参数,要求带有 _urls 表,并采用以下结构: 24 | CREATE TABLE `_urls` ( 25 | `id` varchar(32) NOT NULL COMMENT 'md5 hash of URL', 26 | `url` text, 27 | `rank` smallint NOT NULL default '0' COMMENT 'process prior level', 28 | `status` smallint NOT NULL default '0' COMMENT 'last http response status', 29 | `select_time` int unsigned NOT NULL default '0' COMMENT 'last process time', 30 | `update_time` int unsigned NOT NULL default '0' COMMENT 'last update time', 31 | PRIMARY KEY (`id`) 32 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='url table for pspider'; 33 | */ 34 | class UrlTableCustom extends UrlTableMySQL 35 | { 36 | 37 | public function __construct() 38 | { 39 | /// --- custom setting BEGIN 40 | $host = 'localhost'; 41 | $user = 'root'; 42 | $pass = ''; 43 | $dbname = 'test'; 44 | /// --- custom setting END 45 | 46 | parent::__construct($host, $user, $pass, $dbname); 47 | $this->test(); 48 | } 49 | } 50 | 51 | /** 52 | * 自定义解析器 53 | */ 54 | class UrlParserCustom extends UrlParser 55 | { 56 | 57 | /** 58 | * 在这个方法内添加抓取内容解析处理代码 59 | */ 60 | public function parse(Response $res, Request $req, $key) 61 | { 62 | parent::parse($res, $req, $key); 63 | if ($res->status === 200) { 64 | /// --- custom code BEGIN --- 65 | echo "PROCESSING: " . $req->getUrl() . "\n"; 66 | /// --- custom code END --- 67 | } 68 | } 69 | 70 | /** 71 | * 在这个方法内添加新 URL 过滤规则,主要是调用以下方法: 72 | * followExternal() 73 | * allowDomain(), disallowDomain() 74 | * allow(), disallow(), disallowExt() 75 | * 76 | * 注意:allow() 支持第三在数指定此规则下的页面是否跟随分析 77 | */ 78 | public function defaultFilter() 79 | { 80 | parent::defaultFilter(); 81 | /// --- custom filter BEGIN --- 82 | $this->followExternal(false); 83 | $this->disallow('.php?q='); 84 | /// --- custom filter END --- 85 | } 86 | 87 | /** 88 | * 在这个方法内定义是否分析处理该 url 内容中的链接 89 | * @param string $url 90 | * @return boolean 91 | */ 92 | protected function isFollowUrl($url) 93 | { 94 | return parent::isFollowUrl($url); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /lib/StringHelper.php: -------------------------------------------------------------------------------- 1 | 6 | * @link http://www.hightman.cn/ 7 | * @copyright Copyright © 2008-2013 Twomice Studio 8 | */ 9 | 10 | /** 11 | * String Helper (all are static function) 12 | * 13 | *
 14 |  * StringHelper::decodeHtml($html);
 15 |  * StringHelper::fixHtmlCharset($html, $charset = 'utf-8');
 16 |  * StringHelper::finds($buf, $tag1, $tag2[, ...]);
 17 |  * StringHelper::find($buf, $tag1, $tag2[, ...]);
 18 |  * StringHelper::contains($buf, $tokens);
 19 |  * 
20 | */ 21 | class StringHelper 22 | { 23 | 24 | /** 25 | * @param string $html 26 | * @return string 解码后的 html 27 | */ 28 | public static function decodeHtml($html) 29 | { 30 | if (strpos($html, '<') !== false) { 31 | $html = strip_tags($html); /* preg_replace('/<.+?>/u', '', $html); */ 32 | } 33 | return html_entity_decode(trim($html), ENT_QUOTES, 'utf-8'); 34 | } 35 | 36 | /** 37 | * @param string $charset 目标字符集,默认 utf-8 38 | * @return string 强制转换网页内容为目标字符集 39 | */ 40 | public static function fixHtmlCharset($html, $charset = 'utf-8') 41 | { 42 | if (preg_match('/charset=["\']?([0-9a-zA-Z_-]+)/', $html, $match) 43 | && (strncasecmp($charset, 'gb', 2) || strncasecmp($match[1], 'gb', 2)) 44 | && strcasecmp($charset, $match[1])) { 45 | if (!strcasecmp($match[1], 'gb2312')) { 46 | $match[1] = 'gbk'; 47 | } 48 | if (function_exists('iconv')) { 49 | return iconv($match[1], $charset . '//IGNORE', $html); 50 | } elseif (function_exists('mb_convert_encoding')) { 51 | return mb_convert_encoding($html, $charset, $match[1]); 52 | } 53 | } 54 | return $html; 55 | } 56 | 57 | /** 58 | * 根据标记快速查找字符串列表 59 | * @param string $buf 60 | * @param array $config 61 | * array( 62 | * array(key1, arg1, arg2, ...), 63 | * array(key2, arg1, arg2, ...), 64 | * ), 65 | * @return array 66 | * @see StringMatcher::find 67 | */ 68 | public static function finds($buf, $config, &$error = null) 69 | { 70 | $obj = new StringMatcher($buf); 71 | return $obj->finds($config, $error); 72 | } 73 | 74 | /** 75 | * 根据标记快速查找字符串 76 | * @param string $buf 77 | * @return string 返回最后两个标记之间的内容,找不到返回 null 78 | * @see StringMatcher::find 79 | */ 80 | public static function find($buf) 81 | { 82 | $args = func_get_args(); 83 | array_shift($args); 84 | $obj = new StringMatcher($buf); 85 | return call_user_func_array(array($obj, 'find'), $args); 86 | } 87 | 88 | /** 89 | * 判断字符串是否包含数组中的字符串 90 | * @param string $buf 源字符串 91 | * @param array $tokens 字符串标记列表 92 | * @return boolean 93 | */ 94 | public static function contains($buf, $tokens) 95 | { 96 | foreach ($tokens as $token) { 97 | if (strpos($buf, $token) !== false) { 98 | return true; 99 | } 100 | } 101 | return false; 102 | } 103 | } 104 | 105 | /** 106 | * StringMatcher to parse data 107 | */ 108 | class StringMatcher 109 | { 110 | private $_buf, $_pos; 111 | 112 | /** 113 | * @param string $buf 114 | */ 115 | public function __construct($buf) 116 | { 117 | $this->_buf = $buf; 118 | $this->_pos = 0; 119 | } 120 | 121 | /** 122 | * 批量查找 123 | * @param array $config 124 | * array( 125 | * array(key1, arg1, arg2, ...), 126 | * array(key2, arg1, arg2, ...), 127 | * ), 128 | * @param string $error optional reference 129 | * @return array 130 | */ 131 | public function finds($config, &$error = null) 132 | { 133 | $ret = array(); 134 | foreach ($config as $args) { 135 | $key = array_shift($args); 136 | $val = call_user_func_array(array($this, 'find'), $args); 137 | if ($val === null || $val === false) { 138 | $error = 'Cannot find `' . $key . '\': ' . implode(' ... ', $args); 139 | $pos = strrpos($error, '...'); 140 | $error = substr_replace($error, '???', $pos, 3); 141 | continue; 142 | //return false; 143 | } 144 | $ret[$key] = $val; 145 | } 146 | return $ret; 147 | } 148 | 149 | /** 150 | * 根据特征查找字符串,不定参数: 151 | * 起始1,起始2,起始3 ... 结束关键 152 | * 新增支持特殊串 153 | * "$$$...",表示后面的字符串必须在这个字符串之前,以免跨越太大 154 | * "^^^...",表示后面的字符串如果在这个串之前就用采用当前串的位置 155 | * @return string 成功返回区间内的字符串并将位置设在本字符串之末,若找不到返回 null 156 | */ 157 | public function find() 158 | { 159 | $args = func_get_args(); 160 | $cnt = count($args); 161 | if ($cnt < 2) { 162 | return trigger_error(__CLASS__ . '::find() expects at least 2 parameters, ' . $cnt . ' given', E_USER_WARNING); 163 | } 164 | for ($end = $pre = false, $pos1 = $this->_pos, $i = 0; $i < ($cnt - 1); $i++) { 165 | if (substr($args[$i], 0, 3) === '$$$') { 166 | $end = strpos($this->_buf, substr($args[$i], 3), $pos1); 167 | } elseif (substr($args[$i], 0, 3) === '^^^') { 168 | $pre = strpos($this->_buf, substr($args[$i], 3), $pos1); 169 | } else { 170 | $pos1 = strpos($this->_buf, $args[$i], $pos1); 171 | if ($pos1 === false) { 172 | return null; 173 | } elseif ($end !== false && $pos1 > $end) { 174 | return ''; 175 | } 176 | if ($pre !== false) { 177 | if ($pos1 > $pre) { 178 | $pos1 = $pre; 179 | } 180 | $pre = false; 181 | } 182 | $pos1 += strlen($args[$i]); 183 | } 184 | } 185 | if (($pos2 = strpos($this->_buf, $args[$i], $pos1)) !== false) { 186 | if ($end !== false && $pos2 > $end) { 187 | return ''; 188 | } 189 | if ($pre !== false) { 190 | if ($pos2 > $pre) { 191 | $pos2 = $pre; 192 | } 193 | $pre = false; 194 | } 195 | $this->_pos = $pos2; 196 | return substr($this->_buf, $pos1, $pos2 - $pos1); 197 | } 198 | return null; 199 | } 200 | 201 | /** 202 | * 移动当前处理位置位置指针,类似 fseek 203 | * @param int $offset 204 | * @param int $whence 可选值:SEEK_SET/SEEK_CUR/SEEK_END 205 | */ 206 | public function seek($offset, $whence = SEEK_CUR) 207 | { 208 | $offset = intval($offset); 209 | switch ($whence) { 210 | case SEEK_SET: 211 | $this->_pos = $offset; 212 | break; 213 | case SEEK_END: 214 | $this->_pos = $offset + strlen($this->_buf); 215 | break; 216 | case SEEK_CUR: 217 | default: 218 | $this->_pos += $offset; 219 | break; 220 | } 221 | return $this->_pos; 222 | } 223 | } 224 | -------------------------------------------------------------------------------- /lib/UrlTable.php: -------------------------------------------------------------------------------- 1 | 6 | * @link http://www.hightman.cn/ 7 | * @copyright Copyright © 2008-2013 Twomice Studio 8 | */ 9 | use hightman\http\ParseInterface; 10 | use hightman\http\Response; 11 | use hightman\http\Request; 12 | 13 | /** 14 | * URL 列表管理接口 15 | */ 16 | interface UrlTable 17 | { 18 | /** 19 | * 同一 URL 连续处理的时间间隔 20 | */ 21 | const DURATION = 3600; 22 | 23 | /** 24 | * @return int URL 列表总个数 25 | */ 26 | public function getCount(); 27 | 28 | /** 29 | * @param int $duration 同一 30 | * @return string 返回一个待处理的 URL,若无返回 null 出错则返回 false 31 | */ 32 | public function getOne($duration = self::DURATION); 33 | 34 | /** 35 | * @param int $limit 36 | * @param int $duration 37 | * @return array 返回不超过指定个数的 URL 数组,若无返回空数组,出错则返回 false 38 | */ 39 | public function getSome($limit = 5, $duration = self::DURATION); 40 | 41 | /** 42 | * @param string $url 要添加的 URL 43 | * @param int $rank 被取出处理的优先级 44 | * @return boolean 成功返回 true,若已存在或其它原因失败均返回 false 45 | */ 46 | public function addUrl($url, $rank = 0); 47 | 48 | /** 49 | * @param string $url 要更新的 URL 50 | * @param int $status URL 处理后的状态码 51 | * @return boolean 成功返回 true, 失败返回 false 52 | */ 53 | public function updateUrl($url, $status = 200); 54 | 55 | /** 56 | * @param string $url 要删除的 URL 57 | * @return boolean 成功返回 true,失败返回 false 58 | */ 59 | public function delUrl($url); 60 | } 61 | 62 | /** 63 | * 基于 MySQLi 的 URL 列表管理,结构如下: 64 | * CREATE TABLE `_urls` ( 65 | * `id` varchar(32) NOT NULL COMMENT 'md5 hash of URL', 66 | * `url` text NOT NULL, 67 | * `rank` smallint(6) NOT NULL COMMENT 'process prior level', 68 | * `status` smallint(6) NOT NULL COMMENT 'last http response status', 69 | * `select_time` bigint(20) NOT NULL COMMENT 'last process time', 70 | * `update_time` bigint(20) NOT NULL COMMENT 'last update time', 71 | * PRIMARY KEY (`id`) 72 | * ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='url table for pspider'; 73 | */ 74 | class UrlTableMySQL extends mysqli implements UrlTable 75 | { 76 | private $_table = '_urls'; 77 | private $_addCache = array(); 78 | 79 | /** 80 | * @param string $name 设置数据库表名,默认 _urls 81 | */ 82 | public function setTableName($name) 83 | { 84 | $this->_table = $name; 85 | } 86 | 87 | public function getCount() 88 | { 89 | $res = $this->query('SELECT COUNT(*) AS count FROM ' . $this->_table); 90 | if ($res !== false) { 91 | $row = $res->fetch_assoc(); 92 | $res->free(); 93 | return $row['count']; 94 | } 95 | return 0; 96 | } 97 | 98 | public function getOne($duration = self::DURATION) 99 | { 100 | $urls = $this->getSome(1, $duration); 101 | if (!is_array($urls)) { 102 | return false; 103 | } 104 | return count($urls) > 0 ? $urls[0] : null; 105 | } 106 | 107 | public function getSome($limit = 5, $duration = self::DURATION) 108 | { 109 | $now = time(); 110 | $sql = 'SELECT id, url, ((' . $now . ' - select_time) * (rank + 1) / (status + 1)) AS score FROM ' . $this->_table . ' '; 111 | $sql .= 'WHERE select_time < ' . ($now - $duration) . ' '; // expired 112 | $sql .= 'OR (select_time > update_time AND select_time < ' . ($now - 300) . ') '; // failed 113 | $sql .= 'ORDER BY score DESC LIMIT ' . intval($limit); 114 | ($fd = @fopen(sys_get_temp_dir() . DIRECTORY_SEPARATOR . __CLASS__ . '.lock', 'w')) && flock($fd, LOCK_EX); 115 | if (($res = $this->query($sql)) === false) { 116 | $ret = false; 117 | } else { 118 | $ret = $ids = array(); 119 | while ($row = $res->fetch_assoc()) { 120 | $ids[] = $row['id']; 121 | $ret[] = $row['url']; 122 | } 123 | $res->free(); 124 | if (count($ids) > 0) { 125 | $sql = 'UPDATE ' . $this->_table . ' SET select_time = ' . $now . ' '; 126 | $sql .= 'WHERE id IN (\'' . implode('\', \'', $ids) . '\')'; 127 | $this->query($sql); 128 | } 129 | } 130 | $fd && flock($fd, LOCK_UN) && fclose($fd); 131 | return $ret; 132 | } 133 | 134 | public function addUrl($url, $rank = 0) 135 | { 136 | $id = md5($url); 137 | if ($this->inAddCache($id)) { 138 | return false; 139 | } 140 | $url = $this->real_escape_string($url); 141 | $sql = 'INSERT INTO ' . $this->_table . ' (id, url, rank) '; 142 | $sql .= 'VALUES (\'' . $id . '\', \'' . $url . '\', ' . intval($rank) . ')'; 143 | return $this->query($sql); 144 | } 145 | 146 | public function updateUrl($url, $status = 200) 147 | { 148 | $now = time(); 149 | $sql = 'UPDATE ' . $this->_table . ' SET status = ' . intval($status) . ', update_time = ' . $now . ' '; 150 | $sql .= 'WHERE id = \'' . md5($url) . '\''; 151 | return $this->query($sql); 152 | } 153 | 154 | public function delUrl($url) 155 | { 156 | $sql = 'DELETE FROM ' . $this->_table . ' WHERE id = \'' . md5($url) . '\''; 157 | return $this->query($sql) && $this->affected_rows === 1; 158 | } 159 | 160 | public function query($query, $mode = MYSQLI_STORE_RESULT) 161 | { 162 | $this->ping(); 163 | $res = parent::query($query, $mode); 164 | return $res; 165 | } 166 | 167 | protected function test() 168 | { 169 | if ($this->connect_error) { 170 | return trigger_error($this->connect_error, E_USER_ERROR); 171 | } 172 | $url = 'http://' . uniqid() . '.com/'; 173 | if (!$this->addUrl($url)) { 174 | return trigger_error($this->error, E_USER_ERROR); 175 | } 176 | $this->delUrl($url); 177 | return true; 178 | } 179 | 180 | private function inAddCache($id) 181 | { 182 | $now = time(); 183 | if (isset($this->_addCache[$id])) { 184 | $this->_addCache[$id] = $now; 185 | return true; 186 | } 187 | $this->_addCache[$id] = $now; 188 | if (count($this->_addCache) > 20000) { 189 | $cache = array(); 190 | $expire = $now - 3600; 191 | foreach ($this->_addCache as $key => $value) { 192 | if ($value > $expire) { 193 | $cache[$key] = $value; 194 | } 195 | } 196 | $this->_addCache = $cache; 197 | } 198 | return false; 199 | } 200 | } 201 | 202 | /** 203 | * 带 URL 提取功能的解析器基础类 204 | * 205 | * 设置是 URL 过滤排除规则: 206 | * 规则语法支持局部字符串匹配,或正则匹配(必须是 # 开头) 207 | * 1. 若是默认允许的外站域名,则检测 disallowDomain 匹配一条则直接排除 208 | * 2. 若是默认不允许的外站域名,则检测 allowDomain,匹配任何一条则通过继续检测 209 | * 3. 检测 disallow 规则,匹配其中一条则立即排除 210 | * 4. 检测 allow 规则,若为空则直接通过,否则必须至少满足其中一条 211 | * 5. 检测 disallowExt 规则,匹配不允许的扩展名则直接排除 212 | * 6. 最终通过 ^-^ 213 | */ 214 | class UrlParser implements ParseInterface 215 | { 216 | private $_timeBegin, $_numAdd, $_numUpdate, $_numFilter; 217 | private $_followExternal; 218 | private $_disallowDomain, $_allowDomain, $_disallow, $_allow; 219 | private $_allowRank; 220 | private $_nofollow; 221 | private $_disallowExt = array( 222 | '.tar' => true, '.gz' => true, '.tgz' => true, '.zip' => true, '.Z' => true, '.7z' => true, 223 | '.rpm' => true, '.deb' => true, '.ps' => true, '.dvi' => true, '.pdf' => true, '.smi' => true, 224 | '.png' => true, '.jpg' => true, '.jpeg' => true, '.bmp' => true, '.tiff' => true, '.gif' => true, 225 | '.mov' => true, '.avi' => true, '.mpeg' => true, '.mpg' => true, '.mp3' => true, '.qt' => true, 226 | '.wav' => true, '.ram' => true, '.rm' => true, '.rmvb' => true, '.jar' => true, '.java' => true, 227 | '.class' => true, '.diff' => true, '.doc' => true, '.docx' => true, '.xls' => true, '.ppt' => true, 228 | '.mdb' => true, '.rtf' => true, '.exe' => true, '.pps' => true, '.so' => true, '.psd' => true, 229 | '.css' => true, '.js' => true, '.ico' => true, '.dll' => true, '.bz2' => true, '.rar' => true, 230 | ); 231 | private $_ut; 232 | 233 | /** 234 | * @param UrlTable $ut 235 | */ 236 | public function __construct(UrlTable $ut) 237 | { 238 | $this->_ut = $ut; 239 | $this->_timeBegin = time(); 240 | $this->_numAdd = $this->_numUpdate = $this->_numFilter = 0; 241 | // apply default filters for extending 242 | $this->resetFilter(); 243 | $this->defaultFilter(); 244 | } 245 | 246 | public function __destruct() 247 | { 248 | $this->_ut = null; 249 | } 250 | 251 | /** 252 | * @return UrlTable 253 | */ 254 | public function getUrlTable() 255 | { 256 | return $this->_ut; 257 | } 258 | 259 | /** 260 | * 扩展该类时在此应用默认的 URL 过滤规则 261 | */ 262 | public function defaultFilter() 263 | { 264 | 265 | } 266 | 267 | /** 268 | * 重置所有过滤规则,但不包含后缀过滤规则 269 | */ 270 | public function resetFilter() 271 | { 272 | $this->_followExternal = false; 273 | $this->_disallowDomain = array(); 274 | $this->_allowDomain = array(); 275 | $this->_disallow = array(); 276 | $this->_allow = array(); 277 | $this->_allowRank = array(); 278 | $this->_nofollow = array(); 279 | } 280 | 281 | /** 282 | * @param boolean $on 设置是否处理站外 URL,默认为 false 283 | */ 284 | public function followExternal($on = true) 285 | { 286 | $this->_followExternal = $on === true ? true : false; 287 | } 288 | 289 | /** 290 | * @param string $rule 不允许的域名规则,支持正则表达式 291 | */ 292 | public function disallowDomain($rule) 293 | { 294 | $this->saveMatchRule($this->_disallowDomain, $rule); 295 | } 296 | 297 | /** 298 | * @param string $rule 允许的域名规则,支持正则表达式 299 | */ 300 | public function allowDomain($rule) 301 | { 302 | $this->saveMatchRule($this->_allowDomain, $rule); 303 | } 304 | 305 | /** 306 | * @param string $rule 不允许的 URL 规则,支持正则表达式 307 | */ 308 | public function disallow($rule) 309 | { 310 | $this->saveMatchRule($this->_disallow, $rule); 311 | } 312 | 313 | /** 314 | * @param string $rule 允许的 URL 规则,支持正则表达式 315 | * @param int $rank 匹配此规则的 URL 的权重值 316 | * @param boolean $follow 是否跟随分析此规则页面中的链接 317 | */ 318 | public function allow($rule, $rank = null, $follow = true) 319 | { 320 | $this->saveMatchRule($this->_allow, $rule); 321 | if ($rank !== null) { 322 | $this->_allowRank[$rule] = intval($rank); 323 | } 324 | if (!$follow) { 325 | $this->saveMatchRule($this->_nofollow, $rule); 326 | } 327 | } 328 | 329 | /** 330 | * @param string $name 不允许的 URL 扩展名,必须以 . 开头 331 | */ 332 | public function disallowExt($name) 333 | { 334 | $this->_disallowExt[strtolower($name)] = true; 335 | } 336 | 337 | /** 338 | * @param string $name 强制允许的 URL 扩展名,必须以 . 开头 339 | */ 340 | public function allowExt($name) 341 | { 342 | if (substr($name, 0, 1) === '.') { 343 | $name = strtolower($name); 344 | if (isset($this->_disallowExt[$name])) { 345 | unset($this->_disallowExt[$name]); 346 | } 347 | } 348 | } 349 | 350 | /** 351 | * 打印或返回统计情况 352 | * @param boolean $output 是否直接输出结果 353 | */ 354 | public function stat($output = false) 355 | { 356 | // time 357 | $time = time() - $this->_timeBegin; 358 | $string = date('m-d H:i:s') . ' - Time cost: '; 359 | if ($time > 3600) { 360 | $string .= intval($time / 3600) . ' hours '; 361 | $time %= 3600; 362 | } 363 | if ($time > 60) { 364 | $string .= intval($time / 60) . ' mins '; 365 | $time %= 60; 366 | } 367 | $string .= $time . ' secs, '; 368 | // stats 369 | $string .= sprintf('URLs total: %d, Add: %d, Update: %d, Filtered: %d', $this->_ut->getCount(), $this->_numAdd, $this->_numUpdate, $this->_numFilter); 370 | if ($output !== true) { 371 | return $string; 372 | } 373 | echo $string . "\n"; 374 | } 375 | 376 | /** 377 | * 实现 HttpParser 中定义的方法 378 | * @param Response $res 379 | * @param Request $req 380 | * @param mixed $key 381 | */ 382 | public function parse(Response $res, Request $req, $key) 383 | { 384 | // update url 385 | $rawUrl = $req->getRawUrl(); 386 | if ($this->_ut->updateUrl($rawUrl, $res->status)) { 387 | $this->_numUpdate++; 388 | } 389 | // parse url from body 390 | if ($res->status === 200 && $this->isFollowUrl($rawUrl)) { 391 | // get baseUrl 392 | $baseUrl = $req->getUrl(); 393 | if (preg_match('/]/i', $res->body, $match)) { 394 | $baseUrl = $this->resetUrl($match[1], $baseUrl); 395 | } 396 | // href="xxx", href='xxx' 397 | if (preg_match_all('/href=([\'"])(.*?)\1/i', $res->body, $matches) > 0) { 398 | foreach ($matches[2] as $url) { 399 | $this->processUrl($url, $baseUrl, $res->url); 400 | } 401 | } 402 | // href=xxx 403 | if (preg_match_all('/href=(?![\'"])(.*?)[\s>]/i', $res->body, $matches) > 0) { 404 | foreach ($matches[1] as $url) { 405 | $this->processUrl($url, $baseUrl, $res->url); 406 | } 407 | } 408 | } elseif ($res->status === 301 || $res->status === 302) { 409 | $url = $this->resetUrl($res->getHeader('location'), $req->getUrl()); 410 | $res->setHeader('location', $url); // overwrite formated url 411 | // save url for permanent redirection 412 | if ($res->status === 301) { 413 | $this->processUrl($url, $res->url); 414 | } 415 | } 416 | } 417 | 418 | /** 419 | * @param string $url 420 | * @param string $rawUrl 原先的开始页面 URL,用于计算是否为站外 421 | * @param string &$rank 422 | * @return boolean 是否 URL 符合过滤规则需要排除,需要排除返回 true 423 | */ 424 | public function isDisallow($url, $rawUrl = null, &$rank = null) 425 | { 426 | // get domain 427 | if (($pos1 = strpos($url, '://')) === false) { 428 | return true; 429 | } 430 | $pos1 += 3; 431 | $pos2 = strpos($url, '/', $pos1); 432 | $domain = $pos2 === false ? substr($url, $pos1) : substr($url, $pos1, $pos2 - $pos1); 433 | // external domain 434 | if ($rawUrl !== null && !@strstr($rawUrl, $domain)) { 435 | // disallow domain 436 | if ($this->_followExternal && $this->isMatchRule($this->_disallowDomain, $domain)) { 437 | return true; 438 | } 439 | // allow domain 440 | if (!$this->_followExternal 441 | && (count($this->_allowDomain) === 0 || !$this->isMatchRule($this->_allowDomain, $domain))) { 442 | return true; 443 | } 444 | } 445 | // disallow 446 | if ($this->isMatchRule($this->_disallow, $url)) { 447 | return true; 448 | } 449 | // allow 450 | if (count($this->_allow) > 0 && !$this->isMatchRule($this->_allow, $url, $rank)) { 451 | return true; 452 | } 453 | // dislaowExt 454 | if (($pos1 = strpos($url, '?')) === false) { 455 | $pos1 = strlen($url); 456 | } 457 | if (($pos2 = strpos($url, '/', 8)) !== false 458 | && ($ext = strrchr(substr($url, $pos2, $pos1 - $pos2), '.'))) { 459 | $ext = strtolower($ext); 460 | if (isset($this->_disallowExt[$ext])) { 461 | return true; 462 | } 463 | } 464 | return false; 465 | } 466 | 467 | /** 468 | * @param string $url 469 | * @param string $baseUrl 470 | * @return string 返回处理好的标准 URL 471 | */ 472 | public function resetUrl($url, $baseUrl = null) 473 | { 474 | // 开头处理 475 | if (!strncasecmp($url, 'http://http://', 14)) { 476 | $url = substr($url, 7); 477 | } 478 | if (strncasecmp($url, 'http://', 7) && strncasecmp($url, 'https://', 8)) { 479 | if ($baseUrl === null) { 480 | $url = 'http://' . $url; 481 | } else { 482 | if (substr($url, 0, 1) === '/') { 483 | $pos = @strpos($baseUrl, '/', 8); 484 | $url = ($pos === false ? $baseUrl : substr($baseUrl, 0, $pos)) . $url; 485 | } else { 486 | $pos = @strrpos($baseUrl, '/', 8); 487 | $url = ($pos === false ? $baseUrl . '/' : substr($baseUrl, 0, $pos + 1)) . $url; 488 | } 489 | } 490 | } 491 | // 统一 URL 格式,顶级网址以 / 结尾,去除 # 后的锚点 492 | if (@strpos($url, '/', 8) === false) { 493 | $url .= '/'; 494 | } 495 | if (($pos = strrpos($url, '#')) !== false) { 496 | $url = substr($url, 0, $pos); 497 | } 498 | // 计算并处理 '../../' 等多余的相对 URL 499 | if (strpos($url, '/./') !== false || strpos($url, '/../') !== false) { 500 | $parts = array(); 501 | $tmpa = explode('/', substr($url, 8)); 502 | for ($i = 0; $i < count($tmpa); $i++) { 503 | if ($tmpa[$i] === '.' || ($tmpa[$i] === '' && isset($tmpa[$i + 1]))) { 504 | continue; 505 | } elseif ($tmpa[$i] !== '..') { 506 | array_push($parts, $tmpa[$i]); 507 | } elseif (count($parts) > 1) { 508 | array_pop($parts); 509 | } 510 | } 511 | $url = substr($url, 0, 8) . implode('/', $parts); 512 | } 513 | return $url; 514 | } 515 | 516 | /** 517 | * @param string $url 518 | * @return boolean 是否分析处理当前 URL 内容中的链接 519 | */ 520 | protected function isFollowUrl($url) 521 | { 522 | return !$this->isMatchRule($this->_nofollow, $url); 523 | } 524 | 525 | /** 526 | * @return mixed 527 | */ 528 | protected function processUrl($url, $baseUrl, $rawUrl = null) 529 | { 530 | if (substr($url, 0, 1) === '#' || !strncasecmp($url, 'javascript:', 11) || !strncasecmp($url, 'mailto:', 7)) { 531 | return 'SKIP'; 532 | } 533 | $url = $this->resetUrl($url, $baseUrl); 534 | $rank = 0; 535 | if ($this->isDisallow($url, $rawUrl === null ? $baseUrl : $rawUrl, $rank)) { 536 | $this->_numFilter++; 537 | return 'FILTER'; 538 | } 539 | if ($this->_ut->addUrl($url, $rank)) { 540 | $this->_numAdd++; 541 | return 'ADD'; 542 | } 543 | return 'SKIP'; 544 | } 545 | 546 | private function saveMatchRule(&$array, $rule) 547 | { 548 | if ($rule === null) { 549 | $array = array(); 550 | } elseif ($this->isRegexPattern($rule)) { 551 | array_push($array, "\xff" . $rule); 552 | } else { 553 | array_unshift($array, $rule); 554 | } 555 | } 556 | 557 | private function isMatchRule($rules, $input, &$rank = null) 558 | { 559 | foreach ($rules as $rule) { 560 | if (ord($rule[0]) !== 0xff) { 561 | $matched = stristr($input, $rule) !== false; 562 | } else { 563 | $rule = substr($rule, 1); 564 | $matched = preg_match($rule, $input) > 0; 565 | } 566 | if ($matched === true) { 567 | if (isset($this->_allowRank[$rule])) { 568 | $rank = $this->_allowRank[$rule]; 569 | } 570 | return true; 571 | } 572 | } 573 | return false; 574 | } 575 | 576 | private function isRegexPattern($input) 577 | { 578 | if (strlen($input) > 2 && $input[0] === '#') { 579 | for ($i = strlen($input) - 1; $i > 1; $i--) { 580 | if ($input[$i] === $input[0]) { 581 | return true; 582 | } 583 | if ($input[$i] !== 'i' && $input[$i] !== 'u') { 584 | break; 585 | } 586 | } 587 | } 588 | return false; 589 | } 590 | } 591 | -------------------------------------------------------------------------------- /spider.php: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env php 2 | [options] 22 | pspider -h 23 | 24 | -c The name of custom file under 'custom/' 25 | -n The number of URLs to crawl in parallel 26 | -q Quit after processing the number of URLs 27 | -p Time interval to crawl the same URL 28 | -u The start URL (forced to crawl once) 29 | -h This help 30 | 31 | EOF; 32 | exit(0); 33 | } 34 | 35 | $file = __DIR__ . '/custom/' . $options['c'] . '.inc.php'; 36 | if (!file_exists($file)) { 37 | echo "ERROR: Custom file dose not exists '$file'.\n"; 38 | exit(-1); 39 | } 40 | 41 | // add library 42 | require_once __DIR__ . '/vendor/autoload.php'; 43 | require_once __DIR__ . '/lib/UrlTable.php'; 44 | include $file; 45 | 46 | if (!class_exists('UrlTableCustom') || !class_exists('UrlParserCustom')) { 47 | echo "ERROR: Invalid custom file '$file'.\n"; 48 | echo "Must have defined class 'UrlParserCustom' inherited from 'UrlParser',\n"; 49 | echo "and class 'UrlTableCustom' inherited from 'UrlTableMySQL'.\n"; 50 | echo "Please see 'custom/skel.inc.php'.\n"; 51 | exit(-1); 52 | } 53 | 54 | $options['n'] = isset($options['n']) ? intval($options['n']) : PSP_NUM_PARALLEL; 55 | $options['p'] = isset($options['p']) ? intval($options['p']) : PSP_CRAWL_PERIOD; 56 | $options['q'] = isset($options['q']) ? intval($options['q']) : 0; 57 | 58 | // create objects 59 | $ut = new UrlTableCustom; 60 | $up = new UrlParserCustom($ut); 61 | $http = new \hightman\http\Client($up); 62 | 63 | // start url 64 | if (isset($options['u'])) { 65 | $http->get($up->resetUrl($options['u'])); 66 | } 67 | 68 | // loop to handle 69 | $num = 0; 70 | while ($urls = $ut->getSome($options['n'], $options['p'])) { 71 | if (count($urls) === 0) { 72 | break; 73 | } 74 | $http->mget($urls); 75 | $num += count($urls); 76 | if ($options['q'] > 0 && $num >= $options['q']) { 77 | break; 78 | } 79 | if ($num > $options['n'] && ($num % 1000) < $options['n']) { 80 | $up->stat(true); 81 | } 82 | } 83 | 84 | // print stats 85 | $up->stat(true); 86 | echo "OK, finished!\n"; 87 | -------------------------------------------------------------------------------- /tests/bootstrap.php: -------------------------------------------------------------------------------- 1 | object = new HttpBase; 21 | } 22 | 23 | /** 24 | * Tears down the fixture, for example, closes a network connection. 25 | * This method is called after a test is executed. 26 | */ 27 | protected function tearDown() 28 | { 29 | 30 | } 31 | 32 | public function testSetHeader() 33 | { 34 | $obj = $this->object; 35 | 36 | $obj->setHeader('content-type', 'text/html'); 37 | $this->assertEquals('text/html', $obj->getHeader('Content-Type')); 38 | $obj->setHeader('content-type', 'text/plain'); 39 | $this->assertEquals('text/plain', $obj->getHeader('Content-Type')); 40 | $this->assertArrayHasKey('content-type', $obj->getHeader(null)); 41 | 42 | $obj->setHeader('content-type', null); 43 | $this->assertNull($obj->getHeader('content-type')); 44 | $this->assertNull($obj->getHeader('Content-Type')); 45 | $this->assertArrayNotHasKey('content-type', $obj->getHeader(null)); 46 | } 47 | 48 | public function testAddHeader() 49 | { 50 | $obj = $this->object; 51 | $this->assertFalse($obj->hasHeader('content-type')); 52 | $obj->addHeader('content-type', 'text/html'); 53 | $this->assertTrue($obj->hasHeader('content-type')); 54 | $this->assertEquals('text/html', $obj->getHeader('Content-Type')); 55 | 56 | $obj->addHeader('content-type', 'text/plain'); 57 | $this->assertInternalType('array', $obj->getHeader('Content-Type')); 58 | $obj->setHeader('content-type', null); 59 | $this->assertNull($obj->getHeader('content-type')); 60 | 61 | $this->assertInternalType('array', $obj->getHeader(null)); 62 | $obj->clearHeader(); 63 | $this->assertEmpty($obj->getHeader(null)); 64 | } 65 | 66 | /** 67 | * @todo Implement testSetCookie(). 68 | */ 69 | public function testSetCookie() 70 | { 71 | // Remove the following lines when you implement this test. 72 | $this->markTestIncomplete( 73 | 'This test has not been implemented yet.' 74 | ); 75 | } 76 | 77 | /** 78 | * @todo Implement testClearCookie(). 79 | */ 80 | public function testClearCookie() 81 | { 82 | // Remove the following lines when you implement this test. 83 | $this->markTestIncomplete( 84 | 'This test has not been implemented yet.' 85 | ); 86 | } 87 | 88 | /** 89 | * @todo Implement testGetCookie(). 90 | */ 91 | public function testGetCookie() 92 | { 93 | // Remove the following lines when you implement this test. 94 | $this->markTestIncomplete( 95 | 'This test has not been implemented yet.' 96 | ); 97 | } 98 | 99 | /** 100 | * @todo Implement testApplyCookie(). 101 | */ 102 | public function testApplyCookie() 103 | { 104 | // Remove the following lines when you implement this test. 105 | $this->markTestIncomplete( 106 | 'This test has not been implemented yet.' 107 | ); 108 | } 109 | 110 | /** 111 | * @todo Implement testFetchCookieToSend(). 112 | */ 113 | public function testFetchCookieToSend() 114 | { 115 | // Remove the following lines when you implement this test. 116 | $this->markTestIncomplete( 117 | 'This test has not been implemented yet.' 118 | ); 119 | } 120 | } 121 | 122 | -------------------------------------------------------------------------------- /tests/lib/HttpClientTest.php: -------------------------------------------------------------------------------- 1 | object = new HttpClient; 21 | } 22 | 23 | /** 24 | * Tears down the fixture, for example, closes a network connection. 25 | * This method is called after a test is executed. 26 | */ 27 | protected function tearDown() 28 | { 29 | 30 | } 31 | 32 | /** 33 | * @todo Implement testDebug(). 34 | */ 35 | public function testDebug() 36 | { 37 | // Remove the following lines when you implement this test. 38 | $this->markTestIncomplete( 39 | 'This test has not been implemented yet.' 40 | ); 41 | } 42 | 43 | /** 44 | * @todo Implement testSetCookiePath(). 45 | */ 46 | public function testSetCookiePath() 47 | { 48 | // Remove the following lines when you implement this test. 49 | $this->markTestIncomplete( 50 | 'This test has not been implemented yet.' 51 | ); 52 | } 53 | 54 | /** 55 | * @todo Implement testSetParser(). 56 | */ 57 | public function testSetParser() 58 | { 59 | // Remove the following lines when you implement this test. 60 | $this->markTestIncomplete( 61 | 'This test has not been implemented yet.' 62 | ); 63 | } 64 | 65 | /** 66 | * @todo Implement testRunParser(). 67 | */ 68 | public function testRunParser() 69 | { 70 | // Remove the following lines when you implement this test. 71 | $this->markTestIncomplete( 72 | 'This test has not been implemented yet.' 73 | ); 74 | } 75 | 76 | /** 77 | * @todo Implement testClearHeader(). 78 | */ 79 | public function testClearHeader() 80 | { 81 | // Remove the following lines when you implement this test. 82 | $this->markTestIncomplete( 83 | 'This test has not been implemented yet.' 84 | ); 85 | } 86 | 87 | /** 88 | * @todo Implement testGet(). 89 | */ 90 | public function testGet() 91 | { 92 | // Remove the following lines when you implement this test. 93 | $this->markTestIncomplete( 94 | 'This test has not been implemented yet.' 95 | ); 96 | } 97 | 98 | /** 99 | * @todo Implement testHead(). 100 | */ 101 | public function testHead() 102 | { 103 | // Remove the following lines when you implement this test. 104 | $this->markTestIncomplete( 105 | 'This test has not been implemented yet.' 106 | ); 107 | } 108 | 109 | /** 110 | * @todo Implement testProcess(). 111 | */ 112 | public function testProcess() 113 | { 114 | // Remove the following lines when you implement this test. 115 | $this->markTestIncomplete( 116 | 'This test has not been implemented yet.' 117 | ); 118 | } 119 | 120 | /** 121 | * @todo Implement testExec(). 122 | */ 123 | public function testExec() 124 | { 125 | // Remove the following lines when you implement this test. 126 | $this->markTestIncomplete( 127 | 'This test has not been implemented yet.' 128 | ); 129 | } 130 | 131 | /** 132 | * @todo Implement test__destruct(). 133 | */ 134 | public function test__destruct() 135 | { 136 | // Remove the following lines when you implement this test. 137 | $this->markTestIncomplete( 138 | 'This test has not been implemented yet.' 139 | ); 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /tests/lib/HttpConnTest.php: -------------------------------------------------------------------------------- 1 | object = HttpConn::connect(''); 21 | } 22 | 23 | /** 24 | * Tears down the fixture, for example, closes a network connection. 25 | * This method is called after a test is executed. 26 | */ 27 | protected function tearDown() 28 | { 29 | 30 | } 31 | 32 | /** 33 | * @todo Implement testConnect(). 34 | */ 35 | public function testConnect() 36 | { 37 | // Remove the following lines when you implement this test. 38 | $this->markTestIncomplete( 39 | 'This test has not been implemented yet.' 40 | ); 41 | } 42 | 43 | /** 44 | * @todo Implement testFindBySock(). 45 | */ 46 | public function testFindBySock() 47 | { 48 | // Remove the following lines when you implement this test. 49 | $this->markTestIncomplete( 50 | 'This test has not been implemented yet.' 51 | ); 52 | } 53 | 54 | /** 55 | * @todo Implement testGetLastError(). 56 | */ 57 | public function testGetLastError() 58 | { 59 | // Remove the following lines when you implement this test. 60 | $this->markTestIncomplete( 61 | 'This test has not been implemented yet.' 62 | ); 63 | } 64 | 65 | /** 66 | * @todo Implement testClose(). 67 | */ 68 | public function testClose() 69 | { 70 | // Remove the following lines when you implement this test. 71 | $this->markTestIncomplete( 72 | 'This test has not been implemented yet.' 73 | ); 74 | } 75 | 76 | /** 77 | * @todo Implement testAddWriteData(). 78 | */ 79 | public function testAddWriteData() 80 | { 81 | // Remove the following lines when you implement this test. 82 | $this->markTestIncomplete( 83 | 'This test has not been implemented yet.' 84 | ); 85 | } 86 | 87 | /** 88 | * @todo Implement testHasDataToWrite(). 89 | */ 90 | public function testHasDataToWrite() 91 | { 92 | // Remove the following lines when you implement this test. 93 | $this->markTestIncomplete( 94 | 'This test has not been implemented yet.' 95 | ); 96 | } 97 | 98 | /** 99 | * @todo Implement testWrite(). 100 | */ 101 | public function testWrite() 102 | { 103 | // Remove the following lines when you implement this test. 104 | $this->markTestIncomplete( 105 | 'This test has not been implemented yet.' 106 | ); 107 | } 108 | 109 | /** 110 | * @todo Implement testGetLine(). 111 | */ 112 | public function testGetLine() 113 | { 114 | // Remove the following lines when you implement this test. 115 | $this->markTestIncomplete( 116 | 'This test has not been implemented yet.' 117 | ); 118 | } 119 | 120 | /** 121 | * @todo Implement testRead(). 122 | */ 123 | public function testRead() 124 | { 125 | // Remove the following lines when you implement this test. 126 | $this->markTestIncomplete( 127 | 'This test has not been implemented yet.' 128 | ); 129 | } 130 | 131 | /** 132 | * @todo Implement testGetSock(). 133 | */ 134 | public function testGetSock() 135 | { 136 | // Remove the following lines when you implement this test. 137 | $this->markTestIncomplete( 138 | 'This test has not been implemented yet.' 139 | ); 140 | } 141 | 142 | /** 143 | * @todo Implement testGetExArg(). 144 | */ 145 | public function testGetExArg() 146 | { 147 | // Remove the following lines when you implement this test. 148 | $this->markTestIncomplete( 149 | 'This test has not been implemented yet.' 150 | ); 151 | } 152 | 153 | /** 154 | * @todo Implement test__destruct(). 155 | */ 156 | public function test__destruct() 157 | { 158 | // Remove the following lines when you implement this test. 159 | $this->markTestIncomplete( 160 | 'This test has not been implemented yet.' 161 | ); 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /tests/lib/HttpProcesserTest.php: -------------------------------------------------------------------------------- 1 | object = new HttpProcesser; 21 | } 22 | 23 | /** 24 | * Tears down the fixture, for example, closes a network connection. 25 | * This method is called after a test is executed. 26 | */ 27 | protected function tearDown() 28 | { 29 | 30 | } 31 | 32 | /** 33 | * @todo Implement testGetConn(). 34 | */ 35 | public function testGetConn() 36 | { 37 | // Remove the following lines when you implement this test. 38 | $this->markTestIncomplete( 39 | 'This test has not been implemented yet.' 40 | ); 41 | } 42 | 43 | /** 44 | * @todo Implement testSend(). 45 | */ 46 | public function testSend() 47 | { 48 | // Remove the following lines when you implement this test. 49 | $this->markTestIncomplete( 50 | 'This test has not been implemented yet.' 51 | ); 52 | } 53 | 54 | /** 55 | * @todo Implement testRecv(). 56 | */ 57 | public function testRecv() 58 | { 59 | // Remove the following lines when you implement this test. 60 | $this->markTestIncomplete( 61 | 'This test has not been implemented yet.' 62 | ); 63 | } 64 | 65 | /** 66 | * @todo Implement testFinish(). 67 | */ 68 | public function testFinish() 69 | { 70 | // Remove the following lines when you implement this test. 71 | $this->markTestIncomplete( 72 | 'This test has not been implemented yet.' 73 | ); 74 | } 75 | 76 | /** 77 | * @todo Implement test__destruct(). 78 | */ 79 | public function test__destruct() 80 | { 81 | // Remove the following lines when you implement this test. 82 | $this->markTestIncomplete( 83 | 'This test has not been implemented yet.' 84 | ); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /tests/lib/HttpRequestTest.php: -------------------------------------------------------------------------------- 1 | object = new HttpRequest; 21 | } 22 | 23 | /** 24 | * Tears down the fixture, for example, closes a network connection. 25 | * This method is called after a test is executed. 26 | */ 27 | protected function tearDown() 28 | { 29 | 30 | } 31 | 32 | /** 33 | * @todo Implement testSetMaxRedirect(). 34 | */ 35 | public function testSetMaxRedirect() 36 | { 37 | // Remove the following lines when you implement this test. 38 | $this->markTestIncomplete( 39 | 'This test has not been implemented yet.' 40 | ); 41 | } 42 | 43 | /** 44 | * @todo Implement testGetMaxRedirect(). 45 | */ 46 | public function testGetMaxRedirect() 47 | { 48 | // Remove the following lines when you implement this test. 49 | $this->markTestIncomplete( 50 | 'This test has not been implemented yet.' 51 | ); 52 | } 53 | 54 | /** 55 | * @todo Implement testSetUrl(). 56 | */ 57 | public function testSetUrl() 58 | { 59 | // Remove the following lines when you implement this test. 60 | $this->markTestIncomplete( 61 | 'This test has not been implemented yet.' 62 | ); 63 | } 64 | 65 | /** 66 | * @todo Implement testGetRawUrl(). 67 | */ 68 | public function testGetRawUrl() 69 | { 70 | // Remove the following lines when you implement this test. 71 | $this->markTestIncomplete( 72 | 'This test has not been implemented yet.' 73 | ); 74 | } 75 | 76 | /** 77 | * @todo Implement testGetUrl(). 78 | */ 79 | public function testGetUrl() 80 | { 81 | // Remove the following lines when you implement this test. 82 | $this->markTestIncomplete( 83 | 'This test has not been implemented yet.' 84 | ); 85 | } 86 | 87 | /** 88 | * @todo Implement testGetUrlParams(). 89 | */ 90 | public function testGetUrlParams() 91 | { 92 | // Remove the following lines when you implement this test. 93 | $this->markTestIncomplete( 94 | 'This test has not been implemented yet.' 95 | ); 96 | } 97 | 98 | /** 99 | * @todo Implement testGetUrlParam(). 100 | */ 101 | public function testGetUrlParam() 102 | { 103 | // Remove the following lines when you implement this test. 104 | $this->markTestIncomplete( 105 | 'This test has not been implemented yet.' 106 | ); 107 | } 108 | 109 | /** 110 | * @todo Implement testSetMethod(). 111 | */ 112 | public function testSetMethod() 113 | { 114 | // Remove the following lines when you implement this test. 115 | $this->markTestIncomplete( 116 | 'This test has not been implemented yet.' 117 | ); 118 | } 119 | 120 | /** 121 | * @todo Implement testGetMethod(). 122 | */ 123 | public function testGetMethod() 124 | { 125 | // Remove the following lines when you implement this test. 126 | $this->markTestIncomplete( 127 | 'This test has not been implemented yet.' 128 | ); 129 | } 130 | 131 | /** 132 | * @todo Implement testGetBody(). 133 | */ 134 | public function testGetBody() 135 | { 136 | // Remove the following lines when you implement this test. 137 | $this->markTestIncomplete( 138 | 'This test has not been implemented yet.' 139 | ); 140 | } 141 | 142 | /** 143 | * @todo Implement testAddPostField(). 144 | */ 145 | public function testAddPostField() 146 | { 147 | // Remove the following lines when you implement this test. 148 | $this->markTestIncomplete( 149 | 'This test has not been implemented yet.' 150 | ); 151 | } 152 | 153 | /** 154 | * @todo Implement testAddPostFile(). 155 | */ 156 | public function testAddPostFile() 157 | { 158 | // Remove the following lines when you implement this test. 159 | $this->markTestIncomplete( 160 | 'This test has not been implemented yet.' 161 | ); 162 | } 163 | 164 | /** 165 | * @todo Implement test__toString(). 166 | */ 167 | public function test__toString() 168 | { 169 | // Remove the following lines when you implement this test. 170 | $this->markTestIncomplete( 171 | 'This test has not been implemented yet.' 172 | ); 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /tests/lib/HttpResponseTest.php: -------------------------------------------------------------------------------- 1 | object = new HttpResponse('http://localhost/'); 21 | } 22 | 23 | /** 24 | * Tears down the fixture, for example, closes a network connection. 25 | * This method is called after a test is executed. 26 | */ 27 | protected function tearDown() 28 | { 29 | 30 | } 31 | 32 | /** 33 | * @todo Implement test__toString(). 34 | */ 35 | public function test__toString() 36 | { 37 | // Remove the following lines when you implement this test. 38 | $this->markTestIncomplete( 39 | 'This test has not been implemented yet.' 40 | ); 41 | } 42 | 43 | /** 44 | * @todo Implement testHasError(). 45 | */ 46 | public function testHasError() 47 | { 48 | // Remove the following lines when you implement this test. 49 | $this->markTestIncomplete( 50 | 'This test has not been implemented yet.' 51 | ); 52 | } 53 | 54 | /** 55 | * @todo Implement testReset(). 56 | */ 57 | public function testReset() 58 | { 59 | // Remove the following lines when you implement this test. 60 | $this->markTestIncomplete( 61 | 'This test has not been implemented yet.' 62 | ); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /tests/lib/UrlParserTest.php: -------------------------------------------------------------------------------- 1 | object = new UrlParser(new UrlTableMySQL); 21 | } 22 | 23 | /** 24 | * Tears down the fixture, for example, closes a network connection. 25 | * This method is called after a test is executed. 26 | */ 27 | protected function tearDown() 28 | { 29 | 30 | } 31 | 32 | public function testIsDisallow() 33 | { 34 | $rawUrl = 'http://www.czxiu.com/cz/default.html'; 35 | $obj = $this->object; 36 | 37 | // allow domain 38 | $obj->allowDomain('/baidu.com$/'); 39 | $obj->allowDomain('sina.com'); 40 | $this->assertTrue($obj->isDisallow('http://www.qq.com/test.html', $rawUrl)); 41 | $this->assertTrue($obj->isDisallow('http://www.baidu.com.cn/test.html', $rawUrl)); 42 | $this->assertTrue($obj->isDisallow('http://id.czxiu.com/zj/21.html', $rawUrl)); 43 | $this->assertFalse($obj->isDisallow('http://news.sina.com.cn/', $rawUrl)); 44 | $this->assertFalse($obj->isDisallow('http://www.czxiu.com', $rawUrl)); 45 | 46 | // disallow domain 47 | $obj->disallowDomain('weibo.com'); 48 | $obj->disallowDomain('#^t\.#'); 49 | $obj->followExternal(); 50 | $this->assertFalse($obj->isDisallow('http://www.qq.com/test.html', $rawUrl)); 51 | $this->assertFalse($obj->isDisallow('http://www.baidu.com.cn/test.html', $rawUrl)); 52 | $this->assertFalse($obj->isDisallow('http://id.czxiu.com/zj/21.html', $rawUrl)); 53 | $this->assertTrue($obj->isDisallow('http://t.czxiu.com/zj/21.html', $rawUrl)); 54 | $this->assertTrue($obj->isDisallow('http://weibo.com/zj/21.html', $rawUrl)); 55 | $this->assertTrue($obj->isDisallow('http://tt.weibo.com/zj/21.html', $rawUrl)); 56 | 57 | // disallow rule 58 | $obj->disallow('/21.html'); 59 | $obj->disallow('#http://x\.#'); 60 | $obj->disallow('#COM\.CN#i'); 61 | $this->assertFalse($obj->isDisallow('http://tt.weibo2.com/zj/22.html', $rawUrl)); 62 | $this->assertTrue($obj->isDisallow('http://x.czxiu.com/zj/21.html', $rawUrl)); 63 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/zj/211.html', $rawUrl)); 64 | $this->assertTrue($obj->isDisallow('http://www.xxx.com.cn/test.html', $rawUrl)); 65 | 66 | // allow rule 67 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/ft/2.html', $rawUrl)); 68 | $obj->allow('/zj'); 69 | $this->assertTrue($obj->isDisallow('HTTP://x.czxiu.com/ft/2.html', $rawUrl)); 70 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/zj/212.html', $rawUrl)); 71 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/ZJ/212.jpg2', $rawUrl)); 72 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/zj2/21.php?xyz=1', $rawUrl)); 73 | 74 | // disallow ext (default) 75 | $this->assertTrue($obj->isDisallow('HTTP://x.czxiu.com/ZJ/21.tar?down=yes&x=1', $rawUrl)); 76 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/zj/212.html', $rawUrl)); 77 | $obj->allowExt('.tar'); 78 | $obj->disallowExt('.html'); 79 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/ZJ/21.tar?down=yes&x=1', $rawUrl)); 80 | $this->assertTrue($obj->isDisallow('HTTP://x.czxiu.com/zj/212.html?ok=1', $rawUrl)); 81 | } 82 | 83 | /** 84 | * @dataProvider provider 85 | */ 86 | public function testResetUrl($input, $output) 87 | { 88 | $baseUrl = 'http://www.czxiu.com/cz/default.html'; 89 | $obj = $this->object; 90 | $this->assertEquals($output, $obj->resetUrl($input, $baseUrl)); 91 | } 92 | 93 | public function provider() 94 | { 95 | return array( 96 | array('', 'http://www.czxiu.com/cz/'), 97 | array('test.html', 'http://www.czxiu.com/cz/test.html'), 98 | array('./test.html', 'http://www.czxiu.com/cz/test.html'), 99 | array('../cz/test.html', 'http://www.czxiu.com/cz/test.html'), 100 | array('../..//test.html', 'http://www.czxiu.com/test.html'), 101 | array('diy/test.html', 'http://www.czxiu.com/cz/diy/test.html'), 102 | array('/diy/test.html', 'http://www.czxiu.com/diy/test.html'), 103 | array('#', 'http://www.czxiu.com/cz/'), 104 | array('https://diy.czxiu.com', 'https://diy.czxiu.com/'), 105 | array('http://diy.czxiu.com/index.php#', 'http://diy.czxiu.com/index.php'), 106 | array('http://diy.czxiu.com/cz/diy/../index.php#', 'http://diy.czxiu.com/cz/index.php'), 107 | ); 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /tests/lib/UrlTableMySQLTest.php: -------------------------------------------------------------------------------- 1 | object = new UrlTableMySQL; 21 | } 22 | 23 | /** 24 | * Tears down the fixture, for example, closes a network connection. 25 | * This method is called after a test is executed. 26 | */ 27 | protected function tearDown() 28 | { 29 | 30 | } 31 | 32 | /** 33 | * @todo Implement testSetTableName(). 34 | */ 35 | public function testSetTableName() 36 | { 37 | // Remove the following lines when you implement this test. 38 | $this->markTestIncomplete( 39 | 'This test has not been implemented yet.' 40 | ); 41 | } 42 | 43 | /** 44 | * @todo Implement testGetCount(). 45 | */ 46 | public function testGetCount() 47 | { 48 | // Remove the following lines when you implement this test. 49 | $this->markTestIncomplete( 50 | 'This test has not been implemented yet.' 51 | ); 52 | } 53 | 54 | /** 55 | * @todo Implement testGetOne(). 56 | */ 57 | public function testGetOne() 58 | { 59 | // Remove the following lines when you implement this test. 60 | $this->markTestIncomplete( 61 | 'This test has not been implemented yet.' 62 | ); 63 | } 64 | 65 | /** 66 | * @todo Implement testGetSome(). 67 | */ 68 | public function testGetSome() 69 | { 70 | // Remove the following lines when you implement this test. 71 | $this->markTestIncomplete( 72 | 'This test has not been implemented yet.' 73 | ); 74 | } 75 | 76 | /** 77 | * @todo Implement testAddUrl(). 78 | */ 79 | public function testAddUrl() 80 | { 81 | // Remove the following lines when you implement this test. 82 | $this->markTestIncomplete( 83 | 'This test has not been implemented yet.' 84 | ); 85 | } 86 | 87 | /** 88 | * @todo Implement testUpdateUrl(). 89 | */ 90 | public function testUpdateUrl() 91 | { 92 | // Remove the following lines when you implement this test. 93 | $this->markTestIncomplete( 94 | 'This test has not been implemented yet.' 95 | ); 96 | } 97 | 98 | /** 99 | * @todo Implement testQuery(). 100 | */ 101 | public function testQuery() 102 | { 103 | // Remove the following lines when you implement this test. 104 | $this->markTestIncomplete( 105 | 'This test has not been implemented yet.' 106 | ); 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /tests/phpunit.xml: -------------------------------------------------------------------------------- 1 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | --------------------------------------------------------------------------------