├── .gitignore
├── README.md
├── composer.json
├── custom
└── skel.inc.php
├── lib
├── StringHelper.php
└── UrlTable.php
├── spider.php
└── tests
├── bootstrap.php
├── lib
├── HttpBaseTest.php
├── HttpClientTest.php
├── HttpConnTest.php
├── HttpProcesserTest.php
├── HttpRequestTest.php
├── HttpResponseTest.php
├── UrlParserTest.php
└── UrlTableMySQLTest.php
└── phpunit.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | # ignores
2 | custom/test.inc.php
3 | composer.lock
4 | vendor
5 | .idea
6 |
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | PHP - spider 框架
2 | ===================
3 |
4 | 这是最近使用纯 `php` 代码开发的并行抓取(爬虫)框架,基于 [hightman\httpclient](https://github.com/hightman/httpclient) 组件。
5 |
6 | 您必须先装有 [composer](http://getcomposer.org),然后在项目里先运行以下命令下载组件:
7 |
8 | ~~~
9 | composer install
10 | ~~~
11 |
12 |
13 | 使用 pspider
14 | --------------
15 |
16 | 这里头的 URL 表管理需要 MySQLi 扩展支持,表结构和自定义的内容参见自定义文件。
17 |
18 | 1. 复制 `custom/skel.inc.php` 为 `custom/your.inc.php`
19 | 2. 根据说明修改 custom/your.inc.php
20 | 3. 根据 custom/your.inc.php 里的注释创建 mysql 的 URL 表
21 | 4. 运行 spider.php -u http://... 即可开始循环抓取
22 | 5. UrlTable 的实现很简单仅作示例,具体可自行重做
23 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "hightman/pspider",
3 | "description": "A pure PHP spider framework",
4 | "keywords": ["php", "spider", "crawler", "robot"],
5 | "homepage": "https://github.com/hightman/pspider/",
6 | "type": "project",
7 | "license": "MIT",
8 | "require": {
9 | "php": ">=5.4.0",
10 | "ext-mysqli": "*",
11 | "lib-pcre": "*",
12 | "hightman/httpclient": "*"
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/custom/skel.inc.php:
--------------------------------------------------------------------------------
1 |
6 | * @link http://www.hightman.cn/
7 | * @copyright Copyright © 2008-2013 Twomice Studio
8 | */
9 |
10 | use hightman\http\Response;
11 | use hightman\http\Request;
12 |
13 | /// --- custom 并发抓取数量
14 | define('PSP_NUM_PARALLEL', 5);
15 |
16 | /// --- custom 同一 URL 连续抓取间隔
17 | define('PSP_CRAWL_PERIOD', 3600);
18 |
19 | /// --- Adding StringHelper on need
20 | /// require_once __DIR__ . '/../lib/StringHelper.php';
21 |
22 | /**
23 | * 设置 MySQL 参数,要求带有 _urls 表,并采用以下结构:
24 | CREATE TABLE `_urls` (
25 | `id` varchar(32) NOT NULL COMMENT 'md5 hash of URL',
26 | `url` text,
27 | `rank` smallint NOT NULL default '0' COMMENT 'process prior level',
28 | `status` smallint NOT NULL default '0' COMMENT 'last http response status',
29 | `select_time` int unsigned NOT NULL default '0' COMMENT 'last process time',
30 | `update_time` int unsigned NOT NULL default '0' COMMENT 'last update time',
31 | PRIMARY KEY (`id`)
32 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='url table for pspider';
33 | */
34 | class UrlTableCustom extends UrlTableMySQL
35 | {
36 |
37 | public function __construct()
38 | {
39 | /// --- custom setting BEGIN
40 | $host = 'localhost';
41 | $user = 'root';
42 | $pass = '';
43 | $dbname = 'test';
44 | /// --- custom setting END
45 |
46 | parent::__construct($host, $user, $pass, $dbname);
47 | $this->test();
48 | }
49 | }
50 |
51 | /**
52 | * 自定义解析器
53 | */
54 | class UrlParserCustom extends UrlParser
55 | {
56 |
57 | /**
58 | * 在这个方法内添加抓取内容解析处理代码
59 | */
60 | public function parse(Response $res, Request $req, $key)
61 | {
62 | parent::parse($res, $req, $key);
63 | if ($res->status === 200) {
64 | /// --- custom code BEGIN ---
65 | echo "PROCESSING: " . $req->getUrl() . "\n";
66 | /// --- custom code END ---
67 | }
68 | }
69 |
70 | /**
71 | * 在这个方法内添加新 URL 过滤规则,主要是调用以下方法:
72 | * followExternal()
73 | * allowDomain(), disallowDomain()
74 | * allow(), disallow(), disallowExt()
75 | *
76 | * 注意:allow() 支持第三在数指定此规则下的页面是否跟随分析
77 | */
78 | public function defaultFilter()
79 | {
80 | parent::defaultFilter();
81 | /// --- custom filter BEGIN ---
82 | $this->followExternal(false);
83 | $this->disallow('.php?q=');
84 | /// --- custom filter END ---
85 | }
86 |
87 | /**
88 | * 在这个方法内定义是否分析处理该 url 内容中的链接
89 | * @param string $url
90 | * @return boolean
91 | */
92 | protected function isFollowUrl($url)
93 | {
94 | return parent::isFollowUrl($url);
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/lib/StringHelper.php:
--------------------------------------------------------------------------------
1 |
6 | * @link http://www.hightman.cn/
7 | * @copyright Copyright © 2008-2013 Twomice Studio
8 | */
9 |
10 | /**
11 | * String Helper (all are static function)
12 | *
13 | *
14 | * StringHelper::decodeHtml($html);
15 | * StringHelper::fixHtmlCharset($html, $charset = 'utf-8');
16 | * StringHelper::finds($buf, $tag1, $tag2[, ...]);
17 | * StringHelper::find($buf, $tag1, $tag2[, ...]);
18 | * StringHelper::contains($buf, $tokens);
19 | *
20 | */
21 | class StringHelper
22 | {
23 |
24 | /**
25 | * @param string $html
26 | * @return string 解码后的 html
27 | */
28 | public static function decodeHtml($html)
29 | {
30 | if (strpos($html, '<') !== false) {
31 | $html = strip_tags($html); /* preg_replace('/<.+?>/u', '', $html); */
32 | }
33 | return html_entity_decode(trim($html), ENT_QUOTES, 'utf-8');
34 | }
35 |
36 | /**
37 | * @param string $charset 目标字符集,默认 utf-8
38 | * @return string 强制转换网页内容为目标字符集
39 | */
40 | public static function fixHtmlCharset($html, $charset = 'utf-8')
41 | {
42 | if (preg_match('/charset=["\']?([0-9a-zA-Z_-]+)/', $html, $match)
43 | && (strncasecmp($charset, 'gb', 2) || strncasecmp($match[1], 'gb', 2))
44 | && strcasecmp($charset, $match[1])) {
45 | if (!strcasecmp($match[1], 'gb2312')) {
46 | $match[1] = 'gbk';
47 | }
48 | if (function_exists('iconv')) {
49 | return iconv($match[1], $charset . '//IGNORE', $html);
50 | } elseif (function_exists('mb_convert_encoding')) {
51 | return mb_convert_encoding($html, $charset, $match[1]);
52 | }
53 | }
54 | return $html;
55 | }
56 |
57 | /**
58 | * 根据标记快速查找字符串列表
59 | * @param string $buf
60 | * @param array $config
61 | * array(
62 | * array(key1, arg1, arg2, ...),
63 | * array(key2, arg1, arg2, ...),
64 | * ),
65 | * @return array
66 | * @see StringMatcher::find
67 | */
68 | public static function finds($buf, $config, &$error = null)
69 | {
70 | $obj = new StringMatcher($buf);
71 | return $obj->finds($config, $error);
72 | }
73 |
74 | /**
75 | * 根据标记快速查找字符串
76 | * @param string $buf
77 | * @return string 返回最后两个标记之间的内容,找不到返回 null
78 | * @see StringMatcher::find
79 | */
80 | public static function find($buf)
81 | {
82 | $args = func_get_args();
83 | array_shift($args);
84 | $obj = new StringMatcher($buf);
85 | return call_user_func_array(array($obj, 'find'), $args);
86 | }
87 |
88 | /**
89 | * 判断字符串是否包含数组中的字符串
90 | * @param string $buf 源字符串
91 | * @param array $tokens 字符串标记列表
92 | * @return boolean
93 | */
94 | public static function contains($buf, $tokens)
95 | {
96 | foreach ($tokens as $token) {
97 | if (strpos($buf, $token) !== false) {
98 | return true;
99 | }
100 | }
101 | return false;
102 | }
103 | }
104 |
105 | /**
106 | * StringMatcher to parse data
107 | */
108 | class StringMatcher
109 | {
110 | private $_buf, $_pos;
111 |
112 | /**
113 | * @param string $buf
114 | */
115 | public function __construct($buf)
116 | {
117 | $this->_buf = $buf;
118 | $this->_pos = 0;
119 | }
120 |
121 | /**
122 | * 批量查找
123 | * @param array $config
124 | * array(
125 | * array(key1, arg1, arg2, ...),
126 | * array(key2, arg1, arg2, ...),
127 | * ),
128 | * @param string $error optional reference
129 | * @return array
130 | */
131 | public function finds($config, &$error = null)
132 | {
133 | $ret = array();
134 | foreach ($config as $args) {
135 | $key = array_shift($args);
136 | $val = call_user_func_array(array($this, 'find'), $args);
137 | if ($val === null || $val === false) {
138 | $error = 'Cannot find `' . $key . '\': ' . implode(' ... ', $args);
139 | $pos = strrpos($error, '...');
140 | $error = substr_replace($error, '???', $pos, 3);
141 | continue;
142 | //return false;
143 | }
144 | $ret[$key] = $val;
145 | }
146 | return $ret;
147 | }
148 |
149 | /**
150 | * 根据特征查找字符串,不定参数:
151 | * 起始1,起始2,起始3 ... 结束关键
152 | * 新增支持特殊串
153 | * "$$$...",表示后面的字符串必须在这个字符串之前,以免跨越太大
154 | * "^^^...",表示后面的字符串如果在这个串之前就用采用当前串的位置
155 | * @return string 成功返回区间内的字符串并将位置设在本字符串之末,若找不到返回 null
156 | */
157 | public function find()
158 | {
159 | $args = func_get_args();
160 | $cnt = count($args);
161 | if ($cnt < 2) {
162 | return trigger_error(__CLASS__ . '::find() expects at least 2 parameters, ' . $cnt . ' given', E_USER_WARNING);
163 | }
164 | for ($end = $pre = false, $pos1 = $this->_pos, $i = 0; $i < ($cnt - 1); $i++) {
165 | if (substr($args[$i], 0, 3) === '$$$') {
166 | $end = strpos($this->_buf, substr($args[$i], 3), $pos1);
167 | } elseif (substr($args[$i], 0, 3) === '^^^') {
168 | $pre = strpos($this->_buf, substr($args[$i], 3), $pos1);
169 | } else {
170 | $pos1 = strpos($this->_buf, $args[$i], $pos1);
171 | if ($pos1 === false) {
172 | return null;
173 | } elseif ($end !== false && $pos1 > $end) {
174 | return '';
175 | }
176 | if ($pre !== false) {
177 | if ($pos1 > $pre) {
178 | $pos1 = $pre;
179 | }
180 | $pre = false;
181 | }
182 | $pos1 += strlen($args[$i]);
183 | }
184 | }
185 | if (($pos2 = strpos($this->_buf, $args[$i], $pos1)) !== false) {
186 | if ($end !== false && $pos2 > $end) {
187 | return '';
188 | }
189 | if ($pre !== false) {
190 | if ($pos2 > $pre) {
191 | $pos2 = $pre;
192 | }
193 | $pre = false;
194 | }
195 | $this->_pos = $pos2;
196 | return substr($this->_buf, $pos1, $pos2 - $pos1);
197 | }
198 | return null;
199 | }
200 |
201 | /**
202 | * 移动当前处理位置位置指针,类似 fseek
203 | * @param int $offset
204 | * @param int $whence 可选值:SEEK_SET/SEEK_CUR/SEEK_END
205 | */
206 | public function seek($offset, $whence = SEEK_CUR)
207 | {
208 | $offset = intval($offset);
209 | switch ($whence) {
210 | case SEEK_SET:
211 | $this->_pos = $offset;
212 | break;
213 | case SEEK_END:
214 | $this->_pos = $offset + strlen($this->_buf);
215 | break;
216 | case SEEK_CUR:
217 | default:
218 | $this->_pos += $offset;
219 | break;
220 | }
221 | return $this->_pos;
222 | }
223 | }
224 |
--------------------------------------------------------------------------------
/lib/UrlTable.php:
--------------------------------------------------------------------------------
1 |
6 | * @link http://www.hightman.cn/
7 | * @copyright Copyright © 2008-2013 Twomice Studio
8 | */
9 | use hightman\http\ParseInterface;
10 | use hightman\http\Response;
11 | use hightman\http\Request;
12 |
13 | /**
14 | * URL 列表管理接口
15 | */
16 | interface UrlTable
17 | {
18 | /**
19 | * 同一 URL 连续处理的时间间隔
20 | */
21 | const DURATION = 3600;
22 |
23 | /**
24 | * @return int URL 列表总个数
25 | */
26 | public function getCount();
27 |
28 | /**
29 | * @param int $duration 同一
30 | * @return string 返回一个待处理的 URL,若无返回 null 出错则返回 false
31 | */
32 | public function getOne($duration = self::DURATION);
33 |
34 | /**
35 | * @param int $limit
36 | * @param int $duration
37 | * @return array 返回不超过指定个数的 URL 数组,若无返回空数组,出错则返回 false
38 | */
39 | public function getSome($limit = 5, $duration = self::DURATION);
40 |
41 | /**
42 | * @param string $url 要添加的 URL
43 | * @param int $rank 被取出处理的优先级
44 | * @return boolean 成功返回 true,若已存在或其它原因失败均返回 false
45 | */
46 | public function addUrl($url, $rank = 0);
47 |
48 | /**
49 | * @param string $url 要更新的 URL
50 | * @param int $status URL 处理后的状态码
51 | * @return boolean 成功返回 true, 失败返回 false
52 | */
53 | public function updateUrl($url, $status = 200);
54 |
55 | /**
56 | * @param string $url 要删除的 URL
57 | * @return boolean 成功返回 true,失败返回 false
58 | */
59 | public function delUrl($url);
60 | }
61 |
62 | /**
63 | * 基于 MySQLi 的 URL 列表管理,结构如下:
64 | * CREATE TABLE `_urls` (
65 | * `id` varchar(32) NOT NULL COMMENT 'md5 hash of URL',
66 | * `url` text NOT NULL,
67 | * `rank` smallint(6) NOT NULL COMMENT 'process prior level',
68 | * `status` smallint(6) NOT NULL COMMENT 'last http response status',
69 | * `select_time` bigint(20) NOT NULL COMMENT 'last process time',
70 | * `update_time` bigint(20) NOT NULL COMMENT 'last update time',
71 | * PRIMARY KEY (`id`)
72 | * ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='url table for pspider';
73 | */
74 | class UrlTableMySQL extends mysqli implements UrlTable
75 | {
76 | private $_table = '_urls';
77 | private $_addCache = array();
78 |
79 | /**
80 | * @param string $name 设置数据库表名,默认 _urls
81 | */
82 | public function setTableName($name)
83 | {
84 | $this->_table = $name;
85 | }
86 |
87 | public function getCount()
88 | {
89 | $res = $this->query('SELECT COUNT(*) AS count FROM ' . $this->_table);
90 | if ($res !== false) {
91 | $row = $res->fetch_assoc();
92 | $res->free();
93 | return $row['count'];
94 | }
95 | return 0;
96 | }
97 |
98 | public function getOne($duration = self::DURATION)
99 | {
100 | $urls = $this->getSome(1, $duration);
101 | if (!is_array($urls)) {
102 | return false;
103 | }
104 | return count($urls) > 0 ? $urls[0] : null;
105 | }
106 |
107 | public function getSome($limit = 5, $duration = self::DURATION)
108 | {
109 | $now = time();
110 | $sql = 'SELECT id, url, ((' . $now . ' - select_time) * (rank + 1) / (status + 1)) AS score FROM ' . $this->_table . ' ';
111 | $sql .= 'WHERE select_time < ' . ($now - $duration) . ' '; // expired
112 | $sql .= 'OR (select_time > update_time AND select_time < ' . ($now - 300) . ') '; // failed
113 | $sql .= 'ORDER BY score DESC LIMIT ' . intval($limit);
114 | ($fd = @fopen(sys_get_temp_dir() . DIRECTORY_SEPARATOR . __CLASS__ . '.lock', 'w')) && flock($fd, LOCK_EX);
115 | if (($res = $this->query($sql)) === false) {
116 | $ret = false;
117 | } else {
118 | $ret = $ids = array();
119 | while ($row = $res->fetch_assoc()) {
120 | $ids[] = $row['id'];
121 | $ret[] = $row['url'];
122 | }
123 | $res->free();
124 | if (count($ids) > 0) {
125 | $sql = 'UPDATE ' . $this->_table . ' SET select_time = ' . $now . ' ';
126 | $sql .= 'WHERE id IN (\'' . implode('\', \'', $ids) . '\')';
127 | $this->query($sql);
128 | }
129 | }
130 | $fd && flock($fd, LOCK_UN) && fclose($fd);
131 | return $ret;
132 | }
133 |
134 | public function addUrl($url, $rank = 0)
135 | {
136 | $id = md5($url);
137 | if ($this->inAddCache($id)) {
138 | return false;
139 | }
140 | $url = $this->real_escape_string($url);
141 | $sql = 'INSERT INTO ' . $this->_table . ' (id, url, rank) ';
142 | $sql .= 'VALUES (\'' . $id . '\', \'' . $url . '\', ' . intval($rank) . ')';
143 | return $this->query($sql);
144 | }
145 |
146 | public function updateUrl($url, $status = 200)
147 | {
148 | $now = time();
149 | $sql = 'UPDATE ' . $this->_table . ' SET status = ' . intval($status) . ', update_time = ' . $now . ' ';
150 | $sql .= 'WHERE id = \'' . md5($url) . '\'';
151 | return $this->query($sql);
152 | }
153 |
154 | public function delUrl($url)
155 | {
156 | $sql = 'DELETE FROM ' . $this->_table . ' WHERE id = \'' . md5($url) . '\'';
157 | return $this->query($sql) && $this->affected_rows === 1;
158 | }
159 |
160 | public function query($query, $mode = MYSQLI_STORE_RESULT)
161 | {
162 | $this->ping();
163 | $res = parent::query($query, $mode);
164 | return $res;
165 | }
166 |
167 | protected function test()
168 | {
169 | if ($this->connect_error) {
170 | return trigger_error($this->connect_error, E_USER_ERROR);
171 | }
172 | $url = 'http://' . uniqid() . '.com/';
173 | if (!$this->addUrl($url)) {
174 | return trigger_error($this->error, E_USER_ERROR);
175 | }
176 | $this->delUrl($url);
177 | return true;
178 | }
179 |
180 | private function inAddCache($id)
181 | {
182 | $now = time();
183 | if (isset($this->_addCache[$id])) {
184 | $this->_addCache[$id] = $now;
185 | return true;
186 | }
187 | $this->_addCache[$id] = $now;
188 | if (count($this->_addCache) > 20000) {
189 | $cache = array();
190 | $expire = $now - 3600;
191 | foreach ($this->_addCache as $key => $value) {
192 | if ($value > $expire) {
193 | $cache[$key] = $value;
194 | }
195 | }
196 | $this->_addCache = $cache;
197 | }
198 | return false;
199 | }
200 | }
201 |
202 | /**
203 | * 带 URL 提取功能的解析器基础类
204 | *
205 | * 设置是 URL 过滤排除规则:
206 | * 规则语法支持局部字符串匹配,或正则匹配(必须是 # 开头)
207 | * 1. 若是默认允许的外站域名,则检测 disallowDomain 匹配一条则直接排除
208 | * 2. 若是默认不允许的外站域名,则检测 allowDomain,匹配任何一条则通过继续检测
209 | * 3. 检测 disallow 规则,匹配其中一条则立即排除
210 | * 4. 检测 allow 规则,若为空则直接通过,否则必须至少满足其中一条
211 | * 5. 检测 disallowExt 规则,匹配不允许的扩展名则直接排除
212 | * 6. 最终通过 ^-^
213 | */
214 | class UrlParser implements ParseInterface
215 | {
216 | private $_timeBegin, $_numAdd, $_numUpdate, $_numFilter;
217 | private $_followExternal;
218 | private $_disallowDomain, $_allowDomain, $_disallow, $_allow;
219 | private $_allowRank;
220 | private $_nofollow;
221 | private $_disallowExt = array(
222 | '.tar' => true, '.gz' => true, '.tgz' => true, '.zip' => true, '.Z' => true, '.7z' => true,
223 | '.rpm' => true, '.deb' => true, '.ps' => true, '.dvi' => true, '.pdf' => true, '.smi' => true,
224 | '.png' => true, '.jpg' => true, '.jpeg' => true, '.bmp' => true, '.tiff' => true, '.gif' => true,
225 | '.mov' => true, '.avi' => true, '.mpeg' => true, '.mpg' => true, '.mp3' => true, '.qt' => true,
226 | '.wav' => true, '.ram' => true, '.rm' => true, '.rmvb' => true, '.jar' => true, '.java' => true,
227 | '.class' => true, '.diff' => true, '.doc' => true, '.docx' => true, '.xls' => true, '.ppt' => true,
228 | '.mdb' => true, '.rtf' => true, '.exe' => true, '.pps' => true, '.so' => true, '.psd' => true,
229 | '.css' => true, '.js' => true, '.ico' => true, '.dll' => true, '.bz2' => true, '.rar' => true,
230 | );
231 | private $_ut;
232 |
233 | /**
234 | * @param UrlTable $ut
235 | */
236 | public function __construct(UrlTable $ut)
237 | {
238 | $this->_ut = $ut;
239 | $this->_timeBegin = time();
240 | $this->_numAdd = $this->_numUpdate = $this->_numFilter = 0;
241 | // apply default filters for extending
242 | $this->resetFilter();
243 | $this->defaultFilter();
244 | }
245 |
246 | public function __destruct()
247 | {
248 | $this->_ut = null;
249 | }
250 |
251 | /**
252 | * @return UrlTable
253 | */
254 | public function getUrlTable()
255 | {
256 | return $this->_ut;
257 | }
258 |
259 | /**
260 | * 扩展该类时在此应用默认的 URL 过滤规则
261 | */
262 | public function defaultFilter()
263 | {
264 |
265 | }
266 |
267 | /**
268 | * 重置所有过滤规则,但不包含后缀过滤规则
269 | */
270 | public function resetFilter()
271 | {
272 | $this->_followExternal = false;
273 | $this->_disallowDomain = array();
274 | $this->_allowDomain = array();
275 | $this->_disallow = array();
276 | $this->_allow = array();
277 | $this->_allowRank = array();
278 | $this->_nofollow = array();
279 | }
280 |
281 | /**
282 | * @param boolean $on 设置是否处理站外 URL,默认为 false
283 | */
284 | public function followExternal($on = true)
285 | {
286 | $this->_followExternal = $on === true ? true : false;
287 | }
288 |
289 | /**
290 | * @param string $rule 不允许的域名规则,支持正则表达式
291 | */
292 | public function disallowDomain($rule)
293 | {
294 | $this->saveMatchRule($this->_disallowDomain, $rule);
295 | }
296 |
297 | /**
298 | * @param string $rule 允许的域名规则,支持正则表达式
299 | */
300 | public function allowDomain($rule)
301 | {
302 | $this->saveMatchRule($this->_allowDomain, $rule);
303 | }
304 |
305 | /**
306 | * @param string $rule 不允许的 URL 规则,支持正则表达式
307 | */
308 | public function disallow($rule)
309 | {
310 | $this->saveMatchRule($this->_disallow, $rule);
311 | }
312 |
313 | /**
314 | * @param string $rule 允许的 URL 规则,支持正则表达式
315 | * @param int $rank 匹配此规则的 URL 的权重值
316 | * @param boolean $follow 是否跟随分析此规则页面中的链接
317 | */
318 | public function allow($rule, $rank = null, $follow = true)
319 | {
320 | $this->saveMatchRule($this->_allow, $rule);
321 | if ($rank !== null) {
322 | $this->_allowRank[$rule] = intval($rank);
323 | }
324 | if (!$follow) {
325 | $this->saveMatchRule($this->_nofollow, $rule);
326 | }
327 | }
328 |
329 | /**
330 | * @param string $name 不允许的 URL 扩展名,必须以 . 开头
331 | */
332 | public function disallowExt($name)
333 | {
334 | $this->_disallowExt[strtolower($name)] = true;
335 | }
336 |
337 | /**
338 | * @param string $name 强制允许的 URL 扩展名,必须以 . 开头
339 | */
340 | public function allowExt($name)
341 | {
342 | if (substr($name, 0, 1) === '.') {
343 | $name = strtolower($name);
344 | if (isset($this->_disallowExt[$name])) {
345 | unset($this->_disallowExt[$name]);
346 | }
347 | }
348 | }
349 |
350 | /**
351 | * 打印或返回统计情况
352 | * @param boolean $output 是否直接输出结果
353 | */
354 | public function stat($output = false)
355 | {
356 | // time
357 | $time = time() - $this->_timeBegin;
358 | $string = date('m-d H:i:s') . ' - Time cost: ';
359 | if ($time > 3600) {
360 | $string .= intval($time / 3600) . ' hours ';
361 | $time %= 3600;
362 | }
363 | if ($time > 60) {
364 | $string .= intval($time / 60) . ' mins ';
365 | $time %= 60;
366 | }
367 | $string .= $time . ' secs, ';
368 | // stats
369 | $string .= sprintf('URLs total: %d, Add: %d, Update: %d, Filtered: %d', $this->_ut->getCount(), $this->_numAdd, $this->_numUpdate, $this->_numFilter);
370 | if ($output !== true) {
371 | return $string;
372 | }
373 | echo $string . "\n";
374 | }
375 |
376 | /**
377 | * 实现 HttpParser 中定义的方法
378 | * @param Response $res
379 | * @param Request $req
380 | * @param mixed $key
381 | */
382 | public function parse(Response $res, Request $req, $key)
383 | {
384 | // update url
385 | $rawUrl = $req->getRawUrl();
386 | if ($this->_ut->updateUrl($rawUrl, $res->status)) {
387 | $this->_numUpdate++;
388 | }
389 | // parse url from body
390 | if ($res->status === 200 && $this->isFollowUrl($rawUrl)) {
391 | // get baseUrl
392 | $baseUrl = $req->getUrl();
393 | if (preg_match('/]/i', $res->body, $match)) {
394 | $baseUrl = $this->resetUrl($match[1], $baseUrl);
395 | }
396 | // href="xxx", href='xxx'
397 | if (preg_match_all('/href=([\'"])(.*?)\1/i', $res->body, $matches) > 0) {
398 | foreach ($matches[2] as $url) {
399 | $this->processUrl($url, $baseUrl, $res->url);
400 | }
401 | }
402 | // href=xxx
403 | if (preg_match_all('/href=(?![\'"])(.*?)[\s>]/i', $res->body, $matches) > 0) {
404 | foreach ($matches[1] as $url) {
405 | $this->processUrl($url, $baseUrl, $res->url);
406 | }
407 | }
408 | } elseif ($res->status === 301 || $res->status === 302) {
409 | $url = $this->resetUrl($res->getHeader('location'), $req->getUrl());
410 | $res->setHeader('location', $url); // overwrite formated url
411 | // save url for permanent redirection
412 | if ($res->status === 301) {
413 | $this->processUrl($url, $res->url);
414 | }
415 | }
416 | }
417 |
418 | /**
419 | * @param string $url
420 | * @param string $rawUrl 原先的开始页面 URL,用于计算是否为站外
421 | * @param string &$rank
422 | * @return boolean 是否 URL 符合过滤规则需要排除,需要排除返回 true
423 | */
424 | public function isDisallow($url, $rawUrl = null, &$rank = null)
425 | {
426 | // get domain
427 | if (($pos1 = strpos($url, '://')) === false) {
428 | return true;
429 | }
430 | $pos1 += 3;
431 | $pos2 = strpos($url, '/', $pos1);
432 | $domain = $pos2 === false ? substr($url, $pos1) : substr($url, $pos1, $pos2 - $pos1);
433 | // external domain
434 | if ($rawUrl !== null && !@strstr($rawUrl, $domain)) {
435 | // disallow domain
436 | if ($this->_followExternal && $this->isMatchRule($this->_disallowDomain, $domain)) {
437 | return true;
438 | }
439 | // allow domain
440 | if (!$this->_followExternal
441 | && (count($this->_allowDomain) === 0 || !$this->isMatchRule($this->_allowDomain, $domain))) {
442 | return true;
443 | }
444 | }
445 | // disallow
446 | if ($this->isMatchRule($this->_disallow, $url)) {
447 | return true;
448 | }
449 | // allow
450 | if (count($this->_allow) > 0 && !$this->isMatchRule($this->_allow, $url, $rank)) {
451 | return true;
452 | }
453 | // dislaowExt
454 | if (($pos1 = strpos($url, '?')) === false) {
455 | $pos1 = strlen($url);
456 | }
457 | if (($pos2 = strpos($url, '/', 8)) !== false
458 | && ($ext = strrchr(substr($url, $pos2, $pos1 - $pos2), '.'))) {
459 | $ext = strtolower($ext);
460 | if (isset($this->_disallowExt[$ext])) {
461 | return true;
462 | }
463 | }
464 | return false;
465 | }
466 |
467 | /**
468 | * @param string $url
469 | * @param string $baseUrl
470 | * @return string 返回处理好的标准 URL
471 | */
472 | public function resetUrl($url, $baseUrl = null)
473 | {
474 | // 开头处理
475 | if (!strncasecmp($url, 'http://http://', 14)) {
476 | $url = substr($url, 7);
477 | }
478 | if (strncasecmp($url, 'http://', 7) && strncasecmp($url, 'https://', 8)) {
479 | if ($baseUrl === null) {
480 | $url = 'http://' . $url;
481 | } else {
482 | if (substr($url, 0, 1) === '/') {
483 | $pos = @strpos($baseUrl, '/', 8);
484 | $url = ($pos === false ? $baseUrl : substr($baseUrl, 0, $pos)) . $url;
485 | } else {
486 | $pos = @strrpos($baseUrl, '/', 8);
487 | $url = ($pos === false ? $baseUrl . '/' : substr($baseUrl, 0, $pos + 1)) . $url;
488 | }
489 | }
490 | }
491 | // 统一 URL 格式,顶级网址以 / 结尾,去除 # 后的锚点
492 | if (@strpos($url, '/', 8) === false) {
493 | $url .= '/';
494 | }
495 | if (($pos = strrpos($url, '#')) !== false) {
496 | $url = substr($url, 0, $pos);
497 | }
498 | // 计算并处理 '../../' 等多余的相对 URL
499 | if (strpos($url, '/./') !== false || strpos($url, '/../') !== false) {
500 | $parts = array();
501 | $tmpa = explode('/', substr($url, 8));
502 | for ($i = 0; $i < count($tmpa); $i++) {
503 | if ($tmpa[$i] === '.' || ($tmpa[$i] === '' && isset($tmpa[$i + 1]))) {
504 | continue;
505 | } elseif ($tmpa[$i] !== '..') {
506 | array_push($parts, $tmpa[$i]);
507 | } elseif (count($parts) > 1) {
508 | array_pop($parts);
509 | }
510 | }
511 | $url = substr($url, 0, 8) . implode('/', $parts);
512 | }
513 | return $url;
514 | }
515 |
516 | /**
517 | * @param string $url
518 | * @return boolean 是否分析处理当前 URL 内容中的链接
519 | */
520 | protected function isFollowUrl($url)
521 | {
522 | return !$this->isMatchRule($this->_nofollow, $url);
523 | }
524 |
525 | /**
526 | * @return mixed
527 | */
528 | protected function processUrl($url, $baseUrl, $rawUrl = null)
529 | {
530 | if (substr($url, 0, 1) === '#' || !strncasecmp($url, 'javascript:', 11) || !strncasecmp($url, 'mailto:', 7)) {
531 | return 'SKIP';
532 | }
533 | $url = $this->resetUrl($url, $baseUrl);
534 | $rank = 0;
535 | if ($this->isDisallow($url, $rawUrl === null ? $baseUrl : $rawUrl, $rank)) {
536 | $this->_numFilter++;
537 | return 'FILTER';
538 | }
539 | if ($this->_ut->addUrl($url, $rank)) {
540 | $this->_numAdd++;
541 | return 'ADD';
542 | }
543 | return 'SKIP';
544 | }
545 |
546 | private function saveMatchRule(&$array, $rule)
547 | {
548 | if ($rule === null) {
549 | $array = array();
550 | } elseif ($this->isRegexPattern($rule)) {
551 | array_push($array, "\xff" . $rule);
552 | } else {
553 | array_unshift($array, $rule);
554 | }
555 | }
556 |
557 | private function isMatchRule($rules, $input, &$rank = null)
558 | {
559 | foreach ($rules as $rule) {
560 | if (ord($rule[0]) !== 0xff) {
561 | $matched = stristr($input, $rule) !== false;
562 | } else {
563 | $rule = substr($rule, 1);
564 | $matched = preg_match($rule, $input) > 0;
565 | }
566 | if ($matched === true) {
567 | if (isset($this->_allowRank[$rule])) {
568 | $rank = $this->_allowRank[$rule];
569 | }
570 | return true;
571 | }
572 | }
573 | return false;
574 | }
575 |
576 | private function isRegexPattern($input)
577 | {
578 | if (strlen($input) > 2 && $input[0] === '#') {
579 | for ($i = strlen($input) - 1; $i > 1; $i--) {
580 | if ($input[$i] === $input[0]) {
581 | return true;
582 | }
583 | if ($input[$i] !== 'i' && $input[$i] !== 'u') {
584 | break;
585 | }
586 | }
587 | }
588 | return false;
589 | }
590 | }
591 |
--------------------------------------------------------------------------------
/spider.php:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env php
2 | [options]
22 | pspider -h
23 |
24 | -c The name of custom file under 'custom/'
25 | -n The number of URLs to crawl in parallel
26 | -q Quit after processing the number of URLs
27 | -p Time interval to crawl the same URL
28 | -u The start URL (forced to crawl once)
29 | -h This help
30 |
31 | EOF;
32 | exit(0);
33 | }
34 |
35 | $file = __DIR__ . '/custom/' . $options['c'] . '.inc.php';
36 | if (!file_exists($file)) {
37 | echo "ERROR: Custom file dose not exists '$file'.\n";
38 | exit(-1);
39 | }
40 |
41 | // add library
42 | require_once __DIR__ . '/vendor/autoload.php';
43 | require_once __DIR__ . '/lib/UrlTable.php';
44 | include $file;
45 |
46 | if (!class_exists('UrlTableCustom') || !class_exists('UrlParserCustom')) {
47 | echo "ERROR: Invalid custom file '$file'.\n";
48 | echo "Must have defined class 'UrlParserCustom' inherited from 'UrlParser',\n";
49 | echo "and class 'UrlTableCustom' inherited from 'UrlTableMySQL'.\n";
50 | echo "Please see 'custom/skel.inc.php'.\n";
51 | exit(-1);
52 | }
53 |
54 | $options['n'] = isset($options['n']) ? intval($options['n']) : PSP_NUM_PARALLEL;
55 | $options['p'] = isset($options['p']) ? intval($options['p']) : PSP_CRAWL_PERIOD;
56 | $options['q'] = isset($options['q']) ? intval($options['q']) : 0;
57 |
58 | // create objects
59 | $ut = new UrlTableCustom;
60 | $up = new UrlParserCustom($ut);
61 | $http = new \hightman\http\Client($up);
62 |
63 | // start url
64 | if (isset($options['u'])) {
65 | $http->get($up->resetUrl($options['u']));
66 | }
67 |
68 | // loop to handle
69 | $num = 0;
70 | while ($urls = $ut->getSome($options['n'], $options['p'])) {
71 | if (count($urls) === 0) {
72 | break;
73 | }
74 | $http->mget($urls);
75 | $num += count($urls);
76 | if ($options['q'] > 0 && $num >= $options['q']) {
77 | break;
78 | }
79 | if ($num > $options['n'] && ($num % 1000) < $options['n']) {
80 | $up->stat(true);
81 | }
82 | }
83 |
84 | // print stats
85 | $up->stat(true);
86 | echo "OK, finished!\n";
87 |
--------------------------------------------------------------------------------
/tests/bootstrap.php:
--------------------------------------------------------------------------------
1 | object = new HttpBase;
21 | }
22 |
23 | /**
24 | * Tears down the fixture, for example, closes a network connection.
25 | * This method is called after a test is executed.
26 | */
27 | protected function tearDown()
28 | {
29 |
30 | }
31 |
32 | public function testSetHeader()
33 | {
34 | $obj = $this->object;
35 |
36 | $obj->setHeader('content-type', 'text/html');
37 | $this->assertEquals('text/html', $obj->getHeader('Content-Type'));
38 | $obj->setHeader('content-type', 'text/plain');
39 | $this->assertEquals('text/plain', $obj->getHeader('Content-Type'));
40 | $this->assertArrayHasKey('content-type', $obj->getHeader(null));
41 |
42 | $obj->setHeader('content-type', null);
43 | $this->assertNull($obj->getHeader('content-type'));
44 | $this->assertNull($obj->getHeader('Content-Type'));
45 | $this->assertArrayNotHasKey('content-type', $obj->getHeader(null));
46 | }
47 |
48 | public function testAddHeader()
49 | {
50 | $obj = $this->object;
51 | $this->assertFalse($obj->hasHeader('content-type'));
52 | $obj->addHeader('content-type', 'text/html');
53 | $this->assertTrue($obj->hasHeader('content-type'));
54 | $this->assertEquals('text/html', $obj->getHeader('Content-Type'));
55 |
56 | $obj->addHeader('content-type', 'text/plain');
57 | $this->assertInternalType('array', $obj->getHeader('Content-Type'));
58 | $obj->setHeader('content-type', null);
59 | $this->assertNull($obj->getHeader('content-type'));
60 |
61 | $this->assertInternalType('array', $obj->getHeader(null));
62 | $obj->clearHeader();
63 | $this->assertEmpty($obj->getHeader(null));
64 | }
65 |
66 | /**
67 | * @todo Implement testSetCookie().
68 | */
69 | public function testSetCookie()
70 | {
71 | // Remove the following lines when you implement this test.
72 | $this->markTestIncomplete(
73 | 'This test has not been implemented yet.'
74 | );
75 | }
76 |
77 | /**
78 | * @todo Implement testClearCookie().
79 | */
80 | public function testClearCookie()
81 | {
82 | // Remove the following lines when you implement this test.
83 | $this->markTestIncomplete(
84 | 'This test has not been implemented yet.'
85 | );
86 | }
87 |
88 | /**
89 | * @todo Implement testGetCookie().
90 | */
91 | public function testGetCookie()
92 | {
93 | // Remove the following lines when you implement this test.
94 | $this->markTestIncomplete(
95 | 'This test has not been implemented yet.'
96 | );
97 | }
98 |
99 | /**
100 | * @todo Implement testApplyCookie().
101 | */
102 | public function testApplyCookie()
103 | {
104 | // Remove the following lines when you implement this test.
105 | $this->markTestIncomplete(
106 | 'This test has not been implemented yet.'
107 | );
108 | }
109 |
110 | /**
111 | * @todo Implement testFetchCookieToSend().
112 | */
113 | public function testFetchCookieToSend()
114 | {
115 | // Remove the following lines when you implement this test.
116 | $this->markTestIncomplete(
117 | 'This test has not been implemented yet.'
118 | );
119 | }
120 | }
121 |
122 |
--------------------------------------------------------------------------------
/tests/lib/HttpClientTest.php:
--------------------------------------------------------------------------------
1 | object = new HttpClient;
21 | }
22 |
23 | /**
24 | * Tears down the fixture, for example, closes a network connection.
25 | * This method is called after a test is executed.
26 | */
27 | protected function tearDown()
28 | {
29 |
30 | }
31 |
32 | /**
33 | * @todo Implement testDebug().
34 | */
35 | public function testDebug()
36 | {
37 | // Remove the following lines when you implement this test.
38 | $this->markTestIncomplete(
39 | 'This test has not been implemented yet.'
40 | );
41 | }
42 |
43 | /**
44 | * @todo Implement testSetCookiePath().
45 | */
46 | public function testSetCookiePath()
47 | {
48 | // Remove the following lines when you implement this test.
49 | $this->markTestIncomplete(
50 | 'This test has not been implemented yet.'
51 | );
52 | }
53 |
54 | /**
55 | * @todo Implement testSetParser().
56 | */
57 | public function testSetParser()
58 | {
59 | // Remove the following lines when you implement this test.
60 | $this->markTestIncomplete(
61 | 'This test has not been implemented yet.'
62 | );
63 | }
64 |
65 | /**
66 | * @todo Implement testRunParser().
67 | */
68 | public function testRunParser()
69 | {
70 | // Remove the following lines when you implement this test.
71 | $this->markTestIncomplete(
72 | 'This test has not been implemented yet.'
73 | );
74 | }
75 |
76 | /**
77 | * @todo Implement testClearHeader().
78 | */
79 | public function testClearHeader()
80 | {
81 | // Remove the following lines when you implement this test.
82 | $this->markTestIncomplete(
83 | 'This test has not been implemented yet.'
84 | );
85 | }
86 |
87 | /**
88 | * @todo Implement testGet().
89 | */
90 | public function testGet()
91 | {
92 | // Remove the following lines when you implement this test.
93 | $this->markTestIncomplete(
94 | 'This test has not been implemented yet.'
95 | );
96 | }
97 |
98 | /**
99 | * @todo Implement testHead().
100 | */
101 | public function testHead()
102 | {
103 | // Remove the following lines when you implement this test.
104 | $this->markTestIncomplete(
105 | 'This test has not been implemented yet.'
106 | );
107 | }
108 |
109 | /**
110 | * @todo Implement testProcess().
111 | */
112 | public function testProcess()
113 | {
114 | // Remove the following lines when you implement this test.
115 | $this->markTestIncomplete(
116 | 'This test has not been implemented yet.'
117 | );
118 | }
119 |
120 | /**
121 | * @todo Implement testExec().
122 | */
123 | public function testExec()
124 | {
125 | // Remove the following lines when you implement this test.
126 | $this->markTestIncomplete(
127 | 'This test has not been implemented yet.'
128 | );
129 | }
130 |
131 | /**
132 | * @todo Implement test__destruct().
133 | */
134 | public function test__destruct()
135 | {
136 | // Remove the following lines when you implement this test.
137 | $this->markTestIncomplete(
138 | 'This test has not been implemented yet.'
139 | );
140 | }
141 | }
142 |
--------------------------------------------------------------------------------
/tests/lib/HttpConnTest.php:
--------------------------------------------------------------------------------
1 | object = HttpConn::connect('');
21 | }
22 |
23 | /**
24 | * Tears down the fixture, for example, closes a network connection.
25 | * This method is called after a test is executed.
26 | */
27 | protected function tearDown()
28 | {
29 |
30 | }
31 |
32 | /**
33 | * @todo Implement testConnect().
34 | */
35 | public function testConnect()
36 | {
37 | // Remove the following lines when you implement this test.
38 | $this->markTestIncomplete(
39 | 'This test has not been implemented yet.'
40 | );
41 | }
42 |
43 | /**
44 | * @todo Implement testFindBySock().
45 | */
46 | public function testFindBySock()
47 | {
48 | // Remove the following lines when you implement this test.
49 | $this->markTestIncomplete(
50 | 'This test has not been implemented yet.'
51 | );
52 | }
53 |
54 | /**
55 | * @todo Implement testGetLastError().
56 | */
57 | public function testGetLastError()
58 | {
59 | // Remove the following lines when you implement this test.
60 | $this->markTestIncomplete(
61 | 'This test has not been implemented yet.'
62 | );
63 | }
64 |
65 | /**
66 | * @todo Implement testClose().
67 | */
68 | public function testClose()
69 | {
70 | // Remove the following lines when you implement this test.
71 | $this->markTestIncomplete(
72 | 'This test has not been implemented yet.'
73 | );
74 | }
75 |
76 | /**
77 | * @todo Implement testAddWriteData().
78 | */
79 | public function testAddWriteData()
80 | {
81 | // Remove the following lines when you implement this test.
82 | $this->markTestIncomplete(
83 | 'This test has not been implemented yet.'
84 | );
85 | }
86 |
87 | /**
88 | * @todo Implement testHasDataToWrite().
89 | */
90 | public function testHasDataToWrite()
91 | {
92 | // Remove the following lines when you implement this test.
93 | $this->markTestIncomplete(
94 | 'This test has not been implemented yet.'
95 | );
96 | }
97 |
98 | /**
99 | * @todo Implement testWrite().
100 | */
101 | public function testWrite()
102 | {
103 | // Remove the following lines when you implement this test.
104 | $this->markTestIncomplete(
105 | 'This test has not been implemented yet.'
106 | );
107 | }
108 |
109 | /**
110 | * @todo Implement testGetLine().
111 | */
112 | public function testGetLine()
113 | {
114 | // Remove the following lines when you implement this test.
115 | $this->markTestIncomplete(
116 | 'This test has not been implemented yet.'
117 | );
118 | }
119 |
120 | /**
121 | * @todo Implement testRead().
122 | */
123 | public function testRead()
124 | {
125 | // Remove the following lines when you implement this test.
126 | $this->markTestIncomplete(
127 | 'This test has not been implemented yet.'
128 | );
129 | }
130 |
131 | /**
132 | * @todo Implement testGetSock().
133 | */
134 | public function testGetSock()
135 | {
136 | // Remove the following lines when you implement this test.
137 | $this->markTestIncomplete(
138 | 'This test has not been implemented yet.'
139 | );
140 | }
141 |
142 | /**
143 | * @todo Implement testGetExArg().
144 | */
145 | public function testGetExArg()
146 | {
147 | // Remove the following lines when you implement this test.
148 | $this->markTestIncomplete(
149 | 'This test has not been implemented yet.'
150 | );
151 | }
152 |
153 | /**
154 | * @todo Implement test__destruct().
155 | */
156 | public function test__destruct()
157 | {
158 | // Remove the following lines when you implement this test.
159 | $this->markTestIncomplete(
160 | 'This test has not been implemented yet.'
161 | );
162 | }
163 | }
164 |
--------------------------------------------------------------------------------
/tests/lib/HttpProcesserTest.php:
--------------------------------------------------------------------------------
1 | object = new HttpProcesser;
21 | }
22 |
23 | /**
24 | * Tears down the fixture, for example, closes a network connection.
25 | * This method is called after a test is executed.
26 | */
27 | protected function tearDown()
28 | {
29 |
30 | }
31 |
32 | /**
33 | * @todo Implement testGetConn().
34 | */
35 | public function testGetConn()
36 | {
37 | // Remove the following lines when you implement this test.
38 | $this->markTestIncomplete(
39 | 'This test has not been implemented yet.'
40 | );
41 | }
42 |
43 | /**
44 | * @todo Implement testSend().
45 | */
46 | public function testSend()
47 | {
48 | // Remove the following lines when you implement this test.
49 | $this->markTestIncomplete(
50 | 'This test has not been implemented yet.'
51 | );
52 | }
53 |
54 | /**
55 | * @todo Implement testRecv().
56 | */
57 | public function testRecv()
58 | {
59 | // Remove the following lines when you implement this test.
60 | $this->markTestIncomplete(
61 | 'This test has not been implemented yet.'
62 | );
63 | }
64 |
65 | /**
66 | * @todo Implement testFinish().
67 | */
68 | public function testFinish()
69 | {
70 | // Remove the following lines when you implement this test.
71 | $this->markTestIncomplete(
72 | 'This test has not been implemented yet.'
73 | );
74 | }
75 |
76 | /**
77 | * @todo Implement test__destruct().
78 | */
79 | public function test__destruct()
80 | {
81 | // Remove the following lines when you implement this test.
82 | $this->markTestIncomplete(
83 | 'This test has not been implemented yet.'
84 | );
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/tests/lib/HttpRequestTest.php:
--------------------------------------------------------------------------------
1 | object = new HttpRequest;
21 | }
22 |
23 | /**
24 | * Tears down the fixture, for example, closes a network connection.
25 | * This method is called after a test is executed.
26 | */
27 | protected function tearDown()
28 | {
29 |
30 | }
31 |
32 | /**
33 | * @todo Implement testSetMaxRedirect().
34 | */
35 | public function testSetMaxRedirect()
36 | {
37 | // Remove the following lines when you implement this test.
38 | $this->markTestIncomplete(
39 | 'This test has not been implemented yet.'
40 | );
41 | }
42 |
43 | /**
44 | * @todo Implement testGetMaxRedirect().
45 | */
46 | public function testGetMaxRedirect()
47 | {
48 | // Remove the following lines when you implement this test.
49 | $this->markTestIncomplete(
50 | 'This test has not been implemented yet.'
51 | );
52 | }
53 |
54 | /**
55 | * @todo Implement testSetUrl().
56 | */
57 | public function testSetUrl()
58 | {
59 | // Remove the following lines when you implement this test.
60 | $this->markTestIncomplete(
61 | 'This test has not been implemented yet.'
62 | );
63 | }
64 |
65 | /**
66 | * @todo Implement testGetRawUrl().
67 | */
68 | public function testGetRawUrl()
69 | {
70 | // Remove the following lines when you implement this test.
71 | $this->markTestIncomplete(
72 | 'This test has not been implemented yet.'
73 | );
74 | }
75 |
76 | /**
77 | * @todo Implement testGetUrl().
78 | */
79 | public function testGetUrl()
80 | {
81 | // Remove the following lines when you implement this test.
82 | $this->markTestIncomplete(
83 | 'This test has not been implemented yet.'
84 | );
85 | }
86 |
87 | /**
88 | * @todo Implement testGetUrlParams().
89 | */
90 | public function testGetUrlParams()
91 | {
92 | // Remove the following lines when you implement this test.
93 | $this->markTestIncomplete(
94 | 'This test has not been implemented yet.'
95 | );
96 | }
97 |
98 | /**
99 | * @todo Implement testGetUrlParam().
100 | */
101 | public function testGetUrlParam()
102 | {
103 | // Remove the following lines when you implement this test.
104 | $this->markTestIncomplete(
105 | 'This test has not been implemented yet.'
106 | );
107 | }
108 |
109 | /**
110 | * @todo Implement testSetMethod().
111 | */
112 | public function testSetMethod()
113 | {
114 | // Remove the following lines when you implement this test.
115 | $this->markTestIncomplete(
116 | 'This test has not been implemented yet.'
117 | );
118 | }
119 |
120 | /**
121 | * @todo Implement testGetMethod().
122 | */
123 | public function testGetMethod()
124 | {
125 | // Remove the following lines when you implement this test.
126 | $this->markTestIncomplete(
127 | 'This test has not been implemented yet.'
128 | );
129 | }
130 |
131 | /**
132 | * @todo Implement testGetBody().
133 | */
134 | public function testGetBody()
135 | {
136 | // Remove the following lines when you implement this test.
137 | $this->markTestIncomplete(
138 | 'This test has not been implemented yet.'
139 | );
140 | }
141 |
142 | /**
143 | * @todo Implement testAddPostField().
144 | */
145 | public function testAddPostField()
146 | {
147 | // Remove the following lines when you implement this test.
148 | $this->markTestIncomplete(
149 | 'This test has not been implemented yet.'
150 | );
151 | }
152 |
153 | /**
154 | * @todo Implement testAddPostFile().
155 | */
156 | public function testAddPostFile()
157 | {
158 | // Remove the following lines when you implement this test.
159 | $this->markTestIncomplete(
160 | 'This test has not been implemented yet.'
161 | );
162 | }
163 |
164 | /**
165 | * @todo Implement test__toString().
166 | */
167 | public function test__toString()
168 | {
169 | // Remove the following lines when you implement this test.
170 | $this->markTestIncomplete(
171 | 'This test has not been implemented yet.'
172 | );
173 | }
174 | }
175 |
--------------------------------------------------------------------------------
/tests/lib/HttpResponseTest.php:
--------------------------------------------------------------------------------
1 | object = new HttpResponse('http://localhost/');
21 | }
22 |
23 | /**
24 | * Tears down the fixture, for example, closes a network connection.
25 | * This method is called after a test is executed.
26 | */
27 | protected function tearDown()
28 | {
29 |
30 | }
31 |
32 | /**
33 | * @todo Implement test__toString().
34 | */
35 | public function test__toString()
36 | {
37 | // Remove the following lines when you implement this test.
38 | $this->markTestIncomplete(
39 | 'This test has not been implemented yet.'
40 | );
41 | }
42 |
43 | /**
44 | * @todo Implement testHasError().
45 | */
46 | public function testHasError()
47 | {
48 | // Remove the following lines when you implement this test.
49 | $this->markTestIncomplete(
50 | 'This test has not been implemented yet.'
51 | );
52 | }
53 |
54 | /**
55 | * @todo Implement testReset().
56 | */
57 | public function testReset()
58 | {
59 | // Remove the following lines when you implement this test.
60 | $this->markTestIncomplete(
61 | 'This test has not been implemented yet.'
62 | );
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/tests/lib/UrlParserTest.php:
--------------------------------------------------------------------------------
1 | object = new UrlParser(new UrlTableMySQL);
21 | }
22 |
23 | /**
24 | * Tears down the fixture, for example, closes a network connection.
25 | * This method is called after a test is executed.
26 | */
27 | protected function tearDown()
28 | {
29 |
30 | }
31 |
32 | public function testIsDisallow()
33 | {
34 | $rawUrl = 'http://www.czxiu.com/cz/default.html';
35 | $obj = $this->object;
36 |
37 | // allow domain
38 | $obj->allowDomain('/baidu.com$/');
39 | $obj->allowDomain('sina.com');
40 | $this->assertTrue($obj->isDisallow('http://www.qq.com/test.html', $rawUrl));
41 | $this->assertTrue($obj->isDisallow('http://www.baidu.com.cn/test.html', $rawUrl));
42 | $this->assertTrue($obj->isDisallow('http://id.czxiu.com/zj/21.html', $rawUrl));
43 | $this->assertFalse($obj->isDisallow('http://news.sina.com.cn/', $rawUrl));
44 | $this->assertFalse($obj->isDisallow('http://www.czxiu.com', $rawUrl));
45 |
46 | // disallow domain
47 | $obj->disallowDomain('weibo.com');
48 | $obj->disallowDomain('#^t\.#');
49 | $obj->followExternal();
50 | $this->assertFalse($obj->isDisallow('http://www.qq.com/test.html', $rawUrl));
51 | $this->assertFalse($obj->isDisallow('http://www.baidu.com.cn/test.html', $rawUrl));
52 | $this->assertFalse($obj->isDisallow('http://id.czxiu.com/zj/21.html', $rawUrl));
53 | $this->assertTrue($obj->isDisallow('http://t.czxiu.com/zj/21.html', $rawUrl));
54 | $this->assertTrue($obj->isDisallow('http://weibo.com/zj/21.html', $rawUrl));
55 | $this->assertTrue($obj->isDisallow('http://tt.weibo.com/zj/21.html', $rawUrl));
56 |
57 | // disallow rule
58 | $obj->disallow('/21.html');
59 | $obj->disallow('#http://x\.#');
60 | $obj->disallow('#COM\.CN#i');
61 | $this->assertFalse($obj->isDisallow('http://tt.weibo2.com/zj/22.html', $rawUrl));
62 | $this->assertTrue($obj->isDisallow('http://x.czxiu.com/zj/21.html', $rawUrl));
63 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/zj/211.html', $rawUrl));
64 | $this->assertTrue($obj->isDisallow('http://www.xxx.com.cn/test.html', $rawUrl));
65 |
66 | // allow rule
67 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/ft/2.html', $rawUrl));
68 | $obj->allow('/zj');
69 | $this->assertTrue($obj->isDisallow('HTTP://x.czxiu.com/ft/2.html', $rawUrl));
70 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/zj/212.html', $rawUrl));
71 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/ZJ/212.jpg2', $rawUrl));
72 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/zj2/21.php?xyz=1', $rawUrl));
73 |
74 | // disallow ext (default)
75 | $this->assertTrue($obj->isDisallow('HTTP://x.czxiu.com/ZJ/21.tar?down=yes&x=1', $rawUrl));
76 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/zj/212.html', $rawUrl));
77 | $obj->allowExt('.tar');
78 | $obj->disallowExt('.html');
79 | $this->assertFalse($obj->isDisallow('HTTP://x.czxiu.com/ZJ/21.tar?down=yes&x=1', $rawUrl));
80 | $this->assertTrue($obj->isDisallow('HTTP://x.czxiu.com/zj/212.html?ok=1', $rawUrl));
81 | }
82 |
83 | /**
84 | * @dataProvider provider
85 | */
86 | public function testResetUrl($input, $output)
87 | {
88 | $baseUrl = 'http://www.czxiu.com/cz/default.html';
89 | $obj = $this->object;
90 | $this->assertEquals($output, $obj->resetUrl($input, $baseUrl));
91 | }
92 |
93 | public function provider()
94 | {
95 | return array(
96 | array('', 'http://www.czxiu.com/cz/'),
97 | array('test.html', 'http://www.czxiu.com/cz/test.html'),
98 | array('./test.html', 'http://www.czxiu.com/cz/test.html'),
99 | array('../cz/test.html', 'http://www.czxiu.com/cz/test.html'),
100 | array('../..//test.html', 'http://www.czxiu.com/test.html'),
101 | array('diy/test.html', 'http://www.czxiu.com/cz/diy/test.html'),
102 | array('/diy/test.html', 'http://www.czxiu.com/diy/test.html'),
103 | array('#', 'http://www.czxiu.com/cz/'),
104 | array('https://diy.czxiu.com', 'https://diy.czxiu.com/'),
105 | array('http://diy.czxiu.com/index.php#', 'http://diy.czxiu.com/index.php'),
106 | array('http://diy.czxiu.com/cz/diy/../index.php#', 'http://diy.czxiu.com/cz/index.php'),
107 | );
108 | }
109 | }
110 |
--------------------------------------------------------------------------------
/tests/lib/UrlTableMySQLTest.php:
--------------------------------------------------------------------------------
1 | object = new UrlTableMySQL;
21 | }
22 |
23 | /**
24 | * Tears down the fixture, for example, closes a network connection.
25 | * This method is called after a test is executed.
26 | */
27 | protected function tearDown()
28 | {
29 |
30 | }
31 |
32 | /**
33 | * @todo Implement testSetTableName().
34 | */
35 | public function testSetTableName()
36 | {
37 | // Remove the following lines when you implement this test.
38 | $this->markTestIncomplete(
39 | 'This test has not been implemented yet.'
40 | );
41 | }
42 |
43 | /**
44 | * @todo Implement testGetCount().
45 | */
46 | public function testGetCount()
47 | {
48 | // Remove the following lines when you implement this test.
49 | $this->markTestIncomplete(
50 | 'This test has not been implemented yet.'
51 | );
52 | }
53 |
54 | /**
55 | * @todo Implement testGetOne().
56 | */
57 | public function testGetOne()
58 | {
59 | // Remove the following lines when you implement this test.
60 | $this->markTestIncomplete(
61 | 'This test has not been implemented yet.'
62 | );
63 | }
64 |
65 | /**
66 | * @todo Implement testGetSome().
67 | */
68 | public function testGetSome()
69 | {
70 | // Remove the following lines when you implement this test.
71 | $this->markTestIncomplete(
72 | 'This test has not been implemented yet.'
73 | );
74 | }
75 |
76 | /**
77 | * @todo Implement testAddUrl().
78 | */
79 | public function testAddUrl()
80 | {
81 | // Remove the following lines when you implement this test.
82 | $this->markTestIncomplete(
83 | 'This test has not been implemented yet.'
84 | );
85 | }
86 |
87 | /**
88 | * @todo Implement testUpdateUrl().
89 | */
90 | public function testUpdateUrl()
91 | {
92 | // Remove the following lines when you implement this test.
93 | $this->markTestIncomplete(
94 | 'This test has not been implemented yet.'
95 | );
96 | }
97 |
98 | /**
99 | * @todo Implement testQuery().
100 | */
101 | public function testQuery()
102 | {
103 | // Remove the following lines when you implement this test.
104 | $this->markTestIncomplete(
105 | 'This test has not been implemented yet.'
106 | );
107 | }
108 | }
109 |
--------------------------------------------------------------------------------
/tests/phpunit.xml:
--------------------------------------------------------------------------------
1 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------