├── .gitignore ├── tests ├── phpinfo_test.php ├── Page_test.html ├── errcode_test.php ├── autoload.php ├── return_this_test.php ├── test_simple_html_dom.php ├── Crawler_test.php ├── fetcher_db_test.php ├── curl_test.php ├── test_phpfetcher_util_trie.php ├── Page_test.php └── simple_html_dom.php ├── Phpfetcher ├── Manager │ └── Abstract.php ├── Dom │ ├── Abstract.php │ ├── SimpleHtmlDom.php │ └── simple_html_dom.php ├── Error.php ├── Page │ ├── Abstract.php │ └── Default.php ├── Crawler │ ├── Abstract.php │ └── Default.php ├── Log.php └── Util │ └── Trie.php ├── phpfetcher.php ├── deploy.sh ├── demo ├── structure ├── iframe_example.php ├── single_page.php ├── multi_page.php ├── crawl_baidu_page.php ├── get_picture.php └── crawl_with_headers.php └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | *.swp 3 | -------------------------------------------------------------------------------- /tests/phpinfo_test.php: -------------------------------------------------------------------------------- 1 | 4 | -------------------------------------------------------------------------------- /tests/Page_test.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fanfank/phpfetcher/HEAD/tests/Page_test.html -------------------------------------------------------------------------------- /tests/errcode_test.php: -------------------------------------------------------------------------------- 1 | 9 | -------------------------------------------------------------------------------- /Phpfetcher/Manager/Abstract.php: -------------------------------------------------------------------------------- 1 | 11 | -------------------------------------------------------------------------------- /tests/autoload.php: -------------------------------------------------------------------------------- 1 | 8 | -------------------------------------------------------------------------------- /tests/return_this_test.php: -------------------------------------------------------------------------------- 1 | getInstance()->foo = 2; 11 | echo $objFoo->foo; 12 | ?> 13 | -------------------------------------------------------------------------------- /phpfetcher.php: -------------------------------------------------------------------------------- 1 | 10 | -------------------------------------------------------------------------------- /tests/test_simple_html_dom.php: -------------------------------------------------------------------------------- 1 | find('//h1', 0)->plaintext); 7 | $res = $html->find('//p'); 8 | for($i = 0; $i < count($res); ++$i) { 9 | $arrContent = 10 | } 11 | for($i = 0; $i < $res->length; ++$i) { 12 | 13 | } 14 | -------------------------------------------------------------------------------- /Phpfetcher/Dom/Abstract.php: -------------------------------------------------------------------------------- 1 | 'Success', 16 | self::ERR_INVALID_FIELD => 'Invalid field in array', 17 | self::ERR_FIELD_NOT_SET => 'Accessing a non-set field', 18 | ); 19 | 20 | public static function getErrmsg($errcode) { 21 | return self::$_arrErrcode2Errmsg[$errcode] . "\n"; 22 | } 23 | } 24 | ?> 25 | -------------------------------------------------------------------------------- /tests/Crawler_test.php: -------------------------------------------------------------------------------- 1 | getHyperLinks()); 6 | } 7 | } 8 | 9 | $crawler = new mycrawler(); 10 | $arrFetchJobs = array( 11 | 'blog.reetsee' => array( 12 | 'start_page' => 'http://blog.reetsee.com', 13 | 'link_rules' => array( 14 | '/blog\.reetsee\.com/', 15 | '/wordpress/', 16 | ), 17 | ), 18 | 'qq' => array( 19 | 'start_page' => 'http://news.qq.com', 20 | 'link_rules' => array( 21 | '/(.*)\/a\/(\d{8})\/(\d+)\.htm/', 22 | ), 23 | 'max_depth' => 4, 24 | ), 25 | ); 26 | $crawler->setFetchJobs($arrFetchJobs)->run(); 27 | //$page->setConfField('url', 'http://tech.qq.com/a/20140715/073002.htm'); 28 | 29 | ?> 30 | -------------------------------------------------------------------------------- /tests/fetcher_db_test.php: -------------------------------------------------------------------------------- 1 | getHyperLinks()); 6 | } 7 | } 8 | 9 | $crawler = new mycrawler(); 10 | $arrFetchJobs = array( 11 | 'tencent' => array( 12 | 'start_page' => 'http://news.qq.com', 13 | 'link_rules' => array( 14 | '/(.*)\/a\/(\d{8})\/(\d+)\.htm/', 15 | ), 16 | 'max_depth' => 4, 17 | ), 18 | 'reetsee' => array( 19 | 'start_page' => 'http://blog.reetsee.com', 20 | 'link_rules' => array( 21 | '/blog\.reetsee\.com/', 22 | '/wordpress/', 23 | ), 24 | ), 25 | ); 26 | $crawler->setFetchJobs($arrFetchJobs)->run(); 27 | //$page->setConfField('url', 'http://tech.qq.com/a/20140715/073002.htm'); 28 | 29 | ?> 30 | -------------------------------------------------------------------------------- /demo/structure: -------------------------------------------------------------------------------- 1 | |-phpfetcher.php //使用时include这个文件即可 2 | |-Phpfetcher //此文件夹与phpfetcher.php要在同一目录下 3 | |-Error.php //出错处理相关 4 | |-Log.php //日志相关 5 | |-Manager //暂时无用 6 | |-Crawler //Crawler目录,存放爬虫相关的类代码 7 | |-Abstract.php //爬虫基类Phpfetcher_Crawler_Abstrct 8 | |-Default.php //默认提供的爬虫类Phpfetcher_Crawler_Default 9 | |-Page //Page目录,存放页面相关的类代码 10 | |-Abstract.php //页面基类Phpfetcher_Page_Abstract 11 | |-Default.php //默认提供的页面类Phpfetcher_Page_Default 12 | |-Dom //Dom目录,存放Dom相关的类代码 13 | |-Abstract.php //Dom基类Phpfetcher_Dom_Abstract 14 | |-SimpleHtmlDom.php //默认提供的Dom类Phpfetcher_Dom_SimpleHtmlDom 15 | |-simple_html_dom.php //Phpfetcher_Dom_SimpleHtmlDom实际使用了simple_html_dom中的代码 16 | -------------------------------------------------------------------------------- /tests/curl_test.php: -------------------------------------------------------------------------------- 1 | 0) { 17 | echo "available!\n"; 18 | } 19 | curl_close($objCurl); 20 | return; 21 | } 22 | 23 | $c = 'a'; 24 | for ($i = 0; $i < 26; $i++) { 25 | getAvailableDomain($strName . $c, $intMaxDepth); 26 | ++$c; 27 | } 28 | } 29 | 30 | getAvailableDomain('', 3); 31 | ?> 32 | -------------------------------------------------------------------------------- /demo/iframe_example.php: -------------------------------------------------------------------------------- 1 | getUrl() . "] +++\n"; 9 | 10 | //选取所有包含src属性的iframe元素 11 | $arrIframes = $page->sel('//iframe[@src]'); 12 | for ($i = 0; $i < count($arrIframes); ++$i) { 13 | $strSrc = $arrIframes[$i]->src; 14 | echo "found iframe url=[" . $strSrc . "]\n"; 15 | $this->addAdditionalUrls($strSrc); 16 | } 17 | echo "--- leave page: [" . $page->getUrl() . "] ---\n"; 18 | } 19 | 20 | }; 21 | 22 | $crawler = new mycrawler(); 23 | $arrJobs = array( 24 | '163' => array( 25 | 'start_page' => 'http://news.163.com', 26 | 'link_rules' => array(), 27 | 'max_depth' => 2, 28 | ) , 29 | ); 30 | 31 | $crawler->setFetchJobs($arrJobs)->run(); 32 | 33 | echo "Done!\n"; 34 | -------------------------------------------------------------------------------- /tests/test_phpfetcher_util_trie.php: -------------------------------------------------------------------------------- 1 | has("ftp"), true) . "\n"; 9 | echo "has http:" . var_export($trie->has("http"), true) . "\n"; 10 | echo "has https:" . var_export($trie->has("https"), true) . "\n"; 11 | echo "\n"; 12 | } 13 | 14 | $arrSchemes = array( 15 | "http", 16 | "https", 17 | "ftp", 18 | ); 19 | $trie = new Phpfetcher_Util_Trie($arrSchemes); 20 | print_trie($trie); 21 | 22 | echo "delete 'abc'\n"; 23 | $trie->delete("abc"); 24 | print_trie($trie); 25 | 26 | echo "delete 'ftp'\n"; 27 | $trie->delete("ftp"); 28 | print_trie($trie); 29 | 30 | echo "delete 'http'\n"; 31 | $trie->delete("http"); 32 | print_trie($trie); 33 | 34 | echo "insert 'ftp'\n"; 35 | $trie->insert("ftp"); 36 | print_trie($trie); 37 | 38 | echo "delete 'https'\n"; 39 | $trie->delete("https"); 40 | print_trie($trie); 41 | 42 | echo "insert 'http'\n"; 43 | $trie->insert("http"); 44 | print_trie($trie); 45 | -------------------------------------------------------------------------------- /tests/Page_test.php: -------------------------------------------------------------------------------- 1 | init(); 6 | //$page->setConfField('url', 'http://tech.qq.com/a/20140715/073002.htm'); 7 | $page->setConfField('url', 'http://news.qq.com/a/20140921/000030.htm'); 8 | $page->read(); 9 | //echo $page->getContent(); 10 | //$DOMElement_id_oneKey = $page->selId('oneKey'); 11 | //var_dump($DOMElement_id_oneKey); 12 | //echo "\n"; 13 | //var_dump($DOMElement_id_oneKey->parentNode); 14 | //echo "\n"; 15 | //var_dump($DOMElement_id_oneKey->childNodes); 16 | //echo $DOMElement_id_oneKey->nodeValue; 17 | //print_r($page->xpath('//meta[@http-equiv]')); 18 | //var_dump($page->xpath2('//meta[@http-equiv]')); 19 | $res = $page->xpath('//title'); 20 | //$res = $page->xpath('a'); 21 | var_dump($res->item(0)->nodeValue); 22 | //var_dump($res->item(1)->nodeValue); 23 | /* 24 | $arrLinks = array(); 25 | $res = $page->xpath('//a/@href'); 26 | for($i = 0; $i < $res->length;++$i) { 27 | //var_dump($res->item($i)); 28 | $arrLinks[] = $res->item($i)->nodeValue; 29 | } 30 | */ 31 | //$arrLinks = $page->getHyperLinks(); 32 | //print_r($arrLinks); 33 | 34 | 35 | 36 | ?> 37 | -------------------------------------------------------------------------------- /demo/single_page.php: -------------------------------------------------------------------------------- 1 | sel('//title'); 11 | for ($i = 0; $i < count($res); ++$i) { 12 | echo $res[$i]->plaintext; 13 | echo "\n"; 14 | } 15 | } 16 | } 17 | 18 | $crawler = new mycrawler(); 19 | $arrJobs = array( 20 | //任务的名字随便起,这里把名字叫qqnews 21 | //the key is the name of a job, here names it qqnews 22 | 'qqnews' => array( 23 | 'start_page' => 'http://news.qq.com/a/20140927/026557.htm', //起始网页 24 | 'link_rules' => array( 25 | /* 26 | * 所有在这里列出的正则规则,只要能匹配到超链接,那么那条爬虫就会爬到那条超链接 27 | * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches 28 | */ 29 | ), 30 | //爬虫从开始页面算起,最多爬取的深度,设置为1表示只爬取起始页面 31 | //Crawler's max following depth, 1 stands for only crawl the start page 32 | 'max_depth' => 1, 33 | 34 | ) , 35 | ); 36 | 37 | //$crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样 38 | $crawler->setFetchJobs($arrJobs); 39 | $crawler->run(); 40 | -------------------------------------------------------------------------------- /demo/multi_page.php: -------------------------------------------------------------------------------- 1 | sel('//h1', 0)->plaintext); 11 | if (!empty($strFirstH1)) { 12 | echo $page->sel('//h1', 0)->plaintext; 13 | echo "\n"; 14 | } 15 | } 16 | } 17 | 18 | $crawler = new mycrawler(); 19 | $arrJobs = array( 20 | //任务的名字随便起,这里把名字叫qqnews 21 | //the key is the name of a job, here names it qqnews 22 | 'qqnews' => array( 23 | 'start_page' => 'http://news.qq.com', //起始网页 24 | 'link_rules' => array( 25 | /* 26 | * 所有在这里列出的正则规则,只要能匹配到超链接,那么那条爬虫就会爬到那条超链接 27 | * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches 28 | */ 29 | '#news\.qq\.com/a/\d+/\d+\.htm$#', 30 | ), 31 | //爬虫从开始页面算起,最多爬取的深度,设置为2表示爬取深度为1 32 | //Crawler's max following depth, 1 stands for only crawl the start page 33 | 'max_depth' => 2, 34 | 35 | ) , 36 | ); 37 | 38 | $crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样 39 | //$crawler->setFetchJobs($arrJobs); 40 | //$crawler->run(); 41 | -------------------------------------------------------------------------------- /demo/crawl_baidu_page.php: -------------------------------------------------------------------------------- 1 | sel('//h3/a'); 11 | for ($i = 0; $i < count($res); ++$i) { 12 | echo $res[$i]->plaintext; 13 | echo "\n"; 14 | echo $res[$i]->getAttribute('href'); 15 | echo "\n"; 16 | echo "\n"; 17 | } 18 | } 19 | } 20 | 21 | $crawler = new mycrawler(); 22 | $arrJobs = array( 23 | //任务的名字随便起,这里把名字叫qqnews 24 | //the key is the name of a job, here names it qqnews 25 | 'qqnews' => array( 26 | 'start_page' => 'https://www.baidu.com/s?wd=facebook', //起始网页 27 | 'link_rules' => array( 28 | /* 29 | * 所有在这里列出的正则规则,只要能匹配到超链接,那么那条爬虫就会爬到那条超链接 30 | * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches 31 | */ 32 | ), 33 | //爬虫从开始页面算起,最多爬取的深度,设置为1表示只爬取起始页面 34 | //Crawler's max following depth, 1 stands for only crawl the start page 35 | 'max_depth' => 1, 36 | 37 | ) , 38 | ); 39 | 40 | //$crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样 41 | $crawler->setFetchJobs($arrJobs); 42 | $crawler->run(); 43 | -------------------------------------------------------------------------------- /Phpfetcher/Page/Abstract.php: -------------------------------------------------------------------------------- 1 | 标签的内容,以数组形式返回 65 | abstract function getHyperLinks(); 66 | } 67 | ?> 68 | -------------------------------------------------------------------------------- /Phpfetcher/Crawler/Abstract.php: -------------------------------------------------------------------------------- 1 | _arrExtraInfo[$field]; 33 | } 34 | return $arrOutput; 35 | } 36 | */ 37 | 38 | /* 39 | public function setExtraInfo($arrInput = array()) { 40 | if (!is_array($arrInput) || empty($arrInput)) { 41 | return FALSE; 42 | } 43 | foreach ($arrInput as $key => $value) { 44 | $this->_arrExtraInfo[$key] = $value; 45 | } 46 | return TRUE; 47 | } 48 | */ 49 | 50 | /* 51 | //修改一条已有的规则 52 | public function setFetchJobByName() { 53 | Phpfetcher_Error::notice('not implemented'); 54 | } 55 | */ 56 | 57 | //运行爬虫 58 | abstract function &run($arrInput = array()); 59 | } 60 | ?> 61 | -------------------------------------------------------------------------------- /demo/get_picture.php: -------------------------------------------------------------------------------- 1 | sel("//p"); 10 | for ($i = 0; $i < count($objContent); ++$i) { 11 | $objPic = $objContent[$i]->find("img"); 12 | for ($j = 0; $j < count($objPic); ++$j) { 13 | echo $objPic[$j]->getAttribute('src') . "\n"; 14 | echo $objPic[$j]->getAttribute('alt') . "\n"; 15 | echo $objContent[$i]->plaintext . "\n"; 16 | echo $objContent[$i]->outertext() . "\n"; 17 | } 18 | } 19 | 20 | ////打印处当前页面的title 21 | //$res = $page->sel('//title'); 22 | //for ($i = 0; $i < count($res); ++$i) { 23 | // echo $res[$i]->plaintext; 24 | // echo "\n"; 25 | //} 26 | } 27 | } 28 | 29 | $crawler = new mycrawler(); 30 | $arrJobs = array( 31 | //任务的名字随便起,这里把名字叫qqnews 32 | //the key is the name of a job, here names it qqnews 33 | 'qqnews' => array( 34 | 'start_page' => 'http://news.163.com/16/0325/21/BJ1I6PN40001124J.html', //起始网页 35 | 'link_rules' => array( 36 | /* 37 | * 所有在这里列出的正则规则,只要能匹配到超链接,那么那条爬虫就会爬到那条超链接 38 | * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches 39 | */ 40 | ), 41 | //爬虫从开始页面算起,最多爬取的深度,设置为1表示只爬取起始页面 42 | //Crawler's max following depth, 1 stands for only crawl the start page 43 | 'max_depth' => 1, 44 | 45 | ) , 46 | ); 47 | 48 | //$crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样 49 | $crawler->setFetchJobs($arrJobs); 50 | $crawler->run(); 51 | -------------------------------------------------------------------------------- /Phpfetcher/Dom/SimpleHtmlDom.php: -------------------------------------------------------------------------------- 1 | _dom, 'clear')) { 17 | $this->_dom->clear(); 18 | } 19 | } 20 | 21 | public function getElementById($id) { 22 | $strMethodName = 'getElementById'; 23 | if (method_exists($this->_dom, $strMethodName)) { 24 | return $this->_dom->getElementById($id); 25 | } else { 26 | Phpfetcher_Log::warning("method $strMethodName not exists"); 27 | return FALSE; 28 | } 29 | } 30 | 31 | public function getElementsByTagName($tag) { 32 | $strMethodName = 'getElementsByTagName'; 33 | if (method_exists($this->_dom, $strMethodName)) { 34 | return $this->_dom->getElementsByTagName($tag); 35 | } else { 36 | Phpfetcher_Log::warning("method $strMethodName not exists"); 37 | return FALSE; 38 | } 39 | } 40 | 41 | public function loadHTML($content) { 42 | if (NULL === $this->_dom) { 43 | if (function_exists('str_get_html')) { 44 | $this->_dom = str_get_html($content); 45 | } 46 | } else { 47 | if (method_exists($this->_dom, 'load')) { 48 | $this->_dom->load($content); 49 | } 50 | } 51 | 52 | return $this; 53 | } 54 | 55 | /** 56 | * @deprecated 57 | */ 58 | public function sel($pattern = '', $idx = NULL, $node = NULL) { 59 | return $this->find($pattern, $idx); 60 | } 61 | 62 | public function find($pattern = '', $idx = NULL) { 63 | $strMethodName = 'find'; 64 | if (method_exists($this->_dom, $strMethodName)) { 65 | return $this->_dom->find($pattern, $idx); 66 | } else { 67 | Phpfetcher_Log::warning("method $strMethodName not exists"); 68 | return FALSE; 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /Phpfetcher/Log.php: -------------------------------------------------------------------------------- 1 | $intTraceDepth) { 62 | $intDepth = $intTraceDepth; 63 | } 64 | $arrTargetTrace = $arrTrace[$intDepth]; 65 | unset($arrTrace); 66 | if (isset($arrTargetTrace['file'])) { 67 | $arrTargetTrace['file'] = basename($arrTargetTrace['file']); 68 | } 69 | 70 | $strPrepend = strval(@date("Y-m-d H:i:s")) . " {$arrTargetTrace['file']} {$arrTargetTrace['class']} {$arrTargetTrace['function']} {$arrTargetTrace['line']} " . $strPrepend . ' '; 71 | 72 | $strMsg = $strPrepend . $strMsg . $strAppend; 73 | 74 | echo $strMsg; 75 | } 76 | } 77 | ?> 78 | -------------------------------------------------------------------------------- /Phpfetcher/Util/Trie.php: -------------------------------------------------------------------------------- 1 | _arrTrieRoot = array( 14 | 'children' => array(), 15 | 'count' => 0, 16 | ); 17 | foreach ($arrStrings as $str) { 18 | $this->insert($str); 19 | } 20 | } 21 | 22 | public function insert($str) { 23 | try { 24 | $str = strval($str); 25 | $intLen = strlen($str); 26 | $arrCurNode = &$this->_arrTrieRoot; 27 | 28 | for ($i = 0; $i < $intLen; ++$i) { 29 | if (!isset($arrCurNode['children'][$str[$i]])) { 30 | $arrCurNode['children'][$str[$i]] = array( 31 | 'children' => array(), 32 | 'count' => 0, 33 | ); 34 | } 35 | $arrCurNode = &$arrCurNode['children'][$str[$i]]; 36 | } 37 | 38 | $arrCurNode['count'] += 1; 39 | unset($arrCurNode); 40 | 41 | } catch (Exception $e) { 42 | Phpfetcher_Log::fatal($e->getMessage()); 43 | return false; 44 | } 45 | 46 | return true; 47 | } 48 | 49 | public function delete($str) { 50 | $arrCurNode = &$this->_locateNode($str); 51 | if (!is_null($arrCurNode) && $arrCurNode['count'] > 0) { 52 | $arrCurNode['count'] -= 1; 53 | } 54 | unset($arrCurNode); 55 | return true; 56 | } 57 | 58 | public function has($str) { 59 | $arrTargetNode = &$this->_locateNode($str); 60 | $bolRes = false; 61 | if (!is_null($arrTargetNode) && $arrTargetNode['count'] > 0) { 62 | $bolRes = true; 63 | } 64 | unset($arrTargetNode); 65 | return $bolRes; 66 | } 67 | 68 | protected function &_locateNode($str) { 69 | $str = strval($str); 70 | $intLen = strlen($str); 71 | $arrCurNode = &$this->_arrTrieRoot; 72 | 73 | for ($i = 0; $i < $intLen; ++$i) { 74 | if (!isset($arrCurNode['children'][$str[$i]])) { 75 | return null; 76 | } 77 | $arrCurNode = &$arrCurNode['children'][$str[$i]]; 78 | } 79 | 80 | return $arrCurNode; 81 | } 82 | 83 | //public function startsWith($str) { 84 | // $str = strval($str); 85 | // //TODO 86 | //} 87 | }; 88 | -------------------------------------------------------------------------------- /demo/crawl_with_headers.php: -------------------------------------------------------------------------------- 1 | sel('//title'); 11 | for ($i = 0; $i < count($res); ++$i) { 12 | echo $res[$i]->plaintext; 13 | echo "\n"; 14 | } 15 | } 16 | } 17 | 18 | $crawler = new mycrawler(); 19 | $arrJobs = array( 20 | //任务的名字随便起,这里把名字叫qqnews 21 | //the key is the name of a job, here names it qqnews 22 | 'qqnews' => array( 23 | 'start_page' => 'http://jianli.58.com/resume/93489192884492', //起始网页 24 | 'link_rules' => array( 25 | /* 26 | * 所有在这里列出的正则规则,只要能匹配到超链接,那么那条爬虫就会爬到那条超链接 27 | * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches 28 | */ 29 | ), 30 | //爬虫从开始页面算起,最多爬取的深度,设置为1表示只爬取起始页面 31 | //Crawler's max following depth, 1 stands for only crawl the start page 32 | 'max_depth' => 1, 33 | 34 | //某些页面做了防抓取策略,可以通过修改UA,或者添加必要的HTTP Header来防止屏蔽 35 | //Some pages may prevent crawlers from working, you may change UA or add 36 | // necessary HTTP Headers to prevent this. 37 | 'page_conf' => array( 38 | 'http_header' => array( 39 | //如果本例子对于你来说运行不成功(发生了错误),那么请将下面的Header 40 | // 替换成与你浏览器请求Header一样的内容,但是不要添加Accept-Encoding 41 | // 这个Header 42 | //If this example can not run successfully, please replace the Headers 43 | // below with the ones exactly you see from your browser. Remember 44 | // not to add Accept-Encoding header. 45 | 'Host: jianli.m.58.com', 46 | 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0', 47 | 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 48 | 'Cookie: 58home=tj; id58=c5/ns1enV2k5MFGqLUAXAg==; city=tj; 58tj_uuid=1cf71e54-dd15-4922-8228-b6bb809edbfd; new_session=0; new_uv=1; utm_source=; spm=; init_refer=; myfeet_tooltip=end; als=0; Hm_lvt_2557cda77f2e9a8b94531c9501582142=1470585797; Hm_lpvt_2557cda77f2e9a8b94531c9501582142=1470585797; 4drh9g=test insert val', 49 | 'Connection: keep-alive', 50 | 'Cache-Control: max-age=0', 51 | 52 | //不要添加Accept-Encoding的Header 53 | //Do not add Accept-Encoding Header 54 | //'Accept-Encoding: gzip, deflate' 55 | ), 56 | ), 57 | ) , 58 | ); 59 | 60 | $crawler->setFetchJobs($arrJobs); 61 | $crawler->run(); 62 | -------------------------------------------------------------------------------- /Phpfetcher/Page/Default.php: -------------------------------------------------------------------------------- 1 | CURLOPT_HEADER, 13 | 'exclude_body' => CURLOPT_NOBODY, 14 | 'is_post' => CURLOPT_POST, 15 | 'is_verbose' => CURLOPT_VERBOSE, 16 | 'return_transfer'=> CURLOPT_RETURNTRANSFER, 17 | 18 | /* int */ 19 | 'buffer_size' => CURLOPT_BUFFERSIZE, 20 | 'connect_timeout' => CURLOPT_CONNECTTIMEOUT, 21 | 'connect_timeout_ms' => CURLOPT_CONNECTTIMEOUT_MS, 22 | 'dns_cache_timeout' => CURLOPT_DNS_CACHE_TIMEOUT, 23 | 'max_redirs' => CURLOPT_MAXREDIRS, 24 | 'port' => CURLOPT_PORT, 25 | 'timeout' => CURLOPT_TIMEOUT, 26 | 'timeout_ms' => CURLOPT_TIMEOUT_MS, 27 | 28 | /* string */ 29 | 'cookie' => CURLOPT_COOKIE, 30 | 'cookie_file' => CURLOPT_COOKIEFILE, 31 | 'cookie_jar' => CURLOPT_COOKIEJAR, 32 | 'post_fields' => CURLOPT_POSTFIELDS, 33 | 'url' => CURLOPT_URL, 34 | 'user_agent' => CURLOPT_USERAGENT, 35 | 'user_pwd' => CURLOPT_USERPWD, 36 | 37 | /* array */ 38 | 'http_header' => CURLOPT_HTTPHEADER, 39 | 40 | /* stream resource */ 41 | 'file' => CURLOPT_FILE, 42 | 43 | /* function or a Closure */ 44 | 'write_function' => CURLOPT_WRITEFUNCTION, 45 | 46 | /* https */ 47 | 'ssl_verifypeer' => CURLOPT_SSL_VERIFYPEER, 48 | ); 49 | 50 | protected $_arrDefaultConf = array( 51 | 'connect_timeout' => 10, 52 | 'max_redirs' => 10, 53 | 'return_transfer' => 1, //need this 54 | 'timeout' => 15, 55 | 'url' => NULL, 56 | 'user_agent' => 'firefox', 57 | 'ssl_verifypeer' => false, 58 | ); 59 | 60 | protected $_arrConf = array(); 61 | protected $_arrExtraInfo = array(); 62 | protected $_bolCloseCurlHandle = FALSE; 63 | protected $_curlHandle = NULL; 64 | protected $_dom = NULL; 65 | //protected $_xml = NULL; 66 | 67 | public function __construct() { 68 | } 69 | public function __destruct() { 70 | if ($this->_bolCloseCurlHandle) { 71 | curl_close($this->_curlHandle); 72 | } 73 | } 74 | 75 | public static function formatRes($data, $errcode, $errmsg = NULL) { 76 | if ($errmsg === NULL) { 77 | $errmsg = Phpfetcher_Error::getErrmsg($errcode); 78 | } 79 | return array('errcode' => $errcode, 'errmsg' => $errmsg, 'res' => $data); 80 | } 81 | 82 | /** 83 | * @author xuruiqi 84 | * @desc get configurations. 85 | */ 86 | public function getConf() { 87 | return $this->_arrConf; 88 | } 89 | 90 | /** 91 | * @author xuruiqi 92 | * @param $key: specified field 93 | * @return 94 | * bool : false when field doesn't exist 95 | * mixed : otherwise 96 | * @desc get a specified configuration. 97 | */ 98 | public function getConfField($key) { 99 | if (isset($this->_arrConf[$key])) { 100 | return self::formatRes($this->_arrConf[$key], Phpfetcher_Error::ERR_SUCCESS); 101 | } else { 102 | return self::formatRes(NULL, Phpfetcher_Error::ERR_FIELD_NOT_SET); 103 | } 104 | } 105 | 106 | public function getContent() { 107 | return $this->_strContent; 108 | } 109 | 110 | public function getExtraInfo($arrInput) { 111 | $arrOutput = array(); 112 | foreach ($arrInput as $req_key) { 113 | $arrOutput[$req_key] = $this->_arrExtraInfo[$req_key]; 114 | } 115 | return $arrOutput; 116 | } 117 | 118 | public function getHyperLinks() { 119 | $arrLinks = array(); 120 | $res = $this->sel('//a'); 121 | for ($i = 0; $i < count($res); ++$i) { 122 | $arrLinks[] = $res[$i]->href; 123 | } 124 | /* 125 | foreach ($res as $node) { 126 | $arrLinks[] = $node->href; 127 | } 128 | */ 129 | return $arrLinks; 130 | } 131 | 132 | /** 133 | * @author xuruiqi 134 | * @param 135 | * @return 136 | * string : current page's url 137 | * @desc get this page's URL. 138 | */ 139 | public function getUrl() { 140 | $arrRet = $this->getConfField('url'); 141 | return strval($arrRet['res']); 142 | } 143 | 144 | /** 145 | * @author xuruiqi 146 | * @param 147 | * array $conf : configurations 148 | * bool $clear_default : whether to clear default options not set in $conf 149 | * @return 150 | * @desc initialize this instance with specified or default configuration 151 | */ 152 | public function init($curl_handle = NULL, $conf = array()) { 153 | $this->_curlHandle = $curl_handle; 154 | if (empty($this->_curlHandle)) { 155 | $this->_curlHandle = curl_init(); 156 | $this->_bolCloseCurlHandle = TRUE; 157 | } 158 | $this->_arrConf = $this->_arrDefaultConf; 159 | 160 | $this->setConf($conf, TRUE); 161 | 162 | return $this; 163 | } 164 | 165 | /** 166 | * @author xuruiqi 167 | * @param 168 | * array $ids : elements' ids 169 | * @return 170 | * array : array of DOMElement, with keys equal each of ids 171 | * NULL : if $this->_dom equals NULL 172 | * @desc select spcified elements with their ids. 173 | */ 174 | /* 175 | public function mselId($ids) { 176 | if ($this->_dom === NULL) { 177 | Phpfetcher_Log::warning('$this->_dom is NULL!'); 178 | return NULL; 179 | } 180 | 181 | $arrOutput = array(); 182 | foreach ($ids as $id) { 183 | $arrOutput[$id] = $this->selId($id); 184 | } 185 | return $arrOutput; 186 | } 187 | */ 188 | 189 | /** 190 | * @author xuruiqi 191 | * @param 192 | * array $tags : elements' tags 193 | * @return 194 | * array : array of DOMNodeList, with keys equal each of tags 195 | * NULL : if $this->_dom equals NULL 196 | * @desc select spcified elements with their tags 197 | */ 198 | /* 199 | public function mselTagName($tags) { 200 | if ($this->_dom === NULL) { 201 | Phpfetcher_Log::warning('$this->_dom is NULL!'); 202 | return NULL; 203 | } 204 | 205 | $arrOutput = array(); 206 | foreach ($tags as $tag) { 207 | $arrOutput[$tag] = $this->selId($tag); 208 | } 209 | return $arrOutput; 210 | } 211 | */ 212 | 213 | 214 | /** 215 | * @author xuruiqi 216 | * @param 217 | * array $conf : configurations 218 | * bool $clear_previous_conf : if TRUE, then before set $conf, reset current configuration to its default value 219 | * @return 220 | * array : previous conf 221 | * @desc set configurations. 222 | */ 223 | public function setConf($conf = array(), $clear_previous_conf = FALSE) { 224 | $previous_conf = $this->_arrConf; 225 | if ($clear_previous_conf === TRUE) { 226 | $this->_arrConf = $this->_arrDefaultConf; 227 | } 228 | foreach ($conf as $k => $v) { 229 | $this->_arrConf[$k] = $v; 230 | } 231 | 232 | $bolRes = TRUE; 233 | if ($clear_previous_conf === TRUE) { 234 | $bolRes = $this->_setConf($this->_arrConf); 235 | } else { 236 | $bolRes = $this->_setConf($conf); 237 | } 238 | 239 | if ($bolRes != TRUE) { 240 | $this->_arrConf = $previous_conf; 241 | $this->_setConf($this->_arrConf); 242 | return $bolRes; 243 | } 244 | 245 | return $previous_conf; 246 | } 247 | 248 | protected function _setConf($conf = array()) { 249 | $arrCurlOpts = array(); 250 | foreach ($conf as $k => $v) { 251 | if (isset(self::$_arrField2CurlOpt[$k])) { 252 | $arrCurlOpts[self::$_arrField2CurlOpt[$k]] = $v; 253 | } else { 254 | //currently only curl options can be set 255 | $arrCurlOpts[$k] = $v; 256 | } 257 | } 258 | return curl_setopt_array($this->_curlHandle, $arrCurlOpts); 259 | } 260 | 261 | public function setExtraInfo($arrInput) { 262 | foreach ($arrInput as $key => $val) { 263 | $this->_arrExtraInfo[$key] = $val; 264 | } 265 | } 266 | 267 | /** 268 | * @author xuruiqi 269 | * @param 270 | * string $id : specifed element id 271 | * @return 272 | * object : DOMElement or NULL is not found 273 | * NULL : if $this->_dom equals NULL 274 | * @desc select a spcified element via its id. 275 | */ 276 | public function selId($id) { 277 | if ($this->_dom === NULL) { 278 | Phpfetcher_Log::warning('$this->_dom is NULL!'); 279 | return NULL; 280 | } 281 | 282 | return $this->_dom->getElementById($id); 283 | } 284 | 285 | /** 286 | * @author xuruiqi 287 | * @param 288 | * string $tag : specifed elements' tag name 289 | * @return 290 | * object : a traversable DOMNodeList object containing all the matched elements 291 | * NULL : if $this->_dom equals NULL 292 | * @desc select spcified elements via its tag name. 293 | */ 294 | public function selTagName($tag) { 295 | if ($this->_dom === NULL) { 296 | Phpfetcher_Log::warning('$this->_dom is NULL!'); 297 | return NULL; 298 | } 299 | 300 | return $this->_dom->getElementsByTagName($tag); 301 | } 302 | 303 | public function setConfField($field, $value) { 304 | $this->_arrConf[$field] = $value; 305 | return $this->_setConfField($field, $value); 306 | } 307 | 308 | protected function _setConfField($field, $value) { 309 | if (isset(self::$_arrField2CurlOpt[$field])) { 310 | return curl_setopt($this->_curlHandle, self::$_arrField2CurlOpt[$field], $value); 311 | } else { 312 | //currently only curl options can be set 313 | return curl_setopt($this->_curlHandle, $field, $value); 314 | } 315 | } 316 | 317 | /** 318 | * @author xuruiqi 319 | * @param 320 | * string $url : the URL 321 | * @return 322 | * string : previous URL 323 | * @desc set this page's URL. 324 | */ 325 | public function setUrl($url) { 326 | $previous_url = $this->_arrConf['url']; 327 | $this->setConfField('url', $url); 328 | return $previous_url; 329 | } 330 | 331 | /** 332 | * @author xuruiqi 333 | * @param 334 | * array $arrHeaderList : header list 335 | * e.g. 336 | * array( 337 | * ... 338 | * "Cookie: xxxxx", 339 | * ... 340 | * "Header_n: header_n_value", 341 | * ) 342 | * @return 343 | * this 344 | * @desc set header of the next fetch 345 | */ 346 | public function &setHeaders($arrHeaderList) { 347 | $this->setConf(array( 348 | "http_header" => $arrHeaderList 349 | )); 350 | return $this; 351 | } 352 | 353 | /** 354 | * @author xuruiqi 355 | * @param 356 | * @return 357 | * string : return page's content 358 | * bool : if failed return FALSE 359 | * @desc get page's content, and save it into member variable <_strContent> 360 | */ 361 | public function read() { 362 | $this->_strContent = curl_exec($this->_curlHandle); 363 | if ($this->_strContent != FALSE) { 364 | $matches = array(); 365 | preg_match('#charset="?([a-zA-Z0-9-\._]+)"?#', $this->_strContent, $matches); 366 | if (!empty($matches[1])) { 367 | //Phpfetcher_Log::notice("Convert content from {$matches[1]} to UTF-8\n"); 368 | $this->_strContent = mb_convert_encoding($this->_strContent, 'UTF-8', $matches[1]); 369 | } 370 | 371 | /* 372 | $this->_dom = new DOMDocument(); //DOMDocument's compatibility is bad 373 | if (@$this->_dom->loadHTML($this->_strContent) == FALSE) { 374 | Phpfetcher_Log::warning('Failed to call $this->_dom->loadHTML'); 375 | $this->_dom = NULL; 376 | $this->_domxpath = NULL; 377 | } else { 378 | $this->_domxpath = new DOMXPath($this->_dom); 379 | } 380 | */ 381 | 382 | $this->_dom = new Phpfetcher_Dom_SimpleHtmlDom(); 383 | if (@$this->_dom->loadHTML($this->_strContent) == FALSE) { 384 | Phpfetcher_Log::warning('Failed to call $this->_dom->loadHTML'); 385 | $this->_dom = NULL; 386 | } 387 | } 388 | return $this->_strContent; 389 | } 390 | 391 | /** 392 | * @author xuruiqi 393 | * @param 394 | * string $strPath : xpath's path 395 | * [DOMNode $contextnode : The optional contextnode can be specified for doing relative XPath queries. By default, the queries are relative to the root element.] 396 | * 397 | * @return 398 | * DOMNodelist : DOMNodelist object 399 | * NULL : if $this->_dom equals NULL 400 | * false : if error occurs 401 | * @desc select corresponding content use xpath 402 | */ 403 | public function sel($strPath, $intIndex = NULL, $contextnode = NULL) { 404 | if ($this->_dom === NULL) { 405 | Phpfetcher_Log::warning('$this->_dom is NULL!'); 406 | return NULL; 407 | } 408 | 409 | if ($contextnode !== NULL) { 410 | //$res = $this->_domxpath->query($strPath, $contextnode); 411 | Phpfetcher_Log::warning('param contextnode is no use because of this function\'s inability'); 412 | $res = $this->_dom->sel($strPath, $intIndex); 413 | } else { 414 | //$res = $this->_domxpath->query($strPath); 415 | $res = $this->_dom->sel($strPath, $intIndex); 416 | } 417 | 418 | return $res; 419 | } 420 | } 421 | ?> 422 | -------------------------------------------------------------------------------- /Phpfetcher/Crawler/Default.php: -------------------------------------------------------------------------------- 1 | self::STR_TYPE, 24 | 'link_rules' => self::ARR_TYPE, 25 | 'max_depth' => self::INT_TYPE, 26 | 'max_pages' => self::INT_TYPE, 27 | ); 28 | 29 | /* 30 | protected static $arrJobDefaultFields = array( 31 | 'max_depth' => self::MAX_DEPTH, 32 | 'max_pages' => self::MAX_PAGE_NUM, 33 | ); 34 | */ 35 | 36 | protected $_arrFetchJobs = array(); 37 | protected $_arrHash = array(); 38 | protected $_arrAdditionalUrls = array(); 39 | protected $_objSchemeTrie = array(); //合法url scheme的字典树 40 | //protected $_objPage = NULL; //Phpfetcher_Page_Default; 41 | 42 | public function __construct($arrInitParam = array()) { 43 | if (!isset($arrInitParam['url_schemes'])) { 44 | $arrInitParam['url_schemes'] = array("http", "https", "ftp"); 45 | } 46 | 47 | $this->_objSchemeTrie = 48 | new Phpfetcher_Util_Trie($arrInitParam['url_schemes']); 49 | } 50 | 51 | /** 52 | * @author xuruiqi 53 | * @param 54 | * array $arrInput: 55 | * array <任务名1> : 56 | * string 'start_page', //爬虫的起始页面 57 | * array 'link_rules': //爬虫跟踪的超链接需要满足的正则表达式,依次检查规则,匹配其中任何一条即可 58 | * string 0, //正则表达式1 59 | * string 1, //正则表达式2 60 | * ... 61 | * string n-1, //正则表达式n 62 | * int 'max_depth' , //爬虫最大的跟踪深度,目前限制最大值不超过20 63 | * int 'max_pages' , //最多爬取的页面数,默认指定为-1,表示没有限制 64 | * array <任务名2> : 65 | * ... 66 | * ... 67 | * ... 68 | * array <任务名n-1>: 69 | * ... 70 | * ... 71 | * 72 | * @return 73 | * Object $this : returns the instance itself 74 | * @desc add by what rules the crawler should fetch the pages 75 | * if a job has already been in jobs queue, new rules will 76 | * cover the old ones. 77 | */ 78 | public function &addFetchJobs($arrInput = array()) { 79 | return $this->_modifyFetchJobs($arrInput, self::MODIFY_JOBS_ADD); 80 | } 81 | 82 | /** 83 | * @author xuruiqi 84 | * @param 85 | * array $arrInput : 86 | * mixed 0 : 87 | * 任务名 88 | * mixed 1 : 89 | * 任务名 90 | * ... ... 91 | * @return 92 | * Object $this : returns the instance itself 93 | * @desc delete fetch rules according to job names 94 | */ 95 | public function &delFetchJobs($arrInput = array()) { 96 | return $this->_modifyFetchJobs($arrInput, self::MODIFY_JOBS_DEL); 97 | } 98 | 99 | public function getFetchJobByName($job_name) { 100 | return $this->_arrFetchJobs[$strJobName]; 101 | } 102 | 103 | public function getFetchJobs() { 104 | return $this->_arrFetchJobs; 105 | } 106 | 107 | /* 108 | public function handlePage() { 109 | //由用户继承本类并实现此方法 110 | } 111 | */ 112 | 113 | /** 114 | * @author xuruiqi 115 | * @param : 116 | * //$intOptType === MODIFY_JOBS_SET|MODIFY_JOBS_ADD, 117 | * $arrInput参见addFetchJobs的入参$arrInput 118 | * //$intOptType === MODIFY_JOBS_DEL, 119 | * $arrInput参见delFetchJobs的入参$arrInput 120 | * 121 | * @return 122 | * Object $this : returns the instance itself 123 | * @desc set fetch rules. 124 | */ 125 | protected function &_modifyFetchJobs($arrInput = array(), $intOptType) { 126 | $arrInvalidJobs = array(); 127 | if ($intOptType === self::MODIFY_JOBS_SET || $intOptType === self::MODIFY_JOBS_ADD) { 128 | if ($intOptType === self::MODIFY_JOBS_SET) { 129 | $this->_arrFetchJobs = array(); 130 | } 131 | foreach ($arrInput as $job_name => $job_rules) { 132 | $this->_correctJobParam($job_rules); 133 | if ($this->_isJobValid($job_rules)) { 134 | $this->_arrFetchJobs[$job_name] = $job_rules; 135 | } else { 136 | $arrInvalidJobs[] = $job_name; 137 | } 138 | } 139 | } else if ($intOptType === self::MODIFY_JOBS_DEL) { 140 | foreach ($arrInput as $job_name) { 141 | unset($this->_arrFetchJobs[$job_name]); 142 | } 143 | } else { 144 | Phpfetcher_Log::warning("Unknown options for fetch jobs [{$intOptType}]"); 145 | } 146 | 147 | 148 | if (!empty($arrInvalidJobs)) { 149 | Phpfetcher_Log::notice('Invalid jobs:' . implode(',', $arrInvalidJobs)); 150 | } 151 | return $this; 152 | } 153 | 154 | /** 155 | * @author xuruiqi 156 | * @param : 参见addFetchJobs的入参$arrInput 157 | * 158 | * @return 159 | * Object $this : returns the instance itself 160 | * @desc set fetch jobs. 161 | */ 162 | public function &setFetchJobs($arrInput = array()) { 163 | return $this->_modifyFetchJobs($arrInput, self::MODIFY_JOBS_SET); 164 | } 165 | 166 | /** 167 | * @author xuruiqi 168 | * @param 169 | * array $arrInput : //运行设定 170 | * string 'page_class_name' : //指定要使用的Page类型,必须是 171 | * //Phpfetcher_Page_Abstract的 172 | * //子类 173 | * [array 'page_conf'] : //Page调用setConf时的输入参数,可选 174 | * @return 175 | * obj $this 176 | * @desc 177 | */ 178 | public function &run($arrInput = array()) { 179 | if (empty($this->_arrFetchJobs)) { 180 | Phpfetcher_Log::warning("No fetch jobs."); 181 | return $this; 182 | } 183 | 184 | //构建Page对象 185 | $objPage = NULL; 186 | $strPageClassName = self::DEFAULT_PAGE_CLASS; 187 | if (!empty($arrInput['page_class_name'])) { 188 | $strPageClassName = strval($arrInput['page_class_name']); 189 | } 190 | try { 191 | if (!class_exists($strPageClassName, TRUE)) { 192 | throw new Exception("[$strPageClassName] class not exists!"); 193 | } 194 | 195 | $objPage = new $strPageClassName; 196 | if (!($objPage instanceof Phpfetcher_Page_Abstract)) { 197 | throw new Exception("[$strPageClassName] is not an instance of " . self::ABSTRACT_PAGE_CLASS); 198 | } 199 | } catch (Exception $e) { 200 | Phpfetcher_Log::fatal($e->getMessage()); 201 | return $this; 202 | } 203 | 204 | //初始化Page对象 205 | $arrPageConf = empty($arrInput['page_conf']) ? array() : $arrInput['page_conf']; 206 | $objPage->init(); 207 | if (!empty($arrPageConf)) { 208 | if(isset($arrPageConf['url'])) { 209 | unset($arrPageConf['url']); 210 | } 211 | $objPage->setConf($arrPageConf); 212 | } 213 | 214 | //遍历任务队列 215 | foreach ($this->_arrFetchJobs as $job_name => $job_rules) { 216 | if (!($this->_isJobValid($job_rules))) { 217 | Phpfetcher_Log::warning("Job rules invalid [" . serialize($job_rules) . "]"); 218 | continue; 219 | } 220 | 221 | //检查是否需要设置curl配置 222 | if (!empty($job_rules['page_conf'])) { 223 | $objPage->setConf($job_rules['page_conf']); 224 | } 225 | 226 | $intDepth = 0; 227 | $intPageNum = 0; 228 | $arrIndice = array(0, 1); 229 | $arrJobs = array( 230 | 0 => array($job_rules['start_page']), 231 | 1 => array(), 232 | ); 233 | 234 | //开始爬取 235 | while (!empty($arrJobs[$arrIndice[0]]) 236 | && ($job_rules['max_depth'] === -1 || $intDepth < $job_rules['max_depth']) 237 | && ($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) { 238 | 239 | $intDepth += 1; 240 | $intPopIndex = $arrIndice[0]; 241 | $intPushIndex = $arrIndice[1]; 242 | $arrJobs[$intPushIndex] = array(); 243 | foreach ($arrJobs[$intPopIndex] as $url) { 244 | if (!($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) { 245 | break; 246 | } 247 | $objPage->setUrl($url); 248 | $objPage->read(); 249 | 250 | //获取所有的超链接 251 | $arrLinks = $objPage->getHyperLinks(); 252 | 253 | //解析当前URL的各个组成部分,以应对超链接中存在站内链接 254 | //的情况,如"/entry"等形式的URL 255 | $strCurUrl = $objPage->getUrl(); 256 | $arrUrlComponents = parse_url($strCurUrl); 257 | 258 | //匹配超链接 259 | foreach ($job_rules['link_rules'] as $link_rule) { 260 | foreach ($arrLinks as $link) { 261 | if (substr($link, 0, 2) == "//") { 262 | $link = substr($link, 1); 263 | } 264 | 265 | if (preg_match($link_rule, $link) === 1 266 | && !$this->getHash($link)) { 267 | 268 | //拼出实际的URL 269 | $real_link = $link; 270 | 271 | //不使用strpos,防止扫描整个字符串 272 | //这里只需要扫描前6个字符即可 273 | $colon_pos = false; 274 | for ($i = 0; $i <= 5; ++$i) { 275 | if ($link[$i] == ':') { 276 | $colon_pos = $i; 277 | break; 278 | } 279 | } 280 | 281 | if ($colon_pos === false 282 | || !$this->_objSchemeTrie->has( 283 | substr($link, 0, $colon_pos))) { 284 | //将站内地址转换为完整地址 285 | $real_link = $arrUrlComponents['scheme'] 286 | . "://" 287 | . $arrUrlComponents['host'] 288 | . (isset($arrUrlComponents['port']) 289 | && strlen($arrUrlComponents['port']) != 0 ? 290 | ":{$arrUrlComponents['port']}" : 291 | "") 292 | . ($link[0] == '/' ? 293 | $link : "/$link"); 294 | } 295 | 296 | $this->setHash($link, true); 297 | $this->setHash($real_link, true); 298 | $arrJobs[$intPushIndex][] = $real_link; 299 | } 300 | } 301 | } 302 | 303 | //由用户实现handlePage函数 304 | $objPage->setExtraInfo(array('job_name' => $job_name )); 305 | $this->handlePage($objPage); 306 | $intPageNum += 1; 307 | } 308 | 309 | if (!empty($this->_arrAdditionalUrls)) { 310 | $arrJobs[$intPushIndex] = 311 | array_merge($arrJobs[$intPushIndex], 312 | $this->_arrAdditionalUrls); 313 | $this->_arrAdditionalUrls = array(); 314 | } 315 | 316 | self::_swap($arrIndice[0], $arrIndice[1]); 317 | } 318 | } 319 | return $this; 320 | } 321 | 322 | protected function _correctJobParam(&$job_rules) { 323 | /* 324 | foreach (self::$arrJobDefaultFields as $field => $value) { 325 | if (!isset($job_rules[$field]) || ($job_rules[''])) 326 | } 327 | */ 328 | if (!isset($job_rules['max_depth']) || (self::MAX_DEPTH !== -1 && self::MAX_DEPTH < $job_rules['max_depth'])) { 329 | $job_rules['max_depth'] = self::MAX_DEPTH; 330 | } 331 | 332 | if (!isset($job_rules['max_pages']) || (self::MAX_PAGE_NUM !== -1 && self::MAX_PAGE_NUM < $job_rules['max_pages'])) { 333 | $job_rules['max_pages'] = self::MAX_PAGE_NUM; 334 | } 335 | } 336 | 337 | /** 338 | * @author xuruiqi 339 | * @desc check if a rule is valid 340 | */ 341 | protected function _isJobValid($arrRule) { 342 | foreach (self::$arrJobFieldTypes as $field => $type) { 343 | if (!isset($arrRule[$field]) || ($type === self::ARR_TYPE && !is_array($arrRule[$field]))) { 344 | return FALSE; 345 | } 346 | } 347 | return TRUE; 348 | } 349 | 350 | protected static function _swap(&$a, &$b) { 351 | $tmp = $a; 352 | $a = $b; 353 | $b = $tmp; 354 | } 355 | 356 | public function getHash($strRawKey) { 357 | $strRawKey = strval($strRawKey); 358 | $strKey = md5($strRawKey); 359 | if (isset($this->_arrHash[$strKey])) { 360 | return $this->_arrHash[$strKey]; 361 | } 362 | return NULL; 363 | } 364 | 365 | public function setHash($strRawKey, $value) { 366 | $strRawKey = strval($strRawKey); 367 | $strKey = md5($strRawKey); 368 | $this->_arrHash[$strKey] = $value; 369 | } 370 | 371 | public function setHashIfNotExist($strRawKey, $value) { 372 | $strRawKey = strval($strRawKey); 373 | $strKey = md5($strRawKey); 374 | 375 | $bolExist = true; 376 | if (!isset($this->_arrHash[$strKey])) { 377 | $this->_arrHash[$strKey] = $value; 378 | $bolExist = false; 379 | } 380 | 381 | return $bolExist; 382 | } 383 | 384 | public function clearHash() { 385 | $this->_arrHash = array(); 386 | } 387 | 388 | public function addAdditionalUrls($url) { 389 | if (!is_array($url)) { 390 | $url = array($url); 391 | } 392 | 393 | $intAddedNum = 0; 394 | foreach ($url as $strUrl) { 395 | $strUrl = strval($strUrl); 396 | 397 | if ($this->setHashIfNotExist($strUrl, true) === false) { 398 | $this->_arrAdditionalUrls[] = $strUrl; 399 | ++$intAddedNum; 400 | } 401 | } 402 | 403 | return $intAddedNum; 404 | } 405 | }; 406 | ?> 407 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Phpfetcher - a simple web crawler framework 2 | 3 | ## 重要修改记录 - Important Improvements Log 4 | 2017-03-13 支持形如“//xxx.com/abc/def”的超链接 5 | Support hyperlinks like "//xxx.com/abc/def" 6 | 2016-09-08 支持HTTPS 7 | Support HTTPS websites 8 | 2016-08-08 支持对爬虫设置Header。 9 | Crawlers with Headers supported. 10 | 2016-03-26 PHP7测试通过。 11 | Have PHP7 tested. 12 | 2015-10-26 可以爬取网站内链(如"/entry"的超链接)。 13 | Able to crawl website internal hyper links(say "/entry"). 14 | 15 | ## 中文说明(Scroll Down to See The English Description) 16 | 一个PHP爬虫框架 17 | 框架的起源请参见:http://blog.reetsee.com/archives/366 18 | PHP需要启用curl扩展以及mbstring扩展 19 | 支持PHP5,PHP7 20 | ### 1 例子 21 | 下面的所有例子请在`demo`目录下执行,即假设例子对应的文件名是`hello_world.php`,运行例子时你执行的命令应该是`php hello_world.php`而不是`php demo/hello_world.php` 22 | #### 1.1 获取页面中`