├── .gitignore ├── tests ├── phpinfo_test.php ├── Page_test.html ├── errcode_test.php ├── autoload.php ├── return_this_test.php ├── test_simple_html_dom.php ├── Crawler_test.php ├── fetcher_db_test.php ├── curl_test.php ├── test_phpfetcher_util_trie.php ├── Page_test.php └── simple_html_dom.php ├── Phpfetcher ├── Manager │ └── Abstract.php ├── Dom │ ├── Abstract.php │ ├── SimpleHtmlDom.php │ └── simple_html_dom.php ├── Error.php ├── Page │ ├── Abstract.php │ └── Default.php ├── Crawler │ ├── Abstract.php │ └── Default.php ├── Log.php └── Util │ └── Trie.php ├── phpfetcher.php ├── deploy.sh ├── demo ├── structure ├── iframe_example.php ├── single_page.php ├── multi_page.php ├── crawl_baidu_page.php ├── get_picture.php └── crawl_with_headers.php └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | *.swp 3 | -------------------------------------------------------------------------------- /tests/phpinfo_test.php: -------------------------------------------------------------------------------- 1 | 4 | -------------------------------------------------------------------------------- /tests/Page_test.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fanfank/phpfetcher/HEAD/tests/Page_test.html -------------------------------------------------------------------------------- /tests/errcode_test.php: -------------------------------------------------------------------------------- 1 | 9 | -------------------------------------------------------------------------------- /Phpfetcher/Manager/Abstract.php: -------------------------------------------------------------------------------- 1 | 11 | -------------------------------------------------------------------------------- /tests/autoload.php: -------------------------------------------------------------------------------- 1 | 8 | -------------------------------------------------------------------------------- /tests/return_this_test.php: -------------------------------------------------------------------------------- 1 | getInstance()->foo = 2; 11 | echo $objFoo->foo; 12 | ?> 13 | -------------------------------------------------------------------------------- /phpfetcher.php: -------------------------------------------------------------------------------- 1 | 10 | -------------------------------------------------------------------------------- /tests/test_simple_html_dom.php: -------------------------------------------------------------------------------- 1 | find('//h1', 0)->plaintext); 7 | $res = $html->find('//p'); 8 | for($i = 0; $i < count($res); ++$i) { 9 | $arrContent = 10 | } 11 | for($i = 0; $i < $res->length; ++$i) { 12 | 13 | } 14 | -------------------------------------------------------------------------------- /Phpfetcher/Dom/Abstract.php: -------------------------------------------------------------------------------- 1 | 'Success', 16 | self::ERR_INVALID_FIELD => 'Invalid field in array', 17 | self::ERR_FIELD_NOT_SET => 'Accessing a non-set field', 18 | ); 19 | 20 | public static function getErrmsg($errcode) { 21 | return self::$_arrErrcode2Errmsg[$errcode] . "\n"; 22 | } 23 | } 24 | ?> 25 | -------------------------------------------------------------------------------- /tests/Crawler_test.php: -------------------------------------------------------------------------------- 1 | getHyperLinks()); 6 | } 7 | } 8 | 9 | $crawler = new mycrawler(); 10 | $arrFetchJobs = array( 11 | 'blog.reetsee' => array( 12 | 'start_page' => 'http://blog.reetsee.com', 13 | 'link_rules' => array( 14 | '/blog\.reetsee\.com/', 15 | '/wordpress/', 16 | ), 17 | ), 18 | 'qq' => array( 19 | 'start_page' => 'http://news.qq.com', 20 | 'link_rules' => array( 21 | '/(.*)\/a\/(\d{8})\/(\d+)\.htm/', 22 | ), 23 | 'max_depth' => 4, 24 | ), 25 | ); 26 | $crawler->setFetchJobs($arrFetchJobs)->run(); 27 | //$page->setConfField('url', 'http://tech.qq.com/a/20140715/073002.htm'); 28 | 29 | ?> 30 | -------------------------------------------------------------------------------- /tests/fetcher_db_test.php: -------------------------------------------------------------------------------- 1 | getHyperLinks()); 6 | } 7 | } 8 | 9 | $crawler = new mycrawler(); 10 | $arrFetchJobs = array( 11 | 'tencent' => array( 12 | 'start_page' => 'http://news.qq.com', 13 | 'link_rules' => array( 14 | '/(.*)\/a\/(\d{8})\/(\d+)\.htm/', 15 | ), 16 | 'max_depth' => 4, 17 | ), 18 | 'reetsee' => array( 19 | 'start_page' => 'http://blog.reetsee.com', 20 | 'link_rules' => array( 21 | '/blog\.reetsee\.com/', 22 | '/wordpress/', 23 | ), 24 | ), 25 | ); 26 | $crawler->setFetchJobs($arrFetchJobs)->run(); 27 | //$page->setConfField('url', 'http://tech.qq.com/a/20140715/073002.htm'); 28 | 29 | ?> 30 | -------------------------------------------------------------------------------- /demo/structure: -------------------------------------------------------------------------------- 1 | |-phpfetcher.php //使用时include这个文件即可 2 | |-Phpfetcher //此文件夹与phpfetcher.php要在同一目录下 3 | |-Error.php //出错处理相关 4 | |-Log.php //日志相关 5 | |-Manager //暂时无用 6 | |-Crawler //Crawler目录,存放爬虫相关的类代码 7 | |-Abstract.php //爬虫基类Phpfetcher_Crawler_Abstrct 8 | |-Default.php //默认提供的爬虫类Phpfetcher_Crawler_Default 9 | |-Page //Page目录,存放页面相关的类代码 10 | |-Abstract.php //页面基类Phpfetcher_Page_Abstract 11 | |-Default.php //默认提供的页面类Phpfetcher_Page_Default 12 | |-Dom //Dom目录,存放Dom相关的类代码 13 | |-Abstract.php //Dom基类Phpfetcher_Dom_Abstract 14 | |-SimpleHtmlDom.php //默认提供的Dom类Phpfetcher_Dom_SimpleHtmlDom 15 | |-simple_html_dom.php //Phpfetcher_Dom_SimpleHtmlDom实际使用了simple_html_dom中的代码 16 | -------------------------------------------------------------------------------- /tests/curl_test.php: -------------------------------------------------------------------------------- 1 | 0) { 17 | echo "available!\n"; 18 | } 19 | curl_close($objCurl); 20 | return; 21 | } 22 | 23 | $c = 'a'; 24 | for ($i = 0; $i < 26; $i++) { 25 | getAvailableDomain($strName . $c, $intMaxDepth); 26 | ++$c; 27 | } 28 | } 29 | 30 | getAvailableDomain('', 3); 31 | ?> 32 | -------------------------------------------------------------------------------- /demo/iframe_example.php: -------------------------------------------------------------------------------- 1 | getUrl() . "] +++\n"; 9 | 10 | //选取所有包含src属性的iframe元素 11 | $arrIframes = $page->sel('//iframe[@src]'); 12 | for ($i = 0; $i < count($arrIframes); ++$i) { 13 | $strSrc = $arrIframes[$i]->src; 14 | echo "found iframe url=[" . $strSrc . "]\n"; 15 | $this->addAdditionalUrls($strSrc); 16 | } 17 | echo "--- leave page: [" . $page->getUrl() . "] ---\n"; 18 | } 19 | 20 | }; 21 | 22 | $crawler = new mycrawler(); 23 | $arrJobs = array( 24 | '163' => array( 25 | 'start_page' => 'http://news.163.com', 26 | 'link_rules' => array(), 27 | 'max_depth' => 2, 28 | ) , 29 | ); 30 | 31 | $crawler->setFetchJobs($arrJobs)->run(); 32 | 33 | echo "Done!\n"; 34 | -------------------------------------------------------------------------------- /tests/test_phpfetcher_util_trie.php: -------------------------------------------------------------------------------- 1 | has("ftp"), true) . "\n"; 9 | echo "has http:" . var_export($trie->has("http"), true) . "\n"; 10 | echo "has https:" . var_export($trie->has("https"), true) . "\n"; 11 | echo "\n"; 12 | } 13 | 14 | $arrSchemes = array( 15 | "http", 16 | "https", 17 | "ftp", 18 | ); 19 | $trie = new Phpfetcher_Util_Trie($arrSchemes); 20 | print_trie($trie); 21 | 22 | echo "delete 'abc'\n"; 23 | $trie->delete("abc"); 24 | print_trie($trie); 25 | 26 | echo "delete 'ftp'\n"; 27 | $trie->delete("ftp"); 28 | print_trie($trie); 29 | 30 | echo "delete 'http'\n"; 31 | $trie->delete("http"); 32 | print_trie($trie); 33 | 34 | echo "insert 'ftp'\n"; 35 | $trie->insert("ftp"); 36 | print_trie($trie); 37 | 38 | echo "delete 'https'\n"; 39 | $trie->delete("https"); 40 | print_trie($trie); 41 | 42 | echo "insert 'http'\n"; 43 | $trie->insert("http"); 44 | print_trie($trie); 45 | -------------------------------------------------------------------------------- /tests/Page_test.php: -------------------------------------------------------------------------------- 1 | init(); 6 | //$page->setConfField('url', 'http://tech.qq.com/a/20140715/073002.htm'); 7 | $page->setConfField('url', 'http://news.qq.com/a/20140921/000030.htm'); 8 | $page->read(); 9 | //echo $page->getContent(); 10 | //$DOMElement_id_oneKey = $page->selId('oneKey'); 11 | //var_dump($DOMElement_id_oneKey); 12 | //echo "\n"; 13 | //var_dump($DOMElement_id_oneKey->parentNode); 14 | //echo "\n"; 15 | //var_dump($DOMElement_id_oneKey->childNodes); 16 | //echo $DOMElement_id_oneKey->nodeValue; 17 | //print_r($page->xpath('//meta[@http-equiv]')); 18 | //var_dump($page->xpath2('//meta[@http-equiv]')); 19 | $res = $page->xpath('//title'); 20 | //$res = $page->xpath('a'); 21 | var_dump($res->item(0)->nodeValue); 22 | //var_dump($res->item(1)->nodeValue); 23 | /* 24 | $arrLinks = array(); 25 | $res = $page->xpath('//a/@href'); 26 | for($i = 0; $i < $res->length;++$i) { 27 | //var_dump($res->item($i)); 28 | $arrLinks[] = $res->item($i)->nodeValue; 29 | } 30 | */ 31 | //$arrLinks = $page->getHyperLinks(); 32 | //print_r($arrLinks); 33 | 34 | 35 | 36 | ?> 37 | -------------------------------------------------------------------------------- /demo/single_page.php: -------------------------------------------------------------------------------- 1 | sel('//title'); 11 | for ($i = 0; $i < count($res); ++$i) { 12 | echo $res[$i]->plaintext; 13 | echo "\n"; 14 | } 15 | } 16 | } 17 | 18 | $crawler = new mycrawler(); 19 | $arrJobs = array( 20 | //任务的名字随便起,这里把名字叫qqnews 21 | //the key is the name of a job, here names it qqnews 22 | 'qqnews' => array( 23 | 'start_page' => 'http://news.qq.com/a/20140927/026557.htm', //起始网页 24 | 'link_rules' => array( 25 | /* 26 | * 所有在这里列出的正则规则,只要能匹配到超链接,那么那条爬虫就会爬到那条超链接 27 | * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches 28 | */ 29 | ), 30 | //爬虫从开始页面算起,最多爬取的深度,设置为1表示只爬取起始页面 31 | //Crawler's max following depth, 1 stands for only crawl the start page 32 | 'max_depth' => 1, 33 | 34 | ) , 35 | ); 36 | 37 | //$crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样 38 | $crawler->setFetchJobs($arrJobs); 39 | $crawler->run(); 40 | -------------------------------------------------------------------------------- /demo/multi_page.php: -------------------------------------------------------------------------------- 1 | sel('//h1', 0)->plaintext); 11 | if (!empty($strFirstH1)) { 12 | echo $page->sel('//h1', 0)->plaintext; 13 | echo "\n"; 14 | } 15 | } 16 | } 17 | 18 | $crawler = new mycrawler(); 19 | $arrJobs = array( 20 | //任务的名字随便起,这里把名字叫qqnews 21 | //the key is the name of a job, here names it qqnews 22 | 'qqnews' => array( 23 | 'start_page' => 'http://news.qq.com', //起始网页 24 | 'link_rules' => array( 25 | /* 26 | * 所有在这里列出的正则规则,只要能匹配到超链接,那么那条爬虫就会爬到那条超链接 27 | * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches 28 | */ 29 | '#news\.qq\.com/a/\d+/\d+\.htm$#', 30 | ), 31 | //爬虫从开始页面算起,最多爬取的深度,设置为2表示爬取深度为1 32 | //Crawler's max following depth, 1 stands for only crawl the start page 33 | 'max_depth' => 2, 34 | 35 | ) , 36 | ); 37 | 38 | $crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样 39 | //$crawler->setFetchJobs($arrJobs); 40 | //$crawler->run(); 41 | -------------------------------------------------------------------------------- /demo/crawl_baidu_page.php: -------------------------------------------------------------------------------- 1 | sel('//h3/a'); 11 | for ($i = 0; $i < count($res); ++$i) { 12 | echo $res[$i]->plaintext; 13 | echo "\n"; 14 | echo $res[$i]->getAttribute('href'); 15 | echo "\n"; 16 | echo "\n"; 17 | } 18 | } 19 | } 20 | 21 | $crawler = new mycrawler(); 22 | $arrJobs = array( 23 | //任务的名字随便起,这里把名字叫qqnews 24 | //the key is the name of a job, here names it qqnews 25 | 'qqnews' => array( 26 | 'start_page' => 'https://www.baidu.com/s?wd=facebook', //起始网页 27 | 'link_rules' => array( 28 | /* 29 | * 所有在这里列出的正则规则,只要能匹配到超链接,那么那条爬虫就会爬到那条超链接 30 | * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches 31 | */ 32 | ), 33 | //爬虫从开始页面算起,最多爬取的深度,设置为1表示只爬取起始页面 34 | //Crawler's max following depth, 1 stands for only crawl the start page 35 | 'max_depth' => 1, 36 | 37 | ) , 38 | ); 39 | 40 | //$crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样 41 | $crawler->setFetchJobs($arrJobs); 42 | $crawler->run(); 43 | -------------------------------------------------------------------------------- /Phpfetcher/Page/Abstract.php: -------------------------------------------------------------------------------- 1 | 标签的内容,以数组形式返回 65 | abstract function getHyperLinks(); 66 | } 67 | ?> 68 | -------------------------------------------------------------------------------- /Phpfetcher/Crawler/Abstract.php: -------------------------------------------------------------------------------- 1 | _arrExtraInfo[$field]; 33 | } 34 | return $arrOutput; 35 | } 36 | */ 37 | 38 | /* 39 | public function setExtraInfo($arrInput = array()) { 40 | if (!is_array($arrInput) || empty($arrInput)) { 41 | return FALSE; 42 | } 43 | foreach ($arrInput as $key => $value) { 44 | $this->_arrExtraInfo[$key] = $value; 45 | } 46 | return TRUE; 47 | } 48 | */ 49 | 50 | /* 51 | //修改一条已有的规则 52 | public function setFetchJobByName() { 53 | Phpfetcher_Error::notice('not implemented'); 54 | } 55 | */ 56 | 57 | //运行爬虫 58 | abstract function &run($arrInput = array()); 59 | } 60 | ?> 61 | -------------------------------------------------------------------------------- /demo/get_picture.php: -------------------------------------------------------------------------------- 1 | sel("//p"); 10 | for ($i = 0; $i < count($objContent); ++$i) { 11 | $objPic = $objContent[$i]->find("img"); 12 | for ($j = 0; $j < count($objPic); ++$j) { 13 | echo $objPic[$j]->getAttribute('src') . "\n"; 14 | echo $objPic[$j]->getAttribute('alt') . "\n"; 15 | echo $objContent[$i]->plaintext . "\n"; 16 | echo $objContent[$i]->outertext() . "\n"; 17 | } 18 | } 19 | 20 | ////打印处当前页面的title 21 | //$res = $page->sel('//title'); 22 | //for ($i = 0; $i < count($res); ++$i) { 23 | // echo $res[$i]->plaintext; 24 | // echo "\n"; 25 | //} 26 | } 27 | } 28 | 29 | $crawler = new mycrawler(); 30 | $arrJobs = array( 31 | //任务的名字随便起,这里把名字叫qqnews 32 | //the key is the name of a job, here names it qqnews 33 | 'qqnews' => array( 34 | 'start_page' => 'http://news.163.com/16/0325/21/BJ1I6PN40001124J.html', //起始网页 35 | 'link_rules' => array( 36 | /* 37 | * 所有在这里列出的正则规则,只要能匹配到超链接,那么那条爬虫就会爬到那条超链接 38 | * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches 39 | */ 40 | ), 41 | //爬虫从开始页面算起,最多爬取的深度,设置为1表示只爬取起始页面 42 | //Crawler's max following depth, 1 stands for only crawl the start page 43 | 'max_depth' => 1, 44 | 45 | ) , 46 | ); 47 | 48 | //$crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样 49 | $crawler->setFetchJobs($arrJobs); 50 | $crawler->run(); 51 | -------------------------------------------------------------------------------- /Phpfetcher/Dom/SimpleHtmlDom.php: -------------------------------------------------------------------------------- 1 | _dom, 'clear')) { 17 | $this->_dom->clear(); 18 | } 19 | } 20 | 21 | public function getElementById($id) { 22 | $strMethodName = 'getElementById'; 23 | if (method_exists($this->_dom, $strMethodName)) { 24 | return $this->_dom->getElementById($id); 25 | } else { 26 | Phpfetcher_Log::warning("method $strMethodName not exists"); 27 | return FALSE; 28 | } 29 | } 30 | 31 | public function getElementsByTagName($tag) { 32 | $strMethodName = 'getElementsByTagName'; 33 | if (method_exists($this->_dom, $strMethodName)) { 34 | return $this->_dom->getElementsByTagName($tag); 35 | } else { 36 | Phpfetcher_Log::warning("method $strMethodName not exists"); 37 | return FALSE; 38 | } 39 | } 40 | 41 | public function loadHTML($content) { 42 | if (NULL === $this->_dom) { 43 | if (function_exists('str_get_html')) { 44 | $this->_dom = str_get_html($content); 45 | } 46 | } else { 47 | if (method_exists($this->_dom, 'load')) { 48 | $this->_dom->load($content); 49 | } 50 | } 51 | 52 | return $this; 53 | } 54 | 55 | /** 56 | * @deprecated 57 | */ 58 | public function sel($pattern = '', $idx = NULL, $node = NULL) { 59 | return $this->find($pattern, $idx); 60 | } 61 | 62 | public function find($pattern = '', $idx = NULL) { 63 | $strMethodName = 'find'; 64 | if (method_exists($this->_dom, $strMethodName)) { 65 | return $this->_dom->find($pattern, $idx); 66 | } else { 67 | Phpfetcher_Log::warning("method $strMethodName not exists"); 68 | return FALSE; 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /Phpfetcher/Log.php: -------------------------------------------------------------------------------- 1 | $intTraceDepth) { 62 | $intDepth = $intTraceDepth; 63 | } 64 | $arrTargetTrace = $arrTrace[$intDepth]; 65 | unset($arrTrace); 66 | if (isset($arrTargetTrace['file'])) { 67 | $arrTargetTrace['file'] = basename($arrTargetTrace['file']); 68 | } 69 | 70 | $strPrepend = strval(@date("Y-m-d H:i:s")) . " {$arrTargetTrace['file']} {$arrTargetTrace['class']} {$arrTargetTrace['function']} {$arrTargetTrace['line']} " . $strPrepend . ' '; 71 | 72 | $strMsg = $strPrepend . $strMsg . $strAppend; 73 | 74 | echo $strMsg; 75 | } 76 | } 77 | ?> 78 | -------------------------------------------------------------------------------- /Phpfetcher/Util/Trie.php: -------------------------------------------------------------------------------- 1 | _arrTrieRoot = array( 14 | 'children' => array(), 15 | 'count' => 0, 16 | ); 17 | foreach ($arrStrings as $str) { 18 | $this->insert($str); 19 | } 20 | } 21 | 22 | public function insert($str) { 23 | try { 24 | $str = strval($str); 25 | $intLen = strlen($str); 26 | $arrCurNode = &$this->_arrTrieRoot; 27 | 28 | for ($i = 0; $i < $intLen; ++$i) { 29 | if (!isset($arrCurNode['children'][$str[$i]])) { 30 | $arrCurNode['children'][$str[$i]] = array( 31 | 'children' => array(), 32 | 'count' => 0, 33 | ); 34 | } 35 | $arrCurNode = &$arrCurNode['children'][$str[$i]]; 36 | } 37 | 38 | $arrCurNode['count'] += 1; 39 | unset($arrCurNode); 40 | 41 | } catch (Exception $e) { 42 | Phpfetcher_Log::fatal($e->getMessage()); 43 | return false; 44 | } 45 | 46 | return true; 47 | } 48 | 49 | public function delete($str) { 50 | $arrCurNode = &$this->_locateNode($str); 51 | if (!is_null($arrCurNode) && $arrCurNode['count'] > 0) { 52 | $arrCurNode['count'] -= 1; 53 | } 54 | unset($arrCurNode); 55 | return true; 56 | } 57 | 58 | public function has($str) { 59 | $arrTargetNode = &$this->_locateNode($str); 60 | $bolRes = false; 61 | if (!is_null($arrTargetNode) && $arrTargetNode['count'] > 0) { 62 | $bolRes = true; 63 | } 64 | unset($arrTargetNode); 65 | return $bolRes; 66 | } 67 | 68 | protected function &_locateNode($str) { 69 | $str = strval($str); 70 | $intLen = strlen($str); 71 | $arrCurNode = &$this->_arrTrieRoot; 72 | 73 | for ($i = 0; $i < $intLen; ++$i) { 74 | if (!isset($arrCurNode['children'][$str[$i]])) { 75 | return null; 76 | } 77 | $arrCurNode = &$arrCurNode['children'][$str[$i]]; 78 | } 79 | 80 | return $arrCurNode; 81 | } 82 | 83 | //public function startsWith($str) { 84 | // $str = strval($str); 85 | // //TODO 86 | //} 87 | }; 88 | -------------------------------------------------------------------------------- /demo/crawl_with_headers.php: -------------------------------------------------------------------------------- 1 | sel('//title'); 11 | for ($i = 0; $i < count($res); ++$i) { 12 | echo $res[$i]->plaintext; 13 | echo "\n"; 14 | } 15 | } 16 | } 17 | 18 | $crawler = new mycrawler(); 19 | $arrJobs = array( 20 | //任务的名字随便起,这里把名字叫qqnews 21 | //the key is the name of a job, here names it qqnews 22 | 'qqnews' => array( 23 | 'start_page' => 'http://jianli.58.com/resume/93489192884492', //起始网页 24 | 'link_rules' => array( 25 | /* 26 | * 所有在这里列出的正则规则,只要能匹配到超链接,那么那条爬虫就会爬到那条超链接 27 | * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches 28 | */ 29 | ), 30 | //爬虫从开始页面算起,最多爬取的深度,设置为1表示只爬取起始页面 31 | //Crawler's max following depth, 1 stands for only crawl the start page 32 | 'max_depth' => 1, 33 | 34 | //某些页面做了防抓取策略,可以通过修改UA,或者添加必要的HTTP Header来防止屏蔽 35 | //Some pages may prevent crawlers from working, you may change UA or add 36 | // necessary HTTP Headers to prevent this. 37 | 'page_conf' => array( 38 | 'http_header' => array( 39 | //如果本例子对于你来说运行不成功(发生了错误),那么请将下面的Header 40 | // 替换成与你浏览器请求Header一样的内容,但是不要添加Accept-Encoding 41 | // 这个Header 42 | //If this example can not run successfully, please replace the Headers 43 | // below with the ones exactly you see from your browser. Remember 44 | // not to add Accept-Encoding header. 45 | 'Host: jianli.m.58.com', 46 | 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0', 47 | 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 48 | 'Cookie: 58home=tj; id58=c5/ns1enV2k5MFGqLUAXAg==; city=tj; 58tj_uuid=1cf71e54-dd15-4922-8228-b6bb809edbfd; new_session=0; new_uv=1; utm_source=; spm=; init_refer=; myfeet_tooltip=end; als=0; Hm_lvt_2557cda77f2e9a8b94531c9501582142=1470585797; Hm_lpvt_2557cda77f2e9a8b94531c9501582142=1470585797; 4drh9g=test insert val', 49 | 'Connection: keep-alive', 50 | 'Cache-Control: max-age=0', 51 | 52 | //不要添加Accept-Encoding的Header 53 | //Do not add Accept-Encoding Header 54 | //'Accept-Encoding: gzip, deflate' 55 | ), 56 | ), 57 | ) , 58 | ); 59 | 60 | $crawler->setFetchJobs($arrJobs); 61 | $crawler->run(); 62 | -------------------------------------------------------------------------------- /Phpfetcher/Page/Default.php: -------------------------------------------------------------------------------- 1 | CURLOPT_HEADER, 13 | 'exclude_body' => CURLOPT_NOBODY, 14 | 'is_post' => CURLOPT_POST, 15 | 'is_verbose' => CURLOPT_VERBOSE, 16 | 'return_transfer'=> CURLOPT_RETURNTRANSFER, 17 | 18 | /* int */ 19 | 'buffer_size' => CURLOPT_BUFFERSIZE, 20 | 'connect_timeout' => CURLOPT_CONNECTTIMEOUT, 21 | 'connect_timeout_ms' => CURLOPT_CONNECTTIMEOUT_MS, 22 | 'dns_cache_timeout' => CURLOPT_DNS_CACHE_TIMEOUT, 23 | 'max_redirs' => CURLOPT_MAXREDIRS, 24 | 'port' => CURLOPT_PORT, 25 | 'timeout' => CURLOPT_TIMEOUT, 26 | 'timeout_ms' => CURLOPT_TIMEOUT_MS, 27 | 28 | /* string */ 29 | 'cookie' => CURLOPT_COOKIE, 30 | 'cookie_file' => CURLOPT_COOKIEFILE, 31 | 'cookie_jar' => CURLOPT_COOKIEJAR, 32 | 'post_fields' => CURLOPT_POSTFIELDS, 33 | 'url' => CURLOPT_URL, 34 | 'user_agent' => CURLOPT_USERAGENT, 35 | 'user_pwd' => CURLOPT_USERPWD, 36 | 37 | /* array */ 38 | 'http_header' => CURLOPT_HTTPHEADER, 39 | 40 | /* stream resource */ 41 | 'file' => CURLOPT_FILE, 42 | 43 | /* function or a Closure */ 44 | 'write_function' => CURLOPT_WRITEFUNCTION, 45 | 46 | /* https */ 47 | 'ssl_verifypeer' => CURLOPT_SSL_VERIFYPEER, 48 | ); 49 | 50 | protected $_arrDefaultConf = array( 51 | 'connect_timeout' => 10, 52 | 'max_redirs' => 10, 53 | 'return_transfer' => 1, //need this 54 | 'timeout' => 15, 55 | 'url' => NULL, 56 | 'user_agent' => 'firefox', 57 | 'ssl_verifypeer' => false, 58 | ); 59 | 60 | protected $_arrConf = array(); 61 | protected $_arrExtraInfo = array(); 62 | protected $_bolCloseCurlHandle = FALSE; 63 | protected $_curlHandle = NULL; 64 | protected $_dom = NULL; 65 | //protected $_xml = NULL; 66 | 67 | public function __construct() { 68 | } 69 | public function __destruct() { 70 | if ($this->_bolCloseCurlHandle) { 71 | curl_close($this->_curlHandle); 72 | } 73 | } 74 | 75 | public static function formatRes($data, $errcode, $errmsg = NULL) { 76 | if ($errmsg === NULL) { 77 | $errmsg = Phpfetcher_Error::getErrmsg($errcode); 78 | } 79 | return array('errcode' => $errcode, 'errmsg' => $errmsg, 'res' => $data); 80 | } 81 | 82 | /** 83 | * @author xuruiqi 84 | * @desc get configurations. 85 | */ 86 | public function getConf() { 87 | return $this->_arrConf; 88 | } 89 | 90 | /** 91 | * @author xuruiqi 92 | * @param $key: specified field 93 | * @return 94 | * bool : false when field doesn't exist 95 | * mixed : otherwise 96 | * @desc get a specified configuration. 97 | */ 98 | public function getConfField($key) { 99 | if (isset($this->_arrConf[$key])) { 100 | return self::formatRes($this->_arrConf[$key], Phpfetcher_Error::ERR_SUCCESS); 101 | } else { 102 | return self::formatRes(NULL, Phpfetcher_Error::ERR_FIELD_NOT_SET); 103 | } 104 | } 105 | 106 | public function getContent() { 107 | return $this->_strContent; 108 | } 109 | 110 | public function getExtraInfo($arrInput) { 111 | $arrOutput = array(); 112 | foreach ($arrInput as $req_key) { 113 | $arrOutput[$req_key] = $this->_arrExtraInfo[$req_key]; 114 | } 115 | return $arrOutput; 116 | } 117 | 118 | public function getHyperLinks() { 119 | $arrLinks = array(); 120 | $res = $this->sel('//a'); 121 | for ($i = 0; $i < count($res); ++$i) { 122 | $arrLinks[] = $res[$i]->href; 123 | } 124 | /* 125 | foreach ($res as $node) { 126 | $arrLinks[] = $node->href; 127 | } 128 | */ 129 | return $arrLinks; 130 | } 131 | 132 | /** 133 | * @author xuruiqi 134 | * @param 135 | * @return 136 | * string : current page's url 137 | * @desc get this page's URL. 138 | */ 139 | public function getUrl() { 140 | $arrRet = $this->getConfField('url'); 141 | return strval($arrRet['res']); 142 | } 143 | 144 | /** 145 | * @author xuruiqi 146 | * @param 147 | * array $conf : configurations 148 | * bool $clear_default : whether to clear default options not set in $conf 149 | * @return 150 | * @desc initialize this instance with specified or default configuration 151 | */ 152 | public function init($curl_handle = NULL, $conf = array()) { 153 | $this->_curlHandle = $curl_handle; 154 | if (empty($this->_curlHandle)) { 155 | $this->_curlHandle = curl_init(); 156 | $this->_bolCloseCurlHandle = TRUE; 157 | } 158 | $this->_arrConf = $this->_arrDefaultConf; 159 | 160 | $this->setConf($conf, TRUE); 161 | 162 | return $this; 163 | } 164 | 165 | /** 166 | * @author xuruiqi 167 | * @param 168 | * array $ids : elements' ids 169 | * @return 170 | * array : array of DOMElement, with keys equal each of ids 171 | * NULL : if $this->_dom equals NULL 172 | * @desc select spcified elements with their ids. 173 | */ 174 | /* 175 | public function mselId($ids) { 176 | if ($this->_dom === NULL) { 177 | Phpfetcher_Log::warning('$this->_dom is NULL!'); 178 | return NULL; 179 | } 180 | 181 | $arrOutput = array(); 182 | foreach ($ids as $id) { 183 | $arrOutput[$id] = $this->selId($id); 184 | } 185 | return $arrOutput; 186 | } 187 | */ 188 | 189 | /** 190 | * @author xuruiqi 191 | * @param 192 | * array $tags : elements' tags 193 | * @return 194 | * array : array of DOMNodeList, with keys equal each of tags 195 | * NULL : if $this->_dom equals NULL 196 | * @desc select spcified elements with their tags 197 | */ 198 | /* 199 | public function mselTagName($tags) { 200 | if ($this->_dom === NULL) { 201 | Phpfetcher_Log::warning('$this->_dom is NULL!'); 202 | return NULL; 203 | } 204 | 205 | $arrOutput = array(); 206 | foreach ($tags as $tag) { 207 | $arrOutput[$tag] = $this->selId($tag); 208 | } 209 | return $arrOutput; 210 | } 211 | */ 212 | 213 | 214 | /** 215 | * @author xuruiqi 216 | * @param 217 | * array $conf : configurations 218 | * bool $clear_previous_conf : if TRUE, then before set $conf, reset current configuration to its default value 219 | * @return 220 | * array : previous conf 221 | * @desc set configurations. 222 | */ 223 | public function setConf($conf = array(), $clear_previous_conf = FALSE) { 224 | $previous_conf = $this->_arrConf; 225 | if ($clear_previous_conf === TRUE) { 226 | $this->_arrConf = $this->_arrDefaultConf; 227 | } 228 | foreach ($conf as $k => $v) { 229 | $this->_arrConf[$k] = $v; 230 | } 231 | 232 | $bolRes = TRUE; 233 | if ($clear_previous_conf === TRUE) { 234 | $bolRes = $this->_setConf($this->_arrConf); 235 | } else { 236 | $bolRes = $this->_setConf($conf); 237 | } 238 | 239 | if ($bolRes != TRUE) { 240 | $this->_arrConf = $previous_conf; 241 | $this->_setConf($this->_arrConf); 242 | return $bolRes; 243 | } 244 | 245 | return $previous_conf; 246 | } 247 | 248 | protected function _setConf($conf = array()) { 249 | $arrCurlOpts = array(); 250 | foreach ($conf as $k => $v) { 251 | if (isset(self::$_arrField2CurlOpt[$k])) { 252 | $arrCurlOpts[self::$_arrField2CurlOpt[$k]] = $v; 253 | } else { 254 | //currently only curl options can be set 255 | $arrCurlOpts[$k] = $v; 256 | } 257 | } 258 | return curl_setopt_array($this->_curlHandle, $arrCurlOpts); 259 | } 260 | 261 | public function setExtraInfo($arrInput) { 262 | foreach ($arrInput as $key => $val) { 263 | $this->_arrExtraInfo[$key] = $val; 264 | } 265 | } 266 | 267 | /** 268 | * @author xuruiqi 269 | * @param 270 | * string $id : specifed element id 271 | * @return 272 | * object : DOMElement or NULL is not found 273 | * NULL : if $this->_dom equals NULL 274 | * @desc select a spcified element via its id. 275 | */ 276 | public function selId($id) { 277 | if ($this->_dom === NULL) { 278 | Phpfetcher_Log::warning('$this->_dom is NULL!'); 279 | return NULL; 280 | } 281 | 282 | return $this->_dom->getElementById($id); 283 | } 284 | 285 | /** 286 | * @author xuruiqi 287 | * @param 288 | * string $tag : specifed elements' tag name 289 | * @return 290 | * object : a traversable DOMNodeList object containing all the matched elements 291 | * NULL : if $this->_dom equals NULL 292 | * @desc select spcified elements via its tag name. 293 | */ 294 | public function selTagName($tag) { 295 | if ($this->_dom === NULL) { 296 | Phpfetcher_Log::warning('$this->_dom is NULL!'); 297 | return NULL; 298 | } 299 | 300 | return $this->_dom->getElementsByTagName($tag); 301 | } 302 | 303 | public function setConfField($field, $value) { 304 | $this->_arrConf[$field] = $value; 305 | return $this->_setConfField($field, $value); 306 | } 307 | 308 | protected function _setConfField($field, $value) { 309 | if (isset(self::$_arrField2CurlOpt[$field])) { 310 | return curl_setopt($this->_curlHandle, self::$_arrField2CurlOpt[$field], $value); 311 | } else { 312 | //currently only curl options can be set 313 | return curl_setopt($this->_curlHandle, $field, $value); 314 | } 315 | } 316 | 317 | /** 318 | * @author xuruiqi 319 | * @param 320 | * string $url : the URL 321 | * @return 322 | * string : previous URL 323 | * @desc set this page's URL. 324 | */ 325 | public function setUrl($url) { 326 | $previous_url = $this->_arrConf['url']; 327 | $this->setConfField('url', $url); 328 | return $previous_url; 329 | } 330 | 331 | /** 332 | * @author xuruiqi 333 | * @param 334 | * array $arrHeaderList : header list 335 | * e.g. 336 | * array( 337 | * ... 338 | * "Cookie: xxxxx", 339 | * ... 340 | * "Header_n: header_n_value", 341 | * ) 342 | * @return 343 | * this 344 | * @desc set header of the next fetch 345 | */ 346 | public function &setHeaders($arrHeaderList) { 347 | $this->setConf(array( 348 | "http_header" => $arrHeaderList 349 | )); 350 | return $this; 351 | } 352 | 353 | /** 354 | * @author xuruiqi 355 | * @param 356 | * @return 357 | * string : return page's content 358 | * bool : if failed return FALSE 359 | * @desc get page's content, and save it into member variable <_strContent> 360 | */ 361 | public function read() { 362 | $this->_strContent = curl_exec($this->_curlHandle); 363 | if ($this->_strContent != FALSE) { 364 | $matches = array(); 365 | preg_match('#charset="?([a-zA-Z0-9-\._]+)"?#', $this->_strContent, $matches); 366 | if (!empty($matches[1])) { 367 | //Phpfetcher_Log::notice("Convert content from {$matches[1]} to UTF-8\n"); 368 | $this->_strContent = mb_convert_encoding($this->_strContent, 'UTF-8', $matches[1]); 369 | } 370 | 371 | /* 372 | $this->_dom = new DOMDocument(); //DOMDocument's compatibility is bad 373 | if (@$this->_dom->loadHTML($this->_strContent) == FALSE) { 374 | Phpfetcher_Log::warning('Failed to call $this->_dom->loadHTML'); 375 | $this->_dom = NULL; 376 | $this->_domxpath = NULL; 377 | } else { 378 | $this->_domxpath = new DOMXPath($this->_dom); 379 | } 380 | */ 381 | 382 | $this->_dom = new Phpfetcher_Dom_SimpleHtmlDom(); 383 | if (@$this->_dom->loadHTML($this->_strContent) == FALSE) { 384 | Phpfetcher_Log::warning('Failed to call $this->_dom->loadHTML'); 385 | $this->_dom = NULL; 386 | } 387 | } 388 | return $this->_strContent; 389 | } 390 | 391 | /** 392 | * @author xuruiqi 393 | * @param 394 | * string $strPath : xpath's path 395 | * [DOMNode $contextnode : The optional contextnode can be specified for doing relative XPath queries. By default, the queries are relative to the root element.] 396 | * 397 | * @return 398 | * DOMNodelist : DOMNodelist object 399 | * NULL : if $this->_dom equals NULL 400 | * false : if error occurs 401 | * @desc select corresponding content use xpath 402 | */ 403 | public function sel($strPath, $intIndex = NULL, $contextnode = NULL) { 404 | if ($this->_dom === NULL) { 405 | Phpfetcher_Log::warning('$this->_dom is NULL!'); 406 | return NULL; 407 | } 408 | 409 | if ($contextnode !== NULL) { 410 | //$res = $this->_domxpath->query($strPath, $contextnode); 411 | Phpfetcher_Log::warning('param contextnode is no use because of this function\'s inability'); 412 | $res = $this->_dom->sel($strPath, $intIndex); 413 | } else { 414 | //$res = $this->_domxpath->query($strPath); 415 | $res = $this->_dom->sel($strPath, $intIndex); 416 | } 417 | 418 | return $res; 419 | } 420 | } 421 | ?> 422 | -------------------------------------------------------------------------------- /Phpfetcher/Crawler/Default.php: -------------------------------------------------------------------------------- 1 | self::STR_TYPE, 24 | 'link_rules' => self::ARR_TYPE, 25 | 'max_depth' => self::INT_TYPE, 26 | 'max_pages' => self::INT_TYPE, 27 | ); 28 | 29 | /* 30 | protected static $arrJobDefaultFields = array( 31 | 'max_depth' => self::MAX_DEPTH, 32 | 'max_pages' => self::MAX_PAGE_NUM, 33 | ); 34 | */ 35 | 36 | protected $_arrFetchJobs = array(); 37 | protected $_arrHash = array(); 38 | protected $_arrAdditionalUrls = array(); 39 | protected $_objSchemeTrie = array(); //合法url scheme的字典树 40 | //protected $_objPage = NULL; //Phpfetcher_Page_Default; 41 | 42 | public function __construct($arrInitParam = array()) { 43 | if (!isset($arrInitParam['url_schemes'])) { 44 | $arrInitParam['url_schemes'] = array("http", "https", "ftp"); 45 | } 46 | 47 | $this->_objSchemeTrie = 48 | new Phpfetcher_Util_Trie($arrInitParam['url_schemes']); 49 | } 50 | 51 | /** 52 | * @author xuruiqi 53 | * @param 54 | * array $arrInput: 55 | * array <任务名1> : 56 | * string 'start_page', //爬虫的起始页面 57 | * array 'link_rules': //爬虫跟踪的超链接需要满足的正则表达式,依次检查规则,匹配其中任何一条即可 58 | * string 0, //正则表达式1 59 | * string 1, //正则表达式2 60 | * ... 61 | * string n-1, //正则表达式n 62 | * int 'max_depth' , //爬虫最大的跟踪深度,目前限制最大值不超过20 63 | * int 'max_pages' , //最多爬取的页面数,默认指定为-1,表示没有限制 64 | * array <任务名2> : 65 | * ... 66 | * ... 67 | * ... 68 | * array <任务名n-1>: 69 | * ... 70 | * ... 71 | * 72 | * @return 73 | * Object $this : returns the instance itself 74 | * @desc add by what rules the crawler should fetch the pages 75 | * if a job has already been in jobs queue, new rules will 76 | * cover the old ones. 77 | */ 78 | public function &addFetchJobs($arrInput = array()) { 79 | return $this->_modifyFetchJobs($arrInput, self::MODIFY_JOBS_ADD); 80 | } 81 | 82 | /** 83 | * @author xuruiqi 84 | * @param 85 | * array $arrInput : 86 | * mixed 0 : 87 | * 任务名 88 | * mixed 1 : 89 | * 任务名 90 | * ... ... 91 | * @return 92 | * Object $this : returns the instance itself 93 | * @desc delete fetch rules according to job names 94 | */ 95 | public function &delFetchJobs($arrInput = array()) { 96 | return $this->_modifyFetchJobs($arrInput, self::MODIFY_JOBS_DEL); 97 | } 98 | 99 | public function getFetchJobByName($job_name) { 100 | return $this->_arrFetchJobs[$strJobName]; 101 | } 102 | 103 | public function getFetchJobs() { 104 | return $this->_arrFetchJobs; 105 | } 106 | 107 | /* 108 | public function handlePage() { 109 | //由用户继承本类并实现此方法 110 | } 111 | */ 112 | 113 | /** 114 | * @author xuruiqi 115 | * @param : 116 | * //$intOptType === MODIFY_JOBS_SET|MODIFY_JOBS_ADD, 117 | * $arrInput参见addFetchJobs的入参$arrInput 118 | * //$intOptType === MODIFY_JOBS_DEL, 119 | * $arrInput参见delFetchJobs的入参$arrInput 120 | * 121 | * @return 122 | * Object $this : returns the instance itself 123 | * @desc set fetch rules. 124 | */ 125 | protected function &_modifyFetchJobs($arrInput = array(), $intOptType) { 126 | $arrInvalidJobs = array(); 127 | if ($intOptType === self::MODIFY_JOBS_SET || $intOptType === self::MODIFY_JOBS_ADD) { 128 | if ($intOptType === self::MODIFY_JOBS_SET) { 129 | $this->_arrFetchJobs = array(); 130 | } 131 | foreach ($arrInput as $job_name => $job_rules) { 132 | $this->_correctJobParam($job_rules); 133 | if ($this->_isJobValid($job_rules)) { 134 | $this->_arrFetchJobs[$job_name] = $job_rules; 135 | } else { 136 | $arrInvalidJobs[] = $job_name; 137 | } 138 | } 139 | } else if ($intOptType === self::MODIFY_JOBS_DEL) { 140 | foreach ($arrInput as $job_name) { 141 | unset($this->_arrFetchJobs[$job_name]); 142 | } 143 | } else { 144 | Phpfetcher_Log::warning("Unknown options for fetch jobs [{$intOptType}]"); 145 | } 146 | 147 | 148 | if (!empty($arrInvalidJobs)) { 149 | Phpfetcher_Log::notice('Invalid jobs:' . implode(',', $arrInvalidJobs)); 150 | } 151 | return $this; 152 | } 153 | 154 | /** 155 | * @author xuruiqi 156 | * @param : 参见addFetchJobs的入参$arrInput 157 | * 158 | * @return 159 | * Object $this : returns the instance itself 160 | * @desc set fetch jobs. 161 | */ 162 | public function &setFetchJobs($arrInput = array()) { 163 | return $this->_modifyFetchJobs($arrInput, self::MODIFY_JOBS_SET); 164 | } 165 | 166 | /** 167 | * @author xuruiqi 168 | * @param 169 | * array $arrInput : //运行设定 170 | * string 'page_class_name' : //指定要使用的Page类型,必须是 171 | * //Phpfetcher_Page_Abstract的 172 | * //子类 173 | * [array 'page_conf'] : //Page调用setConf时的输入参数,可选 174 | * @return 175 | * obj $this 176 | * @desc 177 | */ 178 | public function &run($arrInput = array()) { 179 | if (empty($this->_arrFetchJobs)) { 180 | Phpfetcher_Log::warning("No fetch jobs."); 181 | return $this; 182 | } 183 | 184 | //构建Page对象 185 | $objPage = NULL; 186 | $strPageClassName = self::DEFAULT_PAGE_CLASS; 187 | if (!empty($arrInput['page_class_name'])) { 188 | $strPageClassName = strval($arrInput['page_class_name']); 189 | } 190 | try { 191 | if (!class_exists($strPageClassName, TRUE)) { 192 | throw new Exception("[$strPageClassName] class not exists!"); 193 | } 194 | 195 | $objPage = new $strPageClassName; 196 | if (!($objPage instanceof Phpfetcher_Page_Abstract)) { 197 | throw new Exception("[$strPageClassName] is not an instance of " . self::ABSTRACT_PAGE_CLASS); 198 | } 199 | } catch (Exception $e) { 200 | Phpfetcher_Log::fatal($e->getMessage()); 201 | return $this; 202 | } 203 | 204 | //初始化Page对象 205 | $arrPageConf = empty($arrInput['page_conf']) ? array() : $arrInput['page_conf']; 206 | $objPage->init(); 207 | if (!empty($arrPageConf)) { 208 | if(isset($arrPageConf['url'])) { 209 | unset($arrPageConf['url']); 210 | } 211 | $objPage->setConf($arrPageConf); 212 | } 213 | 214 | //遍历任务队列 215 | foreach ($this->_arrFetchJobs as $job_name => $job_rules) { 216 | if (!($this->_isJobValid($job_rules))) { 217 | Phpfetcher_Log::warning("Job rules invalid [" . serialize($job_rules) . "]"); 218 | continue; 219 | } 220 | 221 | //检查是否需要设置curl配置 222 | if (!empty($job_rules['page_conf'])) { 223 | $objPage->setConf($job_rules['page_conf']); 224 | } 225 | 226 | $intDepth = 0; 227 | $intPageNum = 0; 228 | $arrIndice = array(0, 1); 229 | $arrJobs = array( 230 | 0 => array($job_rules['start_page']), 231 | 1 => array(), 232 | ); 233 | 234 | //开始爬取 235 | while (!empty($arrJobs[$arrIndice[0]]) 236 | && ($job_rules['max_depth'] === -1 || $intDepth < $job_rules['max_depth']) 237 | && ($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) { 238 | 239 | $intDepth += 1; 240 | $intPopIndex = $arrIndice[0]; 241 | $intPushIndex = $arrIndice[1]; 242 | $arrJobs[$intPushIndex] = array(); 243 | foreach ($arrJobs[$intPopIndex] as $url) { 244 | if (!($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) { 245 | break; 246 | } 247 | $objPage->setUrl($url); 248 | $objPage->read(); 249 | 250 | //获取所有的超链接 251 | $arrLinks = $objPage->getHyperLinks(); 252 | 253 | //解析当前URL的各个组成部分,以应对超链接中存在站内链接 254 | //的情况,如"/entry"等形式的URL 255 | $strCurUrl = $objPage->getUrl(); 256 | $arrUrlComponents = parse_url($strCurUrl); 257 | 258 | //匹配超链接 259 | foreach ($job_rules['link_rules'] as $link_rule) { 260 | foreach ($arrLinks as $link) { 261 | if (substr($link, 0, 2) == "//") { 262 | $link = substr($link, 1); 263 | } 264 | 265 | if (preg_match($link_rule, $link) === 1 266 | && !$this->getHash($link)) { 267 | 268 | //拼出实际的URL 269 | $real_link = $link; 270 | 271 | //不使用strpos,防止扫描整个字符串 272 | //这里只需要扫描前6个字符即可 273 | $colon_pos = false; 274 | for ($i = 0; $i <= 5; ++$i) { 275 | if ($link[$i] == ':') { 276 | $colon_pos = $i; 277 | break; 278 | } 279 | } 280 | 281 | if ($colon_pos === false 282 | || !$this->_objSchemeTrie->has( 283 | substr($link, 0, $colon_pos))) { 284 | //将站内地址转换为完整地址 285 | $real_link = $arrUrlComponents['scheme'] 286 | . "://" 287 | . $arrUrlComponents['host'] 288 | . (isset($arrUrlComponents['port']) 289 | && strlen($arrUrlComponents['port']) != 0 ? 290 | ":{$arrUrlComponents['port']}" : 291 | "") 292 | . ($link[0] == '/' ? 293 | $link : "/$link"); 294 | } 295 | 296 | $this->setHash($link, true); 297 | $this->setHash($real_link, true); 298 | $arrJobs[$intPushIndex][] = $real_link; 299 | } 300 | } 301 | } 302 | 303 | //由用户实现handlePage函数 304 | $objPage->setExtraInfo(array('job_name' => $job_name )); 305 | $this->handlePage($objPage); 306 | $intPageNum += 1; 307 | } 308 | 309 | if (!empty($this->_arrAdditionalUrls)) { 310 | $arrJobs[$intPushIndex] = 311 | array_merge($arrJobs[$intPushIndex], 312 | $this->_arrAdditionalUrls); 313 | $this->_arrAdditionalUrls = array(); 314 | } 315 | 316 | self::_swap($arrIndice[0], $arrIndice[1]); 317 | } 318 | } 319 | return $this; 320 | } 321 | 322 | protected function _correctJobParam(&$job_rules) { 323 | /* 324 | foreach (self::$arrJobDefaultFields as $field => $value) { 325 | if (!isset($job_rules[$field]) || ($job_rules[''])) 326 | } 327 | */ 328 | if (!isset($job_rules['max_depth']) || (self::MAX_DEPTH !== -1 && self::MAX_DEPTH < $job_rules['max_depth'])) { 329 | $job_rules['max_depth'] = self::MAX_DEPTH; 330 | } 331 | 332 | if (!isset($job_rules['max_pages']) || (self::MAX_PAGE_NUM !== -1 && self::MAX_PAGE_NUM < $job_rules['max_pages'])) { 333 | $job_rules['max_pages'] = self::MAX_PAGE_NUM; 334 | } 335 | } 336 | 337 | /** 338 | * @author xuruiqi 339 | * @desc check if a rule is valid 340 | */ 341 | protected function _isJobValid($arrRule) { 342 | foreach (self::$arrJobFieldTypes as $field => $type) { 343 | if (!isset($arrRule[$field]) || ($type === self::ARR_TYPE && !is_array($arrRule[$field]))) { 344 | return FALSE; 345 | } 346 | } 347 | return TRUE; 348 | } 349 | 350 | protected static function _swap(&$a, &$b) { 351 | $tmp = $a; 352 | $a = $b; 353 | $b = $tmp; 354 | } 355 | 356 | public function getHash($strRawKey) { 357 | $strRawKey = strval($strRawKey); 358 | $strKey = md5($strRawKey); 359 | if (isset($this->_arrHash[$strKey])) { 360 | return $this->_arrHash[$strKey]; 361 | } 362 | return NULL; 363 | } 364 | 365 | public function setHash($strRawKey, $value) { 366 | $strRawKey = strval($strRawKey); 367 | $strKey = md5($strRawKey); 368 | $this->_arrHash[$strKey] = $value; 369 | } 370 | 371 | public function setHashIfNotExist($strRawKey, $value) { 372 | $strRawKey = strval($strRawKey); 373 | $strKey = md5($strRawKey); 374 | 375 | $bolExist = true; 376 | if (!isset($this->_arrHash[$strKey])) { 377 | $this->_arrHash[$strKey] = $value; 378 | $bolExist = false; 379 | } 380 | 381 | return $bolExist; 382 | } 383 | 384 | public function clearHash() { 385 | $this->_arrHash = array(); 386 | } 387 | 388 | public function addAdditionalUrls($url) { 389 | if (!is_array($url)) { 390 | $url = array($url); 391 | } 392 | 393 | $intAddedNum = 0; 394 | foreach ($url as $strUrl) { 395 | $strUrl = strval($strUrl); 396 | 397 | if ($this->setHashIfNotExist($strUrl, true) === false) { 398 | $this->_arrAdditionalUrls[] = $strUrl; 399 | ++$intAddedNum; 400 | } 401 | } 402 | 403 | return $intAddedNum; 404 | } 405 | }; 406 | ?> 407 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Phpfetcher - a simple web crawler framework 2 | 3 | ## 重要修改记录 - Important Improvements Log 4 | 2017-03-13 支持形如“//xxx.com/abc/def”的超链接 5 |       Support hyperlinks like "//xxx.com/abc/def" 6 | 2016-09-08 支持HTTPS 7 |       Support HTTPS websites 8 | 2016-08-08 支持对爬虫设置Header。 9 |       Crawlers with Headers supported. 10 | 2016-03-26 PHP7测试通过。 11 |       Have PHP7 tested. 12 | 2015-10-26 可以爬取网站内链(如"/entry"的超链接)。 13 |       Able to crawl website internal hyper links(say "/entry"). 14 | 15 | ## 中文说明(Scroll Down to See The English Description) 16 | 一个PHP爬虫框架 17 | 框架的起源请参见:http://blog.reetsee.com/archives/366 18 | PHP需要启用curl扩展以及mbstring扩展 19 | 支持PHP5,PHP7 20 | ### 1 例子 21 | 下面的所有例子请在`demo`目录下执行,即假设例子对应的文件名是`hello_world.php`,运行例子时你执行的命令应该是`php hello_world.php`而不是`php demo/hello_world.php` 22 | #### 1.1 获取页面中``标签的内容 23 | 指定一个新闻页面:`http://news.qq.com/a/20140927/026557.htm`,然后获取网页HTML中的`<title>`标签的内容来获取标题 24 | 请运行`single_page.php`例子,得到的输出如下: 25 | ``` 26 | $> php single_page.php 27 | 王思聪回应遭警方调查:带弓箭不犯法 我是绿箭侠_新闻_腾讯网 28 | ``` 29 | #### 1.2 获取腾讯新闻主页的大部分新闻标题 30 | 指定一个种子页面:`http://news.qq.com`,跟踪这个页面的超链接,被跟踪的超链接能被正则表达式`#news\.qq\.com/a/\d+/\d+\.htm$#`匹配,例如`news.qq.com/a/20140927/026557.html`,就会被跟踪。爬虫对于所有爬取的网页(包括起始页`news.qq.com`),抓取所有的`<h1>`标签,并打印内容 31 | 请运行`multi_page.php`,得到的输出如下: 32 | ``` 33 | $> php multi_page.php 34 | 腾讯新闻——事实派 35 | 习近平访英前接受采访 谈及南海问题及足球等 36 | 习近平夫妇访英行程确定 将与女王共进私人午宴 37 | 李克强:让能干事的地方获得更多支持 38 | 环保部:我国40个城市已出现空气质量重污染 39 | 京津冀形成两个重污染带 太行燕山东南污染重 40 | 铁路部门回应“车票丢失被迫补票”:到站再退款 41 | 女大学生火车票遗失被要求补全票 铁路局:没做错 42 | 今日话题:丢失火车票要重买,老黄历何时改 43 | 外媒:两名藏僧被俄驱逐出境 44 | 广西北海民众聚众阻挠海事码头建设 16人被刑拘 45 | 河南一村民被政府人员土埋 官方称系邻里纠纷 46 | 餐厅用掺老鼠屎黄豆做咸菜 老板:都是中药材 47 | ``` 48 | #### 1.3 获取标签属性值 + 指定额外要跟踪的URL 49 | 这个例子用来展现怎么提取HTML标签中的属性以及爬虫运行的过程中如何临时添加需要抓取的URL。我们检查`news.163.com`页面的`<iframe>`标签,并让爬虫进入到iframe标签所指向的URL。 50 | 请运行`iframe_example.php`,得到的输出如下: 51 | ``` 52 | $> php iframe_example.php 53 | +++ enter page: [http://news.163.com] +++ 54 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=1] 55 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=1] 56 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=2] 57 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=2] 58 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=3] 59 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=4] 60 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x150&location=1] 61 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5] 62 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5] 63 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=6] 64 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=7] 65 | --- leave page: [http://news.163.com] --- 66 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=1] +++ 67 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=1] --- 68 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=1] +++ 69 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=1] --- 70 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=2] +++ 71 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=2] --- 72 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=2] +++ 73 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=2] --- 74 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=3] +++ 75 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=3] --- 76 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=4] +++ 77 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=4] --- 78 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x150&location=1] +++ 79 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x150&location=1] --- 80 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5] +++ 81 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5] --- 82 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=6] +++ 83 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=6] --- 84 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=7] +++ 85 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=7] --- 86 | Done! 87 | ``` 88 | 这和直接在`$arrJobs['link_rules']`指定爬取规则有什么不同呢?不同点如下: 89 | 1. 爬虫默认只爬取`<a>`标签,并将`<a>`标签的`href`属性作为要爬取的地址放入爬取队列中,而地址需要满足的规则就是由`$arrJobs['link_rules']`来决定的。而`<iframe>`标签原本并不是爬虫爬取的目标,并且其地址放在标签的`src`属性中; 90 | 2. 之前的例子中,要爬取的URL都是框架自动添加的,而这个例子中,要爬取的`<iframe>`地址是我们通过调用`$this->addAdditionalUrls($strSrc);`手动添加的。 91 | #### 1.4 爬取百度搜索结果 92 | 只要你对一个网站的网页结构有一定了解,你就能获取到你想要的所有信息,通过观察百度的搜索结果页,可以发现大多数搜索结果对应的DOM元素有这样的规律:`<h3><a href="我是结果链接地址">我是结果描述文字</a></h3>`,因此我们只要提取`<h3>`标签下的`<a>`标签的文本内容及`href`属性。 93 | 请运行`crawl_baidu_page.php`,这个程序会打印关键字`facebook`的搜索结果,得到的输出如下: 94 | ``` 95 | $> php crawl_baidu_page.php 96 | Facebook 97 | http://www.baidu.com/link?url=AtASutoPNIKCLMMz_CTeuhoe97gXt5N2JagWcZm0eUO-dvRdInYNWVhk7UVGiSNi 98 | 99 | Facebook_百度百科 100 | http://www.baidu.com/link?url=9D5oa_7E1ezSVwfx4hGVRtObcvmruI0UCR_cOTWEnj74p7AiWY_ESYXyvnyVHlXXHOYHh94UaZdiUpnGdS5qQa 101 | 102 | facebook - facebook官网_facebook注册 103 | http://www.baidu.com/link?url=3CmiG8W9me4-Xc0WkdDvsLT71hMN37s3o1M11T5VnbN-PFBnCgoCoXJ9-8iIPijf 104 | 105 | facebook中文网 - facebook中文官网 facebook网址登陆 106 | http://www.baidu.com/link?url=yJqsEl7U_elBeIsW4i108vaaFNTugzb8nWM8h9kXS0zDdKbBhWEUbcRm7ALY3rQF 107 | 108 | facebook吧_百度贴吧 109 | http://www.baidu.com/link?url=mWmpR1_PTCFQuJTmE_TarbSDvvHhhim4w15fQ8dipvJRwLY5twIb17hivcOcUGa-v_mbDS0Bfd4SVh7mjHz4mK 110 | 111 | facebook的最新相关信息 112 | http://www.baidu.com/link?url=ARSNH3CTzh9HyGL8VgmREUTI1JC8VNmJ3FPHJn32l_nHFnjKGWdbexnZmsQ7090JoTKVeRVYXlixLaxnjH6yDJt8ln7IJsoihEXPY9B7-m3 113 | 114 | Facebook 115 | http://www.baidu.com/link?url=G7GoImtCer71s9xQ0C5rlbCbGN6toa3fONlouj8nlHkIAJg3TrazM4FFw-9sjSzU 116 | 117 | Facebook[FB]_美股实时行情_新浪财经 118 | http://www.baidu.com/link?url=AtASutoPNIKCLMMz_CTeuh_n1s-MJ2bubaCG7gsoyh81Oj-9lYKqY4Wv8iYx8OuUhnaOL6R9M8WJTnc5qcrrF8s_vP2R9W0dURAaLW6zT5_ 119 | 120 | facebook中文网 - facebook官网注册! 121 | http://www.baidu.com/link?url=LDR4I-ZA2VI4YuVk-hLH_SvxNwcynRZJ6qtD1go0wc68Q08viPvLh3-wXvoW3ILS 122 | 123 | 为什么中国出不了Facebook和Twitter?-月光博客 124 | http://www.baidu.com/link?url=g7e5dKdgTPcIKOwybAPc7mk7omwz94u0xWuZ_9-nS1AGfdotydkziu7vqCRbrVK0T6rTCUSA3Al5mL4Rcl7YY_ 125 | ``` 126 | #### 1.5 为你的爬虫添加HTTP Headers 127 | 有时候某些网站必须要求登录用户才能查看内容,或者需要用户的Header里的某些信息校验通过后(例如Cookie),才能进行浏览。我们可以为爬虫添加HTTP Header,使得网页可以被爬取到。 128 | 请运行`crawl_with_headers.php`,这个程序会打印简历页面的标题,得到的输出如下: 129 | ``` 130 | $> php crawl_with_headers.php 131 | 【吴文博简历】 - 出纳简历 - 58同城 132 | ``` 133 | 如果你运行时出现以下错误: 134 | ``` 135 | 2016-08-07 16:33:17 Default.php Phpfetcher_Page_Default sel 116 Warning: $this->_dom is NULL! 136 | 2016-08-07 16:33:17 crawl_with_headers.php Phpfetcher_Page_Default sel 10 Warning: $this->_dom is NULL! 137 | ``` 138 | 请将文件中的`http_header`数组替换成你的浏览器访问这个网页时的Request HEADER,然后再尝试几次。注意:`Accept-Encoding`这个Header不要添加进去。 139 | ### 2 获取HTML页面中某个元素的所有信息 140 | 可以参考例子1.3以及1.4,实际上主要使用以下四样东西: 141 | 1. xpath,它是用来描述你要查找的HTML标签的语句,可以参考[http://www.w3school.com.cn/xpath/](http://www.w3school.com.cn/xpath/); 142 | 2. `find`方法,如所有例子中都有的`$page->find('xpath语句')`,调用这个方法后会得到一个数组,数组的内容就是所有满足要求的DOM元素的实例; 143 | 3. simplehtmldom的`plaintext`成员,例如例子中的`$res[$i]->plaintext`,保存着DOM元素包裹的文本内容; 144 | 4. simplehtmldom的`getAttribute`方法,例如例子`crawl_baidu_page.php`中的`$res[$i]->getAttribute('href')`,这样你就可以获得对应元素的属性值了。 145 | 基本上熟悉了上面四点,你就能较好地在Phpfetcher中操控DOM元素。 146 | Phpfetcher解析HTML时使用了simplehtmldom这个开源项目的内容,更多关于它的API可以参见[http://simplehtmldom.sourceforge.net/](http://simplehtmldom.sourceforge.net/),或者[Drupal API的描述](http://api.drupal.psu.edu/api/drupal/modules%21contrib%21simplehtmldom%21simplehtmldom%21simple_html_dom.php/cis7)。 147 | 你也可以直接修改本项目中的Phpfetcher/Page/Default.php以及Phpfetcher/Dom/SimpleHtmlDom.php文件,来更好地实现你的需求。 148 | ### 3 修改user-agent 149 | 之前出现过一个问题就是Phpfetcher由于使用了`phpfetcher`这个user-agent遭到屏蔽。关于什么是user-agent,大家可以搜一下,它可以看成是浏览器对自己的一种标识,例如火狐的UA中会有`Firefox`,Chrome的UA中会有`Chrome`,手机的浏览器中多数会带上`Mobile`字样等,如`Chrome Mobile`、`Safari Mobile`等; 150 | 当然UA并不是什么神圣、高深的东西,这个东西随便改。以前百度屏蔽360浏览器的请求时,360浏览器就可以通过修改自己的UA来绕过百度的UA检测(当然百度的屏蔽不止检测UA这一项) 151 | 如果大家在使用Phpfetcher过程中,发现有网页返回`Forbidden`等情况,就可以考虑修改一下UA。 152 | 直接修改文件`Phpfetcher/Dom/Default.php`中`'user_agent' = 'firefox'`这一行,将`firefox`替换成一个看起来更靠谱的UA。 153 | ``` 154 | protected $_arrDefaultConf = array( 155 | 'connect_timeout' => 10, 156 | 'max_redirs' => 10, 157 | 'return_transfer' => 1, //need this 158 | 'timeout' => 15, 159 | 'url' => NULL, 160 | 'user_agent' => 'firefox' 161 | ); 162 | ``` 163 | 如果替换UA后还是被屏蔽,那就有可能是其它原因了,例如是你的IP被屏蔽了等。 164 | ### 4 结语 165 | 这个框架还有很多不完善的地方,例如怎么使用多线程进行爬取、怎么样模拟登录状态进行爬取等。 166 | 但目前框架能适应大多数需求,暂时也比较简单易维护,短期内不会往更复杂的方向发展。 167 | 然而设计上的缺陷还是有不少的,例如有没有办法不修改源码去修改UA,去修改CURL的参数等,这些都是可以改进的。不过还是那句,在需求不强烈前,就不去进一步修改现有的结构了。 168 | 祝大家用得开心。 169 | ## English Description 170 | A PHP web crawler framework 171 | The origin of this framework please refer to: http://blog.reetsee.com/archives/366 172 | PHP need to be compiled with curl and mbstring extentions 173 | PHP5, PHP7 are supported 174 | ### 1 Examples 175 | Please run the following examples under `demo` directory, assume you want to run `hello_world.php`, use `php hellow_world.php` rather than `php demo/hello_world.php`. 176 | #### 1.1 Get Plaintext of `<title>` Tags 177 | Specify a target page, say `http://news.qq.com/a/20140927/026557.htm`, then get all the plaintext in the `<title>` tags to get the title of the page 178 | Please run the `single_page.php` example, and you will get the following output: 179 | ``` 180 | $> php single_page.php 181 | 王思聪回应遭警方调查:带弓箭不犯法 我是绿箭侠_新闻_腾讯网 182 | ``` 183 | #### 1.2 Get Titles of News from The Homepage of Tencent News 184 | Sepcify a seed page, say `http://news.qq.com`, the homepage of tencent news, follow the links of on this page, which satisfy the regrex `#news\.qq\.com/a/\d+/\d+\.htm$#`(e.g. `news.qq.com/a/20140927/026557.html`). The crawlers will inspect `<h1>` tags of all the pages(including the homepage `news.qq.com`), and print the plaintext inside the tags. 185 | Please run `multi_page.php`, and you will get the following output: 186 | ``` 187 | $> php multi_page.php 188 | 腾讯新闻——事实派 189 | 习近平访英前接受采访 谈及南海问题及足球等 190 | 习近平夫妇访英行程确定 将与女王共进私人午宴 191 | 李克强:让能干事的地方获得更多支持 192 | 环保部:我国40个城市已出现空气质量重污染 193 | 京津冀形成两个重污染带 太行燕山东南污染重 194 | 铁路部门回应“车票丢失被迫补票”:到站再退款 195 | 女大学生火车票遗失被要求补全票 铁路局:没做错 196 | 今日话题:丢失火车票要重买,老黄历何时改 197 | 外媒:两名藏僧被俄驱逐出境 198 | 广西北海民众聚众阻挠海事码头建设 16人被刑拘 199 | 河南一村民被政府人员土埋 官方称系邻里纠纷 200 | 餐厅用掺老鼠屎黄豆做咸菜 老板:都是中药材 201 | ``` 202 | #### 1.3 Get Attributes of HTML Tags + Add Additional Crawling URLs 203 | This example shows how to get attributes of HTML tags, and how to add URLs to be crawled after starting a crawling job. We will ask the crawlers to inspect all the `<iframe>` tags on page `news.163.com`, and make crawlers follow the links where `<iframe>` tags point to. 204 | Please run `iframe_example.php`, and you will get the following output: 205 | ``` 206 | $> php iframe_example.php 207 | +++ enter page: [http://news.163.com] +++ 208 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=1] 209 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=1] 210 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=2] 211 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=2] 212 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=3] 213 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=4] 214 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x150&location=1] 215 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5] 216 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5] 217 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=6] 218 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=7] 219 | --- leave page: [http://news.163.com] --- 220 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=1] +++ 221 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=1] --- 222 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=1] +++ 223 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=1] --- 224 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=2] +++ 225 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=2] --- 226 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=2] +++ 227 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=2] --- 228 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=3] +++ 229 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=3] --- 230 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=4] +++ 231 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=4] --- 232 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x150&location=1] +++ 233 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x150&location=1] --- 234 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5] +++ 235 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5] --- 236 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=6] +++ 237 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=6] --- 238 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=7] +++ 239 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=7] --- 240 | Done! 241 | ``` 242 | What is the difference compared to setting crawling rules in `$arrJobs['link_rules']`. Answers below: 243 | 1. Crawlers only inspect `<a>` tags, and enqueue the value of the `href` attribute, which must satisfy rules listed in the `$arrJobsb['link_rules']` array, of the tag. While crawlers do not recognise `<iframe>` tags, plus the corresponding URL is pointed to by `src` attribute of the tags; 244 | 2. We tell the crawlers which links should be additinally followed during their run time using `$this->addAdditionalUrls($strSrc);`, rather than setting the rules before they start to work. 245 | #### 1.4 Crawling Baidu Search Engine Results 246 | As long as you know something about the structure of a web page, you will get anything you want from the page. After looking inside the HTML codes of searching-result pages from Baidu, we can find out that every result entry locates in DOMs which has the folloing format: `<h3><a href="I am link"><em>I am description</em></a></h3>`. Thus we only need to retrieve the plaintext and `href` attribute of `<a>` tags whose direct parents are `<h3>` tags. 247 | Please run `crawl_baidu_page.php`, which prints the searching results of 'facebook', and you will get the following output: 248 | ``` 249 | $> php crawl_baidu_page.php 250 | Facebook 251 | http://www.baidu.com/link?url=AtASutoPNIKCLMMz_CTeuhoe97gXt5N2JagWcZm0eUO-dvRdInYNWVhk7UVGiSNi 252 | 253 | Facebook_百度百科 254 | http://www.baidu.com/link?url=9D5oa_7E1ezSVwfx4hGVRtObcvmruI0UCR_cOTWEnj74p7AiWY_ESYXyvnyVHlXXHOYHh94UaZdiUpnGdS5qQa 255 | 256 | facebook - facebook官网_facebook注册 257 | http://www.baidu.com/link?url=3CmiG8W9me4-Xc0WkdDvsLT71hMN37s3o1M11T5VnbN-PFBnCgoCoXJ9-8iIPijf 258 | 259 | facebook中文网 - facebook中文官网 facebook网址登陆 260 | http://www.baidu.com/link?url=yJqsEl7U_elBeIsW4i108vaaFNTugzb8nWM8h9kXS0zDdKbBhWEUbcRm7ALY3rQF 261 | 262 | facebook吧_百度贴吧 263 | http://www.baidu.com/link?url=mWmpR1_PTCFQuJTmE_TarbSDvvHhhim4w15fQ8dipvJRwLY5twIb17hivcOcUGa-v_mbDS0Bfd4SVh7mjHz4mK 264 | 265 | facebook的最新相关信息 266 | http://www.baidu.com/link?url=ARSNH3CTzh9HyGL8VgmREUTI1JC8VNmJ3FPHJn32l_nHFnjKGWdbexnZmsQ7090JoTKVeRVYXlixLaxnjH6yDJt8ln7IJsoihEXPY9B7-m3 267 | 268 | Facebook 269 | http://www.baidu.com/link?url=G7GoImtCer71s9xQ0C5rlbCbGN6toa3fONlouj8nlHkIAJg3TrazM4FFw-9sjSzU 270 | 271 | Facebook[FB]_美股实时行情_新浪财经 272 | http://www.baidu.com/link?url=AtASutoPNIKCLMMz_CTeuh_n1s-MJ2bubaCG7gsoyh81Oj-9lYKqY4Wv8iYx8OuUhnaOL6R9M8WJTnc5qcrrF8s_vP2R9W0dURAaLW6zT5_ 273 | 274 | facebook中文网 - facebook官网注册! 275 | http://www.baidu.com/link?url=LDR4I-ZA2VI4YuVk-hLH_SvxNwcynRZJ6qtD1go0wc68Q08viPvLh3-wXvoW3ILS 276 | 277 | 为什么中国出不了Facebook和Twitter?-月光博客 278 | http://www.baidu.com/link?url=g7e5dKdgTPcIKOwybAPc7mk7omwz94u0xWuZ_9-nS1AGfdotydkziu7vqCRbrVK0T6rTCUSA3Al5mL4Rcl7YY_ 279 | ``` 280 | #### 1.5 Add HTTP Headers For Your Crawlers 281 | Sometimes websites only allow logined users to access, or sometimes websites display different content according to HTTP Request Headers. 282 | We can manually add Headers to our crawlers so that we can fetch the expected content. 283 | Please run `crawl_with_headers.php`, this demo will print the title of the page, which outputs: 284 | ``` 285 | $> php crawl_with_headers.php 286 | 【吴文博简历】 - 出纳简历 - 58同城 287 | ``` 288 | If you encouter the following errors: 289 | ``` 290 | 2016-08-07 16:33:17 Default.php Phpfetcher_Page_Default sel 116 Warning: $this->_dom is NULL! 291 | 2016-08-07 16:33:17 crawl_with_headers.php Phpfetcher_Page_Default sel 10 Warning: $this->_dom is NULL! 292 | ``` 293 | Please replace `http_header` array in the file with your own Request HEADER, then try a few times. Warning: `Accept-Encoding` Header should not be added. 294 | ### 2 Get All The Infomation of An HTML Tag 295 | Please use example 1.3 and 1.4 as references. Actually you mainly have to know the following four techniques: 296 | 1. xpath, it is used to describe what kind of HTML tag you are looking for, learn more about xpath: [www.w3schools.com/xsl/xpath_syntax.asp](www.w3schools.com/xsl/xpath_syntax.asp); 297 | 2. `find` method, all the examples above use `$page->find('xpath query')`, after calling this method you will get an array, which will contain all the qualified DOM elements; 298 | 3. Member `plaintext` of simplehtmldom, say `$res[$i]->plaintext`, which stores plain text that the DOM element wraps; 299 | 4. Method `getAttribute` of simplehtmldom, say `$res[$i]->getAttribute('href')` in the `crawl_baidu_page.php` example, from which you can get the attribute of the specified tag. 300 | Generally speaking, once you are familiar with the above four, you handle DOMs in Phpfetcher well. 301 | Phpfetcher parse HTMLs using simplehtmldom, an opensourced project, view it on [http://simplehtmldom.sourceforge.net/](http://simplehtmldom.sourceforge.net/) or learn more about its API with [Drupal API](http://api.drupal.psu.edu/api/drupal/modules%21contrib%21simplehtmldom%21simplehtmldom%21simple_html_dom.php/cis7) 302 | ### 3 Modify User-agent 303 | Previously I encoutered a problem that a website returned `Forbidden` like response due the forbidden user-agent of Phpfetcher, which I set to 'phpfetcher'. You can Google more about user-agent if you want. 304 | Usually speaking, web browers have their own user-agents, say Firefox may include `Firefox` in its user-agent, Chrome may include `Chrome`. Web browers on mobile phones may have `Mobile` in their user-agents, such as `Chrome Mobile`, `Safari Mobile`, etc. 305 | UA(user-agent) is not something holy that we can not touch, but something we can make it whatever we want. 306 | Some websites may forbid access from some web browers, thus when you encouter a weird `Forbidden` issue, consider modify the UA of Phpfetcher, it resides in the line `'user_agent' = 'firefox'` of file `Phpfetcher/Dom/Default.php`, replace the UA `firefox` with something more convincible. 307 | ``` 308 | protected $_arrDefaultConf = array( 309 | 'connect_timeout' => 10, 310 | 'max_redirs' => 10, 311 | 'return_transfer' => 1, //need this 312 | 'timeout' => 15, 313 | 'url' => NULL, 314 | 'user_agent' => 'firefox' 315 | ); 316 | ``` 317 | If you did not solve the problem, consider other reasons like IP forbidden. 318 | ### 4 Summary 319 | There are still lots of imperfect sides of Phpfetcher, including multi-threading, carwling with logged in states, etc. 320 | But that is probably what makes this framework easy to learn, to maintain. 321 | I will not deny that there are many designing problems despite of the lack of features, and I will push the project forward once more and more developers demand more and more necessary features. 322 | Until now, this framework meets most of the demands of its little user group. 323 | I hope you enjoy using Phpfetcher! 324 | 325 | ## TODO 326 | 1. 支持配置爬虫使用的HTTP头以及Cookies。 To support configuring the HTTP Headers and Cookies for the crawlers. 327 | 2. 支持协程/多线程调度爬虫。 To support coroutine/multi-thread crawlers. 328 | -------------------------------------------------------------------------------- /tests/simple_html_dom.php: -------------------------------------------------------------------------------- 1 | <?php 2 | /** 3 | * Website: http://sourceforge.net/projects/simplehtmldom/ 4 | * Additional projects that may be used: http://sourceforge.net/projects/debugobject/ 5 | * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) 6 | * Contributions by: 7 | * Yousuke Kumakura (Attribute filters) 8 | * Vadim Voituk (Negative indexes supports of "find" method) 9 | * Antcs (Constructor with automatically load contents either text or file/url) 10 | * 11 | * all affected sections have comments starting with "PaperG" 12 | * 13 | * Paperg - Added case insensitive testing of the value of the selector. 14 | * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately. 15 | * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source, 16 | * it will almost always be smaller by some amount. 17 | * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from. 18 | * but for most purposes, it's a really good estimation. 19 | * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors. 20 | * Allow the user to tell us how much they trust the html. 21 | * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node. 22 | * This allows for us to find tags based on the text they contain. 23 | * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag. 24 | * Paperg: added parse_charset so that we know about the character set of the source document. 25 | * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the 26 | * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection. 27 | * 28 | * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that. 29 | * PaperG (John Schlick) Added get_display_size for "IMG" tags. 30 | * 31 | * Licensed under The MIT License 32 | * Redistributions of files must retain the above copyright notice. 33 | * 34 | * @author S.C. Chen <me578022@gmail.com> 35 | * @author John Schlick 36 | * @author Rus Carroll 37 | * @version 1.5 ($Rev: 210 $) 38 | * @package PlaceLocalInclude 39 | * @subpackage simple_html_dom 40 | */ 41 | 42 | /** 43 | * All of the Defines for the classes below. 44 | * @author S.C. Chen <me578022@gmail.com> 45 | */ 46 | define('HDOM_TYPE_ELEMENT', 1); 47 | define('HDOM_TYPE_COMMENT', 2); 48 | define('HDOM_TYPE_TEXT', 3); 49 | define('HDOM_TYPE_ENDTAG', 4); 50 | define('HDOM_TYPE_ROOT', 5); 51 | define('HDOM_TYPE_UNKNOWN', 6); 52 | define('HDOM_QUOTE_DOUBLE', 0); 53 | define('HDOM_QUOTE_SINGLE', 1); 54 | define('HDOM_QUOTE_NO', 3); 55 | define('HDOM_INFO_BEGIN', 0); 56 | define('HDOM_INFO_END', 1); 57 | define('HDOM_INFO_QUOTE', 2); 58 | define('HDOM_INFO_SPACE', 3); 59 | define('HDOM_INFO_TEXT', 4); 60 | define('HDOM_INFO_INNER', 5); 61 | define('HDOM_INFO_OUTER', 6); 62 | define('HDOM_INFO_ENDSPACE',7); 63 | define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 64 | define('DEFAULT_BR_TEXT', "\r\n"); 65 | define('DEFAULT_SPAN_TEXT', " "); 66 | define('MAX_FILE_SIZE', 600000); 67 | // helper functions 68 | // ----------------------------------------------------------------------------- 69 | // get html dom from file 70 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. 71 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 72 | { 73 | // We DO force the tags to be terminated. 74 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); 75 | // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done. 76 | $contents = file_get_contents($url, $use_include_path, $context, $offset); 77 | // Paperg - use our own mechanism for getting the contents as we want to control the timeout. 78 | //$contents = retrieve_url_contents($url); 79 | if (empty($contents) || strlen($contents) > MAX_FILE_SIZE) 80 | { 81 | return false; 82 | } 83 | // The second parameter can force the selectors to all be lowercase. 84 | $dom->load($contents, $lowercase, $stripRN); 85 | return $dom; 86 | } 87 | 88 | // get html dom from string 89 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 90 | { 91 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); 92 | if (empty($str) || strlen($str) > MAX_FILE_SIZE) 93 | { 94 | $dom->clear(); 95 | return false; 96 | } 97 | $dom->load($str, $lowercase, $stripRN); 98 | return $dom; 99 | } 100 | 101 | // dump html dom tree 102 | function dump_html_tree($node, $show_attr=true, $deep=0) 103 | { 104 | $node->dump($node); 105 | } 106 | 107 | 108 | /** 109 | * simple html dom node 110 | * PaperG - added ability for "find" routine to lowercase the value of the selector. 111 | * PaperG - added $tag_start to track the start position of the tag in the total byte index 112 | * 113 | * @package PlaceLocalInclude 114 | */ 115 | class simple_html_dom_node 116 | { 117 | public $nodetype = HDOM_TYPE_TEXT; 118 | public $tag = 'text'; 119 | public $attr = array(); 120 | public $children = array(); 121 | public $nodes = array(); 122 | public $parent = null; 123 | // The "info" array - see HDOM_INFO_... for what each element contains. 124 | public $_ = array(); 125 | public $tag_start = 0; 126 | private $dom = null; 127 | 128 | function __construct($dom) 129 | { 130 | $this->dom = $dom; 131 | $dom->nodes[] = $this; 132 | } 133 | 134 | function __destruct() 135 | { 136 | $this->clear(); 137 | } 138 | 139 | function __toString() 140 | { 141 | return $this->outertext(); 142 | } 143 | 144 | // clean up memory due to php5 circular references memory leak... 145 | function clear() 146 | { 147 | $this->dom = null; 148 | $this->nodes = null; 149 | $this->parent = null; 150 | $this->children = null; 151 | } 152 | 153 | // dump node's tree 154 | function dump($show_attr=true, $deep=0) 155 | { 156 | $lead = str_repeat(' ', $deep); 157 | 158 | echo $lead.$this->tag; 159 | if ($show_attr && count($this->attr)>0) 160 | { 161 | echo '('; 162 | foreach ($this->attr as $k=>$v) 163 | echo "[$k]=>\"".$this->$k.'", '; 164 | echo ')'; 165 | } 166 | echo "\n"; 167 | 168 | if ($this->nodes) 169 | { 170 | foreach ($this->nodes as $c) 171 | { 172 | $c->dump($show_attr, $deep+1); 173 | } 174 | } 175 | } 176 | 177 | 178 | // Debugging function to dump a single dom node with a bunch of information about it. 179 | function dump_node($echo=true) 180 | { 181 | 182 | $string = $this->tag; 183 | if (count($this->attr)>0) 184 | { 185 | $string .= '('; 186 | foreach ($this->attr as $k=>$v) 187 | { 188 | $string .= "[$k]=>\"".$this->$k.'", '; 189 | } 190 | $string .= ')'; 191 | } 192 | if (count($this->_)>0) 193 | { 194 | $string .= ' $_ ('; 195 | foreach ($this->_ as $k=>$v) 196 | { 197 | if (is_array($v)) 198 | { 199 | $string .= "[$k]=>("; 200 | foreach ($v as $k2=>$v2) 201 | { 202 | $string .= "[$k2]=>\"".$v2.'", '; 203 | } 204 | $string .= ")"; 205 | } else { 206 | $string .= "[$k]=>\"".$v.'", '; 207 | } 208 | } 209 | $string .= ")"; 210 | } 211 | 212 | if (isset($this->text)) 213 | { 214 | $string .= " text: (" . $this->text . ")"; 215 | } 216 | 217 | $string .= " HDOM_INNER_INFO: '"; 218 | if (isset($node->_[HDOM_INFO_INNER])) 219 | { 220 | $string .= $node->_[HDOM_INFO_INNER] . "'"; 221 | } 222 | else 223 | { 224 | $string .= ' NULL '; 225 | } 226 | 227 | $string .= " children: " . count($this->children); 228 | $string .= " nodes: " . count($this->nodes); 229 | $string .= " tag_start: " . $this->tag_start; 230 | $string .= "\n"; 231 | 232 | if ($echo) 233 | { 234 | echo $string; 235 | return; 236 | } 237 | else 238 | { 239 | return $string; 240 | } 241 | } 242 | 243 | // returns the parent of node 244 | // If a node is passed in, it will reset the parent of the current node to that one. 245 | function parent($parent=null) 246 | { 247 | // I am SURE that this doesn't work properly. 248 | // It fails to unset the current node from it's current parents nodes or children list first. 249 | if ($parent !== null) 250 | { 251 | $this->parent = $parent; 252 | $this->parent->nodes[] = $this; 253 | $this->parent->children[] = $this; 254 | } 255 | 256 | return $this->parent; 257 | } 258 | 259 | // verify that node has children 260 | function has_child() 261 | { 262 | return !empty($this->children); 263 | } 264 | 265 | // returns children of node 266 | function children($idx=-1) 267 | { 268 | if ($idx===-1) 269 | { 270 | return $this->children; 271 | } 272 | if (isset($this->children[$idx])) 273 | { 274 | return $this->children[$idx]; 275 | } 276 | return null; 277 | } 278 | 279 | // returns the first child of node 280 | function first_child() 281 | { 282 | if (count($this->children)>0) 283 | { 284 | return $this->children[0]; 285 | } 286 | return null; 287 | } 288 | 289 | // returns the last child of node 290 | function last_child() 291 | { 292 | if (($count=count($this->children))>0) 293 | { 294 | return $this->children[$count-1]; 295 | } 296 | return null; 297 | } 298 | 299 | // returns the next sibling of node 300 | function next_sibling() 301 | { 302 | if ($this->parent===null) 303 | { 304 | return null; 305 | } 306 | 307 | $idx = 0; 308 | $count = count($this->parent->children); 309 | while ($idx<$count && $this!==$this->parent->children[$idx]) 310 | { 311 | ++$idx; 312 | } 313 | if (++$idx>=$count) 314 | { 315 | return null; 316 | } 317 | return $this->parent->children[$idx]; 318 | } 319 | 320 | // returns the previous sibling of node 321 | function prev_sibling() 322 | { 323 | if ($this->parent===null) return null; 324 | $idx = 0; 325 | $count = count($this->parent->children); 326 | while ($idx<$count && $this!==$this->parent->children[$idx]) 327 | ++$idx; 328 | if (--$idx<0) return null; 329 | return $this->parent->children[$idx]; 330 | } 331 | 332 | // function to locate a specific ancestor tag in the path to the root. 333 | function find_ancestor_tag($tag) 334 | { 335 | global $debug_object; 336 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 337 | 338 | // Start by including ourselves in the comparison. 339 | $returnDom = $this; 340 | 341 | while (!is_null($returnDom)) 342 | { 343 | if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); } 344 | 345 | if ($returnDom->tag == $tag) 346 | { 347 | break; 348 | } 349 | $returnDom = $returnDom->parent; 350 | } 351 | return $returnDom; 352 | } 353 | 354 | // get dom node's inner html 355 | function innertext() 356 | { 357 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 358 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 359 | 360 | $ret = ''; 361 | foreach ($this->nodes as $n) 362 | $ret .= $n->outertext(); 363 | return $ret; 364 | } 365 | 366 | // get dom node's outer text (with tag) 367 | function outertext() 368 | { 369 | global $debug_object; 370 | if (is_object($debug_object)) 371 | { 372 | $text = ''; 373 | if ($this->tag == 'text') 374 | { 375 | if (!empty($this->text)) 376 | { 377 | $text = " with text: " . $this->text; 378 | } 379 | } 380 | $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 381 | } 382 | 383 | if ($this->tag==='root') return $this->innertext(); 384 | 385 | // trigger callback 386 | if ($this->dom && $this->dom->callback!==null) 387 | { 388 | call_user_func_array($this->dom->callback, array($this)); 389 | } 390 | 391 | if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; 392 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 393 | 394 | // render begin tag 395 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) 396 | { 397 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 398 | } else { 399 | $ret = ""; 400 | } 401 | 402 | // render inner text 403 | if (isset($this->_[HDOM_INFO_INNER])) 404 | { 405 | // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. 406 | if ($this->tag != "br") 407 | { 408 | $ret .= $this->_[HDOM_INFO_INNER]; 409 | } 410 | } else { 411 | if ($this->nodes) 412 | { 413 | foreach ($this->nodes as $n) 414 | { 415 | $ret .= $this->convert_text($n->outertext()); 416 | } 417 | } 418 | } 419 | 420 | // render end tag 421 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) 422 | $ret .= '</'.$this->tag.'>'; 423 | return $ret; 424 | } 425 | 426 | // get dom node's plain text 427 | function text() 428 | { 429 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 430 | switch ($this->nodetype) 431 | { 432 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 433 | case HDOM_TYPE_COMMENT: return ''; 434 | case HDOM_TYPE_UNKNOWN: return ''; 435 | } 436 | if (strcasecmp($this->tag, 'script')===0) return ''; 437 | if (strcasecmp($this->tag, 'style')===0) return ''; 438 | 439 | $ret = ''; 440 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. 441 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. 442 | // WHY is this happening? 443 | if (!is_null($this->nodes)) 444 | { 445 | foreach ($this->nodes as $n) 446 | { 447 | $ret .= $this->convert_text($n->text()); 448 | } 449 | 450 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. 451 | if ($this->tag == "span") 452 | { 453 | $ret .= $this->dom->default_span_text; 454 | } 455 | 456 | 457 | } 458 | return $ret; 459 | } 460 | 461 | function xmltext() 462 | { 463 | $ret = $this->innertext(); 464 | $ret = str_ireplace('<![CDATA[', '', $ret); 465 | $ret = str_replace(']]>', '', $ret); 466 | return $ret; 467 | } 468 | 469 | // build node's text with tag 470 | function makeup() 471 | { 472 | // text, comment, unknown 473 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 474 | 475 | $ret = '<'.$this->tag; 476 | $i = -1; 477 | 478 | foreach ($this->attr as $key=>$val) 479 | { 480 | ++$i; 481 | 482 | // skip removed attribute 483 | if ($val===null || $val===false) 484 | continue; 485 | 486 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 487 | //no value attr: nowrap, checked selected... 488 | if ($val===true) 489 | $ret .= $key; 490 | else { 491 | switch ($this->_[HDOM_INFO_QUOTE][$i]) 492 | { 493 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 494 | case HDOM_QUOTE_SINGLE: $quote = '\''; break; 495 | default: $quote = ''; 496 | } 497 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; 498 | } 499 | } 500 | $ret = $this->dom->restore_noise($ret); 501 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 502 | } 503 | 504 | // find elements by css selector 505 | //PaperG - added ability for find to lowercase the value of the selector. 506 | function find($selector, $idx=null, $lowercase=false) 507 | { 508 | $selectors = $this->parse_selector($selector); 509 | if (($count=count($selectors))===0) return array(); 510 | $found_keys = array(); 511 | 512 | // find each selector 513 | for ($c=0; $c<$count; ++$c) 514 | { 515 | // The change on the below line was documented on the sourceforge code tracker id 2788009 516 | // used to be: if (($levle=count($selectors[0]))===0) return array(); 517 | if (($levle=count($selectors[$c]))===0) return array(); 518 | if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); 519 | 520 | $head = array($this->_[HDOM_INFO_BEGIN]=>1); 521 | 522 | // handle descendant selectors, no recursive! 523 | for ($l=0; $l<$levle; ++$l) 524 | { 525 | $ret = array(); 526 | foreach ($head as $k=>$v) 527 | { 528 | $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; 529 | //PaperG - Pass this optional parameter on to the seek function. 530 | $n->seek($selectors[$c][$l], $ret, $lowercase); 531 | } 532 | $head = $ret; 533 | } 534 | 535 | foreach ($head as $k=>$v) 536 | { 537 | if (!isset($found_keys[$k])) 538 | { 539 | $found_keys[$k] = 1; 540 | } 541 | } 542 | } 543 | 544 | // sort keys 545 | ksort($found_keys); 546 | 547 | $found = array(); 548 | foreach ($found_keys as $k=>$v) 549 | $found[] = $this->dom->nodes[$k]; 550 | 551 | // return nth-element or array 552 | if (is_null($idx)) return $found; 553 | else if ($idx<0) $idx = count($found) + $idx; 554 | return (isset($found[$idx])) ? $found[$idx] : null; 555 | } 556 | 557 | // seek for given conditions 558 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector. 559 | protected function seek($selector, &$ret, $lowercase=false) 560 | { 561 | global $debug_object; 562 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 563 | 564 | list($tag, $key, $val, $exp, $no_key) = $selector; 565 | 566 | // xpath index 567 | if ($tag && $key && is_numeric($key)) 568 | { 569 | $count = 0; 570 | foreach ($this->children as $c) 571 | { 572 | if ($tag==='*' || $tag===$c->tag) { 573 | if (++$count==$key) { 574 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1; 575 | return; 576 | } 577 | } 578 | } 579 | return; 580 | } 581 | 582 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 583 | if ($end==0) { 584 | $parent = $this->parent; 585 | while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { 586 | $end -= 1; 587 | $parent = $parent->parent; 588 | } 589 | $end += $parent->_[HDOM_INFO_END]; 590 | } 591 | 592 | for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { 593 | $node = $this->dom->nodes[$i]; 594 | 595 | $pass = true; 596 | 597 | if ($tag==='*' && !$key) { 598 | if (in_array($node, $this->children, true)) 599 | $ret[$i] = 1; 600 | continue; 601 | } 602 | 603 | // compare tag 604 | if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} 605 | // compare key 606 | if ($pass && $key) { 607 | if ($no_key) { 608 | if (isset($node->attr[$key])) $pass=false; 609 | } else { 610 | if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false; 611 | } 612 | } 613 | // compare value 614 | if ($pass && $key && $val && $val!=='*') { 615 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? 616 | if ($key == "plaintext") { 617 | // $node->plaintext actually returns $node->text(); 618 | $nodeKeyValue = $node->text(); 619 | } else { 620 | // this is a normal search, we want the value of that attribute of the tag. 621 | $nodeKeyValue = $node->attr[$key]; 622 | } 623 | if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} 624 | 625 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. 626 | if ($lowercase) { 627 | $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); 628 | } else { 629 | $check = $this->match($exp, $val, $nodeKeyValue); 630 | } 631 | if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));} 632 | 633 | // handle multiple class 634 | if (!$check && strcasecmp($key, 'class')===0) { 635 | foreach (explode(' ',$node->attr[$key]) as $k) { 636 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. 637 | if (!empty($k)) { 638 | if ($lowercase) { 639 | $check = $this->match($exp, strtolower($val), strtolower($k)); 640 | } else { 641 | $check = $this->match($exp, $val, $k); 642 | } 643 | if ($check) break; 644 | } 645 | } 646 | } 647 | if (!$check) $pass = false; 648 | } 649 | if ($pass) $ret[$i] = 1; 650 | unset($node); 651 | } 652 | // It's passed by reference so this is actually what this function returns. 653 | if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);} 654 | } 655 | 656 | protected function match($exp, $pattern, $value) { 657 | global $debug_object; 658 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 659 | 660 | switch ($exp) { 661 | case '=': 662 | return ($value===$pattern); 663 | case '!=': 664 | return ($value!==$pattern); 665 | case '^=': 666 | return preg_match("/^".preg_quote($pattern,'/')."/", $value); 667 | case '$=': 668 | return preg_match("/".preg_quote($pattern,'/')."$/", $value); 669 | case '*=': 670 | if ($pattern[0]=='/') { 671 | return preg_match($pattern, $value); 672 | } 673 | return preg_match("/".$pattern."/i", $value); 674 | } 675 | return false; 676 | } 677 | 678 | protected function parse_selector($selector_string) { 679 | global $debug_object; 680 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 681 | 682 | // pattern of CSS selectors, modified from mootools 683 | // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. 684 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. 685 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. 686 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. 687 | // farther study is required to determine of this should be documented or removed. 688 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 689 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 690 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); 691 | if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);} 692 | 693 | $selectors = array(); 694 | $result = array(); 695 | //print_r($matches); 696 | 697 | foreach ($matches as $m) { 698 | $m[0] = trim($m[0]); 699 | if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; 700 | // for browser generated xpath 701 | if ($m[1]==='tbody') continue; 702 | 703 | list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); 704 | if (!empty($m[2])) {$key='id'; $val=$m[2];} 705 | if (!empty($m[3])) {$key='class'; $val=$m[3];} 706 | if (!empty($m[4])) {$key=$m[4];} 707 | if (!empty($m[5])) {$exp=$m[5];} 708 | if (!empty($m[6])) {$val=$m[6];} 709 | 710 | // convert to lowercase 711 | if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} 712 | //elements that do NOT have the specified attribute 713 | if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} 714 | 715 | $result[] = array($tag, $key, $val, $exp, $no_key); 716 | if (trim($m[7])===',') { 717 | $selectors[] = $result; 718 | $result = array(); 719 | } 720 | } 721 | if (count($result)>0) 722 | $selectors[] = $result; 723 | return $selectors; 724 | } 725 | 726 | function __get($name) 727 | { 728 | if (isset($this->attr[$name])) 729 | { 730 | return $this->convert_text($this->attr[$name]); 731 | } 732 | switch ($name) 733 | { 734 | case 'outertext': return $this->outertext(); 735 | case 'innertext': return $this->innertext(); 736 | case 'plaintext': return $this->text(); 737 | case 'xmltext': return $this->xmltext(); 738 | default: return array_key_exists($name, $this->attr); 739 | } 740 | } 741 | 742 | function __set($name, $value) 743 | { 744 | global $debug_object; 745 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 746 | 747 | switch ($name) 748 | { 749 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 750 | case 'innertext': 751 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; 752 | return $this->_[HDOM_INFO_INNER] = $value; 753 | } 754 | if (!isset($this->attr[$name])) 755 | { 756 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 757 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 758 | } 759 | $this->attr[$name] = $value; 760 | } 761 | 762 | function __isset($name) 763 | { 764 | switch ($name) 765 | { 766 | case 'outertext': return true; 767 | case 'innertext': return true; 768 | case 'plaintext': return true; 769 | } 770 | //no value attr: nowrap, checked selected... 771 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 772 | } 773 | 774 | function __unset($name) { 775 | if (isset($this->attr[$name])) 776 | unset($this->attr[$name]); 777 | } 778 | 779 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same. 780 | function convert_text($text) 781 | { 782 | global $debug_object; 783 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 784 | 785 | $converted_text = $text; 786 | 787 | $sourceCharset = ""; 788 | $targetCharset = ""; 789 | 790 | if ($this->dom) 791 | { 792 | $sourceCharset = strtoupper($this->dom->_charset); 793 | $targetCharset = strtoupper($this->dom->_target_charset); 794 | } 795 | if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} 796 | 797 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) 798 | { 799 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 800 | if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) 801 | { 802 | $converted_text = $text; 803 | } 804 | else 805 | { 806 | $converted_text = iconv($sourceCharset, $targetCharset, $text); 807 | } 808 | } 809 | 810 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 811 | if ($targetCharset == 'UTF-8') 812 | { 813 | if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") 814 | { 815 | $converted_text = substr($converted_text, 3); 816 | } 817 | if (substr($converted_text, -3) == "\xef\xbb\xbf") 818 | { 819 | $converted_text = substr($converted_text, 0, -3); 820 | } 821 | } 822 | 823 | return $converted_text; 824 | } 825 | 826 | /** 827 | * Returns true if $string is valid UTF-8 and false otherwise. 828 | * 829 | * @param mixed $str String to be tested 830 | * @return boolean 831 | */ 832 | static function is_utf8($str) 833 | { 834 | $c=0; $b=0; 835 | $bits=0; 836 | $len=strlen($str); 837 | for($i=0; $i<$len; $i++) 838 | { 839 | $c=ord($str[$i]); 840 | if($c > 128) 841 | { 842 | if(($c >= 254)) return false; 843 | elseif($c >= 252) $bits=6; 844 | elseif($c >= 248) $bits=5; 845 | elseif($c >= 240) $bits=4; 846 | elseif($c >= 224) $bits=3; 847 | elseif($c >= 192) $bits=2; 848 | else return false; 849 | if(($i+$bits) > $len) return false; 850 | while($bits > 1) 851 | { 852 | $i++; 853 | $b=ord($str[$i]); 854 | if($b < 128 || $b > 191) return false; 855 | $bits--; 856 | } 857 | } 858 | } 859 | return true; 860 | } 861 | /* 862 | function is_utf8($string) 863 | { 864 | //this is buggy 865 | return (utf8_encode(utf8_decode($string)) == $string); 866 | } 867 | */ 868 | 869 | /** 870 | * Function to try a few tricks to determine the displayed size of an img on the page. 871 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. 872 | * 873 | * @author John Schlick 874 | * @version April 19 2012 875 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. 876 | */ 877 | function get_display_size() 878 | { 879 | global $debug_object; 880 | 881 | $width = -1; 882 | $height = -1; 883 | 884 | if ($this->tag !== 'img') 885 | { 886 | return false; 887 | } 888 | 889 | // See if there is aheight or width attribute in the tag itself. 890 | if (isset($this->attr['width'])) 891 | { 892 | $width = $this->attr['width']; 893 | } 894 | 895 | if (isset($this->attr['height'])) 896 | { 897 | $height = $this->attr['height']; 898 | } 899 | 900 | // Now look for an inline style. 901 | if (isset($this->attr['style'])) 902 | { 903 | // Thanks to user gnarf from stackoverflow for this regular expression. 904 | $attributes = array(); 905 | preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); 906 | foreach ($matches as $match) { 907 | $attributes[$match[1]] = $match[2]; 908 | } 909 | 910 | // If there is a width in the style attributes: 911 | if (isset($attributes['width']) && $width == -1) 912 | { 913 | // check that the last two characters are px (pixels) 914 | if (strtolower(substr($attributes['width'], -2)) == 'px') 915 | { 916 | $proposed_width = substr($attributes['width'], 0, -2); 917 | // Now make sure that it's an integer and not something stupid. 918 | if (filter_var($proposed_width, FILTER_VALIDATE_INT)) 919 | { 920 | $width = $proposed_width; 921 | } 922 | } 923 | } 924 | 925 | // If there is a width in the style attributes: 926 | if (isset($attributes['height']) && $height == -1) 927 | { 928 | // check that the last two characters are px (pixels) 929 | if (strtolower(substr($attributes['height'], -2)) == 'px') 930 | { 931 | $proposed_height = substr($attributes['height'], 0, -2); 932 | // Now make sure that it's an integer and not something stupid. 933 | if (filter_var($proposed_height, FILTER_VALIDATE_INT)) 934 | { 935 | $height = $proposed_height; 936 | } 937 | } 938 | } 939 | 940 | } 941 | 942 | // Future enhancement: 943 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. 944 | 945 | // Far future enhancement 946 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width 947 | // Note that in this case, the class or id will have the img subselector for it to apply to the image. 948 | 949 | // ridiculously far future development 950 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. 951 | 952 | $result = array('height' => $height, 953 | 'width' => $width); 954 | return $result; 955 | } 956 | 957 | // camel naming conventions 958 | function getAllAttributes() {return $this->attr;} 959 | function getAttribute($name) {return $this->__get($name);} 960 | function setAttribute($name, $value) {$this->__set($name, $value);} 961 | function hasAttribute($name) {return $this->__isset($name);} 962 | function removeAttribute($name) {$this->__set($name, null);} 963 | function getElementById($id) {return $this->find("#$id", 0);} 964 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} 965 | function getElementByTagName($name) {return $this->find($name, 0);} 966 | function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} 967 | function parentNode() {return $this->parent();} 968 | function childNodes($idx=-1) {return $this->children($idx);} 969 | function firstChild() {return $this->first_child();} 970 | function lastChild() {return $this->last_child();} 971 | function nextSibling() {return $this->next_sibling();} 972 | function previousSibling() {return $this->prev_sibling();} 973 | function hasChildNodes() {return $this->has_child();} 974 | function nodeName() {return $this->tag;} 975 | function appendChild($node) {$node->parent($this); return $node;} 976 | 977 | } 978 | 979 | /** 980 | * simple html dom parser 981 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. 982 | * Paperg - change $size from protected to public so we can easily access it 983 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. 984 | * 985 | * @package PlaceLocalInclude 986 | */ 987 | class simple_html_dom 988 | { 989 | public $root = null; 990 | public $nodes = array(); 991 | public $callback = null; 992 | public $lowercase = false; 993 | // Used to keep track of how large the text was when we started. 994 | public $original_size; 995 | public $size; 996 | protected $pos; 997 | protected $doc; 998 | protected $char; 999 | protected $cursor; 1000 | protected $parent; 1001 | protected $noise = array(); 1002 | protected $token_blank = " \t\r\n"; 1003 | protected $token_equal = ' =/>'; 1004 | protected $token_slash = " />\r\n\t"; 1005 | protected $token_attr = ' >'; 1006 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. 1007 | public $_charset = ''; 1008 | public $_target_charset = ''; 1009 | protected $default_br_text = ""; 1010 | public $default_span_text = ""; 1011 | 1012 | // use isset instead of in_array, performance boost about 30%... 1013 | protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); 1014 | protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); 1015 | // Known sourceforge issue #2977341 1016 | // B tags that are not closed cause us to return everything to the end of the document. 1017 | protected $optional_closing_tags = array( 1018 | 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), 1019 | 'th'=>array('th'=>1), 1020 | 'td'=>array('td'=>1), 1021 | 'li'=>array('li'=>1), 1022 | 'dt'=>array('dt'=>1, 'dd'=>1), 1023 | 'dd'=>array('dd'=>1, 'dt'=>1), 1024 | 'dl'=>array('dd'=>1, 'dt'=>1), 1025 | 'p'=>array('p'=>1), 1026 | 'nobr'=>array('nobr'=>1), 1027 | 'b'=>array('b'=>1), 1028 | 'option'=>array('option'=>1), 1029 | ); 1030 | 1031 | function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 1032 | { 1033 | if ($str) 1034 | { 1035 | if (preg_match("/^http:\/\//i",$str) || is_file($str)) 1036 | { 1037 | $this->load_file($str); 1038 | } 1039 | else 1040 | { 1041 | $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 1042 | } 1043 | } 1044 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. 1045 | if (!$forceTagsClosed) { 1046 | $this->optional_closing_array=array(); 1047 | } 1048 | $this->_target_charset = $target_charset; 1049 | } 1050 | 1051 | function __destruct() 1052 | { 1053 | $this->clear(); 1054 | } 1055 | 1056 | // load html from string 1057 | function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 1058 | { 1059 | global $debug_object; 1060 | 1061 | // prepare 1062 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 1063 | // strip out cdata 1064 | $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1065 | // strip out comments 1066 | $this->remove_noise("'<!--(.*?)-->'is"); 1067 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1068 | // Script tags removal now preceeds style tag removal. 1069 | // strip out <script> tags 1070 | $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 1071 | $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 1072 | // strip out <style> tags 1073 | $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 1074 | $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 1075 | // strip out preformatted tags 1076 | $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); 1077 | // strip out server side scripts 1078 | $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); 1079 | // strip smarty scripts 1080 | $this->remove_noise("'(\{\w)(.*?)(\})'s", true); 1081 | 1082 | // parsing 1083 | while ($this->parse()); 1084 | // end 1085 | $this->root->_[HDOM_INFO_END] = $this->cursor; 1086 | $this->parse_charset(); 1087 | 1088 | // make load function chainable 1089 | return $this; 1090 | 1091 | } 1092 | 1093 | // load html from file 1094 | function load_file() 1095 | { 1096 | $args = func_get_args(); 1097 | $this->load(call_user_func_array('file_get_contents', $args), true); 1098 | // Throw an error if we can't properly load the dom. 1099 | if (($error=error_get_last())!==null) { 1100 | $this->clear(); 1101 | return false; 1102 | } 1103 | } 1104 | 1105 | // set callback function 1106 | function set_callback($function_name) 1107 | { 1108 | $this->callback = $function_name; 1109 | } 1110 | 1111 | // remove callback function 1112 | function remove_callback() 1113 | { 1114 | $this->callback = null; 1115 | } 1116 | 1117 | // save dom as string 1118 | function save($filepath='') 1119 | { 1120 | $ret = $this->root->innertext(); 1121 | if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX); 1122 | return $ret; 1123 | } 1124 | 1125 | // find dom node by css selector 1126 | // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. 1127 | function find($selector, $idx=null, $lowercase=false) 1128 | { 1129 | return $this->root->find($selector, $idx, $lowercase); 1130 | } 1131 | 1132 | // clean up memory due to php5 circular references memory leak... 1133 | function clear() 1134 | { 1135 | foreach ($this->nodes as $n) {$n->clear(); $n = null;} 1136 | // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear. 1137 | if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;} 1138 | if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);} 1139 | if (isset($this->root)) {$this->root->clear(); unset($this->root);} 1140 | unset($this->doc); 1141 | unset($this->noise); 1142 | } 1143 | 1144 | function dump($show_attr=true) 1145 | { 1146 | $this->root->dump($show_attr); 1147 | } 1148 | 1149 | // prepare HTML data and init everything 1150 | protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 1151 | { 1152 | $this->clear(); 1153 | 1154 | // set the length of content before we do anything to it. 1155 | $this->size = strlen($str); 1156 | // Save the original size of the html that we got in. It might be useful to someone. 1157 | $this->original_size = $this->size; 1158 | 1159 | //before we save the string as the doc... strip out the \r \n's if we are told to. 1160 | if ($stripRN) { 1161 | $str = str_replace("\r", " ", $str); 1162 | $str = str_replace("\n", " ", $str); 1163 | 1164 | // set the length of content since we have changed it. 1165 | $this->size = strlen($str); 1166 | } 1167 | 1168 | $this->doc = $str; 1169 | $this->pos = 0; 1170 | $this->cursor = 1; 1171 | $this->noise = array(); 1172 | $this->nodes = array(); 1173 | $this->lowercase = $lowercase; 1174 | $this->default_br_text = $defaultBRText; 1175 | $this->default_span_text = $defaultSpanText; 1176 | $this->root = new simple_html_dom_node($this); 1177 | $this->root->tag = 'root'; 1178 | $this->root->_[HDOM_INFO_BEGIN] = -1; 1179 | $this->root->nodetype = HDOM_TYPE_ROOT; 1180 | $this->parent = $this->root; 1181 | if ($this->size>0) $this->char = $this->doc[0]; 1182 | } 1183 | 1184 | // parse html content 1185 | protected function parse() 1186 | { 1187 | if (($s = $this->copy_until_char('<'))==='') 1188 | { 1189 | return $this->read_tag(); 1190 | } 1191 | 1192 | // text 1193 | $node = new simple_html_dom_node($this); 1194 | ++$this->cursor; 1195 | $node->_[HDOM_INFO_TEXT] = $s; 1196 | $this->link_nodes($node, false); 1197 | return true; 1198 | } 1199 | 1200 | // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later. 1201 | // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec 1202 | // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism. 1203 | protected function parse_charset() 1204 | { 1205 | global $debug_object; 1206 | 1207 | $charset = null; 1208 | 1209 | if (function_exists('get_last_retrieve_url_contents_content_type')) 1210 | { 1211 | $contentTypeHeader = get_last_retrieve_url_contents_content_type(); 1212 | $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); 1213 | if ($success) 1214 | { 1215 | $charset = $matches[1]; 1216 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);} 1217 | } 1218 | 1219 | } 1220 | 1221 | if (empty($charset)) 1222 | { 1223 | $el = $this->root->find('meta[http-equiv=Content-Type]',0, true); 1224 | if (!empty($el)) 1225 | { 1226 | $fullvalue = $el->content; 1227 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);} 1228 | 1229 | if (!empty($fullvalue)) 1230 | { 1231 | $success = preg_match('/charset=(.+)/i', $fullvalue, $matches); 1232 | if ($success) 1233 | { 1234 | $charset = $matches[1]; 1235 | } 1236 | else 1237 | { 1238 | // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 1239 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');} 1240 | $charset = 'ISO-8859-1'; 1241 | } 1242 | } 1243 | } 1244 | } 1245 | 1246 | // If we couldn't find a charset above, then lets try to detect one based on the text we got... 1247 | if (empty($charset)) 1248 | { 1249 | // Use this in case mb_detect_charset isn't installed/loaded on this machine. 1250 | $charset = false; 1251 | if (function_exists('mb_detect_encoding')) 1252 | { 1253 | // Have php try to detect the encoding from the text given to us. 1254 | $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) ); 1255 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);} 1256 | } 1257 | 1258 | // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... 1259 | if ($charset === false) 1260 | { 1261 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');} 1262 | $charset = 'UTF-8'; 1263 | } 1264 | } 1265 | 1266 | // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. 1267 | if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) 1268 | { 1269 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');} 1270 | $charset = 'CP1252'; 1271 | } 1272 | 1273 | if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);} 1274 | 1275 | return $this->_charset = $charset; 1276 | } 1277 | 1278 | // read tag info 1279 | protected function read_tag() 1280 | { 1281 | if ($this->char!=='<') 1282 | { 1283 | $this->root->_[HDOM_INFO_END] = $this->cursor; 1284 | return false; 1285 | } 1286 | $begin_tag_pos = $this->pos; 1287 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1288 | 1289 | // end tag 1290 | if ($this->char==='/') 1291 | { 1292 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1293 | // This represents the change in the simple_html_dom trunk from revision 180 to 181. 1294 | // $this->skip($this->token_blank_t); 1295 | $this->skip($this->token_blank); 1296 | $tag = $this->copy_until_char('>'); 1297 | 1298 | // skip attributes in end tag 1299 | if (($pos = strpos($tag, ' '))!==false) 1300 | $tag = substr($tag, 0, $pos); 1301 | 1302 | $parent_lower = strtolower($this->parent->tag); 1303 | $tag_lower = strtolower($tag); 1304 | 1305 | if ($parent_lower!==$tag_lower) 1306 | { 1307 | if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) 1308 | { 1309 | $this->parent->_[HDOM_INFO_END] = 0; 1310 | $org_parent = $this->parent; 1311 | 1312 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) 1313 | $this->parent = $this->parent->parent; 1314 | 1315 | if (strtolower($this->parent->tag)!==$tag_lower) { 1316 | $this->parent = $org_parent; // restore origonal parent 1317 | if ($this->parent->parent) $this->parent = $this->parent->parent; 1318 | $this->parent->_[HDOM_INFO_END] = $this->cursor; 1319 | return $this->as_text_node($tag); 1320 | } 1321 | } 1322 | else if (($this->parent->parent) && isset($this->block_tags[$tag_lower])) 1323 | { 1324 | $this->parent->_[HDOM_INFO_END] = 0; 1325 | $org_parent = $this->parent; 1326 | 1327 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) 1328 | $this->parent = $this->parent->parent; 1329 | 1330 | if (strtolower($this->parent->tag)!==$tag_lower) 1331 | { 1332 | $this->parent = $org_parent; // restore origonal parent 1333 | $this->parent->_[HDOM_INFO_END] = $this->cursor; 1334 | return $this->as_text_node($tag); 1335 | } 1336 | } 1337 | else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) 1338 | { 1339 | $this->parent->_[HDOM_INFO_END] = 0; 1340 | $this->parent = $this->parent->parent; 1341 | } 1342 | else 1343 | return $this->as_text_node($tag); 1344 | } 1345 | 1346 | $this->parent->_[HDOM_INFO_END] = $this->cursor; 1347 | if ($this->parent->parent) $this->parent = $this->parent->parent; 1348 | 1349 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1350 | return true; 1351 | } 1352 | 1353 | $node = new simple_html_dom_node($this); 1354 | $node->_[HDOM_INFO_BEGIN] = $this->cursor; 1355 | ++$this->cursor; 1356 | $tag = $this->copy_until($this->token_slash); 1357 | $node->tag_start = $begin_tag_pos; 1358 | 1359 | // doctype, cdata & comments... 1360 | if (isset($tag[0]) && $tag[0]==='!') { 1361 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 1362 | 1363 | if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { 1364 | $node->nodetype = HDOM_TYPE_COMMENT; 1365 | $node->tag = 'comment'; 1366 | } else { 1367 | $node->nodetype = HDOM_TYPE_UNKNOWN; 1368 | $node->tag = 'unknown'; 1369 | } 1370 | if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; 1371 | $this->link_nodes($node, true); 1372 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1373 | return true; 1374 | } 1375 | 1376 | // text 1377 | if ($pos=strpos($tag, '<')!==false) { 1378 | $tag = '<' . substr($tag, 0, -1); 1379 | $node->_[HDOM_INFO_TEXT] = $tag; 1380 | $this->link_nodes($node, false); 1381 | $this->char = $this->doc[--$this->pos]; // prev 1382 | return true; 1383 | } 1384 | 1385 | if (!preg_match("/^[\w-:]+$/", $tag)) { 1386 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); 1387 | if ($this->char==='<') { 1388 | $this->link_nodes($node, false); 1389 | return true; 1390 | } 1391 | 1392 | if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; 1393 | $this->link_nodes($node, false); 1394 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1395 | return true; 1396 | } 1397 | 1398 | // begin tag 1399 | $node->nodetype = HDOM_TYPE_ELEMENT; 1400 | $tag_lower = strtolower($tag); 1401 | $node->tag = ($this->lowercase) ? $tag_lower : $tag; 1402 | 1403 | // handle optional closing tags 1404 | if (isset($this->optional_closing_tags[$tag_lower]) ) 1405 | { 1406 | while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) 1407 | { 1408 | $this->parent->_[HDOM_INFO_END] = 0; 1409 | $this->parent = $this->parent->parent; 1410 | } 1411 | $node->parent = $this->parent; 1412 | } 1413 | 1414 | $guard = 0; // prevent infinity loop 1415 | $space = array($this->copy_skip($this->token_blank), '', ''); 1416 | 1417 | // attributes 1418 | do 1419 | { 1420 | if ($this->char!==null && $space[0]==='') 1421 | { 1422 | break; 1423 | } 1424 | $name = $this->copy_until($this->token_equal); 1425 | if ($guard===$this->pos) 1426 | { 1427 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1428 | continue; 1429 | } 1430 | $guard = $this->pos; 1431 | 1432 | // handle endless '<' 1433 | if ($this->pos>=$this->size-1 && $this->char!=='>') { 1434 | $node->nodetype = HDOM_TYPE_TEXT; 1435 | $node->_[HDOM_INFO_END] = 0; 1436 | $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name; 1437 | $node->tag = 'text'; 1438 | $this->link_nodes($node, false); 1439 | return true; 1440 | } 1441 | 1442 | // handle mismatch '<' 1443 | if ($this->doc[$this->pos-1]=='<') { 1444 | $node->nodetype = HDOM_TYPE_TEXT; 1445 | $node->tag = 'text'; 1446 | $node->attr = array(); 1447 | $node->_[HDOM_INFO_END] = 0; 1448 | $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1); 1449 | $this->pos -= 2; 1450 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1451 | $this->link_nodes($node, false); 1452 | return true; 1453 | } 1454 | 1455 | if ($name!=='/' && $name!=='') { 1456 | $space[1] = $this->copy_skip($this->token_blank); 1457 | $name = $this->restore_noise($name); 1458 | if ($this->lowercase) $name = strtolower($name); 1459 | if ($this->char==='=') { 1460 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1461 | $this->parse_attr($node, $name, $space); 1462 | } 1463 | else { 1464 | //no value attr: nowrap, checked selected... 1465 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 1466 | $node->attr[$name] = true; 1467 | if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev 1468 | } 1469 | $node->_[HDOM_INFO_SPACE][] = $space; 1470 | $space = array($this->copy_skip($this->token_blank), '', ''); 1471 | } 1472 | else 1473 | break; 1474 | } while ($this->char!=='>' && $this->char!=='/'); 1475 | 1476 | $this->link_nodes($node, true); 1477 | $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 1478 | 1479 | // check self closing 1480 | if ($this->copy_until_char_escape('>')==='/') 1481 | { 1482 | $node->_[HDOM_INFO_ENDSPACE] .= '/'; 1483 | $node->_[HDOM_INFO_END] = 0; 1484 | } 1485 | else 1486 | { 1487 | // reset parent 1488 | if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node; 1489 | } 1490 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1491 | 1492 | // If it's a BR tag, we need to set it's text to the default text. 1493 | // This way when we see it in plaintext, we can generate formatting that the user wants. 1494 | // since a br tag never has sub nodes, this works well. 1495 | if ($node->tag == "br") 1496 | { 1497 | $node->_[HDOM_INFO_INNER] = $this->default_br_text; 1498 | } 1499 | 1500 | return true; 1501 | } 1502 | 1503 | // parse attributes 1504 | protected function parse_attr($node, $name, &$space) 1505 | { 1506 | // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 1507 | // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one. 1508 | if (isset($node->attr[$name])) 1509 | { 1510 | return; 1511 | } 1512 | 1513 | $space[2] = $this->copy_skip($this->token_blank); 1514 | switch ($this->char) { 1515 | case '"': 1516 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 1517 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1518 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"')); 1519 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1520 | break; 1521 | case '\'': 1522 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; 1523 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1524 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\'')); 1525 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1526 | break; 1527 | default: 1528 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 1529 | $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); 1530 | } 1531 | // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace. 1532 | $node->attr[$name] = str_replace("\r", "", $node->attr[$name]); 1533 | $node->attr[$name] = str_replace("\n", "", $node->attr[$name]); 1534 | // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case. 1535 | if ($name == "class") { 1536 | $node->attr[$name] = trim($node->attr[$name]); 1537 | } 1538 | } 1539 | 1540 | // link node's parent 1541 | protected function link_nodes(&$node, $is_child) 1542 | { 1543 | $node->parent = $this->parent; 1544 | $this->parent->nodes[] = $node; 1545 | if ($is_child) 1546 | { 1547 | $this->parent->children[] = $node; 1548 | } 1549 | } 1550 | 1551 | // as a text node 1552 | protected function as_text_node($tag) 1553 | { 1554 | $node = new simple_html_dom_node($this); 1555 | ++$this->cursor; 1556 | $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 1557 | $this->link_nodes($node, false); 1558 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1559 | return true; 1560 | } 1561 | 1562 | protected function skip($chars) 1563 | { 1564 | $this->pos += strspn($this->doc, $chars, $this->pos); 1565 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1566 | } 1567 | 1568 | protected function copy_skip($chars) 1569 | { 1570 | $pos = $this->pos; 1571 | $len = strspn($this->doc, $chars, $pos); 1572 | $this->pos += $len; 1573 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1574 | if ($len===0) return ''; 1575 | return substr($this->doc, $pos, $len); 1576 | } 1577 | 1578 | protected function copy_until($chars) 1579 | { 1580 | $pos = $this->pos; 1581 | $len = strcspn($this->doc, $chars, $pos); 1582 | $this->pos += $len; 1583 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1584 | return substr($this->doc, $pos, $len); 1585 | } 1586 | 1587 | protected function copy_until_char($char) 1588 | { 1589 | if ($this->char===null) return ''; 1590 | 1591 | if (($pos = strpos($this->doc, $char, $this->pos))===false) { 1592 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); 1593 | $this->char = null; 1594 | $this->pos = $this->size; 1595 | return $ret; 1596 | } 1597 | 1598 | if ($pos===$this->pos) return ''; 1599 | $pos_old = $this->pos; 1600 | $this->char = $this->doc[$pos]; 1601 | $this->pos = $pos; 1602 | return substr($this->doc, $pos_old, $pos-$pos_old); 1603 | } 1604 | 1605 | protected function copy_until_char_escape($char) 1606 | { 1607 | if ($this->char===null) return ''; 1608 | 1609 | $start = $this->pos; 1610 | while (1) 1611 | { 1612 | if (($pos = strpos($this->doc, $char, $start))===false) 1613 | { 1614 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); 1615 | $this->char = null; 1616 | $this->pos = $this->size; 1617 | return $ret; 1618 | } 1619 | 1620 | if ($pos===$this->pos) return ''; 1621 | 1622 | if ($this->doc[$pos-1]==='\\') { 1623 | $start = $pos+1; 1624 | continue; 1625 | } 1626 | 1627 | $pos_old = $this->pos; 1628 | $this->char = $this->doc[$pos]; 1629 | $this->pos = $pos; 1630 | return substr($this->doc, $pos_old, $pos-$pos_old); 1631 | } 1632 | } 1633 | 1634 | // remove noise from html content 1635 | // save the noise in the $this->noise array. 1636 | protected function remove_noise($pattern, $remove_tag=false) 1637 | { 1638 | global $debug_object; 1639 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1640 | 1641 | $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); 1642 | 1643 | for ($i=$count-1; $i>-1; --$i) 1644 | { 1645 | $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000); 1646 | if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); } 1647 | $idx = ($remove_tag) ? 0 : 1; 1648 | $this->noise[$key] = $matches[$i][$idx][0]; 1649 | $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 1650 | } 1651 | 1652 | // reset the length of content 1653 | $this->size = strlen($this->doc); 1654 | if ($this->size>0) 1655 | { 1656 | $this->char = $this->doc[0]; 1657 | } 1658 | } 1659 | 1660 | // restore noise to html content 1661 | function restore_noise($text) 1662 | { 1663 | global $debug_object; 1664 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1665 | 1666 | while (($pos=strpos($text, '___noise___'))!==false) 1667 | { 1668 | // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us... 1669 | if (strlen($text) > $pos+15) 1670 | { 1671 | $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15]; 1672 | if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); } 1673 | 1674 | if (isset($this->noise[$key])) 1675 | { 1676 | $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16); 1677 | } 1678 | else 1679 | { 1680 | // do this to prevent an infinite loop. 1681 | $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16); 1682 | } 1683 | } 1684 | else 1685 | { 1686 | // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem. 1687 | $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11); 1688 | } 1689 | } 1690 | return $text; 1691 | } 1692 | 1693 | // Sometimes we NEED one of the noise elements. 1694 | function search_noise($text) 1695 | { 1696 | global $debug_object; 1697 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1698 | 1699 | foreach($this->noise as $noiseElement) 1700 | { 1701 | if (strpos($noiseElement, $text)!==false) 1702 | { 1703 | return $noiseElement; 1704 | } 1705 | } 1706 | } 1707 | function __toString() 1708 | { 1709 | return $this->root->innertext(); 1710 | } 1711 | 1712 | function __get($name) 1713 | { 1714 | switch ($name) 1715 | { 1716 | case 'outertext': 1717 | return $this->root->innertext(); 1718 | case 'innertext': 1719 | return $this->root->innertext(); 1720 | case 'plaintext': 1721 | return $this->root->text(); 1722 | case 'charset': 1723 | return $this->_charset; 1724 | case 'target_charset': 1725 | return $this->_target_charset; 1726 | } 1727 | } 1728 | 1729 | // camel naming conventions 1730 | function childNodes($idx=-1) {return $this->root->childNodes($idx);} 1731 | function firstChild() {return $this->root->first_child();} 1732 | function lastChild() {return $this->root->last_child();} 1733 | function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();} 1734 | function createTextNode($value) {return @end(str_get_html($value)->nodes);} 1735 | function getElementById($id) {return $this->find("#$id", 0);} 1736 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} 1737 | function getElementByTagName($name) {return $this->find($name, 0);} 1738 | function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);} 1739 | function loadFile() {$args = func_get_args();$this->load_file($args);} 1740 | } 1741 | 1742 | ?> -------------------------------------------------------------------------------- /Phpfetcher/Dom/simple_html_dom.php: -------------------------------------------------------------------------------- 1 | <?php 2 | /** 3 | * Website: http://sourceforge.net/projects/simplehtmldom/ 4 | * Additional projects that may be used: http://sourceforge.net/projects/debugobject/ 5 | * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) 6 | * Contributions by: 7 | * Yousuke Kumakura (Attribute filters) 8 | * Vadim Voituk (Negative indexes supports of "find" method) 9 | * Antcs (Constructor with automatically load contents either text or file/url) 10 | * 11 | * all affected sections have comments starting with "PaperG" 12 | * 13 | * Paperg - Added case insensitive testing of the value of the selector. 14 | * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately. 15 | * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source, 16 | * it will almost always be smaller by some amount. 17 | * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from. 18 | * but for most purposes, it's a really good estimation. 19 | * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors. 20 | * Allow the user to tell us how much they trust the html. 21 | * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node. 22 | * This allows for us to find tags based on the text they contain. 23 | * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag. 24 | * Paperg: added parse_charset so that we know about the character set of the source document. 25 | * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the 26 | * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection. 27 | * 28 | * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that. 29 | * PaperG (John Schlick) Added get_display_size for "IMG" tags. 30 | * 31 | * Licensed under The MIT License 32 | * Redistributions of files must retain the above copyright notice. 33 | * 34 | * @author S.C. Chen <me578022@gmail.com> 35 | * @author John Schlick 36 | * @author Rus Carroll 37 | * @version 1.5 ($Rev: 210 $) 38 | * @package PlaceLocalInclude 39 | * @subpackage simple_html_dom 40 | */ 41 | 42 | /** 43 | * All of the Defines for the classes below. 44 | * @author S.C. Chen <me578022@gmail.com> 45 | */ 46 | define('HDOM_TYPE_ELEMENT', 1); 47 | define('HDOM_TYPE_COMMENT', 2); 48 | define('HDOM_TYPE_TEXT', 3); 49 | define('HDOM_TYPE_ENDTAG', 4); 50 | define('HDOM_TYPE_ROOT', 5); 51 | define('HDOM_TYPE_UNKNOWN', 6); 52 | define('HDOM_QUOTE_DOUBLE', 0); 53 | define('HDOM_QUOTE_SINGLE', 1); 54 | define('HDOM_QUOTE_NO', 3); 55 | define('HDOM_INFO_BEGIN', 0); 56 | define('HDOM_INFO_END', 1); 57 | define('HDOM_INFO_QUOTE', 2); 58 | define('HDOM_INFO_SPACE', 3); 59 | define('HDOM_INFO_TEXT', 4); 60 | define('HDOM_INFO_INNER', 5); 61 | define('HDOM_INFO_OUTER', 6); 62 | define('HDOM_INFO_ENDSPACE',7); 63 | define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 64 | define('DEFAULT_BR_TEXT', "\r\n"); 65 | define('DEFAULT_SPAN_TEXT', " "); 66 | define('MAX_FILE_SIZE', 100000000); //100M 67 | // helper functions 68 | // ----------------------------------------------------------------------------- 69 | // get html dom from file 70 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. 71 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 72 | { 73 | // We DO force the tags to be terminated. 74 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); 75 | // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done. 76 | $contents = file_get_contents($url, $use_include_path, $context, $offset); 77 | // Paperg - use our own mechanism for getting the contents as we want to control the timeout. 78 | //$contents = retrieve_url_contents($url); 79 | if (empty($contents) || strlen($contents) > MAX_FILE_SIZE) 80 | { 81 | return false; 82 | } 83 | // The second parameter can force the selectors to all be lowercase. 84 | $dom->load($contents, $lowercase, $stripRN); 85 | return $dom; 86 | } 87 | 88 | // get html dom from string 89 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 90 | { 91 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); 92 | if (empty($str) || strlen($str) > MAX_FILE_SIZE) 93 | { 94 | $dom->clear(); 95 | return false; 96 | } 97 | $dom->load($str, $lowercase, $stripRN); 98 | return $dom; 99 | } 100 | 101 | // dump html dom tree 102 | function dump_html_tree($node, $show_attr=true, $deep=0) 103 | { 104 | $node->dump($node); 105 | } 106 | 107 | 108 | /** 109 | * simple html dom node 110 | * PaperG - added ability for "find" routine to lowercase the value of the selector. 111 | * PaperG - added $tag_start to track the start position of the tag in the total byte index 112 | * 113 | * @package PlaceLocalInclude 114 | */ 115 | class simple_html_dom_node 116 | { 117 | public $nodetype = HDOM_TYPE_TEXT; 118 | public $tag = 'text'; 119 | public $attr = array(); 120 | public $children = array(); 121 | public $nodes = array(); 122 | public $parent = null; 123 | // The "info" array - see HDOM_INFO_... for what each element contains. 124 | public $_ = array(); 125 | public $tag_start = 0; 126 | private $dom = null; 127 | 128 | function __construct($dom) 129 | { 130 | $this->dom = $dom; 131 | $dom->nodes[] = $this; 132 | } 133 | 134 | function __destruct() 135 | { 136 | $this->clear(); 137 | } 138 | 139 | function __toString() 140 | { 141 | return $this->outertext(); 142 | } 143 | 144 | // clean up memory due to php5 circular references memory leak... 145 | function clear() 146 | { 147 | $this->dom = null; 148 | $this->nodes = null; 149 | $this->parent = null; 150 | $this->children = null; 151 | } 152 | 153 | // dump node's tree 154 | function dump($show_attr=true, $deep=0) 155 | { 156 | $lead = str_repeat(' ', $deep); 157 | 158 | echo $lead.$this->tag; 159 | if ($show_attr && count($this->attr)>0) 160 | { 161 | echo '('; 162 | foreach ($this->attr as $k=>$v) 163 | echo "[$k]=>\"".$this->$k.'", '; 164 | echo ')'; 165 | } 166 | echo "\n"; 167 | 168 | if ($this->nodes) 169 | { 170 | foreach ($this->nodes as $c) 171 | { 172 | $c->dump($show_attr, $deep+1); 173 | } 174 | } 175 | } 176 | 177 | 178 | // Debugging function to dump a single dom node with a bunch of information about it. 179 | function dump_node($echo=true) 180 | { 181 | 182 | $string = $this->tag; 183 | if (count($this->attr)>0) 184 | { 185 | $string .= '('; 186 | foreach ($this->attr as $k=>$v) 187 | { 188 | $string .= "[$k]=>\"".$this->$k.'", '; 189 | } 190 | $string .= ')'; 191 | } 192 | if (count($this->_)>0) 193 | { 194 | $string .= ' $_ ('; 195 | foreach ($this->_ as $k=>$v) 196 | { 197 | if (is_array($v)) 198 | { 199 | $string .= "[$k]=>("; 200 | foreach ($v as $k2=>$v2) 201 | { 202 | $string .= "[$k2]=>\"".$v2.'", '; 203 | } 204 | $string .= ")"; 205 | } else { 206 | $string .= "[$k]=>\"".$v.'", '; 207 | } 208 | } 209 | $string .= ")"; 210 | } 211 | 212 | if (isset($this->text)) 213 | { 214 | $string .= " text: (" . $this->text . ")"; 215 | } 216 | 217 | $string .= " HDOM_INNER_INFO: '"; 218 | if (isset($node->_[HDOM_INFO_INNER])) 219 | { 220 | $string .= $node->_[HDOM_INFO_INNER] . "'"; 221 | } 222 | else 223 | { 224 | $string .= ' NULL '; 225 | } 226 | 227 | $string .= " children: " . count($this->children); 228 | $string .= " nodes: " . count($this->nodes); 229 | $string .= " tag_start: " . $this->tag_start; 230 | $string .= "\n"; 231 | 232 | if ($echo) 233 | { 234 | echo $string; 235 | return; 236 | } 237 | else 238 | { 239 | return $string; 240 | } 241 | } 242 | 243 | // returns the parent of node 244 | // If a node is passed in, it will reset the parent of the current node to that one. 245 | function parent($parent=null) 246 | { 247 | // I am SURE that this doesn't work properly. 248 | // It fails to unset the current node from it's current parents nodes or children list first. 249 | if ($parent !== null) 250 | { 251 | $this->parent = $parent; 252 | $this->parent->nodes[] = $this; 253 | $this->parent->children[] = $this; 254 | } 255 | 256 | return $this->parent; 257 | } 258 | 259 | // verify that node has children 260 | function has_child() 261 | { 262 | return !empty($this->children); 263 | } 264 | 265 | // returns children of node 266 | function children($idx=-1) 267 | { 268 | if ($idx===-1) 269 | { 270 | return $this->children; 271 | } 272 | if (isset($this->children[$idx])) 273 | { 274 | return $this->children[$idx]; 275 | } 276 | return null; 277 | } 278 | 279 | // returns the first child of node 280 | function first_child() 281 | { 282 | if (count($this->children)>0) 283 | { 284 | return $this->children[0]; 285 | } 286 | return null; 287 | } 288 | 289 | // returns the last child of node 290 | function last_child() 291 | { 292 | if (($count=count($this->children))>0) 293 | { 294 | return $this->children[$count-1]; 295 | } 296 | return null; 297 | } 298 | 299 | // returns the next sibling of node 300 | function next_sibling() 301 | { 302 | if ($this->parent===null) 303 | { 304 | return null; 305 | } 306 | 307 | $idx = 0; 308 | $count = count($this->parent->children); 309 | while ($idx<$count && $this!==$this->parent->children[$idx]) 310 | { 311 | ++$idx; 312 | } 313 | if (++$idx>=$count) 314 | { 315 | return null; 316 | } 317 | return $this->parent->children[$idx]; 318 | } 319 | 320 | // returns the previous sibling of node 321 | function prev_sibling() 322 | { 323 | if ($this->parent===null) return null; 324 | $idx = 0; 325 | $count = count($this->parent->children); 326 | while ($idx<$count && $this!==$this->parent->children[$idx]) 327 | ++$idx; 328 | if (--$idx<0) return null; 329 | return $this->parent->children[$idx]; 330 | } 331 | 332 | // function to locate a specific ancestor tag in the path to the root. 333 | function find_ancestor_tag($tag) 334 | { 335 | global $debug_object; 336 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 337 | 338 | // Start by including ourselves in the comparison. 339 | $returnDom = $this; 340 | 341 | while (!is_null($returnDom)) 342 | { 343 | if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); } 344 | 345 | if ($returnDom->tag == $tag) 346 | { 347 | break; 348 | } 349 | $returnDom = $returnDom->parent; 350 | } 351 | return $returnDom; 352 | } 353 | 354 | // get dom node's inner html 355 | function innertext() 356 | { 357 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 358 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 359 | 360 | $ret = ''; 361 | foreach ($this->nodes as $n) 362 | $ret .= $n->outertext(); 363 | return $ret; 364 | } 365 | 366 | // get dom node's outer text (with tag) 367 | function outertext() 368 | { 369 | global $debug_object; 370 | if (is_object($debug_object)) 371 | { 372 | $text = ''; 373 | if ($this->tag == 'text') 374 | { 375 | if (!empty($this->text)) 376 | { 377 | $text = " with text: " . $this->text; 378 | } 379 | } 380 | $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); 381 | } 382 | 383 | if ($this->tag==='root') return $this->innertext(); 384 | 385 | // trigger callback 386 | if ($this->dom && $this->dom->callback!==null) 387 | { 388 | call_user_func_array($this->dom->callback, array($this)); 389 | } 390 | 391 | if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; 392 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 393 | 394 | // render begin tag 395 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) 396 | { 397 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 398 | } else { 399 | $ret = ""; 400 | } 401 | 402 | // render inner text 403 | if (isset($this->_[HDOM_INFO_INNER])) 404 | { 405 | // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. 406 | if ($this->tag != "br") 407 | { 408 | $ret .= $this->_[HDOM_INFO_INNER]; 409 | } 410 | } else { 411 | if ($this->nodes) 412 | { 413 | foreach ($this->nodes as $n) 414 | { 415 | $ret .= $this->convert_text($n->outertext()); 416 | } 417 | } 418 | } 419 | 420 | // render end tag 421 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) 422 | $ret .= '</'.$this->tag.'>'; 423 | return $ret; 424 | } 425 | 426 | // get dom node's plain text 427 | function text() 428 | { 429 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 430 | switch ($this->nodetype) 431 | { 432 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 433 | case HDOM_TYPE_COMMENT: return ''; 434 | case HDOM_TYPE_UNKNOWN: return ''; 435 | } 436 | if (strcasecmp($this->tag, 'script')===0) return ''; 437 | if (strcasecmp($this->tag, 'style')===0) return ''; 438 | 439 | $ret = ''; 440 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. 441 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. 442 | // WHY is this happening? 443 | if (!is_null($this->nodes)) 444 | { 445 | foreach ($this->nodes as $n) 446 | { 447 | $ret .= $this->convert_text($n->text()); 448 | } 449 | 450 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. 451 | if ($this->tag == "span") 452 | { 453 | $ret .= $this->dom->default_span_text; 454 | } 455 | 456 | 457 | } 458 | return $ret; 459 | } 460 | 461 | function xmltext() 462 | { 463 | $ret = $this->innertext(); 464 | $ret = str_ireplace('<![CDATA[', '', $ret); 465 | $ret = str_replace(']]>', '', $ret); 466 | return $ret; 467 | } 468 | 469 | // build node's text with tag 470 | function makeup() 471 | { 472 | // text, comment, unknown 473 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 474 | 475 | $ret = '<'.$this->tag; 476 | $i = -1; 477 | 478 | foreach ($this->attr as $key=>$val) 479 | { 480 | ++$i; 481 | 482 | // skip removed attribute 483 | if ($val===null || $val===false) 484 | continue; 485 | 486 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 487 | //no value attr: nowrap, checked selected... 488 | if ($val===true) 489 | $ret .= $key; 490 | else { 491 | switch ($this->_[HDOM_INFO_QUOTE][$i]) 492 | { 493 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 494 | case HDOM_QUOTE_SINGLE: $quote = '\''; break; 495 | default: $quote = ''; 496 | } 497 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; 498 | } 499 | } 500 | $ret = $this->dom->restore_noise($ret); 501 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 502 | } 503 | 504 | // find elements by css selector 505 | //PaperG - added ability for find to lowercase the value of the selector. 506 | function find($selector, $idx=null, $lowercase=false) 507 | { 508 | $selectors = $this->parse_selector($selector); 509 | if (($count=count($selectors))===0) return array(); 510 | $found_keys = array(); 511 | 512 | // find each selector 513 | for ($c=0; $c<$count; ++$c) 514 | { 515 | // The change on the below line was documented on the sourceforge code tracker id 2788009 516 | // used to be: if (($levle=count($selectors[0]))===0) return array(); 517 | if (($levle=count($selectors[$c]))===0) return array(); 518 | if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); 519 | 520 | $head = array($this->_[HDOM_INFO_BEGIN]=>1); 521 | 522 | // handle descendant selectors, no recursive! 523 | for ($l=0; $l<$levle; ++$l) 524 | { 525 | $ret = array(); 526 | foreach ($head as $k=>$v) 527 | { 528 | $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; 529 | //PaperG - Pass this optional parameter on to the seek function. 530 | $n->seek($selectors[$c][$l], $ret, $lowercase); 531 | } 532 | $head = $ret; 533 | } 534 | 535 | foreach ($head as $k=>$v) 536 | { 537 | if (!isset($found_keys[$k])) 538 | { 539 | $found_keys[$k] = 1; 540 | } 541 | } 542 | } 543 | 544 | // sort keys 545 | ksort($found_keys); 546 | 547 | $found = array(); 548 | foreach ($found_keys as $k=>$v) 549 | $found[] = $this->dom->nodes[$k]; 550 | 551 | // return nth-element or array 552 | if (is_null($idx)) return $found; 553 | else if ($idx<0) $idx = count($found) + $idx; 554 | return (isset($found[$idx])) ? $found[$idx] : null; 555 | } 556 | 557 | // seek for given conditions 558 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector. 559 | protected function seek($selector, &$ret, $lowercase=false) 560 | { 561 | global $debug_object; 562 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 563 | 564 | list($tag, $key, $val, $exp, $no_key) = $selector; 565 | 566 | // xpath index 567 | if ($tag && $key && is_numeric($key)) 568 | { 569 | $count = 0; 570 | foreach ($this->children as $c) 571 | { 572 | if ($tag==='*' || $tag===$c->tag) { 573 | if (++$count==$key) { 574 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1; 575 | return; 576 | } 577 | } 578 | } 579 | return; 580 | } 581 | 582 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 583 | if ($end==0) { 584 | $parent = $this->parent; 585 | while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { 586 | $end -= 1; 587 | $parent = $parent->parent; 588 | } 589 | $end += $parent->_[HDOM_INFO_END]; 590 | } 591 | 592 | for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { 593 | $node = $this->dom->nodes[$i]; 594 | 595 | $pass = true; 596 | 597 | if ($tag==='*' && !$key) { 598 | if (in_array($node, $this->children, true)) 599 | $ret[$i] = 1; 600 | continue; 601 | } 602 | 603 | // compare tag 604 | if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} 605 | // compare key 606 | if ($pass && $key) { 607 | if ($no_key) { 608 | if (isset($node->attr[$key])) $pass=false; 609 | } else { 610 | if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false; 611 | } 612 | } 613 | // compare value 614 | if ($pass && $key && $val && $val!=='*') { 615 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? 616 | if ($key == "plaintext") { 617 | // $node->plaintext actually returns $node->text(); 618 | $nodeKeyValue = $node->text(); 619 | } else { 620 | // this is a normal search, we want the value of that attribute of the tag. 621 | $nodeKeyValue = $node->attr[$key]; 622 | } 623 | if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} 624 | 625 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. 626 | if ($lowercase) { 627 | $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); 628 | } else { 629 | $check = $this->match($exp, $val, $nodeKeyValue); 630 | } 631 | if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));} 632 | 633 | // handle multiple class 634 | if (!$check && strcasecmp($key, 'class')===0) { 635 | foreach (explode(' ',$node->attr[$key]) as $k) { 636 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. 637 | if (!empty($k)) { 638 | if ($lowercase) { 639 | $check = $this->match($exp, strtolower($val), strtolower($k)); 640 | } else { 641 | $check = $this->match($exp, $val, $k); 642 | } 643 | if ($check) break; 644 | } 645 | } 646 | } 647 | if (!$check) $pass = false; 648 | } 649 | if ($pass) $ret[$i] = 1; 650 | unset($node); 651 | } 652 | // It's passed by reference so this is actually what this function returns. 653 | if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);} 654 | } 655 | 656 | protected function match($exp, $pattern, $value) { 657 | global $debug_object; 658 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 659 | 660 | switch ($exp) { 661 | case '=': 662 | return ($value===$pattern); 663 | case '!=': 664 | return ($value!==$pattern); 665 | case '^=': 666 | return preg_match("/^".preg_quote($pattern,'/')."/", $value); 667 | case '$=': 668 | return preg_match("/".preg_quote($pattern,'/')."$/", $value); 669 | case '*=': 670 | if ($pattern[0]=='/') { 671 | return preg_match($pattern, $value); 672 | } 673 | return preg_match("/".$pattern."/i", $value); 674 | } 675 | return false; 676 | } 677 | 678 | protected function parse_selector($selector_string) { 679 | global $debug_object; 680 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 681 | 682 | // pattern of CSS selectors, modified from mootools 683 | // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. 684 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. 685 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. 686 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. 687 | // farther study is required to determine of this should be documented or removed. 688 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 689 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 690 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); 691 | if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);} 692 | 693 | $selectors = array(); 694 | $result = array(); 695 | //print_r($matches); 696 | 697 | foreach ($matches as $m) { 698 | $m[0] = trim($m[0]); 699 | if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; 700 | // for browser generated xpath 701 | if ($m[1]==='tbody') continue; 702 | 703 | list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); 704 | if (!empty($m[2])) {$key='id'; $val=$m[2];} 705 | if (!empty($m[3])) {$key='class'; $val=$m[3];} 706 | if (!empty($m[4])) {$key=$m[4];} 707 | if (!empty($m[5])) {$exp=$m[5];} 708 | if (!empty($m[6])) {$val=$m[6];} 709 | 710 | // convert to lowercase 711 | if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} 712 | //elements that do NOT have the specified attribute 713 | if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} 714 | 715 | $result[] = array($tag, $key, $val, $exp, $no_key); 716 | if (trim($m[7])===',') { 717 | $selectors[] = $result; 718 | $result = array(); 719 | } 720 | } 721 | if (count($result)>0) 722 | $selectors[] = $result; 723 | return $selectors; 724 | } 725 | 726 | function __get($name) 727 | { 728 | if (isset($this->attr[$name])) 729 | { 730 | return $this->convert_text($this->attr[$name]); 731 | } 732 | switch ($name) 733 | { 734 | case 'outertext': return $this->outertext(); 735 | case 'innertext': return $this->innertext(); 736 | case 'plaintext': return $this->text(); 737 | case 'xmltext': return $this->xmltext(); 738 | default: return array_key_exists($name, $this->attr); 739 | } 740 | } 741 | 742 | function __set($name, $value) 743 | { 744 | global $debug_object; 745 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 746 | 747 | switch ($name) 748 | { 749 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 750 | case 'innertext': 751 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; 752 | return $this->_[HDOM_INFO_INNER] = $value; 753 | } 754 | if (!isset($this->attr[$name])) 755 | { 756 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 757 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 758 | } 759 | $this->attr[$name] = $value; 760 | } 761 | 762 | function __isset($name) 763 | { 764 | switch ($name) 765 | { 766 | case 'outertext': return true; 767 | case 'innertext': return true; 768 | case 'plaintext': return true; 769 | } 770 | //no value attr: nowrap, checked selected... 771 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 772 | } 773 | 774 | function __unset($name) { 775 | if (isset($this->attr[$name])) 776 | unset($this->attr[$name]); 777 | } 778 | 779 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same. 780 | function convert_text($text) 781 | { 782 | global $debug_object; 783 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} 784 | 785 | $converted_text = $text; 786 | 787 | $sourceCharset = ""; 788 | $targetCharset = ""; 789 | 790 | if ($this->dom) 791 | { 792 | $sourceCharset = strtoupper($this->dom->_charset); 793 | $targetCharset = strtoupper($this->dom->_target_charset); 794 | } 795 | if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} 796 | 797 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) 798 | { 799 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 800 | if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) 801 | { 802 | $converted_text = $text; 803 | } 804 | else 805 | { 806 | $converted_text = iconv($sourceCharset, $targetCharset, $text); 807 | } 808 | } 809 | 810 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 811 | if ($targetCharset == 'UTF-8') 812 | { 813 | if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") 814 | { 815 | $converted_text = substr($converted_text, 3); 816 | } 817 | if (substr($converted_text, -3) == "\xef\xbb\xbf") 818 | { 819 | $converted_text = substr($converted_text, 0, -3); 820 | } 821 | } 822 | 823 | return $converted_text; 824 | } 825 | 826 | /** 827 | * Returns true if $string is valid UTF-8 and false otherwise. 828 | * 829 | * @param mixed $str String to be tested 830 | * @return boolean 831 | */ 832 | static function is_utf8($str) 833 | { 834 | $c=0; $b=0; 835 | $bits=0; 836 | $len=strlen($str); 837 | for($i=0; $i<$len; $i++) 838 | { 839 | $c=ord($str[$i]); 840 | if($c > 128) 841 | { 842 | if(($c >= 254)) return false; 843 | elseif($c >= 252) $bits=6; 844 | elseif($c >= 248) $bits=5; 845 | elseif($c >= 240) $bits=4; 846 | elseif($c >= 224) $bits=3; 847 | elseif($c >= 192) $bits=2; 848 | else return false; 849 | if(($i+$bits) > $len) return false; 850 | while($bits > 1) 851 | { 852 | $i++; 853 | $b=ord($str[$i]); 854 | if($b < 128 || $b > 191) return false; 855 | $bits--; 856 | } 857 | } 858 | } 859 | return true; 860 | } 861 | /* 862 | function is_utf8($string) 863 | { 864 | //this is buggy 865 | return (utf8_encode(utf8_decode($string)) == $string); 866 | } 867 | */ 868 | 869 | /** 870 | * Function to try a few tricks to determine the displayed size of an img on the page. 871 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. 872 | * 873 | * @author John Schlick 874 | * @version April 19 2012 875 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. 876 | */ 877 | function get_display_size() 878 | { 879 | global $debug_object; 880 | 881 | $width = -1; 882 | $height = -1; 883 | 884 | if ($this->tag !== 'img') 885 | { 886 | return false; 887 | } 888 | 889 | // See if there is aheight or width attribute in the tag itself. 890 | if (isset($this->attr['width'])) 891 | { 892 | $width = $this->attr['width']; 893 | } 894 | 895 | if (isset($this->attr['height'])) 896 | { 897 | $height = $this->attr['height']; 898 | } 899 | 900 | // Now look for an inline style. 901 | if (isset($this->attr['style'])) 902 | { 903 | // Thanks to user gnarf from stackoverflow for this regular expression. 904 | $attributes = array(); 905 | preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); 906 | foreach ($matches as $match) { 907 | $attributes[$match[1]] = $match[2]; 908 | } 909 | 910 | // If there is a width in the style attributes: 911 | if (isset($attributes['width']) && $width == -1) 912 | { 913 | // check that the last two characters are px (pixels) 914 | if (strtolower(substr($attributes['width'], -2)) == 'px') 915 | { 916 | $proposed_width = substr($attributes['width'], 0, -2); 917 | // Now make sure that it's an integer and not something stupid. 918 | if (filter_var($proposed_width, FILTER_VALIDATE_INT)) 919 | { 920 | $width = $proposed_width; 921 | } 922 | } 923 | } 924 | 925 | // If there is a width in the style attributes: 926 | if (isset($attributes['height']) && $height == -1) 927 | { 928 | // check that the last two characters are px (pixels) 929 | if (strtolower(substr($attributes['height'], -2)) == 'px') 930 | { 931 | $proposed_height = substr($attributes['height'], 0, -2); 932 | // Now make sure that it's an integer and not something stupid. 933 | if (filter_var($proposed_height, FILTER_VALIDATE_INT)) 934 | { 935 | $height = $proposed_height; 936 | } 937 | } 938 | } 939 | 940 | } 941 | 942 | // Future enhancement: 943 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. 944 | 945 | // Far future enhancement 946 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width 947 | // Note that in this case, the class or id will have the img subselector for it to apply to the image. 948 | 949 | // ridiculously far future development 950 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. 951 | 952 | $result = array('height' => $height, 953 | 'width' => $width); 954 | return $result; 955 | } 956 | 957 | // camel naming conventions 958 | function getAllAttributes() {return $this->attr;} 959 | function getAttribute($name) {return $this->__get($name);} 960 | function setAttribute($name, $value) {$this->__set($name, $value);} 961 | function hasAttribute($name) {return $this->__isset($name);} 962 | function removeAttribute($name) {$this->__set($name, null);} 963 | function getElementById($id) {return $this->find("#$id", 0);} 964 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} 965 | function getElementByTagName($name) {return $this->find($name, 0);} 966 | function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} 967 | function parentNode() {return $this->parent();} 968 | function childNodes($idx=-1) {return $this->children($idx);} 969 | function firstChild() {return $this->first_child();} 970 | function lastChild() {return $this->last_child();} 971 | function nextSibling() {return $this->next_sibling();} 972 | function previousSibling() {return $this->prev_sibling();} 973 | function hasChildNodes() {return $this->has_child();} 974 | function nodeName() {return $this->tag;} 975 | function appendChild($node) {$node->parent($this); return $node;} 976 | 977 | } 978 | 979 | /** 980 | * simple html dom parser 981 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. 982 | * Paperg - change $size from protected to public so we can easily access it 983 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. 984 | * 985 | * @package PlaceLocalInclude 986 | */ 987 | class simple_html_dom 988 | { 989 | public $root = null; 990 | public $nodes = array(); 991 | public $callback = null; 992 | public $lowercase = false; 993 | // Used to keep track of how large the text was when we started. 994 | public $original_size; 995 | public $size; 996 | protected $pos; 997 | protected $doc; 998 | protected $char; 999 | protected $cursor; 1000 | protected $parent; 1001 | protected $noise = array(); 1002 | protected $token_blank = " \t\r\n"; 1003 | protected $token_equal = ' =/>'; 1004 | protected $token_slash = " />\r\n\t"; 1005 | protected $token_attr = ' >'; 1006 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. 1007 | public $_charset = ''; 1008 | public $_target_charset = ''; 1009 | protected $default_br_text = ""; 1010 | public $default_span_text = ""; 1011 | 1012 | // use isset instead of in_array, performance boost about 30%... 1013 | protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); 1014 | protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); 1015 | // Known sourceforge issue #2977341 1016 | // B tags that are not closed cause us to return everything to the end of the document. 1017 | protected $optional_closing_tags = array( 1018 | 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), 1019 | 'th'=>array('th'=>1), 1020 | 'td'=>array('td'=>1), 1021 | 'li'=>array('li'=>1), 1022 | 'dt'=>array('dt'=>1, 'dd'=>1), 1023 | 'dd'=>array('dd'=>1, 'dt'=>1), 1024 | 'dl'=>array('dd'=>1, 'dt'=>1), 1025 | 'p'=>array('p'=>1), 1026 | 'nobr'=>array('nobr'=>1), 1027 | 'b'=>array('b'=>1), 1028 | 'option'=>array('option'=>1), 1029 | ); 1030 | 1031 | function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 1032 | { 1033 | if ($str) 1034 | { 1035 | if (preg_match("/^http:\/\//i",$str) || is_file($str)) 1036 | { 1037 | $this->load_file($str); 1038 | } 1039 | else 1040 | { 1041 | $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 1042 | } 1043 | } 1044 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. 1045 | if (!$forceTagsClosed) { 1046 | $this->optional_closing_array=array(); 1047 | } 1048 | $this->_target_charset = $target_charset; 1049 | } 1050 | 1051 | function __destruct() 1052 | { 1053 | $this->clear(); 1054 | } 1055 | 1056 | // load html from string 1057 | function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 1058 | { 1059 | global $debug_object; 1060 | 1061 | // prepare 1062 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 1063 | // strip out cdata 1064 | $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1065 | // strip out comments 1066 | $this->remove_noise("'<!--(.*?)-->'is"); 1067 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1068 | // Script tags removal now preceeds style tag removal. 1069 | // strip out <script> tags 1070 | $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 1071 | $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 1072 | // strip out <style> tags 1073 | $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 1074 | $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 1075 | // strip out preformatted tags 1076 | $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); 1077 | // strip out server side scripts 1078 | $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); 1079 | // strip smarty scripts 1080 | $this->remove_noise("'(\{\w)(.*?)(\})'s", true); 1081 | 1082 | // parsing 1083 | while ($this->parse()); 1084 | // end 1085 | $this->root->_[HDOM_INFO_END] = $this->cursor; 1086 | $this->parse_charset(); 1087 | 1088 | // make load function chainable 1089 | return $this; 1090 | 1091 | } 1092 | 1093 | // load html from file 1094 | function load_file() 1095 | { 1096 | $args = func_get_args(); 1097 | $this->load(call_user_func_array('file_get_contents', $args), true); 1098 | // Throw an error if we can't properly load the dom. 1099 | if (($error=error_get_last())!==null) { 1100 | $this->clear(); 1101 | return false; 1102 | } 1103 | } 1104 | 1105 | // set callback function 1106 | function set_callback($function_name) 1107 | { 1108 | $this->callback = $function_name; 1109 | } 1110 | 1111 | // remove callback function 1112 | function remove_callback() 1113 | { 1114 | $this->callback = null; 1115 | } 1116 | 1117 | // save dom as string 1118 | function save($filepath='') 1119 | { 1120 | $ret = $this->root->innertext(); 1121 | if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX); 1122 | return $ret; 1123 | } 1124 | 1125 | // find dom node by css selector 1126 | // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. 1127 | function find($selector, $idx=null, $lowercase=false) 1128 | { 1129 | return $this->root->find($selector, $idx, $lowercase); 1130 | } 1131 | 1132 | // clean up memory due to php5 circular references memory leak... 1133 | function clear() 1134 | { 1135 | foreach ($this->nodes as $n) {$n->clear(); $n = null;} 1136 | // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear. 1137 | if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;} 1138 | if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);} 1139 | if (isset($this->root)) {$this->root->clear(); unset($this->root);} 1140 | unset($this->doc); 1141 | unset($this->noise); 1142 | } 1143 | 1144 | function dump($show_attr=true) 1145 | { 1146 | $this->root->dump($show_attr); 1147 | } 1148 | 1149 | // prepare HTML data and init everything 1150 | protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 1151 | { 1152 | $this->clear(); 1153 | 1154 | // set the length of content before we do anything to it. 1155 | $this->size = strlen($str); 1156 | // Save the original size of the html that we got in. It might be useful to someone. 1157 | $this->original_size = $this->size; 1158 | 1159 | //before we save the string as the doc... strip out the \r \n's if we are told to. 1160 | if ($stripRN) { 1161 | $str = str_replace("\r", " ", $str); 1162 | $str = str_replace("\n", " ", $str); 1163 | 1164 | // set the length of content since we have changed it. 1165 | $this->size = strlen($str); 1166 | } 1167 | 1168 | $this->doc = $str; 1169 | $this->pos = 0; 1170 | $this->cursor = 1; 1171 | $this->noise = array(); 1172 | $this->nodes = array(); 1173 | $this->lowercase = $lowercase; 1174 | $this->default_br_text = $defaultBRText; 1175 | $this->default_span_text = $defaultSpanText; 1176 | $this->root = new simple_html_dom_node($this); 1177 | $this->root->tag = 'root'; 1178 | $this->root->_[HDOM_INFO_BEGIN] = -1; 1179 | $this->root->nodetype = HDOM_TYPE_ROOT; 1180 | $this->parent = $this->root; 1181 | if ($this->size>0) $this->char = $this->doc[0]; 1182 | } 1183 | 1184 | // parse html content 1185 | protected function parse() 1186 | { 1187 | if (($s = $this->copy_until_char('<'))==='') 1188 | { 1189 | return $this->read_tag(); 1190 | } 1191 | 1192 | // text 1193 | $node = new simple_html_dom_node($this); 1194 | ++$this->cursor; 1195 | $node->_[HDOM_INFO_TEXT] = $s; 1196 | $this->link_nodes($node, false); 1197 | return true; 1198 | } 1199 | 1200 | // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later. 1201 | // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec 1202 | // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism. 1203 | protected function parse_charset() 1204 | { 1205 | global $debug_object; 1206 | 1207 | $charset = null; 1208 | 1209 | if (function_exists('get_last_retrieve_url_contents_content_type')) 1210 | { 1211 | $contentTypeHeader = get_last_retrieve_url_contents_content_type(); 1212 | $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); 1213 | if ($success) 1214 | { 1215 | $charset = $matches[1]; 1216 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);} 1217 | } 1218 | 1219 | } 1220 | 1221 | if (empty($charset)) 1222 | { 1223 | $el = $this->root->find('meta[http-equiv=Content-Type]',0, true); 1224 | if (!empty($el)) 1225 | { 1226 | $fullvalue = $el->content; 1227 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);} 1228 | 1229 | if (!empty($fullvalue)) 1230 | { 1231 | $success = preg_match('/charset=(.+)/i', $fullvalue, $matches); 1232 | if ($success) 1233 | { 1234 | $charset = $matches[1]; 1235 | } 1236 | else 1237 | { 1238 | // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 1239 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');} 1240 | $charset = 'ISO-8859-1'; 1241 | } 1242 | } 1243 | } 1244 | } 1245 | 1246 | // If we couldn't find a charset above, then lets try to detect one based on the text we got... 1247 | if (empty($charset)) 1248 | { 1249 | // Use this in case mb_detect_charset isn't installed/loaded on this machine. 1250 | $charset = false; 1251 | if (function_exists('mb_detect_encoding')) 1252 | { 1253 | // Have php try to detect the encoding from the text given to us. 1254 | $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) ); 1255 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);} 1256 | } 1257 | 1258 | // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... 1259 | if ($charset === false) 1260 | { 1261 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');} 1262 | $charset = 'UTF-8'; 1263 | } 1264 | } 1265 | 1266 | // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. 1267 | if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) 1268 | { 1269 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');} 1270 | $charset = 'CP1252'; 1271 | } 1272 | 1273 | if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);} 1274 | 1275 | return $this->_charset = $charset; 1276 | } 1277 | 1278 | // read tag info 1279 | protected function read_tag() 1280 | { 1281 | if ($this->char!=='<') 1282 | { 1283 | $this->root->_[HDOM_INFO_END] = $this->cursor; 1284 | return false; 1285 | } 1286 | $begin_tag_pos = $this->pos; 1287 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1288 | 1289 | // end tag 1290 | if ($this->char==='/') 1291 | { 1292 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1293 | // This represents the change in the simple_html_dom trunk from revision 180 to 181. 1294 | // $this->skip($this->token_blank_t); 1295 | $this->skip($this->token_blank); 1296 | $tag = $this->copy_until_char('>'); 1297 | 1298 | // skip attributes in end tag 1299 | if (($pos = strpos($tag, ' '))!==false) 1300 | $tag = substr($tag, 0, $pos); 1301 | 1302 | $parent_lower = strtolower($this->parent->tag); 1303 | $tag_lower = strtolower($tag); 1304 | 1305 | if ($parent_lower!==$tag_lower) 1306 | { 1307 | if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) 1308 | { 1309 | $this->parent->_[HDOM_INFO_END] = 0; 1310 | $org_parent = $this->parent; 1311 | 1312 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) 1313 | $this->parent = $this->parent->parent; 1314 | 1315 | if (strtolower($this->parent->tag)!==$tag_lower) { 1316 | $this->parent = $org_parent; // restore origonal parent 1317 | if ($this->parent->parent) $this->parent = $this->parent->parent; 1318 | $this->parent->_[HDOM_INFO_END] = $this->cursor; 1319 | return $this->as_text_node($tag); 1320 | } 1321 | } 1322 | else if (($this->parent->parent) && isset($this->block_tags[$tag_lower])) 1323 | { 1324 | $this->parent->_[HDOM_INFO_END] = 0; 1325 | $org_parent = $this->parent; 1326 | 1327 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) 1328 | $this->parent = $this->parent->parent; 1329 | 1330 | if (strtolower($this->parent->tag)!==$tag_lower) 1331 | { 1332 | $this->parent = $org_parent; // restore origonal parent 1333 | $this->parent->_[HDOM_INFO_END] = $this->cursor; 1334 | return $this->as_text_node($tag); 1335 | } 1336 | } 1337 | else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) 1338 | { 1339 | $this->parent->_[HDOM_INFO_END] = 0; 1340 | $this->parent = $this->parent->parent; 1341 | } 1342 | else 1343 | return $this->as_text_node($tag); 1344 | } 1345 | 1346 | $this->parent->_[HDOM_INFO_END] = $this->cursor; 1347 | if ($this->parent->parent) $this->parent = $this->parent->parent; 1348 | 1349 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1350 | return true; 1351 | } 1352 | 1353 | $node = new simple_html_dom_node($this); 1354 | $node->_[HDOM_INFO_BEGIN] = $this->cursor; 1355 | ++$this->cursor; 1356 | $tag = $this->copy_until($this->token_slash); 1357 | $node->tag_start = $begin_tag_pos; 1358 | 1359 | // doctype, cdata & comments... 1360 | if (isset($tag[0]) && $tag[0]==='!') { 1361 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 1362 | 1363 | if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { 1364 | $node->nodetype = HDOM_TYPE_COMMENT; 1365 | $node->tag = 'comment'; 1366 | } else { 1367 | $node->nodetype = HDOM_TYPE_UNKNOWN; 1368 | $node->tag = 'unknown'; 1369 | } 1370 | if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; 1371 | $this->link_nodes($node, true); 1372 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1373 | return true; 1374 | } 1375 | 1376 | // text 1377 | if ($pos=strpos($tag, '<')!==false) { 1378 | $tag = '<' . substr($tag, 0, -1); 1379 | $node->_[HDOM_INFO_TEXT] = $tag; 1380 | $this->link_nodes($node, false); 1381 | $this->char = $this->doc[--$this->pos]; // prev 1382 | return true; 1383 | } 1384 | 1385 | if (!preg_match("/^[\w-:]+$/", $tag)) { 1386 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); 1387 | if ($this->char==='<') { 1388 | $this->link_nodes($node, false); 1389 | return true; 1390 | } 1391 | 1392 | if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; 1393 | $this->link_nodes($node, false); 1394 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1395 | return true; 1396 | } 1397 | 1398 | // begin tag 1399 | $node->nodetype = HDOM_TYPE_ELEMENT; 1400 | $tag_lower = strtolower($tag); 1401 | $node->tag = ($this->lowercase) ? $tag_lower : $tag; 1402 | 1403 | // handle optional closing tags 1404 | if (isset($this->optional_closing_tags[$tag_lower]) ) 1405 | { 1406 | while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) 1407 | { 1408 | $this->parent->_[HDOM_INFO_END] = 0; 1409 | $this->parent = $this->parent->parent; 1410 | } 1411 | $node->parent = $this->parent; 1412 | } 1413 | 1414 | $guard = 0; // prevent infinity loop 1415 | $space = array($this->copy_skip($this->token_blank), '', ''); 1416 | 1417 | // attributes 1418 | do 1419 | { 1420 | if ($this->char!==null && $space[0]==='') 1421 | { 1422 | break; 1423 | } 1424 | $name = $this->copy_until($this->token_equal); 1425 | if ($guard===$this->pos) 1426 | { 1427 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1428 | continue; 1429 | } 1430 | $guard = $this->pos; 1431 | 1432 | // handle endless '<' 1433 | if ($this->pos>=$this->size-1 && $this->char!=='>') { 1434 | $node->nodetype = HDOM_TYPE_TEXT; 1435 | $node->_[HDOM_INFO_END] = 0; 1436 | $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name; 1437 | $node->tag = 'text'; 1438 | $this->link_nodes($node, false); 1439 | return true; 1440 | } 1441 | 1442 | // handle mismatch '<' 1443 | if ($this->doc[$this->pos-1]=='<') { 1444 | $node->nodetype = HDOM_TYPE_TEXT; 1445 | $node->tag = 'text'; 1446 | $node->attr = array(); 1447 | $node->_[HDOM_INFO_END] = 0; 1448 | $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1); 1449 | $this->pos -= 2; 1450 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1451 | $this->link_nodes($node, false); 1452 | return true; 1453 | } 1454 | 1455 | if ($name!=='/' && $name!=='') { 1456 | $space[1] = $this->copy_skip($this->token_blank); 1457 | $name = $this->restore_noise($name); 1458 | if ($this->lowercase) $name = strtolower($name); 1459 | if ($this->char==='=') { 1460 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1461 | $this->parse_attr($node, $name, $space); 1462 | } 1463 | else { 1464 | //no value attr: nowrap, checked selected... 1465 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 1466 | $node->attr[$name] = true; 1467 | if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev 1468 | } 1469 | $node->_[HDOM_INFO_SPACE][] = $space; 1470 | $space = array($this->copy_skip($this->token_blank), '', ''); 1471 | } 1472 | else 1473 | break; 1474 | } while ($this->char!=='>' && $this->char!=='/'); 1475 | 1476 | $this->link_nodes($node, true); 1477 | $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 1478 | 1479 | // check self closing 1480 | if ($this->copy_until_char_escape('>')==='/') 1481 | { 1482 | $node->_[HDOM_INFO_ENDSPACE] .= '/'; 1483 | $node->_[HDOM_INFO_END] = 0; 1484 | } 1485 | else 1486 | { 1487 | // reset parent 1488 | if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node; 1489 | } 1490 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1491 | 1492 | // If it's a BR tag, we need to set it's text to the default text. 1493 | // This way when we see it in plaintext, we can generate formatting that the user wants. 1494 | // since a br tag never has sub nodes, this works well. 1495 | if ($node->tag == "br") 1496 | { 1497 | $node->_[HDOM_INFO_INNER] = $this->default_br_text; 1498 | } 1499 | 1500 | return true; 1501 | } 1502 | 1503 | // parse attributes 1504 | protected function parse_attr($node, $name, &$space) 1505 | { 1506 | // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 1507 | // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one. 1508 | if (isset($node->attr[$name])) 1509 | { 1510 | return; 1511 | } 1512 | 1513 | $space[2] = $this->copy_skip($this->token_blank); 1514 | switch ($this->char) { 1515 | case '"': 1516 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 1517 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1518 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"')); 1519 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1520 | break; 1521 | case '\'': 1522 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; 1523 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1524 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\'')); 1525 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1526 | break; 1527 | default: 1528 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 1529 | $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); 1530 | } 1531 | // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace. 1532 | $node->attr[$name] = str_replace("\r", "", $node->attr[$name]); 1533 | $node->attr[$name] = str_replace("\n", "", $node->attr[$name]); 1534 | // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case. 1535 | if ($name == "class") { 1536 | $node->attr[$name] = trim($node->attr[$name]); 1537 | } 1538 | } 1539 | 1540 | // link node's parent 1541 | protected function link_nodes(&$node, $is_child) 1542 | { 1543 | $node->parent = $this->parent; 1544 | $this->parent->nodes[] = $node; 1545 | if ($is_child) 1546 | { 1547 | $this->parent->children[] = $node; 1548 | } 1549 | } 1550 | 1551 | // as a text node 1552 | protected function as_text_node($tag) 1553 | { 1554 | $node = new simple_html_dom_node($this); 1555 | ++$this->cursor; 1556 | $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 1557 | $this->link_nodes($node, false); 1558 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1559 | return true; 1560 | } 1561 | 1562 | protected function skip($chars) 1563 | { 1564 | $this->pos += strspn($this->doc, $chars, $this->pos); 1565 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1566 | } 1567 | 1568 | protected function copy_skip($chars) 1569 | { 1570 | $pos = $this->pos; 1571 | $len = strspn($this->doc, $chars, $pos); 1572 | $this->pos += $len; 1573 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1574 | if ($len===0) return ''; 1575 | return substr($this->doc, $pos, $len); 1576 | } 1577 | 1578 | protected function copy_until($chars) 1579 | { 1580 | $pos = $this->pos; 1581 | $len = strcspn($this->doc, $chars, $pos); 1582 | $this->pos += $len; 1583 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 1584 | return substr($this->doc, $pos, $len); 1585 | } 1586 | 1587 | protected function copy_until_char($char) 1588 | { 1589 | if ($this->char===null) return ''; 1590 | 1591 | if (($pos = strpos($this->doc, $char, $this->pos))===false) { 1592 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); 1593 | $this->char = null; 1594 | $this->pos = $this->size; 1595 | return $ret; 1596 | } 1597 | 1598 | if ($pos===$this->pos) return ''; 1599 | $pos_old = $this->pos; 1600 | $this->char = $this->doc[$pos]; 1601 | $this->pos = $pos; 1602 | return substr($this->doc, $pos_old, $pos-$pos_old); 1603 | } 1604 | 1605 | protected function copy_until_char_escape($char) 1606 | { 1607 | if ($this->char===null) return ''; 1608 | 1609 | $start = $this->pos; 1610 | while (1) 1611 | { 1612 | if (($pos = strpos($this->doc, $char, $start))===false) 1613 | { 1614 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); 1615 | $this->char = null; 1616 | $this->pos = $this->size; 1617 | return $ret; 1618 | } 1619 | 1620 | if ($pos===$this->pos) return ''; 1621 | 1622 | if ($this->doc[$pos-1]==='\\') { 1623 | $start = $pos+1; 1624 | continue; 1625 | } 1626 | 1627 | $pos_old = $this->pos; 1628 | $this->char = $this->doc[$pos]; 1629 | $this->pos = $pos; 1630 | return substr($this->doc, $pos_old, $pos-$pos_old); 1631 | } 1632 | } 1633 | 1634 | // remove noise from html content 1635 | // save the noise in the $this->noise array. 1636 | protected function remove_noise($pattern, $remove_tag=false) 1637 | { 1638 | global $debug_object; 1639 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1640 | 1641 | $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); 1642 | 1643 | for ($i=$count-1; $i>-1; --$i) 1644 | { 1645 | $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000); 1646 | if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); } 1647 | $idx = ($remove_tag) ? 0 : 1; 1648 | $this->noise[$key] = $matches[$i][$idx][0]; 1649 | $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 1650 | } 1651 | 1652 | // reset the length of content 1653 | $this->size = strlen($this->doc); 1654 | if ($this->size>0) 1655 | { 1656 | $this->char = $this->doc[0]; 1657 | } 1658 | } 1659 | 1660 | // restore noise to html content 1661 | function restore_noise($text) 1662 | { 1663 | global $debug_object; 1664 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1665 | 1666 | while (($pos=strpos($text, '___noise___'))!==false) 1667 | { 1668 | // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us... 1669 | if (strlen($text) > $pos+15) 1670 | { 1671 | $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15]; 1672 | if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); } 1673 | 1674 | if (isset($this->noise[$key])) 1675 | { 1676 | $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16); 1677 | } 1678 | else 1679 | { 1680 | // do this to prevent an infinite loop. 1681 | $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16); 1682 | } 1683 | } 1684 | else 1685 | { 1686 | // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem. 1687 | $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11); 1688 | } 1689 | } 1690 | return $text; 1691 | } 1692 | 1693 | // Sometimes we NEED one of the noise elements. 1694 | function search_noise($text) 1695 | { 1696 | global $debug_object; 1697 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } 1698 | 1699 | foreach($this->noise as $noiseElement) 1700 | { 1701 | if (strpos($noiseElement, $text)!==false) 1702 | { 1703 | return $noiseElement; 1704 | } 1705 | } 1706 | } 1707 | function __toString() 1708 | { 1709 | return $this->root->innertext(); 1710 | } 1711 | 1712 | function __get($name) 1713 | { 1714 | switch ($name) 1715 | { 1716 | case 'outertext': 1717 | return $this->root->innertext(); 1718 | case 'innertext': 1719 | return $this->root->innertext(); 1720 | case 'plaintext': 1721 | return $this->root->text(); 1722 | case 'charset': 1723 | return $this->_charset; 1724 | case 'target_charset': 1725 | return $this->_target_charset; 1726 | } 1727 | } 1728 | 1729 | // camel naming conventions 1730 | function childNodes($idx=-1) {return $this->root->childNodes($idx);} 1731 | function firstChild() {return $this->root->first_child();} 1732 | function lastChild() {return $this->root->last_child();} 1733 | function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();} 1734 | function createTextNode($value) {return @end(str_get_html($value)->nodes);} 1735 | function getElementById($id) {return $this->find("#$id", 0);} 1736 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} 1737 | function getElementByTagName($name) {return $this->find($name, 0);} 1738 | function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);} 1739 | function loadFile() {$args = func_get_args();$this->load_file($args);} 1740 | } 1741 | 1742 | ?> 1743 | --------------------------------------------------------------------------------