├── .gitignore
├── tests
    ├── phpinfo_test.php
    ├── Page_test.html
    ├── errcode_test.php
    ├── autoload.php
    ├── return_this_test.php
    ├── test_simple_html_dom.php
    ├── Crawler_test.php
    ├── fetcher_db_test.php
    ├── curl_test.php
    ├── test_phpfetcher_util_trie.php
    ├── Page_test.php
    └── simple_html_dom.php
├── Phpfetcher
    ├── Manager
    │   └── Abstract.php
    ├── Dom
    │   ├── Abstract.php
    │   ├── SimpleHtmlDom.php
    │   └── simple_html_dom.php
    ├── Error.php
    ├── Page
    │   ├── Abstract.php
    │   └── Default.php
    ├── Crawler
    │   ├── Abstract.php
    │   └── Default.php
    ├── Log.php
    └── Util
    │   └── Trie.php
├── phpfetcher.php
├── deploy.sh
├── demo
    ├── structure
    ├── iframe_example.php
    ├── single_page.php
    ├── multi_page.php
    ├── crawl_baidu_page.php
    ├── get_picture.php
    └── crawl_with_headers.php
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | output
2 | *.swp
3 | 


--------------------------------------------------------------------------------
/tests/phpinfo_test.php:
--------------------------------------------------------------------------------
1 | <?php
2 | echo phpinfo();
3 | ?>
4 | 


--------------------------------------------------------------------------------
/tests/Page_test.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fanfank/phpfetcher/HEAD/tests/Page_test.html


--------------------------------------------------------------------------------
/tests/errcode_test.php:
--------------------------------------------------------------------------------
1 | <?php
2 | class errcode {
3 |     const ERR_SUCCESS = 0;
4 | }
5 | if (errcode::ERR_SUCCESS === 0) {
6 |     echo "defined ERR_SUCCESS\n";
7 | }
8 | ?>
9 | 


--------------------------------------------------------------------------------
/Phpfetcher/Manager/Abstract.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | /*
 3 |  * @author xuruiqi
 4 |  * @date   20150406
 5 |  * @copyright reetsee.com
 6 |  */     
 7 | class Phpfetcher_Manager_Abstract {
 8 | 
 9 | }
10 | ?>
11 | 


--------------------------------------------------------------------------------
/tests/autoload.php:
--------------------------------------------------------------------------------
1 | <?php
2 | define('LIBS_PATH', '../');
3 | function __autoload($strClassName) {
4 |     require_once LIBS_PATH . str_replace('_', '/', $strClassName) . '.php';
5 | }
6 | spl_autoload_register('__autoload');
7 | ?>
8 | 


--------------------------------------------------------------------------------
/tests/return_this_test.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | class Foo {
 3 |     public $foo = 1;
 4 |     public function getInstance() {
 5 |         return $this;
 6 |     }
 7 | }
 8 | 
 9 | $objFoo = new Foo();
10 | $objFoo->getInstance()->foo = 2;
11 | echo $objFoo->foo;
12 | ?>
13 | 


--------------------------------------------------------------------------------
/phpfetcher.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | define('PHPFETCHER_PATH', dirname(__FILE__));
 3 | function __phpfetcher_autoload($strClassName) {
 4 |     if (substr($strClassName, 0, strlen('Phpfetcher_')) === 'Phpfetcher_') {
 5 |         require_once PHPFETCHER_PATH . '/' . str_replace('_', '/', $strClassName) . '.php';
 6 |     }
 7 | }
 8 | spl_autoload_register('__phpfetcher_autoload');
 9 | ?>
10 | 


--------------------------------------------------------------------------------
/tests/test_simple_html_dom.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | require_once('simple_html_dom.php');
 3 | $str = 'http://news.sina.com.cn/s/2014-09-27/150630924264.shtml';
 4 | $html = file_get_html($str);
 5 | //var_dump($html);
 6 | //var_dump($html->find('//h1', 0)->plaintext);
 7 | $res = $html->find('//p');
 8 | for($i = 0; $i < count($res); ++$i) {
 9 |     $arrContent = 
10 | }
11 | for($i = 0; $i < $res->length; ++$i) {
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/Phpfetcher/Dom/Abstract.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | /*
 3 |  * @author xuruiqi
 4 |  * @date   2014.09.21
 5 |  * @copyright reetsee.com
 6 |  * @desc   Dom's abstract class
 7 |  */
 8 | abstract class Phpfetcher_Dom_Abstract {
 9 |     abstract function getElementById($id);
10 | 
11 |     abstract function getElementsByTagName($tag);
12 | 
13 |     abstract function loadHTML($content);
14 | 
15 |     abstract function sel($pattern = '', $node = NULL);
16 | }
17 | 


--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DEPLOY_PATH="$ENV_PATH/php/phplib"
 4 | 
 5 | OUTPUT_DIR="output"
 6 | PRODUCT="phpfetcher"
 7 | OUTPUT_FILE="$PRODUCT.tar.gz"
 8 | 
 9 | mkdir -p $OUTPUT_DIR
10 | rm -rf $OUTPUT_DIR/*
11 | cp -rf Phpfetcher phpfetcher.php $OUTPUT_DIR/
12 | 
13 | cd $OUTPUT_DIR
14 | find ./ -name .git -exec rm -rf {} \;
15 | tar zcvf $OUTPUT_FILE ./*
16 | 
17 | rm -rf Phpfetcher phpfetcher.php
18 | 
19 | cp $OUTPUT_FILE $DEPLOY_PATH/
20 | cd $DEPLOY_PATH
21 | tar zxvf $OUTPUT_FILE
22 | rm -rf $OUTPUT_FILE
23 | 


--------------------------------------------------------------------------------
/Phpfetcher/Error.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | /*
 3 |  * @author xuruiqi
 4 |  * @date   20150406
 5 |  * @copyright reetsee.com
 6 |  */     
 7 | class Phpfetcher_Error {
 8 |     //error codes
 9 |     const ERR_SUCCESS = 0;
10 |     const ERR_INVALID_FIELD = 1000;
11 |     const ERR_FIELD_NOT_SET = 1001;
12 | 
13 |     //error messages
14 |     protected static $_arrErrcode2Errmsg = array(
15 |         self::ERR_SUCCESS       => 'Success',    
16 |         self::ERR_INVALID_FIELD => 'Invalid field in array',
17 |         self::ERR_FIELD_NOT_SET => 'Accessing a non-set field',
18 |     );
19 | 
20 |     public static function getErrmsg($errcode) {
21 |         return self::$_arrErrcode2Errmsg[$errcode] . "\n";
22 |     }
23 | }
24 | ?>
25 | 


--------------------------------------------------------------------------------
/tests/Crawler_test.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | require_once('autoload.php');
 3 | class mycrawler extends Phpfetcher_Crawler_Default {
 4 |     public function handlePage($page) {
 5 |         print_r($page->getHyperLinks());
 6 |     }
 7 | }
 8 | 
 9 | $crawler = new mycrawler();
10 | $arrFetchJobs = array(
11 |     'blog.reetsee' => array(
12 |         'start_page' => 'http://blog.reetsee.com', 
13 |         'link_rules' => array(
14 |             '/blog\.reetsee\.com/',   
15 |             '/wordpress/',
16 |         ),
17 |     ),        
18 |     'qq' => array(
19 |         'start_page' => 'http://news.qq.com',   
20 |         'link_rules' => array(
21 |             '/(.*)\/a\/(\d{8})\/(\d+)\.htm/',    
22 |         ),
23 |         'max_depth' => 4, 
24 |     ),
25 | );
26 | $crawler->setFetchJobs($arrFetchJobs)->run();
27 | //$page->setConfField('url', 'http://tech.qq.com/a/20140715/073002.htm');
28 | 
29 | ?>
30 | 


--------------------------------------------------------------------------------
/tests/fetcher_db_test.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | require_once('phpfetcher.php');
 3 | class mycrawler extends Phpfetcher_Crawler_Default {
 4 |     public function handlePage($page) {
 5 |         print_r($page->getHyperLinks());
 6 |     }
 7 | }
 8 | 
 9 | $crawler = new mycrawler();
10 | $arrFetchJobs = array(
11 |     'tencent' => array(
12 |         'start_page' => 'http://news.qq.com',   
13 |         'link_rules' => array(
14 |             '/(.*)\/a\/(\d{8})\/(\d+)\.htm/',    
15 |         ),
16 |         'max_depth' => 4, 
17 |     ),
18 |     'reetsee' => array(
19 |         'start_page' => 'http://blog.reetsee.com', 
20 |         'link_rules' => array(
21 |             '/blog\.reetsee\.com/',   
22 |             '/wordpress/',
23 |         ),
24 |     ),        
25 | );
26 | $crawler->setFetchJobs($arrFetchJobs)->run();
27 | //$page->setConfField('url', 'http://tech.qq.com/a/20140715/073002.htm');
28 | 
29 | ?>
30 | 


--------------------------------------------------------------------------------
/demo/structure:
--------------------------------------------------------------------------------
 1 | |-phpfetcher.php                //使用时include这个文件即可
 2 | |-Phpfetcher                    //此文件夹与phpfetcher.php要在同一目录下
 3 |     |-Error.php                 //出错处理相关
 4 |     |-Log.php                   //日志相关
 5 |     |-Manager                   //暂时无用
 6 |     |-Crawler                   //Crawler目录，存放爬虫相关的类代码
 7 |         |-Abstract.php          //爬虫基类Phpfetcher_Crawler_Abstrct
 8 |         |-Default.php           //默认提供的爬虫类Phpfetcher_Crawler_Default
 9 |     |-Page                      //Page目录，存放页面相关的类代码
10 |         |-Abstract.php          //页面基类Phpfetcher_Page_Abstract
11 |         |-Default.php           //默认提供的页面类Phpfetcher_Page_Default
12 |     |-Dom                       //Dom目录，存放Dom相关的类代码
13 |         |-Abstract.php          //Dom基类Phpfetcher_Dom_Abstract
14 |         |-SimpleHtmlDom.php     //默认提供的Dom类Phpfetcher_Dom_SimpleHtmlDom
15 |         |-simple_html_dom.php   //Phpfetcher_Dom_SimpleHtmlDom实际使用了simple_html_dom中的代码
16 | 


--------------------------------------------------------------------------------
/tests/curl_test.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | function getAvailableDomain($strName, $intMaxDepth) {
 3 |     echo "testing:$strName ... \n";
 4 |     if (strlen($strName) === $intMaxDepth) {
 5 |         if ($strName < 'xls') {
 6 |             return;
 7 |         }
 8 |         $url = "http://pandavip.www.net.cn/check/check_ac1.cgi?domain=$strName.com";
 9 |         $objCurl = curl_init();
10 |         
11 |         curl_setopt($objCurl, CURLOPT_RETURNTRANSFER, TRUE);
12 |         curl_setopt($objCurl, CURLOPT_URL, $url);
13 |         curl_setopt($objCurl, CURLOPT_TIMEOUT, 5);
14 |         
15 |         $res = curl_exec($objCurl);
16 |         if (preg_match('#is available#', $res) > 0) {
17 |             echo "available!\n";
18 |         }
19 |         curl_close($objCurl);
20 |         return;
21 |     }
22 | 
23 |     $c = 'a';
24 |     for ($i = 0; $i < 26; $i++) {
25 |         getAvailableDomain($strName . $c, $intMaxDepth);
26 |         ++$c;
27 |     }
28 | }
29 | 
30 | getAvailableDomain('', 3);
31 | ?>
32 | 


--------------------------------------------------------------------------------
/demo/iframe_example.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | $demo_include_path = dirname(__FILE__) . '/../';
 3 | set_include_path(get_include_path() . PATH_SEPARATOR . $demo_include_path);
 4 | 
 5 | require_once('phpfetcher.php');
 6 | class mycrawler extends Phpfetcher_Crawler_Default {
 7 |     public function handlePage($page) {
 8 |         echo "+++ enter page: [" . $page->getUrl() . "] +++\n";
 9 | 
10 |         //选取所有包含src属性的iframe元素
11 |         $arrIframes = $page->sel('//iframe[@src]'); 
12 |         for ($i = 0; $i < count($arrIframes); ++$i) {
13 |             $strSrc = $arrIframes[$i]->src;
14 |             echo "found iframe url=[" . $strSrc . "]\n";
15 |             $this->addAdditionalUrls($strSrc);
16 |         }
17 |         echo "--- leave page: [" . $page->getUrl() . "] ---\n";
18 |     }
19 | 
20 | };
21 | 
22 | $crawler = new mycrawler();
23 | $arrJobs = array(
24 |     '163' => array( 
25 |         'start_page' => 'http://news.163.com',
26 |         'link_rules' => array(),
27 |         'max_depth' => 2, 
28 |     ) ,   
29 | );
30 | 
31 | $crawler->setFetchJobs($arrJobs)->run();
32 | 
33 | echo "Done!\n";
34 | 


--------------------------------------------------------------------------------
/tests/test_phpfetcher_util_trie.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | $test_include_path = dirname(__FILE__) . '/../';
 3 | set_include_path(get_include_path() . PATH_SEPARATOR . $test_include_path);
 4 | 
 5 | require_once('phpfetcher.php');
 6 | 
 7 | function print_trie(&$trie) {
 8 |     echo "has ftp:" . var_export($trie->has("ftp"), true) . "\n";
 9 |     echo "has http:" . var_export($trie->has("http"), true) . "\n";
10 |     echo "has https:" . var_export($trie->has("https"), true) . "\n";
11 |     echo "\n";
12 | }
13 | 
14 | $arrSchemes = array(
15 |     "http",        
16 |     "https",
17 |     "ftp",
18 | );
19 | $trie = new Phpfetcher_Util_Trie($arrSchemes);
20 | print_trie($trie);
21 | 
22 | echo "delete 'abc'\n";
23 | $trie->delete("abc");
24 | print_trie($trie);
25 | 
26 | echo "delete 'ftp'\n";
27 | $trie->delete("ftp");
28 | print_trie($trie);
29 | 
30 | echo "delete 'http'\n";
31 | $trie->delete("http");
32 | print_trie($trie);
33 | 
34 | echo "insert 'ftp'\n";
35 | $trie->insert("ftp");
36 | print_trie($trie);
37 | 
38 | echo "delete 'https'\n";
39 | $trie->delete("https");
40 | print_trie($trie);
41 | 
42 | echo "insert 'http'\n";
43 | $trie->insert("http");
44 | print_trie($trie);
45 | 


--------------------------------------------------------------------------------
/tests/Page_test.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | //require_once('autoload.php');
 3 | require_once('phpfetcher.php');
 4 | $page = new Phpfetcher_Page_Default();
 5 | $page->init();
 6 | //$page->setConfField('url', 'http://tech.qq.com/a/20140715/073002.htm');
 7 | $page->setConfField('url', 'http://news.qq.com/a/20140921/000030.htm');
 8 | $page->read();
 9 | //echo $page->getContent();
10 | //$DOMElement_id_oneKey = $page->selId('oneKey');
11 | //var_dump($DOMElement_id_oneKey);
12 | //echo "\n";
13 | //var_dump($DOMElement_id_oneKey->parentNode);
14 | //echo "\n";
15 | //var_dump($DOMElement_id_oneKey->childNodes);
16 | //echo $DOMElement_id_oneKey->nodeValue;
17 | //print_r($page->xpath('//meta[@http-equiv]'));
18 | //var_dump($page->xpath2('//meta[@http-equiv]'));
19 | $res = $page->xpath('//title');
20 | //$res = $page->xpath('a');
21 | var_dump($res->item(0)->nodeValue);
22 | //var_dump($res->item(1)->nodeValue);
23 | /*
24 | $arrLinks = array();
25 | $res = $page->xpath('//a/@href');
26 | for($i = 0; $i < $res->length;++$i) {
27 |     //var_dump($res->item($i));
28 |     $arrLinks[] = $res->item($i)->nodeValue;
29 | }
30 |  */
31 | //$arrLinks = $page->getHyperLinks();
32 | //print_r($arrLinks);
33 | 
34 | 
35 | 
36 | ?>
37 | 


--------------------------------------------------------------------------------
/demo/single_page.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | //下面两行使得这个项目被下载下来后本文件能直接运行
 3 | $demo_include_path = dirname(__FILE__) . '/../';
 4 | set_include_path(get_include_path() . PATH_SEPARATOR . $demo_include_path);
 5 | 
 6 | require_once('phpfetcher.php');
 7 | class mycrawler extends Phpfetcher_Crawler_Default {
 8 |     public function handlePage($page) {
 9 |         //打印处当前页面的title
10 |         $res = $page->sel('//title');
11 |         for ($i = 0; $i < count($res); ++$i) {
12 |             echo $res[$i]->plaintext;
13 |             echo "\n";
14 |         }
15 |     }
16 | }
17 | 
18 | $crawler = new mycrawler();
19 | $arrJobs = array(
20 |     //任务的名字随便起，这里把名字叫qqnews
21 |     //the key is the name of a job, here names it qqnews
22 |     'qqnews' => array( 
23 |         'start_page' => 'http://news.qq.com/a/20140927/026557.htm', //起始网页
24 |         'link_rules' => array(
25 |             /*
26 |              * 所有在这里列出的正则规则，只要能匹配到超链接，那么那条爬虫就会爬到那条超链接
27 |              * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches
28 |              */
29 |         ),
30 |         //爬虫从开始页面算起，最多爬取的深度，设置为1表示只爬取起始页面
31 |         //Crawler's max following depth, 1 stands for only crawl the start page
32 |         'max_depth' => 1, 
33 |         
34 |     ) ,   
35 | );
36 | 
37 | //$crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样
38 | $crawler->setFetchJobs($arrJobs);
39 | $crawler->run();
40 | 


--------------------------------------------------------------------------------
/demo/multi_page.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | //下面两行使得这个项目被下载下来后本文件能直接运行
 3 | $demo_include_path = dirname(__FILE__) . '/../';
 4 | set_include_path(get_include_path() . PATH_SEPARATOR . $demo_include_path);
 5 | 
 6 | require_once('phpfetcher.php');
 7 | class mycrawler extends Phpfetcher_Crawler_Default {
 8 |     public function handlePage($page) {
 9 |         //打印处当前页面的第1个h1标题内荣（下标从0开始）
10 |         $strFirstH1 = trim($page->sel('//h1', 0)->plaintext);
11 |         if (!empty($strFirstH1)) {
12 |             echo $page->sel('//h1', 0)->plaintext;
13 |             echo "\n";
14 |         }
15 |     }
16 | }
17 | 
18 | $crawler = new mycrawler();
19 | $arrJobs = array(
20 |     //任务的名字随便起，这里把名字叫qqnews
21 |     //the key is the name of a job, here names it qqnews
22 |     'qqnews' => array( 
23 |         'start_page' => 'http://news.qq.com', //起始网页
24 |         'link_rules' => array(
25 |             /*
26 |              * 所有在这里列出的正则规则，只要能匹配到超链接，那么那条爬虫就会爬到那条超链接
27 |              * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches
28 |              */
29 |             '#news\.qq\.com/a/\d+/\d+\.htm$#',
30 |         ),
31 |         //爬虫从开始页面算起，最多爬取的深度，设置为2表示爬取深度为1
32 |         //Crawler's max following depth, 1 stands for only crawl the start page
33 |         'max_depth' => 2, 
34 |         
35 |     ) ,   
36 | );
37 | 
38 | $crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样
39 | //$crawler->setFetchJobs($arrJobs);
40 | //$crawler->run();
41 | 


--------------------------------------------------------------------------------
/demo/crawl_baidu_page.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | //下面两行使得这个项目被下载下来后本文件能直接运行
 3 | $demo_include_path = dirname(__FILE__) . '/../';
 4 | set_include_path(get_include_path() . PATH_SEPARATOR . $demo_include_path);
 5 | 
 6 | require_once('phpfetcher.php');
 7 | class mycrawler extends Phpfetcher_Crawler_Default {
 8 |     public function handlePage($page) {
 9 |         //打印处当前页面的title
10 |         $res = $page->sel('//h3/a');
11 |         for ($i = 0; $i < count($res); ++$i) {
12 |             echo $res[$i]->plaintext;
13 |             echo "\n";
14 |             echo $res[$i]->getAttribute('href');
15 |             echo "\n";
16 |             echo "\n";
17 |         }
18 |     }
19 | }
20 | 
21 | $crawler = new mycrawler();
22 | $arrJobs = array(
23 |     //任务的名字随便起，这里把名字叫qqnews
24 |     //the key is the name of a job, here names it qqnews
25 |     'qqnews' => array( 
26 |         'start_page' => 'https://www.baidu.com/s?wd=facebook', //起始网页
27 |         'link_rules' => array(
28 |             /*
29 |              * 所有在这里列出的正则规则，只要能匹配到超链接，那么那条爬虫就会爬到那条超链接
30 |              * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches
31 |              */
32 |         ),
33 |         //爬虫从开始页面算起，最多爬取的深度，设置为1表示只爬取起始页面
34 |         //Crawler's max following depth, 1 stands for only crawl the start page
35 |         'max_depth' => 1, 
36 |         
37 |     ) ,   
38 | );
39 | 
40 | //$crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样
41 | $crawler->setFetchJobs($arrJobs);
42 | $crawler->run();
43 | 


--------------------------------------------------------------------------------
/Phpfetcher/Page/Abstract.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | /*
 3 |  * @author xuruiqi
 4 |  * @date   2014.06.28
 5 |  * @copyright reetsee.com
 6 |  * @abstract Page's abstract class
 7 |  */
 8 | abstract class Phpfetcher_Page_Abstract {
 9 | 
10 |     protected $_strContent = NULL;
11 | 
12 |     public function __construct() {
13 | 
14 |     }
15 | 
16 |     public function __destruct() {
17 | 
18 |     }
19 | 
20 |     public function __get($key) {
21 |         echo $key, 'doesn\'t exist!';
22 |     }
23 | 
24 |     public function __set($key, $val) {
25 |         echo "Can't assign '$val' to $key!";
26 |     }
27 | 
28 |     /**
29 |      * @abstract get configurations.
30 |      */
31 |     abstract function getConf();
32 |     //{
33 |         //echo 'Not implemented';
34 |     //}
35 | 
36 |     /**
37 |      * @abstract get page's URL.
38 |      */
39 |     abstract function getUrl();
40 | 
41 |     /**
42 |      * @abstract initialize this instance with specified or default configuration
43 |      */
44 |     abstract function init();
45 | 
46 |     /**
47 |      * @abstract get page's content, and save it into member variable $content
48 |      */
49 |     abstract function read();
50 | 
51 |     /**
52 |      * @abstract set configurations.
53 |      */
54 |     abstract function setConf($conf = array());
55 |     //{
56 |         //echo 'Not implemented';
57 |     //}
58 | 
59 |     /**
60 |      * @abstract set page's URL.
61 |      */
62 |     abstract function setUrl($url);
63 | 
64 |     //取出$_strContent中的所有<a>标签的内容，以数组形式返回
65 |     abstract function getHyperLinks();
66 | }
67 | ?>
68 | 


--------------------------------------------------------------------------------
/Phpfetcher/Crawler/Abstract.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | /*
 3 |  * @author xuruiqi
 4 |  * @date 2014-07-17
 5 |  * @copyright reetsee.com
 6 |  * @desc 爬虫对象的抽象类
 7 |  *       Crawler objects' abstract class
 8 |  */
 9 | abstract class Phpfetcher_Crawler_Abstract {
10 |     protected $_arrPostFetchHooks = array();
11 |     protected $_arrPreFetchHooks  = array();
12 |     //protected $_arrExtraInfo = array();
13 | 
14 |     //设置爬虫的爬取规则
15 |     abstract function &setFetchJobs($arrInput = array());
16 | 
17 |     //删除一条已有的爬取规则
18 |     public function delFetchJobs($arrInput) {
19 |         Phpfetcher_Log::notice('not implemented');
20 |     }
21 | 
22 |     //查看已有的爬取规则
23 |     abstract function getFetchJobs();
24 | 
25 |     //对于每次爬取到的页面，进行的操作，这个方法需要使用者自己实现
26 |     abstract function handlePage($objPage);
27 | 
28 |     /*
29 |     public function getExtraInfo($arrInput = array()) {
30 |         $arrOutput = array();
31 |         foreach ($arrInput as $field) {
32 |             $arrOutput[$field] = $this->_arrExtraInfo[$field];
33 |         }
34 |         return $arrOutput;
35 |     }
36 |      */
37 | 
38 |     /*
39 |     public function setExtraInfo($arrInput = array()) {
40 |         if (!is_array($arrInput) || empty($arrInput)) {
41 |             return FALSE;
42 |         }
43 |         foreach ($arrInput as $key => $value) {
44 |             $this->_arrExtraInfo[$key] = $value;
45 |         }
46 |         return TRUE;
47 |     }
48 |      */
49 | 
50 |     /*
51 |     //修改一条已有的规则
52 |     public function setFetchJobByName() {
53 |         Phpfetcher_Error::notice('not implemented');
54 |     }
55 |      */
56 | 
57 |     //运行爬虫
58 |     abstract function &run($arrInput = array());
59 | }
60 | ?>
61 | 


--------------------------------------------------------------------------------
/demo/get_picture.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | //下面两行使得这个项目被下载下来后本文件能直接运行
 3 | $demo_include_path = dirname(__FILE__) . '/../';
 4 | set_include_path(get_include_path() . PATH_SEPARATOR . $demo_include_path);
 5 | 
 6 | require_once('phpfetcher.php');
 7 | class mycrawler extends Phpfetcher_Crawler_Default {
 8 |     public function handlePage($page) {
 9 |         $objContent = $page->sel("//p");
10 |         for ($i = 0; $i < count($objContent); ++$i) {
11 |             $objPic = $objContent[$i]->find("img");
12 |             for ($j = 0; $j < count($objPic); ++$j) {
13 |                 echo $objPic[$j]->getAttribute('src') . "\n";
14 |                 echo $objPic[$j]->getAttribute('alt') . "\n";
15 |                 echo $objContent[$i]->plaintext . "\n";
16 |                 echo $objContent[$i]->outertext() . "\n";
17 |             }
18 |         }
19 | 
20 |         ////打印处当前页面的title
21 |         //$res = $page->sel('//title');
22 |         //for ($i = 0; $i < count($res); ++$i) {
23 |         //    echo $res[$i]->plaintext;
24 |         //    echo "\n";
25 |         //}
26 |     }
27 | }
28 | 
29 | $crawler = new mycrawler();
30 | $arrJobs = array(
31 |     //任务的名字随便起，这里把名字叫qqnews
32 |     //the key is the name of a job, here names it qqnews
33 |     'qqnews' => array( 
34 |         'start_page' => 'http://news.163.com/16/0325/21/BJ1I6PN40001124J.html', //起始网页
35 |         'link_rules' => array(
36 |             /*
37 |              * 所有在这里列出的正则规则，只要能匹配到超链接，那么那条爬虫就会爬到那条超链接
38 |              * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches
39 |              */
40 |         ),
41 |         //爬虫从开始页面算起，最多爬取的深度，设置为1表示只爬取起始页面
42 |         //Crawler's max following depth, 1 stands for only crawl the start page
43 |         'max_depth' => 1, 
44 |         
45 |     ) ,   
46 | );
47 | 
48 | //$crawler->setFetchJobs($arrJobs)->run(); //这一行的效果和下面两行的效果一样
49 | $crawler->setFetchJobs($arrJobs);
50 | $crawler->run();
51 | 


--------------------------------------------------------------------------------
/Phpfetcher/Dom/SimpleHtmlDom.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | require_once(dirname(__FILE__) . '/simple_html_dom.php');
 3 | 
 4 | /*
 5 |  * @author xuruiqi
 6 |  * @date   2014.09.21
 7 |  * @copyright reetsee.com
 8 |  * @desc   a dom implementation via simple_html_dom
 9 |  *         simple_html_dom's official site:
10 |  *              http://sourceforge.net/projects/simplehtmldom
11 |  */
12 | class Phpfetcher_Dom_SimpleHtmlDom extends Phpfetcher_Dom_Abstract {
13 |     protected $_dom = NULL;
14 | 
15 |     function __destruct() {
16 |         if (method_exists($this->_dom, 'clear')) {
17 |             $this->_dom->clear();
18 |         }
19 |     }
20 | 
21 |     public function getElementById($id) {
22 |         $strMethodName = 'getElementById';
23 |         if (method_exists($this->_dom, $strMethodName)) {
24 |             return $this->_dom->getElementById($id);
25 |         } else {
26 |             Phpfetcher_Log::warning("method $strMethodName not exists");
27 |             return FALSE;
28 |         }
29 |     }
30 | 
31 |     public function getElementsByTagName($tag) {
32 |         $strMethodName = 'getElementsByTagName';
33 |         if (method_exists($this->_dom, $strMethodName)) {
34 |             return $this->_dom->getElementsByTagName($tag);
35 |         } else {
36 |             Phpfetcher_Log::warning("method $strMethodName not exists");
37 |             return FALSE;
38 |         }
39 |     }
40 | 
41 |     public function loadHTML($content) {
42 |         if (NULL === $this->_dom) {
43 |             if (function_exists('str_get_html')) {
44 |                 $this->_dom = str_get_html($content);
45 |             }
46 |         } else {
47 |             if (method_exists($this->_dom, 'load')) {
48 |                 $this->_dom->load($content);
49 |             }
50 |         }
51 | 
52 |         return $this;
53 |     }
54 | 
55 |     /**
56 |      * @deprecated
57 |      */
58 |     public function sel($pattern = '', $idx = NULL, $node = NULL) {
59 |         return $this->find($pattern, $idx);
60 |     }
61 | 
62 |     public function find($pattern = '', $idx = NULL) {
63 |         $strMethodName = 'find';
64 |         if (method_exists($this->_dom, $strMethodName)) {
65 |             return $this->_dom->find($pattern, $idx);
66 |         } else {
67 |             Phpfetcher_Log::warning("method $strMethodName not exists");
68 |             return FALSE;
69 |         }
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/Phpfetcher/Log.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | /*
 3 |  * @author xuruiqi
 4 |  * @date   20150406
 5 |  * @copyright reetsee.com
 6 |  */     
 7 | class Phpfetcher_Log {
 8 |     const LOG_LEVEL_DEBUG   = 1;
 9 |     const LOG_LEVEL_NOTICE  = 2;
10 |     const LOG_LEVEL_WARNING = 3;
11 |     const LOG_LEVEL_FATAL   = 4;
12 | 
13 |     public function __construct() {
14 | 
15 |     }
16 | 
17 |     static public function debug($strMsg, $intTraceLevel = 0) {
18 |         $strMsg = strval($strMsg);
19 |         self::_print(self::LOG_LEVEL_DEBUG, $strMsg, $intTraceLevel);
20 |     }
21 | 
22 |     static public function notice($strMsg, $intTraceLevel = 0) {
23 |         $strMsg = strval($strMsg);
24 |         self::_print(self::LOG_LEVEL_NOTICE, $strMsg, $intTraceLevel);
25 |     }
26 | 
27 |     static public function warning($strMsg, $intTraceLevel = 0) {
28 |         $strMsg = strval($strMsg);
29 |         self::_print(self::LOG_LEVEL_WARNING, $strMsg, $intTraceLevel);
30 |     }
31 | 
32 |     static public function fatal($strMsg, $intTraceLevel = 0) {
33 |         $strMsg = strval($strMsg);
34 |         self::_print(self::LOG_LEVEL_FATAL, $strMsg, $intTraceLevel);
35 |     }
36 | 
37 |     static protected function _print($log_level, $strMsg, $intTraceLevel = 0) {
38 |         switch($log_level) {
39 |             case self::LOG_LEVEL_FATAL:
40 |                 $strPrepend = 'Fatal: ';
41 |                 $strAppend = "\n";
42 |                 break;
43 |             case self::LOG_LEVEL_WARNING:
44 |                 $strPrepend = 'Warning: ';
45 |                 $strAppend = "\n";
46 |                 break;
47 |             case self::LOG_LEVEL_NOTICE:
48 |                 $strPrepend = 'Notice: ';
49 |                 $strAppend = "\n";
50 |                 break;
51 |             case self::LOG_LEVEL_DEBUG:
52 |                 $strPrepend = 'Debug: ';
53 |                 $strAppend = "\n";
54 |                 break;
55 |         }
56 | 
57 |         //参考了其它地方的代码
58 |         $arrTrace = debug_backtrace();
59 |         $intDepth = 2 + $intTraceLevel;
60 |         $intTraceDepth = count($arrTrace);
61 |         if ($intDepth > $intTraceDepth) {
62 |             $intDepth = $intTraceDepth;
63 |         }
64 |         $arrTargetTrace = $arrTrace[$intDepth];
65 |         unset($arrTrace);
66 |         if (isset($arrTargetTrace['file'])) {
67 |             $arrTargetTrace['file'] = basename($arrTargetTrace['file']);
68 |         }
69 | 
70 |         $strPrepend = strval(@date("Y-m-d H:i:s")) . " {$arrTargetTrace['file']} {$arrTargetTrace['class']} {$arrTargetTrace['function']} {$arrTargetTrace['line']} " . $strPrepend . ' ';
71 | 
72 |         $strMsg = $strPrepend . $strMsg . $strAppend;
73 | 
74 |         echo $strMsg;
75 |     }
76 | }
77 | ?>
78 | 


--------------------------------------------------------------------------------
/Phpfetcher/Util/Trie.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | /*
 3 |  * @author xuruiqi
 4 |  * @date 2015-10-26
 5 |  * @copyright reetsee.com
 6 |  * @desc 字典树的简单实现，没有做内存优化
 7 |  *       A simple implementation of trie without improvements on memory
 8 |  */
 9 | class Phpfetcher_Util_Trie {
10 |     protected $_arrTrieRoot = array();
11 | 
12 |     public function __construct($arrStrings = array()) {
13 |         $this->_arrTrieRoot = array(
14 |             'children' => array(),       
15 |             'count'    => 0,
16 |         );
17 |         foreach ($arrStrings as $str) {
18 |             $this->insert($str);
19 |         }
20 |     }
21 | 
22 |     public function insert($str) {
23 |         try {
24 |             $str        = strval($str);
25 |             $intLen     = strlen($str);
26 |             $arrCurNode = &$this->_arrTrieRoot;
27 | 
28 |             for ($i = 0; $i < $intLen; ++$i) {
29 |                 if (!isset($arrCurNode['children'][$str[$i]])) {
30 |                     $arrCurNode['children'][$str[$i]] = array(
31 |                         'children' => array(),
32 |                         'count'    => 0,
33 |                     );
34 |                 }
35 |                 $arrCurNode = &$arrCurNode['children'][$str[$i]];
36 |             }
37 | 
38 |             $arrCurNode['count'] += 1;
39 |             unset($arrCurNode);
40 | 
41 |         } catch (Exception $e) {
42 |             Phpfetcher_Log::fatal($e->getMessage());
43 |             return false;
44 |         }
45 | 
46 |         return true;
47 |     }
48 | 
49 |     public function delete($str) {
50 |         $arrCurNode = &$this->_locateNode($str);
51 |         if (!is_null($arrCurNode) && $arrCurNode['count'] > 0) {
52 |             $arrCurNode['count'] -= 1;
53 |         }
54 |         unset($arrCurNode);
55 |         return true;
56 |     }
57 | 
58 |     public function has($str) {
59 |         $arrTargetNode = &$this->_locateNode($str);
60 |         $bolRes = false;
61 |         if (!is_null($arrTargetNode) && $arrTargetNode['count'] > 0) {
62 |             $bolRes = true;
63 |         }
64 |         unset($arrTargetNode);
65 |         return $bolRes;
66 |     }
67 | 
68 |     protected function &_locateNode($str) {
69 |         $str = strval($str);
70 |         $intLen     = strlen($str);
71 |         $arrCurNode = &$this->_arrTrieRoot;
72 | 
73 |         for ($i = 0; $i < $intLen; ++$i) {
74 |             if (!isset($arrCurNode['children'][$str[$i]])) {
75 |                 return null;
76 |             }
77 |             $arrCurNode = &$arrCurNode['children'][$str[$i]];
78 |         }
79 | 
80 |         return $arrCurNode;
81 |     }
82 | 
83 |     //public function startsWith($str) {
84 |     //    $str = strval($str);
85 |     //    //TODO
86 |     //}
87 | };
88 | 


--------------------------------------------------------------------------------
/demo/crawl_with_headers.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | //下面两行使得这个项目被下载下来后本文件能直接运行
 3 | $demo_include_path = dirname(__FILE__) . '/../';
 4 | set_include_path(get_include_path() . PATH_SEPARATOR . $demo_include_path);
 5 | 
 6 | require_once('phpfetcher.php');
 7 | class mycrawler extends Phpfetcher_Crawler_Default {
 8 |     public function handlePage($page) {
 9 |         //打印处当前页面的title
10 |         $res = $page->sel('//title');
11 |         for ($i = 0; $i < count($res); ++$i) {
12 |             echo $res[$i]->plaintext;
13 |             echo "\n";
14 |         }
15 |     }
16 | }
17 | 
18 | $crawler = new mycrawler();
19 | $arrJobs = array(
20 |     //任务的名字随便起，这里把名字叫qqnews
21 |     //the key is the name of a job, here names it qqnews
22 |     'qqnews' => array( 
23 |         'start_page' => 'http://jianli.58.com/resume/93489192884492', //起始网页
24 |         'link_rules' => array(
25 |             /*
26 |              * 所有在这里列出的正则规则，只要能匹配到超链接，那么那条爬虫就会爬到那条超链接
27 |              * Regex rules are listed here, the crawler will follow any hyperlinks once the regex matches
28 |              */
29 |         ),
30 |         //爬虫从开始页面算起，最多爬取的深度，设置为1表示只爬取起始页面
31 |         //Crawler's max following depth, 1 stands for only crawl the start page
32 |         'max_depth' => 1, 
33 | 
34 |         //某些页面做了防抓取策略，可以通过修改UA，或者添加必要的HTTP Header来防止屏蔽
35 |         //Some pages may prevent crawlers from working, you may change UA or add
36 |         //  necessary HTTP Headers to prevent this.
37 |         'page_conf' => array(
38 |             'http_header' => array(
39 |                 //如果本例子对于你来说运行不成功（发生了错误），那么请将下面的Header
40 |                 //  替换成与你浏览器请求Header一样的内容，但是不要添加Accept-Encoding
41 |                 //  这个Header
42 |                 //If this example can not run successfully, please replace the Headers
43 |                 //  below with the ones exactly you see from your browser. Remember
44 |                 //  not to add Accept-Encoding header.
45 |                 'Host: jianli.m.58.com',
46 |                 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0',
47 |                 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
48 |                 'Cookie: 58home=tj; id58=c5/ns1enV2k5MFGqLUAXAg==; city=tj; 58tj_uuid=1cf71e54-dd15-4922-8228-b6bb809edbfd; new_session=0; new_uv=1; utm_source=; spm=; init_refer=; myfeet_tooltip=end; als=0; Hm_lvt_2557cda77f2e9a8b94531c9501582142=1470585797; Hm_lpvt_2557cda77f2e9a8b94531c9501582142=1470585797; 4drh9g=test insert val',
49 |                 'Connection: keep-alive',
50 |                 'Cache-Control: max-age=0',
51 | 
52 |                 //不要添加Accept-Encoding的Header
53 |                 //Do not add Accept-Encoding Header
54 |                 //'Accept-Encoding: gzip, deflate' 
55 |             ),
56 |         ),
57 |     ) ,   
58 | );
59 | 
60 | $crawler->setFetchJobs($arrJobs);
61 | $crawler->run();
62 | 


--------------------------------------------------------------------------------
/Phpfetcher/Page/Default.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | /*
  3 |  * @author xuruiqi
  4 |  * @date   2014.06.28
  5 |  * @copyright reetsee.com
  6 |  * @desc Default Page class
  7 |  */
  8 | class Phpfetcher_Page_Default extends Phpfetcher_Page_Abstract {
  9 | 
 10 |     protected static $_arrField2CurlOpt = array(
 11 |         /* bool */
 12 |         'include_header' => CURLOPT_HEADER,
 13 |         'exclude_body'   => CURLOPT_NOBODY,
 14 |         'is_post'        => CURLOPT_POST,
 15 |         'is_verbose'     => CURLOPT_VERBOSE,
 16 |         'return_transfer'=> CURLOPT_RETURNTRANSFER,
 17 | 
 18 |         /* int */
 19 |         'buffer_size'       => CURLOPT_BUFFERSIZE,
 20 |         'connect_timeout'   => CURLOPT_CONNECTTIMEOUT,
 21 |         'connect_timeout_ms' => CURLOPT_CONNECTTIMEOUT_MS,
 22 |         'dns_cache_timeout' => CURLOPT_DNS_CACHE_TIMEOUT,
 23 |         'max_redirs'        => CURLOPT_MAXREDIRS,
 24 |         'port'              => CURLOPT_PORT,
 25 |         'timeout'           => CURLOPT_TIMEOUT,
 26 |         'timeout_ms'        => CURLOPT_TIMEOUT_MS,
 27 | 
 28 |         /* string */
 29 |         'cookie'            => CURLOPT_COOKIE,
 30 |         'cookie_file'       => CURLOPT_COOKIEFILE,
 31 |         'cookie_jar'        => CURLOPT_COOKIEJAR,
 32 |         'post_fields'       => CURLOPT_POSTFIELDS,
 33 |         'url'               => CURLOPT_URL,
 34 |         'user_agent'        => CURLOPT_USERAGENT,
 35 |         'user_pwd'          => CURLOPT_USERPWD,
 36 | 
 37 |         /* array */
 38 |         'http_header'       => CURLOPT_HTTPHEADER,
 39 | 
 40 |         /* stream resource */
 41 |         'file'              => CURLOPT_FILE,
 42 | 
 43 |         /* function or a Closure */
 44 |         'write_function'    => CURLOPT_WRITEFUNCTION,
 45 | 
 46 |         /* https */
 47 |         'ssl_verifypeer'    => CURLOPT_SSL_VERIFYPEER,
 48 |     );
 49 | 
 50 |     protected $_arrDefaultConf = array(
 51 |             'connect_timeout' => 10,
 52 |             'max_redirs'      => 10,
 53 |             'return_transfer' => 1,   //need this
 54 |             'timeout'         => 15,
 55 |             'url'             => NULL,
 56 |             'user_agent'      => 'firefox',
 57 |             'ssl_verifypeer'  => false,
 58 |     );
 59 | 
 60 |     protected $_arrConf    = array();
 61 |     protected $_arrExtraInfo = array();
 62 |     protected $_bolCloseCurlHandle = FALSE;
 63 |     protected $_curlHandle = NULL;
 64 |     protected $_dom        = NULL;
 65 |     //protected $_xml        = NULL;
 66 | 
 67 |     public function __construct() {
 68 |     }
 69 |     public function __destruct() {
 70 |         if ($this->_bolCloseCurlHandle) {
 71 |             curl_close($this->_curlHandle);
 72 |         }
 73 |     }
 74 | 
 75 |     public static function formatRes($data, $errcode, $errmsg = NULL) {
 76 |         if ($errmsg === NULL) {
 77 |             $errmsg = Phpfetcher_Error::getErrmsg($errcode);
 78 |         }
 79 |         return array('errcode' => $errcode, 'errmsg' => $errmsg, 'res' => $data);
 80 |     }
 81 | 
 82 |     /**
 83 |      * @author xuruiqi
 84 |      * @desc get configurations.
 85 |      */
 86 |     public function getConf() {
 87 |         return $this->_arrConf;
 88 |     }
 89 | 
 90 |     /**
 91 |      * @author xuruiqi
 92 |      * @param $key: specified field
 93 |      * @return
 94 |      *      bool  : false when field doesn't exist
 95 |      *      mixed : otherwise
 96 |      * @desc get a specified configuration.
 97 |      */
 98 |     public function getConfField($key) {
 99 |         if (isset($this->_arrConf[$key])) {
100 |             return self::formatRes($this->_arrConf[$key], Phpfetcher_Error::ERR_SUCCESS);
101 |         } else {
102 |             return self::formatRes(NULL, Phpfetcher_Error::ERR_FIELD_NOT_SET);
103 |         }
104 |     }
105 | 
106 |     public function getContent() {
107 |         return $this->_strContent;
108 |     }
109 | 
110 |     public function getExtraInfo($arrInput) {
111 |         $arrOutput = array();
112 |         foreach ($arrInput as $req_key) {
113 |             $arrOutput[$req_key] = $this->_arrExtraInfo[$req_key];
114 |         }
115 |         return $arrOutput;
116 |     }
117 | 
118 |     public function getHyperLinks() {
119 |         $arrLinks = array();
120 |         $res = $this->sel('//a');
121 |         for ($i = 0; $i < count($res); ++$i) {
122 |             $arrLinks[] = $res[$i]->href;
123 |         }
124 |         /*
125 |         foreach ($res as $node) {
126 |             $arrLinks[] = $node->href;
127 |         }
128 |          */
129 |         return $arrLinks;
130 |     }
131 | 
132 |     /**
133 |      * @author xuruiqi
134 |      * @param
135 |      * @return
136 |      *      string : current page's url
137 |      * @desc get this page's URL.
138 |      */
139 |     public function getUrl() {
140 |         $arrRet = $this->getConfField('url');
141 |         return strval($arrRet['res']);
142 |     }
143 | 
144 |     /**
145 |      * @author xuruiqi
146 |      * @param
147 |      *      array $conf : configurations
148 |      *      bool  $clear_default : whether to clear default options not set in $conf
149 |      * @return
150 |      * @desc initialize this instance with specified or default configuration
151 |      */
152 |     public function init($curl_handle = NULL, $conf = array()) {
153 |         $this->_curlHandle = $curl_handle;
154 |         if (empty($this->_curlHandle)) {
155 |             $this->_curlHandle = curl_init();
156 |             $this->_bolCloseCurlHandle = TRUE;
157 |         }
158 |         $this->_arrConf = $this->_arrDefaultConf;
159 | 
160 |         $this->setConf($conf, TRUE);
161 | 
162 |         return $this;
163 |     }
164 | 
165 |     /**
166 |      * @author xuruiqi
167 |      * @param
168 |      *      array $ids : elements' ids
169 |      * @return
170 |      *      array : array of DOMElement, with keys equal each of ids
171 |      *      NULL  : if $this->_dom equals NULL
172 |      * @desc select spcified elements with their ids.
173 |      */
174 |     /*
175 |     public function mselId($ids) {
176 |         if ($this->_dom === NULL) {
177 |             Phpfetcher_Log::warning('$this->_dom is NULL!');
178 |             return NULL;
179 |         }
180 | 
181 |         $arrOutput = array();
182 |         foreach ($ids as $id) {
183 |             $arrOutput[$id] = $this->selId($id);
184 |         }
185 |         return $arrOutput;
186 |     }
187 |      */
188 | 
189 |     /**
190 |      * @author xuruiqi
191 |      * @param
192 |      *      array $tags : elements' tags
193 |      * @return
194 |      *      array : array of DOMNodeList, with keys equal each of tags 
195 |      *      NULL  : if $this->_dom equals NULL
196 |      * @desc select spcified elements with their tags
197 |      */
198 |     /*
199 |     public function mselTagName($tags) {
200 |         if ($this->_dom === NULL) {
201 |             Phpfetcher_Log::warning('$this->_dom is NULL!');
202 |             return NULL;
203 |         }
204 | 
205 |         $arrOutput = array();
206 |         foreach ($tags as $tag) {
207 |             $arrOutput[$tag] = $this->selId($tag);
208 |         }
209 |         return $arrOutput;
210 |     }
211 |      */
212 |     
213 | 
214 |     /**
215 |      * @author xuruiqi
216 |      * @param
217 |      *      array $conf : configurations
218 |      *      bool  $clear_previous_conf : if TRUE, then before set $conf, reset current configuration to its default value
219 |      * @return
220 |      *      array : previous conf
221 |      * @desc set configurations.
222 |      */
223 |     public function setConf($conf = array(), $clear_previous_conf = FALSE) {
224 |         $previous_conf = $this->_arrConf;
225 |         if ($clear_previous_conf === TRUE) {
226 |             $this->_arrConf = $this->_arrDefaultConf;
227 |         }
228 |         foreach ($conf as $k => $v) {
229 |             $this->_arrConf[$k] = $v;
230 |         }
231 | 
232 |         $bolRes = TRUE;
233 |         if ($clear_previous_conf === TRUE) {
234 |             $bolRes = $this->_setConf($this->_arrConf);
235 |         } else {
236 |             $bolRes = $this->_setConf($conf);
237 |         }
238 | 
239 |         if ($bolRes != TRUE) {
240 |             $this->_arrConf = $previous_conf;
241 |             $this->_setConf($this->_arrConf);
242 |             return $bolRes;
243 |         }
244 | 
245 |         return $previous_conf;
246 |     }
247 | 
248 |     protected function _setConf($conf = array()) {
249 |         $arrCurlOpts = array();
250 |         foreach ($conf as $k => $v) {
251 |             if (isset(self::$_arrField2CurlOpt[$k])) {
252 |                 $arrCurlOpts[self::$_arrField2CurlOpt[$k]] = $v;
253 |             } else {
254 |                 //currently only curl options can be set
255 |                 $arrCurlOpts[$k] = $v;
256 |             }
257 |         }
258 |         return curl_setopt_array($this->_curlHandle, $arrCurlOpts);
259 |     }
260 | 
261 |     public function setExtraInfo($arrInput) {
262 |         foreach ($arrInput as $key => $val) {
263 |             $this->_arrExtraInfo[$key] = $val;
264 |         }
265 |     }
266 | 
267 |     /**
268 |      * @author xuruiqi
269 |      * @param
270 |      *      string $id : specifed element id
271 |      * @return
272 |      *      object : DOMElement or NULL is not found
273 |      *      NULL   : if $this->_dom equals NULL
274 |      * @desc select a spcified element via its id.
275 |      */
276 |     public function selId($id) {
277 |         if ($this->_dom === NULL) {
278 |             Phpfetcher_Log::warning('$this->_dom is NULL!');
279 |             return NULL;
280 |         }
281 | 
282 |         return $this->_dom->getElementById($id);
283 |     }
284 | 
285 |     /**
286 |      * @author xuruiqi
287 |      * @param
288 |      *      string $tag : specifed elements' tag name 
289 |      * @return
290 |      *      object : a traversable DOMNodeList object containing all the matched elements
291 |      *      NULL   : if $this->_dom equals NULL
292 |      * @desc select spcified elements via its tag name.
293 |      */
294 |     public function selTagName($tag) {
295 |         if ($this->_dom === NULL) {
296 |             Phpfetcher_Log::warning('$this->_dom is NULL!');
297 |             return NULL;
298 |         }
299 | 
300 |         return $this->_dom->getElementsByTagName($tag);
301 |     }
302 | 
303 |     public function setConfField($field, $value) {
304 |         $this->_arrConf[$field] = $value;
305 |         return $this->_setConfField($field, $value);
306 |     }
307 | 
308 |     protected function _setConfField($field, $value) {
309 |         if (isset(self::$_arrField2CurlOpt[$field])) {
310 |             return curl_setopt($this->_curlHandle, self::$_arrField2CurlOpt[$field], $value);
311 |         } else {
312 |             //currently only curl options can be set
313 |             return curl_setopt($this->_curlHandle, $field, $value);
314 |         }
315 |     }
316 | 
317 |     /**
318 |      * @author xuruiqi
319 |      * @param
320 |      *      string $url : the URL
321 |      * @return
322 |      *      string : previous URL
323 |      * @desc set this page's URL.
324 |      */
325 |     public function setUrl($url) {
326 |         $previous_url = $this->_arrConf['url'];
327 |         $this->setConfField('url', $url);
328 |         return $previous_url;
329 |     }
330 | 
331 |     /**
332 |      * @author  xuruiqi
333 |      * @param
334 |      *      array $arrHeaderList : header list
335 |      *      e.g.
336 |      *          array(
337 |      *              ...
338 |      *              "Cookie: xxxxx",
339 |      *              ...
340 |      *              "Header_n: header_n_value",
341 |      *          )
342 |      * @return
343 |      *      this
344 |      * @desc set header of the next fetch
345 |      */
346 |     public function &setHeaders($arrHeaderList) {
347 |         $this->setConf(array(
348 |             "http_header" => $arrHeaderList
349 |         ));
350 |         return $this;
351 |     }
352 | 
353 |     /**
354 |      * @author xuruiqi
355 |      * @param
356 |      * @return
357 |      *      string : return page's content
358 |      *      bool   : if failed return FALSE
359 |      * @desc get page's content, and save it into member variable <_strContent>
360 |      */
361 |     public function read() {
362 |         $this->_strContent = curl_exec($this->_curlHandle);
363 |         if ($this->_strContent != FALSE) {
364 |             $matches = array();
365 |             preg_match('#charset="?([a-zA-Z0-9-\._]+)"?#', $this->_strContent, $matches);
366 |             if (!empty($matches[1])) {
367 |                 //Phpfetcher_Log::notice("Convert content from {$matches[1]} to UTF-8\n");
368 |                 $this->_strContent = mb_convert_encoding($this->_strContent, 'UTF-8', $matches[1]);
369 |             }
370 | 
371 |             /*
372 |             $this->_dom = new DOMDocument(); //DOMDocument's compatibility is bad
373 |             if (@$this->_dom->loadHTML($this->_strContent) == FALSE) {
374 |                 Phpfetcher_Log::warning('Failed to call $this->_dom->loadHTML');
375 |                 $this->_dom      = NULL;
376 |                 $this->_domxpath = NULL;
377 |             } else {
378 |                 $this->_domxpath = new DOMXPath($this->_dom);
379 |             }
380 |              */
381 | 
382 |             $this->_dom = new Phpfetcher_Dom_SimpleHtmlDom();
383 |             if (@$this->_dom->loadHTML($this->_strContent) == FALSE) {
384 |                 Phpfetcher_Log::warning('Failed to call $this->_dom->loadHTML');
385 |                 $this->_dom      = NULL;
386 |             } 
387 |         }
388 |         return $this->_strContent;
389 |     }
390 | 
391 |     /**
392 |      * @author xuruiqi
393 |      * @param
394 |      *      string $strPath : xpath's path
395 |      *      [DOMNode $contextnode : The optional contextnode can be specified for doing relative XPath queries. By default, the queries are relative to the root element.]
396 |      *
397 |      * @return
398 |      *      DOMNodelist : DOMNodelist object
399 |      *      NULL  : if $this->_dom equals NULL
400 |      *      false : if error occurs
401 |      * @desc select corresponding content use xpath
402 |      */
403 |     public function sel($strPath, $intIndex = NULL, $contextnode = NULL) {
404 |         if ($this->_dom === NULL) {
405 |             Phpfetcher_Log::warning('$this->_dom is NULL!');
406 |             return NULL;
407 |         }
408 | 
409 |         if ($contextnode !== NULL) {
410 |             //$res = $this->_domxpath->query($strPath, $contextnode);
411 |             Phpfetcher_Log::warning('param contextnode is no use because of this function\'s inability');
412 |             $res = $this->_dom->sel($strPath, $intIndex);
413 |         } else {
414 |             //$res = $this->_domxpath->query($strPath);
415 |             $res = $this->_dom->sel($strPath, $intIndex);
416 |         }
417 | 
418 |         return $res;
419 |     }
420 | }
421 | ?>
422 | 


--------------------------------------------------------------------------------
/Phpfetcher/Crawler/Default.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | /*
  3 |  * @author xuruiqi
  4 |  * @date 2014-07-17
  5 |  * @copyright reetsee.com
  6 |  * @desc 爬虫对象的默认类
  7 |  *       Crawler objects' default class
  8 |  */
  9 | abstract class Phpfetcher_Crawler_Default extends Phpfetcher_Crawler_Abstract {
 10 |     const MAX_DEPTH = 20;
 11 |     const MAX_PAGE_NUM = -1;
 12 |     const MODIFY_JOBS_SET = 1;
 13 |     const MODIFY_JOBS_DEL = 2;
 14 |     const MODIFY_JOBS_ADD = 3;
 15 |     const DEFAULT_PAGE_CLASS = 'Phpfetcher_Page_Default';
 16 |     const ABSTRACT_PAGE_CLASS = 'Phpfetcher_Page_Abstract';
 17 | 
 18 |     const INT_TYPE = 1;
 19 |     const STR_TYPE = 2;
 20 |     const ARR_TYPE = 3;
 21 | 
 22 |     protected static $arrJobFieldTypes = array(
 23 |         'start_page' => self::STR_TYPE,
 24 |         'link_rules' => self::ARR_TYPE,
 25 |         'max_depth'  => self::INT_TYPE,
 26 |         'max_pages'  => self::INT_TYPE,
 27 |     );
 28 | 
 29 |     /*
 30 |     protected static $arrJobDefaultFields = array(
 31 |         'max_depth' => self::MAX_DEPTH,
 32 |         'max_pages' => self::MAX_PAGE_NUM,
 33 |     );
 34 |      */
 35 | 
 36 |     protected $_arrFetchJobs = array();
 37 |     protected $_arrHash = array();
 38 |     protected $_arrAdditionalUrls = array();
 39 |     protected $_objSchemeTrie = array(); //合法url scheme的字典树
 40 |     //protected $_objPage = NULL; //Phpfetcher_Page_Default;
 41 | 
 42 |     public function __construct($arrInitParam = array()) {
 43 |         if (!isset($arrInitParam['url_schemes'])) {
 44 |             $arrInitParam['url_schemes'] = array("http", "https", "ftp");
 45 |         }
 46 | 
 47 |         $this->_objSchemeTrie =
 48 |                 new Phpfetcher_Util_Trie($arrInitParam['url_schemes']);
 49 |     }
 50 | 
 51 |     /**
 52 |      * @author xuruiqi
 53 |      * @param
 54 |      *      array $arrInput:
 55 |      *          array <任务名1> :
 56 |      *              string 'start_page',    //爬虫的起始页面
 57 |      *              array  'link_rules':   //爬虫跟踪的超链接需要满足的正则表达式，依次检查规则，匹配其中任何一条即可
 58 |      *                  string 0,   //正则表达式1
 59 |      *                  string 1,   //正则表达式2
 60 |      *                  ...
 61 |      *                  string n-1, //正则表达式n
 62 |      *              int    'max_depth' ,    //爬虫最大的跟踪深度，目前限制最大值不超过20
 63 |      *              int    'max_pages' ,    //最多爬取的页面数，默认指定为-1，表示没有限制
 64 |      *          array <任务名2> :
 65 |      *              ...
 66 |      *              ...
 67 |      *          ...
 68 |      *          array <任务名n-1>:
 69 |      *              ...
 70 |      *              ...
 71 |      *
 72 |      * @return
 73 |      *      Object $this : returns the instance itself
 74 |      * @desc add by what rules the crawler should fetch the pages
 75 |      *       if a job has already been in jobs queue, new rules will
 76 |      *       cover the old ones.
 77 |      */
 78 |     public function &addFetchJobs($arrInput = array()) {
 79 |         return $this->_modifyFetchJobs($arrInput, self::MODIFY_JOBS_ADD);
 80 |     }
 81 | 
 82 |     /**
 83 |      * @author xuruiqi
 84 |      * @param
 85 |      *      array $arrInput :
 86 |      *          mixed 0 :
 87 |      *              任务名
 88 |      *          mixed 1 :
 89 |      *              任务名
 90 |      *          ... ...
 91 |      * @return
 92 |      *      Object $this : returns the instance itself
 93 |      * @desc delete fetch rules according to job names
 94 |      */
 95 |     public function &delFetchJobs($arrInput = array()) {
 96 |         return $this->_modifyFetchJobs($arrInput, self::MODIFY_JOBS_DEL);
 97 |     }
 98 | 
 99 |     public function getFetchJobByName($job_name) {
100 |         return $this->_arrFetchJobs[$strJobName];
101 |     }
102 | 
103 |     public function getFetchJobs() {
104 |         return $this->_arrFetchJobs;
105 |     }
106 | 
107 |     /*
108 |     public function handlePage() {
109 |         //由用户继承本类并实现此方法
110 |     }
111 |      */
112 | 
113 |     /**
114 |      * @author xuruiqi
115 |      * @param :
116 |      *      //$intOptType === MODIFY_JOBS_SET|MODIFY_JOBS_ADD,
117 |      *        $arrInput参见addFetchJobs的入参$arrInput
118 |      *      //$intOptType === MODIFY_JOBS_DEL,
119 |      *        $arrInput参见delFetchJobs的入参$arrInput
120 |      *
121 |      * @return
122 |      *      Object $this : returns the instance itself
123 |      * @desc set fetch rules.
124 |      */
125 |     protected function &_modifyFetchJobs($arrInput = array(), $intOptType) {
126 |         $arrInvalidJobs = array();
127 |         if ($intOptType === self::MODIFY_JOBS_SET || $intOptType === self::MODIFY_JOBS_ADD) {
128 |             if ($intOptType === self::MODIFY_JOBS_SET) {
129 |                 $this->_arrFetchJobs = array();
130 |             }
131 |             foreach ($arrInput as $job_name => $job_rules) {
132 |                 $this->_correctJobParam($job_rules);
133 |                 if ($this->_isJobValid($job_rules)) {
134 |                     $this->_arrFetchJobs[$job_name] = $job_rules;
135 |                 } else {
136 |                     $arrInvalidJobs[] = $job_name;
137 |                 }
138 |             }
139 |         } else if ($intOptType === self::MODIFY_JOBS_DEL) {
140 |             foreach ($arrInput as $job_name) {
141 |                 unset($this->_arrFetchJobs[$job_name]);
142 |             }
143 |         } else {
144 |             Phpfetcher_Log::warning("Unknown options for fetch jobs [{$intOptType}]");
145 |         }
146 | 
147 | 
148 |         if (!empty($arrInvalidJobs)) {
149 |             Phpfetcher_Log::notice('Invalid jobs:' . implode(',', $arrInvalidJobs));
150 |         }
151 |         return $this;
152 |     }
153 | 
154 |     /**
155 |      * @author xuruiqi
156 |      * @param : 参见addFetchJobs的入参$arrInput
157 |      *
158 |      * @return
159 |      *      Object $this : returns the instance itself
160 |      * @desc set fetch jobs.
161 |      */
162 |     public function &setFetchJobs($arrInput = array()) {
163 |         return $this->_modifyFetchJobs($arrInput, self::MODIFY_JOBS_SET);
164 |     }
165 | 
166 |     /**
167 |      * @author xuruiqi
168 |      * @param
169 |      *      array $arrInput : //运行设定
170 |      *          string 'page_class_name' : //指定要使用的Page类型，必须是
171 |      *                                     //Phpfetcher_Page_Abstract的
172 |      *                                     //子类
173 |      *          [array 'page_conf'] : //Page调用setConf时的输入参数，可选
174 |      * @return
175 |      *      obj $this
176 |      * @desc
177 |      */
178 |     public function &run($arrInput = array()) {
179 |         if (empty($this->_arrFetchJobs)) {
180 |             Phpfetcher_Log::warning("No fetch jobs.");
181 |             return $this;
182 |         }
183 | 
184 |         //构建Page对象
185 |         $objPage = NULL;
186 |         $strPageClassName = self::DEFAULT_PAGE_CLASS;
187 |         if (!empty($arrInput['page_class_name'])) {
188 |             $strPageClassName = strval($arrInput['page_class_name']);
189 |         }
190 |         try {
191 |             if (!class_exists($strPageClassName, TRUE)) {
192 |                 throw new Exception("[$strPageClassName] class not exists!");
193 |             }
194 | 
195 |             $objPage = new $strPageClassName;
196 |             if (!($objPage instanceof Phpfetcher_Page_Abstract)) {
197 |                 throw new Exception("[$strPageClassName] is not an instance of " . self::ABSTRACT_PAGE_CLASS);
198 |             }
199 |         } catch (Exception $e) {
200 |             Phpfetcher_Log::fatal($e->getMessage());
201 |             return $this;
202 |         }
203 | 
204 |         //初始化Page对象
205 |         $arrPageConf = empty($arrInput['page_conf']) ? array() : $arrInput['page_conf'];
206 |         $objPage->init();
207 |         if (!empty($arrPageConf)) {
208 |             if(isset($arrPageConf['url'])) {
209 |                 unset($arrPageConf['url']);
210 |             }
211 |             $objPage->setConf($arrPageConf);
212 |         }
213 | 
214 |         //遍历任务队列
215 |         foreach ($this->_arrFetchJobs as $job_name => $job_rules) {
216 |             if (!($this->_isJobValid($job_rules))) {
217 |                 Phpfetcher_Log::warning("Job rules invalid [" . serialize($job_rules) . "]");
218 |                 continue;
219 |             }
220 | 
221 |             //检查是否需要设置curl配置
222 |             if (!empty($job_rules['page_conf'])) {
223 |                 $objPage->setConf($job_rules['page_conf']);
224 |             }
225 | 
226 |             $intDepth   = 0;
227 |             $intPageNum = 0;
228 |             $arrIndice = array(0, 1);
229 |             $arrJobs = array(
230 |                 0 => array($job_rules['start_page']),
231 |                 1 => array(),
232 |             );
233 | 
234 |             //开始爬取
235 |             while (!empty($arrJobs[$arrIndice[0]])
236 |                 && ($job_rules['max_depth'] === -1 || $intDepth < $job_rules['max_depth'])
237 |                 && ($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) {
238 | 
239 |                 $intDepth += 1;
240 |                 $intPopIndex = $arrIndice[0];
241 |                 $intPushIndex = $arrIndice[1];
242 |                 $arrJobs[$intPushIndex] = array();
243 |                 foreach ($arrJobs[$intPopIndex] as $url) {
244 |                     if (!($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) {
245 |                         break;
246 |                     }
247 |                     $objPage->setUrl($url);
248 |                     $objPage->read();
249 | 
250 |                     //获取所有的超链接
251 |                     $arrLinks  = $objPage->getHyperLinks();
252 | 
253 |                     //解析当前URL的各个组成部分，以应对超链接中存在站内链接
254 |                     //的情况，如"/entry"等形式的URL
255 |                     $strCurUrl = $objPage->getUrl();
256 |                     $arrUrlComponents = parse_url($strCurUrl);
257 | 
258 |                     //匹配超链接
259 |                     foreach ($job_rules['link_rules'] as $link_rule) {
260 |                         foreach ($arrLinks as $link) {
261 |                             if (substr($link, 0, 2) == "//") {
262 |                                 $link = substr($link, 1);
263 |                             }
264 | 
265 |                             if (preg_match($link_rule, $link) === 1
266 |                                     && !$this->getHash($link)) {
267 | 
268 |                                 //拼出实际的URL
269 |                                 $real_link = $link;
270 | 
271 |                                 //不使用strpos，防止扫描整个字符串
272 |                                 //这里只需要扫描前6个字符即可
273 |                                 $colon_pos = false;
274 |                                 for ($i = 0; $i <= 5; ++$i) {
275 |                                     if ($link[$i] == ':') {
276 |                                         $colon_pos = $i;
277 |                                         break;
278 |                                     }
279 |                                 }
280 | 
281 |                                 if ($colon_pos === false
282 |                                         || !$this->_objSchemeTrie->has(
283 |                                             substr($link, 0, $colon_pos))) {
284 |                                     //将站内地址转换为完整地址
285 |                                     $real_link = $arrUrlComponents['scheme']
286 |                                             . "://"
287 |                                             . $arrUrlComponents['host']
288 |                                             . (isset($arrUrlComponents['port'])
289 |                                                 && strlen($arrUrlComponents['port']) != 0 ?
290 |                                                     ":{$arrUrlComponents['port']}" :
291 |                                                     "")
292 |                                             . ($link[0] == '/' ?
293 |                                                 $link : "/$link");
294 |                                 }
295 | 
296 |                                 $this->setHash($link, true);
297 |                                 $this->setHash($real_link, true);
298 |                                 $arrJobs[$intPushIndex][] = $real_link;
299 |                             }
300 |                         }
301 |                     }
302 | 
303 |                     //由用户实现handlePage函数
304 |                     $objPage->setExtraInfo(array('job_name' => $job_name ));
305 |                     $this->handlePage($objPage);
306 |                     $intPageNum += 1;
307 |                 }
308 | 
309 |                 if (!empty($this->_arrAdditionalUrls)) {
310 |                     $arrJobs[$intPushIndex] =
311 |                             array_merge($arrJobs[$intPushIndex],
312 |                                 $this->_arrAdditionalUrls);
313 |                     $this->_arrAdditionalUrls = array();
314 |                 }
315 | 
316 |                 self::_swap($arrIndice[0], $arrIndice[1]);
317 |             }
318 |         }
319 |         return $this;
320 |     }
321 | 
322 |     protected function _correctJobParam(&$job_rules) {
323 |         /*
324 |         foreach (self::$arrJobDefaultFields as $field => $value) {
325 |             if (!isset($job_rules[$field]) || ($job_rules['']))
326 |         }
327 |          */
328 |         if (!isset($job_rules['max_depth']) || (self::MAX_DEPTH !== -1 && self::MAX_DEPTH < $job_rules['max_depth'])) {
329 |             $job_rules['max_depth'] = self::MAX_DEPTH;
330 |         }
331 | 
332 |         if (!isset($job_rules['max_pages']) || (self::MAX_PAGE_NUM !== -1 && self::MAX_PAGE_NUM < $job_rules['max_pages'])) {
333 |             $job_rules['max_pages'] = self::MAX_PAGE_NUM;
334 |         }
335 |     }
336 | 
337 |     /**
338 |      * @author xuruiqi
339 |      * @desc check if a rule is valid
340 |      */
341 |     protected function _isJobValid($arrRule) {
342 |         foreach (self::$arrJobFieldTypes as $field => $type) {
343 |             if (!isset($arrRule[$field]) || ($type === self::ARR_TYPE && !is_array($arrRule[$field]))) {
344 |                 return FALSE;
345 |             }
346 |         }
347 |         return TRUE;
348 |     }
349 | 
350 |     protected static function _swap(&$a, &$b) {
351 |         $tmp = $a;
352 |         $a = $b;
353 |         $b = $tmp;
354 |     }
355 | 
356 |     public function getHash($strRawKey) {
357 |         $strRawKey = strval($strRawKey);
358 |         $strKey = md5($strRawKey);
359 |         if (isset($this->_arrHash[$strKey])) {
360 |             return $this->_arrHash[$strKey];
361 |         }
362 |         return NULL;
363 |     }
364 | 
365 |     public function setHash($strRawKey, $value) {
366 |         $strRawKey = strval($strRawKey);
367 |         $strKey = md5($strRawKey);
368 |         $this->_arrHash[$strKey] = $value;
369 |     }
370 | 
371 |     public function setHashIfNotExist($strRawKey, $value) {
372 |         $strRawKey = strval($strRawKey);
373 |         $strKey = md5($strRawKey);
374 | 
375 |         $bolExist = true;
376 |         if (!isset($this->_arrHash[$strKey])) {
377 |             $this->_arrHash[$strKey] = $value;
378 |             $bolExist = false;
379 |         }
380 | 
381 |         return $bolExist;
382 |     }
383 | 
384 |     public function clearHash() {
385 |         $this->_arrHash = array();
386 |     }
387 | 
388 |     public function addAdditionalUrls($url) {
389 |         if (!is_array($url)) {
390 |             $url = array($url);
391 |         }
392 | 
393 |         $intAddedNum = 0;
394 |         foreach ($url as $strUrl) {
395 |             $strUrl = strval($strUrl);
396 | 
397 |             if ($this->setHashIfNotExist($strUrl, true) === false) {
398 |                 $this->_arrAdditionalUrls[] = $strUrl;
399 |                 ++$intAddedNum;
400 |             }
401 |         }
402 | 
403 |         return $intAddedNum;
404 |     }
405 | };
406 | ?>
407 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Phpfetcher - a simple web crawler framework    
  2 | 
  3 | ## 重要修改记录 - Important Improvements Log   
  4 | 2017-03-13　支持形如“//xxx.com/abc/def”的超链接    
  5 | 　　　　　　Support hyperlinks like "//xxx.com/abc/def"     
  6 | 2016-09-08　支持HTTPS    
  7 | 　　　　　　Support HTTPS websites    
  8 | 2016-08-08　支持对爬虫设置Header。    
  9 | 　　　　　　Crawlers with Headers supported.    
 10 | 2016-03-26　PHP7测试通过。   
 11 | 　　　　　　Have PHP7 tested.        
 12 | 2015-10-26　可以爬取网站内链（如"/entry"的超链接）。    
 13 | 　　　　　　Able to crawl website internal hyper links(say "/entry").      
 14 | 
 15 | ## 中文说明(Scroll Down to See The English Description)    
 16 | 一个PHP爬虫框架   
 17 | 框架的起源请参见：http://blog.reetsee.com/archives/366        
 18 | PHP需要启用curl扩展以及mbstring扩展         
 19 | 支持PHP5，PHP7         
 20 | ### 1 例子    
 21 | 下面的所有例子请在`demo`目录下执行，即假设例子对应的文件名是`hello_world.php`，运行例子时你执行的命令应该是`php hello_world.php`而不是`php demo/hello_world.php`
 22 | #### 1.1 获取页面中`<title>`标签的内容
 23 | 指定一个新闻页面：`http://news.qq.com/a/20140927/026557.htm`，然后获取网页HTML中的`<title>`标签的内容来获取标题    
 24 | 请运行`single_page.php`例子，得到的输出如下：    
 25 | ```
 26 | $> php single_page.php 
 27 | 王思聪回应遭警方调查：带弓箭不犯法 我是绿箭侠_新闻_腾讯网
 28 | ```    
 29 | #### 1.2 获取腾讯新闻主页的大部分新闻标题    
 30 | 指定一个种子页面：`http://news.qq.com`，跟踪这个页面的超链接，被跟踪的超链接能被正则表达式`#news\.qq\.com/a/\d+/\d+\.htm$#`匹配，例如`news.qq.com/a/20140927/026557.html`，就会被跟踪。爬虫对于所有爬取的网页（包括起始页`news.qq.com`），抓取所有的`<h1>`标签，并打印内容     
 31 | 请运行`multi_page.php`，得到的输出如下：    
 32 | ```
 33 | $> php multi_page.php 
 34 |   	腾讯新闻——事实派
 35 | 习近平访英前接受采访 谈及南海问题及足球等
 36 | 习近平夫妇访英行程确定 将与女王共进私人午宴
 37 | 李克强：让能干事的地方获得更多支持
 38 | 环保部：我国40个城市已出现空气质量重污染
 39 | 京津冀形成两个重污染带 太行燕山东南污染重
 40 | 铁路部门回应“车票丢失被迫补票”：到站再退款
 41 | 女大学生火车票遗失被要求补全票 铁路局：没做错
 42 | 今日话题：丢失火车票要重买，老黄历何时改
 43 | 外媒：两名藏僧被俄驱逐出境
 44 | 广西北海民众聚众阻挠海事码头建设 16人被刑拘
 45 | 河南一村民被政府人员土埋 官方称系邻里纠纷
 46 | 餐厅用掺老鼠屎黄豆做咸菜 老板：都是中药材
 47 | ```    
 48 | #### 1.3 获取标签属性值 + 指定额外要跟踪的URL
 49 | 这个例子用来展现怎么提取HTML标签中的属性以及爬虫运行的过程中如何临时添加需要抓取的URL。我们检查`news.163.com`页面的`<iframe>`标签，并让爬虫进入到iframe标签所指向的URL。
 50 | 请运行`iframe_example.php`，得到的输出如下：
 51 | ```
 52 | $> php iframe_example.php 
 53 | +++ enter page: [http://news.163.com] +++
 54 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=1]
 55 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=1]
 56 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=2]
 57 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=2]
 58 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=3]
 59 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=4]
 60 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x150&location=1]
 61 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5]
 62 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5]
 63 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=6]
 64 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=7]
 65 | --- leave page: [http://news.163.com] ---
 66 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=1] +++
 67 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=1] ---
 68 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=1] +++
 69 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=1] ---
 70 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=2] +++
 71 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=2] ---
 72 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=2] +++
 73 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=2] ---
 74 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=3] +++
 75 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=3] ---
 76 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=4] +++
 77 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=4] ---
 78 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x150&location=1] +++
 79 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x150&location=1] ---
 80 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5] +++
 81 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5] ---
 82 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=6] +++
 83 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=6] ---
 84 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=7] +++
 85 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=7] ---
 86 | Done!
 87 | ```     
 88 | 这和直接在`$arrJobs['link_rules']`指定爬取规则有什么不同呢？不同点如下：     
 89 | 1. 爬虫默认只爬取`<a>`标签，并将`<a>`标签的`href`属性作为要爬取的地址放入爬取队列中，而地址需要满足的规则就是由`$arrJobs['link_rules']`来决定的。而`<iframe>`标签原本并不是爬虫爬取的目标，并且其地址放在标签的`src`属性中；    
 90 | 2. 之前的例子中，要爬取的URL都是框架自动添加的，而这个例子中，要爬取的`<iframe>`地址是我们通过调用`$this->addAdditionalUrls($strSrc);`手动添加的。    
 91 | #### 1.4 爬取百度搜索结果
 92 | 只要你对一个网站的网页结构有一定了解，你就能获取到你想要的所有信息，通过观察百度的搜索结果页，可以发现大多数搜索结果对应的DOM元素有这样的规律：`<h3><a href="我是结果链接地址">我是结果描述文字</a></h3>`，因此我们只要提取`<h3>`标签下的`<a>`标签的文本内容及`href`属性。
 93 | 请运行`crawl_baidu_page.php`，这个程序会打印关键字`facebook`的搜索结果，得到的输出如下：    
 94 | ```
 95 | $> php crawl_baidu_page.php 
 96 |    Facebook 
 97 | http://www.baidu.com/link?url=AtASutoPNIKCLMMz_CTeuhoe97gXt5N2JagWcZm0eUO-dvRdInYNWVhk7UVGiSNi
 98 | 
 99 | Facebook_百度百科
100 | http://www.baidu.com/link?url=9D5oa_7E1ezSVwfx4hGVRtObcvmruI0UCR_cOTWEnj74p7AiWY_ESYXyvnyVHlXXHOYHh94UaZdiUpnGdS5qQa
101 | 
102 | facebook - facebook官网_facebook注册
103 | http://www.baidu.com/link?url=3CmiG8W9me4-Xc0WkdDvsLT71hMN37s3o1M11T5VnbN-PFBnCgoCoXJ9-8iIPijf
104 | 
105 | facebook中文网 - facebook中文官网 facebook网址登陆
106 | http://www.baidu.com/link?url=yJqsEl7U_elBeIsW4i108vaaFNTugzb8nWM8h9kXS0zDdKbBhWEUbcRm7ALY3rQF
107 | 
108 | facebook吧_百度贴吧
109 | http://www.baidu.com/link?url=mWmpR1_PTCFQuJTmE_TarbSDvvHhhim4w15fQ8dipvJRwLY5twIb17hivcOcUGa-v_mbDS0Bfd4SVh7mjHz4mK
110 | 
111 | facebook的最新相关信息
112 | http://www.baidu.com/link?url=ARSNH3CTzh9HyGL8VgmREUTI1JC8VNmJ3FPHJn32l_nHFnjKGWdbexnZmsQ7090JoTKVeRVYXlixLaxnjH6yDJt8ln7IJsoihEXPY9B7-m3
113 | 
114 | Facebook
115 | http://www.baidu.com/link?url=G7GoImtCer71s9xQ0C5rlbCbGN6toa3fONlouj8nlHkIAJg3TrazM4FFw-9sjSzU
116 | 
117 | Facebook[FB]_美股实时行情_新浪财经
118 | http://www.baidu.com/link?url=AtASutoPNIKCLMMz_CTeuh_n1s-MJ2bubaCG7gsoyh81Oj-9lYKqY4Wv8iYx8OuUhnaOL6R9M8WJTnc5qcrrF8s_vP2R9W0dURAaLW6zT5_
119 | 
120 | facebook中文网 - facebook官网注册!
121 | http://www.baidu.com/link?url=LDR4I-ZA2VI4YuVk-hLH_SvxNwcynRZJ6qtD1go0wc68Q08viPvLh3-wXvoW3ILS
122 | 
123 | 为什么中国出不了Facebook和Twitter?-月光博客
124 | http://www.baidu.com/link?url=g7e5dKdgTPcIKOwybAPc7mk7omwz94u0xWuZ_9-nS1AGfdotydkziu7vqCRbrVK0T6rTCUSA3Al5mL4Rcl7YY_
125 | ```    
126 | #### 1.5 为你的爬虫添加HTTP Headers
127 | 有时候某些网站必须要求登录用户才能查看内容，或者需要用户的Header里的某些信息校验通过后（例如Cookie），才能进行浏览。我们可以为爬虫添加HTTP Header，使得网页可以被爬取到。    
128 | 请运行`crawl_with_headers.php`，这个程序会打印简历页面的标题，得到的输出如下：    
129 | ```
130 | $> php crawl_with_headers.php
131 | 【吴文博简历】 - 出纳简历 - 58同城
132 | ```    
133 | 如果你运行时出现以下错误：   
134 | ```
135 | 2016-08-07 16:33:17 Default.php Phpfetcher_Page_Default sel 116 Warning:  $this->_dom is NULL!
136 | 2016-08-07 16:33:17 crawl_with_headers.php Phpfetcher_Page_Default sel 10 Warning:  $this->_dom is NULL!
137 | ```     
138 | 请将文件中的`http_header`数组替换成你的浏览器访问这个网页时的Request HEADER，然后再尝试几次。注意：`Accept-Encoding`这个Header不要添加进去。    
139 | ### 2 获取HTML页面中某个元素的所有信息
140 | 可以参考例子1.3以及1.4，实际上主要使用以下四样东西：    
141 | 1. xpath，它是用来描述你要查找的HTML标签的语句，可以参考[http://www.w3school.com.cn/xpath/](http://www.w3school.com.cn/xpath/)；    
142 | 2. `find`方法，如所有例子中都有的`$page->find('xpath语句')`，调用这个方法后会得到一个数组，数组的内容就是所有满足要求的DOM元素的实例；     
143 | 3. simplehtmldom的`plaintext`成员，例如例子中的`$res[$i]->plaintext`，保存着DOM元素包裹的文本内容；    
144 | 4. simplehtmldom的`getAttribute`方法，例如例子`crawl_baidu_page.php`中的`$res[$i]->getAttribute('href')`，这样你就可以获得对应元素的属性值了。    
145 | 基本上熟悉了上面四点，你就能较好地在Phpfetcher中操控DOM元素。
146 | Phpfetcher解析HTML时使用了simplehtmldom这个开源项目的内容，更多关于它的API可以参见[http://simplehtmldom.sourceforge.net/](http://simplehtmldom.sourceforge.net/)，或者[Drupal API的描述](http://api.drupal.psu.edu/api/drupal/modules%21contrib%21simplehtmldom%21simplehtmldom%21simple_html_dom.php/cis7)。
147 | 你也可以直接修改本项目中的Phpfetcher/Page/Default.php以及Phpfetcher/Dom/SimpleHtmlDom.php文件，来更好地实现你的需求。
148 | ### 3 修改user-agent   
149 | 之前出现过一个问题就是Phpfetcher由于使用了`phpfetcher`这个user-agent遭到屏蔽。关于什么是user-agent，大家可以搜一下，它可以看成是浏览器对自己的一种标识，例如火狐的UA中会有`Firefox`，Chrome的UA中会有`Chrome`，手机的浏览器中多数会带上`Mobile`字样等，如`Chrome Mobile`、`Safari Mobile`等；
150 | 当然UA并不是什么神圣、高深的东西，这个东西随便改。以前百度屏蔽360浏览器的请求时，360浏览器就可以通过修改自己的UA来绕过百度的UA检测（当然百度的屏蔽不止检测UA这一项）
151 | 如果大家在使用Phpfetcher过程中，发现有网页返回`Forbidden`等情况，就可以考虑修改一下UA。
152 | 直接修改文件`Phpfetcher/Dom/Default.php`中`'user_agent' = 'firefox'`这一行，将`firefox`替换成一个看起来更靠谱的UA。    
153 | ```
154 |     protected $_arrDefaultConf = array(
155 |             'connect_timeout' => 10, 
156 |             'max_redirs'      => 10, 
157 |             'return_transfer' => 1,   //need this
158 |             'timeout'         => 15, 
159 |             'url'             => NULL,
160 |             'user_agent'      => 'firefox'
161 |     );
162 | ```    
163 | 如果替换UA后还是被屏蔽，那就有可能是其它原因了，例如是你的IP被屏蔽了等。
164 | ### 4 结语
165 | 这个框架还有很多不完善的地方，例如怎么使用多线程进行爬取、怎么样模拟登录状态进行爬取等。
166 | 但目前框架能适应大多数需求，暂时也比较简单易维护，短期内不会往更复杂的方向发展。
167 | 然而设计上的缺陷还是有不少的，例如有没有办法不修改源码去修改UA，去修改CURL的参数等，这些都是可以改进的。不过还是那句，在需求不强烈前，就不去进一步修改现有的结构了。
168 | 祝大家用得开心。
169 | ## English Description    
170 | A PHP web crawler framework        
171 | The origin of this framework please refer to: http://blog.reetsee.com/archives/366      
172 | PHP need to be compiled with curl and mbstring extentions         
173 | PHP5, PHP7 are supported      
174 | ### 1 Examples
175 | Please run the following examples under `demo` directory, assume you want to run `hello_world.php`, use `php hellow_world.php` rather than `php demo/hello_world.php`.
176 | #### 1.1 Get Plaintext of `<title>` Tags
177 | Specify a target page, say `http://news.qq.com/a/20140927/026557.htm`, then get all the plaintext in the `<title>` tags to get the title of the page
178 | Please run the `single_page.php` example, and you will get the following output:    
179 | ```
180 | $> php single_page.php 
181 | 王思聪回应遭警方调查：带弓箭不犯法 我是绿箭侠_新闻_腾讯网
182 | ```     
183 | #### 1.2 Get Titles of News from The Homepage of Tencent News
184 | Sepcify a seed page, say `http://news.qq.com`, the homepage of tencent news, follow the links of on this page, which satisfy the regrex `#news\.qq\.com/a/\d+/\d+\.htm$#`(e.g. `news.qq.com/a/20140927/026557.html`). The crawlers will inspect `<h1>` tags of all the pages(including the homepage `news.qq.com`), and print the plaintext inside the tags.
185 | Please run `multi_page.php`, and you will get the following output:    
186 | ```
187 | $> php multi_page.php 
188 |   	腾讯新闻——事实派
189 | 习近平访英前接受采访 谈及南海问题及足球等
190 | 习近平夫妇访英行程确定 将与女王共进私人午宴
191 | 李克强：让能干事的地方获得更多支持
192 | 环保部：我国40个城市已出现空气质量重污染
193 | 京津冀形成两个重污染带 太行燕山东南污染重
194 | 铁路部门回应“车票丢失被迫补票”：到站再退款
195 | 女大学生火车票遗失被要求补全票 铁路局：没做错
196 | 今日话题：丢失火车票要重买，老黄历何时改
197 | 外媒：两名藏僧被俄驱逐出境
198 | 广西北海民众聚众阻挠海事码头建设 16人被刑拘
199 | 河南一村民被政府人员土埋 官方称系邻里纠纷
200 | 餐厅用掺老鼠屎黄豆做咸菜 老板：都是中药材
201 | ```    
202 | #### 1.3 Get Attributes of HTML Tags + Add Additional Crawling URLs
203 | This example shows how to get attributes of HTML tags, and how to add URLs to be crawled after starting a crawling job. We will ask the crawlers to inspect all the `<iframe>` tags on page `news.163.com`, and make crawlers follow the links where `<iframe>` tags point to.
204 | Please run `iframe_example.php`, and you will get the following output:    
205 | ```
206 | $> php iframe_example.php 
207 | +++ enter page: [http://news.163.com] +++
208 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=1]
209 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=1]
210 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=2]
211 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=2]
212 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=3]
213 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=4]
214 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x150&location=1]
215 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5]
216 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5]
217 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=6]
218 | found iframe url=[http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=7]
219 | --- leave page: [http://news.163.com] ---
220 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=1] +++
221 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=1] ---
222 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=1] +++
223 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=1] ---
224 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=2] +++
225 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=2] ---
226 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=2] +++
227 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo540x60&location=2] ---
228 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=3] +++
229 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=3] ---
230 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=4] +++
231 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=4] ---
232 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x150&location=1] +++
233 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x150&location=1] ---
234 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5] +++
235 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=5] ---
236 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=6] +++
237 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=6] ---
238 | +++ enter page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=7] +++
239 | --- leave page: [http://g.163.com/r?site=netease&affiliate=news&cat=homepage&type=logo300x250&location=7] ---
240 | Done!
241 | ```    
242 | What is the difference compared to setting crawling rules in `$arrJobs['link_rules']`. Answers below:    
243 | 1. Crawlers only inspect `<a>` tags, and enqueue the value of the `href` attribute, which must satisfy rules listed in the `$arrJobsb['link_rules']` array, of the tag. While crawlers do not recognise `<iframe>` tags, plus the corresponding URL is pointed to by `src` attribute of the tags;    
244 | 2. We tell the crawlers which links should be additinally followed during their run time using `$this->addAdditionalUrls($strSrc);`, rather than setting the rules before they start to work.    
245 | #### 1.4 Crawling Baidu Search Engine Results
246 | As long as you know something about the structure of a web page, you will get anything you want from the page. After looking inside the HTML codes of searching-result pages from Baidu, we can find out that every result entry locates in DOMs which has the folloing format: `<h3><a href="I am link"><em>I am description</em></a></h3>`. Thus we only need to retrieve the plaintext and `href` attribute of `<a>` tags whose direct parents are `<h3>` tags.
247 | Please run `crawl_baidu_page.php`, which prints the searching results of 'facebook', and you will get the following output:    
248 | ```
249 | $> php crawl_baidu_page.php 
250 |    Facebook 
251 | http://www.baidu.com/link?url=AtASutoPNIKCLMMz_CTeuhoe97gXt5N2JagWcZm0eUO-dvRdInYNWVhk7UVGiSNi
252 | 
253 | Facebook_百度百科
254 | http://www.baidu.com/link?url=9D5oa_7E1ezSVwfx4hGVRtObcvmruI0UCR_cOTWEnj74p7AiWY_ESYXyvnyVHlXXHOYHh94UaZdiUpnGdS5qQa
255 | 
256 | facebook - facebook官网_facebook注册
257 | http://www.baidu.com/link?url=3CmiG8W9me4-Xc0WkdDvsLT71hMN37s3o1M11T5VnbN-PFBnCgoCoXJ9-8iIPijf
258 | 
259 | facebook中文网 - facebook中文官网 facebook网址登陆
260 | http://www.baidu.com/link?url=yJqsEl7U_elBeIsW4i108vaaFNTugzb8nWM8h9kXS0zDdKbBhWEUbcRm7ALY3rQF
261 | 
262 | facebook吧_百度贴吧
263 | http://www.baidu.com/link?url=mWmpR1_PTCFQuJTmE_TarbSDvvHhhim4w15fQ8dipvJRwLY5twIb17hivcOcUGa-v_mbDS0Bfd4SVh7mjHz4mK
264 | 
265 | facebook的最新相关信息
266 | http://www.baidu.com/link?url=ARSNH3CTzh9HyGL8VgmREUTI1JC8VNmJ3FPHJn32l_nHFnjKGWdbexnZmsQ7090JoTKVeRVYXlixLaxnjH6yDJt8ln7IJsoihEXPY9B7-m3
267 | 
268 | Facebook
269 | http://www.baidu.com/link?url=G7GoImtCer71s9xQ0C5rlbCbGN6toa3fONlouj8nlHkIAJg3TrazM4FFw-9sjSzU
270 | 
271 | Facebook[FB]_美股实时行情_新浪财经
272 | http://www.baidu.com/link?url=AtASutoPNIKCLMMz_CTeuh_n1s-MJ2bubaCG7gsoyh81Oj-9lYKqY4Wv8iYx8OuUhnaOL6R9M8WJTnc5qcrrF8s_vP2R9W0dURAaLW6zT5_
273 | 
274 | facebook中文网 - facebook官网注册!
275 | http://www.baidu.com/link?url=LDR4I-ZA2VI4YuVk-hLH_SvxNwcynRZJ6qtD1go0wc68Q08viPvLh3-wXvoW3ILS
276 | 
277 | 为什么中国出不了Facebook和Twitter?-月光博客
278 | http://www.baidu.com/link?url=g7e5dKdgTPcIKOwybAPc7mk7omwz94u0xWuZ_9-nS1AGfdotydkziu7vqCRbrVK0T6rTCUSA3Al5mL4Rcl7YY_
279 | ```    
280 | #### 1.5 Add HTTP Headers For Your Crawlers
281 | Sometimes websites only allow logined users to access, or sometimes websites display different content according to HTTP Request Headers.    
282 | We can manually add Headers to our crawlers so that we can fetch the expected content.    
283 | Please run `crawl_with_headers.php`, this demo will print the title of the page, which outputs:    
284 | ```
285 | $> php crawl_with_headers.php
286 | 【吴文博简历】 - 出纳简历 - 58同城
287 | ```    
288 | If you encouter the following errors:    
289 | ```
290 | 2016-08-07 16:33:17 Default.php Phpfetcher_Page_Default sel 116 Warning:  $this->_dom is NULL!
291 | 2016-08-07 16:33:17 crawl_with_headers.php Phpfetcher_Page_Default sel 10 Warning:  $this->_dom is NULL!
292 | ```     
293 | Please replace `http_header` array in the file with your own Request HEADER, then try a few times. Warning: `Accept-Encoding` Header should not be added.    
294 | ### 2 Get All The Infomation of An HTML Tag
295 | Please use example 1.3 and 1.4 as references. Actually you mainly have to know the following four techniques:    
296 | 1. xpath, it is used to describe what kind of HTML tag you are looking for, learn more about xpath: [www.w3schools.com/xsl/xpath_syntax.asp](www.w3schools.com/xsl/xpath_syntax.asp);    
297 | 2. `find` method, all the examples above use `$page->find('xpath query')`, after calling this method you will get an array, which will contain all the qualified DOM elements;    
298 | 3. Member `plaintext` of simplehtmldom, say `$res[$i]->plaintext`, which stores plain text that the DOM element wraps;    
299 | 4. Method `getAttribute` of simplehtmldom, say `$res[$i]->getAttribute('href')` in the `crawl_baidu_page.php` example, from which you can get the attribute of the specified tag.    
300 | Generally speaking, once you are familiar with the above four, you handle DOMs in Phpfetcher well.
301 | Phpfetcher parse HTMLs using simplehtmldom, an opensourced project, view it on [http://simplehtmldom.sourceforge.net/](http://simplehtmldom.sourceforge.net/) or learn more about its API with [Drupal API](http://api.drupal.psu.edu/api/drupal/modules%21contrib%21simplehtmldom%21simplehtmldom%21simple_html_dom.php/cis7)
302 | ### 3 Modify User-agent
303 | Previously I encoutered a problem that a website returned `Forbidden` like response due the forbidden user-agent of Phpfetcher, which I set to 'phpfetcher'. You can Google more about user-agent if you want.
304 | Usually speaking, web browers have their own user-agents, say Firefox may include `Firefox` in its user-agent, Chrome may include `Chrome`. Web browers on mobile phones may have `Mobile` in their user-agents, such as `Chrome Mobile`, `Safari Mobile`, etc.
305 | UA(user-agent) is not something holy that we can not touch, but something we can make it whatever we want.
306 | Some websites may forbid access from some web browers, thus when you encouter a weird `Forbidden` issue, consider modify the UA of Phpfetcher, it resides in the line `'user_agent' = 'firefox'` of file `Phpfetcher/Dom/Default.php`, replace the UA `firefox` with something more convincible.
307 | ```
308 |     protected $_arrDefaultConf = array(
309 |             'connect_timeout' => 10, 
310 |             'max_redirs'      => 10, 
311 |             'return_transfer' => 1,   //need this
312 |             'timeout'         => 15, 
313 |             'url'             => NULL,
314 |             'user_agent'      => 'firefox'
315 |     );
316 | ```    
317 | If you did not solve the problem, consider other reasons like IP forbidden.
318 | ### 4 Summary
319 | There are still lots of imperfect sides of Phpfetcher, including multi-threading, carwling with logged in states, etc.
320 | But that is probably what makes this framework easy to learn, to maintain.
321 | I will not deny that there are many designing problems despite of the lack of features, and I will push the project forward once more and more developers demand more and more necessary features.
322 | Until now, this framework meets most of the demands of its little user group.
323 | I hope you enjoy using Phpfetcher!     
324 | 
325 | ## TODO         
326 | 1. 支持配置爬虫使用的HTTP头以及Cookies。 To support configuring the HTTP Headers and Cookies for the crawlers.    
327 | 2. 支持协程/多线程调度爬虫。 To support coroutine/multi-thread crawlers.     
328 | 


--------------------------------------------------------------------------------
/tests/simple_html_dom.php:
--------------------------------------------------------------------------------
   1 | <?php
   2 | /**
   3 |  * Website: http://sourceforge.net/projects/simplehtmldom/
   4 |  * Additional projects that may be used: http://sourceforge.net/projects/debugobject/
   5 |  * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
   6 |  * Contributions by:
   7 |  *	 Yousuke Kumakura (Attribute filters)
   8 |  *	 Vadim Voituk (Negative indexes supports of "find" method)
   9 |  *	 Antcs (Constructor with automatically load contents either text or file/url)
  10 |  *
  11 |  * all affected sections have comments starting with "PaperG"
  12 |  *
  13 |  * Paperg - Added case insensitive testing of the value of the selector.
  14 |  * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
  15 |  *  This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
  16 |  *  it will almost always be smaller by some amount.
  17 |  *  We use this to determine how far into the file the tag in question is.  This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
  18 |  *  but for most purposes, it's a really good estimation.
  19 |  * Paperg - Added the forceTagsClosed to the dom constructor.  Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
  20 |  * Allow the user to tell us how much they trust the html.
  21 |  * Paperg add the text and plaintext to the selectors for the find syntax.  plaintext implies text in the innertext of a node.  text implies that the tag is a text node.
  22 |  * This allows for us to find tags based on the text they contain.
  23 |  * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
  24 |  * Paperg: added parse_charset so that we know about the character set of the source document.
  25 |  *  NOTE:  If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
  26 |  *  last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
  27 |  *
  28 |  * Found infinite loop in the case of broken html in restore_noise.  Rewrote to protect from that.
  29 |  * PaperG (John Schlick) Added get_display_size for "IMG" tags.
  30 |  *
  31 |  * Licensed under The MIT License
  32 |  * Redistributions of files must retain the above copyright notice.
  33 |  *
  34 |  * @author S.C. Chen <me578022@gmail.com>
  35 |  * @author John Schlick
  36 |  * @author Rus Carroll
  37 |  * @version 1.5 ($Rev: 210 $)
  38 |  * @package PlaceLocalInclude
  39 |  * @subpackage simple_html_dom
  40 |  */
  41 | 
  42 | /**
  43 |  * All of the Defines for the classes below.
  44 |  * @author S.C. Chen <me578022@gmail.com>
  45 |  */
  46 | define('HDOM_TYPE_ELEMENT', 1);
  47 | define('HDOM_TYPE_COMMENT', 2);
  48 | define('HDOM_TYPE_TEXT',	3);
  49 | define('HDOM_TYPE_ENDTAG',  4);
  50 | define('HDOM_TYPE_ROOT',	5);
  51 | define('HDOM_TYPE_UNKNOWN', 6);
  52 | define('HDOM_QUOTE_DOUBLE', 0);
  53 | define('HDOM_QUOTE_SINGLE', 1);
  54 | define('HDOM_QUOTE_NO',	 3);
  55 | define('HDOM_INFO_BEGIN',   0);
  56 | define('HDOM_INFO_END',	 1);
  57 | define('HDOM_INFO_QUOTE',   2);
  58 | define('HDOM_INFO_SPACE',   3);
  59 | define('HDOM_INFO_TEXT',	4);
  60 | define('HDOM_INFO_INNER',   5);
  61 | define('HDOM_INFO_OUTER',   6);
  62 | define('HDOM_INFO_ENDSPACE',7);
  63 | define('DEFAULT_TARGET_CHARSET', 'UTF-8');
  64 | define('DEFAULT_BR_TEXT', "\r\n");
  65 | define('DEFAULT_SPAN_TEXT', " ");
  66 | define('MAX_FILE_SIZE', 600000);
  67 | // helper functions
  68 | // -----------------------------------------------------------------------------
  69 | // get html dom from file
  70 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
  71 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
  72 | {
  73 | 	// We DO force the tags to be terminated.
  74 | 	$dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
  75 | 	// For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
  76 | 	$contents = file_get_contents($url, $use_include_path, $context, $offset);
  77 | 	// Paperg - use our own mechanism for getting the contents as we want to control the timeout.
  78 | 	//$contents = retrieve_url_contents($url);
  79 | 	if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
  80 | 	{
  81 | 		return false;
  82 | 	}
  83 | 	// The second parameter can force the selectors to all be lowercase.
  84 | 	$dom->load($contents, $lowercase, $stripRN);
  85 | 	return $dom;
  86 | }
  87 | 
  88 | // get html dom from string
  89 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
  90 | {
  91 | 	$dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
  92 | 	if (empty($str) || strlen($str) > MAX_FILE_SIZE)
  93 | 	{
  94 | 		$dom->clear();
  95 | 		return false;
  96 | 	}
  97 | 	$dom->load($str, $lowercase, $stripRN);
  98 | 	return $dom;
  99 | }
 100 | 
 101 | // dump html dom tree
 102 | function dump_html_tree($node, $show_attr=true, $deep=0)
 103 | {
 104 | 	$node->dump($node);
 105 | }
 106 | 
 107 | 
 108 | /**
 109 |  * simple html dom node
 110 |  * PaperG - added ability for "find" routine to lowercase the value of the selector.
 111 |  * PaperG - added $tag_start to track the start position of the tag in the total byte index
 112 |  *
 113 |  * @package PlaceLocalInclude
 114 |  */
 115 | class simple_html_dom_node
 116 | {
 117 | 	public $nodetype = HDOM_TYPE_TEXT;
 118 | 	public $tag = 'text';
 119 | 	public $attr = array();
 120 | 	public $children = array();
 121 | 	public $nodes = array();
 122 | 	public $parent = null;
 123 | 	// The "info" array - see HDOM_INFO_... for what each element contains.
 124 | 	public $_ = array();
 125 | 	public $tag_start = 0;
 126 | 	private $dom = null;
 127 | 
 128 | 	function __construct($dom)
 129 | 	{
 130 | 		$this->dom = $dom;
 131 | 		$dom->nodes[] = $this;
 132 | 	}
 133 | 
 134 | 	function __destruct()
 135 | 	{
 136 | 		$this->clear();
 137 | 	}
 138 | 
 139 | 	function __toString()
 140 | 	{
 141 | 		return $this->outertext();
 142 | 	}
 143 | 
 144 | 	// clean up memory due to php5 circular references memory leak...
 145 | 	function clear()
 146 | 	{
 147 | 		$this->dom = null;
 148 | 		$this->nodes = null;
 149 | 		$this->parent = null;
 150 | 		$this->children = null;
 151 | 	}
 152 | 
 153 | 	// dump node's tree
 154 | 	function dump($show_attr=true, $deep=0)
 155 | 	{
 156 | 		$lead = str_repeat('	', $deep);
 157 | 
 158 | 		echo $lead.$this->tag;
 159 | 		if ($show_attr && count($this->attr)>0)
 160 | 		{
 161 | 			echo '(';
 162 | 			foreach ($this->attr as $k=>$v)
 163 | 				echo "[$k]=>\"".$this->$k.'", ';
 164 | 			echo ')';
 165 | 		}
 166 | 		echo "\n";
 167 | 
 168 | 		if ($this->nodes)
 169 | 		{
 170 | 			foreach ($this->nodes as $c)
 171 | 			{
 172 | 				$c->dump($show_attr, $deep+1);
 173 | 			}
 174 | 		}
 175 | 	}
 176 | 
 177 | 
 178 | 	// Debugging function to dump a single dom node with a bunch of information about it.
 179 | 	function dump_node($echo=true)
 180 | 	{
 181 | 
 182 | 		$string = $this->tag;
 183 | 		if (count($this->attr)>0)
 184 | 		{
 185 | 			$string .= '(';
 186 | 			foreach ($this->attr as $k=>$v)
 187 | 			{
 188 | 				$string .= "[$k]=>\"".$this->$k.'", ';
 189 | 			}
 190 | 			$string .= ')';
 191 | 		}
 192 | 		if (count($this->_)>0)
 193 | 		{
 194 | 			$string .= ' $_ (';
 195 | 			foreach ($this->_ as $k=>$v)
 196 | 			{
 197 | 				if (is_array($v))
 198 | 				{
 199 | 					$string .= "[$k]=>(";
 200 | 					foreach ($v as $k2=>$v2)
 201 | 					{
 202 | 						$string .= "[$k2]=>\"".$v2.'", ';
 203 | 					}
 204 | 					$string .= ")";
 205 | 				} else {
 206 | 					$string .= "[$k]=>\"".$v.'", ';
 207 | 				}
 208 | 			}
 209 | 			$string .= ")";
 210 | 		}
 211 | 
 212 | 		if (isset($this->text))
 213 | 		{
 214 | 			$string .= " text: (" . $this->text . ")";
 215 | 		}
 216 | 
 217 | 		$string .= " HDOM_INNER_INFO: '";
 218 | 		if (isset($node->_[HDOM_INFO_INNER]))
 219 | 		{
 220 | 			$string .= $node->_[HDOM_INFO_INNER] . "'";
 221 | 		}
 222 | 		else
 223 | 		{
 224 | 			$string .= ' NULL ';
 225 | 		}
 226 | 
 227 | 		$string .= " children: " . count($this->children);
 228 | 		$string .= " nodes: " . count($this->nodes);
 229 | 		$string .= " tag_start: " . $this->tag_start;
 230 | 		$string .= "\n";
 231 | 
 232 | 		if ($echo)
 233 | 		{
 234 | 			echo $string;
 235 | 			return;
 236 | 		}
 237 | 		else
 238 | 		{
 239 | 			return $string;
 240 | 		}
 241 | 	}
 242 | 
 243 | 	// returns the parent of node
 244 | 	// If a node is passed in, it will reset the parent of the current node to that one.
 245 | 	function parent($parent=null)
 246 | 	{
 247 | 		// I am SURE that this doesn't work properly.
 248 | 		// It fails to unset the current node from it's current parents nodes or children list first.
 249 | 		if ($parent !== null)
 250 | 		{
 251 | 			$this->parent = $parent;
 252 | 			$this->parent->nodes[] = $this;
 253 | 			$this->parent->children[] = $this;
 254 | 		}
 255 | 
 256 | 		return $this->parent;
 257 | 	}
 258 | 
 259 | 	// verify that node has children
 260 | 	function has_child()
 261 | 	{
 262 | 		return !empty($this->children);
 263 | 	}
 264 | 
 265 | 	// returns children of node
 266 | 	function children($idx=-1)
 267 | 	{
 268 | 		if ($idx===-1)
 269 | 		{
 270 | 			return $this->children;
 271 | 		}
 272 | 		if (isset($this->children[$idx]))
 273 | 		{
 274 | 			return $this->children[$idx];
 275 | 		}
 276 | 		return null;
 277 | 	}
 278 | 
 279 | 	// returns the first child of node
 280 | 	function first_child()
 281 | 	{
 282 | 		if (count($this->children)>0)
 283 | 		{
 284 | 			return $this->children[0];
 285 | 		}
 286 | 		return null;
 287 | 	}
 288 | 
 289 | 	// returns the last child of node
 290 | 	function last_child()
 291 | 	{
 292 | 		if (($count=count($this->children))>0)
 293 | 		{
 294 | 			return $this->children[$count-1];
 295 | 		}
 296 | 		return null;
 297 | 	}
 298 | 
 299 | 	// returns the next sibling of node
 300 | 	function next_sibling()
 301 | 	{
 302 | 		if ($this->parent===null)
 303 | 		{
 304 | 			return null;
 305 | 		}
 306 | 
 307 | 		$idx = 0;
 308 | 		$count = count($this->parent->children);
 309 | 		while ($idx<$count && $this!==$this->parent->children[$idx])
 310 | 		{
 311 | 			++$idx;
 312 | 		}
 313 | 		if (++$idx>=$count)
 314 | 		{
 315 | 			return null;
 316 | 		}
 317 | 		return $this->parent->children[$idx];
 318 | 	}
 319 | 
 320 | 	// returns the previous sibling of node
 321 | 	function prev_sibling()
 322 | 	{
 323 | 		if ($this->parent===null) return null;
 324 | 		$idx = 0;
 325 | 		$count = count($this->parent->children);
 326 | 		while ($idx<$count && $this!==$this->parent->children[$idx])
 327 | 			++$idx;
 328 | 		if (--$idx<0) return null;
 329 | 		return $this->parent->children[$idx];
 330 | 	}
 331 | 
 332 | 	// function to locate a specific ancestor tag in the path to the root.
 333 | 	function find_ancestor_tag($tag)
 334 | 	{
 335 | 		global $debug_object;
 336 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
 337 | 
 338 | 		// Start by including ourselves in the comparison.
 339 | 		$returnDom = $this;
 340 | 
 341 | 		while (!is_null($returnDom))
 342 | 		{
 343 | 			if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); }
 344 | 
 345 | 			if ($returnDom->tag == $tag)
 346 | 			{
 347 | 				break;
 348 | 			}
 349 | 			$returnDom = $returnDom->parent;
 350 | 		}
 351 | 		return $returnDom;
 352 | 	}
 353 | 
 354 | 	// get dom node's inner html
 355 | 	function innertext()
 356 | 	{
 357 | 		if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
 358 | 		if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 359 | 
 360 | 		$ret = '';
 361 | 		foreach ($this->nodes as $n)
 362 | 			$ret .= $n->outertext();
 363 | 		return $ret;
 364 | 	}
 365 | 
 366 | 	// get dom node's outer text (with tag)
 367 | 	function outertext()
 368 | 	{
 369 | 		global $debug_object;
 370 | 		if (is_object($debug_object))
 371 | 		{
 372 | 			$text = '';
 373 | 			if ($this->tag == 'text')
 374 | 			{
 375 | 				if (!empty($this->text))
 376 | 				{
 377 | 					$text = " with text: " . $this->text;
 378 | 				}
 379 | 			}
 380 | 			$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
 381 | 		}
 382 | 
 383 | 		if ($this->tag==='root') return $this->innertext();
 384 | 
 385 | 		// trigger callback
 386 | 		if ($this->dom && $this->dom->callback!==null)
 387 | 		{
 388 | 			call_user_func_array($this->dom->callback, array($this));
 389 | 		}
 390 | 
 391 | 		if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
 392 | 		if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 393 | 
 394 | 		// render begin tag
 395 | 		if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
 396 | 		{
 397 | 			$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
 398 | 		} else {
 399 | 			$ret = "";
 400 | 		}
 401 | 
 402 | 		// render inner text
 403 | 		if (isset($this->_[HDOM_INFO_INNER]))
 404 | 		{
 405 | 			// If it's a br tag...  don't return the HDOM_INNER_INFO that we may or may not have added.
 406 | 			if ($this->tag != "br")
 407 | 			{
 408 | 				$ret .= $this->_[HDOM_INFO_INNER];
 409 | 			}
 410 | 		} else {
 411 | 			if ($this->nodes)
 412 | 			{
 413 | 				foreach ($this->nodes as $n)
 414 | 				{
 415 | 					$ret .= $this->convert_text($n->outertext());
 416 | 				}
 417 | 			}
 418 | 		}
 419 | 
 420 | 		// render end tag
 421 | 		if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
 422 | 			$ret .= '</'.$this->tag.'>';
 423 | 		return $ret;
 424 | 	}
 425 | 
 426 | 	// get dom node's plain text
 427 | 	function text()
 428 | 	{
 429 | 		if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
 430 | 		switch ($this->nodetype)
 431 | 		{
 432 | 			case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 433 | 			case HDOM_TYPE_COMMENT: return '';
 434 | 			case HDOM_TYPE_UNKNOWN: return '';
 435 | 		}
 436 | 		if (strcasecmp($this->tag, 'script')===0) return '';
 437 | 		if (strcasecmp($this->tag, 'style')===0) return '';
 438 | 
 439 | 		$ret = '';
 440 | 		// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
 441 | 		// NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
 442 | 		// WHY is this happening?
 443 | 		if (!is_null($this->nodes))
 444 | 		{
 445 | 			foreach ($this->nodes as $n)
 446 | 			{
 447 | 				$ret .= $this->convert_text($n->text());
 448 | 			}
 449 | 
 450 | 			// If this node is a span... add a space at the end of it so multiple spans don't run into each other.  This is plaintext after all.
 451 | 			if ($this->tag == "span")
 452 | 			{
 453 | 				$ret .= $this->dom->default_span_text;
 454 | 			}
 455 | 
 456 | 
 457 | 		}
 458 | 		return $ret;
 459 | 	}
 460 | 
 461 | 	function xmltext()
 462 | 	{
 463 | 		$ret = $this->innertext();
 464 | 		$ret = str_ireplace('<![CDATA[', '', $ret);
 465 | 		$ret = str_replace(']]>', '', $ret);
 466 | 		return $ret;
 467 | 	}
 468 | 
 469 | 	// build node's text with tag
 470 | 	function makeup()
 471 | 	{
 472 | 		// text, comment, unknown
 473 | 		if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 474 | 
 475 | 		$ret = '<'.$this->tag;
 476 | 		$i = -1;
 477 | 
 478 | 		foreach ($this->attr as $key=>$val)
 479 | 		{
 480 | 			++$i;
 481 | 
 482 | 			// skip removed attribute
 483 | 			if ($val===null || $val===false)
 484 | 				continue;
 485 | 
 486 | 			$ret .= $this->_[HDOM_INFO_SPACE][$i][0];
 487 | 			//no value attr: nowrap, checked selected...
 488 | 			if ($val===true)
 489 | 				$ret .= $key;
 490 | 			else {
 491 | 				switch ($this->_[HDOM_INFO_QUOTE][$i])
 492 | 				{
 493 | 					case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
 494 | 					case HDOM_QUOTE_SINGLE: $quote = '\''; break;
 495 | 					default: $quote = '';
 496 | 				}
 497 | 				$ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
 498 | 			}
 499 | 		}
 500 | 		$ret = $this->dom->restore_noise($ret);
 501 | 		return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
 502 | 	}
 503 | 
 504 | 	// find elements by css selector
 505 | 	//PaperG - added ability for find to lowercase the value of the selector.
 506 | 	function find($selector, $idx=null, $lowercase=false)
 507 | 	{
 508 | 		$selectors = $this->parse_selector($selector);
 509 | 		if (($count=count($selectors))===0) return array();
 510 | 		$found_keys = array();
 511 | 
 512 | 		// find each selector
 513 | 		for ($c=0; $c<$count; ++$c)
 514 | 		{
 515 | 			// The change on the below line was documented on the sourceforge code tracker id 2788009
 516 | 			// used to be: if (($levle=count($selectors[0]))===0) return array();
 517 | 			if (($levle=count($selectors[$c]))===0) return array();
 518 | 			if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
 519 | 
 520 | 			$head = array($this->_[HDOM_INFO_BEGIN]=>1);
 521 | 
 522 | 			// handle descendant selectors, no recursive!
 523 | 			for ($l=0; $l<$levle; ++$l)
 524 | 			{
 525 | 				$ret = array();
 526 | 				foreach ($head as $k=>$v)
 527 | 				{
 528 | 					$n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
 529 | 					//PaperG - Pass this optional parameter on to the seek function.
 530 | 					$n->seek($selectors[$c][$l], $ret, $lowercase);
 531 | 				}
 532 | 				$head = $ret;
 533 | 			}
 534 | 
 535 | 			foreach ($head as $k=>$v)
 536 | 			{
 537 | 				if (!isset($found_keys[$k]))
 538 | 				{
 539 | 					$found_keys[$k] = 1;
 540 | 				}
 541 | 			}
 542 | 		}
 543 | 
 544 | 		// sort keys
 545 | 		ksort($found_keys);
 546 | 
 547 | 		$found = array();
 548 | 		foreach ($found_keys as $k=>$v)
 549 | 			$found[] = $this->dom->nodes[$k];
 550 | 
 551 | 		// return nth-element or array
 552 | 		if (is_null($idx)) return $found;
 553 | 		else if ($idx<0) $idx = count($found) + $idx;
 554 | 		return (isset($found[$idx])) ? $found[$idx] : null;
 555 | 	}
 556 | 
 557 | 	// seek for given conditions
 558 | 	// PaperG - added parameter to allow for case insensitive testing of the value of a selector.
 559 | 	protected function seek($selector, &$ret, $lowercase=false)
 560 | 	{
 561 | 		global $debug_object;
 562 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
 563 | 
 564 | 		list($tag, $key, $val, $exp, $no_key) = $selector;
 565 | 
 566 | 		// xpath index
 567 | 		if ($tag && $key && is_numeric($key))
 568 | 		{
 569 | 			$count = 0;
 570 | 			foreach ($this->children as $c)
 571 | 			{
 572 | 				if ($tag==='*' || $tag===$c->tag) {
 573 | 					if (++$count==$key) {
 574 | 						$ret[$c->_[HDOM_INFO_BEGIN]] = 1;
 575 | 						return;
 576 | 					}
 577 | 				}
 578 | 			}
 579 | 			return;
 580 | 		}
 581 | 
 582 | 		$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
 583 | 		if ($end==0) {
 584 | 			$parent = $this->parent;
 585 | 			while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
 586 | 				$end -= 1;
 587 | 				$parent = $parent->parent;
 588 | 			}
 589 | 			$end += $parent->_[HDOM_INFO_END];
 590 | 		}
 591 | 
 592 | 		for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
 593 | 			$node = $this->dom->nodes[$i];
 594 | 
 595 | 			$pass = true;
 596 | 
 597 | 			if ($tag==='*' && !$key) {
 598 | 				if (in_array($node, $this->children, true))
 599 | 					$ret[$i] = 1;
 600 | 				continue;
 601 | 			}
 602 | 
 603 | 			// compare tag
 604 | 			if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
 605 | 			// compare key
 606 | 			if ($pass && $key) {
 607 | 				if ($no_key) {
 608 | 					if (isset($node->attr[$key])) $pass=false;
 609 | 				} else {
 610 | 					if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
 611 | 				}
 612 | 			}
 613 | 			// compare value
 614 | 			if ($pass && $key && $val  && $val!=='*') {
 615 | 				// If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
 616 | 				if ($key == "plaintext") {
 617 | 					// $node->plaintext actually returns $node->text();
 618 | 					$nodeKeyValue = $node->text();
 619 | 				} else {
 620 | 					// this is a normal search, we want the value of that attribute of the tag.
 621 | 					$nodeKeyValue = $node->attr[$key];
 622 | 				}
 623 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
 624 | 
 625 | 				//PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
 626 | 				if ($lowercase) {
 627 | 					$check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
 628 | 				} else {
 629 | 					$check = $this->match($exp, $val, $nodeKeyValue);
 630 | 				}
 631 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));}
 632 | 
 633 | 				// handle multiple class
 634 | 				if (!$check && strcasecmp($key, 'class')===0) {
 635 | 					foreach (explode(' ',$node->attr[$key]) as $k) {
 636 | 						// Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
 637 | 						if (!empty($k)) {
 638 | 							if ($lowercase) {
 639 | 								$check = $this->match($exp, strtolower($val), strtolower($k));
 640 | 							} else {
 641 | 								$check = $this->match($exp, $val, $k);
 642 | 							}
 643 | 							if ($check) break;
 644 | 						}
 645 | 					}
 646 | 				}
 647 | 				if (!$check) $pass = false;
 648 | 			}
 649 | 			if ($pass) $ret[$i] = 1;
 650 | 			unset($node);
 651 | 		}
 652 | 		// It's passed by reference so this is actually what this function returns.
 653 | 		if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);}
 654 | 	}
 655 | 
 656 | 	protected function match($exp, $pattern, $value) {
 657 | 		global $debug_object;
 658 | 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
 659 | 
 660 | 		switch ($exp) {
 661 | 			case '=':
 662 | 				return ($value===$pattern);
 663 | 			case '!=':
 664 | 				return ($value!==$pattern);
 665 | 			case '^=':
 666 | 				return preg_match("/^".preg_quote($pattern,'/')."/", $value);
 667 | 			case '$=':
 668 | 				return preg_match("/".preg_quote($pattern,'/')."$/", $value);
 669 | 			case '*=':
 670 | 				if ($pattern[0]=='/') {
 671 | 					return preg_match($pattern, $value);
 672 | 				}
 673 | 				return preg_match("/".$pattern."/i", $value);
 674 | 		}
 675 | 		return false;
 676 | 	}
 677 | 
 678 | 	protected function parse_selector($selector_string) {
 679 | 		global $debug_object;
 680 | 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
 681 | 
 682 | 		// pattern of CSS selectors, modified from mootools
 683 | 		// Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
 684 | 		// Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
 685 | // Notice the \[ starting the attbute?  and the @? following?  This implies that an attribute can begin with an @ sign that is not captured.
 686 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
 687 | // farther study is required to determine of this should be documented or removed.
 688 | //		$pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
 689 | 		$pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
 690 | 		preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
 691 | 		if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}
 692 | 
 693 | 		$selectors = array();
 694 | 		$result = array();
 695 | 		//print_r($matches);
 696 | 
 697 | 		foreach ($matches as $m) {
 698 | 			$m[0] = trim($m[0]);
 699 | 			if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
 700 | 			// for browser generated xpath
 701 | 			if ($m[1]==='tbody') continue;
 702 | 
 703 | 			list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
 704 | 			if (!empty($m[2])) {$key='id'; $val=$m[2];}
 705 | 			if (!empty($m[3])) {$key='class'; $val=$m[3];}
 706 | 			if (!empty($m[4])) {$key=$m[4];}
 707 | 			if (!empty($m[5])) {$exp=$m[5];}
 708 | 			if (!empty($m[6])) {$val=$m[6];}
 709 | 
 710 | 			// convert to lowercase
 711 | 			if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
 712 | 			//elements that do NOT have the specified attribute
 713 | 			if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
 714 | 
 715 | 			$result[] = array($tag, $key, $val, $exp, $no_key);
 716 | 			if (trim($m[7])===',') {
 717 | 				$selectors[] = $result;
 718 | 				$result = array();
 719 | 			}
 720 | 		}
 721 | 		if (count($result)>0)
 722 | 			$selectors[] = $result;
 723 | 		return $selectors;
 724 | 	}
 725 | 
 726 | 	function __get($name)
 727 | 	{
 728 | 		if (isset($this->attr[$name]))
 729 | 		{
 730 | 			return $this->convert_text($this->attr[$name]);
 731 | 		}
 732 | 		switch ($name)
 733 | 		{
 734 | 			case 'outertext': return $this->outertext();
 735 | 			case 'innertext': return $this->innertext();
 736 | 			case 'plaintext': return $this->text();
 737 | 			case 'xmltext': return $this->xmltext();
 738 | 			default: return array_key_exists($name, $this->attr);
 739 | 		}
 740 | 	}
 741 | 
 742 | 	function __set($name, $value)
 743 | 	{
 744 | 		global $debug_object;
 745 | 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
 746 | 
 747 | 		switch ($name)
 748 | 		{
 749 | 			case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
 750 | 			case 'innertext':
 751 | 				if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
 752 | 				return $this->_[HDOM_INFO_INNER] = $value;
 753 | 		}
 754 | 		if (!isset($this->attr[$name]))
 755 | 		{
 756 | 			$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
 757 | 			$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
 758 | 		}
 759 | 		$this->attr[$name] = $value;
 760 | 	}
 761 | 
 762 | 	function __isset($name)
 763 | 	{
 764 | 		switch ($name)
 765 | 		{
 766 | 			case 'outertext': return true;
 767 | 			case 'innertext': return true;
 768 | 			case 'plaintext': return true;
 769 | 		}
 770 | 		//no value attr: nowrap, checked selected...
 771 | 		return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
 772 | 	}
 773 | 
 774 | 	function __unset($name) {
 775 | 		if (isset($this->attr[$name]))
 776 | 			unset($this->attr[$name]);
 777 | 	}
 778 | 
 779 | 	// PaperG - Function to convert the text from one character set to another if the two sets are not the same.
 780 | 	function convert_text($text)
 781 | 	{
 782 | 		global $debug_object;
 783 | 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
 784 | 
 785 | 		$converted_text = $text;
 786 | 
 787 | 		$sourceCharset = "";
 788 | 		$targetCharset = "";
 789 | 
 790 | 		if ($this->dom)
 791 | 		{
 792 | 			$sourceCharset = strtoupper($this->dom->_charset);
 793 | 			$targetCharset = strtoupper($this->dom->_target_charset);
 794 | 		}
 795 | 		if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
 796 | 
 797 | 		if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
 798 | 		{
 799 | 			// Check if the reported encoding could have been incorrect and the text is actually already UTF-8
 800 | 			if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
 801 | 			{
 802 | 				$converted_text = $text;
 803 | 			}
 804 | 			else
 805 | 			{
 806 | 				$converted_text = iconv($sourceCharset, $targetCharset, $text);
 807 | 			}
 808 | 		}
 809 | 
 810 | 		// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
 811 | 		if ($targetCharset == 'UTF-8')
 812 | 		{
 813 | 			if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
 814 | 			{
 815 | 				$converted_text = substr($converted_text, 3);
 816 | 			}
 817 | 			if (substr($converted_text, -3) == "\xef\xbb\xbf")
 818 | 			{
 819 | 				$converted_text = substr($converted_text, 0, -3);
 820 | 			}
 821 | 		}
 822 | 
 823 | 		return $converted_text;
 824 | 	}
 825 | 
 826 | 	/**
 827 | 	* Returns true if $string is valid UTF-8 and false otherwise.
 828 | 	*
 829 | 	* @param mixed $str String to be tested
 830 | 	* @return boolean
 831 | 	*/
 832 | 	static function is_utf8($str)
 833 | 	{
 834 | 		$c=0; $b=0;
 835 | 		$bits=0;
 836 | 		$len=strlen($str);
 837 | 		for($i=0; $i<$len; $i++)
 838 | 		{
 839 | 			$c=ord($str[$i]);
 840 | 			if($c > 128)
 841 | 			{
 842 | 				if(($c >= 254)) return false;
 843 | 				elseif($c >= 252) $bits=6;
 844 | 				elseif($c >= 248) $bits=5;
 845 | 				elseif($c >= 240) $bits=4;
 846 | 				elseif($c >= 224) $bits=3;
 847 | 				elseif($c >= 192) $bits=2;
 848 | 				else return false;
 849 | 				if(($i+$bits) > $len) return false;
 850 | 				while($bits > 1)
 851 | 				{
 852 | 					$i++;
 853 | 					$b=ord($str[$i]);
 854 | 					if($b < 128 || $b > 191) return false;
 855 | 					$bits--;
 856 | 				}
 857 | 			}
 858 | 		}
 859 | 		return true;
 860 | 	}
 861 | 	/*
 862 | 	function is_utf8($string)
 863 | 	{
 864 | 		//this is buggy
 865 | 		return (utf8_encode(utf8_decode($string)) == $string);
 866 | 	}
 867 | 	*/
 868 | 
 869 | 	/**
 870 | 	 * Function to try a few tricks to determine the displayed size of an img on the page.
 871 | 	 * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
 872 | 	 *
 873 | 	 * @author John Schlick
 874 | 	 * @version April 19 2012
 875 | 	 * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
 876 | 	 */
 877 | 	function get_display_size()
 878 | 	{
 879 | 		global $debug_object;
 880 | 
 881 | 		$width = -1;
 882 | 		$height = -1;
 883 | 
 884 | 		if ($this->tag !== 'img')
 885 | 		{
 886 | 			return false;
 887 | 		}
 888 | 
 889 | 		// See if there is aheight or width attribute in the tag itself.
 890 | 		if (isset($this->attr['width']))
 891 | 		{
 892 | 			$width = $this->attr['width'];
 893 | 		}
 894 | 
 895 | 		if (isset($this->attr['height']))
 896 | 		{
 897 | 			$height = $this->attr['height'];
 898 | 		}
 899 | 
 900 | 		// Now look for an inline style.
 901 | 		if (isset($this->attr['style']))
 902 | 		{
 903 | 			// Thanks to user gnarf from stackoverflow for this regular expression.
 904 | 			$attributes = array();
 905 | 			preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
 906 | 			foreach ($matches as $match) {
 907 | 			  $attributes[$match[1]] = $match[2];
 908 | 			}
 909 | 
 910 | 			// If there is a width in the style attributes:
 911 | 			if (isset($attributes['width']) && $width == -1)
 912 | 			{
 913 | 				// check that the last two characters are px (pixels)
 914 | 				if (strtolower(substr($attributes['width'], -2)) == 'px')
 915 | 				{
 916 | 					$proposed_width = substr($attributes['width'], 0, -2);
 917 | 					// Now make sure that it's an integer and not something stupid.
 918 | 					if (filter_var($proposed_width, FILTER_VALIDATE_INT))
 919 | 					{
 920 | 						$width = $proposed_width;
 921 | 					}
 922 | 				}
 923 | 			}
 924 | 
 925 | 			// If there is a width in the style attributes:
 926 | 			if (isset($attributes['height']) && $height == -1)
 927 | 			{
 928 | 				// check that the last two characters are px (pixels)
 929 | 				if (strtolower(substr($attributes['height'], -2)) == 'px')
 930 | 				{
 931 | 					$proposed_height = substr($attributes['height'], 0, -2);
 932 | 					// Now make sure that it's an integer and not something stupid.
 933 | 					if (filter_var($proposed_height, FILTER_VALIDATE_INT))
 934 | 					{
 935 | 						$height = $proposed_height;
 936 | 					}
 937 | 				}
 938 | 			}
 939 | 
 940 | 		}
 941 | 
 942 | 		// Future enhancement:
 943 | 		// Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
 944 | 
 945 | 		// Far future enhancement
 946 | 		// Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
 947 | 		// Note that in this case, the class or id will have the img subselector for it to apply to the image.
 948 | 
 949 | 		// ridiculously far future development
 950 | 		// If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
 951 | 
 952 | 		$result = array('height' => $height,
 953 | 						'width' => $width);
 954 | 		return $result;
 955 | 	}
 956 | 
 957 | 	// camel naming conventions
 958 | 	function getAllAttributes() {return $this->attr;}
 959 | 	function getAttribute($name) {return $this->__get($name);}
 960 | 	function setAttribute($name, $value) {$this->__set($name, $value);}
 961 | 	function hasAttribute($name) {return $this->__isset($name);}
 962 | 	function removeAttribute($name) {$this->__set($name, null);}
 963 | 	function getElementById($id) {return $this->find("#$id", 0);}
 964 | 	function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
 965 | 	function getElementByTagName($name) {return $this->find($name, 0);}
 966 | 	function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
 967 | 	function parentNode() {return $this->parent();}
 968 | 	function childNodes($idx=-1) {return $this->children($idx);}
 969 | 	function firstChild() {return $this->first_child();}
 970 | 	function lastChild() {return $this->last_child();}
 971 | 	function nextSibling() {return $this->next_sibling();}
 972 | 	function previousSibling() {return $this->prev_sibling();}
 973 | 	function hasChildNodes() {return $this->has_child();}
 974 | 	function nodeName() {return $this->tag;}
 975 | 	function appendChild($node) {$node->parent($this); return $node;}
 976 | 
 977 | }
 978 | 
 979 | /**
 980 |  * simple html dom parser
 981 |  * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
 982 |  * Paperg - change $size from protected to public so we can easily access it
 983 |  * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not.  Default is to NOT trust it.
 984 |  *
 985 |  * @package PlaceLocalInclude
 986 |  */
 987 | class simple_html_dom
 988 | {
 989 | 	public $root = null;
 990 | 	public $nodes = array();
 991 | 	public $callback = null;
 992 | 	public $lowercase = false;
 993 | 	// Used to keep track of how large the text was when we started.
 994 | 	public $original_size;
 995 | 	public $size;
 996 | 	protected $pos;
 997 | 	protected $doc;
 998 | 	protected $char;
 999 | 	protected $cursor;
1000 | 	protected $parent;
1001 | 	protected $noise = array();
1002 | 	protected $token_blank = " \t\r\n";
1003 | 	protected $token_equal = ' =/>';
1004 | 	protected $token_slash = " />\r\n\t";
1005 | 	protected $token_attr = ' >';
1006 | 	// Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
1007 | 	public $_charset = '';
1008 | 	public $_target_charset = '';
1009 | 	protected $default_br_text = "";
1010 | 	public $default_span_text = "";
1011 | 
1012 | 	// use isset instead of in_array, performance boost about 30%...
1013 | 	protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
1014 | 	protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
1015 | 	// Known sourceforge issue #2977341
1016 | 	// B tags that are not closed cause us to return everything to the end of the document.
1017 | 	protected $optional_closing_tags = array(
1018 | 		'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
1019 | 		'th'=>array('th'=>1),
1020 | 		'td'=>array('td'=>1),
1021 | 		'li'=>array('li'=>1),
1022 | 		'dt'=>array('dt'=>1, 'dd'=>1),
1023 | 		'dd'=>array('dd'=>1, 'dt'=>1),
1024 | 		'dl'=>array('dd'=>1, 'dt'=>1),
1025 | 		'p'=>array('p'=>1),
1026 | 		'nobr'=>array('nobr'=>1),
1027 | 		'b'=>array('b'=>1),
1028 | 		'option'=>array('option'=>1),
1029 | 	);
1030 | 
1031 | 	function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1032 | 	{
1033 | 		if ($str)
1034 | 		{
1035 | 			if (preg_match("/^http:\/\//i",$str) || is_file($str))
1036 | 			{
1037 | 				$this->load_file($str);
1038 | 			}
1039 | 			else
1040 | 			{
1041 | 				$this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1042 | 			}
1043 | 		}
1044 | 		// Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1045 | 		if (!$forceTagsClosed) {
1046 | 			$this->optional_closing_array=array();
1047 | 		}
1048 | 		$this->_target_charset = $target_charset;
1049 | 	}
1050 | 
1051 | 	function __destruct()
1052 | 	{
1053 | 		$this->clear();
1054 | 	}
1055 | 
1056 | 	// load html from string
1057 | 	function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1058 | 	{
1059 | 		global $debug_object;
1060 | 
1061 | 		// prepare
1062 | 		$this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1063 | 		// strip out cdata
1064 | 		$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1065 | 		// strip out comments
1066 | 		$this->remove_noise("'<!--(.*?)-->'is");
1067 | 		// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1068 | 		// Script tags removal now preceeds style tag removal.
1069 | 		// strip out <script> tags
1070 | 		$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1071 | 		$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1072 | 		// strip out <style> tags
1073 | 		$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1074 | 		$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1075 | 		// strip out preformatted tags
1076 | 		$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1077 | 		// strip out server side scripts
1078 | 		$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1079 | 		// strip smarty scripts
1080 | 		$this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1081 | 
1082 | 		// parsing
1083 | 		while ($this->parse());
1084 | 		// end
1085 | 		$this->root->_[HDOM_INFO_END] = $this->cursor;
1086 | 		$this->parse_charset();
1087 | 
1088 | 		// make load function chainable
1089 | 		return $this;
1090 | 
1091 | 	}
1092 | 
1093 | 	// load html from file
1094 | 	function load_file()
1095 | 	{
1096 | 		$args = func_get_args();
1097 | 		$this->load(call_user_func_array('file_get_contents', $args), true);
1098 | 		// Throw an error if we can't properly load the dom.
1099 | 		if (($error=error_get_last())!==null) {
1100 | 			$this->clear();
1101 | 			return false;
1102 | 		}
1103 | 	}
1104 | 
1105 | 	// set callback function
1106 | 	function set_callback($function_name)
1107 | 	{
1108 | 		$this->callback = $function_name;
1109 | 	}
1110 | 
1111 | 	// remove callback function
1112 | 	function remove_callback()
1113 | 	{
1114 | 		$this->callback = null;
1115 | 	}
1116 | 
1117 | 	// save dom as string
1118 | 	function save($filepath='')
1119 | 	{
1120 | 		$ret = $this->root->innertext();
1121 | 		if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
1122 | 		return $ret;
1123 | 	}
1124 | 
1125 | 	// find dom node by css selector
1126 | 	// Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1127 | 	function find($selector, $idx=null, $lowercase=false)
1128 | 	{
1129 | 		return $this->root->find($selector, $idx, $lowercase);
1130 | 	}
1131 | 
1132 | 	// clean up memory due to php5 circular references memory leak...
1133 | 	function clear()
1134 | 	{
1135 | 		foreach ($this->nodes as $n) {$n->clear(); $n = null;}
1136 | 		// This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
1137 | 		if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
1138 | 		if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
1139 | 		if (isset($this->root)) {$this->root->clear(); unset($this->root);}
1140 | 		unset($this->doc);
1141 | 		unset($this->noise);
1142 | 	}
1143 | 
1144 | 	function dump($show_attr=true)
1145 | 	{
1146 | 		$this->root->dump($show_attr);
1147 | 	}
1148 | 
1149 | 	// prepare HTML data and init everything
1150 | 	protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1151 | 	{
1152 | 		$this->clear();
1153 | 
1154 | 		// set the length of content before we do anything to it.
1155 | 		$this->size = strlen($str);
1156 | 		// Save the original size of the html that we got in.  It might be useful to someone.
1157 | 		$this->original_size = $this->size;
1158 | 
1159 | 		//before we save the string as the doc...  strip out the \r \n's if we are told to.
1160 | 		if ($stripRN) {
1161 | 			$str = str_replace("\r", " ", $str);
1162 | 			$str = str_replace("\n", " ", $str);
1163 | 
1164 | 			// set the length of content since we have changed it.
1165 | 			$this->size = strlen($str);
1166 | 		}
1167 | 
1168 | 		$this->doc = $str;
1169 | 		$this->pos = 0;
1170 | 		$this->cursor = 1;
1171 | 		$this->noise = array();
1172 | 		$this->nodes = array();
1173 | 		$this->lowercase = $lowercase;
1174 | 		$this->default_br_text = $defaultBRText;
1175 | 		$this->default_span_text = $defaultSpanText;
1176 | 		$this->root = new simple_html_dom_node($this);
1177 | 		$this->root->tag = 'root';
1178 | 		$this->root->_[HDOM_INFO_BEGIN] = -1;
1179 | 		$this->root->nodetype = HDOM_TYPE_ROOT;
1180 | 		$this->parent = $this->root;
1181 | 		if ($this->size>0) $this->char = $this->doc[0];
1182 | 	}
1183 | 
1184 | 	// parse html content
1185 | 	protected function parse()
1186 | 	{
1187 | 		if (($s = $this->copy_until_char('<'))==='')
1188 | 		{
1189 | 			return $this->read_tag();
1190 | 		}
1191 | 
1192 | 		// text
1193 | 		$node = new simple_html_dom_node($this);
1194 | 		++$this->cursor;
1195 | 		$node->_[HDOM_INFO_TEXT] = $s;
1196 | 		$this->link_nodes($node, false);
1197 | 		return true;
1198 | 	}
1199 | 
1200 | 	// PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
1201 | 	// NOTE:  IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
1202 | 	// (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
1203 | 	protected function parse_charset()
1204 | 	{
1205 | 		global $debug_object;
1206 | 
1207 | 		$charset = null;
1208 | 
1209 | 		if (function_exists('get_last_retrieve_url_contents_content_type'))
1210 | 		{
1211 | 			$contentTypeHeader = get_last_retrieve_url_contents_content_type();
1212 | 			$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1213 | 			if ($success)
1214 | 			{
1215 | 				$charset = $matches[1];
1216 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);}
1217 | 			}
1218 | 
1219 | 		}
1220 | 
1221 | 		if (empty($charset))
1222 | 		{
1223 | 			$el = $this->root->find('meta[http-equiv=Content-Type]',0, true);
1224 | 			if (!empty($el))
1225 | 			{
1226 | 				$fullvalue = $el->content;
1227 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);}
1228 | 
1229 | 				if (!empty($fullvalue))
1230 | 				{
1231 | 					$success = preg_match('/charset=(.+)/i', $fullvalue, $matches);
1232 | 					if ($success)
1233 | 					{
1234 | 						$charset = $matches[1];
1235 | 					}
1236 | 					else
1237 | 					{
1238 | 						// If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1239 | 						if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
1240 | 						$charset = 'ISO-8859-1';
1241 | 					}
1242 | 				}
1243 | 			}
1244 | 		}
1245 | 
1246 | 		// If we couldn't find a charset above, then lets try to detect one based on the text we got...
1247 | 		if (empty($charset))
1248 | 		{
1249 | 			// Use this in case mb_detect_charset isn't installed/loaded on this machine.
1250 | 			$charset = false;
1251 | 			if (function_exists('mb_detect_encoding'))
1252 | 			{
1253 | 				// Have php try to detect the encoding from the text given to us.
1254 | 				$charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1255 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);}
1256 | 			}
1257 | 
1258 | 			// and if this doesn't work...  then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1259 | 			if ($charset === false)
1260 | 			{
1261 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');}
1262 | 				$charset = 'UTF-8';
1263 | 			}
1264 | 		}
1265 | 
1266 | 		// Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1267 | 		if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1268 | 		{
1269 | 			if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
1270 | 			$charset = 'CP1252';
1271 | 		}
1272 | 
1273 | 		if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);}
1274 | 
1275 | 		return $this->_charset = $charset;
1276 | 	}
1277 | 
1278 | 	// read tag info
1279 | 	protected function read_tag()
1280 | 	{
1281 | 		if ($this->char!=='<')
1282 | 		{
1283 | 			$this->root->_[HDOM_INFO_END] = $this->cursor;
1284 | 			return false;
1285 | 		}
1286 | 		$begin_tag_pos = $this->pos;
1287 | 		$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1288 | 
1289 | 		// end tag
1290 | 		if ($this->char==='/')
1291 | 		{
1292 | 			$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1293 | 			// This represents the change in the simple_html_dom trunk from revision 180 to 181.
1294 | 			// $this->skip($this->token_blank_t);
1295 | 			$this->skip($this->token_blank);
1296 | 			$tag = $this->copy_until_char('>');
1297 | 
1298 | 			// skip attributes in end tag
1299 | 			if (($pos = strpos($tag, ' '))!==false)
1300 | 				$tag = substr($tag, 0, $pos);
1301 | 
1302 | 			$parent_lower = strtolower($this->parent->tag);
1303 | 			$tag_lower = strtolower($tag);
1304 | 
1305 | 			if ($parent_lower!==$tag_lower)
1306 | 			{
1307 | 				if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower]))
1308 | 				{
1309 | 					$this->parent->_[HDOM_INFO_END] = 0;
1310 | 					$org_parent = $this->parent;
1311 | 
1312 | 					while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1313 | 						$this->parent = $this->parent->parent;
1314 | 
1315 | 					if (strtolower($this->parent->tag)!==$tag_lower) {
1316 | 						$this->parent = $org_parent; // restore origonal parent
1317 | 						if ($this->parent->parent) $this->parent = $this->parent->parent;
1318 | 						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1319 | 						return $this->as_text_node($tag);
1320 | 					}
1321 | 				}
1322 | 				else if (($this->parent->parent) && isset($this->block_tags[$tag_lower]))
1323 | 				{
1324 | 					$this->parent->_[HDOM_INFO_END] = 0;
1325 | 					$org_parent = $this->parent;
1326 | 
1327 | 					while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1328 | 						$this->parent = $this->parent->parent;
1329 | 
1330 | 					if (strtolower($this->parent->tag)!==$tag_lower)
1331 | 					{
1332 | 						$this->parent = $org_parent; // restore origonal parent
1333 | 						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1334 | 						return $this->as_text_node($tag);
1335 | 					}
1336 | 				}
1337 | 				else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower)
1338 | 				{
1339 | 					$this->parent->_[HDOM_INFO_END] = 0;
1340 | 					$this->parent = $this->parent->parent;
1341 | 				}
1342 | 				else
1343 | 					return $this->as_text_node($tag);
1344 | 			}
1345 | 
1346 | 			$this->parent->_[HDOM_INFO_END] = $this->cursor;
1347 | 			if ($this->parent->parent) $this->parent = $this->parent->parent;
1348 | 
1349 | 			$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1350 | 			return true;
1351 | 		}
1352 | 
1353 | 		$node = new simple_html_dom_node($this);
1354 | 		$node->_[HDOM_INFO_BEGIN] = $this->cursor;
1355 | 		++$this->cursor;
1356 | 		$tag = $this->copy_until($this->token_slash);
1357 | 		$node->tag_start = $begin_tag_pos;
1358 | 
1359 | 		// doctype, cdata & comments...
1360 | 		if (isset($tag[0]) && $tag[0]==='!') {
1361 | 			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1362 | 
1363 | 			if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {
1364 | 				$node->nodetype = HDOM_TYPE_COMMENT;
1365 | 				$node->tag = 'comment';
1366 | 			} else {
1367 | 				$node->nodetype = HDOM_TYPE_UNKNOWN;
1368 | 				$node->tag = 'unknown';
1369 | 			}
1370 | 			if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1371 | 			$this->link_nodes($node, true);
1372 | 			$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1373 | 			return true;
1374 | 		}
1375 | 
1376 | 		// text
1377 | 		if ($pos=strpos($tag, '<')!==false) {
1378 | 			$tag = '<' . substr($tag, 0, -1);
1379 | 			$node->_[HDOM_INFO_TEXT] = $tag;
1380 | 			$this->link_nodes($node, false);
1381 | 			$this->char = $this->doc[--$this->pos]; // prev
1382 | 			return true;
1383 | 		}
1384 | 
1385 | 		if (!preg_match("/^[\w-:]+$/", $tag)) {
1386 | 			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1387 | 			if ($this->char==='<') {
1388 | 				$this->link_nodes($node, false);
1389 | 				return true;
1390 | 			}
1391 | 
1392 | 			if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1393 | 			$this->link_nodes($node, false);
1394 | 			$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1395 | 			return true;
1396 | 		}
1397 | 
1398 | 		// begin tag
1399 | 		$node->nodetype = HDOM_TYPE_ELEMENT;
1400 | 		$tag_lower = strtolower($tag);
1401 | 		$node->tag = ($this->lowercase) ? $tag_lower : $tag;
1402 | 
1403 | 		// handle optional closing tags
1404 | 		if (isset($this->optional_closing_tags[$tag_lower]) )
1405 | 		{
1406 | 			while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)]))
1407 | 			{
1408 | 				$this->parent->_[HDOM_INFO_END] = 0;
1409 | 				$this->parent = $this->parent->parent;
1410 | 			}
1411 | 			$node->parent = $this->parent;
1412 | 		}
1413 | 
1414 | 		$guard = 0; // prevent infinity loop
1415 | 		$space = array($this->copy_skip($this->token_blank), '', '');
1416 | 
1417 | 		// attributes
1418 | 		do
1419 | 		{
1420 | 			if ($this->char!==null && $space[0]==='')
1421 | 			{
1422 | 				break;
1423 | 			}
1424 | 			$name = $this->copy_until($this->token_equal);
1425 | 			if ($guard===$this->pos)
1426 | 			{
1427 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1428 | 				continue;
1429 | 			}
1430 | 			$guard = $this->pos;
1431 | 
1432 | 			// handle endless '<'
1433 | 			if ($this->pos>=$this->size-1 && $this->char!=='>') {
1434 | 				$node->nodetype = HDOM_TYPE_TEXT;
1435 | 				$node->_[HDOM_INFO_END] = 0;
1436 | 				$node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
1437 | 				$node->tag = 'text';
1438 | 				$this->link_nodes($node, false);
1439 | 				return true;
1440 | 			}
1441 | 
1442 | 			// handle mismatch '<'
1443 | 			if ($this->doc[$this->pos-1]=='<') {
1444 | 				$node->nodetype = HDOM_TYPE_TEXT;
1445 | 				$node->tag = 'text';
1446 | 				$node->attr = array();
1447 | 				$node->_[HDOM_INFO_END] = 0;
1448 | 				$node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
1449 | 				$this->pos -= 2;
1450 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1451 | 				$this->link_nodes($node, false);
1452 | 				return true;
1453 | 			}
1454 | 
1455 | 			if ($name!=='/' && $name!=='') {
1456 | 				$space[1] = $this->copy_skip($this->token_blank);
1457 | 				$name = $this->restore_noise($name);
1458 | 				if ($this->lowercase) $name = strtolower($name);
1459 | 				if ($this->char==='=') {
1460 | 					$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1461 | 					$this->parse_attr($node, $name, $space);
1462 | 				}
1463 | 				else {
1464 | 					//no value attr: nowrap, checked selected...
1465 | 					$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1466 | 					$node->attr[$name] = true;
1467 | 					if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
1468 | 				}
1469 | 				$node->_[HDOM_INFO_SPACE][] = $space;
1470 | 				$space = array($this->copy_skip($this->token_blank), '', '');
1471 | 			}
1472 | 			else
1473 | 				break;
1474 | 		} while ($this->char!=='>' && $this->char!=='/');
1475 | 
1476 | 		$this->link_nodes($node, true);
1477 | 		$node->_[HDOM_INFO_ENDSPACE] = $space[0];
1478 | 
1479 | 		// check self closing
1480 | 		if ($this->copy_until_char_escape('>')==='/')
1481 | 		{
1482 | 			$node->_[HDOM_INFO_ENDSPACE] .= '/';
1483 | 			$node->_[HDOM_INFO_END] = 0;
1484 | 		}
1485 | 		else
1486 | 		{
1487 | 			// reset parent
1488 | 			if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
1489 | 		}
1490 | 		$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1491 | 
1492 | 		// If it's a BR tag, we need to set it's text to the default text.
1493 | 		// This way when we see it in plaintext, we can generate formatting that the user wants.
1494 | 		// since a br tag never has sub nodes, this works well.
1495 | 		if ($node->tag == "br")
1496 | 		{
1497 | 			$node->_[HDOM_INFO_INNER] = $this->default_br_text;
1498 | 		}
1499 | 
1500 | 		return true;
1501 | 	}
1502 | 
1503 | 	// parse attributes
1504 | 	protected function parse_attr($node, $name, &$space)
1505 | 	{
1506 | 		// Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
1507 | 		// If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.
1508 | 		if (isset($node->attr[$name]))
1509 | 		{
1510 | 			return;
1511 | 		}
1512 | 
1513 | 		$space[2] = $this->copy_skip($this->token_blank);
1514 | 		switch ($this->char) {
1515 | 			case '"':
1516 | 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1517 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1518 | 				$node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));
1519 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1520 | 				break;
1521 | 			case '\'':
1522 | 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
1523 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1524 | 				$node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));
1525 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1526 | 				break;
1527 | 			default:
1528 | 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1529 | 				$node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
1530 | 		}
1531 | 		// PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
1532 | 		$node->attr[$name] = str_replace("\r", "", $node->attr[$name]);
1533 | 		$node->attr[$name] = str_replace("\n", "", $node->attr[$name]);
1534 | 		// PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
1535 | 		if ($name == "class") {
1536 | 			$node->attr[$name] = trim($node->attr[$name]);
1537 | 		}
1538 | 	}
1539 | 
1540 | 	// link node's parent
1541 | 	protected function link_nodes(&$node, $is_child)
1542 | 	{
1543 | 		$node->parent = $this->parent;
1544 | 		$this->parent->nodes[] = $node;
1545 | 		if ($is_child)
1546 | 		{
1547 | 			$this->parent->children[] = $node;
1548 | 		}
1549 | 	}
1550 | 
1551 | 	// as a text node
1552 | 	protected function as_text_node($tag)
1553 | 	{
1554 | 		$node = new simple_html_dom_node($this);
1555 | 		++$this->cursor;
1556 | 		$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
1557 | 		$this->link_nodes($node, false);
1558 | 		$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1559 | 		return true;
1560 | 	}
1561 | 
1562 | 	protected function skip($chars)
1563 | 	{
1564 | 		$this->pos += strspn($this->doc, $chars, $this->pos);
1565 | 		$this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1566 | 	}
1567 | 
1568 | 	protected function copy_skip($chars)
1569 | 	{
1570 | 		$pos = $this->pos;
1571 | 		$len = strspn($this->doc, $chars, $pos);
1572 | 		$this->pos += $len;
1573 | 		$this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1574 | 		if ($len===0) return '';
1575 | 		return substr($this->doc, $pos, $len);
1576 | 	}
1577 | 
1578 | 	protected function copy_until($chars)
1579 | 	{
1580 | 		$pos = $this->pos;
1581 | 		$len = strcspn($this->doc, $chars, $pos);
1582 | 		$this->pos += $len;
1583 | 		$this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1584 | 		return substr($this->doc, $pos, $len);
1585 | 	}
1586 | 
1587 | 	protected function copy_until_char($char)
1588 | 	{
1589 | 		if ($this->char===null) return '';
1590 | 
1591 | 		if (($pos = strpos($this->doc, $char, $this->pos))===false) {
1592 | 			$ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1593 | 			$this->char = null;
1594 | 			$this->pos = $this->size;
1595 | 			return $ret;
1596 | 		}
1597 | 
1598 | 		if ($pos===$this->pos) return '';
1599 | 		$pos_old = $this->pos;
1600 | 		$this->char = $this->doc[$pos];
1601 | 		$this->pos = $pos;
1602 | 		return substr($this->doc, $pos_old, $pos-$pos_old);
1603 | 	}
1604 | 
1605 | 	protected function copy_until_char_escape($char)
1606 | 	{
1607 | 		if ($this->char===null) return '';
1608 | 
1609 | 		$start = $this->pos;
1610 | 		while (1)
1611 | 		{
1612 | 			if (($pos = strpos($this->doc, $char, $start))===false)
1613 | 			{
1614 | 				$ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1615 | 				$this->char = null;
1616 | 				$this->pos = $this->size;
1617 | 				return $ret;
1618 | 			}
1619 | 
1620 | 			if ($pos===$this->pos) return '';
1621 | 
1622 | 			if ($this->doc[$pos-1]==='\\') {
1623 | 				$start = $pos+1;
1624 | 				continue;
1625 | 			}
1626 | 
1627 | 			$pos_old = $this->pos;
1628 | 			$this->char = $this->doc[$pos];
1629 | 			$this->pos = $pos;
1630 | 			return substr($this->doc, $pos_old, $pos-$pos_old);
1631 | 		}
1632 | 	}
1633 | 
1634 | 	// remove noise from html content
1635 | 	// save the noise in the $this->noise array.
1636 | 	protected function remove_noise($pattern, $remove_tag=false)
1637 | 	{
1638 | 		global $debug_object;
1639 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1640 | 
1641 | 		$count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
1642 | 
1643 | 		for ($i=$count-1; $i>-1; --$i)
1644 | 		{
1645 | 			$key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
1646 | 			if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); }
1647 | 			$idx = ($remove_tag) ? 0 : 1;
1648 | 			$this->noise[$key] = $matches[$i][$idx][0];
1649 | 			$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
1650 | 		}
1651 | 
1652 | 		// reset the length of content
1653 | 		$this->size = strlen($this->doc);
1654 | 		if ($this->size>0)
1655 | 		{
1656 | 			$this->char = $this->doc[0];
1657 | 		}
1658 | 	}
1659 | 
1660 | 	// restore noise to html content
1661 | 	function restore_noise($text)
1662 | 	{
1663 | 		global $debug_object;
1664 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1665 | 
1666 | 		while (($pos=strpos($text, '___noise___'))!==false)
1667 | 		{
1668 | 			// Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
1669 | 			if (strlen($text) > $pos+15)
1670 | 			{
1671 | 				$key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
1672 | 				if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); }
1673 | 
1674 | 				if (isset($this->noise[$key]))
1675 | 				{
1676 | 					$text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16);
1677 | 				}
1678 | 				else
1679 | 				{
1680 | 					// do this to prevent an infinite loop.
1681 | 					$text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16);
1682 | 				}
1683 | 			}
1684 | 			else
1685 | 			{
1686 | 				// There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
1687 | 				$text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11);
1688 | 			}
1689 | 		}
1690 | 		return $text;
1691 | 	}
1692 | 
1693 | 	// Sometimes we NEED one of the noise elements.
1694 | 	function search_noise($text)
1695 | 	{
1696 | 		global $debug_object;
1697 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1698 | 
1699 | 		foreach($this->noise as $noiseElement)
1700 | 		{
1701 | 			if (strpos($noiseElement, $text)!==false)
1702 | 			{
1703 | 				return $noiseElement;
1704 | 			}
1705 | 		}
1706 | 	}
1707 | 	function __toString()
1708 | 	{
1709 | 		return $this->root->innertext();
1710 | 	}
1711 | 
1712 | 	function __get($name)
1713 | 	{
1714 | 		switch ($name)
1715 | 		{
1716 | 			case 'outertext':
1717 | 				return $this->root->innertext();
1718 | 			case 'innertext':
1719 | 				return $this->root->innertext();
1720 | 			case 'plaintext':
1721 | 				return $this->root->text();
1722 | 			case 'charset':
1723 | 				return $this->_charset;
1724 | 			case 'target_charset':
1725 | 				return $this->_target_charset;
1726 | 		}
1727 | 	}
1728 | 
1729 | 	// camel naming conventions
1730 | 	function childNodes($idx=-1) {return $this->root->childNodes($idx);}
1731 | 	function firstChild() {return $this->root->first_child();}
1732 | 	function lastChild() {return $this->root->last_child();}
1733 | 	function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();}
1734 | 	function createTextNode($value) {return @end(str_get_html($value)->nodes);}
1735 | 	function getElementById($id) {return $this->find("#$id", 0);}
1736 | 	function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
1737 | 	function getElementByTagName($name) {return $this->find($name, 0);}
1738 | 	function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
1739 | 	function loadFile() {$args = func_get_args();$this->load_file($args);}
1740 | }
1741 | 
1742 | ?>


--------------------------------------------------------------------------------
/Phpfetcher/Dom/simple_html_dom.php:
--------------------------------------------------------------------------------
   1 | <?php
   2 | /**
   3 |  * Website: http://sourceforge.net/projects/simplehtmldom/
   4 |  * Additional projects that may be used: http://sourceforge.net/projects/debugobject/
   5 |  * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
   6 |  * Contributions by:
   7 |  *	 Yousuke Kumakura (Attribute filters)
   8 |  *	 Vadim Voituk (Negative indexes supports of "find" method)
   9 |  *	 Antcs (Constructor with automatically load contents either text or file/url)
  10 |  *
  11 |  * all affected sections have comments starting with "PaperG"
  12 |  *
  13 |  * Paperg - Added case insensitive testing of the value of the selector.
  14 |  * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
  15 |  *  This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
  16 |  *  it will almost always be smaller by some amount.
  17 |  *  We use this to determine how far into the file the tag in question is.  This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
  18 |  *  but for most purposes, it's a really good estimation.
  19 |  * Paperg - Added the forceTagsClosed to the dom constructor.  Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
  20 |  * Allow the user to tell us how much they trust the html.
  21 |  * Paperg add the text and plaintext to the selectors for the find syntax.  plaintext implies text in the innertext of a node.  text implies that the tag is a text node.
  22 |  * This allows for us to find tags based on the text they contain.
  23 |  * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
  24 |  * Paperg: added parse_charset so that we know about the character set of the source document.
  25 |  *  NOTE:  If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
  26 |  *  last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
  27 |  *
  28 |  * Found infinite loop in the case of broken html in restore_noise.  Rewrote to protect from that.
  29 |  * PaperG (John Schlick) Added get_display_size for "IMG" tags.
  30 |  *
  31 |  * Licensed under The MIT License
  32 |  * Redistributions of files must retain the above copyright notice.
  33 |  *
  34 |  * @author S.C. Chen <me578022@gmail.com>
  35 |  * @author John Schlick
  36 |  * @author Rus Carroll
  37 |  * @version 1.5 ($Rev: 210 $)
  38 |  * @package PlaceLocalInclude
  39 |  * @subpackage simple_html_dom
  40 |  */
  41 | 
  42 | /**
  43 |  * All of the Defines for the classes below.
  44 |  * @author S.C. Chen <me578022@gmail.com>
  45 |  */
  46 | define('HDOM_TYPE_ELEMENT', 1);
  47 | define('HDOM_TYPE_COMMENT', 2);
  48 | define('HDOM_TYPE_TEXT',	3);
  49 | define('HDOM_TYPE_ENDTAG',  4);
  50 | define('HDOM_TYPE_ROOT',	5);
  51 | define('HDOM_TYPE_UNKNOWN', 6);
  52 | define('HDOM_QUOTE_DOUBLE', 0);
  53 | define('HDOM_QUOTE_SINGLE', 1);
  54 | define('HDOM_QUOTE_NO',	 3);
  55 | define('HDOM_INFO_BEGIN',   0);
  56 | define('HDOM_INFO_END',	 1);
  57 | define('HDOM_INFO_QUOTE',   2);
  58 | define('HDOM_INFO_SPACE',   3);
  59 | define('HDOM_INFO_TEXT',	4);
  60 | define('HDOM_INFO_INNER',   5);
  61 | define('HDOM_INFO_OUTER',   6);
  62 | define('HDOM_INFO_ENDSPACE',7);
  63 | define('DEFAULT_TARGET_CHARSET', 'UTF-8');
  64 | define('DEFAULT_BR_TEXT', "\r\n");
  65 | define('DEFAULT_SPAN_TEXT', " ");
  66 | define('MAX_FILE_SIZE', 100000000); //100M
  67 | // helper functions
  68 | // -----------------------------------------------------------------------------
  69 | // get html dom from file
  70 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
  71 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
  72 | {
  73 | 	// We DO force the tags to be terminated.
  74 | 	$dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
  75 | 	// For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
  76 | 	$contents = file_get_contents($url, $use_include_path, $context, $offset);
  77 | 	// Paperg - use our own mechanism for getting the contents as we want to control the timeout.
  78 | 	//$contents = retrieve_url_contents($url);
  79 | 	if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
  80 | 	{
  81 | 		return false;
  82 | 	}
  83 | 	// The second parameter can force the selectors to all be lowercase.
  84 | 	$dom->load($contents, $lowercase, $stripRN);
  85 | 	return $dom;
  86 | }
  87 | 
  88 | // get html dom from string
  89 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
  90 | {
  91 | 	$dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
  92 | 	if (empty($str) || strlen($str) > MAX_FILE_SIZE)
  93 | 	{
  94 | 		$dom->clear();
  95 | 		return false;
  96 | 	}
  97 | 	$dom->load($str, $lowercase, $stripRN);
  98 | 	return $dom;
  99 | }
 100 | 
 101 | // dump html dom tree
 102 | function dump_html_tree($node, $show_attr=true, $deep=0)
 103 | {
 104 | 	$node->dump($node);
 105 | }
 106 | 
 107 | 
 108 | /**
 109 |  * simple html dom node
 110 |  * PaperG - added ability for "find" routine to lowercase the value of the selector.
 111 |  * PaperG - added $tag_start to track the start position of the tag in the total byte index
 112 |  *
 113 |  * @package PlaceLocalInclude
 114 |  */
 115 | class simple_html_dom_node
 116 | {
 117 | 	public $nodetype = HDOM_TYPE_TEXT;
 118 | 	public $tag = 'text';
 119 | 	public $attr = array();
 120 | 	public $children = array();
 121 | 	public $nodes = array();
 122 | 	public $parent = null;
 123 | 	// The "info" array - see HDOM_INFO_... for what each element contains.
 124 | 	public $_ = array();
 125 | 	public $tag_start = 0;
 126 | 	private $dom = null;
 127 | 
 128 | 	function __construct($dom)
 129 | 	{
 130 | 		$this->dom = $dom;
 131 | 		$dom->nodes[] = $this;
 132 | 	}
 133 | 
 134 | 	function __destruct()
 135 | 	{
 136 | 		$this->clear();
 137 | 	}
 138 | 
 139 | 	function __toString()
 140 | 	{
 141 | 		return $this->outertext();
 142 | 	}
 143 | 
 144 | 	// clean up memory due to php5 circular references memory leak...
 145 | 	function clear()
 146 | 	{
 147 | 		$this->dom = null;
 148 | 		$this->nodes = null;
 149 | 		$this->parent = null;
 150 | 		$this->children = null;
 151 | 	}
 152 | 
 153 | 	// dump node's tree
 154 | 	function dump($show_attr=true, $deep=0)
 155 | 	{
 156 | 		$lead = str_repeat('	', $deep);
 157 | 
 158 | 		echo $lead.$this->tag;
 159 | 		if ($show_attr && count($this->attr)>0)
 160 | 		{
 161 | 			echo '(';
 162 | 			foreach ($this->attr as $k=>$v)
 163 | 				echo "[$k]=>\"".$this->$k.'", ';
 164 | 			echo ')';
 165 | 		}
 166 | 		echo "\n";
 167 | 
 168 | 		if ($this->nodes)
 169 | 		{
 170 | 			foreach ($this->nodes as $c)
 171 | 			{
 172 | 				$c->dump($show_attr, $deep+1);
 173 | 			}
 174 | 		}
 175 | 	}
 176 | 
 177 | 
 178 | 	// Debugging function to dump a single dom node with a bunch of information about it.
 179 | 	function dump_node($echo=true)
 180 | 	{
 181 | 
 182 | 		$string = $this->tag;
 183 | 		if (count($this->attr)>0)
 184 | 		{
 185 | 			$string .= '(';
 186 | 			foreach ($this->attr as $k=>$v)
 187 | 			{
 188 | 				$string .= "[$k]=>\"".$this->$k.'", ';
 189 | 			}
 190 | 			$string .= ')';
 191 | 		}
 192 | 		if (count($this->_)>0)
 193 | 		{
 194 | 			$string .= ' $_ (';
 195 | 			foreach ($this->_ as $k=>$v)
 196 | 			{
 197 | 				if (is_array($v))
 198 | 				{
 199 | 					$string .= "[$k]=>(";
 200 | 					foreach ($v as $k2=>$v2)
 201 | 					{
 202 | 						$string .= "[$k2]=>\"".$v2.'", ';
 203 | 					}
 204 | 					$string .= ")";
 205 | 				} else {
 206 | 					$string .= "[$k]=>\"".$v.'", ';
 207 | 				}
 208 | 			}
 209 | 			$string .= ")";
 210 | 		}
 211 | 
 212 | 		if (isset($this->text))
 213 | 		{
 214 | 			$string .= " text: (" . $this->text . ")";
 215 | 		}
 216 | 
 217 | 		$string .= " HDOM_INNER_INFO: '";
 218 | 		if (isset($node->_[HDOM_INFO_INNER]))
 219 | 		{
 220 | 			$string .= $node->_[HDOM_INFO_INNER] . "'";
 221 | 		}
 222 | 		else
 223 | 		{
 224 | 			$string .= ' NULL ';
 225 | 		}
 226 | 
 227 | 		$string .= " children: " . count($this->children);
 228 | 		$string .= " nodes: " . count($this->nodes);
 229 | 		$string .= " tag_start: " . $this->tag_start;
 230 | 		$string .= "\n";
 231 | 
 232 | 		if ($echo)
 233 | 		{
 234 | 			echo $string;
 235 | 			return;
 236 | 		}
 237 | 		else
 238 | 		{
 239 | 			return $string;
 240 | 		}
 241 | 	}
 242 | 
 243 | 	// returns the parent of node
 244 | 	// If a node is passed in, it will reset the parent of the current node to that one.
 245 | 	function parent($parent=null)
 246 | 	{
 247 | 		// I am SURE that this doesn't work properly.
 248 | 		// It fails to unset the current node from it's current parents nodes or children list first.
 249 | 		if ($parent !== null)
 250 | 		{
 251 | 			$this->parent = $parent;
 252 | 			$this->parent->nodes[] = $this;
 253 | 			$this->parent->children[] = $this;
 254 | 		}
 255 | 
 256 | 		return $this->parent;
 257 | 	}
 258 | 
 259 | 	// verify that node has children
 260 | 	function has_child()
 261 | 	{
 262 | 		return !empty($this->children);
 263 | 	}
 264 | 
 265 | 	// returns children of node
 266 | 	function children($idx=-1)
 267 | 	{
 268 | 		if ($idx===-1)
 269 | 		{
 270 | 			return $this->children;
 271 | 		}
 272 | 		if (isset($this->children[$idx]))
 273 | 		{
 274 | 			return $this->children[$idx];
 275 | 		}
 276 | 		return null;
 277 | 	}
 278 | 
 279 | 	// returns the first child of node
 280 | 	function first_child()
 281 | 	{
 282 | 		if (count($this->children)>0)
 283 | 		{
 284 | 			return $this->children[0];
 285 | 		}
 286 | 		return null;
 287 | 	}
 288 | 
 289 | 	// returns the last child of node
 290 | 	function last_child()
 291 | 	{
 292 | 		if (($count=count($this->children))>0)
 293 | 		{
 294 | 			return $this->children[$count-1];
 295 | 		}
 296 | 		return null;
 297 | 	}
 298 | 
 299 | 	// returns the next sibling of node
 300 | 	function next_sibling()
 301 | 	{
 302 | 		if ($this->parent===null)
 303 | 		{
 304 | 			return null;
 305 | 		}
 306 | 
 307 | 		$idx = 0;
 308 | 		$count = count($this->parent->children);
 309 | 		while ($idx<$count && $this!==$this->parent->children[$idx])
 310 | 		{
 311 | 			++$idx;
 312 | 		}
 313 | 		if (++$idx>=$count)
 314 | 		{
 315 | 			return null;
 316 | 		}
 317 | 		return $this->parent->children[$idx];
 318 | 	}
 319 | 
 320 | 	// returns the previous sibling of node
 321 | 	function prev_sibling()
 322 | 	{
 323 | 		if ($this->parent===null) return null;
 324 | 		$idx = 0;
 325 | 		$count = count($this->parent->children);
 326 | 		while ($idx<$count && $this!==$this->parent->children[$idx])
 327 | 			++$idx;
 328 | 		if (--$idx<0) return null;
 329 | 		return $this->parent->children[$idx];
 330 | 	}
 331 | 
 332 | 	// function to locate a specific ancestor tag in the path to the root.
 333 | 	function find_ancestor_tag($tag)
 334 | 	{
 335 | 		global $debug_object;
 336 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
 337 | 
 338 | 		// Start by including ourselves in the comparison.
 339 | 		$returnDom = $this;
 340 | 
 341 | 		while (!is_null($returnDom))
 342 | 		{
 343 | 			if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); }
 344 | 
 345 | 			if ($returnDom->tag == $tag)
 346 | 			{
 347 | 				break;
 348 | 			}
 349 | 			$returnDom = $returnDom->parent;
 350 | 		}
 351 | 		return $returnDom;
 352 | 	}
 353 | 
 354 | 	// get dom node's inner html
 355 | 	function innertext()
 356 | 	{
 357 | 		if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
 358 | 		if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 359 | 
 360 | 		$ret = '';
 361 | 		foreach ($this->nodes as $n)
 362 | 			$ret .= $n->outertext();
 363 | 		return $ret;
 364 | 	}
 365 | 
 366 | 	// get dom node's outer text (with tag)
 367 | 	function outertext()
 368 | 	{
 369 | 		global $debug_object;
 370 | 		if (is_object($debug_object))
 371 | 		{
 372 | 			$text = '';
 373 | 			if ($this->tag == 'text')
 374 | 			{
 375 | 				if (!empty($this->text))
 376 | 				{
 377 | 					$text = " with text: " . $this->text;
 378 | 				}
 379 | 			}
 380 | 			$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
 381 | 		}
 382 | 
 383 | 		if ($this->tag==='root') return $this->innertext();
 384 | 
 385 | 		// trigger callback
 386 | 		if ($this->dom && $this->dom->callback!==null)
 387 | 		{
 388 | 			call_user_func_array($this->dom->callback, array($this));
 389 | 		}
 390 | 
 391 | 		if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
 392 | 		if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 393 | 
 394 | 		// render begin tag
 395 | 		if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
 396 | 		{
 397 | 			$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
 398 | 		} else {
 399 | 			$ret = "";
 400 | 		}
 401 | 
 402 | 		// render inner text
 403 | 		if (isset($this->_[HDOM_INFO_INNER]))
 404 | 		{
 405 | 			// If it's a br tag...  don't return the HDOM_INNER_INFO that we may or may not have added.
 406 | 			if ($this->tag != "br")
 407 | 			{
 408 | 				$ret .= $this->_[HDOM_INFO_INNER];
 409 | 			}
 410 | 		} else {
 411 | 			if ($this->nodes)
 412 | 			{
 413 | 				foreach ($this->nodes as $n)
 414 | 				{
 415 | 					$ret .= $this->convert_text($n->outertext());
 416 | 				}
 417 | 			}
 418 | 		}
 419 | 
 420 | 		// render end tag
 421 | 		if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
 422 | 			$ret .= '</'.$this->tag.'>';
 423 | 		return $ret;
 424 | 	}
 425 | 
 426 | 	// get dom node's plain text
 427 | 	function text()
 428 | 	{
 429 | 		if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
 430 | 		switch ($this->nodetype)
 431 | 		{
 432 | 			case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 433 | 			case HDOM_TYPE_COMMENT: return '';
 434 | 			case HDOM_TYPE_UNKNOWN: return '';
 435 | 		}
 436 | 		if (strcasecmp($this->tag, 'script')===0) return '';
 437 | 		if (strcasecmp($this->tag, 'style')===0) return '';
 438 | 
 439 | 		$ret = '';
 440 | 		// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
 441 | 		// NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
 442 | 		// WHY is this happening?
 443 | 		if (!is_null($this->nodes))
 444 | 		{
 445 | 			foreach ($this->nodes as $n)
 446 | 			{
 447 | 				$ret .= $this->convert_text($n->text());
 448 | 			}
 449 | 
 450 | 			// If this node is a span... add a space at the end of it so multiple spans don't run into each other.  This is plaintext after all.
 451 | 			if ($this->tag == "span")
 452 | 			{
 453 | 				$ret .= $this->dom->default_span_text;
 454 | 			}
 455 | 
 456 | 
 457 | 		}
 458 | 		return $ret;
 459 | 	}
 460 | 
 461 | 	function xmltext()
 462 | 	{
 463 | 		$ret = $this->innertext();
 464 | 		$ret = str_ireplace('<![CDATA[', '', $ret);
 465 | 		$ret = str_replace(']]>', '', $ret);
 466 | 		return $ret;
 467 | 	}
 468 | 
 469 | 	// build node's text with tag
 470 | 	function makeup()
 471 | 	{
 472 | 		// text, comment, unknown
 473 | 		if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 474 | 
 475 | 		$ret = '<'.$this->tag;
 476 | 		$i = -1;
 477 | 
 478 | 		foreach ($this->attr as $key=>$val)
 479 | 		{
 480 | 			++$i;
 481 | 
 482 | 			// skip removed attribute
 483 | 			if ($val===null || $val===false)
 484 | 				continue;
 485 | 
 486 | 			$ret .= $this->_[HDOM_INFO_SPACE][$i][0];
 487 | 			//no value attr: nowrap, checked selected...
 488 | 			if ($val===true)
 489 | 				$ret .= $key;
 490 | 			else {
 491 | 				switch ($this->_[HDOM_INFO_QUOTE][$i])
 492 | 				{
 493 | 					case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
 494 | 					case HDOM_QUOTE_SINGLE: $quote = '\''; break;
 495 | 					default: $quote = '';
 496 | 				}
 497 | 				$ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
 498 | 			}
 499 | 		}
 500 | 		$ret = $this->dom->restore_noise($ret);
 501 | 		return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
 502 | 	}
 503 | 
 504 | 	// find elements by css selector
 505 | 	//PaperG - added ability for find to lowercase the value of the selector.
 506 | 	function find($selector, $idx=null, $lowercase=false)
 507 | 	{
 508 | 		$selectors = $this->parse_selector($selector);
 509 | 		if (($count=count($selectors))===0) return array();
 510 | 		$found_keys = array();
 511 | 
 512 | 		// find each selector
 513 | 		for ($c=0; $c<$count; ++$c)
 514 | 		{
 515 | 			// The change on the below line was documented on the sourceforge code tracker id 2788009
 516 | 			// used to be: if (($levle=count($selectors[0]))===0) return array();
 517 | 			if (($levle=count($selectors[$c]))===0) return array();
 518 | 			if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
 519 | 
 520 | 			$head = array($this->_[HDOM_INFO_BEGIN]=>1);
 521 | 
 522 | 			// handle descendant selectors, no recursive!
 523 | 			for ($l=0; $l<$levle; ++$l)
 524 | 			{
 525 | 				$ret = array();
 526 | 				foreach ($head as $k=>$v)
 527 | 				{
 528 | 					$n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
 529 | 					//PaperG - Pass this optional parameter on to the seek function.
 530 | 					$n->seek($selectors[$c][$l], $ret, $lowercase);
 531 | 				}
 532 | 				$head = $ret;
 533 | 			}
 534 | 
 535 | 			foreach ($head as $k=>$v)
 536 | 			{
 537 | 				if (!isset($found_keys[$k]))
 538 | 				{
 539 | 					$found_keys[$k] = 1;
 540 | 				}
 541 | 			}
 542 | 		}
 543 | 
 544 | 		// sort keys
 545 | 		ksort($found_keys);
 546 | 
 547 | 		$found = array();
 548 | 		foreach ($found_keys as $k=>$v)
 549 | 			$found[] = $this->dom->nodes[$k];
 550 | 
 551 | 		// return nth-element or array
 552 | 		if (is_null($idx)) return $found;
 553 | 		else if ($idx<0) $idx = count($found) + $idx;
 554 | 		return (isset($found[$idx])) ? $found[$idx] : null;
 555 | 	}
 556 | 
 557 | 	// seek for given conditions
 558 | 	// PaperG - added parameter to allow for case insensitive testing of the value of a selector.
 559 | 	protected function seek($selector, &$ret, $lowercase=false)
 560 | 	{
 561 | 		global $debug_object;
 562 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
 563 | 
 564 | 		list($tag, $key, $val, $exp, $no_key) = $selector;
 565 | 
 566 | 		// xpath index
 567 | 		if ($tag && $key && is_numeric($key))
 568 | 		{
 569 | 			$count = 0;
 570 | 			foreach ($this->children as $c)
 571 | 			{
 572 | 				if ($tag==='*' || $tag===$c->tag) {
 573 | 					if (++$count==$key) {
 574 | 						$ret[$c->_[HDOM_INFO_BEGIN]] = 1;
 575 | 						return;
 576 | 					}
 577 | 				}
 578 | 			}
 579 | 			return;
 580 | 		}
 581 | 
 582 | 		$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
 583 | 		if ($end==0) {
 584 | 			$parent = $this->parent;
 585 | 			while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
 586 | 				$end -= 1;
 587 | 				$parent = $parent->parent;
 588 | 			}
 589 | 			$end += $parent->_[HDOM_INFO_END];
 590 | 		}
 591 | 
 592 | 		for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
 593 | 			$node = $this->dom->nodes[$i];
 594 | 
 595 | 			$pass = true;
 596 | 
 597 | 			if ($tag==='*' && !$key) {
 598 | 				if (in_array($node, $this->children, true))
 599 | 					$ret[$i] = 1;
 600 | 				continue;
 601 | 			}
 602 | 
 603 | 			// compare tag
 604 | 			if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
 605 | 			// compare key
 606 | 			if ($pass && $key) {
 607 | 				if ($no_key) {
 608 | 					if (isset($node->attr[$key])) $pass=false;
 609 | 				} else {
 610 | 					if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
 611 | 				}
 612 | 			}
 613 | 			// compare value
 614 | 			if ($pass && $key && $val  && $val!=='*') {
 615 | 				// If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
 616 | 				if ($key == "plaintext") {
 617 | 					// $node->plaintext actually returns $node->text();
 618 | 					$nodeKeyValue = $node->text();
 619 | 				} else {
 620 | 					// this is a normal search, we want the value of that attribute of the tag.
 621 | 					$nodeKeyValue = $node->attr[$key];
 622 | 				}
 623 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
 624 | 
 625 | 				//PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
 626 | 				if ($lowercase) {
 627 | 					$check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
 628 | 				} else {
 629 | 					$check = $this->match($exp, $val, $nodeKeyValue);
 630 | 				}
 631 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));}
 632 | 
 633 | 				// handle multiple class
 634 | 				if (!$check && strcasecmp($key, 'class')===0) {
 635 | 					foreach (explode(' ',$node->attr[$key]) as $k) {
 636 | 						// Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
 637 | 						if (!empty($k)) {
 638 | 							if ($lowercase) {
 639 | 								$check = $this->match($exp, strtolower($val), strtolower($k));
 640 | 							} else {
 641 | 								$check = $this->match($exp, $val, $k);
 642 | 							}
 643 | 							if ($check) break;
 644 | 						}
 645 | 					}
 646 | 				}
 647 | 				if (!$check) $pass = false;
 648 | 			}
 649 | 			if ($pass) $ret[$i] = 1;
 650 | 			unset($node);
 651 | 		}
 652 | 		// It's passed by reference so this is actually what this function returns.
 653 | 		if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);}
 654 | 	}
 655 | 
 656 | 	protected function match($exp, $pattern, $value) {
 657 | 		global $debug_object;
 658 | 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
 659 | 
 660 | 		switch ($exp) {
 661 | 			case '=':
 662 | 				return ($value===$pattern);
 663 | 			case '!=':
 664 | 				return ($value!==$pattern);
 665 | 			case '^=':
 666 | 				return preg_match("/^".preg_quote($pattern,'/')."/", $value);
 667 | 			case '$=':
 668 | 				return preg_match("/".preg_quote($pattern,'/')."$/", $value);
 669 | 			case '*=':
 670 | 				if ($pattern[0]=='/') {
 671 | 					return preg_match($pattern, $value);
 672 | 				}
 673 | 				return preg_match("/".$pattern."/i", $value);
 674 | 		}
 675 | 		return false;
 676 | 	}
 677 | 
 678 | 	protected function parse_selector($selector_string) {
 679 | 		global $debug_object;
 680 | 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
 681 | 
 682 | 		// pattern of CSS selectors, modified from mootools
 683 | 		// Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
 684 | 		// Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
 685 | // Notice the \[ starting the attbute?  and the @? following?  This implies that an attribute can begin with an @ sign that is not captured.
 686 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
 687 | // farther study is required to determine of this should be documented or removed.
 688 | //		$pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
 689 | 		$pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
 690 | 		preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
 691 | 		if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}
 692 | 
 693 | 		$selectors = array();
 694 | 		$result = array();
 695 | 		//print_r($matches);
 696 | 
 697 | 		foreach ($matches as $m) {
 698 | 			$m[0] = trim($m[0]);
 699 | 			if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
 700 | 			// for browser generated xpath
 701 | 			if ($m[1]==='tbody') continue;
 702 | 
 703 | 			list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
 704 | 			if (!empty($m[2])) {$key='id'; $val=$m[2];}
 705 | 			if (!empty($m[3])) {$key='class'; $val=$m[3];}
 706 | 			if (!empty($m[4])) {$key=$m[4];}
 707 | 			if (!empty($m[5])) {$exp=$m[5];}
 708 | 			if (!empty($m[6])) {$val=$m[6];}
 709 | 
 710 | 			// convert to lowercase
 711 | 			if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
 712 | 			//elements that do NOT have the specified attribute
 713 | 			if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
 714 | 
 715 | 			$result[] = array($tag, $key, $val, $exp, $no_key);
 716 | 			if (trim($m[7])===',') {
 717 | 				$selectors[] = $result;
 718 | 				$result = array();
 719 | 			}
 720 | 		}
 721 | 		if (count($result)>0)
 722 | 			$selectors[] = $result;
 723 | 		return $selectors;
 724 | 	}
 725 | 
 726 | 	function __get($name)
 727 | 	{
 728 | 		if (isset($this->attr[$name]))
 729 | 		{
 730 | 			return $this->convert_text($this->attr[$name]);
 731 | 		}
 732 | 		switch ($name)
 733 | 		{
 734 | 			case 'outertext': return $this->outertext();
 735 | 			case 'innertext': return $this->innertext();
 736 | 			case 'plaintext': return $this->text();
 737 | 			case 'xmltext': return $this->xmltext();
 738 | 			default: return array_key_exists($name, $this->attr);
 739 | 		}
 740 | 	}
 741 | 
 742 | 	function __set($name, $value)
 743 | 	{
 744 | 		global $debug_object;
 745 | 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
 746 | 
 747 | 		switch ($name)
 748 | 		{
 749 | 			case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
 750 | 			case 'innertext':
 751 | 				if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
 752 | 				return $this->_[HDOM_INFO_INNER] = $value;
 753 | 		}
 754 | 		if (!isset($this->attr[$name]))
 755 | 		{
 756 | 			$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
 757 | 			$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
 758 | 		}
 759 | 		$this->attr[$name] = $value;
 760 | 	}
 761 | 
 762 | 	function __isset($name)
 763 | 	{
 764 | 		switch ($name)
 765 | 		{
 766 | 			case 'outertext': return true;
 767 | 			case 'innertext': return true;
 768 | 			case 'plaintext': return true;
 769 | 		}
 770 | 		//no value attr: nowrap, checked selected...
 771 | 		return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
 772 | 	}
 773 | 
 774 | 	function __unset($name) {
 775 | 		if (isset($this->attr[$name]))
 776 | 			unset($this->attr[$name]);
 777 | 	}
 778 | 
 779 | 	// PaperG - Function to convert the text from one character set to another if the two sets are not the same.
 780 | 	function convert_text($text)
 781 | 	{
 782 | 		global $debug_object;
 783 | 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
 784 | 
 785 | 		$converted_text = $text;
 786 | 
 787 | 		$sourceCharset = "";
 788 | 		$targetCharset = "";
 789 | 
 790 | 		if ($this->dom)
 791 | 		{
 792 | 			$sourceCharset = strtoupper($this->dom->_charset);
 793 | 			$targetCharset = strtoupper($this->dom->_target_charset);
 794 | 		}
 795 | 		if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
 796 | 
 797 | 		if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
 798 | 		{
 799 | 			// Check if the reported encoding could have been incorrect and the text is actually already UTF-8
 800 | 			if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
 801 | 			{
 802 | 				$converted_text = $text;
 803 | 			}
 804 | 			else
 805 | 			{
 806 | 				$converted_text = iconv($sourceCharset, $targetCharset, $text);
 807 | 			}
 808 | 		}
 809 | 
 810 | 		// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
 811 | 		if ($targetCharset == 'UTF-8')
 812 | 		{
 813 | 			if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
 814 | 			{
 815 | 				$converted_text = substr($converted_text, 3);
 816 | 			}
 817 | 			if (substr($converted_text, -3) == "\xef\xbb\xbf")
 818 | 			{
 819 | 				$converted_text = substr($converted_text, 0, -3);
 820 | 			}
 821 | 		}
 822 | 
 823 | 		return $converted_text;
 824 | 	}
 825 | 
 826 | 	/**
 827 | 	* Returns true if $string is valid UTF-8 and false otherwise.
 828 | 	*
 829 | 	* @param mixed $str String to be tested
 830 | 	* @return boolean
 831 | 	*/
 832 | 	static function is_utf8($str)
 833 | 	{
 834 | 		$c=0; $b=0;
 835 | 		$bits=0;
 836 | 		$len=strlen($str);
 837 | 		for($i=0; $i<$len; $i++)
 838 | 		{
 839 | 			$c=ord($str[$i]);
 840 | 			if($c > 128)
 841 | 			{
 842 | 				if(($c >= 254)) return false;
 843 | 				elseif($c >= 252) $bits=6;
 844 | 				elseif($c >= 248) $bits=5;
 845 | 				elseif($c >= 240) $bits=4;
 846 | 				elseif($c >= 224) $bits=3;
 847 | 				elseif($c >= 192) $bits=2;
 848 | 				else return false;
 849 | 				if(($i+$bits) > $len) return false;
 850 | 				while($bits > 1)
 851 | 				{
 852 | 					$i++;
 853 | 					$b=ord($str[$i]);
 854 | 					if($b < 128 || $b > 191) return false;
 855 | 					$bits--;
 856 | 				}
 857 | 			}
 858 | 		}
 859 | 		return true;
 860 | 	}
 861 | 	/*
 862 | 	function is_utf8($string)
 863 | 	{
 864 | 		//this is buggy
 865 | 		return (utf8_encode(utf8_decode($string)) == $string);
 866 | 	}
 867 | 	*/
 868 | 
 869 | 	/**
 870 | 	 * Function to try a few tricks to determine the displayed size of an img on the page.
 871 | 	 * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
 872 | 	 *
 873 | 	 * @author John Schlick
 874 | 	 * @version April 19 2012
 875 | 	 * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
 876 | 	 */
 877 | 	function get_display_size()
 878 | 	{
 879 | 		global $debug_object;
 880 | 
 881 | 		$width = -1;
 882 | 		$height = -1;
 883 | 
 884 | 		if ($this->tag !== 'img')
 885 | 		{
 886 | 			return false;
 887 | 		}
 888 | 
 889 | 		// See if there is aheight or width attribute in the tag itself.
 890 | 		if (isset($this->attr['width']))
 891 | 		{
 892 | 			$width = $this->attr['width'];
 893 | 		}
 894 | 
 895 | 		if (isset($this->attr['height']))
 896 | 		{
 897 | 			$height = $this->attr['height'];
 898 | 		}
 899 | 
 900 | 		// Now look for an inline style.
 901 | 		if (isset($this->attr['style']))
 902 | 		{
 903 | 			// Thanks to user gnarf from stackoverflow for this regular expression.
 904 | 			$attributes = array();
 905 | 			preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
 906 | 			foreach ($matches as $match) {
 907 | 			  $attributes[$match[1]] = $match[2];
 908 | 			}
 909 | 
 910 | 			// If there is a width in the style attributes:
 911 | 			if (isset($attributes['width']) && $width == -1)
 912 | 			{
 913 | 				// check that the last two characters are px (pixels)
 914 | 				if (strtolower(substr($attributes['width'], -2)) == 'px')
 915 | 				{
 916 | 					$proposed_width = substr($attributes['width'], 0, -2);
 917 | 					// Now make sure that it's an integer and not something stupid.
 918 | 					if (filter_var($proposed_width, FILTER_VALIDATE_INT))
 919 | 					{
 920 | 						$width = $proposed_width;
 921 | 					}
 922 | 				}
 923 | 			}
 924 | 
 925 | 			// If there is a width in the style attributes:
 926 | 			if (isset($attributes['height']) && $height == -1)
 927 | 			{
 928 | 				// check that the last two characters are px (pixels)
 929 | 				if (strtolower(substr($attributes['height'], -2)) == 'px')
 930 | 				{
 931 | 					$proposed_height = substr($attributes['height'], 0, -2);
 932 | 					// Now make sure that it's an integer and not something stupid.
 933 | 					if (filter_var($proposed_height, FILTER_VALIDATE_INT))
 934 | 					{
 935 | 						$height = $proposed_height;
 936 | 					}
 937 | 				}
 938 | 			}
 939 | 
 940 | 		}
 941 | 
 942 | 		// Future enhancement:
 943 | 		// Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
 944 | 
 945 | 		// Far future enhancement
 946 | 		// Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
 947 | 		// Note that in this case, the class or id will have the img subselector for it to apply to the image.
 948 | 
 949 | 		// ridiculously far future development
 950 | 		// If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
 951 | 
 952 | 		$result = array('height' => $height,
 953 | 						'width' => $width);
 954 | 		return $result;
 955 | 	}
 956 | 
 957 | 	// camel naming conventions
 958 | 	function getAllAttributes() {return $this->attr;}
 959 | 	function getAttribute($name) {return $this->__get($name);}
 960 | 	function setAttribute($name, $value) {$this->__set($name, $value);}
 961 | 	function hasAttribute($name) {return $this->__isset($name);}
 962 | 	function removeAttribute($name) {$this->__set($name, null);}
 963 | 	function getElementById($id) {return $this->find("#$id", 0);}
 964 | 	function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
 965 | 	function getElementByTagName($name) {return $this->find($name, 0);}
 966 | 	function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
 967 | 	function parentNode() {return $this->parent();}
 968 | 	function childNodes($idx=-1) {return $this->children($idx);}
 969 | 	function firstChild() {return $this->first_child();}
 970 | 	function lastChild() {return $this->last_child();}
 971 | 	function nextSibling() {return $this->next_sibling();}
 972 | 	function previousSibling() {return $this->prev_sibling();}
 973 | 	function hasChildNodes() {return $this->has_child();}
 974 | 	function nodeName() {return $this->tag;}
 975 | 	function appendChild($node) {$node->parent($this); return $node;}
 976 | 
 977 | }
 978 | 
 979 | /**
 980 |  * simple html dom parser
 981 |  * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
 982 |  * Paperg - change $size from protected to public so we can easily access it
 983 |  * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not.  Default is to NOT trust it.
 984 |  *
 985 |  * @package PlaceLocalInclude
 986 |  */
 987 | class simple_html_dom
 988 | {
 989 | 	public $root = null;
 990 | 	public $nodes = array();
 991 | 	public $callback = null;
 992 | 	public $lowercase = false;
 993 | 	// Used to keep track of how large the text was when we started.
 994 | 	public $original_size;
 995 | 	public $size;
 996 | 	protected $pos;
 997 | 	protected $doc;
 998 | 	protected $char;
 999 | 	protected $cursor;
1000 | 	protected $parent;
1001 | 	protected $noise = array();
1002 | 	protected $token_blank = " \t\r\n";
1003 | 	protected $token_equal = ' =/>';
1004 | 	protected $token_slash = " />\r\n\t";
1005 | 	protected $token_attr = ' >';
1006 | 	// Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
1007 | 	public $_charset = '';
1008 | 	public $_target_charset = '';
1009 | 	protected $default_br_text = "";
1010 | 	public $default_span_text = "";
1011 | 
1012 | 	// use isset instead of in_array, performance boost about 30%...
1013 | 	protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
1014 | 	protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
1015 | 	// Known sourceforge issue #2977341
1016 | 	// B tags that are not closed cause us to return everything to the end of the document.
1017 | 	protected $optional_closing_tags = array(
1018 | 		'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
1019 | 		'th'=>array('th'=>1),
1020 | 		'td'=>array('td'=>1),
1021 | 		'li'=>array('li'=>1),
1022 | 		'dt'=>array('dt'=>1, 'dd'=>1),
1023 | 		'dd'=>array('dd'=>1, 'dt'=>1),
1024 | 		'dl'=>array('dd'=>1, 'dt'=>1),
1025 | 		'p'=>array('p'=>1),
1026 | 		'nobr'=>array('nobr'=>1),
1027 | 		'b'=>array('b'=>1),
1028 | 		'option'=>array('option'=>1),
1029 | 	);
1030 | 
1031 | 	function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1032 | 	{
1033 | 		if ($str)
1034 | 		{
1035 | 			if (preg_match("/^http:\/\//i",$str) || is_file($str))
1036 | 			{
1037 | 				$this->load_file($str);
1038 | 			}
1039 | 			else
1040 | 			{
1041 | 				$this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1042 | 			}
1043 | 		}
1044 | 		// Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1045 | 		if (!$forceTagsClosed) {
1046 | 			$this->optional_closing_array=array();
1047 | 		}
1048 | 		$this->_target_charset = $target_charset;
1049 | 	}
1050 | 
1051 | 	function __destruct()
1052 | 	{
1053 | 		$this->clear();
1054 | 	}
1055 | 
1056 | 	// load html from string
1057 | 	function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1058 | 	{
1059 | 		global $debug_object;
1060 | 
1061 | 		// prepare
1062 | 		$this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1063 | 		// strip out cdata
1064 | 		$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1065 | 		// strip out comments
1066 | 		$this->remove_noise("'<!--(.*?)-->'is");
1067 | 		// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1068 | 		// Script tags removal now preceeds style tag removal.
1069 | 		// strip out <script> tags
1070 | 		$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1071 | 		$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1072 | 		// strip out <style> tags
1073 | 		$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1074 | 		$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1075 | 		// strip out preformatted tags
1076 | 		$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1077 | 		// strip out server side scripts
1078 | 		$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1079 | 		// strip smarty scripts
1080 | 		$this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1081 | 
1082 | 		// parsing
1083 | 		while ($this->parse());
1084 | 		// end
1085 | 		$this->root->_[HDOM_INFO_END] = $this->cursor;
1086 | 		$this->parse_charset();
1087 | 
1088 | 		// make load function chainable
1089 | 		return $this;
1090 | 
1091 | 	}
1092 | 
1093 | 	// load html from file
1094 | 	function load_file()
1095 | 	{
1096 | 		$args = func_get_args();
1097 | 		$this->load(call_user_func_array('file_get_contents', $args), true);
1098 | 		// Throw an error if we can't properly load the dom.
1099 | 		if (($error=error_get_last())!==null) {
1100 | 			$this->clear();
1101 | 			return false;
1102 | 		}
1103 | 	}
1104 | 
1105 | 	// set callback function
1106 | 	function set_callback($function_name)
1107 | 	{
1108 | 		$this->callback = $function_name;
1109 | 	}
1110 | 
1111 | 	// remove callback function
1112 | 	function remove_callback()
1113 | 	{
1114 | 		$this->callback = null;
1115 | 	}
1116 | 
1117 | 	// save dom as string
1118 | 	function save($filepath='')
1119 | 	{
1120 | 		$ret = $this->root->innertext();
1121 | 		if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
1122 | 		return $ret;
1123 | 	}
1124 | 
1125 | 	// find dom node by css selector
1126 | 	// Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1127 | 	function find($selector, $idx=null, $lowercase=false)
1128 | 	{
1129 | 		return $this->root->find($selector, $idx, $lowercase);
1130 | 	}
1131 | 
1132 | 	// clean up memory due to php5 circular references memory leak...
1133 | 	function clear()
1134 | 	{
1135 | 		foreach ($this->nodes as $n) {$n->clear(); $n = null;}
1136 | 		// This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
1137 | 		if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
1138 | 		if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
1139 | 		if (isset($this->root)) {$this->root->clear(); unset($this->root);}
1140 | 		unset($this->doc);
1141 | 		unset($this->noise);
1142 | 	}
1143 | 
1144 | 	function dump($show_attr=true)
1145 | 	{
1146 | 		$this->root->dump($show_attr);
1147 | 	}
1148 | 
1149 | 	// prepare HTML data and init everything
1150 | 	protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1151 | 	{
1152 | 		$this->clear();
1153 | 
1154 | 		// set the length of content before we do anything to it.
1155 | 		$this->size = strlen($str);
1156 | 		// Save the original size of the html that we got in.  It might be useful to someone.
1157 | 		$this->original_size = $this->size;
1158 | 
1159 | 		//before we save the string as the doc...  strip out the \r \n's if we are told to.
1160 | 		if ($stripRN) {
1161 | 			$str = str_replace("\r", " ", $str);
1162 | 			$str = str_replace("\n", " ", $str);
1163 | 
1164 | 			// set the length of content since we have changed it.
1165 | 			$this->size = strlen($str);
1166 | 		}
1167 | 
1168 | 		$this->doc = $str;
1169 | 		$this->pos = 0;
1170 | 		$this->cursor = 1;
1171 | 		$this->noise = array();
1172 | 		$this->nodes = array();
1173 | 		$this->lowercase = $lowercase;
1174 | 		$this->default_br_text = $defaultBRText;
1175 | 		$this->default_span_text = $defaultSpanText;
1176 | 		$this->root = new simple_html_dom_node($this);
1177 | 		$this->root->tag = 'root';
1178 | 		$this->root->_[HDOM_INFO_BEGIN] = -1;
1179 | 		$this->root->nodetype = HDOM_TYPE_ROOT;
1180 | 		$this->parent = $this->root;
1181 | 		if ($this->size>0) $this->char = $this->doc[0];
1182 | 	}
1183 | 
1184 | 	// parse html content
1185 | 	protected function parse()
1186 | 	{
1187 | 		if (($s = $this->copy_until_char('<'))==='')
1188 | 		{
1189 | 			return $this->read_tag();
1190 | 		}
1191 | 
1192 | 		// text
1193 | 		$node = new simple_html_dom_node($this);
1194 | 		++$this->cursor;
1195 | 		$node->_[HDOM_INFO_TEXT] = $s;
1196 | 		$this->link_nodes($node, false);
1197 | 		return true;
1198 | 	}
1199 | 
1200 | 	// PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
1201 | 	// NOTE:  IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
1202 | 	// (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
1203 | 	protected function parse_charset()
1204 | 	{
1205 | 		global $debug_object;
1206 | 
1207 | 		$charset = null;
1208 | 
1209 | 		if (function_exists('get_last_retrieve_url_contents_content_type'))
1210 | 		{
1211 | 			$contentTypeHeader = get_last_retrieve_url_contents_content_type();
1212 | 			$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1213 | 			if ($success)
1214 | 			{
1215 | 				$charset = $matches[1];
1216 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);}
1217 | 			}
1218 | 
1219 | 		}
1220 | 
1221 | 		if (empty($charset))
1222 | 		{
1223 | 			$el = $this->root->find('meta[http-equiv=Content-Type]',0, true);
1224 | 			if (!empty($el))
1225 | 			{
1226 | 				$fullvalue = $el->content;
1227 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);}
1228 | 
1229 | 				if (!empty($fullvalue))
1230 | 				{
1231 | 					$success = preg_match('/charset=(.+)/i', $fullvalue, $matches);
1232 | 					if ($success)
1233 | 					{
1234 | 						$charset = $matches[1];
1235 | 					}
1236 | 					else
1237 | 					{
1238 | 						// If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1239 | 						if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
1240 | 						$charset = 'ISO-8859-1';
1241 | 					}
1242 | 				}
1243 | 			}
1244 | 		}
1245 | 
1246 | 		// If we couldn't find a charset above, then lets try to detect one based on the text we got...
1247 | 		if (empty($charset))
1248 | 		{
1249 | 			// Use this in case mb_detect_charset isn't installed/loaded on this machine.
1250 | 			$charset = false;
1251 | 			if (function_exists('mb_detect_encoding'))
1252 | 			{
1253 | 				// Have php try to detect the encoding from the text given to us.
1254 | 				$charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1255 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);}
1256 | 			}
1257 | 
1258 | 			// and if this doesn't work...  then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1259 | 			if ($charset === false)
1260 | 			{
1261 | 				if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');}
1262 | 				$charset = 'UTF-8';
1263 | 			}
1264 | 		}
1265 | 
1266 | 		// Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1267 | 		if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1268 | 		{
1269 | 			if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
1270 | 			$charset = 'CP1252';
1271 | 		}
1272 | 
1273 | 		if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);}
1274 | 
1275 | 		return $this->_charset = $charset;
1276 | 	}
1277 | 
1278 | 	// read tag info
1279 | 	protected function read_tag()
1280 | 	{
1281 | 		if ($this->char!=='<')
1282 | 		{
1283 | 			$this->root->_[HDOM_INFO_END] = $this->cursor;
1284 | 			return false;
1285 | 		}
1286 | 		$begin_tag_pos = $this->pos;
1287 | 		$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1288 | 
1289 | 		// end tag
1290 | 		if ($this->char==='/')
1291 | 		{
1292 | 			$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1293 | 			// This represents the change in the simple_html_dom trunk from revision 180 to 181.
1294 | 			// $this->skip($this->token_blank_t);
1295 | 			$this->skip($this->token_blank);
1296 | 			$tag = $this->copy_until_char('>');
1297 | 
1298 | 			// skip attributes in end tag
1299 | 			if (($pos = strpos($tag, ' '))!==false)
1300 | 				$tag = substr($tag, 0, $pos);
1301 | 
1302 | 			$parent_lower = strtolower($this->parent->tag);
1303 | 			$tag_lower = strtolower($tag);
1304 | 
1305 | 			if ($parent_lower!==$tag_lower)
1306 | 			{
1307 | 				if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower]))
1308 | 				{
1309 | 					$this->parent->_[HDOM_INFO_END] = 0;
1310 | 					$org_parent = $this->parent;
1311 | 
1312 | 					while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1313 | 						$this->parent = $this->parent->parent;
1314 | 
1315 | 					if (strtolower($this->parent->tag)!==$tag_lower) {
1316 | 						$this->parent = $org_parent; // restore origonal parent
1317 | 						if ($this->parent->parent) $this->parent = $this->parent->parent;
1318 | 						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1319 | 						return $this->as_text_node($tag);
1320 | 					}
1321 | 				}
1322 | 				else if (($this->parent->parent) && isset($this->block_tags[$tag_lower]))
1323 | 				{
1324 | 					$this->parent->_[HDOM_INFO_END] = 0;
1325 | 					$org_parent = $this->parent;
1326 | 
1327 | 					while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1328 | 						$this->parent = $this->parent->parent;
1329 | 
1330 | 					if (strtolower($this->parent->tag)!==$tag_lower)
1331 | 					{
1332 | 						$this->parent = $org_parent; // restore origonal parent
1333 | 						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1334 | 						return $this->as_text_node($tag);
1335 | 					}
1336 | 				}
1337 | 				else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower)
1338 | 				{
1339 | 					$this->parent->_[HDOM_INFO_END] = 0;
1340 | 					$this->parent = $this->parent->parent;
1341 | 				}
1342 | 				else
1343 | 					return $this->as_text_node($tag);
1344 | 			}
1345 | 
1346 | 			$this->parent->_[HDOM_INFO_END] = $this->cursor;
1347 | 			if ($this->parent->parent) $this->parent = $this->parent->parent;
1348 | 
1349 | 			$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1350 | 			return true;
1351 | 		}
1352 | 
1353 | 		$node = new simple_html_dom_node($this);
1354 | 		$node->_[HDOM_INFO_BEGIN] = $this->cursor;
1355 | 		++$this->cursor;
1356 | 		$tag = $this->copy_until($this->token_slash);
1357 | 		$node->tag_start = $begin_tag_pos;
1358 | 
1359 | 		// doctype, cdata & comments...
1360 | 		if (isset($tag[0]) && $tag[0]==='!') {
1361 | 			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1362 | 
1363 | 			if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {
1364 | 				$node->nodetype = HDOM_TYPE_COMMENT;
1365 | 				$node->tag = 'comment';
1366 | 			} else {
1367 | 				$node->nodetype = HDOM_TYPE_UNKNOWN;
1368 | 				$node->tag = 'unknown';
1369 | 			}
1370 | 			if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1371 | 			$this->link_nodes($node, true);
1372 | 			$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1373 | 			return true;
1374 | 		}
1375 | 
1376 | 		// text
1377 | 		if ($pos=strpos($tag, '<')!==false) {
1378 | 			$tag = '<' . substr($tag, 0, -1);
1379 | 			$node->_[HDOM_INFO_TEXT] = $tag;
1380 | 			$this->link_nodes($node, false);
1381 | 			$this->char = $this->doc[--$this->pos]; // prev
1382 | 			return true;
1383 | 		}
1384 | 
1385 | 		if (!preg_match("/^[\w-:]+$/", $tag)) {
1386 | 			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1387 | 			if ($this->char==='<') {
1388 | 				$this->link_nodes($node, false);
1389 | 				return true;
1390 | 			}
1391 | 
1392 | 			if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1393 | 			$this->link_nodes($node, false);
1394 | 			$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1395 | 			return true;
1396 | 		}
1397 | 
1398 | 		// begin tag
1399 | 		$node->nodetype = HDOM_TYPE_ELEMENT;
1400 | 		$tag_lower = strtolower($tag);
1401 | 		$node->tag = ($this->lowercase) ? $tag_lower : $tag;
1402 | 
1403 | 		// handle optional closing tags
1404 | 		if (isset($this->optional_closing_tags[$tag_lower]) )
1405 | 		{
1406 | 			while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)]))
1407 | 			{
1408 | 				$this->parent->_[HDOM_INFO_END] = 0;
1409 | 				$this->parent = $this->parent->parent;
1410 | 			}
1411 | 			$node->parent = $this->parent;
1412 | 		}
1413 | 
1414 | 		$guard = 0; // prevent infinity loop
1415 | 		$space = array($this->copy_skip($this->token_blank), '', '');
1416 | 
1417 | 		// attributes
1418 | 		do
1419 | 		{
1420 | 			if ($this->char!==null && $space[0]==='')
1421 | 			{
1422 | 				break;
1423 | 			}
1424 | 			$name = $this->copy_until($this->token_equal);
1425 | 			if ($guard===$this->pos)
1426 | 			{
1427 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1428 | 				continue;
1429 | 			}
1430 | 			$guard = $this->pos;
1431 | 
1432 | 			// handle endless '<'
1433 | 			if ($this->pos>=$this->size-1 && $this->char!=='>') {
1434 | 				$node->nodetype = HDOM_TYPE_TEXT;
1435 | 				$node->_[HDOM_INFO_END] = 0;
1436 | 				$node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
1437 | 				$node->tag = 'text';
1438 | 				$this->link_nodes($node, false);
1439 | 				return true;
1440 | 			}
1441 | 
1442 | 			// handle mismatch '<'
1443 | 			if ($this->doc[$this->pos-1]=='<') {
1444 | 				$node->nodetype = HDOM_TYPE_TEXT;
1445 | 				$node->tag = 'text';
1446 | 				$node->attr = array();
1447 | 				$node->_[HDOM_INFO_END] = 0;
1448 | 				$node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
1449 | 				$this->pos -= 2;
1450 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1451 | 				$this->link_nodes($node, false);
1452 | 				return true;
1453 | 			}
1454 | 
1455 | 			if ($name!=='/' && $name!=='') {
1456 | 				$space[1] = $this->copy_skip($this->token_blank);
1457 | 				$name = $this->restore_noise($name);
1458 | 				if ($this->lowercase) $name = strtolower($name);
1459 | 				if ($this->char==='=') {
1460 | 					$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1461 | 					$this->parse_attr($node, $name, $space);
1462 | 				}
1463 | 				else {
1464 | 					//no value attr: nowrap, checked selected...
1465 | 					$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1466 | 					$node->attr[$name] = true;
1467 | 					if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
1468 | 				}
1469 | 				$node->_[HDOM_INFO_SPACE][] = $space;
1470 | 				$space = array($this->copy_skip($this->token_blank), '', '');
1471 | 			}
1472 | 			else
1473 | 				break;
1474 | 		} while ($this->char!=='>' && $this->char!=='/');
1475 | 
1476 | 		$this->link_nodes($node, true);
1477 | 		$node->_[HDOM_INFO_ENDSPACE] = $space[0];
1478 | 
1479 | 		// check self closing
1480 | 		if ($this->copy_until_char_escape('>')==='/')
1481 | 		{
1482 | 			$node->_[HDOM_INFO_ENDSPACE] .= '/';
1483 | 			$node->_[HDOM_INFO_END] = 0;
1484 | 		}
1485 | 		else
1486 | 		{
1487 | 			// reset parent
1488 | 			if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
1489 | 		}
1490 | 		$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1491 | 
1492 | 		// If it's a BR tag, we need to set it's text to the default text.
1493 | 		// This way when we see it in plaintext, we can generate formatting that the user wants.
1494 | 		// since a br tag never has sub nodes, this works well.
1495 | 		if ($node->tag == "br")
1496 | 		{
1497 | 			$node->_[HDOM_INFO_INNER] = $this->default_br_text;
1498 | 		}
1499 | 
1500 | 		return true;
1501 | 	}
1502 | 
1503 | 	// parse attributes
1504 | 	protected function parse_attr($node, $name, &$space)
1505 | 	{
1506 | 		// Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
1507 | 		// If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.
1508 | 		if (isset($node->attr[$name]))
1509 | 		{
1510 | 			return;
1511 | 		}
1512 | 
1513 | 		$space[2] = $this->copy_skip($this->token_blank);
1514 | 		switch ($this->char) {
1515 | 			case '"':
1516 | 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1517 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1518 | 				$node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));
1519 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1520 | 				break;
1521 | 			case '\'':
1522 | 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
1523 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1524 | 				$node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));
1525 | 				$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1526 | 				break;
1527 | 			default:
1528 | 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1529 | 				$node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
1530 | 		}
1531 | 		// PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
1532 | 		$node->attr[$name] = str_replace("\r", "", $node->attr[$name]);
1533 | 		$node->attr[$name] = str_replace("\n", "", $node->attr[$name]);
1534 | 		// PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
1535 | 		if ($name == "class") {
1536 | 			$node->attr[$name] = trim($node->attr[$name]);
1537 | 		}
1538 | 	}
1539 | 
1540 | 	// link node's parent
1541 | 	protected function link_nodes(&$node, $is_child)
1542 | 	{
1543 | 		$node->parent = $this->parent;
1544 | 		$this->parent->nodes[] = $node;
1545 | 		if ($is_child)
1546 | 		{
1547 | 			$this->parent->children[] = $node;
1548 | 		}
1549 | 	}
1550 | 
1551 | 	// as a text node
1552 | 	protected function as_text_node($tag)
1553 | 	{
1554 | 		$node = new simple_html_dom_node($this);
1555 | 		++$this->cursor;
1556 | 		$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
1557 | 		$this->link_nodes($node, false);
1558 | 		$this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1559 | 		return true;
1560 | 	}
1561 | 
1562 | 	protected function skip($chars)
1563 | 	{
1564 | 		$this->pos += strspn($this->doc, $chars, $this->pos);
1565 | 		$this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1566 | 	}
1567 | 
1568 | 	protected function copy_skip($chars)
1569 | 	{
1570 | 		$pos = $this->pos;
1571 | 		$len = strspn($this->doc, $chars, $pos);
1572 | 		$this->pos += $len;
1573 | 		$this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1574 | 		if ($len===0) return '';
1575 | 		return substr($this->doc, $pos, $len);
1576 | 	}
1577 | 
1578 | 	protected function copy_until($chars)
1579 | 	{
1580 | 		$pos = $this->pos;
1581 | 		$len = strcspn($this->doc, $chars, $pos);
1582 | 		$this->pos += $len;
1583 | 		$this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1584 | 		return substr($this->doc, $pos, $len);
1585 | 	}
1586 | 
1587 | 	protected function copy_until_char($char)
1588 | 	{
1589 | 		if ($this->char===null) return '';
1590 | 
1591 | 		if (($pos = strpos($this->doc, $char, $this->pos))===false) {
1592 | 			$ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1593 | 			$this->char = null;
1594 | 			$this->pos = $this->size;
1595 | 			return $ret;
1596 | 		}
1597 | 
1598 | 		if ($pos===$this->pos) return '';
1599 | 		$pos_old = $this->pos;
1600 | 		$this->char = $this->doc[$pos];
1601 | 		$this->pos = $pos;
1602 | 		return substr($this->doc, $pos_old, $pos-$pos_old);
1603 | 	}
1604 | 
1605 | 	protected function copy_until_char_escape($char)
1606 | 	{
1607 | 		if ($this->char===null) return '';
1608 | 
1609 | 		$start = $this->pos;
1610 | 		while (1)
1611 | 		{
1612 | 			if (($pos = strpos($this->doc, $char, $start))===false)
1613 | 			{
1614 | 				$ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1615 | 				$this->char = null;
1616 | 				$this->pos = $this->size;
1617 | 				return $ret;
1618 | 			}
1619 | 
1620 | 			if ($pos===$this->pos) return '';
1621 | 
1622 | 			if ($this->doc[$pos-1]==='\\') {
1623 | 				$start = $pos+1;
1624 | 				continue;
1625 | 			}
1626 | 
1627 | 			$pos_old = $this->pos;
1628 | 			$this->char = $this->doc[$pos];
1629 | 			$this->pos = $pos;
1630 | 			return substr($this->doc, $pos_old, $pos-$pos_old);
1631 | 		}
1632 | 	}
1633 | 
1634 | 	// remove noise from html content
1635 | 	// save the noise in the $this->noise array.
1636 | 	protected function remove_noise($pattern, $remove_tag=false)
1637 | 	{
1638 | 		global $debug_object;
1639 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1640 | 
1641 | 		$count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
1642 | 
1643 | 		for ($i=$count-1; $i>-1; --$i)
1644 | 		{
1645 | 			$key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
1646 | 			if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); }
1647 | 			$idx = ($remove_tag) ? 0 : 1;
1648 | 			$this->noise[$key] = $matches[$i][$idx][0];
1649 | 			$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
1650 | 		}
1651 | 
1652 | 		// reset the length of content
1653 | 		$this->size = strlen($this->doc);
1654 | 		if ($this->size>0)
1655 | 		{
1656 | 			$this->char = $this->doc[0];
1657 | 		}
1658 | 	}
1659 | 
1660 | 	// restore noise to html content
1661 | 	function restore_noise($text)
1662 | 	{
1663 | 		global $debug_object;
1664 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1665 | 
1666 | 		while (($pos=strpos($text, '___noise___'))!==false)
1667 | 		{
1668 | 			// Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
1669 | 			if (strlen($text) > $pos+15)
1670 | 			{
1671 | 				$key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
1672 | 				if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); }
1673 | 
1674 | 				if (isset($this->noise[$key]))
1675 | 				{
1676 | 					$text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16);
1677 | 				}
1678 | 				else
1679 | 				{
1680 | 					// do this to prevent an infinite loop.
1681 | 					$text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16);
1682 | 				}
1683 | 			}
1684 | 			else
1685 | 			{
1686 | 				// There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
1687 | 				$text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11);
1688 | 			}
1689 | 		}
1690 | 		return $text;
1691 | 	}
1692 | 
1693 | 	// Sometimes we NEED one of the noise elements.
1694 | 	function search_noise($text)
1695 | 	{
1696 | 		global $debug_object;
1697 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1698 | 
1699 | 		foreach($this->noise as $noiseElement)
1700 | 		{
1701 | 			if (strpos($noiseElement, $text)!==false)
1702 | 			{
1703 | 				return $noiseElement;
1704 | 			}
1705 | 		}
1706 | 	}
1707 | 	function __toString()
1708 | 	{
1709 | 		return $this->root->innertext();
1710 | 	}
1711 | 
1712 | 	function __get($name)
1713 | 	{
1714 | 		switch ($name)
1715 | 		{
1716 | 			case 'outertext':
1717 | 				return $this->root->innertext();
1718 | 			case 'innertext':
1719 | 				return $this->root->innertext();
1720 | 			case 'plaintext':
1721 | 				return $this->root->text();
1722 | 			case 'charset':
1723 | 				return $this->_charset;
1724 | 			case 'target_charset':
1725 | 				return $this->_target_charset;
1726 | 		}
1727 | 	}
1728 | 
1729 | 	// camel naming conventions
1730 | 	function childNodes($idx=-1) {return $this->root->childNodes($idx);}
1731 | 	function firstChild() {return $this->root->first_child();}
1732 | 	function lastChild() {return $this->root->last_child();}
1733 | 	function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();}
1734 | 	function createTextNode($value) {return @end(str_get_html($value)->nodes);}
1735 | 	function getElementById($id) {return $this->find("#$id", 0);}
1736 | 	function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
1737 | 	function getElementByTagName($name) {return $this->find($name, 0);}
1738 | 	function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
1739 | 	function loadFile() {$args = func_get_args();$this->load_file($args);}
1740 | }
1741 | 
1742 | ?>
1743 | 


--------------------------------------------------------------------------------