├── README.md ├── composer.json ├── example.php └── src ├── CekBNI.php ├── bni-cookie.txt └── simple_html_dom.php /README.md: -------------------------------------------------------------------------------- 1 | ## Script Cek mutasi BNI 2 | 3 | Hasil waktu luang pas weekend. 4 | 5 | Cara pakai. 6 | 7 | require_once 'src/CekBNI.php'; 8 | $config = [ 9 | 'credential' => [ 10 | 'username' => 'jenengmu', 11 | 'password' => 'passwordmu' 12 | ], 13 | 'nomor_rekening' => '0400xxxxxx', //No. Rekening 14 | 'range' => [ 15 | 'tgl_akhir' => date('d-M-Y',strtotime('2018-07-31')), 16 | 'tgl_awal' => date('d-M-Y',strtotime('2018-07-01')) 17 | ], 18 | ]; 19 | 20 | $bni = new CekBNI($config); 21 | var_dump($bni->toArray()); 22 | 23 | lebih lengkap lihat example.php 24 | 25 | [Butuh bantuan lain? hubungi saya via telegram](https://t.me/galihazizif) -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "galihazizif/mutasi-bni", 3 | "type": "library", 4 | "description": "Cek mutasi internet banking BNI", 5 | "keywords": [ 6 | "galihazizif", 7 | "mutasi-bni" 8 | ], 9 | "homepage": "https://github.com/galihazizif/mutasi-bni", 10 | "license": "MIT", 11 | "authors": [ 12 | { 13 | "name": "Galih Azizi F", 14 | "email": "galih@rempoah.com", 15 | "homepage": "https://rempoah.com", 16 | "role": "Developer" 17 | } 18 | ], 19 | "require": { 20 | "php" : "~7.0" 21 | }, 22 | "require-dev": { 23 | "phpunit/phpunit" : ">=7.0", 24 | "squizlabs/php_codesniffer": "^3.0" 25 | }, 26 | "autoload": { 27 | "psr-4": { 28 | "galihazizif\\mutasi-bni\\": "src" 29 | } 30 | }, 31 | "scripts": { 32 | "test": "phpunit", 33 | "check-style": "phpcs src tests", 34 | "fix-style": "phpcbf src tests" 35 | }, 36 | "extra": { 37 | "branch-alias": { 38 | "dev-master": "1.0-dev" 39 | } 40 | }, 41 | "config": { 42 | "sort-packages": true 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /example.php: -------------------------------------------------------------------------------- 1 | [ 6 | 'username' => 'username-internet-banking', 7 | 'password' => 'password-internet-banking' 8 | ], 9 | 'nomor_rekening' => '040xxxxxx', //No. Rekening 10 | 'range' => [ 11 | 'tgl_akhir' => date('d-M-Y',strtotime('2018-07-31')), 12 | 'tgl_awal' => date('d-M-Y',strtotime('2018-07-01')) 13 | ], 14 | ]; 15 | 16 | $bni = new CekBNI($config); 17 | var_dump($bni->toArray()); 18 | -------------------------------------------------------------------------------- /src/CekBNI.php: -------------------------------------------------------------------------------- 1 | cookie; 32 | $this->dom = new simple_html_dom(); 33 | $this->config = $config; 34 | $ch = curl_init(); 35 | curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie); 36 | curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie); 37 | curl_setopt($ch, CURLOPT_USERAGENT, self::ua); 38 | curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0); 39 | curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0); 40 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 41 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 42 | $this->ch = $ch; 43 | 44 | $this->prepareLogin(); 45 | $this->login(); 46 | $this->getMutasi(); 47 | $this->logout(); 48 | }catch(\Exception $e){ 49 | echo $e->getMessage().PHP_EOL; 50 | } 51 | 52 | } 53 | 54 | function get_string_between($string, $start, $end){ 55 | $string = ' ' . $string; 56 | $ini = strpos($string, $start); 57 | if ($ini == 0) return ''; 58 | $ini += strlen($start); 59 | $len = strpos($string, $end, $ini) - $ini; 60 | return substr($string, $ini, $len); 61 | } 62 | 63 | private function prepareLogin(){ 64 | $ch = $this->ch; 65 | curl_setopt($ch, CURLOPT_URL,self::urlPrepareLogin); 66 | $this->result = curl_exec($ch); 67 | } 68 | 69 | private function login(){ 70 | $dom = $this->dom; 71 | $dom->load($this->result); 72 | $form = $dom->find('form',0); 73 | $config = $this->config; 74 | 75 | $postdata = "Num_Field_Err=%22Please+enter+digits+only%21%22&Mand_Field_Err=%22Mandatory+field+is+empty%21%22&CorpId=".urlencode($config['credential']['username'])."&PassWord=".urlencode($config['credential']['password'])."&__AUTHENTICATE__=Login&CancelPage=HomePage.xml&USER_TYPE=1&MBLocale=bh&language=bh&AUTHENTICATION_REQUEST=True&__JS_ENCRYPT_KEY__=&JavaScriptEnabled=N&deviceID=&machineFingerPrint=&deviceType=&browserType=&uniqueURLStatus=disabled&imc_service_page=SignOnRetRq&Alignment=LEFT&page=SignOnRetRq&locale=en&PageName=Thin_SignOnRetRq.xml&serviceType=Dynamic"; 76 | 77 | $ch = $this->ch; 78 | curl_setopt($ch, CURLOPT_URL, $form->action); 79 | curl_setopt($ch, CURLOPT_POSTFIELDS, $postdata); 80 | curl_setopt($ch, CURLOPT_POST, 1); 81 | $result = curl_exec($ch); 82 | $this->result = $result; 83 | 84 | } 85 | 86 | private function getMutasi(){ 87 | $dom = $this->dom; 88 | $config = $this->config; 89 | $ch = $this->ch; 90 | $dom->load($this->result); 91 | $form = $dom->find('form',0); 92 | 93 | $anchor = $dom->find("#MBMenuList", 0); 94 | parse_str($anchor->href,$parameters); 95 | $this->parameters = $parameters; 96 | $postdata = "Num_Field_Err=%22Please+enter+digits+only%21%22&Mand_Field_Err=%22Mandatory+field+is+empty%21%22&acc1=OPR%7C0000000".$config['nomor_rekening']."%7CTab+BNI+iB+Hasanah+Wadiah+IDR&TxnPeriod=LastMonth&Search_Option=Date&txnSrcFromDate=".$config['range']['tgl_awal']."&txnSrcToDate=".$config['range']['tgl_akhir']."&FullStmtInqRq=Lanjut&MAIN_ACCOUNT_TYPE=OPR&mbparam=".urlencode($parameters['mbparam'])."&uniqueURLStatus=disabled&imc_service_page=AccountIDSelectRq&Alignment=LEFT&page=AccountIDSelectRq&locale=bh&PageName=FullStmtInqRq&serviceType=Dynamic"; 97 | $this->actionUrl = $form->action; 98 | curl_setopt($ch, CURLOPT_URL, $form->action); 99 | curl_setopt($ch, CURLOPT_POSTFIELDS, $postdata); 100 | curl_setopt($ch, CURLOPT_REFERER, $this->actionUrl); 101 | curl_setopt($ch, CURLOPT_POST, 1); 102 | $result = curl_exec($ch); 103 | 104 | $dom->clear(); 105 | $dom->load($result); 106 | $finalResult = $result; 107 | 108 | $nextData = $dom->find("#NextData", 0); 109 | 110 | $ch = $this->ch; 111 | while($nextData != null){ 112 | $nextUrl = $this->get_string_between($nextData->getAttribute('href'),"'","'"); 113 | curl_setopt($ch, CURLOPT_URL, $nextUrl); 114 | curl_setopt($ch, CURLOPT_REFERER, $this->actionUrl); 115 | $data = curl_exec($ch); 116 | // echo $dom->getElementByTagName("body")->innertext(); 117 | $finalResult = $finalResult.$data; 118 | $dom->clear(); 119 | $dom->load($data); 120 | $nextData = $dom->getElementById('NextData'); 121 | } 122 | 123 | $this->result = $finalResult; 124 | // file_put_contents("bni.hasil_mutasi.html",$finalResult); 125 | 126 | } 127 | 128 | private function logout(){ 129 | 130 | $mbparam = $this->parameters['mbparam']; 131 | $postdata = "Num_Field_Err=%22Please+enter+digits+only%21%22&Mand_Field_Err=%22Mandatory+field+is+empty%21%22&__LOGOUT__=Keluar&mbparam=".urlencode($mbparam)."&uniqueURLStatus=disabled&imc_service_page=SignOffUrlRq&Alignment=LEFT&page=SignOffUrlRq&locale=bh&PageName=LoginRs&serviceType=Dynamic"; 132 | 133 | $ch = $this->ch; 134 | curl_setopt($ch, CURLOPT_URL, $this->actionUrl); 135 | curl_setopt($ch, CURLOPT_POSTFIELDS, $postdata); 136 | curl_setopt($ch, CURLOPT_REFERER, $this->actionUrl); 137 | curl_setopt($ch, CURLOPT_POST, 1); 138 | curl_exec($ch); 139 | // echo "Logout".PHP_EOL; 140 | } 141 | 142 | private function parseResult(){ 143 | // $this->result = file_get_contents("bni.hasil_mutasi.html"); 144 | $dom = $this->dom; 145 | $dom->load($this->result); 146 | $transData = []; 147 | $orients = $dom->getElementsById('orient'); 148 | 149 | foreach($orients as $orient){ 150 | $str = ''; 151 | $tables = $orient->getElementsByTagName('table'); 152 | foreach($tables as $table){ 153 | $span = $table->getElementByTagName('td')->getElementsByTagName('span'); 154 | if(!empty($span[1])){ 155 | if(preg_match("[\d\d\-\w\w\w\-\d\d\d\d]", $span[1]->innertext())){ 156 | $str.="##"; 157 | } 158 | $str.=$span[1]->innertext(); 159 | $str.=PHP_EOL; 160 | } 161 | } 162 | 163 | $exploded = explode("##",$str); 164 | unset($exploded[0]); 165 | 166 | foreach($exploded as $data){ 167 | $d = explode(PHP_EOL,$data); 168 | if(isset($d[5])) 169 | unset($d[5]); 170 | if(isset($d[6])) 171 | unset($d[6]); 172 | $transData[] = $d; 173 | } 174 | } 175 | 176 | return $transData; 177 | } 178 | 179 | public function toArray(){ 180 | try{ 181 | return $this->parseResult(); 182 | }catch(\Exception $e){ 183 | echo $e->getMessage(); 184 | } 185 | } 186 | 187 | public function toJson(){ 188 | try{ 189 | return json_encode($this->parseResult()); 190 | }catch(\Exception $e){ 191 | echo $e->getMessage(); 192 | } 193 | } 194 | 195 | } 196 | 197 | 198 | /*$config = [ 199 | 'credential' => [ 200 | 'username' => 'username_internet_banking', 201 | 'password' => 'password_internet_banking' 202 | ], 203 | 'nomor_rekening' => 'nomor_rekening' //0210123456, 204 | 'range' => [ 205 | 'tgl_akhir' => date('d-M-Y',strtotime('2018-07-34')), 206 | 'tgl_awal' => date('d-M-Y',strtotime('2018-07-01')) 207 | ], 208 | ]; 209 | 210 | $bni = new CekBNI($config);*/ 211 | 212 | 213 | ?> -------------------------------------------------------------------------------- /src/bni-cookie.txt: -------------------------------------------------------------------------------- 1 | # Netscape HTTP Cookie File 2 | # http://curl.haxx.se/docs/http-cookies.html 3 | # This file was generated by libcurl! Edit at your own risk. 4 | 5 | #HttpOnly_ibank.bni.co.id FALSE / FALSE 0 bnicookies !VYL15q0WI4mpqOEtqjjcXyj8TW7bT8hYmMUSSBc1pZU27j7wkIcat1XNFg6BfKwbeyt+i0B1P9eo3A== 6 | #HttpOnly_ibank.bni.co.id FALSE /MBAWeb/ FALSE 0 f5avrbbbbbbbbbbbbbbbb ALADGLDIEDIDHBIIAINDMMHMEGOHLNIEALLHIHIIJMCDKALNNDHOBNANBIFAPPNCKHKLPPAIJBGACOJDJDPMJMCIFEEBBMFOJEKCLDKIPADJAONFANMCKNPNFOJFNGNA 7 | ibank.bni.co.id FALSE /MBAWeb/ FALSE 0 f5_cspm 1234 8 | -------------------------------------------------------------------------------- /src/simple_html_dom.php: -------------------------------------------------------------------------------- 1 | size is the "real" number of bytes the dom was created from. 17 | * but for most purposes, it's a really good estimation. 18 | * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors. 19 | * Allow the user to tell us how much they trust the html. 20 | * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node. 21 | * This allows for us to find tags based on the text they contain. 22 | * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag. 23 | * Paperg: added parse_charset so that we know about the character set of the source document. 24 | * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the 25 | * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection. 26 | * 27 | * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that. 28 | * PaperG (John Schlick) Added get_display_size for "IMG" tags. 29 | * 30 | * Licensed under The MIT License 31 | * Redistributions of files must retain the above copyright notice. 32 | * 33 | * @author S.C. Chen 34 | * @author John Schlick 35 | * @author Rus Carroll 36 | * @version 1.5 ($Rev: 196 $) 37 | * @package PlaceLocalInclude 38 | * @subpackage simple_html_dom 39 | */ 40 | 41 | /** 42 | * All of the Defines for the classes below. 43 | * @author S.C. Chen 44 | */ 45 | define('HDOM_TYPE_ELEMENT', 1); 46 | define('HDOM_TYPE_COMMENT', 2); 47 | define('HDOM_TYPE_TEXT', 3); 48 | define('HDOM_TYPE_ENDTAG', 4); 49 | define('HDOM_TYPE_ROOT', 5); 50 | define('HDOM_TYPE_UNKNOWN', 6); 51 | define('HDOM_QUOTE_DOUBLE', 0); 52 | define('HDOM_QUOTE_SINGLE', 1); 53 | define('HDOM_QUOTE_NO', 3); 54 | define('HDOM_INFO_BEGIN', 0); 55 | define('HDOM_INFO_END', 1); 56 | define('HDOM_INFO_QUOTE', 2); 57 | define('HDOM_INFO_SPACE', 3); 58 | define('HDOM_INFO_TEXT', 4); 59 | define('HDOM_INFO_INNER', 5); 60 | define('HDOM_INFO_OUTER', 6); 61 | define('HDOM_INFO_ENDSPACE',7); 62 | define('DEFAULT_TARGET_CHARSET', 'UTF-8'); 63 | define('DEFAULT_BR_TEXT', "\r\n"); 64 | define('DEFAULT_SPAN_TEXT', " "); 65 | if (!defined('MAX_FILE_SIZE')) { 66 | define('MAX_FILE_SIZE', 600000); 67 | } 68 | 69 | // helper functions 70 | // ----------------------------------------------------------------------------- 71 | // get html dom from file 72 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. 73 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 74 | { 75 | // We DO force the tags to be terminated. 76 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); 77 | 78 | do { 79 | $repeat = false; 80 | if ($context!==NULL) 81 | { 82 | // Test if "Accept-Encoding: gzip" has been set in $context 83 | $params = stream_context_get_params($context); 84 | if (isset($params['options']['http']['header']) && preg_match('/gzip/', $params['options']['http']['header']) !== false) 85 | { 86 | $contents = file_get_contents('compress.zlib://'.$url, $use_include_path, $context, $offset); 87 | } 88 | else 89 | { 90 | $contents = file_get_contents($url, $use_include_path, $context, $offset); 91 | } 92 | } 93 | else 94 | { 95 | $contents = file_get_contents($url, $use_include_path, NULL, $offset); 96 | } 97 | 98 | // test if the URL doesn't return a 200 status 99 | if (isset($http_response_header) && strpos($http_response_header[0], '200') === false) { 100 | // has a 301 redirect header been sent? 101 | $pattern = "/^Location:\s*(.*)$/i"; 102 | $location_headers = preg_grep($pattern, $http_response_header); 103 | 104 | if (!empty($location_headers) && preg_match($pattern, array_values($location_headers)[0], $matches)) { 105 | // set the URL to that returned via the redirect header and repeat this loop 106 | $url = $matches[1]; 107 | $repeat = true; 108 | } 109 | } 110 | } while ($repeat); 111 | 112 | // stop processing if the header isn't a good responce 113 | if (isset($http_response_header) && strpos($http_response_header[0], '200') === false) 114 | { 115 | return false; 116 | } 117 | 118 | // stop processing if the contents are too big 119 | if (empty($contents) || strlen($contents) > MAX_FILE_SIZE) 120 | { 121 | return false; 122 | } 123 | // The second parameter can force the selectors to all be lowercase. 124 | $dom->load($contents, $lowercase, $stripRN); 125 | return $dom; 126 | } 127 | 128 | // get html dom from string 129 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 130 | { 131 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); 132 | if (empty($str) || strlen($str) > MAX_FILE_SIZE) 133 | { 134 | $dom->clear(); 135 | return false; 136 | } 137 | $dom->load($str, $lowercase, $stripRN); 138 | return $dom; 139 | } 140 | 141 | // dump html dom tree 142 | function dump_html_tree($node, $show_attr=true, $deep=0) 143 | { 144 | $node->dump($node); 145 | } 146 | 147 | 148 | /** 149 | * simple html dom node 150 | * PaperG - added ability for "find" routine to lowercase the value of the selector. 151 | * PaperG - added $tag_start to track the start position of the tag in the total byte index 152 | * 153 | * @package PlaceLocalInclude 154 | */ 155 | class simple_html_dom_node 156 | { 157 | public $nodetype = HDOM_TYPE_TEXT; 158 | public $tag = 'text'; 159 | public $attr = array(); 160 | public $children = array(); 161 | public $nodes = array(); 162 | public $parent = null; 163 | // The "info" array - see HDOM_INFO_... for what each element contains. 164 | public $_ = array(); 165 | public $tag_start = 0; 166 | private $dom = null; 167 | 168 | function __construct($dom) 169 | { 170 | $this->dom = $dom; 171 | $dom->nodes[] = $this; 172 | } 173 | 174 | function __destruct() 175 | { 176 | $this->clear(); 177 | } 178 | 179 | function __toString() 180 | { 181 | return $this->outertext(); 182 | } 183 | 184 | // clean up memory due to php5 circular references memory leak... 185 | function clear() 186 | { 187 | $this->dom = null; 188 | $this->nodes = null; 189 | $this->parent = null; 190 | $this->children = null; 191 | } 192 | 193 | // dump node's tree 194 | function dump($show_attr=true, $deep=0) 195 | { 196 | $lead = str_repeat(' ', $deep); 197 | 198 | echo $lead.$this->tag; 199 | if ($show_attr && count($this->attr)>0) 200 | { 201 | echo '('; 202 | foreach ($this->attr as $k=>$v) 203 | echo "[$k]=>\"".$this->$k.'", '; 204 | echo ')'; 205 | } 206 | echo "\n"; 207 | 208 | if ($this->nodes) 209 | { 210 | foreach ($this->nodes as $c) 211 | { 212 | $c->dump($show_attr, $deep+1); 213 | } 214 | } 215 | } 216 | 217 | 218 | // Debugging function to dump a single dom node with a bunch of information about it. 219 | function dump_node($echo=true) 220 | { 221 | 222 | $string = $this->tag; 223 | if (count($this->attr)>0) 224 | { 225 | $string .= '('; 226 | foreach ($this->attr as $k=>$v) 227 | { 228 | $string .= "[$k]=>\"".$this->$k.'", '; 229 | } 230 | $string .= ')'; 231 | } 232 | if (count($this->_)>0) 233 | { 234 | $string .= ' $_ ('; 235 | foreach ($this->_ as $k=>$v) 236 | { 237 | if (is_array($v)) 238 | { 239 | $string .= "[$k]=>("; 240 | foreach ($v as $k2=>$v2) 241 | { 242 | $string .= "[$k2]=>\"".$v2.'", '; 243 | } 244 | $string .= ")"; 245 | } else { 246 | $string .= "[$k]=>\"".$v.'", '; 247 | } 248 | } 249 | $string .= ")"; 250 | } 251 | 252 | if (isset($this->text)) 253 | { 254 | $string .= " text: (" . $this->text . ")"; 255 | } 256 | 257 | $string .= " HDOM_INNER_INFO: '"; 258 | if (isset($node->_[HDOM_INFO_INNER])) 259 | { 260 | $string .= $node->_[HDOM_INFO_INNER] . "'"; 261 | } 262 | else 263 | { 264 | $string .= ' NULL '; 265 | } 266 | 267 | $string .= " children: " . count($this->children); 268 | $string .= " nodes: " . count($this->nodes); 269 | $string .= " tag_start: " . $this->tag_start; 270 | $string .= "\n"; 271 | 272 | if ($echo) 273 | { 274 | echo $string; 275 | return; 276 | } 277 | else 278 | { 279 | return $string; 280 | } 281 | } 282 | 283 | // returns the parent of node 284 | // If a node is passed in, it will reset the parent of the current node to that one. 285 | function parent($parent=null) 286 | { 287 | // I am SURE that this doesn't work properly. 288 | // It fails to unset the current node from it's current parents nodes or children list first. 289 | if ($parent !== null) 290 | { 291 | $this->parent = $parent; 292 | $this->parent->nodes[] = $this; 293 | $this->parent->children[] = $this; 294 | } 295 | 296 | return $this->parent; 297 | } 298 | 299 | // verify that node has children 300 | function has_child() 301 | { 302 | return !empty($this->children); 303 | } 304 | 305 | // returns children of node 306 | function children($idx=-1) 307 | { 308 | if ($idx===-1) 309 | { 310 | return $this->children; 311 | } 312 | if (isset($this->children[$idx])) return $this->children[$idx]; 313 | return null; 314 | } 315 | 316 | // returns the first child of node 317 | function first_child() 318 | { 319 | if (count($this->children)>0) 320 | { 321 | return $this->children[0]; 322 | } 323 | return null; 324 | } 325 | 326 | // returns the last child of node 327 | function last_child() 328 | { 329 | if (($count=count($this->children))>0) 330 | { 331 | return $this->children[$count-1]; 332 | } 333 | return null; 334 | } 335 | 336 | // returns the next sibling of node 337 | function next_sibling() 338 | { 339 | if ($this->parent===null) 340 | { 341 | return null; 342 | } 343 | 344 | $idx = 0; 345 | $count = count($this->parent->children); 346 | while ($idx<$count && $this!==$this->parent->children[$idx]) 347 | { 348 | ++$idx; 349 | } 350 | if (++$idx>=$count) 351 | { 352 | return null; 353 | } 354 | return $this->parent->children[$idx]; 355 | } 356 | 357 | // returns the previous sibling of node 358 | function prev_sibling() 359 | { 360 | if ($this->parent===null) return null; 361 | $idx = 0; 362 | $count = count($this->parent->children); 363 | while ($idx<$count && $this!==$this->parent->children[$idx]) 364 | ++$idx; 365 | if (--$idx<0) return null; 366 | return $this->parent->children[$idx]; 367 | } 368 | 369 | // function to locate a specific ancestor tag in the path to the root. 370 | function find_ancestor_tag($tag) 371 | { 372 | global $debugObject; 373 | if (is_object($debugObject)) { $debugObject->debugLogEntry(1); } 374 | 375 | // Start by including ourselves in the comparison. 376 | $returnDom = $this; 377 | 378 | while (!is_null($returnDom)) 379 | { 380 | if (is_object($debugObject)) { $debugObject->debugLog(2, "Current tag is: " . $returnDom->tag); } 381 | 382 | if ($returnDom->tag == $tag) 383 | { 384 | break; 385 | } 386 | $returnDom = $returnDom->parent; 387 | } 388 | return $returnDom; 389 | } 390 | 391 | // get dom node's inner html 392 | function innertext() 393 | { 394 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 395 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 396 | 397 | $ret = ''; 398 | foreach ($this->nodes as $n) 399 | $ret .= $n->outertext(); 400 | return $ret; 401 | } 402 | 403 | // get dom node's outer text (with tag) 404 | function outertext() 405 | { 406 | global $debugObject; 407 | if (is_object($debugObject)) 408 | { 409 | $text = ''; 410 | if ($this->tag == 'text') 411 | { 412 | if (!empty($this->text)) 413 | { 414 | $text = " with text: " . $this->text; 415 | } 416 | } 417 | $debugObject->debugLog(1, 'Innertext of tag: ' . $this->tag . $text); 418 | } 419 | 420 | if ($this->tag==='root') return $this->innertext(); 421 | 422 | // trigger callback 423 | if ($this->dom && $this->dom->callback!==null) 424 | { 425 | call_user_func_array($this->dom->callback, array($this)); 426 | } 427 | 428 | if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; 429 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 430 | 431 | // render begin tag 432 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) 433 | { 434 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 435 | } else { 436 | $ret = ""; 437 | } 438 | 439 | // render inner text 440 | if (isset($this->_[HDOM_INFO_INNER])) 441 | { 442 | // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. 443 | if ($this->tag != "br") 444 | { 445 | $ret .= $this->_[HDOM_INFO_INNER]; 446 | } 447 | } else { 448 | if ($this->nodes) 449 | { 450 | foreach ($this->nodes as $n) 451 | { 452 | $ret .= $this->convert_text($n->outertext()); 453 | } 454 | } 455 | } 456 | 457 | // render end tag 458 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) 459 | $ret .= 'tag.'>'; 460 | return $ret; 461 | } 462 | 463 | // get dom node's plain text 464 | function text() 465 | { 466 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 467 | switch ($this->nodetype) 468 | { 469 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 470 | case HDOM_TYPE_COMMENT: return ''; 471 | case HDOM_TYPE_UNKNOWN: return ''; 472 | } 473 | if (strcasecmp($this->tag, 'script')===0) return ''; 474 | if (strcasecmp($this->tag, 'style')===0) return ''; 475 | 476 | $ret = ''; 477 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. 478 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. 479 | // WHY is this happening? 480 | if (!is_null($this->nodes)) 481 | { 482 | foreach ($this->nodes as $n) 483 | { 484 | $ret .= $this->convert_text($n->text()); 485 | } 486 | 487 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. 488 | if ($this->tag == "span") 489 | { 490 | $ret .= $this->dom->default_span_text; 491 | } 492 | 493 | 494 | } 495 | return $ret; 496 | } 497 | 498 | function xmltext() 499 | { 500 | $ret = $this->innertext(); 501 | $ret = str_ireplace('', '', $ret); 503 | return $ret; 504 | } 505 | 506 | // build node's text with tag 507 | function makeup() 508 | { 509 | // text, comment, unknown 510 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 511 | 512 | $ret = '<'.$this->tag; 513 | $i = -1; 514 | 515 | foreach ($this->attr as $key=>$val) 516 | { 517 | ++$i; 518 | 519 | // skip removed attribute 520 | if ($val===null || $val===false) 521 | continue; 522 | 523 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 524 | //no value attr: nowrap, checked selected... 525 | if ($val===true) 526 | $ret .= $key; 527 | else { 528 | switch ($this->_[HDOM_INFO_QUOTE][$i]) 529 | { 530 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 531 | case HDOM_QUOTE_SINGLE: $quote = '\''; break; 532 | default: $quote = ''; 533 | } 534 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; 535 | } 536 | } 537 | $ret = $this->dom->restore_noise($ret); 538 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 539 | } 540 | 541 | // find elements by css selector 542 | //PaperG - added ability for find to lowercase the value of the selector. 543 | function find($selector, $idx=null, $lowercase=false) 544 | { 545 | $selectors = $this->parse_selector($selector); 546 | if (($count=count($selectors))===0) return array(); 547 | $found_keys = array(); 548 | 549 | // find each selector 550 | for ($c=0; $c<$count; ++$c) 551 | { 552 | // The change on the below line was documented on the sourceforge code tracker id 2788009 553 | // used to be: if (($levle=count($selectors[0]))===0) return array(); 554 | if (($levle=count($selectors[$c]))===0) return array(); 555 | if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); 556 | 557 | $head = array($this->_[HDOM_INFO_BEGIN]=>1); 558 | 559 | // handle descendant selectors, no recursive! 560 | for ($l=0; $l<$levle; ++$l) 561 | { 562 | $ret = array(); 563 | foreach ($head as $k=>$v) 564 | { 565 | $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; 566 | //PaperG - Pass this optional parameter on to the seek function. 567 | $n->seek($selectors[$c][$l], $ret, $lowercase); 568 | } 569 | $head = $ret; 570 | } 571 | 572 | foreach ($head as $k=>$v) 573 | { 574 | if (!isset($found_keys[$k])) 575 | $found_keys[$k] = 1; 576 | } 577 | } 578 | 579 | // sort keys 580 | ksort($found_keys); 581 | 582 | $found = array(); 583 | foreach ($found_keys as $k=>$v) 584 | $found[] = $this->dom->nodes[$k]; 585 | 586 | // return nth-element or array 587 | if (is_null($idx)) return $found; 588 | else if ($idx<0) $idx = count($found) + $idx; 589 | return (isset($found[$idx])) ? $found[$idx] : null; 590 | } 591 | 592 | // seek for given conditions 593 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector. 594 | protected function seek($selector, &$ret, $lowercase=false) 595 | { 596 | global $debugObject; 597 | if (is_object($debugObject)) { $debugObject->debugLogEntry(1); } 598 | 599 | list($tag, $key, $val, $exp, $no_key) = $selector; 600 | 601 | // xpath index 602 | if ($tag && $key && is_numeric($key)) 603 | { 604 | $count = 0; 605 | foreach ($this->children as $c) 606 | { 607 | if ($tag==='*' || $tag===$c->tag) { 608 | if (++$count==$key) { 609 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1; 610 | return; 611 | } 612 | } 613 | } 614 | return; 615 | } 616 | 617 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 618 | if ($end==0) { 619 | $parent = $this->parent; 620 | while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { 621 | $end -= 1; 622 | $parent = $parent->parent; 623 | } 624 | $end += $parent->_[HDOM_INFO_END]; 625 | } 626 | 627 | for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { 628 | $node = $this->dom->nodes[$i]; 629 | 630 | $pass = true; 631 | 632 | if ($tag==='*' && !$key) { 633 | if (in_array($node, $this->children, true)) 634 | $ret[$i] = 1; 635 | continue; 636 | } 637 | 638 | // compare tag 639 | if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} 640 | // compare key 641 | if ($pass && $key) { 642 | if ($no_key) { 643 | if (isset($node->attr[$key])) $pass=false; 644 | } else { 645 | if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false; 646 | } 647 | } 648 | // compare value 649 | if ($pass && $key && $val && $val!=='*') { 650 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? 651 | if ($key == "plaintext") { 652 | // $node->plaintext actually returns $node->text(); 653 | $nodeKeyValue = $node->text(); 654 | } else { 655 | // this is a normal search, we want the value of that attribute of the tag. 656 | $nodeKeyValue = $node->attr[$key]; 657 | } 658 | if (is_object($debugObject)) {$debugObject->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} 659 | 660 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. 661 | if ($lowercase) { 662 | $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); 663 | } else { 664 | $check = $this->match($exp, $val, $nodeKeyValue); 665 | } 666 | if (is_object($debugObject)) {$debugObject->debugLog(2, "after match: " . ($check ? "true" : "false"));} 667 | 668 | // handle multiple class 669 | if (!$check && strcasecmp($key, 'class')===0) { 670 | foreach (explode(' ',$node->attr[$key]) as $k) { 671 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. 672 | if (!empty($k)) { 673 | if ($lowercase) { 674 | $check = $this->match($exp, strtolower($val), strtolower($k)); 675 | } else { 676 | $check = $this->match($exp, $val, $k); 677 | } 678 | if ($check) break; 679 | } 680 | } 681 | } 682 | if (!$check) $pass = false; 683 | } 684 | if ($pass) $ret[$i] = 1; 685 | unset($node); 686 | } 687 | // It's passed by reference so this is actually what this function returns. 688 | if (is_object($debugObject)) {$debugObject->debugLog(1, "EXIT - ret: ", $ret);} 689 | } 690 | 691 | protected function match($exp, $pattern, $value) { 692 | global $debugObject; 693 | if (is_object($debugObject)) {$debugObject->debugLogEntry(1);} 694 | 695 | switch ($exp) { 696 | case '=': 697 | return ($value===$pattern); 698 | case '!=': 699 | return ($value!==$pattern); 700 | case '^=': 701 | return preg_match("/^".preg_quote($pattern,'/')."/", $value); 702 | case '$=': 703 | return preg_match("/".preg_quote($pattern,'/')."$/", $value); 704 | case '*=': 705 | if ($pattern[0]=='/') { 706 | return preg_match($pattern, $value); 707 | } 708 | return preg_match("/".$pattern."/i", $value); 709 | } 710 | return false; 711 | } 712 | 713 | protected function parse_selector($selector_string) { 714 | global $debugObject; 715 | if (is_object($debugObject)) {$debugObject->debugLogEntry(1);} 716 | 717 | // pattern of CSS selectors, modified from mootools 718 | // Paperg: Add the colon to the attrbute, so that it properly finds like google does. 719 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. 720 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. 721 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. 722 | // farther study is required to determine of this should be documented or removed. 723 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 724 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 725 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); 726 | if (is_object($debugObject)) {$debugObject->debugLog(2, "Matches Array: ", $matches);} 727 | 728 | $selectors = array(); 729 | $result = array(); 730 | //print_r($matches); 731 | 732 | foreach ($matches as $m) { 733 | $m[0] = trim($m[0]); 734 | if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; 735 | // for browser generated xpath 736 | if ($m[1]==='tbody') continue; 737 | 738 | list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); 739 | if (!empty($m[2])) {$key='id'; $val=$m[2];} 740 | if (!empty($m[3])) {$key='class'; $val=$m[3];} 741 | if (!empty($m[4])) {$key=$m[4];} 742 | if (!empty($m[5])) {$exp=$m[5];} 743 | if (!empty($m[6])) {$val=$m[6];} 744 | 745 | // convert to lowercase 746 | if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} 747 | //elements that do NOT have the specified attribute 748 | if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} 749 | 750 | $result[] = array($tag, $key, $val, $exp, $no_key); 751 | if (trim($m[7])===',') { 752 | $selectors[] = $result; 753 | $result = array(); 754 | } 755 | } 756 | if (count($result)>0) 757 | $selectors[] = $result; 758 | return $selectors; 759 | } 760 | 761 | function __get($name) { 762 | if (isset($this->attr[$name])) 763 | { 764 | return $this->convert_text($this->attr[$name]); 765 | } 766 | switch ($name) { 767 | case 'outertext': return $this->outertext(); 768 | case 'innertext': return $this->innertext(); 769 | case 'plaintext': return $this->text(); 770 | case 'xmltext': return $this->xmltext(); 771 | default: return array_key_exists($name, $this->attr); 772 | } 773 | } 774 | 775 | function __set($name, $value) { 776 | switch ($name) { 777 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 778 | case 'innertext': 779 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; 780 | return $this->_[HDOM_INFO_INNER] = $value; 781 | } 782 | if (!isset($this->attr[$name])) { 783 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 784 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 785 | } 786 | $this->attr[$name] = $value; 787 | } 788 | 789 | function __isset($name) { 790 | switch ($name) { 791 | case 'outertext': return true; 792 | case 'innertext': return true; 793 | case 'plaintext': return true; 794 | } 795 | //no value attr: nowrap, checked selected... 796 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 797 | } 798 | 799 | function __unset($name) { 800 | if (isset($this->attr[$name])) 801 | unset($this->attr[$name]); 802 | } 803 | 804 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same. 805 | function convert_text($text) 806 | { 807 | global $debugObject; 808 | if (is_object($debugObject)) {$debugObject->debugLogEntry(1);} 809 | 810 | $converted_text = $text; 811 | 812 | $sourceCharset = ""; 813 | $targetCharset = ""; 814 | 815 | if ($this->dom) 816 | { 817 | $sourceCharset = strtoupper($this->dom->_charset); 818 | $targetCharset = strtoupper($this->dom->_target_charset); 819 | } 820 | if (is_object($debugObject)) {$debugObject->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} 821 | 822 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) 823 | { 824 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 825 | if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) 826 | { 827 | $converted_text = $text; 828 | } 829 | else 830 | { 831 | $converted_text = iconv($sourceCharset, $targetCharset, $text); 832 | } 833 | } 834 | 835 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. 836 | if ($targetCharset == 'UTF-8') 837 | { 838 | if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") 839 | { 840 | $converted_text = substr($converted_text, 3); 841 | } 842 | if (substr($converted_text, -3) == "\xef\xbb\xbf") 843 | { 844 | $converted_text = substr($converted_text, 0, -3); 845 | } 846 | } 847 | 848 | return $converted_text; 849 | } 850 | 851 | /** 852 | * Returns true if $string is valid UTF-8 and false otherwise. 853 | * 854 | * @param mixed $str String to be tested 855 | * @return boolean 856 | */ 857 | static function is_utf8($str) 858 | { 859 | $c=0; $b=0; 860 | $bits=0; 861 | $len=strlen($str); 862 | for($i=0; $i<$len; $i++) 863 | { 864 | $c=ord($str[$i]); 865 | if($c > 128) 866 | { 867 | if(($c >= 254)) return false; 868 | elseif($c >= 252) $bits=6; 869 | elseif($c >= 248) $bits=5; 870 | elseif($c >= 240) $bits=4; 871 | elseif($c >= 224) $bits=3; 872 | elseif($c >= 192) $bits=2; 873 | else return false; 874 | if(($i+$bits) > $len) return false; 875 | while($bits > 1) 876 | { 877 | $i++; 878 | $b=ord($str[$i]); 879 | if($b < 128 || $b > 191) return false; 880 | $bits--; 881 | } 882 | } 883 | } 884 | return true; 885 | } 886 | /* 887 | function is_utf8($string) 888 | { 889 | //this is buggy 890 | return (utf8_encode(utf8_decode($string)) == $string); 891 | } 892 | */ 893 | 894 | /** 895 | * Function to try a few tricks to determine the displayed size of an img on the page. 896 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. 897 | * 898 | * @author John Schlick 899 | * @version April 19 2012 900 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. 901 | */ 902 | function get_display_size() 903 | { 904 | global $debugObject; 905 | 906 | $width = -1; 907 | $height = -1; 908 | 909 | if ($this->tag !== 'img') 910 | { 911 | return false; 912 | } 913 | 914 | // See if there is aheight or width attribute in the tag itself. 915 | if (isset($this->attr['width'])) 916 | { 917 | $width = $this->attr['width']; 918 | } 919 | 920 | if (isset($this->attr['height'])) 921 | { 922 | $height = $this->attr['height']; 923 | } 924 | 925 | // Now look for an inline style. 926 | if (isset($this->attr['style'])) 927 | { 928 | // Thanks to user gnarf from stackoverflow for this regular expression. 929 | $attributes = array(); 930 | preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); 931 | foreach ($matches as $match) { 932 | $attributes[$match[1]] = $match[2]; 933 | } 934 | 935 | // If there is a width in the style attributes: 936 | if (isset($attributes['width']) && $width == -1) 937 | { 938 | // check that the last two characters are px (pixels) 939 | if (strtolower(substr($attributes['width'], -2)) == 'px') 940 | { 941 | $proposed_width = substr($attributes['width'], 0, -2); 942 | // Now make sure that it's an integer and not something stupid. 943 | if (filter_var($proposed_width, FILTER_VALIDATE_INT)) 944 | { 945 | $width = $proposed_width; 946 | } 947 | } 948 | } 949 | 950 | // If there is a width in the style attributes: 951 | if (isset($attributes['height']) && $height == -1) 952 | { 953 | // check that the last two characters are px (pixels) 954 | if (strtolower(substr($attributes['height'], -2)) == 'px') 955 | { 956 | $proposed_height = substr($attributes['height'], 0, -2); 957 | // Now make sure that it's an integer and not something stupid. 958 | if (filter_var($proposed_height, FILTER_VALIDATE_INT)) 959 | { 960 | $height = $proposed_height; 961 | } 962 | } 963 | } 964 | 965 | } 966 | 967 | // Future enhancement: 968 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. 969 | 970 | // Far future enhancement 971 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width 972 | // Note that in this case, the class or id will have the img subselector for it to apply to the image. 973 | 974 | // ridiculously far future development 975 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. 976 | 977 | $result = array('height' => $height, 978 | 'width' => $width); 979 | return $result; 980 | } 981 | 982 | // camel naming conventions 983 | function getAllAttributes() {return $this->attr;} 984 | function getAttribute($name) {return $this->__get($name);} 985 | function setAttribute($name, $value) {$this->__set($name, $value);} 986 | function hasAttribute($name) {return $this->__isset($name);} 987 | function removeAttribute($name) {$this->__set($name, null);} 988 | function getElementById($id) {return $this->find("#$id", 0);} 989 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} 990 | function getElementByTagName($name) {return $this->find($name, 0);} 991 | function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} 992 | function parentNode() {return $this->parent();} 993 | function childNodes($idx=-1) {return $this->children($idx);} 994 | function firstChild() {return $this->first_child();} 995 | function lastChild() {return $this->last_child();} 996 | function nextSibling() {return $this->next_sibling();} 997 | function previousSibling() {return $this->prev_sibling();} 998 | function hasChildNodes() {return $this->has_child();} 999 | function nodeName() {return $this->tag;} 1000 | function appendChild($node) {$node->parent($this); return $node;} 1001 | 1002 | } 1003 | 1004 | /** 1005 | * simple html dom parser 1006 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. 1007 | * Paperg - change $size from protected to public so we can easily access it 1008 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. 1009 | * 1010 | * @package PlaceLocalInclude 1011 | */ 1012 | class simple_html_dom 1013 | { 1014 | public $root = null; 1015 | public $nodes = array(); 1016 | public $callback = null; 1017 | public $lowercase = false; 1018 | // Used to keep track of how large the text was when we started. 1019 | public $original_size; 1020 | public $size; 1021 | protected $pos; 1022 | protected $doc; 1023 | protected $char; 1024 | protected $cursor; 1025 | protected $parent; 1026 | protected $noise = array(); 1027 | protected $token_blank = " \t\r\n"; 1028 | protected $token_equal = ' =/>'; 1029 | protected $token_slash = " />\r\n\t"; 1030 | protected $token_attr = ' >'; 1031 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. 1032 | public $_charset = ''; 1033 | public $_target_charset = ''; 1034 | protected $default_br_text = ""; 1035 | public $default_span_text = ""; 1036 | 1037 | // use isset instead of in_array, performance boost about 30%... 1038 | protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); 1039 | protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); 1040 | // Known sourceforge issue #2977341 1041 | // B tags that are not closed cause us to return everything to the end of the document. 1042 | protected $optional_closing_tags = array( 1043 | 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), 1044 | 'th'=>array('th'=>1), 1045 | 'td'=>array('td'=>1), 1046 | 'li'=>array('li'=>1), 1047 | 'dt'=>array('dt'=>1, 'dd'=>1), 1048 | 'dd'=>array('dd'=>1, 'dt'=>1), 1049 | 'dl'=>array('dd'=>1, 'dt'=>1), 1050 | 'p'=>array('p'=>1), 1051 | 'nobr'=>array('nobr'=>1), 1052 | 'b'=>array('b'=>1), 1053 | 'option'=>array('option'=>1), 1054 | ); 1055 | 1056 | function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 1057 | { 1058 | if ($str) 1059 | { 1060 | if (preg_match("/^http:\/\//i",$str) || is_file($str)) 1061 | { 1062 | $this->load_file($str); 1063 | } 1064 | else 1065 | { 1066 | $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 1067 | } 1068 | } 1069 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. 1070 | if (!$forceTagsClosed) { 1071 | $this->optional_closing_array=array(); 1072 | } 1073 | $this->_target_charset = $target_charset; 1074 | } 1075 | 1076 | function __destruct() 1077 | { 1078 | $this->clear(); 1079 | } 1080 | 1081 | // load html from string 1082 | function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) 1083 | { 1084 | global $debugObject; 1085 | 1086 | // prepare 1087 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 1088 | // strip out comments 1089 | $this->remove_noise("''is"); 1090 | // strip out cdata 1091 | $this->remove_noise("''is", true); 1092 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1093 | // Script tags removal now preceeds style tag removal. 1094 | // strip out