├── README.md ├── api.php └── src └── simple_html_dom.php /README.md: -------------------------------------------------------------------------------- 1 | # e-MEC-API 2 | 3 | API para consulta de dados no http://emec.mec.gov.br/ 4 | -------------------------------------------------------------------------------- /api.php: -------------------------------------------------------------------------------- 1 | 5 | @website www.Alisson.eng.br 6 | */ 7 | 8 | class MECApi 9 | { 10 | /* 11 | Função para listar os municípios do estado com seus respectivos códigos 12 | */ 13 | function get_municipios($sigla /* 2 dígitos maiúsculos */) 14 | { 15 | $str = file_get_contents("http://emec.mec.gov.br/emec/comum/json/selecionar-municipio/".md5("sg_uf")."/".base64_encode($sigla)); 16 | return array_column(json_decode($str, true), 'co_municipio', 'ds_municipio'); 17 | } 18 | 19 | /* 20 | Função para obter as instituições à partir do código do município 21 | */ 22 | function get_instituicoes($cod_municipio) 23 | { 24 | include_once('./src/simple_html_dom.php'); 25 | $ch = curl_init(); 26 | curl_setopt($ch, CURLOPT_URL, 'http://emec.mec.gov.br/emec/nova-index/listar-consulta-avancada/list/1000'); 27 | curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:25.0) Gecko/20100101 Firefox/25.0'); 28 | curl_setopt($ch, CURLOPT_ENCODING , "gzip"); 29 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 30 | curl_setopt($ch, CURLOPT_POST, 1); 31 | curl_setopt($ch, CURLOPT_POSTFIELDS, "data%5BCONSULTA_AVANCADA%5D%5Bhid_template%5D=listar-consulta-avancada-ies&data%5BCONSULTA_AVANCADA%5D%5Bhid_order%5D=ies.no_ies+ASC&data%5BCONSULTA_AVANCADA%5D%5Bhid_no_cidade_avancada%5D=&data%5BCONSULTA_AVANCADA%5D%5Bhid_no_regiao_avancada%5D=&data%5BCONSULTA_AVANCADA%5D%5Bhid_no_pais_avancada%5D=&data%5BCONSULTA_AVANCADA%5D%5Bhid_co_pais_avancada%5D=&data%5BCONSULTA_AVANCADA%5D%5Brad_buscar_por%5D=IES&data%5BCONSULTA_AVANCADA%5D%5Btxt_no_ies%5D=&data%5BCONSULTA_AVANCADA%5D%5Btxt_no_curso%5D=&data%5BCONSULTA_AVANCADA%5D%5Btxt_no_especializacao%5D=&data%5BCONSULTA_AVANCADA%5D%5Bsel_co_area%5D=&data%5BCONSULTA_AVANCADA%5D%5Bsel_sg_uf%5D=MG&data%5BCONSULTA_AVANCADA%5D%5Bsel_co_municipio%5D=".$cod_municipio."&data%5BCONSULTA_AVANCADA%5D%5Bchk_tp_natureza_gn%5D%5B%5D=3&data%5BCONSULTA_AVANCADA%5D%5Bchk_tp_natureza_gn%5D%5B%5D=1&data%5BCONSULTA_AVANCADA%5D%5Bchk_tp_natureza_gn%5D%5B%5D=2&data%5BCONSULTA_AVANCADA%5D%5Bchk_tp_natureza_gn%5D%5B%5D=5&data%5BCONSULTA_AVANCADA%5D%5Bchk_tp_natureza_gn%5D%5B%5D=4&data%5BCONSULTA_AVANCADA%5D%5Bchk_tp_natureza_gn%5D%5B%5D=6&data%5BCONSULTA_AVANCADA%5D%5Bchk_tp_natureza_gn%5D%5B%5D=7&data%5BCONSULTA_AVANCADA%5D%5Bsel_st_gratuito%5D=&data%5BCONSULTA_AVANCADA%5D%5Bchk_tp_organizacao_gn%5D%5B%5D=10022%2C10024%2C10023%2C10027&data%5BCONSULTA_AVANCADA%5D%5Bchk_tp_organizacao_gn%5D%5B%5D=10019%2C10020%2C10021%2C10026&data%5BCONSULTA_AVANCADA%5D%5Bchk_tp_organizacao_gn%5D%5B%5D=10026%2C10019&data%5BCONSULTA_AVANCADA%5D%5Bchk_tp_organizacao_gn%5D%5B%5D=10028%2C10029&data%5BCONSULTA_AVANCADA%5D%5Bsel_no_indice_ies%5D=&data%5BCONSULTA_AVANCADA%5D%5Bsel_co_indice_ies%5D=&data%5BCONSULTA_AVANCADA%5D%5Bsel_no_indice_curso%5D=&data%5BCONSULTA_AVANCADA%5D%5Bsel_co_indice_curso%5D=&data%5BCONSULTA_AVANCADA%5D%5Bsel_co_situacao_funcionamento_ies%5D=10035&data%5BCONSULTA_AVANCADA%5D%5Bsel_co_situacao_funcionamento_curso%5D=9&data%5BCONSULTA_AVANCADA%5D%5Bsel_st_funcionamento_especializacao%5D=&captcha="); 32 | $buffer = curl_exec($ch); 33 | curl_close($ch); 34 | 35 | $dom = new domDocument; 36 | 37 | @$dom->loadHTML($buffer); 38 | $dom->preserveWhiteSpace = false; 39 | $tables = $dom->getElementsByTagName('tr'); 40 | 41 | foreach ($tables as $row) 42 | { 43 | $cols = $row->getElementsByTagName('td'); 44 | if($cols->length == 8) 45 | { 46 | $array[$cols->item(0)->nodeValue] = array( 47 | 'IES' => $cols->item(1)->nodeValue, 48 | 'organizacao' => $cols->item(2)->nodeValue, 49 | 'categoria' => $cols->item(3)->nodeValue, 50 | 'CI' => $cols->item(4)->nodeValue, 51 | 'IGC' => $cols->item(5)->nodeValue, 52 | 'situacao' => $cols->item(6)->nodeValue 53 | ); 54 | } 55 | } 56 | return $array; 57 | } 58 | 59 | /* 60 | Função para obter o endereço de cada campus da instituição 61 | */ 62 | function get_instituicao_enderecos($cod) 63 | { 64 | $html = file_get_contents('http://emec.mec.gov.br/emec/consulta-ies/listar-endereco/d96957f455f6405d14c6542552b0f6eb/'.base64_encode($cod).'/list/1000'); 65 | 66 | include_once('./src/simple_html_dom.php'); 67 | 68 | $dom = new domDocument; 69 | @$dom->loadHTML($html); 70 | $dom->preserveWhiteSpace = false; 71 | $tables = $dom->getElementsByTagName('tbody'); 72 | 73 | foreach ($tables as $row) 74 | { 75 | $cols = $row->getElementsByTagName('td'); 76 | $array[trim($cols->item(0)->nodeValue)] = array( 77 | 'denominacao' => trim($cols->item(1)->nodeValue), 78 | 'endereco' => trim($cols->item(2)->nodeValue), 79 | 'polo' => trim($cols->item(3)->nodeValue), 80 | 'municipio' => trim($cols->item(4)->nodeValue), 81 | 'UF' => preg_replace("/[^A-Z{2}]/", "", $cols->item(5)->nodeValue) 82 | ); 83 | } 84 | return $array; 85 | } 86 | 87 | /* 88 | Função para obter cada curso de um determinado campus de uma instituição 89 | */ 90 | function get_instituicao_cursos($cod_endereco, $cod_instituicao) 91 | { 92 | $html = file_get_contents('http://emec.mec.gov.br/emec/consulta-ies/listar-curso-endereco/d96957f455f6405d14c6542552b0f6eb/'.base64_encode($cod_instituicao).'/aa547dc9e0377b562e2354d29f06085f/'.base64_encode($cod_endereco).'/list/1000'); 93 | include_once('./src/simple_html_dom.php'); 94 | 95 | $dom = new domDocument; 96 | @$dom->loadHTML($html); 97 | $dom->preserveWhiteSpace = false; 98 | $tables = $dom->getElementsByTagName('tbody'); 99 | 100 | foreach ($tables as $row) 101 | { 102 | $cols = $row->getElementsByTagName('td'); 103 | $array[] = preg_replace("/[^A-Za-z]/", "", $cols->item(0)->nodeValue); 104 | } 105 | return $array; 106 | } 107 | } 108 | 109 | $mec = new MECApi; 110 | 111 | echo "
";
112 | 	
113 | 	// Exemplos: 
114 | 	
115 | 	// print_r($mec->get_municipios('MG')); // Lista todos os municípios de Minas Gerais
116 | 	// print_r($mec->get_instituicoes('000000003106200')); // Obtem lista de instituições de Belo Horizonte (Cod 000000003106200)
117 | 	// print_r($mec->get_instituicao_enderecos('575')); // Obtem lista de endereços de cada campus da UFMG (Instituição Cod 575)
118 | 	// print_r($mec->get_instituicao_cursos('34819', '575')); // Obtem lista de cursos do "Campus Saúde" (Endereço Cod 34819) da instituição UFMG (Instituição Cod 575)
119 | 	
120 | ?>


--------------------------------------------------------------------------------
/src/simple_html_dom.php:
--------------------------------------------------------------------------------
   1 | size is the "real" number of bytes the dom was created from.
  17 |  *  but for most purposes, it's a really good estimation.
  18 |  * Paperg - Added the forceTagsClosed to the dom constructor.  Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
  19 |  * Allow the user to tell us how much they trust the html.
  20 |  * Paperg add the text and plaintext to the selectors for the find syntax.  plaintext implies text in the innertext of a node.  text implies that the tag is a text node.
  21 |  * This allows for us to find tags based on the text they contain.
  22 |  * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
  23 |  * Paperg: added parse_charset so that we know about the character set of the source document.
  24 |  *  NOTE:  If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
  25 |  *  last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
  26 |  *
  27 |  * Found infinite loop in the case of broken html in restore_noise.  Rewrote to protect from that.
  28 |  * PaperG (John Schlick) Added get_display_size for "IMG" tags.
  29 |  *
  30 |  * Licensed under The MIT License
  31 |  * Redistributions of files must retain the above copyright notice.
  32 |  *
  33 |  * @author S.C. Chen 
  34 |  * @author John Schlick
  35 |  * @author Rus Carroll
  36 |  * @version 1.5 ($Rev: 196 $)
  37 |  * @package PlaceLocalInclude
  38 |  * @subpackage simple_html_dom
  39 |  */
  40 | 
  41 | /**
  42 |  * All of the Defines for the classes below.
  43 |  * @author S.C. Chen 
  44 |  */
  45 | define('HDOM_TYPE_ELEMENT', 1);
  46 | define('HDOM_TYPE_COMMENT', 2);
  47 | define('HDOM_TYPE_TEXT',    3);
  48 | define('HDOM_TYPE_ENDTAG',  4);
  49 | define('HDOM_TYPE_ROOT',    5);
  50 | define('HDOM_TYPE_UNKNOWN', 6);
  51 | define('HDOM_QUOTE_DOUBLE', 0);
  52 | define('HDOM_QUOTE_SINGLE', 1);
  53 | define('HDOM_QUOTE_NO',     3);
  54 | define('HDOM_INFO_BEGIN',   0);
  55 | define('HDOM_INFO_END',     1);
  56 | define('HDOM_INFO_QUOTE',   2);
  57 | define('HDOM_INFO_SPACE',   3);
  58 | define('HDOM_INFO_TEXT',    4);
  59 | define('HDOM_INFO_INNER',   5);
  60 | define('HDOM_INFO_OUTER',   6);
  61 | define('HDOM_INFO_ENDSPACE',7);
  62 | define('DEFAULT_TARGET_CHARSET', 'UTF-8');
  63 | define('DEFAULT_BR_TEXT', "\r\n");
  64 | define('DEFAULT_SPAN_TEXT', " ");
  65 | define('MAX_FILE_SIZE', 600000);
  66 | // helper functions
  67 | // -----------------------------------------------------------------------------
  68 | // get html dom from file
  69 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
  70 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
  71 | {
  72 |     // We DO force the tags to be terminated.
  73 |     $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
  74 |     // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
  75 |     $contents = file_get_contents($url, $use_include_path, $context, $offset);
  76 |     // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
  77 |     //$contents = retrieve_url_contents($url);
  78 |     if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
  79 |     {
  80 |         return false;
  81 |     }
  82 |     // The second parameter can force the selectors to all be lowercase.
  83 |     $dom->load($contents, $lowercase, $stripRN);
  84 |     return $dom;
  85 | }
  86 | 
  87 | // get html dom from string
  88 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
  89 | {
  90 |     $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
  91 |     if (empty($str) || strlen($str) > MAX_FILE_SIZE)
  92 |     {
  93 |         $dom->clear();
  94 |         return false;
  95 |     }
  96 |     $dom->load($str, $lowercase, $stripRN);
  97 |     return $dom;
  98 | }
  99 | 
 100 | // dump html dom tree
 101 | function dump_html_tree($node, $show_attr=true, $deep=0)
 102 | {
 103 |     $node->dump($node);
 104 | }
 105 | 
 106 | 
 107 | /**
 108 |  * simple html dom node
 109 |  * PaperG - added ability for "find" routine to lowercase the value of the selector.
 110 |  * PaperG - added $tag_start to track the start position of the tag in the total byte index
 111 |  *
 112 |  * @package PlaceLocalInclude
 113 |  */
 114 | class simple_html_dom_node
 115 | {
 116 |     public $nodetype = HDOM_TYPE_TEXT;
 117 |     public $tag = 'text';
 118 |     public $attr = array();
 119 |     public $children = array();
 120 |     public $nodes = array();
 121 |     public $parent = null;
 122 |     // The "info" array - see HDOM_INFO_... for what each element contains.
 123 |     public $_ = array();
 124 |     public $tag_start = 0;
 125 |     private $dom = null;
 126 | 
 127 |     function __construct($dom)
 128 |     {
 129 |         $this->dom = $dom;
 130 |         $dom->nodes[] = $this;
 131 |     }
 132 | 
 133 |     function __destruct()
 134 |     {
 135 |         $this->clear();
 136 |     }
 137 | 
 138 |     function __toString()
 139 |     {
 140 |         return $this->outertext();
 141 |     }
 142 | 
 143 |     // clean up memory due to php5 circular references memory leak...
 144 |     function clear()
 145 |     {
 146 |         $this->dom = null;
 147 |         $this->nodes = null;
 148 |         $this->parent = null;
 149 |         $this->children = null;
 150 |     }
 151 | 
 152 |     // dump node's tree
 153 |     function dump($show_attr=true, $deep=0)
 154 |     {
 155 |         $lead = str_repeat('    ', $deep);
 156 | 
 157 |         echo $lead.$this->tag;
 158 |         if ($show_attr && count($this->attr)>0)
 159 |         {
 160 |             echo '(';
 161 |             foreach ($this->attr as $k=>$v)
 162 |                 echo "[$k]=>\"".$this->$k.'", ';
 163 |             echo ')';
 164 |         }
 165 |         echo "\n";
 166 | 
 167 |         if ($this->nodes)
 168 |         {
 169 |             foreach ($this->nodes as $c)
 170 |             {
 171 |                 $c->dump($show_attr, $deep+1);
 172 |             }
 173 |         }
 174 |     }
 175 | 
 176 | 
 177 |     // Debugging function to dump a single dom node with a bunch of information about it.
 178 |     function dump_node($echo=true)
 179 |     {
 180 | 
 181 |         $string = $this->tag;
 182 |         if (count($this->attr)>0)
 183 |         {
 184 |             $string .= '(';
 185 |             foreach ($this->attr as $k=>$v)
 186 |             {
 187 |                 $string .= "[$k]=>\"".$this->$k.'", ';
 188 |             }
 189 |             $string .= ')';
 190 |         }
 191 |         if (count($this->_)>0)
 192 |         {
 193 |             $string .= ' $_ (';
 194 |             foreach ($this->_ as $k=>$v)
 195 |             {
 196 |                 if (is_array($v))
 197 |                 {
 198 |                     $string .= "[$k]=>(";
 199 |                     foreach ($v as $k2=>$v2)
 200 |                     {
 201 |                         $string .= "[$k2]=>\"".$v2.'", ';
 202 |                     }
 203 |                     $string .= ")";
 204 |                 } else {
 205 |                     $string .= "[$k]=>\"".$v.'", ';
 206 |                 }
 207 |             }
 208 |             $string .= ")";
 209 |         }
 210 | 
 211 |         if (isset($this->text))
 212 |         {
 213 |             $string .= " text: (" . $this->text . ")";
 214 |         }
 215 | 
 216 |         $string .= " HDOM_INNER_INFO: '";
 217 |         if (isset($node->_[HDOM_INFO_INNER]))
 218 |         {
 219 |             $string .= $node->_[HDOM_INFO_INNER] . "'";
 220 |         }
 221 |         else
 222 |         {
 223 |             $string .= ' NULL ';
 224 |         }
 225 | 
 226 |         $string .= " children: " . count($this->children);
 227 |         $string .= " nodes: " . count($this->nodes);
 228 |         $string .= " tag_start: " . $this->tag_start;
 229 |         $string .= "\n";
 230 | 
 231 |         if ($echo)
 232 |         {
 233 |             echo $string;
 234 |             return;
 235 |         }
 236 |         else
 237 |         {
 238 |             return $string;
 239 |         }
 240 |     }
 241 | 
 242 |     // returns the parent of node
 243 |     // If a node is passed in, it will reset the parent of the current node to that one.
 244 |     function parent($parent=null)
 245 |     {
 246 |         // I am SURE that this doesn't work properly.
 247 |         // It fails to unset the current node from it's current parents nodes or children list first.
 248 |         if ($parent !== null)
 249 |         {
 250 |             $this->parent = $parent;
 251 |             $this->parent->nodes[] = $this;
 252 |             $this->parent->children[] = $this;
 253 |         }
 254 | 
 255 |         return $this->parent;
 256 |     }
 257 | 
 258 |     // verify that node has children
 259 |     function has_child()
 260 |     {
 261 |         return !empty($this->children);
 262 |     }
 263 | 
 264 |     // returns children of node
 265 |     function children($idx=-1)
 266 |     {
 267 |         if ($idx===-1)
 268 |         {
 269 |             return $this->children;
 270 |         }
 271 |         if (isset($this->children[$idx])) return $this->children[$idx];
 272 |         return null;
 273 |     }
 274 | 
 275 |     // returns the first child of node
 276 |     function first_child()
 277 |     {
 278 |         if (count($this->children)>0)
 279 |         {
 280 |             return $this->children[0];
 281 |         }
 282 |         return null;
 283 |     }
 284 | 
 285 |     // returns the last child of node
 286 |     function last_child()
 287 |     {
 288 |         if (($count=count($this->children))>0)
 289 |         {
 290 |             return $this->children[$count-1];
 291 |         }
 292 |         return null;
 293 |     }
 294 | 
 295 |     // returns the next sibling of node
 296 |     function next_sibling()
 297 |     {
 298 |         if ($this->parent===null)
 299 |         {
 300 |             return null;
 301 |         }
 302 | 
 303 |         $idx = 0;
 304 |         $count = count($this->parent->children);
 305 |         while ($idx<$count && $this!==$this->parent->children[$idx])
 306 |         {
 307 |             ++$idx;
 308 |         }
 309 |         if (++$idx>=$count)
 310 |         {
 311 |             return null;
 312 |         }
 313 |         return $this->parent->children[$idx];
 314 |     }
 315 | 
 316 |     // returns the previous sibling of node
 317 |     function prev_sibling()
 318 |     {
 319 |         if ($this->parent===null) return null;
 320 |         $idx = 0;
 321 |         $count = count($this->parent->children);
 322 |         while ($idx<$count && $this!==$this->parent->children[$idx])
 323 |             ++$idx;
 324 |         if (--$idx<0) return null;
 325 |         return $this->parent->children[$idx];
 326 |     }
 327 | 
 328 |     // function to locate a specific ancestor tag in the path to the root.
 329 |     function find_ancestor_tag($tag)
 330 |     {
 331 |         global $debugObject;
 332 |         if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
 333 | 
 334 |         // Start by including ourselves in the comparison.
 335 |         $returnDom = $this;
 336 | 
 337 |         while (!is_null($returnDom))
 338 |         {
 339 |             if (is_object($debugObject)) { $debugObject->debugLog(2, "Current tag is: " . $returnDom->tag); }
 340 | 
 341 |             if ($returnDom->tag == $tag)
 342 |             {
 343 |                 break;
 344 |             }
 345 |             $returnDom = $returnDom->parent;
 346 |         }
 347 |         return $returnDom;
 348 |     }
 349 | 
 350 |     // get dom node's inner html
 351 |     function innertext()
 352 |     {
 353 |         if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
 354 |         if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 355 | 
 356 |         $ret = '';
 357 |         foreach ($this->nodes as $n)
 358 |             $ret .= $n->outertext();
 359 |         return $ret;
 360 |     }
 361 | 
 362 |     // get dom node's outer text (with tag)
 363 |     function outertext()
 364 |     {
 365 |         global $debugObject;
 366 |         if (is_object($debugObject))
 367 |         {
 368 |             $text = '';
 369 |             if ($this->tag == 'text')
 370 |             {
 371 |                 if (!empty($this->text))
 372 |                 {
 373 |                     $text = " with text: " . $this->text;
 374 |                 }
 375 |             }
 376 |             $debugObject->debugLog(1, 'Innertext of tag: ' . $this->tag . $text);
 377 |         }
 378 | 
 379 |         if ($this->tag==='root') return $this->innertext();
 380 | 
 381 |         // trigger callback
 382 |         if ($this->dom && $this->dom->callback!==null)
 383 |         {
 384 |             call_user_func_array($this->dom->callback, array($this));
 385 |         }
 386 | 
 387 |         if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
 388 |         if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 389 | 
 390 |         // render begin tag
 391 |         if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
 392 |         {
 393 |             $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
 394 |         } else {
 395 |             $ret = "";
 396 |         }
 397 | 
 398 |         // render inner text
 399 |         if (isset($this->_[HDOM_INFO_INNER]))
 400 |         {
 401 |             // If it's a br tag...  don't return the HDOM_INNER_INFO that we may or may not have added.
 402 |             if ($this->tag != "br")
 403 |             {
 404 |                 $ret .= $this->_[HDOM_INFO_INNER];
 405 |             }
 406 |         } else {
 407 |             if ($this->nodes)
 408 |             {
 409 |                 foreach ($this->nodes as $n)
 410 |                 {
 411 |                     $ret .= $this->convert_text($n->outertext());
 412 |                 }
 413 |             }
 414 |         }
 415 | 
 416 |         // render end tag
 417 |         if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
 418 |             $ret .= 'tag.'>';
 419 |         return $ret;
 420 |     }
 421 | 
 422 |     // get dom node's plain text
 423 |     function text()
 424 |     {
 425 |         if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
 426 |         switch ($this->nodetype)
 427 |         {
 428 |             case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 429 |             case HDOM_TYPE_COMMENT: return '';
 430 |             case HDOM_TYPE_UNKNOWN: return '';
 431 |         }
 432 |         if (strcasecmp($this->tag, 'script')===0) return '';
 433 |         if (strcasecmp($this->tag, 'style')===0) return '';
 434 | 
 435 |         $ret = '';
 436 |         // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
 437 |         // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
 438 |         // WHY is this happening?
 439 |         if (!is_null($this->nodes))
 440 |         {
 441 |             foreach ($this->nodes as $n)
 442 |             {
 443 |                 $ret .= $this->convert_text($n->text());
 444 |             }
 445 | 
 446 |             // If this node is a span... add a space at the end of it so multiple spans don't run into each other.  This is plaintext after all.
 447 |             if ($this->tag == "span")
 448 |             {
 449 |                 $ret .= $this->dom->default_span_text;
 450 |             }
 451 | 
 452 | 
 453 |         }
 454 |         return $ret;
 455 |     }
 456 | 
 457 |     function xmltext()
 458 |     {
 459 |         $ret = $this->innertext();
 460 |         $ret = str_ireplace('', '', $ret);
 462 |         return $ret;
 463 |     }
 464 | 
 465 |     // build node's text with tag
 466 |     function makeup()
 467 |     {
 468 |         // text, comment, unknown
 469 |         if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 470 | 
 471 |         $ret = '<'.$this->tag;
 472 |         $i = -1;
 473 | 
 474 |         foreach ($this->attr as $key=>$val)
 475 |         {
 476 |             ++$i;
 477 | 
 478 |             // skip removed attribute
 479 |             if ($val===null || $val===false)
 480 |                 continue;
 481 | 
 482 |             $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
 483 |             //no value attr: nowrap, checked selected...
 484 |             if ($val===true)
 485 |                 $ret .= $key;
 486 |             else {
 487 |                 switch ($this->_[HDOM_INFO_QUOTE][$i])
 488 |                 {
 489 |                     case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
 490 |                     case HDOM_QUOTE_SINGLE: $quote = '\''; break;
 491 |                     default: $quote = '';
 492 |                 }
 493 |                 $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
 494 |             }
 495 |         }
 496 |         $ret = $this->dom->restore_noise($ret);
 497 |         return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
 498 |     }
 499 | 
 500 |     // find elements by css selector
 501 |     //PaperG - added ability for find to lowercase the value of the selector.
 502 |     function find($selector, $idx=null, $lowercase=false)
 503 |     {
 504 |         $selectors = $this->parse_selector($selector);
 505 |         if (($count=count($selectors))===0) return array();
 506 |         $found_keys = array();
 507 | 
 508 |         // find each selector
 509 |         for ($c=0; $c<$count; ++$c)
 510 |         {
 511 |             // The change on the below line was documented on the sourceforge code tracker id 2788009
 512 |             // used to be: if (($levle=count($selectors[0]))===0) return array();
 513 |             if (($levle=count($selectors[$c]))===0) return array();
 514 |             if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
 515 | 
 516 |             $head = array($this->_[HDOM_INFO_BEGIN]=>1);
 517 | 
 518 |             // handle descendant selectors, no recursive!
 519 |             for ($l=0; $l<$levle; ++$l)
 520 |             {
 521 |                 $ret = array();
 522 |                 foreach ($head as $k=>$v)
 523 |                 {
 524 |                     $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
 525 |                     //PaperG - Pass this optional parameter on to the seek function.
 526 |                     $n->seek($selectors[$c][$l], $ret, $lowercase);
 527 |                 }
 528 |                 $head = $ret;
 529 |             }
 530 | 
 531 |             foreach ($head as $k=>$v)
 532 |             {
 533 |                 if (!isset($found_keys[$k]))
 534 |                     $found_keys[$k] = 1;
 535 |             }
 536 |         }
 537 | 
 538 |         // sort keys
 539 |         ksort($found_keys);
 540 | 
 541 |         $found = array();
 542 |         foreach ($found_keys as $k=>$v)
 543 |             $found[] = $this->dom->nodes[$k];
 544 | 
 545 |         // return nth-element or array
 546 |         if (is_null($idx)) return $found;
 547 |         else if ($idx<0) $idx = count($found) + $idx;
 548 |         return (isset($found[$idx])) ? $found[$idx] : null;
 549 |     }
 550 | 
 551 |     // seek for given conditions
 552 |     // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
 553 |     protected function seek($selector, &$ret, $lowercase=false)
 554 |     {
 555 |         global $debugObject;
 556 |         if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
 557 | 
 558 |         list($tag, $key, $val, $exp, $no_key) = $selector;
 559 | 
 560 |         // xpath index
 561 |         if ($tag && $key && is_numeric($key))
 562 |         {
 563 |             $count = 0;
 564 |             foreach ($this->children as $c)
 565 |             {
 566 |                 if ($tag==='*' || $tag===$c->tag) {
 567 |                     if (++$count==$key) {
 568 |                         $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
 569 |                         return;
 570 |                     }
 571 |                 }
 572 |             }
 573 |             return;
 574 |         }
 575 | 
 576 |         $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
 577 |         if ($end==0) {
 578 |             $parent = $this->parent;
 579 |             while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
 580 |                 $end -= 1;
 581 |                 $parent = $parent->parent;
 582 |             }
 583 |             $end += $parent->_[HDOM_INFO_END];
 584 |         }
 585 | 
 586 |         for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
 587 |             $node = $this->dom->nodes[$i];
 588 | 
 589 |             $pass = true;
 590 | 
 591 |             if ($tag==='*' && !$key) {
 592 |                 if (in_array($node, $this->children, true))
 593 |                     $ret[$i] = 1;
 594 |                 continue;
 595 |             }
 596 | 
 597 |             // compare tag
 598 |             if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
 599 |             // compare key
 600 |             if ($pass && $key) {
 601 |                 if ($no_key) {
 602 |                     if (isset($node->attr[$key])) $pass=false;
 603 |                 } else {
 604 |                     if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
 605 |                 }
 606 |             }
 607 |             // compare value
 608 |             if ($pass && $key && $val  && $val!=='*') {
 609 |                 // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
 610 |                 if ($key == "plaintext") {
 611 |                     // $node->plaintext actually returns $node->text();
 612 |                     $nodeKeyValue = $node->text();
 613 |                 } else {
 614 |                     // this is a normal search, we want the value of that attribute of the tag.
 615 |                     $nodeKeyValue = $node->attr[$key];
 616 |                 }
 617 |                 if (is_object($debugObject)) {$debugObject->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
 618 | 
 619 |                 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
 620 |                 if ($lowercase) {
 621 |                     $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
 622 |                 } else {
 623 |                     $check = $this->match($exp, $val, $nodeKeyValue);
 624 |                 }
 625 |                 if (is_object($debugObject)) {$debugObject->debugLog(2, "after match: " . ($check ? "true" : "false"));}
 626 | 
 627 |                 // handle multiple class
 628 |                 if (!$check && strcasecmp($key, 'class')===0) {
 629 |                     foreach (explode(' ',$node->attr[$key]) as $k) {
 630 |                         // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
 631 |                         if (!empty($k)) {
 632 |                             if ($lowercase) {
 633 |                                 $check = $this->match($exp, strtolower($val), strtolower($k));
 634 |                             } else {
 635 |                                 $check = $this->match($exp, $val, $k);
 636 |                             }
 637 |                             if ($check) break;
 638 |                         }
 639 |                     }
 640 |                 }
 641 |                 if (!$check) $pass = false;
 642 |             }
 643 |             if ($pass) $ret[$i] = 1;
 644 |             unset($node);
 645 |         }
 646 |         // It's passed by reference so this is actually what this function returns.
 647 |         if (is_object($debugObject)) {$debugObject->debugLog(1, "EXIT - ret: ", $ret);}
 648 |     }
 649 | 
 650 |     protected function match($exp, $pattern, $value) {
 651 |         global $debugObject;
 652 |         if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
 653 | 
 654 |         switch ($exp) {
 655 |             case '=':
 656 |                 return ($value===$pattern);
 657 |             case '!=':
 658 |                 return ($value!==$pattern);
 659 |             case '^=':
 660 |                 return preg_match("/^".preg_quote($pattern,'/')."/", $value);
 661 |             case '$=':
 662 |                 return preg_match("/".preg_quote($pattern,'/')."$/", $value);
 663 |             case '*=':
 664 |                 if ($pattern[0]=='/') {
 665 |                     return preg_match($pattern, $value);
 666 |                 }
 667 |                 return preg_match("/".$pattern."/i", $value);
 668 |         }
 669 |         return false;
 670 |     }
 671 | 
 672 |     protected function parse_selector($selector_string) {
 673 |         global $debugObject;
 674 |         if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
 675 | 
 676 |         // pattern of CSS selectors, modified from mootools
 677 |         // Paperg: Add the colon to the attrbute, so that it properly finds  like google does.
 678 |         // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
 679 | // Notice the \[ starting the attbute?  and the @? following?  This implies that an attribute can begin with an @ sign that is not captured.
 680 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
 681 | // farther study is required to determine of this should be documented or removed.
 682 | //        $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
 683 |         $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
 684 |         preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
 685 |         if (is_object($debugObject)) {$debugObject->debugLog(2, "Matches Array: ", $matches);}
 686 | 
 687 |         $selectors = array();
 688 |         $result = array();
 689 |         //print_r($matches);
 690 | 
 691 |         foreach ($matches as $m) {
 692 |             $m[0] = trim($m[0]);
 693 |             if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
 694 |             // for browser generated xpath
 695 |             if ($m[1]==='tbody') continue;
 696 | 
 697 |             list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
 698 |             if (!empty($m[2])) {$key='id'; $val=$m[2];}
 699 |             if (!empty($m[3])) {$key='class'; $val=$m[3];}
 700 |             if (!empty($m[4])) {$key=$m[4];}
 701 |             if (!empty($m[5])) {$exp=$m[5];}
 702 |             if (!empty($m[6])) {$val=$m[6];}
 703 | 
 704 |             // convert to lowercase
 705 |             if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
 706 |             //elements that do NOT have the specified attribute
 707 |             if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
 708 | 
 709 |             $result[] = array($tag, $key, $val, $exp, $no_key);
 710 |             if (trim($m[7])===',') {
 711 |                 $selectors[] = $result;
 712 |                 $result = array();
 713 |             }
 714 |         }
 715 |         if (count($result)>0)
 716 |             $selectors[] = $result;
 717 |         return $selectors;
 718 |     }
 719 | 
 720 |     function __get($name) {
 721 |         if (isset($this->attr[$name]))
 722 |         {
 723 |             return $this->convert_text($this->attr[$name]);
 724 |         }
 725 |         switch ($name) {
 726 |             case 'outertext': return $this->outertext();
 727 |             case 'innertext': return $this->innertext();
 728 |             case 'plaintext': return $this->text();
 729 |             case 'xmltext': return $this->xmltext();
 730 |             default: return array_key_exists($name, $this->attr);
 731 |         }
 732 |     }
 733 | 
 734 |     function __set($name, $value) {
 735 |         switch ($name) {
 736 |             case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
 737 |             case 'innertext':
 738 |                 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
 739 |                 return $this->_[HDOM_INFO_INNER] = $value;
 740 |         }
 741 |         if (!isset($this->attr[$name])) {
 742 |             $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
 743 |             $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
 744 |         }
 745 |         $this->attr[$name] = $value;
 746 |     }
 747 | 
 748 |     function __isset($name) {
 749 |         switch ($name) {
 750 |             case 'outertext': return true;
 751 |             case 'innertext': return true;
 752 |             case 'plaintext': return true;
 753 |         }
 754 |         //no value attr: nowrap, checked selected...
 755 |         return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
 756 |     }
 757 | 
 758 |     function __unset($name) {
 759 |         if (isset($this->attr[$name]))
 760 |             unset($this->attr[$name]);
 761 |     }
 762 | 
 763 |     // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
 764 |     function convert_text($text)
 765 |     {
 766 |         global $debugObject;
 767 |         if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
 768 | 
 769 |         $converted_text = $text;
 770 | 
 771 |         $sourceCharset = "";
 772 |         $targetCharset = "";
 773 | 
 774 |         if ($this->dom)
 775 |         {
 776 |             $sourceCharset = strtoupper($this->dom->_charset);
 777 |             $targetCharset = strtoupper($this->dom->_target_charset);
 778 |         }
 779 |         if (is_object($debugObject)) {$debugObject->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
 780 | 
 781 |         if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
 782 |         {
 783 |             // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
 784 |             if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
 785 |             {
 786 |                 $converted_text = $text;
 787 |             }
 788 |             else
 789 |             {
 790 |                 $converted_text = iconv($sourceCharset, $targetCharset, $text);
 791 |             }
 792 |         }
 793 | 
 794 |         // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
 795 |         if ($targetCharset == 'UTF-8')
 796 |         {
 797 |             if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
 798 |             {
 799 |                 $converted_text = substr($converted_text, 3);
 800 |             }
 801 |             if (substr($converted_text, -3) == "\xef\xbb\xbf")
 802 |             {
 803 |                 $converted_text = substr($converted_text, 0, -3);
 804 |             }
 805 |         }
 806 | 
 807 |         return $converted_text;
 808 |     }
 809 | 
 810 |     /**
 811 |     * Returns true if $string is valid UTF-8 and false otherwise.
 812 |     *
 813 |     * @param mixed $str String to be tested
 814 |     * @return boolean
 815 |     */
 816 |     static function is_utf8($str)
 817 |     {
 818 |         $c=0; $b=0;
 819 |         $bits=0;
 820 |         $len=strlen($str);
 821 |         for($i=0; $i<$len; $i++)
 822 |         {
 823 |             $c=ord($str[$i]);
 824 |             if($c > 128)
 825 |             {
 826 |                 if(($c >= 254)) return false;
 827 |                 elseif($c >= 252) $bits=6;
 828 |                 elseif($c >= 248) $bits=5;
 829 |                 elseif($c >= 240) $bits=4;
 830 |                 elseif($c >= 224) $bits=3;
 831 |                 elseif($c >= 192) $bits=2;
 832 |                 else return false;
 833 |                 if(($i+$bits) > $len) return false;
 834 |                 while($bits > 1)
 835 |                 {
 836 |                     $i++;
 837 |                     $b=ord($str[$i]);
 838 |                     if($b < 128 || $b > 191) return false;
 839 |                     $bits--;
 840 |                 }
 841 |             }
 842 |         }
 843 |         return true;
 844 |     }
 845 |     /*
 846 |     function is_utf8($string)
 847 |     {
 848 |         //this is buggy
 849 |         return (utf8_encode(utf8_decode($string)) == $string);
 850 |     }
 851 |     */
 852 | 
 853 |     /**
 854 |      * Function to try a few tricks to determine the displayed size of an img on the page.
 855 |      * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
 856 |      *
 857 |      * @author John Schlick
 858 |      * @version April 19 2012
 859 |      * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
 860 |      */
 861 |     function get_display_size()
 862 |     {
 863 |         global $debugObject;
 864 | 
 865 |         $width = -1;
 866 |         $height = -1;
 867 | 
 868 |         if ($this->tag !== 'img')
 869 |         {
 870 |             return false;
 871 |         }
 872 | 
 873 |         // See if there is aheight or width attribute in the tag itself.
 874 |         if (isset($this->attr['width']))
 875 |         {
 876 |             $width = $this->attr['width'];
 877 |         }
 878 | 
 879 |         if (isset($this->attr['height']))
 880 |         {
 881 |             $height = $this->attr['height'];
 882 |         }
 883 | 
 884 |         // Now look for an inline style.
 885 |         if (isset($this->attr['style']))
 886 |         {
 887 |             // Thanks to user gnarf from stackoverflow for this regular expression.
 888 |             $attributes = array();
 889 |             preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
 890 |             foreach ($matches as $match) {
 891 |               $attributes[$match[1]] = $match[2];
 892 |             }
 893 | 
 894 |             // If there is a width in the style attributes:
 895 |             if (isset($attributes['width']) && $width == -1)
 896 |             {
 897 |                 // check that the last two characters are px (pixels)
 898 |                 if (strtolower(substr($attributes['width'], -2)) == 'px')
 899 |                 {
 900 |                     $proposed_width = substr($attributes['width'], 0, -2);
 901 |                     // Now make sure that it's an integer and not something stupid.
 902 |                     if (filter_var($proposed_width, FILTER_VALIDATE_INT))
 903 |                     {
 904 |                         $width = $proposed_width;
 905 |                     }
 906 |                 }
 907 |             }
 908 | 
 909 |             // If there is a width in the style attributes:
 910 |             if (isset($attributes['height']) && $height == -1)
 911 |             {
 912 |                 // check that the last two characters are px (pixels)
 913 |                 if (strtolower(substr($attributes['height'], -2)) == 'px')
 914 |                 {
 915 |                     $proposed_height = substr($attributes['height'], 0, -2);
 916 |                     // Now make sure that it's an integer and not something stupid.
 917 |                     if (filter_var($proposed_height, FILTER_VALIDATE_INT))
 918 |                     {
 919 |                         $height = $proposed_height;
 920 |                     }
 921 |                 }
 922 |             }
 923 | 
 924 |         }
 925 | 
 926 |         // Future enhancement:
 927 |         // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
 928 | 
 929 |         // Far future enhancement
 930 |         // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
 931 |         // Note that in this case, the class or id will have the img subselector for it to apply to the image.
 932 | 
 933 |         // ridiculously far future development
 934 |         // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
 935 | 
 936 |         $result = array('height' => $height,
 937 |                         'width' => $width);
 938 |         return $result;
 939 |     }
 940 | 
 941 |     // camel naming conventions
 942 |     function getAllAttributes() {return $this->attr;}
 943 |     function getAttribute($name) {return $this->__get($name);}
 944 |     function setAttribute($name, $value) {$this->__set($name, $value);}
 945 |     function hasAttribute($name) {return $this->__isset($name);}
 946 |     function removeAttribute($name) {$this->__set($name, null);}
 947 |     function getElementById($id) {return $this->find("#$id", 0);}
 948 |     function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
 949 |     function getElementByTagName($name) {return $this->find($name, 0);}
 950 |     function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
 951 |     function parentNode() {return $this->parent();}
 952 |     function childNodes($idx=-1) {return $this->children($idx);}
 953 |     function firstChild() {return $this->first_child();}
 954 |     function lastChild() {return $this->last_child();}
 955 |     function nextSibling() {return $this->next_sibling();}
 956 |     function previousSibling() {return $this->prev_sibling();}
 957 |     function hasChildNodes() {return $this->has_child();}
 958 |     function nodeName() {return $this->tag;}
 959 |     function appendChild($node) {$node->parent($this); return $node;}
 960 | 
 961 | }
 962 | 
 963 | /**
 964 |  * simple html dom parser
 965 |  * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
 966 |  * Paperg - change $size from protected to public so we can easily access it
 967 |  * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not.  Default is to NOT trust it.
 968 |  *
 969 |  * @package PlaceLocalInclude
 970 |  */
 971 | class simple_html_dom
 972 | {
 973 |     public $root = null;
 974 |     public $nodes = array();
 975 |     public $callback = null;
 976 |     public $lowercase = false;
 977 |     // Used to keep track of how large the text was when we started.
 978 |     public $original_size;
 979 |     public $size;
 980 |     protected $pos;
 981 |     protected $doc;
 982 |     protected $char;
 983 |     protected $cursor;
 984 |     protected $parent;
 985 |     protected $noise = array();
 986 |     protected $token_blank = " \t\r\n";
 987 |     protected $token_equal = ' =/>';
 988 |     protected $token_slash = " />\r\n\t";
 989 |     protected $token_attr = ' >';
 990 |     // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
 991 |     public $_charset = '';
 992 |     public $_target_charset = '';
 993 |     protected $default_br_text = "";
 994 |     public $default_span_text = "";
 995 | 
 996 |     // use isset instead of in_array, performance boost about 30%...
 997 |     protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
 998 |     protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
 999 |     // Known sourceforge issue #2977341
1000 |     // B tags that are not closed cause us to return everything to the end of the document.
1001 |     protected $optional_closing_tags = array(
1002 |         'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
1003 |         'th'=>array('th'=>1),
1004 |         'td'=>array('td'=>1),
1005 |         'li'=>array('li'=>1),
1006 |         'dt'=>array('dt'=>1, 'dd'=>1),
1007 |         'dd'=>array('dd'=>1, 'dt'=>1),
1008 |         'dl'=>array('dd'=>1, 'dt'=>1),
1009 |         'p'=>array('p'=>1),
1010 |         'nobr'=>array('nobr'=>1),
1011 |         'b'=>array('b'=>1),
1012 | 		'option'=>array('option'=>1),
1013 |     );
1014 | 
1015 |     function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1016 |     {
1017 |         if ($str)
1018 |         {
1019 |             if (preg_match("/^http:\/\//i",$str) || is_file($str))
1020 |             {
1021 |                 $this->load_file($str);
1022 |             }
1023 |             else
1024 |             {
1025 |                 $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1026 |             }
1027 |         }
1028 |         // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1029 |         if (!$forceTagsClosed) {
1030 |             $this->optional_closing_array=array();
1031 |         }
1032 |         $this->_target_charset = $target_charset;
1033 |     }
1034 | 
1035 |     function __destruct()
1036 |     {
1037 |         $this->clear();
1038 |     }
1039 | 
1040 |     // load html from string
1041 |     function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1042 |     {
1043 |         global $debugObject;
1044 | 
1045 |         // prepare
1046 |         $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1047 |         // strip out comments
1048 |         $this->remove_noise("''is");
1049 |         // strip out cdata
1050 |         $this->remove_noise("''is", true);
1051 |         // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1052 |         // Script tags removal now preceeds style tag removal.
1053 |         // strip out