├── README.md
├── crawler.php
└── simple_html_dom.php
/README.md:
--------------------------------------------------------------------------------
1 |
Simple PHP Web Crawler
2 | A Web Crawler created in PHP.
3 | See Blog Post @ http://www.subinsb.com/2013/10/simple-web-crawler-in-php.html
4 |
--------------------------------------------------------------------------------
/crawler.php:
--------------------------------------------------------------------------------
1 |
2 | include("simple_html_dom.php");
3 | $crawled_urls=array();
4 | $found_urls=array();
5 | function rel2abs($rel, $base){
6 | if (parse_url($rel, PHP_URL_SCHEME) != '') return $rel;
7 | if ($rel[0]=='#' || $rel[0]=='?') return $base.$rel;
8 | extract(parse_url($base));
9 | $path = preg_replace('#/[^/]*$#', '', $path);
10 | if ($rel[0] == '/') $path = '';
11 | $abs = "$host$path/$rel";
12 | $re = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
13 | for($n=1; $n>0;$abs=preg_replace($re,'/', $abs,-1,$n)){}
14 | $abs=str_replace("../","",$abs);
15 | return $scheme.'://'.$abs;
16 | }
17 | function perfect_url($u,$b){
18 | $bp=parse_url($b);
19 | if(($bp['path']!="/" && $bp['path']!="") || $bp['path']==''){
20 | if($bp['scheme']==""){$scheme="http";}else{$scheme=$bp['scheme'];}
21 | $b=$scheme."://".$bp['host']."/";
22 | }
23 | if(substr($u,0,2)=="//"){
24 | $u="http:".$u;
25 | }
26 | if(substr($u,0,4)!="http"){
27 | $u=rel2abs($u,$b);
28 | }
29 | return $u;
30 | }
31 | function crawl_site($u){
32 | global $crawled_urls;
33 | $uen=urlencode($u);
34 | if((array_key_exists($uen,$crawled_urls)==0 || $crawled_urls[$uen] < date("YmdHis",strtotime('-25 seconds', time())))){
35 | $html = file_get_html($u);
36 | $crawled_urls[$uen]=date("YmdHis");
37 | foreach($html->find("a") as $li){
38 | $url=perfect_url($li->href,$u);
39 | $enurl=urlencode($url);
40 | if($url!='' && substr($url,0,4)!="mail" && substr($url,0,4)!="java" && array_key_exists($enurl,$found_urls)==0){
41 | $found_urls[$enurl]=1;
42 | echo "".$url."";
43 | }
44 | }
45 | }
46 | }
47 | crawl_site("http://www.subinsb.com");
48 | ?>
49 |
--------------------------------------------------------------------------------
/simple_html_dom.php:
--------------------------------------------------------------------------------
1 | size is the "real" number of bytes the dom was created from.
18 | * but for most purposes, it's a really good estimation.
19 | * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
20 | * Allow the user to tell us how much they trust the html.
21 | * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
22 | * This allows for us to find tags based on the text they contain.
23 | * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
24 | * Paperg: added parse_charset so that we know about the character set of the source document.
25 | * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
26 | * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
27 | *
28 | * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.
29 | * PaperG (John Schlick) Added get_display_size for "IMG" tags.
30 | *
31 | * Licensed under The MIT License
32 | * Redistributions of files must retain the above copyright notice.
33 | *
34 | * @author S.C. Chen
35 | * @author John Schlick
36 | * @author Rus Carroll
37 | * @version 1.5 ($Rev: 208 $)
38 | * @package PlaceLocalInclude
39 | * @subpackage simple_html_dom
40 | */
41 |
42 | /**
43 | * All of the Defines for the classes below.
44 | * @author S.C. Chen
45 | */
46 | define('HDOM_TYPE_ELEMENT', 1);
47 | define('HDOM_TYPE_COMMENT', 2);
48 | define('HDOM_TYPE_TEXT', 3);
49 | define('HDOM_TYPE_ENDTAG', 4);
50 | define('HDOM_TYPE_ROOT', 5);
51 | define('HDOM_TYPE_UNKNOWN', 6);
52 | define('HDOM_QUOTE_DOUBLE', 0);
53 | define('HDOM_QUOTE_SINGLE', 1);
54 | define('HDOM_QUOTE_NO', 3);
55 | define('HDOM_INFO_BEGIN', 0);
56 | define('HDOM_INFO_END', 1);
57 | define('HDOM_INFO_QUOTE', 2);
58 | define('HDOM_INFO_SPACE', 3);
59 | define('HDOM_INFO_TEXT', 4);
60 | define('HDOM_INFO_INNER', 5);
61 | define('HDOM_INFO_OUTER', 6);
62 | define('HDOM_INFO_ENDSPACE',7);
63 | define('DEFAULT_TARGET_CHARSET', 'UTF-8');
64 | define('DEFAULT_BR_TEXT', "\r\n");
65 | define('DEFAULT_SPAN_TEXT', " ");
66 | define('MAX_FILE_SIZE', 600000);
67 | // helper functions
68 | // -----------------------------------------------------------------------------
69 | // get html dom from file
70 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
71 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
72 | {
73 | // We DO force the tags to be terminated.
74 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
75 | // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
76 | $contents = file_get_contents($url, $use_include_path, $context, $offset);
77 | // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
78 | //$contents = retrieve_url_contents($url);
79 | if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
80 | {
81 | return false;
82 | }
83 | // The second parameter can force the selectors to all be lowercase.
84 | $dom->load($contents, $lowercase, $stripRN);
85 | return $dom;
86 | }
87 |
88 | // get html dom from string
89 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
90 | {
91 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
92 | if (empty($str) || strlen($str) > MAX_FILE_SIZE)
93 | {
94 | $dom->clear();
95 | return false;
96 | }
97 | $dom->load($str, $lowercase, $stripRN);
98 | return $dom;
99 | }
100 |
101 | // dump html dom tree
102 | function dump_html_tree($node, $show_attr=true, $deep=0)
103 | {
104 | $node->dump($node);
105 | }
106 |
107 |
108 | /**
109 | * simple html dom node
110 | * PaperG - added ability for "find" routine to lowercase the value of the selector.
111 | * PaperG - added $tag_start to track the start position of the tag in the total byte index
112 | *
113 | * @package PlaceLocalInclude
114 | */
115 | class simple_html_dom_node
116 | {
117 | public $nodetype = HDOM_TYPE_TEXT;
118 | public $tag = 'text';
119 | public $attr = array();
120 | public $children = array();
121 | public $nodes = array();
122 | public $parent = null;
123 | // The "info" array - see HDOM_INFO_... for what each element contains.
124 | public $_ = array();
125 | public $tag_start = 0;
126 | private $dom = null;
127 |
128 | function __construct($dom)
129 | {
130 | $this->dom = $dom;
131 | $dom->nodes[] = $this;
132 | }
133 |
134 | function __destruct()
135 | {
136 | $this->clear();
137 | }
138 |
139 | function __toString()
140 | {
141 | return $this->outertext();
142 | }
143 |
144 | // clean up memory due to php5 circular references memory leak...
145 | function clear()
146 | {
147 | $this->dom = null;
148 | $this->nodes = null;
149 | $this->parent = null;
150 | $this->children = null;
151 | }
152 |
153 | // dump node's tree
154 | function dump($show_attr=true, $deep=0)
155 | {
156 | $lead = str_repeat(' ', $deep);
157 |
158 | echo $lead.$this->tag;
159 | if ($show_attr && count($this->attr)>0)
160 | {
161 | echo '(';
162 | foreach ($this->attr as $k=>$v)
163 | echo "[$k]=>\"".$this->$k.'", ';
164 | echo ')';
165 | }
166 | echo "\n";
167 |
168 | if ($this->nodes)
169 | {
170 | foreach ($this->nodes as $c)
171 | {
172 | $c->dump($show_attr, $deep+1);
173 | }
174 | }
175 | }
176 |
177 |
178 | // Debugging function to dump a single dom node with a bunch of information about it.
179 | function dump_node($echo=true)
180 | {
181 |
182 | $string = $this->tag;
183 | if (count($this->attr)>0)
184 | {
185 | $string .= '(';
186 | foreach ($this->attr as $k=>$v)
187 | {
188 | $string .= "[$k]=>\"".$this->$k.'", ';
189 | }
190 | $string .= ')';
191 | }
192 | if (count($this->_)>0)
193 | {
194 | $string .= ' $_ (';
195 | foreach ($this->_ as $k=>$v)
196 | {
197 | if (is_array($v))
198 | {
199 | $string .= "[$k]=>(";
200 | foreach ($v as $k2=>$v2)
201 | {
202 | $string .= "[$k2]=>\"".$v2.'", ';
203 | }
204 | $string .= ")";
205 | } else {
206 | $string .= "[$k]=>\"".$v.'", ';
207 | }
208 | }
209 | $string .= ")";
210 | }
211 |
212 | if (isset($this->text))
213 | {
214 | $string .= " text: (" . $this->text . ")";
215 | }
216 |
217 | $string .= " HDOM_INNER_INFO: '";
218 | if (isset($node->_[HDOM_INFO_INNER]))
219 | {
220 | $string .= $node->_[HDOM_INFO_INNER] . "'";
221 | }
222 | else
223 | {
224 | $string .= ' NULL ';
225 | }
226 |
227 | $string .= " children: " . count($this->children);
228 | $string .= " nodes: " . count($this->nodes);
229 | $string .= " tag_start: " . $this->tag_start;
230 | $string .= "\n";
231 |
232 | if ($echo)
233 | {
234 | echo $string;
235 | return;
236 | }
237 | else
238 | {
239 | return $string;
240 | }
241 | }
242 |
243 | // returns the parent of node
244 | // If a node is passed in, it will reset the parent of the current node to that one.
245 | function parent($parent=null)
246 | {
247 | // I am SURE that this doesn't work properly.
248 | // It fails to unset the current node from it's current parents nodes or children list first.
249 | if ($parent !== null)
250 | {
251 | $this->parent = $parent;
252 | $this->parent->nodes[] = $this;
253 | $this->parent->children[] = $this;
254 | }
255 |
256 | return $this->parent;
257 | }
258 |
259 | // verify that node has children
260 | function has_child()
261 | {
262 | return !empty($this->children);
263 | }
264 |
265 | // returns children of node
266 | function children($idx=-1)
267 | {
268 | if ($idx===-1)
269 | {
270 | return $this->children;
271 | }
272 | if (isset($this->children[$idx]))
273 | {
274 | return $this->children[$idx];
275 | }
276 | return null;
277 | }
278 |
279 | // returns the first child of node
280 | function first_child()
281 | {
282 | if (count($this->children)>0)
283 | {
284 | return $this->children[0];
285 | }
286 | return null;
287 | }
288 |
289 | // returns the last child of node
290 | function last_child()
291 | {
292 | if (($count=count($this->children))>0)
293 | {
294 | return $this->children[$count-1];
295 | }
296 | return null;
297 | }
298 |
299 | // returns the next sibling of node
300 | function next_sibling()
301 | {
302 | if ($this->parent===null)
303 | {
304 | return null;
305 | }
306 |
307 | $idx = 0;
308 | $count = count($this->parent->children);
309 | while ($idx<$count && $this!==$this->parent->children[$idx])
310 | {
311 | ++$idx;
312 | }
313 | if (++$idx>=$count)
314 | {
315 | return null;
316 | }
317 | return $this->parent->children[$idx];
318 | }
319 |
320 | // returns the previous sibling of node
321 | function prev_sibling()
322 | {
323 | if ($this->parent===null) return null;
324 | $idx = 0;
325 | $count = count($this->parent->children);
326 | while ($idx<$count && $this!==$this->parent->children[$idx])
327 | ++$idx;
328 | if (--$idx<0) return null;
329 | return $this->parent->children[$idx];
330 | }
331 |
332 | // function to locate a specific ancestor tag in the path to the root.
333 | function find_ancestor_tag($tag)
334 | {
335 | global $debug_object;
336 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
337 |
338 | // Start by including ourselves in the comparison.
339 | $returnDom = $this;
340 |
341 | while (!is_null($returnDom))
342 | {
343 | if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); }
344 |
345 | if ($returnDom->tag == $tag)
346 | {
347 | break;
348 | }
349 | $returnDom = $returnDom->parent;
350 | }
351 | return $returnDom;
352 | }
353 |
354 | // get dom node's inner html
355 | function innertext()
356 | {
357 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
358 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
359 |
360 | $ret = '';
361 | foreach ($this->nodes as $n)
362 | $ret .= $n->outertext();
363 | return $ret;
364 | }
365 |
366 | // get dom node's outer text (with tag)
367 | function outertext()
368 | {
369 | global $debug_object;
370 | if (is_object($debug_object))
371 | {
372 | $text = '';
373 | if ($this->tag == 'text')
374 | {
375 | if (!empty($this->text))
376 | {
377 | $text = " with text: " . $this->text;
378 | }
379 | }
380 | $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
381 | }
382 |
383 | if ($this->tag==='root') return $this->innertext();
384 |
385 | // trigger callback
386 | if ($this->dom && $this->dom->callback!==null)
387 | {
388 | call_user_func_array($this->dom->callback, array($this));
389 | }
390 |
391 | if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
392 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
393 |
394 | // render begin tag
395 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
396 | {
397 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
398 | } else {
399 | $ret = "";
400 | }
401 |
402 | // render inner text
403 | if (isset($this->_[HDOM_INFO_INNER]))
404 | {
405 | // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
406 | if ($this->tag != "br")
407 | {
408 | $ret .= $this->_[HDOM_INFO_INNER];
409 | }
410 | } else {
411 | if ($this->nodes)
412 | {
413 | foreach ($this->nodes as $n)
414 | {
415 | $ret .= $this->convert_text($n->outertext());
416 | }
417 | }
418 | }
419 |
420 | // render end tag
421 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
422 | $ret .= ''.$this->tag.'>';
423 | return $ret;
424 | }
425 |
426 | // get dom node's plain text
427 | function text()
428 | {
429 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
430 | switch ($this->nodetype)
431 | {
432 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
433 | case HDOM_TYPE_COMMENT: return '';
434 | case HDOM_TYPE_UNKNOWN: return '';
435 | }
436 | if (strcasecmp($this->tag, 'script')===0) return '';
437 | if (strcasecmp($this->tag, 'style')===0) return '';
438 |
439 | $ret = '';
440 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
441 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
442 | // WHY is this happening?
443 | if (!is_null($this->nodes))
444 | {
445 | foreach ($this->nodes as $n)
446 | {
447 | $ret .= $this->convert_text($n->text());
448 | }
449 |
450 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
451 | if ($this->tag == "span")
452 | {
453 | $ret .= $this->dom->default_span_text;
454 | }
455 |
456 |
457 | }
458 | return $ret;
459 | }
460 |
461 | function xmltext()
462 | {
463 | $ret = $this->innertext();
464 | $ret = str_ireplace('', '', $ret);
466 | return $ret;
467 | }
468 |
469 | // build node's text with tag
470 | function makeup()
471 | {
472 | // text, comment, unknown
473 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
474 |
475 | $ret = '<'.$this->tag;
476 | $i = -1;
477 |
478 | foreach ($this->attr as $key=>$val)
479 | {
480 | ++$i;
481 |
482 | // skip removed attribute
483 | if ($val===null || $val===false)
484 | continue;
485 |
486 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
487 | //no value attr: nowrap, checked selected...
488 | if ($val===true)
489 | $ret .= $key;
490 | else {
491 | switch ($this->_[HDOM_INFO_QUOTE][$i])
492 | {
493 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
494 | case HDOM_QUOTE_SINGLE: $quote = '\''; break;
495 | default: $quote = '';
496 | }
497 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
498 | }
499 | }
500 | $ret = $this->dom->restore_noise($ret);
501 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
502 | }
503 |
504 | // find elements by css selector
505 | //PaperG - added ability for find to lowercase the value of the selector.
506 | function find($selector, $idx=null, $lowercase=false)
507 | {
508 | $selectors = $this->parse_selector($selector);
509 | if (($count=count($selectors))===0) return array();
510 | $found_keys = array();
511 |
512 | // find each selector
513 | for ($c=0; $c<$count; ++$c)
514 | {
515 | // The change on the below line was documented on the sourceforge code tracker id 2788009
516 | // used to be: if (($levle=count($selectors[0]))===0) return array();
517 | if (($levle=count($selectors[$c]))===0) return array();
518 | if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
519 |
520 | $head = array($this->_[HDOM_INFO_BEGIN]=>1);
521 |
522 | // handle descendant selectors, no recursive!
523 | for ($l=0; $l<$levle; ++$l)
524 | {
525 | $ret = array();
526 | foreach ($head as $k=>$v)
527 | {
528 | $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
529 | //PaperG - Pass this optional parameter on to the seek function.
530 | $n->seek($selectors[$c][$l], $ret, $lowercase);
531 | }
532 | $head = $ret;
533 | }
534 |
535 | foreach ($head as $k=>$v)
536 | {
537 | if (!isset($found_keys[$k]))
538 | {
539 | $found_keys[$k] = 1;
540 | }
541 | }
542 | }
543 |
544 | // sort keys
545 | ksort($found_keys);
546 |
547 | $found = array();
548 | foreach ($found_keys as $k=>$v)
549 | $found[] = $this->dom->nodes[$k];
550 |
551 | // return nth-element or array
552 | if (is_null($idx)) return $found;
553 | else if ($idx<0) $idx = count($found) + $idx;
554 | return (isset($found[$idx])) ? $found[$idx] : null;
555 | }
556 |
557 | // seek for given conditions
558 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
559 | protected function seek($selector, &$ret, $lowercase=false)
560 | {
561 | global $debug_object;
562 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
563 |
564 | list($tag, $key, $val, $exp, $no_key) = $selector;
565 |
566 | // xpath index
567 | if ($tag && $key && is_numeric($key))
568 | {
569 | $count = 0;
570 | foreach ($this->children as $c)
571 | {
572 | if ($tag==='*' || $tag===$c->tag) {
573 | if (++$count==$key) {
574 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
575 | return;
576 | }
577 | }
578 | }
579 | return;
580 | }
581 |
582 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
583 | if ($end==0) {
584 | $parent = $this->parent;
585 | while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
586 | $end -= 1;
587 | $parent = $parent->parent;
588 | }
589 | $end += $parent->_[HDOM_INFO_END];
590 | }
591 |
592 | for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
593 | $node = $this->dom->nodes[$i];
594 |
595 | $pass = true;
596 |
597 | if ($tag==='*' && !$key) {
598 | if (in_array($node, $this->children, true))
599 | $ret[$i] = 1;
600 | continue;
601 | }
602 |
603 | // compare tag
604 | if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
605 | // compare key
606 | if ($pass && $key) {
607 | if ($no_key) {
608 | if (isset($node->attr[$key])) $pass=false;
609 | } else {
610 | if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
611 | }
612 | }
613 | // compare value
614 | if ($pass && $key && $val && $val!=='*') {
615 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
616 | if ($key == "plaintext") {
617 | // $node->plaintext actually returns $node->text();
618 | $nodeKeyValue = $node->text();
619 | } else {
620 | // this is a normal search, we want the value of that attribute of the tag.
621 | $nodeKeyValue = $node->attr[$key];
622 | }
623 | if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
624 |
625 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
626 | if ($lowercase) {
627 | $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
628 | } else {
629 | $check = $this->match($exp, $val, $nodeKeyValue);
630 | }
631 | if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));}
632 |
633 | // handle multiple class
634 | if (!$check && strcasecmp($key, 'class')===0) {
635 | foreach (explode(' ',$node->attr[$key]) as $k) {
636 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
637 | if (!empty($k)) {
638 | if ($lowercase) {
639 | $check = $this->match($exp, strtolower($val), strtolower($k));
640 | } else {
641 | $check = $this->match($exp, $val, $k);
642 | }
643 | if ($check) break;
644 | }
645 | }
646 | }
647 | if (!$check) $pass = false;
648 | }
649 | if ($pass) $ret[$i] = 1;
650 | unset($node);
651 | }
652 | // It's passed by reference so this is actually what this function returns.
653 | if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);}
654 | }
655 |
656 | protected function match($exp, $pattern, $value) {
657 | global $debug_object;
658 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
659 |
660 | switch ($exp) {
661 | case '=':
662 | return ($value===$pattern);
663 | case '!=':
664 | return ($value!==$pattern);
665 | case '^=':
666 | return preg_match("/^".preg_quote($pattern,'/')."/", $value);
667 | case '$=':
668 | return preg_match("/".preg_quote($pattern,'/')."$/", $value);
669 | case '*=':
670 | if ($pattern[0]=='/') {
671 | return preg_match($pattern, $value);
672 | }
673 | return preg_match("/".$pattern."/i", $value);
674 | }
675 | return false;
676 | }
677 |
678 | protected function parse_selector($selector_string) {
679 | global $debug_object;
680 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
681 |
682 | // pattern of CSS selectors, modified from mootools
683 | // Paperg: Add the colon to the attrbute, so that it properly finds like google does.
684 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
685 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
686 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
687 | // farther study is required to determine of this should be documented or removed.
688 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
689 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
690 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
691 | if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}
692 |
693 | $selectors = array();
694 | $result = array();
695 | //print_r($matches);
696 |
697 | foreach ($matches as $m) {
698 | $m[0] = trim($m[0]);
699 | if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
700 | // for browser generated xpath
701 | if ($m[1]==='tbody') continue;
702 |
703 | list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
704 | if (!empty($m[2])) {$key='id'; $val=$m[2];}
705 | if (!empty($m[3])) {$key='class'; $val=$m[3];}
706 | if (!empty($m[4])) {$key=$m[4];}
707 | if (!empty($m[5])) {$exp=$m[5];}
708 | if (!empty($m[6])) {$val=$m[6];}
709 |
710 | // convert to lowercase
711 | if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
712 | //elements that do NOT have the specified attribute
713 | if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
714 |
715 | $result[] = array($tag, $key, $val, $exp, $no_key);
716 | if (trim($m[7])===',') {
717 | $selectors[] = $result;
718 | $result = array();
719 | }
720 | }
721 | if (count($result)>0)
722 | $selectors[] = $result;
723 | return $selectors;
724 | }
725 |
726 | function __get($name)
727 | {
728 | if (isset($this->attr[$name]))
729 | {
730 | return $this->convert_text($this->attr[$name]);
731 | }
732 | switch ($name)
733 | {
734 | case 'outertext': return $this->outertext();
735 | case 'innertext': return $this->innertext();
736 | case 'plaintext': return $this->text();
737 | case 'xmltext': return $this->xmltext();
738 | default: return array_key_exists($name, $this->attr);
739 | }
740 | }
741 |
742 | function __set($name, $value)
743 | {
744 | global $debug_object;
745 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
746 |
747 | switch ($name)
748 | {
749 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
750 | case 'innertext':
751 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
752 | return $this->_[HDOM_INFO_INNER] = $value;
753 | }
754 | if (!isset($this->attr[$name]))
755 | {
756 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
757 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
758 | }
759 | $this->attr[$name] = $value;
760 | }
761 |
762 | function __isset($name)
763 | {
764 | switch ($name)
765 | {
766 | case 'outertext': return true;
767 | case 'innertext': return true;
768 | case 'plaintext': return true;
769 | }
770 | //no value attr: nowrap, checked selected...
771 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
772 | }
773 |
774 | function __unset($name) {
775 | if (isset($this->attr[$name]))
776 | unset($this->attr[$name]);
777 | }
778 |
779 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
780 | function convert_text($text)
781 | {
782 | global $debug_object;
783 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
784 |
785 | $converted_text = $text;
786 |
787 | $sourceCharset = "";
788 | $targetCharset = "";
789 |
790 | if ($this->dom)
791 | {
792 | $sourceCharset = strtoupper($this->dom->_charset);
793 | $targetCharset = strtoupper($this->dom->_target_charset);
794 | }
795 | if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
796 |
797 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
798 | {
799 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
800 | if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
801 | {
802 | $converted_text = $text;
803 | }
804 | else
805 | {
806 | $converted_text = iconv($sourceCharset, $targetCharset, $text);
807 | }
808 | }
809 |
810 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
811 | if ($targetCharset == 'UTF-8')
812 | {
813 | if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
814 | {
815 | $converted_text = substr($converted_text, 3);
816 | }
817 | if (substr($converted_text, -3) == "\xef\xbb\xbf")
818 | {
819 | $converted_text = substr($converted_text, 0, -3);
820 | }
821 | }
822 |
823 | return $converted_text;
824 | }
825 |
826 | /**
827 | * Returns true if $string is valid UTF-8 and false otherwise.
828 | *
829 | * @param mixed $str String to be tested
830 | * @return boolean
831 | */
832 | static function is_utf8($str)
833 | {
834 | $c=0; $b=0;
835 | $bits=0;
836 | $len=strlen($str);
837 | for($i=0; $i<$len; $i++)
838 | {
839 | $c=ord($str[$i]);
840 | if($c > 128)
841 | {
842 | if(($c >= 254)) return false;
843 | elseif($c >= 252) $bits=6;
844 | elseif($c >= 248) $bits=5;
845 | elseif($c >= 240) $bits=4;
846 | elseif($c >= 224) $bits=3;
847 | elseif($c >= 192) $bits=2;
848 | else return false;
849 | if(($i+$bits) > $len) return false;
850 | while($bits > 1)
851 | {
852 | $i++;
853 | $b=ord($str[$i]);
854 | if($b < 128 || $b > 191) return false;
855 | $bits--;
856 | }
857 | }
858 | }
859 | return true;
860 | }
861 | /*
862 | function is_utf8($string)
863 | {
864 | //this is buggy
865 | return (utf8_encode(utf8_decode($string)) == $string);
866 | }
867 | */
868 |
869 | /**
870 | * Function to try a few tricks to determine the displayed size of an img on the page.
871 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
872 | *
873 | * @author John Schlick
874 | * @version April 19 2012
875 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
876 | */
877 | function get_display_size()
878 | {
879 | global $debug_object;
880 |
881 | $width = -1;
882 | $height = -1;
883 |
884 | if ($this->tag !== 'img')
885 | {
886 | return false;
887 | }
888 |
889 | // See if there is aheight or width attribute in the tag itself.
890 | if (isset($this->attr['width']))
891 | {
892 | $width = $this->attr['width'];
893 | }
894 |
895 | if (isset($this->attr['height']))
896 | {
897 | $height = $this->attr['height'];
898 | }
899 |
900 | // Now look for an inline style.
901 | if (isset($this->attr['style']))
902 | {
903 | // Thanks to user gnarf from stackoverflow for this regular expression.
904 | $attributes = array();
905 | preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
906 | foreach ($matches as $match) {
907 | $attributes[$match[1]] = $match[2];
908 | }
909 |
910 | // If there is a width in the style attributes:
911 | if (isset($attributes['width']) && $width == -1)
912 | {
913 | // check that the last two characters are px (pixels)
914 | if (strtolower(substr($attributes['width'], -2)) == 'px')
915 | {
916 | $proposed_width = substr($attributes['width'], 0, -2);
917 | // Now make sure that it's an integer and not something stupid.
918 | if (filter_var($proposed_width, FILTER_VALIDATE_INT))
919 | {
920 | $width = $proposed_width;
921 | }
922 | }
923 | }
924 |
925 | // If there is a width in the style attributes:
926 | if (isset($attributes['height']) && $height == -1)
927 | {
928 | // check that the last two characters are px (pixels)
929 | if (strtolower(substr($attributes['height'], -2)) == 'px')
930 | {
931 | $proposed_height = substr($attributes['height'], 0, -2);
932 | // Now make sure that it's an integer and not something stupid.
933 | if (filter_var($proposed_height, FILTER_VALIDATE_INT))
934 | {
935 | $height = $proposed_height;
936 | }
937 | }
938 | }
939 |
940 | }
941 |
942 | // Future enhancement:
943 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
944 |
945 | // Far future enhancement
946 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
947 | // Note that in this case, the class or id will have the img subselector for it to apply to the image.
948 |
949 | // ridiculously far future development
950 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
951 |
952 | $result = array('height' => $height,
953 | 'width' => $width);
954 | return $result;
955 | }
956 |
957 | // camel naming conventions
958 | function getAllAttributes() {return $this->attr;}
959 | function getAttribute($name) {return $this->__get($name);}
960 | function setAttribute($name, $value) {$this->__set($name, $value);}
961 | function hasAttribute($name) {return $this->__isset($name);}
962 | function removeAttribute($name) {$this->__set($name, null);}
963 | function getElementById($id) {return $this->find("#$id", 0);}
964 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
965 | function getElementByTagName($name) {return $this->find($name, 0);}
966 | function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
967 | function parentNode() {return $this->parent();}
968 | function childNodes($idx=-1) {return $this->children($idx);}
969 | function firstChild() {return $this->first_child();}
970 | function lastChild() {return $this->last_child();}
971 | function nextSibling() {return $this->next_sibling();}
972 | function previousSibling() {return $this->prev_sibling();}
973 | function hasChildNodes() {return $this->has_child();}
974 | function nodeName() {return $this->tag;}
975 | function appendChild($node) {$node->parent($this); return $node;}
976 |
977 | }
978 |
979 | /**
980 | * simple html dom parser
981 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
982 | * Paperg - change $size from protected to public so we can easily access it
983 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
984 | *
985 | * @package PlaceLocalInclude
986 | */
987 | class simple_html_dom
988 | {
989 | public $root = null;
990 | public $nodes = array();
991 | public $callback = null;
992 | public $lowercase = false;
993 | // Used to keep track of how large the text was when we started.
994 | public $original_size;
995 | public $size;
996 | protected $pos;
997 | protected $doc;
998 | protected $char;
999 | protected $cursor;
1000 | protected $parent;
1001 | protected $noise = array();
1002 | protected $token_blank = " \t\r\n";
1003 | protected $token_equal = ' =/>';
1004 | protected $token_slash = " />\r\n\t";
1005 | protected $token_attr = ' >';
1006 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
1007 | public $_charset = '';
1008 | public $_target_charset = '';
1009 | protected $default_br_text = "";
1010 | public $default_span_text = "";
1011 |
1012 | // use isset instead of in_array, performance boost about 30%...
1013 | protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
1014 | protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
1015 | // Known sourceforge issue #2977341
1016 | // B tags that are not closed cause us to return everything to the end of the document.
1017 | protected $optional_closing_tags = array(
1018 | 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
1019 | 'th'=>array('th'=>1),
1020 | 'td'=>array('td'=>1),
1021 | 'li'=>array('li'=>1),
1022 | 'dt'=>array('dt'=>1, 'dd'=>1),
1023 | 'dd'=>array('dd'=>1, 'dt'=>1),
1024 | 'dl'=>array('dd'=>1, 'dt'=>1),
1025 | 'p'=>array('p'=>1),
1026 | 'nobr'=>array('nobr'=>1),
1027 | 'b'=>array('b'=>1),
1028 | 'option'=>array('option'=>1),
1029 | );
1030 |
1031 | function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1032 | {
1033 | if ($str)
1034 | {
1035 | if (preg_match("/^http:\/\//i",$str) || is_file($str))
1036 | {
1037 | $this->load_file($str);
1038 | }
1039 | else
1040 | {
1041 | $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1042 | }
1043 | }
1044 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1045 | if (!$forceTagsClosed) {
1046 | $this->optional_closing_array=array();
1047 | }
1048 | $this->_target_charset = $target_charset;
1049 | }
1050 |
1051 | function __destruct()
1052 | {
1053 | $this->clear();
1054 | }
1055 |
1056 | // load html from string
1057 | function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1058 | {
1059 | global $debug_object;
1060 |
1061 | // prepare
1062 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1063 | // strip out cdata
1064 | $this->remove_noise("''is", true);
1065 | // strip out comments
1066 | $this->remove_noise("''is");
1067 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1068 | // Script tags removal now preceeds style tag removal.
1069 | // strip out