├── README.md
├── URLResolver.php
├── composer.json
├── lib
└── simple_html_dom.php
└── tools
└── resolve_url.php
/README.md:
--------------------------------------------------------------------------------
1 | Welcome to URLResolver.php
2 | ====================================================
3 |
4 | URLResolver.php is a PHP class that attempts to resolve URLs to a final,
5 | canonical link. On the web today, link shorteners, tracking codes and more can
6 | result in many different links that ultimately point to the same resource.
7 | By following HTTP redirects and parsing web pages for open graph and canonical
8 | URLs, URLResolver.php attempts to solve this issue.
9 |
10 | ## Patterns Recognized
11 |
12 | - Follows 301, 302, and 303 redirects found in HTTP headers
13 | - Follows [Open Graph] URL <meta> tags found in web page <head>
14 | - Follows [Canonical] URL <link> tags found in web page <head>
15 | - Aborts download quickly if content type is not an HTML page
16 |
17 | I am open to additional suggestions for improvement.
18 |
19 | ## Usage
20 |
21 | Resolving a URL can be as easy as:
22 |
23 | ``` php
24 | resolveURL('http://goo.gl/0GMP1')->getURL();
28 | ```
29 |
30 | If you installed this library using composer, you would change the first line above to:
31 |
32 | ``` php
33 | setUserAgent('Mozilla/5.0 (compatible; YourAppName/1.0; +http://www.example.com)');
48 |
49 | # Designate a temporary file that will store cookies during the session.
50 | # Some web sites test the browser for cookie support, so this enhances results.
51 | $resolver->setCookieJar('/tmp/url_resolver.cookies');
52 |
53 | # resolveURL() returns an object that allows for additional information.
54 | $url = 'http://goo.gl/0GMP1';
55 | $url_result = $resolver->resolveURL($url);
56 |
57 | # Test to see if any error occurred while resolving the URL:
58 | if ($url_result->didErrorOccur()) {
59 | print "there was an error resolving $url:\n ";
60 | print $url_result->getErrorMessageString();
61 | }
62 |
63 | # Otherwise, print out the resolved URL. The [HTTP status code] will tell you
64 | # additional information about the success/failure. For instance, if the
65 | # link resulted in a 404 Not Found error, it would print '404: http://...'
66 | # The successful status code is 200.
67 | else {
68 | print $url_result->getHTTPStatusCode();
69 | print ': ';
70 | print $url_result->getURL();
71 | }
72 | ```
73 |
74 | ## Installation and Requirements
75 |
76 | #### License
77 | URLResolver.php is licensed under the [MIT License], viewable in the source code.
78 |
79 | #### Install with Composer
80 | [composer require mattwright/urlresolver](https://packagist.org/packages/mattwright/urlresolver)
81 |
82 | #### Download
83 | URLResolver.php as a [.tar.gz](https://github.com/mattwright/URLResolver.php/tarball/master) or [.zip](https://github.com/mattwright/URLResolver.php/zipball/master) file.
84 |
85 | #### Requirements
86 | - The [curl](http://php.net/manual/en/book.curl.php) extension must be installed as part of PHP
87 | - [PHP Simple HTML DOM Parser](http://simplehtmldom.sourceforge.net/) is required and included with the download.
88 |
89 | ## API
90 |
91 | ### URLResolver()
92 |
93 | `$resolver = new mattwright\URLResolver();`
94 | Create the URL resolver object that you call additional methods on.
95 |
96 | `$resolver->resolveURL($url);`
97 | $url is the link you want to resolve.
98 | Returns a [URLResult] object that contains the final, resolved URL.
99 |
100 | `$resolver->setUserAgent($user_agent);`
101 | Pass in a string that is sent to each web server to identify your crawler.
102 |
103 | `$resolver->setCookieJar($cookie_file); # Defaults to disable cookies`
104 | *** This file will be removed at the end of each resolveURL() call. ***
105 | Pass in the path to a file used to store cookies during each resolveURL() call.
106 | If no cookie file is set, cookies will be disabled and results may suffer.
107 | This file must not already exist.
108 | If it does, pass _true_ as second argument to enable overwrite.
109 |
110 | `$resolver->setMaxRedirects($max_redirects); # Defaults to 10`
111 | Set the maximum number of URL requests to attempt during each resolveURL() call.
112 |
113 | `$resolver->setMaxResponseDataSize($max_bytes); # Defaults to 120000`
114 | Pass in an integer specifying the maximum data to download per request.
115 | Multiple URL requests may occur during each resolveURL() call.
116 | Setting this too low may limit the usefulness of results (default 120000).
117 |
118 | `$resolver->setRequestTimeout($num_seconds); # Defaults to 30`
119 | Set the maximum amount of time, in seconds, any URL request can take.
120 | Multiple URL requests may occur during each resolveURL() call.
121 |
122 | `$resolver->setPreferCanonicalURL($value); # Defaults to false`
123 | Set $value to _true_ to prioritize canonical URL over Open Graph URL.
124 |
125 | `$resolver->isDebugMode($value); # Defaults to false`
126 | Set $value to _true_ to enable debug mode and _false_ to disable (the default).
127 | This will print out each link visited, along with status codes and link types.
128 |
129 | ### URLResolverResult()
130 |
131 | `$url_result = $resolver->resolveURL($url);`
132 | Retrieve the URLResolverResult() object representing the resolution of $url.
133 |
134 | `$url_result->getURL();`
135 | This is the best resolved URL we could obtain after following redirects.
136 |
137 | `$url_result->getHTTPStatusCode();`
138 | Returns the integer [HTTP status code] for the resolved URL.
139 | Examples: 200 - OK (success), 404 - Not Found, 301 - Moved Permanently, ...
140 |
141 | `$url_result->hasSuccessHTTPStatus();`
142 | Returns _true_ if the [HTTP status code] for the resolved URL is 200.
143 |
144 | `$url_result->hasRedirectHTTPStatus();`
145 | Returns _true_ if the [HTTP status code] for the resolved URL is 301, 302, or 303.
146 |
147 | `$url_result->getContentType();`
148 | Returns the value of the Content-Type [HTTP header] for the resolved URL.
149 | If header not provided, _null_ is returned. Examples: text/html, image/jpeg, ...
150 |
151 | `$url_result->getContentLength();`
152 | Returns the size of the fetched URL in bytes for the resolved URL.
153 | Determined only by the Content-Length [HTTP header]. _null_ returned otherwise.
154 |
155 | `$url_result->isOpenGraphURL();`
156 | Returns _true_ if resolved URL was marked as the Open Graph URL (og:url)
157 |
158 | `$url_result->isCanonicalURL();`
159 | Returns _true_ if resolved URL was marked as the Canonical URL (rel=canonical)
160 |
161 | `$url_result->isStartingURL();`
162 | Returns _true_ if resolved URL was also the URL you passed to resolveURL().
163 |
164 | `$url_result->didErrorOccur();`
165 | Returns _true_ if an error occurred while resolving the URL.
166 | If this returns _false_, $url_result is guaranteed to have a status code.
167 |
168 | `$url_result->getErrorMessageString();`
169 | Returns an explanation of what went wrong if didErrorOccur() returns _true_.
170 |
171 | `$url_result->didConnectionFail();`
172 | Returns _true_ if there was a connection error (no header or no body returned).
173 | May indicate a situation where you are more likely to try at least once more.
174 | If this returns _true_, didErrorOccur() will true as well.
175 |
176 | ## Changelog
177 | - v2.0 - January 17, 2019
178 | - Breaking change: namespaced the library for use with composer psr-4
179 | - Add requested option to prefer canonical URL over Open Graph
180 | - Minor fixes / improvements
181 | - Upgrade simple_html_dom to 1.8.1
182 |
183 | - v1.1 - June 3, 2014
184 | - Support http redirect code 303
185 |
186 | - v1.0 - December 3, 2011
187 | - Initial release supports http header redirects, og:url and rel=canonical
188 |
189 | [curl]: http://php.net/manual/en/book.curl.php
190 | [PHP Simple HTML DOM Parser]: http://simplehtmldom.sourceforge.net/
191 | [Open Graph]: https://developers.facebook.com/docs/opengraph/
192 | [Canonical]: http://www.google.com/support/webmasters/bin/answer.py?answer=139394
193 | [HTTP status code]: http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
194 | [HTTP header]: http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html
195 | [MIT License]: http://en.wikipedia.org/wiki/MIT_License
196 |
--------------------------------------------------------------------------------
/URLResolver.php:
--------------------------------------------------------------------------------
1 | user_agent = $user_agent_string;
50 | }
51 | else {
52 | throw new Exception('URLResolver->setUserAgent() must be called with a string');
53 | }
54 |
55 | $this->closeCurl(); # Reset curl with new settings...
56 | }
57 |
58 | public function setCookieJar($cookie_jar_filename, $overwrite = false) {
59 | if (!$overwrite && file_exists($cookie_jar_filename)) {
60 | throw new Exception("URLResolver->setCookieJar() founding existing file $cookie_jar_filename.\nPass true as second argument to overwrite and delete.");
61 | }
62 |
63 | if (file_put_contents($cookie_jar_filename, '') === false) {
64 | throw new Exception("URLResolver->setCookieJar() could not write to $cookie_jar_filename");
65 | }
66 |
67 | $this->cookie_jar = $cookie_jar_filename;
68 |
69 | $this->closeCurl(); # Reset curl with new settings...
70 | }
71 |
72 | public function setRequestTimeout($seconds) {
73 | if (is_numeric($seconds) && (int)$seconds == $seconds) {
74 | $this->request_timeout = (int)$seconds;
75 | }
76 | else {
77 | throw new Exception('URLResolver->setRequestTimeout() must be called with an integer');
78 | }
79 |
80 | $this->closeCurl(); # Reset curl with new settings...
81 | }
82 |
83 | public function setMaxRedirects($max_redirects) {
84 | if (is_numeric($max_redirects)) {
85 | $this->max_redirects = (int)$max_redirects;
86 | }
87 | else {
88 | throw new Exception('URLResolver->setMaxRedirects() must be called with an integer');
89 | }
90 |
91 | $this->closeCurl(); # Reset curl with new settings...
92 | }
93 |
94 | public function setMaxResponseDataSize($max_bytes) {
95 | if (is_numeric($max_bytes)) {
96 | $this->max_response_data_size = (int)$max_bytes;
97 | }
98 | else {
99 | throw new Exception('URLResolver->setMaxResponseDataSize() must be called with an integer');
100 | }
101 |
102 | $this->closeCurl(); # Reset curl with new settings...
103 | }
104 |
105 | public function setPreferCanonicalURL($value) {
106 | $this->prefer_canonical_url = $value ? true : false;
107 | }
108 |
109 | public function isDebugMode($value) {
110 | if (isset($value)) { $this->is_debug = $value ? true : false; }
111 | return $this->is_debug;
112 | }
113 |
114 | public function resolveURL($url) {
115 | $starting_url = $url;
116 |
117 | $url_is_open_graph = false;
118 | $url_is_canonical = false;
119 |
120 | $url_results = array();
121 | for ($i = 0; $i < $this->max_redirects; $i++) {
122 | # During debug mode, print out each URL that we visit.
123 | if ($this->is_debug) {
124 | if ($i) { print ' |- '; }
125 | print $url;
126 | }
127 |
128 | # Fetch the redirect information...
129 | $url_result = $this->fetchURLResult($url);
130 |
131 | # Mark this as the starting URL if it is the first or equals that URL
132 | if ($i == 0 || $url == $starting_url) { $url_result->isStartingURL(true); }
133 |
134 | # If we followed this URL because of some HTML markup, note that...
135 | # Don't allow it to overwrite a true value determined from markup with a false value...
136 | if (!$url_result->isOpenGraphURL()) { $url_result->isOpenGraphURL($url_is_open_graph); }
137 | if (!$url_result->isCanonicalURL()) { $url_result->isCanonicalURL($url_is_canonical); }
138 |
139 | # Also print a short status line regarding the URL once it is fetched
140 | if ($this->is_debug) {
141 | print ' ' . $url_result->debugStatus() . "\n";
142 | }
143 |
144 | # If an error occurs during the processing of this url, return
145 | # the result when that error happens
146 | if ($url_result->didErrorOccur()) {
147 | if ($this->is_debug) {
148 | print ' |! ' . $url_result->getURL() . ' ' . $url_result->debugStatus() . "\n";
149 | if ($url_result->didErrorOccur()) { print ' \-> ' . $url_result->getErrorMessageString() . "\n"; }
150 | print "\n";
151 | }
152 |
153 | $this->closeCurl();
154 | return $url_result;
155 | }
156 |
157 | $next_url = $url_result->getRedirectTarget();
158 | $next_url_visited_count = 0;
159 | foreach ($url_results as $previous_result) {
160 |
161 | # If this result was for the same URL with the same status, then we have looped.
162 | # We need to check the status as well, because in some cases we may get
163 | # multiple redirected to establish cookies (New York Times) and so when we
164 | # return to the same page, we will have a different status (200 instead of 301)
165 | # and we will still want to check for the og:url in that case...
166 | if ($previous_result->getURL() == $url_result->getURL() &&
167 | $previous_result->getHTTPStatusCode() == $url_result->getHTTPStatusCode()) {
168 | return $this->resolveURLResults($url_results);
169 | }
170 |
171 | # If the next URL to fetch has been previously fetched, decide whether to try again
172 | if (isset($next_url) && $next_url == $previous_result->getURL()) {
173 | $next_url_visited_count++;
174 |
175 | # We are done if we have already visited this URL twice. (looped)
176 | if ($next_url_visited_count > 1) {
177 | return $this->resolveURLResults($url_results);
178 | }
179 |
180 | # We are also done if we have been to this URL and it wasn't a redirect
181 | # (it could have been an og:url or a rel=canonical) (looped)
182 | if (!$previous_result->hasRedirectHTTPStatus()) {
183 | return $this->resolveURLResults($url_results);
184 | }
185 | }
186 | }
187 |
188 | array_push($url_results, $url_result);
189 |
190 | # If there is no next URL set, we're done.
191 | if (!isset($next_url)) {
192 | return $this->resolveURLResults($url_results);
193 | }
194 |
195 | $url = $next_url;
196 | $url_is_open_graph = $url_result->redirectTargetIsOpenGraphURL();
197 | $url_is_canonical = $url_result->redirectTargetIsCanonicalURL();
198 | }
199 |
200 | return $this->resolveURLResults($url_results);
201 | }
202 |
203 | private function resolveURLResults($url_results) {
204 | # If no URL results were found, return null as failure...
205 | if (!isset($url_results) || count($url_results) < 1) {
206 | return null;
207 | }
208 |
209 | $fail_url_result = $redirect_url_result = null;
210 | $ok_url_result = $og_url_result = $canonical_url_result = null;
211 |
212 | foreach (array_reverse($url_results) as $url_result) {
213 | if ($url_result->hasSuccessHTTPStatus()) {
214 | if ($url_result->isOpenGraphURL() && !$og_url_result) {
215 | $og_url_result = $url_result;
216 | }
217 | else if ($url_result->isCanonicalURL() && !$canonical_url_result) {
218 | $canonical_url_result = $url_result;
219 | }
220 | else if (!$ok_url_result) {
221 | $ok_url_result = $url_result;
222 | }
223 | }
224 |
225 | # If the URL had a redirect status, then we set that result type
226 | else if ($url_result->hasRedirectHTTPStatus()) {
227 | if (!$redirect_url_result) {
228 | $redirect_url_result = $url_result;
229 | }
230 | }
231 |
232 | # Only set a failure result if it didn't have success or redirect status code
233 | else if (!$fail_url_result) {
234 | $fail_url_result = $url_result;
235 | }
236 | }
237 |
238 | # Start with the result from our initial url
239 | $return_url = $url_results[0];
240 |
241 | # The primary goal is to return a valid og:url...
242 | if (isset($og_url_result)) {
243 | $return_url = $og_url_result;
244 | }
245 |
246 | # Canonical URLs are a close second...
247 | else if (isset($canonical_url_result)) {
248 | $return_url = $canonical_url_result;
249 | }
250 |
251 | # Following that, we will take any status=200 link
252 | else if (isset($ok_url_result)) {
253 | $return_url = $ok_url_result;
254 | }
255 |
256 | # Better to end in an HTTP failure than a redirect
257 | else if (isset($fail_url_result)) {
258 | $return_url = $fail_url_result;
259 | }
260 |
261 | # We will still take the deepest redirect found if needed
262 | else if (isset($redirect_url_result)) {
263 | $return_url = $redirect_url_result;
264 | }
265 |
266 | # Reset everything so that we start with a clean slate, cookie jar, etc. next time
267 | $this->closeCurl();
268 |
269 | if ($this->is_debug) {
270 | print ' |> ' . $return_url->getURL() . ' ' . $return_url->debugStatus() . "\n\n";
271 | }
272 |
273 | return $return_url;
274 | }
275 |
276 | private function fetchURL($url) {
277 | $curl = $this->initCurl();
278 | curl_setopt($curl, CURLOPT_URL, $url);
279 |
280 | # Limit the body to 1,000,000 bytes
281 | $headers = $body = '';
282 | $header_length = 0;
283 | $max_data_length = $this->max_response_data_size;
284 | curl_setopt($curl, CURLOPT_WRITEFUNCTION, function($handle, $data) use (&$headers, &$body, &$header_length, $max_data_length) {
285 | $body .= $data;
286 |
287 | if ($headers == '') {
288 | $headers_end = strpos($body, "\r\n\r\n");
289 | if ($headers_end !== false) {
290 | $header_length = $headers_end;
291 | $headers = substr($body, 0, $header_length);
292 | $body = substr($body, $header_length + 4);
293 |
294 |
295 | # Now that we have headers, if the content type is not HTML, we do
296 | # not need to download anything else. Prevents us from downloading
297 | # images, videos, PDFs, etc. that won't contain redirects
298 |
299 | # Until PHP 5.4, you can't import $this lexical variable into a closure,
300 | # so we will need to duplicate code from contentTypeFromHeader()
301 | # and hasHTMLContentType()
302 | if (preg_match('/^\s*Content-Type:\s*([^\s;\n]+)/im', $headers, $matches)) {
303 | if (stripos($matches[1], 'html') === false) { return 0; }
304 | }
305 | }
306 | }
307 |
308 | # If we have downloaded the maximum amount of content, we're done.
309 | if (($header_length + strlen($body)) > $max_data_length) { return 0; }
310 |
311 | return strlen($data);
312 | });
313 |
314 |
315 | curl_exec($curl);
316 |
317 | if ($headers === '') { return array(null, null); }
318 |
319 | return array($headers, $body);
320 | }
321 |
322 | private function fullyQualifyURI($uri, $url) {
323 | $uri = trim($uri);
324 |
325 | # Only use this if it looks like a URL/URI (starts with /, www., or https?://)
326 | # Otherwise, we won't be able to understand it.
327 | if (!preg_match('/^(\/|www\.|https?:\/\/)/i', $uri)) { return null; }
328 |
329 | # If the link is to a domain only, we will standardize it by ensuring a trailing slash
330 | if (preg_match('/^(\/|https?:\/\/)[^\/]+$/i', $uri)) { $uri .= '/'; }
331 |
332 | # If the URL is localized, such as '/path/to/file', add the protocol and host back to the start.
333 | if (strpos($uri, '/') === 0) {
334 |
335 | # If a URI starts with //, then it means there is another domain, but use same protocol
336 | if (strpos($uri, '//') === 0) {
337 | if (preg_match('/^\s*([a-z]+:)/', $url, $matches)) {
338 | $uri = $matches[1] . $uri;
339 | }
340 | }
341 |
342 | # Otherwise, add in the entire domain as well
343 | else {
344 | if (preg_match('/^\s*([a-z]+:\/\/[^\/]+)/', $url, $matches)) {
345 | $uri = $matches[1] . $uri;
346 | }
347 | }
348 | }
349 |
350 | # In the wild, finding several URLs that start with www. and no scheme. Add protocol.
351 | if (strpos($uri, 'www.') === 0) {
352 | if (preg_match('/^\s*([a-z]+:\/\/)/', $url, $matches)) {
353 | $uri = $matches[1] . $uri;
354 | }
355 | }
356 |
357 | # If the URL had a hash fragment attached to it and the URI no longer does, we will add it back in.
358 | $fragment_pos = strpos($url, '#');
359 | if ($fragment_pos !== false && strpos($uri, '#') === false) {
360 | $uri .= substr($url, $fragment_pos);
361 | }
362 |
363 | return $uri;
364 | }
365 |
366 | private function contentTypeFromHeader($headers) {
367 | if (preg_match('/^\s*Content-Type:\s*([^\s;\n]+)/im', $headers, $matches)) {
368 | return $matches[1];
369 | }
370 | return null;
371 | }
372 |
373 | private function fetchURLResult($url) {
374 | $result = new URLResolverResult($url);
375 |
376 | # Attempt to fetch the headers for this URL
377 | list($headers, $body) = $this->fetchURL($url);
378 | if (!isset($headers)) {
379 | $result->didConnectionFail(true, 'Could not retrieve headers');
380 | return $result;
381 | }
382 |
383 | # Parse the headers...
384 | if (preg_match('/^\s*HTTP\/[\d\.]+\s+(\d+)/i', $headers, $matches)) {
385 | $result->setHTTPStatusCode($matches[1]);
386 | }
387 | else {
388 | $result->didFatalErrorOccur(true, 'HTTP status code not found');
389 | }
390 |
391 | $result->setContentType($this->contentTypeFromHeader($headers));
392 |
393 | if (preg_match('/^\s*Content-Length:\s*(\d+)/im', $headers, $matches)) {
394 | $result->setContentLength($matches[1]);
395 | }
396 |
397 | if ($result->hasRedirectHTTPStatus()) {
398 | # Parse the location header to determine the redirect URL...
399 | if (preg_match('/^\s*Location:\s*([^\r\n]+)/im', $headers, $matches)) {
400 | $result->setRedirectTarget($this->fullyQualifyURI($matches[1], $url));
401 | }
402 |
403 | # A redirect status code with no location header is a fatal error...
404 | else { $result->didFatalErrorOccur(true, 'HTTP status code indicated redirect, no location found'); }
405 |
406 | return $result;
407 | }
408 |
409 | else if ($result->hasSuccessHTTPStatus()) {
410 |
411 | # If the content type for this page is something other than HTML, we do
412 | # not need to fetch it. This test will catch text/html and text/xhtml, etc.
413 | if (!$result->hasHTMLContentType()) { return $result; }
414 |
415 | if (!$body) {
416 | $result->didConnectionFail(true, 'Web page was empty');
417 | return $result;
418 | }
419 |
420 | # Load the HTML DOM using PHP Simple HTML DOM
421 | $html_dom = $this->loadHTMLDOM($body);
422 |
423 | # If the DOM could not be parsed, mark it as a fatal error. Reasonable
424 | # HTTP redirects may be available, but this notes it didn't get everything
425 | if (!$html_dom) {
426 | $result->didFatalErrorOccur(true, 'Could not parse web page');
427 | $this->closeHTMLDOM();
428 | return $result;
429 | }
430 |
431 | # If we cannot find the
, then we are done processing this page.
432 | $head = $html_dom->find('head', 0);
433 | if (!isset($head)) {
434 | # If there is no and no tag, then we will look for an instant
435 | # find('body', 0);
440 | if (!isset($body_tag)) {
441 | $meta_refresh_tag = $html_dom->find('meta[http-equiv=refresh]', 0);
442 | if (isset($meta_refresh_tag->content) &&
443 | preg_match('/^\s*(\d+)\s*;\s*URL=(.*)/i', $meta_refresh_tag->content, $matches)) {
444 | if (!$matches[1] <= 2) {
445 | $result->setRedirectTarget($this->fullyQualifyURI($matches[2], $url));
446 | }
447 | }
448 | }
449 |
450 | # Don't mark as failed, some pages may just not have a , but rare...
451 | $this->closeHTMLDOM();
452 | return $result;
453 | }
454 |
455 | # Determine if there are any redirects in the meta/link tags (og:url or rel=canonical)
456 | $redirect_url = null;
457 |
458 | # Locate the Open Graph URL meta tag and extract URL
459 | $og_tag = $head->find('meta[property=og:url]', 0);
460 | $og_url = (isset($og_tag) && isset($og_tag->content)) ?
461 | $this->fullyQualifyURI($og_tag->content, $url) : null;
462 |
463 | if (isset($og_url)) {
464 | $redirect_url = $og_url;
465 | $result->redirectTargetIsOpenGraphURL(true);
466 | }
467 |
468 | # Open Graph takes precedence over Canonical, but it can be both...
469 | $canonical_tag = $head->find('link[rel=canonical]', 0);
470 | $canonical_url = ((isset($canonical_tag) && isset($canonical_tag->href))) ?
471 | $this->fullyQualifyURI($canonical_tag->href, $url) : null;
472 |
473 | if (isset($canonical_url)) {
474 | if (isset($redirect_url)) {
475 | if ($canonical_url == $redirect_url) {
476 | $result->redirectTargetIsCanonicalURL(true);
477 | }
478 |
479 | # If setPreferCanonicalURL(true) was called, then we use it over Open Graph
480 | else if ($this->prefer_canonical_url) {
481 | $redirect_url = $canonical_url;
482 | $result->redirectTargetIsCanonicalURL(true);
483 | $result->redirectTargetIsOpenGraphURL(false);
484 | }
485 | }
486 | else {
487 | $redirect_url = $canonical_url;
488 | $result->redirectTargetIsCanonicalURL(true);
489 | }
490 | }
491 |
492 | # If a redirect was found, set the target and return it
493 | if ($redirect_url) {
494 | # If the redirect URL is the same as the current URL, don't set it, but update values.
495 | if ($redirect_url === $url) {
496 | $result->isOpenGraphURL($result->redirectTargetIsOpenGraphURL());
497 | $result->isCanonicalURL($result->redirectTargetIsCanonicalURL());
498 | }
499 | else {
500 | $result->setRedirectTarget($redirect_url);
501 | }
502 | }
503 |
504 | $this->closeHTMLDOM();
505 | return $result;
506 | }
507 |
508 | # Link had some other status code besides redirect or status...
509 | $this->closeHTMLDOM();
510 | return $result;
511 | }
512 |
513 | private function initCurl() {
514 | # If curl has already been initialized (and not closed), just return the handle
515 | if (isset($this->curl)) { return $this->curl; }
516 |
517 | $this->curl = curl_init();
518 |
519 | curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, true);
520 | curl_setopt($this->curl, CURLOPT_TIMEOUT, $this->request_timeout);
521 | curl_setopt($this->curl, CURLOPT_CONNECTTIMEOUT, $this->request_timeout);
522 |
523 | if (isset($this->cookie_jar)) {
524 | curl_setopt($this->curl, CURLOPT_COOKIEJAR, $this->cookie_jar);
525 | curl_setopt($this->curl, CURLOPT_COOKIEFILE, $this->cookie_jar);
526 | }
527 |
528 | # If a User Agent has been set, set the curl option
529 | if (isset($this->user_agent)) {
530 | curl_setopt($this->curl, CURLOPT_USERAGENT, $this->user_agent);
531 | }
532 |
533 | # We are not too concerned about the strictness of SSL when finding redirects
534 | # Without these, some SSL links just fail to return anything
535 | curl_setopt($this->curl, CURLOPT_SSL_VERIFYHOST, 0);
536 | curl_setopt($this->curl, CURLOPT_SSL_VERIFYPEER, 0);
537 |
538 | # We want the headers returned to us to follow redirects
539 | curl_setopt($this->curl, CURLOPT_HEADER, true);
540 |
541 | return $this->curl;
542 | }
543 |
544 | private function closeCurl() {
545 | if (isset($this->curl)) {
546 | curl_close($this->curl);
547 | unset($this->curl);
548 | }
549 |
550 | # Empty the cookie jar (this deletes the file)
551 | if (isset($this->cookie_jar) && file_exists($this->cookie_jar)) {
552 | unlink($this->cookie_jar);
553 | }
554 | }
555 |
556 | private function loadHTMLDOM($html_content) {
557 | if (isset($this->html_dom)) {
558 | $this->closeHTMLDOM();
559 | }
560 |
561 | $this->html_dom = new \simple_html_dom();
562 | $this->html_dom->load($html_content);
563 | return $this->html_dom;
564 | }
565 |
566 | private function closeHTMLDOM() {
567 | if (isset($this->html_dom)) {
568 | $this->html_dom->clear();
569 | unset($this->html_dom);
570 | }
571 | }
572 | }
573 |
574 | class URLResolverResult {
575 | private $url;
576 | private $status;
577 | private $content_type;
578 | private $content_length;
579 |
580 | private $is_starting_point = false;
581 | private $is_open_graph = false;
582 | private $is_canonical = false;
583 |
584 | private $redirect;
585 | private $redirect_is_open_graph = false;
586 | private $redirect_is_canonical = false;
587 |
588 | private $failed = false;
589 | private $error = false;
590 | private $error_message = '';
591 |
592 | public function __construct($url) {
593 | $this->url = $url;
594 | }
595 |
596 | # This is the best resolved URL we could obtain after following redirects.
597 | public function getURL() { return $this->url; }
598 |
599 | # Returns the integer [HTTP status code] for the resolved URL.
600 | # Examples: 200: OK (success), 404: Not Found, 301: Moved Permanently, ...
601 | public function getHTTPStatusCode() { return $this->status; }
602 | public function setHTTPStatusCode($status) { $this->status = $status; }
603 |
604 | # Returns _true_ if the [HTTP status code] for the resolved URL is 200.
605 | public function hasSuccessHTTPStatus() { return ($this->status == 200); }
606 |
607 | # Returns _true_ if the [HTTP status code] for the resolved URL is 301 or 302.
608 | public function hasRedirectHTTPStatus() { return ($this->status == 301 || $this->status == 302 || $this->status == 303); }
609 |
610 | # Returns the value of the Content-Type [HTTP header] for the resolved URL.
611 | # If header not provided, _null_ is returned. Examples: text/html, image/jpeg, ...
612 | public function getContentType() { return $this->content_type; }
613 | public function setContentType($type) { $this->content_type = $type; }
614 | public function hasHTMLContentType($type=null) {
615 | if (!isset($type)) { $type = $this->content_type; }
616 | return (stripos($type, 'html') !== false);
617 | }
618 |
619 | # Returns the size of the fetched URL in bytes for the resolved URL.
620 | # Determined only by the Content-Length [HTTP header]. _null_ returned otherwise.
621 | public function getContentLength() { return $this->content_length; }
622 | public function setContentLength($length) { $this->content_length = $length; }
623 |
624 | # Returns true if resolved URL was marked as the Open Graph URL (og:url)
625 | public function isOpenGraphURL($value=null) {
626 | if (isset($value)) { $this->is_open_graph = $value ? true : false; }
627 | return $this->is_open_graph;
628 | }
629 |
630 | # Returns true if resolved URL was marked as the Canonical URL (rel=canonical)
631 | public function isCanonicalURL($value=null) {
632 | if (isset($value)) { $this->is_canonical = $value ? true : false; }
633 | return $this->is_canonical;
634 | }
635 |
636 | # Returns true if resolved URL was also the URL you passed to resolveURL().
637 | public function isStartingURL($value=null) {
638 | if (isset($value)) { $this->is_starting_point = $value ? true : false; }
639 | return $this->is_starting_point;
640 | }
641 |
642 | # Returns true if an error occurred while resolving the URL.
643 | # If this returns false, $url_result is guaranteed to have a status code.
644 | public function didErrorOccur() {
645 | return ($this->error || $this->failed);
646 | }
647 |
648 | # Returns an explanation of what went wrong if didErrorOccur() returns true.
649 | public function getErrorMessageString() {
650 | return ($this->error || $this->failed) ? $this->error_message : '';
651 | }
652 |
653 | # Returns _true_ if there was a connection error (no header or no body returned).
654 | # May indicate a situation where you are more likely to try at least once more.
655 | # If this returns _true_, didErrorOccur() will true as well.
656 | public function didConnectionFail($value=null, $message=null) {
657 | if (isset($value)) {
658 | $this->failed = $value ? true : false;
659 | $this->error_message = $message;
660 | }
661 | return $this->failed;
662 | }
663 |
664 | public function didFatalErrorOccur($value=null, $message=null) {
665 | if (isset($value)) {
666 | $this->error = $value ? true : false;
667 | $this->error_message = $message;
668 | }
669 | return $this->error;
670 | }
671 |
672 | public function getRedirectTarget() { return $this->redirect; }
673 | public function setRedirectTarget($url) { $this->redirect = $url; }
674 |
675 | public function redirectTargetIsOpenGraphURL($value=null) {
676 | if (isset($value)) { $this->redirect_is_open_graph = $value ? true : false; }
677 | return $this->redirect_is_open_graph;
678 | }
679 |
680 | public function redirectTargetIsCanonicalURL($value=null) {
681 | if (isset($value)) { $this->redirect_is_canonical = $value ? true : false; }
682 | return $this->redirect_is_canonical;
683 | }
684 |
685 | public function debugStatus() {
686 | $attr = array();
687 | if ($this->failed || $this->error) { array_push($attr, 'ERROR'); }
688 | if ($this->is_open_graph) { array_push($attr, 'og:url'); }
689 | if ($this->is_canonical) { array_push($attr, 'rel=canonical'); }
690 |
691 | $status = '(' . $this->status;
692 | if (count($attr)) { $status .= '; ' . join(', ', $attr); }
693 | $status .= ')';
694 |
695 | return $status;
696 | }
697 | }
698 |
699 | ?>
700 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "mattwright/urlresolver",
3 | "version": "2.0",
4 | "type": "library",
5 | "description": "PHP class that attempts to resolve URLs to a final, canonical link.",
6 | "license": "MIT",
7 | "keywords": ["url", "redirect", "resolve", "canonical", "link"],
8 | "homepage": "https://github.com/mattwright/URLResolver.php",
9 | "authors": [ { "name": "Matt Wright", "email": "mw@mattwright.com" } ],
10 | "require": {
11 | "php": ">=5.3",
12 | "ext-curl": "*",
13 | "ext-mbstring": "*"
14 | },
15 | "autoload": { "psr-4": { "mattwright\\": "." } }
16 | }
17 |
--------------------------------------------------------------------------------
/lib/simple_html_dom.php:
--------------------------------------------------------------------------------
1 | size is the "real"
21 | * number of bytes the dom was created from. But for most purposes, it's a
22 | * really good estimation.
23 | *
24 | * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags
25 | * closed is great for malformed html, but it CAN lead to parsing errors.
26 | *
27 | * Allow the user to tell us how much they trust the html.
28 | *
29 | * Paperg add the text and plaintext to the selectors for the find syntax.
30 | * plaintext implies text in the innertext of a node. text implies that the
31 | * tag is a text node. This allows for us to find tags based on the text they
32 | * contain.
33 | *
34 | * Create find_ancestor_tag to see if a tag is - at any level - inside of
35 | * another specific tag.
36 | *
37 | * Paperg: added parse_charset so that we know about the character set of
38 | * the source document. NOTE: If the user's system has a routine called
39 | * get_last_retrieve_url_contents_content_type availalbe, we will assume it's
40 | * returning the content-type header from the last transfer or curl_exec, and
41 | * we will parse that and use it in preference to any other method of charset
42 | * detection.
43 | *
44 | * Found infinite loop in the case of broken html in restore_noise. Rewrote to
45 | * protect from that.
46 | *
47 | * PaperG (John Schlick) Added get_display_size for "IMG" tags.
48 | *
49 | * Licensed under The MIT License
50 | * Redistributions of files must retain the above copyright notice.
51 | *
52 | * @author S.C. Chen
53 | * @author John Schlick
54 | * @author Rus Carroll
55 | * @version Rev. 1.8.1 (247)
56 | * @package PlaceLocalInclude
57 | * @subpackage simple_html_dom
58 | */
59 |
60 | /**
61 | * All of the Defines for the classes below.
62 | * @author S.C. Chen
63 | */
64 | define('HDOM_TYPE_ELEMENT', 1);
65 | define('HDOM_TYPE_COMMENT', 2);
66 | define('HDOM_TYPE_TEXT', 3);
67 | define('HDOM_TYPE_ENDTAG', 4);
68 | define('HDOM_TYPE_ROOT', 5);
69 | define('HDOM_TYPE_UNKNOWN', 6);
70 | define('HDOM_QUOTE_DOUBLE', 0);
71 | define('HDOM_QUOTE_SINGLE', 1);
72 | define('HDOM_QUOTE_NO', 3);
73 | define('HDOM_INFO_BEGIN', 0);
74 | define('HDOM_INFO_END', 1);
75 | define('HDOM_INFO_QUOTE', 2);
76 | define('HDOM_INFO_SPACE', 3);
77 | define('HDOM_INFO_TEXT', 4);
78 | define('HDOM_INFO_INNER', 5);
79 | define('HDOM_INFO_OUTER', 6);
80 | define('HDOM_INFO_ENDSPACE', 7);
81 |
82 | /** The default target charset */
83 | defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
84 |
85 | /** The default
text used instead of
tags when returning text */
86 | defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
87 |
88 | /** The default text used instead of tags when returning text */
89 | defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
90 |
91 | /** The maximum file size the parser should load */
92 | defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
93 |
94 | /** Contents between curly braces "{" and "}" are interpreted as text */
95 | define('HDOM_SMARTY_AS_TEXT', 1);
96 |
97 | // helper functions
98 | // -----------------------------------------------------------------------------
99 | // get html dom from file
100 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
101 | function file_get_html(
102 | $url,
103 | $use_include_path = false,
104 | $context = null,
105 | $offset = 0,
106 | $maxLen = -1,
107 | $lowercase = true,
108 | $forceTagsClosed = true,
109 | $target_charset = DEFAULT_TARGET_CHARSET,
110 | $stripRN = true,
111 | $defaultBRText = DEFAULT_BR_TEXT,
112 | $defaultSpanText = DEFAULT_SPAN_TEXT)
113 | {
114 | // Ensure maximum length is greater than zero
115 | if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
116 |
117 | // We DO force the tags to be terminated.
118 | $dom = new simple_html_dom(
119 | null,
120 | $lowercase,
121 | $forceTagsClosed,
122 | $target_charset,
123 | $stripRN,
124 | $defaultBRText,
125 | $defaultSpanText);
126 |
127 | /**
128 | * For sourceforge users: uncomment the next line and comment the
129 | * retrieve_url_contents line 2 lines down if it is not already done.
130 | */
131 | $contents = file_get_contents(
132 | $url,
133 | $use_include_path,
134 | $context,
135 | $offset,
136 | $maxLen);
137 |
138 | // Paperg - use our own mechanism for getting the contents as we want to
139 | // control the timeout.
140 | // $contents = retrieve_url_contents($url);
141 | if (empty($contents) || strlen($contents) > $maxLen) { return false; }
142 |
143 | // The second parameter can force the selectors to all be lowercase.
144 | $dom->load($contents, $lowercase, $stripRN);
145 | return $dom;
146 | }
147 |
148 | // get html dom from string
149 | function str_get_html(
150 | $str,
151 | $lowercase = true,
152 | $forceTagsClosed = true,
153 | $target_charset = DEFAULT_TARGET_CHARSET,
154 | $stripRN = true,
155 | $defaultBRText = DEFAULT_BR_TEXT,
156 | $defaultSpanText = DEFAULT_SPAN_TEXT)
157 | {
158 | $dom = new simple_html_dom(
159 | null,
160 | $lowercase,
161 | $forceTagsClosed,
162 | $target_charset,
163 | $stripRN,
164 | $defaultBRText,
165 | $defaultSpanText);
166 |
167 | if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
168 | $dom->clear();
169 | return false;
170 | }
171 |
172 | $dom->load($str, $lowercase, $stripRN);
173 | return $dom;
174 | }
175 |
176 | // dump html dom tree
177 | function dump_html_tree($node, $show_attr = true, $deep = 0)
178 | {
179 | $node->dump($node);
180 | }
181 |
182 | /**
183 | * simple html dom node
184 | * PaperG - added ability for "find" routine to lowercase the value of the
185 | * selector.
186 | *
187 | * PaperG - added $tag_start to track the start position of the tag in the total
188 | * byte index
189 | *
190 | * @package PlaceLocalInclude
191 | */
192 | class simple_html_dom_node
193 | {
194 | /**
195 | * Node type
196 | *
197 | * Default is {@see HDOM_TYPE_TEXT}
198 | *
199 | * @var int
200 | */
201 | public $nodetype = HDOM_TYPE_TEXT;
202 |
203 | /**
204 | * Tag name
205 | *
206 | * Default is 'text'
207 | *
208 | * @var string
209 | */
210 | public $tag = 'text';
211 |
212 | /**
213 | * List of attributes
214 | *
215 | * @var array
216 | */
217 | public $attr = array();
218 |
219 | /**
220 | * List of child node objects
221 | *
222 | * @var array
223 | */
224 | public $children = array();
225 | public $nodes = array();
226 |
227 | /**
228 | * The parent node object
229 | *
230 | * @var object|null
231 | */
232 | public $parent = null;
233 |
234 | // The "info" array - see HDOM_INFO_... for what each element contains.
235 | public $_ = array();
236 |
237 | /**
238 | * Start position of the tag in the document
239 | *
240 | * @var int
241 | */
242 | public $tag_start = 0;
243 |
244 | /**
245 | * The DOM object
246 | *
247 | * @var object|null
248 | */
249 | private $dom = null;
250 |
251 | /**
252 | * Construct new node object
253 | *
254 | * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes}
255 | */
256 | function __construct($dom)
257 | {
258 | $this->dom = $dom;
259 | $dom->nodes[] = $this;
260 | }
261 |
262 | function __destruct()
263 | {
264 | $this->clear();
265 | }
266 |
267 | function __toString()
268 | {
269 | return $this->outertext();
270 | }
271 |
272 | // clean up memory due to php5 circular references memory leak...
273 | function clear()
274 | {
275 | $this->dom = null;
276 | $this->nodes = null;
277 | $this->parent = null;
278 | $this->children = null;
279 | }
280 |
281 | // dump node's tree
282 | function dump($show_attr = true, $deep = 0)
283 | {
284 | $lead = str_repeat(' ', $deep);
285 |
286 | echo $lead . $this->tag;
287 |
288 | if ($show_attr && count($this->attr) > 0) {
289 | echo '(';
290 | foreach ($this->attr as $k => $v) {
291 | echo "[$k]=>\"" . $this->$k . '", ';
292 | }
293 | echo ')';
294 | }
295 |
296 | echo "\n";
297 |
298 | if ($this->nodes) {
299 | foreach ($this->nodes as $c) {
300 | $c->dump($show_attr, $deep + 1);
301 | }
302 | }
303 | }
304 |
305 |
306 | // Debugging function to dump a single dom node with a bunch of information about it.
307 | function dump_node($echo = true)
308 | {
309 | $string = $this->tag;
310 |
311 | if (count($this->attr) > 0) {
312 | $string .= '(';
313 | foreach ($this->attr as $k => $v) {
314 | $string .= "[$k]=>\"" . $this->$k . '", ';
315 | }
316 | $string .= ')';
317 | }
318 |
319 | if (count($this->_) > 0) {
320 | $string .= ' $_ (';
321 | foreach ($this->_ as $k => $v) {
322 | if (is_array($v)) {
323 | $string .= "[$k]=>(";
324 | foreach ($v as $k2 => $v2) {
325 | $string .= "[$k2]=>\"" . $v2 . '", ';
326 | }
327 | $string .= ')';
328 | } else {
329 | $string .= "[$k]=>\"" . $v . '", ';
330 | }
331 | }
332 | $string .= ')';
333 | }
334 |
335 | if (isset($this->text)) {
336 | $string .= ' text: (' . $this->text . ')';
337 | }
338 |
339 | $string .= " HDOM_INNER_INFO: '";
340 |
341 | if (isset($node->_[HDOM_INFO_INNER])) {
342 | $string .= $node->_[HDOM_INFO_INNER] . "'";
343 | } else {
344 | $string .= ' NULL ';
345 | }
346 |
347 | $string .= ' children: ' . count($this->children);
348 | $string .= ' nodes: ' . count($this->nodes);
349 | $string .= ' tag_start: ' . $this->tag_start;
350 | $string .= "\n";
351 |
352 | if ($echo) {
353 | echo $string;
354 | return;
355 | } else {
356 | return $string;
357 | }
358 | }
359 |
360 | /**
361 | * Return or set parent node
362 | *
363 | * @param object|null $parent (optional) The parent node, `null` to return
364 | * the current parent node.
365 | * @return object|null The parent node
366 | */
367 | function parent($parent = null)
368 | {
369 | // I am SURE that this doesn't work properly.
370 | // It fails to unset the current node from it's current parents nodes or
371 | // children list first.
372 | if ($parent !== null) {
373 | $this->parent = $parent;
374 | $this->parent->nodes[] = $this;
375 | $this->parent->children[] = $this;
376 | }
377 |
378 | return $this->parent;
379 | }
380 |
381 | /**
382 | * @return bool True if the node has at least one child node
383 | */
384 | function has_child()
385 | {
386 | return !empty($this->children);
387 | }
388 |
389 | /**
390 | * Get child node at specified index
391 | *
392 | * @param int $idx The index of the child node to return, `-1` to return all
393 | * child nodes.
394 | * @return object|array|null The child node at the specified index, all child
395 | * nodes or null if the index is invalid.
396 | */
397 | function children($idx = -1)
398 | {
399 | if ($idx === -1) {
400 | return $this->children;
401 | }
402 |
403 | if (isset($this->children[$idx])) {
404 | return $this->children[$idx];
405 | }
406 |
407 | return null;
408 | }
409 |
410 | /**
411 | * Get first child node
412 | *
413 | * @return object|null The first child node or null if the current node has
414 | * no child nodes.
415 | *
416 | * @todo Use `empty()` instead of `count()` to improve performance on large
417 | * arrays.
418 | */
419 | function first_child()
420 | {
421 | if (count($this->children) > 0) {
422 | return $this->children[0];
423 | }
424 | return null;
425 | }
426 |
427 | /**
428 | * Get last child node
429 | *
430 | * @return object|null The last child node or null if the current node has
431 | * no child nodes.
432 | *
433 | * @todo Use `end()` to slightly improve performance on large arrays.
434 | */
435 | function last_child()
436 | {
437 | if (($count = count($this->children)) > 0) {
438 | return $this->children[$count - 1];
439 | }
440 | return null;
441 | }
442 |
443 | /**
444 | * Get next sibling node
445 | *
446 | * @return object|null The sibling node or null if the current node has no
447 | * sibling nodes.
448 | */
449 | function next_sibling()
450 | {
451 | if ($this->parent === null) {
452 | return null;
453 | }
454 |
455 | $idx = 0;
456 | $count = count($this->parent->children);
457 |
458 | while ($idx < $count && $this !== $this->parent->children[$idx]) {
459 | ++$idx;
460 | }
461 |
462 | if (++$idx >= $count) {
463 | return null;
464 | }
465 |
466 | return $this->parent->children[$idx];
467 | }
468 |
469 | /**
470 | * Get previous sibling node
471 | *
472 | * @return object|null The sibling node or null if the current node has no
473 | * sibling nodes.
474 | */
475 | function prev_sibling()
476 | {
477 | if ($this->parent === null) { return null; }
478 |
479 | $idx = 0;
480 | $count = count($this->parent->children);
481 |
482 | while ($idx < $count && $this !== $this->parent->children[$idx]) {
483 | ++$idx;
484 | }
485 |
486 | if (--$idx < 0) { return null; }
487 |
488 | return $this->parent->children[$idx];
489 | }
490 |
491 | /**
492 | * Traverse ancestors to the first matching tag.
493 | *
494 | * @param string $tag Tag to find
495 | * @return object|null First matching node in the DOM tree or null if no
496 | * match was found.
497 | *
498 | * @todo Null is returned implicitly by calling ->parent on the root node.
499 | * This behaviour could change at any time, rendering this function invalid.
500 | */
501 | function find_ancestor_tag($tag)
502 | {
503 | global $debug_object;
504 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
505 |
506 | // Start by including ourselves in the comparison.
507 | $returnDom = $this;
508 |
509 | while (!is_null($returnDom)) {
510 | if (is_object($debug_object)) {
511 | $debug_object->debug_log(2, 'Current tag is: ' . $returnDom->tag);
512 | }
513 |
514 | if ($returnDom->tag == $tag) {
515 | break;
516 | }
517 |
518 | $returnDom = $returnDom->parent;
519 | }
520 |
521 | return $returnDom;
522 | }
523 |
524 | /**
525 | * Get node's inner text (everything inside the opening and closing tags)
526 | *
527 | * @return string
528 | */
529 | function innertext()
530 | {
531 | if (isset($this->_[HDOM_INFO_INNER])) {
532 | return $this->_[HDOM_INFO_INNER];
533 | }
534 |
535 | if (isset($this->_[HDOM_INFO_TEXT])) {
536 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
537 | }
538 |
539 | $ret = '';
540 |
541 | foreach ($this->nodes as $n) {
542 | $ret .= $n->outertext();
543 | }
544 |
545 | return $ret;
546 | }
547 |
548 | /**
549 | * Get node's outer text (everything including the opening and closing tags)
550 | *
551 | * @return string
552 | */
553 | function outertext()
554 | {
555 | global $debug_object;
556 |
557 | if (is_object($debug_object)) {
558 | $text = '';
559 |
560 | if ($this->tag === 'text') {
561 | if (!empty($this->text)) {
562 | $text = ' with text: ' . $this->text;
563 | }
564 | }
565 |
566 | $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
567 | }
568 |
569 | if ($this->tag === 'root') return $this->innertext();
570 |
571 | // trigger callback
572 | if ($this->dom && $this->dom->callback !== null) {
573 | call_user_func_array($this->dom->callback, array($this));
574 | }
575 |
576 | if (isset($this->_[HDOM_INFO_OUTER])) {
577 | return $this->_[HDOM_INFO_OUTER];
578 | }
579 |
580 | if (isset($this->_[HDOM_INFO_TEXT])) {
581 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
582 | }
583 |
584 | // render begin tag
585 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
586 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
587 | } else {
588 | $ret = '';
589 | }
590 |
591 | // render inner text
592 | if (isset($this->_[HDOM_INFO_INNER])) {
593 | // If it's a br tag... don't return the HDOM_INNER_INFO that we
594 | // may or may not have added.
595 | if ($this->tag !== 'br') {
596 | $ret .= $this->_[HDOM_INFO_INNER];
597 | }
598 | } else {
599 | if ($this->nodes) {
600 | foreach ($this->nodes as $n) {
601 | $ret .= $this->convert_text($n->outertext());
602 | }
603 | }
604 | }
605 |
606 | // render end tag
607 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
608 | $ret .= '' . $this->tag . '>';
609 | }
610 |
611 | return $ret;
612 | }
613 |
614 | /**
615 | * Get node's plain text (everything excluding all tags)
616 | *
617 | * @return string
618 | */
619 | function text()
620 | {
621 | if (isset($this->_[HDOM_INFO_INNER])) {
622 | return $this->_[HDOM_INFO_INNER];
623 | }
624 |
625 | switch ($this->nodetype) {
626 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
627 | case HDOM_TYPE_COMMENT: return '';
628 | case HDOM_TYPE_UNKNOWN: return '';
629 | }
630 |
631 | if (strcasecmp($this->tag, 'script') === 0) { return ''; }
632 | if (strcasecmp($this->tag, 'style') === 0) { return ''; }
633 |
634 | $ret = '';
635 |
636 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
637 | // for some span tags, and some p tags) $this->nodes is set to NULL.
638 | // NOTE: This indicates that there is a problem where it's set to NULL
639 | // without a clear happening.
640 | // WHY is this happening?
641 | if (!is_null($this->nodes)) {
642 | foreach ($this->nodes as $n) {
643 | // Start paragraph after a blank line
644 | if ($n->tag === 'p') {
645 | $ret .= "\n\n";
646 | }
647 |
648 | $ret .= $this->convert_text($n->text());
649 |
650 | // If this node is a span... add a space at the end of it so
651 | // multiple spans don't run into each other. This is plaintext
652 | // after all.
653 | if ($n->tag === 'span') {
654 | $ret .= $this->dom->default_span_text;
655 | }
656 | }
657 | }
658 | return trim($ret);
659 | }
660 |
661 | /**
662 | * Get node's xml text (inner text as a CDATA section)
663 | *
664 | * @return string
665 | */
666 | function xmltext()
667 | {
668 | $ret = $this->innertext();
669 | $ret = str_ireplace('', '', $ret);
671 | return $ret;
672 | }
673 |
674 | // build node's text with tag
675 | function makeup()
676 | {
677 | // text, comment, unknown
678 | if (isset($this->_[HDOM_INFO_TEXT])) {
679 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
680 | }
681 |
682 | $ret = '<' . $this->tag;
683 | $i = -1;
684 |
685 | foreach ($this->attr as $key => $val) {
686 | ++$i;
687 |
688 | // skip removed attribute
689 | if ($val === null || $val === false) { continue; }
690 |
691 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
692 |
693 | //no value attr: nowrap, checked selected...
694 | if ($val === true) {
695 | $ret .= $key;
696 | } else {
697 | switch ($this->_[HDOM_INFO_QUOTE][$i])
698 | {
699 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
700 | case HDOM_QUOTE_SINGLE: $quote = '\''; break;
701 | default: $quote = '';
702 | }
703 |
704 | $ret .= $key
705 | . $this->_[HDOM_INFO_SPACE][$i][1]
706 | . '='
707 | . $this->_[HDOM_INFO_SPACE][$i][2]
708 | . $quote
709 | . $val
710 | . $quote;
711 | }
712 | }
713 |
714 | $ret = $this->dom->restore_noise($ret);
715 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
716 | }
717 |
718 | /**
719 | * Find elements by CSS selector
720 | *
721 | * @param string $selector The CSS selector
722 | * @param int|null $idx Index of element to return form the list of matching
723 | * elements (default: `null` = disabled).
724 | * @param bool $lowercase Matches tag names case insensitive (lowercase) if
725 | * enabled (default: `false`)
726 | * @return array|object|null A list of elements matching the specified CSS
727 | * selector or a single element if $idx is specified or null if no element
728 | * was found.
729 | */
730 | function find($selector, $idx = null, $lowercase = false)
731 | {
732 | $selectors = $this->parse_selector($selector);
733 | if (($count = count($selectors)) === 0) { return array(); }
734 | $found_keys = array();
735 |
736 | // find each selector
737 | for ($c = 0; $c < $count; ++$c) {
738 | // The change on the below line was documented on the sourceforge
739 | // code tracker id 2788009
740 | // used to be: if (($levle=count($selectors[0]))===0) return array();
741 | if (($levle = count($selectors[$c])) === 0) { return array(); }
742 | if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
743 |
744 | $head = array($this->_[HDOM_INFO_BEGIN] => 1);
745 | $cmd = ' '; // Combinator
746 |
747 | // handle descendant selectors, no recursive!
748 | for ($l = 0; $l < $levle; ++$l) {
749 | $ret = array();
750 |
751 | foreach ($head as $k => $v) {
752 | $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
753 | //PaperG - Pass this optional parameter on to the seek function.
754 | $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
755 | }
756 |
757 | $head = $ret;
758 | $cmd = $selectors[$c][$l][4]; // Next Combinator
759 | }
760 |
761 | foreach ($head as $k => $v) {
762 | if (!isset($found_keys[$k])) {
763 | $found_keys[$k] = 1;
764 | }
765 | }
766 | }
767 |
768 | // sort keys
769 | ksort($found_keys);
770 |
771 | $found = array();
772 | foreach ($found_keys as $k => $v) {
773 | $found[] = $this->dom->nodes[$k];
774 | }
775 |
776 | // return nth-element or array
777 | if (is_null($idx)) { return $found; }
778 | elseif ($idx < 0) { $idx = count($found) + $idx; }
779 | return (isset($found[$idx])) ? $found[$idx] : null;
780 | }
781 |
782 | /**
783 | * Seek DOM elements by selector
784 | *
785 | * **Note**
786 | * The selector element must be compatible to a selector from
787 | * {@see simple_html_dom_node::parse_selector()}
788 | *
789 | * @param array $selector A selector element
790 | * @param array $ret An array of matches
791 | * @param bool $lowercase Matches tag names case insensitive (lowercase) if
792 | * enabled (default: `false`)
793 | * @return void
794 | */
795 | protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
796 | {
797 | global $debug_object;
798 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
799 |
800 | list($tag, $id, $class, $attributes, $cmb) = $selector;
801 | $nodes = array();
802 |
803 | if ($parent_cmd === ' ') { // Descendant Combinator
804 | // Find parent closing tag if the current element doesn't have a closing
805 | // tag (i.e. void element)
806 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
807 | if ($end == 0) {
808 | $parent = $this->parent;
809 | while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
810 | $end -= 1;
811 | $parent = $parent->parent;
812 | }
813 | $end += $parent->_[HDOM_INFO_END];
814 | }
815 |
816 | // Get list of target nodes
817 | $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
818 | $nodes_count = $end - $nodes_start;
819 | $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
820 | } elseif ($parent_cmd === '>') { // Child Combinator
821 | $nodes = $this->children;
822 | } elseif ($parent_cmd === '+'
823 | && $this->parent
824 | && in_array($this, $this->parent->children)) { // Next-Sibling Combinator
825 | $index = array_search($this, $this->parent->children, true) + 1;
826 | $nodes[] = $this->parent->children[$index];
827 | } elseif ($parent_cmd === '~'
828 | && $this->parent
829 | && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
830 | $index = array_search($this, $this->parent->children, true);
831 | $nodes = array_slice($this->parent->children, $index);
832 | }
833 |
834 | // Go throgh each element starting at this element until the end tag
835 | // Note: If this element is a void tag, any previous void element is
836 | // skipped.
837 | foreach($nodes as $node) {
838 | $pass = true;
839 |
840 | // Skip root nodes
841 | if(!$node->parent) {
842 | $pass = false;
843 | }
844 |
845 | // Skip if node isn't a child node (i.e. text nodes)
846 | if($pass && !in_array($node, $node->parent->children, true)) {
847 | $pass = false;
848 | }
849 |
850 | // Skip if tag doesn't match
851 | if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
852 | $pass = false;
853 | }
854 |
855 | // Skip if ID doesn't exist
856 | if ($pass && $id !== '' && !isset($node->attr['id'])) {
857 | $pass = false;
858 | }
859 |
860 | // Check if ID matches
861 | if ($pass && $id !== '' && isset($node->attr['id'])) {
862 | // Note: Only consider the first ID (as browsers do)
863 | $node_id = explode(' ', trim($node->attr['id']))[0];
864 |
865 | if($id !== $node_id) { $pass = false; }
866 | }
867 |
868 | // Check if all class(es) exist
869 | if ($pass && $class !== '' && is_array($class) && !empty($class)) {
870 | if (isset($node->attr['class'])) {
871 | $node_classes = explode(' ', $node->attr['class']);
872 |
873 | if ($lowercase) {
874 | $node_classes = array_map('strtolower', $node_classes);
875 | }
876 |
877 | foreach($class as $c) {
878 | if(!in_array($c, $node_classes)) {
879 | $pass = false;
880 | break;
881 | }
882 | }
883 | } else {
884 | $pass = false;
885 | }
886 | }
887 |
888 | // Check attributes
889 | if ($pass
890 | && $attributes !== ''
891 | && is_array($attributes)
892 | && !empty($attributes)) {
893 | foreach($attributes as $a) {
894 | list (
895 | $att_name,
896 | $att_expr,
897 | $att_val,
898 | $att_inv,
899 | $att_case_sensitivity
900 | ) = $a;
901 |
902 | // Handle indexing attributes (i.e. "[2]")
903 | /**
904 | * Note: This is not supported by the CSS Standard but adds
905 | * the ability to select items compatible to XPath (i.e.
906 | * the 3rd element within it's parent).
907 | *
908 | * Note: This doesn't conflict with the CSS Standard which
909 | * doesn't work on numeric attributes anyway.
910 | */
911 | if (is_numeric($att_name)
912 | && $att_expr === ''
913 | && $att_val === '') {
914 | $count = 0;
915 |
916 | // Find index of current element in parent
917 | foreach ($node->parent->children as $c) {
918 | if ($c->tag === $node->tag) ++$count;
919 | if ($c === $node) break;
920 | }
921 |
922 | // If this is the correct node, continue with next
923 | // attribute
924 | if ($count === (int)$att_name) continue;
925 | }
926 |
927 | // Check attribute availability
928 | if ($att_inv) { // Attribute should NOT be set
929 | if (isset($node->attr[$att_name])) {
930 | $pass = false;
931 | break;
932 | }
933 | } else { // Attribute should be set
934 | // todo: "plaintext" is not a valid CSS selector!
935 | if ($att_name !== 'plaintext'
936 | && !isset($node->attr[$att_name])) {
937 | $pass = false;
938 | break;
939 | }
940 | }
941 |
942 | // Continue with next attribute if expression isn't defined
943 | if ($att_expr === '') continue;
944 |
945 | // If they have told us that this is a "plaintext"
946 | // search then we want the plaintext of the node - right?
947 | // todo "plaintext" is not a valid CSS selector!
948 | if ($att_name === 'plaintext') {
949 | $nodeKeyValue = $node->text();
950 | } else {
951 | $nodeKeyValue = $node->attr[$att_name];
952 | }
953 |
954 | if (is_object($debug_object)) {
955 | $debug_object->debug_log(2,
956 | 'testing node: '
957 | . $node->tag
958 | . ' for attribute: '
959 | . $att_name
960 | . $att_expr
961 | . $att_val
962 | . ' where nodes value is: '
963 | . $nodeKeyValue
964 | );
965 | }
966 |
967 | // If lowercase is set, do a case insensitive test of
968 | // the value of the selector.
969 | if ($lowercase) {
970 | $check = $this->match(
971 | $att_expr,
972 | strtolower($att_val),
973 | strtolower($nodeKeyValue),
974 | $att_case_sensitivity
975 | );
976 | } else {
977 | $check = $this->match(
978 | $att_expr,
979 | $att_val,
980 | $nodeKeyValue,
981 | $att_case_sensitivity
982 | );
983 | }
984 |
985 | if (is_object($debug_object)) {
986 | $debug_object->debug_log(2,
987 | 'after match: '
988 | . ($check ? 'true' : 'false')
989 | );
990 | }
991 |
992 | if (!$check) {
993 | $pass = false;
994 | break;
995 | }
996 | }
997 | }
998 |
999 | // Found a match. Add to list and clear node
1000 | if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
1001 | unset($node);
1002 | }
1003 | // It's passed by reference so this is actually what this function returns.
1004 | if (is_object($debug_object)) {
1005 | $debug_object->debug_log(1, 'EXIT - ret: ', $ret);
1006 | }
1007 | }
1008 |
1009 | /**
1010 | * Match value and pattern for a given CSS expression
1011 | *
1012 | * **Supported Expressions**
1013 | *
1014 | * | Expression | Description
1015 | * | ---------- | -----------
1016 | * | `=` | $value and $pattern must be equal
1017 | * | `!=` | $value and $pattern must not be equal
1018 | * | `^=` | $value must start with $pattern
1019 | * | `$=` | $value must end with $pattern
1020 | * | `*=` | $value must contain $pattern
1021 | *
1022 | * @param string $exp The expression.
1023 | * @param string $pattern The pattern
1024 | * @param string $value The value
1025 | * @value bool True if $value matches $pattern
1026 | */
1027 | protected function match($exp, $pattern, $value, $case_sensitivity)
1028 | {
1029 | global $debug_object;
1030 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
1031 |
1032 | if ($case_sensitivity === 'i') {
1033 | $pattern = strtolower($pattern);
1034 | $value = strtolower($value);
1035 | }
1036 |
1037 | switch ($exp) {
1038 | case '=':
1039 | return ($value === $pattern);
1040 | case '!=':
1041 | return ($value !== $pattern);
1042 | case '^=':
1043 | return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
1044 | case '$=':
1045 | return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
1046 | case '*=':
1047 | return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
1048 | case '|=':
1049 | /**
1050 | * [att|=val]
1051 | *
1052 | * Represents an element with the att attribute, its value
1053 | * either being exactly "val" or beginning with "val"
1054 | * immediately followed by "-" (U+002D).
1055 | */
1056 | return strpos($value, $pattern) === 0;
1057 | case '~=':
1058 | /**
1059 | * [att~=val]
1060 | *
1061 | * Represents an element with the att attribute whose value is a
1062 | * whitespace-separated list of words, one of which is exactly
1063 | * "val". If "val" contains whitespace, it will never represent
1064 | * anything (since the words are separated by spaces). Also if
1065 | * "val" is the empty string, it will never represent anything.
1066 | */
1067 | return in_array($pattern, explode(' ', trim($value)), true);
1068 | }
1069 | return false;
1070 | }
1071 |
1072 | /**
1073 | * Parse CSS selector
1074 | *
1075 | * @param string $selector_string CSS selector string
1076 | * @return array List of CSS selectors. The format depends on the type of
1077 | * selector:
1078 | *
1079 | * ```php
1080 | *
1081 | * array( // list of selectors (each separated by a comma), i.e. 'img, p, div'
1082 | * array( // list of combinator selectors, i.e. 'img > p > div'
1083 | * array( // selector element
1084 | * [0], // (string) The element tag
1085 | * [1], // (string) The element id
1086 | * [2], // (array) The element classes
1087 | * [3], // (array>) The list of attributes, each
1088 | * // with four elements: name, expression, value, inverted
1089 | * [4] // (string) The selector combinator (' ' | '>' | '+' | '~')
1090 | * )
1091 | * )
1092 | * )
1093 | * ```
1094 | *
1095 | * @link https://www.w3.org/TR/selectors/#compound Compound selector
1096 | */
1097 | protected function parse_selector($selector_string)
1098 | {
1099 | global $debug_object;
1100 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1101 |
1102 | /**
1103 | * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
1104 | *
1105 | * Paperg: Add the colon to the attribute, so that it properly finds
1106 | * like google does.
1107 | *
1108 | * Note: if you try to look at this attribute, you MUST use getAttribute
1109 | * since $dom->x:y will fail the php syntax check.
1110 | *
1111 | * Notice the \[ starting the attribute? and the @? following? This
1112 | * implies that an attribute can begin with an @ sign that is not
1113 | * captured. This implies that an html attribute specifier may start
1114 | * with an @ sign that is NOT captured by the expression. Farther study
1115 | * is required to determine of this should be documented or removed.
1116 | *
1117 | * Matches selectors in this order:
1118 | *
1119 | * [0] - full match
1120 | *
1121 | * [1] - tag name
1122 | * ([\w:\*-]*)
1123 | * Matches the tag name consisting of zero or more words, colons,
1124 | * asterisks and hyphens.
1125 | *
1126 | * [2] - id name
1127 | * (?:\#([\w-]+))
1128 | * Optionally matches a id name, consisting of an "#" followed by
1129 | * the id name (one or more words and hyphens).
1130 | *
1131 | * [3] - class names (including dots)
1132 | * (?:\.([\w\.-]+))?
1133 | * Optionally matches a list of classs, consisting of an "."
1134 | * followed by the class name (one or more words and hyphens)
1135 | * where multiple classes can be chained (i.e. ".foo.bar.baz")
1136 | *
1137 | * [4] - attributes
1138 | * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
1139 | * Optionally matches the attributes list
1140 | *
1141 | * [5] - separator
1142 | * ([\/, >+~]+)
1143 | * Matches the selector list separator
1144 | */
1145 | // phpcs:ignore Generic.Files.LineLength
1146 | $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
1147 |
1148 | preg_match_all(
1149 | $pattern,
1150 | trim($selector_string) . ' ', // Add final ' ' as pseudo separator
1151 | $matches,
1152 | PREG_SET_ORDER
1153 | );
1154 |
1155 | if (is_object($debug_object)) {
1156 | $debug_object->debug_log(2, 'Matches Array: ', $matches);
1157 | }
1158 |
1159 | $selectors = array();
1160 | $result = array();
1161 |
1162 | foreach ($matches as $m) {
1163 | $m[0] = trim($m[0]);
1164 |
1165 | // Skip NoOps
1166 | if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
1167 |
1168 | // Convert to lowercase
1169 | if ($this->dom->lowercase) {
1170 | $m[1] = strtolower($m[1]);
1171 | }
1172 |
1173 | // Extract classes
1174 | if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
1175 |
1176 | /* Extract attributes (pattern based on the pattern above!)
1177 |
1178 | * [0] - full match
1179 | * [1] - attribute name
1180 | * [2] - attribute expression
1181 | * [3] - attribute value
1182 | * [4] - case sensitivity
1183 | *
1184 | * Note: Attributes can be negated with a "!" prefix to their name
1185 | */
1186 | if($m[4] !== '') {
1187 | preg_match_all(
1188 | "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s*?([iIsS])?)?\]/is",
1189 | trim($m[4]),
1190 | $attributes,
1191 | PREG_SET_ORDER
1192 | );
1193 |
1194 | // Replace element by array
1195 | $m[4] = array();
1196 |
1197 | foreach($attributes as $att) {
1198 | // Skip empty matches
1199 | if(trim($att[0]) === '') { continue; }
1200 |
1201 | $inverted = (isset($att[1][0]) && $att[1][0] === '!');
1202 | $m[4][] = array(
1203 | $inverted ? substr($att[1], 1) : $att[1], // Name
1204 | (isset($att[2])) ? $att[2] : '', // Expression
1205 | (isset($att[3])) ? $att[3] : '', // Value
1206 | $inverted, // Inverted Flag
1207 | (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
1208 | );
1209 | }
1210 | }
1211 |
1212 | // Sanitize Separator
1213 | if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
1214 | $m[5] = ' ';
1215 | } else { // Other Separator
1216 | $m[5] = trim($m[5]);
1217 | }
1218 |
1219 | // Clear Separator if it's a Selector List
1220 | if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
1221 |
1222 | // Remove full match before adding to results
1223 | array_shift($m);
1224 | $result[] = $m;
1225 |
1226 | if ($is_list) { // Selector List
1227 | $selectors[] = $result;
1228 | $result = array();
1229 | }
1230 | }
1231 |
1232 | if (count($result) > 0) { $selectors[] = $result; }
1233 | return $selectors;
1234 | }
1235 |
1236 | function __get($name)
1237 | {
1238 | if (isset($this->attr[$name])) {
1239 | return $this->convert_text($this->attr[$name]);
1240 | }
1241 | switch ($name) {
1242 | case 'outertext': return $this->outertext();
1243 | case 'innertext': return $this->innertext();
1244 | case 'plaintext': return $this->text();
1245 | case 'xmltext': return $this->xmltext();
1246 | default: return array_key_exists($name, $this->attr);
1247 | }
1248 | }
1249 |
1250 | function __set($name, $value)
1251 | {
1252 | global $debug_object;
1253 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1254 |
1255 | switch ($name) {
1256 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
1257 | case 'innertext':
1258 | if (isset($this->_[HDOM_INFO_TEXT])) {
1259 | return $this->_[HDOM_INFO_TEXT] = $value;
1260 | }
1261 | return $this->_[HDOM_INFO_INNER] = $value;
1262 | }
1263 |
1264 | if (!isset($this->attr[$name])) {
1265 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
1266 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1267 | }
1268 |
1269 | $this->attr[$name] = $value;
1270 | }
1271 |
1272 | function __isset($name)
1273 | {
1274 | switch ($name) {
1275 | case 'outertext': return true;
1276 | case 'innertext': return true;
1277 | case 'plaintext': return true;
1278 | }
1279 | //no value attr: nowrap, checked selected...
1280 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1281 | }
1282 |
1283 | function __unset($name)
1284 | {
1285 | if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1286 | }
1287 |
1288 | // PaperG - Function to convert the text from one character set to another
1289 | // if the two sets are not the same.
1290 | function convert_text($text)
1291 | {
1292 | global $debug_object;
1293 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1294 |
1295 | $converted_text = $text;
1296 |
1297 | $sourceCharset = '';
1298 | $targetCharset = '';
1299 |
1300 | if ($this->dom) {
1301 | $sourceCharset = strtoupper($this->dom->_charset);
1302 | $targetCharset = strtoupper($this->dom->_target_charset);
1303 | }
1304 |
1305 | if (is_object($debug_object)) {
1306 | $debug_object->debug_log(3,
1307 | 'source charset: '
1308 | . $sourceCharset
1309 | . ' target charaset: '
1310 | . $targetCharset
1311 | );
1312 | }
1313 |
1314 | if (!empty($sourceCharset)
1315 | && !empty($targetCharset)
1316 | && (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1317 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1318 | if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1319 | && ($this->is_utf8($text))) {
1320 | $converted_text = $text;
1321 | } else {
1322 | $converted_text = iconv($sourceCharset, $targetCharset, $text);
1323 | }
1324 | }
1325 |
1326 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1327 | if ($targetCharset === 'UTF-8') {
1328 | if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
1329 | $converted_text = substr($converted_text, 3);
1330 | }
1331 |
1332 | if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1333 | $converted_text = substr($converted_text, 0, -3);
1334 | }
1335 | }
1336 |
1337 | return $converted_text;
1338 | }
1339 |
1340 | /**
1341 | * Returns true if $string is valid UTF-8 and false otherwise.
1342 | *
1343 | * @param mixed $str String to be tested
1344 | * @return boolean
1345 | */
1346 | static function is_utf8($str)
1347 | {
1348 | $c = 0; $b = 0;
1349 | $bits = 0;
1350 | $len = strlen($str);
1351 | for($i = 0; $i < $len; $i++) {
1352 | $c = ord($str[$i]);
1353 | if($c > 128) {
1354 | if(($c >= 254)) { return false; }
1355 | elseif($c >= 252) { $bits = 6; }
1356 | elseif($c >= 248) { $bits = 5; }
1357 | elseif($c >= 240) { $bits = 4; }
1358 | elseif($c >= 224) { $bits = 3; }
1359 | elseif($c >= 192) { $bits = 2; }
1360 | else { return false; }
1361 | if(($i + $bits) > $len) { return false; }
1362 | while($bits > 1) {
1363 | $i++;
1364 | $b = ord($str[$i]);
1365 | if($b < 128 || $b > 191) { return false; }
1366 | $bits--;
1367 | }
1368 | }
1369 | }
1370 | return true;
1371 | }
1372 |
1373 | /**
1374 | * Function to try a few tricks to determine the displayed size of an img on
1375 | * the page. NOTE: This will ONLY work on an IMG tag. Returns FALSE on all
1376 | * other tag types.
1377 | *
1378 | * @author John Schlick
1379 | * @version April 19 2012
1380 | * @return array an array containing the 'height' and 'width' of the image
1381 | * on the page or -1 if we can't figure it out.
1382 | */
1383 | function get_display_size()
1384 | {
1385 | global $debug_object;
1386 |
1387 | $width = -1;
1388 | $height = -1;
1389 |
1390 | if ($this->tag !== 'img') {
1391 | return false;
1392 | }
1393 |
1394 | // See if there is aheight or width attribute in the tag itself.
1395 | if (isset($this->attr['width'])) {
1396 | $width = $this->attr['width'];
1397 | }
1398 |
1399 | if (isset($this->attr['height'])) {
1400 | $height = $this->attr['height'];
1401 | }
1402 |
1403 | // Now look for an inline style.
1404 | if (isset($this->attr['style'])) {
1405 | // Thanks to user gnarf from stackoverflow for this regular expression.
1406 | $attributes = array();
1407 |
1408 | preg_match_all(
1409 | '/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1410 | $this->attr['style'],
1411 | $matches,
1412 | PREG_SET_ORDER
1413 | );
1414 |
1415 | foreach ($matches as $match) {
1416 | $attributes[$match[1]] = $match[2];
1417 | }
1418 |
1419 | // If there is a width in the style attributes:
1420 | if (isset($attributes['width']) && $width == -1) {
1421 | // check that the last two characters are px (pixels)
1422 | if (strtolower(substr($attributes['width'], -2)) === 'px') {
1423 | $proposed_width = substr($attributes['width'], 0, -2);
1424 | // Now make sure that it's an integer and not something stupid.
1425 | if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
1426 | $width = $proposed_width;
1427 | }
1428 | }
1429 | }
1430 |
1431 | // If there is a width in the style attributes:
1432 | if (isset($attributes['height']) && $height == -1) {
1433 | // check that the last two characters are px (pixels)
1434 | if (strtolower(substr($attributes['height'], -2)) == 'px') {
1435 | $proposed_height = substr($attributes['height'], 0, -2);
1436 | // Now make sure that it's an integer and not something stupid.
1437 | if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
1438 | $height = $proposed_height;
1439 | }
1440 | }
1441 | }
1442 |
1443 | }
1444 |
1445 | // Future enhancement:
1446 | // Look in the tag to see if there is a class or id specified that has
1447 | // a height or width attribute to it.
1448 |
1449 | // Far future enhancement
1450 | // Look at all the parent tags of this image to see if they specify a
1451 | // class or id that has an img selector that specifies a height or width
1452 | // Note that in this case, the class or id will have the img subselector
1453 | // for it to apply to the image.
1454 |
1455 | // ridiculously far future development
1456 | // If the class or id is specified in a SEPARATE css file thats not on
1457 | // the page, go get it and do what we were just doing for the ones on
1458 | // the page.
1459 |
1460 | $result = array(
1461 | 'height' => $height,
1462 | 'width' => $width
1463 | );
1464 |
1465 | return $result;
1466 | }
1467 |
1468 | // camel naming conventions
1469 | function getAllAttributes()
1470 | {
1471 | return $this->attr;
1472 | }
1473 |
1474 | function getAttribute($name)
1475 | {
1476 | return $this->__get($name);
1477 | }
1478 |
1479 | function setAttribute($name, $value)
1480 | {
1481 | $this->__set($name, $value);
1482 | }
1483 |
1484 | function hasAttribute($name)
1485 | {
1486 | return $this->__isset($name);
1487 | }
1488 |
1489 | function removeAttribute($name)
1490 | {
1491 | $this->__set($name, null);
1492 | }
1493 |
1494 | function getElementById($id)
1495 | {
1496 | return $this->find("#$id", 0);
1497 | }
1498 |
1499 | function getElementsById($id, $idx = null)
1500 | {
1501 | return $this->find("#$id", $idx);
1502 | }
1503 |
1504 | function getElementByTagName($name)
1505 | {
1506 | return $this->find($name, 0);
1507 | }
1508 |
1509 | function getElementsByTagName($name, $idx = null)
1510 | {
1511 | return $this->find($name, $idx);
1512 | }
1513 |
1514 | function parentNode()
1515 | {
1516 | return $this->parent();
1517 | }
1518 |
1519 | function childNodes($idx = -1)
1520 | {
1521 | return $this->children($idx);
1522 | }
1523 |
1524 | function firstChild()
1525 | {
1526 | return $this->first_child();
1527 | }
1528 |
1529 | function lastChild()
1530 | {
1531 | return $this->last_child();
1532 | }
1533 |
1534 | function nextSibling()
1535 | {
1536 | return $this->next_sibling();
1537 | }
1538 |
1539 | function previousSibling()
1540 | {
1541 | return $this->prev_sibling();
1542 | }
1543 |
1544 | function hasChildNodes()
1545 | {
1546 | return $this->has_child();
1547 | }
1548 |
1549 | function nodeName()
1550 | {
1551 | return $this->tag;
1552 | }
1553 |
1554 | function appendChild($node)
1555 | {
1556 | $node->parent($this);
1557 | return $node;
1558 | }
1559 |
1560 | }
1561 |
1562 | /**
1563 | * simple html dom parser
1564 | *
1565 | * Paperg - in the find routine: allow us to specify that we want case
1566 | * insensitive testing of the value of the selector.
1567 | *
1568 | * Paperg - change $size from protected to public so we can easily access it
1569 | *
1570 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we
1571 | * trust the html or not. Default is to NOT trust it.
1572 | *
1573 | * @package PlaceLocalInclude
1574 | */
1575 | class simple_html_dom
1576 | {
1577 | /**
1578 | * The root node of the document
1579 | *
1580 | * @var object
1581 | */
1582 | public $root = null;
1583 |
1584 | /**
1585 | * List of nodes in the current DOM
1586 | *
1587 | * @var array
1588 | */
1589 | public $nodes = array();
1590 |
1591 | /**
1592 | * Callback function to run for each element in the DOM.
1593 | *
1594 | * @var callable|null
1595 | */
1596 | public $callback = null;
1597 |
1598 | /**
1599 | * Indicates how tags and attributes are matched
1600 | *
1601 | * @var bool When set to **true** tags and attributes will be converted to
1602 | * lowercase before matching.
1603 | */
1604 | public $lowercase = false;
1605 |
1606 | /**
1607 | * Original document size
1608 | *
1609 | * Holds the original document size.
1610 | *
1611 | * @var int
1612 | */
1613 | public $original_size;
1614 |
1615 | /**
1616 | * Current document size
1617 | *
1618 | * Holds the current document size. The document size is determined by the
1619 | * string length of ({@see simple_html_dom::$doc}).
1620 | *
1621 | * _Note_: Using this variable is more efficient than calling `strlen($doc)`
1622 | *
1623 | * @var int
1624 | * */
1625 | public $size;
1626 |
1627 | /**
1628 | * Current position in the document
1629 | *
1630 | * @var int
1631 | */
1632 | protected $pos;
1633 |
1634 | /**
1635 | * The document
1636 | *
1637 | * @var string
1638 | */
1639 | protected $doc;
1640 |
1641 | /**
1642 | * Current character
1643 | *
1644 | * Holds the current character at position {@see simple_html_dom::$pos} in
1645 | * the document {@see simple_html_dom::$doc}
1646 | *
1647 | * _Note_: Using this variable is more efficient than calling
1648 | * `substr($doc, $pos, 1)`
1649 | *
1650 | * @var string
1651 | */
1652 | protected $char;
1653 |
1654 | protected $cursor;
1655 |
1656 | /**
1657 | * Parent node of the next node detected by the parser
1658 | *
1659 | * @var object
1660 | */
1661 | protected $parent;
1662 | protected $noise = array();
1663 |
1664 | /**
1665 | * Tokens considered blank in HTML
1666 | *
1667 | * @var string
1668 | */
1669 | protected $token_blank = " \t\r\n";
1670 |
1671 | /**
1672 | * Tokens to identify the equal sign for attributes, stopping either at the
1673 | * closing tag ("/" i.e. "") or the end of an opening tag (">" i.e.
1674 | * "")
1675 | *
1676 | * @var string
1677 | */
1678 | protected $token_equal = ' =/>';
1679 |
1680 | /**
1681 | * Tokens to identify the end of a tag name. A tag name either ends on the
1682 | * ending slash ("/" i.e. "") or whitespace ("\s\r\n\t")
1683 | *
1684 | * @var string
1685 | */
1686 | protected $token_slash = " />\r\n\t";
1687 |
1688 | /**
1689 | * Tokens to identify the end of an attribute
1690 | *
1691 | * @var string
1692 | */
1693 | protected $token_attr = ' >';
1694 |
1695 | // Note that this is referenced by a child node, and so it needs to be
1696 | // public for that node to see this information.
1697 | public $_charset = '';
1698 | public $_target_charset = '';
1699 |
1700 | /**
1701 | * Innertext for
elements
1702 | *
1703 | * @var string
1704 | */
1705 | protected $default_br_text = '';
1706 |
1707 | /**
1708 | * Suffix for elements
1709 | *
1710 | * @var string
1711 | */
1712 | public $default_span_text = '';
1713 |
1714 | /**
1715 | * Defines a list of self-closing tags (Void elements) according to the HTML
1716 | * Specification
1717 | *
1718 | * _Remarks_:
1719 | * - Use `isset()` instead of `in_array()` on array elements to boost
1720 | * performance about 30%
1721 | * - Sort elements by name for better readability!
1722 | *
1723 | * @link https://www.w3.org/TR/html HTML Specification
1724 | * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
1725 | */
1726 | protected $self_closing_tags = array(
1727 | 'area' => 1,
1728 | 'base' => 1,
1729 | 'br' => 1,
1730 | 'col' => 1,
1731 | 'embed' => 1,
1732 | 'hr' => 1,
1733 | 'img' => 1,
1734 | 'input' => 1,
1735 | 'link' => 1,
1736 | 'meta' => 1,
1737 | 'param' => 1,
1738 | 'source' => 1,
1739 | 'track' => 1,
1740 | 'wbr' => 1
1741 | );
1742 |
1743 | /**
1744 | * Defines a list of tags which - if closed - close all optional closing
1745 | * elements within if they haven't been closed yet. (So, an element where
1746 | * neither opening nor closing tag is omissible consistently closes every
1747 | * optional closing element within)
1748 | *
1749 | * _Remarks_:
1750 | * - Use `isset()` instead of `in_array()` on array elements to boost
1751 | * performance about 30%
1752 | * - Sort elements by name for better readability!
1753 | */
1754 | protected $block_tags = array(
1755 | 'body' => 1,
1756 | 'div' => 1,
1757 | 'form' => 1,
1758 | 'root' => 1,
1759 | 'span' => 1,
1760 | 'table' => 1
1761 | );
1762 |
1763 | /**
1764 | * Defines elements whose end tag is omissible.
1765 | *
1766 | * * key = Name of an element whose end tag is omissible.
1767 | * * value = Names of elements whose end tag is omissible, that are closed
1768 | * by the current element.
1769 | *
1770 | * _Remarks_:
1771 | * - Use `isset()` instead of `in_array()` on array elements to boost
1772 | * performance about 30%
1773 | * - Sort elements by name for better readability!
1774 | *
1775 | * **Example**
1776 | *
1777 | * An `li` element’s end tag may be omitted if the `li` element is immediately
1778 | * followed by another `li` element. To do that, add following element to the
1779 | * array:
1780 | *
1781 | * ```php
1782 | * 'li' => array('li'),
1783 | * ```
1784 | *
1785 | * With this, the following two examples are considered equal. Note that the
1786 | * second example is missing the closing tags on `li` elements.
1787 | *
1788 | * ```html
1789 | *
1790 | * ```
1791 | *
1792 | *
1793 | *
1794 | * ```html
1795 | *
1796 | * ```
1797 | *
1798 | *
1799 | *
1800 | * @var array A two-dimensional array where the key is the name of an
1801 | * element whose end tag is omissible and the value is an array of elements
1802 | * whose end tag is omissible, that are closed by the current element.
1803 | *
1804 | * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags
1805 | *
1806 | * @todo The implementation of optional closing tags doesn't work in all cases
1807 | * because it only consideres elements who close other optional closing
1808 | * tags, not taking into account that some (non-blocking) tags should close
1809 | * these optional closing tags. For example, the end tag for "p" is omissible
1810 | * and can be closed by an "address" element, whose end tag is NOT omissible.
1811 | * Currently a "p" element without closing tag stops at the next "p" element
1812 | * or blocking tag, even if it contains other elements.
1813 | *
1814 | * @todo Known sourceforge issue #2977341
1815 | * B tags that are not closed cause us to return everything to the end of
1816 | * the document.
1817 | */
1818 | protected $optional_closing_tags = array(
1819 | // Not optional, see
1820 | // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1821 | 'b' => array('b' => 1),
1822 | 'dd' => array('dd' => 1, 'dt' => 1),
1823 | // Not optional, see
1824 | // https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1825 | 'dl' => array('dd' => 1, 'dt' => 1),
1826 | 'dt' => array('dd' => 1, 'dt' => 1),
1827 | 'li' => array('li' => 1),
1828 | 'optgroup' => array('optgroup' => 1, 'option' => 1),
1829 | 'option' => array('optgroup' => 1, 'option' => 1),
1830 | 'p' => array('p' => 1),
1831 | 'rp' => array('rp' => 1, 'rt' => 1),
1832 | 'rt' => array('rp' => 1, 'rt' => 1),
1833 | 'td' => array('td' => 1, 'th' => 1),
1834 | 'th' => array('td' => 1, 'th' => 1),
1835 | 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1836 | );
1837 |
1838 | function __construct(
1839 | $str = null,
1840 | $lowercase = true,
1841 | $forceTagsClosed = true,
1842 | $target_charset = DEFAULT_TARGET_CHARSET,
1843 | $stripRN = true,
1844 | $defaultBRText = DEFAULT_BR_TEXT,
1845 | $defaultSpanText = DEFAULT_SPAN_TEXT,
1846 | $options = 0)
1847 | {
1848 | if ($str) {
1849 | if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
1850 | $this->load_file($str);
1851 | } else {
1852 | $this->load(
1853 | $str,
1854 | $lowercase,
1855 | $stripRN,
1856 | $defaultBRText,
1857 | $defaultSpanText,
1858 | $options
1859 | );
1860 | }
1861 | }
1862 | // Forcing tags to be closed implies that we don't trust the html, but
1863 | // it can lead to parsing errors if we SHOULD trust the html.
1864 | if (!$forceTagsClosed) {
1865 | $this->optional_closing_array = array();
1866 | }
1867 |
1868 | $this->_target_charset = $target_charset;
1869 | }
1870 |
1871 | function __destruct()
1872 | {
1873 | $this->clear();
1874 | }
1875 |
1876 | // load html from string
1877 | function load(
1878 | $str,
1879 | $lowercase = true,
1880 | $stripRN = true,
1881 | $defaultBRText = DEFAULT_BR_TEXT,
1882 | $defaultSpanText = DEFAULT_SPAN_TEXT,
1883 | $options = 0)
1884 | {
1885 | global $debug_object;
1886 |
1887 | // prepare
1888 | $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1889 |
1890 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1891 | // Script tags removal now preceeds style tag removal.
1892 | // strip out