├── README.md
├── URLResolver.php
├── composer.json
├── lib
    └── simple_html_dom.php
└── tools
    └── resolve_url.php


/README.md:
--------------------------------------------------------------------------------
  1 | Welcome to URLResolver.php
  2 | ====================================================
  3 | 
  4 | URLResolver.php is a PHP class that attempts to resolve URLs to a final,
  5 | canonical link. On the web today, link shorteners, tracking codes and more can
  6 | result in many different links that ultimately point to the same resource.
  7 | By following HTTP redirects and parsing web pages for open graph and canonical
  8 | URLs, URLResolver.php attempts to solve this issue.
  9 | 
 10 | ## Patterns Recognized
 11 | 
 12 | - Follows 301, 302, and 303 redirects found in HTTP headers
 13 | - Follows [Open Graph] URL &lt;meta&gt; tags found in web page &lt;head&gt;
 14 | - Follows [Canonical] URL &lt;link&gt; tags found in web page &lt;head&gt;
 15 | - Aborts download quickly if content type is not an HTML page
 16 | 
 17 | I am open to additional suggestions for improvement.
 18 | 
 19 | ## Usage
 20 | 
 21 | Resolving a URL can be as easy as:
 22 | 
 23 | ``` php
 24 | <?php require_once('URLResolver.php');
 25 | 
 26 | $resolver = new mattwright\URLResolver();
 27 | print $resolver->resolveURL('http://goo.gl/0GMP1')->getURL();
 28 | ```
 29 | 
 30 | If you installed this library using composer, you would change the first line above to:
 31 | 
 32 | ``` php
 33 | <?php require_once('vendor/autoload.php');
 34 | ```
 35 | 
 36 | However, in most cases you will want to perform a little extra setup. The
 37 | following code sets a user agent to identify your crawler (otherwise the
 38 | default will be used) and also designates a temporary file that can be used
 39 | for storing cookies during the session.  Some web sites will test the browser
 40 | for cookie support, so this will enhance your results.
 41 | 
 42 | ``` php
 43 | <?php require_once('URLResolver.php');
 44 | $resolver = new mattwright\URLResolver();
 45 | 
 46 | # Identify your crawler (otherwise the default will be used)
 47 | $resolver->setUserAgent('Mozilla/5.0 (compatible; YourAppName/1.0; +http://www.example.com)');
 48 | 
 49 | # Designate a temporary file that will store cookies during the session.
 50 | # Some web sites test the browser for cookie support, so this enhances results.
 51 | $resolver->setCookieJar('/tmp/url_resolver.cookies');
 52 | 
 53 | # resolveURL() returns an object that allows for additional information.
 54 | $url = 'http://goo.gl/0GMP1';
 55 | $url_result = $resolver->resolveURL($url);
 56 | 
 57 | # Test to see if any error occurred while resolving the URL:
 58 | if ($url_result->didErrorOccur()) {
 59 | 	print "there was an error resolving $url:\n  ";
 60 | 	print $url_result->getErrorMessageString();
 61 | }
 62 | 
 63 | # Otherwise, print out the resolved URL.  The [HTTP status code] will tell you
 64 | # additional information about the success/failure. For instance, if the
 65 | # link resulted in a 404 Not Found error, it would print '404: http://...'
 66 | # The successful status code is 200.
 67 | else {
 68 | 	print $url_result->getHTTPStatusCode();
 69 | 	print ': ';
 70 | 	print $url_result->getURL();
 71 | }
 72 | ```
 73 | 
 74 | ## Installation and Requirements  
 75 |   
 76 | #### License
 77 | URLResolver.php is licensed under the [MIT License], viewable in the source code.
 78 | 
 79 | #### Install with Composer
 80 | [composer require mattwright/urlresolver](https://packagist.org/packages/mattwright/urlresolver)
 81 | 
 82 | #### Download
 83 | URLResolver.php as a [.tar.gz](https://github.com/mattwright/URLResolver.php/tarball/master) or [.zip](https://github.com/mattwright/URLResolver.php/zipball/master) file.
 84 | 
 85 | #### Requirements
 86 | - The [curl](http://php.net/manual/en/book.curl.php) extension must be installed as part of PHP
 87 | - [PHP Simple HTML DOM Parser](http://simplehtmldom.sourceforge.net/) is required and included with the download.
 88 | 
 89 | ## API
 90 | 
 91 | ### URLResolver()
 92 | 
 93 | `$resolver = new mattwright\URLResolver();`  
 94 | Create the URL resolver object that you call additional methods on.
 95 | 
 96 | `$resolver->resolveURL($url);`  
 97 | $url is the link you want to resolve.  
 98 | Returns a [URLResult] object that contains the final, resolved URL.
 99 | 
100 | `$resolver->setUserAgent($user_agent);`  
101 | Pass in a string that is sent to each web server to identify your crawler.
102 | 
103 | `$resolver->setCookieJar($cookie_file);  # Defaults to disable cookies`  
104 | *** This file will be removed at the end of each resolveURL() call. ***  
105 | Pass in the path to a file used to store cookies during each resolveURL() call.  
106 | If no cookie file is set, cookies will be disabled and results may suffer.  
107 | This file must not already exist.
108 | If it does, pass _true_ as second argument to enable overwrite.
109 | 
110 | `$resolver->setMaxRedirects($max_redirects);  # Defaults to 10`  
111 | Set the maximum number of URL requests to attempt during each resolveURL() call.
112 | 
113 | `$resolver->setMaxResponseDataSize($max_bytes);  # Defaults to 120000`  
114 | Pass in an integer specifying the maximum data to download per request.  
115 | Multiple URL requests may occur during each resolveURL() call.  
116 | Setting this too low may limit the usefulness of results (default 120000).
117 | 
118 | `$resolver->setRequestTimeout($num_seconds);  # Defaults to 30`  
119 | Set the maximum amount of time, in seconds, any URL request can take.  
120 | Multiple URL requests may occur during each resolveURL() call.
121 | 
122 | `$resolver->setPreferCanonicalURL($value);  # Defaults to false`  
123 | Set $value to _true_ to prioritize canonical URL over Open Graph URL.
124 | 
125 | `$resolver->isDebugMode($value);  # Defaults to false`  
126 | Set $value to _true_ to enable debug mode and _false_ to disable (the default).  
127 | This will print out each link visited, along with status codes and link types.
128 | 
129 | ### URLResolverResult()
130 | 
131 | `$url_result = $resolver->resolveURL($url);`  
132 | Retrieve the URLResolverResult() object representing the resolution of $url.
133 | 
134 | `$url_result->getURL();`  
135 | This is the best resolved URL we could obtain after following redirects.
136 | 
137 | `$url_result->getHTTPStatusCode();`  
138 | Returns the integer [HTTP status code] for the resolved URL.  
139 | Examples: 200 - OK (success), 404 - Not Found, 301 - Moved Permanently, ...
140 | 
141 | `$url_result->hasSuccessHTTPStatus();`  
142 | Returns _true_ if the [HTTP status code] for the resolved URL is 200.
143 | 
144 | `$url_result->hasRedirectHTTPStatus();`  
145 | Returns _true_ if the [HTTP status code] for the resolved URL is 301, 302, or 303.
146 | 
147 | `$url_result->getContentType();`  
148 | Returns the value of the Content-Type [HTTP header] for the resolved URL.  
149 | If header not provided, _null_ is returned. Examples: text/html, image/jpeg, ...
150 | 
151 | `$url_result->getContentLength();`  
152 | Returns the size of the fetched URL in bytes for the resolved URL.  
153 | Determined only by the Content-Length [HTTP header]. _null_ returned otherwise.
154 | 
155 | `$url_result->isOpenGraphURL();`  
156 | Returns _true_ if resolved URL was marked as the Open Graph URL (og:url)
157 | 
158 | `$url_result->isCanonicalURL();`  
159 | Returns _true_ if resolved URL was marked as the Canonical URL (rel=canonical)
160 | 
161 | `$url_result->isStartingURL();`  
162 | Returns _true_ if resolved URL was also the URL you passed to resolveURL().
163 | 
164 | `$url_result->didErrorOccur();`  
165 | Returns _true_ if an error occurred while resolving the URL.  
166 | If this returns _false_, $url_result is guaranteed to have a status code.
167 | 
168 | `$url_result->getErrorMessageString();`  
169 | Returns an explanation of what went wrong if didErrorOccur() returns _true_.
170 | 
171 | `$url_result->didConnectionFail();`  
172 | Returns _true_ if there was a connection error (no header or no body returned).  
173 | May indicate a situation where you are more likely to try at least once more.  
174 | If this returns _true_, didErrorOccur() will true as well.
175 | 
176 | ## Changelog
177 | - v2.0 - January 17, 2019
178 | 	- Breaking change: namespaced the library for use with composer psr-4
179 | 	- Add requested option to prefer canonical URL over Open Graph
180 | 	- Minor fixes / improvements
181 | 	- Upgrade simple_html_dom to 1.8.1
182 | 
183 | - v1.1 - June 3, 2014
184 | 	- Support http redirect code 303
185 | 
186 | - v1.0 - December 3, 2011
187 | 	- Initial release supports http header redirects, og:url and rel=canonical
188 | 
189 | [curl]: http://php.net/manual/en/book.curl.php
190 | [PHP Simple HTML DOM Parser]: http://simplehtmldom.sourceforge.net/
191 | [Open Graph]: https://developers.facebook.com/docs/opengraph/
192 | [Canonical]: http://www.google.com/support/webmasters/bin/answer.py?answer=139394
193 | [HTTP status code]: http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
194 | [HTTP header]: http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html
195 | [MIT License]: http://en.wikipedia.org/wiki/MIT_License
196 | 


--------------------------------------------------------------------------------
/URLResolver.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | /*******************************************************************************
  3 | * Copyright (c) 2011-2019 by Matt Wright and contributors
  4 | * https://github.com/mattwright/URLResolver.php
  5 | *
  6 | * Licensed under The MIT License
  7 | * Redistributions of files must retain the below copyright notice.
  8 | *
  9 | * Permission is hereby granted, free of charge, to any person obtaining a copy
 10 | * of this software and associated documentation files (the "Software"), to deal
 11 | * in the Software without restriction, including without limitation the rights
 12 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 13 | * copies of the Software, and to permit persons to whom the Software is
 14 | * furnished to do so, subject to the following conditions:
 15 | *
 16 | * The above copyright notice and this permission notice shall be included in
 17 | * all copies or substantial portions of the Software.
 18 | *
 19 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 20 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 21 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 22 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 23 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 24 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 25 | * THE SOFTWARE.
 26 | *******************************************************************************/
 27 | namespace mattwright;
 28 | require_once(__DIR__.'/lib/simple_html_dom.php');
 29 | 
 30 | class URLResolver {
 31 | 	private $curl;
 32 | 	private $html_dom;
 33 | 
 34 | 	private $is_debug = false;
 35 | 
 36 | 	private $user_agent;
 37 | 	private $cookie_jar;
 38 | 	private $request_timeout = 30;
 39 | 	private $max_redirects = 10;
 40 | 	private $max_response_data_size = 120000;
 41 | 	private $prefer_canonical_url = false;
 42 | 
 43 | 	# You must pass this function a filename to use as the cookie jar.
 44 | 	# An exception will be thrown if the file exists or cannot be created.
 45 | 	# This is because the library will also remove this file at the end
 46 | 	# and we don't want to accidentally destroy any of your files.
 47 | 	public function setUserAgent($user_agent_string) {
 48 | 		if (is_string($user_agent_string) && strlen($user_agent_string)) {
 49 | 			$this->user_agent = $user_agent_string;
 50 | 		}
 51 | 		else {
 52 | 			throw new Exception('URLResolver->setUserAgent() must be called with a string');
 53 | 		}
 54 | 
 55 | 		$this->closeCurl(); # Reset curl with new settings...
 56 | 	}
 57 | 
 58 | 	public function setCookieJar($cookie_jar_filename, $overwrite = false) {
 59 | 		if (!$overwrite && file_exists($cookie_jar_filename)) {
 60 | 			throw new Exception("URLResolver->setCookieJar() founding existing file $cookie_jar_filename.\nPass true as second argument to overwrite and delete.");
 61 | 		}
 62 | 
 63 | 		if (file_put_contents($cookie_jar_filename, '') === false) {
 64 | 			throw new Exception("URLResolver->setCookieJar() could not write to $cookie_jar_filename");
 65 | 		}
 66 | 
 67 | 		$this->cookie_jar = $cookie_jar_filename;
 68 | 
 69 | 		$this->closeCurl(); # Reset curl with new settings...
 70 | 	}
 71 | 
 72 | 	public function setRequestTimeout($seconds) {
 73 | 		if (is_numeric($seconds) && (int)$seconds == $seconds) {
 74 | 			$this->request_timeout = (int)$seconds;
 75 | 		}
 76 | 		else {
 77 | 			throw new Exception('URLResolver->setRequestTimeout() must be called with an integer');
 78 | 		}
 79 | 
 80 | 		$this->closeCurl(); # Reset curl with new settings...
 81 | 	}
 82 | 
 83 | 	public function setMaxRedirects($max_redirects) {
 84 | 		if (is_numeric($max_redirects)) {
 85 | 			$this->max_redirects = (int)$max_redirects;
 86 | 		}
 87 | 		else {
 88 | 			throw new Exception('URLResolver->setMaxRedirects() must be called with an integer');
 89 | 		}
 90 | 
 91 | 		$this->closeCurl(); # Reset curl with new settings...
 92 | 	}
 93 | 
 94 | 	public function setMaxResponseDataSize($max_bytes) {
 95 | 		if (is_numeric($max_bytes)) {
 96 | 			$this->max_response_data_size = (int)$max_bytes;
 97 | 		}
 98 | 		else {
 99 | 			throw new Exception('URLResolver->setMaxResponseDataSize() must be called with an integer');
100 | 		}
101 | 
102 | 		$this->closeCurl(); # Reset curl with new settings...
103 | 	}
104 | 
105 | 	public function setPreferCanonicalURL($value) {
106 | 		$this->prefer_canonical_url = $value ? true : false;
107 | 	}
108 | 
109 | 	public function isDebugMode($value) {
110 | 		if (isset($value)) { $this->is_debug = $value ? true : false; }
111 | 		return $this->is_debug;
112 | 	}
113 | 
114 | 	public function resolveURL($url) {
115 | 		$starting_url = $url;
116 | 
117 | 		$url_is_open_graph = false;
118 | 		$url_is_canonical = false;
119 | 
120 | 		$url_results = array();
121 | 		for ($i = 0; $i < $this->max_redirects; $i++) {
122 | 			# During debug mode, print out each URL that we visit.
123 | 			if ($this->is_debug) {
124 | 				if ($i) { print ' |- '; }
125 | 				print $url;
126 | 			}
127 | 
128 | 			# Fetch the redirect information...
129 | 			$url_result = $this->fetchURLResult($url);
130 | 
131 | 			# Mark this as the starting URL if it is the first or equals that URL
132 | 			if ($i == 0 || $url == $starting_url) { $url_result->isStartingURL(true); }
133 | 
134 | 			# If we followed this URL because of some HTML markup, note that...
135 | 			# Don't allow it to overwrite a true value determined from markup with a false value...
136 | 			if (!$url_result->isOpenGraphURL()) { $url_result->isOpenGraphURL($url_is_open_graph); }
137 | 			if (!$url_result->isCanonicalURL()) { $url_result->isCanonicalURL($url_is_canonical); }
138 | 
139 | 			# Also print a short status line regarding the URL once it is fetched
140 | 			if ($this->is_debug) {
141 | 				print ' ' . $url_result->debugStatus() . "\n";
142 | 			}
143 | 
144 | 			# If an error occurs during the processing of this url, return
145 | 			# the result when that error happens
146 | 			if ($url_result->didErrorOccur()) {
147 | 				if ($this->is_debug) {
148 | 					print ' |! ' . $url_result->getURL() . ' ' . $url_result->debugStatus() . "\n";
149 | 					if ($url_result->didErrorOccur()) { print ' \->  ' . $url_result->getErrorMessageString() . "\n"; }
150 | 					print "\n";
151 | 				}
152 | 
153 | 				$this->closeCurl();
154 | 				return $url_result;
155 | 			}
156 | 
157 | 			$next_url = $url_result->getRedirectTarget();
158 | 			$next_url_visited_count = 0;
159 | 			foreach ($url_results as $previous_result) {
160 | 
161 | 				# If this result was for the same URL with the same status, then we have looped.
162 | 				# We need to check the status as well, because in some cases we may get
163 | 				# multiple redirected to establish cookies (New York Times) and so when we
164 | 				# return to the same page, we will have a different status (200 instead of 301)
165 | 				# and we will still want to check for the og:url in that case...
166 | 				if ($previous_result->getURL() == $url_result->getURL() &&
167 | 				    $previous_result->getHTTPStatusCode() == $url_result->getHTTPStatusCode()) {
168 | 					return $this->resolveURLResults($url_results);
169 | 				}
170 | 
171 | 				# If the next URL to fetch has been previously fetched, decide whether to try again
172 | 				if (isset($next_url) && $next_url == $previous_result->getURL()) {
173 | 					$next_url_visited_count++;
174 | 
175 | 					# We are done if we have already visited this URL twice. (looped)
176 | 					if ($next_url_visited_count > 1) {
177 | 						return $this->resolveURLResults($url_results);
178 | 					}
179 | 
180 | 					# We are also done if we have been to this URL and it wasn't a redirect
181 | 					# (it could have been an og:url or a rel=canonical) (looped)
182 | 					if (!$previous_result->hasRedirectHTTPStatus()) {
183 | 						return $this->resolveURLResults($url_results);
184 | 					}
185 | 				}
186 | 			}
187 | 
188 | 			array_push($url_results, $url_result);
189 | 
190 | 			# If there is no next URL set, we're done.
191 | 			if (!isset($next_url)) {
192 | 				return $this->resolveURLResults($url_results);
193 | 			}
194 | 
195 | 			$url = $next_url;
196 | 			$url_is_open_graph = $url_result->redirectTargetIsOpenGraphURL();
197 | 			$url_is_canonical = $url_result->redirectTargetIsCanonicalURL();
198 | 		}
199 | 
200 | 		return $this->resolveURLResults($url_results);
201 | 	}
202 | 
203 | 	private function resolveURLResults($url_results) {
204 | 		# If no URL results were found, return null as failure...
205 | 		if (!isset($url_results) || count($url_results) < 1) {
206 | 			return null;
207 | 		}
208 | 
209 | 		$fail_url_result = $redirect_url_result = null;
210 | 		$ok_url_result = $og_url_result = $canonical_url_result = null;
211 | 
212 | 		foreach (array_reverse($url_results) as $url_result) {
213 | 			if ($url_result->hasSuccessHTTPStatus()) {
214 | 				if ($url_result->isOpenGraphURL() && !$og_url_result) {
215 | 					$og_url_result = $url_result;
216 | 				}
217 | 				else if ($url_result->isCanonicalURL() && !$canonical_url_result) {
218 | 					$canonical_url_result = $url_result;
219 | 				}
220 | 				else if (!$ok_url_result) {
221 | 					$ok_url_result = $url_result;
222 | 				}
223 | 			}
224 | 
225 | 			# If the URL had a redirect status, then we set that result type
226 | 			else if ($url_result->hasRedirectHTTPStatus()) {
227 | 				if (!$redirect_url_result) {
228 | 					$redirect_url_result = $url_result;
229 | 				}
230 | 			}
231 | 
232 | 			# Only set a failure result if it didn't have success or redirect status code
233 | 			else if (!$fail_url_result) {
234 | 				$fail_url_result = $url_result;
235 | 			}
236 | 		}
237 | 
238 | 		# Start with the result from our initial url
239 | 		$return_url = $url_results[0];
240 | 
241 | 		# The primary goal is to return a valid og:url...
242 | 		if (isset($og_url_result)) {
243 | 			$return_url = $og_url_result;
244 | 		}
245 | 
246 | 		# Canonical URLs are a close second...
247 | 		else if (isset($canonical_url_result)) {
248 | 			$return_url = $canonical_url_result;
249 | 		}
250 | 
251 | 		# Following that, we will take any status=200 link
252 | 		else if (isset($ok_url_result)) {
253 | 			$return_url = $ok_url_result;
254 | 		}
255 | 
256 | 		# Better to end in an HTTP failure than a redirect
257 | 		else if (isset($fail_url_result)) {
258 | 			$return_url = $fail_url_result;
259 | 		}
260 | 
261 | 		# We will still take the deepest redirect found if needed
262 | 		else if (isset($redirect_url_result)) {
263 | 			$return_url = $redirect_url_result;
264 | 		}
265 | 
266 | 		# Reset everything so that we start with a clean slate, cookie jar, etc. next time
267 | 		$this->closeCurl();
268 | 
269 | 		if ($this->is_debug) {
270 | 			print ' |> ' . $return_url->getURL() . ' ' . $return_url->debugStatus() . "\n\n";
271 | 		}
272 | 
273 | 		return $return_url;
274 | 	}
275 | 
276 | 	private function fetchURL($url) {
277 | 		$curl = $this->initCurl();
278 | 		curl_setopt($curl, CURLOPT_URL, $url);
279 | 
280 | 		# Limit the body to 1,000,000 bytes
281 | 		$headers = $body = '';
282 | 		$header_length = 0;
283 | 		$max_data_length = $this->max_response_data_size;
284 | 		curl_setopt($curl, CURLOPT_WRITEFUNCTION, function($handle, $data) use (&$headers, &$body, &$header_length, $max_data_length) {
285 | 			$body .= $data;
286 | 
287 | 			if ($headers == '') {
288 | 				$headers_end = strpos($body, "\r\n\r\n");
289 | 				if ($headers_end !== false) {
290 | 					$header_length = $headers_end;
291 | 					$headers = substr($body, 0, $header_length);
292 | 					$body = substr($body, $header_length + 4);
293 | 
294 | 
295 | 					# Now that we have headers, if the content type is not HTML, we do
296 | 					# not need to download anything else. Prevents us from downloading
297 | 					# images, videos, PDFs, etc. that won't contain redirects
298 | 
299 | 					# Until PHP 5.4, you can't import $this lexical variable into a closure,
300 | 					# so we will need to duplicate code from contentTypeFromHeader()
301 | 					# and hasHTMLContentType()
302 | 					if (preg_match('/^\s*Content-Type:\s*([^\s;\n]+)/im', $headers, $matches)) {
303 | 						if (stripos($matches[1], 'html') === false) { return 0; }
304 | 					}
305 | 				}
306 | 			}
307 | 
308 | 			# If we have downloaded the maximum amount of content, we're done.
309 | 			if (($header_length + strlen($body)) > $max_data_length) { return 0; }
310 | 
311 | 			return strlen($data);
312 | 		});
313 | 
314 | 
315 | 		curl_exec($curl);
316 | 
317 | 		if ($headers === '') { return array(null, null); }
318 | 
319 | 		return array($headers, $body);
320 | 	}
321 | 
322 | 	private function fullyQualifyURI($uri, $url) {
323 | 		$uri = trim($uri);
324 | 
325 | 		# Only use this if it looks like a URL/URI (starts with /, www., or https?://)
326 | 		# Otherwise, we won't be able to understand it.
327 | 		if (!preg_match('/^(\/|www\.|https?:\/\/)/i', $uri)) { return null; }
328 | 
329 | 		# If the link is to a domain only, we will standardize it by ensuring a trailing slash
330 | 		if (preg_match('/^(\/|https?:\/\/)[^\/]+$/i', $uri)) { $uri .= '/'; }
331 | 
332 | 		# If the URL is localized, such as '/path/to/file', add the protocol and host back to the start.
333 | 		if (strpos($uri, '/') === 0) {
334 | 
335 | 			# If a URI starts with //, then it means there is another domain, but use same protocol
336 | 			if (strpos($uri, '//') === 0) {
337 | 				if (preg_match('/^\s*([a-z]+:)/', $url, $matches)) {
338 | 					$uri = $matches[1] . $uri;
339 | 				}
340 | 			}
341 | 
342 | 			# Otherwise, add in the entire domain as well
343 | 			else {
344 | 				if (preg_match('/^\s*([a-z]+:\/\/[^\/]+)/', $url, $matches)) {
345 | 					$uri = $matches[1] . $uri;
346 | 				}
347 | 			}
348 | 		}
349 | 
350 | 		# In the wild, finding several URLs that start with www. and no scheme. Add protocol.
351 | 		if (strpos($uri, 'www.') === 0) {
352 | 			if (preg_match('/^\s*([a-z]+:\/\/)/', $url, $matches)) {
353 | 				$uri = $matches[1] . $uri;
354 | 			}
355 | 		}
356 | 
357 | 		# If the URL had a hash fragment attached to it and the URI no longer does, we will add it back in.
358 | 		$fragment_pos = strpos($url, '#');
359 | 		if ($fragment_pos !== false && strpos($uri, '#') === false) {
360 | 			$uri .= substr($url, $fragment_pos);
361 | 		}
362 | 
363 | 		return $uri;
364 | 	}
365 | 
366 | 	private function contentTypeFromHeader($headers) {
367 | 		if (preg_match('/^\s*Content-Type:\s*([^\s;\n]+)/im', $headers, $matches)) {
368 | 			return $matches[1];
369 | 		}
370 | 		return null;
371 | 	}
372 | 
373 | 	private function fetchURLResult($url) {
374 | 		$result = new URLResolverResult($url);
375 | 
376 | 		# Attempt to fetch the headers for this URL
377 | 		list($headers, $body) = $this->fetchURL($url);
378 | 		if (!isset($headers)) {
379 | 			$result->didConnectionFail(true, 'Could not retrieve headers');
380 | 			return $result;
381 | 		}
382 | 
383 | 		# Parse the headers...
384 | 		if (preg_match('/^\s*HTTP\/[\d\.]+\s+(\d+)/i', $headers, $matches)) {
385 | 			$result->setHTTPStatusCode($matches[1]);
386 | 		}
387 | 		else {
388 | 			$result->didFatalErrorOccur(true, 'HTTP status code not found');
389 | 		}
390 | 
391 | 		$result->setContentType($this->contentTypeFromHeader($headers));
392 | 
393 | 		if (preg_match('/^\s*Content-Length:\s*(\d+)/im', $headers, $matches)) {
394 | 			$result->setContentLength($matches[1]);
395 | 		}
396 | 
397 | 		if ($result->hasRedirectHTTPStatus()) {
398 | 			# Parse the location header to determine the redirect URL...
399 | 			if (preg_match('/^\s*Location:\s*([^\r\n]+)/im', $headers, $matches)) {
400 | 				$result->setRedirectTarget($this->fullyQualifyURI($matches[1], $url));
401 | 			}
402 | 
403 | 			# A redirect status code with no location header is a fatal error...
404 | 			else { $result->didFatalErrorOccur(true, 'HTTP status code indicated redirect, no location found'); }
405 | 
406 | 			return $result;
407 | 		}
408 | 
409 | 		else if ($result->hasSuccessHTTPStatus()) {
410 | 
411 | 			# If the content type for this page is something other than HTML, we do
412 | 			# not need to fetch it. This test will catch text/html and text/xhtml, etc.
413 | 			if (!$result->hasHTMLContentType()) { return $result; }
414 | 
415 | 			if (!$body) {
416 | 				$result->didConnectionFail(true, 'Web page was empty');
417 | 				return $result;
418 | 			}
419 | 
420 | 			# Load the HTML DOM using PHP Simple HTML DOM
421 | 			$html_dom = $this->loadHTMLDOM($body);
422 | 
423 | 			# If the DOM could not be parsed, mark it as a fatal error. Reasonable
424 | 			# HTTP redirects may be available, but this notes it didn't get everything
425 | 			if (!$html_dom) {
426 | 				$result->didFatalErrorOccur(true, 'Could not parse web page');
427 | 				$this->closeHTMLDOM();
428 | 				return $result;
429 | 			}
430 | 
431 | 			# If we cannot find the <head>, then we are done processing this page.
432 | 			$head = $html_dom->find('head', 0);
433 | 			if (!isset($head)) {
434 | 				# If there is no <head> and no <body> tag, then we will look for an instant
435 | 				# <meta http-equiv="refresh" tag and use that if found... We don't want to
436 | 				# just always use the meta tag, as it is often used for noscript browsers
437 | 				# or for long-delayed page reloads... But some pages do return just a very
438 | 				# short noscript/meta refresh (t.co/pic.twitter.com) and it is good to catch
439 | 				$body_tag = $html_dom->find('body', 0);
440 | 				if (!isset($body_tag)) {
441 | 					$meta_refresh_tag = $html_dom->find('meta[http-equiv=refresh]', 0);
442 | 					if (isset($meta_refresh_tag->content) &&
443 | 				    	preg_match('/^\s*(\d+)\s*;\s*URL=(.*)/i', $meta_refresh_tag->content, $matches)) {
444 | 							if (!$matches[1] <= 2) {
445 | 								$result->setRedirectTarget($this->fullyQualifyURI($matches[2], $url));
446 | 							}
447 | 					}
448 | 				}
449 | 
450 | 				# Don't mark as failed, some pages may just not have a <head>, but rare...
451 | 				$this->closeHTMLDOM();
452 | 				return $result;
453 | 			}
454 | 
455 | 			# Determine if there are any redirects in the meta/link tags (og:url or rel=canonical)
456 | 			$redirect_url = null;
457 | 
458 | 			# Locate the Open Graph URL meta tag and extract URL
459 | 			$og_tag = $head->find('meta[property=og:url]', 0);
460 | 			$og_url = (isset($og_tag) && isset($og_tag->content)) ?
461 | 				$this->fullyQualifyURI($og_tag->content, $url) : null;
462 | 
463 | 			if (isset($og_url)) {
464 | 				$redirect_url = $og_url;
465 | 				$result->redirectTargetIsOpenGraphURL(true);
466 | 			}
467 | 
468 | 			# Open Graph takes precedence over Canonical, but it can be both...
469 | 			$canonical_tag = $head->find('link[rel=canonical]', 0);
470 | 			$canonical_url = ((isset($canonical_tag) && isset($canonical_tag->href))) ?
471 | 				$this->fullyQualifyURI($canonical_tag->href, $url) : null;
472 | 
473 | 			if (isset($canonical_url)) {
474 | 				if (isset($redirect_url)) {
475 | 					if ($canonical_url == $redirect_url) {
476 | 						$result->redirectTargetIsCanonicalURL(true);
477 | 					}
478 | 
479 | 					# If setPreferCanonicalURL(true) was called, then we use it over Open Graph
480 | 					else if ($this->prefer_canonical_url) {
481 | 						$redirect_url = $canonical_url;
482 | 						$result->redirectTargetIsCanonicalURL(true);
483 | 						$result->redirectTargetIsOpenGraphURL(false);
484 | 					}
485 | 				}
486 | 				else {
487 | 					$redirect_url = $canonical_url;
488 | 					$result->redirectTargetIsCanonicalURL(true);
489 | 				}
490 | 			}
491 | 
492 | 			# If a redirect was found, set the target and return it
493 | 			if ($redirect_url) {
494 | 				# If the redirect URL is the same as the current URL, don't set it, but update values.
495 | 				if ($redirect_url === $url) {
496 | 					$result->isOpenGraphURL($result->redirectTargetIsOpenGraphURL());
497 | 					$result->isCanonicalURL($result->redirectTargetIsCanonicalURL());
498 | 				}
499 | 				else {
500 | 					$result->setRedirectTarget($redirect_url);
501 | 				}
502 | 			}
503 | 
504 | 			$this->closeHTMLDOM();
505 | 			return $result;
506 | 		}
507 | 
508 | 		# Link had some other status code besides redirect or status...
509 | 		$this->closeHTMLDOM();
510 | 		return $result;
511 | 	}
512 | 
513 | 	private function initCurl() {
514 | 		# If curl has already been initialized (and not closed), just return the handle
515 | 		if (isset($this->curl)) { return $this->curl; }
516 | 
517 | 		$this->curl = curl_init();
518 | 
519 | 		curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, true);
520 | 		curl_setopt($this->curl, CURLOPT_TIMEOUT, $this->request_timeout);
521 | 		curl_setopt($this->curl, CURLOPT_CONNECTTIMEOUT, $this->request_timeout);
522 | 
523 | 		if (isset($this->cookie_jar)) {
524 | 			curl_setopt($this->curl, CURLOPT_COOKIEJAR, $this->cookie_jar);
525 | 			curl_setopt($this->curl, CURLOPT_COOKIEFILE, $this->cookie_jar);
526 | 		}
527 | 
528 | 		# If a User Agent has been set, set the curl option
529 | 		if (isset($this->user_agent)) {
530 | 			curl_setopt($this->curl, CURLOPT_USERAGENT, $this->user_agent);
531 | 		}
532 | 
533 | 		# We are not too concerned about the strictness of SSL when finding redirects
534 | 		# Without these, some SSL links just fail to return anything
535 | 		curl_setopt($this->curl, CURLOPT_SSL_VERIFYHOST, 0); 
536 | 		curl_setopt($this->curl, CURLOPT_SSL_VERIFYPEER, 0); 
537 | 
538 | 		# We want the headers returned to us to follow redirects
539 | 		curl_setopt($this->curl, CURLOPT_HEADER, true); 
540 | 
541 | 		return $this->curl;
542 | 	}
543 | 
544 | 	private function closeCurl() {
545 | 		if (isset($this->curl)) {
546 | 			curl_close($this->curl);
547 | 			unset($this->curl);
548 | 		}
549 | 
550 | 		# Empty the cookie jar (this deletes the file)
551 | 		if (isset($this->cookie_jar) && file_exists($this->cookie_jar)) {
552 | 			unlink($this->cookie_jar);
553 | 		}
554 | 	}
555 | 
556 | 	private function loadHTMLDOM($html_content) {
557 | 		if (isset($this->html_dom)) {
558 | 			$this->closeHTMLDOM();
559 | 		}
560 | 
561 | 		$this->html_dom = new \simple_html_dom();
562 | 		$this->html_dom->load($html_content);
563 | 		return $this->html_dom;
564 | 	}
565 | 
566 | 	private function closeHTMLDOM() {
567 | 		if (isset($this->html_dom)) {
568 | 			$this->html_dom->clear();
569 | 			unset($this->html_dom);
570 | 		}
571 | 	}
572 | }
573 | 
574 | class URLResolverResult {
575 | 	private $url;
576 | 	private $status;
577 | 	private $content_type;
578 | 	private $content_length;
579 | 
580 | 	private $is_starting_point = false;
581 | 	private $is_open_graph = false;
582 | 	private $is_canonical = false;
583 | 
584 | 	private $redirect;
585 | 	private $redirect_is_open_graph = false;
586 | 	private $redirect_is_canonical = false;
587 | 
588 | 	private $failed = false;
589 | 	private $error = false;
590 | 	private $error_message = '';
591 | 
592 | 	public function __construct($url) {
593 | 		$this->url = $url;
594 | 	}
595 | 
596 | 	# This is the best resolved URL we could obtain after following redirects.
597 | 	public function getURL() { return $this->url; }
598 | 
599 | 	# Returns the integer [HTTP status code] for the resolved URL.
600 | 	# Examples: 200: OK (success), 404: Not Found, 301: Moved Permanently, ...
601 | 	public function getHTTPStatusCode() { return $this->status; }
602 | 	public function setHTTPStatusCode($status) { $this->status = $status; }
603 | 
604 | 	# Returns _true_ if the [HTTP status code] for the resolved URL is 200.
605 | 	public function hasSuccessHTTPStatus() { return ($this->status == 200); }
606 | 
607 | 	# Returns _true_ if the [HTTP status code] for the resolved URL is 301 or 302.
608 | 	public function hasRedirectHTTPStatus() { return ($this->status == 301 || $this->status == 302 || $this->status == 303); }
609 | 
610 | 	# Returns the value of the Content-Type [HTTP header] for the resolved URL.
611 | 	# If header not provided, _null_ is returned. Examples: text/html, image/jpeg, ...
612 | 	public function getContentType() { return $this->content_type; }
613 | 	public function setContentType($type) { $this->content_type = $type; }
614 | 	public function hasHTMLContentType($type=null) {
615 | 		if (!isset($type)) { $type = $this->content_type; }
616 | 		return (stripos($type, 'html') !== false);
617 | 	}
618 | 
619 | 	# Returns the size of the fetched URL in bytes for the resolved URL.
620 | 	# Determined only by the Content-Length [HTTP header]. _null_ returned otherwise.
621 | 	public function getContentLength() { return $this->content_length; }
622 | 	public function setContentLength($length) { $this->content_length = $length; }
623 | 
624 | 	# Returns true if resolved URL was marked as the Open Graph URL (og:url)
625 | 	public function isOpenGraphURL($value=null) {
626 | 		if (isset($value)) { $this->is_open_graph = $value ? true : false; }
627 | 		return $this->is_open_graph;
628 | 	}
629 | 
630 | 	# Returns true if resolved URL was marked as the Canonical URL (rel=canonical)
631 | 	public function isCanonicalURL($value=null) {
632 | 		if (isset($value)) { $this->is_canonical = $value ? true : false; }
633 | 		return $this->is_canonical;
634 | 	}
635 | 
636 | 	# Returns true if resolved URL was also the URL you passed to resolveURL().
637 | 	public function isStartingURL($value=null) {
638 | 		if (isset($value)) { $this->is_starting_point = $value ? true : false; }
639 | 		return $this->is_starting_point;
640 | 	}
641 | 
642 | 	# Returns true if an error occurred while resolving the URL.
643 | 	# If this returns false, $url_result is guaranteed to have a status code.
644 | 	public function didErrorOccur() {
645 | 		return ($this->error || $this->failed);
646 | 	}
647 | 
648 | 	# Returns an explanation of what went wrong if didErrorOccur() returns true.
649 | 	public function getErrorMessageString() {
650 | 		return ($this->error || $this->failed) ? $this->error_message : '';
651 | 	}
652 | 
653 | 	# Returns _true_ if there was a connection error (no header or no body returned).
654 | 	# May indicate a situation where you are more likely to try at least once more.
655 | 	# If this returns _true_, didErrorOccur() will true as well.
656 | 	public function didConnectionFail($value=null, $message=null) {
657 | 		if (isset($value)) {
658 | 			$this->failed = $value ? true : false;
659 | 			$this->error_message = $message;
660 | 		}
661 | 		return $this->failed;
662 | 	}
663 | 
664 | 	public function didFatalErrorOccur($value=null, $message=null) {
665 | 		if (isset($value)) {
666 | 			$this->error = $value ? true : false;
667 | 			$this->error_message = $message;
668 | 		}
669 | 		return $this->error;
670 | 	}
671 | 
672 | 	public function getRedirectTarget() { return $this->redirect; }
673 | 	public function setRedirectTarget($url) { $this->redirect = $url; }
674 | 
675 | 	public function redirectTargetIsOpenGraphURL($value=null) {
676 | 		if (isset($value)) { $this->redirect_is_open_graph = $value ? true : false; }
677 | 		return $this->redirect_is_open_graph;
678 | 	}
679 | 
680 | 	public function redirectTargetIsCanonicalURL($value=null) {
681 | 		if (isset($value)) { $this->redirect_is_canonical = $value ? true : false; }
682 | 		return $this->redirect_is_canonical;
683 | 	}
684 | 
685 | 	public function debugStatus() {
686 | 		$attr = array();
687 | 		if ($this->failed || $this->error) { array_push($attr, 'ERROR'); }
688 | 		if ($this->is_open_graph) { array_push($attr, 'og:url'); }
689 | 		if ($this->is_canonical) { array_push($attr, 'rel=canonical'); }
690 | 
691 | 		$status = '(' . $this->status;
692 | 		if (count($attr)) { $status .= '; ' . join(', ', $attr); }
693 | 		$status .= ')';
694 | 
695 | 		return $status;
696 | 	}
697 | }
698 | 
699 | ?>
700 | 


--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "mattwright/urlresolver",
 3 | 	"version": "2.0",
 4 | 	"type": "library",
 5 | 	"description": "PHP class that attempts to resolve URLs to a final, canonical link.",
 6 | 	"license": "MIT",
 7 | 	"keywords": ["url", "redirect", "resolve", "canonical", "link"],
 8 | 	"homepage": "https://github.com/mattwright/URLResolver.php",
 9 | 	"authors": [ { "name": "Matt Wright", "email": "mw@mattwright.com" } ],
10 | 	"require": {
11 | 		"php": ">=5.3",
12 | 		"ext-curl": "*",
13 | 		"ext-mbstring": "*"
14 | 	},
15 | 	"autoload": { "psr-4": { "mattwright\\": "." } }
16 | }
17 | 


--------------------------------------------------------------------------------
/lib/simple_html_dom.php:
--------------------------------------------------------------------------------
   1 | <?php
   2 | /**
   3 |  * Website: http://sourceforge.net/projects/simplehtmldom/
   4 |  * Additional projects: http://sourceforge.net/projects/debugobject/
   5 |  * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
   6 |  * Contributions by:
   7 |  *	 Yousuke Kumakura (Attribute filters)
   8 |  *	 Vadim Voituk (Negative indexes supports of "find" method)
   9 |  *	 Antcs (Constructor with automatically load contents either text or file/url)
  10 |  *
  11 |  * all affected sections have comments starting with "PaperG"
  12 |  *
  13 |  * Paperg - Added case insensitive testing of the value of the selector.
  14 |  *
  15 |  * Paperg - Added tag_start for the starting index of tags - NOTE: This works
  16 |  * but not accurately. This tag_start gets counted AFTER \r\n have been crushed
  17 |  * out, and after the remove_noice calls so it will not reflect the REAL
  18 |  * position of the tag in the source, it will almost always be smaller by some
  19 |  * amount. We use this to determine how far into the file the tag in question
  20 |  * is. This "percentage" will never be accurate as the $dom->size is the "real"
  21 |  * number of bytes the dom was created from. But for most purposes, it's a
  22 |  * really good estimation.
  23 |  *
  24 |  * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags
  25 |  * closed is great for malformed html, but it CAN lead to parsing errors.
  26 |  *
  27 |  * Allow the user to tell us how much they trust the html.
  28 |  *
  29 |  * Paperg add the text and plaintext to the selectors for the find syntax.
  30 |  * plaintext implies text in the innertext of a node.  text implies that the
  31 |  * tag is a text node. This allows for us to find tags based on the text they
  32 |  * contain.
  33 |  *
  34 |  * Create find_ancestor_tag to see if a tag is - at any level - inside of
  35 |  * another specific tag.
  36 |  *
  37 |  * Paperg: added parse_charset so that we know about the character set of
  38 |  * the source document. NOTE: If the user's system has a routine called
  39 |  * get_last_retrieve_url_contents_content_type availalbe, we will assume it's
  40 |  * returning the content-type header from the last transfer or curl_exec, and
  41 |  * we will parse that and use it in preference to any other method of charset
  42 |  * detection.
  43 |  *
  44 |  * Found infinite loop in the case of broken html in restore_noise. Rewrote to
  45 |  * protect from that.
  46 |  *
  47 |  * PaperG (John Schlick) Added get_display_size for "IMG" tags.
  48 |  *
  49 |  * Licensed under The MIT License
  50 |  * Redistributions of files must retain the above copyright notice.
  51 |  *
  52 |  * @author S.C. Chen <me578022@gmail.com>
  53 |  * @author John Schlick
  54 |  * @author Rus Carroll
  55 |  * @version Rev. 1.8.1 (247)
  56 |  * @package PlaceLocalInclude
  57 |  * @subpackage simple_html_dom
  58 |  */
  59 | 
  60 | /**
  61 |  * All of the Defines for the classes below.
  62 |  * @author S.C. Chen <me578022@gmail.com>
  63 |  */
  64 | define('HDOM_TYPE_ELEMENT', 1);
  65 | define('HDOM_TYPE_COMMENT', 2);
  66 | define('HDOM_TYPE_TEXT', 3);
  67 | define('HDOM_TYPE_ENDTAG', 4);
  68 | define('HDOM_TYPE_ROOT', 5);
  69 | define('HDOM_TYPE_UNKNOWN', 6);
  70 | define('HDOM_QUOTE_DOUBLE', 0);
  71 | define('HDOM_QUOTE_SINGLE', 1);
  72 | define('HDOM_QUOTE_NO', 3);
  73 | define('HDOM_INFO_BEGIN', 0);
  74 | define('HDOM_INFO_END', 1);
  75 | define('HDOM_INFO_QUOTE', 2);
  76 | define('HDOM_INFO_SPACE', 3);
  77 | define('HDOM_INFO_TEXT', 4);
  78 | define('HDOM_INFO_INNER', 5);
  79 | define('HDOM_INFO_OUTER', 6);
  80 | define('HDOM_INFO_ENDSPACE', 7);
  81 | 
  82 | /** The default target charset */
  83 | defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
  84 | 
  85 | /** The default <br> text used instead of <br> tags when returning text */
  86 | defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
  87 | 
  88 | /** The default <span> text used instead of <span> tags when returning text */
  89 | defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
  90 | 
  91 | /** The maximum file size the parser should load */
  92 | defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
  93 | 
  94 | /** Contents between curly braces "{" and "}" are interpreted as text */
  95 | define('HDOM_SMARTY_AS_TEXT', 1);
  96 | 
  97 | // helper functions
  98 | // -----------------------------------------------------------------------------
  99 | // get html dom from file
 100 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
 101 | function file_get_html(
 102 | 	$url,
 103 | 	$use_include_path = false,
 104 | 	$context = null,
 105 | 	$offset = 0,
 106 | 	$maxLen = -1,
 107 | 	$lowercase = true,
 108 | 	$forceTagsClosed = true,
 109 | 	$target_charset = DEFAULT_TARGET_CHARSET,
 110 | 	$stripRN = true,
 111 | 	$defaultBRText = DEFAULT_BR_TEXT,
 112 | 	$defaultSpanText = DEFAULT_SPAN_TEXT)
 113 | {
 114 | 	// Ensure maximum length is greater than zero
 115 | 	if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
 116 | 
 117 | 	// We DO force the tags to be terminated.
 118 | 	$dom = new simple_html_dom(
 119 | 		null,
 120 | 		$lowercase,
 121 | 		$forceTagsClosed,
 122 | 		$target_charset,
 123 | 		$stripRN,
 124 | 		$defaultBRText,
 125 | 		$defaultSpanText);
 126 | 
 127 | 	/**
 128 | 	 * For sourceforge users: uncomment the next line and comment the
 129 | 	 * retrieve_url_contents line 2 lines down if it is not already done.
 130 | 	 */
 131 | 	$contents = file_get_contents(
 132 | 		$url,
 133 | 		$use_include_path,
 134 | 		$context,
 135 | 		$offset,
 136 | 		$maxLen);
 137 | 
 138 | 	// Paperg - use our own mechanism for getting the contents as we want to
 139 | 	// control the timeout.
 140 | 	// $contents = retrieve_url_contents($url);
 141 | 	if (empty($contents) || strlen($contents) > $maxLen) { return false; }
 142 | 
 143 | 	// The second parameter can force the selectors to all be lowercase.
 144 | 	$dom->load($contents, $lowercase, $stripRN);
 145 | 	return $dom;
 146 | }
 147 | 
 148 | // get html dom from string
 149 | function str_get_html(
 150 | 	$str,
 151 | 	$lowercase = true,
 152 | 	$forceTagsClosed = true,
 153 | 	$target_charset = DEFAULT_TARGET_CHARSET,
 154 | 	$stripRN = true,
 155 | 	$defaultBRText = DEFAULT_BR_TEXT,
 156 | 	$defaultSpanText = DEFAULT_SPAN_TEXT)
 157 | {
 158 | 	$dom = new simple_html_dom(
 159 | 		null,
 160 | 		$lowercase,
 161 | 		$forceTagsClosed,
 162 | 		$target_charset,
 163 | 		$stripRN,
 164 | 		$defaultBRText,
 165 | 		$defaultSpanText);
 166 | 
 167 | 	if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
 168 | 		$dom->clear();
 169 | 		return false;
 170 | 	}
 171 | 
 172 | 	$dom->load($str, $lowercase, $stripRN);
 173 | 	return $dom;
 174 | }
 175 | 
 176 | // dump html dom tree
 177 | function dump_html_tree($node, $show_attr = true, $deep = 0)
 178 | {
 179 | 	$node->dump($node);
 180 | }
 181 | 
 182 | /**
 183 |  * simple html dom node
 184 |  * PaperG - added ability for "find" routine to lowercase the value of the
 185 |  * selector.
 186 |  *
 187 |  * PaperG - added $tag_start to track the start position of the tag in the total
 188 |  * byte index
 189 |  *
 190 |  * @package PlaceLocalInclude
 191 |  */
 192 | class simple_html_dom_node
 193 | {
 194 | 	/**
 195 | 	 * Node type
 196 | 	 *
 197 | 	 * Default is {@see HDOM_TYPE_TEXT}
 198 | 	 *
 199 | 	 * @var int
 200 | 	 */
 201 | 	public $nodetype = HDOM_TYPE_TEXT;
 202 | 
 203 | 	/**
 204 | 	 * Tag name
 205 | 	 *
 206 | 	 * Default is 'text'
 207 | 	 *
 208 | 	 * @var string
 209 | 	 */
 210 | 	public $tag = 'text';
 211 | 
 212 | 	/**
 213 | 	 * List of attributes
 214 | 	 *
 215 | 	 * @var array
 216 | 	 */
 217 | 	public $attr = array();
 218 | 
 219 | 	/**
 220 | 	 * List of child node objects
 221 | 	 *
 222 | 	 * @var array
 223 | 	 */
 224 | 	public $children = array();
 225 | 	public $nodes = array();
 226 | 
 227 | 	/**
 228 | 	 * The parent node object
 229 | 	 *
 230 | 	 * @var object|null
 231 | 	 */
 232 | 	public $parent = null;
 233 | 
 234 | 	// The "info" array - see HDOM_INFO_... for what each element contains.
 235 | 	public $_ = array();
 236 | 
 237 | 	/**
 238 | 	 * Start position of the tag in the document
 239 | 	 *
 240 | 	 * @var int
 241 | 	 */
 242 | 	public $tag_start = 0;
 243 | 
 244 | 	/**
 245 | 	 * The DOM object
 246 | 	 *
 247 | 	 * @var object|null
 248 | 	 */
 249 | 	private $dom = null;
 250 | 
 251 | 	/**
 252 | 	 * Construct new node object
 253 | 	 *
 254 | 	 * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes}
 255 | 	 */
 256 | 	function __construct($dom)
 257 | 	{
 258 | 		$this->dom = $dom;
 259 | 		$dom->nodes[] = $this;
 260 | 	}
 261 | 
 262 | 	function __destruct()
 263 | 	{
 264 | 		$this->clear();
 265 | 	}
 266 | 
 267 | 	function __toString()
 268 | 	{
 269 | 		return $this->outertext();
 270 | 	}
 271 | 
 272 | 	// clean up memory due to php5 circular references memory leak...
 273 | 	function clear()
 274 | 	{
 275 | 		$this->dom = null;
 276 | 		$this->nodes = null;
 277 | 		$this->parent = null;
 278 | 		$this->children = null;
 279 | 	}
 280 | 
 281 | 	// dump node's tree
 282 | 	function dump($show_attr = true, $deep = 0)
 283 | 	{
 284 | 		$lead = str_repeat('	', $deep);
 285 | 
 286 | 		echo $lead . $this->tag;
 287 | 
 288 | 		if ($show_attr && count($this->attr) > 0) {
 289 | 			echo '(';
 290 | 			foreach ($this->attr as $k => $v) {
 291 | 				echo "[$k]=>\"" . $this->$k . '", ';
 292 | 			}
 293 | 			echo ')';
 294 | 		}
 295 | 
 296 | 		echo "\n";
 297 | 
 298 | 		if ($this->nodes) {
 299 | 			foreach ($this->nodes as $c) {
 300 | 				$c->dump($show_attr, $deep + 1);
 301 | 			}
 302 | 		}
 303 | 	}
 304 | 
 305 | 
 306 | 	// Debugging function to dump a single dom node with a bunch of information about it.
 307 | 	function dump_node($echo = true)
 308 | 	{
 309 | 		$string = $this->tag;
 310 | 
 311 | 		if (count($this->attr) > 0) {
 312 | 			$string .= '(';
 313 | 			foreach ($this->attr as $k => $v) {
 314 | 				$string .= "[$k]=>\"" . $this->$k . '", ';
 315 | 			}
 316 | 			$string .= ')';
 317 | 		}
 318 | 
 319 | 		if (count($this->_) > 0) {
 320 | 			$string .= ' $_ (';
 321 | 			foreach ($this->_ as $k => $v) {
 322 | 				if (is_array($v)) {
 323 | 					$string .= "[$k]=>(";
 324 | 					foreach ($v as $k2 => $v2) {
 325 | 						$string .= "[$k2]=>\"" . $v2 . '", ';
 326 | 					}
 327 | 					$string .= ')';
 328 | 				} else {
 329 | 					$string .= "[$k]=>\"" . $v . '", ';
 330 | 				}
 331 | 			}
 332 | 			$string .= ')';
 333 | 		}
 334 | 
 335 | 		if (isset($this->text)) {
 336 | 			$string .= ' text: (' . $this->text . ')';
 337 | 		}
 338 | 
 339 | 		$string .= " HDOM_INNER_INFO: '";
 340 | 
 341 | 		if (isset($node->_[HDOM_INFO_INNER])) {
 342 | 			$string .= $node->_[HDOM_INFO_INNER] . "'";
 343 | 		} else {
 344 | 			$string .= ' NULL ';
 345 | 		}
 346 | 
 347 | 		$string .= ' children: ' . count($this->children);
 348 | 		$string .= ' nodes: ' . count($this->nodes);
 349 | 		$string .= ' tag_start: ' . $this->tag_start;
 350 | 		$string .= "\n";
 351 | 
 352 | 		if ($echo) {
 353 | 			echo $string;
 354 | 			return;
 355 | 		} else {
 356 | 			return $string;
 357 | 		}
 358 | 	}
 359 | 
 360 | 	/**
 361 | 	 * Return or set parent node
 362 | 	 *
 363 | 	 * @param object|null $parent (optional) The parent node, `null` to return
 364 | 	 * the current parent node.
 365 | 	 * @return object|null The parent node
 366 | 	 */
 367 | 	function parent($parent = null)
 368 | 	{
 369 | 		// I am SURE that this doesn't work properly.
 370 | 		// It fails to unset the current node from it's current parents nodes or
 371 | 		// children list first.
 372 | 		if ($parent !== null) {
 373 | 			$this->parent = $parent;
 374 | 			$this->parent->nodes[] = $this;
 375 | 			$this->parent->children[] = $this;
 376 | 		}
 377 | 
 378 | 		return $this->parent;
 379 | 	}
 380 | 
 381 | 	/**
 382 | 	 * @return bool True if the node has at least one child node
 383 | 	 */
 384 | 	function has_child()
 385 | 	{
 386 | 		return !empty($this->children);
 387 | 	}
 388 | 
 389 | 	/**
 390 | 	 * Get child node at specified index
 391 | 	 *
 392 | 	 * @param int $idx The index of the child node to return, `-1` to return all
 393 | 	 * child nodes.
 394 | 	 * @return object|array|null The child node at the specified index, all child
 395 | 	 * nodes or null if the index is invalid.
 396 | 	 */
 397 | 	function children($idx = -1)
 398 | 	{
 399 | 		if ($idx === -1) {
 400 | 			return $this->children;
 401 | 		}
 402 | 
 403 | 		if (isset($this->children[$idx])) {
 404 | 			return $this->children[$idx];
 405 | 		}
 406 | 
 407 | 		return null;
 408 | 	}
 409 | 
 410 | 	/**
 411 | 	 * Get first child node
 412 | 	 *
 413 | 	 * @return object|null The first child node or null if the current node has
 414 | 	 * no child nodes.
 415 | 	 *
 416 | 	 * @todo Use `empty()` instead of `count()` to improve performance on large
 417 | 	 * arrays.
 418 | 	 */
 419 | 	function first_child()
 420 | 	{
 421 | 		if (count($this->children) > 0) {
 422 | 			return $this->children[0];
 423 | 		}
 424 | 		return null;
 425 | 	}
 426 | 
 427 | 	/**
 428 | 	 * Get last child node
 429 | 	 *
 430 | 	 * @return object|null The last child node or null if the current node has
 431 | 	 * no child nodes.
 432 | 	 *
 433 | 	 * @todo Use `end()` to slightly improve performance on large arrays.
 434 | 	 */
 435 | 	function last_child()
 436 | 	{
 437 | 		if (($count = count($this->children)) > 0) {
 438 | 			return $this->children[$count - 1];
 439 | 		}
 440 | 		return null;
 441 | 	}
 442 | 
 443 | 	/**
 444 | 	 * Get next sibling node
 445 | 	 *
 446 | 	 * @return object|null The sibling node or null if the current node has no
 447 | 	 * sibling nodes.
 448 | 	 */
 449 | 	function next_sibling()
 450 | 	{
 451 | 		if ($this->parent === null) {
 452 | 			return null;
 453 | 		}
 454 | 
 455 | 		$idx = 0;
 456 | 		$count = count($this->parent->children);
 457 | 
 458 | 		while ($idx < $count && $this !== $this->parent->children[$idx]) {
 459 | 			++$idx;
 460 | 		}
 461 | 
 462 | 		if (++$idx >= $count) {
 463 | 			return null;
 464 | 		}
 465 | 
 466 | 		return $this->parent->children[$idx];
 467 | 	}
 468 | 
 469 | 	/**
 470 | 	 * Get previous sibling node
 471 | 	 *
 472 | 	 * @return object|null The sibling node or null if the current node has no
 473 | 	 * sibling nodes.
 474 | 	 */
 475 | 	function prev_sibling()
 476 | 	{
 477 | 		if ($this->parent === null) { return null; }
 478 | 
 479 | 		$idx = 0;
 480 | 		$count = count($this->parent->children);
 481 | 
 482 | 		while ($idx < $count && $this !== $this->parent->children[$idx]) {
 483 | 			++$idx;
 484 | 		}
 485 | 
 486 | 		if (--$idx < 0) { return null; }
 487 | 
 488 | 		return $this->parent->children[$idx];
 489 | 	}
 490 | 
 491 | 	/**
 492 | 	 * Traverse ancestors to the first matching tag.
 493 | 	 *
 494 | 	 * @param string $tag Tag to find
 495 | 	 * @return object|null First matching node in the DOM tree or null if no
 496 | 	 * match was found.
 497 | 	 *
 498 | 	 * @todo Null is returned implicitly by calling ->parent on the root node.
 499 | 	 * This behaviour could change at any time, rendering this function invalid.
 500 | 	 */
 501 | 	function find_ancestor_tag($tag)
 502 | 	{
 503 | 		global $debug_object;
 504 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
 505 | 
 506 | 		// Start by including ourselves in the comparison.
 507 | 		$returnDom = $this;
 508 | 
 509 | 		while (!is_null($returnDom)) {
 510 | 			if (is_object($debug_object)) {
 511 | 				$debug_object->debug_log(2, 'Current tag is: ' . $returnDom->tag);
 512 | 			}
 513 | 
 514 | 			if ($returnDom->tag == $tag) {
 515 | 				break;
 516 | 			}
 517 | 
 518 | 			$returnDom = $returnDom->parent;
 519 | 		}
 520 | 
 521 | 		return $returnDom;
 522 | 	}
 523 | 
 524 | 	/**
 525 | 	 * Get node's inner text (everything inside the opening and closing tags)
 526 | 	 *
 527 | 	 * @return string
 528 | 	 */
 529 | 	function innertext()
 530 | 	{
 531 | 		if (isset($this->_[HDOM_INFO_INNER])) {
 532 | 			return $this->_[HDOM_INFO_INNER];
 533 | 		}
 534 | 
 535 | 		if (isset($this->_[HDOM_INFO_TEXT])) {
 536 | 			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 537 | 		}
 538 | 
 539 | 		$ret = '';
 540 | 
 541 | 		foreach ($this->nodes as $n) {
 542 | 			$ret .= $n->outertext();
 543 | 		}
 544 | 
 545 | 		return $ret;
 546 | 	}
 547 | 
 548 | 	/**
 549 | 	 * Get node's outer text (everything including the opening and closing tags)
 550 | 	 *
 551 | 	 * @return string
 552 | 	 */
 553 | 	function outertext()
 554 | 	{
 555 | 		global $debug_object;
 556 | 
 557 | 		if (is_object($debug_object)) {
 558 | 			$text = '';
 559 | 
 560 | 			if ($this->tag === 'text') {
 561 | 				if (!empty($this->text)) {
 562 | 					$text = ' with text: ' . $this->text;
 563 | 				}
 564 | 			}
 565 | 
 566 | 			$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
 567 | 		}
 568 | 
 569 | 		if ($this->tag === 'root') return $this->innertext();
 570 | 
 571 | 		// trigger callback
 572 | 		if ($this->dom && $this->dom->callback !== null) {
 573 | 			call_user_func_array($this->dom->callback, array($this));
 574 | 		}
 575 | 
 576 | 		if (isset($this->_[HDOM_INFO_OUTER])) {
 577 | 			return $this->_[HDOM_INFO_OUTER];
 578 | 		}
 579 | 
 580 | 		if (isset($this->_[HDOM_INFO_TEXT])) {
 581 | 			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 582 | 		}
 583 | 
 584 | 		// render begin tag
 585 | 		if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
 586 | 			$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
 587 | 		} else {
 588 | 			$ret = '';
 589 | 		}
 590 | 
 591 | 		// render inner text
 592 | 		if (isset($this->_[HDOM_INFO_INNER])) {
 593 | 			// If it's a br tag...  don't return the HDOM_INNER_INFO that we
 594 | 			// may or may not have added.
 595 | 			if ($this->tag !== 'br') {
 596 | 				$ret .= $this->_[HDOM_INFO_INNER];
 597 | 			}
 598 | 		} else {
 599 | 			if ($this->nodes) {
 600 | 				foreach ($this->nodes as $n) {
 601 | 					$ret .= $this->convert_text($n->outertext());
 602 | 				}
 603 | 			}
 604 | 		}
 605 | 
 606 | 		// render end tag
 607 | 		if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
 608 | 			$ret .= '</' . $this->tag . '>';
 609 | 		}
 610 | 
 611 | 		return $ret;
 612 | 	}
 613 | 
 614 | 	/**
 615 | 	 * Get node's plain text (everything excluding all tags)
 616 | 	 *
 617 | 	 * @return string
 618 | 	 */
 619 | 	function text()
 620 | 	{
 621 | 		if (isset($this->_[HDOM_INFO_INNER])) {
 622 | 			return $this->_[HDOM_INFO_INNER];
 623 | 		}
 624 | 
 625 | 		switch ($this->nodetype) {
 626 | 			case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 627 | 			case HDOM_TYPE_COMMENT: return '';
 628 | 			case HDOM_TYPE_UNKNOWN: return '';
 629 | 		}
 630 | 
 631 | 		if (strcasecmp($this->tag, 'script') === 0) { return ''; }
 632 | 		if (strcasecmp($this->tag, 'style') === 0) { return ''; }
 633 | 
 634 | 		$ret = '';
 635 | 
 636 | 		// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
 637 | 		// for some span tags, and some p tags) $this->nodes is set to NULL.
 638 | 		// NOTE: This indicates that there is a problem where it's set to NULL
 639 | 		// without a clear happening.
 640 | 		// WHY is this happening?
 641 | 		if (!is_null($this->nodes)) {
 642 | 			foreach ($this->nodes as $n) {
 643 | 				// Start paragraph after a blank line
 644 | 				if ($n->tag === 'p') {
 645 | 					$ret .= "\n\n";
 646 | 				}
 647 | 
 648 | 				$ret .= $this->convert_text($n->text());
 649 | 
 650 | 				// If this node is a span... add a space at the end of it so
 651 | 				// multiple spans don't run into each other.  This is plaintext
 652 | 				// after all.
 653 | 				if ($n->tag === 'span') {
 654 | 					$ret .= $this->dom->default_span_text;
 655 | 				}
 656 | 			}
 657 | 		}
 658 | 		return trim($ret);
 659 | 	}
 660 | 
 661 | 	/**
 662 | 	 * Get node's xml text (inner text as a CDATA section)
 663 | 	 *
 664 | 	 * @return string
 665 | 	 */
 666 | 	function xmltext()
 667 | 	{
 668 | 		$ret = $this->innertext();
 669 | 		$ret = str_ireplace('<![CDATA[', '', $ret);
 670 | 		$ret = str_replace(']]>', '', $ret);
 671 | 		return $ret;
 672 | 	}
 673 | 
 674 | 	// build node's text with tag
 675 | 	function makeup()
 676 | 	{
 677 | 		// text, comment, unknown
 678 | 		if (isset($this->_[HDOM_INFO_TEXT])) {
 679 | 			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 680 | 		}
 681 | 
 682 | 		$ret = '<' . $this->tag;
 683 | 		$i = -1;
 684 | 
 685 | 		foreach ($this->attr as $key => $val) {
 686 | 			++$i;
 687 | 
 688 | 			// skip removed attribute
 689 | 			if ($val === null || $val === false) { continue; }
 690 | 
 691 | 			$ret .= $this->_[HDOM_INFO_SPACE][$i][0];
 692 | 
 693 | 			//no value attr: nowrap, checked selected...
 694 | 			if ($val === true) {
 695 | 				$ret .= $key;
 696 | 			} else {
 697 | 				switch ($this->_[HDOM_INFO_QUOTE][$i])
 698 | 				{
 699 | 					case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
 700 | 					case HDOM_QUOTE_SINGLE: $quote = '\''; break;
 701 | 					default: $quote = '';
 702 | 				}
 703 | 
 704 | 				$ret .= $key
 705 | 				. $this->_[HDOM_INFO_SPACE][$i][1]
 706 | 				. '='
 707 | 				. $this->_[HDOM_INFO_SPACE][$i][2]
 708 | 				. $quote
 709 | 				. $val
 710 | 				. $quote;
 711 | 			}
 712 | 		}
 713 | 
 714 | 		$ret = $this->dom->restore_noise($ret);
 715 | 		return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
 716 | 	}
 717 | 
 718 | 	/**
 719 | 	 * Find elements by CSS selector
 720 | 	 *
 721 | 	 * @param string $selector The CSS selector
 722 | 	 * @param int|null $idx Index of element to return form the list of matching
 723 | 	 * elements (default: `null` = disabled).
 724 | 	 * @param bool $lowercase Matches tag names case insensitive (lowercase) if
 725 | 	 * enabled (default: `false`)
 726 | 	 * @return array|object|null A list of elements matching the specified CSS
 727 | 	 * selector or a single element if $idx is specified or null if no element
 728 | 	 * was found.
 729 | 	 */
 730 | 	function find($selector, $idx = null, $lowercase = false)
 731 | 	{
 732 | 		$selectors = $this->parse_selector($selector);
 733 | 		if (($count = count($selectors)) === 0) { return array(); }
 734 | 		$found_keys = array();
 735 | 
 736 | 		// find each selector
 737 | 		for ($c = 0; $c < $count; ++$c) {
 738 | 			// The change on the below line was documented on the sourceforge
 739 | 			// code tracker id 2788009
 740 | 			// used to be: if (($levle=count($selectors[0]))===0) return array();
 741 | 			if (($levle = count($selectors[$c])) === 0) { return array(); }
 742 | 			if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
 743 | 
 744 | 			$head = array($this->_[HDOM_INFO_BEGIN] => 1);
 745 | 			$cmd = ' '; // Combinator
 746 | 
 747 | 			// handle descendant selectors, no recursive!
 748 | 			for ($l = 0; $l < $levle; ++$l) {
 749 | 				$ret = array();
 750 | 
 751 | 				foreach ($head as $k => $v) {
 752 | 					$n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
 753 | 					//PaperG - Pass this optional parameter on to the seek function.
 754 | 					$n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
 755 | 				}
 756 | 
 757 | 				$head = $ret;
 758 | 				$cmd = $selectors[$c][$l][4]; // Next Combinator
 759 | 			}
 760 | 
 761 | 			foreach ($head as $k => $v) {
 762 | 				if (!isset($found_keys[$k])) {
 763 | 					$found_keys[$k] = 1;
 764 | 				}
 765 | 			}
 766 | 		}
 767 | 
 768 | 		// sort keys
 769 | 		ksort($found_keys);
 770 | 
 771 | 		$found = array();
 772 | 		foreach ($found_keys as $k => $v) {
 773 | 			$found[] = $this->dom->nodes[$k];
 774 | 		}
 775 | 
 776 | 		// return nth-element or array
 777 | 		if (is_null($idx)) { return $found; }
 778 | 		elseif ($idx < 0) { $idx = count($found) + $idx; }
 779 | 		return (isset($found[$idx])) ? $found[$idx] : null;
 780 | 	}
 781 | 
 782 | 	/**
 783 | 	 * Seek DOM elements by selector
 784 | 	 *
 785 | 	 * **Note**
 786 | 	 * The selector element must be compatible to a selector from
 787 | 	 * {@see simple_html_dom_node::parse_selector()}
 788 | 	 *
 789 | 	 * @param array $selector A selector element
 790 | 	 * @param array $ret An array of matches
 791 | 	 * @param bool $lowercase Matches tag names case insensitive (lowercase) if
 792 | 	 * enabled (default: `false`)
 793 | 	 * @return void
 794 | 	 */
 795 | 	protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
 796 | 	{
 797 | 		global $debug_object;
 798 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
 799 | 
 800 | 		list($tag, $id, $class, $attributes, $cmb) = $selector;
 801 | 		$nodes = array();
 802 | 
 803 | 		if ($parent_cmd === ' ') { // Descendant Combinator
 804 | 			// Find parent closing tag if the current element doesn't have a closing
 805 | 			// tag (i.e. void element)
 806 | 			$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
 807 | 			if ($end == 0) {
 808 | 				$parent = $this->parent;
 809 | 				while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
 810 | 					$end -= 1;
 811 | 					$parent = $parent->parent;
 812 | 				}
 813 | 				$end += $parent->_[HDOM_INFO_END];
 814 | 			}
 815 | 
 816 | 			// Get list of target nodes
 817 | 			$nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
 818 | 			$nodes_count = $end - $nodes_start;
 819 | 			$nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
 820 | 		} elseif ($parent_cmd === '>') { // Child Combinator
 821 | 			$nodes = $this->children;
 822 | 		} elseif ($parent_cmd === '+'
 823 | 			&& $this->parent
 824 | 			&& in_array($this, $this->parent->children)) { // Next-Sibling Combinator
 825 | 				$index = array_search($this, $this->parent->children, true) + 1;
 826 | 				$nodes[] = $this->parent->children[$index];
 827 | 		} elseif ($parent_cmd === '~'
 828 | 			&& $this->parent
 829 | 			&& in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
 830 | 				$index = array_search($this, $this->parent->children, true);
 831 | 				$nodes = array_slice($this->parent->children, $index);
 832 | 		}
 833 | 
 834 | 		// Go throgh each element starting at this element until the end tag
 835 | 		// Note: If this element is a void tag, any previous void element is
 836 | 		// skipped.
 837 | 		foreach($nodes as $node) {
 838 | 			$pass = true;
 839 | 
 840 | 			// Skip root nodes
 841 | 			if(!$node->parent) {
 842 | 				$pass = false;
 843 | 			}
 844 | 
 845 | 			// Skip if node isn't a child node (i.e. text nodes)
 846 | 			if($pass && !in_array($node, $node->parent->children, true)) {
 847 | 				$pass = false;
 848 | 			}
 849 | 
 850 | 			// Skip if tag doesn't match
 851 | 			if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
 852 | 				$pass = false;
 853 | 			}
 854 | 
 855 | 			// Skip if ID doesn't exist
 856 | 			if ($pass && $id !== '' && !isset($node->attr['id'])) {
 857 | 				$pass = false;
 858 | 			}
 859 | 
 860 | 			// Check if ID matches
 861 | 			if ($pass && $id !== '' && isset($node->attr['id'])) {
 862 | 				// Note: Only consider the first ID (as browsers do)
 863 | 				$node_id = explode(' ', trim($node->attr['id']))[0];
 864 | 
 865 | 				if($id !== $node_id) { $pass = false; }
 866 | 			}
 867 | 
 868 | 			// Check if all class(es) exist
 869 | 			if ($pass && $class !== '' && is_array($class) && !empty($class)) {
 870 | 				if (isset($node->attr['class'])) {
 871 | 					$node_classes = explode(' ', $node->attr['class']);
 872 | 
 873 | 					if ($lowercase) {
 874 | 						$node_classes = array_map('strtolower', $node_classes);
 875 | 					}
 876 | 
 877 | 					foreach($class as $c) {
 878 | 						if(!in_array($c, $node_classes)) {
 879 | 							$pass = false;
 880 | 							break;
 881 | 						}
 882 | 					}
 883 | 				} else {
 884 | 					$pass = false;
 885 | 				}
 886 | 			}
 887 | 
 888 | 			// Check attributes
 889 | 			if ($pass
 890 | 				&& $attributes !== ''
 891 | 				&& is_array($attributes)
 892 | 				&& !empty($attributes)) {
 893 | 					foreach($attributes as $a) {
 894 | 						list (
 895 | 							$att_name,
 896 | 							$att_expr,
 897 | 							$att_val,
 898 | 							$att_inv,
 899 | 							$att_case_sensitivity
 900 | 						) = $a;
 901 | 
 902 | 						// Handle indexing attributes (i.e. "[2]")
 903 | 						/**
 904 | 						 * Note: This is not supported by the CSS Standard but adds
 905 | 						 * the ability to select items compatible to XPath (i.e.
 906 | 						 * the 3rd element within it's parent).
 907 | 						 *
 908 | 						 * Note: This doesn't conflict with the CSS Standard which
 909 | 						 * doesn't work on numeric attributes anyway.
 910 | 						 */
 911 | 						if (is_numeric($att_name)
 912 | 							&& $att_expr === ''
 913 | 							&& $att_val === '') {
 914 | 								$count = 0;
 915 | 
 916 | 								// Find index of current element in parent
 917 | 								foreach ($node->parent->children as $c) {
 918 | 									if ($c->tag === $node->tag) ++$count;
 919 | 									if ($c === $node) break;
 920 | 								}
 921 | 
 922 | 								// If this is the correct node, continue with next
 923 | 								// attribute
 924 | 								if ($count === (int)$att_name) continue;
 925 | 						}
 926 | 
 927 | 						// Check attribute availability
 928 | 						if ($att_inv) { // Attribute should NOT be set
 929 | 							if (isset($node->attr[$att_name])) {
 930 | 								$pass = false;
 931 | 								break;
 932 | 							}
 933 | 						} else { // Attribute should be set
 934 | 							// todo: "plaintext" is not a valid CSS selector!
 935 | 							if ($att_name !== 'plaintext'
 936 | 								&& !isset($node->attr[$att_name])) {
 937 | 									$pass = false;
 938 | 									break;
 939 | 							}
 940 | 						}
 941 | 
 942 | 						// Continue with next attribute if expression isn't defined
 943 | 						if ($att_expr === '') continue;
 944 | 
 945 | 						// If they have told us that this is a "plaintext"
 946 | 						// search then we want the plaintext of the node - right?
 947 | 						// todo "plaintext" is not a valid CSS selector!
 948 | 						if ($att_name === 'plaintext') {
 949 | 							$nodeKeyValue = $node->text();
 950 | 						} else {
 951 | 							$nodeKeyValue = $node->attr[$att_name];
 952 | 						}
 953 | 
 954 | 						if (is_object($debug_object)) {
 955 | 							$debug_object->debug_log(2,
 956 | 								'testing node: '
 957 | 								. $node->tag
 958 | 								. ' for attribute: '
 959 | 								. $att_name
 960 | 								. $att_expr
 961 | 								. $att_val
 962 | 								. ' where nodes value is: '
 963 | 								. $nodeKeyValue
 964 | 							);
 965 | 						}
 966 | 
 967 | 						// If lowercase is set, do a case insensitive test of
 968 | 						// the value of the selector.
 969 | 						if ($lowercase) {
 970 | 							$check = $this->match(
 971 | 								$att_expr,
 972 | 								strtolower($att_val),
 973 | 								strtolower($nodeKeyValue),
 974 | 								$att_case_sensitivity
 975 | 							);
 976 | 						} else {
 977 | 							$check = $this->match(
 978 | 								$att_expr,
 979 | 								$att_val,
 980 | 								$nodeKeyValue,
 981 | 								$att_case_sensitivity
 982 | 							);
 983 | 						}
 984 | 
 985 | 						if (is_object($debug_object)) {
 986 | 							$debug_object->debug_log(2,
 987 | 								'after match: '
 988 | 								. ($check ? 'true' : 'false')
 989 | 							);
 990 | 						}
 991 | 
 992 | 						if (!$check) {
 993 | 							$pass = false;
 994 | 							break;
 995 | 						}
 996 | 					}
 997 | 			}
 998 | 
 999 | 			// Found a match. Add to list and clear node
1000 | 			if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
1001 | 			unset($node);
1002 | 		}
1003 | 		// It's passed by reference so this is actually what this function returns.
1004 | 		if (is_object($debug_object)) {
1005 | 			$debug_object->debug_log(1, 'EXIT - ret: ', $ret);
1006 | 		}
1007 | 	}
1008 | 
1009 | 	/**
1010 | 	 * Match value and pattern for a given CSS expression
1011 | 	 *
1012 | 	 * **Supported Expressions**
1013 | 	 *
1014 | 	 * | Expression | Description
1015 | 	 * | ---------- | -----------
1016 | 	 * | `=`        | $value and $pattern must be equal
1017 | 	 * | `!=`       | $value and $pattern must not be equal
1018 | 	 * | `^=`       | $value must start with $pattern
1019 | 	 * | `$=`       | $value must end with $pattern
1020 | 	 * | `*=`       | $value must contain $pattern
1021 | 	 *
1022 | 	 * @param string $exp The expression.
1023 | 	 * @param string $pattern The pattern
1024 | 	 * @param string $value The value
1025 | 	 * @value bool True if $value matches $pattern
1026 | 	 */
1027 | 	protected function match($exp, $pattern, $value, $case_sensitivity)
1028 | 	{
1029 | 		global $debug_object;
1030 | 		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
1031 | 
1032 | 		if ($case_sensitivity === 'i') {
1033 | 			$pattern = strtolower($pattern);
1034 | 			$value = strtolower($value);
1035 | 		}
1036 | 
1037 | 		switch ($exp) {
1038 | 			case '=':
1039 | 				return ($value === $pattern);
1040 | 			case '!=':
1041 | 				return ($value !== $pattern);
1042 | 			case '^=':
1043 | 				return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
1044 | 			case '$=':
1045 | 				return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
1046 | 			case '*=':
1047 | 				return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
1048 | 			case '|=':
1049 | 				/**
1050 | 				 * [att|=val]
1051 | 				 *
1052 | 				 * Represents an element with the att attribute, its value
1053 | 				 * either being exactly "val" or beginning with "val"
1054 | 				 * immediately followed by "-" (U+002D).
1055 | 				 */
1056 | 				return strpos($value, $pattern) === 0;
1057 | 			case '~=':
1058 | 				/**
1059 | 				 * [att~=val]
1060 | 				 *
1061 | 				 * Represents an element with the att attribute whose value is a
1062 | 				 * whitespace-separated list of words, one of which is exactly
1063 | 				 * "val". If "val" contains whitespace, it will never represent
1064 | 				 * anything (since the words are separated by spaces). Also if
1065 | 				 * "val" is the empty string, it will never represent anything.
1066 | 				 */
1067 | 				return in_array($pattern, explode(' ', trim($value)), true);
1068 | 		}
1069 | 		return false;
1070 | 	}
1071 | 
1072 | 	/**
1073 | 	 * Parse CSS selector
1074 | 	 *
1075 | 	 * @param string $selector_string CSS selector string
1076 | 	 * @return array List of CSS selectors. The format depends on the type of
1077 | 	 * selector:
1078 | 	 *
1079 | 	 * ```php
1080 | 	 *
1081 | 	 * array( // list of selectors (each separated by a comma), i.e. 'img, p, div'
1082 | 	 *   array( // list of combinator selectors, i.e. 'img > p > div'
1083 | 	 *     array( // selector element
1084 | 	 *       [0], // (string) The element tag
1085 | 	 *       [1], // (string) The element id
1086 | 	 *       [2], // (array<string>) The element classes
1087 | 	 *       [3], // (array<array<string>>) The list of attributes, each
1088 | 	 *            // with four elements: name, expression, value, inverted
1089 | 	 *       [4]  // (string) The selector combinator (' ' | '>' | '+' | '~')
1090 | 	 *     )
1091 | 	 *   )
1092 | 	 * )
1093 | 	 * ```
1094 | 	 *
1095 | 	 * @link https://www.w3.org/TR/selectors/#compound Compound selector
1096 | 	 */
1097 | 	protected function parse_selector($selector_string)
1098 | 	{
1099 | 		global $debug_object;
1100 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1101 | 
1102 | 		/**
1103 | 		 * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
1104 | 		 *
1105 | 		 * Paperg: Add the colon to the attribute, so that it properly finds
1106 | 		 * <tag attr:ibute="something" > like google does.
1107 | 		 *
1108 | 		 * Note: if you try to look at this attribute, you MUST use getAttribute
1109 | 		 * since $dom->x:y will fail the php syntax check.
1110 | 		 *
1111 | 		 * Notice the \[ starting the attribute? and the @? following? This
1112 | 		 * implies that an attribute can begin with an @ sign that is not
1113 | 		 * captured. This implies that an html attribute specifier may start
1114 | 		 * with an @ sign that is NOT captured by the expression. Farther study
1115 | 		 * is required to determine of this should be documented or removed.
1116 | 		 *
1117 | 		 * Matches selectors in this order:
1118 | 		 *
1119 | 		 * [0] - full match
1120 | 		 *
1121 | 		 * [1] - tag name
1122 | 		 *     ([\w:\*-]*)
1123 | 		 *     Matches the tag name consisting of zero or more words, colons,
1124 | 		 *     asterisks and hyphens.
1125 | 		 *
1126 | 		 * [2] - id name
1127 | 		 *     (?:\#([\w-]+))
1128 | 		 *     Optionally matches a id name, consisting of an "#" followed by
1129 | 		 *     the id name (one or more words and hyphens).
1130 | 		 *
1131 | 		 * [3] - class names (including dots)
1132 | 		 *     (?:\.([\w\.-]+))?
1133 | 		 *     Optionally matches a list of classs, consisting of an "."
1134 | 		 *     followed by the class name (one or more words and hyphens)
1135 | 		 *     where multiple classes can be chained (i.e. ".foo.bar.baz")
1136 | 		 *
1137 | 		 * [4] - attributes
1138 | 		 *     ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
1139 | 		 *     Optionally matches the attributes list
1140 | 		 *
1141 | 		 * [5] - separator
1142 | 		 *     ([\/, >+~]+)
1143 | 		 *     Matches the selector list separator
1144 | 		 */
1145 | 		// phpcs:ignore Generic.Files.LineLength
1146 | 		$pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
1147 | 
1148 | 		preg_match_all(
1149 | 			$pattern,
1150 | 			trim($selector_string) . ' ', // Add final ' ' as pseudo separator
1151 | 			$matches,
1152 | 			PREG_SET_ORDER
1153 | 		);
1154 | 
1155 | 		if (is_object($debug_object)) {
1156 | 			$debug_object->debug_log(2, 'Matches Array: ', $matches);
1157 | 		}
1158 | 
1159 | 		$selectors = array();
1160 | 		$result = array();
1161 | 
1162 | 		foreach ($matches as $m) {
1163 | 			$m[0] = trim($m[0]);
1164 | 
1165 | 			// Skip NoOps
1166 | 			if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
1167 | 
1168 | 			// Convert to lowercase
1169 | 			if ($this->dom->lowercase) {
1170 | 				$m[1] = strtolower($m[1]);
1171 | 			}
1172 | 
1173 | 			// Extract classes
1174 | 			if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
1175 | 
1176 | 			/* Extract attributes (pattern based on the pattern above!)
1177 | 
1178 | 			 * [0] - full match
1179 | 			 * [1] - attribute name
1180 | 			 * [2] - attribute expression
1181 | 			 * [3] - attribute value
1182 | 			 * [4] - case sensitivity
1183 | 			 *
1184 | 			 * Note: Attributes can be negated with a "!" prefix to their name
1185 | 			 */
1186 | 			if($m[4] !== '') {
1187 | 				preg_match_all(
1188 | 					"/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s*?([iIsS])?)?\]/is",
1189 | 					trim($m[4]),
1190 | 					$attributes,
1191 | 					PREG_SET_ORDER
1192 | 				);
1193 | 
1194 | 				// Replace element by array
1195 | 				$m[4] = array();
1196 | 
1197 | 				foreach($attributes as $att) {
1198 | 					// Skip empty matches
1199 | 					if(trim($att[0]) === '') { continue; }
1200 | 
1201 | 					$inverted = (isset($att[1][0]) && $att[1][0] === '!');
1202 | 					$m[4][] = array(
1203 | 						$inverted ? substr($att[1], 1) : $att[1], // Name
1204 | 						(isset($att[2])) ? $att[2] : '', // Expression
1205 | 						(isset($att[3])) ? $att[3] : '', // Value
1206 | 						$inverted, // Inverted Flag
1207 | 						(isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
1208 | 					);
1209 | 				}
1210 | 			}
1211 | 
1212 | 			// Sanitize Separator
1213 | 			if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
1214 | 				$m[5] = ' ';
1215 | 			} else { // Other Separator
1216 | 				$m[5] = trim($m[5]);
1217 | 			}
1218 | 
1219 | 			// Clear Separator if it's a Selector List
1220 | 			if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
1221 | 
1222 | 			// Remove full match before adding to results
1223 | 			array_shift($m);
1224 | 			$result[] = $m;
1225 | 
1226 | 			if ($is_list) { // Selector List
1227 | 				$selectors[] = $result;
1228 | 				$result = array();
1229 | 			}
1230 | 		}
1231 | 
1232 | 		if (count($result) > 0) { $selectors[] = $result; }
1233 | 		return $selectors;
1234 | 	}
1235 | 
1236 | 	function __get($name)
1237 | 	{
1238 | 		if (isset($this->attr[$name])) {
1239 | 			return $this->convert_text($this->attr[$name]);
1240 | 		}
1241 | 		switch ($name) {
1242 | 			case 'outertext': return $this->outertext();
1243 | 			case 'innertext': return $this->innertext();
1244 | 			case 'plaintext': return $this->text();
1245 | 			case 'xmltext': return $this->xmltext();
1246 | 			default: return array_key_exists($name, $this->attr);
1247 | 		}
1248 | 	}
1249 | 
1250 | 	function __set($name, $value)
1251 | 	{
1252 | 		global $debug_object;
1253 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1254 | 
1255 | 		switch ($name) {
1256 | 			case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
1257 | 			case 'innertext':
1258 | 				if (isset($this->_[HDOM_INFO_TEXT])) {
1259 | 					return $this->_[HDOM_INFO_TEXT] = $value;
1260 | 				}
1261 | 				return $this->_[HDOM_INFO_INNER] = $value;
1262 | 		}
1263 | 
1264 | 		if (!isset($this->attr[$name])) {
1265 | 			$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
1266 | 			$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1267 | 		}
1268 | 
1269 | 		$this->attr[$name] = $value;
1270 | 	}
1271 | 
1272 | 	function __isset($name)
1273 | 	{
1274 | 		switch ($name) {
1275 | 			case 'outertext': return true;
1276 | 			case 'innertext': return true;
1277 | 			case 'plaintext': return true;
1278 | 		}
1279 | 		//no value attr: nowrap, checked selected...
1280 | 		return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1281 | 	}
1282 | 
1283 | 	function __unset($name)
1284 | 	{
1285 | 		if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1286 | 	}
1287 | 
1288 | 	// PaperG - Function to convert the text from one character set to another
1289 | 	// if the two sets are not the same.
1290 | 	function convert_text($text)
1291 | 	{
1292 | 		global $debug_object;
1293 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1294 | 
1295 | 		$converted_text = $text;
1296 | 
1297 | 		$sourceCharset = '';
1298 | 		$targetCharset = '';
1299 | 
1300 | 		if ($this->dom) {
1301 | 			$sourceCharset = strtoupper($this->dom->_charset);
1302 | 			$targetCharset = strtoupper($this->dom->_target_charset);
1303 | 		}
1304 | 
1305 | 		if (is_object($debug_object)) {
1306 | 			$debug_object->debug_log(3,
1307 | 				'source charset: '
1308 | 				. $sourceCharset
1309 | 				. ' target charaset: '
1310 | 				. $targetCharset
1311 | 			);
1312 | 		}
1313 | 
1314 | 		if (!empty($sourceCharset)
1315 | 			&& !empty($targetCharset)
1316 | 			&& (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1317 | 			// Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1318 | 			if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1319 | 				&& ($this->is_utf8($text))) {
1320 | 				$converted_text = $text;
1321 | 			} else {
1322 | 				$converted_text = iconv($sourceCharset, $targetCharset, $text);
1323 | 			}
1324 | 		}
1325 | 
1326 | 		// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1327 | 		if ($targetCharset === 'UTF-8') {
1328 | 			if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
1329 | 				$converted_text = substr($converted_text, 3);
1330 | 			}
1331 | 
1332 | 			if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1333 | 				$converted_text = substr($converted_text, 0, -3);
1334 | 			}
1335 | 		}
1336 | 
1337 | 		return $converted_text;
1338 | 	}
1339 | 
1340 | 	/**
1341 | 	* Returns true if $string is valid UTF-8 and false otherwise.
1342 | 	*
1343 | 	* @param mixed $str String to be tested
1344 | 	* @return boolean
1345 | 	*/
1346 | 	static function is_utf8($str)
1347 | 	{
1348 | 		$c = 0; $b = 0;
1349 | 		$bits = 0;
1350 | 		$len = strlen($str);
1351 | 		for($i = 0; $i < $len; $i++) {
1352 | 			$c = ord($str[$i]);
1353 | 			if($c > 128) {
1354 | 				if(($c >= 254)) { return false; }
1355 | 				elseif($c >= 252) { $bits = 6; }
1356 | 				elseif($c >= 248) { $bits = 5; }
1357 | 				elseif($c >= 240) { $bits = 4; }
1358 | 				elseif($c >= 224) { $bits = 3; }
1359 | 				elseif($c >= 192) { $bits = 2; }
1360 | 				else { return false; }
1361 | 				if(($i + $bits) > $len) { return false; }
1362 | 				while($bits > 1) {
1363 | 					$i++;
1364 | 					$b = ord($str[$i]);
1365 | 					if($b < 128 || $b > 191) { return false; }
1366 | 					$bits--;
1367 | 				}
1368 | 			}
1369 | 		}
1370 | 		return true;
1371 | 	}
1372 | 
1373 | 	/**
1374 | 	 * Function to try a few tricks to determine the displayed size of an img on
1375 | 	 * the page. NOTE: This will ONLY work on an IMG tag. Returns FALSE on all
1376 | 	 * other tag types.
1377 | 	 *
1378 | 	 * @author John Schlick
1379 | 	 * @version April 19 2012
1380 | 	 * @return array an array containing the 'height' and 'width' of the image
1381 | 	 * on the page or -1 if we can't figure it out.
1382 | 	 */
1383 | 	function get_display_size()
1384 | 	{
1385 | 		global $debug_object;
1386 | 
1387 | 		$width = -1;
1388 | 		$height = -1;
1389 | 
1390 | 		if ($this->tag !== 'img') {
1391 | 			return false;
1392 | 		}
1393 | 
1394 | 		// See if there is aheight or width attribute in the tag itself.
1395 | 		if (isset($this->attr['width'])) {
1396 | 			$width = $this->attr['width'];
1397 | 		}
1398 | 
1399 | 		if (isset($this->attr['height'])) {
1400 | 			$height = $this->attr['height'];
1401 | 		}
1402 | 
1403 | 		// Now look for an inline style.
1404 | 		if (isset($this->attr['style'])) {
1405 | 			// Thanks to user gnarf from stackoverflow for this regular expression.
1406 | 			$attributes = array();
1407 | 
1408 | 			preg_match_all(
1409 | 				'/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1410 | 				$this->attr['style'],
1411 | 				$matches,
1412 | 				PREG_SET_ORDER
1413 | 			);
1414 | 
1415 | 			foreach ($matches as $match) {
1416 | 				$attributes[$match[1]] = $match[2];
1417 | 			}
1418 | 
1419 | 			// If there is a width in the style attributes:
1420 | 			if (isset($attributes['width']) && $width == -1) {
1421 | 				// check that the last two characters are px (pixels)
1422 | 				if (strtolower(substr($attributes['width'], -2)) === 'px') {
1423 | 					$proposed_width = substr($attributes['width'], 0, -2);
1424 | 					// Now make sure that it's an integer and not something stupid.
1425 | 					if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
1426 | 						$width = $proposed_width;
1427 | 					}
1428 | 				}
1429 | 			}
1430 | 
1431 | 			// If there is a width in the style attributes:
1432 | 			if (isset($attributes['height']) && $height == -1) {
1433 | 				// check that the last two characters are px (pixels)
1434 | 				if (strtolower(substr($attributes['height'], -2)) == 'px') {
1435 | 					$proposed_height = substr($attributes['height'], 0, -2);
1436 | 					// Now make sure that it's an integer and not something stupid.
1437 | 					if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
1438 | 						$height = $proposed_height;
1439 | 					}
1440 | 				}
1441 | 			}
1442 | 
1443 | 		}
1444 | 
1445 | 		// Future enhancement:
1446 | 		// Look in the tag to see if there is a class or id specified that has
1447 | 		// a height or width attribute to it.
1448 | 
1449 | 		// Far future enhancement
1450 | 		// Look at all the parent tags of this image to see if they specify a
1451 | 		// class or id that has an img selector that specifies a height or width
1452 | 		// Note that in this case, the class or id will have the img subselector
1453 | 		// for it to apply to the image.
1454 | 
1455 | 		// ridiculously far future development
1456 | 		// If the class or id is specified in a SEPARATE css file thats not on
1457 | 		// the page, go get it and do what we were just doing for the ones on
1458 | 		// the page.
1459 | 
1460 | 		$result = array(
1461 | 			'height' => $height,
1462 | 			'width' => $width
1463 | 		);
1464 | 
1465 | 		return $result;
1466 | 	}
1467 | 
1468 | 	// camel naming conventions
1469 | 	function getAllAttributes()
1470 | 	{
1471 | 		return $this->attr;
1472 | 	}
1473 | 
1474 | 	function getAttribute($name)
1475 | 	{
1476 | 		return $this->__get($name);
1477 | 	}
1478 | 
1479 | 	function setAttribute($name, $value)
1480 | 	{
1481 | 		$this->__set($name, $value);
1482 | 	}
1483 | 
1484 | 	function hasAttribute($name)
1485 | 	{
1486 | 		return $this->__isset($name);
1487 | 	}
1488 | 
1489 | 	function removeAttribute($name)
1490 | 	{
1491 | 		$this->__set($name, null);
1492 | 	}
1493 | 
1494 | 	function getElementById($id)
1495 | 	{
1496 | 		return $this->find("#$id", 0);
1497 | 	}
1498 | 
1499 | 	function getElementsById($id, $idx = null)
1500 | 	{
1501 | 		return $this->find("#$id", $idx);
1502 | 	}
1503 | 
1504 | 	function getElementByTagName($name)
1505 | 	{
1506 | 		return $this->find($name, 0);
1507 | 	}
1508 | 
1509 | 	function getElementsByTagName($name, $idx = null)
1510 | 	{
1511 | 		return $this->find($name, $idx);
1512 | 	}
1513 | 
1514 | 	function parentNode()
1515 | 	{
1516 | 		return $this->parent();
1517 | 	}
1518 | 
1519 | 	function childNodes($idx = -1)
1520 | 	{
1521 | 		return $this->children($idx);
1522 | 	}
1523 | 
1524 | 	function firstChild()
1525 | 	{
1526 | 		return $this->first_child();
1527 | 	}
1528 | 
1529 | 	function lastChild()
1530 | 	{
1531 | 		return $this->last_child();
1532 | 	}
1533 | 
1534 | 	function nextSibling()
1535 | 	{
1536 | 		return $this->next_sibling();
1537 | 	}
1538 | 
1539 | 	function previousSibling()
1540 | 	{
1541 | 		return $this->prev_sibling();
1542 | 	}
1543 | 
1544 | 	function hasChildNodes()
1545 | 	{
1546 | 		return $this->has_child();
1547 | 	}
1548 | 
1549 | 	function nodeName()
1550 | 	{
1551 | 		return $this->tag;
1552 | 	}
1553 | 
1554 | 	function appendChild($node)
1555 | 	{
1556 | 		$node->parent($this);
1557 | 		return $node;
1558 | 	}
1559 | 
1560 | }
1561 | 
1562 | /**
1563 |  * simple html dom parser
1564 |  *
1565 |  * Paperg - in the find routine: allow us to specify that we want case
1566 |  * insensitive testing of the value of the selector.
1567 |  *
1568 |  * Paperg - change $size from protected to public so we can easily access it
1569 |  *
1570 |  * Paperg - added ForceTagsClosed in the constructor which tells us whether we
1571 |  * trust the html or not.  Default is to NOT trust it.
1572 |  *
1573 |  * @package PlaceLocalInclude
1574 |  */
1575 | class simple_html_dom
1576 | {
1577 | 	/**
1578 | 	 * The root node of the document
1579 | 	 *
1580 | 	 * @var object
1581 | 	 */
1582 | 	public $root = null;
1583 | 
1584 | 	/**
1585 | 	 * List of nodes in the current DOM
1586 | 	 *
1587 | 	 * @var array
1588 | 	 */
1589 | 	public $nodes = array();
1590 | 
1591 | 	/**
1592 | 	 * Callback function to run for each element in the DOM.
1593 | 	 *
1594 | 	 * @var callable|null
1595 | 	 */
1596 | 	public $callback = null;
1597 | 
1598 | 	/**
1599 | 	 * Indicates how tags and attributes are matched
1600 | 	 *
1601 | 	 * @var bool When set to **true** tags and attributes will be converted to
1602 | 	 * lowercase before matching.
1603 | 	 */
1604 | 	public $lowercase = false;
1605 | 
1606 | 	/**
1607 | 	 * Original document size
1608 | 	 *
1609 | 	 * Holds the original document size.
1610 | 	 *
1611 | 	 * @var int
1612 | 	 */
1613 | 	public $original_size;
1614 | 
1615 | 	/**
1616 | 	 * Current document size
1617 | 	 *
1618 | 	 * Holds the current document size. The document size is determined by the
1619 | 	 * string length of ({@see simple_html_dom::$doc}).
1620 | 	 *
1621 | 	 * _Note_: Using this variable is more efficient than calling `strlen($doc)`
1622 | 	 *
1623 | 	 * @var int
1624 | 	 * */
1625 | 	public $size;
1626 | 
1627 | 	/**
1628 | 	 * Current position in the document
1629 | 	 *
1630 | 	 * @var int
1631 | 	 */
1632 | 	protected $pos;
1633 | 
1634 | 	/**
1635 | 	 * The document
1636 | 	 *
1637 | 	 * @var string
1638 | 	 */
1639 | 	protected $doc;
1640 | 
1641 | 	/**
1642 | 	 * Current character
1643 | 	 *
1644 | 	 * Holds the current character at position {@see simple_html_dom::$pos} in
1645 | 	 * the document {@see simple_html_dom::$doc}
1646 | 	 *
1647 | 	 * _Note_: Using this variable is more efficient than calling
1648 | 	 * `substr($doc, $pos, 1)`
1649 | 	 *
1650 | 	 * @var string
1651 | 	 */
1652 | 	protected $char;
1653 | 
1654 | 	protected $cursor;
1655 | 
1656 | 	/**
1657 | 	 * Parent node of the next node detected by the parser
1658 | 	 *
1659 | 	 * @var object
1660 | 	 */
1661 | 	protected $parent;
1662 | 	protected $noise = array();
1663 | 
1664 | 	/**
1665 | 	 * Tokens considered blank in HTML
1666 | 	 *
1667 | 	 * @var string
1668 | 	 */
1669 | 	protected $token_blank = " \t\r\n";
1670 | 
1671 | 	/**
1672 | 	 * Tokens to identify the equal sign for attributes, stopping either at the
1673 | 	 * closing tag ("/" i.e. "<html />") or the end of an opening tag (">" i.e.
1674 | 	 * "<html>")
1675 | 	 *
1676 | 	 * @var string
1677 | 	 */
1678 | 	protected $token_equal = ' =/>';
1679 | 
1680 | 	/**
1681 | 	 * Tokens to identify the end of a tag name. A tag name either ends on the
1682 | 	 * ending slash ("/" i.e. "<html/>") or whitespace ("\s\r\n\t")
1683 | 	 *
1684 | 	 * @var string
1685 | 	 */
1686 | 	protected $token_slash = " />\r\n\t";
1687 | 
1688 | 	/**
1689 | 	 * Tokens to identify the end of an attribute
1690 | 	 *
1691 | 	 * @var string
1692 | 	 */
1693 | 	protected $token_attr = ' >';
1694 | 
1695 | 	// Note that this is referenced by a child node, and so it needs to be
1696 | 	// public for that node to see this information.
1697 | 	public $_charset = '';
1698 | 	public $_target_charset = '';
1699 | 
1700 | 	/**
1701 | 	 * Innertext for <br> elements
1702 | 	 *
1703 | 	 * @var string
1704 | 	 */
1705 | 	protected $default_br_text = '';
1706 | 
1707 | 	/**
1708 | 	 * Suffix for <span> elements
1709 | 	 *
1710 | 	 * @var string
1711 | 	 */
1712 | 	public $default_span_text = '';
1713 | 
1714 | 	/**
1715 | 	 * Defines a list of self-closing tags (Void elements) according to the HTML
1716 | 	 * Specification
1717 | 	 *
1718 | 	 * _Remarks_:
1719 | 	 * - Use `isset()` instead of `in_array()` on array elements to boost
1720 | 	 * performance about 30%
1721 | 	 * - Sort elements by name for better readability!
1722 | 	 *
1723 | 	 * @link https://www.w3.org/TR/html HTML Specification
1724 | 	 * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
1725 | 	 */
1726 | 	protected $self_closing_tags = array(
1727 | 		'area' => 1,
1728 | 		'base' => 1,
1729 | 		'br' => 1,
1730 | 		'col' => 1,
1731 | 		'embed' => 1,
1732 | 		'hr' => 1,
1733 | 		'img' => 1,
1734 | 		'input' => 1,
1735 | 		'link' => 1,
1736 | 		'meta' => 1,
1737 | 		'param' => 1,
1738 | 		'source' => 1,
1739 | 		'track' => 1,
1740 | 		'wbr' => 1
1741 | 	);
1742 | 
1743 | 	/**
1744 | 	 * Defines a list of tags which - if closed - close all optional closing
1745 | 	 * elements within if they haven't been closed yet. (So, an element where
1746 | 	 * neither opening nor closing tag is omissible consistently closes every
1747 | 	 * optional closing element within)
1748 | 	 *
1749 | 	 * _Remarks_:
1750 | 	 * - Use `isset()` instead of `in_array()` on array elements to boost
1751 | 	 * performance about 30%
1752 | 	 * - Sort elements by name for better readability!
1753 | 	 */
1754 | 	protected $block_tags = array(
1755 | 		'body' => 1,
1756 | 		'div' => 1,
1757 | 		'form' => 1,
1758 | 		'root' => 1,
1759 | 		'span' => 1,
1760 | 		'table' => 1
1761 | 	);
1762 | 
1763 | 	/**
1764 | 	 * Defines elements whose end tag is omissible.
1765 | 	 *
1766 | 	 * * key = Name of an element whose end tag is omissible.
1767 | 	 * * value = Names of elements whose end tag is omissible, that are closed
1768 | 	 * by the current element.
1769 | 	 *
1770 | 	 * _Remarks_:
1771 | 	 * - Use `isset()` instead of `in_array()` on array elements to boost
1772 | 	 * performance about 30%
1773 | 	 * - Sort elements by name for better readability!
1774 | 	 *
1775 | 	 * **Example**
1776 | 	 *
1777 | 	 * An `li` element’s end tag may be omitted if the `li` element is immediately
1778 | 	 * followed by another `li` element. To do that, add following element to the
1779 | 	 * array:
1780 | 	 *
1781 | 	 * ```php
1782 | 	 * 'li' => array('li'),
1783 | 	 * ```
1784 | 	 *
1785 | 	 * With this, the following two examples are considered equal. Note that the
1786 | 	 * second example is missing the closing tags on `li` elements.
1787 | 	 *
1788 | 	 * ```html
1789 | 	 * <ul><li>First Item</li><li>Second Item</li></ul>
1790 | 	 * ```
1791 | 	 *
1792 | 	 * <ul><li>First Item</li><li>Second Item</li></ul>
1793 | 	 *
1794 | 	 * ```html
1795 | 	 * <ul><li>First Item<li>Second Item</ul>
1796 | 	 * ```
1797 | 	 *
1798 | 	 * <ul><li>First Item<li>Second Item</ul>
1799 | 	 *
1800 | 	 * @var array A two-dimensional array where the key is the name of an
1801 | 	 * element whose end tag is omissible and the value is an array of elements
1802 | 	 * whose end tag is omissible, that are closed by the current element.
1803 | 	 *
1804 | 	 * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags
1805 | 	 *
1806 | 	 * @todo The implementation of optional closing tags doesn't work in all cases
1807 | 	 * because it only consideres elements who close other optional closing
1808 | 	 * tags, not taking into account that some (non-blocking) tags should close
1809 | 	 * these optional closing tags. For example, the end tag for "p" is omissible
1810 | 	 * and can be closed by an "address" element, whose end tag is NOT omissible.
1811 | 	 * Currently a "p" element without closing tag stops at the next "p" element
1812 | 	 * or blocking tag, even if it contains other elements.
1813 | 	 *
1814 | 	 * @todo Known sourceforge issue #2977341
1815 | 	 * B tags that are not closed cause us to return everything to the end of
1816 | 	 * the document.
1817 | 	 */
1818 | 	protected $optional_closing_tags = array(
1819 | 		// Not optional, see
1820 | 		// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1821 | 		'b' => array('b' => 1),
1822 | 		'dd' => array('dd' => 1, 'dt' => 1),
1823 | 		// Not optional, see
1824 | 		// https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1825 | 		'dl' => array('dd' => 1, 'dt' => 1),
1826 | 		'dt' => array('dd' => 1, 'dt' => 1),
1827 | 		'li' => array('li' => 1),
1828 | 		'optgroup' => array('optgroup' => 1, 'option' => 1),
1829 | 		'option' => array('optgroup' => 1, 'option' => 1),
1830 | 		'p' => array('p' => 1),
1831 | 		'rp' => array('rp' => 1, 'rt' => 1),
1832 | 		'rt' => array('rp' => 1, 'rt' => 1),
1833 | 		'td' => array('td' => 1, 'th' => 1),
1834 | 		'th' => array('td' => 1, 'th' => 1),
1835 | 		'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1836 | 	);
1837 | 
1838 | 	function __construct(
1839 | 		$str = null,
1840 | 		$lowercase = true,
1841 | 		$forceTagsClosed = true,
1842 | 		$target_charset = DEFAULT_TARGET_CHARSET,
1843 | 		$stripRN = true,
1844 | 		$defaultBRText = DEFAULT_BR_TEXT,
1845 | 		$defaultSpanText = DEFAULT_SPAN_TEXT,
1846 | 		$options = 0)
1847 | 	{
1848 | 		if ($str) {
1849 | 			if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
1850 | 				$this->load_file($str);
1851 | 			} else {
1852 | 				$this->load(
1853 | 					$str,
1854 | 					$lowercase,
1855 | 					$stripRN,
1856 | 					$defaultBRText,
1857 | 					$defaultSpanText,
1858 | 					$options
1859 | 				);
1860 | 			}
1861 | 		}
1862 | 		// Forcing tags to be closed implies that we don't trust the html, but
1863 | 		// it can lead to parsing errors if we SHOULD trust the html.
1864 | 		if (!$forceTagsClosed) {
1865 | 			$this->optional_closing_array = array();
1866 | 		}
1867 | 
1868 | 		$this->_target_charset = $target_charset;
1869 | 	}
1870 | 
1871 | 	function __destruct()
1872 | 	{
1873 | 		$this->clear();
1874 | 	}
1875 | 
1876 | 	// load html from string
1877 | 	function load(
1878 | 		$str,
1879 | 		$lowercase = true,
1880 | 		$stripRN = true,
1881 | 		$defaultBRText = DEFAULT_BR_TEXT,
1882 | 		$defaultSpanText = DEFAULT_SPAN_TEXT,
1883 | 		$options = 0)
1884 | 	{
1885 | 		global $debug_object;
1886 | 
1887 | 		// prepare
1888 | 		$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1889 | 
1890 | 		// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1891 | 		// Script tags removal now preceeds style tag removal.
1892 | 		// strip out <script> tags
1893 | 		$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1894 | 		$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1895 | 
1896 | 		// strip out the \r \n's if we are told to.
1897 | 		if ($stripRN) {
1898 | 			$this->doc = str_replace("\r", ' ', $this->doc);
1899 | 			$this->doc = str_replace("\n", ' ', $this->doc);
1900 | 
1901 | 			// set the length of content since we have changed it.
1902 | 			$this->size = strlen($this->doc);
1903 | 		}
1904 | 
1905 | 		// strip out cdata
1906 | 		$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1907 | 		// strip out comments
1908 | 		$this->remove_noise("'<!--(.*?)-->'is");
1909 | 		// strip out <style> tags
1910 | 		$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1911 | 		$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1912 | 		// strip out preformatted tags
1913 | 		$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1914 | 		// strip out server side scripts
1915 | 		$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1916 | 
1917 | 		if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
1918 | 			$this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1919 | 		}
1920 | 
1921 | 		// parsing
1922 | 		$this->parse();
1923 | 		// end
1924 | 		$this->root->_[HDOM_INFO_END] = $this->cursor;
1925 | 		$this->parse_charset();
1926 | 
1927 | 		// make load function chainable
1928 | 		return $this;
1929 | 	}
1930 | 
1931 | 	// load html from file
1932 | 	function load_file()
1933 | 	{
1934 | 		$args = func_get_args();
1935 | 
1936 | 		if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
1937 | 			$this->load($doc, true);
1938 | 		} else {
1939 | 			return false;
1940 | 		}
1941 | 	}
1942 | 
1943 | 	/**
1944 | 	 * Set the callback function
1945 | 	 *
1946 | 	 * @param callable $function_name Callback function to run for each element
1947 | 	 * in the DOM.
1948 | 	 * @return void
1949 | 	 */
1950 | 	function set_callback($function_name)
1951 | 	{
1952 | 		$this->callback = $function_name;
1953 | 	}
1954 | 
1955 | 	/**
1956 | 	 * Remove callback function
1957 | 	 *
1958 | 	 * @return void
1959 | 	 */
1960 | 	function remove_callback()
1961 | 	{
1962 | 		$this->callback = null;
1963 | 	}
1964 | 
1965 | 	// save dom as string
1966 | 	function save($filepath = '')
1967 | 	{
1968 | 		$ret = $this->root->innertext();
1969 | 		if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
1970 | 		return $ret;
1971 | 	}
1972 | 
1973 | 	// find dom node by css selector
1974 | 	// Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1975 | 	function find($selector, $idx = null, $lowercase = false)
1976 | 	{
1977 | 		return $this->root->find($selector, $idx, $lowercase);
1978 | 	}
1979 | 
1980 | 	// clean up memory due to php5 circular references memory leak...
1981 | 	function clear()
1982 | 	{
1983 | 		foreach ($this->nodes as $n) {
1984 | 			$n->clear(); $n = null;
1985 | 		}
1986 | 
1987 | 		// This add next line is documented in the sourceforge repository.
1988 | 		// 2977248 as a fix for ongoing memory leaks that occur even with the
1989 | 		// use of clear.
1990 | 		if (isset($this->children)) {
1991 | 			foreach ($this->children as $n) {
1992 | 				$n->clear(); $n = null;
1993 | 			}
1994 | 		}
1995 | 
1996 | 		if (isset($this->parent)) {
1997 | 			$this->parent->clear();
1998 | 			unset($this->parent);
1999 | 		}
2000 | 
2001 | 		if (isset($this->root)) {
2002 | 			$this->root->clear();
2003 | 			unset($this->root);
2004 | 		}
2005 | 
2006 | 		unset($this->doc);
2007 | 		unset($this->noise);
2008 | 	}
2009 | 
2010 | 	function dump($show_attr = true)
2011 | 	{
2012 | 		$this->root->dump($show_attr);
2013 | 	}
2014 | 
2015 | 	// prepare HTML data and init everything
2016 | 	protected function prepare(
2017 | 		$str, $lowercase = true,
2018 | 		$defaultBRText = DEFAULT_BR_TEXT,
2019 | 		$defaultSpanText = DEFAULT_SPAN_TEXT)
2020 | 	{
2021 | 		$this->clear();
2022 | 
2023 | 		$this->doc = trim($str);
2024 | 		$this->size = strlen($this->doc);
2025 | 		$this->original_size = $this->size; // original size of the html
2026 | 		$this->pos = 0;
2027 | 		$this->cursor = 1;
2028 | 		$this->noise = array();
2029 | 		$this->nodes = array();
2030 | 		$this->lowercase = $lowercase;
2031 | 		$this->default_br_text = $defaultBRText;
2032 | 		$this->default_span_text = $defaultSpanText;
2033 | 		$this->root = new simple_html_dom_node($this);
2034 | 		$this->root->tag = 'root';
2035 | 		$this->root->_[HDOM_INFO_BEGIN] = -1;
2036 | 		$this->root->nodetype = HDOM_TYPE_ROOT;
2037 | 		$this->parent = $this->root;
2038 | 		if ($this->size > 0) { $this->char = $this->doc[0]; }
2039 | 	}
2040 | 
2041 | 	/**
2042 | 	 * Parse HTML content
2043 | 	 *
2044 | 	 * @return bool True on success
2045 | 	 */
2046 | 	protected function parse()
2047 | 	{
2048 | 		while (true) {
2049 | 			// Read next tag if there is no text between current position and the
2050 | 			// next opening tag.
2051 | 			if (($s = $this->copy_until_char('<')) === '') {
2052 | 				if($this->read_tag()) {
2053 | 					continue;
2054 | 				} else {
2055 | 					return true;
2056 | 				}
2057 | 			}
2058 | 
2059 | 			// Add a text node for text between tags
2060 | 			$node = new simple_html_dom_node($this);
2061 | 			++$this->cursor;
2062 | 			$node->_[HDOM_INFO_TEXT] = $s;
2063 | 			$this->link_nodes($node, false);
2064 | 		}
2065 | 	}
2066 | 
2067 | 	// PAPERG - dkchou - added this to try to identify the character set of the
2068 | 	// page we have just parsed so we know better how to spit it out later.
2069 | 	// NOTE:  IF you provide a routine called
2070 | 	// get_last_retrieve_url_contents_content_type which returns the
2071 | 	// CURLINFO_CONTENT_TYPE from the last curl_exec
2072 | 	// (or the content_type header from the last transfer), we will parse THAT,
2073 | 	// and if a charset is specified, we will use it over any other mechanism.
2074 | 	protected function parse_charset()
2075 | 	{
2076 | 		global $debug_object;
2077 | 
2078 | 		$charset = null;
2079 | 
2080 | 		if (function_exists('get_last_retrieve_url_contents_content_type')) {
2081 | 			$contentTypeHeader = get_last_retrieve_url_contents_content_type();
2082 | 			$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
2083 | 			if ($success) {
2084 | 				$charset = $matches[1];
2085 | 				if (is_object($debug_object)) {
2086 | 					$debug_object->debug_log(2,
2087 | 						'header content-type found charset of: '
2088 | 						. $charset
2089 | 					);
2090 | 				}
2091 | 			}
2092 | 		}
2093 | 
2094 | 		if (empty($charset)) {
2095 | 			$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
2096 | 
2097 | 			if (!empty($el)) {
2098 | 				$fullvalue = $el->content;
2099 | 				if (is_object($debug_object)) {
2100 | 					$debug_object->debug_log(2,
2101 | 						'meta content-type tag found'
2102 | 						. $fullvalue
2103 | 					);
2104 | 				}
2105 | 
2106 | 				if (!empty($fullvalue)) {
2107 | 					$success = preg_match(
2108 | 						'/charset=(.+)/i',
2109 | 						$fullvalue,
2110 | 						$matches
2111 | 					);
2112 | 
2113 | 					if ($success) {
2114 | 						$charset = $matches[1];
2115 | 					} else {
2116 | 						// If there is a meta tag, and they don't specify the
2117 | 						// character set, research says that it's typically
2118 | 						// ISO-8859-1
2119 | 						if (is_object($debug_object)) {
2120 | 							$debug_object->debug_log(2,
2121 | 								'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
2122 | 							);
2123 | 						}
2124 | 
2125 | 						$charset = 'ISO-8859-1';
2126 | 					}
2127 | 				}
2128 | 			}
2129 | 		}
2130 | 
2131 | 		// If we couldn't find a charset above, then lets try to detect one
2132 | 		// based on the text we got...
2133 | 		if (empty($charset)) {
2134 | 			// Use this in case mb_detect_charset isn't installed/loaded on
2135 | 			// this machine.
2136 | 			$charset = false;
2137 | 			if (function_exists('mb_detect_encoding')) {
2138 | 				// Have php try to detect the encoding from the text given to us.
2139 | 				$charset = mb_detect_encoding(
2140 | 					$this->doc . 'ascii',
2141 | 					$encoding_list = array( 'UTF-8', 'CP1252' )
2142 | 				);
2143 | 
2144 | 				if (is_object($debug_object)) {
2145 | 					$debug_object->debug_log(2, 'mb_detect found: ' . $charset);
2146 | 				}
2147 | 			}
2148 | 
2149 | 			// and if this doesn't work...  then we need to just wrongheadedly
2150 | 			// assume it's UTF-8 so that we can move on - cause this will
2151 | 			// usually give us most of what we need...
2152 | 			if ($charset === false) {
2153 | 				if (is_object($debug_object)) {
2154 | 					$debug_object->debug_log(
2155 | 						2,
2156 | 						'since mb_detect failed - using default of utf-8'
2157 | 					);
2158 | 				}
2159 | 
2160 | 				$charset = 'UTF-8';
2161 | 			}
2162 | 		}
2163 | 
2164 | 		// Since CP1252 is a superset, if we get one of it's subsets, we want
2165 | 		// it instead.
2166 | 		if ((strtolower($charset) == strtolower('ISO-8859-1'))
2167 | 			|| (strtolower($charset) == strtolower('Latin1'))
2168 | 			|| (strtolower($charset) == strtolower('Latin-1'))) {
2169 | 
2170 | 			if (is_object($debug_object)) {
2171 | 				$debug_object->debug_log(
2172 | 					2,
2173 | 					'replacing ' . $charset . ' with CP1252 as its a superset'
2174 | 				);
2175 | 			}
2176 | 
2177 | 			$charset = 'CP1252';
2178 | 		}
2179 | 
2180 | 		if (is_object($debug_object)) {
2181 | 			$debug_object->debug_log(1, 'EXIT - ' . $charset);
2182 | 		}
2183 | 
2184 | 		return $this->_charset = $charset;
2185 | 	}
2186 | 
2187 | 	/**
2188 | 	 * Parse tag from current document position.
2189 | 	 *
2190 | 	 * @return bool True if a tag was found, false otherwise
2191 | 	 */
2192 | 	protected function read_tag()
2193 | 	{
2194 | 		// Set end position if no further tags found
2195 | 		if ($this->char !== '<') {
2196 | 			$this->root->_[HDOM_INFO_END] = $this->cursor;
2197 | 			return false;
2198 | 		}
2199 | 
2200 | 		$begin_tag_pos = $this->pos;
2201 | 		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2202 | 
2203 | 		// end tag
2204 | 		if ($this->char === '/') {
2205 | 			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2206 | 
2207 | 			// Skip whitespace in end tags (i.e. in "</   html>")
2208 | 			$this->skip($this->token_blank);
2209 | 			$tag = $this->copy_until_char('>');
2210 | 
2211 | 			// Skip attributes in end tags
2212 | 			if (($pos = strpos($tag, ' ')) !== false) {
2213 | 				$tag = substr($tag, 0, $pos);
2214 | 			}
2215 | 
2216 | 			$parent_lower = strtolower($this->parent->tag);
2217 | 			$tag_lower = strtolower($tag);
2218 | 
2219 | 			// The end tag is supposed to close the parent tag. Handle situations
2220 | 			// when it doesn't
2221 | 			if ($parent_lower !== $tag_lower) {
2222 | 				// Parent tag does not have to be closed necessarily (optional closing tag)
2223 | 				// Current tag is a block tag, so it may close an ancestor
2224 | 				if (isset($this->optional_closing_tags[$parent_lower])
2225 | 					&& isset($this->block_tags[$tag_lower])) {
2226 | 
2227 | 					$this->parent->_[HDOM_INFO_END] = 0;
2228 | 					$org_parent = $this->parent;
2229 | 
2230 | 					// Traverse ancestors to find a matching opening tag
2231 | 					// Stop at root node
2232 | 					while (($this->parent->parent)
2233 | 						&& strtolower($this->parent->tag) !== $tag_lower
2234 | 					){
2235 | 						$this->parent = $this->parent->parent;
2236 | 					}
2237 | 
2238 | 					// If we don't have a match add current tag as text node
2239 | 					if (strtolower($this->parent->tag) !== $tag_lower) {
2240 | 						$this->parent = $org_parent; // restore origonal parent
2241 | 
2242 | 						if ($this->parent->parent) {
2243 | 							$this->parent = $this->parent->parent;
2244 | 						}
2245 | 
2246 | 						$this->parent->_[HDOM_INFO_END] = $this->cursor;
2247 | 						return $this->as_text_node($tag);
2248 | 					}
2249 | 				} elseif (($this->parent->parent)
2250 | 					&& isset($this->block_tags[$tag_lower])
2251 | 				) {
2252 | 					// Grandparent exists and current tag is a block tag, so our
2253 | 					// parent doesn't have an end tag
2254 | 					$this->parent->_[HDOM_INFO_END] = 0; // No end tag
2255 | 					$org_parent = $this->parent;
2256 | 
2257 | 					// Traverse ancestors to find a matching opening tag
2258 | 					// Stop at root node
2259 | 					while (($this->parent->parent)
2260 | 						&& strtolower($this->parent->tag) !== $tag_lower
2261 | 					) {
2262 | 						$this->parent = $this->parent->parent;
2263 | 					}
2264 | 
2265 | 					// If we don't have a match add current tag as text node
2266 | 					if (strtolower($this->parent->tag) !== $tag_lower) {
2267 | 						$this->parent = $org_parent; // restore origonal parent
2268 | 						$this->parent->_[HDOM_INFO_END] = $this->cursor;
2269 | 						return $this->as_text_node($tag);
2270 | 					}
2271 | 				} elseif (($this->parent->parent)
2272 | 					&& strtolower($this->parent->parent->tag) === $tag_lower
2273 | 				) { // Grandparent exists and current tag closes it
2274 | 					$this->parent->_[HDOM_INFO_END] = 0;
2275 | 					$this->parent = $this->parent->parent;
2276 | 				} else { // Random tag, add as text node
2277 | 					return $this->as_text_node($tag);
2278 | 				}
2279 | 			}
2280 | 
2281 | 			// Set end position of parent tag to current cursor position
2282 | 			$this->parent->_[HDOM_INFO_END] = $this->cursor;
2283 | 
2284 | 			if ($this->parent->parent) {
2285 | 				$this->parent = $this->parent->parent;
2286 | 			}
2287 | 
2288 | 			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2289 | 			return true;
2290 | 		}
2291 | 
2292 | 		// start tag
2293 | 		$node = new simple_html_dom_node($this);
2294 | 		$node->_[HDOM_INFO_BEGIN] = $this->cursor;
2295 | 		++$this->cursor;
2296 | 		$tag = $this->copy_until($this->token_slash); // Get tag name
2297 | 		$node->tag_start = $begin_tag_pos;
2298 | 
2299 | 		// doctype, cdata & comments...
2300 | 		// <!DOCTYPE html>
2301 | 		// <![CDATA[ ... ]]>
2302 | 		// <!-- Comment -->
2303 | 		if (isset($tag[0]) && $tag[0] === '!') {
2304 | 			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
2305 | 
2306 | 			if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
2307 | 				$node->nodetype = HDOM_TYPE_COMMENT;
2308 | 				$node->tag = 'comment';
2309 | 			} else { // Could be doctype or CDATA but we don't care
2310 | 				$node->nodetype = HDOM_TYPE_UNKNOWN;
2311 | 				$node->tag = 'unknown';
2312 | 			}
2313 | 
2314 | 			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
2315 | 
2316 | 			$this->link_nodes($node, true);
2317 | 			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2318 | 			return true;
2319 | 		}
2320 | 
2321 | 		// The start tag cannot contain another start tag, if so add as text
2322 | 		// i.e. "<<html>"
2323 | 		if ($pos = strpos($tag, '<') !== false) {
2324 | 			$tag = '<' . substr($tag, 0, -1);
2325 | 			$node->_[HDOM_INFO_TEXT] = $tag;
2326 | 			$this->link_nodes($node, false);
2327 | 			$this->char = $this->doc[--$this->pos]; // prev
2328 | 			return true;
2329 | 		}
2330 | 
2331 | 		// Handle invalid tag names (i.e. "<html#doc>")
2332 | 		if (!preg_match('/^\w[\w:-]*$/', $tag)) {
2333 | 			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
2334 | 
2335 | 			// Next char is the beginning of a new tag, don't touch it.
2336 | 			if ($this->char === '<') {
2337 | 				$this->link_nodes($node, false);
2338 | 				return true;
2339 | 			}
2340 | 
2341 | 			// Next char closes current tag, add and be done with it.
2342 | 			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
2343 | 			$this->link_nodes($node, false);
2344 | 			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2345 | 			return true;
2346 | 		}
2347 | 
2348 | 		// begin tag, add new node
2349 | 		$node->nodetype = HDOM_TYPE_ELEMENT;
2350 | 		$tag_lower = strtolower($tag);
2351 | 		$node->tag = ($this->lowercase) ? $tag_lower : $tag;
2352 | 
2353 | 		// handle optional closing tags
2354 | 		if (isset($this->optional_closing_tags[$tag_lower])) {
2355 | 			// Traverse ancestors to close all optional closing tags
2356 | 			while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
2357 | 				$this->parent->_[HDOM_INFO_END] = 0;
2358 | 				$this->parent = $this->parent->parent;
2359 | 			}
2360 | 			$node->parent = $this->parent;
2361 | 		}
2362 | 
2363 | 		$guard = 0; // prevent infinity loop
2364 | 
2365 | 		// [0] Space between tag and first attribute
2366 | 		$space = array($this->copy_skip($this->token_blank), '', '');
2367 | 
2368 | 		// attributes
2369 | 		do {
2370 | 			// Everything until the first equal sign should be the attribute name
2371 | 			$name = $this->copy_until($this->token_equal);
2372 | 
2373 | 			if ($name === '' && $this->char !== null && $space[0] === '') {
2374 | 				break;
2375 | 			}
2376 | 
2377 | 			if ($guard === $this->pos) { // Escape infinite loop
2378 | 				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2379 | 				continue;
2380 | 			}
2381 | 
2382 | 			$guard = $this->pos;
2383 | 
2384 | 			// handle endless '<'
2385 | 			// Out of bounds before the tag ended
2386 | 			if ($this->pos >= $this->size - 1 && $this->char !== '>') {
2387 | 				$node->nodetype = HDOM_TYPE_TEXT;
2388 | 				$node->_[HDOM_INFO_END] = 0;
2389 | 				$node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
2390 | 				$node->tag = 'text';
2391 | 				$this->link_nodes($node, false);
2392 | 				return true;
2393 | 			}
2394 | 
2395 | 			// handle mismatch '<'
2396 | 			// Attributes cannot start after opening tag
2397 | 			if ($this->doc[$this->pos - 1] == '<') {
2398 | 				$node->nodetype = HDOM_TYPE_TEXT;
2399 | 				$node->tag = 'text';
2400 | 				$node->attr = array();
2401 | 				$node->_[HDOM_INFO_END] = 0;
2402 | 				$node->_[HDOM_INFO_TEXT] = substr(
2403 | 					$this->doc,
2404 | 					$begin_tag_pos,
2405 | 					$this->pos - $begin_tag_pos - 1
2406 | 				);
2407 | 				$this->pos -= 2;
2408 | 				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2409 | 				$this->link_nodes($node, false);
2410 | 				return true;
2411 | 			}
2412 | 
2413 | 			if ($name !== '/' && $name !== '') { // this is a attribute name
2414 | 				// [1] Whitespace after attribute name
2415 | 				$space[1] = $this->copy_skip($this->token_blank);
2416 | 
2417 | 				$name = $this->restore_noise($name); // might be a noisy name
2418 | 
2419 | 				if ($this->lowercase) { $name = strtolower($name); }
2420 | 
2421 | 				if ($this->char === '=') { // attribute with value
2422 | 					$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2423 | 					$this->parse_attr($node, $name, $space); // get attribute value
2424 | 				} else {
2425 | 					//no value attr: nowrap, checked selected...
2426 | 					$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2427 | 					$node->attr[$name] = true;
2428 | 					if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
2429 | 				}
2430 | 
2431 | 				$node->_[HDOM_INFO_SPACE][] = $space;
2432 | 
2433 | 				// prepare for next attribute
2434 | 				$space = array(
2435 | 					$this->copy_skip($this->token_blank),
2436 | 					'',
2437 | 					''
2438 | 				);
2439 | 			} else { // no more attributes
2440 | 				break;
2441 | 			}
2442 | 		} while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
2443 | 
2444 | 		$this->link_nodes($node, true);
2445 | 		$node->_[HDOM_INFO_ENDSPACE] = $space[0];
2446 | 
2447 | 		// handle empty tags (i.e. "<div/>")
2448 | 		if ($this->copy_until_char('>') === '/') {
2449 | 			$node->_[HDOM_INFO_ENDSPACE] .= '/';
2450 | 			$node->_[HDOM_INFO_END] = 0;
2451 | 		} else {
2452 | 			// reset parent
2453 | 			if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
2454 | 				$this->parent = $node;
2455 | 			}
2456 | 		}
2457 | 
2458 | 		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2459 | 
2460 | 		// If it's a BR tag, we need to set it's text to the default text.
2461 | 		// This way when we see it in plaintext, we can generate formatting that the user wants.
2462 | 		// since a br tag never has sub nodes, this works well.
2463 | 		if ($node->tag === 'br') {
2464 | 			$node->_[HDOM_INFO_INNER] = $this->default_br_text;
2465 | 		}
2466 | 
2467 | 		return true;
2468 | 	}
2469 | 
2470 | 	/**
2471 | 	 * Parse attribute from current document position
2472 | 	 *
2473 | 	 * @param object $node Node for the attributes
2474 | 	 * @param string $name Name of the current attribute
2475 | 	 * @param array $space Array for spacing information
2476 | 	 * @return void
2477 | 	 */
2478 | 	protected function parse_attr($node, $name, &$space)
2479 | 	{
2480 | 		// Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
2481 | 		// If the attribute is already defined inside a tag, only pay attention
2482 | 		// to the first one as opposed to the last one.
2483 | 		// https://stackoverflow.com/a/26341866
2484 | 		if (isset($node->attr[$name])) {
2485 | 			return;
2486 | 		}
2487 | 
2488 | 		// [2] Whitespace between "=" and the value
2489 | 		$space[2] = $this->copy_skip($this->token_blank);
2490 | 
2491 | 		switch ($this->char) {
2492 | 			case '"': // value is anything between double quotes
2493 | 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
2494 | 				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2495 | 				$node->attr[$name] = $this->restore_noise($this->copy_until_char('"'));
2496 | 				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2497 | 				break;
2498 | 			case '\'': // value is anything between single quotes
2499 | 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
2500 | 				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2501 | 				$node->attr[$name] = $this->restore_noise($this->copy_until_char('\''));
2502 | 				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2503 | 				break;
2504 | 			default: // value is anything until the first space or end tag
2505 | 				$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2506 | 				$node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
2507 | 		}
2508 | 		// PaperG: Attributes should not have \r or \n in them, that counts as
2509 | 		// html whitespace.
2510 | 		$node->attr[$name] = str_replace("\r", '', $node->attr[$name]);
2511 | 		$node->attr[$name] = str_replace("\n", '', $node->attr[$name]);
2512 | 		// PaperG: If this is a "class" selector, lets get rid of the preceeding
2513 | 		// and trailing space since some people leave it in the multi class case.
2514 | 		if ($name === 'class') {
2515 | 			$node->attr[$name] = trim($node->attr[$name]);
2516 | 		}
2517 | 	}
2518 | 
2519 | 	/**
2520 | 	 * Link node to parent node
2521 | 	 *
2522 | 	 * @param object $node Node to link to parent
2523 | 	 * @param bool $is_child True if the node is a child of parent
2524 | 	 * @return void
2525 | 	 */
2526 | 	// link node's parent
2527 | 	protected function link_nodes(&$node, $is_child)
2528 | 	{
2529 | 		$node->parent = $this->parent;
2530 | 		$this->parent->nodes[] = $node;
2531 | 		if ($is_child) {
2532 | 			$this->parent->children[] = $node;
2533 | 		}
2534 | 	}
2535 | 
2536 | 	/**
2537 | 	 * Add tag as text node to current node
2538 | 	 *
2539 | 	 * @param string $tag Tag name
2540 | 	 * @return bool True on success
2541 | 	 */
2542 | 	protected function as_text_node($tag)
2543 | 	{
2544 | 		$node = new simple_html_dom_node($this);
2545 | 		++$this->cursor;
2546 | 		$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
2547 | 		$this->link_nodes($node, false);
2548 | 		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2549 | 		return true;
2550 | 	}
2551 | 
2552 | 	/**
2553 | 	 * Seek from the current document position to the first occurrence of a
2554 | 	 * character not defined by the provided string. Update the current document
2555 | 	 * position to the new position.
2556 | 	 *
2557 | 	 * @param string $chars A string containing every allowed character.
2558 | 	 * @return void
2559 | 	 */
2560 | 	protected function skip($chars)
2561 | 	{
2562 | 		$this->pos += strspn($this->doc, $chars, $this->pos);
2563 | 		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2564 | 	}
2565 | 
2566 | 	/**
2567 | 	 * Copy substring from the current document position to the first occurrence
2568 | 	 * of a character not defined by the provided string.
2569 | 	 *
2570 | 	 * @param string $chars A string containing every allowed character.
2571 | 	 * @return string Substring from the current document position to the first
2572 | 	 * occurrence of a character not defined by the provided string.
2573 | 	 */
2574 | 	protected function copy_skip($chars)
2575 | 	{
2576 | 		$pos = $this->pos;
2577 | 		$len = strspn($this->doc, $chars, $pos);
2578 | 		$this->pos += $len;
2579 | 		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2580 | 		if ($len === 0) { return ''; }
2581 | 		return substr($this->doc, $pos, $len);
2582 | 	}
2583 | 
2584 | 	/**
2585 | 	 * Copy substring from the current document position to the first occurrence
2586 | 	 * of any of the provided characters.
2587 | 	 *
2588 | 	 * @param string $chars A string containing every character to stop at.
2589 | 	 * @return string Substring from the current document position to the first
2590 | 	 * occurrence of any of the provided characters.
2591 | 	 */
2592 | 	protected function copy_until($chars)
2593 | 	{
2594 | 		$pos = $this->pos;
2595 | 		$len = strcspn($this->doc, $chars, $pos);
2596 | 		$this->pos += $len;
2597 | 		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2598 | 		return substr($this->doc, $pos, $len);
2599 | 	}
2600 | 
2601 | 	/**
2602 | 	 * Copy substring from the current document position to the first occurrence
2603 | 	 * of the provided string.
2604 | 	 *
2605 | 	 * @param string $char The string to stop at.
2606 | 	 * @return string Substring from the current document position to the first
2607 | 	 * occurrence of the provided string.
2608 | 	 */
2609 | 	protected function copy_until_char($char)
2610 | 	{
2611 | 		if ($this->char === null) { return ''; }
2612 | 
2613 | 		if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
2614 | 			$ret = substr($this->doc, $this->pos, $this->size - $this->pos);
2615 | 			$this->char = null;
2616 | 			$this->pos = $this->size;
2617 | 			return $ret;
2618 | 		}
2619 | 
2620 | 		if ($pos === $this->pos) { return ''; }
2621 | 
2622 | 		$pos_old = $this->pos;
2623 | 		$this->char = $this->doc[$pos];
2624 | 		$this->pos = $pos;
2625 | 		return substr($this->doc, $pos_old, $pos - $pos_old);
2626 | 	}
2627 | 
2628 | 	/**
2629 | 	 * Remove noise from HTML content
2630 | 	 *
2631 | 	 * Noise is stored to {@see simple_html_dom::$noise}
2632 | 	 *
2633 | 	 * @param string $pattern The regex pattern used for finding noise
2634 | 	 * @param bool $remove_tag True to remove the entire match. Default is false
2635 | 	 * to only remove the captured data.
2636 | 	 */
2637 | 	protected function remove_noise($pattern, $remove_tag = false)
2638 | 	{
2639 | 		global $debug_object;
2640 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2641 | 
2642 | 		$count = preg_match_all(
2643 | 			$pattern,
2644 | 			$this->doc,
2645 | 			$matches,
2646 | 			PREG_SET_ORDER | PREG_OFFSET_CAPTURE
2647 | 		);
2648 | 
2649 | 		for ($i = $count - 1; $i > -1; --$i) {
2650 | 			$key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
2651 | 
2652 | 			if (is_object($debug_object)) {
2653 | 				$debug_object->debug_log(2, 'key is: ' . $key);
2654 | 			}
2655 | 
2656 | 			$idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2657 | 			$this->noise[$key] = $matches[$i][$idx][0];
2658 | 			$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2659 | 		}
2660 | 
2661 | 		// reset the length of content
2662 | 		$this->size = strlen($this->doc);
2663 | 
2664 | 		if ($this->size > 0) {
2665 | 			$this->char = $this->doc[0];
2666 | 		}
2667 | 	}
2668 | 
2669 | 	/**
2670 | 	 * Restore noise to HTML content
2671 | 	 *
2672 | 	 * Noise is restored from {@see simple_html_dom::$noise}
2673 | 	 *
2674 | 	 * @param string $text A subset of HTML containing noise
2675 | 	 * @return string The same content with noise restored
2676 | 	 */
2677 | 	function restore_noise($text)
2678 | 	{
2679 | 		global $debug_object;
2680 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2681 | 
2682 | 		while (($pos = strpos($text, '___noise___')) !== false) {
2683 | 			// Sometimes there is a broken piece of markup, and we don't GET the
2684 | 			// pos+11 etc... token which indicates a problem outside of us...
2685 | 
2686 | 			// todo: "___noise___1000" (or any number with four or more digits)
2687 | 			// in the DOM causes an infinite loop which could be utilized by
2688 | 			// malicious software
2689 | 			if (strlen($text) > $pos + 15) {
2690 | 				$key = '___noise___'
2691 | 				. $text[$pos + 11]
2692 | 				. $text[$pos + 12]
2693 | 				. $text[$pos + 13]
2694 | 				. $text[$pos + 14]
2695 | 				. $text[$pos + 15];
2696 | 
2697 | 				if (is_object($debug_object)) {
2698 | 					$debug_object->debug_log(2, 'located key of: ' . $key);
2699 | 				}
2700 | 
2701 | 				if (isset($this->noise[$key])) {
2702 | 					$text = substr($text, 0, $pos)
2703 | 					. $this->noise[$key]
2704 | 					. substr($text, $pos + 16);
2705 | 				} else {
2706 | 					// do this to prevent an infinite loop.
2707 | 					$text = substr($text, 0, $pos)
2708 | 					. 'UNDEFINED NOISE FOR KEY: '
2709 | 					. $key
2710 | 					. substr($text, $pos + 16);
2711 | 				}
2712 | 			} else {
2713 | 				// There is no valid key being given back to us... We must get
2714 | 				// rid of the ___noise___ or we will have a problem.
2715 | 				$text = substr($text, 0, $pos)
2716 | 				. 'NO NUMERIC NOISE KEY'
2717 | 				. substr($text, $pos + 11);
2718 | 			}
2719 | 		}
2720 | 		return $text;
2721 | 	}
2722 | 
2723 | 	// Sometimes we NEED one of the noise elements.
2724 | 	function search_noise($text)
2725 | 	{
2726 | 		global $debug_object;
2727 | 		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2728 | 
2729 | 		foreach($this->noise as $noiseElement) {
2730 | 			if (strpos($noiseElement, $text) !== false) {
2731 | 				return $noiseElement;
2732 | 			}
2733 | 		}
2734 | 	}
2735 | 
2736 | 	function __toString()
2737 | 	{
2738 | 		return $this->root->innertext();
2739 | 	}
2740 | 
2741 | 	function __get($name)
2742 | 	{
2743 | 		switch ($name) {
2744 | 			case 'outertext':
2745 | 				return $this->root->innertext();
2746 | 			case 'innertext':
2747 | 				return $this->root->innertext();
2748 | 			case 'plaintext':
2749 | 				return $this->root->text();
2750 | 			case 'charset':
2751 | 				return $this->_charset;
2752 | 			case 'target_charset':
2753 | 				return $this->_target_charset;
2754 | 		}
2755 | 	}
2756 | 
2757 | 	// camel naming conventions
2758 | 	function childNodes($idx = -1)
2759 | 	{
2760 | 		return $this->root->childNodes($idx);
2761 | 	}
2762 | 
2763 | 	function firstChild()
2764 | 	{
2765 | 		return $this->root->first_child();
2766 | 	}
2767 | 
2768 | 	function lastChild()
2769 | 	{
2770 | 		return $this->root->last_child();
2771 | 	}
2772 | 
2773 | 	function createElement($name, $value = null)
2774 | 	{
2775 | 		return @str_get_html("<$name>$value</$name>")->first_child();
2776 | 	}
2777 | 
2778 | 	function createTextNode($value)
2779 | 	{
2780 | 		return @end(str_get_html($value)->nodes);
2781 | 	}
2782 | 
2783 | 	function getElementById($id)
2784 | 	{
2785 | 		return $this->find("#$id", 0);
2786 | 	}
2787 | 
2788 | 	function getElementsById($id, $idx = null)
2789 | 	{
2790 | 		return $this->find("#$id", $idx);
2791 | 	}
2792 | 
2793 | 	function getElementByTagName($name)
2794 | 	{
2795 | 		return $this->find($name, 0);
2796 | 	}
2797 | 
2798 | 	function getElementsByTagName($name, $idx = -1)
2799 | 	{
2800 | 		return $this->find($name, $idx);
2801 | 	}
2802 | 
2803 | 	function loadFile()
2804 | 	{
2805 | 		$args = func_get_args();
2806 | 		$this->load_file($args);
2807 | 	}
2808 | }
2809 | 


--------------------------------------------------------------------------------
/tools/resolve_url.php:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env php
 2 | <?php
 3 | /*******************************************************************************
 4 | * Copyright (c) 2011-2019 by Matt Wright
 5 | * https://github.com/mattwright/URLResolver.php
 6 | *
 7 | * Licensed under The MIT License
 8 | * See URLResolver.php for full license text
 9 | *******************************************************************************/
10 | require_once(__DIR__.'/../URLResolver.php');
11 | 
12 | if (count($argv) != 2 || !preg_match('/^https?:\/\//i', $argv[1])) {
13 | 	print "You must supply a URL:\n  ./resolve_url.php http://goo.gl/0GMP1\n";
14 | 	exit;
15 | }
16 | 
17 | $resolver = new mattwright\URLResolver();
18 | $resolver->isDebugMode(true);
19 | $resolver->setUserAgent('Mozilla/5.0 (compatible; URLResolver.php/1.0; +https://github.com/mattwright/URLResolver.php)');
20 | $resolver->resolveURL($argv[1]);
21 | 


--------------------------------------------------------------------------------