├── .gitattributes ├── .gitignore ├── .scrutinizer.yml ├── composer.json ├── phpunit.xml.dist └── src └── PHPHtmlParser ├── Content.php ├── Curl.php ├── CurlInterface.php ├── Dom.php ├── Dom ├── AbstractNode.php ├── ArrayNode.php ├── Collection.php ├── HtmlNode.php ├── InnerNode.php ├── LeafNode.php ├── MockNode.php ├── Tag.php └── TextNode.php ├── Exceptions ├── ChildNotFoundException.php ├── CircularException.php ├── CurlException.php ├── EmptyCollectionException.php ├── NotLoadedException.php ├── ParentNotFoundException.php ├── StrictException.php └── UnknownChildTypeException.php ├── Options.php ├── Selector.php └── StaticDom.php /.gitattributes: -------------------------------------------------------------------------------- 1 | /tests export-ignore 2 | /.scrutinizar.yml export-ignore 3 | /.travis.yml export-ignore 4 | /CHANGELOG.md export-ignore 5 | /CONTRIBUTING.md export-ignore 6 | /LICENSE.md export-ignore 7 | /README.md export-ignore 8 | /phpunit.php export-ignore 9 | /phpunit.xml export-ignore 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | composer.lock 2 | vendor/ 3 | phpunit.xml 4 | -------------------------------------------------------------------------------- /.scrutinizer.yml: -------------------------------------------------------------------------------- 1 | filter: 2 | paths: [src/*] 3 | excluded_paths: [tests/*] 4 | checks: 5 | php: 6 | code_rating: true 7 | remove_extra_empty_lines: true 8 | remove_php_closing_tag: true 9 | remove_trailing_whitespace: true 10 | fix_use_statements: 11 | remove_unused: true 12 | preserve_multiple: false 13 | preserve_blanklines: true 14 | order_alphabetically: true 15 | fix_php_opening_tag: true 16 | fix_linefeed: true 17 | fix_line_ending: true 18 | fix_identation_4spaces: true 19 | fix_doc_comments: true 20 | tools: 21 | external_code_coverage: 22 | timeout: 600 23 | runs: 3 24 | php_code_coverage: false 25 | php_code_sniffer: 26 | config: 27 | standard: PSR2 28 | filter: 29 | paths: ['src'] 30 | php_loc: 31 | enabled: true 32 | excluded_dirs: [vendor, test] 33 | php_cpd: 34 | enabled: true 35 | excluded_dirs: [vendor, test] 36 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "thesoftwarefanatics/php-html-parser", 3 | "type": "library", 4 | "description": "An HTML DOM parser. It allows you to manipulate HTML. Find tags on an HTML page with selectors just like jQuery.", 5 | "keywords": ["html", "dom", "parser"], 6 | "homepage": "https://github.com/thesoftwarefanatics/php-html-parser", 7 | "license": "MIT", 8 | "authors": [ 9 | { 10 | "name": "Gilles Paquette", 11 | "email": "paquettg@gmail.com", 12 | "homepage": "http://gillespaquette.ca" 13 | }, 14 | { 15 | "name": "The Software Fanatics GmbH", 16 | "email": "dev@thesoftwarefanatics.com", 17 | "homepage": "https://thesoftwarefanatics.com" 18 | } 19 | ], 20 | "require": { 21 | "php": "^7.1", 22 | "paquettg/string-encode": "^0.1.1" 23 | }, 24 | "require-dev": { 25 | "phpunit/phpunit": "^6.5", 26 | "php-coveralls/php-coveralls": "^2.0", 27 | "mockery/mockery": "^1.0" 28 | }, 29 | "replace": { 30 | "paquettg/php-html-parser": "self.version" 31 | }, 32 | "autoload": { 33 | "psr-0": { 34 | "PHPHtmlParser": "src/" 35 | } 36 | }, 37 | "extra": { 38 | "branch-alias": { 39 | "dev-master": "1.8.x-dev" 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /phpunit.xml.dist: -------------------------------------------------------------------------------- 1 | 2 | 13 | 14 | 15 | ./tests/ 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/PHPHtmlParser/Content.php: -------------------------------------------------------------------------------- 1 | '; 40 | protected $slash = " />\r\n\t"; 41 | protected $attr = ' >'; 42 | 43 | /** 44 | * Content constructor. 45 | * 46 | * @param $content 47 | */ 48 | public function __construct($content) 49 | { 50 | $this->content = $content; 51 | $this->size = strlen($content); 52 | $this->pos = 0; 53 | } 54 | 55 | /** 56 | * Returns the current position of the content. 57 | * 58 | * @return int 59 | */ 60 | public function getPosition() 61 | { 62 | return $this->pos; 63 | } 64 | 65 | /** 66 | * Gets the current character we are at. 67 | * 68 | * @param int $char 69 | * @return string 70 | */ 71 | public function char($char = null) 72 | { 73 | $pos = $this->pos; 74 | if ( ! is_null($char)) { 75 | $pos = $char; 76 | } 77 | 78 | if ( ! isset($this->content[$pos])) { 79 | return ''; 80 | } 81 | 82 | return $this->content[$pos]; 83 | } 84 | 85 | /** 86 | * Moves the current position forward. 87 | * 88 | * @param int $count 89 | * @return $this 90 | */ 91 | public function fastForward($count) 92 | { 93 | $this->pos += $count; 94 | 95 | return $this; 96 | } 97 | 98 | /** 99 | * Moves the current position backward. 100 | * 101 | * @param int $count 102 | * @return $this 103 | */ 104 | public function rewind($count) 105 | { 106 | $this->pos -= $count; 107 | if ($this->pos < 0) { 108 | $this->pos = 0; 109 | } 110 | 111 | return $this; 112 | } 113 | 114 | /** 115 | * Copy the content until we find the given string. 116 | * 117 | * @param string $string 118 | * @param bool $char 119 | * @param bool $escape 120 | * @return string 121 | */ 122 | public function copyUntil($string, $char = false, $escape = false) 123 | { 124 | if ($this->pos >= $this->size) { 125 | // nothing left 126 | return ''; 127 | } 128 | 129 | if ($escape) { 130 | $position = $this->pos; 131 | $found = false; 132 | while ( ! $found) { 133 | $position = strpos($this->content, $string, $position); 134 | if ($position === false) { 135 | // reached the end 136 | $found = true; 137 | continue; 138 | } 139 | 140 | if ($this->char($position - 1) == '\\') { 141 | // this character is escaped 142 | ++$position; 143 | continue; 144 | } 145 | 146 | $found = true; 147 | } 148 | } elseif ($char) { 149 | $position = strcspn($this->content, $string, $this->pos); 150 | $position += $this->pos; 151 | } else { 152 | $position = strpos($this->content, $string, $this->pos); 153 | } 154 | 155 | if ($position === false) { 156 | // could not find character, just return the remaining of the content 157 | $return = substr($this->content, $this->pos, $this->size - $this->pos); 158 | $this->pos = $this->size; 159 | 160 | return $return; 161 | } 162 | 163 | if ($position == $this->pos) { 164 | // we are at the right place 165 | return ''; 166 | } 167 | 168 | $return = substr($this->content, $this->pos, $position - $this->pos); 169 | // set the new position 170 | $this->pos = $position; 171 | 172 | return $return; 173 | } 174 | 175 | /** 176 | * Copies the content until the string is found and return it 177 | * unless the 'unless' is found in the substring. 178 | * 179 | * @param string $string 180 | * @param string $unless 181 | * @return string 182 | */ 183 | public function copyUntilUnless($string, $unless) 184 | { 185 | $lastPos = $this->pos; 186 | $this->fastForward(1); 187 | $foundString = $this->copyUntil($string, true, true); 188 | 189 | $position = strcspn($foundString, $unless); 190 | if ($position == strlen($foundString)) { 191 | return $string.$foundString; 192 | } 193 | // rewind changes and return nothing 194 | $this->pos = $lastPos; 195 | 196 | return ''; 197 | } 198 | 199 | /** 200 | * Copies the content until it reaches the token string., 201 | * 202 | * @param string $token 203 | * @param bool $char 204 | * @param bool $escape 205 | * @return string 206 | * @uses $this->copyUntil() 207 | */ 208 | public function copyByToken($token, $char = false, $escape = false) 209 | { 210 | $string = $this->$token; 211 | 212 | return $this->copyUntil($string, $char, $escape); 213 | } 214 | 215 | /** 216 | * Skip a given set of characters. 217 | * 218 | * @param string $string 219 | * @param bool $copy 220 | * @return $this|string 221 | */ 222 | public function skip($string, $copy = false) 223 | { 224 | $len = strspn($this->content, $string, $this->pos); 225 | 226 | // make it chainable if they don't want a copy 227 | $return = $this; 228 | if ($copy) { 229 | $return = substr($this->content, $this->pos, $len); 230 | } 231 | 232 | // update the position 233 | $this->pos += $len; 234 | 235 | return $return; 236 | } 237 | 238 | /** 239 | * Skip a given token of pre-defined characters. 240 | * 241 | * @param string $token 242 | * @param bool $copy 243 | * @return null|string 244 | * @uses $this->skip() 245 | */ 246 | public function skipByToken($token, $copy = false) 247 | { 248 | $string = $this->$token; 249 | 250 | return $this->skip($string, $copy); 251 | } 252 | } 253 | -------------------------------------------------------------------------------- /src/PHPHtmlParser/Curl.php: -------------------------------------------------------------------------------- 1 | root->innerHtml(); 101 | } 102 | 103 | /** 104 | * A simple wrapper around the root node. 105 | * 106 | * @param string $name 107 | * @return mixed 108 | */ 109 | public function __get($name) 110 | { 111 | return $this->root->$name; 112 | } 113 | 114 | /** 115 | * Attempts to load the dom from any resource, string, file, or URL. 116 | * 117 | * @param string $str 118 | * @param array $options 119 | * @return $this 120 | */ 121 | public function load($str, $options = []) 122 | { 123 | // check if it's a file 124 | if (strpos($str, "\n") === false && is_file($str)) { 125 | return $this->loadFromFile($str, $options); 126 | } 127 | // check if it's a url 128 | if (preg_match("/^https?:\/\//i", $str)) { 129 | return $this->loadFromUrl($str, $options); 130 | } 131 | 132 | return $this->loadStr($str, $options); 133 | } 134 | 135 | /** 136 | * Loads the dom from a document file/url 137 | * 138 | * @param string $file 139 | * @param array $options 140 | * @return $this 141 | */ 142 | public function loadFromFile($file, $options = []) 143 | { 144 | return $this->loadStr(file_get_contents($file), $options); 145 | } 146 | 147 | /** 148 | * Use a curl interface implementation to attempt to load 149 | * the content from a url. 150 | * 151 | * @param string $url 152 | * @param array $options 153 | * @param CurlInterface $curl 154 | * @return $this 155 | */ 156 | public function loadFromUrl($url, $options = [], CurlInterface $curl = null) 157 | { 158 | if (is_null($curl)) { 159 | // use the default curl interface 160 | $curl = new Curl; 161 | } 162 | $content = $curl->get($url); 163 | 164 | return $this->loadStr($content, $options); 165 | } 166 | 167 | /** 168 | * Parsers the html of the given string. Used for load(), loadFromFile(), 169 | * and loadFromUrl(). 170 | * 171 | * @param string $str 172 | * @param array $option 173 | * @return $this 174 | */ 175 | public function loadStr($str, $option) 176 | { 177 | $this->options = new Options; 178 | $this->options->setOptions($this->globalOptions) 179 | ->setOptions($option); 180 | 181 | $this->rawSize = strlen($str); 182 | $this->raw = $str; 183 | 184 | $html = $this->clean($str); 185 | 186 | $this->size = strlen($str); 187 | $this->content = new Content($html); 188 | 189 | $this->parse(); 190 | $this->detectCharset(); 191 | 192 | return $this; 193 | } 194 | 195 | /** 196 | * Sets a global options array to be used by all load calls. 197 | * 198 | * @param array $options 199 | * @return $this 200 | */ 201 | public function setOptions(array $options) 202 | { 203 | $this->globalOptions = $options; 204 | 205 | return $this; 206 | } 207 | 208 | /** 209 | * Find elements by css selector on the root node. 210 | * 211 | * @param string $selector 212 | * @param int $nth 213 | * @return array 214 | */ 215 | public function find($selector, $nth = null) 216 | { 217 | $this->isLoaded(); 218 | 219 | return $this->root->find($selector, $nth); 220 | } 221 | 222 | /** 223 | * Adds the tag (or tags in an array) to the list of tags that will always 224 | * be self closing. 225 | * 226 | * @param string|array $tag 227 | * @return $this 228 | */ 229 | public function addSelfClosingTag($tag) 230 | { 231 | if ( ! is_array($tag)) { 232 | $tag = [$tag]; 233 | } 234 | foreach ($tag as $value) { 235 | $this->selfClosing[] = $value; 236 | } 237 | 238 | return $this; 239 | } 240 | 241 | /** 242 | * Removes the tag (or tags in an array) from the list of tags that will 243 | * always be self closing. 244 | * 245 | * @param string|array $tag 246 | * @return $this 247 | */ 248 | public function removeSelfClosingTag($tag) 249 | { 250 | if ( ! is_array($tag)) { 251 | $tag = [$tag]; 252 | } 253 | $this->selfClosing = array_diff($this->selfClosing, $tag); 254 | 255 | return $this; 256 | } 257 | 258 | /** 259 | * Sets the list of self closing tags to empty. 260 | * 261 | * @return $this 262 | */ 263 | public function clearSelfClosingTags() 264 | { 265 | $this->selfClosing = []; 266 | 267 | return $this; 268 | } 269 | 270 | /** 271 | * Simple wrapper function that returns the first child. 272 | * 273 | * @return \PHPHtmlParser\Dom\AbstractNode 274 | */ 275 | public function firstChild() 276 | { 277 | $this->isLoaded(); 278 | 279 | return $this->root->firstChild(); 280 | } 281 | 282 | /** 283 | * Simple wrapper function that returns the last child. 284 | * 285 | * @return \PHPHtmlParser\Dom\AbstractNode 286 | */ 287 | public function lastChild() 288 | { 289 | $this->isLoaded(); 290 | 291 | return $this->root->lastChild(); 292 | } 293 | 294 | /** 295 | * Simple wrapper function that returns an element by the 296 | * id. 297 | * 298 | * @param string $id 299 | * @return \PHPHtmlParser\Dom\AbstractNode 300 | */ 301 | public function getElementById($id) 302 | { 303 | $this->isLoaded(); 304 | 305 | return $this->find('#'.$id, 0); 306 | } 307 | 308 | /** 309 | * Simple wrapper function that returns all elements by 310 | * tag name. 311 | * 312 | * @param string $name 313 | * @return array 314 | */ 315 | public function getElementsByTag($name) 316 | { 317 | $this->isLoaded(); 318 | 319 | return $this->find($name); 320 | } 321 | 322 | /** 323 | * Simple wrapper function that returns all elements by 324 | * class name. 325 | * 326 | * @param string $class 327 | * @return array 328 | */ 329 | public function getElementsByClass($class) 330 | { 331 | $this->isLoaded(); 332 | 333 | return $this->find('.'.$class); 334 | } 335 | 336 | /** 337 | * Checks if the load methods have been called. 338 | * 339 | * @throws NotLoadedException 340 | */ 341 | protected function isLoaded() 342 | { 343 | if (is_null($this->content)) { 344 | throw new NotLoadedException('Content is not loaded!'); 345 | } 346 | } 347 | 348 | /** 349 | * Cleans the html of any none-html information. 350 | * 351 | * @param string $str 352 | * @return string 353 | */ 354 | protected function clean($str) 355 | { 356 | if ($this->options->get('cleanupInput') != true) { 357 | // skip entire cleanup step 358 | return $str; 359 | } 360 | 361 | // remove white space before closing tags 362 | $str = preg_replace("#'\s+>#i", "'>", $str); 363 | $str = preg_replace('#"\s+>#i', '">', $str); 364 | 365 | // clean out the \n\r 366 | $replace = ' '; 367 | if ($this->options->get('preserveLineBreaks')) { 368 | $replace = ' '; 369 | } 370 | $str = str_replace(["\r\n", "\r", "\n"], $replace, $str); 371 | 372 | // strip the doctype 373 | $str = preg_replace("##i", '', $str); 374 | 375 | // strip out comments 376 | $str = preg_replace("##i", '', $str); 377 | 378 | // strip out cdata 379 | $str = preg_replace("##i", '', $str); 380 | 381 | // strip out