├── .gitignore ├── phpunit ├── bootstrap.php ├── test.html └── HtmlTest.php ├── .travis.yml ├── composer.json ├── test.php ├── LICENSE ├── phpunit.xml.dist ├── README.md └── src └── ParserDom.php /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | /vendor 3 | 4 | composer.lock 5 | -------------------------------------------------------------------------------- /phpunit/bootstrap.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | test 6 | 7 | 8 |

p1

9 |

p2

10 |

p3

11 |

p_id

12 |

p_id_2

13 |

p4

14 |
测试1
15 | 16 | 17 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bupt1987/html-parser", 3 | "license": "MIT", 4 | "description": "Html Parser", 5 | "homepage": "https://github.com/bupt1987/html-parser", 6 | "authors": [ 7 | { 8 | "name": "俊杰jerry", 9 | "email": "bupt1987@gmail.com", 10 | "homepage": "http://bupt1987.github.io", 11 | "role": "Developer" 12 | } 13 | ], 14 | "require": { 15 | "php": ">=5.5" 16 | }, 17 | "require-dev": { 18 | "phpunit/phpunit": "^4.8" 19 | }, 20 | "autoload": { 21 | "psr-4": { 22 | "HtmlParser\\": "src/" 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test.php: -------------------------------------------------------------------------------- 1 | find('ul.uni-blk-list02', 0); 22 | $oDom->find('a'); 23 | $oDom->find('ul'); 24 | $oDom->find('p'); 25 | } 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 俊杰Jerry 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /phpunit.xml.dist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /phpunit/HtmlTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('p4', $oDom->find('p', -1)->getPlainText()); 9 | $this->assertEquals('p_id', $oDom->find('p[id]', 0)->getPlainText()); 10 | $this->assertEquals('p_id_2', $oDom->find('p[id=p_id_2]', 0)->getPlainText()); 11 | $this->assertEquals('p2', $oDom->find('p[!id]', 1)->getPlainText()); 12 | $this->assertEquals('测试1', $oDom->find('#test1', 0)->getPlainText()); 13 | 14 | $oPClass = $oDom->find('p.test_class1', 0); 15 | 16 | $this->assertEquals('p1', $oPClass->getPlainText()); 17 | $this->assertEquals('test_class test_class1', $oPClass->getAttr('class')); 18 | 19 | $lCheck = array( 20 | 'p1', 21 | 'p2', 22 | 'p3', 23 | 'p_id', 24 | 'p_id_2', 25 | ); 26 | $lPTag = $oDom->find('p.test_class'); 27 | $this->assertEquals(5, count($lPTag)); 28 | $lPText = array(); 29 | foreach ($lPTag as $oPTag) { 30 | $lPText[] = $oPTag->getPlainText(); 31 | } 32 | $this->assertEquals($lCheck, $lPText); 33 | 34 | $this->assertEquals($oDom->node instanceof \DOMNode, true); 35 | 36 | } 37 | 38 | private static function getHtml() { 39 | static $sHtml; 40 | if ($sHtml === null) { 41 | $sHtml = file_get_contents(__DIR__ . '/test.html'); 42 | } 43 | return $sHtml; 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | HtmlParser 2 | =============== 3 | [![Total Downloads](https://img.shields.io/badge/downloads-9.4k-green.svg)](https://packagist.org/packages/bupt1987/html-parser) 4 | [![Build Status](https://api.travis-ci.org/bupt1987/html-parser.svg)](https://travis-ci.org/bupt1987/html-parser) 5 | 6 | php html解析工具,类似与PHP Simple HTML DOM Parser。 7 | 由于基于php模块dom,所以在解析html时的效率比 PHP Simple HTML DOM Parser 快好几倍。 8 | 9 | 10 | 注意:html代码必须是utf-8编码字符,如果不是请转成utf-8 11 | 如果有乱码的问题参考:http://www.fwolf.com/blog/post/314 12 | 13 | 现在支持composer 14 | 15 | "require": {"bupt1987/html-parser": "dev-master"} 16 | 17 | 加载composer 18 | require 'vendor/autoload.php'; 19 | 20 | ================================================================================ 21 | ##### *Example* 22 | ~~~ 23 | 27 | 28 | 29 | test 30 | 31 | 32 |

p1

33 |

p2

34 |

p3

35 |
测试1
36 | 37 | '; 38 | $html_dom = new \HtmlParser\ParserDom($html); 39 | $p_array = $html_dom->find('p.test_class'); 40 | $p1 = $html_dom->find('p.test_class1',0); 41 | $div = $html_dom->find('div#test1',0); 42 | foreach ($p_array as $p){ 43 | echo $p->getPlainText() . "\n"; 44 | } 45 | echo $div->getPlainText() . "\n"; 46 | echo $p1->getPlainText() . "\n"; 47 | echo $p1->getAttr('class') . "\n"; 48 | echo "show html:\n"; 49 | echo $div->innerHtml() . "\n"; 50 | echo $div->outerHtml() . "\n"; 51 | ?> 52 | ~~~ 53 | 54 | 基础用法 55 | ================================================================================ 56 | ~~~ 57 | // 查找所有a标签 58 | $ret = $html->find('a'); 59 | 60 | // 查找a标签的第一个元素 61 | $ret = $html->find('a', 0); 62 | 63 | // 查找a标签的倒数第一个元素 64 | $ret = $html->find('a', -1); 65 | 66 | // 查找所有含有id属性的div标签 67 | $ret = $html->find('div[id]'); 68 | 69 | // 查找所有含有id属性为foo的div标签 70 | $ret = $html->find('div[id=foo]'); 71 | ~~~ 72 | 73 | 高级用法 74 | ================================================================================ 75 | ~~~ 76 | // 查找所有id=foo的元素 77 | $ret = $html->find('#foo'); 78 | 79 | // 查找所有class=foo的元素 80 | $ret = $html->find('.foo'); 81 | 82 | // 查找所有拥有 id属性的元素 83 | $ret = $html->find('*[id]'); 84 | 85 | // 查找所有 anchors 和 images标记 86 | $ret = $html->find('a, img'); 87 | 88 | // 查找所有有"title"属性的anchors and images 89 | $ret = $html->find('a[title], img[title]'); 90 | ~~~ 91 | 92 | 层级选择器 93 | ================================================================================ 94 | ~~~ 95 | // Find all
  • in