├── .gitignore ├── .travis.yml ├── AthosHun └── HTMLFilter │ ├── Configuration.php │ ├── ConfigurationTest.php │ ├── HTMLFilter.php │ └── HTMLFilterTest.php ├── LICENSE ├── Makefile ├── README.md ├── build ├── generate_build_docs.sh └── travis-publish.sh ├── composer.json ├── phpunit.xml.dist └── sami.config.php /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | tmp 3 | phpunit.xml 4 | composer.lock 5 | vendor 6 | docs 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | php: 3 | - "5.3" 4 | - "5.4" 5 | - "5.5" 6 | - "5.6" 7 | branches: 8 | only: 9 | - master 10 | script: GHTOKEN= make all 11 | after_success: build/travis-publish.sh 12 | env: 13 | global: 14 | secure: "RgJfQTxHt5fu6C7f+R4IgPzMxMRFEQPIWj/qcKM8yCVr3mPWU5iOkGs3g5WHT9VDuBhjIRnv0BWFwXARmtgr55Wb07oxyKLYfRC47KPnWhPw0hP/ypQ4/W4h5LCAiJpAKjubpROJZvdWZfX4RndAZXrvdtXy3PX6Yxcdo8zXdwc=" 15 | -------------------------------------------------------------------------------- /AthosHun/HTMLFilter/Configuration.php: -------------------------------------------------------------------------------- 1 | allowed_tags_with_attributes = array(); 12 | } 13 | 14 | public function allowTag($tag_name) 15 | { 16 | if (!$this->isAllowedTag($tag_name)) { 17 | $this->allowed_tags_with_attributes[(string)$tag_name] = array(); 18 | } 19 | 20 | return $this; 21 | } 22 | 23 | public function allowAttribute( 24 | $tag_name, 25 | $attribute_name, 26 | $attribute_regexp = "/.*/" 27 | ) { 28 | $this->allowTag($tag_name); 29 | 30 | $tag = (string)$tag_name; 31 | $attr = (string)$attribute_name; 32 | $this->allowed_tags_with_attributes[$tag][$attr] = $attribute_regexp; 33 | 34 | return $this; 35 | } 36 | 37 | public function isAllowedTag($tag_name) 38 | { 39 | return array_key_exists((string)$tag_name, 40 | $this->allowed_tags_with_attributes); 41 | } 42 | 43 | public function isAllowedAttribute( 44 | $tag_name, 45 | $attribute_name, 46 | $attribute_value 47 | ) { 48 | if (!$this->isAllowedTag($tag_name)) { 49 | return false; 50 | } 51 | 52 | $tag = (string)$tag_name; 53 | $attr = (string)$attribute_name; 54 | 55 | $allowed_attributes = $this->allowed_tags_with_attributes[$tag]; 56 | if (!array_key_exists($attr, $allowed_attributes)) { 57 | return false; 58 | } 59 | 60 | $restriction = $allowed_attributes[$attr]; 61 | 62 | return 1 === preg_match($restriction, $attribute_value); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /AthosHun/HTMLFilter/ConfigurationTest.php: -------------------------------------------------------------------------------- 1 | filter_config = new Configuration(); 12 | } 13 | 14 | public function testTagsCanBeAllowedOrDisallowed() 15 | { 16 | $this->filter_config->allowTag("img"); 17 | 18 | $this->assertTagAllowed("img"); 19 | 20 | $this->assertTagDisallowed("script"); 21 | } 22 | 23 | private function assertTagAllowed($tag_name) 24 | { 25 | $this->assertTrue($this->filter_config->isAllowedTag($tag_name)); 26 | } 27 | 28 | private function assertTagDisallowed($tag_name) 29 | { 30 | $this->assertFalse($this->filter_config->isAllowedTag($tag_name)); 31 | } 32 | 33 | public function testAttributesCanBeRestrictedWithRegularExpression() 34 | { 35 | $this->filter_config->allowAttribute("img", "src", "/^hello\$/") ; 36 | 37 | $this->assertAttributeAllowed("img", "src", "hello"); 38 | 39 | $this->assertAttributeDisallowed("img", "src", "world"); 40 | $this->assertAttributeDisallowed("img", "alt", "hello"); 41 | $this->assertAttributeDisallowed("script", "src", "hello"); 42 | } 43 | 44 | private function assertAttributeAllowed($tag_name, $attr_name, $attr_value) 45 | { 46 | $this->assertTrue( 47 | $this->filter_config->isAllowedAttribute( 48 | $tag_name, 49 | $attr_name, 50 | $attr_value 51 | ) 52 | ); 53 | } 54 | 55 | private function assertAttributeDisallowed( 56 | $tag_name, 57 | $attr_name, 58 | $attr_value 59 | ) { 60 | $this->assertFalse( 61 | $this->filter_config->isAllowedAttribute( 62 | $tag_name, 63 | $attr_name, 64 | $attr_value 65 | ) 66 | ); 67 | } 68 | 69 | public function testAllowedAttributeImpliesAllowedTag() 70 | { 71 | $this->filter_config->allowAttribute("img", "src") ; 72 | 73 | $this->assertTagAllowed("img"); 74 | } 75 | 76 | public function testAllowingAnAlreadyAllowedTagKeepsAttributeRestrictions() 77 | { 78 | $this->filter_config->allowAttribute("img", "src", "/^hello\$/") 79 | ->allowTag("img"); 80 | 81 | $this->assertAttributeAllowed("img", "src", "hello"); 82 | 83 | $this->assertAttributeDisallowed("img", "src", "world"); 84 | } 85 | 86 | public function testMultipleTagsCanBeAllowed() 87 | { 88 | $this->filter_config->allowTag("pre") 89 | ->allowTag("blockquote") 90 | ->allowTag("strong"); 91 | 92 | $this->assertTagAllowed("pre"); 93 | $this->assertTagAllowed("blockquote"); 94 | $this->assertTagAllowed("strong"); 95 | 96 | $this->assertTagDisallowed("table"); 97 | } 98 | 99 | public function testMultipleAttributesCanBeAllowed() 100 | { 101 | $this->filter_config->allowAttribute("a", "href", "/^hello\$/") 102 | ->allowAttribute("a", "title", "/^lorem\$/"); 103 | 104 | $this->assertAttributeAllowed("a", "href", "hello"); 105 | $this->assertAttributeAllowed("a", "title", "lorem"); 106 | 107 | $this->assertAttributeDisallowed("a", "title", "hello"); 108 | $this->assertAttributeDisallowed("a", "href", "lorem"); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /AthosHun/HTMLFilter/HTMLFilter.php: -------------------------------------------------------------------------------- 1 | libxml_used_internal_errors = libxml_use_internal_errors(true); 20 | 21 | try { 22 | $this->initialize($config, $html_text); 23 | $this->copyAllowedNodes(); 24 | $filtered_html = $this->fetchFilteredHTML(); 25 | } catch (\Exception $error) { 26 | $this->cleanup(); 27 | throw $error; 28 | } 29 | 30 | $this->cleanup(); 31 | 32 | return $filtered_html; 33 | } 34 | 35 | private function initialize(Configuration $config, $html_text) 36 | { 37 | $this->config = $config; 38 | $html_text = mb_convert_encoding($html_text, "UTF-8", "UTF-8"); 39 | $this->original_dom = $this->createDOMDocument($html_text); 40 | $this->filtered_dom = $this->createDOMDocument(""); 41 | } 42 | 43 | private function createDOMDocument($html_text) 44 | { 45 | $dom_document = new \DOMDocument("1.0", "UTF-8"); 46 | $dom_document->loadHTML( 47 | "
$html_text" 48 | ); 49 | 50 | return $dom_document; 51 | } 52 | 53 | private function copyAllowedNodes() 54 | { 55 | $original_body = $this->findBodyNode($this->original_dom); 56 | $filtered_body = $this->findBodyNode($this->filtered_dom); 57 | $this->copyAllowedChildNodes($original_body, $filtered_body); 58 | } 59 | 60 | private function findBodyNode(\DOMDocument $dom_document) 61 | { 62 | return $dom_document->getElementsByTagName("body")->item(0); 63 | } 64 | 65 | private function copyAllowedChildNodes( 66 | \DOMNode $source, 67 | \DOMNode $destination 68 | ) { 69 | if (!$source->hasChildNodes()) { 70 | return; 71 | } 72 | 73 | for ($i = 0, $l = $source->childNodes->length; $i != $l; ++$i) { 74 | $node = $source->childNodes->item($i); 75 | 76 | if ($this->isAllowedNode($node)) { 77 | $this->copyNode($node, $destination); 78 | } else { 79 | $this->copyAllowedChildNodes($node, $destination); 80 | } 81 | } 82 | } 83 | 84 | private function isAllowedNode(\DOMNode $node) 85 | { 86 | return ($node instanceof \DOMText) 87 | || ($node instanceof \DOMComment) 88 | || ($this->config->isAllowedTag($node->nodeName)); 89 | } 90 | 91 | private function copyNode(\DOMNode $node, \DOMNode $destination) 92 | { 93 | if ($node instanceof \DOMText) { 94 | $this->copyTextNode($node, $destination); 95 | } else if ($node instanceof \DOMElement) { 96 | $this->copyDOMElement($node, $destination); 97 | } else if ($node instanceof \DOMComment) { 98 | $this->copyDOMComment($node, $destination); 99 | } 100 | } 101 | 102 | private function copyTextNode(\DOMText $text_node, \DOMNode $destination) 103 | { 104 | $destination->appendChild( 105 | $destination->ownerDocument->createTextNode($text_node->data) 106 | ); 107 | } 108 | 109 | private function copyDOMElement( 110 | \DOMElement $element, 111 | \DOMNode $destination 112 | ) { 113 | $copied_element = $destination->ownerDocument 114 | ->createElement($element->nodeName); 115 | $destination->appendChild($copied_element); 116 | $this->copyAllowedAttributes($element, $copied_element); 117 | $this->copyAllowedChildNodes($element, $copied_element); 118 | } 119 | 120 | private function copyAllowedAttributes( 121 | \DOMElement $source, 122 | \DOMElement $destination 123 | ) { 124 | for ($i = 0, $l = $source->attributes->length; $i != $l; ++$i) { 125 | $attribute = $source->attributes->item($i); 126 | 127 | if ($this->isAllowedAttribute($attribute)) { 128 | $this->copyAttribute($attribute, $destination); 129 | } 130 | } 131 | } 132 | 133 | private function isAllowedAttribute(\DOMAttr $attribute) 134 | { 135 | return $this->config->isAllowedAttribute( 136 | $attribute->ownerElement->nodeName, 137 | $attribute->name, 138 | $attribute->value 139 | ); 140 | } 141 | 142 | private function copyAttribute( 143 | \DOMAttr $attribute, 144 | \DOMElement $destination 145 | ) { 146 | $copied_attribute = $destination->ownerDocument 147 | ->createAttribute($attribute->name); 148 | $copied_attribute->value = htmlspecialchars( 149 | $attribute->value, 150 | ENT_QUOTES, 151 | "UTF-8" 152 | ); 153 | $destination->appendChild($copied_attribute); 154 | } 155 | 156 | private function copyDOMComment(\DOMComment $comment, \DOMNode $destination) 157 | { 158 | $destination->appendChild( 159 | $destination->ownerDocument->createComment($comment->data) 160 | ); 161 | } 162 | 163 | private function fetchFilteredHTML() 164 | { 165 | $filtered_html = $this->filtered_dom->saveXML( 166 | $this->findBodyNode($this->filtered_dom) 167 | ); 168 | 169 | return $this->trimBodyTags($filtered_html); 170 | } 171 | 172 | private function trimBodyTags($html_text) 173 | { 174 | if ($html_text === "") { 179 | $html_text = substr($html_text, 6); 180 | } 181 | 182 | if (substr($html_text, -7, 7) === "") { 183 | $html_text = substr($html_text, 0, strlen($html_text) - 7); 184 | } 185 | 186 | return $html_text; 187 | } 188 | 189 | private function cleanup() 190 | { 191 | $this->config = null; 192 | $this->original_dom = null; 193 | $this->filtered_dom = null; 194 | libxml_use_internal_errors($this->libxml_used_internal_errors); 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /AthosHun/HTMLFilter/HTMLFilterTest.php: -------------------------------------------------------------------------------- 1 | filter_config = new Configuration(); 14 | $this->filter = new HTMLFilter(); 15 | $this->mbstring_substitute_character = ini_get( 16 | "mbstring.substitute_character" 17 | ); 18 | ini_set("mbstring.substitute_character", "none"); 19 | } 20 | 21 | public function tearDown() 22 | { 23 | ini_set( 24 | "mbstring.substitute_character", 25 | $this->mbstring_substitute_character 26 | ); 27 | } 28 | 29 | public function testPlainTextWithoutAnyHtmlRemainsUnchanged() 30 | { 31 | $this->assertFilteredHTML("", ""); 32 | $this->assertFilteredHTML("hello", "hello"); 33 | } 34 | 35 | private function assertFilteredHTML($expected_html, $html) 36 | { 37 | $this->assertSame( 38 | $expected_html, 39 | $this->filter->filter($this->filter_config, $html) 40 | ); 41 | } 42 | 43 | public function testDisallowedTagsAreRemoved() 44 | { 45 | $this->assertFilteredHTML("lorem ipsum", "lorem ipsum"); 46 | } 47 | 48 | public function testAllowedTagsAreKept() 49 | { 50 | $this->filter_config->allowTag("a"); 51 | $html = "lorem ipsum"; 52 | 53 | $this->assertFilteredHTML($html, $html); 54 | } 55 | 56 | public function testDisallowedAttributesAreRemoved() 57 | { 58 | $this->filter_config->allowTag("a"); 59 | 60 | $this->assertFilteredHTML( 61 | "lorem ipsum", 62 | "lorem ipsum" 63 | ); 64 | } 65 | 66 | public function testAllowedAttributesAreKept() 67 | { 68 | $this->filter_config->allowTag("a") 69 | ->allowAttribute("a", "href"); 70 | 71 | $html = "lorem ipsum"; 72 | $this->assertFilteredHTML($html, $html); 73 | } 74 | 75 | public function testAllowedAttributesNotMatchingARegexpAreRemoved() 76 | { 77 | $this->filter_config->allowAttribute("a", "href", "/^hello\$/"); 78 | 79 | $this->assertFilteredHTML( 80 | "lorem ipsum", 81 | "lorem ipsum" 82 | ); 83 | } 84 | 85 | public function testHTMLEntitiesAreLeftUnchanged() 86 | { 87 | $this->filter_config->allowAttribute("a", "href"); 88 | 89 | $quoted_special_chars_attr = "<"&>'"; 90 | $quoted_special_chars_text = "<\"&>'"; 91 | $quoted_html = "" 92 | . $quoted_special_chars_text 93 | . ""; 94 | 95 | $this->assertFilteredHTML($quoted_html, $quoted_html); 96 | } 97 | 98 | public function testNodesAreCopiedRecursively() 99 | { 100 | $this->filter_config->allowTag("p") 101 | ->allowTag("b") 102 | ->allowAttribute("a", "href", "/^hello\$/"); 103 | 104 | $this->assertFilteredHTML( 105 | "Loremipsum dolor sit amet
", 106 | "Loremipsum dolor sit amet
" 107 | ); 108 | } 109 | 110 | public function testHtmlCommentsArePreserved() 111 | { 112 | $html = "Lorem Ipsum"; 113 | 114 | $this->assertFilteredHTML($html, $html); 115 | } 116 | 117 | public function testInvalidMarkupIsIgnored() 118 | { 119 | $this->assertFilteredHTML( 120 | "hello world", 121 | "$zh
$jp
$ko
" 134 | ); 135 | } 136 | 137 | public function testIgnoresInvalidUtf8() 138 | { 139 | $this->filter_config->allowTag("p"); 140 | 141 | $this->assertFilteredHTML( 142 | "prefixonclick=alert(42)>infix\n\nsuffix
", 143 | "prefixonclick=alert(42)>infix\xe6\xff\n\nsuffix" 144 | ); 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Attila M. Magyar 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | COMPOSER = $(shell which composer) 2 | 3 | DOCS_DIR ?= docs 4 | REPORTS_DIR = $(DOCS_DIR)/reports 5 | APIDOCS_DIR = $(DOCS_DIR)/api 6 | 7 | 8 | .PHONY: all 9 | all: coverage docs 10 | build/generate_build_docs.sh >$(DOCS_DIR)/index.html 11 | 12 | 13 | .PHONY: bootstrap 14 | bootstrap: 15 | $(COMPOSER) install 16 | [ -d $(DOCS_DIR) ] || mkdir $(DOCS_DIR) 17 | [ -d $(REPORTS_DIR) ] || mkdir $(REPORTS_DIR) 18 | [ -d $(APIDOCS_DIR) ] || mkdir $(APIDOCS_DIR) 19 | 20 | .PHONY: phpunitconfig 21 | phpunitconfig: 22 | [ -f phpunit.xml ] || cp phpunit.xml.dist phpunit.xml 23 | 24 | .PHONY: check 25 | check: bootstrap phpunitconfig 26 | ./vendor/bin/phpunit --configuration=phpunit.xml \ 27 | --testdox-html $(REPORTS_DIR)/phpunit.html 28 | 29 | .PHONY: coverage 30 | coverage: bootstrap phpunitconfig 31 | ./vendor/bin/phpunit --configuration=phpunit.xml \ 32 | --coverage-clover $(REPORTS_DIR)/phpunit_clover.xml \ 33 | --coverage-html $(REPORTS_DIR)/phpunit_coverage \ 34 | --testdox-html $(REPORTS_DIR)/phpunit.html \ 35 | --coverage-text \ 36 | | grep -v '^\(\\\|\( Methods: .*Lines: \)\)' \ 37 | | tee $(REPORTS_DIR)/phpunit_coverage.txt 38 | 39 | .PHONY: docs 40 | docs: bootstrap 41 | php ./vendor/bin/sami.php update ./sami.config.php 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | HTMLFilter 2 | ========== 3 | 4 | [](https://travis-ci.org/attilammagyar/html-filter) [Latest build report][latestbuild] 6 | 7 | [latestbuild]: http://attilammagyar.github.io/html-filter/ 8 | 9 | Remove tags or attributes based on a whitelist from a snippet of somewhat 10 | well-formatted HTML text using PHP's DOM library. 11 | 12 | Example: 13 | 14 | ```php 15 | allowTag("p") 19 | ->allowAttribute("a", "title") 20 | ->allowAttribute("a", "href", "|^https?://.*\$|"); 21 | 22 | $filter = new AthosHun\HTMLFilter\HTMLFilter(); 23 | 24 | $html = <<dolor sit amet 26 |
27 | Consectetur adipisicing 28 | elit. 29 |
30 | HTML; 31 | 32 | print $filter->filter($config, $html); 33 | ``` 34 | 35 | Output: 36 | 37 | ```html 38 | Lorem ipsum dolor sit amet 39 |40 | Consectetur adipisicing 41 | elit. 42 |
43 | ``` 44 | 45 | Installation 46 | ------------ 47 | 48 | Installation is possible via [Composer][composer]. Create a file named 49 | `composer.json` in your project directory with the following contents: 50 | 51 | [composer]: http://getcomposer.org/ 52 | 53 | { 54 | "require": { 55 | "athoshun/html-filter": "2.0.*" 56 | } 57 | } 58 | 59 | Then as a normal user, issue the following commands: 60 | 61 | $ curl http://getcomposer.org/installer | php 62 | $ php composer.phar install 63 | -------------------------------------------------------------------------------- /build/generate_build_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NOW="`date`" 4 | 5 | function cat_html() 6 | { 7 | local text_file="$1" 8 | 9 | cat "$text_file" \ 10 | | sed "s/&/\\&/g" \ 11 | | sed "s/\\</g" \ 12 | | sed "s/>/\\>/g" \ 13 | | sed "s/\"/\\"/g" \ 14 | | sed "s/'/\\'/g" 15 | } 16 | 17 | cat < 19 | 20 | 21 |Generated: $NOW
63 |`cat_html README.md`71 |
`cat_html docs/reports/phpunit_coverage.txt`73 |