├── CHANGELOG.md ├── LICENSE.md ├── README.md ├── composer.json └── src ├── AttributeFinder.php ├── Filter ├── AttributeCleaner.php ├── AttributeContent │ ├── CompactExplodedWords.php │ ├── DecodeEntities.php │ └── DecodeUtf8.php ├── AttributeContentCleaner.php ├── EscapeTags.php ├── FilterRunner.php ├── MetaRefresh.php ├── RemoveAttributes.php └── RemoveBlocks.php ├── FilterInterface.php ├── FilterRunnerTrait.php ├── Sanitizer.php ├── TagFinder ├── ByAttribute.php └── ByTag.php └── TagFinderInterface.php /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](http://keepachangelog.com/) 5 | and this project adheres to [Semantic Versioning](http://semver.org/). 6 | 7 | ## [Unreleased] 8 | 9 | ## [2.1.0] - 2024-06-08 10 | ### Added 11 | - Optional constructor params to specify extra tags or attributes to be removed. 12 | HT @EmanuelAugusto 13 | 14 | ## [2.0.2] - 2023-08-01 15 | ### Removed 16 | - Remove test dependency asset files, and remove test files from dist package. 17 | 18 | ## [2.0.1] - 2022-06-08 19 | ### Fixed 20 | - Attribute cleaner error when filtered tag has empty content. 21 | 22 | ## [2.0.0] - 2021-08-17 23 | ### Added 24 | - Add specific support for PHP v8. 25 | - Type declarations have been added to all properties, method parameters and 26 | return types where possible. 27 | - Add explicit dependency for `mbstring` PHP extension. This has always been 28 | required, but not previously listed in the platform dependencies. 29 | ### Changed 30 | - Use SemVer for dependency versions. 31 | - **BC break**: Reduce visibility of internal methods and properties. These 32 | members are not part of the public API. No impact to standard use of this 33 | package. If an implementation has a use case which needs to override these 34 | members, please submit a pull request explaining the change. 35 | ### Removed 36 | - **BC break**: Removed support for PHP versions <= v7.3 as they are no longer 37 | [actively supported](https://php.net/supported-versions.php) by the PHP project. 38 | 39 | ## [1.1.1] - 2022-09-06 40 | ### Fixed 41 | - Attribute cleaner error when filtered tag has empty content. 42 | 43 | ## [1.1.0] - 2016-11-01 44 | - Adds a method for sanitizing an array of strings. 45 | 46 | ## [1.0.0] - 2016-03-18 47 | - Reduce regex backtracking. 48 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # phlib/xss-sanitizer 2 | 3 | [![Code Checks](https://img.shields.io/github/actions/workflow/status/phlib/xss-sanitizer/code-checks.yml?logo=github)](https://github.com/phlib/xss-sanitizer/actions/workflows/code-checks.yml) 4 | [![Codecov](https://img.shields.io/codecov/c/github/phlib/xss-sanitizer.svg?logo=codecov)](https://codecov.io/gh/phlib/xss-sanitizer) 5 | [![Latest Stable Version](https://img.shields.io/packagist/v/phlib/xss-sanitizer.svg?logo=packagist)](https://packagist.org/packages/phlib/xss-sanitizer) 6 | [![Total Downloads](https://img.shields.io/packagist/dt/phlib/xss-sanitizer.svg?logo=packagist)](https://packagist.org/packages/phlib/xss-sanitizer) 7 | ![Licence](https://img.shields.io/github/license/phlib/xss-sanitizer.svg) 8 | 9 | PHP XSS sanitizer tool for HTML 10 | 11 | ## Disclaimer 12 | 13 | Use [HTML Purifier](http://htmlpurifier.org/). 14 | 15 | This library was created to try to solve the problem of XSS sanitization without 16 | using a permissive list, since the HTML which is being sanitized may contain 17 | non-standard or unusual syntax (e.g. HTML for emails). 18 | 19 | This library is also intended for a limited use case whereby it is assumed that 20 | the sanitized HTML is only going to be displayed in a limited set of supported 21 | browsers (e.g. no need to strip 'vbscript:' code). 22 | 23 | ## Install 24 | 25 | Via Composer 26 | 27 | ``` bash 28 | $ composer require phlib/xss-sanitizer 29 | ``` 30 | 31 | ## Usage 32 | 33 | Create a sanitizer and sanitize some input: 34 | 35 | ```php 36 | $sanitizer = new \Phlib\XssSanitizer\Sanitizer(); 37 | $sanitized = $sanitizer->sanitize($htmlInput); 38 | ``` 39 | 40 | Optionally, extra tags and/or attributes can be specified to be removed, 41 | in addition to the defaults: 42 | 43 | ```php 44 | $removeBlocks = ['xss']; 45 | $removeAttributes = ['onwebkittransitionend']; 46 | $sanitizer = new \Phlib\XssSanitizer\Sanitizer($removeBlocks, $removeAttributes); 47 | $sanitized = $sanitizer->sanitize($htmlInput); 48 | ``` 49 | 50 | ## Supported Browsers 51 | 52 | This library is intended to prevent XSS vulnerabilities when the resulting HTML is rendered by any of the following browsers: 53 | 54 | * Chrome (40+) 55 | * Firefox (40+) 56 | * Safari (8+) 57 | * IE (10, 11) 58 | * Edge 59 | 60 | ## License 61 | 62 | This package is free software: you can redistribute it and/or modify 63 | it under the terms of the GNU Lesser General Public License as published by 64 | the Free Software Foundation, either version 3 of the License, or 65 | (at your option) any later version. 66 | 67 | This program is distributed in the hope that it will be useful, 68 | but WITHOUT ANY WARRANTY; without even the implied warranty of 69 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 70 | GNU Lesser General Public License for more details. 71 | 72 | You should have received a copy of the GNU Lesser General Public License 73 | along with this program. If not, see . 74 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "phlib/xss-sanitizer", 3 | "type": "library", 4 | "description": "PHP XSS sanitizer tool for HTML", 5 | "license": "LGPL-3.0", 6 | "authors": [ 7 | { 8 | "name": "Martin Price" 9 | }, 10 | { 11 | "name": "Phlib Team & Contributors", 12 | "homepage": "https://github.com/phlib/xss-sanitizer/contributors" 13 | } 14 | ], 15 | "autoload": { 16 | "psr-4": { 17 | "Phlib\\XssSanitizer\\": "src/" 18 | } 19 | }, 20 | "require": { 21 | "php" : "^7.4 || ^8.0", 22 | "ext-mbstring": "*" 23 | }, 24 | "require-dev": { 25 | "phpunit/phpunit": "^9", 26 | "symplify/easy-coding-standard": "^12" 27 | }, 28 | "autoload-dev": { 29 | "psr-4": { 30 | "Phlib\\XssSanitizer\\Test\\": "tests/" 31 | } 32 | }, 33 | "scripts": { 34 | "check-cs": "vendor/bin/ecs check --ansi", 35 | "fix-cs": "vendor/bin/ecs check --fix --ansi" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/AttributeFinder.php: -------------------------------------------------------------------------------- 1 | attributes = $attributes; 27 | $this->optimisticSearchRegex = $this->initOptimisticSearchRegex(); 28 | $this->pessimisticSearchRegex = $this->initPessimisticSearchRegex($attributes); 29 | } 30 | 31 | /** 32 | * Given the attributes string of an element, finds the required attribute(s) and calls the callback, providing the 33 | * full attribute string and the content (value) of the attribute 34 | * 35 | * The return value is used to replace the full attribute string 36 | * 37 | * e.g. for an attribute finder which is looking for the 'href' attribute 38 | * for the string 39 | * 'a href="something" id="link"' 40 | * the callback will provide 41 | * $fullAttribute: 'href="something"' 42 | * $attributeContent: 'something' 43 | * and the return from the callback would replace the $fullAttribute in the original string 44 | */ 45 | public function findAttributes(string $attributes, callable $callback): string 46 | { 47 | $filtered = []; 48 | 49 | $this->findAttributesOptimistic($attributes, $callback, $filtered); 50 | 51 | $this->findAttributesPessimistic($attributes, $callback, $filtered); 52 | 53 | return implode('', $filtered); 54 | } 55 | 56 | /** 57 | * Find attributes hoping for well-formed and valid HTML 58 | * 59 | * This should prevent a certain number of false positives by nicely handling attributes which are 60 | * well formed and syntactically good 61 | * 62 | * This allows us to ignore cases where an attribute name appears in the context of an attribute value 63 | * when we know that the attribute is well formed 64 | */ 65 | private function findAttributesOptimistic(string &$attributes, callable $callback, array &$filtered): void 66 | { 67 | while (preg_match($this->optimisticSearchRegex, $attributes, $matches)) { 68 | $attributes = substr($attributes, strlen($matches[0])); 69 | if (in_array(strtolower($matches[3]), $this->attributes, true)) { 70 | $replacement = $callback($matches[2], $matches[5]); 71 | } else { 72 | $replacement = $matches[2]; 73 | } 74 | $filtered[] = $matches[1]; // whitespace 75 | $filtered[] = $replacement; 76 | } 77 | } 78 | 79 | private function initOptimisticSearchRegex(): string 80 | { 81 | return implode('', [ 82 | '#', 83 | '^(\s*)', // group 1 (whitespace) 84 | '(', // group 2 (full attribute) 85 | '([a-z]+)', // group 3 (attribute name) 86 | '=', 87 | '(["\'])', // group 4 (quote) 88 | '(.*?)', // group 5 (attribute value) 89 | '\4', 90 | ')', 91 | '#si', 92 | ]); 93 | } 94 | 95 | /** 96 | * Find attributes where the attribute syntax may not be well formed 97 | * 98 | * This acts as a fallback when the optimistic search is not able to parse the attributes 99 | * 100 | * Here, we aren't too bothered about false positives; we want to make sure we catch all and any possibilities 101 | * of the attribute appearing, which may include occurances within an attribute value 102 | */ 103 | private function findAttributesPessimistic(string $attributes, callable $callback, array &$filtered): void 104 | { 105 | $filtered[] = preg_replace_callback( 106 | $this->pessimisticSearchRegex, 107 | function ($matches) use ($callback) { 108 | $attributeContents = ''; 109 | if (isset($matches[2]) && $matches[2]) { 110 | $attributeContents = $matches[2]; // quoted contents 111 | } elseif (isset($matches[3]) && $matches[3]) { 112 | $attributeContents = $matches[3]; // unquoted contents 113 | } 114 | return $callback($matches[0], $attributeContents); 115 | }, 116 | $attributes 117 | ); 118 | } 119 | 120 | private function initPessimisticSearchRegex(array $attributes): string 121 | { 122 | $attributes = '(?:' . implode('|', $attributes) . ')'; 123 | return implode('', [ 124 | '#', 125 | '(?])*)', // everything up to space or '>' 136 | ')', 137 | '#si', 138 | ]); 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/Filter/AttributeCleaner.php: -------------------------------------------------------------------------------- 1 | tagFinder = $tags ? new TagFinder\ByTag($tags) : new TagFinder\ByAttribute($attribute); 31 | $this->attrFinder = new AttributeFinder($attribute); 32 | 33 | $this->contentRegex = $this->buildContentRegex(); 34 | 35 | $this->attributeContentCleaner = $attributeContentCleaner; 36 | } 37 | 38 | /** 39 | * Given the tags and attribute to look for, will search for tags with that attribute containing potential XSS 40 | * exploits, and remove the attribute if found 41 | * 42 | * e.g. with $tags='a' and $attr='href' 43 | * 44 | * should become 45 | * 46 | */ 47 | public function filter(string $str): string 48 | { 49 | $str = $this->tagFinder->findTags($str, function ($fullTag, $attributes): string { 50 | return $this->cleanAttributes($fullTag, $attributes); 51 | }); 52 | 53 | return $str; 54 | } 55 | 56 | /** 57 | * Search for the attribute in the tags, and clean it if found 58 | * 59 | * @param string $fullTag (e.g. '') 60 | * @param string $attributes (e.g. 'a href="javascript:alert('XSS');"') 61 | */ 62 | private function cleanAttributes(string $fullTag, string $attributes): string 63 | { 64 | $replacement = $this->attrFinder->findAttributes($attributes, function ($fullAttribute, $attributeContents): string { 65 | return $this->cleanAttribute($fullAttribute, $attributeContents); 66 | }); 67 | 68 | return str_ireplace($attributes, $replacement, $fullTag); 69 | } 70 | 71 | /** 72 | * Search the attribute content for any potential exploits, and return empty string 73 | * 74 | * @param string $fullAttribute (e.g. 'href="javascript:alert('XSS');"') 75 | * @param string $attributeContents (e.g. 'javascript:alert('XSS');') 76 | */ 77 | private function cleanAttribute(string $fullAttribute, string $attributeContents): string 78 | { 79 | // decode entities, compact words etc. 80 | $cleanedContents = $this->attributeContentCleaner->filter($attributeContents); 81 | 82 | if (preg_match($this->contentRegex, $cleanedContents)) { 83 | return ''; 84 | } 85 | 86 | return $fullAttribute; 87 | } 88 | 89 | private function buildContentRegex(): string 90 | { 91 | $dangerous = [ 92 | 'javascript:', 93 | ]; 94 | 95 | return implode('', [ 96 | '#', 97 | '(', implode('|', $dangerous), ')', 98 | '#i', 99 | ]); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/Filter/AttributeContent/CompactExplodedWords.php: -------------------------------------------------------------------------------- 1 | wordsRegex = $this->buildWordsRegex(); 19 | } 20 | 21 | /** 22 | * Compacts certain potentially dangerous words which have had whtespace added between the letters 23 | * 24 | * e.g. 25 | * j a v a s c r i p t 26 | * becomes 27 | * javascript 28 | */ 29 | public function filter(string $str): string 30 | { 31 | $str = preg_replace_callback( 32 | $this->wordsRegex, 33 | function ($matches): string { 34 | return preg_replace('/\s+/', '', $matches[1]) . $matches[2]; 35 | }, 36 | $str 37 | ); 38 | 39 | return $str; 40 | } 41 | 42 | private function buildWordsRegex(): string 43 | { 44 | $rawWords = [ 45 | 'javascript', 46 | 'refresh', /* @see Phlib\XssSanitizer\Filter\MetaRefresh */ 47 | ]; 48 | 49 | $words = []; 50 | foreach ($rawWords as $word) { 51 | $words[] = chunk_split($word, 1, '\s*'); 52 | } 53 | 54 | return implode('', [ 55 | '#(', implode('|', $words), ')(\W|$)#is', 56 | ]); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/Filter/AttributeContent/DecodeEntities.php: -------------------------------------------------------------------------------- 1 | entityRegex = $this->buildEntityRegex(); 19 | } 20 | 21 | /** 22 | * Decode HTML entities in an attribute content string 23 | * 24 | * e.g. 25 | * javascript:alert('XSS'); 26 | * becomes 27 | * javascript:alert('XSS'); 28 | */ 29 | public function filter(string $str): string 30 | { 31 | $str = preg_replace_callback( 32 | $this->entityRegex, 33 | function ($matches): string { 34 | if ($matches[1]) { 35 | $entity = "&#{$matches[1]};"; 36 | } else { 37 | $entity = "&#x{$matches[2]};"; 38 | } 39 | return mb_convert_encoding($entity, 'UTF-8', 'HTML-ENTITIES'); 40 | }, 41 | $str 42 | ); 43 | return $str; 44 | } 45 | 46 | private function buildEntityRegex(): string 47 | { 48 | return implode('', [ 49 | '/', 50 | '&#', 51 | '(?:', 52 | // decimal 53 | '(?:0*)', // ignore zero padding 54 | '([0-9]+)', 55 | '|', 56 | // hexadecimal 57 | 'x(?:0*)', // ignore zero padding 58 | '([0-9a-f]+)', 59 | ')', 60 | '(;)?', 61 | '/i', 62 | ]); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/Filter/AttributeContent/DecodeUtf8.php: -------------------------------------------------------------------------------- 1 | initFilters(); 25 | } 26 | 27 | /** 28 | * Filters the content of an attribute 29 | * This should be decoding UTF-8 and HTML entities, and compacting any exploded words which we're searching for 30 | * 31 | * e.g. 32 | * \u006A a v a s c r i p t:alert('XSS'); 33 | * should become 34 | * javascript:alert('XSS'); 35 | */ 36 | public function filter(string $str): string 37 | { 38 | return $this->runFilters($str, $this->filters); 39 | } 40 | 41 | private function initFilters(): void 42 | { 43 | $this->filters = [ 44 | new AttributeContent\DecodeUtf8(), 45 | new AttributeContent\DecodeEntities(), 46 | new AttributeContent\CompactExplodedWords(), 47 | ]; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/Filter/EscapeTags.php: -------------------------------------------------------------------------------- 1 | searchRegex = $this->initSearchRegex($tags); 22 | } 23 | 24 | /** 25 | * Filter tags by html encoding the opening angle bracket 26 | * 27 | * e.g. 28 | * 29 | * becomes 30 | * <script type="text/javascript">alert('XSS');</script> 31 | */ 32 | public function filter(string $str): string 33 | { 34 | $str = preg_replace($this->searchRegex, '<\1', $str); 35 | 36 | return $str; 37 | } 38 | 39 | /** 40 | * @param string|string[] $tags 41 | */ 42 | private function initSearchRegex($tags): string 43 | { 44 | if (is_array($tags)) { 45 | $tags = '(?:' . implode('|', $tags) . ')'; 46 | } 47 | return implode('', [ 48 | '#', 49 | '<', 50 | '(/?', $tags, ')', 51 | '#si', 52 | ]); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/Filter/FilterRunner.php: -------------------------------------------------------------------------------- 1 | filters = $filters; 31 | } 32 | 33 | /** 34 | * Runs each of the filters against the string repeatedly 35 | */ 36 | public function filter(string $str): string 37 | { 38 | return $this->runFilters($str, $this->filters); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/Filter/MetaRefresh.php: -------------------------------------------------------------------------------- 1 | tagFinder = new TagFinder\ByTag('meta'); 25 | $this->attrFinder = new AttributeFinder('http-equiv'); 26 | 27 | $this->attributeContentCleaner = $attributeContentCleaner; 28 | } 29 | 30 | /** 31 | * Removes refresh meta tags 32 | * @see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#META 33 | * 34 | * e.g. 35 | * 36 | * would be removed 37 | */ 38 | public function filter(string $str): string 39 | { 40 | $str = $this->tagFinder->findTags($str, function ($fullTag, $attributes): string { 41 | return $this->cleanTag($fullTag, $attributes); 42 | }); 43 | return $str; 44 | } 45 | 46 | /** 47 | * Replaces the tag with an empty string if the 'http-equiv' is set to 'refresh' 48 | * 49 | * @param string $fullTag (e.g. '') 50 | * @param string $attributes (e.g. 'meta http-equiv="refresh"') 51 | */ 52 | private function cleanTag(string $fullTag, string $attributes): string 53 | { 54 | $isRefreshTag = false; 55 | 56 | $this->attrFinder->findAttributes($attributes, function ($full, $contents) use (&$isRefreshTag) { 57 | $cleanedContents = $this->attributeContentCleaner->filter($contents); 58 | if (preg_match('/refresh/i', $cleanedContents)) { 59 | $isRefreshTag = true; 60 | } 61 | return $full; 62 | }); 63 | 64 | if ($isRefreshTag) { 65 | $fullTag = ''; 66 | } 67 | return $fullTag; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/Filter/RemoveAttributes.php: -------------------------------------------------------------------------------- 1 | tagFinder = new TagFinder\ByAttribute($attributes); 44 | $this->attributeFinder = new AttributeFinder($attributes); 45 | } 46 | 47 | /** 48 | * Filter unwanted attributes from tags 49 | * 50 | * This includes event handler attributes ('onload', 'onclick' etc.) 51 | * e.g. '' 52 | */ 53 | public function filter(string $str): string 54 | { 55 | $str = $this->tagFinder->findTags($str, function ($fullTag, $attributes): string { 56 | return $this->removeAttribute($fullTag, $attributes); 57 | }); 58 | 59 | return $str; 60 | } 61 | 62 | /** 63 | * Removes unwanted attributes from a particular tag 64 | * 65 | * @param string $fullTag (e.g. '') 66 | * @param string $attributes (e.g. 'a onclick="alert('XSS');"') 67 | */ 68 | private function removeAttribute(string $fullTag, string $attributes): string 69 | { 70 | $replacement = $this->attributeFinder->findAttributes($attributes, function (): string { 71 | return ''; 72 | }); 73 | 74 | return str_ireplace($attributes, $replacement, $fullTag); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/Filter/RemoveBlocks.php: -------------------------------------------------------------------------------- 1 | searchRegex = $this->initSearchRegex($tags); 22 | } 23 | 24 | /** 25 | * Filter tags out of the HTML by removing whole blocks (from opening tag to closing tag) 26 | * 27 | * This filter should be used in conjunction with @see \Phlib\XssSanitizer\Filter\EscapeTags to ensure that any 28 | * tags which are not picked up will be escaped 29 | * 30 | * e.g. 31 | * 32 | * becomes 33 | * 34 | */ 35 | public function filter(string $str): string 36 | { 37 | $str = preg_replace($this->searchRegex, '', $str); 38 | 39 | return $str; 40 | } 41 | 42 | /** 43 | * @param string|string[] $tags 44 | */ 45 | private function initSearchRegex($tags): string 46 | { 47 | if (is_array($tags)) { 48 | $tags = '(?:' . implode('|', $tags) . ')'; 49 | } 50 | return implode('', [ 51 | '#', 52 | // open tag 53 | '<', 54 | '(', $tags, ')', 55 | '([^>]*?)', 56 | '>', 57 | // content 58 | '.*?', 59 | // closing tag 60 | ']*?)', 63 | '(>|$)', 64 | '#si', 65 | ]); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/FilterInterface.php: -------------------------------------------------------------------------------- 1 | applyEachFilter($str, $filters); 22 | } while ($pre !== $str); 23 | 24 | return $str; 25 | } 26 | 27 | /** 28 | * Apply each filter in the filters array 29 | * 30 | * @param FilterInterface[] $filters 31 | */ 32 | private function applyEachFilter(string $str, array $filters): string 33 | { 34 | foreach ($filters as $filter) { 35 | $str = $filter->filter($str); 36 | } 37 | return $str; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/Sanitizer.php: -------------------------------------------------------------------------------- 1 | removeBlocks = ['script', 'iframe', 'object', ...$removeBlocks]; 26 | $this->removeAttributes = $removeAttributes; 27 | $this->initFilters(); 28 | } 29 | 30 | public function sanitize(string $str): string 31 | { 32 | $str = $this->runFilters($str, $this->filters); 33 | 34 | return $str; 35 | } 36 | 37 | /** 38 | * Sanitize an array of HTML strings 39 | * 40 | * @param string[] $strings 41 | * @return string[] 42 | */ 43 | public function sanitizeArray(array $strings): array 44 | { 45 | foreach ($strings as &$str) { 46 | $str = $this->sanitize($str); 47 | } 48 | 49 | return $strings; 50 | } 51 | 52 | private function initFilters(): void 53 | { 54 | $this->filters = []; 55 | 56 | $attributeContentCleaner = new Filter\AttributeContentCleaner(); 57 | $this->filters[] = new Filter\AttributeCleaner('href', $attributeContentCleaner, ['a', 'link']); 58 | $this->filters[] = new Filter\AttributeCleaner('src', $attributeContentCleaner, ['img', 'input', 'bgsound']); 59 | $this->filters[] = new Filter\AttributeCleaner('action', $attributeContentCleaner, ['form']); 60 | $this->filters[] = new Filter\AttributeCleaner('background', $attributeContentCleaner); 61 | $this->filters[] = new Filter\FilterRunner( 62 | // Keep trying to remove blocks before escaping the tags 63 | new Filter\RemoveBlocks($this->removeBlocks) 64 | ); 65 | $this->filters[] = new Filter\EscapeTags($this->removeBlocks); 66 | $this->filters[] = new Filter\RemoveAttributes($this->removeAttributes); 67 | $this->filters[] = new Filter\MetaRefresh($attributeContentCleaner); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/TagFinder/ByAttribute.php: -------------------------------------------------------------------------------- 1 | initialSearchRegex = $this->initInitialSearchRegex($attributes); 22 | } 23 | 24 | /** 25 | * Given a full html string, finds the required tags by attribute and calls the callback, 26 | * providing the full tag string and the attributes string 27 | */ 28 | public function findTags(string $str, callable $callback): string 29 | { 30 | $searchOffset = 0; 31 | while (preg_match($this->initialSearchRegex, $str, $matches, PREG_OFFSET_CAPTURE, $searchOffset)) { 32 | $attr = $matches[0][0]; 33 | $offset = $matches[0][1]; 34 | 35 | $searchOffset = $offset + strlen($attr); 36 | 37 | $startOfTag = $this->findStartOfTag(substr($str, 0, $offset)); 38 | if (!$startOfTag) { 39 | continue; 40 | } 41 | $endOfTag = $this->findEndOfTag(substr($str, $offset + strlen($attr))); 42 | if (!$endOfTag) { 43 | continue; 44 | } 45 | 46 | $fullTag = implode('', [$startOfTag[0], $attr, $endOfTag[0]]); 47 | $attributes = implode('', [$startOfTag[1], $attr, $endOfTag[1]]); 48 | 49 | $replacement = $callback($fullTag, $attributes); 50 | 51 | $tagOffset = $offset - strlen($startOfTag[0]); 52 | $str = substr_replace($str, $replacement, $tagOffset, strlen($fullTag)); 53 | 54 | // continue searching from after the end of the replaced tag 55 | $searchOffset = $tagOffset + strlen($replacement); 56 | } 57 | 58 | return $str; 59 | } 60 | 61 | /** 62 | * @param string|string[] $attributes 63 | */ 64 | private function initInitialSearchRegex($attributes): string 65 | { 66 | if (is_array($attributes)) { 67 | $attributes = '(?:' . implode('|', $attributes) . ')'; 68 | } 69 | 70 | return implode('', [ 71 | '#', 72 | '(?]*', // https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#Non-alpha-non-digit_XSS 75 | '=', 76 | '#si', 77 | ]); 78 | } 79 | 80 | /** 81 | * Finds the start of the tag from the attribute found 82 | * 83 | * If the start of the tag is found, returns the matches array with the full tag start and attributes start 84 | * If not found, returns null 85 | */ 86 | private function findStartOfTag(string $beforeStr): ?array 87 | { 88 | // Searching backwards from the found attribute 89 | $startTag = preg_match('#^([^>]+)[a-z]<#si', strrev($beforeStr), $matches); 90 | if (!$startTag) { 91 | return null; 92 | } 93 | // reverse back again 94 | return array_map('strrev', $matches); 95 | } 96 | 97 | /** 98 | * Finds the end of the tag from the attribute found 99 | * 100 | * If the end of the tag is found, returns the matches array with the full tag end and attributes end 101 | * If not found, returns null 102 | */ 103 | private function findEndOfTag(string $afterStr): ?array 104 | { 105 | $endTag = preg_match('#^([^>]+)>#si', $afterStr, $matches); 106 | if (!$endTag) { 107 | return null; 108 | } 109 | return $matches; 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/TagFinder/ByTag.php: -------------------------------------------------------------------------------- 1 | searchRegex = $this->initSearchRegex($tags); 22 | } 23 | 24 | /** 25 | * Given a full html string, finds the required tags by either tag name and calls the callback, 26 | * providing the full tag string and the attributes string 27 | * 28 | * The return value is used to replace the full tag string 29 | * 30 | * e.g. for an tag finder which is looking for an img tag 31 | * for the string 32 | * '' 35 | * $attributes: ' src="something"' 36 | * and the return from the callback would replace the $fullTag in the original string 37 | */ 38 | public function findTags(string $str, callable $callback): string 39 | { 40 | return preg_replace_callback( 41 | $this->searchRegex, 42 | function ($matches) use ($callback) { 43 | return $callback($matches[0], $matches[1]); 44 | }, 45 | $str 46 | ); 47 | } 48 | 49 | /** 50 | * @param string|string[] $tags 51 | */ 52 | private function initSearchRegex($tags): string 53 | { 54 | if (is_array($tags)) { 55 | $tags = '(?:' . implode('|', $tags) . ')'; 56 | } 57 | return implode('', [ 58 | '#<', 59 | $tags, 60 | '[^a-z0-9>]+([^>]*)(?:>|$)', 61 | '#si', 62 | ]); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/TagFinderInterface.php: -------------------------------------------------------------------------------- 1 | ' 23 | * $attributes: ' src="something"' 24 | * and the return from the callback would replace the $fullTag in the original string 25 | */ 26 | public function findTags(string $str, callable $callback): string; 27 | } 28 | --------------------------------------------------------------------------------