├── CHANGELOG.md
├── LICENSE.md
├── README.md
├── composer.json
└── src
├── AttributeFinder.php
├── Filter
├── AttributeCleaner.php
├── AttributeContent
│ ├── CompactExplodedWords.php
│ ├── DecodeEntities.php
│ └── DecodeUtf8.php
├── AttributeContentCleaner.php
├── EscapeTags.php
├── FilterRunner.php
├── MetaRefresh.php
├── RemoveAttributes.php
└── RemoveBlocks.php
├── FilterInterface.php
├── FilterRunnerTrait.php
├── Sanitizer.php
├── TagFinder
├── ByAttribute.php
└── ByTag.php
└── TagFinderInterface.php
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Change Log
2 | All notable changes to this project will be documented in this file.
3 |
4 | The format is based on [Keep a Changelog](http://keepachangelog.com/)
5 | and this project adheres to [Semantic Versioning](http://semver.org/).
6 |
7 | ## [Unreleased]
8 |
9 | ## [2.1.0] - 2024-06-08
10 | ### Added
11 | - Optional constructor params to specify extra tags or attributes to be removed.
12 | HT @EmanuelAugusto
13 |
14 | ## [2.0.2] - 2023-08-01
15 | ### Removed
16 | - Remove test dependency asset files, and remove test files from dist package.
17 |
18 | ## [2.0.1] - 2022-06-08
19 | ### Fixed
20 | - Attribute cleaner error when filtered tag has empty content.
21 |
22 | ## [2.0.0] - 2021-08-17
23 | ### Added
24 | - Add specific support for PHP v8.
25 | - Type declarations have been added to all properties, method parameters and
26 | return types where possible.
27 | - Add explicit dependency for `mbstring` PHP extension. This has always been
28 | required, but not previously listed in the platform dependencies.
29 | ### Changed
30 | - Use SemVer for dependency versions.
31 | - **BC break**: Reduce visibility of internal methods and properties. These
32 | members are not part of the public API. No impact to standard use of this
33 | package. If an implementation has a use case which needs to override these
34 | members, please submit a pull request explaining the change.
35 | ### Removed
36 | - **BC break**: Removed support for PHP versions <= v7.3 as they are no longer
37 | [actively supported](https://php.net/supported-versions.php) by the PHP project.
38 |
39 | ## [1.1.1] - 2022-09-06
40 | ### Fixed
41 | - Attribute cleaner error when filtered tag has empty content.
42 |
43 | ## [1.1.0] - 2016-11-01
44 | - Adds a method for sanitizing an array of strings.
45 |
46 | ## [1.0.0] - 2016-03-18
47 | - Reduce regex backtracking.
48 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # phlib/xss-sanitizer
2 |
3 | [](https://github.com/phlib/xss-sanitizer/actions/workflows/code-checks.yml)
4 | [](https://codecov.io/gh/phlib/xss-sanitizer)
5 | [](https://packagist.org/packages/phlib/xss-sanitizer)
6 | [](https://packagist.org/packages/phlib/xss-sanitizer)
7 | 
8 |
9 | PHP XSS sanitizer tool for HTML
10 |
11 | ## Disclaimer
12 |
13 | Use [HTML Purifier](http://htmlpurifier.org/).
14 |
15 | This library was created to try to solve the problem of XSS sanitization without
16 | using a permissive list, since the HTML which is being sanitized may contain
17 | non-standard or unusual syntax (e.g. HTML for emails).
18 |
19 | This library is also intended for a limited use case whereby it is assumed that
20 | the sanitized HTML is only going to be displayed in a limited set of supported
21 | browsers (e.g. no need to strip 'vbscript:' code).
22 |
23 | ## Install
24 |
25 | Via Composer
26 |
27 | ``` bash
28 | $ composer require phlib/xss-sanitizer
29 | ```
30 |
31 | ## Usage
32 |
33 | Create a sanitizer and sanitize some input:
34 |
35 | ```php
36 | $sanitizer = new \Phlib\XssSanitizer\Sanitizer();
37 | $sanitized = $sanitizer->sanitize($htmlInput);
38 | ```
39 |
40 | Optionally, extra tags and/or attributes can be specified to be removed,
41 | in addition to the defaults:
42 |
43 | ```php
44 | $removeBlocks = ['xss'];
45 | $removeAttributes = ['onwebkittransitionend'];
46 | $sanitizer = new \Phlib\XssSanitizer\Sanitizer($removeBlocks, $removeAttributes);
47 | $sanitized = $sanitizer->sanitize($htmlInput);
48 | ```
49 |
50 | ## Supported Browsers
51 |
52 | This library is intended to prevent XSS vulnerabilities when the resulting HTML is rendered by any of the following browsers:
53 |
54 | * Chrome (40+)
55 | * Firefox (40+)
56 | * Safari (8+)
57 | * IE (10, 11)
58 | * Edge
59 |
60 | ## License
61 |
62 | This package is free software: you can redistribute it and/or modify
63 | it under the terms of the GNU Lesser General Public License as published by
64 | the Free Software Foundation, either version 3 of the License, or
65 | (at your option) any later version.
66 |
67 | This program is distributed in the hope that it will be useful,
68 | but WITHOUT ANY WARRANTY; without even the implied warranty of
69 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
70 | GNU Lesser General Public License for more details.
71 |
72 | You should have received a copy of the GNU Lesser General Public License
73 | along with this program. If not, see .
74 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "phlib/xss-sanitizer",
3 | "type": "library",
4 | "description": "PHP XSS sanitizer tool for HTML",
5 | "license": "LGPL-3.0",
6 | "authors": [
7 | {
8 | "name": "Martin Price"
9 | },
10 | {
11 | "name": "Phlib Team & Contributors",
12 | "homepage": "https://github.com/phlib/xss-sanitizer/contributors"
13 | }
14 | ],
15 | "autoload": {
16 | "psr-4": {
17 | "Phlib\\XssSanitizer\\": "src/"
18 | }
19 | },
20 | "require": {
21 | "php" : "^7.4 || ^8.0",
22 | "ext-mbstring": "*"
23 | },
24 | "require-dev": {
25 | "phpunit/phpunit": "^9",
26 | "symplify/easy-coding-standard": "^12"
27 | },
28 | "autoload-dev": {
29 | "psr-4": {
30 | "Phlib\\XssSanitizer\\Test\\": "tests/"
31 | }
32 | },
33 | "scripts": {
34 | "check-cs": "vendor/bin/ecs check --ansi",
35 | "fix-cs": "vendor/bin/ecs check --fix --ansi"
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/AttributeFinder.php:
--------------------------------------------------------------------------------
1 | attributes = $attributes;
27 | $this->optimisticSearchRegex = $this->initOptimisticSearchRegex();
28 | $this->pessimisticSearchRegex = $this->initPessimisticSearchRegex($attributes);
29 | }
30 |
31 | /**
32 | * Given the attributes string of an element, finds the required attribute(s) and calls the callback, providing the
33 | * full attribute string and the content (value) of the attribute
34 | *
35 | * The return value is used to replace the full attribute string
36 | *
37 | * e.g. for an attribute finder which is looking for the 'href' attribute
38 | * for the string
39 | * 'a href="something" id="link"'
40 | * the callback will provide
41 | * $fullAttribute: 'href="something"'
42 | * $attributeContent: 'something'
43 | * and the return from the callback would replace the $fullAttribute in the original string
44 | */
45 | public function findAttributes(string $attributes, callable $callback): string
46 | {
47 | $filtered = [];
48 |
49 | $this->findAttributesOptimistic($attributes, $callback, $filtered);
50 |
51 | $this->findAttributesPessimistic($attributes, $callback, $filtered);
52 |
53 | return implode('', $filtered);
54 | }
55 |
56 | /**
57 | * Find attributes hoping for well-formed and valid HTML
58 | *
59 | * This should prevent a certain number of false positives by nicely handling attributes which are
60 | * well formed and syntactically good
61 | *
62 | * This allows us to ignore cases where an attribute name appears in the context of an attribute value
63 | * when we know that the attribute is well formed
64 | */
65 | private function findAttributesOptimistic(string &$attributes, callable $callback, array &$filtered): void
66 | {
67 | while (preg_match($this->optimisticSearchRegex, $attributes, $matches)) {
68 | $attributes = substr($attributes, strlen($matches[0]));
69 | if (in_array(strtolower($matches[3]), $this->attributes, true)) {
70 | $replacement = $callback($matches[2], $matches[5]);
71 | } else {
72 | $replacement = $matches[2];
73 | }
74 | $filtered[] = $matches[1]; // whitespace
75 | $filtered[] = $replacement;
76 | }
77 | }
78 |
79 | private function initOptimisticSearchRegex(): string
80 | {
81 | return implode('', [
82 | '#',
83 | '^(\s*)', // group 1 (whitespace)
84 | '(', // group 2 (full attribute)
85 | '([a-z]+)', // group 3 (attribute name)
86 | '=',
87 | '(["\'])', // group 4 (quote)
88 | '(.*?)', // group 5 (attribute value)
89 | '\4',
90 | ')',
91 | '#si',
92 | ]);
93 | }
94 |
95 | /**
96 | * Find attributes where the attribute syntax may not be well formed
97 | *
98 | * This acts as a fallback when the optimistic search is not able to parse the attributes
99 | *
100 | * Here, we aren't too bothered about false positives; we want to make sure we catch all and any possibilities
101 | * of the attribute appearing, which may include occurances within an attribute value
102 | */
103 | private function findAttributesPessimistic(string $attributes, callable $callback, array &$filtered): void
104 | {
105 | $filtered[] = preg_replace_callback(
106 | $this->pessimisticSearchRegex,
107 | function ($matches) use ($callback) {
108 | $attributeContents = '';
109 | if (isset($matches[2]) && $matches[2]) {
110 | $attributeContents = $matches[2]; // quoted contents
111 | } elseif (isset($matches[3]) && $matches[3]) {
112 | $attributeContents = $matches[3]; // unquoted contents
113 | }
114 | return $callback($matches[0], $attributeContents);
115 | },
116 | $attributes
117 | );
118 | }
119 |
120 | private function initPessimisticSearchRegex(array $attributes): string
121 | {
122 | $attributes = '(?:' . implode('|', $attributes) . ')';
123 | return implode('', [
124 | '#',
125 | '(?])*)', // everything up to space or '>'
136 | ')',
137 | '#si',
138 | ]);
139 | }
140 | }
141 |
--------------------------------------------------------------------------------
/src/Filter/AttributeCleaner.php:
--------------------------------------------------------------------------------
1 | tagFinder = $tags ? new TagFinder\ByTag($tags) : new TagFinder\ByAttribute($attribute);
31 | $this->attrFinder = new AttributeFinder($attribute);
32 |
33 | $this->contentRegex = $this->buildContentRegex();
34 |
35 | $this->attributeContentCleaner = $attributeContentCleaner;
36 | }
37 |
38 | /**
39 | * Given the tags and attribute to look for, will search for tags with that attribute containing potential XSS
40 | * exploits, and remove the attribute if found
41 | *
42 | * e.g. with $tags='a' and $attr='href'
43 | *
44 | * should become
45 | *
46 | */
47 | public function filter(string $str): string
48 | {
49 | $str = $this->tagFinder->findTags($str, function ($fullTag, $attributes): string {
50 | return $this->cleanAttributes($fullTag, $attributes);
51 | });
52 |
53 | return $str;
54 | }
55 |
56 | /**
57 | * Search for the attribute in the tags, and clean it if found
58 | *
59 | * @param string $fullTag (e.g. '')
60 | * @param string $attributes (e.g. 'a href="javascript:alert('XSS');"')
61 | */
62 | private function cleanAttributes(string $fullTag, string $attributes): string
63 | {
64 | $replacement = $this->attrFinder->findAttributes($attributes, function ($fullAttribute, $attributeContents): string {
65 | return $this->cleanAttribute($fullAttribute, $attributeContents);
66 | });
67 |
68 | return str_ireplace($attributes, $replacement, $fullTag);
69 | }
70 |
71 | /**
72 | * Search the attribute content for any potential exploits, and return empty string
73 | *
74 | * @param string $fullAttribute (e.g. 'href="javascript:alert('XSS');"')
75 | * @param string $attributeContents (e.g. 'javascript:alert('XSS');')
76 | */
77 | private function cleanAttribute(string $fullAttribute, string $attributeContents): string
78 | {
79 | // decode entities, compact words etc.
80 | $cleanedContents = $this->attributeContentCleaner->filter($attributeContents);
81 |
82 | if (preg_match($this->contentRegex, $cleanedContents)) {
83 | return '';
84 | }
85 |
86 | return $fullAttribute;
87 | }
88 |
89 | private function buildContentRegex(): string
90 | {
91 | $dangerous = [
92 | 'javascript:',
93 | ];
94 |
95 | return implode('', [
96 | '#',
97 | '(', implode('|', $dangerous), ')',
98 | '#i',
99 | ]);
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/src/Filter/AttributeContent/CompactExplodedWords.php:
--------------------------------------------------------------------------------
1 | wordsRegex = $this->buildWordsRegex();
19 | }
20 |
21 | /**
22 | * Compacts certain potentially dangerous words which have had whtespace added between the letters
23 | *
24 | * e.g.
25 | * j a v a s c r i p t
26 | * becomes
27 | * javascript
28 | */
29 | public function filter(string $str): string
30 | {
31 | $str = preg_replace_callback(
32 | $this->wordsRegex,
33 | function ($matches): string {
34 | return preg_replace('/\s+/', '', $matches[1]) . $matches[2];
35 | },
36 | $str
37 | );
38 |
39 | return $str;
40 | }
41 |
42 | private function buildWordsRegex(): string
43 | {
44 | $rawWords = [
45 | 'javascript',
46 | 'refresh', /* @see Phlib\XssSanitizer\Filter\MetaRefresh */
47 | ];
48 |
49 | $words = [];
50 | foreach ($rawWords as $word) {
51 | $words[] = chunk_split($word, 1, '\s*');
52 | }
53 |
54 | return implode('', [
55 | '#(', implode('|', $words), ')(\W|$)#is',
56 | ]);
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/Filter/AttributeContent/DecodeEntities.php:
--------------------------------------------------------------------------------
1 | entityRegex = $this->buildEntityRegex();
19 | }
20 |
21 | /**
22 | * Decode HTML entities in an attribute content string
23 | *
24 | * e.g.
25 | * javascript:alert('XSS');
26 | * becomes
27 | * javascript:alert('XSS');
28 | */
29 | public function filter(string $str): string
30 | {
31 | $str = preg_replace_callback(
32 | $this->entityRegex,
33 | function ($matches): string {
34 | if ($matches[1]) {
35 | $entity = "{$matches[1]};";
36 | } else {
37 | $entity = "{$matches[2]};";
38 | }
39 | return mb_convert_encoding($entity, 'UTF-8', 'HTML-ENTITIES');
40 | },
41 | $str
42 | );
43 | return $str;
44 | }
45 |
46 | private function buildEntityRegex(): string
47 | {
48 | return implode('', [
49 | '/',
50 | '',
51 | '(?:',
52 | // decimal
53 | '(?:0*)', // ignore zero padding
54 | '([0-9]+)',
55 | '|',
56 | // hexadecimal
57 | 'x(?:0*)', // ignore zero padding
58 | '([0-9a-f]+)',
59 | ')',
60 | '(;)?',
61 | '/i',
62 | ]);
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/Filter/AttributeContent/DecodeUtf8.php:
--------------------------------------------------------------------------------
1 | initFilters();
25 | }
26 |
27 | /**
28 | * Filters the content of an attribute
29 | * This should be decoding UTF-8 and HTML entities, and compacting any exploded words which we're searching for
30 | *
31 | * e.g.
32 | * \u006A a v a s c r i p t:alert('XSS');
33 | * should become
34 | * javascript:alert('XSS');
35 | */
36 | public function filter(string $str): string
37 | {
38 | return $this->runFilters($str, $this->filters);
39 | }
40 |
41 | private function initFilters(): void
42 | {
43 | $this->filters = [
44 | new AttributeContent\DecodeUtf8(),
45 | new AttributeContent\DecodeEntities(),
46 | new AttributeContent\CompactExplodedWords(),
47 | ];
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/Filter/EscapeTags.php:
--------------------------------------------------------------------------------
1 | searchRegex = $this->initSearchRegex($tags);
22 | }
23 |
24 | /**
25 | * Filter tags by html encoding the opening angle bracket
26 | *
27 | * e.g.
28 | *
29 | * becomes
30 | * <script type="text/javascript">alert('XSS');</script>
31 | */
32 | public function filter(string $str): string
33 | {
34 | $str = preg_replace($this->searchRegex, '<\1', $str);
35 |
36 | return $str;
37 | }
38 |
39 | /**
40 | * @param string|string[] $tags
41 | */
42 | private function initSearchRegex($tags): string
43 | {
44 | if (is_array($tags)) {
45 | $tags = '(?:' . implode('|', $tags) . ')';
46 | }
47 | return implode('', [
48 | '#',
49 | '<',
50 | '(/?', $tags, ')',
51 | '#si',
52 | ]);
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/Filter/FilterRunner.php:
--------------------------------------------------------------------------------
1 | filters = $filters;
31 | }
32 |
33 | /**
34 | * Runs each of the filters against the string repeatedly
35 | */
36 | public function filter(string $str): string
37 | {
38 | return $this->runFilters($str, $this->filters);
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/Filter/MetaRefresh.php:
--------------------------------------------------------------------------------
1 | tagFinder = new TagFinder\ByTag('meta');
25 | $this->attrFinder = new AttributeFinder('http-equiv');
26 |
27 | $this->attributeContentCleaner = $attributeContentCleaner;
28 | }
29 |
30 | /**
31 | * Removes refresh meta tags
32 | * @see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#META
33 | *
34 | * e.g.
35 | *
36 | * would be removed
37 | */
38 | public function filter(string $str): string
39 | {
40 | $str = $this->tagFinder->findTags($str, function ($fullTag, $attributes): string {
41 | return $this->cleanTag($fullTag, $attributes);
42 | });
43 | return $str;
44 | }
45 |
46 | /**
47 | * Replaces the tag with an empty string if the 'http-equiv' is set to 'refresh'
48 | *
49 | * @param string $fullTag (e.g. '')
50 | * @param string $attributes (e.g. 'meta http-equiv="refresh"')
51 | */
52 | private function cleanTag(string $fullTag, string $attributes): string
53 | {
54 | $isRefreshTag = false;
55 |
56 | $this->attrFinder->findAttributes($attributes, function ($full, $contents) use (&$isRefreshTag) {
57 | $cleanedContents = $this->attributeContentCleaner->filter($contents);
58 | if (preg_match('/refresh/i', $cleanedContents)) {
59 | $isRefreshTag = true;
60 | }
61 | return $full;
62 | });
63 |
64 | if ($isRefreshTag) {
65 | $fullTag = '';
66 | }
67 | return $fullTag;
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/Filter/RemoveAttributes.php:
--------------------------------------------------------------------------------
1 | tagFinder = new TagFinder\ByAttribute($attributes);
44 | $this->attributeFinder = new AttributeFinder($attributes);
45 | }
46 |
47 | /**
48 | * Filter unwanted attributes from tags
49 | *
50 | * This includes event handler attributes ('onload', 'onclick' etc.)
51 | * e.g. ''
52 | */
53 | public function filter(string $str): string
54 | {
55 | $str = $this->tagFinder->findTags($str, function ($fullTag, $attributes): string {
56 | return $this->removeAttribute($fullTag, $attributes);
57 | });
58 |
59 | return $str;
60 | }
61 |
62 | /**
63 | * Removes unwanted attributes from a particular tag
64 | *
65 | * @param string $fullTag (e.g. '')
66 | * @param string $attributes (e.g. 'a onclick="alert('XSS');"')
67 | */
68 | private function removeAttribute(string $fullTag, string $attributes): string
69 | {
70 | $replacement = $this->attributeFinder->findAttributes($attributes, function (): string {
71 | return '';
72 | });
73 |
74 | return str_ireplace($attributes, $replacement, $fullTag);
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/Filter/RemoveBlocks.php:
--------------------------------------------------------------------------------
1 | searchRegex = $this->initSearchRegex($tags);
22 | }
23 |
24 | /**
25 | * Filter tags out of the HTML by removing whole blocks (from opening tag to closing tag)
26 | *
27 | * This filter should be used in conjunction with @see \Phlib\XssSanitizer\Filter\EscapeTags to ensure that any
28 | * tags which are not picked up will be escaped
29 | *
30 | * e.g.
31 | *
32 | * becomes
33 | *
34 | */
35 | public function filter(string $str): string
36 | {
37 | $str = preg_replace($this->searchRegex, '', $str);
38 |
39 | return $str;
40 | }
41 |
42 | /**
43 | * @param string|string[] $tags
44 | */
45 | private function initSearchRegex($tags): string
46 | {
47 | if (is_array($tags)) {
48 | $tags = '(?:' . implode('|', $tags) . ')';
49 | }
50 | return implode('', [
51 | '#',
52 | // open tag
53 | '<',
54 | '(', $tags, ')',
55 | '([^>]*?)',
56 | '>',
57 | // content
58 | '.*?',
59 | // closing tag
60 | '',
61 | '\1',
62 | '([^>]*?)',
63 | '(>|$)',
64 | '#si',
65 | ]);
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/FilterInterface.php:
--------------------------------------------------------------------------------
1 | applyEachFilter($str, $filters);
22 | } while ($pre !== $str);
23 |
24 | return $str;
25 | }
26 |
27 | /**
28 | * Apply each filter in the filters array
29 | *
30 | * @param FilterInterface[] $filters
31 | */
32 | private function applyEachFilter(string $str, array $filters): string
33 | {
34 | foreach ($filters as $filter) {
35 | $str = $filter->filter($str);
36 | }
37 | return $str;
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/Sanitizer.php:
--------------------------------------------------------------------------------
1 | removeBlocks = ['script', 'iframe', 'object', ...$removeBlocks];
26 | $this->removeAttributes = $removeAttributes;
27 | $this->initFilters();
28 | }
29 |
30 | public function sanitize(string $str): string
31 | {
32 | $str = $this->runFilters($str, $this->filters);
33 |
34 | return $str;
35 | }
36 |
37 | /**
38 | * Sanitize an array of HTML strings
39 | *
40 | * @param string[] $strings
41 | * @return string[]
42 | */
43 | public function sanitizeArray(array $strings): array
44 | {
45 | foreach ($strings as &$str) {
46 | $str = $this->sanitize($str);
47 | }
48 |
49 | return $strings;
50 | }
51 |
52 | private function initFilters(): void
53 | {
54 | $this->filters = [];
55 |
56 | $attributeContentCleaner = new Filter\AttributeContentCleaner();
57 | $this->filters[] = new Filter\AttributeCleaner('href', $attributeContentCleaner, ['a', 'link']);
58 | $this->filters[] = new Filter\AttributeCleaner('src', $attributeContentCleaner, ['img', 'input', 'bgsound']);
59 | $this->filters[] = new Filter\AttributeCleaner('action', $attributeContentCleaner, ['form']);
60 | $this->filters[] = new Filter\AttributeCleaner('background', $attributeContentCleaner);
61 | $this->filters[] = new Filter\FilterRunner(
62 | // Keep trying to remove blocks before escaping the tags
63 | new Filter\RemoveBlocks($this->removeBlocks)
64 | );
65 | $this->filters[] = new Filter\EscapeTags($this->removeBlocks);
66 | $this->filters[] = new Filter\RemoveAttributes($this->removeAttributes);
67 | $this->filters[] = new Filter\MetaRefresh($attributeContentCleaner);
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/TagFinder/ByAttribute.php:
--------------------------------------------------------------------------------
1 | initialSearchRegex = $this->initInitialSearchRegex($attributes);
22 | }
23 |
24 | /**
25 | * Given a full html string, finds the required tags by attribute and calls the callback,
26 | * providing the full tag string and the attributes string
27 | */
28 | public function findTags(string $str, callable $callback): string
29 | {
30 | $searchOffset = 0;
31 | while (preg_match($this->initialSearchRegex, $str, $matches, PREG_OFFSET_CAPTURE, $searchOffset)) {
32 | $attr = $matches[0][0];
33 | $offset = $matches[0][1];
34 |
35 | $searchOffset = $offset + strlen($attr);
36 |
37 | $startOfTag = $this->findStartOfTag(substr($str, 0, $offset));
38 | if (!$startOfTag) {
39 | continue;
40 | }
41 | $endOfTag = $this->findEndOfTag(substr($str, $offset + strlen($attr)));
42 | if (!$endOfTag) {
43 | continue;
44 | }
45 |
46 | $fullTag = implode('', [$startOfTag[0], $attr, $endOfTag[0]]);
47 | $attributes = implode('', [$startOfTag[1], $attr, $endOfTag[1]]);
48 |
49 | $replacement = $callback($fullTag, $attributes);
50 |
51 | $tagOffset = $offset - strlen($startOfTag[0]);
52 | $str = substr_replace($str, $replacement, $tagOffset, strlen($fullTag));
53 |
54 | // continue searching from after the end of the replaced tag
55 | $searchOffset = $tagOffset + strlen($replacement);
56 | }
57 |
58 | return $str;
59 | }
60 |
61 | /**
62 | * @param string|string[] $attributes
63 | */
64 | private function initInitialSearchRegex($attributes): string
65 | {
66 | if (is_array($attributes)) {
67 | $attributes = '(?:' . implode('|', $attributes) . ')';
68 | }
69 |
70 | return implode('', [
71 | '#',
72 | '(?]*', // https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#Non-alpha-non-digit_XSS
75 | '=',
76 | '#si',
77 | ]);
78 | }
79 |
80 | /**
81 | * Finds the start of the tag from the attribute found
82 | *
83 | * If the start of the tag is found, returns the matches array with the full tag start and attributes start
84 | * If not found, returns null
85 | */
86 | private function findStartOfTag(string $beforeStr): ?array
87 | {
88 | // Searching backwards from the found attribute
89 | $startTag = preg_match('#^([^>]+)[a-z]<#si', strrev($beforeStr), $matches);
90 | if (!$startTag) {
91 | return null;
92 | }
93 | // reverse back again
94 | return array_map('strrev', $matches);
95 | }
96 |
97 | /**
98 | * Finds the end of the tag from the attribute found
99 | *
100 | * If the end of the tag is found, returns the matches array with the full tag end and attributes end
101 | * If not found, returns null
102 | */
103 | private function findEndOfTag(string $afterStr): ?array
104 | {
105 | $endTag = preg_match('#^([^>]+)>#si', $afterStr, $matches);
106 | if (!$endTag) {
107 | return null;
108 | }
109 | return $matches;
110 | }
111 | }
112 |
--------------------------------------------------------------------------------
/src/TagFinder/ByTag.php:
--------------------------------------------------------------------------------
1 | searchRegex = $this->initSearchRegex($tags);
22 | }
23 |
24 | /**
25 | * Given a full html string, finds the required tags by either tag name and calls the callback,
26 | * providing the full tag string and the attributes string
27 | *
28 | * The return value is used to replace the full tag string
29 | *
30 | * e.g. for an tag finder which is looking for an img tag
31 | * for the string
32 | * '
'
35 | * $attributes: ' src="something"'
36 | * and the return from the callback would replace the $fullTag in the original string
37 | */
38 | public function findTags(string $str, callable $callback): string
39 | {
40 | return preg_replace_callback(
41 | $this->searchRegex,
42 | function ($matches) use ($callback) {
43 | return $callback($matches[0], $matches[1]);
44 | },
45 | $str
46 | );
47 | }
48 |
49 | /**
50 | * @param string|string[] $tags
51 | */
52 | private function initSearchRegex($tags): string
53 | {
54 | if (is_array($tags)) {
55 | $tags = '(?:' . implode('|', $tags) . ')';
56 | }
57 | return implode('', [
58 | '#<',
59 | $tags,
60 | '[^a-z0-9>]+([^>]*)(?:>|$)',
61 | '#si',
62 | ]);
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/TagFinderInterface.php:
--------------------------------------------------------------------------------
1 |
'
23 | * $attributes: ' src="something"'
24 | * and the return from the callback would replace the $fullTag in the original string
25 | */
26 | public function findTags(string $str, callable $callback): string;
27 | }
28 |
--------------------------------------------------------------------------------