├── .formatter.exs ├── .github └── workflows │ ├── bugfix-reproducer.yml │ └── ci-workflow.yml ├── .gitignore ├── .tool-versions ├── .travis.yml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── lib ├── html_sanitize_ex.ex └── html_sanitize_ex │ ├── parser.ex │ ├── scrubber.ex │ ├── scrubber │ ├── basic_html.ex │ ├── css.ex │ ├── html5.ex │ ├── markdown_html.ex │ ├── meta.ex │ ├── no_scrub.ex │ └── strip_tags.ex │ └── traverser.ex ├── mix.exs ├── mix.lock └── test ├── basic_html_test.exs ├── css_test.exs ├── custom_scrubber_test.exs ├── html5_test.exs ├── html5_test_data_uri ├── html_sanitize_ex_test.exs ├── markdown_html_test.exs ├── no_scrub_test.exs ├── strip_tags_test.exs ├── test_helper.exs ├── test_if_tests_fail_after_resetting_lib.sh └── traverser_test.exs /.formatter.exs: -------------------------------------------------------------------------------- 1 | # Used by "mix format" and to export configuration. 2 | export_locals_without_parens = [] 3 | 4 | [ 5 | inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"], 6 | locals_without_parens: export_locals_without_parens, 7 | export: [locals_without_parens: export_locals_without_parens], 8 | line_length: 80 9 | ] 10 | -------------------------------------------------------------------------------- /.github/workflows/bugfix-reproducer.yml: -------------------------------------------------------------------------------- 1 | name: "Reproducing Test-Case Detector (experimental)" 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - '**_test.exs' 7 | 8 | jobs: 9 | test: 10 | runs-on: ubuntu-latest 11 | name: "Test for lib/ changes" 12 | strategy: 13 | matrix: 14 | otp: [22.2] 15 | elixir: [1.10.4] 16 | steps: 17 | - uses: actions/checkout@v2.3.1 18 | 19 | - uses: erlef/setup-beam@v1 20 | with: 21 | otp-version: ${{matrix.otp}} 22 | elixir-version: ${{matrix.elixir}} 23 | 24 | - run: git fetch origin master:master 25 | 26 | - name: Check changes to lib/ 27 | id: check_changes 28 | run: echo "::set-output name=changes_to_lib::$(git diff --name-only master | grep "^lib")" 29 | 30 | - name: There are changes to lib/ 31 | if: "contains(steps.check_changes.outputs.changes_to_lib, 'lib')" 32 | run: | 33 | mix deps.get 34 | sh test/test_if_tests_fail_after_resetting_lib.sh 35 | 36 | - name: There are no changes to lib/ 37 | if: "!contains(steps.check_changes.outputs.changes_to_lib, 'lib')" 38 | run: echo "${{ toJSON(steps.check_changes.outputs.changes_to_lib) }}" 39 | -------------------------------------------------------------------------------- /.github/workflows/ci-workflow.yml: -------------------------------------------------------------------------------- 1 | name: "CI Tests" 2 | on: 3 | push: 4 | branches: 5 | - master 6 | - release/* 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | test: 13 | runs-on: ubuntu-18.04 14 | name: "[${{matrix.otp}}/${{matrix.elixir}}] CI Tests on Credo [OTP/Elixir]" 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | otp: [20.3, 21.3, 22.3, 23.3, 24.0, 25.1] 19 | elixir: [1.7.4, 1.8.2, 1.9.4, 1.10.4, 1.11.4, 1.12.2, 1.14.1] 20 | exclude: 21 | - otp: 25.1 22 | elixir: 1.7.4 23 | - otp: 25.1 24 | elixir: 1.8.2 25 | - otp: 25.1 26 | elixir: 1.9.4 27 | - otp: 25.1 28 | elixir: 1.10.4 29 | - otp: 25.1 30 | elixir: 1.11.4 31 | - otp: 25.1 32 | elixir: 1.12.2 33 | - otp: 24.0 34 | elixir: 1.7.4 35 | - otp: 24.0 36 | elixir: 1.8.2 37 | - otp: 24.0 38 | elixir: 1.9.4 39 | - otp: 24.0 40 | elixir: 1.10.4 41 | - otp: 23.3 42 | elixir: 1.7.4 43 | - otp: 23.3 44 | elixir: 1.8.2 45 | - otp: 23.3 46 | elixir: 1.9.4 47 | - otp: 22.3 48 | elixir: 1.14.1 49 | - otp: 21.3 50 | elixir: 1.12.2 51 | - otp: 21.3 52 | elixir: 1.14.1 53 | - otp: 20.3 54 | elixir: 1.10.4 55 | - otp: 20.3 56 | elixir: 1.11.4 57 | - otp: 20.3 58 | elixir: 1.12.2 59 | - otp: 20.3 60 | elixir: 1.14.1 61 | steps: 62 | - uses: actions/checkout@v2.3.1 63 | with: 64 | fetch-depth: 0 65 | - uses: erlef/setup-beam@v1 66 | with: 67 | otp-version: ${{matrix.otp}} 68 | elixir-version: ${{matrix.elixir}} 69 | - run: mix deps.get 70 | - run: mix deps.compile 71 | - run: mix compile --warnings-as-errors 72 | - run: mix test 73 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.elixir_ls 2 | /_build 3 | /deps 4 | /docs/all.json 5 | /doc 6 | test.json 7 | erl_crash.dump 8 | *.ez 9 | -------------------------------------------------------------------------------- /.tool-versions: -------------------------------------------------------------------------------- 1 | erlang 24.2 2 | elixir 1.13.4-otp-24 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: elixir 3 | elixir: 4 | - 1.5.3 5 | - 1.6.6 6 | - 1.7.2 7 | - 1.8.2 8 | - 1.9.4 9 | otp_release: 10 | - 19.3 11 | - 20.3 12 | - 21.3 13 | - 22.0 14 | script: 15 | - mix deps.compile 16 | - mix compile --warnings-as-errors 17 | - mix test 18 | matrix: 19 | exclude: 20 | - elixir: 1.5.3 21 | otp_release: 21.3 22 | - elixir: 1.5.3 23 | otp_release: 22.0 24 | - elixir: 1.6.6 25 | otp_release: 22.0 26 | - elixir: 1.8.2 27 | otp_release: 19.3 28 | - elixir: 1.9.4 29 | otp_release: 19.3 30 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 1.4.3 4 | 5 | - Allow `mochiweb` dep to be `~> 2.15 or ~> 3.1` 6 | 7 | ## 1.4.2 8 | 9 | - Fix regression when parsing schemes from URIs 10 | - Fix compiler warnings 11 | - Add missing `` tag to `HTML5` scrubber 12 | 13 | ## 1.4.1 14 | 15 | - Add missing `
` tag to `BasicHTML` and `MarkdownHTML` scrubbers 16 | 17 | ## 1.4.0 18 | 19 | - Add more missing HTML5 attributes 20 | - Add "middle" to valid CSS keywords 21 | 22 | ## 1.3.0 23 | 24 | - Add valid scheme for links: `mailto` 25 | - Update white-space handling in order to keep more of it untouched 26 | 27 | ## 1.2.0 28 | 29 | - Update `mochiweb` version requirement 30 | - Fix missing elements in HTML5: div, caption 31 | 32 | ## 1.1.1 33 | 34 | - Fix missing element in HTML5: blockquote 35 | 36 | ## 1.1.0 37 | 38 | - Add new scrubber: MarkdownHTML 39 | 40 | It is meant to scrub HTML that resulted from converting Markdown to HTML. It 41 | supports GitHub flavored Markdown (GFM). 42 | 43 | ## 1.0.1 44 | 45 | - Fix Elixir 1.3 compiler warnings 46 | 47 | ## 1.0.0 48 | 49 | - First release 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 René Föhring 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HtmlSanitizeEx [![Build Status](https://travis-ci.org/rrrene/html_sanitize_ex.svg)](https://travis-ci.org/rrrene/html_sanitize_ex) [![Inline docs](http://inch-ci.org/github/rrrene/html_sanitize_ex.svg?branch=master)](http://inch-ci.org/github/rrrene/html_sanitize_ex) 2 | 3 | `html_sanitize_ex` provides a fast and straightforward HTML Sanitizer written in Elixir which lets you include HTML authored by third-parties in your web application while protecting against XSS. 4 | 5 | It is the first Hex package to come out of the [elixirstatus.com](http://elixirstatus.com) project, where it will be used to sanitize user announcements from the Elixir community. 6 | 7 | 8 | 9 | ## What can it do? 10 | 11 | `html_sanitize_ex` parses a given HTML string and, based on the used [Scrubber](https://github.com/rrrene/html_sanitize_ex/tree/master/lib/html_sanitize_ex/scrubber), either completely strips it from HTML tags or sanitizes it by only allowing certain HTML elements and attributes to be present. 12 | 13 | **NOTE:** The one thing missing at this moment is ***support for styles***. To add this, we have to implement a Scrubber for CSS, to prevent nasty CSS hacks using `" 38 | assert input == full_html_sanitize(input) 39 | end 40 | 41 | test "handles bad css" do 42 | input = 43 | "" 44 | 45 | expected = "" 46 | assert expected == full_html_sanitize(input) 47 | end 48 | 49 | test "handles bad css in style attribute" do 50 | input = 51 | "

hello code!

" 52 | 53 | expected = 54 | "

hello code!

" 55 | 56 | assert expected == full_html_sanitize(input) 57 | end 58 | 59 | test "strips everything except the allowed tags (for multiple tags)" do 60 | input = 61 | "

hello

" 62 | 63 | expected = "
code!

hello code!

" 64 | assert expected == full_html_sanitize(input) 65 | end 66 | 67 | test "does not strip caption from tables" do 68 | input = 69 | "
This is a table
" 70 | 71 | expected = 72 | "
This is a table
" 73 | 74 | assert expected == full_html_sanitize(input) 75 | end 76 | 77 | test "does not strip divs" do 78 | input = ~s(
Hello
) 79 | expected = ~s(
Hello
) 80 | assert expected == full_html_sanitize(input) 81 | end 82 | 83 | test "does not strip the mailto URI scheme" do 84 | input = ~s(Email Us) 85 | expected = ~s(Email Us) 86 | assert expected == full_html_sanitize(input) 87 | end 88 | 89 | test "does encode script in textarea, but preserves white-space" do 90 | input = ~s() 91 | expected = ~s() 92 | assert expected == full_html_sanitize(input) 93 | end 94 | 95 | test "does not contain replacement characters in result" do 96 | input = ~s[

Hi

] 97 | expected = ~s[alert()

Hi

] 98 | assert expected == full_html_sanitize(input) 99 | end 100 | 101 | test "does not strip valid html5 attributes from " do 102 | input = 103 | ~s[] 104 | 105 | assert input == full_html_sanitize(input) 106 | end 107 | 108 | test "does not strip valid html5 attributes srcset and sizes from " do 109 | input = 110 | ~s[] 111 | 112 | assert input == full_html_sanitize(input) 113 | end 114 | 115 | test "does not strip any header tags" do 116 | input = """ 117 |

Header 1

118 |

Header 2

119 |

Header 3

120 |

Header 4

121 |
Header 5
122 |
Header 6
123 | """ 124 | 125 | assert input == full_html_sanitize(input) 126 | end 127 | 128 | test "make sure a very long URI is truncated before capturing URI scheme" do 129 | input = 130 | "" 131 | 132 | assert "" == full_html_sanitize(input) 133 | end 134 | end 135 | -------------------------------------------------------------------------------- /test/html_sanitize_ex_test.exs: -------------------------------------------------------------------------------- 1 | defmodule HtmlSanitizeExTest do 2 | use ExUnit.Case, async: true 3 | 4 | test "strips all the tags" do 5 | input = 6 | "hello!

hello

" 7 | 8 | assert "hello! code!hello code!" == HtmlSanitizeEx.strip_tags(input) 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /test/markdown_html_test.exs: -------------------------------------------------------------------------------- 1 | defmodule HtmlSanitizeExScrubberMarkdownHTMLTest do 2 | use ExUnit.Case, async: true 3 | 4 | defp sanitize(text) do 5 | text |> HtmlSanitizeEx.Scrubber.scrub(HtmlSanitizeEx.Scrubber.MarkdownHTML) 6 | end 7 | 8 | test "strips nothing" do 9 | input = "This is an example of space eating." 10 | expected = "This is an example of space eating." 11 | assert expected == sanitize(input) 12 | end 13 | 14 | test "does NOT strip language class from code tag" do 15 | input = "Something.new" 16 | assert input == sanitize(input) 17 | end 18 | 19 | test "strips everything except the allowed tags" do 20 | input = "

hello

" 21 | expected = "

hello code!

" 22 | assert expected == sanitize(input) 23 | end 24 | 25 | test "allows target=_blank inside " do 26 | input = 27 | ~S(hello world) 28 | 29 | expected = 30 | ~S(hello world) 31 | 32 | assert expected == sanitize(input) 33 | end 34 | 35 | test "disallows anything else for target= inside " do 36 | input = 37 | ~S(hello world) 38 | 39 | expected = ~S(hello world) 40 | assert expected == sanitize(input) 41 | end 42 | 43 | test "allows rel=noopener inside " do 44 | input = 45 | ~S(hello world) 46 | 47 | expected = 48 | ~S(hello world) 49 | 50 | assert expected == sanitize(input) 51 | end 52 | 53 | test "allows rel=noreferrer inside " do 54 | input = 55 | ~S(hello world) 56 | 57 | expected = 58 | ~S(hello world) 59 | 60 | assert expected == sanitize(input) 61 | end 62 | 63 | test "disallows anything else for rel= inside " do 64 | input = 65 | ~S(hello world) 66 | 67 | expected = 68 | ~S(hello world) 69 | 70 | assert expected == sanitize(input) 71 | end 72 | 73 | test "strips everything except the allowed tags (for multiple tags)" do 74 | input = 75 | "

hello

" 76 | 77 | expected = "code!

hello code!

" 78 | assert expected == sanitize(input) 79 | end 80 | 81 | test "strips everything for faulty allowed_tags: key" do 82 | input = "

hello

" 83 | expected = "hello" 84 | assert expected != sanitize(input) 85 | end 86 | 87 | test "strips invalid html" do 88 | input = "<< hi" 95 | assert "<\" hi" == sanitize(input) 96 | end 97 | 98 | test "strips nested tags" do 99 | input = "Wei<a onclick='alert(document.cookie);'/>rdos" 100 | expected = "Wei<a onclick='alert(document.cookie);'/>rdos" 101 | assert expected == sanitize(input) 102 | end 103 | 104 | test "strips certain tags in multi line strings" do 105 | input = 106 | "This is <b>a <a href=\"\" target=\"_top\">test</a></b>.\n\n\n\n

It no longer contains any HTML.

\n" 107 | 108 | expected = 109 | "This is a test.\n\n\n\n

It no longer contains any HTML.

\n" 110 | 111 | assert expected == sanitize(input) 112 | end 113 | 114 | test "strips blank string" do 115 | assert "" == sanitize("") 116 | assert "" == sanitize(" ") 117 | assert "" == sanitize(nil) 118 | end 119 | 120 | test "strips nothing from plain text" do 121 | input = "Dont touch me" 122 | expected = "Dont touch me" 123 | assert expected == sanitize(input) 124 | end 125 | 126 | test "strips nothing from a sentence" do 127 | input = "This is a test." 128 | expected = "This is a test." 129 | assert expected == sanitize(input) 130 | end 131 | 132 | test "strips tags with comment" do 133 | input = "This has a here." 134 | expected = "This has a here." 135 | assert expected == sanitize(input) 136 | end 137 | 138 | test "strip_tags escapes special characters" do 139 | assert "&", sanitize("&") 140 | end 141 | 142 | # link sanitizer 143 | 144 | test "test_strip_links_with_tags_in_tags" do 145 | input = "<a href='hello'>all day long</a>" 146 | expected = "<a href='hello'>all day long</a>" 147 | assert expected == sanitize(input) 148 | end 149 | 150 | test "test_strip_links_with_unclosed_tags" do 151 | assert "" == sanitize("on my mind all day long" == 163 | sanitize(input) 164 | end 165 | 166 | @tag href_scrubbing: true 167 | test "test_strip_links_leaves_nonlink_tags" do 168 | assert "My mind\nall day long" == 169 | sanitize( 170 | "My mind\nall day long" 171 | ) 172 | end 173 | 174 | @tag href_scrubbing: true 175 | test "strips tags with sanitize/1" do 176 | input = 177 | "

This is a test.

" 178 | 179 | assert "

This is a test.

" == 180 | sanitize(input) 181 | end 182 | 183 | @a_href_hacks [ 184 | "text here", 185 | "text here", 186 | "text here", 187 | "text here", 188 | "text here", 189 | "text here", 190 | "text here", 191 | "text here", 192 | "text here", 193 | "text here", 194 | "text here", 195 | "text here", 196 | "text here", 197 | "text here", 198 | "text here", 199 | "text here", 200 | "text here", 201 | "text here", 202 | "text here", 203 | "text here", 204 | "text here", 205 | "script:alert(\'foo\')\">text here", 206 | "text here", 207 | "text here", 208 | "text here", 209 | "text here" 210 | ] 211 | 212 | @tag href_scrubbing: true 213 | test "strips malicious protocol hacks from a href attribute" do 214 | expected = "text here" 215 | Enum.each(@a_href_hacks, fn x -> assert expected == sanitize(x) end) 216 | end 217 | 218 | @tag href_scrubbing: true 219 | test "does not strip x03a legitimate" do 220 | assert "" == 221 | sanitize("") 222 | 223 | assert "" == 224 | sanitize("") 225 | end 226 | 227 | test "test_strip links with links" do 228 | input = 229 | "0wn3d" 230 | 231 | assert "0wn3d" == 232 | sanitize(input) 233 | end 234 | 235 | test "test_strip_links_with_linkception" do 236 | assert "Magic" == 237 | sanitize( 238 | "Magic" 239 | ) 240 | end 241 | 242 | test "test_strip_links_with_a_tag_in_href" do 243 | assert "FrrFox" == sanitize("FrrFox") 244 | end 245 | 246 | test "normal scrubbing does only allow certain tags and attributes" do 247 | input = "<span data-foo=\"bar\">foo</span></plaintext>" 248 | expected = "<span>foo</span>" 249 | assert expected == sanitize(input) 250 | end 251 | 252 | test "strips not allowed attributes" do 253 | input = 254 | "start <a title=\"1\" onclick=\"foo\">foo <bad>bar</bad> baz</a> end" 255 | 256 | expected = "start <a title=\"1\">foo bar baz</a> end" 257 | assert expected == sanitize(input) 258 | end 259 | 260 | test "sanitize_script" do 261 | assert "a b cblah blah blahd e f" == 262 | sanitize( 263 | "a b c<script language=\"Javascript\">blah blah blah</script>d e f" 264 | ) 265 | end 266 | 267 | @tag href_scrubbing: true 268 | test "sanitize_js_handlers" do 269 | input = 270 | ~s(onthis="do that" <a href="#" onclick="hello" name="foo" onbogus="remove me">hello</a>) 271 | 272 | assert "onthis=\"do that\" <a href=\"#\" name=\"foo\">hello</a>" == 273 | sanitize(input) 274 | end 275 | 276 | test "sanitize_javascript_href" do 277 | raw = 278 | ~s(href="javascript:bang" <a href="javascript:bang" name="hello">foo</a>, <span href="javascript:bang">bar</span>) 279 | 280 | assert ~s(href="javascript:bang" <a name="hello">foo</a>, <span>bar</span>) == 281 | sanitize(raw) 282 | end 283 | 284 | test "sanitize_image_src" do 285 | raw = 286 | ~s(src="javascript:bang" <img src="javascript:bang" width="5">foo</img>, <span src="javascript:bang">bar</span>) 287 | 288 | assert "src=\"javascript:bang\" <img width=\"5\" />, <span>bar</span>" == 289 | sanitize(raw) 290 | end 291 | 292 | @tag href_scrubbing: true 293 | test "should only allow http/https protocols" do 294 | assert "<a href=\"foo\">baz</a>" == 295 | sanitize(~s(<a href="foo" onclick="bar"><script>baz</script></a>)) 296 | 297 | assert "<a href=\"http://example.com\">baz</a>" == 298 | sanitize( 299 | ~s(<a href="http://example.com" onclick="bar"><script>baz</script></a>) 300 | ) 301 | 302 | assert "<a href=\"https://example.com\">baz</a>" == 303 | sanitize( 304 | ~s(<a href="https://example.com" onclick="bar"><script>baz</script></a>) 305 | ) 306 | end 307 | 308 | # test "video_poster_sanitization" do 309 | # assert ~s(<video src="videofile.ogg" autoplay poster="posterimage.jpg"></video>) == ~s(<video src="videofile.ogg" poster="posterimage.jpg"></video>) 310 | # assert ~s(<video src="videofile.ogg"></video>) == sanitize("<video src=\"videofile.ogg\" poster=javascript:alert(1)></video>") 311 | # end 312 | 313 | test "strips not allowed tags " do 314 | input = "<form><u></u></form>" 315 | expected = "<u></u>" 316 | assert expected == sanitize(input) 317 | end 318 | 319 | test "strips not allowed attributes " do 320 | input = "<a foo=\"hello\" bar=\"world\"></a>" 321 | expected = "<a></a>" 322 | assert expected == sanitize(input) 323 | end 324 | 325 | @image_src_hacks [ 326 | "<IMG SRC=\"javascript:alert('XSS');\">", 327 | "<IMG SRC=javascript:alert('XSS')>", 328 | "<IMG SRC=JaVaScRiPt:alert('XSS')>", 329 | "<IMG SRC=javascript:alert(&quot;XSS&quot;)>", 330 | "<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>", 331 | "<IMG SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>", 332 | "<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>", 333 | "<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>", 334 | "<IMG SRC=\"jav\tascript:alert('XSS');\">", 335 | "<IMG SRC=\"jav&#x09;ascript:alert('XSS');\">", 336 | "<IMG SRC=\"jav&#x0A;ascript:alert('XSS');\">", 337 | "<IMG SRC=\"jav&#x0D;ascript:alert('XSS');\">", 338 | "<IMG SRC=\" &#14; javascript:alert('XSS');\">", 339 | "<IMG SRC=\"javascript&#x3a;alert('XSS');\">", 340 | "<IMG SRC=`javascript:alert(\"RSnake says, 'XSS'\")`>" 341 | ] 342 | 343 | test "strips malicious protocol hacks from img src attribute" do 344 | expected = "<img />" 345 | Enum.each(@image_src_hacks, fn x -> assert expected == sanitize(x) end) 346 | end 347 | 348 | test "strips script tag" do 349 | input = "<SCRIPT\nSRC=http://ha.ckers.org/xss.js></SCRIPT>" 350 | expected = "" 351 | assert expected == sanitize(input) 352 | end 353 | 354 | test "strips xss image hack with uppercase tags" do 355 | input = "<IMG \"\"\"><SCRIPT>alert(\"XSS\")</SCRIPT>\">" 356 | expected = "<img />alert(\"XSS\")\"&gt;" 357 | assert expected == sanitize(input) 358 | end 359 | 360 | test "should_sanitize_tag_broken_up_by_null" do 361 | assert "alert(\"XSS\")" == sanitize("<SCR\0IPT>alert(\"XSS\")</SCR\0IPT>") 362 | end 363 | 364 | test "should_sanitize_invalid_script_tag" do 365 | input = "<SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>" 366 | assert "" == sanitize(input) 367 | end 368 | 369 | test "should_sanitize_unclosed_script" do 370 | input = "<SCRIPT SRC=http://ha.ckers.org/xss.js?<B>" 371 | assert "" == sanitize(input) 372 | end 373 | 374 | test "sanitize half open scripts" do 375 | input = "<IMG SRC=\"javascript:alert('XSS')\"" 376 | assert "<img />" == sanitize(input) 377 | end 378 | 379 | test "should_not_fall_for_ridiculous_hack" do 380 | img_hack = """ 381 | <IMG\nSRC\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n>) 382 | """ 383 | 384 | assert "<img />)\n" == sanitize(img_hack) 385 | end 386 | 387 | test "should_sanitize_within attributes" do 388 | input = 389 | "<span title=\"&#39;&gt;&lt;script&gt;alert()&lt;/script&gt;\">blah</span>" 390 | 391 | assert "<span>blah</span>" == sanitize(input) 392 | end 393 | 394 | test "should_sanitize_invalid_tag_names" do 395 | end 396 | 397 | test "should_sanitize_non_alpha_and_non_digit_characters_in_tags" do 398 | assert "<a></a>foo" == 399 | sanitize( 400 | "<a onclick!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>" 401 | ) 402 | end 403 | 404 | test "should_sanitize_invalid_tag_names_in_single_tags" do 405 | assert "<img />" == sanitize("<img/src=\"http://ha.ckers.org/xss.js\"/>") 406 | end 407 | 408 | test "should_sanitize_img_dynsrc_lowsrc" do 409 | assert "<img />" == sanitize("<img lowsrc=\"javascript:alert('XSS')\" />") 410 | end 411 | 412 | test "should_sanitize_img_vbscript" do 413 | assert "<img />" == sanitize("<img src='vbscript:msgbox(\"XSS\")' />") 414 | end 415 | 416 | @tag cdata: true 417 | test "should_sanitize_cdata_section" do 418 | assert "<span>section</span>]]&gt;" == 419 | sanitize("<![CDATA[<span>section</span>]]>") 420 | end 421 | 422 | @tag cdata: true 423 | test "should_sanitize_cdata_section like any other" do 424 | assert "section]]&gt;" == sanitize("<![CDATA[<script>section</script>]]>") 425 | end 426 | 427 | @tag cdata: true 428 | test "should_sanitize_unterminated_cdata_section" do 429 | assert "<span>neverending...</span>" == 430 | sanitize("<![CDATA[<span>neverending...") 431 | end 432 | 433 | @tag cdata: true 434 | test "strips CDATA" do 435 | input = "This has a <![CDATA[<section>]]> here." 436 | expected = "This has a ]]&gt; here." 437 | assert expected == sanitize(input) 438 | end 439 | 440 | test "should_not_mangle_urls_with_ampersand" do 441 | input = "<a href=\"http://www.domain.com?var1=1&amp;var2=2\">my link</a>" 442 | assert input == sanitize(input) 443 | end 444 | 445 | test "should_sanitize_neverending_attribute" do 446 | assert "<span></span>" == sanitize("<span class=\"\\") 447 | end 448 | 449 | # test "this affects only NS4, but we're on a roll, right?" do 450 | # input = "<div size=\"&{alert('XSS')}\">foo</div>" 451 | # expected = "<div>foo</div>" 452 | # assert expected == sanitize(input) 453 | # end 454 | 455 | test "does not strip the mailto URI scheme" do 456 | input = ~s(<a href="mailto:someone@yoursite.com">Email Us</a>) 457 | expected = ~s(<a href="mailto:someone@yoursite.com">Email Us</a>) 458 | assert expected == sanitize(input) 459 | end 460 | 461 | test "does not strip any header tags" do 462 | input = """ 463 | <h1>Header 1</h1> 464 | <h2>Header 2</h2> 465 | <h3>Header 3</h3> 466 | <h4>Header 4</h4> 467 | <h5>Header 5</h5> 468 | <h6>Header 6</h6> 469 | """ 470 | 471 | assert input == sanitize(input) 472 | end 473 | end 474 | -------------------------------------------------------------------------------- /test/no_scrub_test.exs: -------------------------------------------------------------------------------- 1 | defmodule HtmlSanitizeExScrubberNoScrubTest do 2 | use ExUnit.Case, async: true 3 | 4 | defp no_scrub_sanitize(text) do 5 | HtmlSanitizeEx.noscrub(text) 6 | end 7 | 8 | test "strips nothing" do 9 | input = "This <b>is</b> <b>an</b> <i>example</i> of <u>space</u> eating." 10 | assert input == no_scrub_sanitize(input) 11 | end 12 | 13 | test "leaves white-space between nodes intact" do 14 | input = 15 | "This <b>is</b>\n<b>an</b> <i>example</i> of\n\n<u>space</u> eating." 16 | 17 | assert input == no_scrub_sanitize(input) 18 | end 19 | 20 | test "leaves white-space between nodes intact (CR)" do 21 | input = 22 | "This <b>is</b>\n<b>an</b> <i>example</i> of\r\n\r\n<u>space</u> eating." 23 | 24 | assert input == no_scrub_sanitize(input) 25 | end 26 | 27 | test "leaves white-space between nodes intact (tabs)" do 28 | input = 29 | "This <b>is</b>\t<b>an</b> <i>example</i> of\t\t<u>space</u> eating." 30 | 31 | assert input == no_scrub_sanitize(input) 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /test/strip_tags_test.exs: -------------------------------------------------------------------------------- 1 | defmodule HtmlSanitizeExScrubberStripTagsTest do 2 | use ExUnit.Case, async: true 3 | 4 | defp strip_tags(text) do 5 | HtmlSanitizeEx.strip_tags(text) 6 | end 7 | 8 | test "strips everything except the allowed tags (for multiple tags)" do 9 | input = 10 | "<section><header><script>code!</script></header><p>hello <script>code!</script></p></section>" 11 | 12 | expected = "code!hello code!" 13 | assert expected == strip_tags(input) 14 | end 15 | 16 | test "strips everything" do 17 | input = "<h1>hello<h1>" 18 | expected = "hello" 19 | assert expected == strip_tags(input) 20 | end 21 | 22 | test "strips invalid html" do 23 | input = "<<<bad html" 24 | expected = "&lt;&lt;" 25 | assert expected == strip_tags(input) 26 | end 27 | 28 | test "strips tags with quote" do 29 | input = "<\" <img src=\"trollface.gif\" onload=\"alert(1)\"> hi" 30 | assert "&lt;\" hi" == strip_tags(input) 31 | end 32 | 33 | test "strips nested tags" do 34 | input = "Wei<<a>a onclick='alert(document.cookie);'</a>/>rdos" 35 | expected = "Wei&lt;a onclick='alert(document.cookie);'/&gt;rdos" 36 | assert expected == strip_tags(input) 37 | end 38 | 39 | test "strips tags in multi line strings" do 40 | input = 41 | "<title>This is <b>a <a href=\"\" target=\"_blank\">test</a></b>.</title>\n\n<!-- it has a comment -->\n\n<p>It no <b>longer <strong>contains <em>any <strike>HTML</strike></em>.</strong></b></p>\n" 42 | 43 | expected = "This is a test.\n\n\n\nIt no longer contains any HTML.\n" 44 | assert expected == strip_tags(input) 45 | end 46 | 47 | test "strips comments" do 48 | assert "This is &lt;-- not\n a comment here." == 49 | strip_tags("This is <-- not\n a comment here.") 50 | end 51 | 52 | test "strips blank string" do 53 | assert "" == strip_tags("") 54 | assert "" == strip_tags(" ") 55 | assert "" == strip_tags(nil) 56 | end 57 | 58 | test "strips nothing from plain text" do 59 | input = "Dont touch me" 60 | expected = "Dont touch me" 61 | assert expected == strip_tags(input) 62 | end 63 | 64 | test "strips tags with many open quotes" do 65 | assert "&lt;&lt;" == strip_tags("<<<bad html>") 66 | end 67 | 68 | test "strips nothing from a sentence" do 69 | input = "This is a test." 70 | expected = "This is a test." 71 | assert expected == strip_tags(input) 72 | end 73 | 74 | test "strips tags with comment" do 75 | input = "This has a <!-- comment --> here." 76 | expected = "This has a here." 77 | assert expected == strip_tags(input) 78 | end 79 | 80 | test "strip_tags escapes special characters" do 81 | assert "&amp;", strip_tags("&") 82 | end 83 | 84 | # link sanitizer 85 | 86 | test "test_strip_links_with_tags_in_tags" do 87 | input = "<<a>a href='hello'>all <b>day</b> long<</A>/a>" 88 | expected = "&lt;a href='hello'&gt;all day long&lt;/a&gt;" 89 | assert expected == strip_tags(input) 90 | end 91 | 92 | test "test_strip_links_with_unclosed_tags" do 93 | assert "" == strip_tags("<a<a") 94 | end 95 | 96 | test "test_strip_links_with_plaintext" do 97 | assert "Dont touch me" == strip_tags("Dont touch me") 98 | end 99 | 100 | @a_href_hacks [ 101 | "<a href=\"javascript:alert('XSS');\">text here</a>", 102 | "<a href=javascript:alert('XSS')>text here</a>", 103 | "<a href=JaVaScRiPt:alert('XSS')>text here</a>", 104 | "<a href=javascript:alert(&quot;XSS&quot;)>text here</a>", 105 | "<a href=javascript:alert(String.fromCharCode(88,83,83))>text here</a>", 106 | "<a href=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>text here</a>", 107 | "<a href=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>text here</a>", 108 | "<a href=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>text here</a>", 109 | "<a href=\"jav\tascript:alert('XSS');\">text here</a>", 110 | "<a href=\"jav&#x09;ascript:alert('XSS');\">text here</a>", 111 | "<a href=\"jav&#x0A;ascript:alert('XSS');\">text here</a>", 112 | "<a href=\"jav&#x0D;ascript:alert('XSS');\">text here</a>", 113 | "<a href=\" &#14; javascript:alert('XSS');\">text here</a>", 114 | "<a href=\"javascript&#x3a;alert('XSS');\">text here</a>", 115 | "<a href=`javascript:alert(\"RSnake says, 'XSS'\")`>text here</a>", 116 | "<a href=\"javascript&#x3a;alert('XSS');\">text here</a>", 117 | "<a href=\"javascript&#x003a;alert('XSS');\">text here</a>", 118 | "<a href=\"javascript&#x3A;alert('XSS');\">text here</a>", 119 | "<a href=\"javascript&#x003A;alert('XSS');\">text here</a>", 120 | "<a href=\"&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;\">text here</a>", 121 | "<a href=\"JAVASCRIPT:alert(\'foo\')\">text here</a>", 122 | "<a href=\"java<!-- -->script:alert(\'foo\')\">text here</a>", 123 | "<a href=\"awesome.html#this:stuff\">text here</a>", 124 | "<a href=\"java\0&#14;\t\r\n script:alert(\'foo\')\">text here</a>", 125 | "<a href=\"java&#0000001script:alert(\'foo\')\">text here</a>", 126 | "<a href=\"java&#0000000script:alert(\'foo\')\">text here</a>" 127 | ] 128 | 129 | @tag href_scrubbing: true 130 | test "strips malicious protocol hacks from a href attribute" do 131 | expected = "text here" 132 | Enum.each(@a_href_hacks, fn x -> assert expected == strip_tags(x) end) 133 | end 134 | 135 | test "test_strip links with links" do 136 | input = 137 | "<a href='http://www.rubyonrails.com/'><a href='http://www.rubyonrails.com/' onlclick='steal()'>0wn3d</a></a>" 138 | 139 | assert "0wn3d" == strip_tags(input) 140 | end 141 | 142 | test "test_strip_links_with_a_tag_in_href" do 143 | assert "FrrFox" == strip_tags("<href onlclick='steal()'>FrrFox</a></href>") 144 | end 145 | 146 | test "normal scrubbing does only allow certain tags and attributes" do 147 | input = "<plaintext><span data-foo=\"bar\">foo</span></plaintext>" 148 | expected = "foo" 149 | assert expected == strip_tags(input) 150 | end 151 | 152 | @image_src_hacks [ 153 | "<IMG SRC=\"javascript:alert('XSS');\">", 154 | "<IMG SRC=javascript:alert('XSS')>", 155 | "<IMG SRC=JaVaScRiPt:alert('XSS')>", 156 | "<IMG SRC=javascript:alert(&quot;XSS&quot;)>", 157 | "<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>", 158 | "<IMG SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>", 159 | "<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>", 160 | "<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>", 161 | "<IMG SRC=\"jav\tascript:alert('XSS');\">", 162 | "<IMG SRC=\"jav&#x09;ascript:alert('XSS');\">", 163 | "<IMG SRC=\"jav&#x0A;ascript:alert('XSS');\">", 164 | "<IMG SRC=\"jav&#x0D;ascript:alert('XSS');\">", 165 | "<IMG SRC=\" &#14; javascript:alert('XSS');\">", 166 | "<IMG SRC=\"javascript&#x3a;alert('XSS');\">", 167 | "<IMG SRC=`javascript:alert(\"RSnake says, 'XSS'\")`>" 168 | ] 169 | 170 | test "strips malicious protocol hacks from img src attribute" do 171 | expected = "" 172 | Enum.each(@image_src_hacks, fn x -> assert expected == strip_tags(x) end) 173 | end 174 | 175 | test "strips script tag" do 176 | input = "<SCRIPT\nSRC=http://ha.ckers.org/xss.js></SCRIPT>" 177 | expected = "" 178 | assert expected == strip_tags(input) 179 | end 180 | 181 | test "should_sanitize_tag_broken_up_by_null" do 182 | assert "alert(\"XSS\")" == strip_tags("<SCR\0IPT>alert(\"XSS\")</SCR\0IPT>") 183 | end 184 | 185 | test "should_sanitize_invalid_script_tag" do 186 | input = "<SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>" 187 | assert "" == strip_tags(input) 188 | end 189 | 190 | test "should_sanitize_script_tag_with_multiple_open_brackets" do 191 | assert "&lt;alert(\"XSS\");//&lt;" == 192 | strip_tags("<<SCRIPT>alert(\"XSS\");//<</SCRIPT>") 193 | 194 | assert "" == 195 | strip_tags("<iframe src=http://ha.ckers.org/scriptlet.html\n<a") 196 | end 197 | 198 | test "should_sanitize_unclosed_script" do 199 | input = "<SCRIPT SRC=http://ha.ckers.org/xss.js?<B>" 200 | assert "" == strip_tags(input) 201 | end 202 | 203 | test "sanitize half open scripts" do 204 | input = "<IMG SRC=\"javascript:alert('XSS')\"" 205 | assert "" == strip_tags(input) 206 | end 207 | 208 | test "should_not_fall_for_ridiculous_hack" do 209 | img_hack = """ 210 | <IMG\nSRC\n=\n"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n"\n>) 211 | """ 212 | 213 | assert ")\n" == strip_tags(img_hack) 214 | end 215 | 216 | test "should_sanitize_within attributes" do 217 | input = 218 | "<span title=\"&#39;&gt;&lt;script&gt;alert()&lt;/script&gt;\">blah</span>" 219 | 220 | assert "blah" == strip_tags(input) 221 | end 222 | 223 | test "should_sanitize_invalid_tag_names" do 224 | assert "a b cd e f" == 225 | strip_tags( 226 | ~s(a b c<script/XSS src="http://ha.ckers.org/xss.js"></script>d e f) 227 | ) 228 | end 229 | 230 | test "should_sanitize_non_alpha_and_non_digit_characters_in_tags" do 231 | assert "foo" == 232 | strip_tags( 233 | "<a onclick!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>" 234 | ) 235 | end 236 | 237 | @tag cdata: true 238 | test "should_sanitize_cdata_section" do 239 | assert "section]]&gt;" == strip_tags("<![CDATA[<span>section</span>]]>") 240 | end 241 | 242 | @tag cdata: true 243 | test "should_sanitize_cdata_section like any other" do 244 | assert "section]]&gt;" == strip_tags("<![CDATA[<script>section</script>]]>") 245 | end 246 | 247 | @tag cdata: true 248 | test "should_sanitize_unterminated_cdata_section" do 249 | assert "neverending..." == strip_tags("<![CDATA[<span>neverending...") 250 | end 251 | 252 | @tag cdata: true 253 | test "strips CDATA" do 254 | input = "This has a <![CDATA[<section>]]> here." 255 | expected = "This has a ]]&gt; here." 256 | assert expected == strip_tags(input) 257 | end 258 | 259 | test "should sanitize neverending attribute" do 260 | assert "" == strip_tags("<span class=\"\\") 261 | end 262 | 263 | test "should not destroy white-space" do 264 | assert "some\r\ntext" == strip_tags("some\r\ntext") 265 | end 266 | 267 | test "should not destroy white-space /2" do 268 | assert "sometext with break between tags\r\nwill remove break" == 269 | strip_tags( 270 | "some<b>text with break between tags</b>\r\n<i>will remove break</i>" 271 | ) 272 | end 273 | 274 | test "should not destroy white-space /3" do 275 | assert "some text\r\nbreak only from one side" == 276 | strip_tags("some text\r\n<b>break only from one side</b>") 277 | end 278 | end 279 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | -------------------------------------------------------------------------------- /test/test_if_tests_fail_after_resetting_lib.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # common setup 4 | 5 | set -e 6 | 7 | DIRNAME=$( cd "$( dirname "$0" )" && pwd ) 8 | PROJECT_ROOT=$( cd "$DIRNAME/.." && pwd ) 9 | 10 | # execution 11 | 12 | cd $PROJECT_ROOT 13 | 14 | git checkout master lib/ 15 | 16 | if mix test ; then 17 | echo "" 18 | echo "------------------------------------------------------------------" 19 | echo "" 20 | echo "There are changes to both lib/ and test/ which can indicate" 21 | echo "a bugfix with a corresponding test that reproduces the fixed bug" 22 | echo "" 23 | echo "(if this is not a bugfix PR, please ignore the following error)" 24 | echo "" 25 | echo "\e[31mAfter resetting changes in lib/, mix test should have failed" 26 | echo "" 27 | echo "------------------------------------------------------------------" 28 | echo "" 29 | exit 1 30 | else 31 | exit 0 32 | fi 33 | -------------------------------------------------------------------------------- /test/traverser_test.exs: -------------------------------------------------------------------------------- 1 | defmodule StripEverythingButB do 2 | def scrub({"b", attributes, children}), do: {"b", attributes, children} 3 | 4 | def scrub({_tag, _attributes, children}) do 5 | children 6 | end 7 | 8 | def scrub(text) do 9 | text 10 | end 11 | end 12 | 13 | defmodule HtmlSanitizeExTraverserTest do 14 | use ExUnit.Case, async: true 15 | 16 | def parse_to_tree(html) do 17 | html 18 | |> HtmlSanitizeEx.Parser.parse() 19 | |> HtmlSanitizeEx.Traverser.traverse(StripEverythingButB) 20 | end 21 | 22 | test "should return expected tree" do 23 | input = 24 | "hello! <section><b><script>code!</script></b><p>hello <script>code!</script></p></section>" 25 | 26 | expected = ["hello! ", {"b", [], ["code!"]}, "hello ", "code!"] 27 | assert expected == parse_to_tree(input) 28 | end 29 | 30 | test "should return expected tree 2" do 31 | input = 32 | "<title>This is <b>the <a href=\"http://me@example.com\" target=\"_blank\">test</a></b>.</title>\n\n\n\n<p>It no <b>longer <strong>contains <em>any <strike>HTML</strike></em>.</strong></b></p>\n" 33 | 34 | expected = [ 35 | "This is ", 36 | {"b", [], ["the ", "test"]}, 37 | ".", 38 | " _ \n\n\n\n", 39 | "It no ", 40 | {"b", [], ["longer ", "contains ", "any ", "HTML", "."]}, 41 | " _ \n" 42 | ] 43 | 44 | assert expected == parse_to_tree(input) 45 | end 46 | 47 | test "should return expected tree 3" do 48 | input = "This has a <!-- comment --> here." 49 | expected = ["This has a ", {:comment, " comment "}, " here."] 50 | assert expected == parse_to_tree(input) 51 | end 52 | 53 | test "should return expected tree 4" do 54 | input = "This has a <!-- comment here." 55 | expected = ["This has a ", {:comment, " comment here.</html_sanitize_ex>"}] 56 | assert expected == parse_to_tree(input) 57 | end 58 | 59 | test "should return expected tree 5" do 60 | input = "<<<bad html" 61 | expected = ["<<"] 62 | assert expected == parse_to_tree(input) 63 | end 64 | 65 | test "should return expected tree 6" do 66 | input = "<\" <img src=\"trollface.gif\" onload=\"alert(1)\"> hi" 67 | expected = ["<\" ", " hi"] 68 | assert expected == parse_to_tree(input) 69 | end 70 | 71 | test "should return expected tree 7" do 72 | input = "This has a <![CDATA[<section>]]> here." 73 | expected = "This has a <section> here." 74 | assert expected == parse_to_tree(input) 75 | end 76 | end 77 | --------------------------------------------------------------------------------