├── .github └── workflows │ └── tests.yml ├── .gitignore ├── ChangeLog.md ├── LICENSE ├── README.md ├── Setup.lhs ├── TODO ├── flake.lock ├── package.yaml ├── src └── Text │ └── HTML │ ├── SanitizeXSS.hs │ └── SanitizeXSS │ └── Css.hs ├── stack.yaml └── test └── main.hs /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - master 7 | push: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | build: 13 | name: CI 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | os: [ubuntu-latest, macos-latest, windows-latest] 19 | resolver: [nightly, lts-21, lts-20, lts-19] 20 | # # Bugs in GHC make it crash too often to be worth running 21 | # exclude: 22 | # - os: macos-latest 23 | # resolver: lts-9 24 | # - os: windows-latest 25 | # resolver: lts-15 26 | # - os: windows-latest 27 | # resolver: nightly 28 | 29 | steps: 30 | - name: Clone project 31 | uses: actions/checkout@v4 32 | 33 | - name: Cache dependencies 34 | uses: actions/cache@v3 35 | with: 36 | path: ~/.stack 37 | key: ${{ runner.os }}-${{ matrix.resolver }}-${{ hashFiles('stack.yaml') }} 38 | restore-keys: | 39 | ${{ runner.os }}-${{ matrix.resolver }}- 40 | 41 | - name: Build and run tests 42 | shell: bash 43 | run: stack test --bench --no-run-benchmarks --haddock --no-haddock-deps --no-terminal --resolver=${{ matrix.resolver }} 44 | # run: | 45 | # set -ex 46 | # stack upgrade || curl -sSL https://get.haskellstack.org/ | sh -s - -f 47 | # stack test --bench --no-run-benchmarks --haddock --no-haddock-deps --no-terminal --resolver=${{ matrix.resolver }} 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.hi 2 | *.o 3 | dist 4 | .cabal-sandbox/ 5 | cabal.sandbox.config 6 | .stack-work/ 7 | xss-sanitize.cabal 8 | /dist-newstyle/ 9 | /stack.yaml.lock 10 | -------------------------------------------------------------------------------- /ChangeLog.md: -------------------------------------------------------------------------------- 1 | # 0.3.7.2 2 | 3 | Stops Tag Soup from escaping &"<> which breaks HTML entities 4 | 5 | # 0.3.7.1 6 | 7 | add max height and max width as valid style attributes 8 | 9 | # 0.3.7 10 | 11 | clear the contents of style and script tags instead of escaping them 12 | 13 | # 0.3.5.6 14 | 15 | expose safeTagName 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The following license covers this documentation, and the source code, except 2 | where otherwise indicated. 3 | 4 | Copyright 2010, Greg Weber. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS "AS IS" AND ANY EXPRESS OR 17 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 18 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 19 | EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 | NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 22 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 25 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | [![Tests](https://github.com/yesodweb/haskell-xss-sanitize/actions/workflows/tests.yml/badge.svg)](https://github.com/yesodweb/haskell-xss-sanitize/actions/workflows/tests.yml) 4 | 5 | xss-sanitize allows you to accept html from untrusted sources by first filtering it through a white list. 6 | The white list filtering is fairly comprehensive, including support for css in style attributes, but there are limitations enumerated below. 7 | 8 | Sanitizing allows a web application to safely use a rich text editor, allow html in comments, or otherwise display untrusted HTML. 9 | 10 | If you trust the HTML (you wrote it), you do not need to use this. 11 | If you don't trust the html you probably also do not trust that the tags are balanced and should use the sanitizeBalance function. 12 | 13 | # Usage 14 | 15 | provides 2 functions in the module Text.HTML.SanitizeXSS 16 | 17 | * sanitize - filters html to prevent XSS attacks. 18 | * sanitizeBalance - same as sanitize but makes sure there are no lone opening/closing tags - useful to protect against a user's html messing up your page 19 | 20 | 21 | # Details 22 | 23 | This is not escaping! Escaping html does prevent XSS attacks. Strings (that aren't meant to be HTML) should be HTML escaped to show up properly and to prevent XSS attacks. However, escaping will ruin the display of actual HTML. 24 | 25 | This function removes any HTML tags or attributes that are not in its white-list. This may sound picky, but most HTML should make it through unchanged, making the process unnoticeable to the user but giving us safe HTML. 26 | 27 | 28 | ## Integration 29 | 30 | It is recommended to integrate this so that it is automatically used whenever an application receives untrusted html data (instead of before it is displayed). See the Yesod web framework as an example. 31 | 32 | 33 | ## Limitations 34 | 35 | ### Lowercase 36 | 37 | All tag names and attribute names are converted to lower case as a matter of convenience. If you have a use case where this is undesirable let me know. 38 | 39 | ### Balancing - sanitizeBalance 40 | 41 | The goal of this function is to prevent your html from breaking when (unknown) html with unbalanced tags are placed inside it. I would expect it to work very well in practice and don't see a downside to using it unless you have an alternative approach. However, this function does not at all guarantee valid html. In fact, it is likely that the result of balancing will still be invalid HTML. There is no guarantee for how a browser will display invalid HTML, so there is no guarantee that this function will protect your HTML from being broken by a user's html. Other possible approaches would be to run the HTML through a library like libxml2 which understands HTML or to first render the HTML in a hidden iframe or hidden div at the bottom of the page so that it is isolated, and then use JavaScript to insert it into the page where you want it. 42 | 43 | ### TagSoup Parser 44 | 45 | TagSoup is used to parse the HTML, and it does a good job. However TagSoup does not maintain all white space. TagSoup does not distinguish between the following cases: 46 | 47 | , 48 | , 49 | , 50 | 51 | In the third case, img and br tags will be output as a single self-closing tags. Other self-closing tags will be output as an open and closing pair. So ` or ` converts to ``, and ` or ` converts to ``. There are future updates to TagSoup planned so that TagSoup will be able to render tags exactly the same as they were parsed. 52 | 53 | 54 | ## Security 55 | 56 | ### Where is the white list from? 57 | 58 | Ultimately this is where your security comes from. I would expect that a faulty white list would act as a strong deterrent, but this library strives for correctness. 59 | 60 | The [source code of html5lib](https://github.com/html5lib/html5lib-python/blob/master/html5lib/filters/sanitizer.py) is the source of the white list and my implementation reference. If you feel a tag is missing from the white list, check to see if it has been added there. 61 | 62 | If anyone knows of better sources or thinks a particular tag/attribute/value may be vulnerable, please let me know. 63 | [HTML Purifier](http://htmlpurifier.org/live/smoketests/printDefinition.php) does have a more permissive and configurable (yet safe) white list if you are looking to add anything. 64 | 65 | ### Where is the code from? 66 | 67 | Original code was taken from John MacFarlane's Pandoc (with permission), but modified by Greg Weber to be faster and with parsing redone using TagSoup, and to use html5lib's white list. 68 | Michael Snoyman added the balanced tags functionality and released css-text specifically to help with css parsing. 69 | html5lib's sanitizer.py is used as a reference implementation, and most of the code should look the same. The css parsing is different: as mentioned we use a css parser, not regexes like html5lib. 70 | 71 | ### style attribute 72 | 73 | style attributes are now parsed with the css-text and autoparsec-text dependencies. They are then ran through a white list for properties and keywords. Whitespace is not preserved. This code was again translated from sanitizer.py, but uses attoparsec instead of regexes. If you don't care about stripping css you can avoid the attoparsec dependendcy by using the older < 0.3 version of this library. 74 | 75 | ### data attributes 76 | 77 | data attributes are not on the white list. 78 | The href and style attributes are white listed, but its values must pass through a white list also. This is how the data attributes could work also. 79 | 80 | ### svg and mathml 81 | 82 | A mathml white list is fully implemented. There is some support for svg styling. 83 | There is a full white list for svg elements and attributes. However, some elements are not included because they need further filtering (just like the data attributes) and this has not been done yet. 84 | -------------------------------------------------------------------------------- /Setup.lhs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env runhaskell 2 | 3 | > module Main where 4 | > import Distribution.Simple 5 | 6 | > main :: IO () 7 | > main = defaultMain 8 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | specific test cases: http://ha.ckers.org/xss.html 2 | expanded white-lists as mentioned in README 3 | -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "nix-filter": { 4 | "locked": { 5 | "lastModified": 1678109515, 6 | "narHash": "sha256-C2X+qC80K2C1TOYZT8nabgo05Dw2HST/pSn6s+n6BO8=", 7 | "owner": "numtide", 8 | "repo": "nix-filter", 9 | "rev": "aa9ff6ce4a7f19af6415fb3721eaa513ea6c763c", 10 | "type": "github" 11 | }, 12 | "original": { 13 | "owner": "numtide", 14 | "ref": "main", 15 | "repo": "nix-filter", 16 | "type": "github" 17 | } 18 | }, 19 | "nixpkgs": { 20 | "locked": { 21 | "lastModified": 1679198465, 22 | "narHash": "sha256-VfXpHpniNWgg7pBzxb20pRX7kqn80LApPDQYTReiFCw=", 23 | "owner": "nixos", 24 | "repo": "nixpkgs", 25 | "rev": "5a05160f7671434e1c833b1b01284b876e04eca4", 26 | "type": "github" 27 | }, 28 | "original": { 29 | "owner": "nixos", 30 | "ref": "nixpkgs-unstable", 31 | "repo": "nixpkgs", 32 | "type": "github" 33 | } 34 | }, 35 | "root": { 36 | "inputs": { 37 | "nix-filter": "nix-filter", 38 | "nixpkgs": "nixpkgs" 39 | } 40 | } 41 | }, 42 | "root": "root", 43 | "version": 7 44 | } 45 | -------------------------------------------------------------------------------- /package.yaml: -------------------------------------------------------------------------------- 1 | name: xss-sanitize 2 | version: 0.3.7.2 3 | synopsis: sanitize untrusted HTML to prevent XSS attacks 4 | description: run untrusted HTML through Text.HTML.SanitizeXSS.sanitizeXSS to prevent 5 | XSS attacks. see README.md for 6 | more details 7 | category: Web 8 | author: Greg Weber 9 | maintainer: Michael Snoyman 10 | license: BSD2 11 | github: yesodweb/haskell-xss-sanitize 12 | stability: Stable 13 | 14 | extra-source-files: 15 | - README.md 16 | - ChangeLog.md 17 | 18 | dependencies: 19 | - base >= 4.9.1 && < 5 20 | - containers 21 | - tagsoup >=0.12.2 && <1 22 | - utf8-string >=0.3 && <1.1 23 | - css-text >=0.1.1 && <0.2 24 | - text >=0.11 && < 2.2 25 | - attoparsec >=0.10.0.3 && <1 26 | - network-uri >=2.6 27 | 28 | library: 29 | source-dirs: src 30 | exposed-modules: 31 | - Text.HTML.SanitizeXSS 32 | 33 | tests: 34 | test: 35 | main: main.hs 36 | source-dirs: 37 | - test 38 | - src 39 | cpp-options: -DTEST 40 | dependencies: 41 | - hspec >=1.3 42 | - HUnit >=1.2 43 | -------------------------------------------------------------------------------- /src/Text/HTML/SanitizeXSS.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | -- | Sanatize HTML to prevent XSS attacks. 3 | -- 4 | -- See README.md for more details. 5 | module Text.HTML.SanitizeXSS 6 | ( 7 | -- * Sanitize 8 | sanitize 9 | , sanitizeBalance 10 | , sanitizeXSS 11 | 12 | -- * Custom filtering 13 | , filterTags 14 | , safeTags 15 | , safeTagsCustom 16 | , clearTags 17 | , clearTagsCustom 18 | , balanceTags 19 | 20 | -- * Utilities 21 | , safeTagName 22 | , sanitizeAttribute 23 | , sanitaryURI 24 | ) where 25 | 26 | import Text.HTML.SanitizeXSS.Css 27 | 28 | import Text.HTML.TagSoup 29 | 30 | import Data.Set (Set(), member, notMember, (\\), fromList, fromAscList) 31 | import Data.Char ( toLower ) 32 | import Data.Text (Text) 33 | import qualified Data.Text as T 34 | 35 | import Network.URI ( parseURIReference, URI (..), 36 | isAllowedInURI, escapeURIString, uriScheme ) 37 | import Codec.Binary.UTF8.String ( encodeString ) 38 | 39 | import Data.Maybe (mapMaybe) 40 | 41 | 42 | -- | Sanitize HTML to prevent XSS attacks. This is equivalent to @filterTags safeTags@. 43 | sanitize :: Text -> Text 44 | sanitize = sanitizeXSS 45 | 46 | -- | alias of sanitize function 47 | sanitizeXSS :: Text -> Text 48 | sanitizeXSS = filterTags (safeTags . clearTags) 49 | 50 | -- | Sanitize HTML to prevent XSS attacks and also make sure the tags are balanced. 51 | -- This is equivalent to @filterTags (balanceTags . safeTags)@. 52 | sanitizeBalance :: Text -> Text 53 | sanitizeBalance = filterTags (balanceTags . safeTags . clearTags) 54 | 55 | -- | Filter which makes sure the tags are balanced. Use with 'filterTags' and 'safeTags' to create a custom filter. 56 | balanceTags :: [Tag Text] -> [Tag Text] 57 | balanceTags = balance [] 58 | 59 | -- | Parse the given text to a list of tags, apply the given filtering 60 | -- function, and render back to HTML. You can insert your own custom 61 | -- filtering, but make sure you compose your filtering function with 62 | -- 'safeTags' and 'clearTags' or 'safeTagsCustom' and 'clearTagsCustom'. 63 | filterTags :: ([Tag Text] -> [Tag Text]) -> Text -> Text 64 | filterTags f = renderTagsOptions renderOptions { 65 | optEscape = id -- stops &"<> from being escaped which breaks existing HTML entities 66 | , optMinimize = \x -> x `member` voidElems -- converts to , converts to 67 | } . f . canonicalizeTags . parseTagsOptions (parseOptionsEntities (const Nothing)) 68 | 69 | voidElems :: Set T.Text 70 | voidElems = fromAscList $ T.words $ T.pack "area base br col command embed hr img input keygen link meta param source track wbr" 71 | 72 | balance :: [Text] -- ^ unclosed tags 73 | -> [Tag Text] -> [Tag Text] 74 | balance unclosed [] = map TagClose $ filter (`notMember` voidElems) unclosed 75 | balance (x:xs) tags'@(TagClose name:tags) 76 | | x == name = TagClose name : balance xs tags 77 | | x `member` voidElems = balance xs tags' 78 | | otherwise = TagOpen name [] : TagClose name : balance (x:xs) tags 79 | balance unclosed (TagOpen name as : tags) = 80 | TagOpen name as : balance (name : unclosed) tags 81 | balance unclosed (t:ts) = t : balance unclosed ts 82 | 83 | -- | Filters out unsafe tags and sanitizes attributes. Use with 84 | -- filterTags to create a custom filter. 85 | safeTags :: [Tag Text] -> [Tag Text] 86 | safeTags = safeTagsCustom safeTagName sanitizeAttribute 87 | 88 | -- | Filters out unsafe tags and sanitizes attributes, like 89 | -- 'safeTags', but uses custom functions for determining which tags 90 | -- are safe and for sanitizing attributes. This allows you to add or 91 | -- remove specific tags or attributes on the white list, or to use 92 | -- your own white list. 93 | -- 94 | -- @safeTagsCustom safeTagName sanitizeAttribute@ is equivalent to 95 | -- 'safeTags'. 96 | -- 97 | -- @since 0.3.6 98 | safeTagsCustom :: 99 | (Text -> Bool) -- ^ Select safe tags, like 100 | -- 'safeTagName' 101 | -> ((Text, Text) -> Maybe (Text, Text)) -- ^ Sanitize attributes, 102 | -- like 'sanitizeAttribute' 103 | -> [Tag Text] -> [Tag Text] 104 | safeTagsCustom _ _ [] = [] 105 | safeTagsCustom safeName sanitizeAttr (t@(TagClose name):tags) 106 | | safeName name = t : safeTagsCustom safeName sanitizeAttr tags 107 | | otherwise = safeTagsCustom safeName sanitizeAttr tags 108 | safeTagsCustom safeName sanitizeAttr (TagOpen name attributes:tags) 109 | | safeName name = TagOpen name (mapMaybe sanitizeAttr attributes) : 110 | safeTagsCustom safeName sanitizeAttr tags 111 | | otherwise = safeTagsCustom safeName sanitizeAttr tags 112 | safeTagsCustom n a (t:tags) = t : safeTagsCustom n a tags 113 | 114 | -- | Directly removes tags even if they are not closed properly. 115 | -- This is importent to clear out both the script and iframe tag 116 | -- in sequences like "" "" 45 | 46 | it "object hack" $ 47 | sanitized "" "" 48 | 49 | it "embed hack" $ 50 | sanitized "" "" 51 | 52 | it "ucase image hack" $ 53 | sanitized "" "" 54 | 55 | describe "allowedCssAttributeValue" $ do 56 | it "allows hex" $ do 57 | assert $ allowedCssAttributeValue "#abc" 58 | assert $ allowedCssAttributeValue "#123" 59 | assert $ not $ allowedCssAttributeValue "abc" 60 | assert $ not $ allowedCssAttributeValue "123abc" 61 | 62 | it "allows rgb" $ do 63 | assert $ allowedCssAttributeValue "rgb(1,3,3)" 64 | assert $ not $ allowedCssAttributeValue "rgb()" 65 | 66 | it "allows units" $ do 67 | assert $ allowedCssAttributeValue "10 px" 68 | assert $ not $ allowedCssAttributeValue "10 abc" 69 | 70 | describe "css sanitizing" $ do 71 | it "removes style when empty" $ 72 | sanitized "

" "

" 73 | 74 | it "allows any non-url value for white-listed properties" $ do 75 | let whiteCss = "

" 76 | sanitized whiteCss whiteCss 77 | 78 | it "rejects any url value" $ do 79 | let whiteCss = "

" 80 | sanitized whiteCss "

" 81 | 82 | it "rejects properties not on the white list" $ do 83 | let blackCss = "

" 84 | sanitized blackCss "

" 85 | 86 | it "rejects invalid units for grey-listed css" $ do 87 | let greyCss = "

" 88 | sanitized greyCss "

" 89 | 90 | it "allows valid units for grey-listed css" $ do 91 | let grey2Css = "

" 92 | sanitized grey2Css grey2Css 93 | 94 | describe "balancing" $ do 95 | it "adds missing elements" $ do 96 | sanitizedB "foo" "foo" 97 | it "doesn't add closing voids" $ do 98 | sanitizedB "
" "
" 99 | it "removes closing voids" $ do 100 | sanitizedB "" "" 101 | it "interleaved" $ 102 | sanitizedB "helloworld" "helloworld" 103 | 104 | describe "customized white list" $ do 105 | it "does not filter custom tags" $ do 106 | let custtag = "

" 107 | sanitizedC custtag custtag 108 | it "filters non-custom tags" $ do 109 | sanitizedC "

" "

" 110 | it "does not filter custom attributes" $ do 111 | let custattr = "

" 112 | sanitizedC custattr custattr 113 | it "filters non-custom attributes" $ do 114 | sanitizedC "

" "

" 115 | --------------------------------------------------------------------------------