├── .gitattributes ├── .github ├── dependabot.yml ├── funding.yml └── workflows │ ├── automerge.yml │ ├── lint.yml │ ├── tag_and_release.yml │ ├── test.yml │ └── test_build.yml ├── .gitignore ├── .rubocop.yml ├── .ruby-version ├── .vscode ├── extensions.json ├── launch.json ├── settings.json └── tasks.json ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── bin ├── console ├── rake └── setup ├── ext └── selma │ ├── Cargo.toml │ ├── extconf.rb │ └── src │ ├── html.rs │ ├── html │ ├── element.rs │ ├── end_tag.rs │ └── text_chunk.rs │ ├── lib.rs │ ├── native_ref_wrap.rs │ ├── rewriter.rs │ ├── sanitizer.rs │ ├── selector.rs │ └── tags.rs ├── lib ├── selma.rb └── selma │ ├── config.rb │ ├── extension.rb │ ├── html.rb │ ├── html │ └── element.rb │ ├── rewriter.rb │ ├── sanitizer.rb │ ├── sanitizer │ ├── config.rb │ └── config │ │ ├── basic.rb │ │ ├── default.rb │ │ ├── relaxed.rb │ │ └── restricted.rb │ ├── selector.rb │ └── version.rb ├── rakelib ├── benchmark.rake ├── compile.rake ├── extension.rake ├── lint.rake └── test.rake ├── script ├── bootstrap ├── docker_build └── valgrind ├── selma.gemspec └── test ├── benchmark.rb ├── benchmark ├── html │ ├── document-lg.html │ ├── document-md.html │ ├── document-sm.html │ ├── fragment-large.html │ └── fragment-small.html └── selma_config.rb ├── fixtures ├── deleting_content.html └── docs.html ├── memcheck ├── Dockerfile ├── entrypoint.sh └── tools │ ├── rust-wrapper.sh │ └── userhack.sh ├── selma_maliciousness_test.rb ├── selma_rewriter_match_attribute_test.rb ├── selma_rewriter_match_element_test.rb ├── selma_rewriter_test.rb ├── selma_rewriter_text_test.rb ├── selma_sanitizer_comments_test.rb ├── selma_sanitizer_config_test.rb ├── selma_sanitizer_doctype_test.rb ├── selma_sanitizer_elements_test.rb ├── selma_sanitizer_malicious_html_test.rb ├── selma_sanitizer_parser_test.rb ├── selma_sanitizer_test.rb ├── selma_selector_test.rb └── test_helper.rb /.gitattributes: -------------------------------------------------------------------------------- 1 | # exclude HTML files from stats 2 | test/benchmark/** linguist-vendored 3 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "11:00" 8 | open-pull-requests-limit: 10 9 | 10 | - package-ecosystem: bundler 11 | directory: "/" 12 | schedule: 13 | interval: daily 14 | time: "11:00" 15 | open-pull-requests-limit: 10 16 | 17 | - package-ecosystem: cargo 18 | directory: "/" 19 | schedule: 20 | interval: daily 21 | time: "11:00" 22 | open-pull-requests-limit: 10 23 | -------------------------------------------------------------------------------- /.github/funding.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: gjtorikian 4 | -------------------------------------------------------------------------------- /.github/workflows/automerge.yml: -------------------------------------------------------------------------------- 1 | name: PR auto-{approve,merge} 2 | 3 | on: 4 | pull_request_target: 5 | 6 | permissions: 7 | pull-requests: write 8 | contents: write 9 | 10 | jobs: 11 | dependabot: 12 | uses: yettoapp/actions/.github/workflows/automerge_dependabot.yml@main 13 | secrets: inherit 14 | with: 15 | automerge: true 16 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Linting 2 | 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | ruby: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v4 11 | 12 | - name: Set up Ruby 13 | uses: yettoapp/actions/setup-languages@main 14 | with: 15 | ruby: true 16 | 17 | - name: Rubocop 18 | run: bundle exec rake rubocop 19 | 20 | clippy_format: 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | with: 26 | submodules: true 27 | 28 | - name: Set up Rust 29 | id: toolchain 30 | uses: dtolnay/rust-toolchain@stable 31 | with: 32 | components: clippy, rustfmt 33 | 34 | - uses: actions/cache@v4 35 | with: 36 | path: | 37 | ~/.cargo/bin/ 38 | ~/.cargo/registry/index/ 39 | ~/.cargo/registry/cache/ 40 | ~/.cargo/git/db/ 41 | tmp/ 42 | target/ 43 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}-v1 44 | 45 | - name: Check clippy 46 | run: cargo clippy 47 | 48 | - name: Check formatting 49 | run: cargo fmt -- --check 50 | -------------------------------------------------------------------------------- /.github/workflows/tag_and_release.yml: -------------------------------------------------------------------------------- 1 | name: Tag and Release 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - main 8 | paths: 9 | - "lib/selma/version.rb" 10 | pull_request_target: 11 | types: 12 | - closed 13 | 14 | jobs: 15 | ruby: 16 | uses: yettoapp/actions/.github/workflows/ruby_gem_release.yml@main 17 | secrets: 18 | rubygems_api_key: ${{ secrets.RUBYGEMS_API_BOT_KEY }} 19 | gh_token: ${{ secrets.GITHUB_TOKEN }} 20 | with: 21 | gem_name: selma 22 | version_filepath: lib/selma/version.rb 23 | oxidized: true 24 | prepare: ${{ github.event_name == 'push' }} 25 | release: ${{ github.event_name == 'workflow_dispatch' || ((github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'release'))) }} 26 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | ci: 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | os: 13 | - ubuntu-latest 14 | - macos-latest 15 | - windows-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - uses: oxidize-rb/actions/setup-ruby-and-rust@main 21 | with: 22 | rubygems: latest 23 | bundler-cache: true 24 | cargo-cache: true 25 | cache-version: v1 26 | 27 | - name: Compile dependencies 28 | run: bundle exec rake compile 29 | 30 | - name: Run Ruby tests 31 | run: bundle exec rake test 32 | -------------------------------------------------------------------------------- /.github/workflows/test_build.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | paths: 7 | - "lib/selma/version.rb" 8 | 9 | jobs: 10 | build: 11 | uses: yettoapp/actions/.github/workflows/ruby_rust_test_build.yml@main 12 | secrets: 13 | gh_token: ${{ secrets.GITHUB_TOKEN }} 14 | with: 15 | include_musl: false 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.so 2 | *.rbc 3 | *.bundle 4 | /.bundle 5 | /.config 6 | /coverage/ 7 | /.idea 8 | /InstalledFiles 9 | /pkg/ 10 | /spec/reports/ 11 | /test/version_tmp/ 12 | tmp/ 13 | /vendor/gems 14 | /vendor/cache 15 | Gemfile.lock 16 | *.log 17 | ports/ 18 | target/ 19 | /gems/ 20 | 21 | ## Specific to RubyMotion: 22 | .dat* 23 | .repl_history 24 | build/ 25 | 26 | ## Documentation cache and generated files: 27 | /.yardoc/ 28 | /_yardoc/ 29 | /docs/ 30 | /rdoc/ 31 | 32 | ## Environment normalisation: 33 | /lib/bundler/man/ 34 | 35 | # for a library or gem, you might want to ignore these files since the code is 36 | # intended to run in multiple environments; otherwise, check them in: 37 | # .ruby-version 38 | # .ruby-gemset 39 | 40 | # unless supporting rvm < 1.11.0 or doing something fancy, ignore this: 41 | .rvmrc 42 | 43 | # Vagrant 44 | /.vagrant/ 45 | 46 | actual.txt 47 | test.txt 48 | test/progit 49 | test/benchinput.md 50 | 51 | *.orig 52 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | inherit_gem: 2 | rubocop-standard: 3 | - config/default.yml 4 | - config/minitest.yml 5 | 6 | inherit_mode: 7 | merge: 8 | - Exclude 9 | 10 | AllCops: 11 | Exclude: 12 | - test/progit/**/* 13 | - "pkg/**/*" 14 | - "ext/**/*" 15 | - "vendor/**/*" 16 | - "tmp/**/*" 17 | - "test/progit/**/*" 18 | -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | 3.4.1 2 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "rust-lang.rust-analyzer", 4 | "shopify.ruby-lsp", 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "type": "lldb", 9 | "request": "launch", 10 | "name": "Debug", 11 | "preLaunchTask": "rake: compile:debug", 12 | "program": "~/.rbenv/versions/3.1.1/bin/ruby", 13 | "args": ["-Itest", "test/selma_maliciousness_test.rb"], 14 | "cwd": "${workspaceFolder}", 15 | "sourceLanguages": ["rust"] 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "rust-analyzer.checkOnSave.command": "clippy", 3 | "[ruby]": { 4 | "editor.defaultFormatter": "Shopify.ruby-lsp" 5 | }, 6 | "[markdown]": { 7 | "editor.defaultFormatter": "esbenp.prettier-vscode" 8 | }, 9 | "[html]": { 10 | "editor.defaultFormatter": "esbenp.prettier-vscode" 11 | }, 12 | } 13 | -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "label": "rake: compile:debug", 6 | "type": "shell", 7 | "command": "bin/rake compile:debug", 8 | "problemMatcher": [ 9 | "$rustc" 10 | ], 11 | "group": "build" 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # [v0.4.12] - 08-01-2025 2 | ## What's Changed 3 | * Bump lol_html from 2.1.0 to 2.2.0 by @dependabot in https://github.com/gjtorikian/selma/pull/93 4 | * Bump rb-sys from 0.9.105 to 0.9.106 by @dependabot in https://github.com/gjtorikian/selma/pull/94 5 | * Bump rb-sys build system to correctly support 3.3+ by @gjtorikian in https://github.com/gjtorikian/selma/pull/95 6 | 7 | 8 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.4.11...v0.4.12 9 | # [v0.4.11] - 29-12-2024 10 | ## What's Changed 11 | * Bump lol_html from 2.0.0 to 2.1.0 by @dependabot in https://github.com/gjtorikian/selma/pull/88 12 | * Bump rb-sys from 0.9.103 to 0.9.104 by @dependabot in https://github.com/gjtorikian/selma/pull/89 13 | * Support Ruby 3.4 by @gjtorikian in https://github.com/gjtorikian/selma/pull/91 14 | 15 | 16 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.4.10...v0.4.11 17 | # [v0.4.10] - 26-11-2024 18 | ## What's Changed 19 | * Bump rb-sys from 0.9.102 to 0.9.103 by @dependabot in https://github.com/gjtorikian/selma/pull/84 20 | * should also ignore element's descendants by @gjtorikian in https://github.com/gjtorikian/selma/pull/86 21 | 22 | 23 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.4.9...v0.4.10 24 | # [v0.4.9] - 09-10-2024 25 | ## What's Changed 26 | * Bump lol_html from 1.2.1 to 2.0.0 by @dependabot in https://github.com/gjtorikian/selma/pull/79 27 | 28 | 29 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.4.8...v0.4.9 30 | # [v0.4.8] - 08-10-2024 31 | 32 | ## What's Changed 33 | 34 | - Bump rb-sys from 0.9.101 to 0.9.102 by @dependabot in https://github.com/gjtorikian/selma/pull/78 35 | - Add a spec for stress testing garbage collection by @jordandcarter in https://github.com/gjtorikian/selma/pull/80 36 | 37 | This release changed two APIs: 38 | 39 | - Provided configuration collections must be Arrays, not Arrays _or_ Sets 40 | - The `#elements` method from `Selma::Sanitizer` has been removed. Instead, access the configuration objects through `@config`. 41 | 42 | ## New Contributors 43 | 44 | - @jordandcarter made their first contribution in https://github.com/gjtorikian/selma/pull/80 45 | 46 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.4.7...v0.4.8 47 | 48 | # [v0.4.7] - 17-08-2024 49 | 50 | ## What's Changed 51 | 52 | - Bump rb-sys from 0.9.100 to 0.9.101 by @dependabot in https://github.com/gjtorikian/selma/pull/75 53 | - Support stacking text chunk changes by @gjtorikian in https://github.com/gjtorikian/selma/pull/76 54 | 55 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.4.6.1...v0.4.7 56 | 57 | # [v0.4.6.1] - 04-08-2024 58 | 59 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.4.6...v0.4.6.1 60 | 61 | # [v0.4.6] - 04-08-2024 62 | 63 | ## What's Changed 64 | 65 | - Expose text chunk results by @gjtorikian in https://github.com/gjtorikian/selma/pull/72 66 | 67 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.4.5...v0.4.6 68 | 69 | # [v0.4.5] - 31-07-2024 70 | 71 | ## What's Changed 72 | 73 | - Fix potential for segmentation fault by @gjtorikian in https://github.com/gjtorikian/selma/pull/70 74 | 75 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.4.4...v0.4.5 76 | 77 | # [v0.4.4] - 29-07-2024 78 | 79 | ## What's Changed 80 | 81 | - Bump rb-sys from 0.9.98 to 0.9.99 by @dependabot in https://github.com/gjtorikian/selma/pull/67 82 | - Stop operating handlers on deleted elements by @gjtorikian in https://github.com/gjtorikian/selma/pull/68 83 | 84 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.4.3...v0.4.4 85 | 86 | # [v0.4.3] - 18-07-2024 87 | 88 | ## What's Changed 89 | 90 | - Stop assuming handlers are always defined by @gjtorikian in https://github.com/gjtorikian/selma/pull/65 91 | 92 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.4.2...v0.4.3 93 | 94 | # [v0.4.2] - 16-07-2024 95 | 96 | ## What's Changed 97 | 98 | - Revert sanitization reordering (restore old behavior) by @gjtorikian in https://github.com/gjtorikian/selma/pull/63 99 | 100 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.4.1...v0.4.2 101 | 102 | # [v0.4.1] - 15-07-2024 103 | 104 | ## What's Changed 105 | 106 | - Address regression issue between sanitization and rewriting text chunks by @gjtorikian in https://github.com/gjtorikian/selma/pull/61 107 | 108 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.4.0...v0.4.1 109 | 110 | # [v0.4.0] - 15-07-2024 111 | 112 | ## What's Changed 113 | 114 | - add testing for big strings by @gjtorikian in https://github.com/gjtorikian/selma/pull/58 115 | 116 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.3.0...v0.4.0 117 | 118 | # [v0.3.0] - 07-06-2024 119 | 120 | ## What's Changed 121 | 122 | - Bump enum-iterator from 1.4.1 to 1.5.0 by @dependabot in https://github.com/gjtorikian/selma/pull/41 123 | - Bump actions/cache from 3 to 4 by @dependabot in https://github.com/gjtorikian/selma/pull/42 124 | - Bump enum-iterator from 1.5.0 to 2.0.0 by @dependabot in https://github.com/gjtorikian/selma/pull/44 125 | - Bump lol_html from 1.2.0 to 1.2.1 by @dependabot in https://github.com/gjtorikian/selma/pull/45 126 | - Bump magnus from 0.6.2 to 0.6.3 by @dependabot in https://github.com/gjtorikian/selma/pull/46 127 | - Bump enum-iterator from 2.0.0 to 2.0.1 by @dependabot in https://github.com/gjtorikian/selma/pull/47 128 | - Bump enum-iterator from 2.0.1 to 2.1.0 by @dependabot in https://github.com/gjtorikian/selma/pull/48 129 | - Bump magnus from 0.6.3 to 0.6.4 by @dependabot in https://github.com/gjtorikian/selma/pull/49 130 | - Update bin/console to be able to run the gem by @digitalmoksha in https://github.com/gjtorikian/selma/pull/51 131 | - Add details tag to RELAXED config by @digitalmoksha in https://github.com/gjtorikian/selma/pull/52 132 | - Fix example in README by @digitalmoksha in https://github.com/gjtorikian/selma/pull/53 133 | - Fix and remove old benchmark code by @digitalmoksha in https://github.com/gjtorikian/selma/pull/50 134 | - Add support for :all protocols by @gjtorikian in https://github.com/gjtorikian/selma/pull/55 135 | 136 | ## New Contributors 137 | 138 | - @digitalmoksha made their first contribution in https://github.com/gjtorikian/selma/pull/51 139 | 140 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.2.2...v0.3.0 141 | 142 | ## [v0.2.2] - 03-01-2024 143 | 144 | ## What's Changed 145 | 146 | - Updates for Ruby 3.3 / Rust 1.75 by @gjtorikian in https://github.com/gjtorikian/selma/pull/37 147 | 148 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.2.1...v0.2.2 149 | 150 | ## [v0.2.1] - 12-10-2023 151 | 152 | **Full Changelog**: https://github.com/gjtorikian/selma/compare/v0.2.0...v0.2.1 153 | 154 | # Changelog 155 | 156 | ## [v0.1.6](https://github.com/gjtorikian/selma/tree/v0.1.6) (2023-06-05) 157 | 158 | [Full Changelog](https://github.com/gjtorikian/selma/compare/v0.1.5...v0.1.6) 159 | 160 | ## [v0.1.5](https://github.com/gjtorikian/selma/tree/v0.1.5) (2023-06-05) 161 | 162 | [Full Changelog](https://github.com/gjtorikian/selma/compare/v0.1.4...v0.1.5) 163 | 164 | ## [v0.1.4](https://github.com/gjtorikian/selma/tree/v0.1.4) (2023-06-05) 165 | 166 | [Full Changelog](https://github.com/gjtorikian/selma/compare/v0.1.3...v0.1.4) 167 | 168 | ## [v0.1.3](https://github.com/gjtorikian/selma/tree/v0.1.3) (2023-06-05) 169 | 170 | [Full Changelog](https://github.com/gjtorikian/selma/compare/v0.1.2...v0.1.3) 171 | 172 | ## [v0.1.2](https://github.com/gjtorikian/selma/tree/v0.1.2) (2023-06-05) 173 | 174 | [Full Changelog](https://github.com/gjtorikian/selma/compare/v0.1.0...v0.1.2) 175 | 176 | **Merged pull requests:** 177 | 178 | - Bump lol_html from 0.4.0 to 1.0.0 [\#17](https://github.com/gjtorikian/selma/pull/17) ([dependabot[bot]](https://github.com/apps/dependabot)) 179 | - Update ruby-lsp requirement from ~\> 0.4.0 to ~\> 0.5.1 [\#16](https://github.com/gjtorikian/selma/pull/16) ([dependabot[bot]](https://github.com/apps/dependabot)) 180 | - Bump lol_html from 0.3.3 to 0.4.0 [\#15](https://github.com/gjtorikian/selma/pull/15) ([dependabot[bot]](https://github.com/apps/dependabot)) 181 | 182 | ## [v0.1.0](https://github.com/gjtorikian/selma/tree/v0.1.0) (2023-03-29) 183 | 184 | [Full Changelog](https://github.com/gjtorikian/selma/compare/v0.0.7...v0.1.0) 185 | 186 | **Merged pull requests:** 187 | 188 | - Bump lol_html from 0.3.2 to 0.3.3 [\#13](https://github.com/gjtorikian/selma/pull/13) ([dependabot[bot]](https://github.com/apps/dependabot)) 189 | - Bump enum-iterator from 1.3.0 to 1.4.0 [\#12](https://github.com/gjtorikian/selma/pull/12) ([dependabot[bot]](https://github.com/apps/dependabot)) 190 | - Update ruby-lsp requirement from ~\> 0.3.5 to ~\> 0.4.0 [\#11](https://github.com/gjtorikian/selma/pull/11) ([dependabot[bot]](https://github.com/apps/dependabot)) 191 | - Bump enum-iterator from 1.2.0 to 1.3.0 [\#10](https://github.com/gjtorikian/selma/pull/10) ([dependabot[bot]](https://github.com/apps/dependabot)) 192 | 193 | ## [v0.0.7](https://github.com/gjtorikian/selma/tree/v0.0.7) (2023-01-09) 194 | 195 | [Full Changelog](https://github.com/gjtorikian/selma/compare/v0.0.6...v0.0.7) 196 | 197 | **Merged pull requests:** 198 | 199 | - Support ruby32 [\#9](https://github.com/gjtorikian/selma/pull/9) ([gjtorikian](https://github.com/gjtorikian)) 200 | 201 | ## [v0.0.6](https://github.com/gjtorikian/selma/tree/v0.0.6) (2022-12-28) 202 | 203 | [Full Changelog](https://github.com/gjtorikian/selma/compare/v0.0.5...v0.0.6) 204 | 205 | **Merged pull requests:** 206 | 207 | - Update README [\#8](https://github.com/gjtorikian/selma/pull/8) ([gjtorikian](https://github.com/gjtorikian)) 208 | 209 | ## [v0.0.5](https://github.com/gjtorikian/selma/tree/v0.0.5) (2022-12-27) 210 | 211 | [Full Changelog](https://github.com/gjtorikian/selma/compare/v0.0.4...v0.0.5) 212 | 213 | **Merged pull requests:** 214 | 215 | - Method signature changes [\#7](https://github.com/gjtorikian/selma/pull/7) ([gjtorikian](https://github.com/gjtorikian)) 216 | 217 | ## [v0.0.4](https://github.com/gjtorikian/selma/tree/v0.0.4) (2022-12-26) 218 | 219 | [Full Changelog](https://github.com/gjtorikian/selma/compare/v0.0.3...v0.0.4) 220 | 221 | **Merged pull requests:** 222 | 223 | - Method signature changes [\#6](https://github.com/gjtorikian/selma/pull/6) ([gjtorikian](https://github.com/gjtorikian)) 224 | - Bump escapist from 0.0.1 to 0.0.2 [\#5](https://github.com/gjtorikian/selma/pull/5) ([dependabot[bot]](https://github.com/apps/dependabot)) 225 | - Update escapist / fix Result warnings [\#4](https://github.com/gjtorikian/selma/pull/4) ([gjtorikian](https://github.com/gjtorikian)) 226 | 227 | ## [v0.0.3](https://github.com/gjtorikian/selma/tree/v0.0.3) (2022-12-24) 228 | 229 | [Full Changelog](https://github.com/gjtorikian/selma/compare/v0.0.2...v0.0.3) 230 | 231 | **Merged pull requests:** 232 | 233 | - Clean up various warnings [\#3](https://github.com/gjtorikian/selma/pull/3) ([gjtorikian](https://github.com/gjtorikian)) 234 | 235 | ## [v0.0.2](https://github.com/gjtorikian/selma/tree/v0.0.2) (2022-12-21) 236 | 237 | [Full Changelog](https://github.com/gjtorikian/selma/compare/v0.0.1...v0.0.2) 238 | 239 | **Merged pull requests:** 240 | 241 | - Test publish [\#2](https://github.com/gjtorikian/selma/pull/2) ([gjtorikian](https://github.com/gjtorikian)) 242 | 243 | ## [v0.0.1](https://github.com/gjtorikian/selma/tree/v0.0.1) (2022-12-21) 244 | 245 | [Full Changelog](https://github.com/gjtorikian/selma/compare/f2b6c847e33d8341aec72c070c09fe6d6c226224...v0.0.1) 246 | 247 | **Merged pull requests:** 248 | 249 | - fix segfaulting [\#1](https://github.com/gjtorikian/selma/pull/1) ([gjtorikian](https://github.com/gjtorikian)) 250 | 251 | \* _This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)_ 252 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.1.3" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "allocator-api2" 16 | version = "0.2.21" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" 19 | 20 | [[package]] 21 | name = "bindgen" 22 | version = "0.69.5" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" 25 | dependencies = [ 26 | "bitflags 2.6.0", 27 | "cexpr", 28 | "clang-sys", 29 | "itertools", 30 | "lazy_static", 31 | "lazycell", 32 | "proc-macro2", 33 | "quote", 34 | "regex", 35 | "rustc-hash", 36 | "shlex", 37 | "syn 2.0.93", 38 | ] 39 | 40 | [[package]] 41 | name = "bitflags" 42 | version = "1.3.2" 43 | source = "registry+https://github.com/rust-lang/crates.io-index" 44 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 45 | 46 | [[package]] 47 | name = "bitflags" 48 | version = "2.6.0" 49 | source = "registry+https://github.com/rust-lang/crates.io-index" 50 | checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" 51 | 52 | [[package]] 53 | name = "byteorder" 54 | version = "1.5.0" 55 | source = "registry+https://github.com/rust-lang/crates.io-index" 56 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 57 | 58 | [[package]] 59 | name = "cexpr" 60 | version = "0.6.0" 61 | source = "registry+https://github.com/rust-lang/crates.io-index" 62 | checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" 63 | dependencies = [ 64 | "nom", 65 | ] 66 | 67 | [[package]] 68 | name = "cfg-if" 69 | version = "1.0.0" 70 | source = "registry+https://github.com/rust-lang/crates.io-index" 71 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 72 | 73 | [[package]] 74 | name = "clang-sys" 75 | version = "1.8.1" 76 | source = "registry+https://github.com/rust-lang/crates.io-index" 77 | checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" 78 | dependencies = [ 79 | "glob", 80 | "libc", 81 | "libloading", 82 | ] 83 | 84 | [[package]] 85 | name = "convert_case" 86 | version = "0.4.0" 87 | source = "registry+https://github.com/rust-lang/crates.io-index" 88 | checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" 89 | 90 | [[package]] 91 | name = "cssparser" 92 | version = "0.29.6" 93 | source = "registry+https://github.com/rust-lang/crates.io-index" 94 | checksum = "f93d03419cb5950ccfd3daf3ff1c7a36ace64609a1a8746d493df1ca0afde0fa" 95 | dependencies = [ 96 | "cssparser-macros", 97 | "dtoa-short", 98 | "itoa", 99 | "matches", 100 | "phf", 101 | "proc-macro2", 102 | "quote", 103 | "smallvec", 104 | "syn 1.0.109", 105 | ] 106 | 107 | [[package]] 108 | name = "cssparser-macros" 109 | version = "0.6.1" 110 | source = "registry+https://github.com/rust-lang/crates.io-index" 111 | checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" 112 | dependencies = [ 113 | "quote", 114 | "syn 2.0.93", 115 | ] 116 | 117 | [[package]] 118 | name = "derive_more" 119 | version = "0.99.18" 120 | source = "registry+https://github.com/rust-lang/crates.io-index" 121 | checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" 122 | dependencies = [ 123 | "convert_case", 124 | "proc-macro2", 125 | "quote", 126 | "rustc_version", 127 | "syn 2.0.93", 128 | ] 129 | 130 | [[package]] 131 | name = "dtoa" 132 | version = "1.0.9" 133 | source = "registry+https://github.com/rust-lang/crates.io-index" 134 | checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" 135 | 136 | [[package]] 137 | name = "dtoa-short" 138 | version = "0.3.5" 139 | source = "registry+https://github.com/rust-lang/crates.io-index" 140 | checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" 141 | dependencies = [ 142 | "dtoa", 143 | ] 144 | 145 | [[package]] 146 | name = "either" 147 | version = "1.13.0" 148 | source = "registry+https://github.com/rust-lang/crates.io-index" 149 | checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" 150 | 151 | [[package]] 152 | name = "encoding_rs" 153 | version = "0.8.35" 154 | source = "registry+https://github.com/rust-lang/crates.io-index" 155 | checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" 156 | dependencies = [ 157 | "cfg-if", 158 | ] 159 | 160 | [[package]] 161 | name = "entities" 162 | version = "1.0.1" 163 | source = "registry+https://github.com/rust-lang/crates.io-index" 164 | checksum = "b5320ae4c3782150d900b79807611a59a99fc9a1d61d686faafc24b93fc8d7ca" 165 | 166 | [[package]] 167 | name = "enum-iterator" 168 | version = "2.1.0" 169 | source = "registry+https://github.com/rust-lang/crates.io-index" 170 | checksum = "c280b9e6b3ae19e152d8e31cf47f18389781e119d4013a2a2bb0180e5facc635" 171 | dependencies = [ 172 | "enum-iterator-derive", 173 | ] 174 | 175 | [[package]] 176 | name = "enum-iterator-derive" 177 | version = "1.4.0" 178 | source = "registry+https://github.com/rust-lang/crates.io-index" 179 | checksum = "a1ab991c1362ac86c61ab6f556cff143daa22e5a15e4e189df818b2fd19fe65b" 180 | dependencies = [ 181 | "proc-macro2", 182 | "quote", 183 | "syn 2.0.93", 184 | ] 185 | 186 | [[package]] 187 | name = "equivalent" 188 | version = "1.0.1" 189 | source = "registry+https://github.com/rust-lang/crates.io-index" 190 | checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" 191 | 192 | [[package]] 193 | name = "escapist" 194 | version = "0.0.2" 195 | source = "registry+https://github.com/rust-lang/crates.io-index" 196 | checksum = "a91c7e59d7e5afa56d8be4dc7e27ea123dcd23d38b4b6a6497a05cddd95844a2" 197 | dependencies = [ 198 | "entities", 199 | ] 200 | 201 | [[package]] 202 | name = "foldhash" 203 | version = "0.1.4" 204 | source = "registry+https://github.com/rust-lang/crates.io-index" 205 | checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" 206 | 207 | [[package]] 208 | name = "fxhash" 209 | version = "0.2.1" 210 | source = "registry+https://github.com/rust-lang/crates.io-index" 211 | checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" 212 | dependencies = [ 213 | "byteorder", 214 | ] 215 | 216 | [[package]] 217 | name = "getrandom" 218 | version = "0.1.16" 219 | source = "registry+https://github.com/rust-lang/crates.io-index" 220 | checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" 221 | dependencies = [ 222 | "cfg-if", 223 | "libc", 224 | "wasi", 225 | ] 226 | 227 | [[package]] 228 | name = "glob" 229 | version = "0.3.2" 230 | source = "registry+https://github.com/rust-lang/crates.io-index" 231 | checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" 232 | 233 | [[package]] 234 | name = "hashbrown" 235 | version = "0.15.2" 236 | source = "registry+https://github.com/rust-lang/crates.io-index" 237 | checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" 238 | dependencies = [ 239 | "allocator-api2", 240 | "equivalent", 241 | "foldhash", 242 | ] 243 | 244 | [[package]] 245 | name = "itertools" 246 | version = "0.12.1" 247 | source = "registry+https://github.com/rust-lang/crates.io-index" 248 | checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" 249 | dependencies = [ 250 | "either", 251 | ] 252 | 253 | [[package]] 254 | name = "itoa" 255 | version = "1.0.14" 256 | source = "registry+https://github.com/rust-lang/crates.io-index" 257 | checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" 258 | 259 | [[package]] 260 | name = "lazy_static" 261 | version = "1.5.0" 262 | source = "registry+https://github.com/rust-lang/crates.io-index" 263 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 264 | 265 | [[package]] 266 | name = "lazycell" 267 | version = "1.3.0" 268 | source = "registry+https://github.com/rust-lang/crates.io-index" 269 | checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" 270 | 271 | [[package]] 272 | name = "libc" 273 | version = "0.2.169" 274 | source = "registry+https://github.com/rust-lang/crates.io-index" 275 | checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" 276 | 277 | [[package]] 278 | name = "libloading" 279 | version = "0.8.6" 280 | source = "registry+https://github.com/rust-lang/crates.io-index" 281 | checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" 282 | dependencies = [ 283 | "cfg-if", 284 | "windows-targets", 285 | ] 286 | 287 | [[package]] 288 | name = "log" 289 | version = "0.4.22" 290 | source = "registry+https://github.com/rust-lang/crates.io-index" 291 | checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" 292 | 293 | [[package]] 294 | name = "lol_html" 295 | version = "2.3.0" 296 | source = "registry+https://github.com/rust-lang/crates.io-index" 297 | checksum = "1e24940eb633a7240c1155e61595e18ec6c72b8571837531933f19de3a8c3786" 298 | dependencies = [ 299 | "bitflags 2.6.0", 300 | "cfg-if", 301 | "cssparser", 302 | "encoding_rs", 303 | "hashbrown", 304 | "memchr", 305 | "mime", 306 | "selectors", 307 | "thiserror", 308 | ] 309 | 310 | [[package]] 311 | name = "magnus" 312 | version = "0.7.1" 313 | source = "registry+https://github.com/rust-lang/crates.io-index" 314 | checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab" 315 | dependencies = [ 316 | "magnus-macros", 317 | "rb-sys", 318 | "rb-sys-env", 319 | "seq-macro", 320 | ] 321 | 322 | [[package]] 323 | name = "magnus-macros" 324 | version = "0.6.0" 325 | source = "registry+https://github.com/rust-lang/crates.io-index" 326 | checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3" 327 | dependencies = [ 328 | "proc-macro2", 329 | "quote", 330 | "syn 2.0.93", 331 | ] 332 | 333 | [[package]] 334 | name = "matches" 335 | version = "0.1.10" 336 | source = "registry+https://github.com/rust-lang/crates.io-index" 337 | checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" 338 | 339 | [[package]] 340 | name = "memchr" 341 | version = "2.7.4" 342 | source = "registry+https://github.com/rust-lang/crates.io-index" 343 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" 344 | 345 | [[package]] 346 | name = "mime" 347 | version = "0.3.17" 348 | source = "registry+https://github.com/rust-lang/crates.io-index" 349 | checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" 350 | 351 | [[package]] 352 | name = "minimal-lexical" 353 | version = "0.2.1" 354 | source = "registry+https://github.com/rust-lang/crates.io-index" 355 | checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" 356 | 357 | [[package]] 358 | name = "nodrop" 359 | version = "0.1.14" 360 | source = "registry+https://github.com/rust-lang/crates.io-index" 361 | checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" 362 | 363 | [[package]] 364 | name = "nom" 365 | version = "7.1.3" 366 | source = "registry+https://github.com/rust-lang/crates.io-index" 367 | checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" 368 | dependencies = [ 369 | "memchr", 370 | "minimal-lexical", 371 | ] 372 | 373 | [[package]] 374 | name = "phf" 375 | version = "0.8.0" 376 | source = "registry+https://github.com/rust-lang/crates.io-index" 377 | checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" 378 | dependencies = [ 379 | "phf_macros", 380 | "phf_shared", 381 | "proc-macro-hack", 382 | ] 383 | 384 | [[package]] 385 | name = "phf_codegen" 386 | version = "0.8.0" 387 | source = "registry+https://github.com/rust-lang/crates.io-index" 388 | checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" 389 | dependencies = [ 390 | "phf_generator", 391 | "phf_shared", 392 | ] 393 | 394 | [[package]] 395 | name = "phf_generator" 396 | version = "0.8.0" 397 | source = "registry+https://github.com/rust-lang/crates.io-index" 398 | checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" 399 | dependencies = [ 400 | "phf_shared", 401 | "rand", 402 | ] 403 | 404 | [[package]] 405 | name = "phf_macros" 406 | version = "0.8.0" 407 | source = "registry+https://github.com/rust-lang/crates.io-index" 408 | checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c" 409 | dependencies = [ 410 | "phf_generator", 411 | "phf_shared", 412 | "proc-macro-hack", 413 | "proc-macro2", 414 | "quote", 415 | "syn 1.0.109", 416 | ] 417 | 418 | [[package]] 419 | name = "phf_shared" 420 | version = "0.8.0" 421 | source = "registry+https://github.com/rust-lang/crates.io-index" 422 | checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" 423 | dependencies = [ 424 | "siphasher", 425 | ] 426 | 427 | [[package]] 428 | name = "ppv-lite86" 429 | version = "0.2.20" 430 | source = "registry+https://github.com/rust-lang/crates.io-index" 431 | checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" 432 | dependencies = [ 433 | "zerocopy", 434 | ] 435 | 436 | [[package]] 437 | name = "precomputed-hash" 438 | version = "0.1.1" 439 | source = "registry+https://github.com/rust-lang/crates.io-index" 440 | checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" 441 | 442 | [[package]] 443 | name = "proc-macro-hack" 444 | version = "0.5.20+deprecated" 445 | source = "registry+https://github.com/rust-lang/crates.io-index" 446 | checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" 447 | 448 | [[package]] 449 | name = "proc-macro2" 450 | version = "1.0.92" 451 | source = "registry+https://github.com/rust-lang/crates.io-index" 452 | checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" 453 | dependencies = [ 454 | "unicode-ident", 455 | ] 456 | 457 | [[package]] 458 | name = "quote" 459 | version = "1.0.38" 460 | source = "registry+https://github.com/rust-lang/crates.io-index" 461 | checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" 462 | dependencies = [ 463 | "proc-macro2", 464 | ] 465 | 466 | [[package]] 467 | name = "rand" 468 | version = "0.7.3" 469 | source = "registry+https://github.com/rust-lang/crates.io-index" 470 | checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" 471 | dependencies = [ 472 | "getrandom", 473 | "libc", 474 | "rand_chacha", 475 | "rand_core", 476 | "rand_hc", 477 | "rand_pcg", 478 | ] 479 | 480 | [[package]] 481 | name = "rand_chacha" 482 | version = "0.2.2" 483 | source = "registry+https://github.com/rust-lang/crates.io-index" 484 | checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" 485 | dependencies = [ 486 | "ppv-lite86", 487 | "rand_core", 488 | ] 489 | 490 | [[package]] 491 | name = "rand_core" 492 | version = "0.5.1" 493 | source = "registry+https://github.com/rust-lang/crates.io-index" 494 | checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" 495 | dependencies = [ 496 | "getrandom", 497 | ] 498 | 499 | [[package]] 500 | name = "rand_hc" 501 | version = "0.2.0" 502 | source = "registry+https://github.com/rust-lang/crates.io-index" 503 | checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" 504 | dependencies = [ 505 | "rand_core", 506 | ] 507 | 508 | [[package]] 509 | name = "rand_pcg" 510 | version = "0.2.1" 511 | source = "registry+https://github.com/rust-lang/crates.io-index" 512 | checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" 513 | dependencies = [ 514 | "rand_core", 515 | ] 516 | 517 | [[package]] 518 | name = "rb-sys" 519 | version = "0.9.114" 520 | source = "registry+https://github.com/rust-lang/crates.io-index" 521 | checksum = "b41b4e5d871203c3ce7c7bd0f27390fb6eee494ef9c4822a151854610e562f05" 522 | dependencies = [ 523 | "rb-sys-build", 524 | ] 525 | 526 | [[package]] 527 | name = "rb-sys-build" 528 | version = "0.9.114" 529 | source = "registry+https://github.com/rust-lang/crates.io-index" 530 | checksum = "de20c3cc2868958bdf8eae7431e6797cb3ce29fcdd5bada95231c4c3c167e701" 531 | dependencies = [ 532 | "bindgen", 533 | "lazy_static", 534 | "proc-macro2", 535 | "quote", 536 | "regex", 537 | "shell-words", 538 | "syn 2.0.93", 539 | ] 540 | 541 | [[package]] 542 | name = "rb-sys-env" 543 | version = "0.1.2" 544 | source = "registry+https://github.com/rust-lang/crates.io-index" 545 | checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb" 546 | 547 | [[package]] 548 | name = "regex" 549 | version = "1.11.1" 550 | source = "registry+https://github.com/rust-lang/crates.io-index" 551 | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" 552 | dependencies = [ 553 | "aho-corasick", 554 | "memchr", 555 | "regex-automata", 556 | "regex-syntax", 557 | ] 558 | 559 | [[package]] 560 | name = "regex-automata" 561 | version = "0.4.9" 562 | source = "registry+https://github.com/rust-lang/crates.io-index" 563 | checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" 564 | dependencies = [ 565 | "aho-corasick", 566 | "memchr", 567 | "regex-syntax", 568 | ] 569 | 570 | [[package]] 571 | name = "regex-syntax" 572 | version = "0.8.5" 573 | source = "registry+https://github.com/rust-lang/crates.io-index" 574 | checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" 575 | 576 | [[package]] 577 | name = "rustc-hash" 578 | version = "1.1.0" 579 | source = "registry+https://github.com/rust-lang/crates.io-index" 580 | checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" 581 | 582 | [[package]] 583 | name = "rustc_version" 584 | version = "0.4.1" 585 | source = "registry+https://github.com/rust-lang/crates.io-index" 586 | checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" 587 | dependencies = [ 588 | "semver", 589 | ] 590 | 591 | [[package]] 592 | name = "selectors" 593 | version = "0.24.0" 594 | source = "registry+https://github.com/rust-lang/crates.io-index" 595 | checksum = "0c37578180969d00692904465fb7f6b3d50b9a2b952b87c23d0e2e5cb5013416" 596 | dependencies = [ 597 | "bitflags 1.3.2", 598 | "cssparser", 599 | "derive_more", 600 | "fxhash", 601 | "log", 602 | "phf", 603 | "phf_codegen", 604 | "precomputed-hash", 605 | "servo_arc", 606 | "smallvec", 607 | ] 608 | 609 | [[package]] 610 | name = "selma" 611 | version = "1.0.0" 612 | dependencies = [ 613 | "enum-iterator", 614 | "escapist", 615 | "lol_html", 616 | "magnus", 617 | "rb-sys", 618 | ] 619 | 620 | [[package]] 621 | name = "semver" 622 | version = "1.0.24" 623 | source = "registry+https://github.com/rust-lang/crates.io-index" 624 | checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba" 625 | 626 | [[package]] 627 | name = "seq-macro" 628 | version = "0.3.5" 629 | source = "registry+https://github.com/rust-lang/crates.io-index" 630 | checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" 631 | 632 | [[package]] 633 | name = "servo_arc" 634 | version = "0.2.0" 635 | source = "registry+https://github.com/rust-lang/crates.io-index" 636 | checksum = "d52aa42f8fdf0fed91e5ce7f23d8138441002fa31dca008acf47e6fd4721f741" 637 | dependencies = [ 638 | "nodrop", 639 | "stable_deref_trait", 640 | ] 641 | 642 | [[package]] 643 | name = "shell-words" 644 | version = "1.1.0" 645 | source = "registry+https://github.com/rust-lang/crates.io-index" 646 | checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde" 647 | 648 | [[package]] 649 | name = "shlex" 650 | version = "1.3.0" 651 | source = "registry+https://github.com/rust-lang/crates.io-index" 652 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 653 | 654 | [[package]] 655 | name = "siphasher" 656 | version = "0.3.11" 657 | source = "registry+https://github.com/rust-lang/crates.io-index" 658 | checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" 659 | 660 | [[package]] 661 | name = "smallvec" 662 | version = "1.13.2" 663 | source = "registry+https://github.com/rust-lang/crates.io-index" 664 | checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" 665 | 666 | [[package]] 667 | name = "stable_deref_trait" 668 | version = "1.2.0" 669 | source = "registry+https://github.com/rust-lang/crates.io-index" 670 | checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" 671 | 672 | [[package]] 673 | name = "syn" 674 | version = "1.0.109" 675 | source = "registry+https://github.com/rust-lang/crates.io-index" 676 | checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" 677 | dependencies = [ 678 | "proc-macro2", 679 | "quote", 680 | "unicode-ident", 681 | ] 682 | 683 | [[package]] 684 | name = "syn" 685 | version = "2.0.93" 686 | source = "registry+https://github.com/rust-lang/crates.io-index" 687 | checksum = "9c786062daee0d6db1132800e623df74274a0a87322d8e183338e01b3d98d058" 688 | dependencies = [ 689 | "proc-macro2", 690 | "quote", 691 | "unicode-ident", 692 | ] 693 | 694 | [[package]] 695 | name = "thiserror" 696 | version = "2.0.9" 697 | source = "registry+https://github.com/rust-lang/crates.io-index" 698 | checksum = "f072643fd0190df67a8bab670c20ef5d8737177d6ac6b2e9a236cb096206b2cc" 699 | dependencies = [ 700 | "thiserror-impl", 701 | ] 702 | 703 | [[package]] 704 | name = "thiserror-impl" 705 | version = "2.0.9" 706 | source = "registry+https://github.com/rust-lang/crates.io-index" 707 | checksum = "7b50fa271071aae2e6ee85f842e2e28ba8cd2c5fb67f11fcb1fd70b276f9e7d4" 708 | dependencies = [ 709 | "proc-macro2", 710 | "quote", 711 | "syn 2.0.93", 712 | ] 713 | 714 | [[package]] 715 | name = "unicode-ident" 716 | version = "1.0.14" 717 | source = "registry+https://github.com/rust-lang/crates.io-index" 718 | checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" 719 | 720 | [[package]] 721 | name = "wasi" 722 | version = "0.9.0+wasi-snapshot-preview1" 723 | source = "registry+https://github.com/rust-lang/crates.io-index" 724 | checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" 725 | 726 | [[package]] 727 | name = "windows-targets" 728 | version = "0.52.6" 729 | source = "registry+https://github.com/rust-lang/crates.io-index" 730 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 731 | dependencies = [ 732 | "windows_aarch64_gnullvm", 733 | "windows_aarch64_msvc", 734 | "windows_i686_gnu", 735 | "windows_i686_gnullvm", 736 | "windows_i686_msvc", 737 | "windows_x86_64_gnu", 738 | "windows_x86_64_gnullvm", 739 | "windows_x86_64_msvc", 740 | ] 741 | 742 | [[package]] 743 | name = "windows_aarch64_gnullvm" 744 | version = "0.52.6" 745 | source = "registry+https://github.com/rust-lang/crates.io-index" 746 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 747 | 748 | [[package]] 749 | name = "windows_aarch64_msvc" 750 | version = "0.52.6" 751 | source = "registry+https://github.com/rust-lang/crates.io-index" 752 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 753 | 754 | [[package]] 755 | name = "windows_i686_gnu" 756 | version = "0.52.6" 757 | source = "registry+https://github.com/rust-lang/crates.io-index" 758 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 759 | 760 | [[package]] 761 | name = "windows_i686_gnullvm" 762 | version = "0.52.6" 763 | source = "registry+https://github.com/rust-lang/crates.io-index" 764 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 765 | 766 | [[package]] 767 | name = "windows_i686_msvc" 768 | version = "0.52.6" 769 | source = "registry+https://github.com/rust-lang/crates.io-index" 770 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 771 | 772 | [[package]] 773 | name = "windows_x86_64_gnu" 774 | version = "0.52.6" 775 | source = "registry+https://github.com/rust-lang/crates.io-index" 776 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 777 | 778 | [[package]] 779 | name = "windows_x86_64_gnullvm" 780 | version = "0.52.6" 781 | source = "registry+https://github.com/rust-lang/crates.io-index" 782 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 783 | 784 | [[package]] 785 | name = "windows_x86_64_msvc" 786 | version = "0.52.6" 787 | source = "registry+https://github.com/rust-lang/crates.io-index" 788 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 789 | 790 | [[package]] 791 | name = "zerocopy" 792 | version = "0.7.35" 793 | source = "registry+https://github.com/rust-lang/crates.io-index" 794 | checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" 795 | dependencies = [ 796 | "byteorder", 797 | "zerocopy-derive", 798 | ] 799 | 800 | [[package]] 801 | name = "zerocopy-derive" 802 | version = "0.7.35" 803 | source = "registry+https://github.com/rust-lang/crates.io-index" 804 | checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" 805 | dependencies = [ 806 | "proc-macro2", 807 | "quote", 808 | "syn 2.0.93", 809 | ] 810 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | # This Cargo.toml is here to let externals tools (IDEs, etc.) know that this is 2 | # a Rust project. Your extensions depedencies should be added to the Cargo.toml 3 | # in the ext/ directory. 4 | 5 | [workspace] 6 | members = ["ext/selma"] 7 | resolver = "2" 8 | 9 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source "https://rubygems.org" 4 | 5 | # Specified gem's dependencies in selma.gemspec 6 | gemspec 7 | 8 | group :debug do 9 | gem "amazing_print" 10 | gem "debug" 11 | end 12 | 13 | group :development, :test do 14 | gem "ruby_memcheck" 15 | end 16 | 17 | group :test do 18 | gem "gemojione", "~> 4.3", require: false 19 | gem "minitest", "~> 5.0" 20 | gem "minitest-focus", "~> 1.2" 21 | gem "minitest-spec-context", "~> 0.0.4" 22 | end 23 | 24 | group :lint do 25 | gem "rubocop-standard" 26 | end 27 | 28 | group :benchmark do 29 | gem "benchmark-ips" 30 | gem "nokolexbor" 31 | gem "sanitize" 32 | end 33 | 34 | gem "ruby-lsp", "~> 0.11", group: :development 35 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2022 Garen J. Torikian 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Selma 2 | 3 | Selma **sel**ects and **ma**tches HTML nodes using CSS rules. (It can also reject/delete nodes, but then the name isn't as cool.) It's mostly an idiomatic wrapper around Cloudflare's [lol-html](https://github.com/cloudflare/lol-html) project. 4 | 5 | ![Principal Skinner asking Selma after their date: 'Isn't it nice we hate the same things?'](https://user-images.githubusercontent.com/64050/207155384-14e8bd40-780c-466f-bfff-31a8a8fc3d25.jpg) 6 | 7 | Selma's strength (aside from being backed by Rust) is that HTML content is parsed _once_ and can be manipulated multiple times. 8 | 9 | ## Installation 10 | 11 | Add this line to your application's Gemfile: 12 | 13 | ```ruby 14 | gem 'selma' 15 | ``` 16 | 17 | And then execute: 18 | 19 | $ bundle install 20 | 21 | Or install it yourself as: 22 | 23 | $ gem install selma 24 | 25 | ## Usage 26 | 27 | Selma can perform two different actions, either independently or together: 28 | 29 | - Sanitize HTML, through a [Sanitize](https://github.com/rgrove/sanitize)-like allowlist syntax; and 30 | - Select HTML using CSS rules, and manipulate elements and text nodes along the way. 31 | 32 | It does this through two kwargs: `sanitizer` and `handlers`. The basic API for Selma looks like this: 33 | 34 | ```ruby 35 | sanitizer_config = { 36 | elements: ["b", "em", "i", "strong", "u"], 37 | } 38 | sanitizer = Selma::Sanitizer.new(sanitizer_config) 39 | rewriter = Selma::Rewriter.new(sanitizer: sanitizer, handlers: [MatchElementRewrite.new, MatchTextRewrite.new]) 40 | # removes any element that is not ["b", "em", "i", "strong", "u"]; 41 | # then calls `MatchElementRewrite` and `MatchTextRewrite` on matching HTML elements 42 | rewriter.rewrite(html) 43 | ``` 44 | 45 | Here's a look at each individual part. 46 | 47 | ### Sanitization config 48 | 49 | Selma sanitizes by default. That is, even if the `sanitizer` kwarg is not passed in, sanitization occurs. If you truly want to disable HTML sanitization (for some reason), pass `nil`: 50 | 51 | ```ruby 52 | Selma::Rewriter.new(sanitizer: nil) # dangerous and ill-advised 53 | ``` 54 | 55 | The configuration for the sanitization process is based on the follow key-value hash allowlist: 56 | 57 | ```ruby 58 | # Whether or not to allow HTML comments. 59 | allow_comments: false, 60 | 61 | # Whether or not to allow well-formed HTML doctype declarations such as 62 | # "" when sanitizing a document. 63 | allow_doctype: false, 64 | 65 | # HTML elements to allow. By default, no elements are allowed (which means 66 | # that all HTML will be stripped). 67 | elements: ["a", "b", "img", ], 68 | 69 | # HTML attributes to allow in specific elements. The key is the name of the element, 70 | # and the value is an array of allowed attributes. By default, no attributes 71 | # are allowed. 72 | attributes: { 73 | "a" => ["href"], 74 | "img" => ["src"], 75 | }, 76 | 77 | # URL handling protocols to allow in specific attributes. By default, no 78 | # protocols are allowed. Use :relative in place of a protocol if you want 79 | # to allow relative URLs sans protocol. Set to `:all` to allow any protocol. 80 | protocols: { 81 | "a" => { "href" => ["http", "https", "mailto", :relative] }, 82 | "img" => { "href" => ["http", "https"] }, 83 | }, 84 | 85 | # An Array of element names whose contents will be removed. The contents 86 | # of all other filtered elements will be left behind. 87 | remove_contents: ["iframe", "math", "noembed", "noframes", "noscript"], 88 | 89 | # Elements which, when removed, should have their contents surrounded by 90 | # whitespace. 91 | whitespace_elements: ["blockquote", "h1", "h2", "h3", "h4", "h5", "h6", ] 92 | ``` 93 | 94 | ### Defining handlers 95 | 96 | The real power in Selma comes in its use of handlers. A handler is simply an object with various methods defined: 97 | 98 | - `selector`, a method which MUST return an instance of `Selma::Selector`, defining the CSS classes to match 99 | - `handle_element`, a method that's called on each matched element 100 | - `handle_text_chunk`, a method that's called on each matched text node 101 | 102 | Here's an example which rewrites the `href` attribute on `a` and the `src` attribute on `img` to be `https` rather than `http`. 103 | 104 | ```ruby 105 | class MatchAttribute 106 | SELECTOR = Selma::Selector.new(match_element: %(a[href^="http:"], img[src^="http:"]")) 107 | 108 | def selector 109 | SELECTOR 110 | end 111 | 112 | def handle_element(element) 113 | if element.tag_name == "a" 114 | element["href"] = rename_http(element["href"]) 115 | elsif element.tag_name == "img" 116 | element["src"] = rename_http(element["src"]) 117 | end 118 | end 119 | 120 | private def rename_http(link) 121 | link.sub("http", "https") 122 | end 123 | end 124 | 125 | rewriter = Selma::Rewriter.new(handlers: [MatchAttribute.new]) 126 | ``` 127 | 128 | The `Selma::Selector` object has three possible kwargs: 129 | 130 | - `match_element`: any element which matches this CSS rule will be passed on to `handle_element` 131 | - `match_text_within`: any text_chunk which matches this CSS rule will be passed on to `handle_text_chunk` 132 | - `ignore_text_within`: this is an array of element names whose text contents will be ignored 133 | 134 | Here's an example for `handle_text_chunk` which changes strings in various elements which are _not_ `pre` or `code`: 135 | 136 | ```ruby 137 | class MatchText 138 | SELECTOR = Selma::Selector.new(match_text_within: "*", ignore_text_within: ["pre", "code"]) 139 | 140 | def selector 141 | SELECTOR 142 | end 143 | 144 | def handle_text_chunk(text) 145 | text.replace(text.to_s, text.sub(/@.+/, "")) 146 | end 147 | end 148 | 149 | rewriter = Selma::Rewriter.new(handlers: [MatchText.new]) 150 | ``` 151 | 152 | #### `element` methods 153 | 154 | The `element` argument in `handle_element` has the following methods: 155 | 156 | - `tag_name`: Gets the element's name 157 | - `tag_name=`: Sets the element's name 158 | - `self_closing?`: A bool which identifies whether or not the element is self-closing 159 | - `[]`: Get an attribute 160 | - `[]=`: Set an attribute 161 | - `remove_attribute`: Remove an attribute 162 | - `has_attribute?`: A bool which identifies whether or not the element has an attribute 163 | - `attributes`: List all the attributes 164 | - `ancestors`: List all of an element's ancestors as an array of strings 165 | - `before(content, as: content_type)`: Inserts `content` before the element. `content_type` is either `:text` or `:html` and determines how the content will be applied. 166 | - `after(content, as: content_type)`: Inserts `content` after the element. `content_type` is either `:text` or `:html` and determines how the content will be applied. 167 | - `prepend(content, as: content_type)`: prepends `content` to the element's inner content, i.e. inserts content right after the element's start tag. `content_type` is either `:text` or `:html` and determines how the content will be applied. 168 | - `append(content, as: content_type)`: appends `content` to the element's inner content, i.e. inserts content right before the element's end tag. `content_type` is either `:text` or `:html` and determines how the content will be applied. 169 | - `set_inner_content`: Replaces inner content of the element with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied. 170 | - `remove`: Removes the element and its inner content. 171 | - `remove_and_keep_content`: Removes the element, but keeps its content. I.e. remove start and end tags of the element. 172 | - `removed?`: A bool which identifies if the element has been removed or replaced with some content. 173 | 174 | #### `text_chunk` methods 175 | 176 | - `to_s` / `.content`: Gets the text node's content 177 | - `text_type`: identifies the type of text in the text node 178 | - `before(content, as: content_type)`: Inserts `content` before the text. `content_type` is either `:text` or `:html` and determines how the content will be applied. 179 | - `after(content, as: content_type)`: Inserts `content` after the text. `content_type` is either `:text` or `:html` and determines how the content will be applied. 180 | - `replace(content, as: content_type)`: Replaces the text node with `content`. `content_type` is either `:text` or `:html` and determines how the content will be applied. 181 | 182 | ## Security 183 | 184 | Theoretically, a malicious user can provide a very large document for processing, which can exhaust the memory of the host machine. To set a limit on how much string content is processed at once, you can provide `memory` options: 185 | 186 | ```ruby 187 | Selma::Rewriter.new(options: { memory: { max_allowed_memory_usage: 1_000_000 } }) # ~1MB 188 | ``` 189 | 190 | The structure of the `memory` options looks like this: 191 | 192 | ```ruby 193 | { 194 | memory: { 195 | max_allowed_memory_usage: 1000, 196 | preallocated_parsing_buffer_size: 100, 197 | } 198 | } 199 | ``` 200 | 201 | Note that `preallocated_parsing_buffer_size` must always be less than `max_allowed_memory_usage`. See [the`lol_html` project documentation](https://docs.rs/lol_html/1.2.1/lol_html/struct.MemorySettings.html) to learn more about the default values. 202 | 203 | ## Benchmarks 204 | 205 | When `bundle exec rake benchmark`, two different benchmarks are calculated. Here are those results on my machine. 206 | 207 | ### Benchmarks for just the sanitization process 208 | 209 | Comparing Selma against popular Ruby sanitization gems: 210 | 211 | 212 |
213 |
214 | input size = 25309 bytes, 0.03 MB
215 | 
216 | ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
217 | Warming up --------------------------------------
218 | sanitize-sm 15.000 i/100ms
219 | selma-sm 127.000 i/100ms
220 | Calculating -------------------------------------
221 | sanitize-sm 157.643 (± 1.9%) i/s - 4.740k in 30.077172s
222 | selma-sm 1.278k (± 1.5%) i/s - 38.354k in 30.019722s
223 | 
224 | Comparison:
225 | selma-sm: 1277.9 i/s
226 | sanitize-sm: 157.6 i/s - 8.11x slower
227 | 
228 | input size = 86686 bytes, 0.09 MB
229 | 
230 | ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
231 | Warming up --------------------------------------
232 | sanitize-md 4.000 i/100ms
233 | selma-md 33.000 i/100ms
234 | Calculating -------------------------------------
235 | sanitize-md 40.034 (± 5.0%) i/s - 1.200k in 30.043322s
236 | selma-md 332.959 (± 2.1%) i/s - 9.999k in 30.045733s
237 | 
238 | Comparison:
239 | selma-md: 333.0 i/s
240 | sanitize-md: 40.0 i/s - 8.32x slower
241 | 
242 | input size = 7172510 bytes, 7.17 MB
243 | 
244 | ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
245 | Warming up --------------------------------------
246 | sanitize-lg 1.000 i/100ms
247 | selma-lg 1.000 i/100ms
248 | Calculating -------------------------------------
249 | sanitize-lg 0.141 (± 0.0%) i/s - 5.000 in 35.426127s
250 | selma-lg 3.963 (± 0.0%) i/s - 119.000 in 30.037386s
251 | 
252 | Comparison:
253 | selma-lg: 4.0 i/s
254 | sanitize-lg: 0.1 i/s - 28.03x slower
255 | 
256 | 
257 |
258 | 259 | 260 | ### Benchmarks for just the rewriting process 261 | 262 | Comparing Selma against popular Ruby HTML parsing gems: 263 | 264 | 265 |
266 |
267 | input size = 25309 bytes, 0.03 MB
268 | 
269 | ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
270 | Warming up --------------------------------------
271 | nokogiri-sm 79.000 i/100ms
272 | nokolexbor-sm 295.000 i/100ms
273 | selma-sm 237.000 i/100ms
274 | Calculating -------------------------------------
275 | nokogiri-sm 800.531 (± 2.2%) i/s - 24.016k in 30.016056s
276 | nokolexbor-sm 3.033k (± 3.6%) i/s - 91.155k in 30.094884s
277 | selma-sm 2.386k (± 1.6%) i/s - 71.574k in 30.001701s
278 | 
279 | Comparison:
280 | nokolexbor-sm: 3033.1 i/s
281 | selma-sm: 2386.3 i/s - 1.27x slower
282 | nokogiri-sm: 800.5 i/s - 3.79x slower
283 | 
284 | input size = 86686 bytes, 0.09 MB
285 | 
286 | ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
287 | Warming up --------------------------------------
288 | nokogiri-md 8.000 i/100ms
289 | nokolexbor-md 43.000 i/100ms
290 | selma-md 38.000 i/100ms
291 | Calculating -------------------------------------
292 | nokogiri-md 85.013 (± 8.2%) i/s - 2.024k in 52.257472s
293 | nokolexbor-md 416.074 (±11.1%) i/s - 12.341k in 30.111613s
294 | selma-md 361.471 (± 4.7%) i/s - 10.830k in 30.033997s
295 | 
296 | Comparison:
297 | nokolexbor-md: 416.1 i/s
298 | selma-md: 361.5 i/s - same-ish: difference falls within error
299 | nokogiri-md: 85.0 i/s - 4.89x slower
300 | 
301 | input size = 7172510 bytes, 7.17 MB
302 | 
303 | ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin23]
304 | Warming up --------------------------------------
305 | nokogiri-lg 1.000 i/100ms
306 | nokolexbor-lg 1.000 i/100ms
307 | selma-lg 1.000 i/100ms
308 | Calculating -------------------------------------
309 | nokogiri-lg 0.805 (± 0.0%) i/s - 25.000 in 31.148730s
310 | nokolexbor-lg 2.194 (± 0.0%) i/s - 66.000 in 30.278108s
311 | selma-lg 5.541 (± 0.0%) i/s - 166.000 in 30.037197s
312 | 
313 | Comparison:
314 | selma-lg: 5.5 i/s
315 | nokolexbor-lg: 2.2 i/s - 2.53x slower
316 | nokogiri-lg: 0.8 i/s - 6.88x slower
317 | 
318 | 
319 |
320 | 321 | 322 | ## Contributing 323 | 324 | Bug reports and pull requests are welcome on GitHub at https://github.com/gjtorikian/selma. This project is a safe, welcoming space for collaboration. 325 | 326 | ## Acknowledgements 327 | 328 | - https://github.com/flavorjones/ruby-c-extensions-explained#strategy-3-precompiled and [Nokogiri](https://github.com/sparklemotion/nokogiri) for hints on how to ship precompiled cross-platform gems 329 | - @vmg for his work at GitHub on goomba, from which some design patterns were learned 330 | - [sanitize](https://github.com/rgrove/sanitize) for a comprehensive configuration API and test suite 331 | 332 | ## License 333 | 334 | The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). 335 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | if ENV.fetch("DEBUG", false) 4 | require "amazing_print" 5 | require "debug" 6 | end 7 | 8 | # Gem Spec 9 | require "bundler/gem_tasks" 10 | SELMA_SPEC = Gem::Specification.load("selma.gemspec") 11 | 12 | # Packaging 13 | require "rubygems/package_task" 14 | gem_path = Gem::PackageTask.new(SELMA_SPEC).define 15 | desc "Package the Ruby gem" 16 | task "package" => [gem_path] 17 | 18 | task default: [:test, :rubocop] 19 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'bundler/setup' 5 | require 'selma' 6 | require 'irb' 7 | 8 | puts <<~TEXT 9 | ------------------- 10 | Example Usage: 11 | 12 | html = 'Test of the emergency system' 13 | sanitizer = Selma::Sanitizer.new(Selma::Sanitizer::Config::DEFAULT) # or try RELAXED 14 | rewriter = Selma::Rewriter.new(sanitizer: sanitizer) 15 | rewriter.rewrite(html) 16 | ------------------- 17 | 18 | TEXT 19 | 20 | IRB.start(__FILE__) 21 | -------------------------------------------------------------------------------- /bin/rake: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | # 5 | # This file was generated by Bundler. 6 | # 7 | # The application 'rake' is installed as part of a gem, and 8 | # this file is here to facilitate running it. 9 | # 10 | 11 | ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../Gemfile", __dir__) 12 | 13 | bundle_binstub = File.expand_path("bundle", __dir__) 14 | 15 | if File.file?(bundle_binstub) 16 | if File.read(bundle_binstub, 300).include?("This file was generated by Bundler") 17 | load(bundle_binstub) 18 | else 19 | abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run. 20 | Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.") 21 | end 22 | end 23 | 24 | require "rubygems" 25 | require "bundler/setup" 26 | 27 | load Gem.bin_path("rake", "rake") 28 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /ext/selma/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "selma" 3 | version = "1.0.0" 4 | edition = "2021" 5 | rust-version = "1.75.0" 6 | publish = false 7 | 8 | [dependencies] 9 | enum-iterator = "2.1" 10 | escapist = "0.0.2" 11 | magnus = { version = "0.7", features = ["rb-sys"] } 12 | rb-sys = { version = "*", default-features = false, features = [ 13 | "stable-api-compiled-fallback", 14 | ] } 15 | lol_html = "2.3" 16 | 17 | [lib] 18 | name = "selma" 19 | crate-type = ["cdylib"] 20 | -------------------------------------------------------------------------------- /ext/selma/extconf.rb: -------------------------------------------------------------------------------- 1 | require "mkmf" 2 | require "rb_sys/mkmf" 3 | 4 | create_rust_makefile("selma/selma") 5 | -------------------------------------------------------------------------------- /ext/selma/src/html.rs: -------------------------------------------------------------------------------- 1 | use magnus::{Error, Module, RModule}; 2 | 3 | #[derive(Clone, Debug)] 4 | #[magnus::wrap(class = "Selma::HTML")] 5 | pub(crate) struct SelmaHTML {} 6 | 7 | pub fn init(m_selma: RModule) -> Result<(), Error> { 8 | let c_html = m_selma 9 | .define_class("HTML", magnus::class::object()) 10 | .expect("cannot define class Selma::HTML"); 11 | 12 | element::init(c_html).expect("cannot define Selma::HTML::Element class"); 13 | end_tag::init(c_html).expect("cannot define Selma::HTML::EndTag class"); 14 | text_chunk::init(c_html).expect("cannot define Selma::HTML::TextChunk class"); 15 | 16 | Ok(()) 17 | } 18 | 19 | pub mod element; 20 | pub mod end_tag; 21 | pub mod text_chunk; 22 | -------------------------------------------------------------------------------- /ext/selma/src/html/element.rs: -------------------------------------------------------------------------------- 1 | use std::cell::RefCell; 2 | 3 | use crate::native_ref_wrap::NativeRefWrap; 4 | use lol_html::html_content::Element; 5 | use magnus::{exception, method, Error, Module, RArray, RClass, RHash, RString, Value}; 6 | 7 | struct HTMLElement { 8 | element: NativeRefWrap>, 9 | ancestors: Vec, 10 | } 11 | 12 | #[magnus::wrap(class = "Selma::HTML::Element")] 13 | pub struct SelmaHTMLElement(RefCell); 14 | 15 | /// SAFETY: This is safe because we only access this data when the GVL is held. 16 | unsafe impl Send for SelmaHTMLElement {} 17 | 18 | impl SelmaHTMLElement { 19 | pub fn new(ref_wrap: NativeRefWrap>, ancestors: &[String]) -> Self { 20 | Self(RefCell::new(HTMLElement { 21 | element: ref_wrap, 22 | ancestors: ancestors.to_owned(), 23 | })) 24 | } 25 | 26 | fn tag_name(&self) -> Result { 27 | let binding = self.0.borrow(); 28 | 29 | match binding.element.get() { 30 | Ok(e) => Ok(e.tag_name().to_string()), 31 | Err(_) => Err(Error::new( 32 | exception::runtime_error(), 33 | "`tag_name` is not available", 34 | )), 35 | } 36 | } 37 | 38 | fn set_tag_name(&self, name: String) -> Result<(), Error> { 39 | let mut binding = self.0.borrow_mut(); 40 | 41 | if let Ok(element) = binding.element.get_mut() { 42 | match element.set_tag_name(&name) { 43 | Ok(_) => Ok(()), 44 | Err(err) => Err(Error::new(exception::runtime_error(), format!("{err:?}"))), 45 | } 46 | } else { 47 | Err(Error::new( 48 | exception::runtime_error(), 49 | "`set_tag_name` is not available", 50 | )) 51 | } 52 | } 53 | 54 | fn is_self_closing(&self) -> Result { 55 | let binding = self.0.borrow(); 56 | 57 | if let Ok(e) = binding.element.get() { 58 | Ok(e.is_self_closing()) 59 | } else { 60 | Err(Error::new( 61 | exception::runtime_error(), 62 | "`is_self_closing` is not available", 63 | )) 64 | } 65 | } 66 | 67 | fn has_attribute(&self, attr: String) -> Result { 68 | let binding = self.0.borrow(); 69 | 70 | if let Ok(e) = binding.element.get() { 71 | Ok(e.has_attribute(&attr)) 72 | } else { 73 | Err(Error::new( 74 | exception::runtime_error(), 75 | "`is_self_closing` is not available", 76 | )) 77 | } 78 | } 79 | 80 | fn get_attribute(&self, attr: String) -> Option { 81 | let binding = self.0.borrow(); 82 | let element = binding.element.get(); 83 | element.unwrap().get_attribute(&attr) 84 | } 85 | 86 | fn set_attribute(&self, attr: String, value: String) -> Result { 87 | let mut binding = self.0.borrow_mut(); 88 | if let Ok(element) = binding.element.get_mut() { 89 | match element.set_attribute(&attr, &value) { 90 | Ok(_) => Ok(value), 91 | Err(err) => Err(Error::new( 92 | exception::runtime_error(), 93 | format!("AttributeNameError: {err:?}"), 94 | )), 95 | } 96 | } else { 97 | Err(Error::new( 98 | exception::runtime_error(), 99 | "`tag_name` is not available", 100 | )) 101 | } 102 | } 103 | 104 | fn remove_attribute(&self, attr: String) { 105 | let mut binding = self.0.borrow_mut(); 106 | 107 | if let Ok(e) = binding.element.get_mut() { 108 | e.remove_attribute(&attr) 109 | } 110 | } 111 | 112 | fn get_attributes(&self) -> Result { 113 | let binding = self.0.borrow(); 114 | let hash = RHash::new(); 115 | 116 | if let Ok(e) = binding.element.get() { 117 | e.attributes() 118 | .iter() 119 | .for_each(|attr| match hash.aset(attr.name(), attr.value()) { 120 | Ok(_) => {} 121 | Err(err) => panic!( 122 | "{:?}", 123 | Error::new( 124 | exception::runtime_error(), 125 | format!("AttributeNameError: {err:?}"), 126 | ) 127 | ), 128 | }); 129 | } 130 | Ok(hash) 131 | } 132 | 133 | fn get_ancestors(&self) -> Result { 134 | let binding = self.0.borrow(); 135 | let array = RArray::new(); 136 | 137 | binding 138 | .ancestors 139 | .iter() 140 | .for_each(|ancestor| match array.push(RString::new(ancestor)) { 141 | Ok(_) => {} 142 | Err(err) => { 143 | panic!( 144 | "{:?}", 145 | Error::new(exception::runtime_error(), format!("{err:?}")) 146 | ) 147 | } 148 | }); 149 | 150 | Ok(array) 151 | } 152 | 153 | fn before(&self, args: &[Value]) -> Result<(), Error> { 154 | let mut binding = self.0.borrow_mut(); 155 | let element = binding.element.get_mut().unwrap(); 156 | 157 | let (text_str, content_type) = match crate::scan_text_args(args) { 158 | Ok((text_str, content_type)) => (text_str, content_type), 159 | Err(err) => return Err(err), 160 | }; 161 | 162 | element.before(&text_str, content_type); 163 | 164 | Ok(()) 165 | } 166 | 167 | fn after(&self, args: &[Value]) -> Result<(), Error> { 168 | let mut binding = self.0.borrow_mut(); 169 | let element = binding.element.get_mut().unwrap(); 170 | 171 | let (text_str, content_type) = match crate::scan_text_args(args) { 172 | Ok((text_str, content_type)) => (text_str, content_type), 173 | Err(err) => return Err(err), 174 | }; 175 | 176 | element.after(&text_str, content_type); 177 | 178 | Ok(()) 179 | } 180 | 181 | fn prepend(&self, args: &[Value]) -> Result<(), Error> { 182 | let mut binding = self.0.borrow_mut(); 183 | let element = binding.element.get_mut().unwrap(); 184 | 185 | let (text_str, content_type) = match crate::scan_text_args(args) { 186 | Ok((text_str, content_type)) => (text_str, content_type), 187 | Err(err) => return Err(err), 188 | }; 189 | 190 | element.prepend(&text_str, content_type); 191 | 192 | Ok(()) 193 | } 194 | 195 | fn append(&self, args: &[Value]) -> Result<(), Error> { 196 | let mut binding = self.0.borrow_mut(); 197 | let element = binding.element.get_mut().unwrap(); 198 | 199 | let (text_str, content_type) = match crate::scan_text_args(args) { 200 | Ok((text_str, content_type)) => (text_str, content_type), 201 | Err(err) => return Err(err), 202 | }; 203 | 204 | element.append(&text_str, content_type); 205 | 206 | Ok(()) 207 | } 208 | 209 | fn set_inner_content(&self, args: &[Value]) -> Result<(), Error> { 210 | let mut binding = self.0.borrow_mut(); 211 | let element = binding.element.get_mut().unwrap(); 212 | 213 | let (inner_content, content_type) = match crate::scan_text_args(args) { 214 | Ok((inner_content, content_type)) => (inner_content, content_type), 215 | Err(err) => return Err(err), 216 | }; 217 | 218 | element.set_inner_content(&inner_content, content_type); 219 | 220 | Ok(()) 221 | } 222 | 223 | fn remove(&self) { 224 | let mut binding = self.0.borrow_mut(); 225 | 226 | if let Ok(e) = binding.element.get_mut() { 227 | e.remove() 228 | } 229 | } 230 | 231 | fn remove_and_keep_content(&self) -> Result<(), Error> { 232 | self.0 233 | .borrow_mut() 234 | .element 235 | .get_mut() 236 | .unwrap() 237 | .remove_and_keep_content(); 238 | Ok(()) 239 | } 240 | 241 | fn is_removed(&self) -> Result { 242 | let binding = self.0.borrow(); 243 | 244 | match binding.element.get() { 245 | Ok(e) => Ok(e.removed()), 246 | Err(_) => Err(Error::new( 247 | exception::runtime_error(), 248 | "`is_removed` is not available", 249 | )), 250 | } 251 | } 252 | } 253 | 254 | pub fn init(c_html: RClass) -> Result<(), Error> { 255 | let c_element = c_html 256 | .define_class("Element", magnus::class::object()) 257 | .expect("cannot define class Selma::HTML::Element"); 258 | 259 | c_element.define_method("tag_name", method!(SelmaHTMLElement::tag_name, 0))?; 260 | c_element.define_method("tag_name=", method!(SelmaHTMLElement::set_tag_name, 1))?; 261 | c_element.define_method( 262 | "self_closing?", 263 | method!(SelmaHTMLElement::is_self_closing, 0), 264 | )?; 265 | c_element.define_method("[]", method!(SelmaHTMLElement::get_attribute, 1))?; 266 | c_element.define_method("[]=", method!(SelmaHTMLElement::set_attribute, 2))?; 267 | c_element.define_method( 268 | "remove_attribute", 269 | method!(SelmaHTMLElement::remove_attribute, 1), 270 | )?; 271 | c_element.define_method( 272 | "has_attribute?", 273 | method!(SelmaHTMLElement::has_attribute, 1), 274 | )?; 275 | c_element.define_method("attributes", method!(SelmaHTMLElement::get_attributes, 0))?; 276 | c_element.define_method("ancestors", method!(SelmaHTMLElement::get_ancestors, 0))?; 277 | 278 | c_element.define_method("before", method!(SelmaHTMLElement::before, -1))?; 279 | c_element.define_method("after", method!(SelmaHTMLElement::after, -1))?; 280 | c_element.define_method("prepend", method!(SelmaHTMLElement::prepend, -1))?; 281 | c_element.define_method("append", method!(SelmaHTMLElement::append, -1))?; 282 | c_element.define_method( 283 | "set_inner_content", 284 | method!(SelmaHTMLElement::set_inner_content, -1), 285 | )?; 286 | 287 | c_element.define_method("remove", method!(SelmaHTMLElement::remove, 0))?; 288 | c_element.define_method( 289 | "remove_and_keep_content", 290 | method!(SelmaHTMLElement::remove_and_keep_content, 0), 291 | )?; 292 | c_element.define_method("removed?", method!(SelmaHTMLElement::is_removed, 0))?; 293 | 294 | Ok(()) 295 | } 296 | -------------------------------------------------------------------------------- /ext/selma/src/html/end_tag.rs: -------------------------------------------------------------------------------- 1 | use std::cell::RefCell; 2 | 3 | use crate::native_ref_wrap::NativeRefWrap; 4 | use lol_html::html_content::EndTag; 5 | use magnus::{method, Error, Module, RClass}; 6 | 7 | struct HTMLEndTag { 8 | end_tag: NativeRefWrap>, 9 | } 10 | 11 | #[magnus::wrap(class = "Selma::HTML::EndTag")] 12 | pub struct SelmaHTMLEndTag(RefCell); 13 | 14 | /// SAFETY: This is safe because we only access this data when the GVL is held. 15 | unsafe impl Send for SelmaHTMLEndTag {} 16 | 17 | impl SelmaHTMLEndTag { 18 | pub fn new(ref_wrap: NativeRefWrap>) -> Self { 19 | Self(RefCell::new(HTMLEndTag { end_tag: ref_wrap })) 20 | } 21 | 22 | fn tag_name(&self) -> String { 23 | self.0.borrow().end_tag.get().unwrap().name() 24 | } 25 | } 26 | 27 | pub fn init(c_html: RClass) -> Result<(), Error> { 28 | let c_end_tag = c_html 29 | .define_class("EndTag", magnus::class::object()) 30 | .expect("cannot define class Selma::HTML::EndTag"); 31 | 32 | c_end_tag.define_method("tag_name", method!(SelmaHTMLEndTag::tag_name, 0))?; 33 | 34 | Ok(()) 35 | } 36 | -------------------------------------------------------------------------------- /ext/selma/src/html/text_chunk.rs: -------------------------------------------------------------------------------- 1 | use std::cell::RefCell; 2 | 3 | use crate::native_ref_wrap::NativeRefWrap; 4 | use lol_html::html_content::{TextChunk, TextType}; 5 | use magnus::{exception, method, Error, Module, RClass, Symbol, Value}; 6 | 7 | struct HTMLTextChunk { 8 | text_chunk: NativeRefWrap>, 9 | buffer: String, 10 | } 11 | 12 | macro_rules! clone_buffer_if_not_empty { 13 | ($binding:expr, $buffer:expr) => { 14 | if !$binding.buffer.is_empty() { 15 | $buffer.clone_from(&$binding.buffer); 16 | } 17 | }; 18 | } 19 | 20 | // if this is the first time we're processing this text chunk (buffer is empty), 21 | // we carry on. Otherwise, we need to use the buffer text, not the text chunk, 22 | // because lol-html is not designed in such a way to keep track of text chunks. 23 | macro_rules! set_text_chunk_to_buffer { 24 | ($text_chunk:expr, $buffer:expr) => { 25 | if !$buffer.is_empty() { 26 | $text_chunk.set_str($buffer); 27 | } 28 | }; 29 | } 30 | 31 | #[magnus::wrap(class = "Selma::HTML::TextChunk")] 32 | pub struct SelmaHTMLTextChunk(RefCell); 33 | 34 | /// SAFETY: This is safe because we only access this data when the GVL is held. 35 | unsafe impl Send for SelmaHTMLTextChunk {} 36 | 37 | impl SelmaHTMLTextChunk { 38 | pub fn new(ref_wrap: NativeRefWrap>) -> Self { 39 | Self(RefCell::new(HTMLTextChunk { 40 | text_chunk: ref_wrap, 41 | buffer: String::new(), 42 | })) 43 | } 44 | 45 | fn to_s(&self) -> Result { 46 | let binding = self.0.borrow(); 47 | 48 | if let Ok(tc) = binding.text_chunk.get() { 49 | Ok(tc.as_str().to_string()) 50 | } else { 51 | Err(Error::new( 52 | exception::runtime_error(), 53 | "`to_s` is not available", 54 | )) 55 | } 56 | } 57 | 58 | fn text_type(&self) -> Result { 59 | let binding = self.0.borrow(); 60 | 61 | if let Ok(tc) = binding.text_chunk.get() { 62 | match tc.text_type() { 63 | TextType::Data => Ok(Symbol::new("data")), 64 | TextType::PlainText => Ok(Symbol::new("plain_text")), 65 | TextType::RawText => Ok(Symbol::new("raw_text")), 66 | TextType::ScriptData => Ok(Symbol::new("script")), 67 | TextType::RCData => Ok(Symbol::new("rc_data")), 68 | TextType::CDataSection => Ok(Symbol::new("cdata_section")), 69 | } 70 | } else { 71 | Err(Error::new( 72 | exception::runtime_error(), 73 | "`text_type` is not available", 74 | )) 75 | } 76 | } 77 | 78 | fn is_removed(&self) -> Result { 79 | let binding = self.0.borrow(); 80 | 81 | match binding.text_chunk.get() { 82 | Ok(tc) => Ok(tc.removed()), 83 | Err(_) => Err(Error::new( 84 | exception::runtime_error(), 85 | "`is_removed` is not available", 86 | )), 87 | } 88 | } 89 | 90 | fn before(&self, args: &[Value]) -> Result { 91 | let mut binding = self.0.borrow_mut(); 92 | let text_chunk = binding.text_chunk.get_mut().unwrap(); 93 | 94 | let (text_str, content_type) = match crate::scan_text_args(args) { 95 | Ok((text_str, content_type)) => (text_str, content_type), 96 | Err(err) => return Err(err), 97 | }; 98 | 99 | text_chunk.before(&text_str, content_type); 100 | 101 | Ok(text_chunk.as_str().to_string()) 102 | } 103 | 104 | fn after(&self, args: &[Value]) -> Result { 105 | let mut binding = self.0.borrow_mut(); 106 | let text_chunk = binding.text_chunk.get_mut().unwrap(); 107 | 108 | let (text_str, content_type) = match crate::scan_text_args(args) { 109 | Ok((text_str, content_type)) => (text_str, content_type), 110 | Err(err) => return Err(err), 111 | }; 112 | 113 | text_chunk.after(&text_str, content_type); 114 | 115 | Ok(text_chunk.as_str().to_string()) 116 | } 117 | 118 | fn replace(&self, args: &[Value]) -> Result { 119 | let mut binding = self.0.borrow_mut(); 120 | let mut buffer = String::new(); 121 | 122 | clone_buffer_if_not_empty!(binding, buffer); 123 | 124 | let text_chunk = binding.text_chunk.get_mut().unwrap(); 125 | 126 | set_text_chunk_to_buffer!(text_chunk, buffer); 127 | 128 | let (text_str, content_type) = match crate::scan_text_args(args) { 129 | Ok((text_str, content_type)) => (text_str, content_type), 130 | Err(err) => return Err(err), 131 | }; 132 | text_chunk.replace(&text_str, content_type); 133 | 134 | text_chunk.set_str(text_str.clone()); 135 | 136 | binding.buffer = text_chunk.as_str().to_string(); 137 | 138 | Ok(text_str) 139 | } 140 | } 141 | 142 | pub fn init(c_html: RClass) -> Result<(), Error> { 143 | let c_text_chunk = c_html 144 | .define_class("TextChunk", magnus::class::object()) 145 | .expect("cannot define class Selma::HTML::TextChunk"); 146 | 147 | c_text_chunk.define_method("to_s", method!(SelmaHTMLTextChunk::to_s, 0))?; 148 | c_text_chunk.define_method("content", method!(SelmaHTMLTextChunk::to_s, 0))?; 149 | c_text_chunk.define_method("text_type", method!(SelmaHTMLTextChunk::text_type, 0))?; 150 | c_text_chunk.define_method("before", method!(SelmaHTMLTextChunk::before, -1))?; 151 | c_text_chunk.define_method("after", method!(SelmaHTMLTextChunk::after, -1))?; 152 | c_text_chunk.define_method("replace", method!(SelmaHTMLTextChunk::replace, -1))?; 153 | c_text_chunk.define_method("removed?", method!(SelmaHTMLTextChunk::is_removed, 0))?; 154 | 155 | Ok(()) 156 | } 157 | -------------------------------------------------------------------------------- /ext/selma/src/lib.rs: -------------------------------------------------------------------------------- 1 | extern crate core; 2 | 3 | use lol_html::html_content::ContentType; 4 | use magnus::{define_module, exception, scan_args, Error, Symbol, Value}; 5 | 6 | pub mod html; 7 | pub mod native_ref_wrap; 8 | pub mod rewriter; 9 | pub mod sanitizer; 10 | pub mod selector; 11 | pub mod tags; 12 | 13 | #[allow(clippy::let_unit_value)] 14 | fn scan_text_args(args: &[Value]) -> Result<(String, ContentType), magnus::Error> { 15 | let args = scan_args::scan_args(args)?; 16 | let (text,): (String,) = args.required; 17 | let _: () = args.optional; 18 | let _: () = args.splat; 19 | let _: () = args.trailing; 20 | let _: () = args.block; 21 | 22 | let kwargs = scan_args::get_kwargs::<_, (Symbol,), (), ()>(args.keywords, &["as"], &[])?; 23 | let as_sym = kwargs.required.0; 24 | let as_sym_str = as_sym.name().unwrap(); 25 | let content_type = if as_sym_str == "text" { 26 | ContentType::Text 27 | } else if as_sym_str == "html" { 28 | ContentType::Html 29 | } else { 30 | return Err(Error::new( 31 | exception::runtime_error(), 32 | format!("unknown symbol `{as_sym_str:?}`"), 33 | )); 34 | }; 35 | 36 | Ok((text, content_type)) 37 | } 38 | 39 | #[magnus::init] 40 | fn init() -> Result<(), Error> { 41 | let m_selma = define_module("Selma").expect("cannot define ::Selma module"); 42 | 43 | sanitizer::init(m_selma).expect("cannot define Selma::Sanitizer class"); 44 | rewriter::init(m_selma).expect("cannot define Selma::Rewriter class"); 45 | html::init(m_selma).expect("cannot define Selma::HTML class"); 46 | selector::init(m_selma).expect("cannot define Selma::Selector class"); 47 | 48 | Ok(()) 49 | } 50 | -------------------------------------------------------------------------------- /ext/selma/src/native_ref_wrap.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | marker::PhantomData, 3 | sync::{Arc, Mutex}, 4 | }; 5 | 6 | // NOTE: this was inspired from 7 | // https://github.com/worker-tools/html-rewriter-wasm/blob/92bafdfa34c809c37036f57cb282184cada3bbc9/src/handlers.rs 8 | 9 | pub struct Anchor<'r> { 10 | poisoned: Arc>, 11 | lifetime: PhantomData<&'r mut ()>, 12 | } 13 | 14 | impl<'r> Anchor<'r> { 15 | pub fn new(poisoned: Arc>) -> Self { 16 | Anchor { 17 | poisoned, 18 | lifetime: PhantomData, 19 | } 20 | } 21 | } 22 | 23 | impl Drop for Anchor<'_> { 24 | fn drop(&mut self) { 25 | *self.poisoned.lock().unwrap() = true; 26 | } 27 | } 28 | 29 | // NOTE: So far as I understand it, there's no great way to work between lol_html's lifetimes and FFI. 30 | // To work around that, we create a wrapper that erases all the lifetime information from the inner reference 31 | // and provides an anchor object that keeps track of the lifetime in the runtime. 32 | // 33 | // When anchor goes out of scope, wrapper becomes poisoned and any attempt to get inner 34 | // object results in exception. 35 | #[derive(Clone)] 36 | pub struct NativeRefWrap { 37 | inner_ptr: *mut R, 38 | poisoned: Arc>, 39 | } 40 | 41 | impl NativeRefWrap { 42 | pub fn wrap(inner: &mut I) -> (Self, Anchor) { 43 | let wrap = NativeRefWrap { 44 | inner_ptr: inner as *mut I as *mut R, 45 | poisoned: Arc::new(Mutex::new(false)), 46 | }; 47 | 48 | let anchor = Anchor::new(Arc::clone(&wrap.poisoned)); 49 | 50 | (wrap, anchor) 51 | } 52 | 53 | fn assert_not_poisoned(&self) -> Result<(), &'static str> { 54 | if self.is_poisoned() { 55 | Err("The object has been freed and can't be used anymore.") 56 | } else { 57 | Ok(()) 58 | } 59 | } 60 | 61 | pub fn is_poisoned(&self) -> bool { 62 | *self.poisoned.lock().unwrap() 63 | } 64 | 65 | pub fn get(&self) -> Result<&R, &'static str> { 66 | self.assert_not_poisoned()?; 67 | 68 | Ok(unsafe { self.inner_ptr.as_ref() }.unwrap()) 69 | } 70 | 71 | pub fn get_mut(&mut self) -> Result<&mut R, &'static str> { 72 | self.assert_not_poisoned()?; 73 | 74 | Ok(unsafe { self.inner_ptr.as_mut() }.unwrap()) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /ext/selma/src/selector.rs: -------------------------------------------------------------------------------- 1 | use magnus::{exception, function, scan_args, Error, Module, Object, RModule, Value}; 2 | 3 | #[derive(Clone, Debug)] 4 | #[magnus::wrap(class = "Selma::Selector")] 5 | pub struct SelmaSelector { 6 | match_element: Option, 7 | match_text_within: Option, 8 | ignore_text_within: Option>, 9 | } 10 | 11 | type SelectorMatches = (Option, Option, Option>); 12 | 13 | impl SelmaSelector { 14 | fn new(args: &[Value]) -> Result { 15 | let (match_element, match_text_within, rb_ignore_text_within) = 16 | Self::scan_parse_args(args)?; 17 | 18 | if match_element.is_none() && match_text_within.is_none() { 19 | return Err(Error::new( 20 | exception::arg_error(), 21 | "Neither `match_element` nor `match_text_within` option given", 22 | )); 23 | } 24 | 25 | // FIXME: not excited about this double parse work (`element!` does it too), 26 | // but at least we can bail ASAP if the CSS is invalid 27 | if match_element.is_some() { 28 | let css = match_element.as_ref().unwrap(); 29 | if css.parse::().is_err() { 30 | return Err(Error::new( 31 | exception::arg_error(), 32 | format!("Could not parse `match_element` (`{css:?}`) as valid CSS"), 33 | )); 34 | } 35 | } 36 | 37 | if match_text_within.is_some() { 38 | let css = match_text_within.as_ref().unwrap(); 39 | if css.parse::().is_err() { 40 | return Err(Error::new( 41 | exception::arg_error(), 42 | format!("Could not parse `match_text_within` (`{css:?}`) as valid CSS",), 43 | )); 44 | } 45 | } 46 | 47 | let ignore_text_within = match rb_ignore_text_within { 48 | None => None, 49 | Some(rb_ignore_text_within) => { 50 | let mut ignore_text_within = vec![]; 51 | rb_ignore_text_within.iter().for_each(|i| { 52 | // TODO: test this against malice 53 | let ignore_text_within_tag_name = i.to_string(); 54 | ignore_text_within.push(ignore_text_within_tag_name); 55 | }); 56 | Some(ignore_text_within) 57 | } 58 | }; 59 | 60 | Ok(Self { 61 | match_element, 62 | match_text_within, 63 | ignore_text_within, 64 | }) 65 | } 66 | 67 | #[allow(clippy::let_unit_value)] 68 | fn scan_parse_args(args: &[Value]) -> Result { 69 | let args = scan_args::scan_args(args)?; 70 | let _: () = args.required; 71 | let _: () = args.optional; 72 | let _: () = args.splat; 73 | let _: () = args.trailing; 74 | let _: () = args.block; 75 | 76 | let kw = scan_args::get_kwargs::< 77 | _, 78 | (), 79 | (Option, Option, Option>), 80 | (), 81 | >( 82 | args.keywords, 83 | &[], 84 | &["match_element", "match_text_within", "ignore_text_within"], 85 | )?; 86 | let (match_element, match_text_within, rb_ignore_text_within) = kw.optional; 87 | 88 | Ok((match_element, match_text_within, rb_ignore_text_within)) 89 | } 90 | 91 | pub fn match_element(&self) -> Option { 92 | self.match_element.clone() 93 | } 94 | 95 | pub fn match_text_within(&self) -> Option { 96 | self.match_text_within.clone() 97 | } 98 | 99 | pub fn ignore_text_within(&self) -> Option> { 100 | self.ignore_text_within.clone() 101 | } 102 | } 103 | 104 | pub fn init(m_selma: RModule) -> Result<(), Error> { 105 | let c_selector = m_selma 106 | .define_class("Selector", magnus::class::object()) 107 | .expect("cannot define class Selma::Selector"); 108 | 109 | c_selector.define_singleton_method("new", function!(SelmaSelector::new, -1))?; 110 | 111 | Ok(()) 112 | } 113 | -------------------------------------------------------------------------------- /lib/selma.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | if ENV.fetch("DEBUG", false) 4 | require "amazing_print" 5 | require "debug" 6 | end 7 | 8 | require_relative "selma/extension" 9 | 10 | require_relative "selma/sanitizer" 11 | require_relative "selma/html" 12 | require_relative "selma/rewriter" 13 | require_relative "selma/selector" 14 | -------------------------------------------------------------------------------- /lib/selma/config.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Selma 4 | module Config 5 | OPTIONS = { 6 | memory: { 7 | max_allowed_memory_usage: nil, 8 | preallocated_parsing_buffer_size: nil, 9 | }, 10 | } 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /lib/selma/extension.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | begin 4 | # native precompiled gems package shared libraries in /lib/selma/ 5 | # load the precompiled extension file 6 | ruby_version = /\d+\.\d+/.match(RUBY_VERSION) 7 | require_relative "#{ruby_version}/selma" 8 | rescue LoadError 9 | # fall back to the extension compiled upon installation. 10 | # use "require" instead of "require_relative" because non-native gems will place C extension files 11 | # in Gem::BasicSpecification#extension_dir after compilation (during normal installation), which 12 | # is in $LOAD_PATH but not necessarily relative to this file (see nokogiri#2300) 13 | require "selma/selma" 14 | end 15 | -------------------------------------------------------------------------------- /lib/selma/html.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require_relative "html/element" 4 | 5 | module Selma 6 | class HTML 7 | end 8 | end 9 | -------------------------------------------------------------------------------- /lib/selma/html/element.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Selma 4 | class HTML 5 | class Element 6 | def available? 7 | !removed? 8 | end 9 | end 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /lib/selma/rewriter.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Selma 4 | class Rewriter 5 | end 6 | end 7 | -------------------------------------------------------------------------------- /lib/selma/sanitizer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "selma/sanitizer/config" 4 | 5 | module Selma 6 | class Sanitizer 7 | end 8 | end 9 | -------------------------------------------------------------------------------- /lib/selma/sanitizer/config.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "set" 4 | 5 | module Selma 6 | class Sanitizer 7 | module Config 8 | class << self 9 | # Deeply freezes and returns the given configuration Hash. 10 | def freeze_config(config) 11 | case config 12 | when Hash 13 | config.each_value { |c| freeze_config(c) } 14 | when Array, Set 15 | config.each { |c| freeze_config(c) } 16 | end 17 | 18 | config.freeze 19 | end 20 | 21 | # Returns a new Hash containing the result of deeply merging *other_config* 22 | # into *config*. Does not modify *config* or *other_config*. 23 | # 24 | # This is the safest way to use a built-in config as the basis for 25 | # your own custom config. 26 | def merge(config, other_config = {}) 27 | raise ArgumentError, "config must be a Hash" unless config.is_a?(Hash) 28 | raise ArgumentError, "other_config must be a Hash" unless other_config.is_a?(Hash) 29 | 30 | merged = {} 31 | keys = Set.new(config.keys + other_config.keys).to_a 32 | 33 | keys.each do |key| 34 | oldval = config[key] 35 | 36 | if other_config.key?(key) 37 | newval = other_config[key] 38 | 39 | merged[key] = if oldval.is_a?(Hash) && newval.is_a?(Hash) 40 | oldval.empty? ? newval.dup : merge(oldval, newval) 41 | elsif newval.is_a?(Array) && key != :transformers 42 | Set.new(newval).to_a 43 | else 44 | can_dupe?(newval) ? newval.dup : newval 45 | end 46 | else 47 | merged[key] = can_dupe?(oldval) ? oldval.dup : oldval 48 | end 49 | end 50 | 51 | merged 52 | end 53 | 54 | # Returns `true` if `dup` may be safely called on _value_, `false` 55 | # otherwise. 56 | def can_dupe?(value) 57 | !(value == true || value == false || value.nil? || value.is_a?(Method) || value.is_a?(Numeric) || value.is_a?(Symbol)) 58 | end 59 | end 60 | end 61 | end 62 | end 63 | 64 | require "selma/sanitizer/config/basic" 65 | require "selma/sanitizer/config/default" 66 | require "selma/sanitizer/config/relaxed" 67 | require "selma/sanitizer/config/restricted" 68 | -------------------------------------------------------------------------------- /lib/selma/sanitizer/config/basic.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Selma 4 | class Sanitizer 5 | module Config 6 | BASIC = freeze_config( 7 | elements: [ 8 | "a", 9 | "abbr", 10 | "blockquote", 11 | "b", 12 | "br", 13 | "cite", 14 | "code", 15 | "dd", 16 | "dfn", 17 | "dl", 18 | "dt", 19 | "em", 20 | "i", 21 | "kbd", 22 | "li", 23 | "mark", 24 | "ol", 25 | "p", 26 | "pre", 27 | "q", 28 | "s", 29 | "samp", 30 | "small", 31 | "strike", 32 | "strong", 33 | "sub", 34 | "sup", 35 | "time", 36 | "u", 37 | "ul", 38 | "var", 39 | ], 40 | 41 | attributes: { 42 | "a" => ["href"], 43 | "abbr" => ["title"], 44 | "blockquote" => ["cite"], 45 | "dfn" => ["title"], 46 | "q" => ["cite"], 47 | "time" => ["datetime", "pubdate"], 48 | }, 49 | 50 | protocols: { 51 | "a" => { "href" => ["ftp", "http", "https", "mailto", :relative] }, 52 | "blockquote" => { "cite" => ["http", "https", :relative] }, 53 | "q" => { "cite" => ["http", "https", :relative] }, 54 | }, 55 | ) 56 | end 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /lib/selma/sanitizer/config/default.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Selma 4 | class Sanitizer 5 | module Config 6 | # although there are many more protocol types, eg., ftp, xmpp, etc., 7 | # these are the only ones that are allowed by default 8 | VALID_PROTOCOLS = ["http", "https", "mailto", :relative] 9 | 10 | DEFAULT = freeze_config( 11 | # Whether or not to allow HTML comments. Allowing comments is strongly 12 | # discouraged, since IE allows script execution within conditional 13 | # comments. 14 | allow_comments: false, 15 | 16 | # Whether or not to allow well-formed HTML doctype declarations such as 17 | # "" when sanitizing a document. 18 | allow_doctype: false, 19 | 20 | # HTML attributes to allow in specific elements. By default, no attributes 21 | # are allowed. Use the symbol :data to indicate that arbitrary HTML5 22 | # data-* attributes should be allowed. 23 | attributes: {}, 24 | 25 | # HTML elements to allow. By default, no elements are allowed (which means 26 | # that all HTML will be stripped). 27 | elements: [], 28 | 29 | # URL handling protocols to allow in specific attributes. By default, no 30 | # protocols are allowed. Use :relative in place of a protocol if you want 31 | # to allow relative URLs sans protocol. Set to `:all` to allow any protocol. 32 | protocols: {}, 33 | 34 | # An Array of element names whose contents will be removed. The contents 35 | # of all other filtered elements will be left behind. 36 | remove_contents: [ 37 | "iframe", 38 | "math", 39 | "noembed", 40 | "noframes", 41 | "noscript", 42 | "plaintext", 43 | "script", 44 | "style", 45 | "svg", 46 | "xmp", 47 | ], 48 | 49 | # Elements which, when removed, should have their contents surrounded by 50 | # whitespace. 51 | whitespace_elements: [ 52 | "address", 53 | "article", 54 | "aside", 55 | "blockquote", 56 | "br", 57 | "dd", 58 | "div", 59 | "dl", 60 | "dt", 61 | "footer", 62 | "h1", 63 | "h2", 64 | "h3", 65 | "h4", 66 | "h5", 67 | "h6", 68 | "header", 69 | "hgroup", 70 | "hr", 71 | "li", 72 | "nav", 73 | "ol", 74 | "p", 75 | "pre", 76 | "section", 77 | "ul", 78 | ], 79 | ) 80 | end 81 | end 82 | end 83 | -------------------------------------------------------------------------------- /lib/selma/sanitizer/config/relaxed.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Selma 4 | class Sanitizer 5 | module Config 6 | RELAXED = freeze_config( 7 | elements: BASIC[:elements] + [ 8 | "address", 9 | "article", 10 | "aside", 11 | "bdi", 12 | "bdo", 13 | "body", 14 | "caption", 15 | "col", 16 | "colgroup", 17 | "data", 18 | "del", 19 | "details", 20 | "div", 21 | "figcaption", 22 | "figure", 23 | "footer", 24 | "h1", 25 | "h2", 26 | "h3", 27 | "h4", 28 | "h5", 29 | "h6", 30 | "head", 31 | "header", 32 | "hgroup", 33 | "hr", 34 | "html", 35 | "img", 36 | "ins", 37 | "main", 38 | "nav", 39 | "rp", 40 | "rt", 41 | "ruby", 42 | "section", 43 | "span", 44 | "style", 45 | "summary", 46 | "sup", 47 | "table", 48 | "tbody", 49 | "td", 50 | "tfoot", 51 | "th", 52 | "thead", 53 | "title", 54 | "tr", 55 | "wbr", 56 | ], 57 | 58 | allow_doctype: true, 59 | 60 | attributes: merge( 61 | BASIC[:attributes], 62 | :all => ["class", "dir", "hidden", "id", "lang", "style", "tabindex", "title", "translate"], 63 | "a" => ["href", "hreflang", "name", "rel"], 64 | "col" => ["span", "width"], 65 | "colgroup" => ["span", "width"], 66 | "data" => ["value"], 67 | "del" => ["cite", "datetime"], 68 | "img" => ["align", "alt", "border", "height", "src", "srcset", "width"], 69 | "ins" => ["cite", "datetime"], 70 | "li" => ["value"], 71 | "ol" => ["reversed", "start", "type"], 72 | "style" => ["media", "scoped", "type"], 73 | "table" => [ 74 | "align", 75 | "bgcolor", 76 | "border", 77 | "cellpadding", 78 | "cellspacing", 79 | "frame", 80 | "rules", 81 | "sortable", 82 | "summary", 83 | "width", 84 | ], 85 | "td" => ["abbr", "align", "axis", "colspan", "headers", "rowspan", "valign", "width"], 86 | "th" => ["abbr", "align", "axis", "colspan", "headers", "rowspan", "scope", "sorted", "valign", "width"], 87 | "ul" => ["type"], 88 | ), 89 | 90 | protocols: merge( 91 | BASIC[:protocols], 92 | "del" => { "cite" => ["http", "https", :relative] }, 93 | "img" => { "src" => ["http", "https", :relative] }, 94 | "ins" => { "cite" => ["http", "https", :relative] }, 95 | ), 96 | ) 97 | end 98 | end 99 | end 100 | -------------------------------------------------------------------------------- /lib/selma/sanitizer/config/restricted.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Selma 4 | class Sanitizer 5 | module Config 6 | RESTRICTED = freeze_config( 7 | elements: ["b", "em", "i", "strong", "u"], 8 | 9 | whitespace_elements: DEFAULT[:whitespace_elements], 10 | ) 11 | end 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /lib/selma/selector.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Selma 4 | class Selector 5 | end 6 | end 7 | -------------------------------------------------------------------------------- /lib/selma/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Selma 4 | VERSION = "0.4.12" 5 | end 6 | -------------------------------------------------------------------------------- /rakelib/benchmark.rake: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | desc "Run benchmarks" 4 | task :benchmark do 5 | if ENV["FETCH_PROGIT"] 6 | %x(rm -rf test/progit) 7 | %x(git clone https://github.com/progit/progit.git test/progit) 8 | langs = [ 9 | "ar", 10 | "az", 11 | "be", 12 | "ca", 13 | "cs", 14 | "de", 15 | "en", 16 | "eo", 17 | "es", 18 | "es-ni", 19 | "fa", 20 | "fi", 21 | "fr", 22 | "hi", 23 | "hu", 24 | "id", 25 | "it", 26 | "ja", 27 | "ko", 28 | "mk", 29 | "nl", 30 | "no-nb", 31 | "pl", 32 | "pt-br", 33 | "ro", 34 | "ru", 35 | "sr", 36 | "th", 37 | "tr", 38 | "uk", 39 | "vi", 40 | "zh", 41 | "zh-tw", 42 | ] 43 | langs.each do |lang| 44 | %x(cat test/progit/#{lang}/*/*.markdown >> test/benchmark/rewrite_benchmark_input.md) 45 | end 46 | end 47 | $LOAD_PATH.unshift("lib") 48 | load "test/benchmark.rb" 49 | end 50 | -------------------------------------------------------------------------------- /rakelib/compile.rake: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | desc "Compile the extension with debug symbols" 4 | task "compile:debug" do 5 | ENV["RB_SYS_CARGO_PROFILE"] = "dev" 6 | Rake::Task["compile"].invoke 7 | end 8 | -------------------------------------------------------------------------------- /rakelib/extension.rake: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "rb_sys/extensiontask" 4 | 5 | RbSys::ExtensionTask.new("selma", SELMA_SPEC) do |ext| 6 | ext.lib_dir = File.join("lib", "selma") 7 | end 8 | 9 | desc "Build native extension for a given platform (i.e. `rake 'native[x86_64-linux]'`)" 10 | task :native, [:platform] do |_t, platform:| 11 | sh "bundle", "exec", "rb-sys-dock", "--platform", platform, "--build" 12 | end 13 | 14 | desc "Compile and build native extension for a given platform (i.e. `rake 'native[x86_64-linux]'`)" 15 | task build: :compile 16 | -------------------------------------------------------------------------------- /rakelib/lint.rake: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | begin 4 | require "rubocop/rake_task" 5 | 6 | RuboCop::RakeTask.new(:rubocop) 7 | rescue LoadError => e 8 | warn("WARNING: rubocop is not available in this environment: #{e}") 9 | end 10 | -------------------------------------------------------------------------------- /rakelib/test.rake: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "rake/testtask" 4 | 5 | require "rake/testtask" 6 | require "ruby_memcheck" 7 | 8 | class ValgrindTestTask < Rake::TestTask 9 | DEFAULT_DIRECTORY_NAME = "suppressions" 10 | ERROR_EXITCODE = 42 # the answer to life, the universe, and segfaulting. 11 | VALGRIND_OPTIONS = [ 12 | "--num-callers=50", 13 | "--error-limit=no", 14 | "--partial-loads-ok=yes", 15 | "--undef-value-errors=no", 16 | "--error-exitcode=#{ERROR_EXITCODE}", 17 | "--gen-suppressions=all", 18 | ].freeze 19 | 20 | RubyMemcheck.config( 21 | binary_name: "selma", 22 | valgrind_generate_suppressions: true, 23 | ) 24 | 25 | def ruby(*args, **options, &block) 26 | valgrind_options = check_for_suppression_file(VALGRIND_OPTIONS) 27 | command = "valgrind #{valgrind_options.join(" ")} #{RUBY} #{args.join(" ")}" 28 | sh(command, **options, &block) 29 | end 30 | 31 | def formatted_ruby_version 32 | engine = if defined?(RUBY_DESCRIPTION) && RUBY_DESCRIPTION.include?("Ruby Enterprise Edition") 33 | "ree" 34 | else 35 | defined?(RUBY_ENGINE) ? RUBY_ENGINE : "ruby" 36 | end 37 | %(#{engine}-#{RUBY_VERSION}.#{RUBY_PATCHLEVEL}) 38 | end 39 | 40 | def check_for_suppression_file(options) 41 | options = options.dup 42 | suppression_files = matching_suppression_files 43 | suppression_files.each do |suppression_file| 44 | puts "NOTICE: using valgrind suppressions in #{suppression_file.inspect}" 45 | options << "--suppressions=#{suppression_file}" 46 | end 47 | options 48 | end 49 | 50 | def matching_suppression_files 51 | matching_files = [] 52 | version_matches.each do |version_string| 53 | matching_files += Dir[File.join(DEFAULT_DIRECTORY_NAME, "selma_#{version_string}.supp")] 54 | matching_files += Dir[File.join(DEFAULT_DIRECTORY_NAME, "selma_#{version_string}_*.supp")] 55 | end 56 | matching_files 57 | end 58 | 59 | def version_matches 60 | matches = [formatted_ruby_version] # e.g. "ruby-2.5.1.57" 61 | matches << formatted_ruby_version.split(".")[0, 3].join(".") # e.g. "ruby-2.5.1" 62 | matches << formatted_ruby_version.split(".")[0, 2].join(".") # e.g. "ruby-2.5" 63 | matches << formatted_ruby_version.split(".")[0, 1].join(".") # e.g. "ruby-2" 64 | matches << formatted_ruby_version.split("-").first # e.g. "ruby" 65 | matches 66 | end 67 | end 68 | 69 | def selma_test_task_configuration(test_config) 70 | test_config.libs << "test" 71 | test_config.verbose = true 72 | test_config.options = "-v" if ENV["CI"] 73 | end 74 | 75 | def selma_test_case_configuration(test_config) 76 | selma_test_task_configuration(test_config) 77 | test_config.test_files = FileList["test/**/*_test.rb"] 78 | end 79 | 80 | def selma_test_bench_configuration(test_config) 81 | selma_test_task_configuration(test_config) 82 | test_config.test_files = FileList["test/**/benchmark.rb"] 83 | end 84 | 85 | Rake::TestTask.new do |t| 86 | selma_test_case_configuration(t) 87 | end 88 | 89 | namespace "test" do 90 | ValgrindTestTask.new("valgrind") do |test_config| 91 | selma_test_case_configuration(test_config) 92 | end 93 | 94 | RubyMemcheck::TestTask.new("memcheck") do |test_config| 95 | selma_test_task_configuration(test_config) 96 | test_config.test_files = FileList["test/selma_rewriter_text_test.rb"] 97 | end 98 | end 99 | -------------------------------------------------------------------------------- /script/bootstrap: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | echo "==> Installing gem dependencies…" 6 | 7 | bundle install 8 | -------------------------------------------------------------------------------- /script/docker_build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sha=$(git rev-parse HEAD | cut -c 1-8) 4 | date=$(date +%x_%H:%M:%S) 5 | 6 | docker build -t gjtorikian/selma:latest --build-arg CACHEBUST=$(date +%s) -f test/memcheck/Dockerfile . 7 | 8 | docker rmi $(docker images -qa -f 'dangling=true') --force 9 | -------------------------------------------------------------------------------- /script/valgrind: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script/docker_build 4 | 5 | docker run gjtorikian/selma:latest 6 | -------------------------------------------------------------------------------- /selma.gemspec: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | lib = File.expand_path("lib", __dir__) 4 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 5 | require "selma/version" 6 | 7 | Gem::Specification.new do |spec| 8 | spec.name = "selma" 9 | spec.version = Selma::VERSION 10 | spec.authors = ["Garen J. Torikian"] 11 | spec.email = ["gjtorikian@gmail.com"] 12 | 13 | spec.summary = "Selma selects and matches HTML nodes using CSS rules. Backed by Rust's lol_html parser." 14 | spec.license = "MIT" 15 | 16 | spec.required_ruby_version = "~> 3.1" 17 | spec.required_rubygems_version = "~> 3.4" 18 | 19 | spec.files = ["LICENSE.txt", "README.md", "Cargo.lock", "Cargo.toml"] 20 | spec.files += Dir.glob("lib/**/*.rb") 21 | spec.files += Dir.glob("ext/**/*.{rs,toml,lock,rb}") 22 | spec.bindir = "exe" 23 | spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) } 24 | 25 | spec.require_paths = ["lib"] 26 | spec.extensions = ["ext/selma/extconf.rb"] 27 | 28 | spec.metadata = { 29 | "allowed_push_host" => "https://rubygems.org", 30 | "funding_uri" => "https://github.com/sponsors/gjtorikian/", 31 | "source_code_uri" => "https://github.com/gjtorikian/selma", 32 | "rubygems_mfa_required" => "true", 33 | } 34 | 35 | spec.add_dependency("rb_sys", "~> 0.9") 36 | 37 | spec.add_development_dependency("rake", "~> 13.0") 38 | spec.add_development_dependency("rake-compiler", "~> 1.2") 39 | end 40 | -------------------------------------------------------------------------------- /test/benchmark.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "benchmark/ips" 4 | require "selma" 5 | require_relative "benchmark/selma_config" 6 | 7 | require "sanitize" 8 | require "nokogiri" 9 | require "nokolexbor" 10 | 11 | DIR = File.expand_path(File.dirname(__FILE__)) 12 | 13 | DOCUMENT_SMALL = File.read("#{DIR}/benchmark/html/document-sm.html").encode("UTF-8", invalid: :replace, undef: :replace) 14 | DOCUMENT_MEDIUM = File.read("#{DIR}/benchmark/html/document-md.html").encode("UTF-8", invalid: :replace, undef: :replace) 15 | DOCUMENT_HUGE = File.read("#{DIR}/benchmark/html/document-lg.html").encode("UTF-8", invalid: :replace, undef: :replace) 16 | 17 | DOCUMENTS = [ 18 | [DOCUMENT_SMALL, "sm"], 19 | [DOCUMENT_MEDIUM, "md"], 20 | [DOCUMENT_HUGE, "lg"], 21 | ] 22 | 23 | IPS_ARGS = { time: 30, warmup: 10 } 24 | 25 | def bytes_to_megabytes(bytes) 26 | (bytes.to_f / 1_000_000).round(2) 27 | end 28 | 29 | def print_size(html) 30 | bytes = html.bytesize 31 | mbes = bytes_to_megabytes(bytes) 32 | puts("input size = #{bytes} bytes, #{mbes} MB\n\n") 33 | end 34 | 35 | def compare_sanitize 36 | DOCUMENTS.each do |(html, label)| 37 | print_size(html) 38 | Benchmark.ips do |x| 39 | x.config(IPS_ARGS) 40 | 41 | x.report("sanitize-#{label}") do 42 | Sanitize.document(html, Sanitize::Config::RELAXED) 43 | end 44 | 45 | x.report("selma-#{label}") do 46 | sanitizer = Selma::Sanitizer.new(Selma::Sanitizer::Config::RELAXED) 47 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite(html) 48 | end 49 | 50 | x.compare! 51 | end 52 | end 53 | end 54 | 55 | def compare_rewriting 56 | nokogiri_compat = ->(doc) do 57 | doc.css(%(a[href])).each do |node| 58 | node["href"] = node["href"].sub(/^https?:/, "gopher:") 59 | end 60 | 61 | doc.css("span").each do |node| 62 | node.parent.add_child("
#{node.text}
") 63 | end 64 | 65 | doc.css("img").each(&:remove) 66 | 67 | doc.to_html 68 | end 69 | 70 | DOCUMENTS.each do |(html, label)| 71 | print_size(html) 72 | Benchmark.ips do |x| 73 | x.config(IPS_ARGS) 74 | 75 | x.report("nokogiri-#{label}") do 76 | doc = Nokogiri::HTML.parse(html) 77 | 78 | nokogiri_compat.call(doc) 79 | end 80 | 81 | x.report("nokolexbor-#{label}") do 82 | doc = Nokolexbor::HTML(html) 83 | 84 | nokogiri_compat.call(doc) 85 | end 86 | 87 | x.report("selma-#{label}") do 88 | Selma::Rewriter.new(sanitizer: nil, handlers: [ 89 | SelmaConfig::HrefHandler.new, 90 | SelmaConfig::SpanHandler.new, 91 | SelmaConfig::ImgHandler.new, 92 | ]).rewrite(html) 93 | end 94 | 95 | x.compare! 96 | end 97 | end 98 | end 99 | 100 | puts "Compare sanitize" 101 | compare_sanitize 102 | 103 | puts "Compare rewriting" 104 | compare_rewriting 105 | -------------------------------------------------------------------------------- /test/benchmark/html/fragment-small.html: -------------------------------------------------------------------------------- 1 |

Sanitize is a whitelist-based HTML sanitizer. Given a list of acceptable 2 | elements and attributes, Sanitize will remove all unacceptable HTML from a 3 | string.

4 | 5 |

Using a simple configuration syntax, you can tell Sanitize to allow certain 6 | elements, certain attributes within those elements, and even certain URL 7 | protocols within attributes that contain URLs. Any HTML elements or attributes 8 | that you don't explicitly allow will be removed.

9 | 10 |
14 | -------------------------------------------------------------------------------- /test/benchmark/selma_config.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module SelmaConfig 4 | class HrefHandler 5 | SELECTOR = Selma::Selector.new(match_element: "href") 6 | 7 | def selector 8 | SELECTOR 9 | end 10 | 11 | def handle_element(element) 12 | element["href"] = element["href"].sub(/^https?:/, "gopher:") 13 | end 14 | end 15 | 16 | class SpanHandler 17 | SELECTOR = Selma::Selector.new(match_text_within: "span") 18 | 19 | def selector 20 | SELECTOR 21 | end 22 | 23 | def handle_text_chunk(text_chunk) 24 | text_chunk.after("
#{text_chunk}
", as: :html) unless text_chunk.to_s.strip.empty? 25 | end 26 | end 27 | 28 | class ImgHandler 29 | SELECTOR = Selma::Selector.new(match_element: "img") 30 | 31 | def selector 32 | SELECTOR 33 | end 34 | 35 | def handle_element(element) 36 | element.remove 37 | end 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /test/memcheck/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM arm64v8/ruby:3.1-slim 2 | 3 | # Non-interactive frontend for debian stuff to reduce error noise 4 | ENV DEBIAN_FRONTEND noninteractive 5 | 6 | # Install basic essentials 7 | RUN apt -y update && \ 8 | apt -y install openssh-client apt-utils curl wget zip git make build-essential libclang-dev 9 | 10 | RUN apt-get install -y valgrind 11 | 12 | # clean apt cache 13 | RUN rm -rf /var/cache/apt/* 14 | 15 | ENV RUSTUP_HOME=/opt/rust \ 16 | CARGO_HOME=/opt/rust 17 | 18 | RUN ( curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path ) && \ 19 | find /opt/rust -exec chmod 777 {} + 20 | 21 | COPY test/memcheck/tools/rust-wrapper.sh /usr/local/bin/cargo 22 | COPY test/memcheck/tools/rust-wrapper.sh /usr/local/bin/cargo-clippy 23 | COPY test/memcheck/tools/rust-wrapper.sh /usr/local/bin/cargo-fmt 24 | COPY test/memcheck/tools/rust-wrapper.sh /usr/local/bin/rls 25 | COPY test/memcheck/tools/rust-wrapper.sh /usr/local/bin/rust-gdb 26 | COPY test/memcheck/tools/rust-wrapper.sh /usr/local/bin/rust-lldb 27 | COPY test/memcheck/tools/rust-wrapper.sh /usr/local/bin/rustc 28 | COPY test/memcheck/tools/rust-wrapper.sh /usr/local/bin/rustdoc 29 | COPY test/memcheck/tools/rust-wrapper.sh /usr/local/bin/rustfmt 30 | COPY test/memcheck/tools/rust-wrapper.sh /usr/local/bin/rustup 31 | 32 | ENV APP_HOME /selma 33 | RUN mkdir $APP_HOME 34 | WORKDIR $APP_HOME 35 | 36 | COPY Gemfile* $APP_HOME/ 37 | COPY selma.gemspec $APP_HOME/ 38 | RUN mkdir -p $APP_HOME/lib/selma 39 | COPY lib/selma/version.rb $APP_HOME/lib/selma 40 | 41 | ENV BUNDLE_GEMFILE=$APP_HOME/Gemfile \ 42 | BUNDLE_JOBS=2 \ 43 | BUNDLE_PATH=/bundle 44 | 45 | RUN bundle install 46 | 47 | WORKDIR $APP_HOME 48 | COPY . $APP_HOME 49 | 50 | COPY test/memcheck/entrypoint.sh /usr/bin/ 51 | RUN chmod +x /usr/bin/entrypoint.sh 52 | 53 | ENTRYPOINT entrypoint.sh 54 | -------------------------------------------------------------------------------- /test/memcheck/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | export DEBUG=1 5 | 6 | bundle exec rake compile 7 | 8 | # LD_PRELOAD=$(gcc -print-file-name=libasan.so) bundle exec rake test:asan 9 | 10 | bundle exec rake test:memcheck 11 | -------------------------------------------------------------------------------- /test/memcheck/tools/rust-wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | RUSTUP_HOME=/opt/rust exec /opt/rust/bin/${0##*/} "$@" 4 | -------------------------------------------------------------------------------- /test/memcheck/tools/userhack.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for id in `seq 100 199`; do 4 | groupadd -g $id group$id 5 | useradd -M -d /tmp -u $id user$id 6 | done 7 | 8 | exit 0 9 | -------------------------------------------------------------------------------- /test/selma_maliciousness_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "test_helper" 4 | 5 | class SelmaMaliciousnessTest < Minitest::Test 6 | class NoSelector 7 | def initialize 8 | end 9 | 10 | def handle_element(element) 11 | element["class"] = "boldy" 12 | end 13 | end 14 | 15 | def test_that_it_does_hate_missing_selector 16 | frag = "Wow!" 17 | assert_raises(NoMethodError) do 18 | Selma::Rewriter.new(sanitizer: nil, handlers: [NoSelector.new]).rewrite(frag) 19 | end 20 | end 21 | 22 | class NoHandleElement 23 | SELECTOR = Selma::Selector.new(match_element: "b") 24 | 25 | def selector 26 | SELECTOR 27 | end 28 | end 29 | 30 | def test_that_it_does_not_hate_missing_handle_element 31 | frag = "Wow!" 32 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [NoHandleElement.new]).rewrite(frag) 33 | 34 | assert_equal(frag, modified_doc) 35 | end 36 | 37 | class NoHandleText 38 | SELECTOR = Selma::Selector.new(match_text_within: "strong") 39 | 40 | def selector 41 | SELECTOR 42 | end 43 | end 44 | 45 | def test_that_it_does_hate_missing_match_text_within 46 | frag = "Wow!" 47 | assert_raises(RuntimeError) do 48 | Selma::Rewriter.new(sanitizer: nil, handlers: [NoHandleText.new]).rewrite(frag) 49 | end 50 | end 51 | 52 | def test_that_it_does_hate_nil_sanitizer_and_blank_handlers 53 | frag = "Wow!" 54 | assert_raises(ArgumentError) do 55 | Selma::Rewriter.new(sanitizer: nil, handlers: []).rewrite(frag) 56 | end 57 | end 58 | 59 | def test_that_it_raises_on_non_array_handlers 60 | frag = "Wow!" 61 | assert_raises(TypeError) do 62 | Selma::Rewriter.new(sanitizer: nil, handlers: 818).rewrite(frag) 63 | end 64 | end 65 | 66 | def test_that_it_raises_on_array_handler_with_wrong_type 67 | frag = "Wow!" 68 | assert_raises(NoMethodError) do 69 | Selma::Rewriter.new(sanitizer: nil, handlers: [562]).rewrite(frag) 70 | end 71 | end 72 | 73 | class WrongSelectorArgument 74 | def selector 75 | Selma::Selector.new(55) 76 | end 77 | end 78 | 79 | def test_that_it_raises_on_wrong_selector_arg 80 | frag = "Wow!" 81 | assert_raises(TypeError) do 82 | Selma::Rewriter.new(sanitizer: nil, handlers: [WrongSelectorArgument.new]).rewrite(frag) 83 | end 84 | end 85 | 86 | class IncorrectSelectorType 87 | SELECTOR = Selma::Selector.new(match_element: "strong") 88 | 89 | def selector 90 | 3 91 | end 92 | end 93 | 94 | def test_that_it_raises_on_incorrect_selector_type 95 | frag = "Wow!" 96 | assert_raises(TypeError) do 97 | Selma::Rewriter.new(sanitizer: nil, handlers: [IncorrectSelectorType.new]).rewrite(frag) 98 | end 99 | end 100 | 101 | class IncorrectMatchType 102 | def selector 103 | Selma::Selector.new(match_element: 42) 104 | end 105 | end 106 | 107 | def test_that_it_raises_on_incorrect_match_type 108 | frag = "Wow!" 109 | assert_raises(TypeError) do 110 | Selma::Rewriter.new(sanitizer: nil, handlers: [IncorrectMatchType.new]).rewrite(frag) 111 | end 112 | end 113 | 114 | class IncorrectTextType 115 | def selector 116 | Selma::Selector.new(match_text_within: 42) 117 | end 118 | end 119 | 120 | def test_that_it_raises_on_incorrect_text_type 121 | frag = "Wow!" 122 | assert_raises(TypeError) do 123 | Selma::Rewriter.new(sanitizer: nil, handlers: [IncorrectTextType.new]).rewrite(frag) 124 | end 125 | end 126 | 127 | class NilOptions 128 | def selector 129 | Selma::Selector.new(match_element: nil, match_text_within: nil) 130 | end 131 | end 132 | 133 | def test_that_it_raises_on_both_options_being_nil 134 | frag = "Wow!" 135 | assert_raises(NoMethodError) do 136 | Selma::Rewriter.new(sanitizer: nil, handlers: [NilOptions]).rewrite(frag) 137 | end 138 | end 139 | 140 | class GarbageTextOptions 141 | def selector 142 | Selma::Selector.new(match_text_within: "time") 143 | end 144 | 145 | def handle_text_chunk(text) 146 | text.replace(text.sub("Wow!", as: :boop)) 147 | end 148 | end 149 | 150 | def test_that_it_raises_on_handle_text_returning_non_string 151 | frag = "" 152 | assert_raises(RuntimeError) do 153 | Selma::Rewriter.new(sanitizer: nil, handlers: [GarbageTextOptions.new]).rewrite(frag) 154 | end 155 | end 156 | 157 | def test_sanitizer_expects_all_as_symbol 158 | assert_raises(ArgumentError) do 159 | Selma::Sanitizer.new({ 160 | elements: ["a"], 161 | attributes: { "a" => ["href"] }, 162 | protocols: { "a" => { "href" => [:all] } }, 163 | }) 164 | end 165 | end 166 | 167 | class ContentExtractor 168 | SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title") 169 | 170 | attr_reader :title, :meta 171 | 172 | def initialize 173 | super 174 | @title = "" 175 | @meta = {} 176 | @within_title = false 177 | end 178 | 179 | def selector 180 | SELECTOR 181 | end 182 | 183 | def handle_element(element) 184 | if element.tag_name == "pre" || 185 | element.tag_name == "code" || 186 | element.tag_name == "form" || 187 | element.tag_name == "style" || 188 | element.tag_name == "noscript" || 189 | element.tag_name == "script" || 190 | element.tag_name == "svg" 191 | element.remove 192 | elsif element.tag_name == "title" 193 | @within_title = true 194 | element.remove 195 | elsif element.tag_name == "meta" 196 | return if element.attributes["name"].nil? 197 | 198 | @meta[element.attributes["name"]] = element.attributes["content"] 199 | else 200 | element.remove_and_keep_content 201 | end 202 | end 203 | 204 | def handle_text_chunk(text) 205 | if @within_title 206 | @within_title = false 207 | @title = text.to_s 208 | end 209 | end 210 | end 211 | 212 | def test_rewriter_does_not_halt_on_malformed_html 213 | html = load_fixture("docs.html") 214 | 215 | sanitizer_config = Selma::Sanitizer::Config::RELAXED.dup.merge({ 216 | allow_doctype: false, 217 | }) 218 | sanitizer = Selma::Sanitizer.new(sanitizer_config) 219 | 220 | Selma::Rewriter.new(sanitizer: sanitizer, handlers: [ContentExtractor.new]).rewrite(html) 221 | end 222 | 223 | class TagRemover 224 | SELECTOR = Selma::Selector.new(match_element: "*") 225 | 226 | def selector 227 | SELECTOR 228 | end 229 | 230 | UNNECESSARY_TAGS = [ 231 | "pre", 232 | ] 233 | 234 | CONTENT_TO_KEEP = [ 235 | "html", 236 | "body", 237 | ] 238 | 239 | def handle_element(element) 240 | if UNNECESSARY_TAGS.include?(element.tag_name) 241 | element.remove 242 | elsif CONTENT_TO_KEEP.include?(element.tag_name) 243 | element.remove_and_keep_content 244 | end 245 | end 246 | end 247 | 248 | class ContentBreaker 249 | SELECTOR = Selma::Selector.new(match_element: "*") 250 | 251 | def selector 252 | SELECTOR 253 | end 254 | 255 | def handle_element(element) 256 | if Selma::Sanitizer::Config::DEFAULT[:whitespace_elements].include?(element.tag_name) && !element.removed? 257 | element.append("\n", as: :text) 258 | end 259 | element.remove_and_keep_content 260 | end 261 | end 262 | 263 | def test_deleted_content_does_not_segfault 264 | html = load_fixture("deleting_content.html") 265 | 266 | sanitizer_config = Selma::Sanitizer::Config::RELAXED.dup.merge({ 267 | allow_comments: false, 268 | allow_doctype: false, 269 | }) 270 | sanitizer = Selma::Sanitizer.new(sanitizer_config) 271 | 272 | selma = Selma::Rewriter.new(sanitizer: sanitizer, handlers: [TagRemover.new, ContentBreaker.new]) 273 | 1000.times do 274 | selma.rewrite(html) 275 | end 276 | end if ci? 277 | end 278 | -------------------------------------------------------------------------------- /test/selma_rewriter_match_attribute_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "test_helper" 4 | 5 | class SelmaRewriterMatchAttributeTest < Minitest::Test 6 | class RemoveAttr 7 | SELECTOR = Selma::Selector.new(match_element: "a") 8 | 9 | def selector 10 | SELECTOR 11 | end 12 | 13 | def handle_element(element) 14 | element.remove_attribute("foo") 15 | end 16 | end 17 | 18 | def test_that_it_removes_attributes 19 | frag = "Wow!" 20 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [RemoveAttr.new]).rewrite(frag) 21 | 22 | assert_equal("Wow!", modified_doc) 23 | end 24 | 25 | class GetAttrs < Minitest::Test 26 | SELECTOR = Selma::Selector.new(match_element: "div") 27 | 28 | # rubocop:disable Lint/MissingSuper 29 | def initialize 30 | @assertions = 0 31 | end 32 | # rubocop:enable Lint/MissingSuper 33 | 34 | def selector 35 | SELECTOR 36 | end 37 | 38 | def handle_element(element) 39 | hash = { 40 | "class" => "a b c 1 2 3", 41 | "data-foo" => "baz", 42 | } 43 | 44 | assert_equal(hash, element.attributes) 45 | end 46 | end 47 | 48 | def test_that_it_gets_attributes 49 | frag = "
Wow!
" 50 | Selma::Rewriter.new(sanitizer: nil, handlers: [GetAttrs.new]).rewrite(frag) 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /test/selma_rewriter_match_element_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "test_helper" 4 | 5 | class SelmaRewriterMatchElementTest < Minitest::Test 6 | class Handler 7 | SELECTOR = Selma::Selector.new(match_element: "strong") 8 | 9 | def selector 10 | SELECTOR 11 | end 12 | 13 | def handle_element(element) 14 | element["class"] = "boldy" 15 | end 16 | end 17 | 18 | def test_that_it_works 19 | frag = "Wow!" 20 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [Handler.new]).rewrite(frag) 21 | 22 | assert_equal('Wow!', modified_doc) 23 | end 24 | 25 | def test_that_it_works_with_sanitizer 26 | config = { 27 | elements: ["strong"], 28 | } 29 | sanitizer = Selma::Sanitizer.new(config) 30 | frag = "Wow!" 31 | modified_doc = Selma::Rewriter.new(sanitizer: sanitizer, handlers: [Handler.new]).rewrite(frag) 32 | 33 | # note that sanitization does not effect rewriting 34 | assert_equal("Wow!", modified_doc) 35 | end 36 | 37 | class FirstRewrite 38 | SELECTOR = Selma::Selector.new(match_element: "div") 39 | 40 | def selector 41 | SELECTOR 42 | end 43 | 44 | def handle_element(element) 45 | element["class"] = "boldy" 46 | end 47 | end 48 | 49 | class SecondRewrite 50 | SELECTOR = Selma::Selector.new(match_element: "div") 51 | 52 | def selector 53 | SELECTOR 54 | end 55 | 56 | def handle_element(element) 57 | if element["class"] == "boldy" 58 | element["class"] += " boldy2" 59 | end 60 | end 61 | end 62 | 63 | def test_that_it_performs_handlers_in_order 64 | frag = "
Wow!
" 65 | modified_doc = Selma::Rewriter.new(sanitizer: @sanitizer, handlers: [FirstRewrite.new]).rewrite(frag) 66 | 67 | assert_equal('
Wow!
', modified_doc) 68 | 69 | modified_doc = Selma::Rewriter.new(sanitizer: @sanitizer, handlers: [SecondRewrite.new]).rewrite(frag) 70 | 71 | assert_equal(frag, modified_doc) 72 | 73 | modified_doc = Selma::Rewriter.new(sanitizer: @sanitizer, handlers: [FirstRewrite.new, SecondRewrite.new]).rewrite(frag) 74 | 75 | assert_equal('
Wow!
', modified_doc) 76 | end 77 | 78 | class GetAncestors < Minitest::Test 79 | SELECTOR = Selma::Selector.new(match_element: "strong") 80 | 81 | # rubocop:disable Lint/MissingSuper 82 | def initialize 83 | @assertions = 0 84 | end 85 | # rubocop:enable Lint/MissingSuper 86 | 87 | def selector 88 | SELECTOR 89 | end 90 | 91 | def handle_element(element) 92 | ancestors = ["div", "p", "foo"] 93 | 94 | assert_equal(ancestors, element.ancestors) 95 | end 96 | end 97 | 98 | def test_that_it_knows_ancestors 99 | frag = "

Wow!

" 100 | Selma::Rewriter.new(sanitizer: nil, handlers: [GetAncestors.new]).rewrite(frag) 101 | end 102 | 103 | class GetEmptyAncestors < Minitest::Test 104 | SELECTOR = Selma::Selector.new(match_element: "strong") 105 | 106 | # rubocop:disable Lint/MissingSuper 107 | def initialize 108 | @assertions = 0 109 | end 110 | # rubocop:enable Lint/MissingSuper 111 | 112 | def selector 113 | SELECTOR 114 | end 115 | 116 | def handle_element(element) 117 | ancestors = [] 118 | 119 | assert_equal("strong", element.tag_name) 120 | assert_equal(ancestors, element.ancestors) 121 | end 122 | end 123 | 124 | def test_that_it_knows_empty_ancestors 125 | frag = "Wow!" 126 | Selma::Rewriter.new(sanitizer: nil, handlers: [GetEmptyAncestors.new]).rewrite(frag) 127 | end 128 | 129 | class AppendHtml 130 | SELECTOR = Selma::Selector.new(match_element: "strong") 131 | 132 | def selector 133 | SELECTOR 134 | end 135 | 136 | def handle_element(element) 137 | element.append("Gee!", as: :html) 138 | end 139 | end 140 | 141 | def test_that_it_appends_as_html 142 | frag = "Wow!" 143 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [AppendHtml.new]).rewrite(frag) 144 | 145 | assert_equal("Wow!Gee!", modified_doc) 146 | end 147 | 148 | class AppendText 149 | SELECTOR = Selma::Selector.new(match_element: "strong") 150 | 151 | def selector 152 | SELECTOR 153 | end 154 | 155 | def handle_element(element) 156 | element.append("Gee!", as: :text) 157 | end 158 | end 159 | 160 | def test_that_it_appends_as_text 161 | frag = "Wow!" 162 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [AppendText.new]).rewrite(frag) 163 | 164 | assert_equal("Wow!<em>Gee!</em>", modified_doc) 165 | end 166 | 167 | class BeforeText 168 | SELECTOR = Selma::Selector.new(match_element: "strong") 169 | 170 | def selector 171 | SELECTOR 172 | end 173 | 174 | def handle_element(element) 175 | element.before("wow?", as: :html) 176 | end 177 | end 178 | 179 | def test_that_it_adds_text_before_as_html 180 | frag = "Wow!" 181 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [BeforeText.new]).rewrite(frag) 182 | 183 | assert_equal(%(wow?Wow!), modified_doc) 184 | end 185 | 186 | class AfterText 187 | SELECTOR = Selma::Selector.new(match_element: "strong") 188 | 189 | def selector 190 | SELECTOR 191 | end 192 | 193 | def handle_element(element) 194 | element.after("ok?", as: :html) 195 | end 196 | end 197 | 198 | def test_that_it_adds_text_after_as_html 199 | frag = "Wow!" 200 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [AfterText.new]).rewrite(frag) 201 | 202 | assert_equal(%(Wow!ok?), modified_doc) 203 | end 204 | 205 | class SetInnerContent 206 | SELECTOR = Selma::Selector.new(match_element: "strong") 207 | 208 | def selector 209 | SELECTOR 210 | end 211 | 212 | def handle_element(element) 213 | element.set_inner_content("Gee!", as: :text) 214 | end 215 | end 216 | 217 | def test_that_it_sets_inner_content 218 | frag = "Wow!" 219 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [SetInnerContent.new]).rewrite(frag) 220 | 221 | assert_equal(%(Gee!), modified_doc) 222 | end 223 | 224 | class RaiseError 225 | SELECTOR = Selma::Selector.new(match_element: "strong") 226 | 227 | def selector 228 | SELECTOR 229 | end 230 | 231 | def handle_element(element) 232 | raise NoMethodError, "boom!" 233 | end 234 | end 235 | 236 | # TODO: note that this error does not match, because 237 | # it's difficult to pluck from magnus 238 | def test_that_it_can_raise_errors 239 | frag = "Wow!" 240 | assert_raises(RuntimeError) do 241 | Selma::Rewriter.new(sanitizer: nil, handlers: [RaiseError.new]).rewrite(frag) 242 | end 243 | end 244 | 245 | class SetTagName 246 | SELECTOR = Selma::Selector.new(match_element: "strong") 247 | 248 | def selector 249 | SELECTOR 250 | end 251 | 252 | def handle_element(element) 253 | element.tag_name = "bold" 254 | end 255 | end 256 | 257 | def test_that_it_sets_tag_name 258 | frag = "Wow!" 259 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [SetTagName.new]).rewrite(frag) 260 | 261 | assert_equal(%(Wow!), modified_doc) 262 | end 263 | 264 | class GetIsSelfClosing < Minitest::Test 265 | SELECTOR = Selma::Selector.new(match_element: "strong") 266 | 267 | # rubocop:disable Lint/MissingSuper 268 | def initialize 269 | @assertions = 0 270 | end 271 | # rubocop:enable Lint/MissingSuper 272 | 273 | def selector 274 | SELECTOR 275 | end 276 | 277 | def handle_element(element) 278 | assert_equal("strong", element.self_closing?) 279 | end 280 | end 281 | 282 | def test_that_it_gets_self_closing 283 | frag = "Wow!" 284 | Selma::Rewriter.new(sanitizer: nil, handlers: [GetEmptyAncestors.new]).rewrite(frag) 285 | end 286 | 287 | class GetHasAttr < Minitest::Test 288 | SELECTOR = Selma::Selector.new(match_element: "strong") 289 | 290 | # rubocop:disable Lint/MissingSuper 291 | def initialize 292 | @assertions = 0 293 | end 294 | # rubocop:enable Lint/MissingSuper 295 | 296 | def selector 297 | SELECTOR 298 | end 299 | 300 | def handle_element(element) 301 | assert(element.has_attribute?("class")) 302 | end 303 | end 304 | 305 | def test_that_it_has_attr 306 | frag = %(Wow!) 307 | Selma::Rewriter.new(sanitizer: nil, handlers: [GetHasAttr.new]).rewrite(frag) 308 | end 309 | 310 | class RemoveElement < Minitest::Test 311 | SELECTOR = Selma::Selector.new(match_element: "strong") 312 | 313 | # rubocop:disable Lint/MissingSuper 314 | def initialize 315 | @assertions = 0 316 | end 317 | # rubocop:enable Lint/MissingSuper 318 | 319 | def selector 320 | SELECTOR 321 | end 322 | 323 | def handle_element(element) 324 | element.remove 325 | 326 | assert_predicate(element, :removed?) 327 | end 328 | end 329 | 330 | def test_that_it_can_remove 331 | frag = "
Wow!
" 332 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [RemoveElement.new]).rewrite(frag) 333 | 334 | assert_equal(%(
Wow
), modified_doc) 335 | end 336 | 337 | class RemoveElementAndKeepContent < Minitest::Test 338 | SELECTOR = Selma::Selector.new(match_element: "strong") 339 | 340 | # rubocop:disable Lint/MissingSuper 341 | def initialize 342 | @assertions = 0 343 | end 344 | # rubocop:enable Lint/MissingSuper 345 | 346 | def selector 347 | SELECTOR 348 | end 349 | 350 | def handle_element(element) 351 | element.remove_and_keep_content 352 | 353 | assert_predicate(element, :removed?) 354 | end 355 | end 356 | 357 | def test_that_it_can_remove_and_keep_content 358 | frag = "
Wow!
" 359 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [RemoveElementAndKeepContent.new]).rewrite(frag) 360 | 361 | assert_equal(%(
Wow!
), modified_doc) 362 | end 363 | end 364 | -------------------------------------------------------------------------------- /test/selma_rewriter_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "test_helper" 4 | 5 | class SelmaRewriterTest < Minitest::Test 6 | def test_max_memory_settings_must_be_correctly_set 7 | fragment = "12345" 8 | assert_raises(ArgumentError) do # missing preallocated_parsing_buffer_size 9 | Selma::Rewriter.new(options: { memory: { max_allowed_memory_usage: 4 } }).rewrite(fragment) 10 | end 11 | end 12 | 13 | class RemoveLinkClass 14 | SELECTOR = Selma::Selector.new(match_element: %(a:not([class="anchor"]))) 15 | 16 | def selector 17 | SELECTOR 18 | end 19 | 20 | def handle_element(element) 21 | element.remove_attribute("class") 22 | end 23 | end 24 | 25 | class RemoveIdAttributes 26 | SELECTOR = Selma::Selector.new(match_element: %(a[id], li[id])) 27 | 28 | def selector 29 | SELECTOR 30 | end 31 | 32 | def handle_element(element) 33 | # footnote ids should not be removed 34 | return if element.tag_name == "li" 35 | return if element.tag_name == "a" 36 | 37 | # links with generated header anchors should not be removed 38 | return if element.tag_name == "a" && element["class"] == "anchor" 39 | 40 | element.remove_attribute("id") 41 | end 42 | end 43 | 44 | class BaseRemoveRel 45 | SELECTOR = Selma::Selector.new(match_element: %(a)) 46 | 47 | def selector 48 | SELECTOR 49 | end 50 | 51 | def handle_element(element) 52 | # we allow rel="license" to support the Rel-license microformat 53 | # http://microformats.org/wiki/rel-license 54 | unless element["rel"] == "license" 55 | element.remove_attribute("rel") 56 | end 57 | end 58 | end 59 | 60 | def test_max_memory_settings_work 61 | base_text = ->(itr) { 62 | %|

#{itr}

| 63 | } 64 | 65 | str = [] 66 | 10.times do |itr| 67 | str << base_text.call(itr) 68 | end 69 | html = str.join("\n") 70 | 71 | sanitizer_config = Selma::Sanitizer.new(Selma::Sanitizer::Config::RELAXED) 72 | rewriter = Selma::Rewriter.new(sanitizer: sanitizer_config, handlers: [RemoveLinkClass.new, RemoveIdAttributes.new, BaseRemoveRel.new], options: { memory: { max_allowed_memory_usage: html.length / 2, preallocated_parsing_buffer_size: html.length / 4 } }) 73 | assert_raises(RuntimeError) do 74 | rewriter.rewrite(html) 75 | end 76 | end 77 | 78 | class ElementRewriter 79 | SELECTOR = Selma::Selector.new(match_text_within: "*") 80 | 81 | def selector 82 | SELECTOR 83 | end 84 | 85 | def handle_text_chunk(text) 86 | content = text.to_s 87 | return unless content.include?("@") 88 | 89 | html = content.gsub(/@(\w+)/, "@\\1") 90 | 91 | text.replace(html, as: :html) 92 | end 93 | end 94 | 95 | def test_rewritten_text_chunk_is_not_sanitized 96 | initial_html = "

Hey there, @gjtorikian is here.

" 97 | 98 | sanitizer_config = Selma::Sanitizer.new({ 99 | elements: ["a", "p"], 100 | attributes: { 101 | "a" => ["href"], 102 | }, 103 | protocols: { 104 | "a" => { "href" => ["https"] }, 105 | }, 106 | }) 107 | rewriter = Selma::Rewriter.new(sanitizer: sanitizer_config, handlers: [ElementRewriter.new]) 108 | result = rewriter.rewrite(initial_html) 109 | 110 | # `class` is not sanitized out 111 | assert_equal("

Hey there, @gjtorikian is here.

", result) 112 | end 113 | 114 | def test_stress_garbage_collection 115 | initial_html = File.read(File.join(__dir__, "benchmark", "html", "document-sm.html")).encode("UTF-8", invalid: :replace, undef: :replace) 116 | 117 | sanitizer_config = Selma::Sanitizer.new({ 118 | elements: ["a", "p"], 119 | attributes: { 120 | "a" => ["href"], 121 | }, 122 | protocols: { 123 | "a" => { "href" => ["https"] }, 124 | }, 125 | }) 126 | 127 | GC.stress = true 128 | # If this segfaults, then it failed the test 129 | rewriter = Selma::Rewriter.new(sanitizer: sanitizer_config, handlers: [ElementRewriter.new]) 130 | rewriter.rewrite(initial_html) 131 | GC.stress = false 132 | end 133 | end 134 | -------------------------------------------------------------------------------- /test/selma_rewriter_text_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "test_helper" 4 | 5 | class SelmaRewriterTextTest < Minitest::Test 6 | class TextRewriteAll 7 | SELECTOR = Selma::Selector.new(match_text_within: "*") 8 | 9 | def selector 10 | SELECTOR 11 | end 12 | 13 | def handle_text_chunk(text) 14 | text.replace(text.to_s.sub("Wow", "MEOW!"), as: :text) 15 | end 16 | end 17 | 18 | def test_that_it_works_for_all 19 | frag = "
Wow!
Wow!Wow!" 20 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [TextRewriteAll.new]).rewrite(frag) 21 | 22 | assert_equal("
MEOW!!
MEOW!!MEOW!!", modified_doc) 23 | end 24 | 25 | class GetTextContent < Minitest::Test 26 | SELECTOR = Selma::Selector.new(match_text_within: "*") 27 | 28 | # rubocop:disable Lint/MissingSuper 29 | def initialize 30 | @assertions = 0 31 | end 32 | # rubocop:enable Lint/MissingSuper 33 | 34 | def selector 35 | SELECTOR 36 | end 37 | 38 | def handle_text_chunk(text_chunk) 39 | assert_equal(:rc_data, text_chunk.text_type) 40 | end 41 | end 42 | 43 | def test_that_it_gets_text_content 44 | frag = "Howdy" 45 | Selma::Rewriter.new(sanitizer: nil, handlers: [GetTextContent.new]).rewrite(frag) 46 | end 47 | 48 | class TextRewriteElements 49 | SELECTOR = Selma::Selector.new(match_text_within: "a, div") 50 | 51 | def selector 52 | SELECTOR 53 | end 54 | 55 | def handle_text_chunk(text) 56 | text.replace(text.content.sub("Wow", "MEOW!"), as: :text) 57 | end 58 | end 59 | 60 | def test_that_it_works_for_multiple_elements 61 | frag = "
Wow!
Wow!Wow!" 62 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [TextRewriteElements.new]).rewrite(frag) 63 | 64 | assert_equal("
MEOW!!
Wow!MEOW!!", modified_doc) 65 | end 66 | 67 | class AddTextBefore 68 | SELECTOR = Selma::Selector.new(match_text_within: "div") 69 | 70 | def selector 71 | SELECTOR 72 | end 73 | 74 | def handle_text_chunk(text) 75 | text.before("MEOW! ", as: :text) 76 | end 77 | end 78 | 79 | def test_that_it_adds_text_before 80 | frag = "
Wow!
" 81 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [AddTextBefore.new]).rewrite(frag) 82 | 83 | assert_equal("
MEOW! Wow!
", modified_doc) 84 | end 85 | 86 | class AddTextAfter 87 | SELECTOR = Selma::Selector.new(match_text_within: "div") 88 | 89 | def selector 90 | SELECTOR 91 | end 92 | 93 | def handle_text_chunk(text) 94 | text.after(" MEOW!", as: :text) 95 | end 96 | end 97 | 98 | def test_that_it_adds_text_after 99 | frag = "
Wow!
" 100 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [AddTextAfter.new]).rewrite(frag) 101 | 102 | assert_equal("
Wow! MEOW!
", modified_doc) 103 | end 104 | 105 | class TextRewriteAndMatchElements 106 | SELECTOR = Selma::Selector.new(match_element: "div", match_text_within: "div, p, a") 107 | 108 | def selector 109 | SELECTOR 110 | end 111 | 112 | def handle_element(element) 113 | element["class"] = "neato" 114 | end 115 | 116 | def handle_text_chunk(text) 117 | text.replace(text.to_s.sub("you", "y'all"), as: :html) 118 | end 119 | end 120 | 121 | def test_that_it_works_for_multiple_match_and_text_elements 122 | frag = "

Could you visit this link and tell me what you think? Thank you!

" 123 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [TextRewriteAndMatchElements.new]).rewrite(frag) 124 | 125 | assert_equal("

Could y'all visit this link and tell me what y'all think? Thank y'all!

", modified_doc) 126 | end 127 | 128 | class TextMatchAndRejectElements 129 | SELECTOR = Selma::Selector.new(match_text_within: "*", ignore_text_within: ["code", "pre"]) 130 | 131 | def selector 132 | SELECTOR 133 | end 134 | 135 | def handle_text_chunk(text) 136 | text.replace(text.to_s.sub("@gjtorik", "@gjtorikian"), as: :text) 137 | end 138 | end 139 | 140 | def test_that_it_works_for_text_reject 141 | frag = "

Hello @gjtorik: @gjtorik


@gjtorik
" 142 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [TextMatchAndRejectElements.new]).rewrite(frag) 143 | 144 | assert_equal("

Hello @gjtorikian: @gjtorik


@gjtorik
", modified_doc) 145 | end 146 | 147 | class RejectIndirectAncestorText 148 | SELECTOR = Selma::Selector.new(match_text_within: "*", ignore_text_within: ["code"]) 149 | 150 | def selector 151 | SELECTOR 152 | end 153 | 154 | def handle_text_chunk(text) 155 | text.replace(text.to_s.sub("foo", "bar"), as: :html) 156 | end 157 | end 158 | 159 | def test_that_text_reject_considers_ancestors 160 | frag = "

foo

foofoo" 161 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [RejectIndirectAncestorText.new]).rewrite(frag) 162 | 163 | assert_equal("

bar

foofoo", modified_doc) 164 | end 165 | 166 | class TextRewriteOne 167 | SELECTOR = Selma::Selector.new(match_text_within: "*") 168 | 169 | def selector 170 | SELECTOR 171 | end 172 | 173 | def handle_text_chunk(text) 174 | text.replace(text.to_s.tr("1", "2"), as: :text) 175 | end 176 | end 177 | 178 | class TextRewriteTwo 179 | SELECTOR = Selma::Selector.new(match_text_within: "*") 180 | 181 | def selector 182 | SELECTOR 183 | end 184 | 185 | def handle_text_chunk(text) 186 | text.replace(text.to_s.tr("2", "3"), as: :text) 187 | end 188 | end 189 | 190 | def test_that_it_stacks_two_text_changes 191 | frag = "
1 + 2 = 6
" 192 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [TextRewriteOne.new, TextRewriteTwo.new]).rewrite(frag) 193 | 194 | assert_equal("
3 + 3 = 6
", modified_doc) 195 | end 196 | 197 | class HTMLRewriteOne 198 | SELECTOR = Selma::Selector.new(match_text_within: "*") 199 | 200 | def selector 201 | SELECTOR 202 | end 203 | 204 | def handle_text_chunk(text) 205 | text.replace(text.to_s.sub("1", "1"), as: :html) 206 | end 207 | end 208 | 209 | class HTMLRewriteTwo 210 | SELECTOR = Selma::Selector.new(match_text_within: "*") 211 | 212 | def selector 213 | SELECTOR 214 | end 215 | 216 | def handle_text_chunk(text) 217 | text.replace(text.to_s.sub("2", "2"), as: :html) 218 | end 219 | end 220 | 221 | def test_that_it_stacks_two_html_changes 222 | frag = "
1 + 2 = 3
" 223 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [HTMLRewriteOne.new, HTMLRewriteTwo.new]).rewrite(frag) 224 | 225 | assert_equal("
1 + 2 = 3
", modified_doc) 226 | end 227 | 228 | class TextStringResizeHandler 229 | DEFAULT_IGNORED_ANCESTOR_TAGS = ["pre", "code", "tt"].freeze 230 | 231 | def selector 232 | Selma::Selector.new(match_text_within: "*", ignore_text_within: DEFAULT_IGNORED_ANCESTOR_TAGS) 233 | end 234 | 235 | def handle_text_chunk(text) 236 | return text unless text.to_s.include?(":") 237 | 238 | text.replace(emoji_image_filter(text.to_s), as: :html) 239 | end 240 | 241 | def emoji_image_filter(text) 242 | text.gsub(emoji_pattern) do 243 | emoji_image_tag(Regexp.last_match(1)) 244 | end 245 | end 246 | 247 | def emoji_pattern 248 | @emoji_pattern ||= /:(#{emoji_names.map { |name| Regexp.escape(name) }.join("|")}):/ 249 | end 250 | 251 | def emoji_names 252 | Gemojione::Index.new.all.map { |i| i[1]["name"] }.flatten.sort 253 | end 254 | 255 | # Default attributes for img tag 256 | private def default_img_attrs(name) 257 | { 258 | "class" => "emoji", 259 | "title" => ":#{name}:", 260 | "alt" => ":#{name}:", 261 | "src" => emoji_url(name).to_s, 262 | "height" => "20", 263 | "width" => "20", 264 | "align" => "absmiddle", 265 | } 266 | end 267 | 268 | private def emoji_url(name) 269 | File.join("emoji", emoji_filename(name)) 270 | end 271 | 272 | private def emoji_filename(name) 273 | Gemojione.image_url_for_name(name).sub(Gemojione.asset_host, "") 274 | end 275 | 276 | # Build an emoji image tag 277 | private def emoji_image_tag(name) 278 | html_attrs = default_img_attrs(name).transform_keys(&:to_sym) 279 | .merge!({}).transform_keys(&:to_sym) 280 | .each_with_object([]) do |(attr, value), arr| 281 | next if value.nil? 282 | 283 | value = value.respond_to?(:call) && value.call(name) || value 284 | arr << %(#{attr}="#{value}") 285 | end.compact.join(" ") 286 | 287 | "" 288 | end 289 | end 290 | 291 | def test_that_it_can_handle_text_chunk_with_emoji 292 | require "gemojione" 293 | 294 | frag = ":flag_ar:" 295 | modified_doc = Selma::Rewriter.new(sanitizer: nil, handlers: [TextStringResizeHandler.new]).rewrite(frag) 296 | 297 | assert_equal(%(:flag_ar:), modified_doc) 298 | end unless ENV["CI"] # TODO: why doesn't this work in CI? 299 | end 300 | -------------------------------------------------------------------------------- /test/selma_sanitizer_comments_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "test_helper" 4 | 5 | module Selma 6 | class SanitizerCommentsTest < Minitest::Test 7 | describe "sanitization" do 8 | context "when :allow_comments is false" do 9 | def setup 10 | @sanitizer = Selma::Sanitizer.new({ allow_comments: false, elements: ["div"] }) 11 | end 12 | 13 | def test_it_removes_comments 14 | assert_equal( 15 | "foo bar", 16 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo bar"), 17 | ) 18 | assert_equal("foo ", Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo bar"), 23 | ) 24 | assert_equal( 25 | "foo --> -->bar", 26 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo --> -->bar"), 27 | ) 28 | assert_equal( 29 | "foo ", 30 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo
>bar
"), 31 | ) 32 | 33 | # Special case: the comment markup is inside a "), 38 | ) 39 | 40 | sanitizer = Selma::Sanitizer.new({ allow_comments: false, elements: ["script"] }) 41 | 42 | assert_equal("", Selma::Rewriter.new(sanitizer: sanitizer).rewrite("")) 43 | end 44 | end 45 | 46 | context "when :allow_comments is true" do 47 | def setup 48 | @sanitizer = Selma::Sanitizer.new({ allow_comments: true, elements: ["div"] }) 49 | end 50 | 51 | def test_it_keeps_comments 52 | assert_equal( 53 | "foo bar", 54 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo bar"), 55 | ) 56 | assert_equal("foo bar", 63 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo bar"), 64 | ) 65 | assert_equal( 66 | "foo --> -->bar", 67 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo --> -->bar"), 68 | ) 69 | 70 | assert_equal( 71 | "foo ", 72 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo
>bar
"), 73 | ) 74 | 75 | sanitizer = Selma::Sanitizer.new({ allow_comments: true, elements: ["script"] }) 76 | 77 | assert_equal("", Selma::Rewriter.new(sanitizer: sanitizer).rewrite("")) 78 | end 79 | end 80 | end 81 | end 82 | end 83 | -------------------------------------------------------------------------------- /test/selma_sanitizer_config_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "test_helper" 4 | 5 | module Selma 6 | class SanitizerConfigTest < Minitest::Test 7 | def test_built_in_configs_should_be_deeply_frozen 8 | verify_deeply_frozen(Selma::Sanitizer::Config::DEFAULT) 9 | verify_deeply_frozen(Selma::Sanitizer::Config::BASIC) 10 | verify_deeply_frozen(Selma::Sanitizer::Config::RELAXED) 11 | verify_deeply_frozen(Selma::Sanitizer::Config::RESTRICTED) 12 | end 13 | 14 | def test_should_deeply_freeze_and_return_a_configuration_hash 15 | a = { one: { one_one: [0, "1", :a], one_two: false, one_three: Set.new([:a, :b, :c]) } } 16 | b = Selma::Sanitizer::Config.freeze_config(a) 17 | 18 | assert_equal(a, b) 19 | verify_deeply_frozen(a) 20 | end 21 | 22 | def test_should_deeply_merge_a_configuration_hash 23 | # Freeze to ensure that we get an error if either Hash is modified. 24 | a = Selma::Sanitizer::Config.freeze_config({ 25 | one: { 26 | one_one: [0, "1", :a], 27 | one_two: false, 28 | one_three: Set.new([:a, :b, :c]), 29 | }, 30 | }) 31 | b = Selma::Sanitizer::Config.freeze_config({ one: { one_two: true, one_three: 3 }, two: 2 }) 32 | 33 | c = Selma::Sanitizer::Config.merge(a, b) 34 | 35 | refute_equal(c, a) 36 | refute_equal(c, b) 37 | 38 | assert_equal( 39 | { 40 | one: { 41 | one_one: [0, "1", :a], 42 | one_two: true, 43 | one_three: 3, 44 | }, 45 | 46 | two: 2, 47 | }, 48 | c, 49 | ) 50 | end 51 | 52 | def test_should_raise_an_argumenterror_if_either_argument_is_not_a_hash 53 | assert_raises(ArgumentError) { Selma::Sanitizer::Config.merge("foo", {}) } 54 | assert_raises(ArgumentError) { Selma::Sanitizer::Config.merge({}, "foo") } 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /test/selma_sanitizer_doctype_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "test_helper" 4 | 5 | module Selma 6 | class SanitizerDoctypeTest < Minitest::Test 7 | describe "sanitization" do 8 | context "when :allow_doctype is false" do 9 | def setup 10 | @sanitizer = Selma::Sanitizer.new({ allow_doctype: false, elements: ["html"] }) 11 | end 12 | 13 | def test_it_removes_doctype 14 | assert_equal( 15 | "foo", 16 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo"), 17 | ) 18 | assert_equal("foo", Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo")) 19 | end 20 | end 21 | 22 | def test_blocks_invalid_doctypes_in_documents 23 | skip("non-essential feature") 24 | @sanitizer = Selma::Sanitizer.new({ allow_doctype: true, elements: ["html"] }) 25 | 26 | assert_equal( 27 | "foo", 28 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo"), 29 | ) 30 | assert_equal( 31 | "foo", 32 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo"), 33 | ) 34 | assert_equal( 35 | "foo", 36 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite('foo'), 37 | ) 38 | assert_equal( 39 | "foo", 40 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo"), 41 | ) 42 | end 43 | 44 | context "when :allow_doctype is true" do 45 | def setup 46 | @sanitizer = Selma::Sanitizer.new({ allow_doctype: true, elements: ["html"] }) 47 | end 48 | 49 | def test_it_allows_doctypes_in_documents 50 | assert_equal( 51 | "foo", 52 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo"), 53 | ) 54 | assert_equal( 55 | 'foo', 56 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite('foo'), 57 | ) 58 | assert_equal( 59 | 'foo', 60 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite( 61 | 'foo', 62 | ), 63 | ) 64 | end 65 | 66 | def test_blocks_invalid_doctypes_in_documents 67 | skip("non-essential feature") 68 | 69 | assert_equal( 70 | "foo", 71 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo" 72 | .rewrite(sanitizer: @sanitizer)), 73 | ) 74 | assert_equal( 75 | "foo", 76 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo"), 77 | ) 78 | assert_equal( 79 | "foo", 80 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite('foo'), 81 | ) 82 | assert_equal( 83 | "foo", 84 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo"), 85 | ) 86 | end 87 | end 88 | end 89 | end 90 | end 91 | -------------------------------------------------------------------------------- /test/selma_sanitizer_elements_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "test_helper" 4 | 5 | module Selma 6 | class SanitizerTest < Minitest::Test 7 | describe "sanitize" do 8 | context "Default config" do 9 | def test_remove_non_allowlisted_elements_leaving_safe_contents_behind 10 | assert_equal( 11 | "foo bar baz quux", 12 | Selma::Rewriter.new.rewrite('foo bar baz quux'), 13 | ) 14 | assert_equal("", Selma::Rewriter.new.rewrite('')) 15 | assert_equal("", Selma::Rewriter.new.rewrite('<', 18 | Selma::Rewriter.new.rewrite('< script <>> alert("");'), 19 | ) 20 | end 21 | 22 | def test_should_surround_the_contents_of_whitespace_elements_with_space_characters_when_removing_the_element 23 | assert_equal("foo bar baz", Selma::Rewriter.new.rewrite("foo
bar
baz")) 24 | assert_equal("foo bar baz", Selma::Rewriter.new.rewrite("foo
bar
baz")) 25 | assert_equal("foo bar baz", Selma::Rewriter.new.rewrite("foo
bar
baz")) 26 | end 27 | 28 | def test_should_not_choke_on_several_instances_of_the_same_element_in_a_row 29 | assert_equal( 30 | "", 31 | Selma::Rewriter.new.rewrite(''), 32 | ) 33 | end 34 | 35 | def test_should_not_preserve_the_content_of_removed_iframe_elements 36 | assert_equal("", Selma::Rewriter.new.rewrite("")) 37 | end 38 | 39 | def test_should_not_preserve_the_content_of_removed_math_elements 40 | assert_equal("", Selma::Rewriter.new.rewrite("hello! ")) 41 | end 42 | 43 | def test_should_not_preserve_the_content_of_removed_noembed_elements 44 | assert_equal("", Selma::Rewriter.new.rewrite("hello! <script>alert(0)</script>")) 45 | end 46 | 47 | def test_should_not_preserve_the_content_of_removed_noframes_elements 48 | assert_equal( 49 | "", 50 | Selma::Rewriter.new.rewrite("hello! <script>alert(0)</script>"), 51 | ) 52 | end 53 | 54 | def test_should_not_preserve_the_content_of_removed_noscript_elements 55 | assert_equal( 56 | "", 57 | Selma::Rewriter.new.rewrite(""), 58 | ) 59 | end 60 | 61 | def test_should_not_preserve_the_content_of_removed_plaintext_elements 62 | assert_equal("", Selma::Rewriter.new.rewrite("hello! <script>alert(0)</script>")) 63 | end 64 | 65 | def test_should_not_preserve_the_content_of_removed_script_elements 66 | # NOTE: this gets confused by the embedding 67 | assert_equal("</script>", Selma::Rewriter.new.rewrite("<script>hello! <script>alert(0)</script></script>")) 68 | end 69 | 70 | def test_should_not_preserve_the_content_of_removed_style_elements 71 | assert_equal("", Selma::Rewriter.new.rewrite("<style>hello! <script>alert(0)</script></style>")) 72 | end 73 | 74 | def test_should_not_preserve_the_content_of_removed_svg_elements 75 | assert_equal("", Selma::Rewriter.new.rewrite("<svg>hello! <script>alert(0)</script></svg>")) 76 | end 77 | 78 | def test_should_not_preserve_the_content_of_removed_xmp_elements 79 | assert_equal("", Selma::Rewriter.new.rewrite("<xmp>hello! <script>alert(0)</script></xmp>")) 80 | end 81 | 82 | STRINGS.each do |name, data| 83 | define_method :"test_should_clean_#{name}_HTML" do 84 | assert_equal(data[:default], Selma::Rewriter.new.rewrite(data[:html])) 85 | end 86 | end 87 | 88 | PROTOCOLS.each do |name, data| 89 | define_method :"test_should_not_allow_#{name}" do 90 | assert_equal(data[:default], Selma::Rewriter.new.rewrite(data[:html])) 91 | end 92 | end 93 | end 94 | 95 | context "Restricted config" do 96 | def setup 97 | @sanitizer = Selma::Sanitizer.new(Selma::Sanitizer::Config::RESTRICTED) 98 | end 99 | 100 | STRINGS.each do |name, data| 101 | define_method :"test_should_clean_#{name}_HTML" do 102 | assert_equal(data[:restricted], Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(data[:html])) 103 | end 104 | end 105 | 106 | PROTOCOLS.each do |name, data| 107 | define_method :"test_should_not_allow_#{name}" do 108 | assert_equal(data[:restricted], Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(data[:html])) 109 | end 110 | end 111 | end 112 | 113 | context "Basic config" do 114 | def setup 115 | @sanitizer = Selma::Sanitizer.new(Selma::Sanitizer::Config::BASIC) 116 | end 117 | 118 | def test_should_not_choke_on_valueless_attributes 119 | assert_equal( 120 | "foo <a href>foo</a> bar", 121 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("foo <a href>foo</a> bar"), 122 | ) 123 | end 124 | 125 | def test_should_downcase_attribute_names_when_checking 126 | assert_equal( 127 | "<a>bar</a>", 128 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite('<a HREF="javascript:alert(\'foo\')">bar</a>'), 129 | ) 130 | end 131 | 132 | STRINGS.each do |name, data| 133 | define_method :"test_should_clean_#{name}_HTML" do 134 | assert_equal(data[:basic], Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(data[:html])) 135 | end 136 | end 137 | 138 | PROTOCOLS.each do |name, data| 139 | define_method :"test_should_not_allow_#{name}" do 140 | assert_equal(data[:basic], Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(data[:html])) 141 | end 142 | end 143 | end 144 | 145 | context "Relaxed config" do 146 | def setup 147 | @sanitizer = Selma::Sanitizer.new(Selma::Sanitizer::Config::RELAXED) 148 | end 149 | 150 | def test_should_encode_special_chars_in_attribute_values 151 | assert_equal( 152 | '<a href="http://example.com" title="&lt;b&gt;éxamples&lt;/b&gt; &amp; things">foo</a>', 153 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite('<a href="http://example.com" title="<b>&eacute;xamples</b> & things">foo</a>'), 154 | ) 155 | end 156 | 157 | STRINGS.each do |name, data| 158 | define_method :"test_should_clean_#{name}_HTML" do 159 | assert_equal(data[:relaxed], Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(data[:html])) 160 | end 161 | end 162 | 163 | PROTOCOLS.each do |name, data| 164 | define_method :"test_should_not_allow_#{name}" do 165 | assert_equal(data[:relaxed], Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(data[:html])) 166 | end 167 | end 168 | end 169 | 170 | context "Custom config" do 171 | def test_should_allow_attributes_on_all_elements_if_allowlisted_under_all 172 | input = "<p>bar</p>" 173 | Selma::Rewriter.new.rewrite(input) 174 | 175 | assert_equal(" bar ", Selma::Rewriter.new.rewrite(input)) 176 | 177 | sanitizer = Selma::Sanitizer.new({ 178 | elements: ["p"], 179 | attributes: { all: ["class"] }, 180 | }) 181 | 182 | assert_equal(input, Selma::Rewriter.new(sanitizer: sanitizer).rewrite(input)) 183 | 184 | sanitizer = Selma::Sanitizer.new({ 185 | elements: ["p"], 186 | attributes: { "div" => ["class"] }, 187 | }) 188 | 189 | assert_equal("<p>bar</p>", Selma::Rewriter.new(sanitizer: sanitizer).rewrite(input)) 190 | 191 | sanitizer = Selma::Sanitizer.new({ 192 | elements: ["p"], 193 | attributes: { "p" => ["title"], :all => ["class"] }, 194 | }) 195 | 196 | assert_equal(input, Selma::Rewriter.new(sanitizer: sanitizer).rewrite(input)) 197 | end 198 | 199 | def test_should_not_allow_relative_urls_when_relative_urls_arent_allowlisted 200 | input = '<a href="/foo/bar">Link</a>' 201 | 202 | sanitizer = Selma::Sanitizer.new({ 203 | elements: ["a"], 204 | attributes: { "a" => ["href"] }, 205 | protocols: { "a" => { "href" => ["http"] } }, 206 | }) 207 | 208 | assert_equal("<a>Link</a>", Selma::Rewriter.new(sanitizer: sanitizer).rewrite(input)) 209 | end 210 | 211 | def test_should_allow_relative_urls_containing_colons_when_the_colon_is_not_in_the_first_path_segment 212 | input = '<a href="/wiki/Special:Random">Random Page</a>' 213 | 214 | sanitizer = Selma::Sanitizer.new({ 215 | elements: ["a"], 216 | attributes: { "a" => ["href"] }, 217 | protocols: { "a" => { "href" => [:relative] } }, 218 | }) 219 | 220 | assert_equal(input, Selma::Rewriter.new(sanitizer: sanitizer).rewrite(input)) 221 | end 222 | 223 | def test_should_allow_relative_urls_containing_colons_when_the_colon_is_part_of_an_anchor 224 | input = '<a href="#fn:1">Footnote 1</a>' 225 | 226 | sanitizer = Selma::Sanitizer.new({ 227 | elements: ["a"], 228 | attributes: { "a" => ["href"] }, 229 | protocols: { "a" => { "href" => [:relative] } }, 230 | }) 231 | 232 | assert_equal(input, Selma::Rewriter.new(sanitizer: sanitizer).rewrite(input)) 233 | 234 | input = '<a href="somepage#fn:1">Footnote 1</a>' 235 | 236 | sanitizer = Selma::Sanitizer.new({ 237 | elements: ["a"], 238 | attributes: { "a" => ["href"] }, 239 | protocols: { "a" => { "href" => [:relative] } }, 240 | }) 241 | 242 | assert_equal(input, Selma::Rewriter.new(sanitizer: sanitizer).rewrite(input)) 243 | 244 | input = '<a href="fn:1">Footnote 1</a>' 245 | 246 | sanitizer = Selma::Sanitizer.new({ 247 | elements: ["a"], 248 | attributes: { "a" => ["href"] }, 249 | protocols: { "a" => { "href" => [:relative] } }, 250 | }) 251 | 252 | assert_equal("<a>Footnote 1</a>", Selma::Rewriter.new(sanitizer: sanitizer).rewrite(input)) 253 | end 254 | 255 | def test_should_allow_all_protocols_if_asked 256 | input = <<~HTML 257 | <a href="/foo/bar">Link</a> 258 | <a href="http://wow.com/foo/bar">Link</a> 259 | <a href="https://wow.com/foo/bar">Link</a> 260 | <a href="ftp://wow.com/foo/bar">Link</a> 261 | <a href="ssh://127.0.0.1">Link</a> 262 | HTML 263 | 264 | sanitizer = Selma::Sanitizer.new({ 265 | elements: ["a"], 266 | attributes: { "a" => ["href"] }, 267 | protocols: { "a" => { "href" => :all } }, 268 | }) 269 | 270 | assert_equal(input, Selma::Rewriter.new(sanitizer: sanitizer).rewrite(input)) 271 | end 272 | 273 | def test_should_remove_the_contents_of_filtered_nodes_when_remove_contents_is_true 274 | sanitizer = Selma::Sanitizer.new({ remove_contents: true }) 275 | 276 | assert_equal( 277 | "foo bar ", 278 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite("foo bar <div>baz<span>quux</span></div>"), 279 | ) 280 | end 281 | 282 | def test_remove_the_contents_of_specified_nodes_when_remove_contents_is_an_array_of_element_names_as_strings 283 | sanitizer = Selma::Sanitizer.new({ remove_contents: ["script", "span"] }) 284 | 285 | assert_equal( 286 | "foo bar baz hi", 287 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>'), 288 | ) 289 | end 290 | 291 | def test_raises_when_remove_contents_is_a_set_of_element_names_as_strings 292 | assert_raises(ArgumentError) do 293 | Selma::Sanitizer.new({ remove_contents: Set.new(["script", "span"]) }) 294 | end 295 | end 296 | 297 | def test_should_remove_the_contents_of_allowlisted_iframes 298 | sanitizer = Selma::Sanitizer.new({ elements: ["iframe"] }) 299 | 300 | assert_equal( 301 | "<iframe> </iframe>", 302 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite("<iframe>hi <script>hello</script></iframe>"), 303 | ) 304 | end 305 | 306 | def test_should_not_allow_arbitrary_html5_data_attributes_by_default 307 | sanitizer = Selma::Sanitizer.new({ elements: ["b"] }) 308 | 309 | assert_equal("<b></b>", Selma::Rewriter.new(sanitizer: sanitizer).rewrite('<b data-foo="bar"></b>')) 310 | 311 | sanitizer = Selma::Sanitizer.new({ 312 | attributes: { "b" => ["class"] }, 313 | elements: ["b"], 314 | }) 315 | 316 | assert_equal( 317 | '<b class="foo"></b>', 318 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite('<b class="foo" data-foo="bar"></b>'), 319 | ) 320 | end 321 | 322 | def test_should_allow_arbitrary_html5_data_attributes 323 | sanitizer = Selma::Sanitizer.new( 324 | attributes: { "b" => ["data-foo", "data-bar"] }, 325 | elements: ["b"], 326 | ) 327 | 328 | str = '<b data-foo="valid" data-bar="valid"></b>' 329 | 330 | assert_equal(str, Selma::Rewriter.new(sanitizer: sanitizer).rewrite(str)) 331 | 332 | assert_equal( 333 | "<b></b>", 334 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite('<b data-="invalid"></b>'), 335 | ) 336 | 337 | assert_equal( 338 | "<b></b>", 339 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite('<b data-xml="invalid"></b>'), 340 | ) 341 | 342 | assert_equal( 343 | "<b></b>", 344 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite('<b data-xmlfoo="invalid"></b>'), 345 | ) 346 | 347 | assert_equal( 348 | "<b></b>", 349 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite('<b data-f:oo="valid"></b>'), 350 | ) 351 | 352 | assert_equal( 353 | "<b></b>", 354 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite('<b data-f:oo="valid"></b>'), 355 | ) 356 | 357 | assert_equal( 358 | "<b></b>", 359 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite('<b data-f/oo="partial"></b>'), 360 | ) 361 | 362 | assert_equal( 363 | "<b></b>", 364 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite('<b data-éfoo="valid"></b>'), 365 | ) 366 | end 367 | 368 | def test_should_handle_protocols_correctly_regardless_of_case 369 | input = '<a href="hTTpS://foo.com/">Text</a>' 370 | 371 | sanitizer = Selma::Sanitizer.new( 372 | elements: ["a"], 373 | attributes: { "a" => ["href"] }, 374 | protocols: { "a" => { "href" => ["https"] } }, 375 | ) 376 | 377 | assert_equal(input, Selma::Rewriter.new(sanitizer: sanitizer).rewrite(input)) 378 | 379 | input = '<a href="mailto:someone@example.com?Subject=Hello">Text</a>' 380 | 381 | assert_equal("<a>Text</a>", Selma::Rewriter.new(sanitizer: sanitizer).rewrite(input)) 382 | end 383 | 384 | def test_should_sanitize_protocols_in_data_attributes_even_if_data_attributes_are_generically_allowed 385 | input = '<a data-url="mailto:someone@example.com">Text</a>' 386 | 387 | sanitizer = Selma::Sanitizer.new( 388 | elements: ["a"], 389 | attributes: { "a" => ["data-url"] }, 390 | protocols: { "a" => { "data-url" => ["https"] } }, 391 | ) 392 | 393 | assert_equal("<a>Text</a>", Selma::Rewriter.new(sanitizer: sanitizer).rewrite(input)) 394 | 395 | sanitizer = Selma::Sanitizer.new( 396 | elements: ["a"], 397 | attributes: { "a" => ["data-url"] }, 398 | protocols: { "a" => { "data-url" => ["mailto"] } }, 399 | ) 400 | 401 | assert_equal(input, Selma::Rewriter.new(sanitizer: sanitizer).rewrite(input)) 402 | end 403 | 404 | def test_should_prevent_meta_tags_from_being_used_to_set_a_non_utf8_charset 405 | sanitizer = Selma::Sanitizer.new( 406 | elements: ["html", "head", "meta", "body"], 407 | attributes: { "meta" => ["charset"] }, 408 | ) 409 | 410 | assert_equal( 411 | "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>", 412 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>'), 413 | ) 414 | 415 | sanitizer = Selma::Sanitizer.new( 416 | elements: ["html", "meta"], 417 | attributes: { "meta" => ["charset"] }, 418 | ) 419 | 420 | assert_equal( 421 | "<html><meta charset=\"utf-8\">Howdy!</html>", 422 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite('<html><meta charset="utf-8">Howdy!</html>'), 423 | ) 424 | 425 | sanitizer = Selma::Sanitizer.new( 426 | elements: ["html", "meta"], 427 | attributes: { "meta" => ["charset"] }, 428 | ) 429 | 430 | assert_equal( 431 | "<html><meta charset=\"utf-8\">Howdy!</html>", 432 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite('<html><meta charset="us-ascii">Howdy!</html>'), 433 | ) 434 | end 435 | 436 | def test_should_prevent_meta_tags_from_being_used_to_set_a_non_utf8_charset_when_charset_other_values 437 | skip("non-essential feature") 438 | 439 | sanitizer = Selma::Sanitizer.new( 440 | elements: ["html", "meta"], 441 | attributes: { "meta" => ["content", "http-equiv"] }, 442 | ) 443 | 444 | result = "<!DOCTYPE html><html><meta http-equiv=\"content-type\" content=\" text/html;charset=utf-8\">Howdy!</html>" 445 | 446 | assert_equal(result, Selma::Sanitizer.new(sanitizer: sanitizer).rewrite( 447 | '<html><meta http-equiv="content-type" content=" text/html; charset=us-ascii">Howdy!</html>', 448 | )) 449 | 450 | sanitizer = Selma::Sanitizer.new( 451 | elements: ["html", "meta"], 452 | attributes: { "meta" => ["content", "http-equiv"] }, 453 | ) 454 | 455 | result = '<html><meta http-equiv=\"Content-Type\" content=\"text/plain;charset=utf-8\">Howdy!</html>' 456 | 457 | assert_equal( 458 | result, Selma::Rewriter.new(sanitizer: sanitizer).rewrite( 459 | '<html><meta http-equiv="Content-Type" content="text/plain;charset = us-ascii">Howdy!</html>', 460 | ) 461 | ) 462 | end 463 | 464 | def test_should_not_modify_meta_tags_that_already_set_a_utf8_charset 465 | skip("non-essential feature") 466 | 467 | sanitizer = Selma::Sanitizer.new( 468 | elements: ["html", "head", "meta", "body"], 469 | attributes: { "meta" => ["content", "http-equiv"] }, 470 | ) 471 | 472 | result = '<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>' 473 | 474 | assert_equal( 475 | result, Selma::Rewriter.new(sanitizer: sanitizer).rewrite( 476 | '<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>Howdy!</body></html>', sanitizer: sanitizer 477 | ) 478 | ) 479 | end 480 | end 481 | end 482 | end 483 | end 484 | -------------------------------------------------------------------------------- /test/selma_sanitizer_malicious_html_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "test_helper" 4 | 5 | module Selma 6 | class SanitizerMaliciousHtmlTest < Minitest::Test 7 | def setup 8 | @sanitizer = Selma::Sanitizer.new(Sanitizer::Config::RELAXED) 9 | end 10 | 11 | def test_should_not_allow_script_injection_via_conditional_comments 12 | assert_equal( 13 | "", 14 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%[<!--[if gte IE 4]>\n<script>alert('XSS');</script>\n<![endif]-->]), 15 | ) 16 | end 17 | 18 | def test_should_escape_erb_style_tags 19 | skip("non-essential feature") 20 | 21 | assert_equal( 22 | "&lt;% naughty_ruby_code %&gt;", 23 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("<% naughty_ruby_code %>"), 24 | ) 25 | 26 | assert_equal( 27 | "&lt;%= naughty_ruby_code %&gt;", 28 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("<%= naughty_ruby_code %>"), 29 | ) 30 | end 31 | 32 | def test_should_remove_php_style_tags 33 | skip("non-essential feature") 34 | 35 | assert_equal("", Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("<? naughtyPHPCode(); ?>")) 36 | 37 | assert_equal("", Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("<?= naughtyPHPCode(); ?>")) 38 | end 39 | 40 | def test_should_not_be_possible_to_inject_js_via_a_malformed_event_attribute 41 | assert_equal( 42 | "<html><head></head><body></body></html>", 43 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite('<html><head></head><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>'), 44 | ) 45 | end 46 | 47 | def test_should_not_be_possible_to_inject_an_iframe_using_an_improperly_closed_tag 48 | assert_equal( 49 | "", 50 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%(<iframe src=http://ha.ckers.org/scriptlet.html <)), 51 | ) 52 | end 53 | 54 | def test_should_not_be_possible_to_inject_js_via_an_unquoted_img_src_attribute 55 | assert_equal( 56 | "<img>", 57 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite('<img src=javascript:alert("XSS")>'), 58 | ) 59 | end 60 | 61 | def test_should_not_be_possible_to_inject_js_using_grave_accents_as_img_src_delimiters 62 | assert_equal( 63 | "<img>", 64 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite('<img src=`javascript:alert("XSS")`>'), 65 | ) 66 | end 67 | 68 | def test_should_not_be_possible_to_inject_script_via_a_malformed_img_tag 69 | assert_equal( 70 | '<img>">', 71 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite('<img """><script>alert("XSS")</script>">'), 72 | ) 73 | end 74 | 75 | def test_should_not_be_possible_to_inject_protocol_based_js 76 | assert_equal( 77 | "<img>", 78 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite( 79 | "<img src=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>", 80 | ), 81 | ) 82 | 83 | assert_equal( 84 | "<img>", 85 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite( 86 | "<img src=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>", 87 | ), 88 | ) 89 | 90 | assert_equal( 91 | "<img>", 92 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite( 93 | "<img src=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>", 94 | ), 95 | ) 96 | 97 | # Encoded tab character. 98 | assert_equal( 99 | "<img>", 100 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%[<img src="jav&#x09;ascript:alert('XSS');">]), 101 | ) 102 | 103 | # Encoded newline. 104 | assert_equal( 105 | "<img>", 106 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%[<img src="jav&#x0A;ascript:alert('XSS');">]), 107 | ) 108 | 109 | # Encoded carriage return. 110 | assert_equal( 111 | "<img>", 112 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%[<img src="jav&#x0D;ascript:alert('XSS');">]), 113 | ) 114 | 115 | # Null byte. 116 | assert_equal( 117 | "<img>", 118 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%[<img src=java\0script:alert("XSS")>]), 119 | ) 120 | 121 | # Spaces plus meta char. 122 | assert_equal( 123 | "<img>", 124 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%[<img src=" &#14; javascript:alert('XSS');">]), 125 | ) 126 | 127 | # Mixed spaces and tabs. 128 | assert_equal( 129 | "<img>", 130 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%[<img src="j\na v\tascript://alert('XSS');">]), 131 | ) 132 | end 133 | 134 | def test_should_not_be_possible_to_inject_protocol_based_js_via_whitespace 135 | assert_equal( 136 | "<img>", 137 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%[<img src="jav\tascript:alert('XSS');">]), 138 | ) 139 | end 140 | 141 | # tag never resolves the way it might in eg. Gumbo 142 | def test_should_not_be_possible_to_inject_js_using_a_half_open_img_tag 143 | assert_equal( 144 | "", 145 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%[<img src="javascript:alert('XSS')"]), 146 | ) 147 | end 148 | 149 | def test_should_not_be_possible_to_inject_script_using_a_malformed_non_alphanumeric_tag_name 150 | assert_equal( 151 | "", 152 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%[<script/xss src="http://ha.ckers.org/xss.js">alert(1)</script>]), 153 | ) 154 | end 155 | 156 | def test_should_not_be_possible_to_inject_script_via_extraneous_open_brackets 157 | assert_equal( 158 | "", 159 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%[<<script>alert("XSS");//<</script>]), 160 | ) 161 | end 162 | 163 | # https://github.com/rgrove/sanitize/security/advisories/GHSA-p4x4-rw2p-8j8m 164 | 165 | def test_prevents_a_sanitization_bypass_via_carefully_crafted_foreign_content 166 | ["iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "xmp"].each do |tag_name| 167 | assert_equal( 168 | "", 169 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%[<math><#{tag_name}>/*&lt;/#{tag_name}&gt;&lt;img src onerror=alert(1)>*/]), 170 | ) 171 | 172 | assert_equal( 173 | "", 174 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite(%[<svg><#{tag_name}>/*&lt;/#{tag_name}&gt;&lt;img src onerror=alert(1)>*/]), 175 | ) 176 | end 177 | end 178 | end 179 | end 180 | -------------------------------------------------------------------------------- /test/selma_sanitizer_parser_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "test_helper" 4 | 5 | module Selma 6 | class SanitizerParserTest < Minitest::Test 7 | def test_should_leave_valid_entities_alone 8 | assert_equal("&apos;&eacute;&amp;", Selma::Rewriter.new.rewrite("&apos;&eacute;&amp;")) 9 | end 10 | 11 | def test_should_leave_translate_orphaned_ampersands_alone 12 | assert_equal("at&t", Selma::Rewriter.new.rewrite("at&t")) 13 | end 14 | 15 | def test_should_not_add_newlines_after_tags_when_serializing_a_fragment 16 | sanitizer = Selma::Sanitizer.new({ 17 | elements: ["div", "p"], 18 | }) 19 | 20 | assert_equal( 21 | "<div>foo\n\n<p>bar</p><div>\nbaz</div></div><div>quux</div>", 22 | Selma::Rewriter.new(sanitizer: sanitizer).rewrite("<div>foo\n\n<p>bar</p><div>\nbaz</div></div><div>quux</div>"), 23 | ) 24 | end 25 | 26 | def test_should_not_have_the_nokogiri_1_4_2_unterminated_script_style_element_bug 27 | assert_equal("foo ", Selma::Rewriter.new.rewrite("foo <script>bar")) 28 | 29 | assert_equal("foo ", Selma::Rewriter.new.rewrite("foo <style>bar")) 30 | end 31 | 32 | def test_ambiguous_non_tag_brackets_should_be_parsed_correctly 33 | assert_equal("1 > 2 and 2 < 1", Selma::Rewriter.new.rewrite("1 > 2 and 2 < 1")) 34 | 35 | assert_equal("OMG HAPPY BIRTHDAY! *<:-D", Selma::Rewriter.new.rewrite("OMG HAPPY BIRTHDAY! *<:-D")) 36 | end 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /test/selma_sanitizer_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "test_helper" 4 | 5 | module Selma 6 | class SanitizerTest < Minitest::Test 7 | def test_it_sanitizes_by_default 8 | html = "<a href='https://google.com'>here is a neat site!</a>" 9 | rewritten = Selma::Rewriter.new.rewrite(html) 10 | 11 | assert_equal("here is a neat site!", rewritten) 12 | end 13 | 14 | def test_it_can_retrieve_elements 15 | hash = { 16 | elements: ["a"], 17 | } 18 | sanitizer = Selma::Sanitizer.new(hash) 19 | 20 | assert_equal(["a"], sanitizer.config[:elements]) 21 | end 22 | 23 | def test_it_can_keep_attributes 24 | hash = { 25 | elements: ["a"], 26 | 27 | attributes: { 28 | "a" => ["href"], 29 | }, 30 | 31 | protocols: { 32 | "a" => { "href" => ["ftp", "http", "https", "mailto", :relative] }, 33 | }, 34 | } 35 | 36 | sanitizer = Selma::Sanitizer.new(hash) 37 | html = "<a href='https://google.com'>wow!</a>" 38 | result = Selma::Rewriter.new(sanitizer: sanitizer).rewrite(html) 39 | 40 | assert_equal("<a href=\"https://google.com\">wow!</a>", result) 41 | end 42 | 43 | def test_it_can_remove_attributes 44 | hash = { 45 | elements: ["a"], 46 | 47 | attributes: { 48 | "a" => ["href"], 49 | }, 50 | 51 | protocols: { 52 | "a" => { "href" => ["ftp", "http", "https", "mailto", :relative] }, 53 | }, 54 | } 55 | 56 | sanitizer = Selma::Sanitizer.new(hash) 57 | html = "<a href='https://google.com' class='very'>wow!</a>" 58 | result = Selma::Rewriter.new(sanitizer: sanitizer).rewrite(html) 59 | 60 | assert_equal("<a href=\"https://google.com\">wow!</a>", result) 61 | end 62 | 63 | def test_it_can_be_turned_off 64 | html = '<a href="https://google.com">wow!</a>' 65 | assert_raises(ArgumentError) do 66 | Selma::Rewriter.new(sanitizer: nil).rewrite(html) 67 | end 68 | end 69 | 70 | def test_can_handle_non_standard_elements 71 | frag = <<~FRAG 72 | <svg height="100" width="100"> 73 | <circle cx="50" cy="50" r="40" stroke="black" stroke-width="3" fill="red" /> 74 | </svg> 75 | FRAG 76 | 77 | hash = { 78 | elements: ["svg", "circle"], 79 | attributes: { 80 | "svg" => ["width"], 81 | "circle" => ["cx", "cy", "r"], 82 | }, 83 | } 84 | sanitizer = Selma::Sanitizer.new(hash) 85 | result = Selma::Rewriter.new(sanitizer: sanitizer).rewrite(frag) 86 | 87 | assert_equal(%(<svg width="100">\n<circle cx="50" cy="50" r="40" />\n</svg>\n), result) 88 | end 89 | 90 | describe "#fragment" do 91 | def setup 92 | @sanitizer = Selma::Sanitizer.new(elements: ["html"]) 93 | end 94 | 95 | def test_should_sanitize_an_html_fragment 96 | assert_equal( 97 | "Lorem ipsum dolor sitamet ", 98 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>'), 99 | ) 100 | end 101 | 102 | def test_should_not_modify_the_input_string 103 | input = "<b>foo</b>" 104 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("<b>foo</b>") 105 | 106 | assert_equal("<b>foo</b>", input) 107 | end 108 | 109 | def test_should_not_choke_on_fragments_containing_html_or_body 110 | assert_equal("foo", Selma::Rewriter.new.rewrite("<html><b>foo</b></html>")) 111 | assert_equal("foo", Selma::Rewriter.new.rewrite("<body><b>foo</b></body>")) 112 | assert_equal("foo", Selma::Rewriter.new.rewrite("<html><body><b>foo</b></body></html>")) 113 | assert_equal( 114 | "foo", 115 | Selma::Rewriter.new.rewrite("<!DOCTYPE html><html><body><b>foo</b></body></html>"), 116 | ) 117 | end 118 | 119 | def test_should_not_choke_on_frozen_fragments 120 | assert_equal("foo", Selma::Rewriter.new.rewrite("<b>foo</b>")) 121 | end 122 | 123 | def test_should_normalize_newlines 124 | skip("non-essential feature") 125 | 126 | assert_equal( 127 | "a\n\n\n\n\nz", 128 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("a\r\n\n\r\r\r\nz"), 129 | ) 130 | end 131 | 132 | def test_should_strip_control_characters_except_ascii_whitespace 133 | skip("non-essential feature") 134 | 135 | sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f" 136 | whitespace = "\t\n\f\u0020" 137 | 138 | assert_equal( 139 | "a#{whitespace}z", 140 | Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("a#{sample_control_chars}#{whitespace}z"), 141 | ) 142 | end 143 | 144 | def test_should_strip_non_characters 145 | skip("non-essential feature") 146 | 147 | sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}" 148 | 149 | assert_equal("az", Selma::Rewriter.new(sanitizer: @sanitizer).rewrite("a#{sample_non_chars}z")) 150 | end 151 | 152 | def test_should_remove_the_contents_of_dangerous_elements 153 | assert_equal("", Selma::Rewriter.new.rewrite(%(<iframe src="https://www.youtube.com/embed/dXBohfjc4WA" width="680" height="480" allowfullscreen>hackerman</iframe>))) 154 | end 155 | end 156 | end 157 | end 158 | -------------------------------------------------------------------------------- /test/selma_selector_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | class SelmaSelectorTest < Minitest::Test 4 | def test_that_it_raise_against_invalid_css 5 | assert_raises(ArgumentError) do 6 | Selma::Selector.new(match_element: %(a[href=])) 7 | end 8 | end 9 | 10 | def test_that_it_raises_against_empty_css 11 | assert_raises(ArgumentError) do 12 | Selma::Selector.new(match_element: "") 13 | end 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | $LOAD_PATH.unshift(File.expand_path("../lib", __dir__)) 4 | require "selma" 5 | 6 | require "minitest/autorun" 7 | require "minitest/focus" 8 | require "minitest-spec-context" 9 | require "minitest/pride" 10 | 11 | require "amazing_print" 12 | 13 | def verify_deeply_frozen(config) 14 | assert_predicate(config, :frozen?) 15 | 16 | case config 17 | when Hash 18 | config.each_value { |v| verify_deeply_frozen(v) } 19 | when Set, Array 20 | config.each { |v| verify_deeply_frozen(v) } 21 | end 22 | end 23 | 24 | def nest_html_content(html_content, depth) 25 | "#{"<span>" * depth}#{html_content}#{"</span>" * depth}" 26 | end 27 | 28 | def ci? 29 | ENV["CI"] == "true" 30 | end 31 | 32 | FIXTURES_DIR = "test/fixtures" 33 | 34 | def load_fixture(file) 35 | File.read(File.join(FIXTURES_DIR, file)) 36 | end 37 | 38 | STRINGS = { 39 | basic: { 40 | html: '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <style>.foo { color: #fff; }</style> <script>alert("hello world");</script>', 41 | default: "Lorem ipsum dolor sit amet ", 42 | restricted: "<b>Lorem</b> ipsum <strong>dolor</strong> sit amet ", 43 | basic: '<b>Lorem</b> <a>ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet ', 44 | relaxed: '<b>Lorem</b> <a title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <style>.foo { color: #fff; }</style> ', 45 | }, 46 | 47 | malformed: { 48 | html: 'Lo<!-- comment -->rem</b> <a href=pants title="foo>ipsum <a href="http://foo.com/"><strong>dolor</a></strong> sit<br/>amet <script>alert("hello world");', 49 | default: "Lorem</b> dolor</strong> sit amet ", 50 | restricted: "Lorem</b> <strong>dolor</strong> sit amet ", 51 | basic: "Lorem</b> <a><strong>dolor</a></strong> sit<br/>amet ", 52 | relaxed: 'Lorem</b> <a title="foo&gt;ipsum &lt;a href="><strong>dolor</a></strong> sit<br/>amet ', 53 | }, 54 | 55 | unclosed: { 56 | html: "<p>a</p><blockquote>b", 57 | default: " a b", 58 | restricted: " a b", 59 | basic: "<p>a</p><blockquote>b", 60 | relaxed: "<p>a</p><blockquote>b", 61 | }, 62 | 63 | malicious: { 64 | html: '<b>Lo<!-- comment -->rem</b> <a href="javascript:pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <<foo>script>alert("hello world");</script>', 65 | default: "Lorem ipsum dolor sit amet ", 66 | restricted: "<b>Lorem</b> ipsum <strong>dolor</strong> sit amet ", 67 | basic: '<b>Lorem</b> <a>ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet ', 68 | relaxed: '<b>Lorem</b> <a title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet ', 69 | }, 70 | }.freeze 71 | 72 | PROTOCOLS = { 73 | protocol_based_js_injection_simple_no_spaces: { 74 | html: '<a href="javascript:alert(\'XSS\');">foo</a>', 75 | default: "foo", 76 | restricted: "foo", 77 | basic: "<a>foo</a>", 78 | relaxed: "<a>foo</a>", 79 | }, 80 | 81 | protocol_based_js_injection_simple_spaces_before: { 82 | html: '<a href="javascript :alert(\'XSS\');">foo</a>', 83 | default: "foo", 84 | restricted: "foo", 85 | basic: "<a>foo</a>", 86 | relaxed: "<a>foo</a>", 87 | }, 88 | 89 | protocol_based_js_injection_simple_spaces_after: { 90 | html: '<a href="javascript: alert(\'XSS\');">foo</a>', 91 | default: "foo", 92 | restricted: "foo", 93 | basic: "<a>foo</a>", 94 | relaxed: "<a>foo</a>", 95 | }, 96 | 97 | protocol_based_js_injection_simple_spaces_before_and_after: { 98 | html: '<a href="javascript : alert(\'XSS\');">foo</a>', 99 | default: "foo", 100 | restricted: "foo", 101 | basic: "<a>foo</a>", 102 | relaxed: "<a>foo</a>", 103 | }, 104 | 105 | protocol_based_js_injection_preceding_colon: { 106 | html: '<a href=":javascript:alert(\'XSS\');">foo</a>', 107 | default: "foo", 108 | restricted: "foo", 109 | basic: "<a>foo</a>", 110 | relaxed: "<a>foo</a>", 111 | }, 112 | 113 | protocol_based_js_injection_UTF8_encoding: { 114 | html: '<a href="javascript&#58;">foo</a>', 115 | default: "foo", 116 | restricted: "foo", 117 | basic: "<a>foo</a>", 118 | relaxed: "<a>foo</a>", 119 | }, 120 | 121 | protocol_based_js_injection_long_UTF8_encoding: { 122 | html: '<a href="javascript&#0058;">foo</a>', 123 | default: "foo", 124 | restricted: "foo", 125 | basic: "<a>foo</a>", 126 | relaxed: "<a>foo</a>", 127 | }, 128 | 129 | protocol_based_js_injection_long_UTF8_encoding_without_semicolons: { 130 | html: "<a href=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>foo</a>", 131 | default: "foo", 132 | restricted: "foo", 133 | basic: "<a>foo</a>", 134 | relaxed: "<a>foo</a>", 135 | }, 136 | 137 | protocol_based_js_injection_hex_encoding: { 138 | html: '<a href="javascript&#x3A;">foo</a>', 139 | default: "foo", 140 | restricted: "foo", 141 | basic: "<a>foo</a>", 142 | relaxed: "<a>foo</a>", 143 | }, 144 | 145 | protocol_based_js_injection_long_hex_encoding: { 146 | html: '<a href="javascript&#x003A;">foo</a>', 147 | default: "foo", 148 | restricted: "foo", 149 | basic: "<a>foo</a>", 150 | relaxed: "<a>foo</a>", 151 | }, 152 | 153 | protocol_based_js_injection_hex_encoding_without_semicolons: { 154 | html: "<a href=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>foo</a>", 155 | default: "foo", 156 | restricted: "foo", 157 | basic: "<a>foo</a>", 158 | relaxed: "<a>foo</a>", 159 | }, 160 | 161 | protocol_based_js_injection_null_char: { 162 | html: "<img src=java\0script:alert(\"XSS\")>", 163 | default: "", 164 | restricted: "", 165 | basic: "", 166 | relaxed: "<img>", 167 | }, 168 | 169 | protocol_based_js_injection_invalid_URL_char: { 170 | html: '<img src=java\script:alert("XSS")>', 171 | default: "", 172 | restricted: "", 173 | basic: "", 174 | relaxed: "<img>", 175 | }, 176 | 177 | protocol_based_js_injection_spaces_and_entities: { 178 | html: '<img src=" &#14; javascript:alert(\'XSS\');">', 179 | default: "", 180 | restricted: "", 181 | basic: "", 182 | relaxed: "<img>", 183 | }, 184 | 185 | protocol_whitespace: { 186 | html: '<a href=" http://example.com/"></a>', 187 | default: "", 188 | restricted: "", 189 | basic: '<a href="http://example.com/"></a>', 190 | relaxed: '<a href="http://example.com/"></a>', 191 | }, 192 | }.freeze 193 | --------------------------------------------------------------------------------

Sanitize is based on Google's Gumbo HTML5 parser, which parses HTML 11 | exactly the same way modern browsers do. As long as your whitelist config only 12 | allows safe markup, even the most malformed or malicious input will be 13 | transformed into safe output.