├── .github ├── dependabot.yml └── workflows │ └── ci.yml ├── Gemfile ├── Rakefile ├── .gitignore ├── .editorconfig ├── rack-utf8_sanitizer.gemspec ├── LICENSE.txt ├── CHANGELOG.md ├── README.md ├── lib └── rack │ └── utf8_sanitizer.rb └── test └── test_utf8_sanitizer.rb /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source 'https://rubygems.org' 4 | 5 | # Specify your gem's dependencies in rack-utf8_sanitizer.gemspec 6 | gemspec 7 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "bundler/gem_tasks" 4 | 5 | task :default => :spec 6 | 7 | desc "Run tests" 8 | task :spec do 9 | sh 'bacon -a' 10 | end 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | .yardoc 6 | Gemfile.lock 7 | InstalledFiles 8 | _yardoc 9 | coverage 10 | doc/ 11 | lib/bundler/man 12 | pkg 13 | rdoc 14 | spec/reports 15 | test/tmp 16 | test/version_tmp 17 | tmp 18 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 2 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | 11 | [*.md] 12 | indent_style = space 13 | indent_size = 2 14 | 15 | [*.y{a,}ml] 16 | indent_style = space 17 | indent_size = 2 18 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | 8 | runs-on: ${{ matrix.os }} 9 | 10 | strategy: 11 | fail-fast: false 12 | matrix: 13 | os: [ubuntu-latest] 14 | ruby: 15 | - 2.5 16 | - 2.6 17 | - 2.7 18 | - "3.0" 19 | - 3.1 20 | - 3.2 21 | - 3.3 22 | - 3.4 23 | - ruby-head 24 | - jruby-9.3 25 | - jruby-9.4 26 | - jruby-10.0 27 | - jruby-head 28 | include: 29 | - os: ubuntu-22.04 30 | ruby: jruby-9.2 31 | 32 | steps: 33 | - uses: actions/checkout@v6 34 | - name: Set up Ruby 35 | uses: ruby/setup-ruby@v1 36 | with: 37 | bundler-cache: true # 'bundle install' and cache gems 38 | ruby-version: ${{ matrix.ruby }} 39 | - name: Run tests 40 | run: bundle exec rake 41 | -------------------------------------------------------------------------------- /rack-utf8_sanitizer.gemspec: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # frozen_string_literal: true 3 | 4 | Gem::Specification.new do |gem| 5 | gem.name = "rack-utf8_sanitizer" 6 | gem.version = '1.11.1' 7 | gem.authors = ["Catherine"] 8 | gem.license = "MIT" 9 | gem.email = ["whitequark@whitequark.org"] 10 | gem.description = "Rack::UTF8Sanitizer is a Rack middleware which cleans up " \ 11 | "invalid UTF8 characters in request URI and headers." 12 | gem.summary = gem.description 13 | gem.homepage = "https://github.com/whitequark/rack-utf8_sanitizer" 14 | 15 | gem.files = `git ls-files`.split($/) 16 | gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) 17 | gem.require_paths = ["lib"] 18 | 19 | gem.required_ruby_version = '>= 2.3' 20 | 21 | gem.add_dependency "rack", '>= 1.0', '< 4.0' 22 | 23 | gem.add_development_dependency "bacon" 24 | gem.add_development_dependency "bacon-colored_output" 25 | gem.add_development_dependency "rake" 26 | end 27 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 Peter Zotov 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | v1.11.0 (2025-12-04) 5 | ------------------------- 6 | 7 | Bugs fixed: 8 | 9 | * Return HTTP 400 when Content-Length is too large (Benjamin Quorning, #103) 10 | 11 | v1.10.1 (2025-01-10) 12 | ------------------------- 13 | 14 | Bugs fixed: 15 | 16 | * Fix `URI::RFC2396_PARSER` issue with older Rubies (Tekin Süleyman, #94) 17 | 18 | 19 | v1.10.0 (2025-01-08) 20 | ------------------------- 21 | 22 | Changes: 23 | 24 | * Require Ruby 2.3.0+. (Jean Boussier, #80) 25 | 26 | Bugs fixed: 27 | 28 | * Skip sanitizing the request body if the charset is non-utf-8 (#84) 29 | * Don't use a mutable constant as Rack response (Jean Boussier, #86) 30 | 31 | Chores: 32 | 33 | * Add the `frozen_string_literal` header (Benjamin Quorning, #90) 34 | * Avoid deprecation warming by switching from `URI::DEFAULT_PARSER` to `URI::RFC2396_PARSER` (Roman Gaufman, #92) 35 | 36 | Performance: 37 | 38 | * Use Content-Length to read the request body if available (Jean Boussier, #80) 39 | * Avoid 2nd degree polynomial regexp for sanitizing content type (Jean Boussier, #82) 40 | * Use `Regexp#match?` over `String#=~` when testing for null bytes (Geoff Harcourt, #85) 41 | 42 | v1.9.1 (2023-08-31) 43 | ------------------------- 44 | 45 | Bugs fixed: 46 | 47 | * Fix null byte sanitisation (Szymon Madeja, #78) 48 | 49 | v1.9.0 (2023-07-06) 50 | ------------------------- 51 | 52 | * Optionally sanitize null bytes (James Coleman, #75) 53 | * CI: add Ruby 3.2 (Peter Goldstein, #71) 54 | 55 | v1.8.0 (2022-10-25) 56 | ------------------------- 57 | 58 | Bugs fixed: 59 | 60 | * Handle EOFError (Kir Shatrov, #57) 61 | 62 | Features implemented: 63 | 64 | * Allow Rack version 3 (Alexander Popov, #66) 65 | * Various CI chores (Olle Jonsson) 66 | * Move to GitHub Actions, configure Dependabot (Peter Goldstein, #62, #64) 67 | 68 | v1.7.0 (2020-05-05) 69 | ------------------------- 70 | 71 | * Resolve Ruby warnings about `URI.escape` (Alexander Popov, #53) 72 | * README: better reflect that this also can sanitize text bodies (Zach McCormick, #47) 73 | * Update documentation on exception strategy handler (Josh Frankel, #52) 74 | 75 | v1.6.0 (2018-06-06) 76 | ------------------------- 77 | 78 | Bugs fixed: 79 | 80 | * Add sanitation of cookie header (John Hager, #45) 81 | 82 | v1.5.0 (2018-02-16) 83 | ------------------------- 84 | 85 | Bugs fixed: 86 | 87 | * Sanitize `nil` in `sanitize_uri_encoded_string` (David Čepelík, #44) 88 | 89 | Features implemented: 90 | 91 | * Add `:only` and `:except` options (John Hager, #43) 92 | * Add strategies to rack-utf8_sanitizer (John Hager, #41) 93 | 94 | ```rb 95 | # Example usage in Rails config/application.rb: 96 | config.middleware.insert(0, Rack::UTF8Sanitizer, strategy: :exception) 97 | ``` 98 | 99 | v1.4.0 (2016-03-07) 100 | ------------------------- 101 | 102 | Performance: 103 | 104 | * Use more performant `%char` decoding `.hex.chr` (Martin Emde, #36) 105 | * Make `HTTP_` a constant to avoid creating the string every loop (Martin Emde, #35) 106 | 107 | Features implemented: 108 | 109 | * Add SERVER_NAME to list of sanitization (Denis Lysenko, 9644371) 110 | 111 | Chores: 112 | 113 | * Add license to gemspec (Robert Reiz, #38) 114 | 115 | 116 | v1.3.2 (2015-12-23) 117 | ------------------------- 118 | 119 | API modifications: 120 | 121 | Features implemented: 122 | 123 | Bugs fixed: 124 | 125 | * Strip UTF-8 Byte Order Mark from the request body (Jean Boussier, #29) 126 | * Add options to #initialize to allow configurable sanitizable content types (Shelby Switzer, #30) 127 | 128 | v1.3.1 (2015-07-09) 129 | ------------------------- 130 | 131 | Bugs fixed: 132 | * Make sure Content-Length is adjusted. (Samuel Cochran, #26) 133 | 134 | v1.3.0 (2015-01-26) 135 | ------------------------- 136 | 137 | v1.2.4 (2014-11-29) 138 | ------------------------- 139 | 140 | v1.2.3 (2014-10-08) 141 | ------------------------- 142 | 143 | v1.2.2 (2014-07-10) 144 | ------------------------- 145 | 146 | Features implemented: 147 | * Sanitize request body for all HTTP verbs. (Nathaniel Talbott, #15) 148 | * Add `application/json` and `text/javascript` as sanitizable content types. (Benjamin Fleischer, #12) 149 | 150 | Bugs fixed: 151 | * Ensure Rack::UTF8 Sanitizer is first middleware. (Aaron Renner, #13) 152 | 153 | v1.2.1 (2014-05-27) 154 | ------------------------- 155 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rack::UTF8Sanitizer 2 | 3 | Rack::UTF8Sanitizer is a Rack middleware which cleans up invalid UTF8 characters in request URI and headers. Additionally, 4 | it cleans up invalid UTF8 characters in the request body (depending on the configurable content type filters) by reading 5 | the input into a string, sanitizing the string, then replacing the Rack input stream with a rewindable input stream backed 6 | by the sanitized string. 7 | 8 | ## Installation 9 | 10 | Add this line to your application's Gemfile: 11 | 12 | gem 'rack-utf8_sanitizer' 13 | 14 | And then execute: 15 | 16 | $ bundle 17 | 18 | Or install it yourself as: 19 | 20 | $ gem install rack-utf8_sanitizer 21 | 22 | For Rails, add this to your `application.rb`: 23 | 24 | ``` ruby 25 | config.middleware.insert 0, Rack::UTF8Sanitizer 26 | ``` 27 | 28 | For Rack apps, add this to `config.ru`: 29 | 30 | ``` ruby 31 | use Rack::UTF8Sanitizer 32 | ``` 33 | 34 | ## Usage 35 | 36 | Rack::UTF8Sanitizer divides all keys in the [Rack environment](http://rack.rubyforge.org/doc/SPEC.html) in two distinct groups: keys which contain raw data and the ones with percent-encoded data. The fields which are treated as percent-encoded are: `SCRIPT_NAME`, `REQUEST_PATH`, `REQUEST_URI`, `PATH_INFO`, `QUERY_STRING`, `HTTP_REFERER`. 37 | 38 | The generic sanitization algorithm is as follows: 39 | 40 | 1. Force the encoding to UTF-8. 41 | 2. If the result contains invalid characters: 42 | 1. Force the encoding to ASCII8-BIT. 43 | 2. Re-encode it as UTF-8, replacing invalid and undefined characters as U+FFFD. 44 | 45 | For fields with "raw data", the algorithm is applied once and the (UTF-8 encoded) result is left in the environment. 46 | 47 | For fields with "percent-encoded data", the algorithm is applied twice to catch both invalid characters appearing as-is and invalid characters appearing in the percent encoding. The percent encoded, ASCII-8BIT encoded result is left in the environment. 48 | 49 | ### Sanitizable content types 50 | 51 | The default content types to be sanitized are 'text/plain', 'application/x-www-form-urlencoded', 'application/json', 'text/javascript'. You may wish to modify this, for example if your app accepts specific or custom media types in the CONTENT_TYPE header. If you want to change the sanitizable content types, you can pass options when using Rack::UTF8Sanitizer. 52 | 53 | To add sanitizable content types to the list of defaults, pass the `additional_content_types` options when using Rack::UTF8Sanitizer, e.g. 54 | 55 | ``` ruby 56 | config.middleware.insert 0, Rack::UTF8Sanitizer, additional_content_types: ['application/vnd.api+json'] 57 | ``` 58 | 59 | To explicitly set sanitizable content types and override the defaults, use the `sanitizable_content_types` option: 60 | 61 | ``` ruby 62 | config.middleware.insert 0, Rack::UTF8Sanitizer, sanitizable_content_types: ['application/vnd.api+json'] 63 | ``` 64 | 65 | ### Whitelist/Blacklist Rack Env Keys 66 | 67 | Using the `:only` and `:except` keys you can skip sanitation of values in the Rack Env. `:only` and `:except` are arrays that can contain strings or regular expressions. 68 | 69 | Only sanitize the body, query string, and url of a request. 70 | 71 | ```ruby 72 | config.middleware.insert 0, Rack::UTF8Sanitizer, only: ['rack.input', 'PATH_INFO', 'QUERY_STRING'] 73 | ``` 74 | 75 | Sanitize everything except HTTP headers. 76 | 77 | ```ruby 78 | config.middleware.insert 0, Rack::UTF8Sanitizer, except: [/HTTP_.+/] 79 | ``` 80 | 81 | ### Strategies 82 | 83 | There are two built in strategies for handling invalid characters. The default strategy is `:replace`, which will cause any invalid characters to be replaces with the unicode replacement character (�). The second built in strategy is `:exception` which will cause an `EncodingError` exception to be raised if invalid characters are found (the exception can then be handled by another Rack middleware). 84 | 85 | This is an example of handling the `:exception` strategy with additional middleware: 86 | 87 | ```ruby 88 | require "./your/middleware/directory/utf8_sanitizer_exception_handler.rb" 89 | 90 | config.middleware.insert 0, Rack::UTF8SanitizerExceptionHandler 91 | config.middleware.insert_after Rack::UTF8SanitizerExceptionHandler, Rack::UTF8Sanitizer, strategy: :exception 92 | ``` 93 | 94 | Note: The exception handling middleware must be inserted before `Rack::UTF8Sanitizer` 95 | 96 | ```ruby 97 | module Rack 98 | class UTF8SanitizerExceptionHandler 99 | def initialize(app) 100 | @app = app 101 | end 102 | 103 | def call(env) 104 | @app.call(env) 105 | rescue EncodingError => exception 106 | # OPTIONAL: Add error logging service of your choice here 107 | return [400, {}, ["Bad Request"]] 108 | end 109 | end 110 | end 111 | ``` 112 | 113 | An object that responds to `#call` and accepts the offending string with invalid characters as an argument can also be passed as a `:strategy`. This is how you can define custom strategies. 114 | 115 | ```ruby 116 | config.middleware.insert 0, Rack::UTF8Sanitizer, strategy: :exception 117 | ``` 118 | 119 | ```ruby 120 | replace_string = lambda do |_invalid, sanitize_null_bytes: false| 121 | Rails.logger.warn('Replacing invalid string') 122 | 123 | ''.freeze 124 | end 125 | 126 | config.middleware.insert 0, Rack::UTF8Sanitizer, strategy: replace_string 127 | ``` 128 | 129 | ### Sanitizing Null Bytes 130 | 131 | While null bytes are valid UTF-8, it can be useful to further restrict the valid character set to exclude null bytes. For example, PostgreSQL text columns do not allow storing null bytes. Passing `sanitize_null_bytes: true` in the configuration hash enables sanitizing null bytes, and the two built-in strategies both support this feature. Custom strategies should accept a keyword argument `sanitize_null_bytes` containing this configuration value. 132 | 133 | ## Contributing 134 | 135 | 1. Fork it 136 | 2. Create your feature branch (`git checkout -b my-new-feature`) 137 | 3. Commit your changes (`git commit -am 'Add some feature'`) 138 | 4. Push to the branch (`git push origin my-new-feature`) 139 | 5. Create new Pull Request 140 | 141 | To run the tests, run `rake spec` in the project directory. 142 | -------------------------------------------------------------------------------- /lib/rack/utf8_sanitizer.rb: -------------------------------------------------------------------------------- 1 | # encoding: ascii-8bit 2 | # frozen_string_literal: true 3 | 4 | require 'uri' 5 | require 'stringio' 6 | require 'rack/request' 7 | 8 | module Rack 9 | class UTF8Sanitizer 10 | StringIO = ::StringIO 11 | NULL_BYTE_REGEX = /\x00/.freeze 12 | 13 | class InvalidStream < IOError; end 14 | class NullByteInString < StandardError; end 15 | 16 | # options[:sanitizable_content_types] Array 17 | # options[:additional_content_types] Array 18 | def initialize(app, options={}) 19 | @app = app 20 | @strategy = build_strategy(options) 21 | @sanitizable_content_types = options[:sanitizable_content_types] 22 | @sanitizable_content_types ||= SANITIZABLE_CONTENT_TYPES + (options[:additional_content_types] || []) 23 | @only = Array(options[:only]).flatten 24 | @except = Array(options[:except]).flatten 25 | @sanitize_null_bytes = options.fetch(:sanitize_null_bytes, false) 26 | end 27 | 28 | def call(env) 29 | begin 30 | env = sanitize(env) 31 | rescue EOFError, InvalidStream 32 | return [400, { "Content-Type" => "text/plain" }, ["Bad Request"]] 33 | end 34 | @app.call(env) 35 | end 36 | 37 | DEFAULT_STRATEGIES = { 38 | replace: lambda do |input, sanitize_null_bytes: false| 39 | input. 40 | force_encoding(Encoding::ASCII_8BIT). 41 | encode!(Encoding::UTF_8, 42 | invalid: :replace, 43 | undef: :replace) 44 | if sanitize_null_bytes 45 | input = input.gsub(NULL_BYTE_REGEX, "") 46 | end 47 | input 48 | end, 49 | exception: lambda do |input, sanitize_null_bytes: false| 50 | input. 51 | force_encoding(Encoding::ASCII_8BIT). 52 | encode!(Encoding::UTF_8) 53 | if sanitize_null_bytes && NULL_BYTE_REGEX.match?(input) 54 | raise NullByteInString 55 | end 56 | input 57 | end 58 | }.freeze 59 | 60 | # https://github.com/rack/rack/blob/main/SPEC.rdoc 61 | URI_FIELDS = %w( 62 | SCRIPT_NAME 63 | REQUEST_PATH REQUEST_URI PATH_INFO 64 | QUERY_STRING 65 | HTTP_REFERER 66 | ORIGINAL_FULLPATH 67 | ORIGINAL_SCRIPT_NAME 68 | SERVER_NAME 69 | ).freeze 70 | 71 | SANITIZABLE_CONTENT_TYPES = %w( 72 | text/plain 73 | application/x-www-form-urlencoded 74 | application/json 75 | text/javascript 76 | ).freeze 77 | 78 | URI_ENCODED_CONTENT_TYPES = %w( 79 | application/x-www-form-urlencoded 80 | ).freeze 81 | 82 | HTTP_ = 'HTTP_' 83 | 84 | def sanitize(env) 85 | sanitize_rack_input(env) 86 | sanitize_cookies(env) 87 | env.each do |key, value| 88 | next if skip?(key) 89 | 90 | if URI_FIELDS.include?(key) 91 | env[key] = transfer_frozen(value, 92 | sanitize_uri_encoded_string(value)) 93 | elsif key.to_s.start_with?(HTTP_) 94 | # Just sanitize the headers and leave them in UTF-8. There is 95 | # no reason to have UTF-8 in headers, but if it's valid, let it be. 96 | env[key] = transfer_frozen(value, 97 | sanitize_string(value)) 98 | end 99 | end 100 | end 101 | 102 | protected 103 | 104 | def skip?(rack_env_key) 105 | return true if !@except.empty? && @except.any? { |matcher| rack_env_key[matcher] } 106 | return true if !@only.empty? && @only.none? { |matcher| rack_env_key[matcher] } 107 | 108 | false 109 | end 110 | 111 | def build_strategy(options) 112 | strategy = options.fetch(:strategy) { :replace } 113 | 114 | return strategy unless DEFAULT_STRATEGIES.key?(strategy) 115 | 116 | DEFAULT_STRATEGIES[strategy] 117 | end 118 | 119 | def sanitize_rack_input(env) 120 | request = Rack::Request.new(env) 121 | content_type = request.media_type 122 | return unless @sanitizable_content_types.any? {|type| content_type == type } 123 | 124 | charset = request.content_charset 125 | return if charset && charset.downcase != 'utf-8' 126 | 127 | uri_encoded = URI_ENCODED_CONTENT_TYPES.any? {|type| content_type == type} 128 | 129 | if env['rack.input'] 130 | sanitized_input = sanitize_io(env['rack.input'], uri_encoded, env['CONTENT_LENGTH']&.to_i) 131 | 132 | env['rack.input'] = sanitized_input 133 | env['CONTENT_LENGTH'] &&= sanitized_input.size.to_s 134 | end 135 | end 136 | 137 | # Modeled after Rack::RewindableInput 138 | # TODO: Should this delegate any methods to the original io? 139 | class SanitizedRackInput 140 | def initialize(original_io, sanitized_io) 141 | @original_io = original_io 142 | @sanitized_io = sanitized_io 143 | end 144 | 145 | def gets 146 | @sanitized_io.gets 147 | end 148 | 149 | def read(*args) 150 | @sanitized_io.read(*args) 151 | end 152 | 153 | def each(&block) 154 | @sanitized_io.each(&block) 155 | end 156 | 157 | def rewind 158 | @sanitized_io.rewind 159 | end 160 | 161 | def size 162 | # StringIO#size is bytesize 163 | @sanitized_io.size 164 | end 165 | 166 | def close 167 | @sanitized_io.close 168 | @original_io.close if @original_io.respond_to?(:close) 169 | end 170 | end 171 | 172 | def sanitize_io(io, uri_encoded = false, content_length = nil) 173 | input = if content_length && content_length >= 0 174 | io.read(content_length) 175 | else 176 | io.read 177 | end 178 | raise InvalidStream if input.nil? 179 | sanitized_input = sanitize_string(strip_byte_order_mark(input)) 180 | if uri_encoded 181 | sanitized_input = sanitize_uri_encoded_string(sanitized_input). 182 | force_encoding(Encoding::UTF_8) 183 | end 184 | sanitized_input = transfer_frozen(input, sanitized_input) 185 | SanitizedRackInput.new(io, StringIO.new(sanitized_input)) 186 | end 187 | 188 | # Cookies need to be split and then sanitized as url encoded strings 189 | # since the cookie string itself is not url encoded (separated by `;`), 190 | # and the normal method of `sanitize_uri_encoded_string` would break 191 | # later cookie parsing in the case that a cookie value contained an 192 | # encoded `;`. 193 | def sanitize_cookies(env) 194 | return unless env['HTTP_COOKIE'] 195 | 196 | env['HTTP_COOKIE'] = env['HTTP_COOKIE'] 197 | .split(/[;,] */n) 198 | .map { |cookie| sanitize_uri_encoded_string(cookie) } 199 | .join('; ') 200 | end 201 | 202 | # URI.encode/decode expect the input to be in ASCII-8BIT. 203 | # However, there could be invalid UTF-8 characters both in 204 | # raw and percent-encoded form. 205 | # 206 | # So, first sanitize the value, then percent-decode it while 207 | # treating as UTF-8, then sanitize the result and encode it back. 208 | # 209 | # The result is guaranteed to be UTF-8-safe. 210 | def sanitize_uri_encoded_string(input) 211 | return input if input.nil? 212 | decoded_value = decode_string(input) 213 | reencode_string(decoded_value) 214 | end 215 | 216 | def reencode_string(decoded_value) 217 | escape_unreserved( 218 | sanitize_string(decoded_value)) 219 | end 220 | 221 | def decode_string(input) 222 | unescape_unreserved( 223 | sanitize_string(input). 224 | force_encoding(Encoding::ASCII_8BIT)) 225 | end 226 | 227 | # This regexp matches all 'unreserved' characters from RFC3986 (2.3), 228 | # plus all multibyte UTF-8 characters. 229 | UNRESERVED_OR_UTF8 = /[A-Za-z0-9\-._~\x80-\xFF]/.freeze 230 | UNRESERVED_OR_UTF8_OR_NULL = /[A-Za-z0-9\-._~\x00\x80-\xFF]/.freeze 231 | 232 | # RFC3986, 2.2 states that the characters from 'reserved' group must be 233 | # protected during normalization (which is what UTF8Sanitizer does). 234 | # 235 | # However, the regexp approach used by URI.unescape is not sophisticated 236 | # enough for our task. 237 | def unescape_unreserved(input) 238 | input.gsub(/%([a-f\d]{2})/i) do |encoded| 239 | decoded = $1.hex.chr 240 | 241 | decodable_regex = @sanitize_null_bytes ? UNRESERVED_OR_UTF8_OR_NULL : UNRESERVED_OR_UTF8 242 | if decoded =~ decodable_regex 243 | decoded 244 | else 245 | encoded 246 | end 247 | end 248 | end 249 | 250 | # This regexp matches unsafe characters, i.e. everything except 'reserved' 251 | # and 'unreserved' characters from RFC3986 (2.3), and additionally '%', 252 | # as percent-encoded unreserved characters could be left over from the 253 | # `unescape_unreserved` invocation. 254 | # 255 | # See also URI::REGEXP::PATTERN::{UNRESERVED,RESERVED}. 256 | UNSAFE = /[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]%]/ 257 | 258 | # Performs the reverse function of `unescape_unreserved`. Unlike 259 | # the previous function, we can reuse the logic in URI#encode 260 | def escape_unreserved(input) 261 | if Object.const_defined?("URI::RFC2396_PARSER") 262 | URI::RFC2396_PARSER.escape(input, UNSAFE) 263 | else 264 | URI::DEFAULT_PARSER.escape(input, UNSAFE) 265 | end 266 | end 267 | 268 | def sanitize_string(input) 269 | if input.is_a? String 270 | input = input.dup.force_encoding(Encoding::UTF_8) 271 | 272 | if input.valid_encoding? && !(@sanitize_null_bytes && input =~ NULL_BYTE_REGEX) 273 | input 274 | else 275 | @strategy.call(input, sanitize_null_bytes: @sanitize_null_bytes) 276 | end 277 | else 278 | input 279 | end 280 | end 281 | 282 | def transfer_frozen(from, to) 283 | if from.frozen? 284 | to.freeze 285 | else 286 | to 287 | end 288 | end 289 | 290 | UTF8_BOM = "\xef\xbb\xbf".dup.force_encoding(Encoding::BINARY).freeze 291 | UTF8_BOM_SIZE = UTF8_BOM.bytesize 292 | 293 | def strip_byte_order_mark(input) 294 | return input unless input.start_with?(UTF8_BOM) 295 | input.byteslice(UTF8_BOM_SIZE..-1) 296 | end 297 | end 298 | end 299 | -------------------------------------------------------------------------------- /test/test_utf8_sanitizer.rb: -------------------------------------------------------------------------------- 1 | # encoding:ascii-8bit 2 | # frozen_string_literal: true 3 | 4 | require 'bacon/colored_output' 5 | require 'cgi' 6 | require 'rack/utf8_sanitizer' 7 | 8 | describe Rack::UTF8Sanitizer do 9 | before do 10 | @app = Rack::UTF8Sanitizer.new(-> env { env }) 11 | end 12 | 13 | shared :does_sanitize_plain do 14 | it "sanitizes plaintext entity (HTTP_USER_AGENT)" do 15 | env = @app.({ "HTTP_USER_AGENT" => @plain_input }) 16 | result = env["HTTP_USER_AGENT"] 17 | 18 | result.encoding.should == Encoding::UTF_8 19 | result.should.be.valid_encoding 20 | end 21 | end 22 | 23 | shared :does_sanitize_uri do 24 | it "sanitizes URI-like entity (REQUEST_PATH)" do 25 | env = @app.({ "REQUEST_PATH" => @uri_input }) 26 | result = env["REQUEST_PATH"] 27 | 28 | result.encoding.should == Encoding::US_ASCII 29 | result.should.be.valid_encoding 30 | end 31 | end 32 | 33 | describe "with invalid host input" do 34 | it "sanitizes host entity (SERVER_NAME)" do 35 | host = "host\xD0".dup.force_encoding('UTF-8') 36 | env = @app.({ "SERVER_NAME" => host }) 37 | result = env["SERVER_NAME"] 38 | 39 | result.encoding.should == Encoding::US_ASCII 40 | result.should.be.valid_encoding 41 | end 42 | end 43 | 44 | describe "with invalid UTF-8 input" do 45 | before do 46 | @plain_input = "foo\xe0".dup.force_encoding('UTF-8') 47 | @uri_input = "http://bar/foo%E0".dup.force_encoding('UTF-8') 48 | end 49 | 50 | behaves_like :does_sanitize_plain 51 | behaves_like :does_sanitize_uri 52 | end 53 | 54 | describe "with invalid, incorrectly percent-encoded UTF-8 URI input" do 55 | before do 56 | @uri_input = "http://bar/foo%E0\xe0".dup.force_encoding('UTF-8') 57 | end 58 | 59 | behaves_like :does_sanitize_uri 60 | end 61 | 62 | describe "with invalid ASCII-8BIT input" do 63 | before do 64 | @plain_input = "foo\xe0" 65 | @uri_input = "http://bar/foo%E0" 66 | end 67 | 68 | behaves_like :does_sanitize_plain 69 | behaves_like :does_sanitize_uri 70 | end 71 | 72 | describe "with invalid, incorrectly percent-encoded ASCII-8BIT URI input" do 73 | before do 74 | @uri_input = "http://bar/foo%E0\xe0" 75 | end 76 | 77 | behaves_like :does_sanitize_uri 78 | end 79 | 80 | shared :identity_plain do 81 | it "does not change plaintext entity (HTTP_USER_AGENT)" do 82 | env = @app.({ "HTTP_USER_AGENT" => @plain_input }) 83 | result = env["HTTP_USER_AGENT"] 84 | 85 | result.encoding.should == Encoding::UTF_8 86 | result.should.be.valid_encoding 87 | result.should == @plain_input 88 | end 89 | end 90 | 91 | shared :identity_uri do 92 | it "does not change URI-like entity (REQUEST_PATH)" do 93 | env = @app.({ "REQUEST_PATH" => @uri_input }) 94 | result = env["REQUEST_PATH"] 95 | 96 | result.encoding.should == Encoding::US_ASCII 97 | result.should.be.valid_encoding 98 | result.should == @uri_input 99 | end 100 | end 101 | 102 | describe "with valid UTF-8 input" do 103 | before do 104 | @plain_input = "foo bar лол".dup.force_encoding('UTF-8') 105 | @uri_input = "http://bar/foo+bar+%D0%BB%D0%BE%D0%BB".dup.force_encoding('UTF-8') 106 | end 107 | 108 | behaves_like :identity_plain 109 | behaves_like :identity_uri 110 | 111 | describe "with URI characters from reserved range" do 112 | before do 113 | @uri_input = "http://bar/foo+%2F%3A+bar+%D0%BB%D0%BE%D0%BB".dup.force_encoding('UTF-8') 114 | end 115 | 116 | behaves_like :identity_uri 117 | end 118 | end 119 | 120 | describe "with valid, not percent-encoded UTF-8 URI input" do 121 | before do 122 | @uri_input = "http://bar/foo+bar+лол".dup.force_encoding('UTF-8') 123 | @encoded = "http://bar/foo+bar+#{CGI.escape("лол")}" 124 | end 125 | 126 | it "does not change URI-like entity (REQUEST_PATH)" do 127 | env = @app.({ "REQUEST_PATH" => @uri_input }) 128 | result = env["REQUEST_PATH"] 129 | 130 | result.encoding.should == Encoding::US_ASCII 131 | result.should.be.valid_encoding 132 | result.should == @encoded 133 | end 134 | end 135 | 136 | describe "with valid ASCII-8BIT input" do 137 | before do 138 | @plain_input = "bar baz" 139 | @uri_input = "http://bar/bar+baz" 140 | end 141 | 142 | behaves_like :identity_plain 143 | behaves_like :identity_uri 144 | 145 | describe "with URI characters from reserved range" do 146 | before do 147 | @uri_input = "http://bar/foo+%2F%3A+bar+%D0%BB%D0%BE%D0%BB" 148 | end 149 | 150 | behaves_like :identity_uri 151 | end 152 | end 153 | 154 | describe "with frozen strings" do 155 | before do 156 | @plain_input = "bar baz" 157 | @uri_input = "http://bar/bar+baz" 158 | end 159 | 160 | it "preserves the frozen? status of input" do 161 | env = @app.({ "HTTP_USER_AGENT" => @plain_input, 162 | "REQUEST_PATH" => @uri_input }) 163 | 164 | env["HTTP_USER_AGENT"].should.be.frozen 165 | env["REQUEST_PATH"].should.be.frozen 166 | end 167 | end 168 | 169 | describe "with mutable strings" do 170 | before do 171 | @plain_input = "bar baz".dup 172 | @uri_input = "http://bar/bar+baz".dup 173 | end 174 | 175 | it "preserves the frozen? status of input" do 176 | env = @app.({ "HTTP_USER_AGENT" => @plain_input, 177 | "REQUEST_PATH" => @uri_input }) 178 | 179 | env["HTTP_USER_AGENT"].should.not.be.frozen 180 | env["REQUEST_PATH"].should.not.be.frozen 181 | end 182 | end 183 | 184 | describe "with symbols in the env" do 185 | before do 186 | @uri_input = "http://bar/foo%E0\xe0".dup.force_encoding('UTF-8') 187 | end 188 | 189 | it "sanitizes REQUEST_PATH with invalid UTF-8 URI input" do 190 | env = @app.({ :requested_at => "2014-07-22", 191 | "REQUEST_PATH" => @uri_input }) 192 | 193 | result = env["REQUEST_PATH"] 194 | 195 | result.encoding.should == Encoding::US_ASCII 196 | result.should.be.valid_encoding 197 | end 198 | end 199 | 200 | describe "with form data" do 201 | def request_env 202 | @plain_input = "foo bar лол".dup.force_encoding('UTF-8') 203 | { 204 | "REQUEST_METHOD" => "POST", 205 | "CONTENT_TYPE" => "application/x-www-form-urlencoded;foo=bar", 206 | "HTTP_USER_AGENT" => @plain_input, 207 | "rack.input" => @rack_input, 208 | } 209 | end 210 | 211 | def sanitize_form_data(request_env = request_env()) 212 | @uri_input = "http://bar/foo+%2F%3A+bar+%D0%BB%D0%BE%D0%BB".dup.force_encoding('UTF-8') 213 | @response_env = @app.(request_env) 214 | sanitized_input = @response_env['rack.input'].read 215 | 216 | yield sanitized_input if block_given? 217 | 218 | @response_env['rack.input'].rewind 219 | behaves_like :does_sanitize_plain 220 | behaves_like :does_sanitize_uri 221 | behaves_like :identity_plain 222 | behaves_like :identity_uri 223 | @response_env['rack.input'].close 224 | end 225 | 226 | class BrokenIO < StringIO 227 | def read 228 | raise EOFError 229 | end 230 | end 231 | 232 | it "returns HTTP 400 on EOF" do 233 | @rack_input = BrokenIO.new 234 | @response_env = @app.(request_env) 235 | @response_env.should == [400, {"Content-Type"=>"text/plain"}, ["Bad Request"]] 236 | end 237 | 238 | it "Bad Request response can safety be mutated" do 239 | @rack_input = BrokenIO.new 240 | response_env = @app.(request_env) 241 | response_env.should == [400, {"Content-Type"=>"text/plain"}, ["Bad Request"]] 242 | response_env[1]["Set-Cookie"] = "you_are_admin" 243 | 244 | response_env = @app.(request_env) 245 | response_env[1]["Set-Cookie"].should == nil 246 | end 247 | 248 | it "returns HTTP 400 if CONTENT_LENGTH is larger than actual length of rack.input" do 249 | @rack_input = StringIO.new("") 250 | response_env = @app.(request_env.merge("CONTENT_LENGTH" => (@rack_input.length + 1).to_s)) 251 | response_env.should == [400, {"Content-Type"=>"text/plain"}, ["Bad Request"]] 252 | end 253 | 254 | it "sanitizes StringIO rack.input" do 255 | input = "foo=bla&quux=bar" 256 | @rack_input = StringIO.new input 257 | 258 | sanitize_form_data do |sanitized_input| 259 | sanitized_input.encoding.should == Encoding::UTF_8 260 | sanitized_input.should.be.valid_encoding 261 | sanitized_input.should == input 262 | end 263 | end 264 | 265 | it "sanitizes StringIO rack.input on GET" do 266 | input = "foo=bla&quux=bar" 267 | @rack_input = StringIO.new input 268 | 269 | sanitize_form_data(request_env.merge("REQUEST_METHOD" => "GET")) do |sanitized_input| 270 | sanitized_input.encoding.should == Encoding::UTF_8 271 | sanitized_input.should.be.valid_encoding 272 | sanitized_input.should == input 273 | end 274 | end 275 | 276 | it "sanitizes StringIO rack.input with bad encoding" do 277 | input = "foo=bla&quux=bar\xED" 278 | @rack_input = StringIO.new input 279 | 280 | sanitize_form_data do |sanitized_input| 281 | sanitized_input.encoding.should == Encoding::UTF_8 282 | sanitized_input.should.be.valid_encoding 283 | sanitized_input.should != input 284 | end 285 | end 286 | 287 | it "sanitizes the rack body if the charset is present and utf-8" do 288 | input = "name=#{CGI.escape("まつもと")}" 289 | @rack_input = StringIO.new input 290 | 291 | env = request_env.update('CONTENT_TYPE' => "application/x-www-form-urlencoded; charset=utf-8") 292 | sanitize_form_data(env) do |sanitized_input| 293 | sanitized_input.encoding.should == Encoding::UTF_8 294 | sanitized_input.should.be.valid_encoding 295 | sanitized_input.should == input 296 | end 297 | end 298 | 299 | it "strip UTF-8 BOM from StringIO rack.input" do 300 | input = %(\xef\xbb\xbf{"Hello": "World"}) 301 | @rack_input = StringIO.new input 302 | 303 | sanitize_form_data(request_env.merge("CONTENT_TYPE" => "application/json")) do |sanitized_input| 304 | sanitized_input.encoding.should == Encoding::UTF_8 305 | sanitized_input.should.be.valid_encoding 306 | sanitized_input.should == '{"Hello": "World"}' 307 | end 308 | end 309 | 310 | it "sanitizes StringIO rack.input with form encoded bad encoding" do 311 | input = "foo=bla&foo=baz&quux%ED=bar%ED" 312 | @rack_input = StringIO.new input 313 | 314 | sanitize_form_data do |sanitized_input| 315 | # URI.decode_www_form does some encoding magic 316 | sanitized_input.split("&").each do |pair| 317 | pair.split("=", 2).each do |component| 318 | decoded = URI.decode_www_form_component(component) 319 | decoded.should.be.valid_encoding 320 | end 321 | end 322 | sanitized_input.should != input 323 | end 324 | end 325 | 326 | it "sanitizes non-StringIO rack.input" do 327 | require 'rack/rewindable_input' 328 | input = "foo=bla&quux=bar" 329 | @rack_input = Rack::RewindableInput.new(StringIO.new(input)) 330 | 331 | sanitize_form_data do |sanitized_input| 332 | sanitized_input.encoding.should == Encoding::UTF_8 333 | sanitized_input.should.be.valid_encoding 334 | sanitized_input.should == input 335 | end 336 | end 337 | 338 | it "sanitizes non-StringIO rack.input with bad encoding" do 339 | require 'rack/rewindable_input' 340 | input = "foo=bla&quux=bar\xED" 341 | @rack_input = Rack::RewindableInput.new(StringIO.new(input)) 342 | 343 | sanitize_form_data do |sanitized_input| 344 | sanitized_input.encoding.should == Encoding::UTF_8 345 | sanitized_input.should.be.valid_encoding 346 | sanitized_input.should != input 347 | end 348 | end 349 | 350 | it "does not sanitize the rack body if there is no CONTENT_TYPE" do 351 | input = "foo=bla&quux=bar\xED" 352 | @rack_input = StringIO.new input 353 | 354 | env = request_env.update('CONTENT_TYPE' => nil) 355 | sanitize_form_data(env) do |sanitized_input| 356 | sanitized_input.encoding.should == Encoding::ASCII_8BIT 357 | sanitized_input.should.be.valid_encoding 358 | sanitized_input.should == input 359 | end 360 | end 361 | 362 | it "does not sanitize the rack body if there is empty CONTENT_TYPE" do 363 | input = "foo=bla&quux=bar\xED" 364 | @rack_input = StringIO.new input 365 | 366 | env = request_env.update('CONTENT_TYPE' => '') 367 | sanitize_form_data(env) do |sanitized_input| 368 | sanitized_input.encoding.should == Encoding::ASCII_8BIT 369 | sanitized_input.should.be.valid_encoding 370 | sanitized_input.should == input 371 | end 372 | end 373 | 374 | it "does not sanitize the rack body if the charset is present and not utf-8" do 375 | input = "name=".encode("Shift_JIS") + CGI.escape("まつもと".encode("Shift_JIS", "UTF-8")) 376 | @rack_input = StringIO.new input 377 | 378 | env = request_env.update('CONTENT_TYPE' => "application/x-www-form-urlencoded; charset=Shift_JIS") 379 | sanitize_form_data(env) do |sanitized_input| 380 | sanitized_input.encoding.should == Encoding::SHIFT_JIS 381 | sanitized_input.should.be.valid_encoding 382 | sanitized_input.should == input 383 | end 384 | end 385 | 386 | it "adjusts content-length when replacing input" do 387 | input = "foo=bla&quux=bar\xED" 388 | @rack_input = StringIO.new input 389 | 390 | env = request_env.update("CONTENT_LENGTH" => input.bytesize) 391 | sanitize_form_data(env) do |sanitized_input| 392 | sanitized_input.bytesize.should != input.bytesize 393 | @response_env["CONTENT_LENGTH"].should == sanitized_input.bytesize.to_s 394 | end 395 | end 396 | 397 | it "does not sanitize null bytes by default" do 398 | input = "foo=bla&quux=bar%00" 399 | @rack_input = StringIO.new input 400 | 401 | sanitize_form_data do |sanitized_input| 402 | sanitized_input.encoding.should == Encoding::UTF_8 403 | sanitized_input.should.be.valid_encoding 404 | sanitized_input.should == input 405 | end 406 | end 407 | 408 | it "optionally sanitizes null bytes with the replace strategy" do 409 | @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true) 410 | input = "foo=bla\xED&quux=bar\x00" 411 | @rack_input = StringIO.new input 412 | 413 | sanitize_form_data do |sanitized_input| 414 | sanitized_input.encoding.should == Encoding::UTF_8 415 | sanitized_input.should.be.valid_encoding 416 | sanitized_input.should == "foo=bla%EF%BF%BD&quux=bar" 417 | end 418 | end 419 | 420 | it "optionally sanitizes encoded null bytes with the replace strategy" do 421 | @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true) 422 | input = "foo=bla%ED&quux=bar%00" 423 | @rack_input = StringIO.new input 424 | 425 | sanitize_form_data do |sanitized_input| 426 | sanitized_input.encoding.should == Encoding::UTF_8 427 | sanitized_input.should.be.valid_encoding 428 | sanitized_input.should == "foo=bla%EF%BF%BD&quux=bar" 429 | end 430 | end 431 | 432 | it "optionally raises on null bytes with the exception strategy" do 433 | @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true, strategy: :exception) 434 | input = "foo=bla&quux=bar\x00" 435 | @rack_input = StringIO.new input 436 | 437 | should.raise(Rack::UTF8Sanitizer::NullByteInString) do 438 | sanitize_form_data 439 | end 440 | end 441 | 442 | it "optionally raises on encoded null bytes with the exception strategy" do 443 | @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true, strategy: :exception) 444 | input = "foo=bla&quux=bar%00" 445 | @rack_input = StringIO.new input 446 | 447 | should.raise(Rack::UTF8Sanitizer::NullByteInString) do 448 | sanitize_form_data 449 | end 450 | end 451 | 452 | it "gives precedence to encoding errors with the exception strategy and null byte sanitisation" do 453 | @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true, strategy: :exception) 454 | input = "foo=bla\x00&quux=bar\xED" 455 | @rack_input = StringIO.new input 456 | 457 | should.raise(EncodingError) do 458 | sanitize_form_data 459 | end 460 | end 461 | end 462 | 463 | describe "with custom content-type" do 464 | def request_env 465 | { 466 | "REQUEST_METHOD" => "GET", 467 | "CONTENT_TYPE" => "application/json", 468 | "HTTP_COOKIE" => @cookie, 469 | "rack.input" => StringIO.new, 470 | } 471 | end 472 | 473 | it "sanitizes bad http cookie" do 474 | @cookie = "foo=bla; quux=bar\xED" 475 | response_env = @app.(request_env) 476 | response_env['HTTP_COOKIE'].should != @cookie 477 | response_env['HTTP_COOKIE'].should == 'foo=bla; quux=bar%EF%BF%BD' 478 | end 479 | 480 | it "does not change ok http cookie" do 481 | @cookie = "foo=bla; quux=bar" 482 | response_env = @app.(request_env) 483 | response_env['HTTP_COOKIE'].should == @cookie 484 | 485 | @cookie = "foo=b%3bla; quux=b%20a%20r" 486 | response_env = @app.(request_env) 487 | response_env['HTTP_COOKIE'].should == @cookie 488 | end 489 | end 490 | 491 | describe "with custom content-type" do 492 | def request_env 493 | @plain_input = "foo bar лол".dup.force_encoding('UTF-8') 494 | { 495 | "REQUEST_METHOD" => "POST", 496 | "CONTENT_TYPE" => "application/vnd.api+json", 497 | "HTTP_USER_AGENT" => @plain_input, 498 | "rack.input" => @rack_input, 499 | } 500 | end 501 | 502 | def sanitize_data(request_env = request_env()) 503 | @uri_input = "http://bar/foo+%2F%3A+bar+%D0%BB%D0%BE%D0%BB".dup.force_encoding('UTF-8') 504 | @response_env = @app.(request_env) 505 | sanitized_input = @response_env['rack.input'].read 506 | 507 | yield sanitized_input if block_given? 508 | end 509 | 510 | it "does not sanitize custom content-type by default" do 511 | input = "foo=bla&quux=bar\xED" 512 | @rack_input = StringIO.new input 513 | 514 | env = request_env 515 | sanitize_data(env) do |sanitized_input| 516 | sanitized_input.encoding.should == Encoding::ASCII_8BIT 517 | sanitized_input.should.be.valid_encoding 518 | sanitized_input.should == input 519 | end 520 | end 521 | 522 | it "sanitizes custom content-type if additional_content_types given" do 523 | @app = Rack::UTF8Sanitizer.new(-> env { env }, additional_content_types: ["application/vnd.api+json"]) 524 | input = "foo=bla&quux=bar\xED" 525 | @rack_input = StringIO.new input 526 | 527 | env = request_env 528 | sanitize_data(env) do |sanitized_input| 529 | sanitized_input.encoding.should == Encoding::UTF_8 530 | sanitized_input.should.be.valid_encoding 531 | sanitized_input.should != input 532 | end 533 | end 534 | 535 | it "sanitizes default content-type if additional_content_types given" do 536 | @app = Rack::UTF8Sanitizer.new(-> env { env }, additional_content_types: ["application/vnd.api+json"]) 537 | input = "foo=bla&quux=bar\xED" 538 | @rack_input = StringIO.new input 539 | 540 | env = request_env.update('CONTENT_TYPE' => 'application/json') 541 | sanitize_data(env) do |sanitized_input| 542 | sanitized_input.encoding.should == Encoding::UTF_8 543 | sanitized_input.should.be.valid_encoding 544 | sanitized_input.should != input 545 | end 546 | end 547 | 548 | it "sanitizes custom content-type if sanitizable_content_types given" do 549 | @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitizable_content_types: ["application/vnd.api+json"]) 550 | input = "foo=bla&quux=bar\xED" 551 | @rack_input = StringIO.new input 552 | 553 | env = request_env 554 | sanitize_data(env) do |sanitized_input| 555 | sanitized_input.encoding.should == Encoding::UTF_8 556 | sanitized_input.should.be.valid_encoding 557 | sanitized_input.should != input 558 | end 559 | end 560 | 561 | it "does not sanitize default content-type if sanitizable_content_types does not include it" do 562 | @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitizable_content_types: ["application/vnd.api+json"]) 563 | input = "foo=bla&quux=bar\xED" 564 | @rack_input = StringIO.new input 565 | 566 | env = request_env.update('CONTENT_TYPE' => 'application/json') 567 | sanitize_data(env) do |sanitized_input| 568 | sanitized_input.encoding.should == Encoding::ASCII_8BIT 569 | sanitized_input.should.be.valid_encoding 570 | sanitized_input.should == input 571 | end 572 | end 573 | end 574 | 575 | describe "with only and/or except options" do 576 | before do 577 | @plain_input = "foo\xe0".dup.force_encoding('UTF-8') 578 | end 579 | 580 | def request_env 581 | { 582 | "REQUEST_METHOD" => "POST", 583 | "CONTENT_TYPE" => "application/json", 584 | "HTTP_USER_AGENT" => @plain_input, 585 | "HTTP_CUSTOM_HEADER" => @plain_input, 586 | "rack.input" => @rack_input, 587 | } 588 | end 589 | 590 | def sanitize_data(request_env = request_env()) 591 | @response_env = @app.(request_env) 592 | end 593 | 594 | it 'skips unless in only' do 595 | @app = Rack::UTF8Sanitizer.new( 596 | -> env { env }, 597 | only: ['HTTP_CUSTOM_HEADER'] 598 | ) 599 | @rack_input = StringIO.new('{}') 600 | 601 | sanitize_data 602 | @response_env['HTTP_CUSTOM_HEADER'].should != @plain_input 603 | @response_env['HTTP_USER_AGENT'].should == @plain_input 604 | end 605 | 606 | it 'skips if in except' do 607 | @app = Rack::UTF8Sanitizer.new( 608 | -> env { env }, 609 | except: ['HTTP_CUSTOM_HEADER'] 610 | ) 611 | @rack_input = StringIO.new('{}') 612 | 613 | sanitize_data 614 | @response_env['HTTP_CUSTOM_HEADER'].should == @plain_input 615 | @response_env['HTTP_USER_AGENT'].should != @plain_input 616 | end 617 | 618 | it 'works with regular expressions' do 619 | @app = Rack::UTF8Sanitizer.new( 620 | -> env { env }, 621 | only: ['HTTP_CUSTOM_HEADER', /(agent|input)/i] 622 | ) 623 | @rack_input = StringIO.new(@plain_input.force_encoding(Encoding::ASCII_8BIT)) 624 | 625 | sanitize_data 626 | @response_env['HTTP_CUSTOM_HEADER'].should != @plain_input 627 | @response_env['HTTP_USER_AGENT'].should != @plain_input 628 | @response_env['rack.input'].read.should != @plain_input 629 | end 630 | end 631 | 632 | describe "with custom strategy" do 633 | def request_env 634 | @plain_input = "foo bar лол".dup.force_encoding('UTF-8') 635 | { 636 | "REQUEST_METHOD" => "POST", 637 | "CONTENT_TYPE" => "application/json", 638 | "HTTP_USER_AGENT" => @plain_input, 639 | "rack.input" => @rack_input, 640 | } 641 | end 642 | 643 | def sanitize_data(request_env = request_env()) 644 | @uri_input = "http://bar/foo+%2F%3A+bar+%D0%BB%D0%BE%D0%BB".dup.force_encoding('UTF-8') 645 | @response_env = @app.(request_env) 646 | sanitized_input = @response_env['rack.input'].read 647 | 648 | yield sanitized_input if block_given? 649 | end 650 | 651 | it "calls a default strategy (replace)" do 652 | @app = Rack::UTF8Sanitizer.new(-> env { env }) 653 | 654 | input = "foo=bla&quux=bar\xED" 655 | @rack_input = StringIO.new input 656 | 657 | env = request_env 658 | sanitize_data(env) do |sanitized_input| 659 | sanitized_input.encoding.should == Encoding::UTF_8 660 | sanitized_input.should.be.valid_encoding 661 | sanitized_input.should != input 662 | end 663 | end 664 | 665 | it "calls the exception strategy" do 666 | @app = Rack::UTF8Sanitizer.new(-> env { env }, strategy: :exception) 667 | 668 | input = "foo=bla&quux=bar\xED" 669 | @rack_input = StringIO.new input 670 | 671 | env = request_env 672 | should.raise(EncodingError) { sanitize_data(env) } 673 | end 674 | 675 | it "accepts a proc as a strategy" do 676 | truncate = -> (input, sanitize_null_bytes:) do 677 | sanitize_null_bytes.should == false 678 | "replace".dup.force_encoding(Encoding::UTF_8) 679 | end 680 | 681 | @app = Rack::UTF8Sanitizer.new(-> env { env }, strategy: truncate) 682 | 683 | input = "foo=bla&quux=bar\xED" 684 | @rack_input = StringIO.new input 685 | 686 | env = request_env 687 | sanitize_data(env) do |sanitized_input| 688 | sanitized_input.encoding.should == Encoding::UTF_8 689 | sanitized_input.should.be.valid_encoding 690 | sanitized_input.should == 'replace' 691 | end 692 | end 693 | 694 | it "accepts a proc as a strategy and passes along sanitize_null_bytes" do 695 | truncate = -> (input, sanitize_null_bytes:) do 696 | sanitize_null_bytes.should == true 697 | "replace".dup.force_encoding(Encoding::UTF_8) 698 | end 699 | 700 | @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true, strategy: truncate) 701 | input = "foo=bla&quux=bar\x00" 702 | 703 | @rack_input = StringIO.new input 704 | 705 | env = request_env 706 | sanitize_data(env) do |sanitized_input| 707 | sanitized_input.encoding.should == Encoding::UTF_8 708 | sanitized_input.should.be.valid_encoding 709 | sanitized_input.should == 'replace' 710 | end 711 | end 712 | end 713 | end 714 | --------------------------------------------------------------------------------