├── .github └── workflows │ └── ci.yml ├── .gitignore ├── CHANGELOG.md ├── Dockerfile.dev ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── decontaminator.gemspec ├── docker-compose.yml ├── docker └── entrypoint.sh ├── lib ├── decontaminator.rb └── decontaminator │ ├── fragment.rb │ └── version.rb └── spec └── decontaminator └── fragment_spec.rb /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI for Decontaminator 2 | 3 | on: 4 | push: 5 | branches: 6 | - '**' 7 | 8 | jobs: 9 | test: 10 | runs-on: ubuntu-latest 11 | 12 | strategy: 13 | matrix: 14 | ruby-version: ['2.7', '3.0', '3.1', '3.2', '3.3', 'jruby-9.4'] 15 | 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@v4 19 | 20 | - name: Set up Ruby 21 | uses: ruby/setup-ruby@v1 22 | with: 23 | ruby-version: ${{ matrix.ruby-version }} 24 | bundler-cache: true 25 | 26 | - name: Install dependencies 27 | run: bundle install 28 | 29 | - name: Run tests 30 | run: bundle exec rake 31 | 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /Gemfile.lock 4 | /_yardoc/ 5 | /coverage/ 6 | /doc/ 7 | /pkg/ 8 | /spec/reports/ 9 | /tmp/ 10 | *.bundle 11 | *.so 12 | *.o 13 | *.a 14 | mkmf.log 15 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | All notable changes to this project will be documented in this file. This 3 | project adheres to [Semantic Versioning](http://semver.org/). 4 | 5 | ## [1.0.2] - 2018-06-20 6 | ### Changed 7 | - Loosen dependency on Oga so we can use Oga 1.3 and 2.0 8 | 9 | ## [1.0.1] - 2018-02-01 10 | ### Fixed 11 | - Fixed decontaminating HTML with comments 12 | 13 | ## [1.0.0] - 2015-09-16 14 | ### Added 15 | - First stable version of Decontaminator 16 | 17 | [1.0.2]: https://github.com/altmetric/decontaminator/releases/tag/v1.0.2 18 | [1.0.1]: https://github.com/altmetric/decontaminator/releases/tag/v1.0.1 19 | [1.0.0]: https://github.com/altmetric/decontaminator/releases/tag/v1.0.0 20 | -------------------------------------------------------------------------------- /Dockerfile.dev: -------------------------------------------------------------------------------- 1 | FROM ruby:2.7.8 2 | 3 | WORKDIR /app 4 | COPY . /app 5 | 6 | RUN gem install bundler -v 2.4.22 7 | RUN bundle install 8 | 9 | CMD ["docker/entrypoint.sh"] 10 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gemspec 4 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015-2024 Altmetric LLP 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Decontaminator 2 | 3 | Ruby HTML sanitizer based on a lightweight Oga parser. 4 | 5 | **Current version:** 1.0.2 6 | 7 | **Supported Ruby versions:** >= 2.7 8 | 9 | ## Installation 10 | 11 | Add this line to your application's Gemfile: 12 | 13 | ```ruby 14 | gem 'decontaminator' 15 | ``` 16 | 17 | And then execute: 18 | 19 | $ bundle 20 | 21 | Or install it yourself as: 22 | 23 | $ gem install decontaminator 24 | 25 | ## Usage 26 | 27 | ```ruby 28 | require 'decontaminator' 29 | 30 | input = '
Lorem ipsum...
' 31 | fragment = Decontaminator::Fragment.new(input) 32 | puts fragment.decontaminate.inspect 33 | " Heading Lorem ipsum... " 34 | ``` 35 | 36 | ## Contributing 37 | 38 | 1. Fork it ( https://github.com/altmetric/decontaminator/fork ) 39 | 2. Create your feature branch (`git checkout -b my-new-feature`) 40 | 3. Commit your changes (`git commit -am 'Add some feature'`) 41 | 4. Push to the branch (`git push origin my-new-feature`) 42 | 5. Create a new Pull Request 43 | 44 | ## License 45 | 46 | Copyright © 2015-2024 Altmetric LLP 47 | 48 | Distributed under the [MIT license](https://github.com/altmetric/decontaminator/blob/master/LICENSE.txt). 49 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rspec/core/rake_task' 2 | 3 | RSpec::Core::RakeTask.new(:spec) 4 | task default: :spec 5 | -------------------------------------------------------------------------------- /decontaminator.gemspec: -------------------------------------------------------------------------------- 1 | require File.expand_path('../lib/decontaminator/version', __FILE__) 2 | 3 | Gem::Specification.new do |spec| 4 | spec.name = 'decontaminator' 5 | spec.version = Decontaminator::VERSION 6 | spec.authors = ['Matthew MacLeod', 'Paul Mucur', 'Jakub Pawlowicz', 'Anna Klimas'] 7 | spec.email = 'support@altmetric.com' 8 | spec.homepage = 'https://github.com/altmetric/decontaminator' 9 | spec.summary = %q{HTML sanitizer using lightweight Oga HTML parser.} 10 | spec.license = 'MIT' 11 | spec.description = 'Ruby HTML sanitizer based on a lightweight Oga parser.' 12 | 13 | spec.files = %w(README.md LICENSE.txt) + Dir['lib/**/*.rb'] 14 | spec.test_files = Dir['spec/**/*.rb'] 15 | 16 | spec.add_dependency('oga', '>= 1.3', '< 3.0') 17 | 18 | spec.add_development_dependency 'rake' 19 | spec.add_development_dependency 'rspec' 20 | end 21 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | app: 3 | build: 4 | context: . 5 | dockerfile: Dockerfile.dev 6 | stdin_open: true 7 | tty: true 8 | volumes: 9 | - .:/app 10 | -------------------------------------------------------------------------------- /docker/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Start an interactive shell to keep the container running 4 | exec bash 5 | -------------------------------------------------------------------------------- /lib/decontaminator.rb: -------------------------------------------------------------------------------- 1 | require 'decontaminator/fragment' 2 | -------------------------------------------------------------------------------- /lib/decontaminator/fragment.rb: -------------------------------------------------------------------------------- 1 | require 'oga' 2 | 3 | module Decontaminator 4 | class Fragment 5 | def initialize(html_fragment) 6 | @html_fragment = html_fragment 7 | end 8 | 9 | def decontaminate(options = {}) 10 | blacklisted_tags = NON_CONTENT_TAGS + options.fetch(:blacklist, []) 11 | 12 | sanitize(Oga.parse_html(html_fragment).children, blacklisted_tags) 13 | end 14 | 15 | private 16 | 17 | attr_reader :html_fragment 18 | 19 | NON_CONTENT_TAGS = %w( 20 | script 21 | style 22 | ) 23 | 24 | WHITESPACE_CONTENT_TAGS = %w( 25 | address 26 | article 27 | aside 28 | blockquote 29 | br 30 | dd 31 | div 32 | dl 33 | dt 34 | footer 35 | h1 36 | h2 37 | h3 38 | h4 39 | h5 40 | h6 41 | header 42 | hgroup 43 | hr 44 | li 45 | nav 46 | ol 47 | p 48 | pre 49 | section 50 | ul 51 | ) 52 | 53 | def sanitize(node_set, blacklisted_tags) 54 | node_set 55 | .reject { |node| comment?(node) || (!text?(node) && blacklisted_tags.include?(node.name)) } 56 | .flat_map { |node| [whitespace(node, :prefix), text(node, blacklisted_tags), whitespace(node, :suffix)] } 57 | .join 58 | end 59 | 60 | def text?(node) 61 | node.is_a?(Oga::XML::Text) 62 | end 63 | 64 | def comment?(node) 65 | node.is_a?(Oga::XML::Comment) 66 | end 67 | 68 | def whitespace(node, _position) 69 | if !text?(node) && WHITESPACE_CONTENT_TAGS.include?(node.name) 70 | ' ' 71 | else 72 | '' 73 | end 74 | end 75 | 76 | def text(node, blacklisted_tags) 77 | if text?(node) 78 | node.text 79 | else 80 | sanitize(node.children, blacklisted_tags) 81 | end 82 | end 83 | end 84 | end 85 | -------------------------------------------------------------------------------- /lib/decontaminator/version.rb: -------------------------------------------------------------------------------- 1 | module Decontaminator 2 | VERSION = '1.0.2' 3 | end 4 | -------------------------------------------------------------------------------- /spec/decontaminator/fragment_spec.rb: -------------------------------------------------------------------------------- 1 | require 'decontaminator' 2 | 3 | RSpec.describe Decontaminator::Fragment do 4 | describe '#decontaminate' do 5 | it 'sanitizes an empty string' do 6 | expect(described_class.new('').decontaminate).to eq('') 7 | end 8 | 9 | it 'sanitizes an empty paragraph' do 10 | expect(described_class.new('').decontaminate).to eq(' ') 11 | end 12 | 13 | it 'sanitizes a paragraph' do 14 | expect(described_class.new('Text
').decontaminate).to eq(' Text ') 15 | end 16 | 17 | it 'sanitizes a formatted paragraph' do 18 | expect(described_class.new('Some text
').decontaminate).to eq(' Some text ') 19 | end 20 | 21 | it 'sanitizes a formatted paragraph with attributes' do 22 | expect(described_class.new('Some text
').decontaminate).to eq(' Some text ') 23 | end 24 | 25 | it 'sanitizes two formatted paragraphs' do 26 | expect(described_class.new('Paragraph one.
Paragraph two.
').decontaminate).to eq(' Paragraph one. Paragraph two. ') 27 | end 28 | 29 | it 'sanitizes a link' do 30 | expect(described_class.new('link').decontaminate).to eq('link') 31 | end 32 | 33 | it 'sanitizes a script' do 34 | expect(described_class.new('a{color:red}').decontaminate).to eq('') 39 | end 40 | 41 | it 'sanitizes multiple tags' do 42 | expect(described_class.new('Section.
but not that
').decontaminate(blacklist: %w(figcaption))).to eq(' but not that ') 47 | end 48 | 49 | it 'sanitizes content with comments' do 50 | expect(described_class.new('