├── .rspec ├── spec ├── spec_helper.rb └── confidential_info_redactor │ ├── hyperlink_spec.rb │ ├── extractor_spec.rb │ ├── date_spec.rb │ └── redactor_spec.rb ├── .travis.yml ├── lib ├── confidential_info_redactor │ ├── version.rb │ ├── hyperlink.rb │ ├── extractor.rb │ ├── redactor.rb │ └── date.rb └── confidential_info_redactor.rb ├── Gemfile ├── Rakefile ├── .gitignore ├── LICENSE.txt ├── confidential_info_redactor.gemspec └── README.md /.rspec: -------------------------------------------------------------------------------- 1 | --color -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require 'confidential_info_redactor' -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - "2.1.0" 4 | - "2.1.5" 5 | - "2.2.0" 6 | -------------------------------------------------------------------------------- /lib/confidential_info_redactor/version.rb: -------------------------------------------------------------------------------- 1 | module ConfidentialInfoRedactor 2 | VERSION = "1.0.1" 3 | end 4 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in confidential_info_redactor.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'bundler/gem_tasks' 2 | require 'rspec/core/rake_task' 3 | 4 | RSpec::Core::RakeTask.new(:spec) 5 | task :default => :spec 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /Gemfile.lock 4 | /_yardoc/ 5 | /coverage/ 6 | /doc/ 7 | /pkg/ 8 | /spec/reports/ 9 | /tmp/ 10 | *.bundle 11 | *.so 12 | *.o 13 | *.a 14 | mkmf.log 15 | -------------------------------------------------------------------------------- /lib/confidential_info_redactor.rb: -------------------------------------------------------------------------------- 1 | require 'set' 2 | require 'confidential_info_redactor/version' 3 | require 'confidential_info_redactor/extractor' 4 | require 'confidential_info_redactor/redactor' 5 | require 'pragmatic_segmenter' -------------------------------------------------------------------------------- /lib/confidential_info_redactor/hyperlink.rb: -------------------------------------------------------------------------------- 1 | require 'uri' 2 | 3 | module ConfidentialInfoRedactor 4 | class Hyperlink 5 | NON_HYPERLINK_REGEX = /\A\w+:$/ 6 | 7 | # Rubular: http://rubular.com/r/fXa4lp0gfS 8 | HYPERLINK_REGEX = /(http|https|www)(\.|:)/ 9 | 10 | def hyperlink?(text) 11 | !(text !~ URI.regexp) && text !~ NON_HYPERLINK_REGEX && !(text !~ HYPERLINK_REGEX) 12 | end 13 | 14 | def replace(text) 15 | text.split(/\s+/).map { |token| text = text.gsub(/#{Regexp.escape(token.gsub(/\.\z/, ''))}/, ' ') if !(token !~ HYPERLINK_REGEX) } 16 | text 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Kevin S. Dias 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /confidential_info_redactor.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'confidential_info_redactor/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "confidential_info_redactor" 8 | spec.version = ConfidentialInfoRedactor::VERSION 9 | spec.authors = ["Kevin S. Dias"] 10 | spec.email = ["diasks2@gmail.com"] 11 | spec.summary = %q{Semi-automatically redact confidential information from a text} 12 | spec.description = %q{A Ruby gem to semi-automatically redact confidential information from a text} 13 | spec.homepage = "https://github.com/diasks2/confidential_info_redactor" 14 | spec.license = "MIT" 15 | 16 | spec.files = `git ls-files -z`.split("\x0") 17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 19 | spec.require_paths = ["lib"] 20 | spec.required_ruby_version = '>= 2.1.0' 21 | 22 | spec.add_development_dependency "bundler", "~> 1.6" 23 | spec.add_development_dependency "rake", "~> 10.0" 24 | spec.add_development_dependency "rspec" 25 | spec.add_runtime_dependency "pragmatic_segmenter" 26 | end 27 | -------------------------------------------------------------------------------- /spec/confidential_info_redactor/hyperlink_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe ConfidentialInfoRedactor::Hyperlink do 4 | context '#hyperlink?' do 5 | it 'returns true if the string is a hyperlink #001' do 6 | string = "http://www.example.com/this-IS-a_test/hello.html" 7 | ws = described_class.new 8 | expect(ws.hyperlink?(string)).to eq(true) 9 | end 10 | 11 | it 'returns true if the string is a hyperlink #002' do 12 | string = "http://www.google.co.uk" 13 | ws = described_class.new 14 | expect(ws.hyperlink?(string)).to eq(true) 15 | end 16 | 17 | it 'returns true if the string is a hyperlink #003' do 18 | string = "https://google.co.uk" 19 | ws = described_class.new 20 | expect(ws.hyperlink?(string)).to eq(true) 21 | end 22 | 23 | it 'returns false if the string is not a hyperlink #004' do 24 | string = "hello" 25 | ws = described_class.new 26 | expect(ws.hyperlink?(string)).to eq(false) 27 | end 28 | 29 | it 'returns false if the string is not a hyperlink #005' do 30 | string = "john@gmail.com" 31 | ws = described_class.new 32 | expect(ws.hyperlink?(string)).to eq(false) 33 | end 34 | 35 | it 'returns false if the string is not a hyperlink #006' do 36 | string = "date:" 37 | ws = described_class.new 38 | expect(ws.hyperlink?(string)).to eq(false) 39 | end 40 | 41 | it 'returns false if the string is not a hyperlink #007' do 42 | string = 'The file location is c:\Users\johndoe.' 43 | ws = described_class.new 44 | expect(ws.hyperlink?(string)).to eq(false) 45 | end 46 | end 47 | 48 | context '#replace' do 49 | it 'replaces the hyperlinks in a string with regular tokens #001' do 50 | string = "Today the date is: Jan 1. Visit https://www.example.com/hello or http://www.google.co.uk" 51 | ws = described_class.new 52 | expect(ws.replace(string)).to eq("Today the date is: Jan 1. Visit or ") 53 | end 54 | 55 | it 'replaces the hyperlinks in a string with regular tokens #002' do 56 | string = 'The file location is c:\Users\johndoe or d:\Users\john\www' 57 | ws = described_class.new 58 | expect(ws.replace(string)).to eq('The file location is c:\Users\johndoe or d:\Users\john\www') 59 | end 60 | end 61 | end 62 | -------------------------------------------------------------------------------- /lib/confidential_info_redactor/extractor.rb: -------------------------------------------------------------------------------- 1 | require 'confidential_info_redactor/word_lists' 2 | 3 | module ConfidentialInfoRedactor 4 | # This class extracts proper nouns from a text 5 | class Extractor 6 | # Rubular: http://rubular.com/r/qE0g4r9zR7 7 | EXTRACT_REGEX = /(?<=\s|^|\s\"|\s\“|\s\«|\s\‹|\s\”|\s\»|\s\›)([A-Z]\S*\s)*[A-Z]\S*(?=(\s|\.|\z))|(?<=\s|^|\s\"|\s\”|\s\»|\s\›|\s\“|\s\«|\s\‹)[i][A-Z][a-z]+/ 8 | 9 | PUNCTUATION_REGEX = /[\?\)\(\!\\\/\"\:\;\,\”\“\«\»\‹\›]/ 10 | 11 | attr_reader :language, :corpus 12 | def initialize(**args) 13 | @language = args[:language] || 'en' 14 | case @language 15 | when 'en' 16 | @corpus = ConfidentialInfoRedactor::WordLists::EN_WORDS 17 | when 'de' 18 | @corpus = ConfidentialInfoRedactor::WordLists::DE_WORDS 19 | else 20 | @corpus = ConfidentialInfoRedactor::WordLists::EN_WORDS 21 | end 22 | end 23 | 24 | def extract(text) 25 | extracted_terms = [] 26 | PragmaticSegmenter::Segmenter.new(text: text.gsub(/[’‘]/, "'"), language: language).segment.each do |segment| 27 | initial_extracted_terms = extract_preliminary_terms(segment) 28 | search_ngrams(initial_extracted_terms, extracted_terms) 29 | end 30 | extracted_terms.map { |t| t.gsub(/\{\}/, '') }.delete_if { |t| t.length == 1 }.uniq.reject(&:empty?) 31 | end 32 | 33 | private 34 | 35 | def extract_preliminary_terms(segment) 36 | segment.to_s.gsub(EXTRACT_REGEX).map { |match| match unless corpus.include?(match.downcase.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '')) }.compact 37 | end 38 | 39 | def clean_token(token) 40 | token.gsub(PUNCTUATION_REGEX, '').gsub(/\'$/, '').gsub(/\.\z/, '').strip 41 | end 42 | 43 | def non_confidential_token?(token, includes_confidential) 44 | corpus.include?(token) || !includes_confidential || singular_in_corpus?(token) 45 | end 46 | 47 | def singular_in_corpus?(token) 48 | corpus.include?(token[0...-1]) && 49 | token[-1].eql?('s') || 50 | corpus.include?(token[0...-2]) && token[-2..-1].eql?('en') || 51 | corpus.include?(token[0...-2]) && token[-2..-1].eql?('es') || 52 | corpus.include?(token[0...-2]) && token[-2..-1].eql?('er') || 53 | corpus.include?(token[0...-1]) && token[-1].eql?('n') 54 | end 55 | 56 | def includes_confidential?(token) 57 | token.split(' ').map { |t| return false if corpus.include?(t.downcase) } unless token.split(' ').length.eql?(2) && token.split(' ')[1].downcase.eql?('bank') 58 | true 59 | end 60 | 61 | def matching_first_token?(tokens) 62 | corpus.include?(tokens[0]) && 63 | tokens[0] != 'the' && 64 | tokens[0] != 'deutsche' && 65 | tokens.length.eql?(2) 66 | end 67 | 68 | def find_extracted_terms(string, extracted_terms) 69 | cleaned_token_downcased = clean_token(string.downcase) 70 | cleaned_token = clean_token(string) 71 | tokens = cleaned_token_downcased.split(' ') 72 | if matching_first_token?(tokens) 73 | extracted_terms << cleaned_token.split(' ')[1] unless corpus.include?(tokens[1]) 74 | else 75 | extracted_terms << cleaned_token unless non_confidential_token?(cleaned_token_downcased, includes_confidential?(cleaned_token)) 76 | end 77 | extracted_terms 78 | end 79 | 80 | def search_ngrams(tokens, extracted_terms) 81 | tokens.each do |ngram| 82 | ngram.split(PUNCTUATION_REGEX).each do |t| 83 | next if !(t !~ /.*\d+.*/) 84 | extracted_terms = find_extracted_terms(t, extracted_terms) 85 | end 86 | end 87 | end 88 | end 89 | end 90 | -------------------------------------------------------------------------------- /lib/confidential_info_redactor/redactor.rb: -------------------------------------------------------------------------------- 1 | require 'confidential_info_redactor/date' 2 | require 'confidential_info_redactor/hyperlink' 3 | 4 | module ConfidentialInfoRedactor 5 | # This class redacts various tokens from a text 6 | class Redactor 7 | # Rubular: http://rubular.com/r/OI2wQZ0KSl 8 | NUMBER_REGEX = /(?<=\A|\A\()[^(]?\d+((,|\.|\/)*\d)*(\D?\s|\s|[[:cntrl:]]|[[:space:]]|\.?\s|\.$|$)|(?<=[[:cntrl:]]|[[:space:]]|\s|\s\(|\s'|\s‘)[^('‘]?\d+((,|\.|\/)*\d)*\"*(?=(\D?\s|\s|[[:cntrl:]]|[[:space:]]|\.?\s|\.$|$))|(?<=\s)\d+(nd|th|st)|(?<=\s)\d+\/\d+\"*(?=\s)|(?<=\()\S{1}\d+(?=\))|(?<=\s{1})\S{1}\d+\z|^\d+$|(?<=\A|\A\(|\s|[[:cntrl:]]|[[:space:]]|\s\()[^(]?\d+((,|\.|\/)*\d)*\D{2}(?=($|\s+))|(?<=\A|[[:cntrl:]]|[[:space:]]|\s|\A\(|\s\()[^\(\s]*\d+[^\.\s\)]*(?=\z|$|\s|\.$|\.\s|\))/ 9 | 10 | # Rubular: http://rubular.com/r/mxcj2G0Jfa 11 | EMAIL_REGEX = /(?<=\A|\s|\()[\w+\-.]+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+(?=\z|\s|\.|\))/i 12 | 13 | attr_reader :language, :number_text, :date_text, :token_text, :tokens, :ignore_emails, :ignore_dates, :ignore_numbers, :ignore_hyperlinks 14 | def initialize(**args) 15 | @language = args[:language] || 'en' 16 | @tokens = args[:tokens] 17 | @number_text = args[:number_text] || '' 18 | @date_text = args[:date_text] || '' 19 | @token_text = args[:token_text] || '' 20 | @ignore_emails = args[:ignore_emails] 21 | @ignore_dates = args[:ignore_dates] 22 | @ignore_numbers = args[:ignore_numbers] 23 | @ignore_hyperlinks = args[:ignore_hyperlinks] 24 | end 25 | 26 | def dates(text) 27 | redact_dates(text) 28 | end 29 | 30 | def numbers(text) 31 | redact_numbers(text) 32 | end 33 | 34 | def emails(text) 35 | redact_emails(text) 36 | end 37 | 38 | def hyperlinks(text) 39 | redact_hyperlinks(text) 40 | end 41 | 42 | def proper_nouns(text) 43 | redact_tokens(text) 44 | end 45 | 46 | def redact(text) 47 | if ignore_emails 48 | redacted_text = text 49 | else 50 | redacted_text = redact_emails(text) 51 | end 52 | redacted_text = redact_hyperlinks(redacted_text) unless ignore_hyperlinks 53 | redacted_text = redact_dates(redacted_text) unless ignore_dates 54 | redacted_text = redact_numbers(redacted_text) unless ignore_numbers 55 | redact_tokens(redacted_text) 56 | end 57 | 58 | private 59 | 60 | def redact_hyperlinks(txt) 61 | ConfidentialInfoRedactor::Hyperlink.new.replace(txt).gsub(//, "#{token_text}").gsub(/\s*#{Regexp.escape(token_text)}\s*/, " #{token_text} ").gsub(/#{Regexp.escape(token_text)}\s{1}\.{1}/, "#{token_text}.").gsub(/#{Regexp.escape(token_text)}\s{1}\,{1}/, "#{token_text},") 62 | end 63 | 64 | def redact_dates(txt) 65 | ConfidentialInfoRedactor::Date.new(language: language).replace(txt).gsub(//, "#{date_text}").gsub(/\s*#{Regexp.escape(date_text)}\s*/, " #{date_text} ").gsub(/\A\s*#{Regexp.escape(date_text)}\s*/, "#{date_text} ").gsub(/#{Regexp.escape(date_text)}\s{1}\.{1}/, "#{date_text}.") 66 | end 67 | 68 | def redact_numbers(txt) 69 | txt.gsub(NUMBER_REGEX, " #{number_text} ").gsub(/\s*#{Regexp.escape(number_text)}\s*/, " #{number_text} ").gsub(/\A\s*#{Regexp.escape(number_text)}\s*/, "#{number_text} ").gsub(/#{Regexp.escape(number_text)}\s{1}\.{1}/, "#{number_text}.").gsub(/#{Regexp.escape(number_text)}\s{1}\,{1}/, "#{number_text},").gsub(/#{Regexp.escape(number_text)}\s{1}\){1}/, "#{number_text})").gsub(/\(\s{1}#{Regexp.escape(number_text)}/, "(#{number_text}").gsub(/#{Regexp.escape(number_text)}\s\z/, "#{number_text}") 70 | end 71 | 72 | def redact_emails(txt) 73 | txt.gsub(EMAIL_REGEX, "#{token_text}") 74 | end 75 | 76 | def redact_tokens(txt) 77 | tokens.sort_by{ |x| x.split.count }.reverse.each do |token| 78 | txt.gsub!(/(?<=\s|^|\")#{Regexp.escape(token)}(?=\W|$)/, "#{token_text}") 79 | end 80 | txt.strip 81 | end 82 | end 83 | end 84 | -------------------------------------------------------------------------------- /lib/confidential_info_redactor/date.rb: -------------------------------------------------------------------------------- 1 | module ConfidentialInfoRedactor 2 | class Date 3 | EN_DOW = %w(monday tuesday wednesday thursday friday saturday sunday).freeze 4 | EN_DOW_ABBR = %w(mon tu tue tues wed th thu thur thurs fri sat sun).freeze 5 | EN_MONTHS = %w(january february march april may june july august september october november december).freeze 6 | EN_MONTH_ABBR = %w(jan feb mar apr jun jul aug sep sept oct nov dec).freeze 7 | 8 | DE_DOW = %w(montag dienstag mittwoch donnerstag freitag samstag sonntag sonnabend).freeze 9 | DE_DOW_ABBR = %w(mo di mi do fr sa so).freeze 10 | DE_MONTHS = %w(januar februar märz april mai juni juli august september oktober november dezember).freeze 11 | DE_MONTH_ABBR = %w(jan jän feb märz apr mai juni juli aug sep sept okt nov dez).freeze 12 | # Rubular: http://rubular.com/r/73CZ2HU0q6 13 | DMY_MDY_REGEX = /(\d{1,2}(\/|\.|-)){2}\d{4}/ 14 | 15 | # Rubular: http://rubular.com/r/GWbuWXw4t0 16 | YMD_YDM_REGEX = /\d{4}(\/|\.|-)(\d{1,2}(\/|\.|-)){2}/ 17 | 18 | # Rubular: http://rubular.com/r/SRZ27XNlvR 19 | DIGIT_ONLY_YEAR_FIRST_REGEX = /[12]\d{7}\D/ 20 | 21 | # Rubular: http://rubular.com/r/mpVSeaKwdY 22 | DIGIT_ONLY_YEAR_LAST_REGEX = /\d{4}[12]\d{3}\D/ 23 | 24 | attr_reader :language, :dow, :dow_abbr, :months, :months_abbr 25 | def initialize(language:) 26 | @language = language 27 | case language 28 | when 'en' 29 | @dow = EN_DOW 30 | @dow_abbr = EN_DOW_ABBR 31 | @months = EN_MONTHS 32 | @months_abbr = EN_MONTH_ABBR 33 | when 'de' 34 | @dow = DE_DOW 35 | @dow_abbr = DE_DOW_ABBR 36 | @months = DE_MONTHS 37 | @months_abbr = DE_MONTH_ABBR 38 | else 39 | @dow = EN_DOW 40 | @dow_abbr = EN_DOW_ABBR 41 | @months = EN_MONTHS 42 | @months_abbr = EN_MONTH_ABBR 43 | end 44 | end 45 | 46 | def includes_date?(text) 47 | includes_long_date?(text) || includes_number_only_date?(text) 48 | end 49 | 50 | def replace(text) 51 | return text unless is_an_array? 52 | counter = 0 53 | dow_abbr.map { |day| counter +=1 if text.include?('day') } 54 | text = redact_dates(counter, text) 55 | redact_regex(text) 56 | end 57 | 58 | def occurences(text) 59 | replace(text).scan(//).size 60 | end 61 | 62 | def replace_number_only_date(text) 63 | text.gsub(DMY_MDY_REGEX, ' ') 64 | .gsub(YMD_YDM_REGEX, ' ') 65 | .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' ') 66 | .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' ') 67 | end 68 | 69 | private 70 | 71 | def is_an_array? 72 | dow.kind_of?(Array) && dow_abbr.kind_of?(Array) && months.kind_of?(Array) && months_abbr.kind_of?(Array) 73 | end 74 | 75 | def redact_dates(counter, text) 76 | if counter > 0 77 | text = redact_dow_abbr(text) 78 | text = redact_dow(text) 79 | else 80 | text = redact_dow(text) 81 | text = redact_dow_abbr(text) 82 | end 83 | text 84 | end 85 | 86 | def redact_regex(text) 87 | text.gsub(DMY_MDY_REGEX, ' ') 88 | .gsub(YMD_YDM_REGEX, ' ') 89 | .gsub(DIGIT_ONLY_YEAR_FIRST_REGEX, ' ') 90 | .gsub(DIGIT_ONLY_YEAR_LAST_REGEX, ' ') 91 | end 92 | 93 | def redact_dow(text) 94 | dow.each do |day| 95 | months.map { |month| text = redact_date(text, day, month) } 96 | months_abbr.map { |month| text = redact_date(text, day, month) } 97 | end 98 | text 99 | end 100 | 101 | def redact_dow_abbr(text) 102 | dow_abbr.each do |day| 103 | months.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' ') } 104 | months_abbr.map { |month| text = text.gsub(/#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' ') } 105 | end 106 | text 107 | end 108 | 109 | def redact_date(text, day, month) 110 | text.gsub(/#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' ') 111 | .gsub(/\d+\s+de\s+#{Regexp.escape(month)}\s\d{4}/i, ' ') 112 | .gsub(/\d{2}(\.|-|\/)*\s?#{Regexp.escape(month)}(\.|-|\/)*\s?(\d{4}|\d{2})/i, ' ') 113 | .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i, ' ') 114 | .gsub(/\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i, ' ') 115 | .gsub(/\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i, ' ') 116 | .gsub(/#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i, ' ') 117 | .gsub(/#{Regexp.escape(month)}\sde\s\d+(rd|th|st)*/i, ' ') 118 | end 119 | 120 | def includes_long_date?(text) 121 | includes_long_date_1?(text) || includes_long_date_2?(text) 122 | end 123 | 124 | def includes_long_date_1?(text) 125 | dow.each do |day| 126 | months.map { |month| return true if check_for_matches(day, month, text) } 127 | months_abbr.map { |month| return true if check_for_matches(day, month, text) } 128 | end 129 | false 130 | end 131 | 132 | def includes_long_date_2?(text) 133 | dow_abbr.each do |day| 134 | months.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*(,)*\s\d{4}/i) } 135 | months_abbr.map { |month| return true if !(text !~ /#{Regexp.escape(day)}(\.)*(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) } 136 | end 137 | false 138 | end 139 | 140 | def includes_number_only_date?(text) 141 | !(text !~ DMY_MDY_REGEX) || 142 | !(text !~ YMD_YDM_REGEX) || 143 | !(text !~ DIGIT_ONLY_YEAR_FIRST_REGEX) || 144 | !(text !~ DIGIT_ONLY_YEAR_LAST_REGEX) 145 | end 146 | 147 | def check_for_matches(day, month, text) 148 | !(text !~ /#{Regexp.escape(day)}(,)*\s#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) || 149 | !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*(,)*\s\d{4}/i) || 150 | !(text !~ /\d{4}\.*\s#{Regexp.escape(month)}\s\d+(rd|th|st)*/i) || 151 | !(text !~ /\d{4}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*\d+/i) || 152 | !(text !~ /#{Regexp.escape(month)}(\.)*\s\d+(rd|th|st)*/i) || 153 | !(text !~ /\d{2}(\.|-|\/)*#{Regexp.escape(month)}(\.|-|\/)*(\d{4}|\d{2})/i) 154 | end 155 | end 156 | end 157 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Confidential Info Redactor 2 | 3 | [![Gem Version](https://badge.fury.io/rb/confidential_info_redactor.svg)](http://badge.fury.io/rb/confidential_info_redactor) [![Build Status](https://travis-ci.org/diasks2/confidential_info_redactor.png)](https://travis-ci.org/diasks2/confidential_info_redactor) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/diasks2/confidential_info_redactor/blob/master/LICENSE.txt) 4 | 5 | Confidential Info Redactor is a Ruby gem to semi-automatically redact confidential information from a text. 6 | 7 | This gem is a poor man's named-entity recognition (NER) library built to extract (and later redact) information in a text (such as proper nouns) that may be confidential. 8 | 9 | It differs from typical NER as it makes no attempt to identify whether a token is a person, company, location, etc. It only attempts to extract tokens that might fit into one of those categories. 10 | 11 | Your use case may vary, but the gem was written to first extract potential sensitive tokens from a text and then show the user the extracted tokens and let the user decide which ones should be redacted (or add missing tokens to the list). 12 | 13 | The way the gem works is rather simple. It uses regular expressions to search for capitalized tokens (1-grams, 2-grams, 3-grams etc.) and then checks whether those tokens match a list of the common vocabulary for that language (e.g. the x most frequent words - the size of x depending on what is available for that language). If the token is not in the list of words for that language it is added to an array of tokens that should be checked by the user as potential "confidential information". 14 | 15 | In the sentence "Pepsi and Coca-Cola battled for position in the market." the gem would extract "Pepsi" and "Coca-Cola" as potential tokens to redact. 16 | 17 | In addition to searching for proper nouns, the gem also has the functionality to redact numbers, dates, emails and hyperlinks. 18 | 19 | This gem comes with built-in language support for English and German. If you are interested in other language support, check out [Confidential Info Redactor Lite](https://github.com/diasks2/confidential_info_redactor_lite) where you can supply your own language vocabulary files. 20 | 21 | ## Install 22 | 23 | **Ruby** 24 | *Supports Ruby 2.1.0 and above* 25 | ``` 26 | gem install confidential_info_redactor 27 | ``` 28 | 29 | **Ruby on Rails** 30 | Add this line to your application’s Gemfile: 31 | ```ruby 32 | gem 'confidential_info_redactor' 33 | ``` 34 | 35 | ## Usage 36 | 37 | * If no language is specified, the library will default to English. 38 | * To specify a language use its two character [ISO 639-1 code](https://www.tm-town.com/languages). 39 | 40 | ```ruby 41 | text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.' 42 | 43 | tokens = ConfidentialInfoRedactor::Extractor.new.extract(text) 44 | # => ["Coca-Cola", "Pepsi", "John Smith"] 45 | 46 | ConfidentialInfoRedactor::Redactor.new(tokens: tokens).redact(text) 47 | # => ' announced a merger with that will happen on for . Please contact at or visit .' 48 | 49 | # You can also just use a specific redactor 50 | ConfidentialInfoRedactor::Redactor.new.dates(text) 51 | # => 'Coca-Cola announced a merger with Pepsi that will happen on for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.' 52 | 53 | ConfidentialInfoRedactor::Redactor.new.numbers(text) 54 | # => 'Coca-Cola announced a merger with Pepsi that will happen on December , for . Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.' 55 | 56 | ConfidentialInfoRedactor::Redactor.new.emails(text) 57 | # => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at or visit http://www.super-fake-merger.com.' 58 | 59 | ConfidentialInfoRedactor::Redactor.new.hyperlinks(text) 60 | # => 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit .' 61 | 62 | ConfidentialInfoRedactor::Redactor.new(tokens: tokens).proper_nouns(text) 63 | # => ' announced a merger with that will happen on December 15th, 2020 for $200,000,000,000. Please contact at j.smith@example.com or visit http://www.super-fake-merger.com.' 64 | 65 | # It is possible to 'turn off' any of the specific redactors 66 | ConfidentialInfoRedactor::Redactor.new(tokens: tokens, ignore_numbers: true).redact(text) 67 | # => ' announced a merger with that will happen on for $200,000,000,000. Please contact at or visit .' 68 | 69 | # German Example 70 | text = 'Viele Mitarbeiter der Deutschen Bank suchen eine andere Arbeitsstelle.' 71 | 72 | tokens = ConfidentialInfoRedactor::Extractor.new(language: 'de').extract(text) 73 | # => ['Deutschen Bank'] 74 | 75 | ConfidentialInfoRedactor::Redactor.new(language: 'de', tokens: tokens).redact(text) 76 | # => 'Viele Mitarbeiter der suchen eine andere Arbeitsstelle.' 77 | 78 | # It is also possible to change the redaction text 79 | text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.' 80 | tokens = ['Coca-Cola', 'Pepsi', 'John Smith'] 81 | ConfidentialInfoRedactor::Redactor.new(tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****').redact(text) 82 | # => '***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.' 83 | ``` 84 | 85 | #### Redactor class options 86 | * `language` *(optional - defaults to 'en' if not specified)* 87 | * `tokens` *(optional - any tokens to redact from the text)* 88 | * `number_text` *(optional - change the text for redacted numbers; the standard is ``)* 89 | * `date_text` *(optional - change the text for redacted dates; the standard is ``)* 90 | * `token_text` *(optional - change the text for redacted tokens, emails and hyperlinks; the standard is ``)* 91 | * `ignore_emails` *(optional - set to true if you do not want to redact emails)* 92 | * `ignore_dates` *(optional - set to true if you do not want to redact dates)* 93 | * `ignore_numbers` *(optional - set to true if you do not want to redact numbers)* 94 | * `ignore_hyperlinks` *(optional - set to true if you do not want to redact hyperlinks)* 95 | 96 | #### Languages Supported 97 | * English ('en') 98 | * German ('de') 99 | 100 | ## Contributing 101 | 102 | 1. Fork it ( https://github.com/diasks2/confidential_info_redactor/fork ) 103 | 2. Create your feature branch (`git checkout -b my-new-feature`) 104 | 3. Commit your changes (`git commit -am 'Add some feature'`) 105 | 4. Push to the branch (`git push origin my-new-feature`) 106 | 5. Create a new Pull Request 107 | 108 | ## License 109 | 110 | The MIT License (MIT) 111 | 112 | Copyright (c) 2015 Kevin S. Dias 113 | 114 | Permission is hereby granted, free of charge, to any person obtaining a copy 115 | of this software and associated documentation files (the "Software"), to deal 116 | in the Software without restriction, including without limitation the rights 117 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 118 | copies of the Software, and to permit persons to whom the Software is 119 | furnished to do so, subject to the following conditions: 120 | 121 | The above copyright notice and this permission notice shall be included in 122 | all copies or substantial portions of the Software. 123 | 124 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 125 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 126 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 127 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 128 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 129 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 130 | THE SOFTWARE. -------------------------------------------------------------------------------- /spec/confidential_info_redactor/extractor_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe ConfidentialInfoRedactor::Extractor do 4 | describe '#extract' do 5 | context 'English (en)' do 6 | it 'extracts the proper nouns from a text #001' do 7 | text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000.' 8 | expect(described_class.new(language: 'en').extract(text)).to eq(['Coca-Cola', 'Pepsi']) 9 | end 10 | 11 | it 'extracts the proper nouns from a text #002' do 12 | text = 'Coca-Cola announced a merger with Pepsi.' 13 | expect(described_class.new(language: 'en').extract(text)).to eq(['Coca-Cola', 'Pepsi']) 14 | end 15 | 16 | it 'extracts the proper nouns from a text #003' do 17 | text = 'Many employees of Deutsche Bank are looking for another job.' 18 | expect(described_class.new(language: 'en').extract(text)).to eq(['Deutsche Bank']) 19 | end 20 | 21 | it 'extracts the proper nouns from a text #004' do 22 | text = 'Many employees of Deutsche Bank are looking for another job while those from Pepsi are not.' 23 | expect(described_class.new(language: 'en').extract(text)).to eq(['Deutsche Bank', 'Pepsi']) 24 | end 25 | 26 | it 'extracts the proper nouns from a text #005' do 27 | text = 'There are many employees at Deutsche Bank. Some are thinking about drinking Pepsi, Coke, or Sprite.' 28 | expect(described_class.new(language: 'en').extract(text)).to eq(['Deutsche Bank', 'Pepsi', 'Coke', 'Sprite']) 29 | end 30 | 31 | it 'extracts the proper nouns from a text #006' do 32 | text = <<-EOF 33 | Putter King Miniature Golf Scavenger Hunt 34 | 35 | Putter King is hosting the 1st Annual Miniature Golf Scavenger Hunt. So get out your putter and your camera and see if you have what it takes. Are you a King? 36 | 37 | The Official List: 38 | 39 | #1) Autographs of 2 professional miniature golfers, each from a different country. (45 points; 5 bonus points if the professional miniature golfers are also from 2 different continents) 40 | 41 | #2) Picture of yourself next to each obstacle in our list of the Top 10 Nostalgic Miniature Golf Obstacles. (120 points; 20 bonus points for each obstacle that exactly matches the one pictured in the article) 42 | 43 | #3) Build your own full-size miniature golf hole. (75 points; up to 100 bonus points available depending on the craftsmanship, playability, creativity and fun factor of your hole) 44 | 45 | #4) Video of yourself making a hole-in-one on two consecutive miniature golf holes. The video must be one continuous shot with no editing. (60 points) 46 | 47 | #5) Picture of yourself with the Putter King mascot. (50 points; 15 bonus points if you are wearing a Putter King t-shirt) 48 | 49 | #6) Picture of yourself with the completed Putter King wobblehead. (15 points; 15 bonus points if the picture is taken at a miniature golf course) 50 | 51 | #7) Picture of a completed scorecard from a round of miniature golf. The round of golf must have taken place after the start of this scavenger hunt. (10 points) 52 | 53 | #8) Picture of completed scorecards from 5 different miniature golf courses. Each round of golf must have taken place after the start of this scavenger hunt. (35 points) 54 | 55 | #9) Submit an entry to the 2011 Putter King Hole Design Contest. (60 points; 40 bonus points if your entry gets more than 100 votes) 56 | 57 | #10) Screenshot from the Putter King app showing a 9-hole score below par. (10 points) 58 | 59 | #11) Screenshot from the Putter King app showing that you have successfully unlocked all of the holes in the game. (45 points) 60 | 61 | #12) Picture of the Putter King wobblehead at a World Heritage Site. (55 points) 62 | 63 | #13) Complete and submit the Putter King ‘Practice Activity’ and ‘Final Project’ for any one of the Putter King math or physics lessons. (40 points; 20 bonus points if you complete two lessons) 64 | 65 | #14) Picture of yourself with at least 6 different colored miniature golf balls. (10 points; 2 bonus points for each additional color {limit of 10 bonus points}) 66 | 67 | #15) Picture of yourself with a famous golfer or miniature golfer. (15 points; 150 bonus points if the golfer is on the PGA tour AND you are wearing a Putter King t-shirt in the picture) 68 | 69 | #16) Video of yourself making a hole-in-one on a miniature golf hole with a loop-de-loop obstacle. (30 points) 70 | 71 | #17) Video of yourself successfully making a trick miniature golf shot. (40 points; up to 100 bonus points available depending on the difficulty and complexity of the trick shot) 72 | 73 | 74 | Prizes: 75 | 76 | $100 iTunes Gift Card 77 | 78 | Putter King Scavenger Hunt Trophy 79 | (6 3/4" Engraved Crystal Trophy - Picture Coming Soon) 80 | 81 | The Putter King team will judge the scavenger hunt and all decisions will be final. The U.S. Government is sponsoring it. The scavenger hunt is open to anyone and everyone. The scavenger hunt ends on Dec. 31st, 2011. 82 | 83 | To enter the scavenger hunt, send an email to info AT putterking DOT com with the subject line: "Putter King Scavenger Hunt Submission". In the email please include links to the pictures and videos you are submitting. You can utilize free photo and video hosting sites such as YouTube, Flickr, Picasa, Photobucket, etc. for your submissions. 84 | 85 | By entering the Putter King Miniature Golf Scavenger Hunt, you allow Putter King to use or link to any of the pictures or videos you submit for advertisements and promotions. 86 | 87 | Don’t forget to use your imagination and creativity! 88 | EOF 89 | expect(described_class.new(text: text).extract(text)).to eq(["PGA", "iTunes", "YouTube", "Flickr", "Picasa", "Photobucket"]) 90 | end 91 | 92 | it 'extracts the proper nouns from a text #007' do 93 | text = 'I learned that Apple has plans to release a new iPhone, iPad and iWatch.' 94 | expect(described_class.new(language: 'en').extract(text)).to eq(['Apple', 'iPhone', 'iPad', 'iWatch']) 95 | end 96 | 97 | it 'extracts the proper nouns from a text #008' do 98 | text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.' 99 | expect(described_class.new(language: 'en').extract(text)).to eq(["Coca-Cola", "Pepsi", "John Smith"]) 100 | end 101 | 102 | it 'extracts the proper nouns from a text #009' do 103 | text = 'Then Peter went to the store.' 104 | expect(described_class.new(language: 'en').extract(text)).to eq(["Peter"]) 105 | end 106 | 107 | it 'extracts the proper nouns from a text #010' do 108 | text = 'HOW TO COOK VEGETABLES' 109 | expect(described_class.new(language: 'en').extract(text)).to eq([]) 110 | end 111 | 112 | it 'extracts the proper nouns from a text #011' do 113 | text = 'All Natural Peanut Butter' 114 | expect(described_class.new(language: 'en').extract(text)).to eq([]) 115 | end 116 | 117 | it 'extracts the proper nouns from a text #012' do 118 | text = 'GOOD CARBS VS. BAD CARBS' 119 | expect(described_class.new(language: 'en').extract(text)).to eq([]) 120 | end 121 | 122 | it 'extracts the proper nouns from a text #013' do 123 | text = 'Reducing”' 124 | expect(described_class.new(language: 'en').extract(text)).to eq([]) 125 | end 126 | 127 | it 'extracts the proper nouns from a text #014' do 128 | text = '”' 129 | expect(described_class.new(language: 'en').extract(text)).to eq([]) 130 | end 131 | 132 | it 'extracts the proper nouns from a text #015' do 133 | text = '“Reducing' 134 | expect(described_class.new(language: 'en').extract(text)).to eq([]) 135 | end 136 | 137 | it 'extracts the proper nouns from a text #016' do 138 | text = 'Corrigendum to Council Regulation (EC) No 85/2009 of 19 January 2009 amending Regulation (EC) No 1083/2006 laying down general provisions on the European Regional Development Fund, the European Social Fund and the Cohesion Fund concerning certain provisions relating to financial management' 139 | expect(described_class.new(language: 'en').extract(text)).to eq(["Corrigendum"]) 140 | end 141 | 142 | it 'extracts the proper nouns from a text #017' do 143 | text = 'John' 144 | expect(described_class.new(language: 'en').extract(text)).to eq(['John']) 145 | end 146 | end 147 | 148 | context 'German (de)' do 149 | it 'extracts the proper nouns from a text #001' do 150 | text = 'Viele Mitarbeiter der Deutschen Bank suchen eine andere Arbeitsstelle.' 151 | expect(described_class.new(language: 'de').extract(text)).to eq(['Deutschen Bank']) 152 | end 153 | 154 | it 'extracts the proper nouns from a text #002' do 155 | text = 'Viele Mitarbeiter der Deutsche Bank suchen eine andere Arbeitsstelle.' 156 | expect(described_class.new(language: 'de').extract(text)).to eq(['Deutsche Bank']) 157 | end 158 | 159 | it 'extracts the proper nouns from a text #003' do 160 | text = 'Ich behielt diese Routine während und sogar während des Studiums an der Uni bei, und ich war damals froh, wenn ich pro Tag zwei ganze Mahlzeiten zu mir nahm.' 161 | expect(described_class.new(language: 'de').extract(text)).to eq([]) 162 | end 163 | end 164 | end 165 | end -------------------------------------------------------------------------------- /spec/confidential_info_redactor/date_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe ConfidentialInfoRedactor::Date do 4 | context '#includes_date?' do 5 | it 'returns true if the string includes a date #001' do 6 | string = 'Today is Monday, April 4th, 2011, aka 04/04/2011.' 7 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 8 | expect(ws.includes_date?(string)).to eq(true) 9 | end 10 | 11 | it 'returns true if the string includes a date #002' do 12 | string = 'Today is Monday April 4th 2011.' 13 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 14 | expect(ws.includes_date?(string)).to eq(true) 15 | end 16 | 17 | it 'returns true if the string includes a date #003' do 18 | string = 'Today is April 4th, 2011.' 19 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 20 | expect(ws.includes_date?(string)).to eq(true) 21 | end 22 | 23 | it 'returns true if the string includes a date #004' do 24 | string = 'Today is Mon., Apr. 4, 2011.' 25 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 26 | expect(ws.includes_date?(string)).to eq(true) 27 | end 28 | 29 | it 'returns true if the string includes a date #005' do 30 | string = 'Today is 04/04/2011.' 31 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 32 | expect(ws.includes_date?(string)).to eq(true) 33 | end 34 | 35 | it 'returns true if the string includes a date #006' do 36 | string = 'Today is 04.04.2011.' 37 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 38 | expect(ws.includes_date?(string)).to eq(true) 39 | end 40 | 41 | it 'returns true if the string includes a date #007' do 42 | string = 'Today is 2011.04.04.' 43 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 44 | expect(ws.includes_date?(string)).to eq(true) 45 | end 46 | 47 | it 'returns true if the string includes a date #008' do 48 | string = 'Today is 2011/04/04.' 49 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 50 | expect(ws.includes_date?(string)).to eq(true) 51 | end 52 | 53 | it 'returns true if the string includes a date #009' do 54 | string = 'Today is 2011-04-04.' 55 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 56 | expect(ws.includes_date?(string)).to eq(true) 57 | end 58 | 59 | it 'returns true if the string includes a date #010' do 60 | string = 'Today is 04-04-2011.' 61 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 62 | expect(ws.includes_date?(string)).to eq(true) 63 | end 64 | 65 | it 'returns true if the string includes a date #011' do 66 | string = 'Today is 2003 November 9.' 67 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 68 | expect(ws.includes_date?(string)).to eq(true) 69 | end 70 | 71 | it 'returns true if the string includes a date #012' do 72 | string = 'Today is 2003Nov9.' 73 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 74 | expect(ws.includes_date?(string)).to eq(true) 75 | end 76 | 77 | it 'returns true if the string includes a date #013' do 78 | string = 'Today is 2003Nov09.' 79 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 80 | expect(ws.includes_date?(string)).to eq(true) 81 | end 82 | 83 | it 'returns true if the string includes a date #014' do 84 | string = 'Today is 2003-Nov-9.' 85 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 86 | expect(ws.includes_date?(string)).to eq(true) 87 | end 88 | 89 | it 'returns true if the string includes a date #015' do 90 | string = 'Today is 2003-Nov-09.' 91 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 92 | expect(ws.includes_date?(string)).to eq(true) 93 | end 94 | 95 | it 'returns true if the string includes a date #016' do 96 | string = 'Today is 2003-Nov-9, Sunday.' 97 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 98 | expect(ws.includes_date?(string)).to eq(true) 99 | end 100 | 101 | it 'returns true if the string includes a date #017' do 102 | string = 'Today is 2003. november 9.' 103 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 104 | expect(ws.includes_date?(string)).to eq(true) 105 | end 106 | 107 | it 'returns true if the string includes a date #018' do 108 | string = 'Today is 2003.11.9.' 109 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 110 | expect(ws.includes_date?(string)).to eq(true) 111 | end 112 | 113 | it 'returns true if the string includes a date #019' do 114 | string = 'Today is Monday, Apr. 4, 2011.' 115 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 116 | expect(ws.includes_date?(string)).to eq(true) 117 | end 118 | 119 | it 'returns true if the string includes a date #020' do 120 | string = 'Today is 2003/11/09.' 121 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 122 | expect(ws.includes_date?(string)).to eq(true) 123 | end 124 | 125 | it 'returns true if the string includes a date #021' do 126 | string = 'Today is 20030109.' 127 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 128 | expect(ws.includes_date?(string)).to eq(true) 129 | end 130 | 131 | it 'returns true if the string includes a date #022' do 132 | string = 'Today is 01092003.' 133 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 134 | expect(ws.includes_date?(string)).to eq(true) 135 | end 136 | 137 | it 'returns true if the string includes a date #023' do 138 | string = 'Today is Sunday, November 9, 2014.' 139 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 140 | expect(ws.includes_date?(string)).to eq(true) 141 | end 142 | 143 | it 'returns true if the string includes a date #024' do 144 | string = 'Today is November 9, 2014.' 145 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 146 | expect(ws.includes_date?(string)).to eq(true) 147 | end 148 | 149 | it 'returns true if the string includes a date #025' do 150 | string = 'Today is Nov. 9, 2014.' 151 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 152 | expect(ws.includes_date?(string)).to eq(true) 153 | end 154 | 155 | it 'returns true if the string includes a date #026' do 156 | string = 'Today is july 1st.' 157 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 158 | expect(ws.includes_date?(string)).to eq(true) 159 | end 160 | 161 | it 'returns true if the string includes a date #027' do 162 | string = 'Today is jul. 1st.' 163 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 164 | expect(ws.includes_date?(string)).to eq(true) 165 | end 166 | 167 | it 'returns true if the string includes a date #028' do 168 | string = 'Today is 8 November 2014.' 169 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 170 | expect(ws.includes_date?(string)).to eq(true) 171 | end 172 | 173 | it 'returns true if the string includes a date #029' do 174 | string = 'Today is 8. November 2014.' 175 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 176 | expect(ws.includes_date?(string)).to eq(true) 177 | end 178 | 179 | it 'returns true if the string includes a date #030' do 180 | string = 'Today is 08-Nov-2014.' 181 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 182 | expect(ws.includes_date?(string)).to eq(true) 183 | end 184 | 185 | it 'returns true if the string includes a date #031' do 186 | string = 'Today is 08Nov14.' 187 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 188 | expect(ws.includes_date?(string)).to eq(true) 189 | end 190 | 191 | it 'returns true if the string includes a date #032' do 192 | string = 'Today is 8th November 2014.' 193 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 194 | expect(ws.includes_date?(string)).to eq(true) 195 | end 196 | 197 | it 'returns true if the string includes a date #033' do 198 | string = 'Today is the 8th of November 2014.' 199 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 200 | expect(ws.includes_date?(string)).to eq(true) 201 | end 202 | 203 | it 'returns true if the string includes a date #034' do 204 | string = 'Today is 08/Nov/2014.' 205 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 206 | expect(ws.includes_date?(string)).to eq(true) 207 | end 208 | 209 | it 'returns true if the string includes a date #035' do 210 | string = 'Today is Sunday, 8 November 2014.' 211 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 212 | expect(ws.includes_date?(string)).to eq(true) 213 | end 214 | 215 | it 'returns true if the string includes a date #036' do 216 | string = 'Today is 8 November 2014.' 217 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 218 | expect(ws.includes_date?(string)).to eq(true) 219 | end 220 | 221 | it 'returns false if the string does not include a date #037' do 222 | string = 'Hello world. There is no date here - $50,000. The sun is hot.' 223 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 224 | expect(ws.includes_date?(string)).to eq(false) 225 | end 226 | end 227 | 228 | context '#occurences' do 229 | it 'counts the date occurences in a string #001' do 230 | string = 'Today is Sunday, 8 November 2014.' 231 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 232 | expect(ws.occurences(string)).to eq(1) 233 | end 234 | 235 | it 'counts the date occurences in a string #002' do 236 | string = 'Today is Sunday, 8 November 2014. Yesterday was 07/Nov/2014.' 237 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 238 | expect(ws.occurences(string)).to eq(2) 239 | end 240 | end 241 | 242 | context '#replace' do 243 | context 'English (en)' do 244 | it 'replaces the date occurences in a string #001' do 245 | string = 'Today is Tues. March 3rd, 2011.' 246 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 247 | expect(ws.replace(string)).to eq('Today is .') 248 | end 249 | 250 | it 'replaces the date occurences in a string #002' do 251 | string = 'The scavenger hunt ends on Dec. 31st, 2011.' 252 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 253 | expect(ws.replace(string)).to eq('The scavenger hunt ends on .') 254 | end 255 | end 256 | 257 | context 'German (de)' do 258 | it 'replaces the date occurences in a string #001' do 259 | string = '15. Oktober 2015' 260 | ws = ConfidentialInfoRedactor::Date.new(language: 'de') 261 | expect(ws.replace(string)).to eq(' ') 262 | end 263 | 264 | it 'replaces the date occurences in a string #002' do 265 | string = 'Oktober de 15' 266 | ws = ConfidentialInfoRedactor::Date.new(language: 'de') 267 | expect(ws.replace(string)).to eq(' ') 268 | end 269 | end 270 | end 271 | 272 | context '#replace_number_only_date' do 273 | it 'replaces only the number date occurences in a string' do 274 | string = 'Today is Tues. March 3rd, 2011. 4/28/2013' 275 | ws = ConfidentialInfoRedactor::Date.new(language: 'en') 276 | expect(ws.replace_number_only_date(string)).to eq("Today is Tues. March 3rd, 2011. ") 277 | end 278 | end 279 | end 280 | -------------------------------------------------------------------------------- /spec/confidential_info_redactor/redactor_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe ConfidentialInfoRedactor::Redactor do 4 | describe '#dates' do 5 | it 'redacts dates from a text #001' do 6 | text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000.' 7 | expect(described_class.new(language: 'en').dates(text)).to eq('Coca-Cola announced a merger with Pepsi that will happen on for $200,000,000,000.') 8 | end 9 | 10 | it 'redacts dates from a text #002' do 11 | text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020.' 12 | expect(described_class.new(language: 'en').dates(text)).to eq('Coca-Cola announced a merger with Pepsi that will happen on .') 13 | end 14 | 15 | it 'redacts dates from a text #003' do 16 | text = 'December 5, 2010 - Coca-Cola announced a merger with Pepsi.' 17 | expect(described_class.new(language: 'en').dates(text)).to eq(' - Coca-Cola announced a merger with Pepsi.') 18 | end 19 | 20 | it 'redacts dates from a text #004' do 21 | text = 'The scavenger hunt ends on Dec. 31st, 2011.' 22 | expect(described_class.new(language: 'en').dates(text)).to eq('The scavenger hunt ends on .') 23 | end 24 | end 25 | 26 | describe '#numbers' do 27 | it 'redacts numbers from a text #001' do 28 | text = 'Coca-Cola announced a merger with Pepsi that will happen on for $200,000,000,000.' 29 | expect(described_class.new(language: 'en').numbers(text)).to eq('Coca-Cola announced a merger with Pepsi that will happen on for .') 30 | end 31 | 32 | it 'redacts numbers from a text #002' do 33 | text = '200 years ago.' 34 | expect(described_class.new(language: 'en').numbers(text)).to eq(' years ago.') 35 | end 36 | 37 | it 'redacts numbers from a text #003' do 38 | text = 'It was his 1st time, not yet his 10th, not even his 2nd. The wood was 3/4" thick.' 39 | expect(described_class.new(language: 'en').numbers(text)).to eq('It was his time, not yet his , not even his . The wood was thick.') 40 | end 41 | 42 | it 'redacts numbers from a text #004' do 43 | text = 'Checking file of %2' 44 | expect(described_class.new(language: 'en').numbers(text)).to eq('Checking file of ') 45 | end 46 | 47 | it 'redacts numbers from a text #005' do 48 | text = 'zawiera pliki skompresowane (%2).' 49 | expect(described_class.new(language: 'en').numbers(text)).to eq('zawiera pliki skompresowane ().') 50 | end 51 | end 52 | 53 | describe '#emails' do 54 | it 'redacts email addresses from a text #001' do 55 | text = 'His email is john@gmail.com or you can try k.light@tuv.eu.us.' 56 | expect(described_class.new(language: 'en').emails(text)).to eq('His email is or you can try .') 57 | end 58 | 59 | it 'redacts email addresses from a text #002' do 60 | text = 'His email is (john@gmail.com) or you can try (k.light@tuv.eu.us).' 61 | expect(described_class.new(language: 'en').emails(text)).to eq('His email is () or you can try ().') 62 | end 63 | end 64 | 65 | describe '#hyperlinks' do 66 | it 'redacts hyperlinks from a text #001' do 67 | text = 'Visit https://www.tm-town.com for more info.' 68 | expect(described_class.new(language: 'en').hyperlinks(text)).to eq('Visit for more info.') 69 | end 70 | end 71 | 72 | describe '#proper_nouns' do 73 | it 'redacts tokens from a text #001' do 74 | tokens = ['Coca-Cola', 'Pepsi'] 75 | text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000.' 76 | expect(described_class.new(language: 'en', tokens: tokens).proper_nouns(text)).to eq(' announced a merger with that will happen on on December 15th, 2020 for $200,000,000,000.') 77 | end 78 | 79 | it 'redacts tokens from a text #002' do 80 | tokens = ['Coca-Cola', 'Pepsi'] 81 | text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000.' 82 | expect(described_class.new(language: 'en', tokens: tokens, token_text: '*****').proper_nouns(text)).to eq('***** announced a merger with ***** that will happen on on December 15th, 2020 for $200,000,000,000.') 83 | end 84 | end 85 | 86 | describe '#redact' do 87 | it 'redacts all confidential information from a text #001' do 88 | tokens = ['Coca-Cola', 'Pepsi'] 89 | text = 'Coca-Cola announced a merger with Pepsi that will happen on on December 15th, 2020 for $200,000,000,000.' 90 | expect(described_class.new(language: 'en', tokens: tokens).redact(text)).to eq(' announced a merger with that will happen on on for .') 91 | end 92 | 93 | it 'redacts all confidential information from a text #002' do 94 | text = <<-EOF 95 | Putter King Miniature Golf Scavenger Hunt 96 | 97 | Putter King is hosting the 1st Annual Miniature Golf Scavenger Hunt. So get out your putter and your camera and see if you have what it takes. Are you a King? 98 | 99 | The Official List: 100 | 101 | #1) Autographs of 2 professional miniature golfers, each from a different country. (45 points; 5 bonus points if the professional miniature golfers are also from 2 different continents) 102 | 103 | #2) Picture of yourself next to each obstacle in our list of the Top 10 Nostalgic Miniature Golf Obstacles. (120 points; 20 bonus points for each obstacle that exactly matches the one pictured in the article) 104 | 105 | #3) Build your own full-size miniature golf hole. (75 points; up to 100 bonus points available depending on the craftsmanship, playability, creativity and fun factor of your hole) 106 | 107 | #4) Video of yourself making a hole-in-one on two consecutive miniature golf holes. The video must be one continuous shot with no editing. (60 points) 108 | 109 | #5) Picture of yourself with the Putter King mascot. (50 points; 15 bonus points if you are wearing a Putter King t-shirt) 110 | 111 | #6) Picture of yourself with the completed Putter King wobblehead. (15 points; 15 bonus points if the picture is taken at a miniature golf course) 112 | 113 | #7) Picture of a completed scorecard from a round of miniature golf. The round of golf must have taken place after the start of this scavenger hunt. (10 points) 114 | 115 | #8) Picture of completed scorecards from 5 different miniature golf courses. Each round of golf must have taken place after the start of this scavenger hunt. (35 points) 116 | 117 | #9) Submit an entry to the 2011 Putter King Hole Design Contest. (60 points; 40 bonus points if your entry gets more than 100 votes) 118 | 119 | #10) Screenshot from the Putter King app showing a 9-hole score below par. (10 points) 120 | 121 | #11) Screenshot from the Putter King app showing that you have successfully unlocked all of the holes in the game. (45 points) 122 | 123 | #12) Picture of the Putter King wobblehead at a World Heritage Site. (55 points) 124 | 125 | #13) Complete and submit the Putter King ‘Practice Activity’ and ‘Final Project’ for any one of the Putter King math or physics lessons. (40 points; 20 bonus points if you complete two lessons) 126 | 127 | #14) Picture of yourself with at least 6 different colored miniature golf balls. (10 points; 2 bonus points for each additional color {limit of 10 bonus points}) 128 | 129 | #15) Picture of yourself with a famous golfer or miniature golfer. (15 points; 150 bonus points if the golfer is on the PGA tour AND you are wearing a Putter King t-shirt in the picture) 130 | 131 | #16) Video of yourself making a hole-in-one on a miniature golf hole with a loop-de-loop obstacle. (30 points) 132 | 133 | #17) Video of yourself successfully making a trick miniature golf shot. (40 points; up to 100 bonus points available depending on the difficulty and complexity of the trick shot) 134 | 135 | 136 | Prizes: 137 | 138 | $100 iTunes Gift Card 139 | 140 | Putter King Scavenger Hunt Trophy 141 | (6 3/4" Engraved Crystal Trophy - Picture Coming Soon) 142 | 143 | The Putter King team will judge the scavenger hunt and all decisions will be final. The U.S. Government is sponsoring it. The scavenger hunt is open to anyone and everyone. The scavenger hunt ends on Dec. 31st, 2011. 144 | 145 | To enter the scavenger hunt, send an email to info AT putterking DOT com with the subject line: "Putter King Scavenger Hunt Submission". In the email please include links to the pictures and videos you are submitting. You can utilize free photo and video hosting sites such as YouTube, Flickr, Picasa, Photobucket, etc. for your submissions. 146 | 147 | By entering the Putter King Miniature Golf Scavenger Hunt, you allow Putter King to use or link to any of the pictures or videos you submit for advertisements and promotions. 148 | 149 | Don’t forget to use your imagination and creativity! 150 | EOF 151 | tokens = ConfidentialInfoRedactor::Extractor.new.extract(text) 152 | expect(described_class.new(language: 'en', tokens: tokens).redact(text)).to eq("Putter King Miniature Golf Scavenger Hunt\n\n Putter King is hosting the Annual Miniature Golf Scavenger Hunt. So get out your putter and your camera and see if you have what it takes. Are you a King?\n\n The Official List: ) Autographs of professional miniature golfers, each from a different country. ( points; bonus points if the professional miniature golfers are also from different continents) ) Picture of yourself next to each obstacle in our list of the Top Nostalgic Miniature Golf Obstacles. ( points; bonus points for each obstacle that exactly matches the one pictured in the article) ) Build your own full-size miniature golf hole. ( points; up to bonus points available depending on the craftsmanship, playability, creativity and fun factor of your hole) ) Video of yourself making a hole-in-one on two consecutive miniature golf holes. The video must be one continuous shot with no editing. ( points) ) Picture of yourself with the Putter King mascot. ( points; bonus points if you are wearing a Putter King t-shirt) ) Picture of yourself with the completed Putter King wobblehead. ( points; bonus points if the picture is taken at a miniature golf course) ) Picture of a completed scorecard from a round of miniature golf. The round of golf must have taken place after the start of this scavenger hunt. ( points) ) Picture of completed scorecards from different miniature golf courses. Each round of golf must have taken place after the start of this scavenger hunt. ( points) ) Submit an entry to the Putter King Hole Design Contest. ( points; bonus points if your entry gets more than votes) ) Screenshot from the Putter King app showing a score below par. ( points) ) Screenshot from the Putter King app showing that you have successfully unlocked all of the holes in the game. ( points) ) Picture of the Putter King wobblehead at a World Heritage Site. ( points) ) Complete and submit the Putter King ‘Practice Activity’ and ‘Final Project’ for any one of the Putter King math or physics lessons. ( points; bonus points if you complete two lessons) ) Picture of yourself with at least different colored miniature golf balls. ( points; bonus points for each additional color {limit of bonus points}) ) Picture of yourself with a famous golfer or miniature golfer. ( points; bonus points if the golfer is on the tour AND you are wearing a Putter King t-shirt in the picture) ) Video of yourself making a hole-in-one on a miniature golf hole with a loop-de-loop obstacle. ( points) ) Video of yourself successfully making a trick miniature golf shot. ( points; up to bonus points available depending on the difficulty and complexity of the trick shot)\n\n\n Prizes: Gift Card\n\n Putter King Scavenger Hunt Trophy\n ( Engraved Crystal Trophy - Picture Coming Soon)\n\n The Putter King team will judge the scavenger hunt and all decisions will be final. The U.S. Government is sponsoring it. The scavenger hunt is open to anyone and everyone. The scavenger hunt ends on .\n\n To enter the scavenger hunt, send an email to info AT putterking DOT com with the subject line: \"Putter King Scavenger Hunt Submission\". In the email please include links to the pictures and videos you are submitting. You can utilize free photo and video hosting sites such as , , , , etc. for your submissions.\n\n By entering the Putter King Miniature Golf Scavenger Hunt, you allow Putter King to use or link to any of the pictures or videos you submit for advertisements and promotions.\n\n Don’t forget to use your imagination and creativity!") 153 | end 154 | 155 | it 'redacts all confidential information from a text #003' do 156 | tokens = ['Coca-Cola', 'Pepsi', 'John Smith'] 157 | text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.' 158 | expect(described_class.new(language: 'en', tokens: tokens).redact(text)).to eq(' announced a merger with that will happen on for . Please contact at or visit .') 159 | end 160 | 161 | it 'redacts all confidential information from a text #004' do 162 | tokens = ['Coca-Cola', 'Pepsi', 'John Smith'] 163 | text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.' 164 | expect(described_class.new(language: 'en', tokens: tokens, ignore_numbers: true).redact(text)).to eq(' announced a merger with that will happen on for $200,000,000,000. Please contact at or visit .') 165 | end 166 | 167 | it 'redacts all confidential information from a text #005' do 168 | tokens = ['Coca-Cola', 'Pepsi', 'John Smith'] 169 | text = 'Coca-Cola announced a merger with Pepsi that will happen on December 15th, 2020 for $200,000,000,000. Please contact John Smith at j.smith@example.com or visit http://www.super-fake-merger.com.' 170 | expect(described_class.new(language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****').redact(text)).to eq('***** announced a merger with ***** that will happen on ^^redacted date^^ for **redacted number**. Please contact ***** at ***** or visit *****.') 171 | end 172 | 173 | it 'redacts all confidential information from a text #006' do 174 | tokens = ['CLA'] 175 | text = 'LEGAL DISCLAIMER - CLA will not be held reponsible for changes.' 176 | expect(described_class.new(language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****').redact(text)).to eq("LEGAL DISCLAIMER - ***** will not be held reponsible for changes.") 177 | end 178 | 179 | it 'redacts all confidential information from a text #007' do 180 | tokens = ['Trans'] 181 | text = 'My Transformation - avoid Trans.' 182 | expect(described_class.new(language: 'en', tokens: tokens, number_text: '**redacted number**', date_text: '^^redacted date^^', token_text: '*****', hyperlink_text: '*****', email_text: '*****').redact(text)).to eq('My Transformation - avoid *****.') 183 | end 184 | end 185 | end --------------------------------------------------------------------------------