├── scrubadub ├── filth │ ├── en_GB │ │ ├── __init__.py │ │ ├── tax_reference_number.py │ │ └── national_insurance_number.py │ ├── en_US │ │ ├── __init__.py │ │ └── social_security_number.py │ ├── drivers_licence.py │ ├── tagged.py │ ├── email.py │ ├── name.py │ ├── location.py │ ├── organization.py │ ├── vehicle_licence_plate.py │ ├── twitter.py │ ├── skype.py │ ├── postalcode.py │ ├── credit_card.py │ ├── __init__.py │ ├── url.py │ ├── credential.py │ ├── phone.py │ ├── date_of_birth.py │ └── address.py ├── .gitattributes ├── detectors │ ├── en_US │ │ ├── __init__.py │ │ └── social_security_number.py │ ├── en_GB │ │ ├── __init__.py │ │ ├── tax_reference_number.py │ │ └── national_insurance_number.py │ ├── drivers_licence.py │ ├── twitter.py │ ├── __init__.py │ ├── credential.py │ ├── vehicle_licence_plate.py │ ├── url.py │ ├── credit_card.py │ ├── phone.py │ ├── email.py │ ├── postalcode.py │ ├── catalogue.py │ ├── text_blob.py │ ├── user_supplied.py │ └── skype.py ├── post_processors │ ├── __init__.py │ ├── base.py │ ├── remover.py │ ├── catalogue.py │ ├── prefix_suffix.py │ └── filth_replacer.py ├── exceptions.py ├── utils.py └── __init__.py ├── .coveragerc ├── tests ├── example_real_data │ ├── known_pii.csv │ └── document.txt ├── test_filth_location.py ├── test_filth_organization.py ├── test_exceptions.py ├── test_detector_user_supplied.py ├── colors.py ├── benchmark_time.py ├── test_unicode.py ├── test_postprocessor.py ├── test_postprocessor_prefix_postfix_replacer.py ├── test_detector_configuration.py ├── test_detector_emails.py ├── test_detector_en_US_social_security_number.py ├── test_detector_text_blob.py ├── test_detector_en_GB_trn.py ├── run.py ├── test_postprocessor_configuration.py ├── test_detector_drivers_licence.py ├── test_detector_twitter.py ├── test_detector_en_GB_nino.py ├── test_api_advanced.py ├── test_detector_phone_numbers.py ├── test_detector_credentials.py ├── base.py ├── test_detector_urls.py ├── test_utils_canonical_string_set.py ├── test_api.py ├── test_detector_skype.py ├── test_detector_credit_card.py ├── test_locale.py ├── test_detector_postal_codes.py ├── test_detector.py ├── test_api_older.py ├── test_postprocessor_filth_replacer.py ├── test_filth.py ├── test_filth_address.py └── test_detector_date_of_birth.py ├── MANIFEST.in ├── requirements ├── python-readthedocs ├── python └── python-dev ├── setup.cfg ├── docs ├── post_processors.rst ├── api_scrubadub_filth.rst ├── api_scrubadub_comparison.rst ├── api_scrubadub.rst ├── api_scrubadub_post.rst ├── addresses.rst ├── index.rst ├── contributing.rst ├── names.rst └── localization.rst ├── .readthedocs.yml ├── azure-pipelines.yml ├── .github └── workflows │ ├── python-publish.yml │ └── unittests.yml ├── bin └── download_data.sh ├── .gitignore ├── design ├── basic_usage.py ├── customize_filth_detection.py ├── customize_replacement_strings.py └── customize_via_training.py ├── tox.ini ├── setup.py ├── README.rst └── CONTRIBUTING.md /scrubadub/filth/en_GB/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrubadub/filth/en_US/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrubadub/.gitattributes: -------------------------------------------------------------------------------- 1 | *.pickle filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /scrubadub/detectors/en_US/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .social_security_number import SocialSecurityNumberDetector 3 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | omit = 3 | */python?.?/* 4 | */site-packages/nose/* 5 | scrubadub/colors.py 6 | -------------------------------------------------------------------------------- /scrubadub/filth/drivers_licence.py: -------------------------------------------------------------------------------- 1 | from .base import Filth 2 | 3 | 4 | class DriversLicenceFilth(Filth): 5 | type = 'drivers_licence' 6 | -------------------------------------------------------------------------------- /scrubadub/filth/en_GB/tax_reference_number.py: -------------------------------------------------------------------------------- 1 | from scrubadub.filth.base import Filth 2 | 3 | 4 | class TaxReferenceNumberFilth(Filth): 5 | type = 'tax_reference_number' 6 | -------------------------------------------------------------------------------- /scrubadub/detectors/en_GB/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .national_insurance_number import NationalInsuranceNumberDetector 3 | from .tax_reference_number import TaxReferenceNumberDetector 4 | -------------------------------------------------------------------------------- /tests/example_real_data/known_pii.csv: -------------------------------------------------------------------------------- 1 | filth_type,match,match_end,limit 2 | address,123 The Street,England, 3 | phone,0775 2212 211,, 4 | email,mike@example.com,, 5 | name,Mike Johnson,, 6 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements/* 2 | include MANIFEST.in 3 | include README.rst 4 | include LICENSE 5 | recursive-include scrubadub/detectors/models *.json 6 | recursive-exclude * *.py[co] 7 | recursive-exclude * *~ 8 | recursive-exclude * *.orig 9 | -------------------------------------------------------------------------------- /requirements/python-readthedocs: -------------------------------------------------------------------------------- 1 | # install everything in the python requirements too. 2 | -r python 3 | 4 | # for documentation 5 | sphinx>=3 6 | sphinx_rtd_theme>=0.5 7 | 8 | # Needed for the docs 9 | scrubadub_address 10 | scrubadub_spacy 11 | scrubadub_stanford 12 | -------------------------------------------------------------------------------- /scrubadub/post_processors/__init__.py: -------------------------------------------------------------------------------- 1 | from .catalogue import post_processor_catalogue, register_post_processor, remove_post_processor 2 | 3 | from .base import PostProcessor 4 | from .filth_replacer import FilthReplacer 5 | from .prefix_suffix import PrefixSuffixReplacer 6 | from .remover import FilthRemover 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True 3 | 4 | [pycodestyle] 5 | max-line-length = 120 6 | statistics = True 7 | 8 | [flake8] 9 | max-line-length = 120 10 | statistics = True 11 | per-file-ignores = 12 | # imported but unused 13 | __init__.py: F401 14 | 15 | [nosetests] 16 | with-coverage = 1 17 | cover-package = scrubadub 18 | -------------------------------------------------------------------------------- /scrubadub/filth/tagged.py: -------------------------------------------------------------------------------- 1 | from .base import Filth 2 | 3 | import typing 4 | 5 | 6 | class TaggedEvaluationFilth(Filth): 7 | type = 'tagged' 8 | 9 | def __init__(self, *args, comparison_type: typing.Optional[str] = None, **kwargs): 10 | super(TaggedEvaluationFilth, self).__init__(*args, **kwargs) 11 | self.comparison_type = comparison_type 12 | -------------------------------------------------------------------------------- /docs/post_processors.rst: -------------------------------------------------------------------------------- 1 | 2 | Post Processors 3 | =============== 4 | 5 | Post processors run in a certain order and do something to the detected ``Filth``\ s. 6 | You could use them to validate your filth, to save your filth into a lookup file, to record statics on your found filth, to combine filth together, to remove the filth from the text or anything else you want really. 7 | 8 | -------------------------------------------------------------------------------- /tests/test_filth_location.py: -------------------------------------------------------------------------------- 1 | import faker 2 | import unittest 3 | 4 | from scrubadub.filth import LocationFilth 5 | 6 | class LocationFilthTestCase(unittest.TestCase): 7 | 8 | def test_generate(self): 9 | class Faker: 10 | def city(self): 11 | return 'Brianland' 12 | 13 | self.assertEqual( 14 | 'Brianland', 15 | LocationFilth.generate(faker=Faker()), 16 | ) 17 | -------------------------------------------------------------------------------- /tests/test_filth_organization.py: -------------------------------------------------------------------------------- 1 | import faker 2 | import unittest 3 | 4 | from scrubadub.filth import OrganizationFilth 5 | 6 | class OrganizationFilthTestCase(unittest.TestCase): 7 | 8 | def test_generate(self): 9 | class Faker: 10 | def company(self): 11 | return 'Brown-Lindsey' 12 | 13 | self.assertEqual( 14 | 'Brown-Lindsey', 15 | OrganizationFilth.generate(faker=Faker()), 16 | ) 17 | -------------------------------------------------------------------------------- /docs/api_scrubadub_filth.rst: -------------------------------------------------------------------------------- 1 | .. _api_scrubadub_filth: 2 | 3 | 4 | scrubadub.filth 5 | =============== 6 | 7 | Filth objects are responsible for marking particular sections of text as 8 | containing that type of filth. It is also responsible for knowing how it should 9 | be cleaned. Every type of ``Filth`` inherits from ``scrubadub.filth.base.Filth``. 10 | 11 | .. autoclass:: scrubadub.filth.Filth 12 | :members: 13 | :undoc-members: 14 | :show-inheritance: 15 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Optionally set the version of Python and requirements required to build your docs 13 | python: 14 | version: 3.8 15 | install: 16 | - requirements: requirements/python-dev 17 | -------------------------------------------------------------------------------- /tests/test_exceptions.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from scrubadub import exceptions 4 | 5 | 6 | class ExceptionsTestCase(unittest.TestCase): 7 | def test_render(self): 8 | exception = exceptions.ScrubadubException() 9 | exception.var = 'there' 10 | 11 | self.assertEquals(exception.render('test'), 'test') 12 | self.assertEquals(exception.render('url %(issues_url)s'), 'url ' + exception.issues_url) 13 | self.assertEquals(exception.render('hello %(var)s'), 'hello there') 14 | -------------------------------------------------------------------------------- /tests/example_real_data/document.txt: -------------------------------------------------------------------------------- 1 | 2 | This is is an example document that has been labelled in known_pii.csv. This document contains filthy personal 3 | infomation that we want to remove, such as an address for Mike Johnson: 4 | 5 | 123 The Street, 6 | London, 7 | E2 2AA, 8 | England 9 | 10 | or an example phone number 0775 2212 211 and email address mike@example.com, even if it is wildly capitalised MiKe@ExAmPlE.Com. 11 | 12 | benchmark_accuracy_real_data.py checks to see if the personal information can be found in this file. 13 | -------------------------------------------------------------------------------- /scrubadub/filth/email.py: -------------------------------------------------------------------------------- 1 | from faker import Faker 2 | 3 | from .base import Filth 4 | 5 | 6 | class EmailFilth(Filth): 7 | type = 'email' 8 | 9 | @staticmethod 10 | def generate(faker: Faker) -> str: 11 | """Generates an example of this ``Filth`` type, usually using the faker python library. 12 | 13 | :param faker: The ``Faker`` class from the ``faker`` library 14 | :type faker: Faker 15 | :return: An example of this ``Filth`` 16 | :rtype: str 17 | """ 18 | return faker.email() 19 | -------------------------------------------------------------------------------- /scrubadub/filth/name.py: -------------------------------------------------------------------------------- 1 | from faker import Faker 2 | 3 | from .base import Filth 4 | 5 | 6 | class NameFilth(Filth): 7 | type = 'name' 8 | 9 | @staticmethod 10 | def generate(faker: Faker) -> str: 11 | """Generates an example of this ``Filth`` type, usually using the faker python library. 12 | 13 | :param faker: The ``Faker`` class from the ``faker`` library 14 | :type faker: Faker 15 | :return: An example of this ``Filth`` 16 | :rtype: str 17 | """ 18 | return faker.name() 19 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # Starter pipeline 2 | # Start with a minimal pipeline that you can customize to build and deploy your code. 3 | # Add steps that build, run tests, deploy, and more: 4 | # https://aka.ms/yaml 5 | 6 | trigger: 7 | - main 8 | 9 | pool: 10 | vmImage: 'ubuntu-latest' 11 | 12 | steps: 13 | - script: echo Hello, world! 14 | displayName: 'Run a one-line script' 15 | 16 | - script: | 17 | echo Add other tasks to build, test, and deploy your project. 18 | echo See https://aka.ms/yaml 19 | displayName: 'Run a multi-line script' 20 | -------------------------------------------------------------------------------- /scrubadub/filth/location.py: -------------------------------------------------------------------------------- 1 | from faker import Faker 2 | 3 | from .base import Filth 4 | 5 | 6 | class LocationFilth(Filth): 7 | type = 'location' 8 | 9 | @staticmethod 10 | def generate(faker: Faker) -> str: 11 | """Generates an example of this ``Filth`` type, usually using the faker python library. 12 | 13 | :param faker: The ``Faker`` class from the ``faker`` library 14 | :type faker: Faker 15 | :return: An example of this ``Filth`` 16 | :rtype: str 17 | """ 18 | return faker.city() 19 | -------------------------------------------------------------------------------- /scrubadub/post_processors/base.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence 2 | 3 | from ..filth import Filth 4 | 5 | 6 | class PostProcessor(object): 7 | name = 'post_processor' # type: str 8 | autoload = False # type: bool 9 | index = 10000 # type: int 10 | 11 | def __init__(self, name: Optional[str] = None): 12 | if name is not None: 13 | self.name = name 14 | 15 | def process_filth(self, filth_list: Sequence[Filth]) -> Sequence[Filth]: 16 | raise NotImplementedError('must be overridden by base classes') 17 | -------------------------------------------------------------------------------- /scrubadub/filth/organization.py: -------------------------------------------------------------------------------- 1 | from faker import Faker 2 | 3 | from .base import Filth 4 | 5 | 6 | class OrganizationFilth(Filth): 7 | type = 'organization' 8 | 9 | @staticmethod 10 | def generate(faker: Faker) -> str: 11 | """Generates an example of this ``Filth`` type, usually using the faker python library. 12 | 13 | :param faker: The ``Faker`` class from the ``faker`` library 14 | :type faker: Faker 15 | :return: An example of this ``Filth`` 16 | :rtype: str 17 | """ 18 | return faker.company() 19 | -------------------------------------------------------------------------------- /docs/api_scrubadub_comparison.rst: -------------------------------------------------------------------------------- 1 | .. _api_scrubadub_comparison: 2 | 3 | 4 | scrubadub.comparison 5 | ==================== 6 | 7 | Filth objects are responsible for marking particular sections of text as 8 | containing that type of filth. It is also responsible for knowing how it should 9 | be cleaned. Every type of ``Filth`` inherits from ``scrubadub.filth.base.Filth``. 10 | 11 | .. autofunction:: scrubadub.comparison.get_filth_classification_report 12 | 13 | .. autofunction:: scrubadub.comparison.get_filth_dataframe 14 | 15 | .. autofunction:: scrubadub.comparison.make_fake_document 16 | -------------------------------------------------------------------------------- /scrubadub/filth/vehicle_licence_plate.py: -------------------------------------------------------------------------------- 1 | from faker import Faker 2 | 3 | from .base import Filth 4 | 5 | 6 | class VehicleLicencePlateFilth(Filth): 7 | type = 'vehicle_licence_plate' 8 | 9 | @staticmethod 10 | def generate(faker: Faker) -> str: 11 | """Generates an example of this ``Filth`` type, usually using the faker python library. 12 | 13 | :param faker: The ``Faker`` class from the ``faker`` library 14 | :type faker: Faker 15 | :return: An example of this ``Filth`` 16 | :rtype: str 17 | """ 18 | return faker.license_plate() 19 | -------------------------------------------------------------------------------- /requirements/python: -------------------------------------------------------------------------------- 1 | # For the TextBlobNameDetecotr 2 | textblob==0.15.3 3 | 4 | # For the PhoneDetector 5 | phonenumbers 6 | 7 | # For SSN, credit cards and TRN 8 | python-stdnum 9 | 10 | # For the DateOfBirthDetector, master version due to an unfixed bug... but which one? 11 | # Can't upload to PyPi with a dependency on a GH repo, so removed link to dateparser repo 12 | dateparser 13 | # @ git+https://github.com/scrapinghub/dateparser.git 14 | 15 | # For the detector/post-processor catalogues 16 | catalogue 17 | 18 | # For scrubadub.comparison 19 | scikit-learn 20 | 21 | typing_extensions 22 | faker 23 | -------------------------------------------------------------------------------- /scrubadub/filth/twitter.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from faker import Faker 4 | 5 | from .base import Filth 6 | 7 | 8 | class TwitterFilth(Filth): 9 | type = 'twitter' 10 | 11 | @staticmethod 12 | def generate(faker: Faker) -> str: 13 | """Generates an example of this ``Filth`` type, usually using the faker python library. 14 | 15 | :param faker: The ``Faker`` class from the ``faker`` library 16 | :type faker: Faker 17 | :return: An example of this ``Filth`` 18 | :rtype: str 19 | """ 20 | return '@' + re.sub(r'[^a-zA-Z0-9_]', '', faker.user_name())[:15] 21 | -------------------------------------------------------------------------------- /scrubadub/filth/en_GB/national_insurance_number.py: -------------------------------------------------------------------------------- 1 | from faker import Faker 2 | 3 | from scrubadub.filth.base import Filth 4 | 5 | 6 | class NationalInsuranceNumberFilth(Filth): 7 | type = 'national_insurance_number' 8 | 9 | @staticmethod 10 | def generate(faker: Faker) -> str: 11 | """Generates an example of this ``Filth`` type, usually using the faker python library. 12 | 13 | :param faker: The ``Faker`` class from the ``faker`` library 14 | :type faker: Faker 15 | :return: An example of this ``Filth`` 16 | :rtype: str 17 | """ 18 | return faker.ssn() 19 | -------------------------------------------------------------------------------- /requirements/python-dev: -------------------------------------------------------------------------------- 1 | # install everything in the python requirements too. 2 | -r python 3 | 4 | # needed to run the tests 5 | flake8 6 | coveralls 7 | nose 8 | mypy 9 | tox 10 | 11 | # for documentation 12 | sphinx>=3 13 | sphinx_rtd_theme>=0.5 14 | 15 | # This is for the tests/benchmark_accuracy_real_data.py script 16 | cchardet 17 | pandas 18 | click 19 | python-magic 20 | python-dotenv 21 | azure-storage-blob 22 | openpyxl 23 | tabulate 24 | pandas 25 | 26 | types-dateparser 27 | types-requests 28 | 29 | # needed for the tests/run.py script 30 | wasabi 31 | 32 | # Needed for the docs 33 | postal 34 | scrubadub_address 35 | scrubadub_spacy 36 | scrubadub_stanford 37 | -------------------------------------------------------------------------------- /scrubadub/filth/skype.py: -------------------------------------------------------------------------------- 1 | import re 2 | from faker import Faker 3 | 4 | from .base import Filth 5 | 6 | 7 | class SkypeFilth(Filth): 8 | type = 'skype' 9 | 10 | @staticmethod 11 | def generate(faker: Faker) -> str: 12 | """Generates an example of this ``Filth`` type, usually using the faker python library. 13 | 14 | :param faker: The ``Faker`` class from the ``faker`` library 15 | :type faker: Faker 16 | :return: An example of this ``Filth`` 17 | :rtype: str 18 | """ 19 | username = '' 20 | while len(username) < 5: 21 | username = re.sub(r'(^[^a-zA-Z])|[^a-zA-Z0-9_\-\,\.]', '', faker.user_name())[:31] 22 | return username 23 | -------------------------------------------------------------------------------- /scrubadub/filth/postalcode.py: -------------------------------------------------------------------------------- 1 | from faker import Faker 2 | 3 | from .base import Filth 4 | 5 | 6 | class PostalCodeFilth(Filth): 7 | type = "postalcode" 8 | 9 | @staticmethod 10 | def generate(faker: Faker) -> str: 11 | """Generates an example of this ``Filth`` type, usually using the faker python library. 12 | 13 | :param faker: The ``Faker`` class from the ``faker`` library 14 | :type faker: Faker 15 | :return: An example of this ``Filth`` 16 | :rtype: str 17 | """ 18 | # for en_US I expect we should pick between .zipcode() and .zipcode_plus4() 19 | # as postcode() for en_US only returns the 5 number zip code 20 | return faker.postcode() 21 | -------------------------------------------------------------------------------- /scrubadub/filth/credit_card.py: -------------------------------------------------------------------------------- 1 | import string 2 | import stdnum.luhn 3 | from faker import Faker 4 | 5 | from .base import Filth 6 | 7 | 8 | class CreditCardFilth(Filth): 9 | type = 'credit_card' 10 | 11 | @staticmethod 12 | def generate(faker: Faker) -> str: 13 | """Generates an example of this ``Filth`` type, usually using the faker python library. 14 | 15 | :param faker: The ``Faker`` class from the ``faker`` library 16 | :type faker: Faker 17 | :return: An example of this ``Filth`` 18 | :rtype: str 19 | """ 20 | return faker.credit_card_number() 21 | 22 | def is_valid(self) -> bool: 23 | return stdnum.luhn.is_valid(''.join(char for char in self.text if char in string.digits)) 24 | -------------------------------------------------------------------------------- /scrubadub/detectors/drivers_licence.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from scrubadub.detectors.catalogue import register_detector 4 | from .base import RegionLocalisedRegexDetector 5 | from ..filth import DriversLicenceFilth 6 | 7 | 8 | @register_detector 9 | class DriversLicenceDetector(RegionLocalisedRegexDetector): 10 | """Use regular expressions to detect UK driving licence numbers, 11 | Simple pattern matching, no checksum solution. 12 | """ 13 | 14 | name = 'drivers_licence' 15 | autoload = True 16 | filth_cls = DriversLicenceFilth 17 | 18 | region_regex = { 19 | # this regex is looking for UK driving licence numbers that follow a pattern, no checksum 20 | 'GB': re.compile(r'''([a-zA-Z9]{5}\s?)((?:\s*\d\s*){6}[a-zA-Z9]{2}\w{3})\s?(\d{2})''', re.IGNORECASE) 21 | } 22 | -------------------------------------------------------------------------------- /scrubadub/detectors/en_GB/tax_reference_number.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from scrubadub.detectors.catalogue import register_detector 4 | from scrubadub.detectors.base import RegionLocalisedRegexDetector 5 | from scrubadub.filth import TaxReferenceNumberFilth 6 | 7 | 8 | @register_detector 9 | class TaxReferenceNumberDetector(RegionLocalisedRegexDetector): 10 | """Use regular expressions to detect the UK PAYE temporary reference number (TRN), 11 | Simple pattern matching, no checksum solution. 12 | """ 13 | 14 | name = 'tax_reference_number' 15 | autoload = True 16 | filth_cls = TaxReferenceNumberFilth 17 | # this regex is looking for NINO that does not begin with certain letters 18 | region_regex = { 19 | 'GB': re.compile(r'''\d{2}\s?[a-zA-Z]{1}(?:\s*\d\s*){5}''', re.IGNORECASE), 20 | } 21 | -------------------------------------------------------------------------------- /scrubadub/detectors/twitter.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from scrubadub.detectors.catalogue import register_detector 4 | from .base import RegexDetector 5 | from ..filth import TwitterFilth 6 | 7 | 8 | @register_detector 9 | class TwitterDetector(RegexDetector): 10 | """Use regular expression magic to remove twitter usernames from dirty 11 | dirty ``text``. 12 | """ 13 | filth_cls = TwitterFilth 14 | name = 'twitter' 15 | autoload = True 16 | 17 | # https://help.twitter.com/en/managing-your-account/twitter-username-rules#error 18 | # Twitter user names must be 15 or less charachtors and only contain a-zA-Z0-9_ 19 | # Twitter and admin are not allowed in user names 20 | # (? str: 12 | """Generates an example of this ``Filth`` type, usually using the faker python library. 13 | 14 | :param faker: The ``Faker`` class from the ``faker`` library 15 | :type faker: Faker 16 | :return: An example of this ``Filth`` 17 | :rtype: str 18 | """ 19 | ssn = '' 20 | if faker.locales == ['en_US']: 21 | while not stdnum.us.ssn.is_valid(ssn): 22 | ssn = faker.ssn() 23 | return faker.ssn() 24 | 25 | def is_valid(self) -> bool: 26 | return stdnum.us.ssn.is_valid(''.join(char for char in self.text if char not in '. -')) 27 | -------------------------------------------------------------------------------- /tests/test_detector_user_supplied.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import scrubadub 4 | 5 | class UserDefinedTestCase(unittest.TestCase): 6 | 7 | def test_simple(self): 8 | """test a simple matching""" 9 | 10 | test_str = 'this is a test string' 11 | detector = scrubadub.detectors.UserSuppliedFilthDetector([ 12 | {'match': 'test', 'filth_type': 'name'}, 13 | ]) 14 | 15 | matches = list(detector.iter_filth(test_str)) 16 | self.assertEqual(matches[0].beg, 10) 17 | self.assertEqual(matches[0].end, 14) 18 | 19 | def test_bad_filth(self): 20 | """test a simple matching""" 21 | 22 | test_str = 'this is a test string' 23 | detector = scrubadub.detectors.UserSuppliedFilthDetector([ 24 | {'match': 'test', 'filth_type': 'invalid_filth'}, 25 | ]) 26 | 27 | with self.assertRaises(KeyError): 28 | list(detector.iter_filth(test_str)) 29 | -------------------------------------------------------------------------------- /scrubadub/detectors/en_GB/national_insurance_number.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from scrubadub.detectors.catalogue import register_detector 4 | from scrubadub.detectors.base import RegionLocalisedRegexDetector 5 | from scrubadub.filth import NationalInsuranceNumberFilth 6 | 7 | 8 | @register_detector 9 | class NationalInsuranceNumberDetector(RegionLocalisedRegexDetector): 10 | """Use regular expressions to remove the GB National Insurance number (NINO), 11 | Simple pattern matching, no checksum solution. 12 | """ 13 | name = 'national_insurance_number' 14 | autoload = True 15 | filth_cls = NationalInsuranceNumberFilth 16 | # this regex is looking for NINO that does not begin with certain letters 17 | region_regex = { 18 | 'GB': re.compile( 19 | r'(?!BG)(?!GB)(?!NK)(?!KN)(?!TN)(?!NT)(?!ZZ)(?:[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z])(?:\s*\d\s*){6}[A-D]', 20 | re.IGNORECASE | re.VERBOSE 21 | ), 22 | } 23 | -------------------------------------------------------------------------------- /scrubadub/filth/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Filth, MergedFilth, RegexFilth 2 | from .address import AddressFilth 3 | from .credential import CredentialFilth 4 | from .credit_card import CreditCardFilth 5 | from .drivers_licence import DriversLicenceFilth 6 | from .email import EmailFilth 7 | from .tagged import TaggedEvaluationFilth 8 | from .location import LocationFilth 9 | from .name import NameFilth 10 | from .organization import OrganizationFilth 11 | from .phone import PhoneFilth 12 | from .postalcode import PostalCodeFilth 13 | from .skype import SkypeFilth 14 | from .twitter import TwitterFilth 15 | from .url import UrlFilth 16 | from .vehicle_licence_plate import VehicleLicencePlateFilth 17 | from .date_of_birth import DateOfBirthFilth 18 | from .en_GB.national_insurance_number import NationalInsuranceNumberFilth 19 | from .en_GB.tax_reference_number import TaxReferenceNumberFilth 20 | from .en_US.social_security_number import SocialSecurityNumberFilth 21 | -------------------------------------------------------------------------------- /scrubadub/filth/url.py: -------------------------------------------------------------------------------- 1 | from faker import Faker 2 | 3 | from .base import Filth 4 | 5 | 6 | class UrlFilth(Filth): 7 | type = 'url' 8 | 9 | # This allows you to keep the domain 10 | keep_domain = False 11 | 12 | # this can be used to customize the output, particularly when 13 | # keep_domain=True 14 | url_placeholder = type.upper() 15 | 16 | @property 17 | def placeholder(self): 18 | if self.keep_domain: 19 | return self.match.group('domain') + self.url_placeholder 20 | return self.url_placeholder 21 | 22 | @staticmethod 23 | def generate(faker: Faker) -> str: 24 | """Generates an example of this ``Filth`` type, usually using the faker python library. 25 | 26 | :param faker: The ``Faker`` class from the ``faker`` library 27 | :type faker: Faker 28 | :return: An example of this ``Filth`` 29 | :rtype: str 30 | """ 31 | return faker.url() 32 | -------------------------------------------------------------------------------- /tests/colors.py: -------------------------------------------------------------------------------- 1 | """Inspiration from 2 | https://github.com/fabric/fabric/blob/master/fabric/colors.py 3 | """ 4 | import re 5 | 6 | 7 | def _wrap_with(code, bold=False): 8 | def inner(text): 9 | c = code 10 | if bold: 11 | c = "1;%s" % c 12 | return "\033[%sm%s\033[0m" % (c, text) 13 | return inner 14 | 15 | red = _wrap_with('31') 16 | green = _wrap_with('32') 17 | yellow = _wrap_with('33') 18 | blue = _wrap_with('34') 19 | magenta = _wrap_with('35') 20 | cyan = _wrap_with('36') 21 | white = _wrap_with('37') 22 | 23 | bold_red = _wrap_with('31', True) 24 | bold_green = _wrap_with('32', True) 25 | bold_yellow = _wrap_with('33', True) 26 | bold_blue = _wrap_with('34', True) 27 | bold_magenta = _wrap_with('35', True) 28 | bold_cyan = _wrap_with('36', True) 29 | bold_white = _wrap_with('37', True) 30 | 31 | 32 | # regular expression to omit colorcodes 33 | def colorless(text): 34 | """Remove color from the text""" 35 | return re.sub("\033\[(1;)?[\d]+m", '', text) -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /tests/benchmark_time.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import timeit 5 | 6 | from scrubadub.comparison import make_fake_document 7 | 8 | 9 | def main(): 10 | doc, _ = make_fake_document(paragraphs=20, seed=1234) 11 | variables = {'doc': doc} 12 | setup_cmd = 'import scrubadub; scrubber = scrubadub.Scrubber()' 13 | cmd = 'scrubber.clean(doc)' 14 | 15 | print("Timing '{}':".format(cmd)) 16 | repeats = 50 17 | timer = timeit.Timer(cmd, setup=setup_cmd, globals=variables) 18 | try: 19 | time = timer.timeit(number=repeats) 20 | except Exception: 21 | timer.print_exc() 22 | sys.exit(1) 23 | else: 24 | print("{: >8.4f}s total runtime".format(time)) 25 | print("{: >8.4f}s per iteration".format(time/repeats)) 26 | 27 | if time/repeats > 0.1: 28 | print("Usual runtimes for the default set of detectors is 0.02s per iteration.") 29 | sys.exit(1) 30 | 31 | sys.exit(0) 32 | 33 | 34 | if __name__ == "__main__": 35 | main() 36 | -------------------------------------------------------------------------------- /bin/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # this script downloads some test datasets and puts it into a format that is 4 | # convenient for testing the effectiveness of scrubadub 5 | 6 | # all of the data is unpacked in data/testing 7 | bin_dir=$(dirname $0) 8 | project_root=${bin_dir}/.. 9 | raw_dir=${project_root}/data/raw 10 | mkdir -p ${raw_dir} 11 | 12 | # enron 13 | echo 'downloading enron data...' 14 | curl https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tgz > ${project_root}/enron_mail_20150507.tgz 15 | echo 'extracting enron data...' 16 | mkdir -p ${raw_dir}/enron 17 | tar xzf ${project_root}/enron_mail_20150507.tgz -C ${raw_dir}/enron --strip-components=1 18 | rm ${project_root}/enron_mail_20150507.tgz 19 | 20 | # sms 21 | echo 'downloading sms data...' 22 | curl https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip > ${project_root}/smsspamcollection.zip 23 | echo 'extracting sms data...' 24 | unzip ${project_root}/smsspamcollection.zip -d ${raw_dir}/sms 25 | rm ${project_root}/smsspamcollection.zip 26 | -------------------------------------------------------------------------------- /tests/test_unicode.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | 4 | import scrubadub 5 | 6 | from base import BaseTestCase 7 | 8 | @unittest.skipIf(sys.version_info >= (3,0), "Test only needed in Python 2") 9 | class UnicodeTestCase(unittest.TestCase, BaseTestCase): 10 | 11 | def test_empty(self): 12 | """Make sure this returns an empty string""" 13 | self.assertEqual( 14 | self.clean(u''), 15 | u'', 16 | 'empty string is not preserved', 17 | ) 18 | 19 | def test_not_unicode(self): 20 | """Make sure unicode works, too""" 21 | with self.assertRaises(scrubadub.exceptions.UnicodeRequired): 22 | self.clean('John is a byte string') 23 | 24 | def test_useful_error_message(self): 25 | try: 26 | self.clean('John is a byte string') 27 | except scrubadub.exceptions.UnicodeRequired as e: 28 | self.assertIn("scrubadub works best with unicode", str(e)) 29 | else: 30 | self.fail('UnicodeRequired was not raised') 31 | -------------------------------------------------------------------------------- /tests/test_postprocessor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import scrubadub 4 | import scrubadub.post_processors 5 | from scrubadub.filth import EmailFilth 6 | 7 | 8 | class PostProcessorTestCase(unittest.TestCase): 9 | def test_post_processor_name(self): 10 | """make sure adding an initialised detector works""" 11 | filths = [ 12 | EmailFilth(beg=0, end=5, text='e@e.c'), 13 | # EmailFilth(beg=5, end=10, text='e@e.c'), 14 | ] 15 | 16 | post_processor = scrubadub.post_processors.FilthReplacer(name='new_name') 17 | self.assertEqual(post_processor.name, 'new_name') 18 | new_filths = list(post_processor.process_filth(filths)) 19 | self.assertEqual(len(new_filths), 1) 20 | self.assertEqual(new_filths[0].replacement_string, 'EMAIL') 21 | 22 | def test_post_processor_raise(self): 23 | """make sure adding an initialised detector works""" 24 | with self.assertRaises(NotImplementedError): 25 | scrubadub.post_processors.PostProcessor().process_filth([]) 26 | -------------------------------------------------------------------------------- /scrubadub/filth/credential.py: -------------------------------------------------------------------------------- 1 | from .base import Filth 2 | from .. import exceptions 3 | 4 | 5 | class CredentialFilth(Filth): 6 | type = 'credential' 7 | 8 | # specify how the username/password are replaced 9 | username_placeholder = 'USERNAME' 10 | password_placeholder = 'PASSWORD' 11 | 12 | @property 13 | def placeholder(self): 14 | ubeg, uend = self.match.span('username') 15 | pbeg, pend = self.match.span('password') 16 | return ( 17 | self.match.string[self.match.start():ubeg] + 18 | self.prefix + self.username_placeholder + self.suffix + 19 | self.match.string[uend:pbeg] + 20 | self.prefix + self.password_placeholder + self.suffix 21 | ) 22 | 23 | # override the replace_with method for credentials because the 24 | # prefix/suffix components are mixed into the placeholder 25 | def replace_with(self, replace_with='placeholder', **kwargs): 26 | if replace_with == 'placeholder': 27 | return self.placeholder 28 | else: 29 | raise exceptions.InvalidReplaceWith(replace_with) 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | /data 60 | .mypy_cache 61 | .env 62 | .idea/ 63 | ..bfg-report 64 | libpostal/ 65 | tests/code_point_uk_post_codes.zip 66 | .ipynb_checkpoints/ 67 | tests/output* 68 | -------------------------------------------------------------------------------- /design/basic_usage.py: -------------------------------------------------------------------------------- 1 | """This is the basic usage of the scrubadub module. It exposes three different 2 | methods for obfuscating personally identifiable information and uses high 3 | recall methods for identifying filth. Precision can be improved by further 4 | customization. 5 | """ 6 | 7 | import scrubadub 8 | 9 | # this should have very smart defaults, with high recall and relatively low 10 | # precision. the placeholder method is default and uses {{}} notation to 11 | # signify when text has been obfuscated 12 | clean_text = scrubadub.clean(text) 13 | clean_text = scrubadub.clean(text, replace_with="placeholder") 14 | 15 | # the surrogate replacement method makes it easy to replace phone numbers with 16 | # fake phone numbers, for example. this makes it easy to read the content 17 | clean_text = scrubadub.clean(text, replace_with="surrogate") 18 | 19 | # the identifier replacement method replaces the personal information 20 | # associated with each person in lookup with the same unique id to make it easy 21 | # to detect the same person across document records. 22 | clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup) 23 | -------------------------------------------------------------------------------- /scrubadub/detectors/credential.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .base import RegexDetector 4 | from ..filth import CredentialFilth 5 | from scrubadub.detectors.catalogue import register_detector 6 | 7 | 8 | @register_detector 9 | class CredentialDetector(RegexDetector): 10 | """Remove username/password combinations from dirty drity ``text``. 11 | """ 12 | filth_cls = CredentialFilth 13 | name = 'credential' 14 | autoload = True 15 | 16 | # this regular expression searches for patterns like 17 | # "username: root password: root" 18 | # that tend to occur very frequently in text. This does not currently catch 19 | # things like "username / password is root / root" 20 | regex = re.compile(r''' 21 | (username|login|u:)\s*:?\s* # username might have : and whitespace 22 | (?P[\w\-\.@+]*) # capture the username for replacement 23 | \s+ # some whitespace between 24 | (password|pw|p:)\s*:?\s* # password might have : and whitespace 25 | (?P.*) # password can be anything until EOL 26 | ''', re.MULTILINE | re.VERBOSE | re.IGNORECASE) 27 | -------------------------------------------------------------------------------- /tests/test_postprocessor_prefix_postfix_replacer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from scrubadub.post_processors.prefix_suffix import PrefixSuffixReplacer 4 | from scrubadub.filth import EmailFilth 5 | 6 | 7 | class PrefixSuffixReplacerTestCase(unittest.TestCase): 8 | def test_usage(self): 9 | post_proc = PrefixSuffixReplacer() 10 | filths = [EmailFilth(0, 19, 'example@example.com')] 11 | self.assertEqual(filths[0].replacement_string, None) 12 | 13 | filths = post_proc.process_filth(filths) 14 | self.assertEqual(filths[0].replacement_string, '{{EMAIL}}') 15 | 16 | post_proc = PrefixSuffixReplacer(prefix=None, suffix='>>') 17 | filths = post_proc.process_filth(filths) 18 | self.assertEqual(filths[0].replacement_string, '{{EMAIL}}>>') 19 | 20 | post_proc = PrefixSuffixReplacer(prefix='<<', suffix=None) 21 | filths = post_proc.process_filth(filths) 22 | self.assertEqual(filths[0].replacement_string, '<<{{EMAIL}}>>') 23 | 24 | post_proc = PrefixSuffixReplacer(prefix='||', suffix='||') 25 | filths = post_proc.process_filth(filths) 26 | self.assertEqual(filths[0].replacement_string, '||<<{{EMAIL}}>>||') -------------------------------------------------------------------------------- /scrubadub/detectors/vehicle_licence_plate.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from scrubadub.detectors.catalogue import register_detector 4 | from .base import RegionLocalisedRegexDetector 5 | from ..filth.vehicle_licence_plate import VehicleLicencePlateFilth 6 | 7 | 8 | @register_detector 9 | class VehicleLicencePlateDetector(RegionLocalisedRegexDetector): 10 | """Detects standard british licence plates.""" 11 | filth_cls = VehicleLicencePlateFilth 12 | name = 'vehicle_licence_plate' 13 | autoload = True 14 | 15 | # Vehicle Registration Plates from: 16 | # https://gist.github.com/harry-jones/755501192139820eeb65e030fe878f75 17 | # More cases available in above link, but can cause the regex to become 18 | # quire greedy. For now keep it simple! 19 | 20 | # taken from the alphagov fork of scrubadub: https://github.com/alphagov/scrubadub 21 | 22 | region_regex = { 23 | 'GB': re.compile( 24 | # Current system followed by the old system 25 | r""" 26 | \b( 27 | ([a-zA-Z]{2}[0-9]{2}(?:\s)?[a-zA-Z]{3}) 28 | | 29 | ([a-zA-Z][0-9]{1,3}(?:\s)?[a-zA-Z]{3}) 30 | )\b 31 | """, 32 | re.VERBOSE | re.IGNORECASE, 33 | ), 34 | } 35 | -------------------------------------------------------------------------------- /scrubadub/filth/phone.py: -------------------------------------------------------------------------------- 1 | import re 2 | import phonenumbers 3 | 4 | from faker import Faker 5 | from typing import List 6 | 7 | from .base import Filth 8 | from .. import utils 9 | 10 | 11 | class PhoneFilth(Filth): 12 | type = 'phone' 13 | 14 | @staticmethod 15 | def generate(faker: Faker) -> str: 16 | """Generates an example of this ``Filth`` type, usually using the faker python library. 17 | 18 | :param faker: The ``Faker`` class from the ``faker`` library 19 | :type faker: Faker 20 | :return: An example of this ``Filth`` 21 | :rtype: str 22 | """ 23 | phone_number = '' 24 | language, region = utils.locale_split(faker._locales[0]) 25 | results = [] # type: List[phonenumbers.PhoneNumberMatch] 26 | # Here I'm filtering for numbers that pass validation by the phonenumbers package 27 | while len(results) < 1: 28 | # Faker generates random numbers of the right format eg (###)###-#### 29 | phone_number = re.sub(r'x.*$', '', faker.phone_number()) 30 | # phonenumbers checks that they follow the rules around area codes and that they are possibly valid 31 | results = list(phonenumbers.PhoneNumberMatcher(phone_number, region)) 32 | return phone_number 33 | -------------------------------------------------------------------------------- /scrubadub/post_processors/remover.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence 2 | 3 | from scrubadub.post_processors.catalogue import register_post_processor 4 | from scrubadub.filth import Filth 5 | from scrubadub.post_processors.base import PostProcessor 6 | 7 | 8 | class FilthRemover(PostProcessor): 9 | """Removes all found filth from the original document. 10 | 11 | >>> import scrubadub 12 | >>> scrubber = scrubadub.Scrubber(post_processor_list=[ 13 | ... scrubadub.post_processors.FilthRemover(), 14 | ... ]) 15 | >>> scrubber.clean("Contact me at 522-368-8530 or hernandezjenna@example.com") 16 | 'Contact me at or ' 17 | 18 | """ 19 | name = 'filth_remover' # type: str 20 | autoload = False 21 | index = 0 22 | 23 | def process_filth(self, filth_list: Sequence[Filth]) -> Sequence[Filth]: 24 | """Processes the filth to remove the filth 25 | 26 | :param filth_list: The text to be hashed 27 | :type filth_list: Sequence[Filth] 28 | :return: The processed filths 29 | :rtype: Sequence[Filth] 30 | """ 31 | for filth_item in filth_list: 32 | filth_item.replacement_string = '' 33 | return filth_list 34 | 35 | 36 | register_post_processor(FilthRemover) 37 | 38 | __all__ = ['FilthRemover'] 39 | -------------------------------------------------------------------------------- /scrubadub/detectors/url.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from scrubadub.detectors.catalogue import register_detector 4 | from .base import RegexDetector 5 | from ..filth import UrlFilth 6 | 7 | 8 | @register_detector 9 | class UrlDetector(RegexDetector): 10 | """Use regular expressions to remove URLs that begin with ``http://``, 11 | ``https://`` or ``www.`` from dirty dirty ``text``. 12 | 13 | With ``keep_domain=True``, this detector only obfuscates the path on a 14 | URL, not its domain. For example, 15 | ``http://twitter.com/someone/status/234978haoin`` becomes 16 | ``http://twitter.com/{{replacement}}``. 17 | """ 18 | filth_cls = UrlFilth 19 | name = 'url' 20 | autoload = True 21 | 22 | # this regular expression is convenient for captures the domain name 23 | # and the path separately, which is useful for keeping the domain name 24 | # but sanitizing the path altogether 25 | regex = re.compile(r''' 26 | (?P 27 | (https?:\/\/(www\.)?|www\.) # protocol http://, etc 28 | [\-\w@:%\.\+~\#=]{2,256}\.[a-z]{2,6} # domain name 29 | /? # can have a trailing slash 30 | )(?P 31 | [\-\w@:%\+\.~\#?&/=]* # rest of path, query, & hash 32 | ) 33 | ''', re.VERBOSE) 34 | -------------------------------------------------------------------------------- /tests/test_detector_configuration.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import catalogue 4 | import scrubadub 5 | import scrubadub.detectors.catalogue 6 | 7 | 8 | class DetectorConfigTestCase(unittest.TestCase): 9 | def test_register_detector(self): 10 | class NewDetector(scrubadub.detectors.Detector): 11 | name = 'new_detector' 12 | 13 | scrubadub.detectors.catalogue.register_detector(NewDetector, autoload=False) 14 | self.assertTrue(NewDetector.name in scrubadub.detectors.catalogue.detector_catalogue) 15 | self.assertFalse(NewDetector.autoload) 16 | self.assertEqual(scrubadub.detectors.catalogue.detector_catalogue.get(NewDetector.name), NewDetector) 17 | 18 | scrubadub.detectors.catalogue.remove_detector(NewDetector) 19 | with self.assertRaises(catalogue.RegistryError): 20 | scrubadub.detectors.catalogue.detector_catalogue.get(NewDetector.name) 21 | 22 | scrubadub.detectors.catalogue.register_detector(NewDetector, autoload=True) 23 | self.assertTrue(NewDetector.name in scrubadub.detectors.catalogue.detector_catalogue) 24 | self.assertTrue(NewDetector.autoload) 25 | self.assertEqual(scrubadub.detectors.catalogue.detector_catalogue.get(NewDetector.name), NewDetector) 26 | 27 | scrubadub.detectors.catalogue.remove_detector(NewDetector) 28 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # tox (https://tox.readthedocs.io/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py39, py38, py37, py36 8 | 9 | [testenv] 10 | allowlist_externals = bash 11 | skip_install = False 12 | setenv = 13 | PIP_INDEX_URL = {env:PIP_INDEX_URL} 14 | LD_LIBRARY_PATH = {env:LD_LIBRARY_PATH} 15 | LIBRARY_PATH = {env:LIBRARY_PATH} 16 | C_INCLUDE_PATH = {env:C_INCLUDE_PATH} 17 | CPP_INCLUDE_PATH = {env:CPP_INCLUDE_PATH} 18 | commands = 19 | pip install --upgrade pip wheel setuptools 20 | pip install -r requirements/python-dev 21 | python3 -c "import nltk; nltk.download('punkt')" 22 | bash -c "python3 -m spacy info | grep Pipelines | grep -qv en_core_web_trf && python -m spacy download en_core_web_trf || exit 0" 23 | bash -c "python3 -m spacy info | grep Pipelines | grep -qv en_core_web_sm && python -m spacy download en_core_web_sm || exit 0" 24 | bash -c "python3 -m spacy info | grep Pipelines | grep -qv fr_core_news_lg && python -m spacy download fr_core_news_lg || exit 0" 25 | bash -c "python3 -m spacy info | grep Pipelines | grep -qv de_core_news_sm && python -m spacy download de_core_news_sm || exit 0" 26 | python tests/run.py 27 | -------------------------------------------------------------------------------- /design/customize_filth_detection.py: -------------------------------------------------------------------------------- 1 | """scrubadub has some very conservative defaults (high recall) for identifying 2 | filth. One of the key ways in which scrubadub can be customized is in improving 3 | the precision of filth detection. 4 | 5 | For example, if a user knows that the word 'iPhone' is not a person's name, but 6 | a product, then a user should be able to easily adapt how scrubadub identifies 7 | names. 8 | """ 9 | 10 | import scrubadub 11 | 12 | # fine-tune how scrubadub detects names and omit product names 13 | # https://github.com/deanmalmgren/scrubadub/issues/6 14 | class MyNameDetector(scrubadub.detectors.TextBlobNameDetector): 15 | def iter_filth(self, text): 16 | for filth in super(MyNameDetector, self).iter_filth(text): 17 | if filth != "iPhone": 18 | yield filth 19 | 20 | # instantiate a scrubber and change the name detector to use our custom class 21 | scrubber = scrubadub.Scrubber() 22 | scrubber.detectors['name'] = MyNameDetector() 23 | 24 | # these methods have identical on a Scrubber object should have identical 25 | # behavior to the scrubadub.clean convenience function 26 | clean_text = scrubber.clean(text) 27 | clean_text = scrubber.clean(text, replace_with="placeholder") 28 | clean_text = scrubber.clean(text, replace_with="surrogate") 29 | clean_text = scrubber.clean(text, replace_with="identifier", lookup=lookup) 30 | -------------------------------------------------------------------------------- /docs/api_scrubadub.rst: -------------------------------------------------------------------------------- 1 | .. _api_scrubadub: 2 | 3 | scrubadub 4 | ========= 5 | 6 | There are several convenience functions to make using scrubadub quick and simple. 7 | These functions either remove the Filth from the text (such as ``scrubadub.clean``) or 8 | return a list of Filth objects that were found (such as ``scrubadub.list_filth``). 9 | These functions either work on a single document in a string (such as ``scrubadub.clean``) or 10 | work on a set of documents given in either a dictonary or list (such as ``scrubadub.clean_documents``). 11 | 12 | scrubadub.clean 13 | --------------- 14 | 15 | .. autofunction:: scrubadub.clean 16 | 17 | scrubadub.clean_documents 18 | ------------------------- 19 | 20 | .. autofunction:: scrubadub.clean_documents 21 | 22 | scrubadub.list_filth 23 | -------------------- 24 | 25 | .. autofunction:: scrubadub.list_filth 26 | 27 | scrubadub.list_filth_documents 28 | ------------------------------ 29 | 30 | .. autofunction:: scrubadub.list_filth_documents 31 | 32 | 33 | scrubadub.Scrubber 34 | ------------------ 35 | 36 | All of the ``Detector``'s are managed by the ``Scrubber``. The main job of the 37 | ``Scrubber`` is to handle situations in which the same section of text contains 38 | different types of ``Filth``. 39 | 40 | .. autoclass:: scrubadub.scrubbers.Scrubber 41 | :members: 42 | :undoc-members: 43 | :show-inheritance: 44 | 45 | -------------------------------------------------------------------------------- /tests/test_detector_emails.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from base import BaseTestCase 4 | 5 | 6 | class EmailTestCase(unittest.TestCase, BaseTestCase): 7 | 8 | def test_john_gmail(self): 9 | """ 10 | BEFORE: My email is john@gmail.com 11 | AFTER: My email is {{EMAIL}} 12 | """ 13 | self.compare_before_after() 14 | 15 | def test_John_gmail(self): 16 | """ 17 | BEFORE: My email is John@gmail.com 18 | AFTER: My email is {{EMAIL}} 19 | """ 20 | self.compare_before_after() 21 | 22 | def test_John1_example_com(self): 23 | """ 24 | BEFORE: My email is John1@example.com 25 | AFTER: My email is {{EMAIL}} 26 | """ 27 | self.compare_before_after() 28 | 29 | def test_adam_example_info(self): 30 | """ 31 | BEFORE: My email is adam80@example.info 32 | AFTER: My email is {{EMAIL}} 33 | """ 34 | self.compare_before_after() 35 | 36 | def test_uppercase(self): 37 | """ 38 | BEFORE: My email is HELLO@EXAMPLE.COM 39 | AFTER: My email is {{EMAIL}} 40 | """ 41 | self.compare_before_after() 42 | 43 | def test_fancy_john_gmail(self): 44 | """ 45 | BEFORE: My email is john at gmail.com 46 | AFTER: My email is {{EMAIL}} 47 | """ 48 | self.compare_before_after() 49 | -------------------------------------------------------------------------------- /tests/test_detector_en_US_social_security_number.py: -------------------------------------------------------------------------------- 1 | import faker 2 | import unittest 3 | from scrubadub.filth import SocialSecurityNumberFilth 4 | 5 | from base import BaseTestCase 6 | 7 | 8 | class SSNTestCase(unittest.TestCase, BaseTestCase): 9 | 10 | def test_example(self): 11 | """ 12 | BEFORE: My social security number is 726-60-2033 13 | AFTER: My social security number is {{SOCIAL_SECURITY_NUMBER}} 14 | """ 15 | self.compare_before_after() 16 | 17 | def test_hyphens(self): 18 | """ 19 | BEFORE: My social security number is 109-99-6000 20 | AFTER: My social security number is {{SOCIAL_SECURITY_NUMBER}} 21 | """ 22 | self.compare_before_after() 23 | 24 | def test_dots(self): 25 | """ 26 | BEFORE: My social security number is 109.99.6000 27 | AFTER: My social security number is {{SOCIAL_SECURITY_NUMBER}} 28 | """ 29 | self.compare_before_after() 30 | 31 | def test_spaces(self): 32 | """ 33 | BEFORE: My social security number is 109 99 6000 34 | AFTER: My social security number is {{SOCIAL_SECURITY_NUMBER}} 35 | """ 36 | self.compare_before_after() 37 | 38 | def test_generate(self): 39 | fake = faker.Faker('en_US') 40 | faker.Faker.seed(4321) 41 | 42 | self.assertEqual( 43 | '818-09-2900', 44 | SocialSecurityNumberFilth.generate(faker=fake), 45 | ) 46 | -------------------------------------------------------------------------------- /docs/api_scrubadub_post.rst: -------------------------------------------------------------------------------- 1 | .. _api_scrubadub_post: 2 | 3 | scrubadub.post_processors 4 | ========================= 5 | 6 | ``PostProcessor``\ s generally can be used to process the detected ``Filth`` 7 | objects and make changes to them. 8 | 9 | These are a new addition to scrubadub and at the moment only simple ones 10 | exist that alter the replacement string. 11 | 12 | .. autoclass:: scrubadub.post_processors.base.PostProcessor 13 | :members: 14 | :undoc-members: 15 | :show-inheritance: 16 | 17 | .. autoclass:: scrubadub.post_processors.filth_replacer.FilthReplacer 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | .. autoclass:: scrubadub.post_processors.prefix_suffix.PrefixSuffixReplacer 23 | :members: 24 | :undoc-members: 25 | :show-inheritance: 26 | 27 | .. autoclass:: scrubadub.post_processors.remover.FilthRemover 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | 32 | 33 | Catalogue functions 34 | ------------------- 35 | 36 | .. _scrubadub.post_processors.register_post_processor: 37 | 38 | scrubadub.post_processors.register_post_processor 39 | ------------------------------------------------- 40 | 41 | .. autofunction:: scrubadub.post_processors.register_post_processor 42 | 43 | .. _scrubadub.post_processors.remove_post_processor: 44 | 45 | scrubadub.post_processors.remove_post_processor 46 | ----------------------------------------------- 47 | 48 | .. autofunction:: scrubadub.post_processors.remove_post_processor 49 | 50 | -------------------------------------------------------------------------------- /scrubadub/filth/date_of_birth.py: -------------------------------------------------------------------------------- 1 | import random 2 | import datetime 3 | import dateparser 4 | from faker import Faker 5 | 6 | from .base import Filth 7 | 8 | 9 | class DateOfBirthFilth(Filth): 10 | type = 'date_of_birth' 11 | min_age_years = 18 12 | max_age_years = 100 13 | 14 | @staticmethod 15 | def generate(faker: Faker) -> str: 16 | """Generates an example of this ``Filth`` type, usually using the faker python library. 17 | 18 | :param faker: The ``Faker`` class from the ``faker`` library 19 | :type faker: Faker 20 | :return: An example of this ``Filth`` 21 | :rtype: str 22 | """ 23 | formats = [ 24 | '%c', # Tue Aug 16 21:30:00 1988 (en_US); locale dependant 25 | '%x', # 08/16/1988 (en_US); locale dependant 26 | '%a %d %b %Y', # Sun 19 Jan 1999 27 | '%A %d %B %Y', # Sunday 19 January 1999 28 | '%d-%m-%Y', # 15-01-1999 29 | '%A %dth, %B, %Y', # Monday 08th, January, 1973 30 | ] 31 | return faker.date_of_birth().strftime(random.choice(formats)) 32 | 33 | def is_valid(self) -> bool: 34 | """Check to see if the found filth is valid.""" 35 | found_date = dateparser.parse(self.text) 36 | if found_date is None: 37 | return False 38 | years_since_identified_date = datetime.date.today().year - found_date.year 39 | return DateOfBirthFilth.min_age_years <= years_since_identified_date <= DateOfBirthFilth.max_age_years 40 | -------------------------------------------------------------------------------- /scrubadub/exceptions.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # this is the base exception that is thrown by scrubadub to make it 4 | # easy to suppress all Scrubadub exceptions 5 | class ScrubadubException(Exception): 6 | 7 | def __init__(self, *args, **kwargs): 8 | self.issues_url = 'http://github.com/LeapBeyond/scrubadub/issues' 9 | 10 | def render(self, msg): 11 | return msg % vars(self) 12 | 13 | 14 | class UnicodeRequired(ScrubadubException): 15 | """Scrubadub requires unicode. Throw a useful error to lead users to 16 | the promised land. 17 | """ 18 | 19 | def __str__(self): 20 | return self.render(( 21 | 'scrubadub works best with unicode.\n' 22 | 'Frustrated by unicode?\n' 23 | 'Yeah, me too.\n' 24 | 'But unicode sandwiches are awesome.\n' 25 | 'http://bit.ly/unipain @nedbat\n' 26 | )) 27 | 28 | 29 | class UnexpectedFilth(ScrubadubException): 30 | pass 31 | 32 | 33 | class FilthMergeError(ScrubadubException): 34 | pass 35 | 36 | 37 | class InvalidReplaceWith(ScrubadubException): 38 | 39 | def __init__(self, replace_with): 40 | super(InvalidReplaceWith, self).__init__() 41 | self.replace_with = replace_with 42 | 43 | def __str__(self): 44 | return self.render(( 45 | 'Invalid replace_with parameter %(replace_with)s. Can only use ' 46 | '`placeholder` for the time being. If you have other ideas for ' 47 | 'replace_with functionality, please make a suggestion at ' 48 | '%(issues_url)s' 49 | )) 50 | -------------------------------------------------------------------------------- /tests/test_detector_text_blob.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import scrubadub.detectors.catalogue 4 | from base import BaseTestCase 5 | 6 | import scrubadub 7 | 8 | class TextBlobNameTestCase(unittest.TestCase, BaseTestCase): 9 | 10 | def setUp(self): 11 | from scrubadub.detectors.text_blob import TextBlobNameDetector 12 | scrubadub.detectors.catalogue.register_detector(TextBlobNameDetector, autoload=True) 13 | 14 | def test_john(self): 15 | """ 16 | BEFORE: John is a cat 17 | AFTER: {{NAME}} is a cat 18 | """ 19 | self.compare_before_after() 20 | 21 | def test_no_names(self): 22 | """ 23 | BEFORE: Hello. Please testing. 24 | AFTER: Hello. Please testing. 25 | """ 26 | self.compare_before_after() 27 | 28 | @unittest.skip('lower names cause problems for textblob') 29 | def test_lower_names(self): 30 | """ 31 | BEFORE: sarah is a friendly person 32 | AFTER: {{NAME}} is a friendly person 33 | """ 34 | self.compare_before_after() 35 | 36 | def test_disallowed_nouns(self): 37 | import scrubadub.detectors.text_blob 38 | detector = scrubadub.detectors.text_blob.TextBlobNameDetector() 39 | detector.disallowed_nouns = set() 40 | with self.assertRaises(TypeError): 41 | list(detector.iter_filth('John is a cat')) 42 | 43 | def tearDown(self) -> None: 44 | from scrubadub.detectors.text_blob import TextBlobNameDetector 45 | scrubadub.detectors.catalogue.remove_detector(TextBlobNameDetector) 46 | -------------------------------------------------------------------------------- /scrubadub/detectors/credit_card.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .base import RegexDetector 4 | from ..filth import CreditCardFilth 5 | from scrubadub.detectors.catalogue import register_detector 6 | 7 | 8 | @register_detector 9 | class CreditCardDetector(RegexDetector): 10 | """Remove credit-card numbers from dirty dirty ``text``. 11 | 12 | Supports Visa, MasterCard, American Express, Diners Club and JCB. 13 | """ 14 | name = 'credit_card' 15 | filth_cls = CreditCardFilth 16 | autoload = True 17 | 18 | # Regexes from: 19 | # http://www.regular-expressions.info/creditcard.html 20 | 21 | # Fake card numbers from: 22 | # https://www.paypalobjects.com/en_US/vhelp/paypalmanager_help/credit_card_numbers.htm 23 | 24 | # taken from the alphagov fork of scrubadub: https://github.com/alphagov/scrubadub 25 | 26 | # Looking at wikipedia, there are probably more numbers to detect: 27 | # https://en.wikipedia.org/wiki/Payment_card_number#Issuer_identification_number_.28IIN.29 28 | 29 | # TODO: regex doesn't match if the credit card number has spaces/dashes in 30 | 31 | regex = re.compile(( 32 | r"(?<=\s)" 33 | r"(?:4[0-9]{12}(?:[0-9]{3})?" # Visa 34 | r"|(?:5[1-5][0-9]{2}" # MasterCard 35 | r"|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}" 36 | r"|3[47][0-9]{13}" # American Express 37 | r"|3(?:0[0-5]|[68][0-9])[0-9]{11}" # Diners Club 38 | r"|6(?:011|5[0-9]{2})[0-9]{12}" # Discover 39 | r"|(?:2131|1800|35\d{3})\d{11})" # JCB 40 | ), re.VERBOSE) 41 | -------------------------------------------------------------------------------- /tests/test_detector_en_GB_trn.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from base import BaseTestCase 4 | 5 | 6 | class GBTrnTestCase(unittest.TestCase, BaseTestCase): 7 | 8 | def test_gbtrn_1(self): 9 | """ 10 | BEFORE: My PAYE temp number is 99L99999, which is not permanent. 11 | AFTER: My PAYE temp number is {{TAX_REFERENCE_NUMBER}}, which is not permanent. 12 | """ 13 | self.compare_before_after(locale='en_GB') 14 | 15 | def test_gbtrn_2(self): 16 | """ 17 | BEFORE: Enter a Temporary Reference Number that is 2 numbers, 1 letter, then 5 numbers, like 11 A 12345. 18 | AFTER: Enter a Temporary Reference Number that is 2 numbers, 1 letter, then 5 numbers, like {{TAX_REFERENCE_NUMBER}}. 19 | """ 20 | self.compare_before_after(locale='en_GB') 21 | 22 | def test_gbtrn_3(self): 23 | """ 24 | BEFORE: It’s on your National Insurance card, benefit letter, payslip or P60. For example, 99L 99999. 25 | AFTER: It’s on your National Insurance card, benefit letter, payslip or P60. For example, {{TAX_REFERENCE_NUMBER}}. 26 | """ 27 | self.compare_before_after(locale='en_GB') 28 | 29 | def test_gbtrn_4(self): 30 | """ 31 | BEFORE: Please verify the TRN 99 L 999 99. 32 | AFTER: Please verify the TRN {{TAX_REFERENCE_NUMBER}}. 33 | """ 34 | self.compare_before_after(locale='en_GB') 35 | 36 | def test_gbtrn_5(self): 37 | """ 38 | BEFORE: The number is 11A 12345. 39 | AFTER: The number is {{TAX_REFERENCE_NUMBER}}. 40 | """ 41 | self.compare_before_after(locale='en_GB') 42 | -------------------------------------------------------------------------------- /tests/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import sys 5 | import subprocess 6 | 7 | from wasabi import msg 8 | 9 | tests = [ 10 | "mypy --config-file setup.cfg scrubadub/", 11 | "flake8 --config setup.cfg scrubadub/", 12 | # If py3.5 then examples with spacy don't work so disable doctests 13 | 'if python3 --version | grep -Evq "Python (3\\.5\\.)" ; then nosetests --with-doctest --doctest-extension=rst ./tests/ ./scrubadub/ ./docs/ ; else nosetests ; fi', 14 | "python3 ./tests/benchmark_accuracy.py --fast", 15 | "python3 ./tests/benchmark_time.py", 16 | 'if python3 --version | grep -Evq "Python (3\\.5\\.)" ; then cd docs && make html && cd - ; fi', 17 | ] 18 | 19 | 20 | def run_test(command, directory): 21 | """Execute a command that runs a test""" 22 | msg.text("RUNNING " + command) 23 | wrapped_command = f"cd {directory} && {command}" 24 | pipe = subprocess.Popen( 25 | wrapped_command, shell=True, 26 | ) 27 | pipe.wait() 28 | if pipe.returncode == 0: 29 | msg.good("TEST PASSED") 30 | else: 31 | msg.fail("TEST FAILED") 32 | msg.text('') 33 | return pipe.returncode 34 | 35 | 36 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 37 | 38 | # run the tests 39 | if isinstance(tests, str): 40 | returncode = run_test(tests, root_dir) 41 | elif isinstance(tests, (list, tuple)): 42 | returncode = 0 43 | for test in tests: 44 | returncode += run_test(test, root_dir) 45 | 46 | if returncode == 0: 47 | msg.good("ALL TESTS PASSED") 48 | else: 49 | msg.fail("SOME TESTS FAILED, SEE ABOVE") 50 | 51 | sys.exit(returncode) 52 | -------------------------------------------------------------------------------- /tests/test_postprocessor_configuration.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import catalogue 3 | import scrubadub 4 | import scrubadub.post_processors.catalogue 5 | 6 | 7 | class PostProcessorConfigTestCase(unittest.TestCase): 8 | def test_register_post_processor(self): 9 | class NewPostProcessor(scrubadub.post_processors.PostProcessor): 10 | name = 'new_post_processor' 11 | 12 | scrubadub.post_processors.catalogue.register_post_processor(NewPostProcessor, False, -1) 13 | 14 | self.assertTrue(NewPostProcessor.name in scrubadub.post_processors.catalogue.post_processor_catalogue) 15 | self.assertFalse(NewPostProcessor.autoload) 16 | self.assertEqual(-1, NewPostProcessor.index) 17 | self.assertEqual(scrubadub.post_processors.catalogue.post_processor_catalogue.get(NewPostProcessor.name), NewPostProcessor) 18 | 19 | scrubadub.post_processors.catalogue.remove_post_processor(NewPostProcessor) 20 | with self.assertRaises(catalogue.RegistryError): 21 | scrubadub.post_processors.catalogue.post_processor_catalogue.get(NewPostProcessor.name) 22 | 23 | scrubadub.post_processors.catalogue.register_post_processor(NewPostProcessor, True, 7927) 24 | self.assertTrue(NewPostProcessor.name in scrubadub.post_processors.catalogue.post_processor_catalogue) 25 | self.assertTrue(NewPostProcessor.autoload) 26 | self.assertEqual(7927, NewPostProcessor.index) 27 | self.assertEqual(scrubadub.post_processors.catalogue.post_processor_catalogue.get(NewPostProcessor.name), NewPostProcessor) 28 | 29 | scrubadub.post_processors.catalogue.remove_post_processor(NewPostProcessor) 30 | -------------------------------------------------------------------------------- /tests/test_detector_drivers_licence.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from base import BaseTestCase 4 | 5 | 6 | class GBDriversTestCase(unittest.TestCase, BaseTestCase): 7 | 8 | def test_gbdrivers_1(self): 9 | """ 10 | BEFORE: The driving licence number of the claimant is MORGA753116SM91J 01, and a copy of the licence is attached. 11 | AFTER: The driving licence number of the claimant is {{DRIVERS_LICENCE}}, and a copy of the licence is attached. 12 | """ 13 | self.compare_before_after(locale='en_GB') 14 | 15 | def test_gbdrivers_2(self): 16 | """ 17 | BEFORE: My DVLA NO is MORGA 753116SM91J 01 could you please check. 18 | AFTER: My DVLA NO is {{DRIVERS_LICENCE}} could you please check. 19 | """ 20 | self.compare_before_after(locale='en_GB') 21 | 22 | def test_gbdrivers_3(self): 23 | """ 24 | BEFORE: My DVLA NO is MORGA753116SM91J01 could you please check. 25 | AFTER: My DVLA NO is {{DRIVERS_LICENCE}} could you please check. 26 | """ 27 | self.compare_before_after(locale='en_GB') 28 | 29 | def test_gbdrivers_4(self): 30 | """ 31 | BEFORE: My DVLA NO is MORGA 753 116 SM91J 01 could you please check. 32 | AFTER: My DVLA NO is {{DRIVERS_LICENCE}} could you please check. 33 | """ 34 | self.compare_before_after(locale='en_GB') 35 | 36 | def test_gbdrivers_5(self): 37 | """ 38 | BEFORE: My DVLA NO is MORGA 753116 SM91J01 could you please check. 39 | AFTER: My DVLA NO is {{DRIVERS_LICENCE}} could you please check. 40 | """ 41 | self.compare_before_after(locale='en_GB') 42 | -------------------------------------------------------------------------------- /tests/test_detector_twitter.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from base import BaseTestCase 4 | 5 | 6 | class EmailTestCase(unittest.TestCase, BaseTestCase): 7 | 8 | def test_email_and_twitter(self): 9 | """ 10 | BEFORE: My email is john@gmail.com and i tweet at @john_gmail 11 | AFTER: My email is {{EMAIL}} and i tweet at {{TWITTER}} 12 | """ 13 | self.compare_before_after() 14 | 15 | def test_capitalise(self): 16 | """ 17 | BEFORE: My tweeter is @John_gmail 18 | AFTER: My tweeter is {{TWITTER}} 19 | """ 20 | self.compare_before_after() 21 | 22 | def test_twitter(self): 23 | """ 24 | BEFORE: This is an invalid handle @TwitterInfo 25 | AFTER: This is an invalid handle @TwitterInfo 26 | """ 27 | self.compare_before_after() 28 | 29 | def test_admin(self): 30 | """ 31 | BEFORE: This is an invalid handle @XYZAdminInfo 32 | AFTER: This is an invalid handle @XYZAdminInfo 33 | """ 34 | self.compare_before_after() 35 | 36 | def test_uppercase(self): 37 | """ 38 | BEFORE: My tweeter is @JOHN_JOHN123 39 | AFTER: My tweeter is {{TWITTER}} 40 | """ 41 | self.compare_before_after() 42 | 43 | def test_underscore(self): 44 | """ 45 | BEFORE: My tweeter is @_JOHN_JOHN123 46 | AFTER: My tweeter is {{TWITTER}} 47 | """ 48 | self.compare_before_after() 49 | 50 | def test_underscores(self): 51 | """ 52 | BEFORE: My tweeter is @_JOHN_JOHN123_ 53 | AFTER: My tweeter is {{TWITTER}} 54 | """ 55 | self.compare_before_after() 56 | -------------------------------------------------------------------------------- /docs/addresses.rst: -------------------------------------------------------------------------------- 1 | 2 | Addresses 3 | ========= 4 | 5 | Address detection is hard, despite the fact it may seem simple on the surface. 6 | We use the `pyap `_ package to detect addresses and `libpostal `_ to verify them. 7 | This is implemented in ``scrubadub_address.address.AddressDetector``, which is in a separate package and not enabled by default due to its dependencies on these two libraries. 8 | We currently support British, American and Canadian addresses. 9 | 10 | Installation 11 | ------------ 12 | 13 | First libpostal needs to be installed. 14 | Full instructions can be found in the `libpostal documentation `_, but a summary is given below for linux installation: 15 | 16 | .. code-block:: console 17 | 18 | $ sudo apt-get install curl autoconf automake libtool pkg-config 19 | $ git clone https://github.com/openvenues/libpostal 20 | $ cd libpostal 21 | $ ./bootstrap.sh 22 | $ ./configure --prefix=/usr/local/ 23 | $ make -j4 24 | $ sudo make install 25 | 26 | Once you have installed libpostal, the remaining python dependencies can be installed: 27 | 28 | .. code-block:: console 29 | 30 | $ pip install pypostal scrubadub_address 31 | 32 | Usage 33 | ----- 34 | 35 | Once the dependencies are installed you can import the detector and add it to your ``Scrubber`` as shown below: 36 | 37 | .. code-block:: pycon 38 | 39 | >>> import scrubadub, scrubadub_address 40 | >>> scrubber = scrubadub.Scrubber() 41 | >>> scrubber.add_detector(scrubadub_address.detectors.AddressDetector) 42 | >>> scrubber.clean("I live at 6919 Bell Drives, East Jessicastad, MO 76908") 43 | 'I live at {{ADDRESS}}' 44 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. _quick_start: 2 | 3 | .. include:: ../README.rst 4 | 5 | Related work 6 | ------------ 7 | 8 | ``scrubadub`` isn't the first package to attempt to remove personally 9 | identifiable information from free text. There are a handful of other 10 | projects out there that have very similar aims and which provide some 11 | inspiration for how ``scrubadub`` should work. 12 | 13 | - `MITRE `__ gives the ability to 14 | replace names with a placeholder like ``[NAME]`` or alternatively 15 | replace names with fake names. last release in 8/2014. not on github. 16 | unclear what language although it looks like python. it is clear that 17 | the documentation sucks and is primarily intended for academic 18 | audiences (docs are in papers). 19 | 20 | - `physionet has a few deidentification 21 | packages `__ 22 | that look pretty decent but are both written in perl and require 23 | advance knowledge of what you are trying to replace. Intended for 24 | HIPAA regulations. In particular, 25 | `deid `__ has some good 26 | lists of names that might be useful in spite of the fact it has 5k+ 27 | lines of gross perl. 28 | 29 | 30 | Contents 31 | -------- 32 | 33 | .. toctree:: 34 | :maxdepth: 2 35 | :caption: Documentation 36 | 37 | Introduction 38 | usage 39 | accuracy 40 | names 41 | addresses 42 | creating_detectors 43 | localization 44 | contributing 45 | changelog 46 | 47 | .. toctree:: 48 | :maxdepth: 2 49 | :name: api_toc 50 | :caption: API Reference 51 | 52 | api_scrubadub 53 | api_scrubadub_detectors 54 | api_scrubadub_filth 55 | api_scrubadub_post 56 | api_scrubadub_comparison 57 | 58 | 59 | Indices and tables 60 | ------------------ 61 | 62 | * :ref:`genindex` 63 | * :ref:`modindex` 64 | * :ref:`search` 65 | -------------------------------------------------------------------------------- /design/customize_replacement_strings.py: -------------------------------------------------------------------------------- 1 | """scrubadub uses {{}} notation by default to identify filth, but a user may 2 | prefer to fine-tune how the filth is removed. 3 | 4 | For example, if the input text is html, then a user may want the filth to be 5 | included in a tag that has a particular class on it to make it easy to 6 | style these things. 7 | 8 | Another example is a situation when a user wants to retain the domain name on a 9 | URL but not the path. 10 | """ 11 | 12 | import scrubadub 13 | 14 | # fine tune the prefix and suffix for all scrubadub objects. because this is 15 | # changing a class attribute on the base class, this should propagate to all 16 | # filth 17 | scrubadub.filth.Filth.prefix = '' 18 | scrubadub.filth.Filth.suffix = '' 19 | 20 | # these methods should now all have that prefix and suffix 21 | clean_text = scrubadub.clean(text) 22 | clean_text = scrubadub.clean(text, replace_with="placeholder") 23 | clean_text = scrubadub.clean(text, replace_with="surrogate") 24 | clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup) 25 | 26 | # and so should these 27 | scrubber = scrubadub.Scrubber() 28 | clean_text = scrubber.clean(text) 29 | clean_text = scrubber.clean(text, replace_with="placeholder") 30 | clean_text = scrubber.clean(text, replace_with="surrogate") 31 | clean_text = scrubber.clean(text, replace_with="identifier", lookup=lookup) 32 | 33 | 34 | # reconfigure back to the old prefix and suffix combination and now keep the 35 | # domain on UrlFilth 36 | scrubadub.filth.Filth.prefix = '{{' 37 | scrubadub.filth.Filth.suffix = '}}' 38 | scrubadub.filth.UrlFilth.keep_domain = True 39 | 40 | # these methods should now all have that prefix and suffix 41 | clean_text = scrubadub.clean(text) 42 | clean_text = scrubadub.clean(text, replace_with="placeholder") 43 | clean_text = scrubadub.clean(text, replace_with="surrogate") 44 | clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup) 45 | -------------------------------------------------------------------------------- /tests/test_detector_en_GB_nino.py: -------------------------------------------------------------------------------- 1 | import faker 2 | import unittest 3 | from scrubadub.filth import NationalInsuranceNumberFilth 4 | 5 | from base import BaseTestCase 6 | 7 | 8 | class GBNinoTestCase(unittest.TestCase, BaseTestCase): 9 | 10 | def test_nino_1(self): 11 | """ 12 | BEFORE: My NI number is AZ 12 34 56 A 13 | AFTER: My NI number is {{NATIONAL_INSURANCE_NUMBER}} 14 | """ 15 | self.compare_before_after(locale='en_GB') 16 | 17 | def test_nino_2(self): 18 | """ 19 | BEFORE: Enter a National Insurance number that is 2 letters, 6 numbers, then A, B, C or D, like AZ123456A. 20 | AFTER: Enter a National Insurance number that is 2 letters, 6 numbers, then A, B, C or D, like {{NATIONAL_INSURANCE_NUMBER}}. 21 | """ 22 | self.compare_before_after(locale='en_GB') 23 | 24 | def test_nino_3(self): 25 | """ 26 | BEFORE: It’s on your National Insurance card, benefit letter, payslip or P60. For example, AZ 12 34 56 A. 27 | AFTER: It’s on your National Insurance card, benefit letter, payslip or P60. For example, {{NATIONAL_INSURANCE_NUMBER}}. 28 | """ 29 | self.compare_before_after(locale='en_GB') 30 | 31 | def test_nino_4(self): 32 | """ 33 | BEFORE: Please verify the NI AZ 123456 A. 34 | AFTER: Please verify the NI {{NATIONAL_INSURANCE_NUMBER}}. 35 | """ 36 | self.compare_before_after(locale='en_GB') 37 | 38 | def test_nino_5(self): 39 | """ 40 | BEFORE: The number is AZ 123 456 A. 41 | AFTER: The number is {{NATIONAL_INSURANCE_NUMBER}}. 42 | """ 43 | self.compare_before_after(locale='en_GB') 44 | 45 | def test_generate(self): 46 | class Faker: 47 | def ssn(self): 48 | return 'ZZ061251T' 49 | 50 | self.assertEqual( 51 | 'ZZ061251T', 52 | NationalInsuranceNumberFilth.generate(faker=Faker()), 53 | ) 54 | -------------------------------------------------------------------------------- /tests/test_api_advanced.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from base import BaseTestCase 4 | 5 | 6 | class AdvancedTestCase(unittest.TestCase, BaseTestCase): 7 | 8 | def test_disable_email(self): 9 | """ 10 | BEFORE: contact me at joe@example.com 11 | AFTER: contact me at joe@example.com 12 | """ 13 | before, after = self.get_before_after() 14 | import scrubadub 15 | scrubber = scrubadub.Scrubber() 16 | scrubber.remove_detector('email') 17 | self.check_equal(after, scrubber.clean(before)) 18 | 19 | def test_customize_filth_identification(self): 20 | """ 21 | BEFORE: contact me at joe@example.com 22 | AFTER: contact me at EMAIL 23 | """ 24 | before, after = self.get_before_after() 25 | import scrubadub 26 | prefix = scrubadub.filth.base.Filth.prefix 27 | suffix = scrubadub.filth.base.Filth.suffix 28 | scrubadub.filth.base.Filth.prefix = u'' 29 | scrubadub.filth.base.Filth.suffix = u'' 30 | try: 31 | scrubber = scrubadub.Scrubber() 32 | self.check_equal(after, scrubber.clean(before)) 33 | finally: 34 | # Ensure that this is reset, no matter what happens above 35 | scrubadub.filth.base.Filth.prefix = prefix 36 | scrubadub.filth.base.Filth.suffix = suffix 37 | 38 | def test_identifier(self): 39 | """ 40 | BEFORE: i'm on twitter (@john_smith) or can be reached at +1.800.346.1819 41 | AFTER: i'm on twitter ({{TWITTER-0}}) or can be reached at {{PHONE-1}} 42 | """ 43 | self.compare_before_after(replace_with='identifier') 44 | 45 | def test_identifier_repeat(self): 46 | """ 47 | BEFORE: i'm on twitter (@john_smith), but tweet @john instead, don't tweet me @john_smith. 48 | AFTER: i'm on twitter ({{TWITTER-0}}), but tweet {{TWITTER-1}} instead, don't tweet me {{TWITTER-0}}. 49 | """ 50 | self.compare_before_after(replace_with='identifier') 51 | -------------------------------------------------------------------------------- /tests/test_detector_phone_numbers.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from base import BaseTestCase 4 | 5 | 6 | class PhoneNumberTestCase(unittest.TestCase, BaseTestCase): 7 | 8 | def create_docstring(self, phone_number): 9 | return """ 10 | BEFORE: My phone number is %s 11 | AFTER: My phone number is {{PHONE}} 12 | """ % phone_number 13 | 14 | def check_phone_numbers(self, *phone_numbers): 15 | for phone_number in phone_numbers: 16 | self.compare_before_after( 17 | docstring=self.create_docstring(phone_number), 18 | ) 19 | 20 | def test_american_phone_number(self): 21 | """test american-style phone numbers""" 22 | self.check_phone_numbers( 23 | '1-312-515-2239', 24 | '+1-312-515-2239', 25 | '1 (312) 515-2239', 26 | '312-515-2239', 27 | '(312) 515-2239', 28 | '(312)515-2239', 29 | ) 30 | 31 | def test_extension_phone_numbers(self): 32 | """test phone numbers with extensions""" 33 | self.check_phone_numbers( 34 | '312-515-2239 x12', 35 | '312-515-2239 ext. 12', 36 | '312-515-2239 ext.12', 37 | ) 38 | 39 | def test_international_phone_numbers(self): 40 | """test international phone numbers""" 41 | self.check_phone_numbers( 42 | '+47 21 30 85 99', 43 | '+45 69 19 88 56', 44 | '+46 852 503 499', 45 | '+31 619 837 236', 46 | '+86 135 3727 4136', 47 | '+61267881324', 48 | ) 49 | 50 | def test_multiple_phone_numbers(self): 51 | # running this through scrubadub.clean replaces 'reached at 52 | # 312.714.8142' with '{{EMAIL}}'. See issue 53 | result = self.clean( 54 | u'Call me on my cell 312.714.8142 or in my office 773.415.7432' 55 | ) 56 | self.assertEqual( 57 | result, 58 | u'Call me on my cell {{PHONE}} or in my office {{PHONE}}', 59 | 'problem with multiple phone numbers: \n %s' % result, 60 | ) 61 | -------------------------------------------------------------------------------- /tests/test_detector_credentials.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from base import BaseTestCase 4 | 5 | 6 | class CredentialsTestCase(unittest.TestCase, BaseTestCase): 7 | 8 | def test_root_root_combo(self): 9 | """ 10 | BEFORE: username: root\npassword: root\n\n 11 | AFTER: username: {{USERNAME}}\npassword: {{PASSWORD}}\n\n 12 | """ 13 | self.compare_before_after() 14 | 15 | def test_whitespaceless(self): 16 | """ 17 | BEFORE: username:root\npassword:crickets 18 | AFTER: username:{{USERNAME}}\npassword:{{PASSWORD}} 19 | """ 20 | self.compare_before_after() 21 | 22 | def test_colonless(self): 23 | """ 24 | BEFORE: username root\npassword crickets 25 | AFTER: username {{USERNAME}}\npassword {{PASSWORD}} 26 | """ 27 | self.compare_before_after() 28 | 29 | def test_email_username(self): 30 | """sometimes there is no colon""" 31 | result = self.clean(u'username: joe@example.com\npassword moi') 32 | self.assertNotIn("joe@example.com", result, 'email username remains "%s"' % result) 33 | self.assertNotIn("moi", result, 'password remains "%s"' % result) 34 | 35 | def test_alternate_keywords(self): 36 | """ 37 | BEFORE: login snoop pw biggreenhat 38 | AFTER: login {{USERNAME}} pw {{PASSWORD}} 39 | """ 40 | self.compare_before_after() 41 | 42 | def test_singleletter_keywords(self): 43 | """ 44 | BEFORE: u: snoop\np: biggreenhat 45 | AFTER: u: {{USERNAME}}\np: {{PASSWORD}} 46 | """ 47 | self.compare_before_after() 48 | 49 | def test_singleletter_keyword_exceptions(self): 50 | """Make sure that the single letter keywords do not make mistakes 51 | 52 | BEFORE: This is your problem 53 | AFTER: This is your problem 54 | """ 55 | self.compare_before_after() 56 | 57 | def test_camelcase_keywords(self): 58 | """ 59 | BEFORE: UserName snoop PassWord biggreenhat 60 | AFTER: UserName {{USERNAME}} PassWord {{PASSWORD}} 61 | """ 62 | self.compare_before_after() 63 | -------------------------------------------------------------------------------- /tests/base.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | import scrubadub 4 | 5 | 6 | try: 7 | unicode 8 | except NameError: 9 | unicode = str # Python 2 and 3 compatibility 10 | 11 | # this is a mixin class to make it easy to centralize a lot of the core 12 | # functionality of the test suite 13 | class BaseTestCase(object): 14 | 15 | def clean(self, text, locale='en_US', **kwargs): 16 | if 'replace_with' in kwargs: 17 | scrubadub.filth.base.Filth.lookup = scrubadub.utils.Lookup() 18 | return scrubadub.clean(text, locale=locale, **kwargs) 19 | 20 | def get_before_after(self, docstring=None): 21 | """Recursively parse the docstrings of methods that are called in the 22 | stack to find the docstring that has been used to define the test. 23 | """ 24 | # get the before and after outcomes from the docstring of the method 25 | # that calls compare_before_after 26 | if docstring is None: 27 | stack = inspect.stack() 28 | for frame in inspect.stack(): 29 | calling_function_name = frame[3] 30 | _docstring = getattr(self, calling_function_name).__doc__ 31 | if "BEFORE:" in _docstring and "AFTER:" in _docstring: 32 | docstring = _docstring 33 | break 34 | before, after = docstring.split("BEFORE:")[1].split("AFTER:") 35 | return unicode(before.strip()), unicode(after.strip()) 36 | 37 | def check_equal(self, expected, actual): 38 | """This method makes it easy to give useful error messages when running 39 | nosetests 40 | """ 41 | self.assertEqual( 42 | actual, 43 | expected, 44 | '\nEXPECTED:\n"%s"\n\nBUT GOT THIS:\n"%s"'%(expected, actual), 45 | ) 46 | 47 | def compare_before_after(self, docstring=None, locale='en_US', **clean_kwargs): 48 | """Convenience method for quickly writing tests using the BEFORE and 49 | AFTER keywords to parse the docstring. 50 | """ 51 | before, after = self.get_before_after(docstring=docstring) 52 | self.check_equal(after, self.clean(before, locale=locale, **clean_kwargs)) 53 | -------------------------------------------------------------------------------- /scrubadub/detectors/phone.py: -------------------------------------------------------------------------------- 1 | import phonenumbers 2 | 3 | from typing import Optional 4 | 5 | from scrubadub.detectors.catalogue import register_detector 6 | from .base import Detector 7 | from ..filth import PhoneFilth 8 | 9 | 10 | @register_detector 11 | class PhoneDetector(Detector): 12 | """Remove phone numbers from dirty dirty ``text`` using 13 | `python-phonenumbers `_, a port of a 14 | Google project to correctly format phone numbers in text. 15 | 16 | Set the locale on the scrubber or detector to set the region used to search for valid phone numbers. 17 | If the locale is set to 'en_CA' Canadian numbers will be searched for, while setting the local to 'en_GB' searches 18 | for British numbers. 19 | """ 20 | filth_cls = PhoneFilth 21 | name = 'phone' 22 | autoload = True 23 | 24 | def iter_filth(self, text, document_name: Optional[str] = None): 25 | """Yields discovered filth in the provided ``text``. 26 | 27 | :param text: The dirty text to clean. 28 | :type text: str 29 | :param document_name: The name of the document to clean. 30 | :type document_name: str, optional 31 | :return: An iterator to the discovered :class:`Filth` 32 | :rtype: Iterator[:class:`Filth`] 33 | """ 34 | # create a copy of text to handle multiple phone numbers correctly 35 | for match in phonenumbers.PhoneNumberMatcher(text, self.region): 36 | yield PhoneFilth( 37 | beg=match.start, 38 | end=match.end, 39 | text=match.raw_string, 40 | detector_name=self.name, 41 | document_name=document_name, 42 | locale=self.locale, 43 | ) 44 | 45 | @classmethod 46 | def supported_locale(cls, locale: str) -> bool: 47 | """Returns true if this ``Detector`` supports the given locale. 48 | 49 | :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an 50 | underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". 51 | :type locale: str 52 | :return: ``True`` if the locale is supported, otherwise ``False`` 53 | :rtype: bool 54 | """ 55 | return True 56 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | from setuptools import setup, find_packages 4 | 5 | # read in the description from README 6 | with open("README.rst") as stream: 7 | long_description = stream.read() 8 | 9 | github_url = 'https://github.com/LeapBeyond/scrubadub' 10 | 11 | 12 | def read_packages_from_file(filename): 13 | with open(filename, 'r') as stream: 14 | for line in stream: 15 | package = line.strip().split('#')[0] 16 | if package: 17 | yield package 18 | 19 | 20 | def get_package_list(location): 21 | location = os.path.join('requirements', location) 22 | return list(read_packages_from_file(location)) 23 | 24 | 25 | # get the version 26 | version = None 27 | with open(os.path.join('scrubadub', '__init__.py')) as stream: 28 | for line in stream: 29 | if 'version' in line.lower(): 30 | version = line.split()[-1].replace('"', '').replace("'", '') 31 | 32 | setup( 33 | name='scrubadub', 34 | version=version, 35 | description=( 36 | "Clean personally identifiable information from dirty dirty text." 37 | ), 38 | long_description=long_description, 39 | url=github_url, 40 | download_url="%s/archives/master" % github_url, 41 | author='Dean Malmgren', 42 | author_email='dean.malmgren@datascopeanalytics.com', 43 | license='MIT', 44 | packages=find_packages(exclude=["tests", "tests.*"]), 45 | classifiers=[ 46 | 'Intended Audience :: Developers', 47 | 'Development Status :: 5 - Production/Stable', 48 | 'License :: OSI Approved :: Apache Software License', 49 | 'Natural Language :: English', 50 | 'Programming Language :: Python', 51 | 'Programming Language :: Python :: 3', 52 | 'Programming Language :: Python :: 3.6', 53 | 'Programming Language :: Python :: 3.7', 54 | 'Programming Language :: Python :: 3.8', 55 | 'Programming Language :: Python :: 3.9', 56 | 'Topic :: Software Development :: Libraries', 57 | 'Topic :: Scientific/Engineering :: Information Analysis', 58 | 'Topic :: Text Processing', 59 | 'Topic :: Utilities', 60 | ], 61 | install_requires=get_package_list('python'), 62 | include_package_data=True, 63 | package_data={'': ['scrubadub/detectors/models/sklearn_address/*.json']}, 64 | zip_safe=False, 65 | ) 66 | -------------------------------------------------------------------------------- /design/customize_via_training.py: -------------------------------------------------------------------------------- 1 | """scrubadub currently removes personally identifiable information with some 2 | regular expression and natural language processing techniques. These techniques 3 | work very well in a wide range of circumstances, but they also tend to make 4 | mistakes. 5 | 6 | For example, the first sentence should obfuscate the name 'April' and 7 | the second sentence should not obfuscate the month 'April'. 8 | 9 | April is a good friend of mine. I hope to see her in April. 10 | 11 | To make this possible, scrubadub needs to be able to incorporate some 12 | techniques for training a classifier to identify filth. The training interface 13 | is important and probably not something that is best done in a terminal, but it 14 | is important that the technical infrastructure is there for it to work. 15 | """ 16 | 17 | import scrubadub 18 | 19 | # a TrainedScrubber can be taught what is dirty about a particular document. 20 | scrubber = scrubadub.TrainedScrubber() 21 | for document in training_documents: 22 | 23 | # TrainedScrubber.detect_filth just returns a list of filth objects that 24 | # are returned by Scrubber.iter_filth. This is used to help make 25 | # classification easy for end users. 26 | filth_list = scrubber.detect_filth(document) 27 | 28 | # The filth_list is then refined by human input. It is very difficult to 29 | # imagine doing this in a terminal in an effective way (although `git add 30 | # -i` might be a decent example). I imagine that person_identifies_filth is 31 | # a web interface where users can easily brush text to improve recall and 32 | # adjust the preliminary filth_list to improve precision. 33 | filth_list = person_identifies_filth(document, filth_list) 34 | 35 | # The TrainedScrubber.train method should incorporate the filth_list into 36 | # its classifier and further return a cleaned document with the filth 37 | # removed in an appropriate way. 38 | cleaned_document = scrubber.train(document, filth_list) 39 | 40 | # the TrainedScrubber.predict (or maybe just TrainedScrubber.clean?) method is 41 | # then used to use the classifier to selectively clean filth based on the human 42 | # input. This way, you might only have to train ~1000 documents to do a good 43 | # job of scrubbing the rest (imagine having to do this for 1mm documents) 44 | for document in test_documents: 45 | clean_document = scrubber.predict(document) 46 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. _contributing: 2 | 3 | Contributing 4 | ============ 5 | 6 | The overarching goal of this project is to remove personally identifiable 7 | information from raw text as reliably as possible. In practice, this means that 8 | this project, by default, will preferentially be overly conservative in removing 9 | information that might be personally identifiable. As this project matures, I 10 | fully expect the project to become ever smarter about how it interprets and 11 | anonymizes raw text. 12 | 13 | Regardless of which personal information is identified, this project is committed 14 | to being as agnostic about the manner in which the text is anonymized, so long 15 | as it is done with rigor and does not inadvertantly lead to `improper 16 | anonymization `_. 17 | Replacing with placholders? Replacing with anonymous (but consistent) IDs? 18 | Replacing with random metadata? Other ideas? All should be supported to make 19 | this project as useful as possible to the people that need it. 20 | 21 | Another important aspect of this project is that we want to have extremely good 22 | documentation and source code that is easy to read. If you notice a type-o, 23 | error, confusing statement etc, please fix it! 24 | 25 | 26 | .. _contributing-quick-start: 27 | 28 | Quick start 29 | ----------- 30 | 31 | 1. `Fork `_ and clone the 32 | project: 33 | 34 | .. code-block:: bash 35 | 36 | git clone https://github.com/YOUR-USERNAME/scrubadub.git 37 | 38 | 2. Create a python virtual environment and install the requirements 39 | 40 | .. code-block:: bash 41 | 42 | mkvirtualenv scrubadub 43 | pip install -r requirements/python-dev 44 | 45 | 3. Contribute! There are several `open issues 46 | `_ that provide 47 | good places to dig in. Check out the `contribution guidelines 48 | `_ 49 | and send pull requests; your help is greatly appreciated! 50 | 51 | 4. Run the test suite that is defined in ``.travis.yml`` to make sure 52 | everything is working properly 53 | 54 | .. code-block:: bash 55 | 56 | ./tests/run.py 57 | 58 | Current build status: |Build Status| 59 | 60 | .. |Build Status| image:: https://travis-ci.org/LeapBeyond/scrubadub.png 61 | :target: https://travis-ci.org/LeapBeyond/scrubadub 62 | -------------------------------------------------------------------------------- /tests/test_detector_urls.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import scrubadub 4 | 5 | from base import BaseTestCase 6 | 7 | 8 | class UrlTestCase(unittest.TestCase, BaseTestCase): 9 | 10 | def test_http(self): 11 | """ 12 | BEFORE: http://bit.ly/aser is neat 13 | AFTER: {{URL}} is neat 14 | """ 15 | self.compare_before_after() 16 | 17 | def test_https(self): 18 | """ 19 | BEFORE: https://bit.ly/aser is neat 20 | AFTER: {{URL}} is neat 21 | """ 22 | self.compare_before_after() 23 | 24 | def test_www(self): 25 | """ 26 | BEFORE: www.bit.ly/aser is neat 27 | AFTER: {{URL}} is neat 28 | """ 29 | self.compare_before_after() 30 | 31 | 32 | def test_long_url(self): 33 | """ 34 | BEFORE: https://this.is/a/long?url=very#url is good 35 | AFTER: {{URL}} is good 36 | """ 37 | self.compare_before_after() 38 | 39 | def test_two_urls(self): 40 | """ 41 | BEFORE: http://bit.ly/number-one http://www.google.com/two 42 | AFTER: {{URL}} {{URL}} 43 | """ 44 | self.compare_before_after() 45 | 46 | 47 | class UrlKeepDomainTestCase(unittest.TestCase, BaseTestCase): 48 | 49 | def setUp(self): 50 | scrubadub.filth.UrlFilth.keep_domain = True 51 | scrubadub.filth.UrlFilth.url_placeholder = 'path/to/something' 52 | scrubadub.filth.UrlFilth.prefix = '' 53 | scrubadub.filth.UrlFilth.suffix = '' 54 | super(UrlKeepDomainTestCase, self).setUp() 55 | 56 | def tearDown(self): 57 | scrubadub.filth.UrlFilth.keep_domain = False 58 | scrubadub.filth.UrlFilth.url_placeholder = 'URL' 59 | scrubadub.filth.UrlFilth.prefix = '{{' 60 | scrubadub.filth.UrlFilth.suffix = '}}' 61 | 62 | def test_path_word_in_sentence(self): 63 | """ 64 | BEFORE: Find jobs at http://facebook.com/jobs 65 | AFTER: Find jobs at http://facebook.com/path/to/something 66 | """ 67 | self.compare_before_after() 68 | 69 | def test_keep_domain(self): 70 | """ 71 | BEFORE: http://public.com/this/is/very/private 72 | AFTER: http://public.com/path/to/something 73 | """ 74 | self.compare_before_after() 75 | 76 | def test_keep_domain_empty_path(self): 77 | """ 78 | BEFORE: http://public.com/ 79 | AFTER: http://public.com/path/to/something 80 | """ 81 | self.compare_before_after() 82 | -------------------------------------------------------------------------------- /scrubadub/post_processors/catalogue.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import catalogue 3 | 4 | from typing import Type, Optional, Union, TYPE_CHECKING 5 | 6 | if TYPE_CHECKING: 7 | from scrubadub.post_processors import PostProcessor 8 | 9 | post_processor_catalogue = catalogue.create('scrubadub', 'post_processors', entry_points=True) 10 | 11 | 12 | def register_post_processor(post_processor: Type['PostProcessor'], autoload: Optional[bool] = None, 13 | index: Optional[int] = None) -> None: 14 | """Register a PostProcessor for use with the ``Scrubber`` class. 15 | 16 | You can use ``register_post_processor(NewPostProcessor)`` after your post-processor definition to automatically 17 | register it with the ``Scrubber`` class so that it can be used to process Filth. 18 | 19 | The argument ``autoload`` sets if a new ``Scrubber()`` instance should load this ``PostProcessor`` by default. 20 | 21 | :param post_processor: The ``PostProcessor`` to register with the scrubadub post-processor configuration. 22 | :type post_processor: PostProcessor class 23 | :param autoload: Whether to automatically load this ``Detector`` on ``Scrubber`` initialisation. 24 | :type autoload: bool 25 | :param index: The location/index in which this ``PostProcessor`` should be added. 26 | :type index: int 27 | """ 28 | if not inspect.isclass(post_processor): 29 | raise ValueError("post_processor should be a class, not an instance.") 30 | 31 | if autoload is not None: 32 | post_processor.autoload = autoload 33 | 34 | if index is not None: 35 | post_processor.index = index 36 | 37 | post_processor_catalogue.register(post_processor.name, func=post_processor) 38 | 39 | 40 | def remove_post_processor(post_processor: Union[Type['PostProcessor'], str]) -> None: 41 | """Remove an already registered post-processor. 42 | 43 | :param post_processor: The ``PostProcessor`` to register with the scrubadub post-processor configuration. 44 | :type post_processor: Union[Type['PostProcessor'], str] 45 | """ 46 | if isinstance(post_processor, str): 47 | if post_processor in post_processor_catalogue: 48 | catalogue._remove((*post_processor_catalogue.namespace, post_processor)) 49 | 50 | elif inspect.isclass(post_processor): 51 | if post_processor.name in post_processor_catalogue: 52 | catalogue._remove((*post_processor_catalogue.namespace, post_processor.name)) 53 | 54 | else: 55 | raise ValueError("post-processor should be a class (not an instance) or a string.") 56 | -------------------------------------------------------------------------------- /scrubadub/detectors/email.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from typing import Optional, Generator 4 | 5 | from scrubadub.detectors.catalogue import register_detector 6 | from .base import RegexDetector 7 | from ..filth import EmailFilth, Filth 8 | 9 | 10 | @register_detector 11 | class EmailDetector(RegexDetector): 12 | """Use regular expression magic to remove email addresses from dirty 13 | dirty ``text``. This method also catches email addresses like ``john at 14 | gmail.com``. 15 | """ 16 | filth_cls = EmailFilth 17 | name = 'email' 18 | autoload = True 19 | 20 | # there may be better solutions than this out there and this certainly 21 | # doesn't do that great of a job with people that spell out the 22 | # hyphenation of their email address, but its a pretty solid start. 23 | # 24 | # adapted from https://gist.github.com/dideler/5219706 25 | regex = re.compile(( 26 | r"\b[a-z0-9!#$%&'*+\/=?^_`{|}~-]" # start with this character 27 | r"(?:" 28 | r" [\.a-z0-9!#$%&'*+\/=?^_`{|}~-]{0,62}" # valid next characters (max length 64 chars before @) 29 | r" [a-z0-9!#$%&'*+\/=?^_`{|}~-]" # end with this character 30 | r")?" 31 | r"(?:@|\sat\s)" # @ or the word 'at' instead 32 | r"[a-z0-9]" # domain starts like this 33 | r"(?:" 34 | r" (?=[a-z0-9-]*(\.|\sdot\s))" # A lookahead to ensure there is a dot in the domain 35 | r" (?:\.|\sdot\s|[a-z0-9-]){0,251}" # might have a '.' or the word 'dot' instead 36 | r" [a-z0-9]" # domain has max 253 chars, ends with one of these 37 | r")+\b" 38 | ), re.VERBOSE | re.IGNORECASE) 39 | 40 | at_matcher = re.compile(r"@|\sat\s", re.IGNORECASE) 41 | dot_matcher = re.compile(r"\.|\sdot\s", re.IGNORECASE) 42 | 43 | def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: 44 | """Yields discovered filth in the provided ``text``. 45 | 46 | :param text: The dirty text to clean. 47 | :type text: str 48 | :param document_name: The name of the document to clean. 49 | :type document_name: str, optional 50 | :return: An iterator to the discovered :class:`Filth` 51 | :rtype: Iterator[:class:`Filth`] 52 | """ 53 | 54 | if re.search(self.at_matcher, text) and re.search(self.dot_matcher, text): 55 | yield from super().iter_filth(text=text, document_name=document_name) 56 | -------------------------------------------------------------------------------- /tests/test_utils_canonical_string_set.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from scrubadub.utils import CanonicalStringSet 4 | 5 | class CanonicalStringSetTestCase(unittest.TestCase): 6 | 7 | def test_init(self): 8 | """make sure that lower case casting works in __init__""" 9 | s = CanonicalStringSet(['TKTK', 'tKtK', 'Tktk']) 10 | self.assertTrue('tktk' in s) 11 | self.assertEqual(len(s), 1) 12 | 13 | def test_add(self): 14 | """make sure that lower case casting works in add""" 15 | s = CanonicalStringSet() 16 | s.add('TKTK') 17 | s.add('tKtK') 18 | s.add('Tktk') 19 | self.assertTrue('tktk' in s) 20 | self.assertEqual(len(s), 1) 21 | 22 | def test_update(self): 23 | """make sure lower case casting works in update""" 24 | s = CanonicalStringSet() 25 | s.update(['TKTK', 'tKtK', 'Tktk']) 26 | self.assertTrue('tktk' in s) 27 | self.assertEqual(len(s), 1) 28 | 29 | def test_update_again(self): 30 | """make sure udpate works properly""" 31 | s = CanonicalStringSet(['tktk']) 32 | s.update(set(['KtKt'])) 33 | self.assertTrue('tktk' in s) 34 | self.assertTrue('ktkt' in s) 35 | self.assertIsInstance(s, CanonicalStringSet) 36 | 37 | def test_contains(self): 38 | """make sure __contains__ casts things properly""" 39 | s = CanonicalStringSet(['tktk']) 40 | self.assertTrue('TKTK' in s) 41 | self.assertTrue('Tktk' in s) 42 | self.assertTrue('tKtK' in s) 43 | 44 | def test_pop(self): 45 | """make sure pop deals with capitalized things properly""" 46 | s = CanonicalStringSet(['TKTK']) 47 | self.assertEqual(s.pop(), 'tktk') 48 | 49 | def test_remove(self): 50 | """make sure remove works properly""" 51 | s = CanonicalStringSet(['tktk']) 52 | s.remove('TKTK') 53 | self.assertFalse('tktk' in s) 54 | 55 | def test_discard(self): 56 | """make sure discard works properly""" 57 | s = CanonicalStringSet(['tktk']) 58 | s.discard('TKTK') 59 | s.discard('TkTk') 60 | s.discard('Tktk') 61 | self.assertFalse('tktk' in s) 62 | 63 | def test_non_string(self): 64 | """ensure error is thrown when non string is added""" 65 | s = CanonicalStringSet(['tktk']) 66 | s.add('123') 67 | with self.assertRaises(TypeError): 68 | s.add(123) 69 | with self.assertRaises(TypeError): 70 | s.add(None) 71 | 72 | # TODO: add more tests for all of the other set operations to make sure 73 | # people get what they expect 74 | -------------------------------------------------------------------------------- /scrubadub/detectors/postalcode.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from scrubadub.detectors.catalogue import register_detector 4 | from .base import RegionLocalisedRegexDetector 5 | from ..filth.postalcode import PostalCodeFilth 6 | 7 | 8 | @register_detector 9 | class PostalCodeDetector(RegionLocalisedRegexDetector): 10 | """Detects postal codes, currently only British post codes are supported.""" 11 | filth_cls = PostalCodeFilth 12 | name = 'postalcode' 13 | autoload = True 14 | region_regex = { 15 | # Informed by https://en.wikipedia.org/wiki/Postcodes_in_the_United_Kingdom#Validation 16 | # and validated against https://osdatahub.os.uk/downloads/open/CodePointOpen 17 | 'GB': re.compile(r""" 18 | ( 19 | # Girobank postcode 20 | (?:[gG][iI][rR] {0,}0[aA]{2})| 21 | (?: # British Overseas Territories in usual format 22 | (?: 23 | [aA][sS][cC][nN]| 24 | [sS][tT][hH][lL]| 25 | [tT][dD][cC][uU]| 26 | [bB][bB][nN][dD]| 27 | [bB][iI][qQ][qQ]| 28 | [fF][iI][qQ][qQ]| 29 | [pP][cC][rR][nN]| 30 | [sS][iI][qQ][qQ]| 31 | [iT][kK][cC][aA] 32 | ) 33 | \ {0,}1[zZ]{2} 34 | )| 35 | (?: # British Overseas Territories in zip-code format 36 | (KY[0-9]|MSR|VG|AI)[ -]{0,}[0-9]{4} 37 | )| 38 | # (?: # Bermuda including this causes too many false positives, so excluded for now 39 | # [a-zA-Z]{2}\ {0,}[0-9]{2} 40 | # )| 41 | (?: # British Forces Post Office 42 | [Bb][Ff][Pp][Oo]\ {0,}[0-9]{1,4} 43 | )| 44 | (?: # Mainland British postcodes 45 | (?: 46 | (?:[Ww][Cc][0-9][abehmnprvwxyABEHMNPRVWXY])| 47 | (?:[Ee][Cc][1-4][abehmnprvwxyABEHMNPRVWXY])| 48 | (?:[Nn][Ww]1[Ww])| 49 | (?:[Ss][Ee]1[Pp])| 50 | (?:[Ss][Ww]1[abehmnprvwxyABEHMNPRVWXY])| 51 | (?:[EeNnWw]1[a-hjkpstuwA-HJKPSTUW])| 52 | (?:[BbEeGgLlMmNnSsWw][0-9][0-9]?)| 53 | (?:[a-pr-uwyzA-PR-UWYZ][a-hk-yxA-HK-XY][0-9][0-9]?) 54 | ) 55 | \ {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2} 56 | ) 57 | ) 58 | """, re.VERBOSE), 59 | } 60 | -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import scrubadub 3 | 4 | 5 | class APITestCase(unittest.TestCase): 6 | 7 | def test_clean(self): 8 | """Test the top level clean api""" 9 | self.assertEqual( 10 | "This is a test message for {{EMAIL}}", 11 | scrubadub.clean("This is a test message for example@exampe.com"), 12 | ) 13 | 14 | def test_clean_documents(self): 15 | """Test the top level clean_documents api""" 16 | self.assertEqual( 17 | { 18 | "first.txt": "This is a test message for {{EMAIL}}", 19 | "second.txt": "Hello {{TWITTER}} call me on {{PHONE}}.", 20 | }, 21 | scrubadub.clean_documents( 22 | { 23 | "first.txt": "This is a test message for example@exampe.com", 24 | "second.txt": "Hello @Jane call me on +33 4 41 26 62 36.", 25 | }, 26 | ), 27 | ) 28 | 29 | def test_list_filth(self): 30 | """Test the top level list_filth api""" 31 | filths = scrubadub.list_filth("This is a test message for example@example.com") 32 | self.assertEqual( 33 | [scrubadub.filth.EmailFilth(text='example@example.com', detector_name='email', beg=27, end=46)], 34 | filths, 35 | ) 36 | 37 | def test_list_filth_docuemnts(self): 38 | """Test the top level list_filth_documents api""" 39 | filths = scrubadub.list_filth_documents( 40 | { 41 | "first.txt": "This is a test message for example@example.com", 42 | "second.txt": "Hello @Jane call me on +33 4 41 26 62 36.", 43 | } 44 | ) 45 | self.assertEqual( 46 | scrubadub.Scrubber._sort_filths([ 47 | scrubadub.filth.EmailFilth( 48 | text='example@example.com', document_name='first.txt', detector_name='email', beg=27, end=46 49 | ), 50 | scrubadub.filth.TwitterFilth( 51 | text='@Jane', document_name='second.txt', detector_name='twitter', beg=6, end=11 52 | ), 53 | scrubadub.filth.PhoneFilth( 54 | text='+33 4 41 26 62 36', document_name='second.txt', detector_name='phone', beg=23, end=40 55 | ), 56 | ]), 57 | scrubadub.Scrubber._sort_filths(filths), 58 | ) 59 | 60 | def test_quickstart(self): 61 | """Test the example given in the quick start docs""" 62 | text = "My cat can be contacted on example@example.com, or 1800 555-5555" 63 | self.assertEqual( 64 | 'My cat can be contacted on {{EMAIL}}, or {{PHONE}}', 65 | scrubadub.clean(text), 66 | ) 67 | -------------------------------------------------------------------------------- /scrubadub/post_processors/prefix_suffix.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence 2 | 3 | from scrubadub.filth import Filth 4 | from scrubadub.post_processors.catalogue import register_post_processor 5 | from scrubadub.post_processors.base import PostProcessor 6 | 7 | 8 | class PrefixSuffixReplacer(PostProcessor): 9 | """Add a prefix and/or suffix to the Filth's replacement string. 10 | 11 | >>> import scrubadub 12 | >>> scrubber = scrubadub.Scrubber(post_processor_list=[ 13 | ... scrubadub.post_processors.FilthReplacer(), 14 | ... ]) 15 | >>> scrubber.clean("Contact me at 522-368-8530 or hernandezjenna@example.com") 16 | 'Contact me at PHONE or EMAIL' 17 | >>> scrubber = scrubadub.Scrubber(post_processor_list=[ 18 | ... scrubadub.post_processors.FilthReplacer(), 19 | ... scrubadub.post_processors.PrefixSuffixReplacer(prefix='{{', suffix='}}'), 20 | ... ]) 21 | >>> scrubber.clean("Contact me at 522-368-8530 or hernandezjenna@example.com") 22 | 'Contact me at {{PHONE}} or {{EMAIL}}' 23 | >>> scrubber = scrubadub.Scrubber(post_processor_list=[ 24 | ... scrubadub.post_processors.FilthReplacer(), 25 | ... scrubadub.post_processors.PrefixSuffixReplacer(prefix='', suffix=''), 26 | ... ]) 27 | >>> scrubber.clean("Contact me at 522-368-8530 or hernandezjenna@example.com") 28 | 'Contact me at PHONE or EMAIL' 29 | 30 | """ 31 | name = 'prefix_suffix_replacer' # type: str 32 | autoload = False 33 | index = 1 34 | 35 | def __init__(self, prefix: Optional[str] = '{{', suffix: Optional[str] = '}}', name: Optional[str] = None): 36 | super(PrefixSuffixReplacer, self).__init__(name=name) 37 | 38 | self.prefix = prefix 39 | self.suffix = suffix 40 | 41 | def process_filth(self, filth_list: Sequence[Filth]) -> Sequence[Filth]: 42 | """Processes the filth to add prefixes and suffixes to the replacement text 43 | 44 | :param filth_list: The text to be hashed 45 | :type filth_list: Sequence[Filth] 46 | :return: The processed filths 47 | :rtype: Sequence[Filth] 48 | """ 49 | for filth_item in filth_list: 50 | if filth_item.replacement_string is None: 51 | filth_item.replacement_string = filth_item.type.upper() 52 | 53 | if self.prefix is not None and self.suffix is not None: 54 | filth_item.replacement_string = self.prefix + filth_item.replacement_string + self.suffix 55 | elif self.prefix is not None: 56 | filth_item.replacement_string = self.prefix + filth_item.replacement_string 57 | elif self.suffix is not None: 58 | filth_item.replacement_string = filth_item.replacement_string + self.suffix 59 | 60 | return filth_list 61 | 62 | 63 | register_post_processor(PrefixSuffixReplacer) 64 | 65 | __all__ = ['PrefixSuffixReplacer'] 66 | -------------------------------------------------------------------------------- /scrubadub/detectors/catalogue.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import catalogue 3 | 4 | from typing import Type, Optional, Union, TYPE_CHECKING 5 | 6 | if TYPE_CHECKING: 7 | from scrubadub.detectors import Detector 8 | 9 | detector_catalogue = catalogue.create('scrubadub', 'detectors', entry_points=True) 10 | 11 | 12 | def register_detector(detector: Type['Detector'], *, autoload: Optional[bool] = None) -> Type['Detector']: 13 | """Register a detector for use with the ``Scrubber`` class. 14 | 15 | You can use ``register_detector(NewDetector, autoload=True)`` after your detector definition to automatically 16 | register it with the ``Scrubber`` class so that it can be used to remove Filth. 17 | 18 | The argument ``autoload``decides whether a new ``Scrubber()`` instance should load this ``detector`` by default. 19 | 20 | .. code:: pycon 21 | 22 | >>> import scrubadub 23 | >>> class NewDetector(scrubadub.detectors.Detector): 24 | ... pass 25 | >>> scrubadub.detectors.register_detector(NewDetector, autoload=False) 26 | 27 | 28 | :param detector: The ``Detector`` to register with the scrubadub detector configuration. 29 | :type detector: Detector class 30 | :param autoload: Whether to automatically load this ``Detector`` on ``Scrubber`` initialisation. 31 | :type autoload: Optional[bool] 32 | """ 33 | if not inspect.isclass(detector): 34 | raise ValueError("detector should be a class, not an instance.") 35 | 36 | if autoload is not None: 37 | detector.autoload = autoload 38 | 39 | detector_catalogue.register(detector.name, func=detector) 40 | 41 | return detector 42 | 43 | 44 | def remove_detector(detector: Union[Type['Detector'], str]): 45 | """Remove an already registered detector. 46 | 47 | .. code:: pycon 48 | 49 | >>> import scrubadub 50 | >>> class NewDetector(scrubadub.detectors.Detector): 51 | ... pass 52 | >>> scrubadub.detectors.catalogue.register_detector(NewDetector, autoload=False) 53 | 54 | >>> scrubadub.detectors.catalogue.remove_detector(NewDetector) 55 | 56 | :param detector: The ``Detector`` to register with the scrubadub detector configuration. 57 | :type detector: Union[Type['PostProcessor'], str] 58 | :param autoload: Whether to automatically load this ``Detector`` on ``Scrubber`` initialisation. 59 | :type autoload: bool 60 | """ 61 | if isinstance(detector, str): 62 | if detector in detector_catalogue: 63 | catalogue._remove((*detector_catalogue.namespace, detector)) 64 | 65 | elif inspect.isclass(detector): 66 | if detector.name in detector_catalogue: 67 | catalogue._remove((*detector_catalogue.namespace, detector.name)) 68 | 69 | else: 70 | raise ValueError("detector should be a class (not an instance) or a string.") 71 | -------------------------------------------------------------------------------- /tests/test_detector_skype.py: -------------------------------------------------------------------------------- 1 | import faker 2 | import unittest 3 | 4 | import scrubadub.detectors.catalogue 5 | from scrubadub.filth import SkypeFilth 6 | 7 | from base import BaseTestCase 8 | 9 | import scrubadub 10 | 11 | class SkypeTestCase(unittest.TestCase, BaseTestCase): 12 | 13 | def setUp(self): 14 | from scrubadub.detectors.skype import SkypeDetector 15 | scrubadub.detectors.catalogue.register_detector(SkypeDetector, autoload=True) 16 | 17 | def test_inline_skype_name(self): 18 | """ 19 | BEFORE: contact me on skype (dean.malmgren) to chat 20 | AFTER: contact me on skype ({{SKYPE}}) to chat 21 | """ 22 | self.compare_before_after() 23 | 24 | def test_pre_inline_skype_name(self): 25 | """ 26 | BEFORE: i'm dean.malmgren on skype 27 | AFTER: i'm {{SKYPE}} on skype 28 | """ 29 | self.compare_before_after() 30 | 31 | def test_parenthetical_skype(self): 32 | """ 33 | BEFORE: i'm on skype (dean.malmgren) or can be reached on my cell 34 | AFTER: i'm on skype ({{SKYPE}}) or can be reached on my cell 35 | """ 36 | self.compare_before_after() 37 | 38 | def test_skype_signature(self): 39 | """ 40 | BEFORE: skype: dean.malmgren\nnerd 41 | AFTER: skype: {{SKYPE}}\nnerd 42 | """ 43 | self.compare_before_after() 44 | 45 | def test_skype_addition(self): 46 | """ 47 | BEFORE: I have added you on Skype. My ID is dean.malmgren 48 | AFTER: I have added you on Skype. My ID is {{SKYPE}} 49 | """ 50 | self.compare_before_after() 51 | 52 | def test_skype_usernames(self): 53 | """test different skype username formats""" 54 | usernames = ( 55 | "joecool", 56 | "joe,cool", 57 | "joe.cool", 58 | "joe-cool", 59 | ) 60 | docstring_template =""" 61 | BEFORE: My Skype is %s 62 | AFTER: My Skype is {{SKYPE}} 63 | """ 64 | for username in usernames: 65 | self.compare_before_after(docstring_template % username) 66 | 67 | def test_all_caps_words_nearby(self): 68 | """ 69 | BEFORE: SCREAM to get my attention on Skype (dean.malmgren) 70 | AFTER: SCREAM to get my attention on Skype ({{SKYPE}}) 71 | """ 72 | self.compare_before_after() 73 | 74 | def test_no_triggers(self): 75 | """ 76 | BEFORE: SCREAM to get my attention because Im not on instant messengers 77 | AFTER: SCREAM to get my attention because Im not on instant messengers 78 | """ 79 | self.compare_before_after() 80 | 81 | def test_generate(self): 82 | class Faker: 83 | def user_name(self): 84 | return 'brian12' 85 | 86 | self.assertEqual( 87 | 'brian12', 88 | SkypeFilth.generate(faker=Faker()), 89 | ) 90 | 91 | def tearDown(self) -> None: 92 | from scrubadub.detectors.skype import SkypeDetector 93 | scrubadub.detectors.catalogue.register_detector(SkypeDetector, autoload=False) 94 | -------------------------------------------------------------------------------- /tests/test_detector_credit_card.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from base import BaseTestCase 4 | 5 | 6 | class CreditCardTestCase(unittest.TestCase, BaseTestCase): 7 | """ 8 | Test cases for Credit Card number removal removal. 9 | All these will clash with PASSPORT filth. 10 | """ 11 | 12 | def test_american_express(self): 13 | """ 14 | BEFORE: My credit card is 378282246310005. 15 | AFTER: My credit card is {{CREDIT_CARD}}. 16 | """ 17 | self.compare_before_after() 18 | 19 | def test_american_express2(self): 20 | """ 21 | BEFORE: My credit card is 371449635398431. 22 | AFTER: My credit card is {{CREDIT_CARD}}. 23 | """ 24 | self.compare_before_after() 25 | 26 | def test_american_corporate(self): 27 | """ 28 | BEFORE: My credit card is 378734493671000. 29 | AFTER: My credit card is {{CREDIT_CARD}}. 30 | """ 31 | self.compare_before_after() 32 | 33 | def test_diners_club(self): 34 | """ 35 | BEFORE: My credit card is 30569309025904. 36 | AFTER: My credit card is {{CREDIT_CARD}}. 37 | """ 38 | self.compare_before_after() 39 | 40 | def test_diners_club2(self): 41 | """ 42 | BEFORE: My credit card is 38520000023237. 43 | AFTER: My credit card is {{CREDIT_CARD}}. 44 | """ 45 | self.compare_before_after() 46 | 47 | def test_discover(self): 48 | """ 49 | BEFORE: My credit card is 6011111111111117. 50 | AFTER: My credit card is {{CREDIT_CARD}}. 51 | """ 52 | self.compare_before_after() 53 | 54 | def test_discover2(self): 55 | """ 56 | BEFORE: My credit card is 6011000990139424. 57 | AFTER: My credit card is {{CREDIT_CARD}}. 58 | """ 59 | self.compare_before_after() 60 | 61 | def test_jcb(self): 62 | """ 63 | BEFORE: My credit card is 3530111333300000. 64 | AFTER: My credit card is {{CREDIT_CARD}}. 65 | """ 66 | self.compare_before_after() 67 | 68 | def test_jcb2(self): 69 | """ 70 | BEFORE: My credit card is 3566002020360505. 71 | AFTER: My credit card is {{CREDIT_CARD}}. 72 | """ 73 | self.compare_before_after() 74 | 75 | def test_mastercard(self): 76 | """ 77 | BEFORE: My credit card is 5555555555554444. 78 | AFTER: My credit card is {{CREDIT_CARD}}. 79 | """ 80 | self.compare_before_after() 81 | 82 | def test_mastercard2(self): 83 | """ 84 | BEFORE: My credit card is 5105105105105100. 85 | AFTER: My credit card is {{CREDIT_CARD}}. 86 | """ 87 | self.compare_before_after() 88 | 89 | def test_visa(self): 90 | """ 91 | BEFORE: My credit card is 4111111111111111. 92 | AFTER: My credit card is {{CREDIT_CARD}}. 93 | """ 94 | self.compare_before_after() 95 | 96 | def test_visa2(self): 97 | """ 98 | BEFORE: My credit card is 4012888888881881. 99 | AFTER: My credit card is {{CREDIT_CARD}}. 100 | """ 101 | self.compare_before_after() 102 | -------------------------------------------------------------------------------- /scrubadub/detectors/text_blob.py: -------------------------------------------------------------------------------- 1 | import re 2 | import textblob 3 | 4 | from textblob.blob import BaseBlob 5 | from textblob.en.taggers import PatternTagger 6 | 7 | from typing import Optional, Generator 8 | 9 | from scrubadub.detectors.catalogue import register_detector 10 | from .base import RegexDetector 11 | from ..filth import NameFilth, Filth 12 | from ..utils import CanonicalStringSet 13 | 14 | # BaseBlob uses NLTKTagger as a pos_tagger, but it works wrong 15 | BaseBlob.pos_tagger = PatternTagger() 16 | 17 | 18 | @register_detector 19 | class TextBlobNameDetector(RegexDetector): 20 | """Use part of speech tagging from textblob to clean proper nouns out of the dirty dirty 21 | ``text``. Disallow particular nouns by adding them to the ``NameDetector.disallowed_nouns`` set. 22 | """ 23 | filth_cls = NameFilth 24 | name = 'text_blob_name' 25 | autoload = False 26 | 27 | disallowed_nouns = CanonicalStringSet(["skype"]) 28 | 29 | def iter_filth(self, text, document_name: Optional[str] = None) -> Generator[Filth, None, None]: 30 | """Yields discovered filth in the provided ``text``. 31 | 32 | :param text: The dirty text to clean. 33 | :type text: str 34 | :param document_name: The name of the document to clean. 35 | :type document_name: str, optional 36 | :return: An iterator to the discovered :class:`Filth` 37 | :rtype: Iterator[:class:`Filth`] 38 | """ 39 | 40 | if not isinstance(self.disallowed_nouns, CanonicalStringSet): 41 | raise TypeError( 42 | 'NameDetector.disallowed_nouns must be CanonicalStringSet' 43 | ) 44 | 45 | # find the set of proper nouns using textblob. 46 | proper_nouns = set() 47 | blob = textblob.TextBlob(text) 48 | for word, part_of_speech in blob.tags: 49 | is_proper_noun = part_of_speech in ("NNP", "NNPS") 50 | if is_proper_noun and word.lower() not in self.disallowed_nouns: 51 | proper_nouns.add(word) 52 | 53 | # use a regex to replace the proper nouns by first escaping any 54 | # lingering punctuation in the regex 55 | # http://stackoverflow.com/a/4202559/564709 56 | if proper_nouns: 57 | re_list = [] 58 | for proper_noun in proper_nouns: 59 | re_list.append(r'\b' + re.escape(str(proper_noun)) + r'\b') 60 | self.regex = re.compile('|'.join(re_list)) 61 | yield from super(TextBlobNameDetector, self).iter_filth(text, document_name=document_name) 62 | return 63 | 64 | @classmethod 65 | def supported_locale(cls, locale: str) -> bool: 66 | """Returns true if this ``Detector`` supports the given locale. 67 | 68 | :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an 69 | underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". 70 | :type locale: str 71 | :return: ``True`` if the locale is supported, otherwise ``False`` 72 | :rtype: bool 73 | """ 74 | language, region = cls.locale_split(locale) 75 | 76 | # fr and de are possible through plugins, but need to be implemented on this end 77 | # https://github.com/sloria/textblob-fr and https://github.com/markuskiller/textblob-de 78 | return language in ['en', ] 79 | -------------------------------------------------------------------------------- /tests/test_locale.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import scrubadub 3 | import scrubadub.utils 4 | 5 | 6 | class LocaleTestCase(unittest.TestCase): 7 | 8 | def test_top_level(self): 9 | """Test that locales work at the top level""" 10 | self.assertEqual( 11 | scrubadub.clean("Localisation is important for phone numbers '06 87 49 77 56'", locale='en_GB'), 12 | "Localisation is important for phone numbers '06 87 49 77 56'", 13 | ) 14 | self.assertEqual( 15 | scrubadub.clean("Localisation is important for phone numbers '06 87 49 77 56'", locale='fr_FR'), 16 | "Localisation is important for phone numbers '{{PHONE}}'", 17 | ) 18 | self.assertEqual( 19 | scrubadub.clean("Localisation is important for phone numbers '(0121) 496 0852'", locale='en_GB'), 20 | "Localisation is important for phone numbers '{{PHONE}}'", 21 | ) 22 | self.assertEqual( 23 | scrubadub.clean("Localisation is important for phone numbers '(0121) 496 0852'", locale='fr_FR'), 24 | "Localisation is important for phone numbers '(0121) 496 0852'", 25 | ) 26 | 27 | def test_bad_locale(self): 28 | with self.assertRaises(ValueError): 29 | scrubadub.clean("Localisation is important for phone numbers '(0121) 496 0852'", locale='non_existant') 30 | 31 | def test_locale_in_filth(self): 32 | filths = scrubadub.list_filth("Localisation is important for phone numbers '(0121) 496 0852'", locale='en_GB') 33 | self.assertEqual(len(filths), 1) 34 | self.assertEqual(filths[0].locale, 'en_GB') 35 | 36 | def test_locale_split(self): 37 | self.assertEqual( 38 | scrubadub.utils.locale_split('en_US'), 39 | ('en', 'US'), 40 | ) 41 | self.assertEqual( 42 | scrubadub.utils.locale_split('de_DE'), 43 | ('de', 'DE'), 44 | ) 45 | self.assertEqual( 46 | scrubadub.utils.locale_split('en_GB'), 47 | ('en', 'GB'), 48 | ) 49 | self.assertEqual( 50 | scrubadub.utils.locale_split('en'), 51 | ('en', 'US'), 52 | ) 53 | self.assertEqual( 54 | scrubadub.utils.locale_split('en_GB.ISO8859-1'), 55 | ('en', 'GB'), 56 | ) 57 | self.assertEqual( 58 | scrubadub.utils.locale_split('ru_RU.UTF-8'), 59 | ('ru', 'RU'), 60 | ) 61 | self.assertEqual( 62 | scrubadub.utils.locale_split('tt_RU.UTF-8@iqtelif'), 63 | ('tt', 'RU'), 64 | ) 65 | with self.assertRaises(ValueError): 66 | scrubadub.utils.locale_split('non_existant') 67 | 68 | def test_locale_transform(self): 69 | with self.assertRaises(ValueError): 70 | scrubadub.utils.locale_transform('not_exist'), 71 | 72 | self.assertEqual( 73 | scrubadub.utils.locale_transform('en'), 74 | 'en_US.ISO8859-1', 75 | ) 76 | self.assertEqual( 77 | scrubadub.utils.locale_transform('fr'), 78 | 'fr_FR.ISO8859-1', 79 | ) 80 | self.assertEqual( 81 | scrubadub.utils.locale_transform('fr_CA'), 82 | 'fr_CA.ISO8859-1', 83 | ) 84 | self.assertEqual( 85 | scrubadub.utils.locale_transform('zh'), 86 | 'zh_CN.eucCN', 87 | ) -------------------------------------------------------------------------------- /docs/names.rst: -------------------------------------------------------------------------------- 1 | 2 | Name Detection 3 | ============== 4 | 5 | There are several detectors that can be used to detect names: 6 | 7 | 1. `Stanford `_ detector 8 | * Best accuracy, requires java to be installed 9 | 2. `Spacy v3 `_ detector 10 | * Almost as good as Stanford NER, but easier to install 11 | 3. `TextBlob `_ detector 12 | * Has a very high false positive rate, use with caution 13 | 14 | All of these detectors are optional and so are not enabled by default. 15 | To enable them you must install any dependencies, import them and finally add them to your ``Scrubber``. 16 | In the following sections examples are given for this. 17 | 18 | Stanford NER detector 19 | --------------------- 20 | 21 | To run the Stanford NER detector you will need both java and the nltk python package. 22 | On debian linux, java can be installed with: 23 | 24 | .. code-block:: console 25 | 26 | $ apt-get install openjdk-14-jre 27 | 28 | And then the python dependencies can be installed with: 29 | 30 | .. code-block:: console 31 | 32 | $ pip install scrubadub_stanford 33 | 34 | Once this has been done, the ``StanfordEntityDetector`` can be used with the following: 35 | 36 | .. code-block:: pycon 37 | 38 | >>> import scrubadub, scrubadub_stanford 39 | >>> scrubber = scrubadub.Scrubber() 40 | >>> scrubber.add_detector(scrubadub_stanford.detectors.StanfordEntityDetector) 41 | >>> scrubber.clean("My name is John") 42 | 'My name is {{NAME}}' 43 | 44 | Spacy 45 | ----- 46 | 47 | This is the suggested named detector, since its easy to install and works pretty well. 48 | Spacy v3 requires python version >= 3.6 and < 3.9, as python 3.9 is not yet supported by spacy. 49 | 50 | To install all dependencies for the Spacy detector you can do: 51 | 52 | .. code-block:: console 53 | 54 | $ pip install scrubadub_spacy 55 | 56 | Then to run it you can add it to your ``Scrubber``, like so: 57 | 58 | .. code-block:: pycon 59 | 60 | >>> import scrubadub, scrubadub_spacy 61 | >>> scrubber = scrubadub.Scrubber() 62 | >>> scrubber.add_detector(scrubadub_spacy.detectors.SpacyEntityDetector) 63 | >>> scrubber.clean("My name is John") 64 | 'My name is {{NAME}}' 65 | 66 | It is also possible to enable other tags from the Spacy Entity tagger, such Location and Organisation. 67 | This can be done with the ``enable_*`` parameters in the initialiser: 68 | 69 | .. code-block:: pycon 70 | 71 | >>> import scrubadub, scrubadub_stanford 72 | >>> scrubber = scrubadub.Scrubber() 73 | >>> scrubber.add_detector(scrubadub_stanford.detectors.StanfordEntityDetector( 74 | ... enable_person=True, enable_organization=True, enable_location=True 75 | ... )) 76 | >>> scrubber.clean("My name is John and I work at the United Nations in Geneva") 77 | 'My name is {{NAME}} and I work at the {{ORGANIZATION}} in {{LOCATION}}' 78 | 79 | TextBlob 80 | -------- 81 | 82 | It is suggested not to use this detector due to its high false positive rate, however it is useful in some situations. 83 | Please test it on your data to ensure it works well. 84 | This detector is already installed in the base scrubadub package and os you only need scrubadub installed to run it. 85 | 86 | .. code-block:: console 87 | 88 | $ pip install scrubadub 89 | 90 | Then to run it you can add it to your ``Scrubber``, like so: 91 | 92 | .. code-block:: pycon 93 | 94 | >>> import scrubadub 95 | >>> scrubber = scrubadub.Scrubber() 96 | >>> scrubber.add_detector(scrubadub.detectors.TextBlobNameDetector) 97 | >>> scrubber.clean("My name is John") 98 | 'My name is {{NAME}}' 99 | 100 | -------------------------------------------------------------------------------- /tests/test_detector_postal_codes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import zipfile 3 | import pathlib 4 | import requests 5 | import unittest 6 | import warnings 7 | 8 | import scrubadub 9 | 10 | class PostalCodesTestCase(unittest.TestCase): 11 | 12 | def test_bad_locale(self): 13 | """test a non existant region""" 14 | with self.assertRaises(ValueError): 15 | scrubadub.detectors.PostalCodeDetector(locale='non_existant') 16 | 17 | def test_not_implemented_locale(self): 18 | """test a non existant region""" 19 | scrubber = scrubadub.Scrubber(locale='fr_FR') 20 | with warnings.catch_warnings(): 21 | warnings.simplefilter("error") 22 | with self.assertRaises(UserWarning): 23 | scrubber.add_detector(scrubadub.detectors.PostalCodeDetector) 24 | 25 | def test_gb(self): 26 | """test a simple matching""" 27 | 28 | to_test = [ 29 | # positive assertions 30 | ("BX1 1LT", True), 31 | ("sw1A 0AA", True), 32 | ("EC2V 7hh", True), 33 | ("M25DB", True), 34 | ("eh12ng", True), 35 | ("BT1 5GS", True), 36 | ("EC1A 1BB", True), 37 | ("W1A 0AX", True), 38 | ("M1 1AE", True), 39 | ("B33 8TH", True), 40 | ("CR2 6XH", True), 41 | ("DN55 1PT", True), 42 | ("CM2 0PP", True), 43 | ("EC3M 5AD", True), 44 | # negative assertions 45 | ("1", False), 46 | ("23", False), 47 | ("456", False), 48 | ("4567", False), 49 | ("750621", False), 50 | ("95130-642", False), 51 | ("95130-64212", False), 52 | ] 53 | 54 | test_str = 'this is a {} test string' 55 | detector = scrubadub.detectors.PostalCodeDetector(locale='en_GB') 56 | 57 | for postal_code, result in to_test: 58 | matches = list(detector.iter_filth(test_str.format(postal_code))) 59 | if result: 60 | self.assertEquals(len(matches), 1) 61 | self.assertEquals(matches[0].text, postal_code) 62 | else: 63 | self.assertEquals(matches, []) 64 | 65 | def test_extensive(self): 66 | zip_location = pathlib.Path(__file__).parent / 'code_point_uk_post_codes.zip' 67 | 68 | # Download an extensive list of all postcodes 69 | if not zip_location.exists(): 70 | url = 'https://api.os.uk/downloads/v1/products/CodePointOpen/downloads?area=GB&format=CSV&redirect' 71 | r = requests.get(url, allow_redirects=True) 72 | with open(zip_location.absolute(), 'wb') as f: 73 | f.write(r.content) 74 | 75 | detector = scrubadub.detectors.PostalCodeDetector(locale='en_GB') 76 | 77 | # Run the detector against this list to ensure we pickup all post codes 78 | with zipfile.ZipFile(zip_location.absolute()) as zip: 79 | data_file_names = [ 80 | name for name in zip.namelist() 81 | if name.lower().endswith('.csv') and name.startswith('Data/CSV') 82 | ] 83 | for data_file_name in data_file_names: 84 | with zip.open(data_file_name) as data_file: 85 | df = pd.read_csv(data_file, header=None) 86 | post_codes = df.loc[:, 0].sample(frac=.1).values.tolist() 87 | for post_code in post_codes: 88 | filth_list = list(detector.iter_filth(post_code)) 89 | error_message = "Unable to match postcode {} from {}".format(post_code, data_file_name) 90 | self.assertEquals(1, len(filth_list), error_message) 91 | self.assertEquals(post_code, filth_list[0].text) 92 | -------------------------------------------------------------------------------- /tests/test_detector.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import catalogue 3 | import scrubadub.detectors.catalogue 4 | 5 | from scrubadub.detectors.base import Detector, RegexDetector 6 | from scrubadub.detectors.url import UrlDetector 7 | from scrubadub.detectors.email import EmailDetector 8 | from scrubadub.filth.base import Filth 9 | from scrubadub.exceptions import UnexpectedFilth 10 | import scrubadub 11 | 12 | 13 | class DetectorTestCase(unittest.TestCase): 14 | # TODO: test detector names 15 | 16 | def test_detector_names(self): 17 | """make sure detector names appear in Filth""" 18 | detector = UrlDetector(name='example_name') 19 | filths = list(detector.iter_filth('www.google.com')) 20 | self.assertEqual(len(filths), 1) 21 | self.assertEqual(filths[0].detector_name, 'example_name') 22 | 23 | detector = EmailDetector(name='example_name') 24 | filths = list(detector.iter_filth('example@example.com')) 25 | self.assertEqual(len(filths), 1) 26 | self.assertEqual(filths[0].detector_name, 'example_name') 27 | 28 | def test_name_from_filth_cls(self): 29 | class OldFilth(Filth): 30 | type = 'old_filth' 31 | class OldDetector(Detector): 32 | filth_cls = OldFilth 33 | 34 | old_detector = OldDetector() 35 | self.assertEqual(old_detector.name, 'old_filth') 36 | 37 | detector = Detector() 38 | self.assertEqual(detector.name, 'detector') 39 | 40 | def test_abstract_detector_raises_error(self): 41 | """Test that the Detector abstract class raises an error when iter_filth is not implemented""" 42 | detector = Detector() 43 | with self.assertRaises(NotImplementedError): 44 | detector.iter_filth_documents(['text'], ['text.txt']) 45 | with self.assertRaises(NotImplementedError): 46 | detector.iter_filth('text') 47 | 48 | def test_abstract_regex_filth_raises_error(self): 49 | """Test that the RegexDetector abstract class raises an error when the filth_cls is incorrectly set""" 50 | class BadRegexDetector(RegexDetector): 51 | filth_cls = str 52 | 53 | detector = BadRegexDetector() 54 | with self.assertRaises(TypeError): 55 | list(detector.iter_filth('text')) 56 | 57 | def test_abstract_regex_raises_error(self): 58 | """Test that the RegexDetector abstract class raises an error when there is no regex set""" 59 | detector = RegexDetector() 60 | with self.assertRaises(ValueError): 61 | list(detector.iter_filth('text')) 62 | 63 | def test_non_detector_registration(self): 64 | """Test to ensure an error is raised if you try to register somthing thats not a detector""" 65 | 66 | detector = scrubadub.detectors.TwitterDetector() 67 | with self.assertRaises(ValueError): 68 | scrubadub.detectors.catalogue.register_detector(detector, autoload=False) 69 | 70 | with self.assertRaises(ValueError): 71 | scrubadub.detectors.catalogue.register_detector(123, autoload=False) 72 | 73 | def test_detector_registration(self): 74 | """Test to ensure adding a detector adds it to the configuration as expected""" 75 | 76 | class Temp(scrubadub.detectors.base.Detector): 77 | name = "temp" 78 | 79 | with self.assertRaises(catalogue.RegistryError): 80 | scrubadub.detectors.catalogue.detector_catalogue.get(Temp.name) 81 | 82 | scrubadub.detectors.catalogue.register_detector(Temp, autoload=False) 83 | 84 | self.assertEqual(Temp, scrubadub.detectors.catalogue.detector_catalogue.get(Temp.name)) 85 | 86 | scrubadub.detectors.catalogue.remove_detector(Temp) 87 | 88 | with self.assertRaises(catalogue.RegistryError): 89 | scrubadub.detectors.catalogue.detector_catalogue.get(Temp.name) 90 | -------------------------------------------------------------------------------- /scrubadub/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import locale as locale_module 3 | 4 | from typing import Optional, Tuple, List 5 | 6 | try: 7 | unicode # type: ignore # tell mypy to ignore the fact that this doesnt exist in python3 8 | except NameError: 9 | basestring = str # Compatibility for Python 2 and 3 10 | 11 | 12 | class CanonicalStringSet(set): 13 | """Just like a set, except it makes sure that all elements are lower case 14 | strings. 15 | """ 16 | 17 | def _cast_as_lower(self, x): 18 | if not isinstance(x, basestring): 19 | raise TypeError('CanonicalStringSet only works with strings') 20 | return x.lower() 21 | 22 | def __init__(self, *elements): 23 | super(CanonicalStringSet, self).__init__() 24 | if elements: 25 | self.update(*elements) 26 | 27 | def __contains__(self, element): 28 | return super(CanonicalStringSet, self).__contains__( 29 | self._cast_as_lower(element) 30 | ) 31 | 32 | def add(self, element): 33 | return super(CanonicalStringSet, self).add( 34 | self._cast_as_lower(element) 35 | ) 36 | 37 | def update(self, elements): 38 | for element in elements: 39 | self.add(element) 40 | 41 | def remove(self, element): 42 | return super(CanonicalStringSet, self).remove( 43 | self._cast_as_lower(element) 44 | ) 45 | 46 | def discard(self, element): 47 | return super(CanonicalStringSet, self).discard( 48 | self._cast_as_lower(element) 49 | ) 50 | 51 | 52 | class Lookup(object): 53 | """The Lookup object is used to create an in-memory reference table to 54 | create unique identifiers for ``Filth`` that is encountered. 55 | """ 56 | 57 | def __init__(self): 58 | self.table = {} 59 | 60 | def __getitem__(self, key): 61 | try: 62 | return self.table[key] 63 | except KeyError: 64 | self.table[key] = len(self.table) 65 | return self.table[key] 66 | 67 | 68 | def locale_transform(locale: str) -> str: 69 | """Normalise the locale string, e.g. 'fr' -> 'fr_FR'. 70 | 71 | :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an 72 | underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". 73 | :type locale: str 74 | :return: The normalised locale string 75 | :rtype: str 76 | """ 77 | normalised = locale_module.normalize(locale.lower()) 78 | if normalised not in locale_module.locale_alias.values(): 79 | raise ValueError("Unknown locale '{}', not in locale.locale_alias".format(locale)) 80 | return normalised 81 | 82 | 83 | def locale_split(locale: str) -> Tuple[Optional[str], Optional[str]]: 84 | """Split the locale string into the language and region. 85 | 86 | :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an 87 | underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". 88 | :type locale: str 89 | :return: The two-letter language code and the two-letter region code in a tuple. 90 | :rtype: tuple, (str, str) 91 | """ 92 | locale = locale_transform(locale) 93 | 94 | regex = r'(?P[0-9a-zA-Z]+)(_(?P[0-9a-zA-Z]+))?' \ 95 | r'(\.(?P[0-9a-zA-Z-]+)(@(?P[0-9a-zA-Z]+))?)?' 96 | match = re.match(regex, locale) 97 | if match is None: 98 | raise ValueError('Locale does not match expected format.') 99 | 100 | return match.group('language').lower(), match.group('region').upper() 101 | 102 | 103 | class ToStringMixin(object): 104 | def _to_string(self, attributes: List[str]) -> str: 105 | item_attributes = [ 106 | "{}={}".format(item, getattr(self, item, None).__repr__()) 107 | for item in attributes 108 | if getattr(self, item, None) is not None 109 | ] 110 | return "<{} {}>".format(self.__class__.__name__, " ".join(item_attributes)) 111 | -------------------------------------------------------------------------------- /.github/workflows/unittests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: [3.9, 3.8, 3.7, 3.6] 19 | 20 | env: 21 | PREFIX: /home/runner/prefix 22 | LIBPOSTAL: /home/runner/libpostal 23 | LIBRARY_PATH: /home/runner/prefix/lib 24 | LD_LIBRARY_PATH: /home/runner/prefix/lib 25 | C_INCLUDE_PATH: /home/runner/prefix/include 26 | CPP_INCLUDE_PATH: /home/runner/prefix/include 27 | 28 | steps: 29 | - uses: actions/checkout@v2 30 | - name: Set up Python ${{ matrix.python-version }} 31 | uses: actions/setup-python@v2 32 | with: 33 | python-version: ${{ matrix.python-version }} 34 | 35 | - name: Install apt dependencies 36 | run: | 37 | sudo apt-get update && sudo apt-get install -y curl autoconf automake libtool pkg-config default-jre 38 | 39 | - name: Cache restore libpostal 40 | id: cache-libpostal 41 | uses: actions/cache@v2 42 | with: 43 | path: | 44 | ${{ env.PREFIX }} 45 | ${{ env.LIBPOSTAL }} 46 | key: v1-libpostal-${{ runner.os }} 47 | 48 | - name: Install libpostal 49 | if: steps.cache-libpostal.outputs.cache-hit != 'true' 50 | run: | 51 | if test ! -f ${{ env.PREFIX }}/lib/libpostal.so ; then mkdir -p ${{ env.PREFIX }} ${{ env.LIBPOSTAL }} && 52 | git clone https://github.com/openvenues/libpostal ${{ env.LIBPOSTAL }} && cd ${{ env.LIBPOSTAL }} && 53 | ./bootstrap.sh && ./configure --prefix=${{ env.PREFIX }} && sudo make -j4 && sudo make install && cd - ; fi 54 | 55 | - name: Cache restore pip 56 | id: cache-pip 57 | uses: actions/cache@v2 58 | with: 59 | path: ~/.cache/pip 60 | key: ${{ runner.os }}-pip-${{ hashFiles('requirements/python*') }} 61 | restore-keys: | 62 | ${{ runner.os }}-pip- 63 | 64 | - name: Install pip dependencies 65 | run: | 66 | export LIBRARY_PATH=${{ env.LIBRARY_PATH }} 67 | export LD_LIBRARY_PATH=${{ env.LD_LIBRARY_PATH }} 68 | export C_INCLUDE_PATH=${{ env.C_INCLUDE_PATH }} 69 | export CPP_INCLUDE_PATH=${{ env.CPP_INCLUDE_PATH }} 70 | python -m pip install --upgrade pip wheel setuptools 71 | pip install -r requirements/python-dev 72 | 73 | - name: Cache restore nltk data 74 | id: cache-models 75 | uses: actions/cache@v2 76 | with: 77 | path: ~/nltk_data 78 | key: v1-nltk-data 79 | 80 | - name: Download models and NLTK data 81 | run: | 82 | # Needed for stanford model 83 | python3 -c "import nltk; nltk.download('punkt')" 84 | # Needed for the TextBlob model 85 | python -m textblob.download_corpora 86 | # One of the possible spacy models, should 87 | ( python3 -c 'import spacy' && python -m spacy download en_core_web_sm ) || bash -c 'exit 0' 88 | ( python3 -c 'import spacy' && python -m spacy download en_core_web_trf ) || bash -c 'exit 0' 89 | ( python3 -c 'import spacy' && python -m spacy download de_core_news_sm ) || bash -c 'exit 0' 90 | ( python3 -c 'import spacy' && python -m spacy download fr_core_news_lg ) || bash -c 'exit 0' 91 | 92 | - name: Install package 93 | run: | 94 | echo "Installing package" 95 | pip install -e . 96 | 97 | - name: Run tests 98 | run: | 99 | python3 tests/run.py 100 | 101 | - name: Coveralls 102 | env: 103 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 104 | COVERALLS_SERVICE_NAME: github-actions 105 | COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }} 106 | run: | 107 | if python3 --version | grep -q "Python 3.9." ; then coveralls ; fi 108 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | 2 | .. NOTES FOR CREATING A RELEASE: 3 | .. 4 | .. * bump the version number in scrubadub/__init__.py 5 | .. * update docs/changelog.rst 6 | .. * git push 7 | .. * create a release https://github.com/LeapBeyond/scrubadub/releases 8 | .. * This should trigger a github action to upload to pypi 9 | .. * ReadTheDocs.io should see any changes and also rebuild the docs 10 | 11 | 12 | ********* 13 | scrubadub 14 | ********* 15 | 16 | Remove personally identifiable information from free text. Sometimes we have 17 | additional metadata about the people we wish to anonymize. Other times we don't. 18 | This package makes it easy to seamlessly scrub personal information from free 19 | text, without compromising the privacy of the people we are trying to protect. 20 | 21 | ``scrubadub`` currently supports removing: 22 | 23 | * Names 24 | * Email addresses 25 | * Addresses/Postal codes (US, GB, CA) 26 | * Credit card numbers 27 | * Dates of birth 28 | * URLs 29 | * Phone numbers 30 | * Username and password combinations 31 | * Skype/twitter usernames 32 | * Social security numbers (US and GB national insurance numbers) 33 | * Tax numbers (GB) 34 | * Driving licence numbers (GB) 35 | 36 | .. image:: https://img.shields.io/github/workflow/status/LeapBeyond/scrubadub/Python%20package/master 37 | :target: https://github.com/LeapBeyond/scrubadub/actions?query=workflow%3A%22Python+package%22+branch%3Amaster 38 | :alt: Build Status 39 | .. image:: https://img.shields.io/pypi/v/scrubadub.svg 40 | :target: https://pypi.org/project/scrubadub/ 41 | :alt: Version 42 | .. image:: https://img.shields.io/pypi/dm/scrubadub.svg 43 | :target: https://pypi.org/project/scrubadub/ 44 | :alt: Downloads 45 | .. image:: https://coveralls.io/repos/github/LeapBeyond/scrubadub/badge.svg?branch=master 46 | :target: https://coveralls.io/r/LeapBeyond/scrubadub 47 | :alt: Test Coverage 48 | .. image:: https://readthedocs.org/projects/scrubadub/badge/?version=latest 49 | :target: https://readthedocs.org/projects/scrubadub/?badge=latest 50 | :alt: Documentation Status 51 | 52 | 53 | Quick start 54 | ----------- 55 | 56 | Getting started with ``scrubadub`` is as easy as ``pip install scrubadub`` and 57 | incorporating it into your python scripts like this: 58 | 59 | .. code:: pycon 60 | 61 | >>> import scrubadub 62 | 63 | # My cat may be more tech-savvy than most, but he doesn't want other people to know it. 64 | >>> text = "My cat can be contacted on example@example.com, or 1800 555-5555" 65 | 66 | # Replaces the phone number and email addresse with anonymous IDs. 67 | >>> scrubadub.clean(text) 68 | 'My cat can be contacted on {{EMAIL}}, or {{PHONE}}' 69 | 70 | 71 | There are many ways to tailor the behavior of ``scrubadub`` using 72 | `different Detectors and PostProcessors `_. 73 | Scrubadub is highly configurable and supports localisation for different languages and regions. 74 | 75 | Installation 76 | ------------ 77 | 78 | To install scrubadub using pip, simply type:: 79 | 80 | pip install scrubadub 81 | 82 | There are several other packages that can optionally be installed to enable extra detectors. 83 | These `scrubadub_address `_, `scrubadub_spacy `_ and `scrubadub_stanford `_, see the relevant documentation (`address detector documentation `_ and `name detector documentation `_) for more info on these as they require additional dependencies. 84 | This package requires at least python 3.6. 85 | For python 2.7 or 3.5 support use v1.2.2 which is the last version with support for these versions. 86 | 87 | New maintainers 88 | --------------- 89 | 90 | `LeapBeyond `_ are excited to be supporting scrubadub with ongoing maintenance and development. 91 | Thanks to all of the contributors who made this package a success, but especially `@deanmalmgren `_, `IDEO `_ and `Datascope `_. 92 | -------------------------------------------------------------------------------- /scrubadub/detectors/user_supplied.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from scrubadub.detectors.catalogue import register_detector 4 | from .. import filth as filth_module 5 | from ..filth.base import Filth 6 | from .tagged import TaggedEvaluationFilthDetector 7 | 8 | 9 | @register_detector 10 | class UserSuppliedFilthDetector(TaggedEvaluationFilthDetector): 11 | """Use this ``Detector`` to find some known filth in the text. An example might be if you have a list of employee 12 | numbers that you wish to remove from a document, as shown below: 13 | 14 | >>> import scrubadub 15 | >>> scrubber = scrubadub.Scrubber(detector_list=[ 16 | ... scrubadub.detectors.UserSuppliedFilthDetector([ 17 | ... {'match': 'Anika', 'filth_type': 'name'}, 18 | ... {'match': 'Larry', 'filth_type': 'name'}, 19 | ... ]), 20 | ... ]) 21 | >>> scrubber.clean("Anika is my favourite employee.") 22 | '{{NAME}} is my favourite employee.' 23 | 24 | This detector takes a list of dictonaires (reffered to as known filth items). These specify what to look for in 25 | the text to label as tagged filth. The dictionary should contain the following keys: 26 | 27 | * ``match`` (`str`) - a string value that will be searched for in the text 28 | * ``filth_type`` (`str`) - a string value that indicates the type of Filth, should be set to ``Filth.name``. 29 | An example of these could be 'name' or 'phone' for name and phone filths respectively. 30 | 31 | The known filth item dictionary may also optionally contain: 32 | 33 | * ``match_end`` (`str`) - if specified will search for Filth starting with the value of match and ending with 34 | the value of ``match_end`` 35 | * ``limit`` (`int`) - an integer describing the maximum number of characters between match and match_end, 36 | defaults to 150 37 | * ``ignore_case`` (`bool`) - Ignore case when searching for the tagged filth 38 | * ``ignore_whitespace`` (`bool`) - Ignore whitespace when matching ("asd qwe" can also match "asd\\\\nqwe") 39 | * ``ignore_partial_word_matches`` (`bool`) - Ignore matches that are only partial words (if you're looking 40 | for "Eve", this flag ensure it wont match "Evening") 41 | 42 | Examples of this: 43 | 44 | * ``{'match': 'aaa', 'filth_type': 'name'}`` - will search for an exact match to aaa and return it as a 45 | ``NameFilth`` 46 | * ``{'match': 'aaa', 'match_end': 'zzz', 'filth_type': 'name'}`` - will search for `aaa` followed by up to 150 47 | characters followed by `zzz`, which would match both `aaabbbzzz` and `aaazzz`. 48 | * ``{'match': '012345', 'filth_type': 'phone', 'ignore_partial_word_matches': True}`` - will search for an 49 | exact match to 012345, ignoring any partial matches and return it as a ``PhoneFilth`` 50 | 51 | This detector is not enabled by default (since you need to supply a list of known filths) and so you must always 52 | add it to your scrubber with a ``scrubber.add_detector(detector)`` call or by adding it to the ``detector_list`` 53 | inialising a ``Scrubber``. 54 | """ 55 | 56 | name = 'user_supplied' 57 | 58 | def create_filth( 59 | self, start_location: int, end_location: int, text: str, comparison_type: Optional[str], 60 | detector_name: str, document_name: Optional[str], locale: str 61 | ) -> Filth: 62 | for item_name in dir(filth_module): 63 | try: 64 | filth_cls = filth_module.__getattribute__(item_name) 65 | except AttributeError: 66 | continue 67 | 68 | if not isinstance(filth_cls, type) or not issubclass(filth_cls, Filth): 69 | continue 70 | 71 | try: 72 | filth_type = filth_cls.type 73 | except AttributeError: 74 | continue 75 | 76 | if filth_type != comparison_type: 77 | continue 78 | 79 | return filth_cls( 80 | start_location, 81 | end_location, 82 | text, 83 | detector_name=detector_name, 84 | document_name=document_name, 85 | locale=locale, 86 | ) 87 | raise KeyError(f"Unable to find filth '{comparison_type}'") 88 | -------------------------------------------------------------------------------- /scrubadub/detectors/skype.py: -------------------------------------------------------------------------------- 1 | import re 2 | import nltk 3 | import textblob 4 | 5 | from textblob.blob import BaseBlob 6 | from textblob.en.taggers import PatternTagger 7 | 8 | from typing import Optional, Generator 9 | 10 | from scrubadub.detectors.catalogue import register_detector 11 | from .base import RegexDetector 12 | from ..filth import SkypeFilth, Filth 13 | 14 | # BaseBlob uses NLTKTagger as a pos_tagger, but it works wrong 15 | BaseBlob.pos_tagger = PatternTagger() 16 | 17 | 18 | @register_detector 19 | class SkypeDetector(RegexDetector): 20 | """Skype usernames tend to be used inline in dirty dirty text quite 21 | often but also appear as ``skype: {{SKYPE}}`` quite a bit. This method 22 | looks at words within ``word_radius`` words of "skype" for things that 23 | appear to be misspelled or have punctuation in them as a means to 24 | identify skype usernames. 25 | 26 | Default ``word_radius`` is 10, corresponding with the rough scale of 27 | half of a sentence before or after the word "skype" is used. Increasing 28 | the ``word_radius`` will increase the false positive rate and 29 | decreasing the ``word_radius`` will increase the false negative rate. 30 | """ 31 | filth_cls = SkypeFilth 32 | name = 'skype' 33 | autoload = False 34 | 35 | word_radius = 10 36 | 37 | # these two regular expressions are used to validate a skype usernames. 38 | # _TOKEN is the core regular expression that is used to chunk text into 39 | # tokens to make sure all valid skype usernames are considered the same 40 | # token. Importantly, the word "skype" must pass the _SKYPE regex. 41 | # SKYPE_TOKEN is used to tokenize text and SKYPE_USERNAME is the same thing 42 | # but with the 6-32 character limit imposed on the username. adapted from 43 | # http://bit.ly/1FQs1hD 44 | _SKYPE = r'[a-zA-Z][a-zA-Z0-9_\-\,\.]' 45 | SKYPE_TOKEN = _SKYPE + '+' 46 | SKYPE_USERNAME = re.compile(_SKYPE+'{5,31}') 47 | 48 | def iter_filth(self, text, document_name: Optional[str] = None) -> Generator[Filth, None, None]: 49 | """Yields discovered filth in the provided ``text``. 50 | 51 | :param text: The dirty text to clean. 52 | :type text: str 53 | :param document_name: The name of the document to clean. 54 | :type document_name: str, optional 55 | :return: An iterator to the discovered :class:`Filth` 56 | :rtype: Iterator[:class:`Filth`] 57 | """ 58 | 59 | # find 'skype' in the text using a customized tokenizer. this makes 60 | # sure that all valid skype usernames are kept as tokens and not split 61 | # into different words 62 | tokenizer = nltk.tokenize.regexp.RegexpTokenizer( 63 | self.SKYPE_TOKEN 64 | ) 65 | blob = textblob.TextBlob(text, tokenizer=tokenizer) 66 | skype_indices, tokens = [], [] 67 | for i, token in enumerate(blob.tokens): 68 | tokens.append(token) 69 | if 'skype' in token.lower(): 70 | skype_indices.append(i) 71 | 72 | # go through the words before and after skype words to identify 73 | # potential skype usernames. 74 | skype_usernames = [] 75 | for i in skype_indices: 76 | jmin = max(i-self.word_radius, 0) 77 | jmax = min(i+self.word_radius+1, len(tokens)) 78 | for j in list(range(jmin, i)) + list(range(i+1, jmax)): 79 | token = tokens[j] 80 | if self.SKYPE_USERNAME.match(token): 81 | 82 | # this token is a valid skype username. Most skype 83 | # usernames appear to be misspelled words. Word.spellcheck 84 | # does not handle the situation of an all caps word very 85 | # well, so we cast these to all lower case before checking 86 | # whether the word is misspelled 87 | if token.isupper(): 88 | token = token.lower() 89 | word = textblob.Word(token) 90 | suggestions = word.spellcheck() 91 | corrected_word, score = suggestions[0] 92 | if score < 0.5: 93 | skype_usernames.append(token) 94 | 95 | # replace all skype usernames 96 | if skype_usernames: 97 | self.regex = re.compile('|'.join(skype_usernames)) 98 | yield from super(SkypeDetector, self).iter_filth(text, document_name=document_name) 99 | 100 | return 101 | -------------------------------------------------------------------------------- /tests/test_api_older.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import warnings 3 | 4 | import scrubadub 5 | import scrubadub.detectors.catalogue 6 | import scrubadub.utils 7 | 8 | class OldAPITestCase(unittest.TestCase): 9 | 10 | def setUp(self): 11 | from scrubadub.detectors.text_blob import TextBlobNameDetector 12 | scrubadub.detectors.catalogue.register_detector(TextBlobNameDetector, autoload=True) 13 | 14 | def test_scrubadub_clean(self): 15 | """test old scrubadub API""" 16 | text = u"John is a cat" 17 | self.assertEqual( 18 | scrubadub.clean(text), 19 | "{{NAME}} is a cat", 20 | ) 21 | 22 | scrubadub.filth.Filth.lookup = scrubadub.utils.Lookup() 23 | with warnings.catch_warnings(record=True) as warning_context: 24 | warnings.simplefilter("always") 25 | try: 26 | self.assertEqual( 27 | scrubadub.clean(text, replace_with='identifier'), 28 | "{{NAME-0}} is a cat", 29 | ) 30 | finally: 31 | warnings.simplefilter("default") 32 | self.assertTrue(sum(issubclass(w.category, DeprecationWarning) for w in warning_context) > 0) 33 | 34 | 35 | scrubadub.filth.Filth.lookup = scrubadub.utils.Lookup() 36 | with warnings.catch_warnings(record=True) as warning_context: 37 | warnings.simplefilter("always") 38 | try: 39 | self.assertEqual( 40 | scrubadub.clean("John spoke with Doug.", replace_with='identifier'), 41 | "{{NAME-0}} spoke with {{NAME-1}}.", 42 | ) 43 | finally: 44 | warnings.simplefilter("default") 45 | self.assertTrue(sum(issubclass(w.category, DeprecationWarning) for w in warning_context) > 0) 46 | 47 | scrubadub.filth.Filth.lookup = scrubadub.utils.Lookup() 48 | 49 | def test_scrubber_clean(self): 50 | """test older scrubber API""" 51 | scrubber = scrubadub.Scrubber() 52 | scrubber.remove_detector('email') 53 | text = "contact Joe Duffy at joe@example.com" 54 | self.assertEqual( 55 | scrubadub.clean(text), 56 | "contact {{NAME}} {{NAME}} at {{EMAIL}}", 57 | ) 58 | 59 | def test_filth_class(self): 60 | class MyFilth(scrubadub.filth.Filth): 61 | type = 'mine' 62 | 63 | class MyDetector(scrubadub.detectors.Detector): 64 | filth_cls = MyFilth 65 | 66 | def iter_filth(self, text, **kwargs): 67 | yield MyFilth(beg=0, end=8, text='My stuff', **kwargs) 68 | 69 | scrubber = scrubadub.Scrubber() 70 | # TODO: Add depreciation warning 71 | scrubber.add_detector(MyDetector) 72 | text = "My stuff can be found there." 73 | 74 | self.assertEqual( 75 | scrubber.clean(text), 76 | "{{MINE}} can be found there.", 77 | ) 78 | 79 | def test_filth_markers(self): 80 | prefix = scrubadub.filth.base.Filth.prefix 81 | suffix = scrubadub.filth.base.Filth.suffix 82 | scrubadub.filth.base.Filth.prefix = '' 83 | scrubadub.filth.base.Filth.suffix = '' 84 | 85 | scrubber = scrubadub.Scrubber() 86 | 87 | with warnings.catch_warnings(record=True) as warning_context: 88 | warnings.simplefilter("always") 89 | try: 90 | self.assertEqual( 91 | scrubber.clean("contact Joe Duffy at joe@example.com"), 92 | "contact NAME NAME at EMAIL", 93 | ) 94 | finally: 95 | warnings.simplefilter("default") 96 | # Ensure that this is reset, no matter what happens above 97 | scrubadub.filth.base.Filth.prefix = prefix 98 | scrubadub.filth.base.Filth.suffix = suffix 99 | self.assertTrue(sum(issubclass(w.category, DeprecationWarning) for w in warning_context) > 0) 100 | 101 | def test_regex_filth(self): 102 | """Test for a DeprecationWarning when using RegexFilth.""" 103 | with warnings.catch_warnings(record=True) as warning_context: 104 | warnings.simplefilter("always") 105 | try: 106 | scrubadub.filth.RegexFilth(0, 2, 'ab') 107 | finally: 108 | warnings.simplefilter("default") 109 | self.assertEqual(sum(issubclass(w.category, DeprecationWarning) for w in warning_context), 1) 110 | 111 | def tearDown(self) -> None: 112 | from scrubadub.detectors.text_blob import TextBlobNameDetector 113 | scrubadub.detectors.catalogue.remove_detector(TextBlobNameDetector) 114 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Any and all contributions are welcome and appreciated. To make it easy 2 | to keep things organized, this project uses the 3 | [general guidelines](https://help.github.com/articles/using-pull-requests) 4 | for the fork-branch-pull request model for github. Briefly, this means: 5 | 6 | 1. Make sure your fork's `master` branch is up to date: 7 | 8 | git remote add LeapBeyond https://github.com/LeapBeyond/scrubadub.git 9 | git checkout master 10 | git pull LeapBeyond/master 11 | 12 | 2. Start a feature branch with a descriptive name about what you're trying 13 | to accomplish: 14 | 15 | git checkout -b italian-name-fix 16 | 17 | 3. Make commits to this feature branch (`italian-name-fix`, in this case) 18 | in a way that other people can understand with good commit message 19 | to explain the changes you've made: 20 | 21 | emacs scrubadub/__init__.py 22 | git add scrubadub/__init__.py 23 | git commit -m 'added italian name fix' 24 | 25 | 4. If an issue already exists for the code you're contributing, use 26 | [issue2pr](http://issue2pr.herokuapp.com/) to attach your code to that issue: 27 | 28 | git push origin italian-name-fix 29 | chrome http://issue2pr.herokuapp.com 30 | # enter the issue URL, HEAD=yourusername:italian-name-fix, Base=master 31 | 32 | If the issue doesn't already exist, just send a pull request in the 33 | usual way: 34 | 35 | git push origin italian-name-fix 36 | chrome http://github.com/LeapBeyond/scrubadub/compare 37 | 38 | 39 | Style guidelines 40 | ---------------- 41 | 42 | As a general rule of thumb, the goal of this package is to be as 43 | readable as possible to make it easy for novices and experts alike to 44 | contribute to the source code in meaningful ways. Pull requests that 45 | favor cleverness or optimization over readability are less likely to be 46 | incorporated. 47 | 48 | To make this notion of "readability" more concrete, here are a few 49 | stylistic guidelines that are inspired by other projects and we 50 | generally recommend: 51 | 52 | - write functions and methods that can `fit on a screen or two of a 53 | standard 54 | terminal `_ 55 | --- no more than approximately 40 lines. 56 | 57 | - unless it makes code less readable, adhere to `PEP 8 58 | `_ style 59 | recommendations --- use an appropriate amount of whitespace. This 60 | is enforced in the test suite 61 | 62 | - `code comments should be about *what* and *why* is being done, not *how* it is 63 | being done `_ --- 64 | that should be self-evident from the code itself. 65 | 66 | 67 | Common contributions: Removing a new type of filth 68 | -------------------------------------------------- 69 | 70 | This project has really taken off, much more so than I would have thought 71 | (thanks everybody!). One very common contribution is adding a new type of filth 72 | that should be removed by `scrubadub`. To make it as easy as possible to add 73 | these types of contributions, I thought I'd jot down a few notes about how to 74 | add a new type of filth, for example, addresses. 75 | 76 | * Create an appropriately named python file in `scrubadub/filth/` and write a 77 | new `Filth` class that inherits from `scrubadub.filth.base.Filth`. In this 78 | case, perhaps you'd create an `AddressFilth` class in 79 | `scrubadub/filth/address.py` 80 | 81 | * Add your new type of `Filth` to the `scrubadub.filth` namespace by importing 82 | it in `scrubadub/filth/__init__.py` 83 | 84 | * Create an appropriately named python file in `scrubadub/detectors/` and write 85 | a new `Detector` class that inherits from 86 | `scrubadub.detectors.base.Detector`. In this case, perhaps you'd create an 87 | `AddressDetector` class in `scrubadub/detectors/address.py`. 88 | 89 | * Add your new type of `Detector` to the `scrubadub.detectors` namespace by 90 | importing it in `scrubadub/detectors/__init__.py`. 91 | 92 | * Register your new detector by adding it to the `types` dictionary in 93 | `scrubadub/detectors/__init__.py` 94 | 95 | * Create a new python file to handle some tests for your particular type of 96 | filth. In this case, perhaps you would write your tests in 97 | `tests/test_addresses.py` 98 | 99 | * Add documentation for the new type of filth in `docs/index.rst` and be sure 100 | to give yourself a pat on the back in `docs/changelog.rst` 101 | 102 | * Make sure all of the tests are passing by running `./tests/run.py` and fix 103 | any lingering problems (usually PEP-8 nonsense). 104 | -------------------------------------------------------------------------------- /scrubadub/filth/address.py: -------------------------------------------------------------------------------- 1 | import string 2 | import random 3 | 4 | from faker import Faker 5 | 6 | from .base import Filth 7 | 8 | 9 | class AddressFilth(Filth): 10 | type = 'address' 11 | 12 | @staticmethod 13 | def _randomise_seperators(address: str) -> str: 14 | target = random.choice(["comma", "newline", "mixed", "spaces", "no_change"]) 15 | if target == "comma": 16 | return address.replace('\n', ', ') 17 | elif target == "newline": 18 | return address.replace(', ', '\n') 19 | elif target == "spaces": 20 | return address.replace(', ', ' ').replace('\n', ' ') 21 | elif target == "mixed": 22 | address = address.replace(', ', '{{SEP}}').replace('\n', '{{SEP}}') 23 | while '{{SEP}}' in address: 24 | this_seporator = random.choice(["comma", "newline", "spaces"]) 25 | if this_seporator == "comma": 26 | address = address.replace('{{SEP}}', ', ', 1) 27 | elif this_seporator == "newline": 28 | address = address.replace('{{SEP}}', '\n', 1) 29 | elif this_seporator == "spaces": 30 | address = address.replace('{{SEP}}', ' ', 1) 31 | return address 32 | return address 33 | 34 | @staticmethod 35 | def _randomise_street_number(address: str) -> str: 36 | target = random.choice(["remove", "add_letter", "no_change", "no_change", "no_change", "no_change"]) 37 | if target == "remove": 38 | address_split = address.split('\n') 39 | first_line_split = address_split[0].split(' ') 40 | try: 41 | int(first_line_split[0]) 42 | except ValueError: 43 | return address 44 | new_first_line = " ".join(first_line_split[1:]) 45 | return "\n".join([new_first_line] + address_split[1:]) 46 | elif target == "add_letter": 47 | address_split = address.split('\n') 48 | first_line_split = address_split[0].split(' ') 49 | try: 50 | int(first_line_split[0]) 51 | except ValueError: 52 | return address 53 | new_number = first_line_split[0] + random.choice(string.ascii_letters) 54 | new_first_line = " ".join([new_number] + first_line_split[1:]) 55 | return "\n".join([new_first_line] + address_split[1:]) 56 | return address 57 | 58 | @staticmethod 59 | def _randomise_postcode(address: str) -> str: 60 | target = random.choice(["remove", "lower", "no_change", "no_change", "no_change"]) 61 | if target == "remove": 62 | return "\n".join(address.split('\n')[:-1]) 63 | elif target == "lower": 64 | address_split = address.split('\n') 65 | return "\n".join(address.split('\n')[:-1] + [address_split[-1].lower()]) 66 | return address 67 | 68 | @staticmethod 69 | def _randomise_country(address: str) -> str: 70 | target = random.choice(["country", "upper_country", "no_change", "no_change", "no_change"]) 71 | if "country" in target: 72 | country = random.choice(['United Kingdom', 'Britain', 'England', 'Scotland', 'Wales', 'Cymru', 'GB']) 73 | if "upper" in target: 74 | country = country.upper() 75 | return address + "\n" + country 76 | return address 77 | 78 | @staticmethod 79 | def _randomise_building(address: str, faker: Faker) -> str: 80 | target = random.choice(["add_building", "no_change", "no_change", "no_change"]) 81 | if target == "add_building": 82 | if bool(random.getrandbits(1)): 83 | building = faker.last_name() + " " + random.choice(["Building", "House", "Block"]) 84 | else: 85 | building = random.choice(["Building", "House", "Block"]) + " " + faker.last_name() 86 | return building + "\n" + address 87 | return address 88 | 89 | @staticmethod 90 | def _randomise_case(address: str) -> str: 91 | target = random.random() 92 | if target >= 0.8: 93 | if target >= 0.9: 94 | address = address.upper() 95 | else: 96 | address = address.lower() 97 | return address 98 | 99 | @staticmethod 100 | def generate(faker: Faker) -> str: 101 | """Generates an example of this ``Filth`` type, usually using the faker python library. 102 | 103 | :param faker: The ``Faker`` class from the ``faker`` library 104 | :type faker: Faker 105 | :return: An example of this ``Filth`` 106 | :rtype: str 107 | """ 108 | address = faker.address() 109 | if faker.locales == ['en_GB']: 110 | address = AddressFilth._randomise_street_number(address) 111 | address = AddressFilth._randomise_building(address, faker) 112 | address = AddressFilth._randomise_postcode(address) 113 | if faker.locales == ['en_GB']: 114 | address = AddressFilth._randomise_country(address) 115 | address = AddressFilth._randomise_seperators(address) 116 | address = AddressFilth._randomise_case(address) 117 | 118 | return address 119 | -------------------------------------------------------------------------------- /docs/localization.rst: -------------------------------------------------------------------------------- 1 | .. _locales: 2 | .. _localization: 3 | 4 | Localization 5 | ============ 6 | 7 | We have started to make scrubadub localised to support multiple languages and regions. 8 | We are on the beginning of this journey, so stay tuned. 9 | 10 | By setting a locale the ``Detector``\ s that need configuring based on your region or language will know what type of text to expect. 11 | This means that a ``Detector`` that needs to know how ``Filth`` (such as a phone number) is formatted in your 12 | region will be able to look for ``Filth`` in that specific format. 13 | Other detectors that use machine learning models to identify entities in the text will be able to use models 14 | corresponding to the correct language or location. 15 | 16 | To set your locale you can use the standard format ``xx_YY``, where ``xx`` is a 17 | lower-case `language code `_ 18 | and ``YY`` is an upper-case `country code `_. 19 | Examples of this include ``en_CA`` (Canadian english), ``fr_CA`` (Canadian french)` and ``de_AT`` (Austrian german). 20 | These locales can be set by passing them directly to one of the functions in the ``scrubadub`` module or to a ``Scrubber`` instance: 21 | 22 | .. code:: pycon 23 | 24 | >>> import scrubadub 25 | >>> scrubadub.clean('My US number is 731-938-1630', locale='en_US') 26 | 'My US number is {{PHONE}}' 27 | >>> scrubadub.clean('My US number is 731-938-1630', locale='en_GB') 28 | 'My US number is 731-938-1630' 29 | >>> scrubadub.clean('My GB number is 0121 496 0112', locale='en_GB') 30 | 'My GB number is {{PHONE}}' 31 | >>> scrubadub.clean('My GB number is 0121 496 0112', locale='en_US') 32 | 'My GB number is 0121 496 0112' 33 | >>> scrubber = scrubadub.Scrubber(locale='de_DE') 34 | >>> scrubber.clean('Meine Telefonnummer ist 05086 63680') 35 | 'Meine Telefonnummer ist {{PHONE}}' 36 | 37 | Below is a summary of the supported countries and regions of the various detectors in scrubadub. 38 | 39 | * `AddressDetector`: supports Canadian, American and British addresses 40 | * `PhoneDetector`: supports most regions via `libphonenumber `_ 41 | * `PostalCodeDetector`: only supports British postcodes 42 | * `SpacyEntityDetector`: supports a wide range of languages check the `spacy documentation `_ for the full list of supported languages. 43 | * `StanfordEntityDetector`: only supports english in scrubadub, but the models support more languages (es, fr, de, zh). 44 | 45 | This is just the start of the localisation, so if you want to add more languages or features we're keen to hear from you! 46 | Other detectors are location/language independent (eg email addresses or twitter usernames) or do not support localisation. 47 | 48 | Creating a localized detector 49 | ----------------------------- 50 | 51 | To create a detector that is localised the process is identical to creating a normal detector 52 | (as shown in :ref:`create-detector`), but with one addition a ``supported_locale()`` function. 53 | If this function is not defined it is assumed that this ``Detector`` does not need 54 | localization. 55 | An example of a ``Detector`` that does not need localization is the email detector, 56 | as emails follow the same format no matter where you live and what language you speak. 57 | On the other hand, the format of a phone number can vary significantly depending on the region. 58 | 59 | Below is an example of a detector that detects employee names for a very small, but international company. 60 | There is one German employee, `Walther`, and one US employee `Georgina`. 61 | When the document is German we will remove `Walther` and when the document is American we will remove `Georgina`. 62 | 63 | The ``supported_locale()`` function should return ``True`` if the passed locale is supported and ``False`` if it is not supported. 64 | If ``supported_locale()`` returns ``False`` then the ``Scrubber`` will emit a warning and not add or run that ``Detector`` on the documents passed to it. 65 | The ``Detector.locale_split(locale)`` function can be used to split the locale into the language and region. 66 | 67 | Below is the full example: 68 | 69 | .. code:: pycon 70 | 71 | >>> import scrubadub, re 72 | 73 | >>> class EmployeeNameFilth(scrubadub.filth.Filth): 74 | ... type = 'employee_name' 75 | 76 | >>> class EmployeeDetector(scrubadub.detectors.Detector): 77 | ... name = 'employee_detector' 78 | ... 79 | ... def __init__(self, *args, **kwargs): 80 | ... super(EmployeeDetector, self).__init__(*args, **kwargs) 81 | ... self.employees = {'DE': ['Walther'], 'US': ['Georgina'] } 82 | ... self.regex = re.compile('|'.join(self.employees[self.region])) 83 | ... 84 | ... @classmethod 85 | ... def supported_locale(cls, locale): 86 | ... language, region = cls.locale_split(locale) 87 | ... return region in ['DE', 'US'] 88 | ... 89 | ... def iter_filth(self, text, document_name=None): 90 | ... for match in self.regex.finditer(text): 91 | ... yield EmployeeNameFilth(match=match, detector_name=self.name, document_name=document_name, locale=self.locale) 92 | ... 93 | >>> us_scrubber = scrubadub.Scrubber(detector_list=[EmployeeDetector], locale='en_US') 94 | >>> us_scrubber.clean('Jane spoke with Georgina') 95 | 'Jane spoke with {{EMPLOYEE_NAME}}' 96 | >>> de_scrubber = scrubadub.Scrubber(detector_list=[EmployeeDetector], locale='de_DE') 97 | >>> de_scrubber.clean('Jane spoke with Georgina') 98 | 'Jane spoke with Georgina' 99 | >>> de_scrubber.clean('Luigi spoke with Walther') 100 | 'Luigi spoke with {{EMPLOYEE_NAME}}' 101 | -------------------------------------------------------------------------------- /scrubadub/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import Union, List, Dict, Sequence, Optional 3 | 4 | # convenient imports 5 | from .scrubbers import Scrubber 6 | from . import filth 7 | from . import detectors 8 | from . import post_processors 9 | from .filth import Filth 10 | 11 | __version__ = VERSION = "2.0.0" 12 | __all__ = [ 13 | 'Scrubber', 'filth', 'detectors', 'post_processors', 'clean', 'clean_documents', 'list_filth', 14 | 'list_filth_documents', 15 | ] 16 | 17 | 18 | def clean(text: str, locale: Optional[str] = None, **kwargs) -> str: 19 | """Seaches for ``Filth`` in `text` in a string and replaces it with placeholders. 20 | 21 | .. code:: pycon 22 | 23 | >>> import scrubadub 24 | >>> scrubadub.clean(u"contact me at joe@example.com") 25 | 'contact me at {{EMAIL}}' 26 | 27 | :param text: The text containing possible PII that needs to be redacted 28 | :type text: `str` 29 | :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an 30 | underscore and the two letter upper-case country code, eg "en_GB" or "de_CH" 31 | :type locale: str 32 | :return: Text with all :class:``Filth`` replaced. 33 | :rtype: `str` 34 | 35 | """ 36 | scrubber = Scrubber(locale=locale) 37 | return scrubber.clean(text, **kwargs) 38 | 39 | 40 | def clean_documents(documents: Union[Sequence[str], Dict[Optional[str], str]], locale: Optional[str] = None, **kwargs 41 | ) -> Union[Sequence[str], Dict[Optional[str], str]]: 42 | """Seaches for ``Filth`` in `documents` and replaces it with placeholders. 43 | 44 | `documents` can be in a dict, in the format of ``{'document_name': 'document'}``, or as a list of strings 45 | (each a seperate document). 46 | This can be useful when processing many documents. 47 | 48 | .. code:: pycon 49 | 50 | >>> import scrubadub 51 | >>> scrubadub.clean_documents({'contact.txt': "contact me at joe@example.com", 52 | ... 'hello.txt': 'hello world!'}) 53 | {'contact.txt': 'contact me at {{EMAIL}}', 'hello.txt': 'hello world!'} 54 | 55 | >>> scrubadub.clean_documents(["contact me at joe@example.com", 'hello world!']) 56 | ['contact me at {{EMAIL}}', 'hello world!'] 57 | 58 | :param documents: Documents containing possible PII that needs to be redacted in the form of a list of documents 59 | or a dictonary with the key as the document name and the value as the document text 60 | :type documents: `list` of `str` objects, `dict` of `str` objects 61 | :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an 62 | underscore and the two letter upper-case country code, eg "en_GB" or "de_CH" 63 | :type locale: str 64 | :return: Documents in the same format as input, but with `Filth` redacted 65 | :rtype: `list` of `str` objects, `dict` of `str` objects; same as input 66 | """ 67 | scrubber = Scrubber(locale=locale) 68 | return scrubber.clean_documents(documents, **kwargs) 69 | 70 | 71 | def list_filth(text: str, locale: Optional[str] = None, **kwargs) -> List[Filth]: 72 | """Return a list of ``Filth`` that was detected in the string `text`. 73 | 74 | .. code:: pycon 75 | 76 | >>> import scrubadub 77 | >>> scrubadub.list_filth(u"contact me at joe@example.com") 78 | [] 79 | 80 | :param text: The text containing possible PII that needs to be found 81 | :type text: `str` 82 | :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an 83 | underscore and the two letter upper-case country code, eg "en_GB" or "de_CH" 84 | :type locale: str 85 | :return: A list of all the :class:``Filth`` objects that were found 86 | :rtype: `list` of :class:``Filth`` objects 87 | 88 | """ 89 | scrubber = Scrubber(locale=locale) 90 | return list(scrubber.iter_filth(text, **kwargs)) 91 | 92 | 93 | def list_filth_documents(documents: Union[List[str], Dict[Optional[str], str]], locale: Optional[str] = None, 94 | **kwargs) -> List[Filth]: 95 | """Return a list of ``Filth`` that was detected in the string `text`. 96 | 97 | `documents` can be in a dict, in the format of ``{'document_name': 'document'}``, or as a list of strings 98 | (each a seperate document). 99 | This can be useful when processing many documents. 100 | 101 | .. code:: pycon 102 | 103 | >>> import scrubadub 104 | >>> scrubadub.list_filth_documents( 105 | ... {'contact.txt': "contact me at joe@example.com", 'hello.txt': 'hello world!'} 106 | ... ) 107 | [] 109 | 110 | >>> scrubadub.list_filth_documents(["contact me at joe@example.com", 'hello world!']) 111 | [] 112 | 113 | :param documents: Documents containing possible PII that needs to be found in the form of a list of documents 114 | or a dictonary with the key as the document name and the value as the document text 115 | :type documents: `list` of `str` objects, `dict` of `str` objects 116 | :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an 117 | underscore and the two letter upper-case country code, eg "en_GB" or "de_CH" 118 | :type locale: str 119 | :return: A list of all the :class:``Filth`` objects that were found 120 | :rtype: `list` of :class:``Filth`` objects 121 | 122 | """ 123 | scrubber = Scrubber(locale=locale) 124 | return list(scrubber.iter_filth_documents(documents, **kwargs)) 125 | -------------------------------------------------------------------------------- /tests/test_postprocessor_filth_replacer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import scrubadub.filth 4 | from scrubadub.post_processors.filth_replacer import FilthReplacer 5 | from scrubadub.filth import Filth, MergedFilth, EmailFilth 6 | 7 | 8 | class FilthTypeReplacerTestCase(unittest.TestCase): 9 | def test_label_maker(self): 10 | """Test making labels from filths""" 11 | class TestFilth(Filth): 12 | type = 'test_type' 13 | 14 | filth_replacer = FilthReplacer() 15 | self.assertEqual( 16 | filth_replacer.filth_label(TestFilth(0, 1, 'a')), 17 | 'TEST_TYPE' 18 | ) 19 | 20 | merged = MergedFilth(TestFilth(0, 2, 'ab'), EmailFilth(1, 2, 'b')) 21 | 22 | self.assertEqual( 23 | filth_replacer.filth_label(merged), 24 | 'EMAIL+TEST_TYPE' 25 | ) 26 | 27 | merged = MergedFilth(EmailFilth(0, 2, 'ab'), TestFilth(1, 2, 'b')) 28 | 29 | self.assertEqual( 30 | filth_replacer.filth_label(merged), 31 | 'EMAIL+TEST_TYPE' 32 | ) 33 | 34 | filth_replacer = FilthReplacer(separator='::') 35 | self.assertEqual( 36 | filth_replacer.filth_label(merged), 37 | 'EMAIL::TEST_TYPE' 38 | ) 39 | 40 | filth_replacer = FilthReplacer() 41 | TestFilth.type = "other_test_type" 42 | 43 | self.assertEqual( 44 | filth_replacer.filth_label(TestFilth(0, 1, 'a')), 45 | 'OTHER_TEST_TYPE' 46 | ) 47 | 48 | self.assertEqual( 49 | filth_replacer.filth_label(EmailFilth(0, 1, 'a')), 50 | 'EMAIL' 51 | ) 52 | 53 | filth_replacer = FilthReplacer(include_count=True) 54 | filth_replacer.reset_lookup() 55 | self.assertEqual(filth_replacer.filth_label(EmailFilth(0, 1, 'a')), 'EMAIL-0') 56 | self.assertEqual(filth_replacer.filth_label(EmailFilth(0, 1, 'b')), 'EMAIL-1') 57 | self.assertEqual(filth_replacer.filth_label(EmailFilth(0, 1, 'a')), 'EMAIL-0') 58 | self.assertEqual(filth_replacer.filth_label(EmailFilth(0, 1, 'c')), 'EMAIL-2') 59 | 60 | filth_replacer = FilthReplacer(uppercase=False) 61 | self.assertEqual(filth_replacer.filth_label(EmailFilth(0, 1, 'a')), 'email') 62 | 63 | def test_process_filths(self): 64 | """Test that the process_filths behaves as expected""" 65 | class TestFilth(Filth): 66 | type = 'test_type' 67 | 68 | filths = [ 69 | MergedFilth(EmailFilth(0, 2, 'ab'), TestFilth(1, 2, 'b')), 70 | EmailFilth(5, 6, 'c') 71 | ] 72 | 73 | post_processor = FilthReplacer() 74 | filths = post_processor.process_filth(filths) 75 | 76 | self.assertEqual(filths[0].replacement_string, 'EMAIL+TEST_TYPE') 77 | self.assertEqual(filths[1].replacement_string, 'EMAIL') 78 | 79 | def test_hashing(self): 80 | post_proc = FilthReplacer() 81 | self.assertTrue(post_proc.hash_salt is not None) 82 | self.assertIsInstance(post_proc.hash_salt, bytes) 83 | self.assertGreater(len(post_proc.hash_salt), 0) 84 | 85 | filths = [EmailFilth(0, 19, 'example@example.com')] 86 | self.assertEqual(filths[0].replacement_string, None) 87 | 88 | post_proc = FilthReplacer(hash_salt='example', include_type=True, include_hash=True) 89 | filths = post_proc.process_filth(filths) 90 | self.assertEqual(filths[0].replacement_string, 'EMAIL-42FFCB267F8C5E6D') 91 | 92 | post_proc = FilthReplacer(hash_salt='example', include_type=True, include_count=True, include_hash=True) 93 | post_proc.reset_lookup() 94 | filths = post_proc.process_filth(filths) 95 | self.assertEqual(filths[0].replacement_string, 'EMAIL-0-42FFCB267F8C5E6D') 96 | 97 | post_proc = FilthReplacer(hash_salt='example', include_type=False, include_hash=True) 98 | filths = post_proc.process_filth(filths) 99 | self.assertEqual(filths[0].replacement_string, '42FFCB267F8C5E6D') 100 | 101 | post_proc = FilthReplacer(hash_salt='another_salt', include_type=False, include_hash=True) 102 | filths = post_proc.process_filth(filths) 103 | self.assertEqual(filths[0].replacement_string, '87BB6F7ED5FE49C4') 104 | 105 | post_proc = FilthReplacer(hash_salt='another_salt', include_type=False, hash_length=10, include_hash=True) 106 | filths = post_proc.process_filth(filths) 107 | self.assertEqual(filths[0].replacement_string, '87BB6F7ED5') 108 | self.assertEqual(len(filths[0].replacement_string), 10) 109 | 110 | post_proc = FilthReplacer(hash_salt='another_salt', include_type=False, hash_length=50, include_hash=True) 111 | filths = post_proc.process_filth(filths) 112 | self.assertEqual(filths[0].replacement_string, '87BB6F7ED5FE49C4EA43D95A41F843D4FBB66D15C5AA41A7F7') 113 | self.assertEqual(len(filths[0].replacement_string), 50) 114 | 115 | def test_bad_filth(self): 116 | """Test making labels from a filth without a type""" 117 | class TestFilth(Filth): 118 | type = None 119 | 120 | filth_replacer = FilthReplacer() 121 | self.assertEqual( 122 | filth_replacer.filth_label(TestFilth(0, 1, 'a')), 123 | '' 124 | ) 125 | 126 | def test_tagged_filth(self): 127 | """Test making labels from a tagged filth""" 128 | filth_replacer = FilthReplacer() 129 | self.assertEqual( 130 | filth_replacer.filth_label(scrubadub.filth.TaggedEvaluationFilth(0, 1, 'a', comparison_type='phone')), 131 | 'TAGGED_PHONE' 132 | ) 133 | 134 | def test_all_disabled(self): 135 | """Test making labels when everything is disabled""" 136 | filth_replacer = FilthReplacer(include_type=False, include_hash=False, include_count=False) 137 | self.assertEqual( 138 | filth_replacer.filth_label(scrubadub.filth.TaggedEvaluationFilth(0, 1, 'a', comparison_type='phone')), 139 | 'FILTH' 140 | ) 141 | 142 | def tearDown(self) -> None: 143 | FilthReplacer.reset_lookup() -------------------------------------------------------------------------------- /tests/test_filth.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unittest 3 | 4 | from scrubadub.filth import Filth, MergedFilth 5 | from scrubadub.exceptions import InvalidReplaceWith, FilthMergeError 6 | 7 | class FilthTestCase(unittest.TestCase): 8 | 9 | def test_disallowed_replace_with(self): 10 | """replace_with should fail gracefully""" 11 | filth = Filth(beg=0, end=3, text='asd') 12 | with self.assertRaises(InvalidReplaceWith): 13 | filth.replace_with('surrogate') 14 | with self.assertRaises(InvalidReplaceWith): 15 | filth.replace_with('something_invalid') 16 | 17 | def test_nonoverlapping_filth(self): 18 | """can't merge non-overlapping filth""" 19 | a_filth = Filth(beg=0, end=3, text="the") 20 | b_filth = Filth(beg=4, end=7, text="end") 21 | with self.assertRaises(FilthMergeError): 22 | a_filth.merge(b_filth) 23 | with self.assertRaises(FilthMergeError): 24 | b_filth.merge(a_filth) 25 | 26 | def test_text_merge(self): 27 | """make sure text length is correct""" 28 | class SomeFilth(Filth): 29 | type = 'something' 30 | 31 | text = "the end" 32 | a_filth = SomeFilth(beg=0, end=3, text=text[:3]) 33 | b_filth = SomeFilth(beg=1, end=7, text=text[1:]) 34 | 35 | c_filth = a_filth.merge(b_filth) 36 | self.assertEqual(c_filth.text, text) 37 | 38 | c_filth = b_filth.merge(a_filth) 39 | self.assertEqual(c_filth.text, text) 40 | 41 | d_filth = c_filth.merge(a_filth) 42 | self.assertEqual(d_filth.text, text) 43 | 44 | b_filth.end = 2 45 | with self.assertRaises(FilthMergeError): 46 | b_filth.merge(a_filth) 47 | 48 | def test_invalid_merge_documents(self): 49 | """Ensure Filth in two different documents cant be merged""" 50 | filth_a = Filth(0, 2, text='aa', document_name='one') 51 | filth_b = Filth(1, 2, text='a', document_name='two') 52 | 53 | with self.assertRaises(FilthMergeError): 54 | filth_a.merge(filth_b) 55 | 56 | with self.assertRaises(FilthMergeError): 57 | filth_b.merge(filth_a) 58 | 59 | def test_filth_string(self): 60 | """Test the Filth to string function""" 61 | 62 | filth = Filth(beg=0, end=5) 63 | self.assertEqual(str(filth), "") 64 | 65 | filth = Filth(beg=0, end=5) 66 | self.assertEqual(filth.__repr__(), "") 67 | 68 | filth = Filth(beg=0, end=5) 69 | self.assertEqual(filth._to_string(), "") 70 | 71 | filth = Filth(beg=0, end=5, text='hello') 72 | self.assertEqual(str(filth), "") 73 | 74 | filth = Filth(beg=0, end=5, text='hello', document_name='hello.txt') 75 | self.assertEqual(str(filth), "") 76 | 77 | filth = Filth(beg=0, end=5, text='hello', document_name='hello.txt') 78 | self.assertEqual(filth._to_string(attributes=['text']), "") 79 | self.assertEqual(filth._to_string(attributes=['beg', 'end', 'text']), "") 80 | self.assertEqual( 81 | filth._to_string(attributes=['text', 'document_name']), 82 | "" 83 | ) 84 | 85 | def test_merged_to_string(self): 86 | """Test the MergedFilth to string""" 87 | class TestFilth(Filth): 88 | type = 'test_filth' 89 | 90 | merged = MergedFilth(TestFilth(0, 2, 'ab'), Filth(1, 2, 'b')) 91 | self.assertEqual(merged.__repr__(), ", ]>") 92 | 93 | def test_equality(self): 94 | """Test the filth equality function""" 95 | self.assertTrue( 96 | Filth(beg=0, end=5, text='hello') == 97 | Filth(beg=0, end=5, text='hello') 98 | ) 99 | self.assertTrue( 100 | Filth(beg=0, end=5, text='hello') == 101 | Filth(beg=0, end=5, text='hello', match=re.match('123', '1234')) 102 | ) 103 | 104 | self.assertTrue( 105 | Filth(beg=0, end=5, text='hello') != 106 | Filth(beg=1, end=5, text='hello') 107 | ) 108 | self.assertTrue( 109 | Filth(beg=0, end=5, text='hello') != 110 | Filth(beg=0, end=6, text='hello') 111 | ) 112 | self.assertTrue( 113 | Filth(beg=0, end=5, text='hello') != 114 | Filth(beg=0, end=5, text='hellou') 115 | ) 116 | 117 | self.assertTrue( 118 | Filth(beg=0, end=5, text='hello', document_name='test') == 119 | Filth(beg=0, end=5, text='hello', document_name='test') 120 | ) 121 | self.assertTrue( 122 | Filth(beg=0, end=5, text='hello') != 123 | Filth(beg=0, end=5, text='hello', document_name='test') 124 | ) 125 | self.assertTrue( 126 | Filth(beg=0, end=5, text='hello', document_name='test') != 127 | Filth(beg=0, end=5, text='hello') 128 | ) 129 | self.assertTrue( 130 | Filth(beg=0, end=5, text='hello', document_name='test') != 131 | Filth(beg=0, end=5, text='hello', document_name='another_test') 132 | ) 133 | 134 | self.assertTrue( 135 | Filth(beg=0, end=5, text='hello', detector_name='tester') == 136 | Filth(beg=0, end=5, text='hello', detector_name='tester') 137 | ) 138 | self.assertTrue( 139 | Filth(beg=0, end=5, text='hello', detector_name='tester') != 140 | Filth(beg=0, end=5, text='hello', detector_name='another_tester') 141 | ) 142 | self.assertTrue( 143 | Filth(beg=0, end=5, text='hello', detector_name='tester') != 144 | Filth(beg=0, end=5, text='hello') 145 | ) 146 | self.assertTrue( 147 | Filth(beg=0, end=5, text='hello') != 148 | Filth(beg=0, end=5, text='hello', detector_name='tester') 149 | ) 150 | 151 | self.assertTrue( 152 | Filth(beg=0, end=5, text='hello', document_name='test', detector_name='tester') == 153 | Filth(beg=0, end=5, text='hello', document_name='test', detector_name='tester') 154 | ) 155 | self.assertTrue( 156 | Filth(beg=0, end=5, text='hello', document_name='test', detector_name='tester') != 157 | Filth(beg=0, end=5, text='hello', document_name='test', detector_name='another_tester') 158 | ) 159 | self.assertTrue( 160 | Filth(beg=0, end=5, text='hello', document_name='test', detector_name='tester') != 161 | Filth(beg=0, end=5, text='hello', document_name='another_test', detector_name='tester') 162 | ) -------------------------------------------------------------------------------- /tests/test_filth_address.py: -------------------------------------------------------------------------------- 1 | 2 | import random 3 | import unittest 4 | 5 | from scrubadub.filth import AddressFilth 6 | 7 | 8 | class AddressFilthTestCase(unittest.TestCase): 9 | 10 | def test_generate(self): 11 | class Faker: 12 | locales = ['en_GB'] 13 | def address(self): 14 | return '4 Paula views\nLake Howardburgh\nN7U 2FQ' 15 | def last_name(self): 16 | return 'Smith' 17 | 18 | random.seed(1234) 19 | self.assertEqual( 20 | 'Building Smith, 4 Paula views, Lake Howardburgh, N7U 2FQ, Cymru', 21 | AddressFilth.generate(faker=Faker()), 22 | ) 23 | 24 | def test_seperators(self): 25 | addresses = [ 26 | ('4 Paula views\nLake Howardburgh\nN7U 2FQ', '4 Paula views Lake Howardburgh N7U 2FQ'), 27 | ('79 Miller branch\nJordantown\nW1F 3LB', '79 Miller branch, Jordantown, W1F 3LB'), 28 | ('78 Joseph keys\nEast Patricktown\nEN6 2SD', '78 Joseph keys, East Patricktown, EN6 2SD'), 29 | ('93 Hall overpass\nNashbury\nTA2W 9XP', '93 Hall overpass, Nashbury, TA2W 9XP'), 30 | ('Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ', 'Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ'), 31 | ('8 Roberts stravenue\nElliottville\nSY18 2YP', '8 Roberts stravenue, Elliottville, SY18 2YP'), 32 | ('784 Knowles mall\nJunetown\nIM20 2PG', '784 Knowles mall, Junetown, IM20 2PG'), 33 | ] 34 | random.seed(1234) 35 | for input_value, output_value in addresses: 36 | self.assertEqual( 37 | output_value, 38 | AddressFilth._randomise_seperators(input_value), 39 | ) 40 | 41 | def test_street_number(self): 42 | addresses = [ 43 | ('4 Paula views\nLake Howardburgh\nN7U 2FQ', '4 Paula views\nLake Howardburgh\nN7U 2FQ'), 44 | ('79 Miller branch\nJordantown\nW1F 3LB', 'Miller branch\nJordantown\nW1F 3LB'), 45 | ('78 Joseph keys\nEast Patricktown\nEN6 2SD', 'Joseph keys\nEast Patricktown\nEN6 2SD'), 46 | ('93 Hall overpass\nNashbury\nTA2W 9XP', 'Hall overpass\nNashbury\nTA2W 9XP'), 47 | ('Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ', 'Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ'), 48 | ('8 Roberts stravenue\nElliottville\nSY18 2YP', 'Roberts stravenue\nElliottville\nSY18 2YP'), 49 | ('784 Knowles mall\nJunetown\nIM20 2PG', '784 Knowles mall\nJunetown\nIM20 2PG'), 50 | ] 51 | random.seed(1234) 52 | for input_value, output_value in addresses: 53 | self.assertEqual( 54 | output_value, 55 | AddressFilth._randomise_street_number(input_value), 56 | ) 57 | 58 | def test_postcode(self): 59 | addresses = [ 60 | ('4 Paula views\nLake Howardburgh\nN7U 2FQ', '4 Paula views\nLake Howardburgh\nN7U 2FQ'), 61 | ('79 Miller branch\nJordantown\nW1F 3LB', '79 Miller branch\nJordantown'), 62 | ('78 Joseph keys\nEast Patricktown\nEN6 2SD', '78 Joseph keys\nEast Patricktown'), 63 | ('93 Hall overpass\nNashbury\nTA2W 9XP', '93 Hall overpass\nNashbury'), 64 | ('Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ', 'Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ'), 65 | ('8 Roberts stravenue\nElliottville\nSY18 2YP', '8 Roberts stravenue\nElliottville'), 66 | ('784 Knowles mall\nJunetown\nIM20 2PG', '784 Knowles mall\nJunetown'), 67 | ] 68 | random.seed(1234) 69 | for input_value, output_value in addresses: 70 | self.assertEqual( 71 | output_value, 72 | AddressFilth._randomise_postcode(input_value), 73 | ) 74 | 75 | def test_country(self): 76 | addresses = [ 77 | ('4 Paula views\nLake Howardburgh\nN7U 2FQ', '4 Paula views\nLake Howardburgh\nN7U 2FQ'), 78 | ('79 Miller branch\nJordantown\nW1F 3LB', '79 Miller branch\nJordantown\nW1F 3LB\nUnited Kingdom'), 79 | ('78 Joseph keys\nEast Patricktown\nEN6 2SD', '78 Joseph keys\nEast Patricktown\nEN6 2SD\nGB'), 80 | ('93 Hall overpass\nNashbury\nTA2W 9XP', '93 Hall overpass\nNashbury\nTA2W 9XP'), 81 | ('Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ', 'Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ\nCymru'), 82 | ('8 Roberts stravenue\nElliottville\nSY18 2YP', '8 Roberts stravenue\nElliottville\nSY18 2YP\nUnited Kingdom'), 83 | ('784 Knowles mall\nJunetown\nIM20 2PG', '784 Knowles mall\nJunetown\nIM20 2PG'), 84 | ] 85 | random.seed(1234) 86 | for input_value, output_value in addresses: 87 | self.assertEqual( 88 | output_value, 89 | AddressFilth._randomise_country(input_value), 90 | ) 91 | 92 | def test_building(self): 93 | class Faker: 94 | def last_name(self): 95 | return 'Smith' 96 | 97 | addresses = [ 98 | ('4 Paula views\nLake Howardburgh\nN7U 2FQ', '4 Paula views\nLake Howardburgh\nN7U 2FQ'), 99 | ('79 Miller branch\nJordantown\nW1F 3LB', 'Building Smith\n79 Miller branch\nJordantown\nW1F 3LB'), 100 | ('78 Joseph keys\nEast Patricktown\nEN6 2SD', 'Smith Block\n78 Joseph keys\nEast Patricktown\nEN6 2SD'), 101 | ('93 Hall overpass\nNashbury\nTA2W 9XP', 'House Smith\n93 Hall overpass\nNashbury\nTA2W 9XP'), 102 | ('Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ', 'Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ'), 103 | ('8 Roberts stravenue\nElliottville\nSY18 2YP', 'Building Smith\n8 Roberts stravenue\nElliottville\nSY18 2YP'), 104 | ('784 Knowles mall\nJunetown\nIM20 2PG', '784 Knowles mall\nJunetown\nIM20 2PG'), 105 | ] 106 | random.seed(1234) 107 | for input_value, output_value in addresses: 108 | self.assertEqual( 109 | output_value, 110 | AddressFilth._randomise_building(input_value, faker=Faker()), 111 | ) 112 | 113 | def test_case(self): 114 | addresses = [ 115 | ('4 Paula views\nLake Howardburgh\nN7U 2FQ', '4 PAULA VIEWS\nLAKE HOWARDBURGH\nN7U 2FQ'), 116 | ('79 Miller branch\nJordantown\nW1F 3LB', '79 Miller branch\nJordantown\nW1F 3LB'), 117 | ('78 Joseph keys\nEast Patricktown\nEN6 2SD', '78 Joseph keys\nEast Patricktown\nEN6 2SD'), 118 | ('93 Hall overpass\nNashbury\nTA2W 9XP', '93 HALL OVERPASS\nNASHBURY\nTA2W 9XP'), 119 | ('Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ', 'FLAT 98R\nNATASHA FALL\nLAKE ROSIE\nB73 8PJ'), 120 | ('8 Roberts stravenue\nElliottville\nSY18 2YP', '8 Roberts stravenue\nElliottville\nSY18 2YP'), 121 | ('784 Knowles mall\nJunetown\nIM20 2PG', '784 Knowles mall\nJunetown\nIM20 2PG'), 122 | ] 123 | random.seed(1234) 124 | for input_value, output_value in addresses: 125 | self.assertEqual( 126 | output_value, 127 | AddressFilth._randomise_case(input_value), 128 | ) 129 | -------------------------------------------------------------------------------- /tests/test_detector_date_of_birth.py: -------------------------------------------------------------------------------- 1 | import faker 2 | import random 3 | import unittest 4 | import scrubadub 5 | import scrubadub.detectors.catalogue 6 | from scrubadub.filth import DateOfBirthFilth 7 | 8 | import datetime 9 | from base import BaseTestCase 10 | 11 | 12 | class DoBTestCase(unittest.TestCase, BaseTestCase): 13 | 14 | 15 | def setUp(self): 16 | from scrubadub.detectors.date_of_birth import DateOfBirthDetector 17 | scrubadub.detectors.catalogue.register_detector(DateOfBirthDetector, autoload=True) 18 | 19 | def tearDown(self) -> None: 20 | from scrubadub.detectors.date_of_birth import DateOfBirthDetector 21 | scrubadub.detectors.catalogue.remove_detector(DateOfBirthDetector) 22 | 23 | def test_DoB_1(self): 24 | """ 25 | BEFORE: My date of birth is 17/06/1976. 26 | AFTER: My date of birth is {{DATE_OF_BIRTH}}. 27 | """ 28 | self.compare_before_after() 29 | 30 | def test_DoB_2(self): 31 | """ 32 | BEFORE: I was born 15th June 1991 33 | AFTER: I was born {{DATE_OF_BIRTH}} 34 | """ 35 | self.compare_before_after() 36 | 37 | def test_DoB_3(self): 38 | """ 39 | BEFORE: DOB: 02.12.1979 40 | AFTER: DOB: 02.12.{{DATE_OF_BIRTH}} 41 | """ 42 | # TODO: this is a known limitation of the dateparser search util, 43 | # need to improve the search to include the full date 44 | self.compare_before_after() 45 | 46 | def test_DoB_4(self): 47 | """ 48 | BEFORE: My name is Mike and I was born in a land far away on 22/11/1972 49 | AFTER: My name is Mike and I was born in a land far away {{DATE_OF_BIRTH}} 50 | """ 51 | # TODO: dateparser is a little greedy, consuming the "on " as well as the date 52 | self.compare_before_after() 53 | 54 | def test_DoB_5(self): 55 | """ 56 | BEFORE: my name is Jane and I was born on 11/22/1972 57 | AFTER: my name is Jane and I was born {{DATE_OF_BIRTH}} 58 | """ 59 | # TODO: dateparser is a little greedy, consuming the "on " as well as the date 60 | self.compare_before_after() 61 | 62 | def test_DoB_6(self): 63 | """ 64 | BEFORE: my date of birth is 22-nov-1972 65 | AFTER: my date of birth is {{DATE_OF_BIRTH}} 66 | """ 67 | self.compare_before_after() 68 | 69 | def test_DoB_7(self): 70 | """ 71 | BEFORE: My dob is 22-11-1972 72 | AFTER: My dob is {{DATE_OF_BIRTH}} 73 | """ 74 | self.compare_before_after() 75 | 76 | def test_DoB_8(self): 77 | """ 78 | BEFORE: The claimant's, d.o.b. is 4 June 1976 79 | AFTER: The claimant's, d.o.b. is {{DATE_OF_BIRTH}} 80 | """ 81 | self.compare_before_after() 82 | 83 | def test_DoB_9(self): 84 | """ 85 | BEFORE: 1985-01-01 is my birthday. 86 | AFTER: {{DATE_OF_BIRTH}} is my birthday. 87 | """ 88 | self.compare_before_after() 89 | 90 | def test_generate(self): 91 | fake = faker.Faker() 92 | faker.Faker.seed(4321) 93 | random.seed(4321) 94 | 95 | # I think this could fail just after midnight, because the generated date it relative to today's date and the 96 | # generated timedelta will unlikly be an integer number of days. 97 | # Will test and possibly remove/change this test further. 98 | self.assertIn( 99 | DateOfBirthFilth.generate(faker=fake), 100 | [ 101 | (datetime.date.today() - datetime.timedelta(days=29729)).strftime('%a %d %b %Y'), 102 | (datetime.date.today() - datetime.timedelta(days=29729 + 1)).strftime('%a %d %b %Y'), 103 | ] 104 | ) 105 | 106 | def test_init(self): 107 | from scrubadub.detectors.date_of_birth import DateOfBirthDetector 108 | with self.assertRaises(ValueError): 109 | DateOfBirthDetector(locale='zz_GB') 110 | 111 | def test_custom_words(self): 112 | from scrubadub.detectors.date_of_birth import DateOfBirthDetector 113 | detector = DateOfBirthDetector(context_words=['big day']) 114 | filths = list(detector.iter_filth('the big day is may 14th 1983\nsee you then')) 115 | 116 | self.assertEqual(1, len(filths)) 117 | self.assertEqual(15, filths[0].beg) 118 | self.assertEqual(28, filths[0].end) 119 | self.assertEqual('may 14th 1983', filths[0].text) 120 | 121 | def test_young(self): 122 | from scrubadub.detectors.date_of_birth import DateOfBirthDetector 123 | detector = DateOfBirthDetector() 124 | filths = list(detector.iter_filth('my birthday is not may 14th 2020\nor may 15th 2020\nor +14-05-2020 23')) 125 | 126 | self.assertEqual(0, len(filths)) 127 | 128 | def test_context(self): 129 | from scrubadub.detectors.date_of_birth import DateOfBirthDetector 130 | text = """ 131 | CONTEXTB2 132 | CONTEXTB1 133 | 10-Nov-2000 134 | CONTEXTA1 135 | CONTEXTA2 136 | """ 137 | 138 | detector = DateOfBirthDetector(context_words=['CONTEXTB1'], context_before=10, context_after=10) 139 | self.assertEqual(1, len(list(detector.iter_filth(text)))) 140 | detector = DateOfBirthDetector(context_words=['CONTEXTB1'], context_before=1, context_after=10) 141 | self.assertEqual(1, len(list(detector.iter_filth(text)))) 142 | detector = DateOfBirthDetector(context_words=['CONTEXTB1'], context_before=0, context_after=10) 143 | self.assertEqual(0, len(list(detector.iter_filth(text)))) 144 | 145 | detector = DateOfBirthDetector(context_words=['CONTEXTB2'], context_before=10, context_after=0) 146 | self.assertEqual(1, len(list(detector.iter_filth(text)))) 147 | detector = DateOfBirthDetector(context_words=['CONTEXTB2'], context_before=2, context_after=0) 148 | self.assertEqual(1, len(list(detector.iter_filth(text)))) 149 | detector = DateOfBirthDetector(context_words=['CONTEXTB2'], context_before=1, context_after=0) 150 | self.assertEqual(0, len(list(detector.iter_filth(text)))) 151 | 152 | detector = DateOfBirthDetector(context_words=['CONTEXTA1'], context_before=10, context_after=10) 153 | self.assertEqual(1, len(list(detector.iter_filth(text)))) 154 | detector = DateOfBirthDetector(context_words=['CONTEXTA1'], context_before=0, context_after=1) 155 | self.assertEqual(1, len(list(detector.iter_filth(text)))) 156 | detector = DateOfBirthDetector(context_words=['CONTEXTA1'], context_before=1, context_after=0) 157 | self.assertEqual(0, len(list(detector.iter_filth(text)))) 158 | 159 | detector = DateOfBirthDetector(context_words=['CONTEXTA2'], context_before=0, context_after=10) 160 | self.assertEqual(1, len(list(detector.iter_filth(text)))) 161 | detector = DateOfBirthDetector(context_words=['CONTEXTA2'], context_before=10, context_after=2) 162 | self.assertEqual(1, len(list(detector.iter_filth(text)))) 163 | detector = DateOfBirthDetector(context_words=['CONTEXTA2'], context_before=3, context_after=0) 164 | self.assertEqual(0, len(list(detector.iter_filth(text)))) 165 | -------------------------------------------------------------------------------- /scrubadub/post_processors/filth_replacer.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import math 4 | import hashlib 5 | 6 | from typing import Sequence, Optional, Union, Dict 7 | from collections import defaultdict 8 | 9 | from scrubadub.filth import Filth, MergedFilth, TaggedEvaluationFilth 10 | from scrubadub.post_processors.base import PostProcessor 11 | from scrubadub.post_processors.catalogue import register_post_processor 12 | from scrubadub import utils 13 | 14 | 15 | class FilthReplacer(PostProcessor): 16 | """Creates tokens that are used to replace the Filth found in the text of a document. 17 | 18 | This can be configured to include the filth type (eg phone, name, email, ...), a unique number for each piece of 19 | Filth, and a hash of the Filth. 20 | 21 | >>> import scrubadub 22 | >>> scrubber = scrubadub.Scrubber(post_processor_list=[ 23 | ... scrubadub.post_processors.FilthReplacer(), 24 | ... ]) 25 | >>> scrubber.clean("Contact me at 522-368-8530 or hernandezjenna@example.com") 26 | 'Contact me at PHONE or EMAIL' 27 | >>> scrubber = scrubadub.Scrubber(post_processor_list=[ 28 | ... scrubadub.post_processors.FilthReplacer(include_hash=True, hash_salt='example', hash_length=8), 29 | ... ]) 30 | >>> scrubber.clean("Contact me at 522-368-8530 or hernandezjenna@example.com") 31 | 'Contact me at PHONE-7358BF44 or EMAIL-AC0B8AC3' 32 | >>> scrubber = scrubadub.Scrubber(post_processor_list=[ 33 | ... scrubadub.post_processors.FilthReplacer(include_count=True), 34 | ... ]) 35 | >>> scrubber.clean("Contact me at taylordaniel@example.com or hernandezjenna@example.com, " 36 | ... "but taylordaniel@example.com is probably better.") 37 | 'Contact me at EMAIL-0 or EMAIL-1, but EMAIL-0 is probably better.' 38 | """ 39 | name = 'filth_replacer' # type: str 40 | autoload = False 41 | index = 0 42 | 43 | # NOTE: this is not an efficient way to store this in memory. could 44 | # alternatively hash the type and text and do away with the overhead 45 | # bits of storing the tuple in the lookup 46 | typed_lookup = defaultdict(lambda: utils.Lookup(), {}) # type: Dict[str, utils.Lookup] 47 | 48 | def __init__(self, include_type: bool = True, include_count: bool = False, include_hash: bool = False, 49 | uppercase: bool = True, separator: Optional[str] = None, hash_length: Optional[int] = None, 50 | hash_salt: Optional[Union[str, bytes]] = None, **kwargs): 51 | """Initialise the FilthReplacer. 52 | 53 | :param include_type: 54 | :type include_type: bool, default True 55 | :param include_count: 56 | :type include_count: bool, default False 57 | :param include_hash: 58 | :type include_hash: bool, default False 59 | :param uppercase: Make the label uppercase 60 | :type uppercase: bool, default True 61 | :param separator: Used to separate labels if a merged filth is being replaced 62 | :type separator: Optional[str], default None 63 | :param hash_length: The length of the hexadecimal hash 64 | :type hash_length: Optional[int], default None 65 | :param hash_salt: The salt used in the hashing process 66 | :type hash_salt: Optional[Union[str, bytes]], default None 67 | """ 68 | super(FilthReplacer, self).__init__(**kwargs) 69 | self.include_type = include_type 70 | self.include_count = include_count 71 | self.include_hash = include_hash 72 | self.uppercase = uppercase 73 | self.separator = separator or '+' 74 | self.hash_length = hash_length or 16 75 | 76 | if isinstance(hash_salt, str): 77 | self.hash_salt = hash_salt.encode('utf8') # type: bytes 78 | else: 79 | self.hash_salt = os.urandom(128) 80 | 81 | @classmethod 82 | def reset_lookup(cls): 83 | """Reset the lookups that maintain a map of filth to a numeric ID.""" 84 | cls.typed_lookup = defaultdict(lambda: utils.Lookup(), {}) 85 | 86 | def filth_label(self, filth: Filth) -> str: 87 | """This function takes a filth and creates a label that can be used to replace the original text. 88 | 89 | :param filth: Limit the named entities to those in this list, defaults to ``{'PERSON', 'PER', 'ORG'}`` 90 | :type filth: Filth 91 | :return: The replacement label that should be used for this `Filth`. 92 | :rtype: str 93 | 94 | """ 95 | filths = [filth] 96 | if isinstance(filth, MergedFilth): 97 | filths = filth.filths 98 | 99 | replacements = set() 100 | for f in filths: 101 | replacement_pieces = [] 102 | 103 | if self.include_type: 104 | filth_type = getattr(f, 'type', None) 105 | if filth_type is None: 106 | continue 107 | if filth_type == TaggedEvaluationFilth.type: 108 | filth_comparison_type = getattr(f, 'comparison_type', None) 109 | if filth_comparison_type is not None: 110 | filth_type += '_' + filth_comparison_type 111 | filth_type = filth_type.replace(' ', '_') 112 | 113 | replacement_pieces.append(filth_type) 114 | 115 | if self.include_count: 116 | replacement_pieces.append(str(FilthReplacer.typed_lookup[filth_type][f.text.lower()])) 117 | 118 | if self.include_hash: 119 | replacement_pieces.append(FilthReplacer.get_hash(f.text.lower(), self.hash_salt, self.hash_length)) 120 | 121 | if len(replacement_pieces) == 0: 122 | replacement_pieces = ['filth'] 123 | 124 | replacements.add('-'.join(replacement_pieces)) 125 | 126 | label = self.separator.join(sorted(replacements)) 127 | if self.uppercase: 128 | label = label.upper() 129 | return label 130 | 131 | @staticmethod 132 | def get_hash(text: str, salt: bytes, length: int) -> str: 133 | """Get a hash of some text, that has been salted and truncated. 134 | 135 | :param text: The text to be hashed 136 | :type text: str 137 | :param salt: The salt that should be used in this hashing 138 | :type salt: bytes 139 | :param length: The number of characters long that the hexadecimal hash should be 140 | :type length: int 141 | :return: The hash of the text 142 | :rtype: str 143 | """ 144 | return hashlib.pbkdf2_hmac( 145 | hash_name='sha256', 146 | password=text.encode('utf8'), 147 | salt=salt, 148 | iterations=100000, 149 | dklen=math.ceil(length / 2), 150 | ).hex()[:length] 151 | 152 | def process_filth(self, filth_list: Sequence[Filth]) -> Sequence[Filth]: 153 | """Processes the filth to replace the original text 154 | 155 | :param filth_list: The text to be hashed 156 | :type filth_list: Sequence[Filth] 157 | :return: The processed filths 158 | :rtype: Sequence[Filth] 159 | """ 160 | for filth_item in filth_list: 161 | filth_item.replacement_string = self.filth_label(filth=filth_item) 162 | 163 | return filth_list 164 | 165 | 166 | register_post_processor(FilthReplacer) 167 | 168 | __all__ = ['FilthReplacer'] 169 | --------------------------------------------------------------------------------