├── scrubadub
    ├── filth
    │   ├── en_GB
    │   │   ├── __init__.py
    │   │   ├── tax_reference_number.py
    │   │   └── national_insurance_number.py
    │   ├── en_US
    │   │   ├── __init__.py
    │   │   └── social_security_number.py
    │   ├── drivers_licence.py
    │   ├── tagged.py
    │   ├── email.py
    │   ├── name.py
    │   ├── location.py
    │   ├── organization.py
    │   ├── vehicle_licence_plate.py
    │   ├── twitter.py
    │   ├── skype.py
    │   ├── postalcode.py
    │   ├── credit_card.py
    │   ├── __init__.py
    │   ├── url.py
    │   ├── credential.py
    │   ├── phone.py
    │   ├── date_of_birth.py
    │   └── address.py
    ├── .gitattributes
    ├── detectors
    │   ├── en_US
    │   │   ├── __init__.py
    │   │   └── social_security_number.py
    │   ├── en_GB
    │   │   ├── __init__.py
    │   │   ├── tax_reference_number.py
    │   │   └── national_insurance_number.py
    │   ├── drivers_licence.py
    │   ├── twitter.py
    │   ├── __init__.py
    │   ├── credential.py
    │   ├── vehicle_licence_plate.py
    │   ├── url.py
    │   ├── credit_card.py
    │   ├── phone.py
    │   ├── email.py
    │   ├── postalcode.py
    │   ├── catalogue.py
    │   ├── text_blob.py
    │   ├── user_supplied.py
    │   └── skype.py
    ├── post_processors
    │   ├── __init__.py
    │   ├── base.py
    │   ├── remover.py
    │   ├── catalogue.py
    │   ├── prefix_suffix.py
    │   └── filth_replacer.py
    ├── exceptions.py
    ├── utils.py
    └── __init__.py
├── .coveragerc
├── tests
    ├── example_real_data
    │   ├── known_pii.csv
    │   └── document.txt
    ├── test_filth_location.py
    ├── test_filth_organization.py
    ├── test_exceptions.py
    ├── test_detector_user_supplied.py
    ├── colors.py
    ├── benchmark_time.py
    ├── test_unicode.py
    ├── test_postprocessor.py
    ├── test_postprocessor_prefix_postfix_replacer.py
    ├── test_detector_configuration.py
    ├── test_detector_emails.py
    ├── test_detector_en_US_social_security_number.py
    ├── test_detector_text_blob.py
    ├── test_detector_en_GB_trn.py
    ├── run.py
    ├── test_postprocessor_configuration.py
    ├── test_detector_drivers_licence.py
    ├── test_detector_twitter.py
    ├── test_detector_en_GB_nino.py
    ├── test_api_advanced.py
    ├── test_detector_phone_numbers.py
    ├── test_detector_credentials.py
    ├── base.py
    ├── test_detector_urls.py
    ├── test_utils_canonical_string_set.py
    ├── test_api.py
    ├── test_detector_skype.py
    ├── test_detector_credit_card.py
    ├── test_locale.py
    ├── test_detector_postal_codes.py
    ├── test_detector.py
    ├── test_api_older.py
    ├── test_postprocessor_filth_replacer.py
    ├── test_filth.py
    ├── test_filth_address.py
    └── test_detector_date_of_birth.py
├── MANIFEST.in
├── requirements
    ├── python-readthedocs
    ├── python
    └── python-dev
├── setup.cfg
├── docs
    ├── post_processors.rst
    ├── api_scrubadub_filth.rst
    ├── api_scrubadub_comparison.rst
    ├── api_scrubadub.rst
    ├── api_scrubadub_post.rst
    ├── addresses.rst
    ├── index.rst
    ├── contributing.rst
    ├── names.rst
    └── localization.rst
├── .readthedocs.yml
├── azure-pipelines.yml
├── .github
    └── workflows
    │   ├── python-publish.yml
    │   └── unittests.yml
├── bin
    └── download_data.sh
├── .gitignore
├── design
    ├── basic_usage.py
    ├── customize_filth_detection.py
    ├── customize_replacement_strings.py
    └── customize_via_training.py
├── tox.ini
├── setup.py
├── README.rst
└── CONTRIBUTING.md


/scrubadub/filth/en_GB/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrubadub/filth/en_US/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrubadub/.gitattributes:
--------------------------------------------------------------------------------
1 | *.pickle filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/en_US/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .social_security_number import SocialSecurityNumberDetector
3 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | omit =
3 |     */python?.?/*
4 |     */site-packages/nose/*
5 |     scrubadub/colors.py
6 | 


--------------------------------------------------------------------------------
/scrubadub/filth/drivers_licence.py:
--------------------------------------------------------------------------------
1 | from .base import Filth
2 | 
3 | 
4 | class DriversLicenceFilth(Filth):
5 |     type = 'drivers_licence'
6 | 


--------------------------------------------------------------------------------
/scrubadub/filth/en_GB/tax_reference_number.py:
--------------------------------------------------------------------------------
1 | from scrubadub.filth.base import Filth
2 | 
3 | 
4 | class TaxReferenceNumberFilth(Filth):
5 |     type = 'tax_reference_number'
6 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/en_GB/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .national_insurance_number import NationalInsuranceNumberDetector
3 | from .tax_reference_number import TaxReferenceNumberDetector
4 | 


--------------------------------------------------------------------------------
/tests/example_real_data/known_pii.csv:
--------------------------------------------------------------------------------
1 | filth_type,match,match_end,limit
2 | address,123 The Street,England,
3 | phone,0775 2212 211,,
4 | email,mike@example.com,,
5 | name,Mike Johnson,,
6 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements/*
2 | include MANIFEST.in
3 | include README.rst
4 | include LICENSE
5 | recursive-include scrubadub/detectors/models *.json
6 | recursive-exclude * *.py[co]
7 | recursive-exclude * *~
8 | recursive-exclude * *.orig
9 | 


--------------------------------------------------------------------------------
/requirements/python-readthedocs:
--------------------------------------------------------------------------------
 1 | # install everything in the python requirements too.
 2 | -r python
 3 | 
 4 | # for documentation
 5 | sphinx>=3
 6 | sphinx_rtd_theme>=0.5
 7 | 
 8 | # Needed for the docs
 9 | scrubadub_address
10 | scrubadub_spacy
11 | scrubadub_stanford
12 | 


--------------------------------------------------------------------------------
/scrubadub/post_processors/__init__.py:
--------------------------------------------------------------------------------
1 | from .catalogue import post_processor_catalogue, register_post_processor, remove_post_processor
2 | 
3 | from .base import PostProcessor
4 | from .filth_replacer import FilthReplacer
5 | from .prefix_suffix import PrefixSuffixReplacer
6 | from .remover import FilthRemover
7 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | ignore_missing_imports = True
 3 | 
 4 | [pycodestyle]
 5 | max-line-length = 120
 6 | statistics = True
 7 | 
 8 | [flake8]
 9 | max-line-length = 120
10 | statistics = True
11 | per-file-ignores =
12 |     # imported but unused
13 |     __init__.py: F401
14 | 
15 | [nosetests]
16 | with-coverage = 1
17 | cover-package = scrubadub
18 | 


--------------------------------------------------------------------------------
/scrubadub/filth/tagged.py:
--------------------------------------------------------------------------------
 1 | from .base import Filth
 2 | 
 3 | import typing
 4 | 
 5 | 
 6 | class TaggedEvaluationFilth(Filth):
 7 |     type = 'tagged'
 8 | 
 9 |     def __init__(self, *args, comparison_type: typing.Optional[str] = None, **kwargs):
10 |         super(TaggedEvaluationFilth, self).__init__(*args, **kwargs)
11 |         self.comparison_type = comparison_type
12 | 


--------------------------------------------------------------------------------
/docs/post_processors.rst:
--------------------------------------------------------------------------------
1 | 
2 | Post Processors
3 | ===============
4 | 
5 | Post processors run in a certain order and do something to the detected ``Filth``\ s.
6 | You could use them to validate your filth, to save your filth into a lookup file, to record statics on your found filth, to combine filth together, to remove the filth from the text or anything else you want really.
7 | 
8 | 


--------------------------------------------------------------------------------
/tests/test_filth_location.py:
--------------------------------------------------------------------------------
 1 | import faker
 2 | import unittest
 3 | 
 4 | from scrubadub.filth import LocationFilth
 5 | 
 6 | class LocationFilthTestCase(unittest.TestCase):
 7 | 
 8 |     def test_generate(self):
 9 |         class Faker:
10 |             def city(self):
11 |                 return 'Brianland'
12 | 
13 |         self.assertEqual(
14 |             'Brianland',
15 |             LocationFilth.generate(faker=Faker()),
16 |         )
17 | 


--------------------------------------------------------------------------------
/tests/test_filth_organization.py:
--------------------------------------------------------------------------------
 1 | import faker
 2 | import unittest
 3 | 
 4 | from scrubadub.filth import OrganizationFilth
 5 | 
 6 | class OrganizationFilthTestCase(unittest.TestCase):
 7 | 
 8 |     def test_generate(self):
 9 |         class Faker:
10 |             def company(self):
11 |                 return 'Brown-Lindsey'
12 | 
13 |         self.assertEqual(
14 |             'Brown-Lindsey',
15 |             OrganizationFilth.generate(faker=Faker()),
16 |         )
17 | 


--------------------------------------------------------------------------------
/docs/api_scrubadub_filth.rst:
--------------------------------------------------------------------------------
 1 | .. _api_scrubadub_filth:
 2 | 
 3 | 
 4 | scrubadub.filth
 5 | ===============
 6 | 
 7 | Filth objects are responsible for marking particular sections of text as
 8 | containing that type of filth. It is also responsible for knowing how it should
 9 | be cleaned. Every type of ``Filth`` inherits from ``scrubadub.filth.base.Filth``.
10 | 
11 | .. autoclass:: scrubadub.filth.Filth
12 |     :members:
13 |     :undoc-members:
14 |     :show-inheritance:
15 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/conf.py
11 | 
12 | # Optionally set the version of Python and requirements required to build your docs
13 | python:
14 |   version: 3.8
15 |   install:
16 |     - requirements: requirements/python-dev
17 | 


--------------------------------------------------------------------------------
/tests/test_exceptions.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from scrubadub import exceptions
 4 | 
 5 | 
 6 | class ExceptionsTestCase(unittest.TestCase):
 7 |     def test_render(self):
 8 |         exception = exceptions.ScrubadubException()
 9 |         exception.var = 'there'
10 | 
11 |         self.assertEquals(exception.render('test'), 'test')
12 |         self.assertEquals(exception.render('url %(issues_url)s'), 'url ' + exception.issues_url)
13 |         self.assertEquals(exception.render('hello %(var)s'), 'hello there')
14 | 


--------------------------------------------------------------------------------
/tests/example_real_data/document.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | This is is an example document that has been labelled in known_pii.csv. This document contains filthy personal
 3 | infomation that we want to remove, such as an address for Mike Johnson:
 4 | 
 5 | 123 The Street,
 6 | London,
 7 | E2 2AA,
 8 | England
 9 | 
10 | or an example phone number 0775 2212 211 and email address mike@example.com, even if it is wildly capitalised MiKe@ExAmPlE.Com.
11 | 
12 | benchmark_accuracy_real_data.py checks to see if the personal information can be found in this file.
13 | 


--------------------------------------------------------------------------------
/scrubadub/filth/email.py:
--------------------------------------------------------------------------------
 1 | from faker import Faker
 2 | 
 3 | from .base import Filth
 4 | 
 5 | 
 6 | class EmailFilth(Filth):
 7 |     type = 'email'
 8 | 
 9 |     @staticmethod
10 |     def generate(faker: Faker) -> str:
11 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
12 | 
13 |         :param faker: The ``Faker`` class from the ``faker`` library
14 |         :type faker: Faker
15 |         :return: An example of this ``Filth``
16 |         :rtype: str
17 |         """
18 |         return faker.email()
19 | 


--------------------------------------------------------------------------------
/scrubadub/filth/name.py:
--------------------------------------------------------------------------------
 1 | from faker import Faker
 2 | 
 3 | from .base import Filth
 4 | 
 5 | 
 6 | class NameFilth(Filth):
 7 |     type = 'name'
 8 | 
 9 |     @staticmethod
10 |     def generate(faker: Faker) -> str:
11 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
12 | 
13 |         :param faker: The ``Faker`` class from the ``faker`` library
14 |         :type faker: Faker
15 |         :return: An example of this ``Filth``
16 |         :rtype: str
17 |         """
18 |         return faker.name()
19 | 


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
 1 | # Starter pipeline
 2 | # Start with a minimal pipeline that you can customize to build and deploy your code.
 3 | # Add steps that build, run tests, deploy, and more:
 4 | # https://aka.ms/yaml
 5 | 
 6 | trigger:
 7 | - main
 8 | 
 9 | pool:
10 |   vmImage: 'ubuntu-latest'
11 | 
12 | steps:
13 | - script: echo Hello, world!
14 |   displayName: 'Run a one-line script'
15 | 
16 | - script: |
17 |     echo Add other tasks to build, test, and deploy your project.
18 |     echo See https://aka.ms/yaml
19 |   displayName: 'Run a multi-line script'
20 | 


--------------------------------------------------------------------------------
/scrubadub/filth/location.py:
--------------------------------------------------------------------------------
 1 | from faker import Faker
 2 | 
 3 | from .base import Filth
 4 | 
 5 | 
 6 | class LocationFilth(Filth):
 7 |     type = 'location'
 8 | 
 9 |     @staticmethod
10 |     def generate(faker: Faker) -> str:
11 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
12 | 
13 |         :param faker: The ``Faker`` class from the ``faker`` library
14 |         :type faker: Faker
15 |         :return: An example of this ``Filth``
16 |         :rtype: str
17 |         """
18 |         return faker.city()
19 | 


--------------------------------------------------------------------------------
/scrubadub/post_processors/base.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Sequence
 2 | 
 3 | from ..filth import Filth
 4 | 
 5 | 
 6 | class PostProcessor(object):
 7 |     name = 'post_processor'  # type: str
 8 |     autoload = False  # type: bool
 9 |     index = 10000  # type: int
10 | 
11 |     def __init__(self, name: Optional[str] = None):
12 |         if name is not None:
13 |             self.name = name
14 | 
15 |     def process_filth(self, filth_list: Sequence[Filth]) -> Sequence[Filth]:
16 |         raise NotImplementedError('must be overridden by base classes')
17 | 


--------------------------------------------------------------------------------
/scrubadub/filth/organization.py:
--------------------------------------------------------------------------------
 1 | from faker import Faker
 2 | 
 3 | from .base import Filth
 4 | 
 5 | 
 6 | class OrganizationFilth(Filth):
 7 |     type = 'organization'
 8 | 
 9 |     @staticmethod
10 |     def generate(faker: Faker) -> str:
11 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
12 | 
13 |         :param faker: The ``Faker`` class from the ``faker`` library
14 |         :type faker: Faker
15 |         :return: An example of this ``Filth``
16 |         :rtype: str
17 |         """
18 |         return faker.company()
19 | 


--------------------------------------------------------------------------------
/docs/api_scrubadub_comparison.rst:
--------------------------------------------------------------------------------
 1 | .. _api_scrubadub_comparison:
 2 | 
 3 | 
 4 | scrubadub.comparison
 5 | ====================
 6 | 
 7 | Filth objects are responsible for marking particular sections of text as
 8 | containing that type of filth. It is also responsible for knowing how it should
 9 | be cleaned. Every type of ``Filth`` inherits from ``scrubadub.filth.base.Filth``.
10 | 
11 | .. autofunction:: scrubadub.comparison.get_filth_classification_report
12 | 
13 | .. autofunction:: scrubadub.comparison.get_filth_dataframe
14 | 
15 | .. autofunction:: scrubadub.comparison.make_fake_document
16 | 


--------------------------------------------------------------------------------
/scrubadub/filth/vehicle_licence_plate.py:
--------------------------------------------------------------------------------
 1 | from faker import Faker
 2 | 
 3 | from .base import Filth
 4 | 
 5 | 
 6 | class VehicleLicencePlateFilth(Filth):
 7 |     type = 'vehicle_licence_plate'
 8 | 
 9 |     @staticmethod
10 |     def generate(faker: Faker) -> str:
11 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
12 | 
13 |         :param faker: The ``Faker`` class from the ``faker`` library
14 |         :type faker: Faker
15 |         :return: An example of this ``Filth``
16 |         :rtype: str
17 |         """
18 |         return faker.license_plate()
19 | 


--------------------------------------------------------------------------------
/requirements/python:
--------------------------------------------------------------------------------
 1 | # For the TextBlobNameDetecotr
 2 | textblob==0.15.3
 3 | 
 4 | # For the PhoneDetector
 5 | phonenumbers
 6 | 
 7 | # For SSN, credit cards and TRN
 8 | python-stdnum
 9 | 
10 | # For the DateOfBirthDetector, master version due to an unfixed bug... but which one?
11 | # Can't upload to PyPi with a dependency on a GH repo, so removed link to dateparser repo
12 | dateparser
13 | # @ git+https://github.com/scrapinghub/dateparser.git
14 | 
15 | # For the detector/post-processor catalogues
16 | catalogue
17 | 
18 | # For scrubadub.comparison
19 | scikit-learn
20 | 
21 | typing_extensions
22 | faker
23 | 


--------------------------------------------------------------------------------
/scrubadub/filth/twitter.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from faker import Faker
 4 | 
 5 | from .base import Filth
 6 | 
 7 | 
 8 | class TwitterFilth(Filth):
 9 |     type = 'twitter'
10 | 
11 |     @staticmethod
12 |     def generate(faker: Faker) -> str:
13 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
14 | 
15 |         :param faker: The ``Faker`` class from the ``faker`` library
16 |         :type faker: Faker
17 |         :return: An example of this ``Filth``
18 |         :rtype: str
19 |         """
20 |         return '@' + re.sub(r'[^a-zA-Z0-9_]', '', faker.user_name())[:15]
21 | 


--------------------------------------------------------------------------------
/scrubadub/filth/en_GB/national_insurance_number.py:
--------------------------------------------------------------------------------
 1 | from faker import Faker
 2 | 
 3 | from scrubadub.filth.base import Filth
 4 | 
 5 | 
 6 | class NationalInsuranceNumberFilth(Filth):
 7 |     type = 'national_insurance_number'
 8 | 
 9 |     @staticmethod
10 |     def generate(faker: Faker) -> str:
11 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
12 | 
13 |         :param faker: The ``Faker`` class from the ``faker`` library
14 |         :type faker: Faker
15 |         :return: An example of this ``Filth``
16 |         :rtype: str
17 |         """
18 |         return faker.ssn()
19 | 


--------------------------------------------------------------------------------
/requirements/python-dev:
--------------------------------------------------------------------------------
 1 | # install everything in the python requirements too.
 2 | -r python
 3 | 
 4 | # needed to run the tests
 5 | flake8
 6 | coveralls
 7 | nose
 8 | mypy
 9 | tox
10 | 
11 | # for documentation
12 | sphinx>=3
13 | sphinx_rtd_theme>=0.5
14 | 
15 | # This is for the tests/benchmark_accuracy_real_data.py script
16 | cchardet
17 | pandas
18 | click
19 | python-magic
20 | python-dotenv
21 | azure-storage-blob
22 | openpyxl
23 | tabulate
24 | pandas
25 | 
26 | types-dateparser
27 | types-requests
28 | 
29 | # needed for the tests/run.py script
30 | wasabi
31 | 
32 | # Needed for the docs
33 | postal
34 | scrubadub_address
35 | scrubadub_spacy
36 | scrubadub_stanford
37 | 


--------------------------------------------------------------------------------
/scrubadub/filth/skype.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from faker import Faker
 3 | 
 4 | from .base import Filth
 5 | 
 6 | 
 7 | class SkypeFilth(Filth):
 8 |     type = 'skype'
 9 | 
10 |     @staticmethod
11 |     def generate(faker: Faker) -> str:
12 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
13 | 
14 |         :param faker: The ``Faker`` class from the ``faker`` library
15 |         :type faker: Faker
16 |         :return: An example of this ``Filth``
17 |         :rtype: str
18 |         """
19 |         username = ''
20 |         while len(username) < 5:
21 |             username = re.sub(r'(^[^a-zA-Z])|[^a-zA-Z0-9_\-\,\.]', '', faker.user_name())[:31]
22 |         return username
23 | 


--------------------------------------------------------------------------------
/scrubadub/filth/postalcode.py:
--------------------------------------------------------------------------------
 1 | from faker import Faker
 2 | 
 3 | from .base import Filth
 4 | 
 5 | 
 6 | class PostalCodeFilth(Filth):
 7 |     type = "postalcode"
 8 | 
 9 |     @staticmethod
10 |     def generate(faker: Faker) -> str:
11 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
12 | 
13 |         :param faker: The ``Faker`` class from the ``faker`` library
14 |         :type faker: Faker
15 |         :return: An example of this ``Filth``
16 |         :rtype: str
17 |         """
18 |         # for en_US I expect we should pick between .zipcode() and .zipcode_plus4()
19 |         # as postcode() for en_US only returns the 5 number zip code
20 |         return faker.postcode()
21 | 


--------------------------------------------------------------------------------
/scrubadub/filth/credit_card.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import stdnum.luhn
 3 | from faker import Faker
 4 | 
 5 | from .base import Filth
 6 | 
 7 | 
 8 | class CreditCardFilth(Filth):
 9 |     type = 'credit_card'
10 | 
11 |     @staticmethod
12 |     def generate(faker: Faker) -> str:
13 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
14 | 
15 |         :param faker: The ``Faker`` class from the ``faker`` library
16 |         :type faker: Faker
17 |         :return: An example of this ``Filth``
18 |         :rtype: str
19 |         """
20 |         return faker.credit_card_number()
21 | 
22 |     def is_valid(self) -> bool:
23 |         return stdnum.luhn.is_valid(''.join(char for char in self.text if char in string.digits))
24 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/drivers_licence.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from scrubadub.detectors.catalogue import register_detector
 4 | from .base import RegionLocalisedRegexDetector
 5 | from ..filth import DriversLicenceFilth
 6 | 
 7 | 
 8 | @register_detector
 9 | class DriversLicenceDetector(RegionLocalisedRegexDetector):
10 |     """Use regular expressions to detect UK driving licence numbers,
11 |     Simple pattern matching, no checksum solution.
12 |     """
13 | 
14 |     name = 'drivers_licence'
15 |     autoload = True
16 |     filth_cls = DriversLicenceFilth
17 | 
18 |     region_regex = {
19 |         # this regex is looking for UK driving licence numbers that follow a pattern, no checksum
20 |         'GB': re.compile(r'''([a-zA-Z9]{5}\s?)((?:\s*\d\s*){6}[a-zA-Z9]{2}\w{3})\s?(\d{2})''', re.IGNORECASE)
21 |     }
22 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/en_GB/tax_reference_number.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from scrubadub.detectors.catalogue import register_detector
 4 | from scrubadub.detectors.base import RegionLocalisedRegexDetector
 5 | from scrubadub.filth import TaxReferenceNumberFilth
 6 | 
 7 | 
 8 | @register_detector
 9 | class TaxReferenceNumberDetector(RegionLocalisedRegexDetector):
10 |     """Use regular expressions to detect the UK PAYE temporary reference number (TRN),
11 |     Simple pattern matching, no checksum solution.
12 |     """
13 | 
14 |     name = 'tax_reference_number'
15 |     autoload = True
16 |     filth_cls = TaxReferenceNumberFilth
17 |     # this regex is looking for NINO that does not begin with certain letters
18 |     region_regex = {
19 |         'GB': re.compile(r'''\d{2}\s?[a-zA-Z]{1}(?:\s*\d\s*){5}''', re.IGNORECASE),
20 |     }
21 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/twitter.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from scrubadub.detectors.catalogue import register_detector
 4 | from .base import RegexDetector
 5 | from ..filth import TwitterFilth
 6 | 
 7 | 
 8 | @register_detector
 9 | class TwitterDetector(RegexDetector):
10 |     """Use regular expression magic to remove twitter usernames from dirty
11 |     dirty ``text``.
12 |     """
13 |     filth_cls = TwitterFilth
14 |     name = 'twitter'
15 |     autoload = True
16 | 
17 |     # https://help.twitter.com/en/managing-your-account/twitter-username-rules#error
18 |     # Twitter user names must be 15 or less charachtors and only contain a-zA-Z0-9_
19 |     # Twitter and admin are not allowed in user names
20 |     # (?<!\w) prevents it matching email addresses
21 |     regex = re.compile((
22 |         r"(?<!\w)@((?!((admin)|(twitter)))[a-z0-9_]){2,15}\b"
23 |     ), re.VERBOSE | re.IGNORECASE)
24 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/__init__.py:
--------------------------------------------------------------------------------
 1 | from .catalogue import remove_detector, register_detector, detector_catalogue
 2 | 
 3 | from .base import Detector, RegexDetector, RegionLocalisedRegexDetector
 4 | from .credential import CredentialDetector
 5 | from .credit_card import CreditCardDetector
 6 | from .date_of_birth import DateOfBirthDetector
 7 | from .drivers_licence import DriversLicenceDetector
 8 | from .email import EmailDetector
 9 | from .phone import PhoneDetector
10 | from .postalcode import PostalCodeDetector
11 | from .skype import SkypeDetector
12 | from .tagged import TaggedEvaluationFilthDetector
13 | from .text_blob import TextBlobNameDetector
14 | from .twitter import TwitterDetector
15 | from .url import UrlDetector
16 | from .user_supplied import UserSuppliedFilthDetector
17 | from .vehicle_licence_plate import VehicleLicencePlateDetector
18 | 
19 | from . import en_GB
20 | from . import en_US
21 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/en_US/social_security_number.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from scrubadub.detectors.catalogue import register_detector
 4 | from scrubadub.detectors.base import RegionLocalisedRegexDetector
 5 | from scrubadub.filth import SocialSecurityNumberFilth
 6 | 
 7 | 
 8 | @register_detector
 9 | class SocialSecurityNumberDetector(RegionLocalisedRegexDetector):
10 |     """Use regular expressions to detect a social security number (SSN) in
11 |     dirty dirty ``text``.
12 |     """
13 | 
14 |     filth_cls = SocialSecurityNumberFilth
15 |     name = 'social_security_number'
16 |     autoload = True
17 |     region_regex = {
18 |         'US': re.compile((
19 |             r"[0-9][0-9][0-9]"  # first three digits
20 |             r"[\-. ]"  # separator
21 |             r"[0-9][0-9]"  # next two digits
22 |             r"[\-. ]"  # separator
23 |             r"[0-9][0-9][0-9][0-9]"  # last four digits
24 |         ), re.VERBOSE),
25 |     }
26 | 


--------------------------------------------------------------------------------
/scrubadub/filth/en_US/social_security_number.py:
--------------------------------------------------------------------------------
 1 | from faker import Faker
 2 | import stdnum.us.ssn
 3 | 
 4 | from scrubadub.filth.base import Filth
 5 | 
 6 | 
 7 | class SocialSecurityNumberFilth(Filth):
 8 |     type = 'social_security_number'
 9 | 
10 |     @staticmethod
11 |     def generate(faker: Faker) -> str:
12 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
13 | 
14 |         :param faker: The ``Faker`` class from the ``faker`` library
15 |         :type faker: Faker
16 |         :return: An example of this ``Filth``
17 |         :rtype: str
18 |         """
19 |         ssn = ''
20 |         if faker.locales == ['en_US']:
21 |             while not stdnum.us.ssn.is_valid(ssn):
22 |                 ssn = faker.ssn()
23 |         return faker.ssn()
24 | 
25 |     def is_valid(self) -> bool:
26 |         return stdnum.us.ssn.is_valid(''.join(char for char in self.text if char not in '. -'))
27 | 


--------------------------------------------------------------------------------
/tests/test_detector_user_supplied.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import scrubadub
 4 | 
 5 | class UserDefinedTestCase(unittest.TestCase):
 6 | 
 7 |     def test_simple(self):
 8 |         """test a simple matching"""
 9 | 
10 |         test_str = 'this is a test string'
11 |         detector = scrubadub.detectors.UserSuppliedFilthDetector([
12 |             {'match': 'test', 'filth_type': 'name'},
13 |         ])
14 | 
15 |         matches = list(detector.iter_filth(test_str))
16 |         self.assertEqual(matches[0].beg, 10)
17 |         self.assertEqual(matches[0].end, 14)
18 | 
19 |     def test_bad_filth(self):
20 |         """test a simple matching"""
21 | 
22 |         test_str = 'this is a test string'
23 |         detector = scrubadub.detectors.UserSuppliedFilthDetector([
24 |             {'match': 'test', 'filth_type': 'invalid_filth'},
25 |         ])
26 | 
27 |         with self.assertRaises(KeyError):
28 |             list(detector.iter_filth(test_str))
29 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/en_GB/national_insurance_number.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from scrubadub.detectors.catalogue import register_detector
 4 | from scrubadub.detectors.base import RegionLocalisedRegexDetector
 5 | from scrubadub.filth import NationalInsuranceNumberFilth
 6 | 
 7 | 
 8 | @register_detector
 9 | class NationalInsuranceNumberDetector(RegionLocalisedRegexDetector):
10 |     """Use regular expressions to remove the GB National Insurance number (NINO),
11 |     Simple pattern matching, no checksum solution.
12 |     """
13 |     name = 'national_insurance_number'
14 |     autoload = True
15 |     filth_cls = NationalInsuranceNumberFilth
16 |     # this regex is looking for NINO that does not begin with certain letters
17 |     region_regex = {
18 |         'GB': re.compile(
19 |             r'(?!BG)(?!GB)(?!NK)(?!KN)(?!TN)(?!NT)(?!ZZ)(?:[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z])(?:\s*\d\s*){6}[A-D]',
20 |             re.IGNORECASE | re.VERBOSE
21 |         ),
22 |     }
23 | 


--------------------------------------------------------------------------------
/scrubadub/filth/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import Filth, MergedFilth, RegexFilth
 2 | from .address import AddressFilth
 3 | from .credential import CredentialFilth
 4 | from .credit_card import CreditCardFilth
 5 | from .drivers_licence import DriversLicenceFilth
 6 | from .email import EmailFilth
 7 | from .tagged import TaggedEvaluationFilth
 8 | from .location import LocationFilth
 9 | from .name import NameFilth
10 | from .organization import OrganizationFilth
11 | from .phone import PhoneFilth
12 | from .postalcode import PostalCodeFilth
13 | from .skype import SkypeFilth
14 | from .twitter import TwitterFilth
15 | from .url import UrlFilth
16 | from .vehicle_licence_plate import VehicleLicencePlateFilth
17 | from .date_of_birth import DateOfBirthFilth
18 | from .en_GB.national_insurance_number import NationalInsuranceNumberFilth
19 | from .en_GB.tax_reference_number import TaxReferenceNumberFilth
20 | from .en_US.social_security_number import SocialSecurityNumberFilth
21 | 


--------------------------------------------------------------------------------
/scrubadub/filth/url.py:
--------------------------------------------------------------------------------
 1 | from faker import Faker
 2 | 
 3 | from .base import Filth
 4 | 
 5 | 
 6 | class UrlFilth(Filth):
 7 |     type = 'url'
 8 | 
 9 |     # This allows you to keep the domain
10 |     keep_domain = False
11 | 
12 |     # this can be used to customize the output, particularly when
13 |     # keep_domain=True
14 |     url_placeholder = type.upper()
15 | 
16 |     @property
17 |     def placeholder(self):
18 |         if self.keep_domain:
19 |             return self.match.group('domain') + self.url_placeholder
20 |         return self.url_placeholder
21 | 
22 |     @staticmethod
23 |     def generate(faker: Faker) -> str:
24 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
25 | 
26 |         :param faker: The ``Faker`` class from the ``faker`` library
27 |         :type faker: Faker
28 |         :return: An example of this ``Filth``
29 |         :rtype: str
30 |         """
31 |         return faker.url()
32 | 


--------------------------------------------------------------------------------
/tests/colors.py:
--------------------------------------------------------------------------------
 1 | """Inspiration from
 2 | https://github.com/fabric/fabric/blob/master/fabric/colors.py
 3 | """
 4 | import re
 5 | 
 6 | 
 7 | def _wrap_with(code, bold=False):
 8 |     def inner(text):
 9 |         c = code
10 |         if bold:
11 |             c = "1;%s" % c
12 |         return "\033[%sm%s\033[0m" % (c, text)
13 |     return inner
14 | 
15 | red = _wrap_with('31')
16 | green = _wrap_with('32')
17 | yellow = _wrap_with('33')
18 | blue = _wrap_with('34')
19 | magenta = _wrap_with('35')
20 | cyan = _wrap_with('36')
21 | white = _wrap_with('37')
22 | 
23 | bold_red = _wrap_with('31', True)
24 | bold_green = _wrap_with('32', True)
25 | bold_yellow = _wrap_with('33', True)
26 | bold_blue = _wrap_with('34', True)
27 | bold_magenta = _wrap_with('35', True)
28 | bold_cyan = _wrap_with('36', True)
29 | bold_white = _wrap_with('37', True)
30 | 
31 | 
32 | # regular expression to omit colorcodes
33 | def colorless(text):
34 |     """Remove color from the text"""
35 |     return re.sub("\033\[(1;)?[\d]+m", '', text)


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/tests/benchmark_time.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import timeit
 5 | 
 6 | from scrubadub.comparison import make_fake_document
 7 | 
 8 | 
 9 | def main():
10 |     doc, _ = make_fake_document(paragraphs=20, seed=1234)
11 |     variables = {'doc': doc}
12 |     setup_cmd = 'import scrubadub; scrubber = scrubadub.Scrubber()'
13 |     cmd = 'scrubber.clean(doc)'
14 | 
15 |     print("Timing '{}':".format(cmd))
16 |     repeats = 50
17 |     timer = timeit.Timer(cmd, setup=setup_cmd, globals=variables)
18 |     try:
19 |         time = timer.timeit(number=repeats)
20 |     except Exception:
21 |         timer.print_exc()
22 |         sys.exit(1)
23 |     else:
24 |         print("{: >8.4f}s total runtime".format(time))
25 |         print("{: >8.4f}s per iteration".format(time/repeats))
26 | 
27 |     if time/repeats > 0.1:
28 |         print("Usual runtimes for the default set of detectors is 0.02s per iteration.")
29 |         sys.exit(1)
30 | 
31 |     sys.exit(0)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     main()
36 | 


--------------------------------------------------------------------------------
/bin/download_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # this script downloads some test datasets and puts it into a format that is
 4 | # convenient for testing the effectiveness of scrubadub
 5 | 
 6 | # all of the data is unpacked in data/testing
 7 | bin_dir=$(dirname $0)
 8 | project_root=${bin_dir}/..
 9 | raw_dir=${project_root}/data/raw
10 | mkdir -p ${raw_dir}
11 | 
12 | # enron
13 | echo 'downloading enron data...'
14 | curl https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tgz > ${project_root}/enron_mail_20150507.tgz
15 | echo 'extracting enron data...'
16 | mkdir -p ${raw_dir}/enron
17 | tar xzf ${project_root}/enron_mail_20150507.tgz -C ${raw_dir}/enron --strip-components=1
18 | rm ${project_root}/enron_mail_20150507.tgz
19 | 
20 | # sms
21 | echo 'downloading sms data...'
22 | curl https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip > ${project_root}/smsspamcollection.zip
23 | echo 'extracting sms data...'
24 | unzip ${project_root}/smsspamcollection.zip -d ${raw_dir}/sms
25 | rm ${project_root}/smsspamcollection.zip
26 | 


--------------------------------------------------------------------------------
/tests/test_unicode.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | 
 4 | import scrubadub
 5 | 
 6 | from base import BaseTestCase
 7 | 
 8 | @unittest.skipIf(sys.version_info >= (3,0), "Test only needed in Python 2")
 9 | class UnicodeTestCase(unittest.TestCase, BaseTestCase):
10 | 
11 |     def test_empty(self):
12 |         """Make sure this returns an empty string"""
13 |         self.assertEqual(
14 |             self.clean(u''),
15 |             u'',
16 |             'empty string is not preserved',
17 |         )
18 | 
19 |     def test_not_unicode(self):
20 |         """Make sure unicode works, too"""
21 |         with self.assertRaises(scrubadub.exceptions.UnicodeRequired):
22 |             self.clean('John is a byte string')
23 | 
24 |     def test_useful_error_message(self):
25 |         try:
26 |             self.clean('John is a byte string')
27 |         except scrubadub.exceptions.UnicodeRequired as e:
28 |             self.assertIn("scrubadub works best with unicode", str(e))
29 |         else:
30 |             self.fail('UnicodeRequired was not raised')
31 | 


--------------------------------------------------------------------------------
/tests/test_postprocessor.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import scrubadub
 4 | import scrubadub.post_processors
 5 | from scrubadub.filth import EmailFilth
 6 | 
 7 | 
 8 | class PostProcessorTestCase(unittest.TestCase):
 9 |     def test_post_processor_name(self):
10 |         """make sure adding an initialised detector works"""
11 |         filths = [
12 |             EmailFilth(beg=0, end=5, text='e@e.c'),
13 |             # EmailFilth(beg=5, end=10, text='e@e.c'),
14 |         ]
15 | 
16 |         post_processor = scrubadub.post_processors.FilthReplacer(name='new_name')
17 |         self.assertEqual(post_processor.name, 'new_name')
18 |         new_filths = list(post_processor.process_filth(filths))
19 |         self.assertEqual(len(new_filths), 1)
20 |         self.assertEqual(new_filths[0].replacement_string, 'EMAIL')
21 | 
22 |     def test_post_processor_raise(self):
23 |         """make sure adding an initialised detector works"""
24 |         with self.assertRaises(NotImplementedError):
25 |             scrubadub.post_processors.PostProcessor().process_filth([])
26 | 


--------------------------------------------------------------------------------
/scrubadub/filth/credential.py:
--------------------------------------------------------------------------------
 1 | from .base import Filth
 2 | from .. import exceptions
 3 | 
 4 | 
 5 | class CredentialFilth(Filth):
 6 |     type = 'credential'
 7 | 
 8 |     # specify how the username/password are replaced
 9 |     username_placeholder = 'USERNAME'
10 |     password_placeholder = 'PASSWORD'
11 | 
12 |     @property
13 |     def placeholder(self):
14 |         ubeg, uend = self.match.span('username')
15 |         pbeg, pend = self.match.span('password')
16 |         return (
17 |             self.match.string[self.match.start():ubeg] +
18 |             self.prefix + self.username_placeholder + self.suffix +
19 |             self.match.string[uend:pbeg] +
20 |             self.prefix + self.password_placeholder + self.suffix
21 |         )
22 | 
23 |     # override the replace_with method for credentials because the
24 |     # prefix/suffix components are mixed into the placeholder
25 |     def replace_with(self, replace_with='placeholder', **kwargs):
26 |         if replace_with == 'placeholder':
27 |             return self.placeholder
28 |         else:
29 |             raise exceptions.InvalidReplaceWith(replace_with)
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | /data
60 | .mypy_cache
61 | .env
62 | .idea/
63 | ..bfg-report
64 | libpostal/
65 | tests/code_point_uk_post_codes.zip
66 | .ipynb_checkpoints/
67 | tests/output*
68 | 


--------------------------------------------------------------------------------
/design/basic_usage.py:
--------------------------------------------------------------------------------
 1 | """This is the basic usage of the scrubadub module. It exposes three different
 2 | methods for obfuscating personally identifiable information and uses high
 3 | recall methods for identifying filth. Precision can be improved by further
 4 | customization.
 5 | """
 6 | 
 7 | import scrubadub
 8 | 
 9 | # this should have very smart defaults, with high recall and relatively low
10 | # precision. the placeholder method is default and uses {{}} notation to
11 | # signify when text has been obfuscated
12 | clean_text = scrubadub.clean(text)
13 | clean_text = scrubadub.clean(text, replace_with="placeholder")
14 | 
15 | # the surrogate replacement method makes it easy to replace phone numbers with
16 | # fake phone numbers, for example. this makes it easy to read the content
17 | clean_text = scrubadub.clean(text, replace_with="surrogate")
18 | 
19 | # the identifier replacement method replaces the personal information
20 | # associated with each person in lookup with the same unique id to make it easy
21 | # to detect the same person across document records.
22 | clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup)
23 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/credential.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from .base import RegexDetector
 4 | from ..filth import CredentialFilth
 5 | from scrubadub.detectors.catalogue import register_detector
 6 | 
 7 | 
 8 | @register_detector
 9 | class CredentialDetector(RegexDetector):
10 |     """Remove username/password combinations from dirty drity ``text``.
11 |     """
12 |     filth_cls = CredentialFilth
13 |     name = 'credential'
14 |     autoload = True
15 | 
16 |     # this regular expression searches for patterns like
17 |     #     "username: root password: root"
18 |     # that tend to occur very frequently in text. This does not currently catch
19 |     # things like "username / password is root / root"
20 |     regex = re.compile(r'''
21 |         (username|login|u:)\s*:?\s*    # username might have : and whitespace
22 |         (?P<username>[\w\-\.@+]*)      # capture the username for replacement
23 |         \s+                            # some whitespace between
24 |         (password|pw|p:)\s*:?\s*       # password might have : and whitespace
25 |         (?P<password>.*)               # password can be anything until EOL
26 |     ''', re.MULTILINE | re.VERBOSE | re.IGNORECASE)
27 | 


--------------------------------------------------------------------------------
/tests/test_postprocessor_prefix_postfix_replacer.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from scrubadub.post_processors.prefix_suffix import PrefixSuffixReplacer
 4 | from scrubadub.filth import EmailFilth
 5 | 
 6 | 
 7 | class PrefixSuffixReplacerTestCase(unittest.TestCase):
 8 |     def test_usage(self):
 9 |         post_proc = PrefixSuffixReplacer()
10 |         filths = [EmailFilth(0, 19, 'example@example.com')]
11 |         self.assertEqual(filths[0].replacement_string, None)
12 | 
13 |         filths = post_proc.process_filth(filths)
14 |         self.assertEqual(filths[0].replacement_string, '{{EMAIL}}')
15 | 
16 |         post_proc = PrefixSuffixReplacer(prefix=None, suffix='>>')
17 |         filths = post_proc.process_filth(filths)
18 |         self.assertEqual(filths[0].replacement_string, '{{EMAIL}}>>')
19 | 
20 |         post_proc = PrefixSuffixReplacer(prefix='<<', suffix=None)
21 |         filths = post_proc.process_filth(filths)
22 |         self.assertEqual(filths[0].replacement_string, '<<{{EMAIL}}>>')
23 | 
24 |         post_proc = PrefixSuffixReplacer(prefix='||', suffix='||')
25 |         filths = post_proc.process_filth(filths)
26 |         self.assertEqual(filths[0].replacement_string, '||<<{{EMAIL}}>>||')


--------------------------------------------------------------------------------
/scrubadub/detectors/vehicle_licence_plate.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from scrubadub.detectors.catalogue import register_detector
 4 | from .base import RegionLocalisedRegexDetector
 5 | from ..filth.vehicle_licence_plate import VehicleLicencePlateFilth
 6 | 
 7 | 
 8 | @register_detector
 9 | class VehicleLicencePlateDetector(RegionLocalisedRegexDetector):
10 |     """Detects standard british licence plates."""
11 |     filth_cls = VehicleLicencePlateFilth
12 |     name = 'vehicle_licence_plate'
13 |     autoload = True
14 | 
15 |     # Vehicle Registration Plates from:
16 |     # https://gist.github.com/harry-jones/755501192139820eeb65e030fe878f75
17 |     # More cases available in above link, but can cause the regex to become
18 |     # quire greedy. For now keep it simple!
19 | 
20 |     # taken from the alphagov fork of scrubadub: https://github.com/alphagov/scrubadub
21 | 
22 |     region_regex = {
23 |         'GB': re.compile(
24 |             # Current system followed by the old system
25 |             r"""
26 |                 \b(
27 |                     ([a-zA-Z]{2}[0-9]{2}(?:\s)?[a-zA-Z]{3})
28 |                     |
29 |                     ([a-zA-Z][0-9]{1,3}(?:\s)?[a-zA-Z]{3})
30 |                 )\b
31 |             """,
32 |             re.VERBOSE | re.IGNORECASE,
33 |         ),
34 |     }
35 | 


--------------------------------------------------------------------------------
/scrubadub/filth/phone.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import phonenumbers
 3 | 
 4 | from faker import Faker
 5 | from typing import List
 6 | 
 7 | from .base import Filth
 8 | from .. import utils
 9 | 
10 | 
11 | class PhoneFilth(Filth):
12 |     type = 'phone'
13 | 
14 |     @staticmethod
15 |     def generate(faker: Faker) -> str:
16 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
17 | 
18 |         :param faker: The ``Faker`` class from the ``faker`` library
19 |         :type faker: Faker
20 |         :return: An example of this ``Filth``
21 |         :rtype: str
22 |         """
23 |         phone_number = ''
24 |         language, region = utils.locale_split(faker._locales[0])
25 |         results = []  # type: List[phonenumbers.PhoneNumberMatch]
26 |         # Here I'm filtering for numbers that pass validation by the phonenumbers package
27 |         while len(results) < 1:
28 |             # Faker generates random numbers of the right format eg (###)###-####
29 |             phone_number = re.sub(r'x.*$', '', faker.phone_number())
30 |             # phonenumbers checks that they follow the rules around area codes and that they are possibly valid
31 |             results = list(phonenumbers.PhoneNumberMatcher(phone_number, region))
32 |         return phone_number
33 | 


--------------------------------------------------------------------------------
/scrubadub/post_processors/remover.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence
 2 | 
 3 | from scrubadub.post_processors.catalogue import register_post_processor
 4 | from scrubadub.filth import Filth
 5 | from scrubadub.post_processors.base import PostProcessor
 6 | 
 7 | 
 8 | class FilthRemover(PostProcessor):
 9 |     """Removes all found filth from the original document.
10 | 
11 |     >>> import scrubadub
12 |     >>> scrubber = scrubadub.Scrubber(post_processor_list=[
13 |     ...     scrubadub.post_processors.FilthRemover(),
14 |     ... ])
15 |     >>> scrubber.clean("Contact me at 522-368-8530 or hernandezjenna@example.com")
16 |     'Contact me at  or '
17 | 
18 |     """
19 |     name = 'filth_remover'  # type: str
20 |     autoload = False
21 |     index = 0
22 | 
23 |     def process_filth(self, filth_list: Sequence[Filth]) -> Sequence[Filth]:
24 |         """Processes the filth to remove the filth
25 | 
26 |         :param filth_list: The text to be hashed
27 |         :type filth_list: Sequence[Filth]
28 |         :return: The processed filths
29 |         :rtype: Sequence[Filth]
30 |         """
31 |         for filth_item in filth_list:
32 |             filth_item.replacement_string = ''
33 |         return filth_list
34 | 
35 | 
36 | register_post_processor(FilthRemover)
37 | 
38 | __all__ = ['FilthRemover']
39 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/url.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from scrubadub.detectors.catalogue import register_detector
 4 | from .base import RegexDetector
 5 | from ..filth import UrlFilth
 6 | 
 7 | 
 8 | @register_detector
 9 | class UrlDetector(RegexDetector):
10 |     """Use regular expressions to remove URLs that begin with ``http://``,
11 |     ``https://`` or ``www.`` from dirty dirty ``text``.
12 | 
13 |     With ``keep_domain=True``, this detector only obfuscates the path on a
14 |     URL, not its domain. For example,
15 |     ``http://twitter.com/someone/status/234978haoin`` becomes
16 |     ``http://twitter.com/{{replacement}}``.
17 |     """
18 |     filth_cls = UrlFilth
19 |     name = 'url'
20 |     autoload = True
21 | 
22 |     # this regular expression is convenient for captures the domain name
23 |     # and the path separately, which is useful for keeping the domain name
24 |     # but sanitizing the path altogether
25 |     regex = re.compile(r'''
26 |         (?P<domain>
27 |             (https?:\/\/(www\.)?|www\.)          # protocol http://, etc
28 |             [\-\w@:%\.\+~\#=]{2,256}\.[a-z]{2,6} # domain name
29 |             /?                                   # can have a trailing slash
30 |         )(?P<path>
31 |             [\-\w@:%\+\.~\#?&/=]*                # rest of path, query, & hash
32 |         )
33 |     ''', re.VERBOSE)
34 | 


--------------------------------------------------------------------------------
/tests/test_detector_configuration.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import catalogue
 4 | import scrubadub
 5 | import scrubadub.detectors.catalogue
 6 | 
 7 | 
 8 | class DetectorConfigTestCase(unittest.TestCase):
 9 |     def test_register_detector(self):
10 |         class NewDetector(scrubadub.detectors.Detector):
11 |             name = 'new_detector'
12 | 
13 |         scrubadub.detectors.catalogue.register_detector(NewDetector, autoload=False)
14 |         self.assertTrue(NewDetector.name in scrubadub.detectors.catalogue.detector_catalogue)
15 |         self.assertFalse(NewDetector.autoload)
16 |         self.assertEqual(scrubadub.detectors.catalogue.detector_catalogue.get(NewDetector.name), NewDetector)
17 | 
18 |         scrubadub.detectors.catalogue.remove_detector(NewDetector)
19 |         with self.assertRaises(catalogue.RegistryError):
20 |             scrubadub.detectors.catalogue.detector_catalogue.get(NewDetector.name)
21 | 
22 |         scrubadub.detectors.catalogue.register_detector(NewDetector, autoload=True)
23 |         self.assertTrue(NewDetector.name in scrubadub.detectors.catalogue.detector_catalogue)
24 |         self.assertTrue(NewDetector.autoload)
25 |         self.assertEqual(scrubadub.detectors.catalogue.detector_catalogue.get(NewDetector.name), NewDetector)
26 | 
27 |         scrubadub.detectors.catalogue.remove_detector(NewDetector)
28 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # tox (https://tox.readthedocs.io/) is a tool for running tests
 2 | # in multiple virtualenvs. This configuration file will run the
 3 | # test suite on all supported python versions. To use it, "pip install tox"
 4 | # and then run "tox" from this directory.
 5 | 
 6 | [tox]
 7 | envlist = py39, py38, py37, py36
 8 | 
 9 | [testenv]
10 | allowlist_externals = bash
11 | skip_install = False
12 | setenv =
13 |     PIP_INDEX_URL = {env:PIP_INDEX_URL}
14 |     LD_LIBRARY_PATH = {env:LD_LIBRARY_PATH}
15 |     LIBRARY_PATH = {env:LIBRARY_PATH}
16 |     C_INCLUDE_PATH = {env:C_INCLUDE_PATH}
17 |     CPP_INCLUDE_PATH = {env:CPP_INCLUDE_PATH}
18 | commands =
19 |     pip install --upgrade pip wheel setuptools
20 |     pip install -r requirements/python-dev
21 |     python3 -c "import nltk; nltk.download('punkt')"
22 |     bash -c "python3 -m spacy info | grep Pipelines | grep -qv en_core_web_trf && python -m spacy download en_core_web_trf || exit 0"
23 |     bash -c "python3 -m spacy info | grep Pipelines | grep -qv en_core_web_sm && python -m spacy download en_core_web_sm || exit 0"
24 |     bash -c "python3 -m spacy info | grep Pipelines | grep -qv fr_core_news_lg && python -m spacy download fr_core_news_lg || exit 0"
25 |     bash -c "python3 -m spacy info | grep Pipelines | grep -qv de_core_news_sm && python -m spacy download de_core_news_sm || exit 0"
26 |     python tests/run.py
27 | 


--------------------------------------------------------------------------------
/design/customize_filth_detection.py:
--------------------------------------------------------------------------------
 1 | """scrubadub has some very conservative defaults (high recall) for identifying
 2 | filth. One of the key ways in which scrubadub can be customized is in improving
 3 | the precision of filth detection.
 4 | 
 5 | For example, if a user knows that the word 'iPhone' is not a person's name, but
 6 | a product, then a user should be able to easily adapt how scrubadub identifies
 7 | names.
 8 | """
 9 | 
10 | import scrubadub
11 | 
12 | # fine-tune how scrubadub detects names and omit product names
13 | # https://github.com/deanmalmgren/scrubadub/issues/6
14 | class MyNameDetector(scrubadub.detectors.TextBlobNameDetector):
15 |     def iter_filth(self, text):
16 |         for filth in super(MyNameDetector, self).iter_filth(text):
17 |             if filth != "iPhone":
18 |                 yield filth
19 | 
20 | # instantiate a scrubber and change the name detector to use our custom class
21 | scrubber = scrubadub.Scrubber()
22 | scrubber.detectors['name'] = MyNameDetector()
23 | 
24 | # these methods have identical on a Scrubber object should have identical
25 | # behavior to the scrubadub.clean convenience function
26 | clean_text = scrubber.clean(text)
27 | clean_text = scrubber.clean(text, replace_with="placeholder")
28 | clean_text = scrubber.clean(text, replace_with="surrogate")
29 | clean_text = scrubber.clean(text, replace_with="identifier", lookup=lookup)
30 | 


--------------------------------------------------------------------------------
/docs/api_scrubadub.rst:
--------------------------------------------------------------------------------
 1 | .. _api_scrubadub:
 2 | 
 3 | scrubadub
 4 | =========
 5 | 
 6 | There are several convenience functions to make using scrubadub quick and simple.
 7 | These functions either remove the Filth from the text (such as ``scrubadub.clean``) or
 8 | return a list of Filth objects that were found (such as ``scrubadub.list_filth``).
 9 | These functions either work on a single document in a string (such as ``scrubadub.clean``) or
10 | work on a set of documents given in either a dictonary or list (such as ``scrubadub.clean_documents``).
11 | 
12 | scrubadub.clean
13 | ---------------
14 | 
15 | .. autofunction:: scrubadub.clean
16 | 
17 | scrubadub.clean_documents
18 | -------------------------
19 | 
20 | .. autofunction:: scrubadub.clean_documents
21 | 
22 | scrubadub.list_filth
23 | --------------------
24 | 
25 | .. autofunction:: scrubadub.list_filth
26 | 
27 | scrubadub.list_filth_documents
28 | ------------------------------
29 | 
30 | .. autofunction:: scrubadub.list_filth_documents
31 | 
32 | 
33 | scrubadub.Scrubber
34 | ------------------
35 | 
36 | All of the ``Detector``'s are managed by the ``Scrubber``. The main job of the
37 | ``Scrubber`` is to handle situations in which the same section of text contains
38 | different types of ``Filth``.
39 | 
40 | .. autoclass:: scrubadub.scrubbers.Scrubber
41 |     :members:
42 |     :undoc-members:
43 |     :show-inheritance:
44 | 
45 | 


--------------------------------------------------------------------------------
/tests/test_detector_emails.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from base import BaseTestCase
 4 | 
 5 | 
 6 | class EmailTestCase(unittest.TestCase, BaseTestCase):
 7 | 
 8 |     def test_john_gmail(self):
 9 |         """
10 |         BEFORE: My email is john@gmail.com
11 |         AFTER:  My email is {{EMAIL}}
12 |         """
13 |         self.compare_before_after()
14 | 
15 |     def test_John_gmail(self):
16 |         """
17 |         BEFORE: My email is John@gmail.com
18 |         AFTER:  My email is {{EMAIL}}
19 |         """
20 |         self.compare_before_after()
21 | 
22 |     def test_John1_example_com(self):
23 |         """
24 |         BEFORE: My email is John1@example.com
25 |         AFTER:  My email is {{EMAIL}}
26 |         """
27 |         self.compare_before_after()
28 | 
29 |     def test_adam_example_info(self):
30 |         """
31 |         BEFORE: My email is adam80@example.info
32 |         AFTER:  My email is {{EMAIL}}
33 |         """
34 |         self.compare_before_after()
35 | 
36 |     def test_uppercase(self):
37 |         """
38 |         BEFORE: My email is HELLO@EXAMPLE.COM
39 |         AFTER:  My email is {{EMAIL}}
40 |         """
41 |         self.compare_before_after()
42 | 
43 |     def test_fancy_john_gmail(self):
44 |         """
45 |         BEFORE: My email is john at gmail.com
46 |         AFTER:  My email is {{EMAIL}}
47 |         """
48 |         self.compare_before_after()
49 | 


--------------------------------------------------------------------------------
/tests/test_detector_en_US_social_security_number.py:
--------------------------------------------------------------------------------
 1 | import faker
 2 | import unittest
 3 | from scrubadub.filth import SocialSecurityNumberFilth
 4 | 
 5 | from base import BaseTestCase
 6 | 
 7 | 
 8 | class SSNTestCase(unittest.TestCase, BaseTestCase):
 9 | 
10 |     def test_example(self):
11 |         """
12 |         BEFORE: My social security number is 726-60-2033
13 |         AFTER:  My social security number is {{SOCIAL_SECURITY_NUMBER}}
14 |         """
15 |         self.compare_before_after()
16 | 
17 |     def test_hyphens(self):
18 |         """
19 |         BEFORE: My social security number is 109-99-6000
20 |         AFTER:  My social security number is {{SOCIAL_SECURITY_NUMBER}}
21 |         """
22 |         self.compare_before_after()
23 | 
24 |     def test_dots(self):
25 |         """
26 |         BEFORE: My social security number is 109.99.6000
27 |         AFTER:  My social security number is {{SOCIAL_SECURITY_NUMBER}}
28 |         """
29 |         self.compare_before_after()
30 | 
31 |     def test_spaces(self):
32 |         """
33 |         BEFORE: My social security number is 109 99 6000
34 |         AFTER:  My social security number is {{SOCIAL_SECURITY_NUMBER}}
35 |         """
36 |         self.compare_before_after()
37 | 
38 |     def test_generate(self):
39 |         fake = faker.Faker('en_US')
40 |         faker.Faker.seed(4321)
41 | 
42 |         self.assertEqual(
43 |             '818-09-2900',
44 |             SocialSecurityNumberFilth.generate(faker=fake),
45 |         )
46 | 


--------------------------------------------------------------------------------
/docs/api_scrubadub_post.rst:
--------------------------------------------------------------------------------
 1 | .. _api_scrubadub_post:
 2 | 
 3 | scrubadub.post_processors
 4 | =========================
 5 | 
 6 | ``PostProcessor``\ s generally can be used to process the detected ``Filth``
 7 | objects and make changes to them.
 8 | 
 9 | These are a new addition to scrubadub and at the moment only simple ones
10 | exist that alter the replacement string.
11 | 
12 | .. autoclass:: scrubadub.post_processors.base.PostProcessor
13 |     :members:
14 |     :undoc-members:
15 |     :show-inheritance:
16 | 
17 | .. autoclass:: scrubadub.post_processors.filth_replacer.FilthReplacer
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | .. autoclass:: scrubadub.post_processors.prefix_suffix.PrefixSuffixReplacer
23 |     :members:
24 |     :undoc-members:
25 |     :show-inheritance:
26 | 
27 | .. autoclass:: scrubadub.post_processors.remover.FilthRemover
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 
32 | 
33 | Catalogue functions
34 | -------------------
35 | 
36 | .. _scrubadub.post_processors.register_post_processor:
37 | 
38 | scrubadub.post_processors.register_post_processor
39 | -------------------------------------------------
40 | 
41 | .. autofunction:: scrubadub.post_processors.register_post_processor
42 | 
43 | .. _scrubadub.post_processors.remove_post_processor:
44 | 
45 | scrubadub.post_processors.remove_post_processor
46 | -----------------------------------------------
47 | 
48 | .. autofunction:: scrubadub.post_processors.remove_post_processor
49 | 
50 | 


--------------------------------------------------------------------------------
/scrubadub/filth/date_of_birth.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import datetime
 3 | import dateparser
 4 | from faker import Faker
 5 | 
 6 | from .base import Filth
 7 | 
 8 | 
 9 | class DateOfBirthFilth(Filth):
10 |     type = 'date_of_birth'
11 |     min_age_years = 18
12 |     max_age_years = 100
13 | 
14 |     @staticmethod
15 |     def generate(faker: Faker) -> str:
16 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
17 | 
18 |         :param faker: The ``Faker`` class from the ``faker`` library
19 |         :type faker: Faker
20 |         :return: An example of this ``Filth``
21 |         :rtype: str
22 |         """
23 |         formats = [
24 |             '%c',  # Tue Aug 16 21:30:00 1988 (en_US); locale dependant
25 |             '%x',  # 08/16/1988 (en_US); locale dependant
26 |             '%a %d %b %Y',  # Sun 19 Jan 1999
27 |             '%A %d %B %Y',  # Sunday 19 January 1999
28 |             '%d-%m-%Y',  # 15-01-1999
29 |             '%A %dth, %B, %Y',  # Monday 08th, January, 1973
30 |         ]
31 |         return faker.date_of_birth().strftime(random.choice(formats))
32 | 
33 |     def is_valid(self) -> bool:
34 |         """Check to see if the found filth is valid."""
35 |         found_date = dateparser.parse(self.text)
36 |         if found_date is None:
37 |             return False
38 |         years_since_identified_date = datetime.date.today().year - found_date.year
39 |         return DateOfBirthFilth.min_age_years <= years_since_identified_date <= DateOfBirthFilth.max_age_years
40 | 


--------------------------------------------------------------------------------
/scrubadub/exceptions.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # this is the base exception that is thrown by scrubadub to make it
 4 | # easy to suppress all Scrubadub exceptions
 5 | class ScrubadubException(Exception):
 6 | 
 7 |     def __init__(self, *args, **kwargs):
 8 |         self.issues_url = 'http://github.com/LeapBeyond/scrubadub/issues'
 9 | 
10 |     def render(self, msg):
11 |         return msg % vars(self)
12 | 
13 | 
14 | class UnicodeRequired(ScrubadubException):
15 |     """Scrubadub requires unicode. Throw a useful error to lead users to
16 |     the promised land.
17 |     """
18 | 
19 |     def __str__(self):
20 |         return self.render((
21 |             'scrubadub works best with unicode.\n'
22 |             'Frustrated by unicode?\n'
23 |             'Yeah, me too.\n'
24 |             'But unicode sandwiches are awesome.\n'
25 |             'http://bit.ly/unipain @nedbat\n'
26 |         ))
27 | 
28 | 
29 | class UnexpectedFilth(ScrubadubException):
30 |     pass
31 | 
32 | 
33 | class FilthMergeError(ScrubadubException):
34 |     pass
35 | 
36 | 
37 | class InvalidReplaceWith(ScrubadubException):
38 | 
39 |     def __init__(self, replace_with):
40 |         super(InvalidReplaceWith, self).__init__()
41 |         self.replace_with = replace_with
42 | 
43 |     def __str__(self):
44 |         return self.render((
45 |             'Invalid replace_with parameter %(replace_with)s. Can only use '
46 |             '`placeholder` for the time being. If you have other ideas for '
47 |             'replace_with functionality, please make a suggestion at '
48 |             '%(issues_url)s'
49 |         ))
50 | 


--------------------------------------------------------------------------------
/tests/test_detector_text_blob.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import scrubadub.detectors.catalogue
 4 | from base import BaseTestCase
 5 | 
 6 | import scrubadub
 7 | 
 8 | class TextBlobNameTestCase(unittest.TestCase, BaseTestCase):
 9 | 
10 |     def setUp(self):
11 |         from scrubadub.detectors.text_blob import TextBlobNameDetector
12 |         scrubadub.detectors.catalogue.register_detector(TextBlobNameDetector, autoload=True)
13 | 
14 |     def test_john(self):
15 |         """
16 |         BEFORE: John is a cat
17 |         AFTER:  {{NAME}} is a cat
18 |         """
19 |         self.compare_before_after()
20 | 
21 |     def test_no_names(self):
22 |         """
23 |         BEFORE: Hello. Please testing.
24 |         AFTER: Hello. Please testing.
25 |         """
26 |         self.compare_before_after()
27 | 
28 |     @unittest.skip('lower names cause problems for textblob')
29 |     def test_lower_names(self):
30 |         """
31 |         BEFORE: sarah is a friendly person
32 |         AFTER: {{NAME}} is a friendly person
33 |         """
34 |         self.compare_before_after()
35 | 
36 |     def test_disallowed_nouns(self):
37 |         import scrubadub.detectors.text_blob
38 |         detector = scrubadub.detectors.text_blob.TextBlobNameDetector()
39 |         detector.disallowed_nouns = set()
40 |         with self.assertRaises(TypeError):
41 |             list(detector.iter_filth('John is a cat'))
42 | 
43 |     def tearDown(self) -> None:
44 |         from scrubadub.detectors.text_blob import TextBlobNameDetector
45 |         scrubadub.detectors.catalogue.remove_detector(TextBlobNameDetector)
46 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/credit_card.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from .base import RegexDetector
 4 | from ..filth import CreditCardFilth
 5 | from scrubadub.detectors.catalogue import register_detector
 6 | 
 7 | 
 8 | @register_detector
 9 | class CreditCardDetector(RegexDetector):
10 |     """Remove credit-card numbers from dirty dirty ``text``.
11 | 
12 |     Supports Visa, MasterCard, American Express, Diners Club and JCB.
13 |     """
14 |     name = 'credit_card'
15 |     filth_cls = CreditCardFilth
16 |     autoload = True
17 | 
18 |     # Regexes from:
19 |     # http://www.regular-expressions.info/creditcard.html
20 | 
21 |     # Fake card numbers from:
22 |     # https://www.paypalobjects.com/en_US/vhelp/paypalmanager_help/credit_card_numbers.htm
23 | 
24 |     # taken from the alphagov fork of scrubadub: https://github.com/alphagov/scrubadub
25 | 
26 |     # Looking at wikipedia, there are probably more numbers to detect:
27 |     # https://en.wikipedia.org/wiki/Payment_card_number#Issuer_identification_number_.28IIN.29
28 | 
29 |     # TODO: regex doesn't match if the credit card number has spaces/dashes in
30 | 
31 |     regex = re.compile((
32 |         r"(?<=\s)"
33 |         r"(?:4[0-9]{12}(?:[0-9]{3})?"  		# Visa
34 |         r"|(?:5[1-5][0-9]{2}"          		# MasterCard
35 |         r"|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}"
36 |         r"|3[47][0-9]{13}"             		# American Express
37 |         r"|3(?:0[0-5]|[68][0-9])[0-9]{11}"   	# Diners Club
38 |         r"|6(?:011|5[0-9]{2})[0-9]{12}"      	# Discover
39 |         r"|(?:2131|1800|35\d{3})\d{11})"      	# JCB
40 |     ), re.VERBOSE)
41 | 


--------------------------------------------------------------------------------
/tests/test_detector_en_GB_trn.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from base import BaseTestCase
 4 | 
 5 | 
 6 | class GBTrnTestCase(unittest.TestCase, BaseTestCase):
 7 | 
 8 |     def test_gbtrn_1(self):
 9 |         """
10 |         BEFORE: My PAYE temp number is 99L99999, which is not permanent.
11 |         AFTER:  My PAYE temp number is {{TAX_REFERENCE_NUMBER}}, which is not permanent.
12 |         """
13 |         self.compare_before_after(locale='en_GB')
14 | 
15 |     def test_gbtrn_2(self):
16 |         """
17 |         BEFORE: Enter a Temporary Reference Number that is 2 numbers, 1 letter, then 5 numbers, like 11 A 12345.
18 |         AFTER:  Enter a Temporary Reference Number that is 2 numbers, 1 letter, then 5 numbers, like {{TAX_REFERENCE_NUMBER}}.
19 |         """
20 |         self.compare_before_after(locale='en_GB')
21 | 
22 |     def test_gbtrn_3(self):
23 |         """
24 |         BEFORE: It’s on your National Insurance card, benefit letter, payslip or P60. For example, 99L 99999.
25 |         AFTER:  It’s on your National Insurance card, benefit letter, payslip or P60. For example, {{TAX_REFERENCE_NUMBER}}.
26 |         """
27 |         self.compare_before_after(locale='en_GB')
28 | 
29 |     def test_gbtrn_4(self):
30 |         """
31 |         BEFORE: Please verify the TRN 99 L 999 99.
32 |         AFTER:  Please verify the TRN {{TAX_REFERENCE_NUMBER}}.
33 |         """
34 |         self.compare_before_after(locale='en_GB')
35 | 
36 |     def test_gbtrn_5(self):
37 |         """
38 |         BEFORE: The number is 11A 12345.
39 |         AFTER:  The number is {{TAX_REFERENCE_NUMBER}}.
40 |         """
41 |         self.compare_before_after(locale='en_GB')
42 | 


--------------------------------------------------------------------------------
/tests/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import sys
 5 | import subprocess
 6 | 
 7 | from wasabi import msg
 8 | 
 9 | tests = [
10 |     "mypy --config-file setup.cfg scrubadub/",
11 |     "flake8  --config setup.cfg scrubadub/",
12 |     # If py3.5 then examples with spacy don't work so disable doctests
13 |     'if python3 --version | grep -Evq "Python (3\\.5\\.)" ; then nosetests --with-doctest --doctest-extension=rst ./tests/ ./scrubadub/ ./docs/ ; else nosetests ; fi',
14 |     "python3 ./tests/benchmark_accuracy.py --fast",
15 |     "python3 ./tests/benchmark_time.py",
16 |     'if python3 --version | grep -Evq "Python (3\\.5\\.)" ; then cd docs && make html && cd - ; fi',
17 | ]
18 | 
19 | 
20 | def run_test(command, directory):
21 |     """Execute a command that runs a test"""
22 |     msg.text("RUNNING  " + command)
23 |     wrapped_command = f"cd {directory} && {command}"
24 |     pipe = subprocess.Popen(
25 |         wrapped_command, shell=True,
26 |     )
27 |     pipe.wait()
28 |     if pipe.returncode == 0:
29 |         msg.good("TEST PASSED")
30 |     else:
31 |         msg.fail("TEST FAILED")
32 |     msg.text('')
33 |     return pipe.returncode
34 | 
35 | 
36 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
37 | 
38 | # run the tests
39 | if isinstance(tests, str):
40 |     returncode = run_test(tests, root_dir)
41 | elif isinstance(tests, (list, tuple)):
42 |     returncode = 0
43 |     for test in tests:
44 |         returncode += run_test(test, root_dir)
45 | 
46 | if returncode == 0:
47 |     msg.good("ALL TESTS PASSED")
48 | else:
49 |     msg.fail("SOME TESTS FAILED, SEE ABOVE")
50 | 
51 | sys.exit(returncode)
52 | 


--------------------------------------------------------------------------------
/tests/test_postprocessor_configuration.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import catalogue
 3 | import scrubadub
 4 | import scrubadub.post_processors.catalogue
 5 | 
 6 | 
 7 | class PostProcessorConfigTestCase(unittest.TestCase):
 8 |     def test_register_post_processor(self):
 9 |         class NewPostProcessor(scrubadub.post_processors.PostProcessor):
10 |             name = 'new_post_processor'
11 | 
12 |         scrubadub.post_processors.catalogue.register_post_processor(NewPostProcessor, False, -1)
13 | 
14 |         self.assertTrue(NewPostProcessor.name in scrubadub.post_processors.catalogue.post_processor_catalogue)
15 |         self.assertFalse(NewPostProcessor.autoload)
16 |         self.assertEqual(-1, NewPostProcessor.index)
17 |         self.assertEqual(scrubadub.post_processors.catalogue.post_processor_catalogue.get(NewPostProcessor.name), NewPostProcessor)
18 | 
19 |         scrubadub.post_processors.catalogue.remove_post_processor(NewPostProcessor)
20 |         with self.assertRaises(catalogue.RegistryError):
21 |             scrubadub.post_processors.catalogue.post_processor_catalogue.get(NewPostProcessor.name)
22 | 
23 |         scrubadub.post_processors.catalogue.register_post_processor(NewPostProcessor, True, 7927)
24 |         self.assertTrue(NewPostProcessor.name in scrubadub.post_processors.catalogue.post_processor_catalogue)
25 |         self.assertTrue(NewPostProcessor.autoload)
26 |         self.assertEqual(7927, NewPostProcessor.index)
27 |         self.assertEqual(scrubadub.post_processors.catalogue.post_processor_catalogue.get(NewPostProcessor.name), NewPostProcessor)
28 | 
29 |         scrubadub.post_processors.catalogue.remove_post_processor(NewPostProcessor)
30 | 


--------------------------------------------------------------------------------
/tests/test_detector_drivers_licence.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from base import BaseTestCase
 4 | 
 5 | 
 6 | class GBDriversTestCase(unittest.TestCase, BaseTestCase):
 7 | 
 8 |     def test_gbdrivers_1(self):
 9 |         """
10 |         BEFORE: The driving licence number of the claimant is MORGA753116SM91J 01, and a copy of the licence is attached.
11 |         AFTER:  The driving licence number of the claimant is {{DRIVERS_LICENCE}}, and a copy of the licence is attached.
12 |         """
13 |         self.compare_before_after(locale='en_GB')
14 | 
15 |     def test_gbdrivers_2(self):
16 |         """
17 |         BEFORE: My DVLA NO is MORGA 753116SM91J 01 could you please check.
18 |         AFTER:  My DVLA NO is {{DRIVERS_LICENCE}} could you please check.
19 |         """
20 |         self.compare_before_after(locale='en_GB')
21 | 
22 |     def test_gbdrivers_3(self):
23 |         """
24 |         BEFORE: My DVLA NO is MORGA753116SM91J01 could you please check.
25 |         AFTER:  My DVLA NO is {{DRIVERS_LICENCE}} could you please check.
26 |         """
27 |         self.compare_before_after(locale='en_GB')
28 | 
29 |     def test_gbdrivers_4(self):
30 |         """
31 |         BEFORE: My DVLA NO is MORGA 753 116 SM91J 01 could you please check.
32 |         AFTER:  My DVLA NO is {{DRIVERS_LICENCE}} could you please check.
33 |         """
34 |         self.compare_before_after(locale='en_GB')
35 | 
36 |     def test_gbdrivers_5(self):
37 |         """
38 |         BEFORE: My DVLA NO is MORGA 753116 SM91J01 could you please check.
39 |         AFTER:  My DVLA NO is {{DRIVERS_LICENCE}} could you please check.
40 |         """
41 |         self.compare_before_after(locale='en_GB')
42 | 


--------------------------------------------------------------------------------
/tests/test_detector_twitter.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from base import BaseTestCase
 4 | 
 5 | 
 6 | class EmailTestCase(unittest.TestCase, BaseTestCase):
 7 | 
 8 |     def test_email_and_twitter(self):
 9 |         """
10 |         BEFORE: My email is john@gmail.com and i tweet at @john_gmail
11 |         AFTER:  My email is {{EMAIL}} and i tweet at {{TWITTER}}
12 |         """
13 |         self.compare_before_after()
14 | 
15 |     def test_capitalise(self):
16 |         """
17 |         BEFORE: My tweeter is @John_gmail
18 |         AFTER:  My tweeter is {{TWITTER}}
19 |         """
20 |         self.compare_before_after()
21 | 
22 |     def test_twitter(self):
23 |         """
24 |         BEFORE: This is an invalid handle @TwitterInfo
25 |         AFTER:  This is an invalid handle @TwitterInfo
26 |         """
27 |         self.compare_before_after()
28 | 
29 |     def test_admin(self):
30 |         """
31 |         BEFORE: This is an invalid handle @XYZAdminInfo
32 |         AFTER:  This is an invalid handle @XYZAdminInfo
33 |         """
34 |         self.compare_before_after()
35 | 
36 |     def test_uppercase(self):
37 |         """
38 |         BEFORE: My tweeter is @JOHN_JOHN123
39 |         AFTER:  My tweeter is {{TWITTER}}
40 |         """
41 |         self.compare_before_after()
42 | 
43 |     def test_underscore(self):
44 |         """
45 |         BEFORE: My tweeter is @_JOHN_JOHN123
46 |         AFTER:  My tweeter is {{TWITTER}}
47 |         """
48 |         self.compare_before_after()
49 | 
50 |     def test_underscores(self):
51 |         """
52 |         BEFORE: My tweeter is @_JOHN_JOHN123_
53 |         AFTER:  My tweeter is {{TWITTER}}
54 |         """
55 |         self.compare_before_after()
56 | 


--------------------------------------------------------------------------------
/docs/addresses.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Addresses
 3 | =========
 4 | 
 5 | Address detection is hard, despite the fact it may seem simple on the surface.
 6 | We use the `pyap <https://github.com/vladimarius/pyap>`_ package to detect addresses and `libpostal <https://github.com/openvenues/libpostal>`_ to verify them.
 7 | This is implemented in ``scrubadub_address.address.AddressDetector``, which is in a separate package and not enabled by default due to its dependencies on these two libraries.
 8 | We currently support British, American and Canadian addresses.
 9 | 
10 | Installation
11 | ------------
12 | 
13 | First libpostal needs to be installed.
14 | Full instructions can be found in the `libpostal documentation <https://github.com/openvenues/libpostal#installation-maclinux>`_, but a summary is given below for linux installation:
15 | 
16 | .. code-block:: console
17 | 
18 |     $ sudo apt-get install curl autoconf automake libtool pkg-config
19 |     $ git clone https://github.com/openvenues/libpostal
20 |     $ cd libpostal
21 |     $ ./bootstrap.sh
22 |     $ ./configure --prefix=/usr/local/
23 |     $ make -j4
24 |     $ sudo make install
25 | 
26 | Once you have installed libpostal, the remaining python dependencies can be installed:
27 | 
28 | .. code-block:: console
29 | 
30 |     $ pip install pypostal scrubadub_address
31 | 
32 | Usage
33 | -----
34 | 
35 | Once the dependencies are installed you can import the detector and add it to your ``Scrubber`` as shown below:
36 | 
37 | .. code-block:: pycon
38 | 
39 |     >>> import scrubadub, scrubadub_address
40 |     >>> scrubber = scrubadub.Scrubber()
41 |     >>> scrubber.add_detector(scrubadub_address.detectors.AddressDetector)
42 |     >>> scrubber.clean("I live at 6919 Bell Drives, East Jessicastad, MO 76908")
43 |     'I live at {{ADDRESS}}'
44 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. _quick_start:
 2 | 
 3 | .. include:: ../README.rst
 4 | 
 5 | Related work
 6 | ------------
 7 | 
 8 | ``scrubadub`` isn't the first package to attempt to remove personally
 9 | identifiable information from free text. There are a handful of other
10 | projects out there that have very similar aims and which provide some
11 | inspiration for how ``scrubadub`` should work.
12 | 
13 | -  `MITRE <http://mist-deid.sourceforge.net/>`__ gives the ability to
14 |    replace names with a placeholder like ``[NAME]`` or alternatively
15 |    replace names with fake names. last release in 8/2014. not on github.
16 |    unclear what language although it looks like python. it is clear that
17 |    the documentation sucks and is primarily intended for academic
18 |    audiences (docs are in papers).
19 | 
20 | -  `physionet has a few deidentification
21 |    packages <http://www.physionet.org/physiotools/software-index.shtml#deid>`__
22 |    that look pretty decent but are both written in perl and require
23 |    advance knowledge of what you are trying to replace. Intended for
24 |    HIPAA regulations. In particular,
25 |    `deid <http://www.physionet.org/physiotools/deid/>`__ has some good
26 |    lists of names that might be useful in spite of the fact it has 5k+
27 |    lines of gross perl.
28 | 
29 | 
30 | Contents
31 | --------
32 | 
33 | .. toctree::
34 |     :maxdepth: 2
35 |     :caption: Documentation
36 | 
37 |     Introduction <self>
38 |     usage
39 |     accuracy
40 |     names
41 |     addresses
42 |     creating_detectors
43 |     localization
44 |     contributing
45 |     changelog
46 | 
47 | .. toctree::
48 |     :maxdepth: 2
49 |     :name: api_toc
50 |     :caption: API Reference
51 | 
52 |     api_scrubadub
53 |     api_scrubadub_detectors
54 |     api_scrubadub_filth
55 |     api_scrubadub_post
56 |     api_scrubadub_comparison
57 | 
58 | 
59 | Indices and tables
60 | ------------------
61 | 
62 | * :ref:`genindex`
63 | * :ref:`modindex`
64 | * :ref:`search`
65 | 


--------------------------------------------------------------------------------
/design/customize_replacement_strings.py:
--------------------------------------------------------------------------------
 1 | """scrubadub uses {{}} notation by default to identify filth, but a user may
 2 | prefer to fine-tune how the filth is removed.
 3 | 
 4 | For example, if the input text is html, then a user may want the filth to be
 5 | included in a <span> tag that has a particular class on it to make it easy to
 6 | style these things.
 7 | 
 8 | Another example is a situation when a user wants to retain the domain name on a
 9 | URL but not the path.
10 | """
11 | 
12 | import scrubadub
13 | 
14 | # fine tune the prefix and suffix for all scrubadub objects. because this is
15 | # changing a class attribute on the base class, this should propagate to all
16 | # filth
17 | scrubadub.filth.Filth.prefix = '<span class="scrubadub filth">'
18 | scrubadub.filth.Filth.suffix = '</span>'
19 | 
20 | # these methods should now all have that prefix and suffix
21 | clean_text = scrubadub.clean(text)
22 | clean_text = scrubadub.clean(text, replace_with="placeholder")
23 | clean_text = scrubadub.clean(text, replace_with="surrogate")
24 | clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup)
25 | 
26 | # and so should these
27 | scrubber = scrubadub.Scrubber()
28 | clean_text = scrubber.clean(text)
29 | clean_text = scrubber.clean(text, replace_with="placeholder")
30 | clean_text = scrubber.clean(text, replace_with="surrogate")
31 | clean_text = scrubber.clean(text, replace_with="identifier", lookup=lookup)
32 | 
33 | 
34 | # reconfigure back to the old prefix and suffix combination and now keep the
35 | # domain on UrlFilth
36 | scrubadub.filth.Filth.prefix = '{{'
37 | scrubadub.filth.Filth.suffix = '}}'
38 | scrubadub.filth.UrlFilth.keep_domain = True
39 | 
40 | # these methods should now all have that prefix and suffix
41 | clean_text = scrubadub.clean(text)
42 | clean_text = scrubadub.clean(text, replace_with="placeholder")
43 | clean_text = scrubadub.clean(text, replace_with="surrogate")
44 | clean_text = scrubadub.clean(text, replace_with="identifier", lookup=lookup)
45 | 


--------------------------------------------------------------------------------
/tests/test_detector_en_GB_nino.py:
--------------------------------------------------------------------------------
 1 | import faker
 2 | import unittest
 3 | from scrubadub.filth import NationalInsuranceNumberFilth
 4 | 
 5 | from base import BaseTestCase
 6 | 
 7 | 
 8 | class GBNinoTestCase(unittest.TestCase, BaseTestCase):
 9 | 
10 |     def test_nino_1(self):
11 |         """
12 |         BEFORE: My NI number is AZ 12 34 56 A
13 |         AFTER:  My NI number is {{NATIONAL_INSURANCE_NUMBER}}
14 |         """
15 |         self.compare_before_after(locale='en_GB')
16 | 
17 |     def test_nino_2(self):
18 |         """
19 |         BEFORE: Enter a National Insurance number that is 2 letters, 6 numbers, then A, B, C or D, like AZ123456A.
20 |         AFTER:  Enter a National Insurance number that is 2 letters, 6 numbers, then A, B, C or D, like {{NATIONAL_INSURANCE_NUMBER}}.
21 |         """
22 |         self.compare_before_after(locale='en_GB')
23 | 
24 |     def test_nino_3(self):
25 |         """
26 |         BEFORE: It’s on your National Insurance card, benefit letter, payslip or P60. For example, AZ 12 34 56 A.
27 |         AFTER:  It’s on your National Insurance card, benefit letter, payslip or P60. For example, {{NATIONAL_INSURANCE_NUMBER}}.
28 |         """
29 |         self.compare_before_after(locale='en_GB')
30 | 
31 |     def test_nino_4(self):
32 |         """
33 |         BEFORE: Please verify the NI AZ 123456 A.
34 |         AFTER:  Please verify the NI {{NATIONAL_INSURANCE_NUMBER}}.
35 |         """
36 |         self.compare_before_after(locale='en_GB')
37 | 
38 |     def test_nino_5(self):
39 |         """
40 |         BEFORE: The number is AZ 123 456 A.
41 |         AFTER:  The number is {{NATIONAL_INSURANCE_NUMBER}}.
42 |         """
43 |         self.compare_before_after(locale='en_GB')
44 | 
45 |     def test_generate(self):
46 |         class Faker:
47 |             def ssn(self):
48 |                 return 'ZZ061251T'
49 | 
50 |         self.assertEqual(
51 |             'ZZ061251T',
52 |             NationalInsuranceNumberFilth.generate(faker=Faker()),
53 |         )
54 | 


--------------------------------------------------------------------------------
/tests/test_api_advanced.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from base import BaseTestCase
 4 | 
 5 | 
 6 | class AdvancedTestCase(unittest.TestCase, BaseTestCase):
 7 | 
 8 |     def test_disable_email(self):
 9 |         """
10 |         BEFORE: contact me at joe@example.com
11 |         AFTER:  contact me at joe@example.com
12 |         """
13 |         before, after = self.get_before_after()
14 |         import scrubadub
15 |         scrubber = scrubadub.Scrubber()
16 |         scrubber.remove_detector('email')
17 |         self.check_equal(after, scrubber.clean(before))
18 | 
19 |     def test_customize_filth_identification(self):
20 |         """
21 |         BEFORE: contact me at joe@example.com
22 |         AFTER:  contact me at <b>EMAIL</b>
23 |         """
24 |         before, after = self.get_before_after()
25 |         import scrubadub
26 |         prefix = scrubadub.filth.base.Filth.prefix
27 |         suffix = scrubadub.filth.base.Filth.suffix
28 |         scrubadub.filth.base.Filth.prefix = u'<b>'
29 |         scrubadub.filth.base.Filth.suffix = u'</b>'
30 |         try:
31 |             scrubber = scrubadub.Scrubber()
32 |             self.check_equal(after, scrubber.clean(before))
33 |         finally:
34 |             # Ensure that this is reset, no matter what happens above
35 |             scrubadub.filth.base.Filth.prefix = prefix
36 |             scrubadub.filth.base.Filth.suffix = suffix
37 | 
38 |     def test_identifier(self):
39 |         """
40 |         BEFORE: i'm on twitter (@john_smith) or can be reached at +1.800.346.1819
41 |         AFTER:  i'm on twitter ({{TWITTER-0}}) or can be reached at {{PHONE-1}}
42 |         """
43 |         self.compare_before_after(replace_with='identifier')
44 | 
45 |     def test_identifier_repeat(self):
46 |         """
47 |         BEFORE: i'm on twitter (@john_smith), but tweet @john instead, don't tweet me @john_smith.
48 |         AFTER:  i'm on twitter ({{TWITTER-0}}), but tweet {{TWITTER-1}} instead, don't tweet me {{TWITTER-0}}.
49 |         """
50 |         self.compare_before_after(replace_with='identifier')
51 | 


--------------------------------------------------------------------------------
/tests/test_detector_phone_numbers.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from base import BaseTestCase
 4 | 
 5 | 
 6 | class PhoneNumberTestCase(unittest.TestCase, BaseTestCase):
 7 | 
 8 |     def create_docstring(self, phone_number):
 9 |         return """
10 |         BEFORE: My phone number is %s
11 |         AFTER:  My phone number is {{PHONE}}
12 |         """ % phone_number
13 | 
14 |     def check_phone_numbers(self, *phone_numbers):
15 |         for phone_number in phone_numbers:
16 |             self.compare_before_after(
17 |                 docstring=self.create_docstring(phone_number),
18 |             )
19 | 
20 |     def test_american_phone_number(self):
21 |         """test american-style phone numbers"""
22 |         self.check_phone_numbers(
23 |             '1-312-515-2239',
24 |             '+1-312-515-2239',
25 |             '1 (312) 515-2239',
26 |             '312-515-2239',
27 |             '(312) 515-2239',
28 |             '(312)515-2239',
29 |         )
30 | 
31 |     def test_extension_phone_numbers(self):
32 |         """test phone numbers with extensions"""
33 |         self.check_phone_numbers(
34 |             '312-515-2239 x12',
35 |             '312-515-2239 ext. 12',
36 |             '312-515-2239 ext.12',
37 |         )
38 | 
39 |     def test_international_phone_numbers(self):
40 |         """test international phone numbers"""
41 |         self.check_phone_numbers(
42 |             '+47 21 30 85 99',
43 |             '+45 69 19 88 56',
44 |             '+46 852 503 499',
45 |             '+31 619 837 236',
46 |             '+86 135 3727 4136',
47 |             '+61267881324',
48 |         )
49 | 
50 |     def test_multiple_phone_numbers(self):
51 |         # running this through scrubadub.clean replaces 'reached at
52 |         # 312.714.8142' with '{{EMAIL}}'. See issue
53 |         result = self.clean(
54 |             u'Call me on my cell 312.714.8142 or in my office 773.415.7432'
55 |         )
56 |         self.assertEqual(
57 |             result,
58 |             u'Call me on my cell {{PHONE}} or in my office {{PHONE}}',
59 |             'problem with multiple phone numbers: \n %s' % result,
60 |         )
61 | 


--------------------------------------------------------------------------------
/tests/test_detector_credentials.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from base import BaseTestCase
 4 | 
 5 | 
 6 | class CredentialsTestCase(unittest.TestCase, BaseTestCase):
 7 | 
 8 |     def test_root_root_combo(self):
 9 |         """
10 |         BEFORE: username: root\npassword: root\n\n
11 |         AFTER:  username: {{USERNAME}}\npassword: {{PASSWORD}}\n\n
12 |         """
13 |         self.compare_before_after()
14 | 
15 |     def test_whitespaceless(self):
16 |         """
17 |         BEFORE: username:root\npassword:crickets
18 |         AFTER:  username:{{USERNAME}}\npassword:{{PASSWORD}}
19 |         """
20 |         self.compare_before_after()
21 | 
22 |     def test_colonless(self):
23 |         """
24 |         BEFORE: username root\npassword crickets
25 |         AFTER:  username {{USERNAME}}\npassword {{PASSWORD}}
26 |         """
27 |         self.compare_before_after()
28 | 
29 |     def test_email_username(self):
30 |         """sometimes there is no colon"""
31 |         result = self.clean(u'username: joe@example.com\npassword moi')
32 |         self.assertNotIn("joe@example.com", result, 'email username remains "%s"' % result)
33 |         self.assertNotIn("moi", result, 'password remains "%s"' % result)
34 | 
35 |     def test_alternate_keywords(self):
36 |         """
37 |         BEFORE: login snoop pw biggreenhat
38 |         AFTER:  login {{USERNAME}} pw {{PASSWORD}}
39 |         """
40 |         self.compare_before_after()
41 | 
42 |     def test_singleletter_keywords(self):
43 |         """
44 |         BEFORE: u: snoop\np: biggreenhat
45 |         AFTER:  u: {{USERNAME}}\np: {{PASSWORD}}
46 |         """
47 |         self.compare_before_after()
48 | 
49 |     def test_singleletter_keyword_exceptions(self):
50 |         """Make sure that the single letter keywords do not make mistakes
51 | 
52 |         BEFORE: This is your problem
53 |         AFTER:  This is your problem
54 |         """
55 |         self.compare_before_after()
56 | 
57 |     def test_camelcase_keywords(self):
58 |         """
59 |         BEFORE: UserName snoop PassWord biggreenhat
60 |         AFTER:  UserName {{USERNAME}} PassWord {{PASSWORD}}
61 |         """
62 |         self.compare_before_after()
63 | 


--------------------------------------------------------------------------------
/tests/base.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | import scrubadub
 4 | 
 5 | 
 6 | try:
 7 |     unicode
 8 | except NameError:
 9 |     unicode = str  # Python 2 and 3 compatibility
10 | 
11 | # this is a mixin class to make it easy to centralize a lot of the core
12 | # functionality of the test suite
13 | class BaseTestCase(object):
14 | 
15 |     def clean(self, text, locale='en_US', **kwargs):
16 |         if 'replace_with' in kwargs:
17 |             scrubadub.filth.base.Filth.lookup = scrubadub.utils.Lookup()
18 |         return scrubadub.clean(text, locale=locale, **kwargs)
19 | 
20 |     def get_before_after(self, docstring=None):
21 |         """Recursively parse the docstrings of methods that are called in the
22 |         stack to find the docstring that has been used to define the test.
23 |         """
24 |         # get the before and after outcomes from the docstring of the method
25 |         # that calls compare_before_after
26 |         if docstring is None:
27 |             stack = inspect.stack()
28 |             for frame in inspect.stack():
29 |                 calling_function_name = frame[3]
30 |                 _docstring = getattr(self, calling_function_name).__doc__
31 |                 if "BEFORE:" in _docstring and "AFTER:" in _docstring:
32 |                     docstring = _docstring
33 |                     break
34 |         before, after = docstring.split("BEFORE:")[1].split("AFTER:")
35 |         return unicode(before.strip()), unicode(after.strip())
36 | 
37 |     def check_equal(self, expected, actual):
38 |         """This method makes it easy to give useful error messages when running
39 |         nosetests
40 |         """
41 |         self.assertEqual(
42 |             actual,
43 |             expected,
44 |             '\nEXPECTED:\n"%s"\n\nBUT GOT THIS:\n"%s"'%(expected, actual),
45 |         )
46 | 
47 |     def compare_before_after(self, docstring=None, locale='en_US', **clean_kwargs):
48 |         """Convenience method for quickly writing tests using the BEFORE and
49 |         AFTER keywords to parse the docstring.
50 |         """
51 |         before, after = self.get_before_after(docstring=docstring)
52 |         self.check_equal(after, self.clean(before, locale=locale, **clean_kwargs))
53 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/phone.py:
--------------------------------------------------------------------------------
 1 | import phonenumbers
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from scrubadub.detectors.catalogue import register_detector
 6 | from .base import Detector
 7 | from ..filth import PhoneFilth
 8 | 
 9 | 
10 | @register_detector
11 | class PhoneDetector(Detector):
12 |     """Remove phone numbers from dirty dirty ``text`` using
13 |     `python-phonenumbers <https://github.com/daviddrysdale/python-phonenumbers>`_, a port of a
14 |     Google project to correctly format phone numbers in text.
15 | 
16 |     Set the locale on the scrubber or detector to set the region used to search for valid phone numbers.
17 |     If the locale is set to 'en_CA' Canadian numbers will be searched for, while setting the local to 'en_GB' searches
18 |     for British numbers.
19 |     """
20 |     filth_cls = PhoneFilth
21 |     name = 'phone'
22 |     autoload = True
23 | 
24 |     def iter_filth(self, text, document_name: Optional[str] = None):
25 |         """Yields discovered filth in the provided ``text``.
26 | 
27 |         :param text: The dirty text to clean.
28 |         :type text: str
29 |         :param document_name: The name of the document to clean.
30 |         :type document_name: str, optional
31 |         :return: An iterator to the discovered :class:`Filth`
32 |         :rtype: Iterator[:class:`Filth`]
33 |         """
34 |         # create a copy of text to handle multiple phone numbers correctly
35 |         for match in phonenumbers.PhoneNumberMatcher(text, self.region):
36 |             yield PhoneFilth(
37 |                 beg=match.start,
38 |                 end=match.end,
39 |                 text=match.raw_string,
40 |                 detector_name=self.name,
41 |                 document_name=document_name,
42 |                 locale=self.locale,
43 |             )
44 | 
45 |     @classmethod
46 |     def supported_locale(cls, locale: str) -> bool:
47 |         """Returns true if this ``Detector`` supports the given locale.
48 | 
49 |         :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
50 |                        underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
51 |         :type locale: str
52 |         :return: ``True`` if the locale is supported, otherwise ``False``
53 |         :rtype: bool
54 |         """
55 |         return True
56 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | from setuptools import setup, find_packages
 4 | 
 5 | # read in the description from README
 6 | with open("README.rst") as stream:
 7 |     long_description = stream.read()
 8 | 
 9 | github_url = 'https://github.com/LeapBeyond/scrubadub'
10 | 
11 | 
12 | def read_packages_from_file(filename):
13 |     with open(filename, 'r') as stream:
14 |         for line in stream:
15 |             package = line.strip().split('#')[0]
16 |             if package:
17 |                 yield package
18 | 
19 | 
20 | def get_package_list(location):
21 |     location = os.path.join('requirements', location)
22 |     return list(read_packages_from_file(location))
23 | 
24 | 
25 | # get the version
26 | version = None
27 | with open(os.path.join('scrubadub', '__init__.py')) as stream:
28 |     for line in stream:
29 |         if 'version' in line.lower():
30 |             version = line.split()[-1].replace('"', '').replace("'", '')
31 | 
32 | setup(
33 |     name='scrubadub',
34 |     version=version,
35 |     description=(
36 |         "Clean personally identifiable information from dirty dirty text."
37 |     ),
38 |     long_description=long_description,
39 |     url=github_url,
40 |     download_url="%s/archives/master" % github_url,
41 |     author='Dean Malmgren',
42 |     author_email='dean.malmgren@datascopeanalytics.com',
43 |     license='MIT',
44 |     packages=find_packages(exclude=["tests", "tests.*"]),
45 |     classifiers=[
46 |         'Intended Audience :: Developers',
47 |         'Development Status :: 5 - Production/Stable',
48 |         'License :: OSI Approved :: Apache Software License',
49 |         'Natural Language :: English',
50 |         'Programming Language :: Python',
51 |         'Programming Language :: Python :: 3',
52 |         'Programming Language :: Python :: 3.6',
53 |         'Programming Language :: Python :: 3.7',
54 |         'Programming Language :: Python :: 3.8',
55 |         'Programming Language :: Python :: 3.9',
56 |         'Topic :: Software Development :: Libraries',
57 |         'Topic :: Scientific/Engineering :: Information Analysis',
58 |         'Topic :: Text Processing',
59 |         'Topic :: Utilities',
60 |     ],
61 |     install_requires=get_package_list('python'),
62 |     include_package_data=True,
63 |     package_data={'': ['scrubadub/detectors/models/sklearn_address/*.json']},
64 |     zip_safe=False,
65 | )
66 | 


--------------------------------------------------------------------------------
/design/customize_via_training.py:
--------------------------------------------------------------------------------
 1 | """scrubadub currently removes personally identifiable information with some
 2 | regular expression and natural language processing techniques. These techniques
 3 | work very well in a wide range of circumstances, but they also tend to make
 4 | mistakes.
 5 | 
 6 | For example, the first sentence should obfuscate the name 'April' and
 7 | the second sentence should not obfuscate the month 'April'.
 8 | 
 9 | April is a good friend of mine. I hope to see her in April.
10 | 
11 | To make this possible, scrubadub needs to be able to incorporate some
12 | techniques for training a classifier to identify filth. The training interface
13 | is important and probably not something that is best done in a terminal, but it
14 | is important that the technical infrastructure is there for it to work.
15 | """
16 | 
17 | import scrubadub
18 | 
19 | # a TrainedScrubber can be taught what is dirty about a particular document.
20 | scrubber = scrubadub.TrainedScrubber()
21 | for document in training_documents:
22 | 
23 |     # TrainedScrubber.detect_filth just returns a list of filth objects that
24 |     # are returned by Scrubber.iter_filth. This is used to help make
25 |     # classification easy for end users.
26 |     filth_list = scrubber.detect_filth(document)
27 | 
28 |     # The filth_list is then refined by human input. It is very difficult to
29 |     # imagine doing this in a terminal in an effective way (although `git add
30 |     # -i` might be a decent example). I imagine that person_identifies_filth is
31 |     # a web interface where users can easily brush text to improve recall and
32 |     # adjust the preliminary filth_list to improve precision.
33 |     filth_list = person_identifies_filth(document, filth_list)
34 | 
35 |     # The TrainedScrubber.train method should incorporate the filth_list into
36 |     # its classifier and further return a cleaned document with the filth
37 |     # removed in an appropriate way.
38 |     cleaned_document = scrubber.train(document, filth_list)
39 | 
40 | # the TrainedScrubber.predict (or maybe just TrainedScrubber.clean?) method is
41 | # then used to use the classifier to selectively clean filth based on the human
42 | # input. This way, you might only have to train ~1000 documents to do a good
43 | # job of scrubbing the rest (imagine having to do this for 1mm documents)
44 | for document in test_documents:
45 |     clean_document = scrubber.predict(document)
46 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
 1 | .. _contributing:
 2 | 
 3 | Contributing
 4 | ============
 5 | 
 6 | The overarching goal of this project is to remove personally identifiable
 7 | information from raw text as reliably as possible. In practice, this means that
 8 | this project, by default, will preferentially be overly conservative in removing
 9 | information that might be personally identifiable. As this project matures, I
10 | fully expect the project to become ever smarter about how it interprets and
11 | anonymizes raw text.
12 | 
13 | Regardless of which personal information is identified, this project is committed
14 | to being as agnostic about the manner in which the text is anonymized, so long
15 | as it is done with rigor and does not inadvertantly lead to `improper
16 | anonymization <https://medium.com/@vijayp/of-taxis-and-rainbows-f6bc289679a1>`_.
17 | Replacing with placholders? Replacing with anonymous (but consistent) IDs?
18 | Replacing with random metadata? Other ideas? All should be supported to make
19 | this project as useful as possible to the people that need it.
20 | 
21 | Another important aspect of this project is that we want to have extremely good
22 | documentation and source code that is easy to read. If you notice a type-o,
23 | error, confusing statement etc, please fix it!
24 | 
25 | 
26 | .. _contributing-quick-start:
27 | 
28 | Quick start
29 | -----------
30 | 
31 | 1. `Fork <https://github.com/LeapBeyond/scrubadub/fork>`_ and clone the
32 |    project:
33 | 
34 |    .. code-block:: bash
35 | 
36 |         git clone https://github.com/YOUR-USERNAME/scrubadub.git
37 | 
38 | 2. Create a python virtual environment and install the requirements
39 | 
40 |    .. code-block:: bash
41 | 
42 |        mkvirtualenv scrubadub
43 |        pip install -r requirements/python-dev
44 | 
45 | 3. Contribute! There are several `open issues
46 |    <https://github.com/LeapBeyond/scrubadub/issues>`_ that provide
47 |    good places to dig in. Check out the `contribution guidelines
48 |    <https://github.com/LeapBeyond/scrubadub/blob/master/CONTRIBUTING.md>`_
49 |    and send pull requests; your help is greatly appreciated!
50 | 
51 | 4. Run the test suite that is defined in ``.travis.yml`` to make sure
52 |    everything is working properly
53 | 
54 |    .. code-block:: bash
55 | 
56 |        ./tests/run.py
57 | 
58 |    Current build status: |Build Status|
59 | 
60 | .. |Build Status| image:: https://travis-ci.org/LeapBeyond/scrubadub.png
61 |    :target: https://travis-ci.org/LeapBeyond/scrubadub
62 | 


--------------------------------------------------------------------------------
/tests/test_detector_urls.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import scrubadub
 4 | 
 5 | from base import BaseTestCase
 6 | 
 7 | 
 8 | class UrlTestCase(unittest.TestCase, BaseTestCase):
 9 | 
10 |     def test_http(self):
11 |         """
12 |         BEFORE: http://bit.ly/aser is neat
13 |         AFTER:  {{URL}} is neat
14 |         """
15 |         self.compare_before_after()
16 | 
17 |     def test_https(self):
18 |         """
19 |         BEFORE: https://bit.ly/aser is neat
20 |         AFTER:  {{URL}} is neat
21 |         """
22 |         self.compare_before_after()
23 | 
24 |     def test_www(self):
25 |         """
26 |         BEFORE: www.bit.ly/aser is neat
27 |         AFTER:  {{URL}} is neat
28 |         """
29 |         self.compare_before_after()
30 | 
31 | 
32 |     def test_long_url(self):
33 |         """
34 |         BEFORE: https://this.is/a/long?url=very#url is good
35 |         AFTER:  {{URL}} is good
36 |         """
37 |         self.compare_before_after()
38 | 
39 |     def test_two_urls(self):
40 |         """
41 |         BEFORE: http://bit.ly/number-one http://www.google.com/two
42 |         AFTER:  {{URL}} {{URL}}
43 |         """
44 |         self.compare_before_after()
45 | 
46 | 
47 | class UrlKeepDomainTestCase(unittest.TestCase, BaseTestCase):
48 | 
49 |     def setUp(self):
50 |         scrubadub.filth.UrlFilth.keep_domain = True
51 |         scrubadub.filth.UrlFilth.url_placeholder = 'path/to/something'
52 |         scrubadub.filth.UrlFilth.prefix = ''
53 |         scrubadub.filth.UrlFilth.suffix = ''
54 |         super(UrlKeepDomainTestCase, self).setUp()
55 | 
56 |     def tearDown(self):
57 |         scrubadub.filth.UrlFilth.keep_domain = False
58 |         scrubadub.filth.UrlFilth.url_placeholder = 'URL'
59 |         scrubadub.filth.UrlFilth.prefix = '{{'
60 |         scrubadub.filth.UrlFilth.suffix = '}}'
61 | 
62 |     def test_path_word_in_sentence(self):
63 |         """
64 |         BEFORE: Find jobs at http://facebook.com/jobs
65 |         AFTER:  Find jobs at http://facebook.com/path/to/something
66 |         """
67 |         self.compare_before_after()
68 | 
69 |     def test_keep_domain(self):
70 |         """
71 |         BEFORE: http://public.com/this/is/very/private
72 |         AFTER:  http://public.com/path/to/something
73 |         """
74 |         self.compare_before_after()
75 | 
76 |     def test_keep_domain_empty_path(self):
77 |         """
78 |         BEFORE: http://public.com/
79 |         AFTER:  http://public.com/path/to/something
80 |         """
81 |         self.compare_before_after()
82 | 


--------------------------------------------------------------------------------
/scrubadub/post_processors/catalogue.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import catalogue
 3 | 
 4 | from typing import Type, Optional, Union, TYPE_CHECKING
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from scrubadub.post_processors import PostProcessor
 8 | 
 9 | post_processor_catalogue = catalogue.create('scrubadub', 'post_processors', entry_points=True)
10 | 
11 | 
12 | def register_post_processor(post_processor: Type['PostProcessor'], autoload: Optional[bool] = None,
13 |                             index: Optional[int] = None) -> None:
14 |     """Register a PostProcessor for use with the ``Scrubber`` class.
15 | 
16 |     You can use ``register_post_processor(NewPostProcessor)`` after your post-processor definition to automatically
17 |     register it with the ``Scrubber`` class so that it can be used to process Filth.
18 | 
19 |     The argument ``autoload`` sets if a new ``Scrubber()`` instance should load this ``PostProcessor`` by default.
20 | 
21 |     :param post_processor: The ``PostProcessor`` to register with the scrubadub post-processor configuration.
22 |     :type post_processor: PostProcessor class
23 |     :param autoload: Whether to automatically load this ``Detector`` on ``Scrubber`` initialisation.
24 |     :type autoload: bool
25 |     :param index: The location/index in which this ``PostProcessor`` should be added.
26 |     :type index: int
27 |     """
28 |     if not inspect.isclass(post_processor):
29 |         raise ValueError("post_processor should be a class, not an instance.")
30 | 
31 |     if autoload is not None:
32 |         post_processor.autoload = autoload
33 | 
34 |     if index is not None:
35 |         post_processor.index = index
36 | 
37 |     post_processor_catalogue.register(post_processor.name, func=post_processor)
38 | 
39 | 
40 | def remove_post_processor(post_processor: Union[Type['PostProcessor'], str]) -> None:
41 |     """Remove an already registered post-processor.
42 | 
43 |     :param post_processor: The ``PostProcessor`` to register with the scrubadub post-processor configuration.
44 |     :type post_processor: Union[Type['PostProcessor'], str]
45 |     """
46 |     if isinstance(post_processor, str):
47 |         if post_processor in post_processor_catalogue:
48 |             catalogue._remove((*post_processor_catalogue.namespace, post_processor))
49 | 
50 |     elif inspect.isclass(post_processor):
51 |         if post_processor.name in post_processor_catalogue:
52 |             catalogue._remove((*post_processor_catalogue.namespace, post_processor.name))
53 | 
54 |     else:
55 |         raise ValueError("post-processor should be a class (not an instance) or a string.")
56 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/email.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from typing import Optional, Generator
 4 | 
 5 | from scrubadub.detectors.catalogue import register_detector
 6 | from .base import RegexDetector
 7 | from ..filth import EmailFilth, Filth
 8 | 
 9 | 
10 | @register_detector
11 | class EmailDetector(RegexDetector):
12 |     """Use regular expression magic to remove email addresses from dirty
13 |     dirty ``text``. This method also catches email addresses like ``john at
14 |     gmail.com``.
15 |     """
16 |     filth_cls = EmailFilth
17 |     name = 'email'
18 |     autoload = True
19 | 
20 |     # there may be better solutions than this out there and this certainly
21 |     # doesn't do that great of a job with people that spell out the
22 |     # hyphenation of their email address, but its a pretty solid start.
23 |     #
24 |     # adapted from https://gist.github.com/dideler/5219706
25 |     regex = re.compile((
26 |         r"\b[a-z0-9!#$%&'*+\/=?^_`{|}~-]"             # start with this character
27 |         r"(?:"
28 |         r"    [\.a-z0-9!#$%&'*+\/=?^_`{|}~-]{0,62}"   # valid next characters (max length 64 chars before @)
29 |         r"    [a-z0-9!#$%&'*+\/=?^_`{|}~-]"           # end with this character
30 |         r")?"
31 |         r"(?:@|\sat\s)"                               # @ or the word 'at' instead
32 |         r"[a-z0-9]"                                   # domain starts like this
33 |         r"(?:"
34 |         r"    (?=[a-z0-9-]*(\.|\sdot\s))"             # A lookahead to ensure there is a dot in the domain
35 |         r"    (?:\.|\sdot\s|[a-z0-9-]){0,251}"        # might have a '.' or the word 'dot' instead
36 |         r"    [a-z0-9]"                               # domain has max 253 chars, ends with one of these
37 |         r")+\b"
38 |     ), re.VERBOSE | re.IGNORECASE)
39 | 
40 |     at_matcher = re.compile(r"@|\sat\s", re.IGNORECASE)
41 |     dot_matcher = re.compile(r"\.|\sdot\s", re.IGNORECASE)
42 | 
43 |     def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
44 |         """Yields discovered filth in the provided ``text``.
45 | 
46 |         :param text: The dirty text to clean.
47 |         :type text: str
48 |         :param document_name: The name of the document to clean.
49 |         :type document_name: str, optional
50 |         :return: An iterator to the discovered :class:`Filth`
51 |         :rtype: Iterator[:class:`Filth`]
52 |         """
53 | 
54 |         if re.search(self.at_matcher, text) and re.search(self.dot_matcher, text):
55 |             yield from super().iter_filth(text=text, document_name=document_name)
56 | 


--------------------------------------------------------------------------------
/tests/test_utils_canonical_string_set.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from scrubadub.utils import CanonicalStringSet
 4 | 
 5 | class CanonicalStringSetTestCase(unittest.TestCase):
 6 | 
 7 |     def test_init(self):
 8 |         """make sure that lower case casting works in __init__"""
 9 |         s = CanonicalStringSet(['TKTK', 'tKtK', 'Tktk'])
10 |         self.assertTrue('tktk' in s)
11 |         self.assertEqual(len(s), 1)
12 | 
13 |     def test_add(self):
14 |         """make sure that lower case casting works in add"""
15 |         s = CanonicalStringSet()
16 |         s.add('TKTK')
17 |         s.add('tKtK')
18 |         s.add('Tktk')
19 |         self.assertTrue('tktk' in s)
20 |         self.assertEqual(len(s), 1)
21 | 
22 |     def test_update(self):
23 |         """make sure lower case casting works in update"""
24 |         s = CanonicalStringSet()
25 |         s.update(['TKTK', 'tKtK', 'Tktk'])
26 |         self.assertTrue('tktk' in s)
27 |         self.assertEqual(len(s), 1)
28 | 
29 |     def test_update_again(self):
30 |         """make sure udpate works properly"""
31 |         s = CanonicalStringSet(['tktk'])
32 |         s.update(set(['KtKt']))
33 |         self.assertTrue('tktk' in s)
34 |         self.assertTrue('ktkt' in s)
35 |         self.assertIsInstance(s, CanonicalStringSet)
36 | 
37 |     def test_contains(self):
38 |         """make sure __contains__ casts things properly"""
39 |         s = CanonicalStringSet(['tktk'])
40 |         self.assertTrue('TKTK' in s)
41 |         self.assertTrue('Tktk' in s)
42 |         self.assertTrue('tKtK' in s)
43 | 
44 |     def test_pop(self):
45 |         """make sure pop deals with capitalized things properly"""
46 |         s = CanonicalStringSet(['TKTK'])
47 |         self.assertEqual(s.pop(), 'tktk')
48 | 
49 |     def test_remove(self):
50 |         """make sure remove works properly"""
51 |         s = CanonicalStringSet(['tktk'])
52 |         s.remove('TKTK')
53 |         self.assertFalse('tktk' in s)
54 | 
55 |     def test_discard(self):
56 |         """make sure discard works properly"""
57 |         s = CanonicalStringSet(['tktk'])
58 |         s.discard('TKTK')
59 |         s.discard('TkTk')
60 |         s.discard('Tktk')
61 |         self.assertFalse('tktk' in s)
62 | 
63 |     def test_non_string(self):
64 |         """ensure error is thrown when non string is added"""
65 |         s = CanonicalStringSet(['tktk'])
66 |         s.add('123')
67 |         with self.assertRaises(TypeError):
68 |             s.add(123)
69 |         with self.assertRaises(TypeError):
70 |             s.add(None)
71 | 
72 |     # TODO: add more tests for all of the other set operations to make sure
73 |     # people get what they expect
74 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/postalcode.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from scrubadub.detectors.catalogue import register_detector
 4 | from .base import RegionLocalisedRegexDetector
 5 | from ..filth.postalcode import PostalCodeFilth
 6 | 
 7 | 
 8 | @register_detector
 9 | class PostalCodeDetector(RegionLocalisedRegexDetector):
10 |     """Detects postal codes, currently only British post codes are supported."""
11 |     filth_cls = PostalCodeFilth
12 |     name = 'postalcode'
13 |     autoload = True
14 |     region_regex = {
15 |         # Informed by https://en.wikipedia.org/wiki/Postcodes_in_the_United_Kingdom#Validation
16 |         # and validated against https://osdatahub.os.uk/downloads/open/CodePointOpen
17 |         'GB': re.compile(r"""
18 |             (
19 |                 # Girobank postcode
20 |                 (?:[gG][iI][rR] {0,}0[aA]{2})|
21 |                 (?:  # British Overseas Territories in usual format
22 |                     (?:
23 |                         [aA][sS][cC][nN]|
24 |                         [sS][tT][hH][lL]|
25 |                         [tT][dD][cC][uU]|
26 |                         [bB][bB][nN][dD]|
27 |                         [bB][iI][qQ][qQ]|
28 |                         [fF][iI][qQ][qQ]|
29 |                         [pP][cC][rR][nN]|
30 |                         [sS][iI][qQ][qQ]|
31 |                         [iT][kK][cC][aA]
32 |                     )
33 |                     \ {0,}1[zZ]{2}
34 |                 )|
35 |                 (?:  # British Overseas Territories in zip-code format
36 |                     (KY[0-9]|MSR|VG|AI)[ -]{0,}[0-9]{4}
37 |                 )|
38 |                 # (?:  # Bermuda including this causes too many false positives, so excluded for now
39 |                 #     [a-zA-Z]{2}\ {0,}[0-9]{2}
40 |                 # )|
41 |                 (?:  # British Forces Post Office
42 |                     [Bb][Ff][Pp][Oo]\ {0,}[0-9]{1,4}
43 |                 )|
44 |                 (?:  # Mainland British postcodes
45 |                     (?:
46 |                         (?:[Ww][Cc][0-9][abehmnprvwxyABEHMNPRVWXY])|
47 |                         (?:[Ee][Cc][1-4][abehmnprvwxyABEHMNPRVWXY])|
48 |                         (?:[Nn][Ww]1[Ww])|
49 |                         (?:[Ss][Ee]1[Pp])|
50 |                         (?:[Ss][Ww]1[abehmnprvwxyABEHMNPRVWXY])|
51 |                         (?:[EeNnWw]1[a-hjkpstuwA-HJKPSTUW])|
52 |                         (?:[BbEeGgLlMmNnSsWw][0-9][0-9]?)|
53 |                         (?:[a-pr-uwyzA-PR-UWYZ][a-hk-yxA-HK-XY][0-9][0-9]?)
54 |                     )
55 |                     \ {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2}
56 |                 )
57 |             )
58 |         """, re.VERBOSE),
59 |     }
60 | 


--------------------------------------------------------------------------------
/tests/test_api.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import scrubadub
 3 | 
 4 | 
 5 | class APITestCase(unittest.TestCase):
 6 | 
 7 |     def test_clean(self):
 8 |         """Test the top level clean api"""
 9 |         self.assertEqual(
10 |             "This is a test message for {{EMAIL}}",
11 |             scrubadub.clean("This is a test message for example@exampe.com"),
12 |         )
13 | 
14 |     def test_clean_documents(self):
15 |         """Test the top level clean_documents api"""
16 |         self.assertEqual(
17 |             {
18 |                 "first.txt": "This is a test message for {{EMAIL}}",
19 |                 "second.txt": "Hello {{TWITTER}} call me on {{PHONE}}.",
20 |             },
21 |             scrubadub.clean_documents(
22 |                 {
23 |                     "first.txt": "This is a test message for example@exampe.com",
24 |                     "second.txt": "Hello @Jane call me on +33 4 41 26 62 36.",
25 |                 },
26 |             ),
27 |         )
28 | 
29 |     def test_list_filth(self):
30 |         """Test the top level list_filth api"""
31 |         filths = scrubadub.list_filth("This is a test message for example@example.com")
32 |         self.assertEqual(
33 |             [scrubadub.filth.EmailFilth(text='example@example.com', detector_name='email', beg=27, end=46)],
34 |             filths,
35 |         )
36 | 
37 |     def test_list_filth_docuemnts(self):
38 |         """Test the top level list_filth_documents api"""
39 |         filths = scrubadub.list_filth_documents(
40 |             {
41 |                 "first.txt": "This is a test message for example@example.com",
42 |                 "second.txt": "Hello @Jane call me on +33 4 41 26 62 36.",
43 |             }
44 |         )
45 |         self.assertEqual(
46 |             scrubadub.Scrubber._sort_filths([
47 |                 scrubadub.filth.EmailFilth(
48 |                     text='example@example.com', document_name='first.txt', detector_name='email', beg=27, end=46
49 |                 ),
50 |                 scrubadub.filth.TwitterFilth(
51 |                     text='@Jane', document_name='second.txt', detector_name='twitter', beg=6, end=11
52 |                 ),
53 |                 scrubadub.filth.PhoneFilth(
54 |                     text='+33 4 41 26 62 36', document_name='second.txt', detector_name='phone', beg=23, end=40
55 |                 ),
56 |             ]),
57 |             scrubadub.Scrubber._sort_filths(filths),
58 |         )
59 | 
60 |     def test_quickstart(self):
61 |         """Test the example given in the quick start docs"""
62 |         text = "My cat can be contacted on example@example.com, or 1800 555-5555"
63 |         self.assertEqual(
64 |             'My cat can be contacted on {{EMAIL}}, or {{PHONE}}',
65 |             scrubadub.clean(text),
66 |         )
67 | 


--------------------------------------------------------------------------------
/scrubadub/post_processors/prefix_suffix.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Sequence
 2 | 
 3 | from scrubadub.filth import Filth
 4 | from scrubadub.post_processors.catalogue import register_post_processor
 5 | from scrubadub.post_processors.base import PostProcessor
 6 | 
 7 | 
 8 | class PrefixSuffixReplacer(PostProcessor):
 9 |     """Add a prefix and/or suffix to the Filth's replacement string.
10 | 
11 |     >>> import scrubadub
12 |     >>> scrubber = scrubadub.Scrubber(post_processor_list=[
13 |     ...     scrubadub.post_processors.FilthReplacer(),
14 |     ... ])
15 |     >>> scrubber.clean("Contact me at 522-368-8530 or hernandezjenna@example.com")
16 |     'Contact me at PHONE or EMAIL'
17 |     >>> scrubber = scrubadub.Scrubber(post_processor_list=[
18 |     ...     scrubadub.post_processors.FilthReplacer(),
19 |     ...     scrubadub.post_processors.PrefixSuffixReplacer(prefix='{{', suffix='}}'),
20 |     ... ])
21 |     >>> scrubber.clean("Contact me at 522-368-8530 or hernandezjenna@example.com")
22 |     'Contact me at {{PHONE}} or {{EMAIL}}'
23 |     >>> scrubber = scrubadub.Scrubber(post_processor_list=[
24 |     ...     scrubadub.post_processors.FilthReplacer(),
25 |     ...     scrubadub.post_processors.PrefixSuffixReplacer(prefix='<b>', suffix='</b>'),
26 |     ... ])
27 |     >>> scrubber.clean("Contact me at 522-368-8530 or hernandezjenna@example.com")
28 |     'Contact me at <b>PHONE</b> or <b>EMAIL</b>'
29 | 
30 |     """
31 |     name = 'prefix_suffix_replacer'  # type: str
32 |     autoload = False
33 |     index = 1
34 | 
35 |     def __init__(self, prefix: Optional[str] = '{{', suffix: Optional[str] = '}}', name: Optional[str] = None):
36 |         super(PrefixSuffixReplacer, self).__init__(name=name)
37 | 
38 |         self.prefix = prefix
39 |         self.suffix = suffix
40 | 
41 |     def process_filth(self, filth_list: Sequence[Filth]) -> Sequence[Filth]:
42 |         """Processes the filth to add prefixes and suffixes to the replacement text
43 | 
44 |         :param filth_list: The text to be hashed
45 |         :type filth_list: Sequence[Filth]
46 |         :return: The processed filths
47 |         :rtype: Sequence[Filth]
48 |         """
49 |         for filth_item in filth_list:
50 |             if filth_item.replacement_string is None:
51 |                 filth_item.replacement_string = filth_item.type.upper()
52 | 
53 |             if self.prefix is not None and self.suffix is not None:
54 |                 filth_item.replacement_string = self.prefix + filth_item.replacement_string + self.suffix
55 |             elif self.prefix is not None:
56 |                 filth_item.replacement_string = self.prefix + filth_item.replacement_string
57 |             elif self.suffix is not None:
58 |                 filth_item.replacement_string = filth_item.replacement_string + self.suffix
59 | 
60 |         return filth_list
61 | 
62 | 
63 | register_post_processor(PrefixSuffixReplacer)
64 | 
65 | __all__ = ['PrefixSuffixReplacer']
66 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/catalogue.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import catalogue
 3 | 
 4 | from typing import Type, Optional, Union, TYPE_CHECKING
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from scrubadub.detectors import Detector
 8 | 
 9 | detector_catalogue = catalogue.create('scrubadub', 'detectors', entry_points=True)
10 | 
11 | 
12 | def register_detector(detector: Type['Detector'], *, autoload: Optional[bool] = None) -> Type['Detector']:
13 |     """Register a detector for use with the ``Scrubber`` class.
14 | 
15 |     You can use ``register_detector(NewDetector, autoload=True)`` after your detector definition to automatically
16 |     register it with the ``Scrubber`` class so that it can be used to remove Filth.
17 | 
18 |     The argument ``autoload``decides whether a new ``Scrubber()`` instance should load this ``detector`` by default.
19 | 
20 |     .. code:: pycon
21 | 
22 |         >>> import scrubadub
23 |         >>> class NewDetector(scrubadub.detectors.Detector):
24 |         ...     pass
25 |         >>> scrubadub.detectors.register_detector(NewDetector, autoload=False)
26 |         <class 'scrubadub.detectors.catalogue.NewDetector'>
27 | 
28 |     :param detector: The ``Detector`` to register with the scrubadub detector configuration.
29 |     :type detector: Detector class
30 |     :param autoload: Whether to automatically load this ``Detector`` on ``Scrubber`` initialisation.
31 |     :type autoload: Optional[bool]
32 |     """
33 |     if not inspect.isclass(detector):
34 |         raise ValueError("detector should be a class, not an instance.")
35 | 
36 |     if autoload is not None:
37 |         detector.autoload = autoload
38 | 
39 |     detector_catalogue.register(detector.name, func=detector)
40 | 
41 |     return detector
42 | 
43 | 
44 | def remove_detector(detector: Union[Type['Detector'], str]):
45 |     """Remove an already registered detector.
46 | 
47 |     .. code:: pycon
48 | 
49 |         >>> import scrubadub
50 |         >>> class NewDetector(scrubadub.detectors.Detector):
51 |         ...     pass
52 |         >>> scrubadub.detectors.catalogue.register_detector(NewDetector, autoload=False)
53 |         <class 'scrubadub.detectors.catalogue.NewDetector'>
54 |         >>> scrubadub.detectors.catalogue.remove_detector(NewDetector)
55 | 
56 |     :param detector: The ``Detector`` to register with the scrubadub detector configuration.
57 |     :type detector: Union[Type['PostProcessor'], str]
58 |     :param autoload: Whether to automatically load this ``Detector`` on ``Scrubber`` initialisation.
59 |     :type autoload: bool
60 |     """
61 |     if isinstance(detector, str):
62 |         if detector in detector_catalogue:
63 |             catalogue._remove((*detector_catalogue.namespace, detector))
64 | 
65 |     elif inspect.isclass(detector):
66 |         if detector.name in detector_catalogue:
67 |             catalogue._remove((*detector_catalogue.namespace, detector.name))
68 | 
69 |     else:
70 |         raise ValueError("detector should be a class (not an instance) or a string.")
71 | 


--------------------------------------------------------------------------------
/tests/test_detector_skype.py:
--------------------------------------------------------------------------------
 1 | import faker
 2 | import unittest
 3 | 
 4 | import scrubadub.detectors.catalogue
 5 | from scrubadub.filth import SkypeFilth
 6 | 
 7 | from base import BaseTestCase
 8 | 
 9 | import scrubadub
10 | 
11 | class SkypeTestCase(unittest.TestCase, BaseTestCase):
12 | 
13 |     def setUp(self):
14 |         from scrubadub.detectors.skype import SkypeDetector
15 |         scrubadub.detectors.catalogue.register_detector(SkypeDetector, autoload=True)
16 | 
17 |     def test_inline_skype_name(self):
18 |         """
19 |         BEFORE: contact me on skype (dean.malmgren) to chat
20 |         AFTER:  contact me on skype ({{SKYPE}}) to chat
21 |         """
22 |         self.compare_before_after()
23 | 
24 |     def test_pre_inline_skype_name(self):
25 |         """
26 |         BEFORE: i'm dean.malmgren on skype
27 |         AFTER:  i'm {{SKYPE}} on skype
28 |         """
29 |         self.compare_before_after()
30 | 
31 |     def test_parenthetical_skype(self):
32 |         """
33 |         BEFORE: i'm on skype (dean.malmgren) or can be reached on my cell
34 |         AFTER:  i'm on skype ({{SKYPE}}) or can be reached on my cell
35 |         """
36 |         self.compare_before_after()
37 | 
38 |     def test_skype_signature(self):
39 |         """
40 |         BEFORE: skype: dean.malmgren\nnerd
41 |         AFTER:  skype: {{SKYPE}}\nnerd
42 |         """
43 |         self.compare_before_after()
44 | 
45 |     def test_skype_addition(self):
46 |         """
47 |         BEFORE: I have added you on Skype. My ID is dean.malmgren
48 |         AFTER:  I have added you on Skype. My ID is {{SKYPE}}
49 |         """
50 |         self.compare_before_after()
51 | 
52 |     def test_skype_usernames(self):
53 |         """test different skype username formats"""
54 |         usernames = (
55 |             "joecool",
56 |             "joe,cool",
57 |             "joe.cool",
58 |             "joe-cool",
59 |         )
60 |         docstring_template ="""
61 |         BEFORE: My Skype is %s
62 |         AFTER:  My Skype is {{SKYPE}}
63 |         """
64 |         for username in usernames:
65 |             self.compare_before_after(docstring_template % username)
66 | 
67 |     def test_all_caps_words_nearby(self):
68 |         """
69 |         BEFORE: SCREAM to get my attention on Skype (dean.malmgren)
70 |         AFTER:  SCREAM to get my attention on Skype ({{SKYPE}})
71 |         """
72 |         self.compare_before_after()
73 | 
74 |     def test_no_triggers(self):
75 |         """
76 |         BEFORE: SCREAM to get my attention because Im not on instant messengers
77 |         AFTER:  SCREAM to get my attention because Im not on instant messengers
78 |         """
79 |         self.compare_before_after()
80 | 
81 |     def test_generate(self):
82 |         class Faker:
83 |             def user_name(self):
84 |                 return 'brian12'
85 | 
86 |         self.assertEqual(
87 |             'brian12',
88 |             SkypeFilth.generate(faker=Faker()),
89 |         )
90 | 
91 |     def tearDown(self) -> None:
92 |         from scrubadub.detectors.skype import SkypeDetector
93 |         scrubadub.detectors.catalogue.register_detector(SkypeDetector, autoload=False)
94 | 


--------------------------------------------------------------------------------
/tests/test_detector_credit_card.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from base import BaseTestCase
  4 | 
  5 | 
  6 | class CreditCardTestCase(unittest.TestCase, BaseTestCase):
  7 |     """
  8 |     Test cases for Credit Card number removal removal.
  9 |     All these will clash with PASSPORT filth.
 10 |     """
 11 | 
 12 |     def test_american_express(self):
 13 |         """
 14 |         BEFORE: My credit card is 378282246310005.
 15 |         AFTER:  My credit card is {{CREDIT_CARD}}.
 16 |         """
 17 |         self.compare_before_after()
 18 | 
 19 |     def test_american_express2(self):
 20 |         """
 21 |         BEFORE: My credit card is 371449635398431.
 22 |         AFTER:  My credit card is {{CREDIT_CARD}}.
 23 |         """
 24 |         self.compare_before_after()
 25 | 
 26 |     def test_american_corporate(self):
 27 |         """
 28 |         BEFORE: My credit card is 378734493671000.
 29 |         AFTER:  My credit card is {{CREDIT_CARD}}.
 30 |         """
 31 |         self.compare_before_after()
 32 | 
 33 |     def test_diners_club(self):
 34 |         """
 35 |         BEFORE: My credit card is 30569309025904.
 36 |         AFTER:  My credit card is {{CREDIT_CARD}}.
 37 |         """
 38 |         self.compare_before_after()
 39 | 
 40 |     def test_diners_club2(self):
 41 |         """
 42 |         BEFORE: My credit card is 38520000023237.
 43 |         AFTER:  My credit card is {{CREDIT_CARD}}.
 44 |         """
 45 |         self.compare_before_after()
 46 | 
 47 |     def test_discover(self):
 48 |         """
 49 |         BEFORE: My credit card is 6011111111111117.
 50 |         AFTER:  My credit card is {{CREDIT_CARD}}.
 51 |         """
 52 |         self.compare_before_after()
 53 | 
 54 |     def test_discover2(self):
 55 |         """
 56 |         BEFORE: My credit card is 6011000990139424.
 57 |         AFTER:  My credit card is {{CREDIT_CARD}}.
 58 |         """
 59 |         self.compare_before_after()
 60 | 
 61 |     def test_jcb(self):
 62 |         """
 63 |         BEFORE: My credit card is 3530111333300000.
 64 |         AFTER:  My credit card is {{CREDIT_CARD}}.
 65 |         """
 66 |         self.compare_before_after()
 67 | 
 68 |     def test_jcb2(self):
 69 |         """
 70 |         BEFORE: My credit card is 3566002020360505.
 71 |         AFTER:  My credit card is {{CREDIT_CARD}}.
 72 |         """
 73 |         self.compare_before_after()
 74 | 
 75 |     def test_mastercard(self):
 76 |         """
 77 |         BEFORE: My credit card is 5555555555554444.
 78 |         AFTER:  My credit card is {{CREDIT_CARD}}.
 79 |         """
 80 |         self.compare_before_after()
 81 | 
 82 |     def test_mastercard2(self):
 83 |         """
 84 |         BEFORE: My credit card is 5105105105105100.
 85 |         AFTER:  My credit card is {{CREDIT_CARD}}.
 86 |         """
 87 |         self.compare_before_after()
 88 | 
 89 |     def test_visa(self):
 90 |         """
 91 |         BEFORE: My credit card is 4111111111111111.
 92 |         AFTER:  My credit card is {{CREDIT_CARD}}.
 93 |         """
 94 |         self.compare_before_after()
 95 | 
 96 |     def test_visa2(self):
 97 |         """
 98 |         BEFORE: My credit card is 4012888888881881.
 99 |         AFTER:  My credit card is {{CREDIT_CARD}}.
100 |         """
101 |         self.compare_before_after()
102 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/text_blob.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import textblob
 3 | 
 4 | from textblob.blob import BaseBlob
 5 | from textblob.en.taggers import PatternTagger
 6 | 
 7 | from typing import Optional, Generator
 8 | 
 9 | from scrubadub.detectors.catalogue import register_detector
10 | from .base import RegexDetector
11 | from ..filth import NameFilth, Filth
12 | from ..utils import CanonicalStringSet
13 | 
14 | # BaseBlob uses NLTKTagger as a pos_tagger, but it works wrong
15 | BaseBlob.pos_tagger = PatternTagger()
16 | 
17 | 
18 | @register_detector
19 | class TextBlobNameDetector(RegexDetector):
20 |     """Use part of speech tagging from textblob to clean proper nouns out of the dirty dirty
21 |     ``text``. Disallow particular nouns by adding them to the ``NameDetector.disallowed_nouns`` set.
22 |     """
23 |     filth_cls = NameFilth
24 |     name = 'text_blob_name'
25 |     autoload = False
26 | 
27 |     disallowed_nouns = CanonicalStringSet(["skype"])
28 | 
29 |     def iter_filth(self, text, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
30 |         """Yields discovered filth in the provided ``text``.
31 | 
32 |         :param text: The dirty text to clean.
33 |         :type text: str
34 |         :param document_name: The name of the document to clean.
35 |         :type document_name: str, optional
36 |         :return: An iterator to the discovered :class:`Filth`
37 |         :rtype: Iterator[:class:`Filth`]
38 |         """
39 | 
40 |         if not isinstance(self.disallowed_nouns, CanonicalStringSet):
41 |             raise TypeError(
42 |                 'NameDetector.disallowed_nouns must be CanonicalStringSet'
43 |             )
44 | 
45 |         # find the set of proper nouns using textblob.
46 |         proper_nouns = set()
47 |         blob = textblob.TextBlob(text)
48 |         for word, part_of_speech in blob.tags:
49 |             is_proper_noun = part_of_speech in ("NNP", "NNPS")
50 |             if is_proper_noun and word.lower() not in self.disallowed_nouns:
51 |                 proper_nouns.add(word)
52 | 
53 |         # use a regex to replace the proper nouns by first escaping any
54 |         # lingering punctuation in the regex
55 |         # http://stackoverflow.com/a/4202559/564709
56 |         if proper_nouns:
57 |             re_list = []
58 |             for proper_noun in proper_nouns:
59 |                 re_list.append(r'\b' + re.escape(str(proper_noun)) + r'\b')
60 |             self.regex = re.compile('|'.join(re_list))
61 |             yield from super(TextBlobNameDetector, self).iter_filth(text, document_name=document_name)
62 |         return
63 | 
64 |     @classmethod
65 |     def supported_locale(cls, locale: str) -> bool:
66 |         """Returns true if this ``Detector`` supports the given locale.
67 | 
68 |         :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
69 |                        underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
70 |         :type locale: str
71 |         :return: ``True`` if the locale is supported, otherwise ``False``
72 |         :rtype: bool
73 |         """
74 |         language, region = cls.locale_split(locale)
75 | 
76 |         # fr and de are possible through plugins, but need to be implemented on this end
77 |         # https://github.com/sloria/textblob-fr and https://github.com/markuskiller/textblob-de
78 |         return language in ['en', ]
79 | 


--------------------------------------------------------------------------------
/tests/test_locale.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import scrubadub
 3 | import scrubadub.utils
 4 | 
 5 | 
 6 | class LocaleTestCase(unittest.TestCase):
 7 | 
 8 |     def test_top_level(self):
 9 |         """Test that locales work at the top level"""
10 |         self.assertEqual(
11 |             scrubadub.clean("Localisation is important for phone numbers '06 87 49 77 56'", locale='en_GB'),
12 |             "Localisation is important for phone numbers '06 87 49 77 56'",
13 |         )
14 |         self.assertEqual(
15 |             scrubadub.clean("Localisation is important for phone numbers '06 87 49 77 56'", locale='fr_FR'),
16 |             "Localisation is important for phone numbers '{{PHONE}}'",
17 |         )
18 |         self.assertEqual(
19 |             scrubadub.clean("Localisation is important for phone numbers '(0121) 496 0852'", locale='en_GB'),
20 |             "Localisation is important for phone numbers '{{PHONE}}'",
21 |         )
22 |         self.assertEqual(
23 |             scrubadub.clean("Localisation is important for phone numbers '(0121) 496 0852'", locale='fr_FR'),
24 |             "Localisation is important for phone numbers '(0121) 496 0852'",
25 |         )
26 | 
27 |     def test_bad_locale(self):
28 |         with self.assertRaises(ValueError):
29 |             scrubadub.clean("Localisation is important for phone numbers '(0121) 496 0852'", locale='non_existant')
30 | 
31 |     def test_locale_in_filth(self):
32 |         filths = scrubadub.list_filth("Localisation is important for phone numbers '(0121) 496 0852'", locale='en_GB')
33 |         self.assertEqual(len(filths), 1)
34 |         self.assertEqual(filths[0].locale, 'en_GB')
35 | 
36 |     def test_locale_split(self):
37 |         self.assertEqual(
38 |             scrubadub.utils.locale_split('en_US'),
39 |             ('en', 'US'),
40 |         )
41 |         self.assertEqual(
42 |             scrubadub.utils.locale_split('de_DE'),
43 |             ('de', 'DE'),
44 |         )
45 |         self.assertEqual(
46 |             scrubadub.utils.locale_split('en_GB'),
47 |             ('en', 'GB'),
48 |         )
49 |         self.assertEqual(
50 |             scrubadub.utils.locale_split('en'),
51 |             ('en', 'US'),
52 |         )
53 |         self.assertEqual(
54 |             scrubadub.utils.locale_split('en_GB.ISO8859-1'),
55 |             ('en', 'GB'),
56 |         )
57 |         self.assertEqual(
58 |             scrubadub.utils.locale_split('ru_RU.UTF-8'),
59 |             ('ru', 'RU'),
60 |         )
61 |         self.assertEqual(
62 |             scrubadub.utils.locale_split('tt_RU.UTF-8@iqtelif'),
63 |             ('tt', 'RU'),
64 |         )
65 |         with self.assertRaises(ValueError):
66 |             scrubadub.utils.locale_split('non_existant')
67 | 
68 |     def test_locale_transform(self):
69 |         with self.assertRaises(ValueError):
70 |             scrubadub.utils.locale_transform('not_exist'),
71 | 
72 |         self.assertEqual(
73 |             scrubadub.utils.locale_transform('en'),
74 |             'en_US.ISO8859-1',
75 |         )
76 |         self.assertEqual(
77 |             scrubadub.utils.locale_transform('fr'),
78 |             'fr_FR.ISO8859-1',
79 |         )
80 |         self.assertEqual(
81 |             scrubadub.utils.locale_transform('fr_CA'),
82 |             'fr_CA.ISO8859-1',
83 |         )
84 |         self.assertEqual(
85 |             scrubadub.utils.locale_transform('zh'),
86 |             'zh_CN.eucCN',
87 |         )


--------------------------------------------------------------------------------
/docs/names.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Name Detection
  3 | ==============
  4 | 
  5 | There are several detectors that can be used to detect names:
  6 | 
  7 | 1. `Stanford <https://nlp.stanford.edu/software/CRF-NER.html>`_ detector
  8 |     * Best accuracy, requires java to be installed
  9 | 2. `Spacy v3 <https://explosion.ai/blog/spacy-v3-nightly/>`_ detector
 10 |     * Almost as good as Stanford NER, but easier to install
 11 | 3. `TextBlob <https://textblob.readthedocs.io/en/dev/>`_ detector
 12 |     * Has a very high false positive rate, use with caution
 13 | 
 14 | All of these detectors are optional and so are not enabled by default.
 15 | To enable them you must install any dependencies, import them and finally add them to your ``Scrubber``.
 16 | In the following sections examples are given for this.
 17 | 
 18 | Stanford NER detector
 19 | ---------------------
 20 | 
 21 | To run the Stanford NER detector you will need both java and the nltk python package.
 22 | On debian linux, java can be installed with:
 23 | 
 24 | .. code-block:: console
 25 | 
 26 |     $ apt-get install openjdk-14-jre
 27 | 
 28 | And then the python dependencies can be installed with:
 29 | 
 30 | .. code-block:: console
 31 | 
 32 |     $ pip install scrubadub_stanford
 33 | 
 34 | Once this has been done, the ``StanfordEntityDetector`` can be used with the following:
 35 | 
 36 | .. code-block:: pycon
 37 | 
 38 |     >>> import scrubadub, scrubadub_stanford
 39 |     >>> scrubber = scrubadub.Scrubber()
 40 |     >>> scrubber.add_detector(scrubadub_stanford.detectors.StanfordEntityDetector)
 41 |     >>> scrubber.clean("My name is John")
 42 |     'My name is {{NAME}}'
 43 | 
 44 | Spacy
 45 | -----
 46 | 
 47 | This is the suggested named detector, since its easy to install and works pretty well.
 48 | Spacy v3 requires python version >= 3.6 and < 3.9, as python 3.9 is not yet supported by spacy.
 49 | 
 50 | To install all dependencies for the Spacy detector you can do:
 51 | 
 52 | .. code-block:: console
 53 | 
 54 |     $ pip install scrubadub_spacy
 55 | 
 56 | Then to run it you can add it to your ``Scrubber``, like so:
 57 | 
 58 | .. code-block:: pycon
 59 | 
 60 |     >>> import scrubadub, scrubadub_spacy
 61 |     >>> scrubber = scrubadub.Scrubber()
 62 |     >>> scrubber.add_detector(scrubadub_spacy.detectors.SpacyEntityDetector)
 63 |     >>> scrubber.clean("My name is John")
 64 |     'My name is {{NAME}}'
 65 | 
 66 | It is also possible to enable other tags from the Spacy Entity tagger, such Location and Organisation.
 67 | This can be done with the ``enable_*`` parameters in the initialiser:
 68 | 
 69 | .. code-block:: pycon
 70 | 
 71 |     >>> import scrubadub, scrubadub_stanford
 72 |     >>> scrubber = scrubadub.Scrubber()
 73 |     >>> scrubber.add_detector(scrubadub_stanford.detectors.StanfordEntityDetector(
 74 |     ...     enable_person=True, enable_organization=True, enable_location=True
 75 |     ... ))
 76 |     >>> scrubber.clean("My name is John and I work at the United Nations in Geneva")
 77 |     'My name is {{NAME}} and I work at the {{ORGANIZATION}} in {{LOCATION}}'
 78 | 
 79 | TextBlob
 80 | --------
 81 | 
 82 | It is suggested not to use this detector due to its high false positive rate, however it is useful in some situations.
 83 | Please test it on your data to ensure it works well.
 84 | This detector is already installed in the base scrubadub package and os you only need scrubadub installed to run it.
 85 | 
 86 | .. code-block:: console
 87 | 
 88 |     $ pip install scrubadub
 89 | 
 90 | Then to run it you can add it to your ``Scrubber``, like so:
 91 | 
 92 | .. code-block:: pycon
 93 | 
 94 |     >>> import scrubadub
 95 |     >>> scrubber = scrubadub.Scrubber()
 96 |     >>> scrubber.add_detector(scrubadub.detectors.TextBlobNameDetector)
 97 |     >>> scrubber.clean("My name is John")
 98 |     'My name is {{NAME}}'
 99 | 
100 | 


--------------------------------------------------------------------------------
/tests/test_detector_postal_codes.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import zipfile
 3 | import pathlib
 4 | import requests
 5 | import unittest
 6 | import warnings
 7 | 
 8 | import scrubadub
 9 | 
10 | class PostalCodesTestCase(unittest.TestCase):
11 | 
12 |     def test_bad_locale(self):
13 |         """test a non existant region"""
14 |         with self.assertRaises(ValueError):
15 |             scrubadub.detectors.PostalCodeDetector(locale='non_existant')
16 | 
17 |     def test_not_implemented_locale(self):
18 |         """test a non existant region"""
19 |         scrubber = scrubadub.Scrubber(locale='fr_FR')
20 |         with warnings.catch_warnings():
21 |             warnings.simplefilter("error")
22 |             with self.assertRaises(UserWarning):
23 |                 scrubber.add_detector(scrubadub.detectors.PostalCodeDetector)
24 | 
25 |     def test_gb(self):
26 |         """test a simple matching"""
27 | 
28 |         to_test = [
29 |             # positive assertions
30 |             ("BX1 1LT", True),
31 |             ("sw1A 0AA", True),
32 |             ("EC2V 7hh", True),
33 |             ("M25DB", True),
34 |             ("eh12ng", True),
35 |             ("BT1 5GS", True),
36 |             ("EC1A 1BB", True),
37 |             ("W1A 0AX", True),
38 |             ("M1 1AE", True),
39 |             ("B33 8TH", True),
40 |             ("CR2 6XH", True),
41 |             ("DN55 1PT", True),
42 |             ("CM2 0PP", True),
43 |             ("EC3M 5AD", True),
44 |             # negative assertions
45 |             ("1", False),
46 |             ("23", False),
47 |             ("456", False),
48 |             ("4567", False),
49 |             ("750621", False),
50 |             ("95130-642", False),
51 |             ("95130-64212", False),
52 |         ]
53 | 
54 |         test_str = 'this is a {} test string'
55 |         detector = scrubadub.detectors.PostalCodeDetector(locale='en_GB')
56 | 
57 |         for postal_code, result in to_test:
58 |             matches = list(detector.iter_filth(test_str.format(postal_code)))
59 |             if result:
60 |                 self.assertEquals(len(matches), 1)
61 |                 self.assertEquals(matches[0].text, postal_code)
62 |             else:
63 |                 self.assertEquals(matches, [])
64 | 
65 |     def test_extensive(self):
66 |         zip_location = pathlib.Path(__file__).parent / 'code_point_uk_post_codes.zip'
67 | 
68 |         # Download an extensive list of all postcodes
69 |         if not zip_location.exists():
70 |             url = 'https://api.os.uk/downloads/v1/products/CodePointOpen/downloads?area=GB&format=CSV&redirect'
71 |             r = requests.get(url, allow_redirects=True)
72 |             with open(zip_location.absolute(), 'wb') as f:
73 |                 f.write(r.content)
74 | 
75 |         detector = scrubadub.detectors.PostalCodeDetector(locale='en_GB')
76 | 
77 |         # Run the detector against this list to ensure we pickup all post codes
78 |         with zipfile.ZipFile(zip_location.absolute()) as zip:
79 |             data_file_names = [
80 |                 name for name in zip.namelist()
81 |                 if name.lower().endswith('.csv') and name.startswith('Data/CSV')
82 |             ]
83 |             for data_file_name in data_file_names:
84 |                 with zip.open(data_file_name) as data_file:
85 |                     df = pd.read_csv(data_file, header=None)
86 |                     post_codes = df.loc[:, 0].sample(frac=.1).values.tolist()
87 |                     for post_code in post_codes:
88 |                         filth_list = list(detector.iter_filth(post_code))
89 |                         error_message = "Unable to match postcode {} from {}".format(post_code, data_file_name)
90 |                         self.assertEquals(1, len(filth_list), error_message)
91 |                         self.assertEquals(post_code, filth_list[0].text)
92 | 


--------------------------------------------------------------------------------
/tests/test_detector.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import catalogue
 3 | import scrubadub.detectors.catalogue
 4 | 
 5 | from scrubadub.detectors.base import Detector, RegexDetector
 6 | from scrubadub.detectors.url import UrlDetector
 7 | from scrubadub.detectors.email import EmailDetector
 8 | from scrubadub.filth.base import Filth
 9 | from scrubadub.exceptions import UnexpectedFilth
10 | import scrubadub
11 | 
12 | 
13 | class DetectorTestCase(unittest.TestCase):
14 |     # TODO: test detector names
15 | 
16 |     def test_detector_names(self):
17 |         """make sure detector names appear in Filth"""
18 |         detector = UrlDetector(name='example_name')
19 |         filths = list(detector.iter_filth('www.google.com'))
20 |         self.assertEqual(len(filths), 1)
21 |         self.assertEqual(filths[0].detector_name, 'example_name')
22 | 
23 |         detector = EmailDetector(name='example_name')
24 |         filths = list(detector.iter_filth('example@example.com'))
25 |         self.assertEqual(len(filths), 1)
26 |         self.assertEqual(filths[0].detector_name, 'example_name')
27 | 
28 |     def test_name_from_filth_cls(self):
29 |         class OldFilth(Filth):
30 |             type = 'old_filth'
31 |         class OldDetector(Detector):
32 |             filth_cls = OldFilth
33 | 
34 |         old_detector = OldDetector()
35 |         self.assertEqual(old_detector.name, 'old_filth')
36 | 
37 |         detector = Detector()
38 |         self.assertEqual(detector.name, 'detector')
39 | 
40 |     def test_abstract_detector_raises_error(self):
41 |         """Test that the Detector abstract class raises an error when iter_filth is not implemented"""
42 |         detector = Detector()
43 |         with self.assertRaises(NotImplementedError):
44 |             detector.iter_filth_documents(['text'], ['text.txt'])
45 |         with self.assertRaises(NotImplementedError):
46 |             detector.iter_filth('text')
47 | 
48 |     def test_abstract_regex_filth_raises_error(self):
49 |         """Test that the RegexDetector abstract class raises an error when the filth_cls is incorrectly set"""
50 |         class BadRegexDetector(RegexDetector):
51 |             filth_cls = str
52 | 
53 |         detector = BadRegexDetector()
54 |         with self.assertRaises(TypeError):
55 |             list(detector.iter_filth('text'))
56 | 
57 |     def test_abstract_regex_raises_error(self):
58 |         """Test that the RegexDetector abstract class raises an error when there is no regex set"""
59 |         detector = RegexDetector()
60 |         with self.assertRaises(ValueError):
61 |             list(detector.iter_filth('text'))
62 | 
63 |     def test_non_detector_registration(self):
64 |         """Test to ensure an error is raised if you try to register somthing thats not a detector"""
65 | 
66 |         detector = scrubadub.detectors.TwitterDetector()
67 |         with self.assertRaises(ValueError):
68 |             scrubadub.detectors.catalogue.register_detector(detector, autoload=False)
69 | 
70 |         with self.assertRaises(ValueError):
71 |             scrubadub.detectors.catalogue.register_detector(123, autoload=False)
72 | 
73 |     def test_detector_registration(self):
74 |         """Test to ensure adding a detector adds it to the configuration as expected"""
75 | 
76 |         class Temp(scrubadub.detectors.base.Detector):
77 |             name = "temp"
78 | 
79 |         with self.assertRaises(catalogue.RegistryError):
80 |             scrubadub.detectors.catalogue.detector_catalogue.get(Temp.name)
81 | 
82 |         scrubadub.detectors.catalogue.register_detector(Temp, autoload=False)
83 | 
84 |         self.assertEqual(Temp, scrubadub.detectors.catalogue.detector_catalogue.get(Temp.name))
85 | 
86 |         scrubadub.detectors.catalogue.remove_detector(Temp)
87 | 
88 |         with self.assertRaises(catalogue.RegistryError):
89 |             scrubadub.detectors.catalogue.detector_catalogue.get(Temp.name)
90 | 


--------------------------------------------------------------------------------
/scrubadub/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import locale as locale_module
  3 | 
  4 | from typing import Optional, Tuple, List
  5 | 
  6 | try:
  7 |     unicode  # type: ignore  # tell mypy to ignore the fact that this doesnt exist in python3
  8 | except NameError:
  9 |     basestring = str  # Compatibility for Python 2 and 3
 10 | 
 11 | 
 12 | class CanonicalStringSet(set):
 13 |     """Just like a set, except it makes sure that all elements are lower case
 14 |     strings.
 15 |     """
 16 | 
 17 |     def _cast_as_lower(self, x):
 18 |         if not isinstance(x, basestring):
 19 |             raise TypeError('CanonicalStringSet only works with strings')
 20 |         return x.lower()
 21 | 
 22 |     def __init__(self, *elements):
 23 |         super(CanonicalStringSet, self).__init__()
 24 |         if elements:
 25 |             self.update(*elements)
 26 | 
 27 |     def __contains__(self, element):
 28 |         return super(CanonicalStringSet, self).__contains__(
 29 |             self._cast_as_lower(element)
 30 |         )
 31 | 
 32 |     def add(self, element):
 33 |         return super(CanonicalStringSet, self).add(
 34 |             self._cast_as_lower(element)
 35 |         )
 36 | 
 37 |     def update(self, elements):
 38 |         for element in elements:
 39 |             self.add(element)
 40 | 
 41 |     def remove(self, element):
 42 |         return super(CanonicalStringSet, self).remove(
 43 |             self._cast_as_lower(element)
 44 |         )
 45 | 
 46 |     def discard(self, element):
 47 |         return super(CanonicalStringSet, self).discard(
 48 |             self._cast_as_lower(element)
 49 |         )
 50 | 
 51 | 
 52 | class Lookup(object):
 53 |     """The Lookup object is used to create an in-memory reference table to
 54 |     create unique identifiers for ``Filth`` that is encountered.
 55 |     """
 56 | 
 57 |     def __init__(self):
 58 |         self.table = {}
 59 | 
 60 |     def __getitem__(self, key):
 61 |         try:
 62 |             return self.table[key]
 63 |         except KeyError:
 64 |             self.table[key] = len(self.table)
 65 |             return self.table[key]
 66 | 
 67 | 
 68 | def locale_transform(locale: str) -> str:
 69 |     """Normalise the locale string, e.g. 'fr' -> 'fr_FR'.
 70 | 
 71 |     :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
 72 |                    underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
 73 |     :type locale: str
 74 |     :return: The normalised locale string
 75 |     :rtype: str
 76 |     """
 77 |     normalised = locale_module.normalize(locale.lower())
 78 |     if normalised not in locale_module.locale_alias.values():
 79 |         raise ValueError("Unknown locale '{}', not in locale.locale_alias".format(locale))
 80 |     return normalised
 81 | 
 82 | 
 83 | def locale_split(locale: str) -> Tuple[Optional[str], Optional[str]]:
 84 |     """Split the locale string into the language and region.
 85 | 
 86 |     :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
 87 |                    underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
 88 |     :type locale: str
 89 |     :return: The two-letter language code and the two-letter region code in a tuple.
 90 |     :rtype: tuple, (str, str)
 91 |     """
 92 |     locale = locale_transform(locale)
 93 | 
 94 |     regex = r'(?P<language>[0-9a-zA-Z]+)(_(?P<region>[0-9a-zA-Z]+))?' \
 95 |             r'(\.(?P<charset>[0-9a-zA-Z-]+)(@(?P<charset2>[0-9a-zA-Z]+))?)?'
 96 |     match = re.match(regex, locale)
 97 |     if match is None:
 98 |         raise ValueError('Locale does not match expected format.')
 99 | 
100 |     return match.group('language').lower(), match.group('region').upper()
101 | 
102 | 
103 | class ToStringMixin(object):
104 |     def _to_string(self, attributes: List[str]) -> str:
105 |         item_attributes = [
106 |             "{}={}".format(item, getattr(self, item, None).__repr__())
107 |             for item in attributes
108 |             if getattr(self, item, None) is not None
109 |         ]
110 |         return "<{} {}>".format(self.__class__.__name__, " ".join(item_attributes))
111 | 


--------------------------------------------------------------------------------
/.github/workflows/unittests.yml:
--------------------------------------------------------------------------------
  1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
  2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
  3 | 
  4 | name: Python package
  5 | 
  6 | on:
  7 |   push:
  8 |     branches: [ master ]
  9 |   pull_request:
 10 |     branches: [ master ]
 11 | 
 12 | jobs:
 13 |   build:
 14 | 
 15 |     runs-on: ubuntu-latest
 16 |     strategy:
 17 |       matrix:
 18 |         python-version: [3.9, 3.8, 3.7, 3.6]
 19 | 
 20 |     env:
 21 |       PREFIX: /home/runner/prefix
 22 |       LIBPOSTAL: /home/runner/libpostal
 23 |       LIBRARY_PATH: /home/runner/prefix/lib
 24 |       LD_LIBRARY_PATH: /home/runner/prefix/lib
 25 |       C_INCLUDE_PATH: /home/runner/prefix/include
 26 |       CPP_INCLUDE_PATH: /home/runner/prefix/include
 27 | 
 28 |     steps:
 29 |     - uses: actions/checkout@v2
 30 |     - name: Set up Python ${{ matrix.python-version }}
 31 |       uses: actions/setup-python@v2
 32 |       with:
 33 |         python-version: ${{ matrix.python-version }}
 34 | 
 35 |     - name: Install apt dependencies
 36 |       run: |
 37 |         sudo apt-get update && sudo apt-get install -y curl autoconf automake libtool pkg-config default-jre
 38 | 
 39 |     - name: Cache restore libpostal
 40 |       id: cache-libpostal
 41 |       uses: actions/cache@v2
 42 |       with:
 43 |         path: |
 44 |           ${{ env.PREFIX }}
 45 |           ${{ env.LIBPOSTAL }}
 46 |         key: v1-libpostal-${{ runner.os }}
 47 | 
 48 |     - name: Install libpostal
 49 |       if: steps.cache-libpostal.outputs.cache-hit != 'true'
 50 |       run: |
 51 |         if test ! -f ${{ env.PREFIX }}/lib/libpostal.so ; then mkdir -p ${{ env.PREFIX }} ${{ env.LIBPOSTAL }} &&
 52 |         git clone https://github.com/openvenues/libpostal ${{ env.LIBPOSTAL }} && cd ${{ env.LIBPOSTAL }} &&
 53 |         ./bootstrap.sh && ./configure --prefix=${{ env.PREFIX }} && sudo make -j4 && sudo make install && cd - ; fi
 54 | 
 55 |     - name: Cache restore pip
 56 |       id: cache-pip
 57 |       uses: actions/cache@v2
 58 |       with:
 59 |         path: ~/.cache/pip
 60 |         key: ${{ runner.os }}-pip-${{ hashFiles('requirements/python*') }}
 61 |         restore-keys: |
 62 |           ${{ runner.os }}-pip-
 63 | 
 64 |     - name: Install pip dependencies
 65 |       run: |
 66 |         export LIBRARY_PATH=${{ env.LIBRARY_PATH }}
 67 |         export LD_LIBRARY_PATH=${{ env.LD_LIBRARY_PATH }}
 68 |         export C_INCLUDE_PATH=${{ env.C_INCLUDE_PATH }}
 69 |         export CPP_INCLUDE_PATH=${{ env.CPP_INCLUDE_PATH }}
 70 |         python -m pip install --upgrade pip wheel setuptools
 71 |         pip install -r requirements/python-dev
 72 | 
 73 |     - name: Cache restore nltk data
 74 |       id: cache-models
 75 |       uses: actions/cache@v2
 76 |       with:
 77 |         path: ~/nltk_data
 78 |         key: v1-nltk-data
 79 | 
 80 |     - name: Download models and NLTK data
 81 |       run: |
 82 |         # Needed for stanford model
 83 |         python3 -c "import nltk; nltk.download('punkt')"
 84 |         # Needed for the TextBlob model
 85 |         python -m textblob.download_corpora
 86 |         # One of the possible spacy models, should
 87 |         ( python3 -c 'import spacy' && python -m spacy download en_core_web_sm ) || bash -c 'exit 0'
 88 |         ( python3 -c 'import spacy' && python -m spacy download en_core_web_trf ) || bash -c 'exit 0'
 89 |         ( python3 -c 'import spacy' && python -m spacy download de_core_news_sm ) || bash -c 'exit 0'
 90 |         ( python3 -c 'import spacy' && python -m spacy download fr_core_news_lg ) || bash -c 'exit 0'
 91 | 
 92 |     - name: Install package
 93 |       run: |
 94 |         echo "Installing package"
 95 |         pip install -e .
 96 | 
 97 |     - name: Run tests
 98 |       run: |
 99 |         python3 tests/run.py
100 | 
101 |     - name: Coveralls
102 |       env:
103 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
104 |         COVERALLS_SERVICE_NAME: github-actions
105 |         COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
106 |       run: |
107 |         if python3 --version | grep -q "Python 3.9." ; then coveralls ; fi
108 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. NOTES FOR CREATING A RELEASE:
 3 | ..
 4 | ..   * bump the version number in scrubadub/__init__.py
 5 | ..   * update docs/changelog.rst
 6 | ..   * git push
 7 | ..   * create a release https://github.com/LeapBeyond/scrubadub/releases
 8 | ..      * This should trigger a github action to upload to pypi
 9 | ..      * ReadTheDocs.io should see any changes and also rebuild the docs
10 | 
11 | 
12 | *********
13 | scrubadub
14 | *********
15 | 
16 | Remove personally identifiable information from free text. Sometimes we have
17 | additional metadata about the people we wish to anonymize. Other times we don't.
18 | This package makes it easy to seamlessly scrub personal information from free
19 | text, without compromising the privacy of the people we are trying to protect.
20 | 
21 | ``scrubadub`` currently supports removing:
22 | 
23 | * Names
24 | * Email addresses
25 | * Addresses/Postal codes (US, GB, CA)
26 | * Credit card numbers
27 | * Dates of birth
28 | * URLs
29 | * Phone numbers
30 | * Username and password combinations
31 | * Skype/twitter usernames
32 | * Social security numbers (US and GB national insurance numbers)
33 | * Tax numbers (GB)
34 | * Driving licence numbers (GB)
35 | 
36 | .. image:: https://img.shields.io/github/workflow/status/LeapBeyond/scrubadub/Python%20package/master
37 |    :target: https://github.com/LeapBeyond/scrubadub/actions?query=workflow%3A%22Python+package%22+branch%3Amaster
38 |    :alt:  Build Status
39 | .. image:: https://img.shields.io/pypi/v/scrubadub.svg
40 |    :target: https://pypi.org/project/scrubadub/
41 |    :alt:  Version
42 | .. image:: https://img.shields.io/pypi/dm/scrubadub.svg
43 |    :target: https://pypi.org/project/scrubadub/
44 |    :alt:  Downloads
45 | .. image:: https://coveralls.io/repos/github/LeapBeyond/scrubadub/badge.svg?branch=master
46 |    :target: https://coveralls.io/r/LeapBeyond/scrubadub
47 |    :alt:  Test Coverage
48 | .. image:: https://readthedocs.org/projects/scrubadub/badge/?version=latest
49 |    :target: https://readthedocs.org/projects/scrubadub/?badge=latest
50 |    :alt:  Documentation Status
51 | 
52 | 
53 | Quick start
54 | -----------
55 | 
56 | Getting started with ``scrubadub`` is as easy as ``pip install scrubadub`` and
57 | incorporating it into your python scripts like this:
58 | 
59 | .. code:: pycon
60 | 
61 |     >>> import scrubadub
62 | 
63 |     # My cat may be more tech-savvy than most, but he doesn't want other people to know it.
64 |     >>> text = "My cat can be contacted on example@example.com, or 1800 555-5555"
65 | 
66 |     # Replaces the phone number and email addresse with anonymous IDs.
67 |     >>> scrubadub.clean(text)
68 |     'My cat can be contacted on {{EMAIL}}, or {{PHONE}}'
69 | 
70 | 
71 | There are many ways to tailor the behavior of ``scrubadub`` using
72 | `different Detectors and PostProcessors <https://scrubadub.readthedocs.io/en/stable/usage.html>`_.
73 | Scrubadub is highly configurable and supports localisation for different languages and regions.
74 | 
75 | Installation
76 | ------------
77 | 
78 | To install scrubadub using pip, simply type::
79 | 
80 |     pip install scrubadub
81 | 
82 | There are several other packages that can optionally be installed to enable extra detectors.
83 | These `scrubadub_address <https://github.com/LeapBeyond/scrubadub_address>`_, `scrubadub_spacy <https://github.com/LeapBeyond/scrubadub_spacy>`_ and `scrubadub_stanford <https://github.com/LeapBeyond/scrubadub_stanford>`_, see the relevant documentation (`address detector documentation <https://scrubadub.readthedocs.io/en/latest/addresses.html>`_ and `name detector documentation <https://scrubadub.readthedocs.io/en/latest/names.html>`_) for more info on these as they require additional dependencies.
84 | This package requires at least python 3.6.
85 | For python 2.7 or 3.5 support use v1.2.2 which is the last version with support for these versions.
86 | 
87 | New maintainers
88 | ---------------
89 | 
90 | `LeapBeyond <http://leapbeyond.ai/>`_ are excited to be supporting scrubadub with ongoing maintenance and development.
91 | Thanks to all of the contributors who made this package a success, but especially `@deanmalmgren <https://github.com/deanmalmgren>`_, `IDEO <https://www.ideo.com/>`_ and `Datascope <https://datascopeanalytics.com/>`_.
92 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/user_supplied.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from scrubadub.detectors.catalogue import register_detector
 4 | from .. import filth as filth_module
 5 | from ..filth.base import Filth
 6 | from .tagged import TaggedEvaluationFilthDetector
 7 | 
 8 | 
 9 | @register_detector
10 | class UserSuppliedFilthDetector(TaggedEvaluationFilthDetector):
11 |     """Use this ``Detector`` to find some known filth in the text. An example might be if you have a list of employee
12 |     numbers that you wish to remove from a document, as shown below:
13 | 
14 |     >>> import scrubadub
15 |     >>> scrubber = scrubadub.Scrubber(detector_list=[
16 |     ...     scrubadub.detectors.UserSuppliedFilthDetector([
17 |     ...         {'match': 'Anika', 'filth_type': 'name'},
18 |     ...         {'match': 'Larry', 'filth_type': 'name'},
19 |     ...     ]),
20 |     ... ])
21 |     >>> scrubber.clean("Anika is my favourite employee.")
22 |     '{{NAME}} is my favourite employee.'
23 | 
24 |     This detector takes a list of dictonaires (reffered to as known filth items). These specify what to look for in
25 |     the text to label as tagged filth. The dictionary should contain the following keys:
26 | 
27 |         * ``match`` (`str`) - a string value that will be searched for in the text
28 |         * ``filth_type`` (`str`) - a string value that indicates the type of Filth, should be set to ``Filth.name``.
29 |           An example of these could be 'name' or 'phone' for name and phone filths respectively.
30 | 
31 |     The known filth item dictionary may also optionally contain:
32 | 
33 |         * ``match_end`` (`str`) - if specified will search for Filth starting with the value of match and ending with
34 |           the value of ``match_end``
35 |         * ``limit`` (`int`) - an integer describing the maximum number of characters between match and match_end,
36 |           defaults to 150
37 |         * ``ignore_case`` (`bool`) - Ignore case when searching for the tagged filth
38 |         * ``ignore_whitespace`` (`bool`) - Ignore whitespace when matching ("asd qwe" can also match "asd\\\\nqwe")
39 |         * ``ignore_partial_word_matches`` (`bool`) - Ignore matches that are only partial words (if you're looking
40 |           for "Eve", this flag ensure it wont match "Evening")
41 | 
42 |     Examples of this:
43 | 
44 |         * ``{'match': 'aaa', 'filth_type': 'name'}`` - will search for an exact match to aaa and return it as a
45 |           ``NameFilth``
46 |         * ``{'match': 'aaa', 'match_end': 'zzz', 'filth_type': 'name'}`` - will search for `aaa` followed by up to 150
47 |           characters followed by `zzz`, which would match both `aaabbbzzz` and `aaazzz`.
48 |         * ``{'match': '012345', 'filth_type': 'phone', 'ignore_partial_word_matches': True}`` - will search for an
49 |           exact match to 012345, ignoring any partial matches and return it as a ``PhoneFilth``
50 | 
51 |     This detector is not enabled by default (since you need to supply a list of known filths) and so you must always
52 |     add it to your scrubber with a ``scrubber.add_detector(detector)`` call or by adding it to the ``detector_list``
53 |     inialising a ``Scrubber``.
54 |     """
55 | 
56 |     name = 'user_supplied'
57 | 
58 |     def create_filth(
59 |             self, start_location: int, end_location: int, text: str, comparison_type: Optional[str],
60 |             detector_name: str, document_name: Optional[str], locale: str
61 |     ) -> Filth:
62 |         for item_name in dir(filth_module):
63 |             try:
64 |                 filth_cls = filth_module.__getattribute__(item_name)
65 |             except AttributeError:
66 |                 continue
67 | 
68 |             if not isinstance(filth_cls, type) or not issubclass(filth_cls, Filth):
69 |                 continue
70 | 
71 |             try:
72 |                 filth_type = filth_cls.type
73 |             except AttributeError:
74 |                 continue
75 | 
76 |             if filth_type != comparison_type:
77 |                 continue
78 | 
79 |             return filth_cls(
80 |                 start_location,
81 |                 end_location,
82 |                 text,
83 |                 detector_name=detector_name,
84 |                 document_name=document_name,
85 |                 locale=locale,
86 |             )
87 |         raise KeyError(f"Unable to find filth '{comparison_type}'")
88 | 


--------------------------------------------------------------------------------
/scrubadub/detectors/skype.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import nltk
  3 | import textblob
  4 | 
  5 | from textblob.blob import BaseBlob
  6 | from textblob.en.taggers import PatternTagger
  7 | 
  8 | from typing import Optional, Generator
  9 | 
 10 | from scrubadub.detectors.catalogue import register_detector
 11 | from .base import RegexDetector
 12 | from ..filth import SkypeFilth, Filth
 13 | 
 14 | # BaseBlob uses NLTKTagger as a pos_tagger, but it works wrong
 15 | BaseBlob.pos_tagger = PatternTagger()
 16 | 
 17 | 
 18 | @register_detector
 19 | class SkypeDetector(RegexDetector):
 20 |     """Skype usernames tend to be used inline in dirty dirty text quite
 21 |     often but also appear as ``skype: {{SKYPE}}`` quite a bit. This method
 22 |     looks at words within ``word_radius`` words of "skype" for things that
 23 |     appear to be misspelled or have punctuation in them as a means to
 24 |     identify skype usernames.
 25 | 
 26 |     Default ``word_radius`` is 10, corresponding with the rough scale of
 27 |     half of a sentence before or after the word "skype" is used. Increasing
 28 |     the ``word_radius`` will increase the false positive rate and
 29 |     decreasing the ``word_radius`` will increase the false negative rate.
 30 |     """
 31 |     filth_cls = SkypeFilth
 32 |     name = 'skype'
 33 |     autoload = False
 34 | 
 35 |     word_radius = 10
 36 | 
 37 |     # these two regular expressions are used to validate a skype usernames.
 38 |     # _TOKEN is the core regular expression that is used to chunk text into
 39 |     # tokens to make sure all valid skype usernames are considered the same
 40 |     # token. Importantly, the word "skype" must pass the _SKYPE regex.
 41 |     # SKYPE_TOKEN is used to tokenize text and SKYPE_USERNAME is the same thing
 42 |     # but with the 6-32 character limit imposed on the username. adapted from
 43 |     # http://bit.ly/1FQs1hD
 44 |     _SKYPE = r'[a-zA-Z][a-zA-Z0-9_\-\,\.]'
 45 |     SKYPE_TOKEN = _SKYPE + '+'
 46 |     SKYPE_USERNAME = re.compile(_SKYPE+'{5,31}')
 47 | 
 48 |     def iter_filth(self, text, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
 49 |         """Yields discovered filth in the provided ``text``.
 50 | 
 51 |         :param text: The dirty text to clean.
 52 |         :type text: str
 53 |         :param document_name: The name of the document to clean.
 54 |         :type document_name: str, optional
 55 |         :return: An iterator to the discovered :class:`Filth`
 56 |         :rtype: Iterator[:class:`Filth`]
 57 |         """
 58 | 
 59 |         # find 'skype' in the text using a customized tokenizer. this makes
 60 |         # sure that all valid skype usernames are kept as tokens and not split
 61 |         # into different words
 62 |         tokenizer = nltk.tokenize.regexp.RegexpTokenizer(
 63 |             self.SKYPE_TOKEN
 64 |         )
 65 |         blob = textblob.TextBlob(text, tokenizer=tokenizer)
 66 |         skype_indices, tokens = [], []
 67 |         for i, token in enumerate(blob.tokens):
 68 |             tokens.append(token)
 69 |             if 'skype' in token.lower():
 70 |                 skype_indices.append(i)
 71 | 
 72 |         # go through the words before and after skype words to identify
 73 |         # potential skype usernames.
 74 |         skype_usernames = []
 75 |         for i in skype_indices:
 76 |             jmin = max(i-self.word_radius, 0)
 77 |             jmax = min(i+self.word_radius+1, len(tokens))
 78 |             for j in list(range(jmin, i)) + list(range(i+1, jmax)):
 79 |                 token = tokens[j]
 80 |                 if self.SKYPE_USERNAME.match(token):
 81 | 
 82 |                     # this token is a valid skype username. Most skype
 83 |                     # usernames appear to be misspelled words. Word.spellcheck
 84 |                     # does not handle the situation of an all caps word very
 85 |                     # well, so we cast these to all lower case before checking
 86 |                     # whether the word is misspelled
 87 |                     if token.isupper():
 88 |                         token = token.lower()
 89 |                     word = textblob.Word(token)
 90 |                     suggestions = word.spellcheck()
 91 |                     corrected_word, score = suggestions[0]
 92 |                     if score < 0.5:
 93 |                         skype_usernames.append(token)
 94 | 
 95 |         # replace all skype usernames
 96 |         if skype_usernames:
 97 |             self.regex = re.compile('|'.join(skype_usernames))
 98 |             yield from super(SkypeDetector, self).iter_filth(text, document_name=document_name)
 99 | 
100 |         return
101 | 


--------------------------------------------------------------------------------
/tests/test_api_older.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import warnings
  3 | 
  4 | import scrubadub
  5 | import scrubadub.detectors.catalogue
  6 | import scrubadub.utils
  7 | 
  8 | class OldAPITestCase(unittest.TestCase):
  9 | 
 10 |     def setUp(self):
 11 |         from scrubadub.detectors.text_blob import TextBlobNameDetector
 12 |         scrubadub.detectors.catalogue.register_detector(TextBlobNameDetector, autoload=True)
 13 | 
 14 |     def test_scrubadub_clean(self):
 15 |         """test old scrubadub API"""
 16 |         text = u"John is a cat"
 17 |         self.assertEqual(
 18 |             scrubadub.clean(text),
 19 |             "{{NAME}} is a cat",
 20 |         )
 21 | 
 22 |         scrubadub.filth.Filth.lookup = scrubadub.utils.Lookup()
 23 |         with warnings.catch_warnings(record=True) as warning_context:
 24 |             warnings.simplefilter("always")
 25 |             try:
 26 |                 self.assertEqual(
 27 |                     scrubadub.clean(text, replace_with='identifier'),
 28 |                     "{{NAME-0}} is a cat",
 29 |                 )
 30 |             finally:
 31 |                 warnings.simplefilter("default")
 32 |             self.assertTrue(sum(issubclass(w.category, DeprecationWarning) for w in warning_context) > 0)
 33 | 
 34 | 
 35 |         scrubadub.filth.Filth.lookup = scrubadub.utils.Lookup()
 36 |         with warnings.catch_warnings(record=True) as warning_context:
 37 |             warnings.simplefilter("always")
 38 |             try:
 39 |                 self.assertEqual(
 40 |                     scrubadub.clean("John spoke with Doug.", replace_with='identifier'),
 41 |                     "{{NAME-0}} spoke with {{NAME-1}}.",
 42 |                 )
 43 |             finally:
 44 |                 warnings.simplefilter("default")
 45 |             self.assertTrue(sum(issubclass(w.category, DeprecationWarning) for w in warning_context) > 0)
 46 | 
 47 |         scrubadub.filth.Filth.lookup = scrubadub.utils.Lookup()
 48 | 
 49 |     def test_scrubber_clean(self):
 50 |         """test older scrubber API"""
 51 |         scrubber = scrubadub.Scrubber()
 52 |         scrubber.remove_detector('email')
 53 |         text = "contact Joe Duffy at joe@example.com"
 54 |         self.assertEqual(
 55 |             scrubadub.clean(text),
 56 |             "contact {{NAME}} {{NAME}} at {{EMAIL}}",
 57 |         )
 58 | 
 59 |     def test_filth_class(self):
 60 |         class MyFilth(scrubadub.filth.Filth):
 61 |             type = 'mine'
 62 | 
 63 |         class MyDetector(scrubadub.detectors.Detector):
 64 |             filth_cls = MyFilth
 65 | 
 66 |             def iter_filth(self, text, **kwargs):
 67 |                yield MyFilth(beg=0, end=8, text='My stuff', **kwargs)
 68 | 
 69 |         scrubber = scrubadub.Scrubber()
 70 |         # TODO: Add depreciation warning
 71 |         scrubber.add_detector(MyDetector)
 72 |         text = "My stuff can be found there."
 73 | 
 74 |         self.assertEqual(
 75 |             scrubber.clean(text),
 76 |             "{{MINE}} can be found there.",
 77 |         )
 78 | 
 79 |     def test_filth_markers(self):
 80 |         prefix = scrubadub.filth.base.Filth.prefix
 81 |         suffix = scrubadub.filth.base.Filth.suffix
 82 |         scrubadub.filth.base.Filth.prefix = '<b>'
 83 |         scrubadub.filth.base.Filth.suffix = '</b>'
 84 | 
 85 |         scrubber = scrubadub.Scrubber()
 86 | 
 87 |         with warnings.catch_warnings(record=True) as warning_context:
 88 |             warnings.simplefilter("always")
 89 |             try:
 90 |                 self.assertEqual(
 91 |                     scrubber.clean("contact Joe Duffy at joe@example.com"),
 92 |                     "contact <b>NAME</b> <b>NAME</b> at <b>EMAIL</b>",
 93 |                 )
 94 |             finally:
 95 |                 warnings.simplefilter("default")
 96 |                 # Ensure that this is reset, no matter what happens above
 97 |                 scrubadub.filth.base.Filth.prefix = prefix
 98 |                 scrubadub.filth.base.Filth.suffix = suffix
 99 |             self.assertTrue(sum(issubclass(w.category, DeprecationWarning) for w in warning_context) > 0)
100 | 
101 |     def test_regex_filth(self):
102 |         """Test for a DeprecationWarning when using RegexFilth."""
103 |         with warnings.catch_warnings(record=True) as warning_context:
104 |             warnings.simplefilter("always")
105 |             try:
106 |                 scrubadub.filth.RegexFilth(0, 2, 'ab')
107 |             finally:
108 |                 warnings.simplefilter("default")
109 |             self.assertEqual(sum(issubclass(w.category, DeprecationWarning) for w in warning_context), 1)
110 | 
111 |     def tearDown(self) -> None:
112 |         from scrubadub.detectors.text_blob import TextBlobNameDetector
113 |         scrubadub.detectors.catalogue.remove_detector(TextBlobNameDetector)
114 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | Any and all contributions are welcome and appreciated. To make it easy
  2 | to keep things organized, this project uses the
  3 | [general guidelines](https://help.github.com/articles/using-pull-requests)
  4 | for the fork-branch-pull request model for github. Briefly, this means:
  5 | 
  6 | 1. Make sure your fork's `master` branch is up to date:
  7 | 
  8 |     	git remote add LeapBeyond https://github.com/LeapBeyond/scrubadub.git
  9 |         git checkout master
 10 |         git pull LeapBeyond/master
 11 | 
 12 | 2. Start a feature branch with a descriptive name about what you're trying
 13 |    to accomplish:
 14 | 
 15 |         git checkout -b italian-name-fix
 16 | 
 17 | 3. Make commits to this feature branch (`italian-name-fix`, in this case)
 18 |    in a way that other people can understand with good commit message
 19 |    to explain the changes you've made:
 20 | 
 21 |         emacs scrubadub/__init__.py
 22 | 	    git add scrubadub/__init__.py
 23 | 	    git commit -m 'added italian name fix'
 24 | 
 25 | 4. If an issue already exists for the code you're contributing, use
 26 |    [issue2pr](http://issue2pr.herokuapp.com/) to attach your code to that issue:
 27 | 
 28 |         git push origin italian-name-fix
 29 | 		chrome http://issue2pr.herokuapp.com
 30 | 		# enter the issue URL, HEAD=yourusername:italian-name-fix, Base=master
 31 | 
 32 |    If the issue doesn't already exist, just send a pull request in the
 33 |    usual way:
 34 | 
 35 |         git push origin italian-name-fix
 36 | 		chrome http://github.com/LeapBeyond/scrubadub/compare
 37 | 
 38 | 
 39 | Style guidelines
 40 | ----------------
 41 | 
 42 | As a general rule of thumb, the goal of this package is to be as
 43 | readable as possible to make it easy for novices and experts alike to
 44 | contribute to the source code in meaningful ways. Pull requests that
 45 | favor cleverness or optimization over readability are less likely to be
 46 | incorporated.
 47 | 
 48 | To make this notion of "readability" more concrete, here are a few
 49 | stylistic guidelines that are inspired by other projects and we
 50 | generally recommend:
 51 | 
 52 | -  write functions and methods that can `fit on a screen or two of a
 53 |    standard
 54 |    terminal <https://www.kernel.org/doc/Documentation/CodingStyle>`_
 55 |    --- no more than approximately 40 lines.
 56 | 
 57 | -  unless it makes code less readable, adhere to `PEP 8
 58 |    <http://legacy.python.org/dev/peps/pep-0008/>`_ style
 59 |    recommendations --- use an appropriate amount of whitespace. This
 60 |    is enforced in the test suite
 61 | 
 62 | - `code comments should be about *what* and *why* is being done, not *how* it is
 63 |   being done <https://www.kernel.org/doc/Documentation/CodingStyle>`_ ---
 64 |   that should be self-evident from the code itself.
 65 | 
 66 | 
 67 | Common contributions: Removing a new type of filth
 68 | --------------------------------------------------
 69 | 
 70 | This project has really taken off, much more so than I would have thought
 71 | (thanks everybody!). One very common contribution is adding a new type of filth
 72 | that should be removed by `scrubadub`. To make it as easy as possible to add
 73 | these types of contributions, I thought I'd jot down a few notes about how to
 74 | add a new type of filth, for example, addresses.
 75 | 
 76 | * Create an appropriately named python file in `scrubadub/filth/` and write a
 77 |   new `Filth` class that inherits from `scrubadub.filth.base.Filth`. In this
 78 |   case, perhaps you'd create an `AddressFilth` class in
 79 |   `scrubadub/filth/address.py`
 80 | 
 81 | * Add your new type of `Filth` to the `scrubadub.filth` namespace by importing
 82 |   it in `scrubadub/filth/__init__.py`
 83 | 
 84 | * Create an appropriately named python file in `scrubadub/detectors/` and write
 85 |   a new `Detector` class that inherits from
 86 |   `scrubadub.detectors.base.Detector`. In this case, perhaps you'd create an
 87 |   `AddressDetector` class in `scrubadub/detectors/address.py`.
 88 | 
 89 | * Add your new type of `Detector` to the `scrubadub.detectors` namespace by
 90 |   importing it in `scrubadub/detectors/__init__.py`.
 91 | 
 92 | * Register your new detector by adding it to the `types` dictionary in
 93 |   `scrubadub/detectors/__init__.py`
 94 | 
 95 | * Create a new python file to handle some tests for your particular type of
 96 |   filth. In this case, perhaps you would write your tests in
 97 |   `tests/test_addresses.py`
 98 | 
 99 | * Add documentation for the new type of filth in `docs/index.rst` and be sure
100 |   to give yourself a pat on the back in `docs/changelog.rst`
101 | 
102 | * Make sure all of the tests are passing by running `./tests/run.py` and fix
103 |   any lingering problems (usually PEP-8 nonsense).
104 | 


--------------------------------------------------------------------------------
/scrubadub/filth/address.py:
--------------------------------------------------------------------------------
  1 | import string
  2 | import random
  3 | 
  4 | from faker import Faker
  5 | 
  6 | from .base import Filth
  7 | 
  8 | 
  9 | class AddressFilth(Filth):
 10 |     type = 'address'
 11 | 
 12 |     @staticmethod
 13 |     def _randomise_seperators(address: str) -> str:
 14 |         target = random.choice(["comma", "newline", "mixed", "spaces", "no_change"])
 15 |         if target == "comma":
 16 |             return address.replace('\n', ', ')
 17 |         elif target == "newline":
 18 |             return address.replace(', ', '\n')
 19 |         elif target == "spaces":
 20 |             return address.replace(', ', ' ').replace('\n', ' ')
 21 |         elif target == "mixed":
 22 |             address = address.replace(', ', '{{SEP}}').replace('\n', '{{SEP}}')
 23 |             while '{{SEP}}' in address:
 24 |                 this_seporator = random.choice(["comma", "newline", "spaces"])
 25 |                 if this_seporator == "comma":
 26 |                     address = address.replace('{{SEP}}', ', ', 1)
 27 |                 elif this_seporator == "newline":
 28 |                     address = address.replace('{{SEP}}', '\n', 1)
 29 |                 elif this_seporator == "spaces":
 30 |                     address = address.replace('{{SEP}}', ' ', 1)
 31 |             return address
 32 |         return address
 33 | 
 34 |     @staticmethod
 35 |     def _randomise_street_number(address: str) -> str:
 36 |         target = random.choice(["remove", "add_letter", "no_change", "no_change", "no_change", "no_change"])
 37 |         if target == "remove":
 38 |             address_split = address.split('\n')
 39 |             first_line_split = address_split[0].split(' ')
 40 |             try:
 41 |                 int(first_line_split[0])
 42 |             except ValueError:
 43 |                 return address
 44 |             new_first_line = " ".join(first_line_split[1:])
 45 |             return "\n".join([new_first_line] + address_split[1:])
 46 |         elif target == "add_letter":
 47 |             address_split = address.split('\n')
 48 |             first_line_split = address_split[0].split(' ')
 49 |             try:
 50 |                 int(first_line_split[0])
 51 |             except ValueError:
 52 |                 return address
 53 |             new_number = first_line_split[0] + random.choice(string.ascii_letters)
 54 |             new_first_line = " ".join([new_number] + first_line_split[1:])
 55 |             return "\n".join([new_first_line] + address_split[1:])
 56 |         return address
 57 | 
 58 |     @staticmethod
 59 |     def _randomise_postcode(address: str) -> str:
 60 |         target = random.choice(["remove", "lower", "no_change", "no_change", "no_change"])
 61 |         if target == "remove":
 62 |             return "\n".join(address.split('\n')[:-1])
 63 |         elif target == "lower":
 64 |             address_split = address.split('\n')
 65 |             return "\n".join(address.split('\n')[:-1] + [address_split[-1].lower()])
 66 |         return address
 67 | 
 68 |     @staticmethod
 69 |     def _randomise_country(address: str) -> str:
 70 |         target = random.choice(["country", "upper_country", "no_change", "no_change", "no_change"])
 71 |         if "country" in target:
 72 |             country = random.choice(['United Kingdom', 'Britain', 'England', 'Scotland', 'Wales', 'Cymru', 'GB'])
 73 |             if "upper" in target:
 74 |                 country = country.upper()
 75 |             return address + "\n" + country
 76 |         return address
 77 | 
 78 |     @staticmethod
 79 |     def _randomise_building(address: str, faker: Faker) -> str:
 80 |         target = random.choice(["add_building", "no_change", "no_change", "no_change"])
 81 |         if target == "add_building":
 82 |             if bool(random.getrandbits(1)):
 83 |                 building = faker.last_name() + " " + random.choice(["Building", "House", "Block"])
 84 |             else:
 85 |                 building = random.choice(["Building", "House", "Block"]) + " " + faker.last_name()
 86 |             return building + "\n" + address
 87 |         return address
 88 | 
 89 |     @staticmethod
 90 |     def _randomise_case(address: str) -> str:
 91 |         target = random.random()
 92 |         if target >= 0.8:
 93 |             if target >= 0.9:
 94 |                 address = address.upper()
 95 |             else:
 96 |                 address = address.lower()
 97 |         return address
 98 | 
 99 |     @staticmethod
100 |     def generate(faker: Faker) -> str:
101 |         """Generates an example of this ``Filth`` type, usually using the faker python library.
102 | 
103 |         :param faker: The ``Faker`` class from the ``faker`` library
104 |         :type faker: Faker
105 |         :return: An example of this ``Filth``
106 |         :rtype: str
107 |         """
108 |         address = faker.address()
109 |         if faker.locales == ['en_GB']:
110 |             address = AddressFilth._randomise_street_number(address)
111 |             address = AddressFilth._randomise_building(address, faker)
112 |             address = AddressFilth._randomise_postcode(address)
113 |         if faker.locales == ['en_GB']:
114 |             address = AddressFilth._randomise_country(address)
115 |         address = AddressFilth._randomise_seperators(address)
116 |         address = AddressFilth._randomise_case(address)
117 | 
118 |         return address
119 | 


--------------------------------------------------------------------------------
/docs/localization.rst:
--------------------------------------------------------------------------------
  1 | .. _locales:
  2 | .. _localization:
  3 | 
  4 | Localization
  5 | ============
  6 | 
  7 | We have started to make scrubadub localised to support multiple languages and regions.
  8 | We are on the beginning of this journey, so stay tuned.
  9 | 
 10 | By setting a locale the ``Detector``\ s that need configuring based on your region or language will know what type of text to expect.
 11 | This means that a ``Detector`` that needs to know how  ``Filth`` (such as a phone number) is formatted in your
 12 | region will be able to look for ``Filth`` in that specific format.
 13 | Other detectors that use machine learning models to identify entities in the text will be able to use models
 14 | corresponding to the correct language or location.
 15 | 
 16 | To set your locale you can use the standard format ``xx_YY``, where ``xx`` is a
 17 | lower-case `language code <https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes>`_
 18 | and ``YY`` is an upper-case `country code <https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2>`_.
 19 | Examples of this include ``en_CA`` (Canadian english), ``fr_CA`` (Canadian french)` and ``de_AT`` (Austrian german).
 20 | These locales can be set by passing them directly to one of the functions in the ``scrubadub`` module or to a ``Scrubber`` instance:
 21 | 
 22 | .. code:: pycon
 23 | 
 24 |     >>> import scrubadub
 25 |     >>> scrubadub.clean('My US number is 731-938-1630', locale='en_US')
 26 |     'My US number is {{PHONE}}'
 27 |     >>> scrubadub.clean('My US number is 731-938-1630', locale='en_GB')
 28 |     'My US number is 731-938-1630'
 29 |     >>> scrubadub.clean('My GB number is 0121 496 0112', locale='en_GB')
 30 |     'My GB number is {{PHONE}}'
 31 |     >>> scrubadub.clean('My GB number is 0121 496 0112', locale='en_US')
 32 |     'My GB number is 0121 496 0112'
 33 |     >>> scrubber = scrubadub.Scrubber(locale='de_DE')
 34 |     >>> scrubber.clean('Meine Telefonnummer ist 05086 63680')
 35 |     'Meine Telefonnummer ist {{PHONE}}'
 36 | 
 37 | Below is a summary of the supported countries and regions of the various detectors in scrubadub.
 38 | 
 39 |  * `AddressDetector`: supports Canadian, American and British addresses
 40 |  * `PhoneDetector`: supports most regions via `libphonenumber <https://github.com/google/libphonenumber>`_
 41 |  * `PostalCodeDetector`: only supports British postcodes
 42 |  * `SpacyEntityDetector`: supports a wide range of languages check the `spacy documentation <https://spacy.io/usage/models>`_ for the full list of supported languages.
 43 |  * `StanfordEntityDetector`: only supports english in scrubadub, but the models support more languages (es, fr, de, zh).
 44 | 
 45 | This is just the start of the localisation, so if you want to add more languages or features we're keen to hear from you!
 46 | Other detectors are location/language independent (eg email addresses or twitter usernames) or do not support localisation.
 47 | 
 48 | Creating a localized detector
 49 | -----------------------------
 50 | 
 51 | To create a detector that is localised the process is identical to creating a normal detector
 52 | (as shown in :ref:`create-detector`), but with one addition a ``supported_locale()`` function.
 53 | If this function is not defined it is assumed that this ``Detector`` does not need
 54 | localization.
 55 | An example of a ``Detector`` that does not need localization is the email detector,
 56 | as emails follow the same format no matter where you live and what language you speak.
 57 | On the other hand, the format of a phone number can vary significantly depending on the region.
 58 | 
 59 | Below is an example of a detector that detects employee names for a very small, but international company.
 60 | There is one German employee, `Walther`, and one US employee `Georgina`.
 61 | When the document is German we will remove `Walther` and when the document is American we will remove `Georgina`.
 62 | 
 63 | The ``supported_locale()`` function should return ``True`` if the passed locale is supported and ``False`` if it is not supported.
 64 | If ``supported_locale()`` returns ``False`` then the ``Scrubber`` will emit a warning and not add or run that ``Detector`` on the documents passed to it.
 65 | The ``Detector.locale_split(locale)`` function can be used to split the locale into the language and region.
 66 | 
 67 | Below is the full example:
 68 | 
 69 | .. code:: pycon
 70 | 
 71 |     >>> import scrubadub, re
 72 | 
 73 |     >>> class EmployeeNameFilth(scrubadub.filth.Filth):
 74 |     ...     type = 'employee_name'
 75 | 
 76 |     >>> class EmployeeDetector(scrubadub.detectors.Detector):
 77 |     ...     name = 'employee_detector'
 78 |     ...
 79 |     ...     def __init__(self, *args, **kwargs):
 80 |     ...         super(EmployeeDetector, self).__init__(*args, **kwargs)
 81 |     ...         self.employees = {'DE': ['Walther'], 'US': ['Georgina'] }
 82 |     ...         self.regex = re.compile('|'.join(self.employees[self.region]))
 83 |     ...
 84 |     ...     @classmethod
 85 |     ...     def supported_locale(cls, locale):
 86 |     ...         language, region = cls.locale_split(locale)
 87 |     ...         return region in ['DE', 'US']
 88 |     ...
 89 |     ...     def iter_filth(self, text, document_name=None):
 90 |     ...         for match in self.regex.finditer(text):
 91 |     ...             yield EmployeeNameFilth(match=match, detector_name=self.name, document_name=document_name, locale=self.locale)
 92 |     ...
 93 |     >>> us_scrubber = scrubadub.Scrubber(detector_list=[EmployeeDetector], locale='en_US')
 94 |     >>> us_scrubber.clean('Jane spoke with Georgina')
 95 |     'Jane spoke with {{EMPLOYEE_NAME}}'
 96 |     >>> de_scrubber = scrubadub.Scrubber(detector_list=[EmployeeDetector], locale='de_DE')
 97 |     >>> de_scrubber.clean('Jane spoke with Georgina')
 98 |     'Jane spoke with Georgina'
 99 |     >>> de_scrubber.clean('Luigi spoke with Walther')
100 |     'Luigi spoke with {{EMPLOYEE_NAME}}'
101 | 


--------------------------------------------------------------------------------
/scrubadub/__init__.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from typing import Union, List, Dict, Sequence, Optional
  3 | 
  4 | # convenient imports
  5 | from .scrubbers import Scrubber
  6 | from . import filth
  7 | from . import detectors
  8 | from . import post_processors
  9 | from .filth import Filth
 10 | 
 11 | __version__ = VERSION = "2.0.0"
 12 | __all__ = [
 13 |     'Scrubber', 'filth', 'detectors', 'post_processors', 'clean', 'clean_documents', 'list_filth',
 14 |     'list_filth_documents',
 15 | ]
 16 | 
 17 | 
 18 | def clean(text: str, locale: Optional[str] = None, **kwargs) -> str:
 19 |     """Seaches for ``Filth`` in `text` in a string and replaces it with placeholders.
 20 | 
 21 |     .. code:: pycon
 22 | 
 23 |         >>> import scrubadub
 24 |         >>> scrubadub.clean(u"contact me at joe@example.com")
 25 |         'contact me at {{EMAIL}}'
 26 | 
 27 |     :param text: The text containing possible PII that needs to be redacted
 28 |     :type text: `str`
 29 |     :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
 30 |                    underscore and the two letter upper-case country code, eg "en_GB" or "de_CH"
 31 |     :type locale: str
 32 |     :return: Text with all :class:``Filth`` replaced.
 33 |     :rtype: `str`
 34 | 
 35 |     """
 36 |     scrubber = Scrubber(locale=locale)
 37 |     return scrubber.clean(text, **kwargs)
 38 | 
 39 | 
 40 | def clean_documents(documents: Union[Sequence[str], Dict[Optional[str], str]], locale: Optional[str] = None, **kwargs
 41 |                     ) -> Union[Sequence[str], Dict[Optional[str], str]]:
 42 |     """Seaches for ``Filth`` in `documents` and replaces it with placeholders.
 43 | 
 44 |     `documents` can be in a dict, in the format of ``{'document_name': 'document'}``, or as a list of strings
 45 |     (each a seperate document).
 46 |     This can be useful when processing many documents.
 47 | 
 48 |     .. code:: pycon
 49 | 
 50 |         >>> import scrubadub
 51 |         >>> scrubadub.clean_documents({'contact.txt': "contact me at joe@example.com",
 52 |         ...     'hello.txt': 'hello world!'})
 53 |         {'contact.txt': 'contact me at {{EMAIL}}', 'hello.txt': 'hello world!'}
 54 | 
 55 |         >>> scrubadub.clean_documents(["contact me at joe@example.com", 'hello world!'])
 56 |         ['contact me at {{EMAIL}}', 'hello world!']
 57 | 
 58 |     :param documents: Documents containing possible PII that needs to be redacted in the form of a list of documents
 59 |         or a dictonary with the key as the document name and the value as the document text
 60 |     :type documents: `list` of `str` objects, `dict` of `str` objects
 61 |     :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
 62 |                    underscore and the two letter upper-case country code, eg "en_GB" or "de_CH"
 63 |     :type locale: str
 64 |     :return: Documents in the same format as input, but with `Filth` redacted
 65 |     :rtype: `list` of `str` objects, `dict` of `str` objects; same as input
 66 |     """
 67 |     scrubber = Scrubber(locale=locale)
 68 |     return scrubber.clean_documents(documents, **kwargs)
 69 | 
 70 | 
 71 | def list_filth(text: str, locale: Optional[str] = None, **kwargs) -> List[Filth]:
 72 |     """Return a list of ``Filth`` that was detected in the string `text`.
 73 | 
 74 |     .. code:: pycon
 75 | 
 76 |         >>> import scrubadub
 77 |         >>> scrubadub.list_filth(u"contact me at joe@example.com")
 78 |         [<EmailFilth text='joe@example.com' beg=14 end=29 detector_name='email' locale='en_US'>]
 79 | 
 80 |     :param text: The text containing possible PII that needs to be found
 81 |     :type text: `str`
 82 |     :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
 83 |                    underscore and the two letter upper-case country code, eg "en_GB" or "de_CH"
 84 |     :type locale: str
 85 |     :return: A list of all the :class:``Filth`` objects that were found
 86 |     :rtype: `list` of :class:``Filth`` objects
 87 | 
 88 |     """
 89 |     scrubber = Scrubber(locale=locale)
 90 |     return list(scrubber.iter_filth(text, **kwargs))
 91 | 
 92 | 
 93 | def list_filth_documents(documents: Union[List[str], Dict[Optional[str], str]], locale: Optional[str] = None,
 94 |                          **kwargs) -> List[Filth]:
 95 |     """Return a list of ``Filth`` that was detected in the string `text`.
 96 | 
 97 |     `documents` can be in a dict, in the format of ``{'document_name': 'document'}``, or as a list of strings
 98 |     (each a seperate document).
 99 |     This can be useful when processing many documents.
100 | 
101 |     .. code:: pycon
102 | 
103 |         >>> import scrubadub
104 |         >>> scrubadub.list_filth_documents(
105 |         ...     {'contact.txt': "contact me at joe@example.com", 'hello.txt': 'hello world!'}
106 |         ... )
107 |         [<EmailFilth text='joe@example.com' document_name='contact.txt' beg=14 end=29 detector_name='email' \
108 | locale='en_US'>]
109 | 
110 |         >>> scrubadub.list_filth_documents(["contact me at joe@example.com", 'hello world!'])
111 |         [<EmailFilth text='joe@example.com' document_name='0' beg=14 end=29 detector_name='email' locale='en_US'>]
112 | 
113 |     :param documents: Documents containing possible PII that needs to be found in the form of a list of documents
114 |         or a dictonary with the key as the document name and the value as the document text
115 |     :type documents: `list` of `str` objects, `dict` of `str` objects
116 |     :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
117 |                    underscore and the two letter upper-case country code, eg "en_GB" or "de_CH"
118 |     :type locale: str
119 |     :return: A list of all the :class:``Filth`` objects that were found
120 |     :rtype: `list` of :class:``Filth`` objects
121 | 
122 |     """
123 |     scrubber = Scrubber(locale=locale)
124 |     return list(scrubber.iter_filth_documents(documents, **kwargs))
125 | 


--------------------------------------------------------------------------------
/tests/test_postprocessor_filth_replacer.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import scrubadub.filth
  4 | from scrubadub.post_processors.filth_replacer import FilthReplacer
  5 | from scrubadub.filth import Filth, MergedFilth, EmailFilth
  6 | 
  7 | 
  8 | class FilthTypeReplacerTestCase(unittest.TestCase):
  9 |     def test_label_maker(self):
 10 |         """Test making labels from filths"""
 11 |         class TestFilth(Filth):
 12 |             type = 'test_type'
 13 | 
 14 |         filth_replacer = FilthReplacer()
 15 |         self.assertEqual(
 16 |             filth_replacer.filth_label(TestFilth(0, 1, 'a')),
 17 |             'TEST_TYPE'
 18 |         )
 19 | 
 20 |         merged = MergedFilth(TestFilth(0, 2, 'ab'), EmailFilth(1, 2, 'b'))
 21 | 
 22 |         self.assertEqual(
 23 |             filth_replacer.filth_label(merged),
 24 |             'EMAIL+TEST_TYPE'
 25 |         )
 26 | 
 27 |         merged = MergedFilth(EmailFilth(0, 2, 'ab'), TestFilth(1, 2, 'b'))
 28 | 
 29 |         self.assertEqual(
 30 |             filth_replacer.filth_label(merged),
 31 |             'EMAIL+TEST_TYPE'
 32 |         )
 33 | 
 34 |         filth_replacer = FilthReplacer(separator='::')
 35 |         self.assertEqual(
 36 |             filth_replacer.filth_label(merged),
 37 |             'EMAIL::TEST_TYPE'
 38 |         )
 39 | 
 40 |         filth_replacer = FilthReplacer()
 41 |         TestFilth.type = "other_test_type"
 42 | 
 43 |         self.assertEqual(
 44 |             filth_replacer.filth_label(TestFilth(0, 1, 'a')),
 45 |             'OTHER_TEST_TYPE'
 46 |         )
 47 | 
 48 |         self.assertEqual(
 49 |             filth_replacer.filth_label(EmailFilth(0, 1, 'a')),
 50 |             'EMAIL'
 51 |         )
 52 | 
 53 |         filth_replacer = FilthReplacer(include_count=True)
 54 |         filth_replacer.reset_lookup()
 55 |         self.assertEqual(filth_replacer.filth_label(EmailFilth(0, 1, 'a')), 'EMAIL-0')
 56 |         self.assertEqual(filth_replacer.filth_label(EmailFilth(0, 1, 'b')), 'EMAIL-1')
 57 |         self.assertEqual(filth_replacer.filth_label(EmailFilth(0, 1, 'a')), 'EMAIL-0')
 58 |         self.assertEqual(filth_replacer.filth_label(EmailFilth(0, 1, 'c')), 'EMAIL-2')
 59 | 
 60 |         filth_replacer = FilthReplacer(uppercase=False)
 61 |         self.assertEqual(filth_replacer.filth_label(EmailFilth(0, 1, 'a')), 'email')
 62 | 
 63 |     def test_process_filths(self):
 64 |         """Test that the process_filths behaves as expected"""
 65 |         class TestFilth(Filth):
 66 |             type = 'test_type'
 67 | 
 68 |         filths = [
 69 |             MergedFilth(EmailFilth(0, 2, 'ab'), TestFilth(1, 2, 'b')),
 70 |             EmailFilth(5, 6, 'c')
 71 |         ]
 72 | 
 73 |         post_processor = FilthReplacer()
 74 |         filths = post_processor.process_filth(filths)
 75 | 
 76 |         self.assertEqual(filths[0].replacement_string, 'EMAIL+TEST_TYPE')
 77 |         self.assertEqual(filths[1].replacement_string, 'EMAIL')
 78 | 
 79 |     def test_hashing(self):
 80 |         post_proc = FilthReplacer()
 81 |         self.assertTrue(post_proc.hash_salt is not None)
 82 |         self.assertIsInstance(post_proc.hash_salt, bytes)
 83 |         self.assertGreater(len(post_proc.hash_salt), 0)
 84 | 
 85 |         filths = [EmailFilth(0, 19, 'example@example.com')]
 86 |         self.assertEqual(filths[0].replacement_string, None)
 87 | 
 88 |         post_proc = FilthReplacer(hash_salt='example', include_type=True, include_hash=True)
 89 |         filths = post_proc.process_filth(filths)
 90 |         self.assertEqual(filths[0].replacement_string, 'EMAIL-42FFCB267F8C5E6D')
 91 | 
 92 |         post_proc = FilthReplacer(hash_salt='example', include_type=True, include_count=True, include_hash=True)
 93 |         post_proc.reset_lookup()
 94 |         filths = post_proc.process_filth(filths)
 95 |         self.assertEqual(filths[0].replacement_string, 'EMAIL-0-42FFCB267F8C5E6D')
 96 | 
 97 |         post_proc = FilthReplacer(hash_salt='example', include_type=False, include_hash=True)
 98 |         filths = post_proc.process_filth(filths)
 99 |         self.assertEqual(filths[0].replacement_string, '42FFCB267F8C5E6D')
100 | 
101 |         post_proc = FilthReplacer(hash_salt='another_salt', include_type=False, include_hash=True)
102 |         filths = post_proc.process_filth(filths)
103 |         self.assertEqual(filths[0].replacement_string, '87BB6F7ED5FE49C4')
104 | 
105 |         post_proc = FilthReplacer(hash_salt='another_salt', include_type=False, hash_length=10, include_hash=True)
106 |         filths = post_proc.process_filth(filths)
107 |         self.assertEqual(filths[0].replacement_string, '87BB6F7ED5')
108 |         self.assertEqual(len(filths[0].replacement_string), 10)
109 | 
110 |         post_proc = FilthReplacer(hash_salt='another_salt', include_type=False, hash_length=50, include_hash=True)
111 |         filths = post_proc.process_filth(filths)
112 |         self.assertEqual(filths[0].replacement_string, '87BB6F7ED5FE49C4EA43D95A41F843D4FBB66D15C5AA41A7F7')
113 |         self.assertEqual(len(filths[0].replacement_string), 50)
114 | 
115 |     def test_bad_filth(self):
116 |         """Test making labels from a filth without a type"""
117 |         class TestFilth(Filth):
118 |             type = None
119 | 
120 |         filth_replacer = FilthReplacer()
121 |         self.assertEqual(
122 |             filth_replacer.filth_label(TestFilth(0, 1, 'a')),
123 |             ''
124 |         )
125 | 
126 |     def test_tagged_filth(self):
127 |         """Test making labels from a tagged filth"""
128 |         filth_replacer = FilthReplacer()
129 |         self.assertEqual(
130 |             filth_replacer.filth_label(scrubadub.filth.TaggedEvaluationFilth(0, 1, 'a', comparison_type='phone')),
131 |             'TAGGED_PHONE'
132 |         )
133 | 
134 |     def test_all_disabled(self):
135 |         """Test making labels when everything is disabled"""
136 |         filth_replacer = FilthReplacer(include_type=False, include_hash=False, include_count=False)
137 |         self.assertEqual(
138 |             filth_replacer.filth_label(scrubadub.filth.TaggedEvaluationFilth(0, 1, 'a', comparison_type='phone')),
139 |             'FILTH'
140 |         )
141 | 
142 |     def tearDown(self) -> None:
143 |         FilthReplacer.reset_lookup()


--------------------------------------------------------------------------------
/tests/test_filth.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import unittest
  3 | 
  4 | from scrubadub.filth import Filth, MergedFilth
  5 | from scrubadub.exceptions import InvalidReplaceWith, FilthMergeError
  6 | 
  7 | class FilthTestCase(unittest.TestCase):
  8 | 
  9 |     def test_disallowed_replace_with(self):
 10 |         """replace_with should fail gracefully"""
 11 |         filth = Filth(beg=0, end=3, text='asd')
 12 |         with self.assertRaises(InvalidReplaceWith):
 13 |             filth.replace_with('surrogate')
 14 |         with self.assertRaises(InvalidReplaceWith):
 15 |             filth.replace_with('something_invalid')
 16 | 
 17 |     def test_nonoverlapping_filth(self):
 18 |         """can't merge non-overlapping filth"""
 19 |         a_filth = Filth(beg=0, end=3, text="the")
 20 |         b_filth = Filth(beg=4, end=7, text="end")
 21 |         with self.assertRaises(FilthMergeError):
 22 |             a_filth.merge(b_filth)
 23 |         with self.assertRaises(FilthMergeError):
 24 |             b_filth.merge(a_filth)
 25 | 
 26 |     def test_text_merge(self):
 27 |         """make sure text length is correct"""
 28 |         class SomeFilth(Filth):
 29 |             type = 'something'
 30 | 
 31 |         text = "the end"
 32 |         a_filth = SomeFilth(beg=0, end=3, text=text[:3])
 33 |         b_filth = SomeFilth(beg=1, end=7, text=text[1:])
 34 | 
 35 |         c_filth = a_filth.merge(b_filth)
 36 |         self.assertEqual(c_filth.text, text)
 37 | 
 38 |         c_filth = b_filth.merge(a_filth)
 39 |         self.assertEqual(c_filth.text, text)
 40 | 
 41 |         d_filth = c_filth.merge(a_filth)
 42 |         self.assertEqual(d_filth.text, text)
 43 | 
 44 |         b_filth.end = 2
 45 |         with self.assertRaises(FilthMergeError):
 46 |             b_filth.merge(a_filth)
 47 | 
 48 |     def test_invalid_merge_documents(self):
 49 |         """Ensure Filth in two different documents cant be merged"""
 50 |         filth_a = Filth(0, 2, text='aa', document_name='one')
 51 |         filth_b = Filth(1, 2, text='a', document_name='two')
 52 | 
 53 |         with self.assertRaises(FilthMergeError):
 54 |             filth_a.merge(filth_b)
 55 | 
 56 |         with self.assertRaises(FilthMergeError):
 57 |             filth_b.merge(filth_a)
 58 | 
 59 |     def test_filth_string(self):
 60 |         """Test the Filth to string function"""
 61 | 
 62 |         filth = Filth(beg=0, end=5)
 63 |         self.assertEqual(str(filth), "<Filth text='' beg=0 end=5>")
 64 | 
 65 |         filth = Filth(beg=0, end=5)
 66 |         self.assertEqual(filth.__repr__(), "<Filth text='' beg=0 end=5>")
 67 | 
 68 |         filth = Filth(beg=0, end=5)
 69 |         self.assertEqual(filth._to_string(), "<Filth text='' beg=0 end=5>")
 70 | 
 71 |         filth = Filth(beg=0, end=5, text='hello')
 72 |         self.assertEqual(str(filth), "<Filth text='hello' beg=0 end=5>")
 73 | 
 74 |         filth = Filth(beg=0, end=5, text='hello', document_name='hello.txt')
 75 |         self.assertEqual(str(filth), "<Filth text='hello' document_name='hello.txt' beg=0 end=5>")
 76 | 
 77 |         filth = Filth(beg=0, end=5, text='hello', document_name='hello.txt')
 78 |         self.assertEqual(filth._to_string(attributes=['text']), "<Filth text='hello'>")
 79 |         self.assertEqual(filth._to_string(attributes=['beg', 'end', 'text']), "<Filth beg=0 end=5 text='hello'>")
 80 |         self.assertEqual(
 81 |             filth._to_string(attributes=['text', 'document_name']),
 82 |             "<Filth text='hello' document_name='hello.txt'>"
 83 |         )
 84 | 
 85 |     def test_merged_to_string(self):
 86 |         """Test the MergedFilth to string"""
 87 |         class TestFilth(Filth):
 88 |             type = 'test_filth'
 89 | 
 90 |         merged = MergedFilth(TestFilth(0, 2, 'ab'), Filth(1, 2, 'b'))
 91 |         self.assertEqual(merged.__repr__(), "<MergedFilth filths=[<TestFilth text='ab' beg=0 end=2>, <Filth text='b' beg=1 end=2>]>")
 92 | 
 93 |     def test_equality(self):
 94 |         """Test the filth equality function"""
 95 |         self.assertTrue(
 96 |             Filth(beg=0, end=5, text='hello') ==
 97 |             Filth(beg=0, end=5, text='hello')
 98 |         )
 99 |         self.assertTrue(
100 |             Filth(beg=0, end=5, text='hello') ==
101 |             Filth(beg=0, end=5, text='hello', match=re.match('123', '1234'))
102 |         )
103 | 
104 |         self.assertTrue(
105 |             Filth(beg=0, end=5, text='hello') !=
106 |             Filth(beg=1, end=5, text='hello')
107 |         )
108 |         self.assertTrue(
109 |             Filth(beg=0, end=5, text='hello') !=
110 |             Filth(beg=0, end=6, text='hello')
111 |         )
112 |         self.assertTrue(
113 |             Filth(beg=0, end=5, text='hello') !=
114 |             Filth(beg=0, end=5, text='hellou')
115 |         )
116 | 
117 |         self.assertTrue(
118 |             Filth(beg=0, end=5, text='hello', document_name='test') ==
119 |             Filth(beg=0, end=5, text='hello', document_name='test')
120 |         )
121 |         self.assertTrue(
122 |             Filth(beg=0, end=5, text='hello') !=
123 |             Filth(beg=0, end=5, text='hello', document_name='test')
124 |         )
125 |         self.assertTrue(
126 |             Filth(beg=0, end=5, text='hello', document_name='test') !=
127 |             Filth(beg=0, end=5, text='hello')
128 |         )
129 |         self.assertTrue(
130 |             Filth(beg=0, end=5, text='hello', document_name='test') !=
131 |             Filth(beg=0, end=5, text='hello', document_name='another_test')
132 |         )
133 | 
134 |         self.assertTrue(
135 |             Filth(beg=0, end=5, text='hello', detector_name='tester') ==
136 |             Filth(beg=0, end=5, text='hello', detector_name='tester')
137 |         )
138 |         self.assertTrue(
139 |             Filth(beg=0, end=5, text='hello', detector_name='tester') !=
140 |             Filth(beg=0, end=5, text='hello', detector_name='another_tester')
141 |         )
142 |         self.assertTrue(
143 |             Filth(beg=0, end=5, text='hello', detector_name='tester') !=
144 |             Filth(beg=0, end=5, text='hello')
145 |         )
146 |         self.assertTrue(
147 |             Filth(beg=0, end=5, text='hello') !=
148 |             Filth(beg=0, end=5, text='hello', detector_name='tester')
149 |         )
150 | 
151 |         self.assertTrue(
152 |             Filth(beg=0, end=5, text='hello', document_name='test', detector_name='tester') ==
153 |             Filth(beg=0, end=5, text='hello', document_name='test', detector_name='tester')
154 |         )
155 |         self.assertTrue(
156 |             Filth(beg=0, end=5, text='hello', document_name='test', detector_name='tester') !=
157 |             Filth(beg=0, end=5, text='hello', document_name='test', detector_name='another_tester')
158 |         )
159 |         self.assertTrue(
160 |             Filth(beg=0, end=5, text='hello', document_name='test', detector_name='tester') !=
161 |             Filth(beg=0, end=5, text='hello', document_name='another_test', detector_name='tester')
162 |         )


--------------------------------------------------------------------------------
/tests/test_filth_address.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import random
  3 | import unittest
  4 | 
  5 | from scrubadub.filth import AddressFilth
  6 | 
  7 | 
  8 | class AddressFilthTestCase(unittest.TestCase):
  9 | 
 10 |     def test_generate(self):
 11 |         class Faker:
 12 |             locales = ['en_GB']
 13 |             def address(self):
 14 |                 return '4 Paula views\nLake Howardburgh\nN7U 2FQ'
 15 |             def last_name(self):
 16 |                 return 'Smith'
 17 | 
 18 |         random.seed(1234)
 19 |         self.assertEqual(
 20 |             'Building Smith, 4 Paula views, Lake Howardburgh, N7U 2FQ, Cymru',
 21 |             AddressFilth.generate(faker=Faker()),
 22 |         )
 23 | 
 24 |     def test_seperators(self):
 25 |         addresses = [
 26 |             ('4 Paula views\nLake Howardburgh\nN7U 2FQ', '4 Paula views Lake Howardburgh N7U 2FQ'),
 27 |             ('79 Miller branch\nJordantown\nW1F 3LB', '79 Miller branch, Jordantown, W1F 3LB'),
 28 |             ('78 Joseph keys\nEast Patricktown\nEN6 2SD', '78 Joseph keys, East Patricktown, EN6 2SD'),
 29 |             ('93 Hall overpass\nNashbury\nTA2W 9XP', '93 Hall overpass, Nashbury, TA2W 9XP'),
 30 |             ('Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ', 'Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ'),
 31 |             ('8 Roberts stravenue\nElliottville\nSY18 2YP', '8 Roberts stravenue, Elliottville, SY18 2YP'),
 32 |             ('784 Knowles mall\nJunetown\nIM20 2PG', '784 Knowles mall, Junetown, IM20 2PG'),
 33 |         ]
 34 |         random.seed(1234)
 35 |         for input_value, output_value in addresses:
 36 |             self.assertEqual(
 37 |                 output_value,
 38 |                 AddressFilth._randomise_seperators(input_value),
 39 |             )
 40 | 
 41 |     def test_street_number(self):
 42 |         addresses = [
 43 |             ('4 Paula views\nLake Howardburgh\nN7U 2FQ', '4 Paula views\nLake Howardburgh\nN7U 2FQ'),
 44 |             ('79 Miller branch\nJordantown\nW1F 3LB', 'Miller branch\nJordantown\nW1F 3LB'),
 45 |             ('78 Joseph keys\nEast Patricktown\nEN6 2SD', 'Joseph keys\nEast Patricktown\nEN6 2SD'),
 46 |             ('93 Hall overpass\nNashbury\nTA2W 9XP', 'Hall overpass\nNashbury\nTA2W 9XP'),
 47 |             ('Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ', 'Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ'),
 48 |             ('8 Roberts stravenue\nElliottville\nSY18 2YP', 'Roberts stravenue\nElliottville\nSY18 2YP'),
 49 |             ('784 Knowles mall\nJunetown\nIM20 2PG', '784 Knowles mall\nJunetown\nIM20 2PG'),
 50 |         ]
 51 |         random.seed(1234)
 52 |         for input_value, output_value in addresses:
 53 |             self.assertEqual(
 54 |                 output_value,
 55 |                 AddressFilth._randomise_street_number(input_value),
 56 |             )
 57 | 
 58 |     def test_postcode(self):
 59 |         addresses = [
 60 |             ('4 Paula views\nLake Howardburgh\nN7U 2FQ', '4 Paula views\nLake Howardburgh\nN7U 2FQ'),
 61 |             ('79 Miller branch\nJordantown\nW1F 3LB', '79 Miller branch\nJordantown'),
 62 |             ('78 Joseph keys\nEast Patricktown\nEN6 2SD', '78 Joseph keys\nEast Patricktown'),
 63 |             ('93 Hall overpass\nNashbury\nTA2W 9XP', '93 Hall overpass\nNashbury'),
 64 |             ('Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ', 'Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ'),
 65 |             ('8 Roberts stravenue\nElliottville\nSY18 2YP', '8 Roberts stravenue\nElliottville'),
 66 |             ('784 Knowles mall\nJunetown\nIM20 2PG', '784 Knowles mall\nJunetown'),
 67 |         ]
 68 |         random.seed(1234)
 69 |         for input_value, output_value in addresses:
 70 |             self.assertEqual(
 71 |                 output_value,
 72 |                 AddressFilth._randomise_postcode(input_value),
 73 |             )
 74 | 
 75 |     def test_country(self):
 76 |         addresses = [
 77 |             ('4 Paula views\nLake Howardburgh\nN7U 2FQ', '4 Paula views\nLake Howardburgh\nN7U 2FQ'),
 78 |             ('79 Miller branch\nJordantown\nW1F 3LB', '79 Miller branch\nJordantown\nW1F 3LB\nUnited Kingdom'),
 79 |             ('78 Joseph keys\nEast Patricktown\nEN6 2SD', '78 Joseph keys\nEast Patricktown\nEN6 2SD\nGB'),
 80 |             ('93 Hall overpass\nNashbury\nTA2W 9XP', '93 Hall overpass\nNashbury\nTA2W 9XP'),
 81 |             ('Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ', 'Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ\nCymru'),
 82 |             ('8 Roberts stravenue\nElliottville\nSY18 2YP', '8 Roberts stravenue\nElliottville\nSY18 2YP\nUnited Kingdom'),
 83 |             ('784 Knowles mall\nJunetown\nIM20 2PG', '784 Knowles mall\nJunetown\nIM20 2PG'),
 84 |         ]
 85 |         random.seed(1234)
 86 |         for input_value, output_value in addresses:
 87 |             self.assertEqual(
 88 |                 output_value,
 89 |                 AddressFilth._randomise_country(input_value),
 90 |             )
 91 | 
 92 |     def test_building(self):
 93 |         class Faker:
 94 |             def last_name(self):
 95 |                 return 'Smith'
 96 | 
 97 |         addresses = [
 98 |             ('4 Paula views\nLake Howardburgh\nN7U 2FQ', '4 Paula views\nLake Howardburgh\nN7U 2FQ'),
 99 |             ('79 Miller branch\nJordantown\nW1F 3LB', 'Building Smith\n79 Miller branch\nJordantown\nW1F 3LB'),
100 |             ('78 Joseph keys\nEast Patricktown\nEN6 2SD', 'Smith Block\n78 Joseph keys\nEast Patricktown\nEN6 2SD'),
101 |             ('93 Hall overpass\nNashbury\nTA2W 9XP', 'House Smith\n93 Hall overpass\nNashbury\nTA2W 9XP'),
102 |             ('Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ', 'Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ'),
103 |             ('8 Roberts stravenue\nElliottville\nSY18 2YP', 'Building Smith\n8 Roberts stravenue\nElliottville\nSY18 2YP'),
104 |             ('784 Knowles mall\nJunetown\nIM20 2PG', '784 Knowles mall\nJunetown\nIM20 2PG'),
105 |         ]
106 |         random.seed(1234)
107 |         for input_value, output_value in addresses:
108 |             self.assertEqual(
109 |                 output_value,
110 |                 AddressFilth._randomise_building(input_value, faker=Faker()),
111 |             )
112 | 
113 |     def test_case(self):
114 |         addresses = [
115 |             ('4 Paula views\nLake Howardburgh\nN7U 2FQ', '4 PAULA VIEWS\nLAKE HOWARDBURGH\nN7U 2FQ'),
116 |             ('79 Miller branch\nJordantown\nW1F 3LB', '79 Miller branch\nJordantown\nW1F 3LB'),
117 |             ('78 Joseph keys\nEast Patricktown\nEN6 2SD', '78 Joseph keys\nEast Patricktown\nEN6 2SD'),
118 |             ('93 Hall overpass\nNashbury\nTA2W 9XP', '93 HALL OVERPASS\nNASHBURY\nTA2W 9XP'),
119 |             ('Flat 98R\nNatasha fall\nLake Rosie\nB73 8PJ', 'FLAT 98R\nNATASHA FALL\nLAKE ROSIE\nB73 8PJ'),
120 |             ('8 Roberts stravenue\nElliottville\nSY18 2YP', '8 Roberts stravenue\nElliottville\nSY18 2YP'),
121 |             ('784 Knowles mall\nJunetown\nIM20 2PG', '784 Knowles mall\nJunetown\nIM20 2PG'),
122 |         ]
123 |         random.seed(1234)
124 |         for input_value, output_value in addresses:
125 |             self.assertEqual(
126 |                 output_value,
127 |                 AddressFilth._randomise_case(input_value),
128 |             )
129 | 


--------------------------------------------------------------------------------
/tests/test_detector_date_of_birth.py:
--------------------------------------------------------------------------------
  1 | import faker
  2 | import random
  3 | import unittest
  4 | import scrubadub
  5 | import scrubadub.detectors.catalogue
  6 | from scrubadub.filth import DateOfBirthFilth
  7 | 
  8 | import datetime
  9 | from base import BaseTestCase
 10 | 
 11 | 
 12 | class DoBTestCase(unittest.TestCase, BaseTestCase):
 13 | 
 14 | 
 15 |     def setUp(self):
 16 |         from scrubadub.detectors.date_of_birth import DateOfBirthDetector
 17 |         scrubadub.detectors.catalogue.register_detector(DateOfBirthDetector, autoload=True)
 18 | 
 19 |     def tearDown(self) -> None:
 20 |         from scrubadub.detectors.date_of_birth import DateOfBirthDetector
 21 |         scrubadub.detectors.catalogue.remove_detector(DateOfBirthDetector)
 22 | 
 23 |     def test_DoB_1(self):
 24 |         """
 25 |         BEFORE: My date of birth is 17/06/1976.
 26 |         AFTER:  My date of birth is {{DATE_OF_BIRTH}}.
 27 |         """
 28 |         self.compare_before_after()
 29 | 
 30 |     def test_DoB_2(self):
 31 |         """
 32 |         BEFORE: I was born 15th June 1991
 33 |         AFTER:  I was born {{DATE_OF_BIRTH}}
 34 |         """
 35 |         self.compare_before_after()
 36 | 
 37 |     def test_DoB_3(self):
 38 |         """
 39 |         BEFORE: DOB: 02.12.1979
 40 |         AFTER:  DOB: 02.12.{{DATE_OF_BIRTH}}
 41 |         """
 42 |         # TODO: this is a known limitation of the dateparser search util,
 43 |         #  need to improve the search to include the full date
 44 |         self.compare_before_after()
 45 | 
 46 |     def test_DoB_4(self):
 47 |         """
 48 |         BEFORE: My name is Mike and I was born in a land far away on 22/11/1972
 49 |         AFTER:  My name is Mike and I was born in a land far away {{DATE_OF_BIRTH}}
 50 |         """
 51 |         # TODO: dateparser is a little greedy, consuming the "on " as well as the date
 52 |         self.compare_before_after()
 53 | 
 54 |     def test_DoB_5(self):
 55 |         """
 56 |         BEFORE: my name is Jane and I was born on 11/22/1972
 57 |         AFTER:  my name is Jane and I was born {{DATE_OF_BIRTH}}
 58 |         """
 59 |         # TODO: dateparser is a little greedy, consuming the "on " as well as the date
 60 |         self.compare_before_after()
 61 | 
 62 |     def test_DoB_6(self):
 63 |         """
 64 |         BEFORE: my date of birth is 22-nov-1972
 65 |         AFTER:  my date of birth is {{DATE_OF_BIRTH}}
 66 |         """
 67 |         self.compare_before_after()
 68 | 
 69 |     def test_DoB_7(self):
 70 |         """
 71 |         BEFORE: My dob is 22-11-1972
 72 |         AFTER:  My dob is {{DATE_OF_BIRTH}}
 73 |         """
 74 |         self.compare_before_after()
 75 | 
 76 |     def test_DoB_8(self):
 77 |         """
 78 |         BEFORE: The claimant's, d.o.b. is 4 June 1976
 79 |         AFTER:  The claimant's, d.o.b. is {{DATE_OF_BIRTH}}
 80 |         """
 81 |         self.compare_before_after()
 82 | 
 83 |     def test_DoB_9(self):
 84 |         """
 85 |         BEFORE: 1985-01-01 is my birthday.
 86 |         AFTER:  {{DATE_OF_BIRTH}} is my birthday.
 87 |         """
 88 |         self.compare_before_after()
 89 | 
 90 |     def test_generate(self):
 91 |         fake = faker.Faker()
 92 |         faker.Faker.seed(4321)
 93 |         random.seed(4321)
 94 | 
 95 |         # I think this could fail just after midnight, because the generated date it relative to today's date and the
 96 |         # generated timedelta will unlikly be an integer number of days.
 97 |         # Will test and possibly remove/change this test further.
 98 |         self.assertIn(
 99 |             DateOfBirthFilth.generate(faker=fake),
100 |             [
101 |                 (datetime.date.today() - datetime.timedelta(days=29729)).strftime('%a %d %b %Y'),
102 |                 (datetime.date.today() - datetime.timedelta(days=29729 + 1)).strftime('%a %d %b %Y'),
103 |             ]
104 |         )
105 | 
106 |     def test_init(self):
107 |         from scrubadub.detectors.date_of_birth import DateOfBirthDetector
108 |         with self.assertRaises(ValueError):
109 |             DateOfBirthDetector(locale='zz_GB')
110 | 
111 |     def test_custom_words(self):
112 |         from scrubadub.detectors.date_of_birth import DateOfBirthDetector
113 |         detector = DateOfBirthDetector(context_words=['big day'])
114 |         filths = list(detector.iter_filth('the big day is may 14th 1983\nsee you then'))
115 | 
116 |         self.assertEqual(1, len(filths))
117 |         self.assertEqual(15, filths[0].beg)
118 |         self.assertEqual(28, filths[0].end)
119 |         self.assertEqual('may 14th 1983', filths[0].text)
120 | 
121 |     def test_young(self):
122 |         from scrubadub.detectors.date_of_birth import DateOfBirthDetector
123 |         detector = DateOfBirthDetector()
124 |         filths = list(detector.iter_filth('my birthday is not may 14th 2020\nor may 15th 2020\nor +14-05-2020 23'))
125 | 
126 |         self.assertEqual(0, len(filths))
127 | 
128 |     def test_context(self):
129 |         from scrubadub.detectors.date_of_birth import DateOfBirthDetector
130 |         text = """
131 |         CONTEXTB2
132 |         CONTEXTB1
133 |         10-Nov-2000
134 |         CONTEXTA1
135 |         CONTEXTA2
136 |         """
137 | 
138 |         detector = DateOfBirthDetector(context_words=['CONTEXTB1'], context_before=10, context_after=10)
139 |         self.assertEqual(1, len(list(detector.iter_filth(text))))
140 |         detector = DateOfBirthDetector(context_words=['CONTEXTB1'], context_before=1, context_after=10)
141 |         self.assertEqual(1, len(list(detector.iter_filth(text))))
142 |         detector = DateOfBirthDetector(context_words=['CONTEXTB1'], context_before=0, context_after=10)
143 |         self.assertEqual(0, len(list(detector.iter_filth(text))))
144 | 
145 |         detector = DateOfBirthDetector(context_words=['CONTEXTB2'], context_before=10, context_after=0)
146 |         self.assertEqual(1, len(list(detector.iter_filth(text))))
147 |         detector = DateOfBirthDetector(context_words=['CONTEXTB2'], context_before=2, context_after=0)
148 |         self.assertEqual(1, len(list(detector.iter_filth(text))))
149 |         detector = DateOfBirthDetector(context_words=['CONTEXTB2'], context_before=1, context_after=0)
150 |         self.assertEqual(0, len(list(detector.iter_filth(text))))
151 | 
152 |         detector = DateOfBirthDetector(context_words=['CONTEXTA1'], context_before=10, context_after=10)
153 |         self.assertEqual(1, len(list(detector.iter_filth(text))))
154 |         detector = DateOfBirthDetector(context_words=['CONTEXTA1'], context_before=0, context_after=1)
155 |         self.assertEqual(1, len(list(detector.iter_filth(text))))
156 |         detector = DateOfBirthDetector(context_words=['CONTEXTA1'], context_before=1, context_after=0)
157 |         self.assertEqual(0, len(list(detector.iter_filth(text))))
158 | 
159 |         detector = DateOfBirthDetector(context_words=['CONTEXTA2'], context_before=0, context_after=10)
160 |         self.assertEqual(1, len(list(detector.iter_filth(text))))
161 |         detector = DateOfBirthDetector(context_words=['CONTEXTA2'], context_before=10, context_after=2)
162 |         self.assertEqual(1, len(list(detector.iter_filth(text))))
163 |         detector = DateOfBirthDetector(context_words=['CONTEXTA2'], context_before=3, context_after=0)
164 |         self.assertEqual(0, len(list(detector.iter_filth(text))))
165 | 


--------------------------------------------------------------------------------
/scrubadub/post_processors/filth_replacer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import math
  4 | import hashlib
  5 | 
  6 | from typing import Sequence, Optional, Union, Dict
  7 | from collections import defaultdict
  8 | 
  9 | from scrubadub.filth import Filth, MergedFilth, TaggedEvaluationFilth
 10 | from scrubadub.post_processors.base import PostProcessor
 11 | from scrubadub.post_processors.catalogue import register_post_processor
 12 | from scrubadub import utils
 13 | 
 14 | 
 15 | class FilthReplacer(PostProcessor):
 16 |     """Creates tokens that are used to replace the Filth found in the text of a document.
 17 | 
 18 |     This can be configured to include the filth type (eg phone, name, email, ...), a unique number for each piece of
 19 |     Filth, and a hash of the Filth.
 20 | 
 21 |     >>> import scrubadub
 22 |     >>> scrubber = scrubadub.Scrubber(post_processor_list=[
 23 |     ...     scrubadub.post_processors.FilthReplacer(),
 24 |     ... ])
 25 |     >>> scrubber.clean("Contact me at 522-368-8530 or hernandezjenna@example.com")
 26 |     'Contact me at PHONE or EMAIL'
 27 |     >>> scrubber = scrubadub.Scrubber(post_processor_list=[
 28 |     ...     scrubadub.post_processors.FilthReplacer(include_hash=True, hash_salt='example', hash_length=8),
 29 |     ... ])
 30 |     >>> scrubber.clean("Contact me at 522-368-8530 or hernandezjenna@example.com")
 31 |     'Contact me at PHONE-7358BF44 or EMAIL-AC0B8AC3'
 32 |     >>> scrubber = scrubadub.Scrubber(post_processor_list=[
 33 |     ...     scrubadub.post_processors.FilthReplacer(include_count=True),
 34 |     ... ])
 35 |     >>> scrubber.clean("Contact me at taylordaniel@example.com or hernandezjenna@example.com, "
 36 |     ...                "but taylordaniel@example.com is probably better.")
 37 |     'Contact me at EMAIL-0 or EMAIL-1, but EMAIL-0 is probably better.'
 38 |     """
 39 |     name = 'filth_replacer'  # type: str
 40 |     autoload = False
 41 |     index = 0
 42 | 
 43 |     # NOTE: this is not an efficient way to store this in memory. could
 44 |     # alternatively hash the type and text and do away with the overhead
 45 |     # bits of storing the tuple in the lookup
 46 |     typed_lookup = defaultdict(lambda: utils.Lookup(), {})  # type: Dict[str, utils.Lookup]
 47 | 
 48 |     def __init__(self, include_type: bool = True, include_count: bool = False, include_hash: bool = False,
 49 |                  uppercase: bool = True, separator: Optional[str] = None, hash_length: Optional[int] = None,
 50 |                  hash_salt: Optional[Union[str, bytes]] = None, **kwargs):
 51 |         """Initialise the FilthReplacer.
 52 | 
 53 |         :param include_type:
 54 |         :type include_type: bool, default True
 55 |         :param include_count:
 56 |         :type include_count: bool, default False
 57 |         :param include_hash:
 58 |         :type include_hash: bool, default False
 59 |         :param uppercase: Make the label uppercase
 60 |         :type uppercase: bool, default True
 61 |         :param separator: Used to separate labels if a merged filth is being replaced
 62 |         :type separator: Optional[str], default None
 63 |         :param hash_length: The length of the hexadecimal hash
 64 |         :type hash_length: Optional[int], default None
 65 |         :param hash_salt: The salt used in the hashing process
 66 |         :type hash_salt: Optional[Union[str, bytes]], default None
 67 |         """
 68 |         super(FilthReplacer, self).__init__(**kwargs)
 69 |         self.include_type = include_type
 70 |         self.include_count = include_count
 71 |         self.include_hash = include_hash
 72 |         self.uppercase = uppercase
 73 |         self.separator = separator or '+'
 74 |         self.hash_length = hash_length or 16
 75 | 
 76 |         if isinstance(hash_salt, str):
 77 |             self.hash_salt = hash_salt.encode('utf8')  # type: bytes
 78 |         else:
 79 |             self.hash_salt = os.urandom(128)
 80 | 
 81 |     @classmethod
 82 |     def reset_lookup(cls):
 83 |         """Reset the lookups that maintain a map of filth to a numeric ID."""
 84 |         cls.typed_lookup = defaultdict(lambda: utils.Lookup(), {})
 85 | 
 86 |     def filth_label(self, filth: Filth) -> str:
 87 |         """This function takes a filth and creates a label that can be used to replace the original text.
 88 | 
 89 |         :param filth: Limit the named entities to those in this list, defaults to ``{'PERSON', 'PER', 'ORG'}``
 90 |         :type filth: Filth
 91 |         :return: The replacement label that should be used for this `Filth`.
 92 |         :rtype: str
 93 | 
 94 |         """
 95 |         filths = [filth]
 96 |         if isinstance(filth, MergedFilth):
 97 |             filths = filth.filths
 98 | 
 99 |         replacements = set()
100 |         for f in filths:
101 |             replacement_pieces = []
102 | 
103 |             if self.include_type:
104 |                 filth_type = getattr(f, 'type', None)
105 |                 if filth_type is None:
106 |                     continue
107 |                 if filth_type == TaggedEvaluationFilth.type:
108 |                     filth_comparison_type = getattr(f, 'comparison_type', None)
109 |                     if filth_comparison_type is not None:
110 |                         filth_type += '_' + filth_comparison_type
111 |                 filth_type = filth_type.replace(' ', '_')
112 | 
113 |                 replacement_pieces.append(filth_type)
114 | 
115 |             if self.include_count:
116 |                 replacement_pieces.append(str(FilthReplacer.typed_lookup[filth_type][f.text.lower()]))
117 | 
118 |             if self.include_hash:
119 |                 replacement_pieces.append(FilthReplacer.get_hash(f.text.lower(), self.hash_salt, self.hash_length))
120 | 
121 |             if len(replacement_pieces) == 0:
122 |                 replacement_pieces = ['filth']
123 | 
124 |             replacements.add('-'.join(replacement_pieces))
125 | 
126 |         label = self.separator.join(sorted(replacements))
127 |         if self.uppercase:
128 |             label = label.upper()
129 |         return label
130 | 
131 |     @staticmethod
132 |     def get_hash(text: str, salt: bytes, length: int) -> str:
133 |         """Get a hash of some text, that has been salted and truncated.
134 | 
135 |         :param text: The text to be hashed
136 |         :type text: str
137 |         :param salt: The salt that should be used in this hashing
138 |         :type salt: bytes
139 |         :param length: The number of characters long that the hexadecimal hash should be
140 |         :type length: int
141 |         :return: The hash of the text
142 |         :rtype: str
143 |         """
144 |         return hashlib.pbkdf2_hmac(
145 |             hash_name='sha256',
146 |             password=text.encode('utf8'),
147 |             salt=salt,
148 |             iterations=100000,
149 |             dklen=math.ceil(length / 2),
150 |         ).hex()[:length]
151 | 
152 |     def process_filth(self, filth_list: Sequence[Filth]) -> Sequence[Filth]:
153 |         """Processes the filth to replace the original text
154 | 
155 |         :param filth_list: The text to be hashed
156 |         :type filth_list: Sequence[Filth]
157 |         :return: The processed filths
158 |         :rtype: Sequence[Filth]
159 |         """
160 |         for filth_item in filth_list:
161 |             filth_item.replacement_string = self.filth_label(filth=filth_item)
162 | 
163 |         return filth_list
164 | 
165 | 
166 | register_post_processor(FilthReplacer)
167 | 
168 | __all__ = ['FilthReplacer']
169 | 


--------------------------------------------------------------------------------