├── .github └── workflows │ └── codeql-analysis.yml ├── .gitignore ├── README.md ├── bin ├── color_my_terminal.sh ├── run_tests.sh └── setup_venv_locally.sh ├── config.json ├── docker-compose.yml ├── no_pii_data.csv ├── requirements-dev.txt ├── requirements.txt ├── setup.py ├── src ├── __init__.py ├── acquire │ ├── __init__.py │ ├── csv_parser.py │ └── tests │ │ ├── __init__.py │ │ ├── data │ │ ├── comma_delimited_file.csv │ │ ├── empty.csv │ │ ├── missing_comma.csv │ │ └── pipe_delimited_file.csv │ │ └── test_csv_parser.py ├── analyze │ ├── __init__.py │ ├── detectors │ │ ├── __init__.py │ │ ├── base_detector.py │ │ ├── credit_card_detector.py │ │ ├── email_detector.py │ │ ├── national_id_detector.py │ │ ├── phone_number_detector.py │ │ ├── pii_detector.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── test_base_detector.py │ │ │ ├── test_credit_card_detector.py │ │ │ ├── test_email_detector.py │ │ │ ├── test_national_id_detector.py │ │ │ ├── test_phone_number_detector.py │ │ │ └── test_pii_detector.py │ └── utils │ │ ├── __init__.py │ │ ├── analyzer_result.py │ │ ├── regex.py │ │ └── tests │ │ ├── __init__.py │ │ ├── test_analyzer_result.py │ │ └── test_regex.py ├── anonymize │ ├── __init__.py │ ├── anonymizer_result.py │ ├── drop_anonymizer.py │ └── tests │ │ ├── __init__.py │ │ └── test_drop_anonymizer.py ├── constants.py ├── dpf_main.py ├── report │ ├── __init__.py │ ├── report_generator.py │ └── tests │ │ ├── __init__.py │ │ └── test_report_generator.py ├── tests │ ├── __init__.py │ ├── config │ │ ├── __init__.py │ │ └── test_config.json │ └── test_dpf_main.py └── write │ ├── __init__.py │ ├── csv_writer.py │ └── tests │ ├── __init__.py │ └── test_csv_writer.py ├── src_spark ├── __init__.py ├── acquire │ ├── __init__.py │ ├── csv_parser.py │ └── tests │ │ ├── __init__.py │ │ ├── data │ │ ├── comma_delimited_file.csv │ │ ├── empty.csv │ │ ├── missing_comma.csv │ │ └── pipe_delimited_file.csv │ │ └── test_csv_parser.py ├── analyze │ ├── __init__.py │ ├── detectors │ │ ├── __init__.py │ │ ├── base_detector.py │ │ ├── credit_card_detector.py │ │ ├── email_detector.py │ │ ├── national_id_detector.py │ │ ├── phone_number_detector.py │ │ ├── pii_detector.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── test_pii_detector.py │ └── utils │ │ ├── __init__.py │ │ ├── analyzer_result.py │ │ └── regex.py ├── constants.py ├── main.py ├── report │ ├── __init__.py │ ├── report_generator.py │ └── tests │ │ ├── __init__.py │ │ └── test_report_generator.py └── write │ ├── __init__.py │ ├── csv_writer.py │ └── tests │ ├── __init__.py │ └── test_csv_writer.py └── test_data.csv /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '30 17 * * 5' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v2 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v1 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 52 | 53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 54 | # If this step fails, then you should remove it and run the build manually (see below) 55 | - name: Autobuild 56 | uses: github/codeql-action/autobuild@v1 57 | 58 | # ℹ️ Command-line programs to run using the OS shell. 59 | # 📚 https://git.io/JvXDl 60 | 61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 62 | # and modify them (or add more) to build your code if your project 63 | # uses a compiled language 64 | 65 | #- run: | 66 | # make bootstrap 67 | # make release 68 | 69 | - name: Perform CodeQL Analysis 70 | uses: github/codeql-action/analyze@v1 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.vscode 2 | /.idea 3 | __pycache__ 4 | *.pyc 5 | /.venv 6 | /venv 7 | /output 8 | /dist 9 | /build/lib 10 | .pytest_cache 11 | .coverage 12 | pyspark_output 13 | pyspark_config.json 14 | *.csv 15 | generate_fake_data.py 16 | scratchpad.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Protection Framework 2 | Data Protection Framework is a python library/command line application for identification, anonymization and de-anonymization of Personally Identifiable Information data. 3 | 4 | The framework aims to work on a two-fold principle for detecting PII: 5 | 1. Using RegularExpressions using a pattern 6 | 2. Using NLP for detecting NER (Named Entity Recognitions) 7 | 8 | ## Features and Current Status 9 | 10 | ### Completed 11 | * Following Global detectors have been completed: 12 | * [x] EMAIL_ADDRESS : An email address identifies the mailbox that emails are sent to or from. The maximum length of the domain name is 255 characters, and the maximum length of the local-part is 64 characters. 13 | * [x] CREDIT_CARD_NUMBER : A credit card number is 12 to 19 digits long. They are used for payment transactions globally. 14 | 15 | * Following detectors specific to Singapore have been completed: 16 | * [x] PHONE_NUMBER : A telephone number. 17 | * [x] FIN/NRIC : A unique set of nine alpha-numeric characters on the Singapore National Registration Identity Card. 18 | 19 | * Following anonymizers have been added 20 | * [x] Redaction: Deletes all or part of a detected sensitive value. 21 | * [x] Encryption : Encrypts the original sensitive data value using a cryptographic key. Cloud DLP supports several types of tokenization, including transformations that can be reversed, or "re-identified." 22 | 23 | ### TO-DO 24 | Following features are part of the backlog with more features coming soon 25 | * Detectors: 26 | * [ ] NAME 27 | * [ ] ADDRESS 28 | * Anonymizers: 29 | * [ ] Masking: Replaces a number of characters of a sensitive value with a specified surrogate character, such as a hash (#) or asterisk (*). 30 | * [ ] Bucketing: "Generalizes" a sensitive value by replacing it with a range of values. (For example, replacing a specific age with an age range, 31 | or temperatures with ranges corresponding to "Hot," "Medium," and "Cold.") 32 | * [ ] Replacement: Replaces a detected sensitive value with a specified surrogate value. 33 | 34 | 35 | You can have a detailed at upcoming features and backlog in this [Github Board](https://github.com/thoughtworks-datakind/anonymizer/projects/1?fullscreen=true) 36 | 37 | ## Development setup 38 | 39 | Clone the [repo](https://github.com/thoughtworks-datakind/anonymizer) and follow the below instructions:
40 | _Assuming that $pwd is where you cloned the repo_ 41 | 2. Setup venv : `./bin/setup_venv_locally.sh` 42 | 3. Activate venv : `source ./.venv/bin/activate` 43 | 4. Install dependencies : `pip install -r requirements-dev.txt` 44 | 45 | ### Config JSON 46 | An example for the config JSON is located at `/config.json` 47 | ``` 48 | { 49 | "acquire": { 50 | "file_path": , 51 | "delimiter": 52 | }, 53 | "analyze": { 54 | 55 | }, 56 | "report" : { 57 | "location" : , 58 | "level" : 59 | }, 60 | "anonymize": { 61 | "output_file_path" : 62 | } 63 | } 64 | ``` 65 | 66 | ### Running Tests 67 | Update this file first `/src/tests/config/test_config.json` \ 68 | You can run the tests by triggering shell script located at `/bin/run_tests.sh` 69 | 70 | ### Trying out on local 71 | 72 | ##### Anonymizing a delimited csv file 73 | 1. Set up a JSON config file similar to the one seen at the project root. 74 | In the 'acquire' section of the json, populate the input file path and the delimiter. 75 | In the 'report' section, provide the output path, where you want the PII detection report to be generated. 76 | A 'high' level report just calls out which columns have PII attributes. 77 | A 'medium' level report calls out the percentage of PII in each column and the associated PII (email, credit card, etc)type for the same. 78 | 2. Run the main class - `python src/dpf_main.py --config ` 79 | You should see the report being appended to the file named 'report_\.log' in the output path specified in the 80 | config file. 81 | 82 | ### Packaging 83 | Run `python setup.py bdist_wheel` and the `.whl` file will be created in the `dist` folder. 84 | 85 | ### Spark-submit 86 | To run spark-submit locally, you can run the following command 87 | `spark-submit --py-files dist/SomePackage-*.whl src_spark/main.py --config config.json` 88 | 89 | 90 | ### Licensing 91 | Distributed under the MIT license. See ``LICENSE`` for more information. 92 | 93 | 94 | ### Contributing 95 | 96 | You want to help out? _Awesome_! 97 | 98 | -------------------------------------------------------------------------------- /bin/color_my_terminal.sh: -------------------------------------------------------------------------------- 1 | export "PS1=${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\] \$ " -------------------------------------------------------------------------------- /bin/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | project_path=$(dirname $0)/.. 4 | 5 | export PYTHONPATH=$project_path 6 | 7 | coverage run --source='./src' --omit='*/tests/*' -m unittest discover . 8 | coverage report -m -------------------------------------------------------------------------------- /bin/setup_venv_locally.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | project_path=$(dirname $0)/.. 4 | 5 | cd ${project_path} 6 | export PYTHONPATH=${project_path} 7 | 8 | echo "$header: Creating virtual environment." 9 | python3 -m venv ${project_path}/.venv 10 | source ${project_path}/.venv/bin/activate 11 | 12 | curl https://bootstrap.pypa.io/get-pip.py | python 13 | pip install -r requirements-dev.txt -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "acquire": { 3 | "file_path": "./test_data.csv", 4 | "delimiter": "," 5 | }, 6 | "analyze": {}, 7 | "report": { 8 | "location": "./output", 9 | "level": "medium" 10 | }, 11 | "anonymize": { 12 | "output_file_path": "./output" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | services: 4 | spark-master: 5 | image: docker.io/bitnami/spark:3.1.2 6 | environment: 7 | - SPARK_MODE=master 8 | - SPARK_RPC_AUTHENTICATION_ENABLED=no 9 | - SPARK_RPC_ENCRYPTION_ENABLED=no 10 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 11 | - SPARK_SSL_ENABLED=no 12 | ports: 13 | - '8080:8080' 14 | - '7077:7077' 15 | networks: 16 | - spark 17 | spark-worker-1: 18 | image: docker.io/bitnami/spark:3.1.2 19 | environment: 20 | - SPARK_MODE=worker 21 | - SPARK_MASTER_URL=spark://spark:7077 22 | - SPARK_WORKER_MEMORY=1G 23 | - SPARK_WORKER_CORES=1 24 | - SPARK_RPC_AUTHENTICATION_ENABLED=no 25 | - SPARK_RPC_ENCRYPTION_ENABLED=no 26 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 27 | - SPARK_SSL_ENABLED=no 28 | networks: 29 | - spark 30 | depends_on: 31 | - spark-master 32 | 33 | networks: 34 | spark: 35 | driver: bridge -------------------------------------------------------------------------------- /no_pii_data.csv: -------------------------------------------------------------------------------- 1 | Address,Remarks 2 | 112 Bedok,Good 3 | 112 Bedok,Average -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | 3 | coverage==5.5 4 | pytest==6.2.5 5 | freezegun==1.1.0 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.2 2 | attrs==21.2.0 3 | backcall==0.2.0 4 | coverage==5.5 5 | debugpy==1.4.3 6 | decorator==5.1.0 7 | entrypoints==0.3 8 | Faker==8.14.0 9 | freezegun==1.1.0 10 | iniconfig==1.1.1 11 | ipykernel==6.4.1 12 | ipython==7.27.0 13 | ipython-genutils==0.2.0 14 | jedi==0.18.0 15 | jupyter-client==7.0.3 16 | jupyter-core==4.8.1 17 | matplotlib-inline==0.1.3 18 | nest-asyncio==1.5.1 19 | numpy==1.21.2 20 | packaging==21.0 21 | pandas==1.3.3 22 | parso==0.8.2 23 | pexpect==4.8.0 24 | pickleshare==0.7.5 25 | pluggy==1.0.0 26 | prompt-toolkit==3.0.20 27 | ptyprocess==0.7.0 28 | py==1.10.0 29 | py4j==0.10.9 30 | Pygments==2.10.0 31 | pyparsing==2.4.7 32 | pyspark==3.1.2 33 | pytest==6.2.5 34 | python-dateutil==2.8.2 35 | pytz==2021.1 36 | pyzmq==22.3.0 37 | six==1.16.0 38 | text-unidecode==1.3 39 | toml==0.10.2 40 | tornado==6.1 41 | traitlets==5.1.0 42 | wcwidth==0.2.5 43 | wheel==0.37.0 44 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name = 'SomePackage', 5 | version = '0.1', 6 | packages = find_packages() 7 | ) -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/__init__.py -------------------------------------------------------------------------------- /src/acquire/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/acquire/__init__.py -------------------------------------------------------------------------------- /src/acquire/csv_parser.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from src.constants import FILE_PATH 4 | 5 | 6 | class CsvParser: 7 | 8 | def __init__(self, config): 9 | self.__validate_config(config) 10 | self.input_path = config["file_path"] 11 | self.delimiter = config["delimiter"] if "delimiter" in config and config["delimiter"] else "," 12 | 13 | def __validate_config(self, config): 14 | if FILE_PATH not in config or not config[FILE_PATH]: 15 | raise ValueError("Config 'file_path' needs to be provided for parsing") 16 | 17 | def parse(self): 18 | try: 19 | df = pd.read_csv(self.input_path, delimiter=self.delimiter) 20 | except pd.errors.EmptyDataError: 21 | return pd.DataFrame({}) 22 | 23 | if df.isnull().values.any(): 24 | raise ValueError("Dataframe contains NULL values") 25 | 26 | return df 27 | -------------------------------------------------------------------------------- /src/acquire/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/acquire/tests/__init__.py -------------------------------------------------------------------------------- /src/acquire/tests/data/comma_delimited_file.csv: -------------------------------------------------------------------------------- 1 | name,ssn 2 | Lisa Beard,557-39-2479 -------------------------------------------------------------------------------- /src/acquire/tests/data/empty.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/acquire/tests/data/empty.csv -------------------------------------------------------------------------------- /src/acquire/tests/data/missing_comma.csv: -------------------------------------------------------------------------------- 1 | name,ssn,age 2 | Lisa Beard,557-39-2479,33 3 | John Sohn,33 -------------------------------------------------------------------------------- /src/acquire/tests/data/pipe_delimited_file.csv: -------------------------------------------------------------------------------- 1 | name|ssn 2 | Lisa Beard|557-39-2479 -------------------------------------------------------------------------------- /src/acquire/tests/test_csv_parser.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import os 3 | import pandas as pd 4 | from src.acquire.csv_parser import CsvParser 5 | 6 | 7 | class TestCsvParser(TestCase): 8 | 9 | def setUp(self): 10 | self.current_dir = os.path.dirname(os.path.realpath(__file__)) 11 | 12 | def test_invalid_config_gets_caught_during_initialization(self): 13 | context = {} 14 | with self.assertRaises(ValueError) as ve: 15 | CsvParser(config=context) 16 | self.assertEqual(str(ve.exception), "Config 'file_path' needs to be provided for parsing") 17 | 18 | def test_if_valid_csv_file_provided_returns_pandas_df(self): 19 | file_path = "{}/data/comma_delimited_file.csv".format(self.current_dir) 20 | config = {"file_path" : file_path, "delimiter" : ""} 21 | test_csv_parser_valid_file_path = CsvParser(config=config) 22 | expected = pd.DataFrame({"name": ["Lisa Beard"], "ssn": ["557-39-2479"]}) 23 | actual = test_csv_parser_valid_file_path.parse() 24 | self.assertEqual(actual.to_dict(), expected.to_dict()) 25 | 26 | def test_if_valid_csv_file_with_different_delimiter_provided_returns_pandas_df(self): 27 | file_path = "{}/data/pipe_delimited_file.csv".format(self.current_dir) 28 | config = {"file_path" : file_path, "delimiter" : "|"} 29 | test_csv_parser_valid_file_path = CsvParser(config=config) 30 | expected = pd.DataFrame({"name": ["Lisa Beard"], "ssn": ["557-39-2479"]}) 31 | actual = test_csv_parser_valid_file_path.parse() 32 | self.assertEqual(actual.to_dict(), expected.to_dict()) 33 | 34 | def test_if_empty_csv_file_returns_empty_pandas_df(self): 35 | file_path = "{}/data/empty.csv".format(self.current_dir) 36 | config = {"file_path" : file_path} 37 | test_csv_parser_valid_file_path = CsvParser(config=config) 38 | expected = pd.DataFrame({}) 39 | actual = test_csv_parser_valid_file_path.parse() 40 | self.assertEqual(actual.to_dict(), expected.to_dict()) 41 | 42 | def test_if_error_is_raised_if_df_has_null_values(self): 43 | file_path = "{}/data/missing_comma.csv".format(self.current_dir) 44 | config = {"file_path" : file_path} 45 | with self.assertRaises(ValueError) as ve: 46 | CsvParser(config=config).parse() 47 | self.assertEqual(str(ve.exception), "Dataframe contains NULL values") 48 | -------------------------------------------------------------------------------- /src/analyze/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/__init__.py -------------------------------------------------------------------------------- /src/analyze/detectors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/detectors/__init__.py -------------------------------------------------------------------------------- /src/analyze/detectors/base_detector.py: -------------------------------------------------------------------------------- 1 | import re 2 | from abc import ABC, abstractmethod 3 | 4 | from src.analyze.utils.analyzer_result import AnalyzerResult 5 | 6 | 7 | class BaseDetector(ABC): 8 | 9 | def __init__(self): 10 | self.name = None 11 | self.pattern = None 12 | 13 | @abstractmethod 14 | def get_pattern(self): 15 | pass 16 | 17 | @abstractmethod 18 | def get_name(self): 19 | pass 20 | 21 | def validate(self, text): 22 | return True 23 | 24 | def execute(self, text): 25 | results = [] 26 | matches = re.finditer(self.get_pattern(), text) 27 | for match in matches: 28 | matched_string = match.string[match.start(): match.end()] 29 | if self.validate(matched_string): 30 | results.append(AnalyzerResult(matched_string, self.get_name(), match.start(), match.end())) 31 | return results 32 | -------------------------------------------------------------------------------- /src/analyze/detectors/credit_card_detector.py: -------------------------------------------------------------------------------- 1 | from src.analyze.detectors.base_detector import BaseDetector 2 | from src.analyze.utils.regex import RegEx 3 | 4 | 5 | class CreditCardDetector(BaseDetector): 6 | 7 | def __init__(self): 8 | self.name = "CREDIT_CARD" 9 | self.pattern = RegEx().literal("4").any_digit().num_occurrences(3).pipe() \ 10 | .literal("5").range(0, 5).any_digit().num_occurrences(2).pipe() \ 11 | .literal("6").any_digit().num_occurrences(3).pipe() \ 12 | .literal("1").any_digit().num_occurrences(3).pipe() \ 13 | .literal("3").any_digit().num_occurrences(3) \ 14 | .one_of("- ").zero_or_one_occurrences() \ 15 | .any_digit().range_occurrences(3, 4) \ 16 | .one_of("- ").zero_or_one_occurrences() \ 17 | .any_digit().range_occurrences(3, 4) \ 18 | .one_of("- ").zero_or_one_occurrences() \ 19 | .any_digit().range_occurrences(3, 5).build() 20 | 21 | def get_name(self): 22 | return self.name 23 | 24 | def get_pattern(self): 25 | return self.pattern 26 | 27 | def validate(self, text): 28 | def digits_of(n): 29 | return [int(d) for d in str(n)] 30 | 31 | digits = digits_of(text.replace('-', '').replace(' ', '')) 32 | odd_digits = digits[-1::-2] 33 | even_digits = digits[-2::-2] 34 | checksum = sum(odd_digits) 35 | 36 | for d in even_digits: 37 | checksum += sum(digits_of(d * 2)) 38 | 39 | return checksum % 10 == 0 40 | -------------------------------------------------------------------------------- /src/analyze/detectors/email_detector.py: -------------------------------------------------------------------------------- 1 | from src.analyze.detectors.base_detector import BaseDetector 2 | from src.analyze.utils.regex import RegEx 3 | 4 | 5 | class EmailDetector(BaseDetector): 6 | 7 | def __init__(self): 8 | self.name = "EMAIL" 9 | self.pattern = RegEx().one_of("a-zA-Z0-9_.+-").one_or_more_occurrences().literal("@").one_of("a-zA-Z0-9-")\ 10 | .one_or_more_occurrences().literal("\\.").one_of("a-zA-Z0-9-.").one_or_more_occurrences().build() 11 | 12 | def get_name(self): 13 | return self.name 14 | 15 | def get_pattern(self): 16 | return self.pattern 17 | -------------------------------------------------------------------------------- /src/analyze/detectors/national_id_detector.py: -------------------------------------------------------------------------------- 1 | from src.analyze.detectors.base_detector import BaseDetector 2 | from src.analyze.utils.regex import RegEx 3 | 4 | 5 | class NationalIdDetector(BaseDetector): 6 | 7 | def __init__(self): 8 | self.name = "NRIC" 9 | self.pattern = RegEx().one_of("STFG").any_digit().num_occurrences(7).range("A", "Z").build() 10 | 11 | def get_name(self): 12 | return self.name 13 | 14 | def get_pattern(self): 15 | return self.pattern 16 | 17 | def __get_offset(self, text): 18 | return 4 if text in "TG" else 0 19 | 20 | def __is_NRIC(self, text, loc): 21 | if text[0] in "ST": 22 | return "JZIHGFEDCBA"[loc] == text[8] 23 | return False 24 | 25 | def __is_FIN(self, text, loc): 26 | if text[0] in "FG": 27 | return "XWUTRQPNMLK"[loc] == text[8] 28 | return False 29 | 30 | def validate(self, text): 31 | weight = self.__get_weight(text) 32 | first_character = text[0] 33 | offset = self.__get_offset(first_character) 34 | loc = (offset + weight) % 11 35 | return self.__is_NRIC(text, loc) or self.__is_FIN(text, loc) 36 | 37 | def __get_weight(self, text): 38 | numbers = [int(digit) for digit in list(text[1:-1])] 39 | for index, i in enumerate(numbers): 40 | if index == 0: 41 | numbers[index] *= 2 42 | numbers[index] *= 8 - index 43 | return sum(numbers) 44 | -------------------------------------------------------------------------------- /src/analyze/detectors/phone_number_detector.py: -------------------------------------------------------------------------------- 1 | from src.analyze.detectors.base_detector import BaseDetector 2 | from src.analyze.utils.regex import RegEx 3 | 4 | 5 | class PhoneNumberDetector(BaseDetector): 6 | 7 | def __init__(self): 8 | self.name = "PHONE_NUMBER" 9 | regex_pipe = RegEx().pipe().build() 10 | 11 | regex_with_country_code_and_no_space = '(\\+65?\\s?[689]\\d{7})' 12 | regex_with_country_code_and_single_space = '(\\+65?\\s?[689]\\d{3} \\d{4})' 13 | regex_no_country_code_and_no_space = '([689]\\d{7})' 14 | regex_no_country_code_and_single_space = '([689]\\d{3} \\d{4})' 15 | regex_with_country_code_in_brackets_and_no_space = '([(]65[)]\\s?[689]\\d{7})' 16 | regex_with_country_code_in_brackets_and_single_space = '([(]65[)]\\s?[689]\\d{3} \\d{4})' 17 | 18 | self.pattern = regex_with_country_code_and_no_space + regex_pipe + \ 19 | regex_with_country_code_and_single_space + regex_pipe + \ 20 | regex_no_country_code_and_no_space + regex_pipe + \ 21 | regex_no_country_code_and_single_space + regex_pipe + \ 22 | regex_with_country_code_in_brackets_and_no_space + regex_pipe + \ 23 | regex_with_country_code_in_brackets_and_single_space 24 | 25 | def get_name(self): 26 | return self.name 27 | 28 | def get_pattern(self): 29 | return self.pattern 30 | -------------------------------------------------------------------------------- /src/analyze/detectors/pii_detector.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import pkgutil 3 | import inspect 4 | import sys 5 | 6 | import pandas as pd 7 | 8 | import src.analyze.detectors 9 | from src.analyze.detectors.base_detector import BaseDetector 10 | from src.anonymize.drop_anonymizer import DropAnonymizer 11 | from src.anonymize.anonymizer_result import AnonymizerResult 12 | 13 | 14 | #TODO : refactor this to use the annotations instead of the module path. 15 | class PIIDetector: 16 | 17 | def __init__(self): 18 | self.detectors = self.__get_detector_instances() 19 | 20 | def __get_detector_modules(self): 21 | modules = [modname for importer, modname, ispkg in 22 | pkgutil.walk_packages(path=src.analyze.detectors.__path__, 23 | prefix=src.analyze.detectors.__name__+".") 24 | if "tests" not in modname] 25 | return modules 26 | 27 | def __get_detector_instances(self): 28 | modules = self.__get_detector_modules() 29 | detectors = [] 30 | for module in modules: 31 | importlib.import_module(module) 32 | classes = inspect.getmembers(sys.modules[module], inspect.isclass) 33 | for class_name, class_type in classes: 34 | if class_name != "BaseDetector" and issubclass(class_type, BaseDetector): 35 | detectors.append(class_type()) 36 | return detectors 37 | 38 | #TODO : Should we make this static? 39 | def analyze_and_redact(self, text: str): 40 | analyzer_results = [] 41 | for detector in self.detectors: 42 | analyzer_results = analyzer_results + detector.execute(text) 43 | redacted_text = DropAnonymizer.redact(text, analyzer_results) 44 | return AnonymizerResult(redacted_text, analyzer_results) 45 | 46 | def __contains_pii(self, results): 47 | for result in results: 48 | if len(result.analyzer_results) > 0: 49 | return True 50 | return False 51 | 52 | def analyze_data_frame(self, input_data_frame): 53 | result_df = input_data_frame.applymap(self.analyze_and_redact) 54 | return result_df.applymap(lambda x: x.analyzer_results), result_df.applymap(lambda x: x.redacted_text) 55 | -------------------------------------------------------------------------------- /src/analyze/detectors/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/detectors/tests/__init__.py -------------------------------------------------------------------------------- /src/analyze/detectors/tests/test_base_detector.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | 4 | from src.analyze.detectors.base_detector import BaseDetector 5 | from src.analyze.utils.analyzer_result import AnalyzerResult 6 | from src.analyze.utils.regex import RegEx 7 | 8 | 9 | class TestBaseDetector(TestCase): 10 | 11 | def setUp(self): 12 | 13 | class TestClass(BaseDetector): 14 | def get_pattern(self): 15 | return RegEx().one_of("STFG").any_digit().num_occurrences(7).range("A", "Z").build() 16 | 17 | def get_name(self): 18 | return "NRIC" 19 | 20 | self.test_class = TestClass() 21 | 22 | def test_execute_calls_match_and_validate(self): 23 | results = self.test_class.execute("First President of Singapore NRIC was S0000001I") 24 | self.assertEqual(len(results), 1) 25 | self.assertEqual(AnalyzerResult("S0000001I", "NRIC", 38, 47), results[0]) 26 | 27 | def test_execute_returns_all_matches_when_more_than_one(self): 28 | results = self.test_class.execute("First President of Singapore NRIC was S0000001I and the second president's was T0000001R") 29 | self.assertEqual(len(results), 2) 30 | self.assertCountEqual([AnalyzerResult("S0000001I", "NRIC", 38, 47),AnalyzerResult("T0000001R", "NRIC", 79, 88)], results) 31 | 32 | def test_execute_returns_empty_list_when_no_matches(self): 33 | results = self.test_class.execute("First President of Singapore NRIC was ABC and the second president's was DEF") 34 | self.assertEqual(len(results), 0) 35 | 36 | def test_get_name_and_get_patterns_are_abstract(self): 37 | with self.assertRaises(TypeError) as te: 38 | BaseDetector() 39 | self.assertEqual(str(te.exception), "Can't instantiate abstract class BaseDetector with abstract methods get_name, get_pattern") 40 | -------------------------------------------------------------------------------- /src/analyze/detectors/tests/test_credit_card_detector.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from src.analyze.detectors.credit_card_detector import CreditCardDetector 4 | 5 | 6 | class TestCreditCardDetector(TestCase): 7 | 8 | def setUp(self): 9 | self.credit_card_detector = CreditCardDetector() 10 | 11 | def test_default_property_values_are_correct(self): 12 | self.assertEqual("CREDIT_CARD", self.credit_card_detector.name) 13 | self.assertEqual('4\\d{3}|5[0-5]\\d{2}|6\\d{3}|1\\d{3}|3\\d{3}[- ]?\\d{3,4}[- ]?\\d{3,4}[- ]?\\d{3,5}', 14 | self.credit_card_detector.pattern) 15 | 16 | def test_valid_credit_cards(self): 17 | self.assertTrue(self.credit_card_detector.validate("4012888888881881")) 18 | self.assertTrue(self.credit_card_detector.validate("4012-8888-8888-1881")) 19 | self.assertTrue(self.credit_card_detector.validate("4012 8888 8888 1881")) 20 | 21 | def test_valid_airplus_credit_card(self): 22 | self.assertTrue(self.credit_card_detector.validate('122000000000003')) 23 | 24 | def test_valid_amex_credit_card(self): 25 | self.assertTrue(self.credit_card_detector.validate('371449635398431')) 26 | 27 | def test_valid_cartebleue_credit_card(self): 28 | self.assertTrue(self.credit_card_detector.validate('5555555555554444')) 29 | 30 | def test_valid_dankort_credit_card(self): 31 | self.assertTrue(self.credit_card_detector.validate('5019717010103742')) 32 | 33 | def test_valid_diners_credit_card(self): 34 | self.assertTrue(self.credit_card_detector.validate('30569309025904')) 35 | 36 | def test_valid_discover_credit_card(self): 37 | self.assertTrue(self.credit_card_detector.validate('6011000400000000')) 38 | 39 | def test_valid_jcb_credit_card(self): 40 | self.assertTrue(self.credit_card_detector.validate('3528000700000000')) 41 | 42 | def test_valid_maestro_credit_card(self): 43 | self.assertTrue(self.credit_card_detector.validate('6759649826438453')) 44 | 45 | def test_valid_mastercard_credit_card(self): 46 | self.assertTrue(self.credit_card_detector.validate('5555555555554444')) 47 | 48 | def test_valid_visa_credit_card(self): 49 | self.assertTrue(self.credit_card_detector.validate('4111111111111111')) 50 | 51 | def test_valid_visa_debit_credit_card(self): 52 | self.assertTrue(self.credit_card_detector.validate('4111111111111111')) 53 | 54 | def test_valid_visa_electron_credit_card(self): 55 | self.assertTrue(self.credit_card_detector.validate('4917300800000000')) 56 | 57 | def test_valid_visa_purchasing_credit_card(self): 58 | self.assertTrue(self.credit_card_detector.validate('4484070000000000')) 59 | 60 | def test_invalid_credit_card(self): 61 | self.assertFalse(self.credit_card_detector.validate('4012-8888-8888-1882')) 62 | 63 | def test_invalid_diners_card(self): 64 | self.assertFalse(self.credit_card_detector.validate('36168002586008')) 65 | -------------------------------------------------------------------------------- /src/analyze/detectors/tests/test_email_detector.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from unittest.mock import patch 3 | 4 | from src.analyze.detectors.email_detector import EmailDetector 5 | 6 | 7 | class TestEmailDetector(TestCase): 8 | 9 | def setUp(self): 10 | self.email_detector = EmailDetector() 11 | 12 | def test_get_name_returns_the_valid_detector_name(self): 13 | self.assertEqual(self.email_detector.get_name(), "EMAIL") 14 | 15 | def test_get_pattern_returns_compiled_regex(self): 16 | actual_value = self.email_detector.get_pattern() 17 | return_value = "[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+" 18 | self.assertEqual(return_value, actual_value) 19 | 20 | def test_valid_email_gets_detected_correctly(self): 21 | self.assertEqual(len(self.email_detector.execute("abc@hotmail.com")), 1) 22 | 23 | def test_invalid_email_does_not_get_detected(self): 24 | self.assertEqual(len(self.email_detector.execute("@hotmail.com")), 0) 25 | -------------------------------------------------------------------------------- /src/analyze/detectors/tests/test_national_id_detector.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from src.analyze.detectors.national_id_detector import NationalIdDetector 4 | 5 | 6 | class TestNationalIdDetector(TestCase): 7 | 8 | def setUp(self): 9 | self.national_id_detector = NationalIdDetector() 10 | 11 | def test_default_property_values_are_correct(self): 12 | self.assertEqual("NRIC", self.national_id_detector.name) 13 | self.assertEqual("[STFG]\\d{7}[A-Z]", self.national_id_detector.pattern) 14 | 15 | def test_execute_return_true_when_valid_old_NRIC(self): 16 | self.assertTrue(self.national_id_detector.validate("S0000001I")) 17 | 18 | def test_execute_return_true_when_valid_old_FIN(self): 19 | self.assertTrue(self.national_id_detector.validate("F0000001U")) 20 | 21 | def test_execute_return_true_when_valid_new_NRIC(self): 22 | self.assertTrue(self.national_id_detector.validate("T0000001E")) 23 | 24 | def test_execute_return_true_when_valid_new_FIN(self): 25 | self.assertTrue(self.national_id_detector.validate("G0000001P")) 26 | 27 | def test_execute_return_false_when_invalid_old_NRIC(self): 28 | self.assertFalse(self.national_id_detector.validate("S0000001K")) 29 | 30 | def test_execute_return_false_when_invalid_new_NRIC(self): 31 | self.assertFalse(self.national_id_detector.validate("F0000001V")) 32 | 33 | def test_execute_return_false_when_invalid_old_FIN(self): 34 | self.assertFalse(self.national_id_detector.validate("T0000001F")) 35 | 36 | def test_execute_return_false_when_invalid_new_FIN(self): 37 | self.assertFalse(self.national_id_detector.validate("G0000001Q")) 38 | -------------------------------------------------------------------------------- /src/analyze/detectors/tests/test_phone_number_detector.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from src.analyze.detectors.phone_number_detector import PhoneNumberDetector 4 | from src.analyze.utils.analyzer_result import AnalyzerResult 5 | 6 | 7 | class TestPhoneNumberDetector(TestCase): 8 | 9 | def setUp(self): 10 | self.phone_number_detector = PhoneNumberDetector() 11 | 12 | def test_default_property_values_are_correct(self): 13 | self.assertEqual("PHONE_NUMBER", self.phone_number_detector.name) 14 | self.assertEqual('(\\+65?\\s?[689]\\d{7})|' 15 | '(\\+65?\\s?[689]\\d{3} \\d{4})|' 16 | '([689]\\d{7})|' 17 | '([689]\\d{3} \\d{4})|' 18 | '([(]65[)]\\s?[689]\\d{7})|' 19 | '([(]65[)]\\s?[689]\\d{3} \\d{4})', 20 | self.phone_number_detector.pattern) 21 | 22 | def test_invalid_phone_number_does_not_get_detected(self): 23 | self.assertEqual(len(self.phone_number_detector.execute("S0000001I")), 0) 24 | 25 | def __assert_single_result(self, text_to_be_tested, start, end): 26 | actual = self.phone_number_detector.execute(text_to_be_tested) 27 | expected = AnalyzerResult(text_to_be_tested, "PHONE_NUMBER", start, end) 28 | self.assertEqual(len(actual), 1) 29 | self.assertEqual(expected, actual[0]) 30 | 31 | def test_valid_phone_number_gets_detected_correctly(self): 32 | self.__assert_single_result("+65 65781234", 0, 12) 33 | self.__assert_single_result("+65 85781234", 0, 12) 34 | self.__assert_single_result("+65 95781234", 0, 12) 35 | 36 | self.__assert_single_result("+65 6578 1234", 0, 13) 37 | self.__assert_single_result("+65 8578 1234", 0, 13) 38 | self.__assert_single_result("+65 9578 1234", 0, 13) 39 | 40 | self.__assert_single_result("65781234", 0, 8) 41 | self.__assert_single_result("85781234", 0, 8) 42 | self.__assert_single_result("95781234", 0, 8) 43 | 44 | self.__assert_single_result("6578 1234", 0, 9) 45 | self.__assert_single_result("8578 1234", 0, 9) 46 | self.__assert_single_result("9578 1234", 0, 9) 47 | 48 | self.__assert_single_result("(65) 65781234", 0, 13) 49 | self.__assert_single_result("(65) 85781234", 0, 13) 50 | self.__assert_single_result("(65) 95781234", 0, 13) 51 | 52 | self.__assert_single_result("(65) 6578 1234", 0, 14) 53 | self.__assert_single_result("(65) 8578 1234", 0, 14) 54 | self.__assert_single_result("(65) 9578 1234", 0, 14) 55 | -------------------------------------------------------------------------------- /src/analyze/detectors/tests/test_pii_detector.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from unittest import TestCase 3 | 4 | from src.analyze.detectors.pii_detector import PIIDetector 5 | from src.analyze.utils.analyzer_result import AnalyzerResult 6 | from src.anonymize.anonymizer_result import AnonymizerResult 7 | 8 | 9 | class TestPIIDetector(TestCase): 10 | 11 | def setUp(self): 12 | self.pii_detector = PIIDetector() 13 | 14 | def test_should_detect_and_redact_nric_in_text(self): 15 | actual = self.pii_detector.analyze_and_redact("First President of Singapore NRIC was S0000001I") 16 | expected = AnonymizerResult("First President of Singapore NRIC was ", [AnalyzerResult("S0000001I", "NRIC", 38, 47)]) 17 | self.assertEqual(actual, expected) 18 | 19 | def test_should_detect_and_redact_email_in_text(self): 20 | actual = self.pii_detector.analyze_and_redact("A typical email id would look something like test@sample.com") 21 | expected = AnonymizerResult("A typical email id would look something like ", 22 | [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]) 23 | self.assertEqual(actual, expected) 24 | 25 | def test_should_detect_and_redact_phone_in_text(self): 26 | actual = self.pii_detector.analyze_and_redact("Some examples of phone numbers are +65 62345678") 27 | expected = AnonymizerResult("Some examples of phone numbers are ", 28 | [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]) 29 | self.assertEqual(actual, expected) 30 | 31 | def test_should_detect_and_redact_all_pii_fields_in_text(self): 32 | actual = self.pii_detector.analyze_and_redact("""First President of Singapore NRIC was S0000001I. 33 | A typical email id would look something like test@sample.com""") 34 | expected_redacted_text = """First President of Singapore NRIC was . 35 | A typical email id would look something like """ 36 | 37 | expected = AnonymizerResult(expected_redacted_text, [AnalyzerResult("test@sample.com", "EMAIL", 135, 150), 38 | AnalyzerResult("S0000001I", "NRIC", 38, 47)]) 39 | self.assertEqual(actual, expected) 40 | 41 | def test_analyze_returns_returns_same_text_and_no_results_when_no_PII_fields(self): 42 | input_text = """First President of Singapore NRIC was ABC. 43 | A typical email id would look something like test""" 44 | actual = self.pii_detector.analyze_and_redact(input_text) 45 | expected = AnonymizerResult(input_text, []) 46 | self.assertEqual(actual, expected) 47 | 48 | def test_analyze_data_frame_runs_analyze_against_each_cell_with_a_PII_value(self): 49 | test_data_frame = pd.DataFrame({"summary": ["First President of Singapore NRIC was S0000001I", 50 | "A typical email id would look something like test@sample.com"], 51 | "phone number": ["Some examples of phone numbers are +65 62345678", 52 | "Some examples of phone numbers are +65 62345678"]}) 53 | 54 | actual, _ = self.pii_detector.analyze_data_frame(test_data_frame) 55 | 56 | expected_data_frame = pd.DataFrame({"summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], 57 | [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]], 58 | "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], 59 | [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]}) 60 | 61 | pd.testing.assert_frame_equal(expected_data_frame, actual) 62 | 63 | def test_analyze_data_frame_runs_analyze_against_each_cell_when_there_are_no_PII_values_returns_empty_data_frame( 64 | self): 65 | test_data_frame = pd.DataFrame({"summary": ["First President of Singapore NRIC was abcde", 66 | "A typical email id would look something like test@t"], 67 | "phone number": ["Some examples of phone numbers are +34342", 68 | "Some examples of phone numbers are +8909"]}) 69 | expected_report = pd.DataFrame({"summary": [[],[]], 70 | "phone number": [[],[]] 71 | }) 72 | expected_result = pd.DataFrame({"summary": ["First President of Singapore NRIC was abcde", 73 | "A typical email id would look something like test@t"], 74 | "phone number": ["Some examples of phone numbers are +34342", 75 | "Some examples of phone numbers are +8909"]}) 76 | actual_report, actual_result = self.pii_detector.analyze_data_frame(test_data_frame) 77 | 78 | pd.testing.assert_frame_equal(expected_report, actual_report) 79 | pd.testing.assert_frame_equal(expected_result, actual_result) 80 | 81 | def test_analyze_data_frame_runs_analyze_only_on_cells_with_a_PII_value(self): 82 | test_data_frame = pd.DataFrame({"summary": ["First President of Singapore NRIC was S0000001I", 83 | "A typical email id would look something like test@sample.com"], 84 | "remarks": ["No sensitive data", 85 | "No sensitive data"]}) 86 | 87 | actual_report, actual_result = self.pii_detector.analyze_data_frame(test_data_frame) 88 | 89 | expected_report = pd.DataFrame({"summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], 90 | [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]], 91 | "remarks": [[],[]] 92 | }) 93 | 94 | expected_result = pd.DataFrame({"summary": ["First President of Singapore NRIC was ", 95 | "A typical email id would look something like "], 96 | "remarks": ["No sensitive data", 97 | "No sensitive data"]}) 98 | 99 | pd.testing.assert_frame_equal(expected_report, actual_report) 100 | pd.testing.assert_frame_equal(expected_result, actual_result) -------------------------------------------------------------------------------- /src/analyze/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/utils/__init__.py -------------------------------------------------------------------------------- /src/analyze/utils/analyzer_result.py: -------------------------------------------------------------------------------- 1 | class AnalyzerResult: 2 | 3 | def __init__(self, text, type, start, end): 4 | self.text = text 5 | self.type = type 6 | self.start = start 7 | self.end = end 8 | 9 | def __eq__(self, other): 10 | return type(self) == type(other) and self.text == other.text and self.type == other.type \ 11 | and self.start == other.start and self.end == other.end 12 | 13 | def __repr__(self): 14 | return self.__str__() 15 | 16 | def __str__(self): 17 | return "Text {} at position ({},{}) was identified as {}".format(self.text, self.start, self.end, self.type) 18 | 19 | def detector(self): 20 | return self.type 21 | -------------------------------------------------------------------------------- /src/analyze/utils/regex.py: -------------------------------------------------------------------------------- 1 | class RegEx: 2 | 3 | def __init__(self): 4 | self.regex_string = "" 5 | 6 | def __is_numeric(self, value): 7 | return isinstance(value, int) 8 | 9 | def __is_single_character_value(self, value): 10 | return len(str(value)) == 1 11 | 12 | def __validate_range(self, start, end): 13 | if start > end: 14 | raise ValueError("Range start should be less than end") 15 | 16 | def boundary(self): 17 | self.regex_string += "\\b" 18 | return self 19 | 20 | def pipe(self): 21 | self.regex_string += "|" 22 | return self 23 | 24 | def range(self, from_char, to_char): 25 | if not self.__is_single_character_value(from_char) or not self.__is_single_character_value(to_char): 26 | raise ValueError("Range boundaries should be single character") 27 | 28 | self.__validate_range(from_char, to_char) 29 | self.regex_string += "[{}-{}]".format(from_char, to_char) 30 | return self 31 | 32 | def one_of(self, character_set): 33 | if character_set is None or character_set == "": 34 | raise ValueError("Character Set should not be empty") 35 | 36 | self.regex_string += "[" + character_set + "]" 37 | return self 38 | 39 | def any_digit(self): 40 | self.regex_string += "\\d" 41 | return self 42 | 43 | def num_occurrences(self, number): 44 | if number < 1: 45 | raise ValueError 46 | 47 | self.regex_string += "{" + str(number) + "}" 48 | return self 49 | 50 | def one_or_more_occurrences(self): 51 | self.regex_string += "+" 52 | return self 53 | 54 | def zero_or_more_occurrences(self): 55 | self.regex_string += "*" 56 | return self 57 | 58 | def zero_or_one_occurrences(self): 59 | self.regex_string += "?" 60 | return self 61 | 62 | def range_occurrences(self, start, end): 63 | if not self.__is_numeric(start) or not self.__is_numeric(end): 64 | raise TypeError("Range should be integers") 65 | 66 | self.__validate_range(start, end) 67 | self.regex_string += "{" + str(start) + "," + str(end) + "}" 68 | return self 69 | 70 | def literal(self, literal): 71 | self.regex_string += literal 72 | return self 73 | 74 | def build(self): 75 | return self.regex_string 76 | -------------------------------------------------------------------------------- /src/analyze/utils/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/utils/tests/__init__.py -------------------------------------------------------------------------------- /src/analyze/utils/tests/test_analyzer_result.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from src.analyze.utils.analyzer_result import AnalyzerResult 4 | 5 | 6 | class TestAnalyzerResult(TestCase): 7 | 8 | def test_equality(self): 9 | expected = AnalyzerResult("text", "type", 0, 10) 10 | actual = AnalyzerResult("text", "type", 0, 10) 11 | self.assertEqual(expected, actual) 12 | 13 | def test_inequality(self): 14 | self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("different_text", "type", 0, 10)) 15 | self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("text", "different_type", 0, 10)) 16 | self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("text", "type", 1, 10)) 17 | self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("text", "type", 0, 11)) 18 | 19 | def test_repr(self): 20 | expected = "Text sample_data at position (0,10) was identified as type" 21 | self.assertEqual(AnalyzerResult("sample_data", "type", 0, 10).__repr__(), expected) 22 | 23 | def test_str(self): 24 | expected = "Text sample_data at position (0,10) was identified as type" 25 | self.assertEqual(str(AnalyzerResult("sample_data", "type", 0, 10)), expected) 26 | 27 | def test_get_detector_fetches_detector_type_correctly(self): 28 | result = AnalyzerResult("text", "EMAIL", 0, 10) 29 | self.assertEqual(result.detector(), "EMAIL") -------------------------------------------------------------------------------- /src/analyze/utils/tests/test_regex.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from src.analyze.utils.regex import RegEx 4 | 5 | 6 | class TestRegEx(TestCase): 7 | 8 | # Testing one_of 9 | def test_when_one_of_param_is_empty_throws_error(self): 10 | self.assertRaises(ValueError, lambda: RegEx().one_of("").build()) 11 | 12 | def test_when_valid_input_is_passed_one_of_returns_correct_output(self): 13 | self.assertEqual("[AB]", RegEx().one_of("AB").build()) 14 | self.assertEqual("[357]", RegEx().one_of("357").build()) 15 | 16 | # Testing num_occurrences 17 | def test_when_non_positive_number_of_occurrences_throws_error(self): 18 | self.assertRaises(ValueError, lambda: RegEx().num_occurrences(-7).build()) 19 | self.assertRaises(ValueError, lambda: RegEx().num_occurrences(0).build()) 20 | 21 | def test_when_valid_input_is_passed_num_occurrences_returns_correct_output(self): 22 | self.assertEqual("{7}", RegEx().num_occurrences(7).build()) 23 | 24 | # Testing any_digit 25 | def test_when_any_digit_returns_correct_output(self): 26 | self.assertEqual("\\d", RegEx().any_digit().build()) 27 | 28 | def __assert_value_error_is_raised(self, fn, msg): 29 | with self.assertRaises(ValueError) as ve: 30 | fn() 31 | self.assertEqual(str(ve.exception), msg) 32 | 33 | def __assert_type_error_is_raised(self, fn, msg): 34 | with self.assertRaises(TypeError) as ve: 35 | fn() 36 | self.assertEqual(str(ve.exception), msg) 37 | 38 | # Testing range 39 | def test_when_range_is_incomplete(self): 40 | single_character = "Range boundaries should be single character" 41 | self.__assert_value_error_is_raised(lambda: RegEx().range("", "Z").build(), single_character) 42 | self.__assert_value_error_is_raised(lambda: RegEx().range("0", "").build(), single_character) 43 | self.__assert_value_error_is_raised(lambda: RegEx().range("01", "9").build(), single_character) 44 | self.__assert_value_error_is_raised(lambda: RegEx().range("A", "YZ").build(), single_character) 45 | 46 | def test_when_invalid_range_boundaries_are_provided(self): 47 | less_than_end = "Range start should be less than end" 48 | self.__assert_value_error_is_raised(lambda: RegEx().range("B", "A").build(), less_than_end) 49 | self.__assert_value_error_is_raised(lambda: RegEx().range("9", "0").build(), less_than_end) 50 | 51 | def test_when_valid_input_is_passed_range_returns_correct_output(self): 52 | self.assertEqual("[A-Z]", RegEx().range("A", "Z").build()) 53 | self.assertEqual("[0-9]", RegEx().range("0", "9").build()) 54 | 55 | # Testing range_occurrences 56 | def test_when_invalid_numeric_range_boundaries_are_provided(self): 57 | less_than_end = "Range start should be less than end" 58 | self.__assert_value_error_is_raised(lambda: RegEx().range_occurrences(9, 0).build(), less_than_end) 59 | 60 | def test_when_invalid_input_for_range_occurrences_throws_error(self): 61 | range_should_be_integers = "Range should be integers" 62 | self.__assert_type_error_is_raised(lambda: RegEx().range_occurrences(1.2, 2).build(), range_should_be_integers) 63 | self.__assert_type_error_is_raised(lambda: RegEx().range_occurrences("A", 9).build(), range_should_be_integers) 64 | 65 | def test_when_valid_input_is_passed_range_occurrences_returns_correct_output(self): 66 | self.assertEqual("{0,9}", RegEx().range_occurrences(0, 9).build()) 67 | 68 | # Testing one_or_more_occurrences 69 | def test_when_valid_input_is_passed_one_or_more_occurrences_returns_correct_output(self): 70 | self.assertEqual("+", RegEx().one_or_more_occurrences().build()) 71 | 72 | # Testing zero_or_more_occurrences 73 | def test_when_valid_input_is_passed_zero_or_more_occurrences_returns_correct_output(self): 74 | self.assertEqual("*", RegEx().zero_or_more_occurrences().build()) 75 | 76 | # Testing zero_or_one_occurrences 77 | def test_when_valid_input_is_passed_zero_or_one_occurrences_returns_correct_output(self): 78 | self.assertEqual("?", RegEx().zero_or_one_occurrences().build()) 79 | 80 | # Testing literal 81 | def test_when_valid_input_is_passed_literal_returns_correct_output(self): 82 | self.assertEqual("@", RegEx().literal("@").build()) 83 | 84 | # Testing boundary 85 | def test_boundary(self): 86 | self.assertEqual("\\b", RegEx().boundary().build()) 87 | 88 | # Testing complex inputs 89 | def test_builds_correct_pattern_for_NRIC(self): 90 | self.assertEqual("[AIR]\\d{7}[A-Z]", 91 | RegEx() 92 | .one_of("AIR") 93 | .any_digit() 94 | .num_occurrences(7) 95 | .range("A", "Z") 96 | .build()) 97 | 98 | self.assertEqual("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+", 99 | RegEx() 100 | .one_of("a-zA-Z0-9_.+-") 101 | .one_or_more_occurrences() 102 | .literal("@") 103 | .one_of("a-zA-Z0-9-") 104 | .one_or_more_occurrences() 105 | .literal("\\.") 106 | .one_of("a-zA-Z0-9-.") 107 | .one_or_more_occurrences() 108 | .build()) 109 | -------------------------------------------------------------------------------- /src/anonymize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/anonymize/__init__.py -------------------------------------------------------------------------------- /src/anonymize/anonymizer_result.py: -------------------------------------------------------------------------------- 1 | class AnonymizerResult: 2 | 3 | def __init__(self, redacted_text, analyzer_results): 4 | self.redacted_text = redacted_text 5 | self.analyzer_results = analyzer_results 6 | 7 | def __eq__(self, other): 8 | return type(self) == type(other) and self.redacted_text == other.redacted_text and self.analyzer_results == other.analyzer_results 9 | 10 | def __repr__(self): 11 | return self.__str__() 12 | 13 | def __str__(self): 14 | return "PII information found: \n{}\nRedacted text: {}".format(self.analyzer_results, self.redacted_text) 15 | -------------------------------------------------------------------------------- /src/anonymize/drop_anonymizer.py: -------------------------------------------------------------------------------- 1 | from src.analyze.utils.analyzer_result import AnalyzerResult 2 | 3 | 4 | class DropAnonymizer: 5 | 6 | @staticmethod 7 | def redact(text: str, analyzer_results: [AnalyzerResult]): 8 | for result in analyzer_results: 9 | text = text.replace(result.text, "") 10 | return text 11 | -------------------------------------------------------------------------------- /src/anonymize/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/anonymize/tests/__init__.py -------------------------------------------------------------------------------- /src/anonymize/tests/test_drop_anonymizer.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from src.anonymize.drop_anonymizer import DropAnonymizer 3 | from src.analyze.utils.analyzer_result import AnalyzerResult 4 | 5 | 6 | class TestDropAnonymizer(TestCase): 7 | 8 | def test_redact_for_single_analyzer_result(self): 9 | text = "text containing pii" 10 | analyzer_results = [AnalyzerResult("pii", "PII_DETECTOR", 16, 18)] 11 | result = DropAnonymizer.redact(text, analyzer_results) 12 | self.assertEqual(result, "text containing ") 13 | 14 | def test_redact_for_multiple_analyzer_results(self): 15 | text = "text containing pii1 and pii2" 16 | analyzer_results = [AnalyzerResult("pii1", "PII_DETECTOR", 16, 19), 17 | AnalyzerResult("pii2", "PII_DETECTOR", 25, 28)] 18 | result = DropAnonymizer.redact(text, analyzer_results) 19 | self.assertEqual(result, "text containing and ") 20 | 21 | -------------------------------------------------------------------------------- /src/constants.py: -------------------------------------------------------------------------------- 1 | ACQUIRE="acquire" 2 | FILE_PATH="file_path" 3 | ANALYZE="analyze" 4 | REPORT="report" 5 | LOCATION="location" 6 | REPORT_LEVEL="level" 7 | OUTPUT_FILE_PATH="output_file_path" -------------------------------------------------------------------------------- /src/dpf_main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.abspath('.')) 4 | 5 | import argparse 6 | import json 7 | 8 | from src.report.report_generator import ReportGenerator 9 | from src.acquire.csv_parser import CsvParser 10 | from src.analyze.detectors.pii_detector import PIIDetector 11 | from src.constants import ACQUIRE, REPORT 12 | from src.write.csv_writer import CsvWriter 13 | 14 | 15 | class DPFMain(): 16 | 17 | def __init__(self, config_file_path): 18 | with open(config_file_path) as config_file: 19 | self.config = json.load(config_file) 20 | 21 | #TODO : validate the config for the stages right here 22 | def run(self): 23 | parsed_data_frame = CsvParser(config=self.config[ACQUIRE]).parse() 24 | pii_analysis_report, redacted_data_frame = PIIDetector().analyze_data_frame(parsed_data_frame) 25 | if pii_analysis_report.empty: 26 | print("NO PII VALUES WERE FOUND!") 27 | else: 28 | ReportGenerator(config=self.config[REPORT])\ 29 | .generate(results_df=pii_analysis_report, 30 | ) 31 | CsvWriter(config=self.config).write_csv(df=redacted_data_frame) 32 | 33 | 34 | # output_directory needs to be obtained from the config json file as a parameter in the 'anonymize' section. 35 | 36 | def get_args(): 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('--config-file', help='config file to run the tool') 39 | args = parser.parse_args() 40 | if not args.config_file: 41 | raise ValueError("Config file path should be provided for the tool to run.") 42 | return args 43 | 44 | if __name__ == "__main__": 45 | args = get_args() 46 | DPFMain(args.config_file).run() -------------------------------------------------------------------------------- /src/report/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/report/__init__.py -------------------------------------------------------------------------------- /src/report/report_generator.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from enum import Enum 3 | 4 | import os 5 | import pandas as pd 6 | import logging 7 | from src.constants import LOCATION, REPORT_LEVEL 8 | 9 | 10 | class ReportLevel(Enum): 11 | 12 | HIGH = "high" 13 | MEDIUM = "medium" 14 | LOW = "low" 15 | 16 | class ReportGenerator(): 17 | 18 | def __init__(self, config): 19 | self.report_file_location = config[LOCATION] 20 | self.report_level = config[REPORT_LEVEL] 21 | self.setup_logging_config() 22 | 23 | def setup_logging_config(self): 24 | date = datetime.today().strftime("%Y%m%d") 25 | file_name = "{}/report_{}.log".format(self.report_file_location, date) 26 | if os.path.exists(file_name): 27 | mode = "a" 28 | else: 29 | if not os.path.exists(self.report_file_location): 30 | os.makedirs(self.report_file_location) 31 | mode = "x" 32 | file_handler = logging.FileHandler(filename=file_name, mode=mode) 33 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 34 | file_handler.setFormatter(formatter) 35 | logging.getLogger().addHandler(file_handler) 36 | logging.getLogger().setLevel(logging.INFO) 37 | 38 | def __generate_high_level_report(self, results_df): 39 | report_df = pd.DataFrame({"Columns with PII values" : results_df.columns.values}) 40 | return report_df 41 | 42 | def __collate_all_detectors_per_cell(self, analyzer_result): 43 | return [result.detector() for result in analyzer_result[1]] 44 | 45 | def __calculate_percentage(self, item_count, total_count): 46 | return round((item_count/total_count) * 100.0, 2) 47 | 48 | def __calculate_detector_percentage(self, row_count, count_map): 49 | percentage_map = {} 50 | for key, value in count_map.items(): 51 | percentage_map[key] = "{}%".format(self.__calculate_percentage(value, row_count)) 52 | return percentage_map 53 | 54 | def __calculate_detector_count(self, column_series): 55 | detector_count_map = {} 56 | for analyzer_results in column_series.iteritems(): 57 | if not analyzer_results: 58 | continue 59 | detector_types = self.__collate_all_detectors_per_cell(analyzer_results) 60 | for detector_type in detector_types: 61 | if detector_type not in detector_count_map: 62 | detector_count_map[detector_type] = 0 63 | detector_count_map[detector_type] += 1 64 | return detector_count_map 65 | 66 | 67 | #TODO : filter out the NAs before passing through this 68 | def calculate_detector_stats_for_each_column(self, column_series): 69 | stats_map = {} 70 | count_map = self.__calculate_detector_count(column_series) 71 | percentage_map = self.__calculate_detector_percentage(len(column_series), count_map) 72 | for key, value in count_map.items(): 73 | stats_tuple = (value, percentage_map[key]) 74 | stats_map[key] = stats_tuple 75 | return stats_map 76 | 77 | def __generate_medium_level_report(self, results_df): 78 | report_df = pd.DataFrame({}) 79 | columns = list(results_df) 80 | column_reports = [] 81 | for column in columns: 82 | detector_stats_for_each_column = self.calculate_detector_stats_for_each_column(results_df[column]) 83 | column_report = pd.Series(detector_stats_for_each_column, name=column, index=detector_stats_for_each_column.keys()) 84 | if not column_report.empty: 85 | column_reports.append(column_report) 86 | if column_reports: 87 | report_df = pd.concat(column_reports, axis=1, keys=[series.name for series in column_reports], sort=True) 88 | return report_df.fillna(value=0) 89 | 90 | def generate_report_content(self, results_df): 91 | if self.report_level == ReportLevel.HIGH.value: 92 | return self.__generate_high_level_report(results_df) 93 | elif self.report_level == ReportLevel.MEDIUM.value: 94 | return self.__generate_medium_level_report(results_df) 95 | 96 | def __print(self, msg): 97 | print(msg) 98 | logging.info(msg) 99 | 100 | def __print_report(self, report): 101 | self.__print("\n\n****************************PII ANALYSIS REPORT**************************\n\n") 102 | if report.empty: 103 | self.__print("NO PII VALUES WERE FOUND!") 104 | else: 105 | self.__print(report) 106 | self.__print("\n\n****************************DONE!**************************\n\n") 107 | 108 | def generate(self, results_df): 109 | final_report = self.generate_report_content(results_df) 110 | self.__print_report(final_report) 111 | return final_report 112 | 113 | -------------------------------------------------------------------------------- /src/report/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/report/tests/__init__.py -------------------------------------------------------------------------------- /src/report/tests/test_report_generator.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from unittest.mock import patch, MagicMock 3 | 4 | import os 5 | import pandas as pd 6 | from freezegun import freeze_time 7 | 8 | from src.report.report_generator import ReportGenerator 9 | from src.analyze.utils.analyzer_result import AnalyzerResult 10 | 11 | 12 | 13 | 14 | class TestReportGenerator(TestCase): 15 | 16 | @patch("src.report.report_generator.ReportGenerator.setup_logging_config") 17 | def setUp(self, mock_setup_logging_config): 18 | self.report_generator_high_level = ReportGenerator(config={"location" : "abc", "level" : "high"}) 19 | mock_setup_logging_config.assert_called_with() 20 | self.report_generator_medium_level = ReportGenerator(config={"location" : "abc", "level" : "medium"}) 21 | mock_setup_logging_config.assert_called_with() 22 | 23 | def test_high_level_reporting_returns_columns_with_PII_values_when_given_a_results_data_frame(self): 24 | result_data_frame = pd.DataFrame({"summary" : [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]], 25 | "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]}) 26 | expected_data_frame = pd.DataFrame({"Columns with PII values" : ["summary", "phone number"]}) 27 | self.assertCountEqual(expected_data_frame, self.report_generator_high_level.generate_report_content(result_data_frame)) 28 | 29 | def test_medium_level_reporting_returns_data_frame_with_detectors_and_column_details(self): 30 | result_data_frame = pd.DataFrame({"summary" : [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]], 31 | "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]}) 32 | expected_data_frame = pd.DataFrame({"summary" : pd.Series({"NRIC" : (1, "50%"), "EMAIL" : (1,"50%")}), 33 | "phone number" : pd.Series({"PHONE_NUMBER" : (2, "100%")})}) 34 | self.assertCountEqual(list(expected_data_frame), self.report_generator_medium_level.generate_report_content(result_data_frame)) 35 | 36 | def test_calculate_detector_stats_returns_detector_counts_and_percentages(self): 37 | result_column_values = pd.Series([[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]]) 38 | actual_result = self.report_generator_medium_level.calculate_detector_stats_for_each_column(result_column_values) 39 | expected_result = {"NRIC" : (1, "33.33%"), "EMAIL" : (2, "66.67%")} 40 | self.assertCountEqual(expected_result, actual_result) 41 | 42 | @patch("logging.info") 43 | @patch("src.report.report_generator.ReportGenerator.generate_report_content") 44 | def test_generate_report_calls_content_generate_report_content_and_logs_it(self, mock_generate_content, mock_logging): 45 | result_data_frame = pd.DataFrame({"summary" : [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]], 46 | "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]}) 47 | mock_generate_content.return_value = pd.DataFrame({"Columns with PII values" : ["summary", "phone number"]}) 48 | mock_logging.return_value = None 49 | expected_result = self.report_generator_high_level.generate(result_data_frame) 50 | self.assertCountEqual(expected_result, mock_generate_content.return_value) 51 | 52 | 53 | @freeze_time('2019-05-29 01:01:03') 54 | @patch("logging.FileHandler") 55 | @patch("logging.Logger.addHandler") 56 | @patch("genericpath.exists") 57 | def test_creation_of_the_report_file_if_not_present(self, mock_file_exists, mock_add_handler, mock_file_handler): 58 | mock_file_exists.return_value = False 59 | mock_file_handler.return_value = MagicMock() 60 | self.report_generator_high_level.setup_logging_config() 61 | mock_file_handler.assert_called_with(filename="abc/report_20190529.log", mode="x") 62 | mock_add_handler.assert_called_with(mock_file_handler.return_value) 63 | 64 | 65 | @freeze_time('2019-05-29 01:01:03') 66 | @patch("logging.FileHandler") 67 | @patch("logging.Logger.addHandler") 68 | @patch("os.path.exists") 69 | def test_appending_to_report_file_if_already_present(self, mock_file_exists, mock_add_handler, mock_file_handler): 70 | mock_file_exists.return_value = True 71 | mock_file_handler.return_value = MagicMock() 72 | self.report_generator_high_level.setup_logging_config() 73 | mock_file_handler.assert_called_with(filename="abc/report_20190529.log", mode="a") 74 | mock_add_handler.assert_called_with(mock_file_handler.return_value) 75 | -------------------------------------------------------------------------------- /src/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/tests/__init__.py -------------------------------------------------------------------------------- /src/tests/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/tests/config/__init__.py -------------------------------------------------------------------------------- /src/tests/config/test_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "acquire": { 3 | "file_path": "/Users/wisuchoi/Documents/anonymizer/src/acquire/tests/data/comma_delimited_file.csv", 4 | "delimiter": "," 5 | }, 6 | "analyze": { 7 | }, 8 | "report" : { 9 | "location" : "/Users/wisuchoi/Documents/anonymizer/report", 10 | "level" : "high" 11 | }, 12 | "anonymize": { 13 | "output_file_path" : "/Users/wisuchoi/Documents/anonymizer/output" 14 | } 15 | } -------------------------------------------------------------------------------- /src/tests/test_dpf_main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from unittest import TestCase 4 | from unittest.mock import patch, MagicMock 5 | 6 | import pandas as pd 7 | 8 | from src.constants import ACQUIRE, REPORT 9 | from src.dpf_main import DPFMain 10 | 11 | 12 | class TestDPFMain(TestCase): 13 | 14 | def setUp(self): 15 | test_config = "{}/{}".format(os.path.dirname(os.path.realpath(__file__)),"config/test_config.json") 16 | self.dpf_main = DPFMain(test_config) 17 | with open(test_config) as input_file: 18 | self.config_json = json.load(input_file) 19 | 20 | @patch('src.write.csv_writer.CsvWriter.write_csv') 21 | @patch('src.write.csv_writer.CsvWriter.__init__') 22 | @patch('src.report.report_generator.ReportGenerator.generate') 23 | @patch('src.report.report_generator.ReportGenerator.__init__') 24 | @patch('src.analyze.detectors.pii_detector.PIIDetector.analyze_data_frame') 25 | @patch('src.acquire.csv_parser.CsvParser.parse') 26 | @patch('src.acquire.csv_parser.CsvParser.__init__') 27 | def test_run_parses_the_config_file_and_invokes_respective_stages_correctly(self, mock_csv_parser_init, 28 | mock_csv_parser_parse, 29 | mock_pii_analyze_df, 30 | mock_report_generator_init, 31 | mock_generate_report, 32 | mock_csv_writer_init, 33 | mock_csv_writer_write_csv): 34 | mock_csv_parser_init.return_value = None 35 | mock_csv_parser_parse.return_value = MagicMock() 36 | mock_pii_analyze_df.return_value = (pd.DataFrame({"summary" : ["test result"]}), pd.DataFrame({})) 37 | mock_report_generator_init.return_value = None 38 | mock_generate_report.return_value = MagicMock() 39 | mock_csv_writer_init.return_value = None 40 | mock_csv_writer_write_csv.return_value = None 41 | self.dpf_main.run() 42 | mock_csv_parser_init.assert_called_with(config=self.config_json[ACQUIRE]) 43 | mock_csv_parser_parse.assert_called_with() 44 | mock_pii_analyze_df.assert_called_with(mock_csv_parser_parse.return_value) 45 | mock_report_generator_init.assert_called_with(config=self.config_json[REPORT]) 46 | mock_generate_report.assert_called_with(results_df=mock_pii_analyze_df.return_value[0]) 47 | mock_csv_writer_init.assert_called_with(config=self.config_json) 48 | mock_csv_writer_write_csv.assert_called_with(df=mock_pii_analyze_df.return_value[1]) 49 | 50 | 51 | @patch('src.write.csv_writer.CsvWriter.write_csv') 52 | @patch('src.write.csv_writer.CsvWriter.__init__') 53 | @patch('src.report.report_generator.ReportGenerator.generate') 54 | @patch('src.analyze.detectors.pii_detector.PIIDetector.analyze_data_frame') 55 | @patch('src.acquire.csv_parser.CsvParser.parse') 56 | @patch('src.acquire.csv_parser.CsvParser.__init__') 57 | def test_run_short_circuits_generate_report_when_no_PII_values_detected(self, mock_csv_parser_init, 58 | mock_csv_parser_parse, 59 | mock_pii_analyze_df, 60 | mock_generate_report, 61 | mock_csv_writer_init, 62 | mock_csv_writer_write_csv): 63 | mock_csv_parser_init.return_value = None 64 | mock_csv_parser_parse.return_value = pd.DataFrame({}) 65 | mock_pii_analyze_df.return_value = (pd.DataFrame({}), pd.DataFrame({})) 66 | mock_generate_report.return_value = MagicMock() 67 | mock_generate_report.return_value = None 68 | mock_csv_writer_init.return_value = None 69 | mock_csv_writer_write_csv.return_value = None 70 | self.dpf_main.run() 71 | mock_csv_parser_init.assert_called_with(config=self.config_json[ACQUIRE]) 72 | mock_csv_parser_parse.assert_called_with() 73 | mock_pii_analyze_df.assert_called_with(mock_csv_parser_parse.return_value) 74 | mock_generate_report.assert_not_called() 75 | mock_csv_writer_init.assert_called_with(config=self.config_json) 76 | mock_csv_writer_write_csv.assert_called_with(df=mock_pii_analyze_df.return_value[1]) 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /src/write/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/write/__init__.py -------------------------------------------------------------------------------- /src/write/csv_writer.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas import DataFrame 3 | 4 | from src.constants import OUTPUT_FILE_PATH, FILE_PATH 5 | 6 | 7 | class CsvWriter: 8 | 9 | def __init__(self, config): 10 | self.__validate_config(config) 11 | self.output_path = config["anonymize"][OUTPUT_FILE_PATH] 12 | self.input_file_name = config["acquire"][FILE_PATH] 13 | 14 | def __validate_config(self, config): 15 | if "anonymize" not in config or not config["anonymize"] or OUTPUT_FILE_PATH not in config["anonymize"] or not config["anonymize"][OUTPUT_FILE_PATH]: 16 | raise ValueError("Config 'output_file_path' needs to be provided for parsing") 17 | 18 | def get_output_file_path(self): 19 | file_name = self.input_file_name.split('/')[-1] 20 | file_name_no_extension = file_name.split('.')[0] 21 | result = f"{self.output_path}/{file_name_no_extension}_anonymized_.csv" 22 | return result 23 | 24 | def write_csv(self, df: DataFrame): 25 | df.to_csv(self.get_output_file_path(), index=False) 26 | print("Anonymized csv has been successfully created!") -------------------------------------------------------------------------------- /src/write/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/write/tests/__init__.py -------------------------------------------------------------------------------- /src/write/tests/test_csv_writer.py: -------------------------------------------------------------------------------- 1 | from src.write.csv_writer import CsvWriter 2 | from unittest import TestCase 3 | import os 4 | import pandas as pd 5 | 6 | 7 | class TestCsvWriter(TestCase): 8 | 9 | #TODO: check acquire file path exists 10 | def test_invalid_config_gets_caught_during_initialization(self): 11 | context = {} 12 | with self.assertRaises(ValueError) as ve: 13 | CsvWriter(config=context) 14 | self.assertEqual(str(ve.exception), "Config 'output_file_path' needs to be provided for parsing") 15 | 16 | 17 | def test_correct_output_path_is_generated(self): 18 | context = { 19 | "acquire": { 20 | "file_path": "/anonymizer/test_data.csv", 21 | "delimiter": "," 22 | }, 23 | "anonymize": { 24 | "output_file_path" : "/anonymizer/output" 25 | } 26 | } 27 | input_file_name = "test_data" 28 | output_directory = "/anonymizer/output" 29 | expected = f"{output_directory}/{input_file_name}_anonymized_.csv" 30 | writer = CsvWriter(config=context) 31 | self.assertEqual(writer.get_output_file_path(), expected) -------------------------------------------------------------------------------- /src_spark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/__init__.py -------------------------------------------------------------------------------- /src_spark/acquire/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/acquire/__init__.py -------------------------------------------------------------------------------- /src_spark/acquire/csv_parser.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from src_spark.constants import FILE_PATH 3 | import pyspark.sql.functions as f 4 | 5 | class CsvParser: 6 | 7 | def __init__(self, spark: SparkSession, config): 8 | self.__validate_config(config) 9 | self.input_path = config["file_path"] 10 | self.delimiter = config["delimiter"] if "delimiter" in config and config["delimiter"] else "," 11 | self.spark = spark 12 | 13 | def __validate_config(self, config): 14 | if FILE_PATH not in config or not config[FILE_PATH]: 15 | raise ValueError("Config 'file_path' needs to be provided for parsing") 16 | 17 | def parse(self): 18 | df = self.spark.read.load( 19 | self.input_path, 20 | format="csv", 21 | sep=self.delimiter, 22 | header="true", 23 | inferSchema="true") 24 | 25 | 26 | 27 | return df -------------------------------------------------------------------------------- /src_spark/acquire/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/acquire/tests/__init__.py -------------------------------------------------------------------------------- /src_spark/acquire/tests/data/comma_delimited_file.csv: -------------------------------------------------------------------------------- 1 | name,ssn 2 | Lisa Beard,557-39-2479 -------------------------------------------------------------------------------- /src_spark/acquire/tests/data/empty.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/acquire/tests/data/empty.csv -------------------------------------------------------------------------------- /src_spark/acquire/tests/data/missing_comma.csv: -------------------------------------------------------------------------------- 1 | name,ssn,age 2 | Lisa Beard,557-39-2479,33 3 | John Sohn,33 -------------------------------------------------------------------------------- /src_spark/acquire/tests/data/pipe_delimited_file.csv: -------------------------------------------------------------------------------- 1 | name|ssn 2 | Lisa Beard|557-39-2479 -------------------------------------------------------------------------------- /src_spark/acquire/tests/test_csv_parser.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | from pyspark.sql import SparkSession 4 | from pyspark.sql.types import StructType 5 | from src_spark.acquire.csv_parser import CsvParser 6 | 7 | 8 | class TestCsvParser(TestCase): 9 | 10 | def setUp(self) -> None: 11 | self.SPARK = SparkSession.builder \ 12 | .master("local") \ 13 | .appName("Test CSVParser") \ 14 | .getOrCreate() 15 | self.current_dir = os.path.dirname(os.path.realpath(__file__)) 16 | 17 | def test_invalid_config_gets_caught_during_initialization(self): 18 | context = {} 19 | with self.assertRaises(ValueError) as ve: 20 | CsvParser(self.SPARK, config=context) 21 | self.assertEqual(str(ve.exception), "Config 'file_path' needs to be provided for parsing") 22 | 23 | def test_if_valid_csv_file_provided_returns_spark_df(self): 24 | file_path = "{}/data/comma_delimited_file.csv".format(self.current_dir) 25 | config = {"file_path" : file_path, "delimiter" : ""} 26 | 27 | expected = self.SPARK.createDataFrame( 28 | [("Lisa Beard", "557-39-2479")], 29 | ["name", "ssn"] 30 | ) 31 | actual = CsvParser(spark=self.SPARK, config=config).parse() 32 | 33 | self.assertEqual(actual.schema, expected.schema) 34 | self.assertEqual(actual.collect(), expected.collect()) 35 | 36 | def test_if_valid_csv_file_with_different_delimiter_provided_returns_spark_df(self): 37 | file_path = "{}/data/pipe_delimited_file.csv".format(self.current_dir) 38 | config = {"file_path" : file_path, "delimiter" : "|"} 39 | 40 | expected = self.SPARK.createDataFrame( 41 | [("Lisa Beard", "557-39-2479")], 42 | ["name", "ssn"] 43 | ) 44 | actual = CsvParser(spark=self.SPARK, config=config).parse() 45 | 46 | self.assertEqual(actual.schema, expected.schema) 47 | self.assertEqual(actual.collect(), expected.collect()) 48 | 49 | def test_if_empty_csv_file_returns_empty_pandas_df(self): 50 | file_path = "{}/data/empty.csv".format(self.current_dir) 51 | config = {"file_path" : file_path} 52 | expected = self.SPARK.createDataFrame([], StructType([])) 53 | actual = CsvParser(spark=self.SPARK, config=config).parse() 54 | self.assertEqual(actual.schema, expected.schema) 55 | self.assertEqual(actual.collect(), expected.collect()) 56 | 57 | -------------------------------------------------------------------------------- /src_spark/analyze/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/analyze/__init__.py -------------------------------------------------------------------------------- /src_spark/analyze/detectors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/analyze/detectors/__init__.py -------------------------------------------------------------------------------- /src_spark/analyze/detectors/base_detector.py: -------------------------------------------------------------------------------- 1 | import re 2 | from abc import ABC, abstractmethod 3 | 4 | from src_spark.analyze.utils.analyzer_result import AnalyzerResult 5 | 6 | 7 | 8 | class BaseDetector(ABC): 9 | 10 | def __init__(self): 11 | self.name = None 12 | self.pattern = None 13 | 14 | @abstractmethod 15 | def get_pattern(self): 16 | pass 17 | 18 | @abstractmethod 19 | def get_name(self): 20 | pass 21 | 22 | def validate(self, text): 23 | return True 24 | 25 | def execute(self, text): 26 | results = [] 27 | matches = re.finditer(self.get_pattern(), text) 28 | for match in matches: 29 | matched_string = match.string[match.start(): match.end()] 30 | if self.validate(matched_string): 31 | results.append(AnalyzerResult(matched_string, self.get_name(), match.start(), match.end())) 32 | return results 33 | -------------------------------------------------------------------------------- /src_spark/analyze/detectors/credit_card_detector.py: -------------------------------------------------------------------------------- 1 | from src_spark.analyze.detectors.base_detector import BaseDetector 2 | from src_spark.analyze.utils.regex import RegEx 3 | 4 | 5 | class CreditCardDetector(BaseDetector): 6 | 7 | def __init__(self): 8 | self.name = "CREDIT_CARD" 9 | self.pattern = RegEx().literal("4").any_digit().num_occurrences(3).pipe() \ 10 | .literal("5").range(0, 5).any_digit().num_occurrences(2).pipe() \ 11 | .literal("6").any_digit().num_occurrences(3).pipe() \ 12 | .literal("1").any_digit().num_occurrences(3).pipe() \ 13 | .literal("3").any_digit().num_occurrences(3) \ 14 | .one_of("- ").zero_or_one_occurrences() \ 15 | .any_digit().range_occurrences(3, 4) \ 16 | .one_of("- ").zero_or_one_occurrences() \ 17 | .any_digit().range_occurrences(3, 4) \ 18 | .one_of("- ").zero_or_one_occurrences() \ 19 | .any_digit().range_occurrences(3, 5).build() 20 | 21 | def get_name(self): 22 | return self.name 23 | 24 | def get_pattern(self): 25 | return self.pattern 26 | 27 | def validate(self, text): 28 | def digits_of(n): 29 | return [int(d) for d in str(n)] 30 | 31 | digits = digits_of(text.replace('-', '').replace(' ', '')) 32 | odd_digits = digits[-1::-2] 33 | even_digits = digits[-2::-2] 34 | checksum = sum(odd_digits) 35 | 36 | for d in even_digits: 37 | checksum += sum(digits_of(d * 2)) 38 | 39 | return checksum % 10 == 0 40 | -------------------------------------------------------------------------------- /src_spark/analyze/detectors/email_detector.py: -------------------------------------------------------------------------------- 1 | from src_spark.analyze.detectors.base_detector import BaseDetector 2 | from src_spark.analyze.utils.regex import RegEx 3 | 4 | 5 | class EmailDetector(BaseDetector): 6 | 7 | def __init__(self): 8 | self.name = "EMAIL" 9 | self.pattern = RegEx().one_of("a-zA-Z0-9_.+-").one_or_more_occurrences().literal("@").one_of("a-zA-Z0-9-")\ 10 | .one_or_more_occurrences().literal("\\.").one_of("a-zA-Z0-9-.").one_or_more_occurrences().build() 11 | 12 | def get_name(self): 13 | return self.name 14 | 15 | def get_pattern(self): 16 | return self.pattern 17 | -------------------------------------------------------------------------------- /src_spark/analyze/detectors/national_id_detector.py: -------------------------------------------------------------------------------- 1 | from src_spark.analyze.detectors.base_detector import BaseDetector 2 | from src_spark.analyze.utils.regex import RegEx 3 | 4 | 5 | class NationalIdDetector(BaseDetector): 6 | 7 | def __init__(self): 8 | self.name = "NRIC" 9 | self.pattern = RegEx().one_of("STFG").any_digit().num_occurrences(7).range("A", "Z").build() 10 | 11 | def get_name(self): 12 | return self.name 13 | 14 | def get_pattern(self): 15 | return self.pattern 16 | 17 | def __get_offset(self, text): 18 | return 4 if text in "TG" else 0 19 | 20 | def __is_NRIC(self, text, loc): 21 | if text[0] in "ST": 22 | return "JZIHGFEDCBA"[loc] == text[8] 23 | return False 24 | 25 | def __is_FIN(self, text, loc): 26 | if text[0] in "FG": 27 | return "XWUTRQPNMLK"[loc] == text[8] 28 | return False 29 | 30 | def validate(self, text): 31 | weight = self.__get_weight(text) 32 | first_character = text[0] 33 | offset = self.__get_offset(first_character) 34 | loc = (offset + weight) % 11 35 | return self.__is_NRIC(text, loc) or self.__is_FIN(text, loc) 36 | 37 | def __get_weight(self, text): 38 | numbers = [int(digit) for digit in list(text[1:-1])] 39 | for index, i in enumerate(numbers): 40 | if index == 0: 41 | numbers[index] *= 2 42 | numbers[index] *= 8 - index 43 | return sum(numbers) 44 | -------------------------------------------------------------------------------- /src_spark/analyze/detectors/phone_number_detector.py: -------------------------------------------------------------------------------- 1 | from src_spark.analyze.detectors.base_detector import BaseDetector 2 | from src_spark.analyze.utils.regex import RegEx 3 | 4 | 5 | class PhoneNumberDetector(BaseDetector): 6 | 7 | def __init__(self): 8 | self.name = "PHONE_NUMBER" 9 | regex_pipe = RegEx().pipe().build() 10 | 11 | regex_with_country_code_and_no_space = '(\\+65?\\s?[689]\\d{7})' 12 | regex_with_country_code_and_single_space = '(\\+65?\\s?[689]\\d{3} \\d{4})' 13 | regex_no_country_code_and_no_space = '([689]\\d{7})' 14 | regex_no_country_code_and_single_space = '([689]\\d{3} \\d{4})' 15 | regex_with_country_code_in_brackets_and_no_space = '([(]65[)]\\s?[689]\\d{7})' 16 | regex_with_country_code_in_brackets_and_single_space = '([(]65[)]\\s?[689]\\d{3} \\d{4})' 17 | 18 | self.pattern = regex_with_country_code_and_no_space + regex_pipe + \ 19 | regex_with_country_code_and_single_space + regex_pipe + \ 20 | regex_no_country_code_and_no_space + regex_pipe + \ 21 | regex_no_country_code_and_single_space + regex_pipe + \ 22 | regex_with_country_code_in_brackets_and_no_space + regex_pipe + \ 23 | regex_with_country_code_in_brackets_and_single_space 24 | 25 | def get_name(self): 26 | return self.name 27 | 28 | def get_pattern(self): 29 | return self.pattern 30 | -------------------------------------------------------------------------------- /src_spark/analyze/detectors/pii_detector.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import pkgutil 3 | import inspect 4 | import sys 5 | from pyspark.sql import DataFrame 6 | from pyspark.sql.types import StructField, StructType, ArrayType, StringType, LongType 7 | from src_spark.analyze.detectors.base_detector import BaseDetector 8 | import src_spark.analyze.detectors 9 | 10 | class PIIDetector(): 11 | 12 | def __init__(self): 13 | self.detectors = self.__get_detector_instances() 14 | 15 | def __get_detector_modules(self): 16 | modules = [modname for importer, modname, ispkg in 17 | pkgutil.walk_packages(path=src_spark.analyze.detectors.__path__, 18 | prefix=src_spark.analyze.detectors.__name__+".") 19 | if "tests" not in modname] 20 | return modules 21 | 22 | def __get_detector_instances(self): 23 | modules = self.__get_detector_modules() 24 | detectors = [] 25 | for module in modules: 26 | importlib.import_module(module) 27 | classes = inspect.getmembers(sys.modules[module], inspect.isclass) 28 | for class_name, class_type in classes: 29 | if class_name != "BaseDetector" and issubclass(class_type, BaseDetector): 30 | detectors.append(class_type()) 31 | return detectors 32 | 33 | def __detect_pii_row(self, row): 34 | new_row = [] 35 | for element in row: 36 | results = [] 37 | for detector in self.detectors: 38 | results += detector.execute(element) 39 | new_row.append(results) 40 | 41 | return new_row 42 | 43 | def get_analyzer_results(self, input_data_frame: DataFrame): 44 | columns = input_data_frame.columns 45 | 46 | array_structtype = StructType([ 47 | StructField("end", LongType(), False), 48 | StructField("start", LongType(), False), 49 | StructField("text", StringType(), False), 50 | StructField("type", StringType(), False) 51 | ]) 52 | result_schema = [] 53 | for column in columns: 54 | result_schema.append(StructField(column, ArrayType(array_structtype, True), nullable=False) ) 55 | 56 | result = input_data_frame.rdd.map(lambda x: self.__detect_pii_row(x)).toDF(schema=StructType(result_schema)) 57 | 58 | return result 59 | 60 | def _get_pii_list(self, row): 61 | get_analyzer_results_text = lambda x: x.text 62 | 63 | new_row = [] 64 | for cell in row: 65 | pii_sublist = list(map(get_analyzer_results_text,cell)) 66 | new_row.extend(pii_sublist) 67 | return new_row 68 | 69 | def get_redacted_text(self, input_data_frame: DataFrame, report: DataFrame): 70 | pii_list = report.rdd.flatMap(lambda row: self._get_pii_list(row)).collect() 71 | column = input_data_frame.columns 72 | result = input_data_frame.rdd.map(lambda row: self.__replace_redacted_text(row, pii_list)).toDF(column) 73 | 74 | return result 75 | 76 | def __replace_redacted_text(self, row, pii_list): 77 | new_row = [] 78 | for cell in row: 79 | for word in pii_list: 80 | if word in cell: 81 | cell = cell.replace(word, "") 82 | new_row.append(cell) 83 | return new_row 84 | 85 | def analyze_data_frame(self, input_data_frame: DataFrame): 86 | report = self.get_analyzer_results(input_data_frame) 87 | redacted = self.get_redacted_text(input_data_frame, report) 88 | 89 | return report, redacted 90 | 91 | -------------------------------------------------------------------------------- /src_spark/analyze/detectors/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/analyze/detectors/tests/__init__.py -------------------------------------------------------------------------------- /src_spark/analyze/detectors/tests/test_pii_detector.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from pyspark.sql import SparkSession 3 | from src_spark.analyze.detectors.pii_detector import PIIDetector 4 | from src.analyze.utils.analyzer_result import AnalyzerResult 5 | from pyspark.sql.types import StructField, StructType, ArrayType, StringType, LongType, Row 6 | 7 | 8 | class TestPIIDetector(TestCase): 9 | 10 | def setUp(self) -> None: 11 | self.SPARK = SparkSession.builder \ 12 | .master("local") \ 13 | .appName("Test PIIDetector") \ 14 | .getOrCreate() 15 | self.pii_detector = PIIDetector() 16 | 17 | self.array_structtype = StructType([ 18 | StructField("end", LongType(), False), 19 | StructField("start", LongType(), False), 20 | StructField("text", StringType(), False), 21 | StructField("type", StringType(), False) 22 | ]) 23 | self.schema = StructType([ 24 | StructField("summary", ArrayType(self.array_structtype, True), nullable=False), 25 | StructField("phone number", ArrayType(self.array_structtype, True), nullable=False) 26 | ]) 27 | 28 | def test_analyze_data_frame_runs_analyze_against_each_cell_with_a_PII_value(self): 29 | test_data_frame = self.SPARK.createDataFrame( 30 | [ 31 | ("First President of Singapore NRIC was S0000001I", "Some examples of phone numbers are +65 62345678"), 32 | ("A typical email id would look something like test@sample.com","Some examples of phone numbers are +65 62345678") 33 | ], 34 | ["summary", "phone number"] 35 | ) 36 | 37 | actual = self.pii_detector.get_analyzer_results(test_data_frame) 38 | 39 | expected_data_frame = self.SPARK.createDataFrame( 40 | [ 41 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]), 42 | ([AnalyzerResult("test@sample.com", "EMAIL", 45, 60)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]) 43 | ], 44 | self.schema 45 | ) 46 | 47 | self.assertEqual(actual.schema, expected_data_frame.schema) 48 | self.assertEqual(actual.collect(), expected_data_frame.collect()) 49 | 50 | def test_analyze_data_frame_runs_analyze_against_cell_with_multiple_PII_values(self): 51 | test_data_frame = self.SPARK.createDataFrame( 52 | [ 53 | ("First President of Singapore NRIC was S0000001I", "Some examples of phone numbers are +65 62345678"), 54 | ("email test@sample.com and phone +65 62345678","Phone one +65 62345678 Phone two +65 62345678") 55 | ], 56 | ["summary", "phone number"] 57 | ) 58 | 59 | actual = self.pii_detector.get_analyzer_results(test_data_frame) 60 | 61 | expected_data_frame = self.SPARK.createDataFrame( 62 | [ 63 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]), 64 | ([AnalyzerResult("test@sample.com", "EMAIL", 6, 21), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 32, 44)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 10, 22), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 33, 45)]) 65 | ], 66 | self.schema 67 | ) 68 | 69 | self.assertEqual(actual.schema, expected_data_frame.schema) 70 | self.assertEqual(actual.collect(), expected_data_frame.collect()) 71 | 72 | def test_analyze_data_frame_returns_empty_data_frame_when_there_are_no_PII_values(self): 73 | test_data_frame = self.SPARK.createDataFrame( 74 | [ 75 | ("No", "Personal"), 76 | ("Data","Inside") 77 | ], 78 | ["summary", "phone number"] 79 | ) 80 | 81 | actual = self.pii_detector.get_analyzer_results(test_data_frame) 82 | 83 | expected_data_frame = self.SPARK.createDataFrame( 84 | [ 85 | ([], []), 86 | ([], []) 87 | ], 88 | self.schema 89 | ) 90 | 91 | self.assertEqual(actual.schema, expected_data_frame.schema) 92 | self.assertEqual(actual.collect(), expected_data_frame.collect()) 93 | 94 | def test_get_pii_list_returns_list_of_pii_words_given_row_of_list_of_analyzer_results(self): 95 | test_row = Row( 96 | summary=[ 97 | AnalyzerResult("S0000001I", "NRIC", 38, 47), 98 | AnalyzerResult("S0000002I", "NRIC", 38, 47) 99 | ], 100 | phone_number=[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]) 101 | actual = self.pii_detector._get_pii_list(test_row) 102 | expected = ["S0000001I","S0000002I","+65 62345678"] 103 | self.assertEqual(actual, expected) 104 | 105 | def test_get_pii_list_returns_empty_lists_no_analyzer_results(self): 106 | test_row = Row(summary=[],phone_number=[]) 107 | actual = self.pii_detector._get_pii_list(test_row) 108 | expected = [] 109 | self.assertEqual(actual, expected) 110 | 111 | def test_get_redacted_text_returns_redacted_data_frame(self): 112 | test_report_data_frame = self.SPARK.createDataFrame( 113 | [ 114 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]), 115 | ([AnalyzerResult("test@sample.com", "EMAIL", 6, 21), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 32, 44)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 10, 22), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 33, 45)]) 116 | ], 117 | self.schema 118 | ) 119 | 120 | test_input_data_frame = self.SPARK.createDataFrame( 121 | [ 122 | ("First President of Singapore NRIC was S0000001I", "Some examples of phone numbers are +65 62345678"), 123 | ("email test@sample.com and phone +65 62345678","Phone one +65 62345678 Phone two +65 62345678") 124 | ], 125 | ["summary", "phone number"] 126 | ) 127 | 128 | actual = self.pii_detector.get_redacted_text(test_input_data_frame, test_report_data_frame) 129 | 130 | expected = self.SPARK.createDataFrame( 131 | [ 132 | ("First President of Singapore NRIC was ", "Some examples of phone numbers are "), 133 | ("email and phone ","Phone one Phone two ") 134 | ], 135 | ["summary", "phone number"] 136 | ) 137 | 138 | self.assertEqual(actual.schema, expected.schema) 139 | self.assertEqual(actual.collect(), expected.collect()) 140 | 141 | def test_get_redacted_text_returns_same_data_frame_if_analyzer_results_are_empty(self): 142 | test_report_data_frame = self.SPARK.createDataFrame( 143 | [ 144 | ([], []), 145 | ([], []) 146 | ], 147 | self.schema 148 | ) 149 | 150 | test_input_data_frame = self.SPARK.createDataFrame( 151 | [ 152 | ("No", "Personal"), 153 | ("Data","Inside") 154 | ], 155 | ["summary", "phone number"] 156 | ) 157 | 158 | actual = self.pii_detector.get_redacted_text(test_input_data_frame, test_report_data_frame) 159 | 160 | expected = self.SPARK.createDataFrame( 161 | [ 162 | ("No", "Personal"), 163 | ("Data","Inside") 164 | ], 165 | ["summary", "phone number"] 166 | ) 167 | 168 | self.assertEqual(actual.schema, expected.schema) 169 | self.assertEqual(actual.collect(), expected.collect()) 170 | 171 | -------------------------------------------------------------------------------- /src_spark/analyze/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/analyze/utils/__init__.py -------------------------------------------------------------------------------- /src_spark/analyze/utils/analyzer_result.py: -------------------------------------------------------------------------------- 1 | class AnalyzerResult: 2 | 3 | def __init__(self, text, type, start, end): 4 | self.text = text 5 | self.type = type 6 | self.start = start 7 | self.end = end 8 | 9 | def __eq__(self, other): 10 | return type(self) == type(other) and self.text == other.text and self.type == other.type \ 11 | and self.start == other.start and self.end == other.end 12 | 13 | def __repr__(self): 14 | return self.__str__() 15 | 16 | def __str__(self): 17 | return "Text {} at position ({},{}) was identified as {}".format(self.text, self.start, self.end, self.type) 18 | 19 | def detector(self): 20 | return self.type 21 | -------------------------------------------------------------------------------- /src_spark/analyze/utils/regex.py: -------------------------------------------------------------------------------- 1 | class RegEx: 2 | 3 | def __init__(self): 4 | self.regex_string = "" 5 | 6 | def __is_numeric(self, value): 7 | return isinstance(value, int) 8 | 9 | def __is_single_character_value(self, value): 10 | return len(str(value)) == 1 11 | 12 | def __validate_range(self, start, end): 13 | if start > end: 14 | raise ValueError("Range start should be less than end") 15 | 16 | def boundary(self): 17 | self.regex_string += "\\b" 18 | return self 19 | 20 | def pipe(self): 21 | self.regex_string += "|" 22 | return self 23 | 24 | def range(self, from_char, to_char): 25 | if not self.__is_single_character_value(from_char) or not self.__is_single_character_value(to_char): 26 | raise ValueError("Range boundaries should be single character") 27 | 28 | self.__validate_range(from_char, to_char) 29 | self.regex_string += "[{}-{}]".format(from_char, to_char) 30 | return self 31 | 32 | def one_of(self, character_set): 33 | if character_set is None or character_set == "": 34 | raise ValueError("Character Set should not be empty") 35 | 36 | self.regex_string += "[" + character_set + "]" 37 | return self 38 | 39 | def any_digit(self): 40 | self.regex_string += "\\d" 41 | return self 42 | 43 | def num_occurrences(self, number): 44 | if number < 1: 45 | raise ValueError 46 | 47 | self.regex_string += "{" + str(number) + "}" 48 | return self 49 | 50 | def one_or_more_occurrences(self): 51 | self.regex_string += "+" 52 | return self 53 | 54 | def zero_or_more_occurrences(self): 55 | self.regex_string += "*" 56 | return self 57 | 58 | def zero_or_one_occurrences(self): 59 | self.regex_string += "?" 60 | return self 61 | 62 | def range_occurrences(self, start, end): 63 | if not self.__is_numeric(start) or not self.__is_numeric(end): 64 | raise TypeError("Range should be integers") 65 | 66 | self.__validate_range(start, end) 67 | self.regex_string += "{" + str(start) + "," + str(end) + "}" 68 | return self 69 | 70 | def literal(self, literal): 71 | self.regex_string += literal 72 | return self 73 | 74 | def build(self): 75 | return self.regex_string 76 | -------------------------------------------------------------------------------- /src_spark/constants.py: -------------------------------------------------------------------------------- 1 | ACQUIRE="acquire" 2 | FILE_PATH="file_path" 3 | ANALYZE="analyze" 4 | REPORT="report" 5 | LOCATION="location" 6 | REPORT_LEVEL="level" 7 | OUTPUT_FILE_PATH="output_file_path" -------------------------------------------------------------------------------- /src_spark/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.abspath('.')) 4 | 5 | import argparse 6 | import json 7 | 8 | from pyspark.sql import SparkSession 9 | from src_spark.report.report_generator import ReportGenerator 10 | from src_spark.acquire.csv_parser import CsvParser 11 | from src_spark.analyze.detectors.pii_detector import PIIDetector 12 | from src_spark.constants import ACQUIRE, REPORT 13 | from src_spark.write.csv_writer import CsvWriter 14 | 15 | 16 | class Main(): 17 | 18 | def __init__(self, config_file_path): 19 | with open(config_file_path) as config_file: 20 | self.config = json.load(config_file) 21 | 22 | #TODO : validate the config for the stages right here 23 | def run(self): 24 | spark = SparkSession.builder \ 25 | .master("local") \ 26 | .appName("PIIDetector") \ 27 | .getOrCreate() 28 | parsed_data_frame = CsvParser(spark, config=self.config[ACQUIRE]).parse() 29 | pii_analysis_report, redacted_data_frame = PIIDetector().analyze_data_frame(parsed_data_frame) 30 | 31 | report_generator = ReportGenerator(config=self.config[REPORT]) 32 | if report_generator.is_empty_report_dataframe(pii_analysis_report): 33 | print("NO PII VALUES WERE FOUND!") 34 | else: 35 | report_generator.generate(results_df=pii_analysis_report) 36 | CsvWriter(spark, config=self.config).write_csv(df=redacted_data_frame) 37 | 38 | def get_args(): 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument('--config-file', help='config file to run the tool') 41 | args = parser.parse_args() 42 | if not args.config_file: 43 | raise ValueError("Config file path should be provided for the tool to run.") 44 | return args 45 | 46 | if __name__ == "__main__": 47 | args = get_args() 48 | Main(args.config_file).run() -------------------------------------------------------------------------------- /src_spark/report/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/report/__init__.py -------------------------------------------------------------------------------- /src_spark/report/report_generator.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from enum import Enum 3 | import os 4 | import logging 5 | 6 | import pandas as pd 7 | from pyspark.sql.dataframe import DataFrame 8 | from pyspark.sql.types import Row 9 | from src_spark.constants import LOCATION, REPORT_LEVEL 10 | 11 | 12 | class ReportLevel(Enum): 13 | 14 | HIGH = "high" 15 | MEDIUM = "medium" 16 | LOW = "low" 17 | 18 | class ReportGenerator(): 19 | 20 | def __init__(self, config): 21 | self.report_file_location = config[LOCATION] 22 | self.report_level = config[REPORT_LEVEL] 23 | self.setup_logging_config() 24 | self.dataframe_is_empty = None 25 | 26 | def setup_logging_config(self): 27 | date = datetime.today().strftime("%Y%m%d") 28 | file_name = "{}/report_{}.log".format(self.report_file_location, date) 29 | if os.path.exists(file_name): 30 | mode = "a" 31 | else: 32 | if not os.path.exists(self.report_file_location): 33 | os.makedirs(self.report_file_location) 34 | mode = "x" 35 | file_handler = logging.FileHandler(filename=file_name, mode=mode) 36 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 37 | file_handler.setFormatter(formatter) 38 | logging.getLogger().addHandler(file_handler) 39 | logging.getLogger().setLevel(logging.INFO) 40 | 41 | def __generate_high_level_report(self, results_df: DataFrame): 42 | columns = results_df.columns 43 | report_df = pd.DataFrame({"Columns with PII values" : columns}) 44 | return report_df 45 | 46 | def __calculate_percentage(self, item_count, total_count): 47 | return round((item_count/total_count) * 100.0, 2) 48 | 49 | def _get_detector_results(self, row:Row, columns:list): 50 | new_row = [] 51 | for index, cell in enumerate(row): 52 | current_col = columns[index] 53 | if cell != []: 54 | for analyzer_result in cell: 55 | detector = analyzer_result["type"] 56 | new_row.append(((current_col, detector), 1)) 57 | else: 58 | new_row.append(((current_col, "no_pii"), 1)) 59 | return new_row 60 | 61 | def __get_list_of_detectors(self, detector_results): 62 | report_detectors = [] 63 | for key, _ in detector_results: 64 | detector = key[1] 65 | if detector not in report_detectors and detector != "no_pii": 66 | report_detectors.append(detector) 67 | return report_detectors 68 | 69 | def spark_generate_medium_level_report(self, results_df: DataFrame) -> pd.DataFrame: 70 | columns = results_df.columns 71 | detector_results = results_df.rdd.flatMap(lambda row: self._get_detector_results(row, columns)).reduceByKey(lambda acc, next: acc + next).collect() 72 | report_detectors = self.__get_list_of_detectors(detector_results) 73 | num_rows = results_df.count() 74 | pd_columns = [] 75 | for column in columns: 76 | detection_stats = self.__get_detection_stats(column, report_detectors, detector_results, num_rows) 77 | pd_columns.append(pd.Series(data=detection_stats, index=report_detectors, name=column)) 78 | report_df = pd.concat(pd_columns,axis=1).fillna(0) 79 | return report_df 80 | 81 | def __get_detection_stats(self, column: list, report_detectors: list, detector_results: list, num_rows: int) -> dict: 82 | detection_stats = {} 83 | default_value = () 84 | for detector in report_detectors: 85 | column_detector_count = next(filter(lambda result: result[0] == (column, detector), detector_results), default_value) 86 | if len(column_detector_count) > 0: 87 | count = column_detector_count[1] 88 | percentage_value = self.__calculate_percentage(item_count=count, total_count=num_rows) 89 | detection_stats[detector] = (count, f"{percentage_value}%") 90 | return detection_stats 91 | 92 | 93 | def generate_report_content(self, results_df: DataFrame) -> pd.DataFrame: 94 | if self.report_level == ReportLevel.HIGH.value: 95 | return self.__generate_high_level_report(results_df) 96 | elif self.report_level == ReportLevel.MEDIUM.value: 97 | return self.spark_generate_medium_level_report(results_df) 98 | return self.spark_generate_medium_level_report(results_df) 99 | 100 | def __print(self, msg): 101 | formatted_msg = f"\n{msg}" 102 | print(formatted_msg) 103 | logging.info(formatted_msg) 104 | 105 | def __print_report(self, report): 106 | self.__print("\n\n****************************PII ANALYSIS REPORT**************************\n\n") 107 | if report.empty: 108 | self.__print("NO PII VALUES WERE FOUND!") 109 | else: 110 | self.__print(report) 111 | self.__print("\n\n****************************DONE!**************************\n\n") 112 | 113 | def generate(self, results_df: DataFrame): 114 | if self.is_empty_report_dataframe(results_df): 115 | print("NO PII VALUES WERE FOUND!") 116 | 117 | final_report = self.generate_report_content(results_df) 118 | self.__print_report(final_report) 119 | return final_report 120 | 121 | def is_empty_report_dataframe(self, results_df: DataFrame) -> bool: 122 | if self.dataframe_is_empty == None: 123 | self.dataframe_is_empty = results_df.rdd.flatMap(lambda row: self._row_is_empty_list(row)).reduce(lambda acc, item: acc and item) 124 | return self.dataframe_is_empty 125 | 126 | def _row_is_empty_list(self, row: Row) -> map: 127 | return map(lambda cell: True if cell == [] else False , row) 128 | 129 | 130 | -------------------------------------------------------------------------------- /src_spark/report/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/report/tests/__init__.py -------------------------------------------------------------------------------- /src_spark/report/tests/test_report_generator.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from unittest.mock import patch, MagicMock 3 | 4 | import pandas as pd 5 | from pandas._testing import assert_frame_equal 6 | from freezegun import freeze_time 7 | from pyspark.sql.session import SparkSession 8 | from pyspark.sql.types import Row, StructField, StructType, ArrayType, StringType, LongType 9 | from src_spark.report.report_generator import ReportGenerator 10 | from src_spark.analyze.utils.analyzer_result import AnalyzerResult 11 | 12 | 13 | 14 | class TestReportGenerator(TestCase): 15 | 16 | @patch("src_spark.report.report_generator.ReportGenerator.setup_logging_config") 17 | def setUp(self, mock_setup_logging_config): 18 | self.SPARK = SparkSession.builder \ 19 | .master("local") \ 20 | .appName("Test PIIDetector") \ 21 | .getOrCreate() 22 | 23 | 24 | self.array_structtype = StructType([ 25 | StructField("end", LongType(), False), 26 | StructField("start", LongType(), False), 27 | StructField("text", StringType(), False), 28 | StructField("type", StringType(), False) 29 | ]) 30 | self.schema = StructType([ 31 | StructField("summary", ArrayType(self.array_structtype, True), nullable=False), 32 | StructField("phone number", ArrayType(self.array_structtype, True), nullable=False) 33 | ]) 34 | self.report_generator_high_level = ReportGenerator(config={"location" : "abc", "level" : "high"}) 35 | mock_setup_logging_config.assert_called_with() 36 | self.report_generator_medium_level = ReportGenerator(config={"location" : "abc", "level" : "medium"}) 37 | mock_setup_logging_config.assert_called_with() 38 | 39 | def test_high_level_reporting_returns_columns_with_PII_values_when_given_a_results_data_frame(self): 40 | test_data_frame = self.SPARK.createDataFrame( 41 | [ 42 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]), 43 | ([AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]) 44 | ], 45 | self.schema 46 | ) 47 | expected_data_frame = pd.DataFrame({"Columns with PII values" : ["summary", "phone number"]}) 48 | self.assertCountEqual(expected_data_frame, self.report_generator_high_level.generate_report_content(test_data_frame)) 49 | 50 | 51 | 52 | def test_medium_level_reporting_returns_data_frame_with_detectors_and_column_details(self): 53 | test_data_frame = self.SPARK.createDataFrame( 54 | [ 55 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]), 56 | ([AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]) 57 | ], 58 | self.schema 59 | ) 60 | 61 | expected_data_frame = pd.DataFrame({ 62 | "summary": [(1, "50.0%"), 0, (1, "50.0%")], 63 | "phone number": [0, (1, "50.0%"), (1, "50.0%")] 64 | },index=["NRIC","EMAIL","PHONE_NUMBER"]) 65 | 66 | self.assertCountEqual(list(expected_data_frame), self.report_generator_medium_level.spark_generate_medium_level_report(test_data_frame)) 67 | 68 | def test_that_medium_level_reporting_returns_correct_data_frame(self): 69 | test_data_frame = self.SPARK.createDataFrame( 70 | [ 71 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]), 72 | ([AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]) 73 | ], 74 | self.schema 75 | ) 76 | 77 | expected_data_frame = pd.DataFrame({ 78 | "summary": [(1, "50.0%"), 0, (1, "50.0%")], 79 | "phone number": [0, (1, "50.0%"), (1, "50.0%")] 80 | },index=["NRIC","EMAIL","PHONE_NUMBER"]) 81 | 82 | actual = self.report_generator_medium_level.spark_generate_medium_level_report(test_data_frame) 83 | assert_frame_equal(actual, expected_data_frame) 84 | 85 | @patch("logging.info") 86 | @patch("src.report.report_generator.ReportGenerator.generate_report_content") 87 | def test_generate_report_calls_content_generate_report_content_and_logs_it(self, mock_generate_content, mock_logging): 88 | test_data_frame = self.SPARK.createDataFrame( 89 | [ 90 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]), 91 | ([AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]) 92 | ], 93 | self.schema 94 | ) 95 | mock_generate_content.return_value = pd.DataFrame({"Columns with PII values" : ["summary", "phone number"]}) 96 | mock_logging.return_value = None 97 | expected_result = self.report_generator_high_level.generate(test_data_frame) 98 | self.assertCountEqual(expected_result, mock_generate_content.return_value) 99 | 100 | 101 | @freeze_time('2019-05-29 01:01:03') 102 | @patch("logging.FileHandler") 103 | @patch("logging.Logger.addHandler") 104 | @patch("genericpath.exists") 105 | def test_creation_of_the_report_file_if_not_present(self, mock_file_exists, mock_add_handler, mock_file_handler): 106 | mock_file_exists.return_value = False 107 | mock_file_handler.return_value = MagicMock() 108 | self.report_generator_high_level.setup_logging_config() 109 | mock_file_handler.assert_called_with(filename="abc/report_20190529.log", mode="x") 110 | mock_add_handler.assert_called_with(mock_file_handler.return_value) 111 | 112 | 113 | @freeze_time('2019-05-29 01:01:03') 114 | @patch("logging.FileHandler") 115 | @patch("logging.Logger.addHandler") 116 | @patch("os.path.exists") 117 | def test_appending_to_report_file_if_already_present(self, mock_file_exists, mock_add_handler, mock_file_handler): 118 | mock_file_exists.return_value = True 119 | mock_file_handler.return_value = MagicMock() 120 | self.report_generator_high_level.setup_logging_config() 121 | mock_file_handler.assert_called_with(filename="abc/report_20190529.log", mode="a") 122 | mock_add_handler.assert_called_with(mock_file_handler.return_value) 123 | 124 | def test_that_when_report_dataframe_contains_only_empty_lists_it_is_considered_empty(self): 125 | test_data_frame = self.SPARK.createDataFrame( 126 | [ 127 | ([], []), 128 | ([], []) 129 | ], 130 | self.schema 131 | ) 132 | 133 | actual = self.report_generator_medium_level.is_empty_report_dataframe(test_data_frame) 134 | expected = True 135 | 136 | self.assertEqual(actual, expected) 137 | 138 | def test_that_when_report_dataframe_contains_some_text_it_is_not_considered_empty(self): 139 | test_data_frame = self.SPARK.createDataFrame( 140 | [ 141 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], []), 142 | ([], []) 143 | ], 144 | self.schema 145 | ) 146 | 147 | actual = self.report_generator_medium_level.is_empty_report_dataframe(test_data_frame) 148 | expected = False 149 | 150 | self.assertEqual(actual, expected) 151 | 152 | def test_that_get_detector_results_returns_list_of_detector_results(self): 153 | columns = ["summary", "phone_number"] 154 | test_row = Row(summary=[Row(end=47, start=38, text='S0000001I', type='NRIC')], phone_number=[Row(end=60, start=45, text='test@sample.com', type='EMAIL')]) 155 | actual = self.report_generator_medium_level._get_detector_results(test_row, columns) 156 | expected = [(('summary', 'NRIC'), 1), (('phone_number', 'EMAIL'), 1)] 157 | self.assertEqual(actual, expected) 158 | 159 | def test_that_get_detector_results_returns_list_of_detector_results_if_column_is_empty(self): 160 | columns = ["summary", "phone_number"] 161 | test_row = Row(summary=[Row(end=47, start=38, text='S0000001I', type='NRIC')], phone_number=[]) 162 | actual = self.report_generator_medium_level._get_detector_results(test_row, columns) 163 | expected = [(('summary', 'NRIC'), 1), (('phone_number', 'no_pii'), 1)] 164 | self.assertEqual(actual, expected) 165 | 166 | -------------------------------------------------------------------------------- /src_spark/write/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/write/__init__.py -------------------------------------------------------------------------------- /src_spark/write/csv_writer.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession, DataFrame 2 | from src.constants import OUTPUT_FILE_PATH, FILE_PATH 3 | 4 | 5 | class CsvWriter(): 6 | 7 | def __init__(self, spark: SparkSession, config): 8 | self.__validate_config(config) 9 | self.output_path = config["anonymize"][OUTPUT_FILE_PATH] 10 | self.input_file_name = config["acquire"][FILE_PATH] 11 | self.spark = spark 12 | 13 | def __validate_config(self, config): 14 | if "anonymize" not in config or not config["anonymize"] or OUTPUT_FILE_PATH not in config["anonymize"] or not config["anonymize"][OUTPUT_FILE_PATH]: 15 | raise ValueError("Config 'output_file_path' needs to be provided for parsing") 16 | 17 | def get_output_file_path(self): 18 | file_name = self.input_file_name.split('/')[-1] 19 | file_name_no_extension = file_name.split('.')[0] 20 | result = f"{self.output_path}/{file_name_no_extension}_anonymized_.csv" 21 | return result 22 | 23 | def write_csv(self, df: DataFrame): 24 | df.write.csv(self.get_output_file_path()) -------------------------------------------------------------------------------- /src_spark/write/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/write/tests/__init__.py -------------------------------------------------------------------------------- /src_spark/write/tests/test_csv_writer.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from pyspark.sql import SparkSession 3 | from src_spark.write.csv_writer import CsvWriter 4 | 5 | 6 | class TestCsvWriter(TestCase): 7 | 8 | def setUp(self) -> None: 9 | self.SPARK = SparkSession.builder \ 10 | .master("local") \ 11 | .appName("Test CsvWriter") \ 12 | .getOrCreate() 13 | 14 | def test_invalid_config_gets_caught_during_initialization(self): 15 | context = {} 16 | with self.assertRaises(ValueError) as ve: 17 | CsvWriter(self.SPARK, config=context) 18 | self.assertEqual(str(ve.exception), "Config 'output_file_path' needs to be provided for parsing") 19 | 20 | def test_correct_output_path_is_generated(self): 21 | context = { 22 | "acquire": { 23 | "file_path": "/anonymizer/test_data.csv", 24 | "delimiter": "," 25 | }, 26 | "anonymize": { 27 | "output_file_path" : "/anonymizer/output" 28 | } 29 | } 30 | input_file_name = "test_data" 31 | output_directory = "/anonymizer/output" 32 | expected = f"{output_directory}/{input_file_name}_anonymized_.csv" 33 | writer = CsvWriter(spark=self.SPARK, config=context) 34 | self.assertEqual(writer.get_output_file_path(), expected) 35 | 36 | -------------------------------------------------------------------------------- /test_data.csv: -------------------------------------------------------------------------------- 1 | National ID,Phone Number,Address,Remarks 2 | S0000001I,+65 91264944,112 Bedok,A typical email id would look something like test@sample.com 3 | S00000dfs,+65 91264944,112 Bedok,A typical email id would look something like ANC --------------------------------------------------------------------------------