├── .github
└── workflows
│ └── codeql-analysis.yml
├── .gitignore
├── README.md
├── bin
├── color_my_terminal.sh
├── run_tests.sh
└── setup_venv_locally.sh
├── config.json
├── docker-compose.yml
├── no_pii_data.csv
├── requirements-dev.txt
├── requirements.txt
├── setup.py
├── src
├── __init__.py
├── acquire
│ ├── __init__.py
│ ├── csv_parser.py
│ └── tests
│ │ ├── __init__.py
│ │ ├── data
│ │ ├── comma_delimited_file.csv
│ │ ├── empty.csv
│ │ ├── missing_comma.csv
│ │ └── pipe_delimited_file.csv
│ │ └── test_csv_parser.py
├── analyze
│ ├── __init__.py
│ ├── detectors
│ │ ├── __init__.py
│ │ ├── base_detector.py
│ │ ├── credit_card_detector.py
│ │ ├── email_detector.py
│ │ ├── national_id_detector.py
│ │ ├── phone_number_detector.py
│ │ ├── pii_detector.py
│ │ └── tests
│ │ │ ├── __init__.py
│ │ │ ├── test_base_detector.py
│ │ │ ├── test_credit_card_detector.py
│ │ │ ├── test_email_detector.py
│ │ │ ├── test_national_id_detector.py
│ │ │ ├── test_phone_number_detector.py
│ │ │ └── test_pii_detector.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── analyzer_result.py
│ │ ├── regex.py
│ │ └── tests
│ │ ├── __init__.py
│ │ ├── test_analyzer_result.py
│ │ └── test_regex.py
├── anonymize
│ ├── __init__.py
│ ├── anonymizer_result.py
│ ├── drop_anonymizer.py
│ └── tests
│ │ ├── __init__.py
│ │ └── test_drop_anonymizer.py
├── constants.py
├── dpf_main.py
├── report
│ ├── __init__.py
│ ├── report_generator.py
│ └── tests
│ │ ├── __init__.py
│ │ └── test_report_generator.py
├── tests
│ ├── __init__.py
│ ├── config
│ │ ├── __init__.py
│ │ └── test_config.json
│ └── test_dpf_main.py
└── write
│ ├── __init__.py
│ ├── csv_writer.py
│ └── tests
│ ├── __init__.py
│ └── test_csv_writer.py
├── src_spark
├── __init__.py
├── acquire
│ ├── __init__.py
│ ├── csv_parser.py
│ └── tests
│ │ ├── __init__.py
│ │ ├── data
│ │ ├── comma_delimited_file.csv
│ │ ├── empty.csv
│ │ ├── missing_comma.csv
│ │ └── pipe_delimited_file.csv
│ │ └── test_csv_parser.py
├── analyze
│ ├── __init__.py
│ ├── detectors
│ │ ├── __init__.py
│ │ ├── base_detector.py
│ │ ├── credit_card_detector.py
│ │ ├── email_detector.py
│ │ ├── national_id_detector.py
│ │ ├── phone_number_detector.py
│ │ ├── pii_detector.py
│ │ └── tests
│ │ │ ├── __init__.py
│ │ │ └── test_pii_detector.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── analyzer_result.py
│ │ └── regex.py
├── constants.py
├── main.py
├── report
│ ├── __init__.py
│ ├── report_generator.py
│ └── tests
│ │ ├── __init__.py
│ │ └── test_report_generator.py
└── write
│ ├── __init__.py
│ ├── csv_writer.py
│ └── tests
│ ├── __init__.py
│ └── test_csv_writer.py
└── test_data.csv
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ master ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ master ]
20 | schedule:
21 | - cron: '30 17 * * 5'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ 'python' ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 |
39 | steps:
40 | - name: Checkout repository
41 | uses: actions/checkout@v2
42 |
43 | # Initializes the CodeQL tools for scanning.
44 | - name: Initialize CodeQL
45 | uses: github/codeql-action/init@v1
46 | with:
47 | languages: ${{ matrix.language }}
48 | # If you wish to specify custom queries, you can do so here or in a config file.
49 | # By default, queries listed here will override any specified in a config file.
50 | # Prefix the list here with "+" to use these queries and those in the config file.
51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 |
53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
54 | # If this step fails, then you should remove it and run the build manually (see below)
55 | - name: Autobuild
56 | uses: github/codeql-action/autobuild@v1
57 |
58 | # ℹ️ Command-line programs to run using the OS shell.
59 | # 📚 https://git.io/JvXDl
60 |
61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 | # and modify them (or add more) to build your code if your project
63 | # uses a compiled language
64 |
65 | #- run: |
66 | # make bootstrap
67 | # make release
68 |
69 | - name: Perform CodeQL Analysis
70 | uses: github/codeql-action/analyze@v1
71 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.vscode
2 | /.idea
3 | __pycache__
4 | *.pyc
5 | /.venv
6 | /venv
7 | /output
8 | /dist
9 | /build/lib
10 | .pytest_cache
11 | .coverage
12 | pyspark_output
13 | pyspark_config.json
14 | *.csv
15 | generate_fake_data.py
16 | scratchpad.py
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data Protection Framework
2 | Data Protection Framework is a python library/command line application for identification, anonymization and de-anonymization of Personally Identifiable Information data.
3 |
4 | The framework aims to work on a two-fold principle for detecting PII:
5 | 1. Using RegularExpressions using a pattern
6 | 2. Using NLP for detecting NER (Named Entity Recognitions)
7 |
8 | ## Features and Current Status
9 |
10 | ### Completed
11 | * Following Global detectors have been completed:
12 | * [x] EMAIL_ADDRESS : An email address identifies the mailbox that emails are sent to or from. The maximum length of the domain name is 255 characters, and the maximum length of the local-part is 64 characters.
13 | * [x] CREDIT_CARD_NUMBER : A credit card number is 12 to 19 digits long. They are used for payment transactions globally.
14 |
15 | * Following detectors specific to Singapore have been completed:
16 | * [x] PHONE_NUMBER : A telephone number.
17 | * [x] FIN/NRIC : A unique set of nine alpha-numeric characters on the Singapore National Registration Identity Card.
18 |
19 | * Following anonymizers have been added
20 | * [x] Redaction: Deletes all or part of a detected sensitive value.
21 | * [x] Encryption : Encrypts the original sensitive data value using a cryptographic key. Cloud DLP supports several types of tokenization, including transformations that can be reversed, or "re-identified."
22 |
23 | ### TO-DO
24 | Following features are part of the backlog with more features coming soon
25 | * Detectors:
26 | * [ ] NAME
27 | * [ ] ADDRESS
28 | * Anonymizers:
29 | * [ ] Masking: Replaces a number of characters of a sensitive value with a specified surrogate character, such as a hash (#) or asterisk (*).
30 | * [ ] Bucketing: "Generalizes" a sensitive value by replacing it with a range of values. (For example, replacing a specific age with an age range,
31 | or temperatures with ranges corresponding to "Hot," "Medium," and "Cold.")
32 | * [ ] Replacement: Replaces a detected sensitive value with a specified surrogate value.
33 |
34 |
35 | You can have a detailed at upcoming features and backlog in this [Github Board](https://github.com/thoughtworks-datakind/anonymizer/projects/1?fullscreen=true)
36 |
37 | ## Development setup
38 |
39 | Clone the [repo](https://github.com/thoughtworks-datakind/anonymizer) and follow the below instructions:
40 | _Assuming that $pwd is where you cloned the repo_
41 | 2. Setup venv : `./bin/setup_venv_locally.sh`
42 | 3. Activate venv : `source ./.venv/bin/activate`
43 | 4. Install dependencies : `pip install -r requirements-dev.txt`
44 |
45 | ### Config JSON
46 | An example for the config JSON is located at `/config.json`
47 | ```
48 | {
49 | "acquire": {
50 | "file_path": ,
51 | "delimiter":
52 | },
53 | "analyze": {
54 |
55 | },
56 | "report" : {
57 | "location" : ,
58 | "level" :
59 | },
60 | "anonymize": {
61 | "output_file_path" :
62 | }
63 | }
64 | ```
65 |
66 | ### Running Tests
67 | Update this file first `/src/tests/config/test_config.json` \
68 | You can run the tests by triggering shell script located at `/bin/run_tests.sh`
69 |
70 | ### Trying out on local
71 |
72 | ##### Anonymizing a delimited csv file
73 | 1. Set up a JSON config file similar to the one seen at the project root.
74 | In the 'acquire' section of the json, populate the input file path and the delimiter.
75 | In the 'report' section, provide the output path, where you want the PII detection report to be generated.
76 | A 'high' level report just calls out which columns have PII attributes.
77 | A 'medium' level report calls out the percentage of PII in each column and the associated PII (email, credit card, etc)type for the same.
78 | 2. Run the main class - `python src/dpf_main.py --config `
79 | You should see the report being appended to the file named 'report_\.log' in the output path specified in the
80 | config file.
81 |
82 | ### Packaging
83 | Run `python setup.py bdist_wheel` and the `.whl` file will be created in the `dist` folder.
84 |
85 | ### Spark-submit
86 | To run spark-submit locally, you can run the following command
87 | `spark-submit --py-files dist/SomePackage-*.whl src_spark/main.py --config config.json`
88 |
89 |
90 | ### Licensing
91 | Distributed under the MIT license. See ``LICENSE`` for more information.
92 |
93 |
94 | ### Contributing
95 |
96 | You want to help out? _Awesome_!
97 |
98 |
--------------------------------------------------------------------------------
/bin/color_my_terminal.sh:
--------------------------------------------------------------------------------
1 | export "PS1=${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\] \$ "
--------------------------------------------------------------------------------
/bin/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | project_path=$(dirname $0)/..
4 |
5 | export PYTHONPATH=$project_path
6 |
7 | coverage run --source='./src' --omit='*/tests/*' -m unittest discover .
8 | coverage report -m
--------------------------------------------------------------------------------
/bin/setup_venv_locally.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | project_path=$(dirname $0)/..
4 |
5 | cd ${project_path}
6 | export PYTHONPATH=${project_path}
7 |
8 | echo "$header: Creating virtual environment."
9 | python3 -m venv ${project_path}/.venv
10 | source ${project_path}/.venv/bin/activate
11 |
12 | curl https://bootstrap.pypa.io/get-pip.py | python
13 | pip install -r requirements-dev.txt
--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "acquire": {
3 | "file_path": "./test_data.csv",
4 | "delimiter": ","
5 | },
6 | "analyze": {},
7 | "report": {
8 | "location": "./output",
9 | "level": "medium"
10 | },
11 | "anonymize": {
12 | "output_file_path": "./output"
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 |
3 | services:
4 | spark-master:
5 | image: docker.io/bitnami/spark:3.1.2
6 | environment:
7 | - SPARK_MODE=master
8 | - SPARK_RPC_AUTHENTICATION_ENABLED=no
9 | - SPARK_RPC_ENCRYPTION_ENABLED=no
10 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
11 | - SPARK_SSL_ENABLED=no
12 | ports:
13 | - '8080:8080'
14 | - '7077:7077'
15 | networks:
16 | - spark
17 | spark-worker-1:
18 | image: docker.io/bitnami/spark:3.1.2
19 | environment:
20 | - SPARK_MODE=worker
21 | - SPARK_MASTER_URL=spark://spark:7077
22 | - SPARK_WORKER_MEMORY=1G
23 | - SPARK_WORKER_CORES=1
24 | - SPARK_RPC_AUTHENTICATION_ENABLED=no
25 | - SPARK_RPC_ENCRYPTION_ENABLED=no
26 | - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
27 | - SPARK_SSL_ENABLED=no
28 | networks:
29 | - spark
30 | depends_on:
31 | - spark-master
32 |
33 | networks:
34 | spark:
35 | driver: bridge
--------------------------------------------------------------------------------
/no_pii_data.csv:
--------------------------------------------------------------------------------
1 | Address,Remarks
2 | 112 Bedok,Good
3 | 112 Bedok,Average
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 |
3 | coverage==5.5
4 | pytest==6.2.5
5 | freezegun==1.1.0
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | appnope==0.1.2
2 | attrs==21.2.0
3 | backcall==0.2.0
4 | coverage==5.5
5 | debugpy==1.4.3
6 | decorator==5.1.0
7 | entrypoints==0.3
8 | Faker==8.14.0
9 | freezegun==1.1.0
10 | iniconfig==1.1.1
11 | ipykernel==6.4.1
12 | ipython==7.27.0
13 | ipython-genutils==0.2.0
14 | jedi==0.18.0
15 | jupyter-client==7.0.3
16 | jupyter-core==4.8.1
17 | matplotlib-inline==0.1.3
18 | nest-asyncio==1.5.1
19 | numpy==1.21.2
20 | packaging==21.0
21 | pandas==1.3.3
22 | parso==0.8.2
23 | pexpect==4.8.0
24 | pickleshare==0.7.5
25 | pluggy==1.0.0
26 | prompt-toolkit==3.0.20
27 | ptyprocess==0.7.0
28 | py==1.10.0
29 | py4j==0.10.9
30 | Pygments==2.10.0
31 | pyparsing==2.4.7
32 | pyspark==3.1.2
33 | pytest==6.2.5
34 | python-dateutil==2.8.2
35 | pytz==2021.1
36 | pyzmq==22.3.0
37 | six==1.16.0
38 | text-unidecode==1.3
39 | toml==0.10.2
40 | tornado==6.1
41 | traitlets==5.1.0
42 | wcwidth==0.2.5
43 | wheel==0.37.0
44 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(
4 | name = 'SomePackage',
5 | version = '0.1',
6 | packages = find_packages()
7 | )
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/__init__.py
--------------------------------------------------------------------------------
/src/acquire/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/acquire/__init__.py
--------------------------------------------------------------------------------
/src/acquire/csv_parser.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from src.constants import FILE_PATH
4 |
5 |
6 | class CsvParser:
7 |
8 | def __init__(self, config):
9 | self.__validate_config(config)
10 | self.input_path = config["file_path"]
11 | self.delimiter = config["delimiter"] if "delimiter" in config and config["delimiter"] else ","
12 |
13 | def __validate_config(self, config):
14 | if FILE_PATH not in config or not config[FILE_PATH]:
15 | raise ValueError("Config 'file_path' needs to be provided for parsing")
16 |
17 | def parse(self):
18 | try:
19 | df = pd.read_csv(self.input_path, delimiter=self.delimiter)
20 | except pd.errors.EmptyDataError:
21 | return pd.DataFrame({})
22 |
23 | if df.isnull().values.any():
24 | raise ValueError("Dataframe contains NULL values")
25 |
26 | return df
27 |
--------------------------------------------------------------------------------
/src/acquire/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/acquire/tests/__init__.py
--------------------------------------------------------------------------------
/src/acquire/tests/data/comma_delimited_file.csv:
--------------------------------------------------------------------------------
1 | name,ssn
2 | Lisa Beard,557-39-2479
--------------------------------------------------------------------------------
/src/acquire/tests/data/empty.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/acquire/tests/data/empty.csv
--------------------------------------------------------------------------------
/src/acquire/tests/data/missing_comma.csv:
--------------------------------------------------------------------------------
1 | name,ssn,age
2 | Lisa Beard,557-39-2479,33
3 | John Sohn,33
--------------------------------------------------------------------------------
/src/acquire/tests/data/pipe_delimited_file.csv:
--------------------------------------------------------------------------------
1 | name|ssn
2 | Lisa Beard|557-39-2479
--------------------------------------------------------------------------------
/src/acquire/tests/test_csv_parser.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | import os
3 | import pandas as pd
4 | from src.acquire.csv_parser import CsvParser
5 |
6 |
7 | class TestCsvParser(TestCase):
8 |
9 | def setUp(self):
10 | self.current_dir = os.path.dirname(os.path.realpath(__file__))
11 |
12 | def test_invalid_config_gets_caught_during_initialization(self):
13 | context = {}
14 | with self.assertRaises(ValueError) as ve:
15 | CsvParser(config=context)
16 | self.assertEqual(str(ve.exception), "Config 'file_path' needs to be provided for parsing")
17 |
18 | def test_if_valid_csv_file_provided_returns_pandas_df(self):
19 | file_path = "{}/data/comma_delimited_file.csv".format(self.current_dir)
20 | config = {"file_path" : file_path, "delimiter" : ""}
21 | test_csv_parser_valid_file_path = CsvParser(config=config)
22 | expected = pd.DataFrame({"name": ["Lisa Beard"], "ssn": ["557-39-2479"]})
23 | actual = test_csv_parser_valid_file_path.parse()
24 | self.assertEqual(actual.to_dict(), expected.to_dict())
25 |
26 | def test_if_valid_csv_file_with_different_delimiter_provided_returns_pandas_df(self):
27 | file_path = "{}/data/pipe_delimited_file.csv".format(self.current_dir)
28 | config = {"file_path" : file_path, "delimiter" : "|"}
29 | test_csv_parser_valid_file_path = CsvParser(config=config)
30 | expected = pd.DataFrame({"name": ["Lisa Beard"], "ssn": ["557-39-2479"]})
31 | actual = test_csv_parser_valid_file_path.parse()
32 | self.assertEqual(actual.to_dict(), expected.to_dict())
33 |
34 | def test_if_empty_csv_file_returns_empty_pandas_df(self):
35 | file_path = "{}/data/empty.csv".format(self.current_dir)
36 | config = {"file_path" : file_path}
37 | test_csv_parser_valid_file_path = CsvParser(config=config)
38 | expected = pd.DataFrame({})
39 | actual = test_csv_parser_valid_file_path.parse()
40 | self.assertEqual(actual.to_dict(), expected.to_dict())
41 |
42 | def test_if_error_is_raised_if_df_has_null_values(self):
43 | file_path = "{}/data/missing_comma.csv".format(self.current_dir)
44 | config = {"file_path" : file_path}
45 | with self.assertRaises(ValueError) as ve:
46 | CsvParser(config=config).parse()
47 | self.assertEqual(str(ve.exception), "Dataframe contains NULL values")
48 |
--------------------------------------------------------------------------------
/src/analyze/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/__init__.py
--------------------------------------------------------------------------------
/src/analyze/detectors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/detectors/__init__.py
--------------------------------------------------------------------------------
/src/analyze/detectors/base_detector.py:
--------------------------------------------------------------------------------
1 | import re
2 | from abc import ABC, abstractmethod
3 |
4 | from src.analyze.utils.analyzer_result import AnalyzerResult
5 |
6 |
7 | class BaseDetector(ABC):
8 |
9 | def __init__(self):
10 | self.name = None
11 | self.pattern = None
12 |
13 | @abstractmethod
14 | def get_pattern(self):
15 | pass
16 |
17 | @abstractmethod
18 | def get_name(self):
19 | pass
20 |
21 | def validate(self, text):
22 | return True
23 |
24 | def execute(self, text):
25 | results = []
26 | matches = re.finditer(self.get_pattern(), text)
27 | for match in matches:
28 | matched_string = match.string[match.start(): match.end()]
29 | if self.validate(matched_string):
30 | results.append(AnalyzerResult(matched_string, self.get_name(), match.start(), match.end()))
31 | return results
32 |
--------------------------------------------------------------------------------
/src/analyze/detectors/credit_card_detector.py:
--------------------------------------------------------------------------------
1 | from src.analyze.detectors.base_detector import BaseDetector
2 | from src.analyze.utils.regex import RegEx
3 |
4 |
5 | class CreditCardDetector(BaseDetector):
6 |
7 | def __init__(self):
8 | self.name = "CREDIT_CARD"
9 | self.pattern = RegEx().literal("4").any_digit().num_occurrences(3).pipe() \
10 | .literal("5").range(0, 5).any_digit().num_occurrences(2).pipe() \
11 | .literal("6").any_digit().num_occurrences(3).pipe() \
12 | .literal("1").any_digit().num_occurrences(3).pipe() \
13 | .literal("3").any_digit().num_occurrences(3) \
14 | .one_of("- ").zero_or_one_occurrences() \
15 | .any_digit().range_occurrences(3, 4) \
16 | .one_of("- ").zero_or_one_occurrences() \
17 | .any_digit().range_occurrences(3, 4) \
18 | .one_of("- ").zero_or_one_occurrences() \
19 | .any_digit().range_occurrences(3, 5).build()
20 |
21 | def get_name(self):
22 | return self.name
23 |
24 | def get_pattern(self):
25 | return self.pattern
26 |
27 | def validate(self, text):
28 | def digits_of(n):
29 | return [int(d) for d in str(n)]
30 |
31 | digits = digits_of(text.replace('-', '').replace(' ', ''))
32 | odd_digits = digits[-1::-2]
33 | even_digits = digits[-2::-2]
34 | checksum = sum(odd_digits)
35 |
36 | for d in even_digits:
37 | checksum += sum(digits_of(d * 2))
38 |
39 | return checksum % 10 == 0
40 |
--------------------------------------------------------------------------------
/src/analyze/detectors/email_detector.py:
--------------------------------------------------------------------------------
1 | from src.analyze.detectors.base_detector import BaseDetector
2 | from src.analyze.utils.regex import RegEx
3 |
4 |
5 | class EmailDetector(BaseDetector):
6 |
7 | def __init__(self):
8 | self.name = "EMAIL"
9 | self.pattern = RegEx().one_of("a-zA-Z0-9_.+-").one_or_more_occurrences().literal("@").one_of("a-zA-Z0-9-")\
10 | .one_or_more_occurrences().literal("\\.").one_of("a-zA-Z0-9-.").one_or_more_occurrences().build()
11 |
12 | def get_name(self):
13 | return self.name
14 |
15 | def get_pattern(self):
16 | return self.pattern
17 |
--------------------------------------------------------------------------------
/src/analyze/detectors/national_id_detector.py:
--------------------------------------------------------------------------------
1 | from src.analyze.detectors.base_detector import BaseDetector
2 | from src.analyze.utils.regex import RegEx
3 |
4 |
5 | class NationalIdDetector(BaseDetector):
6 |
7 | def __init__(self):
8 | self.name = "NRIC"
9 | self.pattern = RegEx().one_of("STFG").any_digit().num_occurrences(7).range("A", "Z").build()
10 |
11 | def get_name(self):
12 | return self.name
13 |
14 | def get_pattern(self):
15 | return self.pattern
16 |
17 | def __get_offset(self, text):
18 | return 4 if text in "TG" else 0
19 |
20 | def __is_NRIC(self, text, loc):
21 | if text[0] in "ST":
22 | return "JZIHGFEDCBA"[loc] == text[8]
23 | return False
24 |
25 | def __is_FIN(self, text, loc):
26 | if text[0] in "FG":
27 | return "XWUTRQPNMLK"[loc] == text[8]
28 | return False
29 |
30 | def validate(self, text):
31 | weight = self.__get_weight(text)
32 | first_character = text[0]
33 | offset = self.__get_offset(first_character)
34 | loc = (offset + weight) % 11
35 | return self.__is_NRIC(text, loc) or self.__is_FIN(text, loc)
36 |
37 | def __get_weight(self, text):
38 | numbers = [int(digit) for digit in list(text[1:-1])]
39 | for index, i in enumerate(numbers):
40 | if index == 0:
41 | numbers[index] *= 2
42 | numbers[index] *= 8 - index
43 | return sum(numbers)
44 |
--------------------------------------------------------------------------------
/src/analyze/detectors/phone_number_detector.py:
--------------------------------------------------------------------------------
1 | from src.analyze.detectors.base_detector import BaseDetector
2 | from src.analyze.utils.regex import RegEx
3 |
4 |
5 | class PhoneNumberDetector(BaseDetector):
6 |
7 | def __init__(self):
8 | self.name = "PHONE_NUMBER"
9 | regex_pipe = RegEx().pipe().build()
10 |
11 | regex_with_country_code_and_no_space = '(\\+65?\\s?[689]\\d{7})'
12 | regex_with_country_code_and_single_space = '(\\+65?\\s?[689]\\d{3} \\d{4})'
13 | regex_no_country_code_and_no_space = '([689]\\d{7})'
14 | regex_no_country_code_and_single_space = '([689]\\d{3} \\d{4})'
15 | regex_with_country_code_in_brackets_and_no_space = '([(]65[)]\\s?[689]\\d{7})'
16 | regex_with_country_code_in_brackets_and_single_space = '([(]65[)]\\s?[689]\\d{3} \\d{4})'
17 |
18 | self.pattern = regex_with_country_code_and_no_space + regex_pipe + \
19 | regex_with_country_code_and_single_space + regex_pipe + \
20 | regex_no_country_code_and_no_space + regex_pipe + \
21 | regex_no_country_code_and_single_space + regex_pipe + \
22 | regex_with_country_code_in_brackets_and_no_space + regex_pipe + \
23 | regex_with_country_code_in_brackets_and_single_space
24 |
25 | def get_name(self):
26 | return self.name
27 |
28 | def get_pattern(self):
29 | return self.pattern
30 |
--------------------------------------------------------------------------------
/src/analyze/detectors/pii_detector.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import pkgutil
3 | import inspect
4 | import sys
5 |
6 | import pandas as pd
7 |
8 | import src.analyze.detectors
9 | from src.analyze.detectors.base_detector import BaseDetector
10 | from src.anonymize.drop_anonymizer import DropAnonymizer
11 | from src.anonymize.anonymizer_result import AnonymizerResult
12 |
13 |
14 | #TODO : refactor this to use the annotations instead of the module path.
15 | class PIIDetector:
16 |
17 | def __init__(self):
18 | self.detectors = self.__get_detector_instances()
19 |
20 | def __get_detector_modules(self):
21 | modules = [modname for importer, modname, ispkg in
22 | pkgutil.walk_packages(path=src.analyze.detectors.__path__,
23 | prefix=src.analyze.detectors.__name__+".")
24 | if "tests" not in modname]
25 | return modules
26 |
27 | def __get_detector_instances(self):
28 | modules = self.__get_detector_modules()
29 | detectors = []
30 | for module in modules:
31 | importlib.import_module(module)
32 | classes = inspect.getmembers(sys.modules[module], inspect.isclass)
33 | for class_name, class_type in classes:
34 | if class_name != "BaseDetector" and issubclass(class_type, BaseDetector):
35 | detectors.append(class_type())
36 | return detectors
37 |
38 | #TODO : Should we make this static?
39 | def analyze_and_redact(self, text: str):
40 | analyzer_results = []
41 | for detector in self.detectors:
42 | analyzer_results = analyzer_results + detector.execute(text)
43 | redacted_text = DropAnonymizer.redact(text, analyzer_results)
44 | return AnonymizerResult(redacted_text, analyzer_results)
45 |
46 | def __contains_pii(self, results):
47 | for result in results:
48 | if len(result.analyzer_results) > 0:
49 | return True
50 | return False
51 |
52 | def analyze_data_frame(self, input_data_frame):
53 | result_df = input_data_frame.applymap(self.analyze_and_redact)
54 | return result_df.applymap(lambda x: x.analyzer_results), result_df.applymap(lambda x: x.redacted_text)
55 |
--------------------------------------------------------------------------------
/src/analyze/detectors/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/detectors/tests/__init__.py
--------------------------------------------------------------------------------
/src/analyze/detectors/tests/test_base_detector.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 |
4 | from src.analyze.detectors.base_detector import BaseDetector
5 | from src.analyze.utils.analyzer_result import AnalyzerResult
6 | from src.analyze.utils.regex import RegEx
7 |
8 |
9 | class TestBaseDetector(TestCase):
10 |
11 | def setUp(self):
12 |
13 | class TestClass(BaseDetector):
14 | def get_pattern(self):
15 | return RegEx().one_of("STFG").any_digit().num_occurrences(7).range("A", "Z").build()
16 |
17 | def get_name(self):
18 | return "NRIC"
19 |
20 | self.test_class = TestClass()
21 |
22 | def test_execute_calls_match_and_validate(self):
23 | results = self.test_class.execute("First President of Singapore NRIC was S0000001I")
24 | self.assertEqual(len(results), 1)
25 | self.assertEqual(AnalyzerResult("S0000001I", "NRIC", 38, 47), results[0])
26 |
27 | def test_execute_returns_all_matches_when_more_than_one(self):
28 | results = self.test_class.execute("First President of Singapore NRIC was S0000001I and the second president's was T0000001R")
29 | self.assertEqual(len(results), 2)
30 | self.assertCountEqual([AnalyzerResult("S0000001I", "NRIC", 38, 47),AnalyzerResult("T0000001R", "NRIC", 79, 88)], results)
31 |
32 | def test_execute_returns_empty_list_when_no_matches(self):
33 | results = self.test_class.execute("First President of Singapore NRIC was ABC and the second president's was DEF")
34 | self.assertEqual(len(results), 0)
35 |
36 | def test_get_name_and_get_patterns_are_abstract(self):
37 | with self.assertRaises(TypeError) as te:
38 | BaseDetector()
39 | self.assertEqual(str(te.exception), "Can't instantiate abstract class BaseDetector with abstract methods get_name, get_pattern")
40 |
--------------------------------------------------------------------------------
/src/analyze/detectors/tests/test_credit_card_detector.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from src.analyze.detectors.credit_card_detector import CreditCardDetector
4 |
5 |
6 | class TestCreditCardDetector(TestCase):
7 |
8 | def setUp(self):
9 | self.credit_card_detector = CreditCardDetector()
10 |
11 | def test_default_property_values_are_correct(self):
12 | self.assertEqual("CREDIT_CARD", self.credit_card_detector.name)
13 | self.assertEqual('4\\d{3}|5[0-5]\\d{2}|6\\d{3}|1\\d{3}|3\\d{3}[- ]?\\d{3,4}[- ]?\\d{3,4}[- ]?\\d{3,5}',
14 | self.credit_card_detector.pattern)
15 |
16 | def test_valid_credit_cards(self):
17 | self.assertTrue(self.credit_card_detector.validate("4012888888881881"))
18 | self.assertTrue(self.credit_card_detector.validate("4012-8888-8888-1881"))
19 | self.assertTrue(self.credit_card_detector.validate("4012 8888 8888 1881"))
20 |
21 | def test_valid_airplus_credit_card(self):
22 | self.assertTrue(self.credit_card_detector.validate('122000000000003'))
23 |
24 | def test_valid_amex_credit_card(self):
25 | self.assertTrue(self.credit_card_detector.validate('371449635398431'))
26 |
27 | def test_valid_cartebleue_credit_card(self):
28 | self.assertTrue(self.credit_card_detector.validate('5555555555554444'))
29 |
30 | def test_valid_dankort_credit_card(self):
31 | self.assertTrue(self.credit_card_detector.validate('5019717010103742'))
32 |
33 | def test_valid_diners_credit_card(self):
34 | self.assertTrue(self.credit_card_detector.validate('30569309025904'))
35 |
36 | def test_valid_discover_credit_card(self):
37 | self.assertTrue(self.credit_card_detector.validate('6011000400000000'))
38 |
39 | def test_valid_jcb_credit_card(self):
40 | self.assertTrue(self.credit_card_detector.validate('3528000700000000'))
41 |
42 | def test_valid_maestro_credit_card(self):
43 | self.assertTrue(self.credit_card_detector.validate('6759649826438453'))
44 |
45 | def test_valid_mastercard_credit_card(self):
46 | self.assertTrue(self.credit_card_detector.validate('5555555555554444'))
47 |
48 | def test_valid_visa_credit_card(self):
49 | self.assertTrue(self.credit_card_detector.validate('4111111111111111'))
50 |
51 | def test_valid_visa_debit_credit_card(self):
52 | self.assertTrue(self.credit_card_detector.validate('4111111111111111'))
53 |
54 | def test_valid_visa_electron_credit_card(self):
55 | self.assertTrue(self.credit_card_detector.validate('4917300800000000'))
56 |
57 | def test_valid_visa_purchasing_credit_card(self):
58 | self.assertTrue(self.credit_card_detector.validate('4484070000000000'))
59 |
60 | def test_invalid_credit_card(self):
61 | self.assertFalse(self.credit_card_detector.validate('4012-8888-8888-1882'))
62 |
63 | def test_invalid_diners_card(self):
64 | self.assertFalse(self.credit_card_detector.validate('36168002586008'))
65 |
--------------------------------------------------------------------------------
/src/analyze/detectors/tests/test_email_detector.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from unittest.mock import patch
3 |
4 | from src.analyze.detectors.email_detector import EmailDetector
5 |
6 |
7 | class TestEmailDetector(TestCase):
8 |
9 | def setUp(self):
10 | self.email_detector = EmailDetector()
11 |
12 | def test_get_name_returns_the_valid_detector_name(self):
13 | self.assertEqual(self.email_detector.get_name(), "EMAIL")
14 |
15 | def test_get_pattern_returns_compiled_regex(self):
16 | actual_value = self.email_detector.get_pattern()
17 | return_value = "[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+"
18 | self.assertEqual(return_value, actual_value)
19 |
20 | def test_valid_email_gets_detected_correctly(self):
21 | self.assertEqual(len(self.email_detector.execute("abc@hotmail.com")), 1)
22 |
23 | def test_invalid_email_does_not_get_detected(self):
24 | self.assertEqual(len(self.email_detector.execute("@hotmail.com")), 0)
25 |
--------------------------------------------------------------------------------
/src/analyze/detectors/tests/test_national_id_detector.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from src.analyze.detectors.national_id_detector import NationalIdDetector
4 |
5 |
6 | class TestNationalIdDetector(TestCase):
7 |
8 | def setUp(self):
9 | self.national_id_detector = NationalIdDetector()
10 |
11 | def test_default_property_values_are_correct(self):
12 | self.assertEqual("NRIC", self.national_id_detector.name)
13 | self.assertEqual("[STFG]\\d{7}[A-Z]", self.national_id_detector.pattern)
14 |
15 | def test_execute_return_true_when_valid_old_NRIC(self):
16 | self.assertTrue(self.national_id_detector.validate("S0000001I"))
17 |
18 | def test_execute_return_true_when_valid_old_FIN(self):
19 | self.assertTrue(self.national_id_detector.validate("F0000001U"))
20 |
21 | def test_execute_return_true_when_valid_new_NRIC(self):
22 | self.assertTrue(self.national_id_detector.validate("T0000001E"))
23 |
24 | def test_execute_return_true_when_valid_new_FIN(self):
25 | self.assertTrue(self.national_id_detector.validate("G0000001P"))
26 |
27 | def test_execute_return_false_when_invalid_old_NRIC(self):
28 | self.assertFalse(self.national_id_detector.validate("S0000001K"))
29 |
30 | def test_execute_return_false_when_invalid_new_NRIC(self):
31 | self.assertFalse(self.national_id_detector.validate("F0000001V"))
32 |
33 | def test_execute_return_false_when_invalid_old_FIN(self):
34 | self.assertFalse(self.national_id_detector.validate("T0000001F"))
35 |
36 | def test_execute_return_false_when_invalid_new_FIN(self):
37 | self.assertFalse(self.national_id_detector.validate("G0000001Q"))
38 |
--------------------------------------------------------------------------------
/src/analyze/detectors/tests/test_phone_number_detector.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from src.analyze.detectors.phone_number_detector import PhoneNumberDetector
4 | from src.analyze.utils.analyzer_result import AnalyzerResult
5 |
6 |
7 | class TestPhoneNumberDetector(TestCase):
8 |
9 | def setUp(self):
10 | self.phone_number_detector = PhoneNumberDetector()
11 |
12 | def test_default_property_values_are_correct(self):
13 | self.assertEqual("PHONE_NUMBER", self.phone_number_detector.name)
14 | self.assertEqual('(\\+65?\\s?[689]\\d{7})|'
15 | '(\\+65?\\s?[689]\\d{3} \\d{4})|'
16 | '([689]\\d{7})|'
17 | '([689]\\d{3} \\d{4})|'
18 | '([(]65[)]\\s?[689]\\d{7})|'
19 | '([(]65[)]\\s?[689]\\d{3} \\d{4})',
20 | self.phone_number_detector.pattern)
21 |
22 | def test_invalid_phone_number_does_not_get_detected(self):
23 | self.assertEqual(len(self.phone_number_detector.execute("S0000001I")), 0)
24 |
25 | def __assert_single_result(self, text_to_be_tested, start, end):
26 | actual = self.phone_number_detector.execute(text_to_be_tested)
27 | expected = AnalyzerResult(text_to_be_tested, "PHONE_NUMBER", start, end)
28 | self.assertEqual(len(actual), 1)
29 | self.assertEqual(expected, actual[0])
30 |
31 | def test_valid_phone_number_gets_detected_correctly(self):
32 | self.__assert_single_result("+65 65781234", 0, 12)
33 | self.__assert_single_result("+65 85781234", 0, 12)
34 | self.__assert_single_result("+65 95781234", 0, 12)
35 |
36 | self.__assert_single_result("+65 6578 1234", 0, 13)
37 | self.__assert_single_result("+65 8578 1234", 0, 13)
38 | self.__assert_single_result("+65 9578 1234", 0, 13)
39 |
40 | self.__assert_single_result("65781234", 0, 8)
41 | self.__assert_single_result("85781234", 0, 8)
42 | self.__assert_single_result("95781234", 0, 8)
43 |
44 | self.__assert_single_result("6578 1234", 0, 9)
45 | self.__assert_single_result("8578 1234", 0, 9)
46 | self.__assert_single_result("9578 1234", 0, 9)
47 |
48 | self.__assert_single_result("(65) 65781234", 0, 13)
49 | self.__assert_single_result("(65) 85781234", 0, 13)
50 | self.__assert_single_result("(65) 95781234", 0, 13)
51 |
52 | self.__assert_single_result("(65) 6578 1234", 0, 14)
53 | self.__assert_single_result("(65) 8578 1234", 0, 14)
54 | self.__assert_single_result("(65) 9578 1234", 0, 14)
55 |
--------------------------------------------------------------------------------
/src/analyze/detectors/tests/test_pii_detector.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from unittest import TestCase
3 |
4 | from src.analyze.detectors.pii_detector import PIIDetector
5 | from src.analyze.utils.analyzer_result import AnalyzerResult
6 | from src.anonymize.anonymizer_result import AnonymizerResult
7 |
8 |
9 | class TestPIIDetector(TestCase):
10 |
11 | def setUp(self):
12 | self.pii_detector = PIIDetector()
13 |
14 | def test_should_detect_and_redact_nric_in_text(self):
15 | actual = self.pii_detector.analyze_and_redact("First President of Singapore NRIC was S0000001I")
16 | expected = AnonymizerResult("First President of Singapore NRIC was ", [AnalyzerResult("S0000001I", "NRIC", 38, 47)])
17 | self.assertEqual(actual, expected)
18 |
19 | def test_should_detect_and_redact_email_in_text(self):
20 | actual = self.pii_detector.analyze_and_redact("A typical email id would look something like test@sample.com")
21 | expected = AnonymizerResult("A typical email id would look something like ",
22 | [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)])
23 | self.assertEqual(actual, expected)
24 |
25 | def test_should_detect_and_redact_phone_in_text(self):
26 | actual = self.pii_detector.analyze_and_redact("Some examples of phone numbers are +65 62345678")
27 | expected = AnonymizerResult("Some examples of phone numbers are ",
28 | [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
29 | self.assertEqual(actual, expected)
30 |
31 | def test_should_detect_and_redact_all_pii_fields_in_text(self):
32 | actual = self.pii_detector.analyze_and_redact("""First President of Singapore NRIC was S0000001I.
33 | A typical email id would look something like test@sample.com""")
34 | expected_redacted_text = """First President of Singapore NRIC was .
35 | A typical email id would look something like """
36 |
37 | expected = AnonymizerResult(expected_redacted_text, [AnalyzerResult("test@sample.com", "EMAIL", 135, 150),
38 | AnalyzerResult("S0000001I", "NRIC", 38, 47)])
39 | self.assertEqual(actual, expected)
40 |
41 | def test_analyze_returns_returns_same_text_and_no_results_when_no_PII_fields(self):
42 | input_text = """First President of Singapore NRIC was ABC.
43 | A typical email id would look something like test"""
44 | actual = self.pii_detector.analyze_and_redact(input_text)
45 | expected = AnonymizerResult(input_text, [])
46 | self.assertEqual(actual, expected)
47 |
48 | def test_analyze_data_frame_runs_analyze_against_each_cell_with_a_PII_value(self):
49 | test_data_frame = pd.DataFrame({"summary": ["First President of Singapore NRIC was S0000001I",
50 | "A typical email id would look something like test@sample.com"],
51 | "phone number": ["Some examples of phone numbers are +65 62345678",
52 | "Some examples of phone numbers are +65 62345678"]})
53 |
54 | actual, _ = self.pii_detector.analyze_data_frame(test_data_frame)
55 |
56 | expected_data_frame = pd.DataFrame({"summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)],
57 | [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]],
58 | "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)],
59 | [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]})
60 |
61 | pd.testing.assert_frame_equal(expected_data_frame, actual)
62 |
63 | def test_analyze_data_frame_runs_analyze_against_each_cell_when_there_are_no_PII_values_returns_empty_data_frame(
64 | self):
65 | test_data_frame = pd.DataFrame({"summary": ["First President of Singapore NRIC was abcde",
66 | "A typical email id would look something like test@t"],
67 | "phone number": ["Some examples of phone numbers are +34342",
68 | "Some examples of phone numbers are +8909"]})
69 | expected_report = pd.DataFrame({"summary": [[],[]],
70 | "phone number": [[],[]]
71 | })
72 | expected_result = pd.DataFrame({"summary": ["First President of Singapore NRIC was abcde",
73 | "A typical email id would look something like test@t"],
74 | "phone number": ["Some examples of phone numbers are +34342",
75 | "Some examples of phone numbers are +8909"]})
76 | actual_report, actual_result = self.pii_detector.analyze_data_frame(test_data_frame)
77 |
78 | pd.testing.assert_frame_equal(expected_report, actual_report)
79 | pd.testing.assert_frame_equal(expected_result, actual_result)
80 |
81 | def test_analyze_data_frame_runs_analyze_only_on_cells_with_a_PII_value(self):
82 | test_data_frame = pd.DataFrame({"summary": ["First President of Singapore NRIC was S0000001I",
83 | "A typical email id would look something like test@sample.com"],
84 | "remarks": ["No sensitive data",
85 | "No sensitive data"]})
86 |
87 | actual_report, actual_result = self.pii_detector.analyze_data_frame(test_data_frame)
88 |
89 | expected_report = pd.DataFrame({"summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)],
90 | [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]],
91 | "remarks": [[],[]]
92 | })
93 |
94 | expected_result = pd.DataFrame({"summary": ["First President of Singapore NRIC was ",
95 | "A typical email id would look something like "],
96 | "remarks": ["No sensitive data",
97 | "No sensitive data"]})
98 |
99 | pd.testing.assert_frame_equal(expected_report, actual_report)
100 | pd.testing.assert_frame_equal(expected_result, actual_result)
--------------------------------------------------------------------------------
/src/analyze/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/utils/__init__.py
--------------------------------------------------------------------------------
/src/analyze/utils/analyzer_result.py:
--------------------------------------------------------------------------------
1 | class AnalyzerResult:
2 |
3 | def __init__(self, text, type, start, end):
4 | self.text = text
5 | self.type = type
6 | self.start = start
7 | self.end = end
8 |
9 | def __eq__(self, other):
10 | return type(self) == type(other) and self.text == other.text and self.type == other.type \
11 | and self.start == other.start and self.end == other.end
12 |
13 | def __repr__(self):
14 | return self.__str__()
15 |
16 | def __str__(self):
17 | return "Text {} at position ({},{}) was identified as {}".format(self.text, self.start, self.end, self.type)
18 |
19 | def detector(self):
20 | return self.type
21 |
--------------------------------------------------------------------------------
/src/analyze/utils/regex.py:
--------------------------------------------------------------------------------
1 | class RegEx:
2 |
3 | def __init__(self):
4 | self.regex_string = ""
5 |
6 | def __is_numeric(self, value):
7 | return isinstance(value, int)
8 |
9 | def __is_single_character_value(self, value):
10 | return len(str(value)) == 1
11 |
12 | def __validate_range(self, start, end):
13 | if start > end:
14 | raise ValueError("Range start should be less than end")
15 |
16 | def boundary(self):
17 | self.regex_string += "\\b"
18 | return self
19 |
20 | def pipe(self):
21 | self.regex_string += "|"
22 | return self
23 |
24 | def range(self, from_char, to_char):
25 | if not self.__is_single_character_value(from_char) or not self.__is_single_character_value(to_char):
26 | raise ValueError("Range boundaries should be single character")
27 |
28 | self.__validate_range(from_char, to_char)
29 | self.regex_string += "[{}-{}]".format(from_char, to_char)
30 | return self
31 |
32 | def one_of(self, character_set):
33 | if character_set is None or character_set == "":
34 | raise ValueError("Character Set should not be empty")
35 |
36 | self.regex_string += "[" + character_set + "]"
37 | return self
38 |
39 | def any_digit(self):
40 | self.regex_string += "\\d"
41 | return self
42 |
43 | def num_occurrences(self, number):
44 | if number < 1:
45 | raise ValueError
46 |
47 | self.regex_string += "{" + str(number) + "}"
48 | return self
49 |
50 | def one_or_more_occurrences(self):
51 | self.regex_string += "+"
52 | return self
53 |
54 | def zero_or_more_occurrences(self):
55 | self.regex_string += "*"
56 | return self
57 |
58 | def zero_or_one_occurrences(self):
59 | self.regex_string += "?"
60 | return self
61 |
62 | def range_occurrences(self, start, end):
63 | if not self.__is_numeric(start) or not self.__is_numeric(end):
64 | raise TypeError("Range should be integers")
65 |
66 | self.__validate_range(start, end)
67 | self.regex_string += "{" + str(start) + "," + str(end) + "}"
68 | return self
69 |
70 | def literal(self, literal):
71 | self.regex_string += literal
72 | return self
73 |
74 | def build(self):
75 | return self.regex_string
76 |
--------------------------------------------------------------------------------
/src/analyze/utils/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/utils/tests/__init__.py
--------------------------------------------------------------------------------
/src/analyze/utils/tests/test_analyzer_result.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from src.analyze.utils.analyzer_result import AnalyzerResult
4 |
5 |
6 | class TestAnalyzerResult(TestCase):
7 |
8 | def test_equality(self):
9 | expected = AnalyzerResult("text", "type", 0, 10)
10 | actual = AnalyzerResult("text", "type", 0, 10)
11 | self.assertEqual(expected, actual)
12 |
13 | def test_inequality(self):
14 | self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("different_text", "type", 0, 10))
15 | self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("text", "different_type", 0, 10))
16 | self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("text", "type", 1, 10))
17 | self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("text", "type", 0, 11))
18 |
19 | def test_repr(self):
20 | expected = "Text sample_data at position (0,10) was identified as type"
21 | self.assertEqual(AnalyzerResult("sample_data", "type", 0, 10).__repr__(), expected)
22 |
23 | def test_str(self):
24 | expected = "Text sample_data at position (0,10) was identified as type"
25 | self.assertEqual(str(AnalyzerResult("sample_data", "type", 0, 10)), expected)
26 |
27 | def test_get_detector_fetches_detector_type_correctly(self):
28 | result = AnalyzerResult("text", "EMAIL", 0, 10)
29 | self.assertEqual(result.detector(), "EMAIL")
--------------------------------------------------------------------------------
/src/analyze/utils/tests/test_regex.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from src.analyze.utils.regex import RegEx
4 |
5 |
6 | class TestRegEx(TestCase):
7 |
8 | # Testing one_of
9 | def test_when_one_of_param_is_empty_throws_error(self):
10 | self.assertRaises(ValueError, lambda: RegEx().one_of("").build())
11 |
12 | def test_when_valid_input_is_passed_one_of_returns_correct_output(self):
13 | self.assertEqual("[AB]", RegEx().one_of("AB").build())
14 | self.assertEqual("[357]", RegEx().one_of("357").build())
15 |
16 | # Testing num_occurrences
17 | def test_when_non_positive_number_of_occurrences_throws_error(self):
18 | self.assertRaises(ValueError, lambda: RegEx().num_occurrences(-7).build())
19 | self.assertRaises(ValueError, lambda: RegEx().num_occurrences(0).build())
20 |
21 | def test_when_valid_input_is_passed_num_occurrences_returns_correct_output(self):
22 | self.assertEqual("{7}", RegEx().num_occurrences(7).build())
23 |
24 | # Testing any_digit
25 | def test_when_any_digit_returns_correct_output(self):
26 | self.assertEqual("\\d", RegEx().any_digit().build())
27 |
28 | def __assert_value_error_is_raised(self, fn, msg):
29 | with self.assertRaises(ValueError) as ve:
30 | fn()
31 | self.assertEqual(str(ve.exception), msg)
32 |
33 | def __assert_type_error_is_raised(self, fn, msg):
34 | with self.assertRaises(TypeError) as ve:
35 | fn()
36 | self.assertEqual(str(ve.exception), msg)
37 |
38 | # Testing range
39 | def test_when_range_is_incomplete(self):
40 | single_character = "Range boundaries should be single character"
41 | self.__assert_value_error_is_raised(lambda: RegEx().range("", "Z").build(), single_character)
42 | self.__assert_value_error_is_raised(lambda: RegEx().range("0", "").build(), single_character)
43 | self.__assert_value_error_is_raised(lambda: RegEx().range("01", "9").build(), single_character)
44 | self.__assert_value_error_is_raised(lambda: RegEx().range("A", "YZ").build(), single_character)
45 |
46 | def test_when_invalid_range_boundaries_are_provided(self):
47 | less_than_end = "Range start should be less than end"
48 | self.__assert_value_error_is_raised(lambda: RegEx().range("B", "A").build(), less_than_end)
49 | self.__assert_value_error_is_raised(lambda: RegEx().range("9", "0").build(), less_than_end)
50 |
51 | def test_when_valid_input_is_passed_range_returns_correct_output(self):
52 | self.assertEqual("[A-Z]", RegEx().range("A", "Z").build())
53 | self.assertEqual("[0-9]", RegEx().range("0", "9").build())
54 |
55 | # Testing range_occurrences
56 | def test_when_invalid_numeric_range_boundaries_are_provided(self):
57 | less_than_end = "Range start should be less than end"
58 | self.__assert_value_error_is_raised(lambda: RegEx().range_occurrences(9, 0).build(), less_than_end)
59 |
60 | def test_when_invalid_input_for_range_occurrences_throws_error(self):
61 | range_should_be_integers = "Range should be integers"
62 | self.__assert_type_error_is_raised(lambda: RegEx().range_occurrences(1.2, 2).build(), range_should_be_integers)
63 | self.__assert_type_error_is_raised(lambda: RegEx().range_occurrences("A", 9).build(), range_should_be_integers)
64 |
65 | def test_when_valid_input_is_passed_range_occurrences_returns_correct_output(self):
66 | self.assertEqual("{0,9}", RegEx().range_occurrences(0, 9).build())
67 |
68 | # Testing one_or_more_occurrences
69 | def test_when_valid_input_is_passed_one_or_more_occurrences_returns_correct_output(self):
70 | self.assertEqual("+", RegEx().one_or_more_occurrences().build())
71 |
72 | # Testing zero_or_more_occurrences
73 | def test_when_valid_input_is_passed_zero_or_more_occurrences_returns_correct_output(self):
74 | self.assertEqual("*", RegEx().zero_or_more_occurrences().build())
75 |
76 | # Testing zero_or_one_occurrences
77 | def test_when_valid_input_is_passed_zero_or_one_occurrences_returns_correct_output(self):
78 | self.assertEqual("?", RegEx().zero_or_one_occurrences().build())
79 |
80 | # Testing literal
81 | def test_when_valid_input_is_passed_literal_returns_correct_output(self):
82 | self.assertEqual("@", RegEx().literal("@").build())
83 |
84 | # Testing boundary
85 | def test_boundary(self):
86 | self.assertEqual("\\b", RegEx().boundary().build())
87 |
88 | # Testing complex inputs
89 | def test_builds_correct_pattern_for_NRIC(self):
90 | self.assertEqual("[AIR]\\d{7}[A-Z]",
91 | RegEx()
92 | .one_of("AIR")
93 | .any_digit()
94 | .num_occurrences(7)
95 | .range("A", "Z")
96 | .build())
97 |
98 | self.assertEqual("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+",
99 | RegEx()
100 | .one_of("a-zA-Z0-9_.+-")
101 | .one_or_more_occurrences()
102 | .literal("@")
103 | .one_of("a-zA-Z0-9-")
104 | .one_or_more_occurrences()
105 | .literal("\\.")
106 | .one_of("a-zA-Z0-9-.")
107 | .one_or_more_occurrences()
108 | .build())
109 |
--------------------------------------------------------------------------------
/src/anonymize/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/anonymize/__init__.py
--------------------------------------------------------------------------------
/src/anonymize/anonymizer_result.py:
--------------------------------------------------------------------------------
1 | class AnonymizerResult:
2 |
3 | def __init__(self, redacted_text, analyzer_results):
4 | self.redacted_text = redacted_text
5 | self.analyzer_results = analyzer_results
6 |
7 | def __eq__(self, other):
8 | return type(self) == type(other) and self.redacted_text == other.redacted_text and self.analyzer_results == other.analyzer_results
9 |
10 | def __repr__(self):
11 | return self.__str__()
12 |
13 | def __str__(self):
14 | return "PII information found: \n{}\nRedacted text: {}".format(self.analyzer_results, self.redacted_text)
15 |
--------------------------------------------------------------------------------
/src/anonymize/drop_anonymizer.py:
--------------------------------------------------------------------------------
1 | from src.analyze.utils.analyzer_result import AnalyzerResult
2 |
3 |
4 | class DropAnonymizer:
5 |
6 | @staticmethod
7 | def redact(text: str, analyzer_results: [AnalyzerResult]):
8 | for result in analyzer_results:
9 | text = text.replace(result.text, "")
10 | return text
11 |
--------------------------------------------------------------------------------
/src/anonymize/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/anonymize/tests/__init__.py
--------------------------------------------------------------------------------
/src/anonymize/tests/test_drop_anonymizer.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from src.anonymize.drop_anonymizer import DropAnonymizer
3 | from src.analyze.utils.analyzer_result import AnalyzerResult
4 |
5 |
6 | class TestDropAnonymizer(TestCase):
7 |
8 | def test_redact_for_single_analyzer_result(self):
9 | text = "text containing pii"
10 | analyzer_results = [AnalyzerResult("pii", "PII_DETECTOR", 16, 18)]
11 | result = DropAnonymizer.redact(text, analyzer_results)
12 | self.assertEqual(result, "text containing ")
13 |
14 | def test_redact_for_multiple_analyzer_results(self):
15 | text = "text containing pii1 and pii2"
16 | analyzer_results = [AnalyzerResult("pii1", "PII_DETECTOR", 16, 19),
17 | AnalyzerResult("pii2", "PII_DETECTOR", 25, 28)]
18 | result = DropAnonymizer.redact(text, analyzer_results)
19 | self.assertEqual(result, "text containing and ")
20 |
21 |
--------------------------------------------------------------------------------
/src/constants.py:
--------------------------------------------------------------------------------
1 | ACQUIRE="acquire"
2 | FILE_PATH="file_path"
3 | ANALYZE="analyze"
4 | REPORT="report"
5 | LOCATION="location"
6 | REPORT_LEVEL="level"
7 | OUTPUT_FILE_PATH="output_file_path"
--------------------------------------------------------------------------------
/src/dpf_main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | sys.path.append(os.path.abspath('.'))
4 |
5 | import argparse
6 | import json
7 |
8 | from src.report.report_generator import ReportGenerator
9 | from src.acquire.csv_parser import CsvParser
10 | from src.analyze.detectors.pii_detector import PIIDetector
11 | from src.constants import ACQUIRE, REPORT
12 | from src.write.csv_writer import CsvWriter
13 |
14 |
15 | class DPFMain():
16 |
17 | def __init__(self, config_file_path):
18 | with open(config_file_path) as config_file:
19 | self.config = json.load(config_file)
20 |
21 | #TODO : validate the config for the stages right here
22 | def run(self):
23 | parsed_data_frame = CsvParser(config=self.config[ACQUIRE]).parse()
24 | pii_analysis_report, redacted_data_frame = PIIDetector().analyze_data_frame(parsed_data_frame)
25 | if pii_analysis_report.empty:
26 | print("NO PII VALUES WERE FOUND!")
27 | else:
28 | ReportGenerator(config=self.config[REPORT])\
29 | .generate(results_df=pii_analysis_report,
30 | )
31 | CsvWriter(config=self.config).write_csv(df=redacted_data_frame)
32 |
33 |
34 | # output_directory needs to be obtained from the config json file as a parameter in the 'anonymize' section.
35 |
36 | def get_args():
37 | parser = argparse.ArgumentParser()
38 | parser.add_argument('--config-file', help='config file to run the tool')
39 | args = parser.parse_args()
40 | if not args.config_file:
41 | raise ValueError("Config file path should be provided for the tool to run.")
42 | return args
43 |
44 | if __name__ == "__main__":
45 | args = get_args()
46 | DPFMain(args.config_file).run()
--------------------------------------------------------------------------------
/src/report/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/report/__init__.py
--------------------------------------------------------------------------------
/src/report/report_generator.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from enum import Enum
3 |
4 | import os
5 | import pandas as pd
6 | import logging
7 | from src.constants import LOCATION, REPORT_LEVEL
8 |
9 |
10 | class ReportLevel(Enum):
11 |
12 | HIGH = "high"
13 | MEDIUM = "medium"
14 | LOW = "low"
15 |
16 | class ReportGenerator():
17 |
18 | def __init__(self, config):
19 | self.report_file_location = config[LOCATION]
20 | self.report_level = config[REPORT_LEVEL]
21 | self.setup_logging_config()
22 |
23 | def setup_logging_config(self):
24 | date = datetime.today().strftime("%Y%m%d")
25 | file_name = "{}/report_{}.log".format(self.report_file_location, date)
26 | if os.path.exists(file_name):
27 | mode = "a"
28 | else:
29 | if not os.path.exists(self.report_file_location):
30 | os.makedirs(self.report_file_location)
31 | mode = "x"
32 | file_handler = logging.FileHandler(filename=file_name, mode=mode)
33 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
34 | file_handler.setFormatter(formatter)
35 | logging.getLogger().addHandler(file_handler)
36 | logging.getLogger().setLevel(logging.INFO)
37 |
38 | def __generate_high_level_report(self, results_df):
39 | report_df = pd.DataFrame({"Columns with PII values" : results_df.columns.values})
40 | return report_df
41 |
42 | def __collate_all_detectors_per_cell(self, analyzer_result):
43 | return [result.detector() for result in analyzer_result[1]]
44 |
45 | def __calculate_percentage(self, item_count, total_count):
46 | return round((item_count/total_count) * 100.0, 2)
47 |
48 | def __calculate_detector_percentage(self, row_count, count_map):
49 | percentage_map = {}
50 | for key, value in count_map.items():
51 | percentage_map[key] = "{}%".format(self.__calculate_percentage(value, row_count))
52 | return percentage_map
53 |
54 | def __calculate_detector_count(self, column_series):
55 | detector_count_map = {}
56 | for analyzer_results in column_series.iteritems():
57 | if not analyzer_results:
58 | continue
59 | detector_types = self.__collate_all_detectors_per_cell(analyzer_results)
60 | for detector_type in detector_types:
61 | if detector_type not in detector_count_map:
62 | detector_count_map[detector_type] = 0
63 | detector_count_map[detector_type] += 1
64 | return detector_count_map
65 |
66 |
67 | #TODO : filter out the NAs before passing through this
68 | def calculate_detector_stats_for_each_column(self, column_series):
69 | stats_map = {}
70 | count_map = self.__calculate_detector_count(column_series)
71 | percentage_map = self.__calculate_detector_percentage(len(column_series), count_map)
72 | for key, value in count_map.items():
73 | stats_tuple = (value, percentage_map[key])
74 | stats_map[key] = stats_tuple
75 | return stats_map
76 |
77 | def __generate_medium_level_report(self, results_df):
78 | report_df = pd.DataFrame({})
79 | columns = list(results_df)
80 | column_reports = []
81 | for column in columns:
82 | detector_stats_for_each_column = self.calculate_detector_stats_for_each_column(results_df[column])
83 | column_report = pd.Series(detector_stats_for_each_column, name=column, index=detector_stats_for_each_column.keys())
84 | if not column_report.empty:
85 | column_reports.append(column_report)
86 | if column_reports:
87 | report_df = pd.concat(column_reports, axis=1, keys=[series.name for series in column_reports], sort=True)
88 | return report_df.fillna(value=0)
89 |
90 | def generate_report_content(self, results_df):
91 | if self.report_level == ReportLevel.HIGH.value:
92 | return self.__generate_high_level_report(results_df)
93 | elif self.report_level == ReportLevel.MEDIUM.value:
94 | return self.__generate_medium_level_report(results_df)
95 |
96 | def __print(self, msg):
97 | print(msg)
98 | logging.info(msg)
99 |
100 | def __print_report(self, report):
101 | self.__print("\n\n****************************PII ANALYSIS REPORT**************************\n\n")
102 | if report.empty:
103 | self.__print("NO PII VALUES WERE FOUND!")
104 | else:
105 | self.__print(report)
106 | self.__print("\n\n****************************DONE!**************************\n\n")
107 |
108 | def generate(self, results_df):
109 | final_report = self.generate_report_content(results_df)
110 | self.__print_report(final_report)
111 | return final_report
112 |
113 |
--------------------------------------------------------------------------------
/src/report/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/report/tests/__init__.py
--------------------------------------------------------------------------------
/src/report/tests/test_report_generator.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from unittest.mock import patch, MagicMock
3 |
4 | import os
5 | import pandas as pd
6 | from freezegun import freeze_time
7 |
8 | from src.report.report_generator import ReportGenerator
9 | from src.analyze.utils.analyzer_result import AnalyzerResult
10 |
11 |
12 |
13 |
14 | class TestReportGenerator(TestCase):
15 |
16 | @patch("src.report.report_generator.ReportGenerator.setup_logging_config")
17 | def setUp(self, mock_setup_logging_config):
18 | self.report_generator_high_level = ReportGenerator(config={"location" : "abc", "level" : "high"})
19 | mock_setup_logging_config.assert_called_with()
20 | self.report_generator_medium_level = ReportGenerator(config={"location" : "abc", "level" : "medium"})
21 | mock_setup_logging_config.assert_called_with()
22 |
23 | def test_high_level_reporting_returns_columns_with_PII_values_when_given_a_results_data_frame(self):
24 | result_data_frame = pd.DataFrame({"summary" : [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]],
25 | "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]})
26 | expected_data_frame = pd.DataFrame({"Columns with PII values" : ["summary", "phone number"]})
27 | self.assertCountEqual(expected_data_frame, self.report_generator_high_level.generate_report_content(result_data_frame))
28 |
29 | def test_medium_level_reporting_returns_data_frame_with_detectors_and_column_details(self):
30 | result_data_frame = pd.DataFrame({"summary" : [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]],
31 | "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]})
32 | expected_data_frame = pd.DataFrame({"summary" : pd.Series({"NRIC" : (1, "50%"), "EMAIL" : (1,"50%")}),
33 | "phone number" : pd.Series({"PHONE_NUMBER" : (2, "100%")})})
34 | self.assertCountEqual(list(expected_data_frame), self.report_generator_medium_level.generate_report_content(result_data_frame))
35 |
36 | def test_calculate_detector_stats_returns_detector_counts_and_percentages(self):
37 | result_column_values = pd.Series([[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]])
38 | actual_result = self.report_generator_medium_level.calculate_detector_stats_for_each_column(result_column_values)
39 | expected_result = {"NRIC" : (1, "33.33%"), "EMAIL" : (2, "66.67%")}
40 | self.assertCountEqual(expected_result, actual_result)
41 |
42 | @patch("logging.info")
43 | @patch("src.report.report_generator.ReportGenerator.generate_report_content")
44 | def test_generate_report_calls_content_generate_report_content_and_logs_it(self, mock_generate_content, mock_logging):
45 | result_data_frame = pd.DataFrame({"summary" : [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]],
46 | "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]})
47 | mock_generate_content.return_value = pd.DataFrame({"Columns with PII values" : ["summary", "phone number"]})
48 | mock_logging.return_value = None
49 | expected_result = self.report_generator_high_level.generate(result_data_frame)
50 | self.assertCountEqual(expected_result, mock_generate_content.return_value)
51 |
52 |
53 | @freeze_time('2019-05-29 01:01:03')
54 | @patch("logging.FileHandler")
55 | @patch("logging.Logger.addHandler")
56 | @patch("genericpath.exists")
57 | def test_creation_of_the_report_file_if_not_present(self, mock_file_exists, mock_add_handler, mock_file_handler):
58 | mock_file_exists.return_value = False
59 | mock_file_handler.return_value = MagicMock()
60 | self.report_generator_high_level.setup_logging_config()
61 | mock_file_handler.assert_called_with(filename="abc/report_20190529.log", mode="x")
62 | mock_add_handler.assert_called_with(mock_file_handler.return_value)
63 |
64 |
65 | @freeze_time('2019-05-29 01:01:03')
66 | @patch("logging.FileHandler")
67 | @patch("logging.Logger.addHandler")
68 | @patch("os.path.exists")
69 | def test_appending_to_report_file_if_already_present(self, mock_file_exists, mock_add_handler, mock_file_handler):
70 | mock_file_exists.return_value = True
71 | mock_file_handler.return_value = MagicMock()
72 | self.report_generator_high_level.setup_logging_config()
73 | mock_file_handler.assert_called_with(filename="abc/report_20190529.log", mode="a")
74 | mock_add_handler.assert_called_with(mock_file_handler.return_value)
75 |
--------------------------------------------------------------------------------
/src/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/tests/__init__.py
--------------------------------------------------------------------------------
/src/tests/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/tests/config/__init__.py
--------------------------------------------------------------------------------
/src/tests/config/test_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "acquire": {
3 | "file_path": "/Users/wisuchoi/Documents/anonymizer/src/acquire/tests/data/comma_delimited_file.csv",
4 | "delimiter": ","
5 | },
6 | "analyze": {
7 | },
8 | "report" : {
9 | "location" : "/Users/wisuchoi/Documents/anonymizer/report",
10 | "level" : "high"
11 | },
12 | "anonymize": {
13 | "output_file_path" : "/Users/wisuchoi/Documents/anonymizer/output"
14 | }
15 | }
--------------------------------------------------------------------------------
/src/tests/test_dpf_main.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from unittest import TestCase
4 | from unittest.mock import patch, MagicMock
5 |
6 | import pandas as pd
7 |
8 | from src.constants import ACQUIRE, REPORT
9 | from src.dpf_main import DPFMain
10 |
11 |
12 | class TestDPFMain(TestCase):
13 |
14 | def setUp(self):
15 | test_config = "{}/{}".format(os.path.dirname(os.path.realpath(__file__)),"config/test_config.json")
16 | self.dpf_main = DPFMain(test_config)
17 | with open(test_config) as input_file:
18 | self.config_json = json.load(input_file)
19 |
20 | @patch('src.write.csv_writer.CsvWriter.write_csv')
21 | @patch('src.write.csv_writer.CsvWriter.__init__')
22 | @patch('src.report.report_generator.ReportGenerator.generate')
23 | @patch('src.report.report_generator.ReportGenerator.__init__')
24 | @patch('src.analyze.detectors.pii_detector.PIIDetector.analyze_data_frame')
25 | @patch('src.acquire.csv_parser.CsvParser.parse')
26 | @patch('src.acquire.csv_parser.CsvParser.__init__')
27 | def test_run_parses_the_config_file_and_invokes_respective_stages_correctly(self, mock_csv_parser_init,
28 | mock_csv_parser_parse,
29 | mock_pii_analyze_df,
30 | mock_report_generator_init,
31 | mock_generate_report,
32 | mock_csv_writer_init,
33 | mock_csv_writer_write_csv):
34 | mock_csv_parser_init.return_value = None
35 | mock_csv_parser_parse.return_value = MagicMock()
36 | mock_pii_analyze_df.return_value = (pd.DataFrame({"summary" : ["test result"]}), pd.DataFrame({}))
37 | mock_report_generator_init.return_value = None
38 | mock_generate_report.return_value = MagicMock()
39 | mock_csv_writer_init.return_value = None
40 | mock_csv_writer_write_csv.return_value = None
41 | self.dpf_main.run()
42 | mock_csv_parser_init.assert_called_with(config=self.config_json[ACQUIRE])
43 | mock_csv_parser_parse.assert_called_with()
44 | mock_pii_analyze_df.assert_called_with(mock_csv_parser_parse.return_value)
45 | mock_report_generator_init.assert_called_with(config=self.config_json[REPORT])
46 | mock_generate_report.assert_called_with(results_df=mock_pii_analyze_df.return_value[0])
47 | mock_csv_writer_init.assert_called_with(config=self.config_json)
48 | mock_csv_writer_write_csv.assert_called_with(df=mock_pii_analyze_df.return_value[1])
49 |
50 |
51 | @patch('src.write.csv_writer.CsvWriter.write_csv')
52 | @patch('src.write.csv_writer.CsvWriter.__init__')
53 | @patch('src.report.report_generator.ReportGenerator.generate')
54 | @patch('src.analyze.detectors.pii_detector.PIIDetector.analyze_data_frame')
55 | @patch('src.acquire.csv_parser.CsvParser.parse')
56 | @patch('src.acquire.csv_parser.CsvParser.__init__')
57 | def test_run_short_circuits_generate_report_when_no_PII_values_detected(self, mock_csv_parser_init,
58 | mock_csv_parser_parse,
59 | mock_pii_analyze_df,
60 | mock_generate_report,
61 | mock_csv_writer_init,
62 | mock_csv_writer_write_csv):
63 | mock_csv_parser_init.return_value = None
64 | mock_csv_parser_parse.return_value = pd.DataFrame({})
65 | mock_pii_analyze_df.return_value = (pd.DataFrame({}), pd.DataFrame({}))
66 | mock_generate_report.return_value = MagicMock()
67 | mock_generate_report.return_value = None
68 | mock_csv_writer_init.return_value = None
69 | mock_csv_writer_write_csv.return_value = None
70 | self.dpf_main.run()
71 | mock_csv_parser_init.assert_called_with(config=self.config_json[ACQUIRE])
72 | mock_csv_parser_parse.assert_called_with()
73 | mock_pii_analyze_df.assert_called_with(mock_csv_parser_parse.return_value)
74 | mock_generate_report.assert_not_called()
75 | mock_csv_writer_init.assert_called_with(config=self.config_json)
76 | mock_csv_writer_write_csv.assert_called_with(df=mock_pii_analyze_df.return_value[1])
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/src/write/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/write/__init__.py
--------------------------------------------------------------------------------
/src/write/csv_writer.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from pandas import DataFrame
3 |
4 | from src.constants import OUTPUT_FILE_PATH, FILE_PATH
5 |
6 |
7 | class CsvWriter:
8 |
9 | def __init__(self, config):
10 | self.__validate_config(config)
11 | self.output_path = config["anonymize"][OUTPUT_FILE_PATH]
12 | self.input_file_name = config["acquire"][FILE_PATH]
13 |
14 | def __validate_config(self, config):
15 | if "anonymize" not in config or not config["anonymize"] or OUTPUT_FILE_PATH not in config["anonymize"] or not config["anonymize"][OUTPUT_FILE_PATH]:
16 | raise ValueError("Config 'output_file_path' needs to be provided for parsing")
17 |
18 | def get_output_file_path(self):
19 | file_name = self.input_file_name.split('/')[-1]
20 | file_name_no_extension = file_name.split('.')[0]
21 | result = f"{self.output_path}/{file_name_no_extension}_anonymized_.csv"
22 | return result
23 |
24 | def write_csv(self, df: DataFrame):
25 | df.to_csv(self.get_output_file_path(), index=False)
26 | print("Anonymized csv has been successfully created!")
--------------------------------------------------------------------------------
/src/write/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/write/tests/__init__.py
--------------------------------------------------------------------------------
/src/write/tests/test_csv_writer.py:
--------------------------------------------------------------------------------
1 | from src.write.csv_writer import CsvWriter
2 | from unittest import TestCase
3 | import os
4 | import pandas as pd
5 |
6 |
7 | class TestCsvWriter(TestCase):
8 |
9 | #TODO: check acquire file path exists
10 | def test_invalid_config_gets_caught_during_initialization(self):
11 | context = {}
12 | with self.assertRaises(ValueError) as ve:
13 | CsvWriter(config=context)
14 | self.assertEqual(str(ve.exception), "Config 'output_file_path' needs to be provided for parsing")
15 |
16 |
17 | def test_correct_output_path_is_generated(self):
18 | context = {
19 | "acquire": {
20 | "file_path": "/anonymizer/test_data.csv",
21 | "delimiter": ","
22 | },
23 | "anonymize": {
24 | "output_file_path" : "/anonymizer/output"
25 | }
26 | }
27 | input_file_name = "test_data"
28 | output_directory = "/anonymizer/output"
29 | expected = f"{output_directory}/{input_file_name}_anonymized_.csv"
30 | writer = CsvWriter(config=context)
31 | self.assertEqual(writer.get_output_file_path(), expected)
--------------------------------------------------------------------------------
/src_spark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/__init__.py
--------------------------------------------------------------------------------
/src_spark/acquire/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/acquire/__init__.py
--------------------------------------------------------------------------------
/src_spark/acquire/csv_parser.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from src_spark.constants import FILE_PATH
3 | import pyspark.sql.functions as f
4 |
5 | class CsvParser:
6 |
7 | def __init__(self, spark: SparkSession, config):
8 | self.__validate_config(config)
9 | self.input_path = config["file_path"]
10 | self.delimiter = config["delimiter"] if "delimiter" in config and config["delimiter"] else ","
11 | self.spark = spark
12 |
13 | def __validate_config(self, config):
14 | if FILE_PATH not in config or not config[FILE_PATH]:
15 | raise ValueError("Config 'file_path' needs to be provided for parsing")
16 |
17 | def parse(self):
18 | df = self.spark.read.load(
19 | self.input_path,
20 | format="csv",
21 | sep=self.delimiter,
22 | header="true",
23 | inferSchema="true")
24 |
25 |
26 |
27 | return df
--------------------------------------------------------------------------------
/src_spark/acquire/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/acquire/tests/__init__.py
--------------------------------------------------------------------------------
/src_spark/acquire/tests/data/comma_delimited_file.csv:
--------------------------------------------------------------------------------
1 | name,ssn
2 | Lisa Beard,557-39-2479
--------------------------------------------------------------------------------
/src_spark/acquire/tests/data/empty.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/acquire/tests/data/empty.csv
--------------------------------------------------------------------------------
/src_spark/acquire/tests/data/missing_comma.csv:
--------------------------------------------------------------------------------
1 | name,ssn,age
2 | Lisa Beard,557-39-2479,33
3 | John Sohn,33
--------------------------------------------------------------------------------
/src_spark/acquire/tests/data/pipe_delimited_file.csv:
--------------------------------------------------------------------------------
1 | name|ssn
2 | Lisa Beard|557-39-2479
--------------------------------------------------------------------------------
/src_spark/acquire/tests/test_csv_parser.py:
--------------------------------------------------------------------------------
1 | import os
2 | from unittest import TestCase
3 | from pyspark.sql import SparkSession
4 | from pyspark.sql.types import StructType
5 | from src_spark.acquire.csv_parser import CsvParser
6 |
7 |
8 | class TestCsvParser(TestCase):
9 |
10 | def setUp(self) -> None:
11 | self.SPARK = SparkSession.builder \
12 | .master("local") \
13 | .appName("Test CSVParser") \
14 | .getOrCreate()
15 | self.current_dir = os.path.dirname(os.path.realpath(__file__))
16 |
17 | def test_invalid_config_gets_caught_during_initialization(self):
18 | context = {}
19 | with self.assertRaises(ValueError) as ve:
20 | CsvParser(self.SPARK, config=context)
21 | self.assertEqual(str(ve.exception), "Config 'file_path' needs to be provided for parsing")
22 |
23 | def test_if_valid_csv_file_provided_returns_spark_df(self):
24 | file_path = "{}/data/comma_delimited_file.csv".format(self.current_dir)
25 | config = {"file_path" : file_path, "delimiter" : ""}
26 |
27 | expected = self.SPARK.createDataFrame(
28 | [("Lisa Beard", "557-39-2479")],
29 | ["name", "ssn"]
30 | )
31 | actual = CsvParser(spark=self.SPARK, config=config).parse()
32 |
33 | self.assertEqual(actual.schema, expected.schema)
34 | self.assertEqual(actual.collect(), expected.collect())
35 |
36 | def test_if_valid_csv_file_with_different_delimiter_provided_returns_spark_df(self):
37 | file_path = "{}/data/pipe_delimited_file.csv".format(self.current_dir)
38 | config = {"file_path" : file_path, "delimiter" : "|"}
39 |
40 | expected = self.SPARK.createDataFrame(
41 | [("Lisa Beard", "557-39-2479")],
42 | ["name", "ssn"]
43 | )
44 | actual = CsvParser(spark=self.SPARK, config=config).parse()
45 |
46 | self.assertEqual(actual.schema, expected.schema)
47 | self.assertEqual(actual.collect(), expected.collect())
48 |
49 | def test_if_empty_csv_file_returns_empty_pandas_df(self):
50 | file_path = "{}/data/empty.csv".format(self.current_dir)
51 | config = {"file_path" : file_path}
52 | expected = self.SPARK.createDataFrame([], StructType([]))
53 | actual = CsvParser(spark=self.SPARK, config=config).parse()
54 | self.assertEqual(actual.schema, expected.schema)
55 | self.assertEqual(actual.collect(), expected.collect())
56 |
57 |
--------------------------------------------------------------------------------
/src_spark/analyze/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/analyze/__init__.py
--------------------------------------------------------------------------------
/src_spark/analyze/detectors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/analyze/detectors/__init__.py
--------------------------------------------------------------------------------
/src_spark/analyze/detectors/base_detector.py:
--------------------------------------------------------------------------------
1 | import re
2 | from abc import ABC, abstractmethod
3 |
4 | from src_spark.analyze.utils.analyzer_result import AnalyzerResult
5 |
6 |
7 |
8 | class BaseDetector(ABC):
9 |
10 | def __init__(self):
11 | self.name = None
12 | self.pattern = None
13 |
14 | @abstractmethod
15 | def get_pattern(self):
16 | pass
17 |
18 | @abstractmethod
19 | def get_name(self):
20 | pass
21 |
22 | def validate(self, text):
23 | return True
24 |
25 | def execute(self, text):
26 | results = []
27 | matches = re.finditer(self.get_pattern(), text)
28 | for match in matches:
29 | matched_string = match.string[match.start(): match.end()]
30 | if self.validate(matched_string):
31 | results.append(AnalyzerResult(matched_string, self.get_name(), match.start(), match.end()))
32 | return results
33 |
--------------------------------------------------------------------------------
/src_spark/analyze/detectors/credit_card_detector.py:
--------------------------------------------------------------------------------
1 | from src_spark.analyze.detectors.base_detector import BaseDetector
2 | from src_spark.analyze.utils.regex import RegEx
3 |
4 |
5 | class CreditCardDetector(BaseDetector):
6 |
7 | def __init__(self):
8 | self.name = "CREDIT_CARD"
9 | self.pattern = RegEx().literal("4").any_digit().num_occurrences(3).pipe() \
10 | .literal("5").range(0, 5).any_digit().num_occurrences(2).pipe() \
11 | .literal("6").any_digit().num_occurrences(3).pipe() \
12 | .literal("1").any_digit().num_occurrences(3).pipe() \
13 | .literal("3").any_digit().num_occurrences(3) \
14 | .one_of("- ").zero_or_one_occurrences() \
15 | .any_digit().range_occurrences(3, 4) \
16 | .one_of("- ").zero_or_one_occurrences() \
17 | .any_digit().range_occurrences(3, 4) \
18 | .one_of("- ").zero_or_one_occurrences() \
19 | .any_digit().range_occurrences(3, 5).build()
20 |
21 | def get_name(self):
22 | return self.name
23 |
24 | def get_pattern(self):
25 | return self.pattern
26 |
27 | def validate(self, text):
28 | def digits_of(n):
29 | return [int(d) for d in str(n)]
30 |
31 | digits = digits_of(text.replace('-', '').replace(' ', ''))
32 | odd_digits = digits[-1::-2]
33 | even_digits = digits[-2::-2]
34 | checksum = sum(odd_digits)
35 |
36 | for d in even_digits:
37 | checksum += sum(digits_of(d * 2))
38 |
39 | return checksum % 10 == 0
40 |
--------------------------------------------------------------------------------
/src_spark/analyze/detectors/email_detector.py:
--------------------------------------------------------------------------------
1 | from src_spark.analyze.detectors.base_detector import BaseDetector
2 | from src_spark.analyze.utils.regex import RegEx
3 |
4 |
5 | class EmailDetector(BaseDetector):
6 |
7 | def __init__(self):
8 | self.name = "EMAIL"
9 | self.pattern = RegEx().one_of("a-zA-Z0-9_.+-").one_or_more_occurrences().literal("@").one_of("a-zA-Z0-9-")\
10 | .one_or_more_occurrences().literal("\\.").one_of("a-zA-Z0-9-.").one_or_more_occurrences().build()
11 |
12 | def get_name(self):
13 | return self.name
14 |
15 | def get_pattern(self):
16 | return self.pattern
17 |
--------------------------------------------------------------------------------
/src_spark/analyze/detectors/national_id_detector.py:
--------------------------------------------------------------------------------
1 | from src_spark.analyze.detectors.base_detector import BaseDetector
2 | from src_spark.analyze.utils.regex import RegEx
3 |
4 |
5 | class NationalIdDetector(BaseDetector):
6 |
7 | def __init__(self):
8 | self.name = "NRIC"
9 | self.pattern = RegEx().one_of("STFG").any_digit().num_occurrences(7).range("A", "Z").build()
10 |
11 | def get_name(self):
12 | return self.name
13 |
14 | def get_pattern(self):
15 | return self.pattern
16 |
17 | def __get_offset(self, text):
18 | return 4 if text in "TG" else 0
19 |
20 | def __is_NRIC(self, text, loc):
21 | if text[0] in "ST":
22 | return "JZIHGFEDCBA"[loc] == text[8]
23 | return False
24 |
25 | def __is_FIN(self, text, loc):
26 | if text[0] in "FG":
27 | return "XWUTRQPNMLK"[loc] == text[8]
28 | return False
29 |
30 | def validate(self, text):
31 | weight = self.__get_weight(text)
32 | first_character = text[0]
33 | offset = self.__get_offset(first_character)
34 | loc = (offset + weight) % 11
35 | return self.__is_NRIC(text, loc) or self.__is_FIN(text, loc)
36 |
37 | def __get_weight(self, text):
38 | numbers = [int(digit) for digit in list(text[1:-1])]
39 | for index, i in enumerate(numbers):
40 | if index == 0:
41 | numbers[index] *= 2
42 | numbers[index] *= 8 - index
43 | return sum(numbers)
44 |
--------------------------------------------------------------------------------
/src_spark/analyze/detectors/phone_number_detector.py:
--------------------------------------------------------------------------------
1 | from src_spark.analyze.detectors.base_detector import BaseDetector
2 | from src_spark.analyze.utils.regex import RegEx
3 |
4 |
5 | class PhoneNumberDetector(BaseDetector):
6 |
7 | def __init__(self):
8 | self.name = "PHONE_NUMBER"
9 | regex_pipe = RegEx().pipe().build()
10 |
11 | regex_with_country_code_and_no_space = '(\\+65?\\s?[689]\\d{7})'
12 | regex_with_country_code_and_single_space = '(\\+65?\\s?[689]\\d{3} \\d{4})'
13 | regex_no_country_code_and_no_space = '([689]\\d{7})'
14 | regex_no_country_code_and_single_space = '([689]\\d{3} \\d{4})'
15 | regex_with_country_code_in_brackets_and_no_space = '([(]65[)]\\s?[689]\\d{7})'
16 | regex_with_country_code_in_brackets_and_single_space = '([(]65[)]\\s?[689]\\d{3} \\d{4})'
17 |
18 | self.pattern = regex_with_country_code_and_no_space + regex_pipe + \
19 | regex_with_country_code_and_single_space + regex_pipe + \
20 | regex_no_country_code_and_no_space + regex_pipe + \
21 | regex_no_country_code_and_single_space + regex_pipe + \
22 | regex_with_country_code_in_brackets_and_no_space + regex_pipe + \
23 | regex_with_country_code_in_brackets_and_single_space
24 |
25 | def get_name(self):
26 | return self.name
27 |
28 | def get_pattern(self):
29 | return self.pattern
30 |
--------------------------------------------------------------------------------
/src_spark/analyze/detectors/pii_detector.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import pkgutil
3 | import inspect
4 | import sys
5 | from pyspark.sql import DataFrame
6 | from pyspark.sql.types import StructField, StructType, ArrayType, StringType, LongType
7 | from src_spark.analyze.detectors.base_detector import BaseDetector
8 | import src_spark.analyze.detectors
9 |
10 | class PIIDetector():
11 |
12 | def __init__(self):
13 | self.detectors = self.__get_detector_instances()
14 |
15 | def __get_detector_modules(self):
16 | modules = [modname for importer, modname, ispkg in
17 | pkgutil.walk_packages(path=src_spark.analyze.detectors.__path__,
18 | prefix=src_spark.analyze.detectors.__name__+".")
19 | if "tests" not in modname]
20 | return modules
21 |
22 | def __get_detector_instances(self):
23 | modules = self.__get_detector_modules()
24 | detectors = []
25 | for module in modules:
26 | importlib.import_module(module)
27 | classes = inspect.getmembers(sys.modules[module], inspect.isclass)
28 | for class_name, class_type in classes:
29 | if class_name != "BaseDetector" and issubclass(class_type, BaseDetector):
30 | detectors.append(class_type())
31 | return detectors
32 |
33 | def __detect_pii_row(self, row):
34 | new_row = []
35 | for element in row:
36 | results = []
37 | for detector in self.detectors:
38 | results += detector.execute(element)
39 | new_row.append(results)
40 |
41 | return new_row
42 |
43 | def get_analyzer_results(self, input_data_frame: DataFrame):
44 | columns = input_data_frame.columns
45 |
46 | array_structtype = StructType([
47 | StructField("end", LongType(), False),
48 | StructField("start", LongType(), False),
49 | StructField("text", StringType(), False),
50 | StructField("type", StringType(), False)
51 | ])
52 | result_schema = []
53 | for column in columns:
54 | result_schema.append(StructField(column, ArrayType(array_structtype, True), nullable=False) )
55 |
56 | result = input_data_frame.rdd.map(lambda x: self.__detect_pii_row(x)).toDF(schema=StructType(result_schema))
57 |
58 | return result
59 |
60 | def _get_pii_list(self, row):
61 | get_analyzer_results_text = lambda x: x.text
62 |
63 | new_row = []
64 | for cell in row:
65 | pii_sublist = list(map(get_analyzer_results_text,cell))
66 | new_row.extend(pii_sublist)
67 | return new_row
68 |
69 | def get_redacted_text(self, input_data_frame: DataFrame, report: DataFrame):
70 | pii_list = report.rdd.flatMap(lambda row: self._get_pii_list(row)).collect()
71 | column = input_data_frame.columns
72 | result = input_data_frame.rdd.map(lambda row: self.__replace_redacted_text(row, pii_list)).toDF(column)
73 |
74 | return result
75 |
76 | def __replace_redacted_text(self, row, pii_list):
77 | new_row = []
78 | for cell in row:
79 | for word in pii_list:
80 | if word in cell:
81 | cell = cell.replace(word, "")
82 | new_row.append(cell)
83 | return new_row
84 |
85 | def analyze_data_frame(self, input_data_frame: DataFrame):
86 | report = self.get_analyzer_results(input_data_frame)
87 | redacted = self.get_redacted_text(input_data_frame, report)
88 |
89 | return report, redacted
90 |
91 |
--------------------------------------------------------------------------------
/src_spark/analyze/detectors/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/analyze/detectors/tests/__init__.py
--------------------------------------------------------------------------------
/src_spark/analyze/detectors/tests/test_pii_detector.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from pyspark.sql import SparkSession
3 | from src_spark.analyze.detectors.pii_detector import PIIDetector
4 | from src.analyze.utils.analyzer_result import AnalyzerResult
5 | from pyspark.sql.types import StructField, StructType, ArrayType, StringType, LongType, Row
6 |
7 |
8 | class TestPIIDetector(TestCase):
9 |
10 | def setUp(self) -> None:
11 | self.SPARK = SparkSession.builder \
12 | .master("local") \
13 | .appName("Test PIIDetector") \
14 | .getOrCreate()
15 | self.pii_detector = PIIDetector()
16 |
17 | self.array_structtype = StructType([
18 | StructField("end", LongType(), False),
19 | StructField("start", LongType(), False),
20 | StructField("text", StringType(), False),
21 | StructField("type", StringType(), False)
22 | ])
23 | self.schema = StructType([
24 | StructField("summary", ArrayType(self.array_structtype, True), nullable=False),
25 | StructField("phone number", ArrayType(self.array_structtype, True), nullable=False)
26 | ])
27 |
28 | def test_analyze_data_frame_runs_analyze_against_each_cell_with_a_PII_value(self):
29 | test_data_frame = self.SPARK.createDataFrame(
30 | [
31 | ("First President of Singapore NRIC was S0000001I", "Some examples of phone numbers are +65 62345678"),
32 | ("A typical email id would look something like test@sample.com","Some examples of phone numbers are +65 62345678")
33 | ],
34 | ["summary", "phone number"]
35 | )
36 |
37 | actual = self.pii_detector.get_analyzer_results(test_data_frame)
38 |
39 | expected_data_frame = self.SPARK.createDataFrame(
40 | [
41 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]),
42 | ([AnalyzerResult("test@sample.com", "EMAIL", 45, 60)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
43 | ],
44 | self.schema
45 | )
46 |
47 | self.assertEqual(actual.schema, expected_data_frame.schema)
48 | self.assertEqual(actual.collect(), expected_data_frame.collect())
49 |
50 | def test_analyze_data_frame_runs_analyze_against_cell_with_multiple_PII_values(self):
51 | test_data_frame = self.SPARK.createDataFrame(
52 | [
53 | ("First President of Singapore NRIC was S0000001I", "Some examples of phone numbers are +65 62345678"),
54 | ("email test@sample.com and phone +65 62345678","Phone one +65 62345678 Phone two +65 62345678")
55 | ],
56 | ["summary", "phone number"]
57 | )
58 |
59 | actual = self.pii_detector.get_analyzer_results(test_data_frame)
60 |
61 | expected_data_frame = self.SPARK.createDataFrame(
62 | [
63 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]),
64 | ([AnalyzerResult("test@sample.com", "EMAIL", 6, 21), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 32, 44)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 10, 22), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 33, 45)])
65 | ],
66 | self.schema
67 | )
68 |
69 | self.assertEqual(actual.schema, expected_data_frame.schema)
70 | self.assertEqual(actual.collect(), expected_data_frame.collect())
71 |
72 | def test_analyze_data_frame_returns_empty_data_frame_when_there_are_no_PII_values(self):
73 | test_data_frame = self.SPARK.createDataFrame(
74 | [
75 | ("No", "Personal"),
76 | ("Data","Inside")
77 | ],
78 | ["summary", "phone number"]
79 | )
80 |
81 | actual = self.pii_detector.get_analyzer_results(test_data_frame)
82 |
83 | expected_data_frame = self.SPARK.createDataFrame(
84 | [
85 | ([], []),
86 | ([], [])
87 | ],
88 | self.schema
89 | )
90 |
91 | self.assertEqual(actual.schema, expected_data_frame.schema)
92 | self.assertEqual(actual.collect(), expected_data_frame.collect())
93 |
94 | def test_get_pii_list_returns_list_of_pii_words_given_row_of_list_of_analyzer_results(self):
95 | test_row = Row(
96 | summary=[
97 | AnalyzerResult("S0000001I", "NRIC", 38, 47),
98 | AnalyzerResult("S0000002I", "NRIC", 38, 47)
99 | ],
100 | phone_number=[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
101 | actual = self.pii_detector._get_pii_list(test_row)
102 | expected = ["S0000001I","S0000002I","+65 62345678"]
103 | self.assertEqual(actual, expected)
104 |
105 | def test_get_pii_list_returns_empty_lists_no_analyzer_results(self):
106 | test_row = Row(summary=[],phone_number=[])
107 | actual = self.pii_detector._get_pii_list(test_row)
108 | expected = []
109 | self.assertEqual(actual, expected)
110 |
111 | def test_get_redacted_text_returns_redacted_data_frame(self):
112 | test_report_data_frame = self.SPARK.createDataFrame(
113 | [
114 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]),
115 | ([AnalyzerResult("test@sample.com", "EMAIL", 6, 21), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 32, 44)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 10, 22), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 33, 45)])
116 | ],
117 | self.schema
118 | )
119 |
120 | test_input_data_frame = self.SPARK.createDataFrame(
121 | [
122 | ("First President of Singapore NRIC was S0000001I", "Some examples of phone numbers are +65 62345678"),
123 | ("email test@sample.com and phone +65 62345678","Phone one +65 62345678 Phone two +65 62345678")
124 | ],
125 | ["summary", "phone number"]
126 | )
127 |
128 | actual = self.pii_detector.get_redacted_text(test_input_data_frame, test_report_data_frame)
129 |
130 | expected = self.SPARK.createDataFrame(
131 | [
132 | ("First President of Singapore NRIC was ", "Some examples of phone numbers are "),
133 | ("email and phone ","Phone one Phone two ")
134 | ],
135 | ["summary", "phone number"]
136 | )
137 |
138 | self.assertEqual(actual.schema, expected.schema)
139 | self.assertEqual(actual.collect(), expected.collect())
140 |
141 | def test_get_redacted_text_returns_same_data_frame_if_analyzer_results_are_empty(self):
142 | test_report_data_frame = self.SPARK.createDataFrame(
143 | [
144 | ([], []),
145 | ([], [])
146 | ],
147 | self.schema
148 | )
149 |
150 | test_input_data_frame = self.SPARK.createDataFrame(
151 | [
152 | ("No", "Personal"),
153 | ("Data","Inside")
154 | ],
155 | ["summary", "phone number"]
156 | )
157 |
158 | actual = self.pii_detector.get_redacted_text(test_input_data_frame, test_report_data_frame)
159 |
160 | expected = self.SPARK.createDataFrame(
161 | [
162 | ("No", "Personal"),
163 | ("Data","Inside")
164 | ],
165 | ["summary", "phone number"]
166 | )
167 |
168 | self.assertEqual(actual.schema, expected.schema)
169 | self.assertEqual(actual.collect(), expected.collect())
170 |
171 |
--------------------------------------------------------------------------------
/src_spark/analyze/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/analyze/utils/__init__.py
--------------------------------------------------------------------------------
/src_spark/analyze/utils/analyzer_result.py:
--------------------------------------------------------------------------------
1 | class AnalyzerResult:
2 |
3 | def __init__(self, text, type, start, end):
4 | self.text = text
5 | self.type = type
6 | self.start = start
7 | self.end = end
8 |
9 | def __eq__(self, other):
10 | return type(self) == type(other) and self.text == other.text and self.type == other.type \
11 | and self.start == other.start and self.end == other.end
12 |
13 | def __repr__(self):
14 | return self.__str__()
15 |
16 | def __str__(self):
17 | return "Text {} at position ({},{}) was identified as {}".format(self.text, self.start, self.end, self.type)
18 |
19 | def detector(self):
20 | return self.type
21 |
--------------------------------------------------------------------------------
/src_spark/analyze/utils/regex.py:
--------------------------------------------------------------------------------
1 | class RegEx:
2 |
3 | def __init__(self):
4 | self.regex_string = ""
5 |
6 | def __is_numeric(self, value):
7 | return isinstance(value, int)
8 |
9 | def __is_single_character_value(self, value):
10 | return len(str(value)) == 1
11 |
12 | def __validate_range(self, start, end):
13 | if start > end:
14 | raise ValueError("Range start should be less than end")
15 |
16 | def boundary(self):
17 | self.regex_string += "\\b"
18 | return self
19 |
20 | def pipe(self):
21 | self.regex_string += "|"
22 | return self
23 |
24 | def range(self, from_char, to_char):
25 | if not self.__is_single_character_value(from_char) or not self.__is_single_character_value(to_char):
26 | raise ValueError("Range boundaries should be single character")
27 |
28 | self.__validate_range(from_char, to_char)
29 | self.regex_string += "[{}-{}]".format(from_char, to_char)
30 | return self
31 |
32 | def one_of(self, character_set):
33 | if character_set is None or character_set == "":
34 | raise ValueError("Character Set should not be empty")
35 |
36 | self.regex_string += "[" + character_set + "]"
37 | return self
38 |
39 | def any_digit(self):
40 | self.regex_string += "\\d"
41 | return self
42 |
43 | def num_occurrences(self, number):
44 | if number < 1:
45 | raise ValueError
46 |
47 | self.regex_string += "{" + str(number) + "}"
48 | return self
49 |
50 | def one_or_more_occurrences(self):
51 | self.regex_string += "+"
52 | return self
53 |
54 | def zero_or_more_occurrences(self):
55 | self.regex_string += "*"
56 | return self
57 |
58 | def zero_or_one_occurrences(self):
59 | self.regex_string += "?"
60 | return self
61 |
62 | def range_occurrences(self, start, end):
63 | if not self.__is_numeric(start) or not self.__is_numeric(end):
64 | raise TypeError("Range should be integers")
65 |
66 | self.__validate_range(start, end)
67 | self.regex_string += "{" + str(start) + "," + str(end) + "}"
68 | return self
69 |
70 | def literal(self, literal):
71 | self.regex_string += literal
72 | return self
73 |
74 | def build(self):
75 | return self.regex_string
76 |
--------------------------------------------------------------------------------
/src_spark/constants.py:
--------------------------------------------------------------------------------
1 | ACQUIRE="acquire"
2 | FILE_PATH="file_path"
3 | ANALYZE="analyze"
4 | REPORT="report"
5 | LOCATION="location"
6 | REPORT_LEVEL="level"
7 | OUTPUT_FILE_PATH="output_file_path"
--------------------------------------------------------------------------------
/src_spark/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | sys.path.append(os.path.abspath('.'))
4 |
5 | import argparse
6 | import json
7 |
8 | from pyspark.sql import SparkSession
9 | from src_spark.report.report_generator import ReportGenerator
10 | from src_spark.acquire.csv_parser import CsvParser
11 | from src_spark.analyze.detectors.pii_detector import PIIDetector
12 | from src_spark.constants import ACQUIRE, REPORT
13 | from src_spark.write.csv_writer import CsvWriter
14 |
15 |
16 | class Main():
17 |
18 | def __init__(self, config_file_path):
19 | with open(config_file_path) as config_file:
20 | self.config = json.load(config_file)
21 |
22 | #TODO : validate the config for the stages right here
23 | def run(self):
24 | spark = SparkSession.builder \
25 | .master("local") \
26 | .appName("PIIDetector") \
27 | .getOrCreate()
28 | parsed_data_frame = CsvParser(spark, config=self.config[ACQUIRE]).parse()
29 | pii_analysis_report, redacted_data_frame = PIIDetector().analyze_data_frame(parsed_data_frame)
30 |
31 | report_generator = ReportGenerator(config=self.config[REPORT])
32 | if report_generator.is_empty_report_dataframe(pii_analysis_report):
33 | print("NO PII VALUES WERE FOUND!")
34 | else:
35 | report_generator.generate(results_df=pii_analysis_report)
36 | CsvWriter(spark, config=self.config).write_csv(df=redacted_data_frame)
37 |
38 | def get_args():
39 | parser = argparse.ArgumentParser()
40 | parser.add_argument('--config-file', help='config file to run the tool')
41 | args = parser.parse_args()
42 | if not args.config_file:
43 | raise ValueError("Config file path should be provided for the tool to run.")
44 | return args
45 |
46 | if __name__ == "__main__":
47 | args = get_args()
48 | Main(args.config_file).run()
--------------------------------------------------------------------------------
/src_spark/report/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/report/__init__.py
--------------------------------------------------------------------------------
/src_spark/report/report_generator.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from enum import Enum
3 | import os
4 | import logging
5 |
6 | import pandas as pd
7 | from pyspark.sql.dataframe import DataFrame
8 | from pyspark.sql.types import Row
9 | from src_spark.constants import LOCATION, REPORT_LEVEL
10 |
11 |
12 | class ReportLevel(Enum):
13 |
14 | HIGH = "high"
15 | MEDIUM = "medium"
16 | LOW = "low"
17 |
18 | class ReportGenerator():
19 |
20 | def __init__(self, config):
21 | self.report_file_location = config[LOCATION]
22 | self.report_level = config[REPORT_LEVEL]
23 | self.setup_logging_config()
24 | self.dataframe_is_empty = None
25 |
26 | def setup_logging_config(self):
27 | date = datetime.today().strftime("%Y%m%d")
28 | file_name = "{}/report_{}.log".format(self.report_file_location, date)
29 | if os.path.exists(file_name):
30 | mode = "a"
31 | else:
32 | if not os.path.exists(self.report_file_location):
33 | os.makedirs(self.report_file_location)
34 | mode = "x"
35 | file_handler = logging.FileHandler(filename=file_name, mode=mode)
36 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
37 | file_handler.setFormatter(formatter)
38 | logging.getLogger().addHandler(file_handler)
39 | logging.getLogger().setLevel(logging.INFO)
40 |
41 | def __generate_high_level_report(self, results_df: DataFrame):
42 | columns = results_df.columns
43 | report_df = pd.DataFrame({"Columns with PII values" : columns})
44 | return report_df
45 |
46 | def __calculate_percentage(self, item_count, total_count):
47 | return round((item_count/total_count) * 100.0, 2)
48 |
49 | def _get_detector_results(self, row:Row, columns:list):
50 | new_row = []
51 | for index, cell in enumerate(row):
52 | current_col = columns[index]
53 | if cell != []:
54 | for analyzer_result in cell:
55 | detector = analyzer_result["type"]
56 | new_row.append(((current_col, detector), 1))
57 | else:
58 | new_row.append(((current_col, "no_pii"), 1))
59 | return new_row
60 |
61 | def __get_list_of_detectors(self, detector_results):
62 | report_detectors = []
63 | for key, _ in detector_results:
64 | detector = key[1]
65 | if detector not in report_detectors and detector != "no_pii":
66 | report_detectors.append(detector)
67 | return report_detectors
68 |
69 | def spark_generate_medium_level_report(self, results_df: DataFrame) -> pd.DataFrame:
70 | columns = results_df.columns
71 | detector_results = results_df.rdd.flatMap(lambda row: self._get_detector_results(row, columns)).reduceByKey(lambda acc, next: acc + next).collect()
72 | report_detectors = self.__get_list_of_detectors(detector_results)
73 | num_rows = results_df.count()
74 | pd_columns = []
75 | for column in columns:
76 | detection_stats = self.__get_detection_stats(column, report_detectors, detector_results, num_rows)
77 | pd_columns.append(pd.Series(data=detection_stats, index=report_detectors, name=column))
78 | report_df = pd.concat(pd_columns,axis=1).fillna(0)
79 | return report_df
80 |
81 | def __get_detection_stats(self, column: list, report_detectors: list, detector_results: list, num_rows: int) -> dict:
82 | detection_stats = {}
83 | default_value = ()
84 | for detector in report_detectors:
85 | column_detector_count = next(filter(lambda result: result[0] == (column, detector), detector_results), default_value)
86 | if len(column_detector_count) > 0:
87 | count = column_detector_count[1]
88 | percentage_value = self.__calculate_percentage(item_count=count, total_count=num_rows)
89 | detection_stats[detector] = (count, f"{percentage_value}%")
90 | return detection_stats
91 |
92 |
93 | def generate_report_content(self, results_df: DataFrame) -> pd.DataFrame:
94 | if self.report_level == ReportLevel.HIGH.value:
95 | return self.__generate_high_level_report(results_df)
96 | elif self.report_level == ReportLevel.MEDIUM.value:
97 | return self.spark_generate_medium_level_report(results_df)
98 | return self.spark_generate_medium_level_report(results_df)
99 |
100 | def __print(self, msg):
101 | formatted_msg = f"\n{msg}"
102 | print(formatted_msg)
103 | logging.info(formatted_msg)
104 |
105 | def __print_report(self, report):
106 | self.__print("\n\n****************************PII ANALYSIS REPORT**************************\n\n")
107 | if report.empty:
108 | self.__print("NO PII VALUES WERE FOUND!")
109 | else:
110 | self.__print(report)
111 | self.__print("\n\n****************************DONE!**************************\n\n")
112 |
113 | def generate(self, results_df: DataFrame):
114 | if self.is_empty_report_dataframe(results_df):
115 | print("NO PII VALUES WERE FOUND!")
116 |
117 | final_report = self.generate_report_content(results_df)
118 | self.__print_report(final_report)
119 | return final_report
120 |
121 | def is_empty_report_dataframe(self, results_df: DataFrame) -> bool:
122 | if self.dataframe_is_empty == None:
123 | self.dataframe_is_empty = results_df.rdd.flatMap(lambda row: self._row_is_empty_list(row)).reduce(lambda acc, item: acc and item)
124 | return self.dataframe_is_empty
125 |
126 | def _row_is_empty_list(self, row: Row) -> map:
127 | return map(lambda cell: True if cell == [] else False , row)
128 |
129 |
130 |
--------------------------------------------------------------------------------
/src_spark/report/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/report/tests/__init__.py
--------------------------------------------------------------------------------
/src_spark/report/tests/test_report_generator.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from unittest.mock import patch, MagicMock
3 |
4 | import pandas as pd
5 | from pandas._testing import assert_frame_equal
6 | from freezegun import freeze_time
7 | from pyspark.sql.session import SparkSession
8 | from pyspark.sql.types import Row, StructField, StructType, ArrayType, StringType, LongType
9 | from src_spark.report.report_generator import ReportGenerator
10 | from src_spark.analyze.utils.analyzer_result import AnalyzerResult
11 |
12 |
13 |
14 | class TestReportGenerator(TestCase):
15 |
16 | @patch("src_spark.report.report_generator.ReportGenerator.setup_logging_config")
17 | def setUp(self, mock_setup_logging_config):
18 | self.SPARK = SparkSession.builder \
19 | .master("local") \
20 | .appName("Test PIIDetector") \
21 | .getOrCreate()
22 |
23 |
24 | self.array_structtype = StructType([
25 | StructField("end", LongType(), False),
26 | StructField("start", LongType(), False),
27 | StructField("text", StringType(), False),
28 | StructField("type", StringType(), False)
29 | ])
30 | self.schema = StructType([
31 | StructField("summary", ArrayType(self.array_structtype, True), nullable=False),
32 | StructField("phone number", ArrayType(self.array_structtype, True), nullable=False)
33 | ])
34 | self.report_generator_high_level = ReportGenerator(config={"location" : "abc", "level" : "high"})
35 | mock_setup_logging_config.assert_called_with()
36 | self.report_generator_medium_level = ReportGenerator(config={"location" : "abc", "level" : "medium"})
37 | mock_setup_logging_config.assert_called_with()
38 |
39 | def test_high_level_reporting_returns_columns_with_PII_values_when_given_a_results_data_frame(self):
40 | test_data_frame = self.SPARK.createDataFrame(
41 | [
42 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]),
43 | ([AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
44 | ],
45 | self.schema
46 | )
47 | expected_data_frame = pd.DataFrame({"Columns with PII values" : ["summary", "phone number"]})
48 | self.assertCountEqual(expected_data_frame, self.report_generator_high_level.generate_report_content(test_data_frame))
49 |
50 |
51 |
52 | def test_medium_level_reporting_returns_data_frame_with_detectors_and_column_details(self):
53 | test_data_frame = self.SPARK.createDataFrame(
54 | [
55 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]),
56 | ([AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
57 | ],
58 | self.schema
59 | )
60 |
61 | expected_data_frame = pd.DataFrame({
62 | "summary": [(1, "50.0%"), 0, (1, "50.0%")],
63 | "phone number": [0, (1, "50.0%"), (1, "50.0%")]
64 | },index=["NRIC","EMAIL","PHONE_NUMBER"])
65 |
66 | self.assertCountEqual(list(expected_data_frame), self.report_generator_medium_level.spark_generate_medium_level_report(test_data_frame))
67 |
68 | def test_that_medium_level_reporting_returns_correct_data_frame(self):
69 | test_data_frame = self.SPARK.createDataFrame(
70 | [
71 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]),
72 | ([AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
73 | ],
74 | self.schema
75 | )
76 |
77 | expected_data_frame = pd.DataFrame({
78 | "summary": [(1, "50.0%"), 0, (1, "50.0%")],
79 | "phone number": [0, (1, "50.0%"), (1, "50.0%")]
80 | },index=["NRIC","EMAIL","PHONE_NUMBER"])
81 |
82 | actual = self.report_generator_medium_level.spark_generate_medium_level_report(test_data_frame)
83 | assert_frame_equal(actual, expected_data_frame)
84 |
85 | @patch("logging.info")
86 | @patch("src.report.report_generator.ReportGenerator.generate_report_content")
87 | def test_generate_report_calls_content_generate_report_content_and_logs_it(self, mock_generate_content, mock_logging):
88 | test_data_frame = self.SPARK.createDataFrame(
89 | [
90 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]),
91 | ([AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
92 | ],
93 | self.schema
94 | )
95 | mock_generate_content.return_value = pd.DataFrame({"Columns with PII values" : ["summary", "phone number"]})
96 | mock_logging.return_value = None
97 | expected_result = self.report_generator_high_level.generate(test_data_frame)
98 | self.assertCountEqual(expected_result, mock_generate_content.return_value)
99 |
100 |
101 | @freeze_time('2019-05-29 01:01:03')
102 | @patch("logging.FileHandler")
103 | @patch("logging.Logger.addHandler")
104 | @patch("genericpath.exists")
105 | def test_creation_of_the_report_file_if_not_present(self, mock_file_exists, mock_add_handler, mock_file_handler):
106 | mock_file_exists.return_value = False
107 | mock_file_handler.return_value = MagicMock()
108 | self.report_generator_high_level.setup_logging_config()
109 | mock_file_handler.assert_called_with(filename="abc/report_20190529.log", mode="x")
110 | mock_add_handler.assert_called_with(mock_file_handler.return_value)
111 |
112 |
113 | @freeze_time('2019-05-29 01:01:03')
114 | @patch("logging.FileHandler")
115 | @patch("logging.Logger.addHandler")
116 | @patch("os.path.exists")
117 | def test_appending_to_report_file_if_already_present(self, mock_file_exists, mock_add_handler, mock_file_handler):
118 | mock_file_exists.return_value = True
119 | mock_file_handler.return_value = MagicMock()
120 | self.report_generator_high_level.setup_logging_config()
121 | mock_file_handler.assert_called_with(filename="abc/report_20190529.log", mode="a")
122 | mock_add_handler.assert_called_with(mock_file_handler.return_value)
123 |
124 | def test_that_when_report_dataframe_contains_only_empty_lists_it_is_considered_empty(self):
125 | test_data_frame = self.SPARK.createDataFrame(
126 | [
127 | ([], []),
128 | ([], [])
129 | ],
130 | self.schema
131 | )
132 |
133 | actual = self.report_generator_medium_level.is_empty_report_dataframe(test_data_frame)
134 | expected = True
135 |
136 | self.assertEqual(actual, expected)
137 |
138 | def test_that_when_report_dataframe_contains_some_text_it_is_not_considered_empty(self):
139 | test_data_frame = self.SPARK.createDataFrame(
140 | [
141 | ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], []),
142 | ([], [])
143 | ],
144 | self.schema
145 | )
146 |
147 | actual = self.report_generator_medium_level.is_empty_report_dataframe(test_data_frame)
148 | expected = False
149 |
150 | self.assertEqual(actual, expected)
151 |
152 | def test_that_get_detector_results_returns_list_of_detector_results(self):
153 | columns = ["summary", "phone_number"]
154 | test_row = Row(summary=[Row(end=47, start=38, text='S0000001I', type='NRIC')], phone_number=[Row(end=60, start=45, text='test@sample.com', type='EMAIL')])
155 | actual = self.report_generator_medium_level._get_detector_results(test_row, columns)
156 | expected = [(('summary', 'NRIC'), 1), (('phone_number', 'EMAIL'), 1)]
157 | self.assertEqual(actual, expected)
158 |
159 | def test_that_get_detector_results_returns_list_of_detector_results_if_column_is_empty(self):
160 | columns = ["summary", "phone_number"]
161 | test_row = Row(summary=[Row(end=47, start=38, text='S0000001I', type='NRIC')], phone_number=[])
162 | actual = self.report_generator_medium_level._get_detector_results(test_row, columns)
163 | expected = [(('summary', 'NRIC'), 1), (('phone_number', 'no_pii'), 1)]
164 | self.assertEqual(actual, expected)
165 |
166 |
--------------------------------------------------------------------------------
/src_spark/write/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/write/__init__.py
--------------------------------------------------------------------------------
/src_spark/write/csv_writer.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession, DataFrame
2 | from src.constants import OUTPUT_FILE_PATH, FILE_PATH
3 |
4 |
5 | class CsvWriter():
6 |
7 | def __init__(self, spark: SparkSession, config):
8 | self.__validate_config(config)
9 | self.output_path = config["anonymize"][OUTPUT_FILE_PATH]
10 | self.input_file_name = config["acquire"][FILE_PATH]
11 | self.spark = spark
12 |
13 | def __validate_config(self, config):
14 | if "anonymize" not in config or not config["anonymize"] or OUTPUT_FILE_PATH not in config["anonymize"] or not config["anonymize"][OUTPUT_FILE_PATH]:
15 | raise ValueError("Config 'output_file_path' needs to be provided for parsing")
16 |
17 | def get_output_file_path(self):
18 | file_name = self.input_file_name.split('/')[-1]
19 | file_name_no_extension = file_name.split('.')[0]
20 | result = f"{self.output_path}/{file_name_no_extension}_anonymized_.csv"
21 | return result
22 |
23 | def write_csv(self, df: DataFrame):
24 | df.write.csv(self.get_output_file_path())
--------------------------------------------------------------------------------
/src_spark/write/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/write/tests/__init__.py
--------------------------------------------------------------------------------
/src_spark/write/tests/test_csv_writer.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from pyspark.sql import SparkSession
3 | from src_spark.write.csv_writer import CsvWriter
4 |
5 |
6 | class TestCsvWriter(TestCase):
7 |
8 | def setUp(self) -> None:
9 | self.SPARK = SparkSession.builder \
10 | .master("local") \
11 | .appName("Test CsvWriter") \
12 | .getOrCreate()
13 |
14 | def test_invalid_config_gets_caught_during_initialization(self):
15 | context = {}
16 | with self.assertRaises(ValueError) as ve:
17 | CsvWriter(self.SPARK, config=context)
18 | self.assertEqual(str(ve.exception), "Config 'output_file_path' needs to be provided for parsing")
19 |
20 | def test_correct_output_path_is_generated(self):
21 | context = {
22 | "acquire": {
23 | "file_path": "/anonymizer/test_data.csv",
24 | "delimiter": ","
25 | },
26 | "anonymize": {
27 | "output_file_path" : "/anonymizer/output"
28 | }
29 | }
30 | input_file_name = "test_data"
31 | output_directory = "/anonymizer/output"
32 | expected = f"{output_directory}/{input_file_name}_anonymized_.csv"
33 | writer = CsvWriter(spark=self.SPARK, config=context)
34 | self.assertEqual(writer.get_output_file_path(), expected)
35 |
36 |
--------------------------------------------------------------------------------
/test_data.csv:
--------------------------------------------------------------------------------
1 | National ID,Phone Number,Address,Remarks
2 | S0000001I,+65 91264944,112 Bedok,A typical email id would look something like test@sample.com
3 | S00000dfs,+65 91264944,112 Bedok,A typical email id would look something like ANC
--------------------------------------------------------------------------------