├── .github
    └── workflows
    │   └── codeql-analysis.yml
├── .gitignore
├── README.md
├── bin
    ├── color_my_terminal.sh
    ├── run_tests.sh
    └── setup_venv_locally.sh
├── config.json
├── docker-compose.yml
├── no_pii_data.csv
├── requirements-dev.txt
├── requirements.txt
├── setup.py
├── src
    ├── __init__.py
    ├── acquire
    │   ├── __init__.py
    │   ├── csv_parser.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── data
    │   │       ├── comma_delimited_file.csv
    │   │       ├── empty.csv
    │   │       ├── missing_comma.csv
    │   │       └── pipe_delimited_file.csv
    │   │   └── test_csv_parser.py
    ├── analyze
    │   ├── __init__.py
    │   ├── detectors
    │   │   ├── __init__.py
    │   │   ├── base_detector.py
    │   │   ├── credit_card_detector.py
    │   │   ├── email_detector.py
    │   │   ├── national_id_detector.py
    │   │   ├── phone_number_detector.py
    │   │   ├── pii_detector.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── test_base_detector.py
    │   │   │   ├── test_credit_card_detector.py
    │   │   │   ├── test_email_detector.py
    │   │   │   ├── test_national_id_detector.py
    │   │   │   ├── test_phone_number_detector.py
    │   │   │   └── test_pii_detector.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── analyzer_result.py
    │   │   ├── regex.py
    │   │   └── tests
    │   │       ├── __init__.py
    │   │       ├── test_analyzer_result.py
    │   │       └── test_regex.py
    ├── anonymize
    │   ├── __init__.py
    │   ├── anonymizer_result.py
    │   ├── drop_anonymizer.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   └── test_drop_anonymizer.py
    ├── constants.py
    ├── dpf_main.py
    ├── report
    │   ├── __init__.py
    │   ├── report_generator.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   └── test_report_generator.py
    ├── tests
    │   ├── __init__.py
    │   ├── config
    │   │   ├── __init__.py
    │   │   └── test_config.json
    │   └── test_dpf_main.py
    └── write
    │   ├── __init__.py
    │   ├── csv_writer.py
    │   └── tests
    │       ├── __init__.py
    │       └── test_csv_writer.py
├── src_spark
    ├── __init__.py
    ├── acquire
    │   ├── __init__.py
    │   ├── csv_parser.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── data
    │   │       ├── comma_delimited_file.csv
    │   │       ├── empty.csv
    │   │       ├── missing_comma.csv
    │   │       └── pipe_delimited_file.csv
    │   │   └── test_csv_parser.py
    ├── analyze
    │   ├── __init__.py
    │   ├── detectors
    │   │   ├── __init__.py
    │   │   ├── base_detector.py
    │   │   ├── credit_card_detector.py
    │   │   ├── email_detector.py
    │   │   ├── national_id_detector.py
    │   │   ├── phone_number_detector.py
    │   │   ├── pii_detector.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   └── test_pii_detector.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── analyzer_result.py
    │   │   └── regex.py
    ├── constants.py
    ├── main.py
    ├── report
    │   ├── __init__.py
    │   ├── report_generator.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   └── test_report_generator.py
    └── write
    │   ├── __init__.py
    │   ├── csv_writer.py
    │   └── tests
    │       ├── __init__.py
    │       └── test_csv_writer.py
└── test_data.csv


/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master ]
20 |   schedule:
21 |     - cron: '30 17 * * 5'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 |         # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 | 
39 |     steps:
40 |     - name: Checkout repository
41 |       uses: actions/checkout@v2
42 | 
43 |     # Initializes the CodeQL tools for scanning.
44 |     - name: Initialize CodeQL
45 |       uses: github/codeql-action/init@v1
46 |       with:
47 |         languages: ${{ matrix.language }}
48 |         # If you wish to specify custom queries, you can do so here or in a config file.
49 |         # By default, queries listed here will override any specified in a config file.
50 |         # Prefix the list here with "+" to use these queries and those in the config file.
51 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 | 
53 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
54 |     # If this step fails, then you should remove it and run the build manually (see below)
55 |     - name: Autobuild
56 |       uses: github/codeql-action/autobuild@v1
57 | 
58 |     # ℹ️ Command-line programs to run using the OS shell.
59 |     # 📚 https://git.io/JvXDl
60 | 
61 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 |     #    and modify them (or add more) to build your code if your project
63 |     #    uses a compiled language
64 | 
65 |     #- run: |
66 |     #   make bootstrap
67 |     #   make release
68 | 
69 |     - name: Perform CodeQL Analysis
70 |       uses: github/codeql-action/analyze@v1
71 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /.vscode
 2 | /.idea
 3 | __pycache__
 4 | *.pyc
 5 | /.venv
 6 | /venv
 7 | /output
 8 | /dist
 9 | /build/lib
10 | .pytest_cache
11 | .coverage
12 | pyspark_output
13 | pyspark_config.json
14 | *.csv
15 | generate_fake_data.py
16 | scratchpad.py


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Protection Framework
 2 | Data Protection Framework is a python library/command line application for identification, anonymization and de-anonymization of Personally Identifiable Information data.
 3 | 
 4 | The framework aims to work on a two-fold principle for detecting PII:
 5 | 1. Using RegularExpressions using a pattern
 6 | 2. Using NLP for detecting NER (Named Entity Recognitions)
 7 |  
 8 | ## Features and Current Status
 9 | 
10 | ### Completed
11 |  * Following Global detectors have been completed:
12 |    * [x] EMAIL_ADDRESS :  An email address identifies the mailbox that emails are sent to or from. The maximum length of the domain name is 255 characters, and the maximum length of the local-part is 64 characters.
13 |    * [x] CREDIT_CARD_NUMBER : A credit card number is 12 to 19 digits long. They are used for payment transactions globally.
14 |  
15 |  * Following detectors specific to Singapore have been completed:
16 |    * [x] PHONE_NUMBER : A telephone number.
17 |    * [x] FIN/NRIC : A unique set of nine alpha-numeric characters on the Singapore National Registration Identity Card.
18 |  
19 |  * Following anonymizers have been added
20 |     * [x] Redaction: Deletes all or part of a detected sensitive value.
21 |     * [x] Encryption :  Encrypts the original sensitive data value using a cryptographic key. Cloud DLP supports several types of tokenization, including transformations that can be reversed, or "re-identified."
22 | 
23 | ### TO-DO
24 | Following features  are part of the backlog with more features coming soon
25 |  * Detectors:
26 |     * [ ] NAME
27 |     * [ ] ADDRESS
28 |  * Anonymizers:
29 |     * [ ] Masking: Replaces a number of characters of a sensitive value with a specified surrogate character, such as a hash (#) or asterisk (*).
30 |     * [ ] Bucketing: "Generalizes" a sensitive value by replacing it with a range of values. (For example, replacing a specific age with an age range, 
31 |     or temperatures with ranges corresponding to "Hot," "Medium," and "Cold.")
32 |     * [ ] Replacement: Replaces a detected sensitive value with a specified surrogate value.
33 |     
34 |  
35 | You can have a detailed at upcoming features and backlog in this [Github Board](https://github.com/thoughtworks-datakind/anonymizer/projects/1?fullscreen=true)
36 | 
37 | ## Development setup
38 | 
39 | Clone the [repo](https://github.com/thoughtworks-datakind/anonymizer) and follow the below instructions:  <br/>
40 | _Assuming that $pwd is where you cloned the repo_ 
41 | 2. Setup venv : `./bin/setup_venv_locally.sh`
42 | 3. Activate venv : `source ./.venv/bin/activate`
43 | 4. Install dependencies : `pip install -r requirements-dev.txt`
44 | 
45 | ### Config JSON
46 | An example for the config JSON is located at `<PROJECT_ROOT>/config.json`
47 | ```
48 | {
49 |   "acquire": {
50 |     "file_path": <FILE PATH TO YOUR INPUT CSV>,
51 |     "delimiter": <YOUR CSV DELIMITER>
52 |   },
53 |   "analyze": {
54 | 
55 |   },
56 |   "report" : {
57 |     "location" : <PATH TO YOUR REPORT OUTPUT FOLDER>,
58 |     "level" : <LOG LEVEL>
59 |   },
60 |   "anonymize": {
61 |     "output_file_path" : <PATH TO YOUR CSV OUTPUT FOLDER>
62 |   }
63 | }
64 | ```
65 | 
66 | ### Running Tests
67 | Update this file first `<PROJECT_ROOT>/src/tests/config/test_config.json` \
68 | You can run the tests by triggering shell script located at `<PROJECT_ROOT>/bin/run_tests.sh`
69 | 
70 | ### Trying out on local
71 | 
72 | ##### Anonymizing a delimited csv file
73 | 1. Set up a JSON config file similar to the one seen at the project root. 
74 | In the 'acquire' section of the json, populate the input file path and the delimiter.
75 | In the 'report' section, provide the output path, where you want the PII detection report to be generated.
76 | A 'high' level report just calls out which columns have PII attributes.
77 | A 'medium' level report calls out the percentage of PII in each column and the associated PII (email, credit card, etc)type for the same.
78 | 2. Run the main class - `python src/dpf_main.py --config <absolute path of the config file>`
79 | You should see the report being appended to the file named 'report_\<date\>.log' in the output path specified in the 
80 | config file.
81 | 
82 | ### Packaging
83 | Run `python setup.py bdist_wheel` and the `.whl` file will be created in the `dist` folder.
84 | 
85 | ### Spark-submit
86 | To run spark-submit locally, you can run the following command
87 | `spark-submit --py-files dist/SomePackage-*.whl src_spark/main.py --config config.json`
88 | 
89 | 
90 | ### Licensing
91 | Distributed under the MIT license. See ``LICENSE`` for more information.
92 | 
93 | 
94 | ### Contributing
95 | 
96 | You want to help out? _Awesome_! 
97 | 
98 | 


--------------------------------------------------------------------------------
/bin/color_my_terminal.sh:
--------------------------------------------------------------------------------
1 | export "PS1=${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\] \$ "


--------------------------------------------------------------------------------
/bin/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | project_path=$(dirname $0)/..
4 | 
5 | export PYTHONPATH=$project_path
6 | 
7 | coverage run --source='./src' --omit='*/tests/*' -m unittest discover .
8 | coverage report -m


--------------------------------------------------------------------------------
/bin/setup_venv_locally.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | project_path=$(dirname $0)/..
 4 | 
 5 | cd ${project_path}
 6 | export PYTHONPATH=${project_path}
 7 | 
 8 | echo "$header: Creating virtual environment."
 9 | python3 -m venv ${project_path}/.venv
10 | source ${project_path}/.venv/bin/activate
11 | 
12 | curl https://bootstrap.pypa.io/get-pip.py | python
13 | pip install -r requirements-dev.txt


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "acquire": {
 3 |     "file_path": "./test_data.csv",
 4 |     "delimiter": ","
 5 |   },
 6 |   "analyze": {},
 7 |   "report": {
 8 |     "location": "./output",
 9 |     "level": "medium"
10 |   },
11 |   "anonymize": {
12 |     "output_file_path": "./output"
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | 
 3 | services:
 4 |   spark-master:
 5 |     image: docker.io/bitnami/spark:3.1.2
 6 |     environment:
 7 |       - SPARK_MODE=master
 8 |       - SPARK_RPC_AUTHENTICATION_ENABLED=no
 9 |       - SPARK_RPC_ENCRYPTION_ENABLED=no
10 |       - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
11 |       - SPARK_SSL_ENABLED=no
12 |     ports:
13 |       - '8080:8080'
14 |       - '7077:7077'
15 |     networks:
16 |       - spark
17 |   spark-worker-1:
18 |     image: docker.io/bitnami/spark:3.1.2
19 |     environment:
20 |       - SPARK_MODE=worker
21 |       - SPARK_MASTER_URL=spark://spark:7077
22 |       - SPARK_WORKER_MEMORY=1G
23 |       - SPARK_WORKER_CORES=1
24 |       - SPARK_RPC_AUTHENTICATION_ENABLED=no
25 |       - SPARK_RPC_ENCRYPTION_ENABLED=no
26 |       - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
27 |       - SPARK_SSL_ENABLED=no
28 |     networks:
29 |       - spark
30 |     depends_on:
31 |       - spark-master
32 | 
33 | networks:
34 |   spark:
35 |     driver: bridge


--------------------------------------------------------------------------------
/no_pii_data.csv:
--------------------------------------------------------------------------------
1 | Address,Remarks
2 | 112 Bedok,Good
3 | 112 Bedok,Average


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | 
3 | coverage==5.5
4 | pytest==6.2.5
5 | freezegun==1.1.0
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | appnope==0.1.2
 2 | attrs==21.2.0
 3 | backcall==0.2.0
 4 | coverage==5.5
 5 | debugpy==1.4.3
 6 | decorator==5.1.0
 7 | entrypoints==0.3
 8 | Faker==8.14.0
 9 | freezegun==1.1.0
10 | iniconfig==1.1.1
11 | ipykernel==6.4.1
12 | ipython==7.27.0
13 | ipython-genutils==0.2.0
14 | jedi==0.18.0
15 | jupyter-client==7.0.3
16 | jupyter-core==4.8.1
17 | matplotlib-inline==0.1.3
18 | nest-asyncio==1.5.1
19 | numpy==1.21.2
20 | packaging==21.0
21 | pandas==1.3.3
22 | parso==0.8.2
23 | pexpect==4.8.0
24 | pickleshare==0.7.5
25 | pluggy==1.0.0
26 | prompt-toolkit==3.0.20
27 | ptyprocess==0.7.0
28 | py==1.10.0
29 | py4j==0.10.9
30 | Pygments==2.10.0
31 | pyparsing==2.4.7
32 | pyspark==3.1.2
33 | pytest==6.2.5
34 | python-dateutil==2.8.2
35 | pytz==2021.1
36 | pyzmq==22.3.0
37 | six==1.16.0
38 | text-unidecode==1.3
39 | toml==0.10.2
40 | tornado==6.1
41 | traitlets==5.1.0
42 | wcwidth==0.2.5
43 | wheel==0.37.0
44 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | setup(
4 |     name = 'SomePackage', 
5 |     version = '0.1', 
6 |     packages = find_packages()
7 | )


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/__init__.py


--------------------------------------------------------------------------------
/src/acquire/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/acquire/__init__.py


--------------------------------------------------------------------------------
/src/acquire/csv_parser.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from src.constants import FILE_PATH
 4 | 
 5 | 
 6 | class CsvParser:
 7 | 
 8 |     def __init__(self, config):
 9 |         self.__validate_config(config)
10 |         self.input_path = config["file_path"]
11 |         self.delimiter = config["delimiter"] if "delimiter" in config and config["delimiter"] else ","
12 | 
13 |     def __validate_config(self, config):
14 |         if FILE_PATH not in config or not config[FILE_PATH]:
15 |             raise ValueError("Config 'file_path' needs to be provided for parsing")
16 | 
17 |     def parse(self):
18 |         try:
19 |             df = pd.read_csv(self.input_path, delimiter=self.delimiter)
20 |         except pd.errors.EmptyDataError:
21 |             return pd.DataFrame({})
22 |         
23 |         if df.isnull().values.any():
24 |             raise ValueError("Dataframe contains NULL values")
25 | 
26 |         return df
27 | 


--------------------------------------------------------------------------------
/src/acquire/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/acquire/tests/__init__.py


--------------------------------------------------------------------------------
/src/acquire/tests/data/comma_delimited_file.csv:
--------------------------------------------------------------------------------
1 | name,ssn
2 | Lisa Beard,557-39-2479


--------------------------------------------------------------------------------
/src/acquire/tests/data/empty.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/acquire/tests/data/empty.csv


--------------------------------------------------------------------------------
/src/acquire/tests/data/missing_comma.csv:
--------------------------------------------------------------------------------
1 | name,ssn,age
2 | Lisa Beard,557-39-2479,33
3 | John Sohn,33


--------------------------------------------------------------------------------
/src/acquire/tests/data/pipe_delimited_file.csv:
--------------------------------------------------------------------------------
1 | name|ssn
2 | Lisa Beard|557-39-2479


--------------------------------------------------------------------------------
/src/acquire/tests/test_csv_parser.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | import os
 3 | import pandas as pd
 4 | from src.acquire.csv_parser import CsvParser
 5 | 
 6 | 
 7 | class TestCsvParser(TestCase):
 8 | 
 9 |     def setUp(self):
10 |         self.current_dir = os.path.dirname(os.path.realpath(__file__))
11 | 
12 |     def test_invalid_config_gets_caught_during_initialization(self):
13 |         context = {}
14 |         with self.assertRaises(ValueError) as ve:
15 |             CsvParser(config=context)
16 |         self.assertEqual(str(ve.exception), "Config 'file_path' needs to be provided for parsing")
17 | 
18 |     def test_if_valid_csv_file_provided_returns_pandas_df(self):
19 |         file_path = "{}/data/comma_delimited_file.csv".format(self.current_dir)
20 |         config = {"file_path" : file_path, "delimiter" : ""}
21 |         test_csv_parser_valid_file_path = CsvParser(config=config)
22 |         expected = pd.DataFrame({"name": ["Lisa Beard"], "ssn": ["557-39-2479"]})
23 |         actual = test_csv_parser_valid_file_path.parse()
24 |         self.assertEqual(actual.to_dict(), expected.to_dict())
25 | 
26 |     def test_if_valid_csv_file_with_different_delimiter_provided_returns_pandas_df(self):
27 |         file_path = "{}/data/pipe_delimited_file.csv".format(self.current_dir)
28 |         config = {"file_path" : file_path, "delimiter" : "|"}
29 |         test_csv_parser_valid_file_path = CsvParser(config=config)
30 |         expected = pd.DataFrame({"name": ["Lisa Beard"], "ssn": ["557-39-2479"]})
31 |         actual = test_csv_parser_valid_file_path.parse()
32 |         self.assertEqual(actual.to_dict(), expected.to_dict())
33 | 
34 |     def test_if_empty_csv_file_returns_empty_pandas_df(self):
35 |         file_path = "{}/data/empty.csv".format(self.current_dir)
36 |         config = {"file_path" : file_path}
37 |         test_csv_parser_valid_file_path = CsvParser(config=config)
38 |         expected = pd.DataFrame({})
39 |         actual = test_csv_parser_valid_file_path.parse()
40 |         self.assertEqual(actual.to_dict(), expected.to_dict())
41 | 
42 |     def test_if_error_is_raised_if_df_has_null_values(self):
43 |         file_path = "{}/data/missing_comma.csv".format(self.current_dir)
44 |         config = {"file_path" : file_path}
45 |         with self.assertRaises(ValueError) as ve:
46 |             CsvParser(config=config).parse()
47 |         self.assertEqual(str(ve.exception), "Dataframe contains NULL values")
48 | 


--------------------------------------------------------------------------------
/src/analyze/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/__init__.py


--------------------------------------------------------------------------------
/src/analyze/detectors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/detectors/__init__.py


--------------------------------------------------------------------------------
/src/analyze/detectors/base_detector.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | from src.analyze.utils.analyzer_result import AnalyzerResult
 5 | 
 6 | 
 7 | class BaseDetector(ABC):
 8 | 
 9 |     def __init__(self):
10 |         self.name = None
11 |         self.pattern = None
12 | 
13 |     @abstractmethod
14 |     def get_pattern(self):
15 |         pass
16 | 
17 |     @abstractmethod
18 |     def get_name(self):
19 |         pass
20 | 
21 |     def validate(self, text):
22 |         return True
23 | 
24 |     def execute(self, text):
25 |         results = []
26 |         matches = re.finditer(self.get_pattern(), text)
27 |         for match in matches:
28 |             matched_string = match.string[match.start(): match.end()]
29 |             if self.validate(matched_string):
30 |                 results.append(AnalyzerResult(matched_string, self.get_name(), match.start(), match.end()))
31 |         return results
32 | 


--------------------------------------------------------------------------------
/src/analyze/detectors/credit_card_detector.py:
--------------------------------------------------------------------------------
 1 | from src.analyze.detectors.base_detector import BaseDetector
 2 | from src.analyze.utils.regex import RegEx
 3 | 
 4 | 
 5 | class CreditCardDetector(BaseDetector):
 6 | 
 7 |     def __init__(self):
 8 |         self.name = "CREDIT_CARD"
 9 |         self.pattern = RegEx().literal("4").any_digit().num_occurrences(3).pipe() \
10 |             .literal("5").range(0, 5).any_digit().num_occurrences(2).pipe() \
11 |             .literal("6").any_digit().num_occurrences(3).pipe() \
12 |             .literal("1").any_digit().num_occurrences(3).pipe() \
13 |             .literal("3").any_digit().num_occurrences(3) \
14 |             .one_of("- ").zero_or_one_occurrences() \
15 |             .any_digit().range_occurrences(3, 4) \
16 |             .one_of("- ").zero_or_one_occurrences() \
17 |             .any_digit().range_occurrences(3, 4) \
18 |             .one_of("- ").zero_or_one_occurrences() \
19 |             .any_digit().range_occurrences(3, 5).build()
20 | 
21 |     def get_name(self):
22 |         return self.name
23 | 
24 |     def get_pattern(self):
25 |         return self.pattern
26 | 
27 |     def validate(self, text):
28 |         def digits_of(n):
29 |             return [int(d) for d in str(n)]
30 | 
31 |         digits = digits_of(text.replace('-', '').replace(' ', ''))
32 |         odd_digits = digits[-1::-2]
33 |         even_digits = digits[-2::-2]
34 |         checksum = sum(odd_digits)
35 | 
36 |         for d in even_digits:
37 |             checksum += sum(digits_of(d * 2))
38 | 
39 |         return checksum % 10 == 0
40 | 


--------------------------------------------------------------------------------
/src/analyze/detectors/email_detector.py:
--------------------------------------------------------------------------------
 1 | from src.analyze.detectors.base_detector import BaseDetector
 2 | from src.analyze.utils.regex import RegEx
 3 | 
 4 | 
 5 | class EmailDetector(BaseDetector):
 6 | 
 7 |     def __init__(self):
 8 |         self.name = "EMAIL"
 9 |         self.pattern = RegEx().one_of("a-zA-Z0-9_.+-").one_or_more_occurrences().literal("@").one_of("a-zA-Z0-9-")\
10 |             .one_or_more_occurrences().literal("\\.").one_of("a-zA-Z0-9-.").one_or_more_occurrences().build()
11 | 
12 |     def get_name(self):
13 |         return self.name
14 | 
15 |     def get_pattern(self):
16 |         return self.pattern
17 | 


--------------------------------------------------------------------------------
/src/analyze/detectors/national_id_detector.py:
--------------------------------------------------------------------------------
 1 | from src.analyze.detectors.base_detector import BaseDetector
 2 | from src.analyze.utils.regex import RegEx
 3 | 
 4 | 
 5 | class NationalIdDetector(BaseDetector):
 6 | 
 7 |     def __init__(self):
 8 |         self.name = "NRIC"
 9 |         self.pattern = RegEx().one_of("STFG").any_digit().num_occurrences(7).range("A", "Z").build()
10 | 
11 |     def get_name(self):
12 |         return self.name
13 | 
14 |     def get_pattern(self):
15 |         return self.pattern
16 | 
17 |     def __get_offset(self, text):
18 |         return 4 if text in "TG" else 0
19 | 
20 |     def __is_NRIC(self, text, loc):
21 |         if text[0] in "ST":
22 |             return "JZIHGFEDCBA"[loc] == text[8]
23 |         return False
24 | 
25 |     def __is_FIN(self, text, loc):
26 |         if text[0] in "FG":
27 |             return "XWUTRQPNMLK"[loc] == text[8]
28 |         return False
29 | 
30 |     def validate(self, text):
31 |         weight = self.__get_weight(text)
32 |         first_character = text[0]
33 |         offset = self.__get_offset(first_character)
34 |         loc = (offset + weight) % 11
35 |         return self.__is_NRIC(text, loc) or self.__is_FIN(text, loc)
36 | 
37 |     def __get_weight(self, text):
38 |         numbers = [int(digit) for digit in list(text[1:-1])]
39 |         for index, i in enumerate(numbers):
40 |             if index == 0:
41 |                 numbers[index] *= 2
42 |             numbers[index] *= 8 - index
43 |         return sum(numbers)
44 | 


--------------------------------------------------------------------------------
/src/analyze/detectors/phone_number_detector.py:
--------------------------------------------------------------------------------
 1 | from src.analyze.detectors.base_detector import BaseDetector
 2 | from src.analyze.utils.regex import RegEx
 3 | 
 4 | 
 5 | class PhoneNumberDetector(BaseDetector):
 6 | 
 7 |     def __init__(self):
 8 |         self.name = "PHONE_NUMBER"
 9 |         regex_pipe = RegEx().pipe().build()
10 | 
11 |         regex_with_country_code_and_no_space = '(\\+65?\\s?[689]\\d{7})'
12 |         regex_with_country_code_and_single_space = '(\\+65?\\s?[689]\\d{3} \\d{4})'
13 |         regex_no_country_code_and_no_space = '([689]\\d{7})'
14 |         regex_no_country_code_and_single_space = '([689]\\d{3} \\d{4})'
15 |         regex_with_country_code_in_brackets_and_no_space = '([(]65[)]\\s?[689]\\d{7})'
16 |         regex_with_country_code_in_brackets_and_single_space = '([(]65[)]\\s?[689]\\d{3} \\d{4})'
17 | 
18 |         self.pattern = regex_with_country_code_and_no_space + regex_pipe + \
19 |             regex_with_country_code_and_single_space + regex_pipe + \
20 |             regex_no_country_code_and_no_space + regex_pipe + \
21 |             regex_no_country_code_and_single_space + regex_pipe + \
22 |             regex_with_country_code_in_brackets_and_no_space + regex_pipe + \
23 |             regex_with_country_code_in_brackets_and_single_space
24 | 
25 |     def get_name(self):
26 |         return self.name
27 | 
28 |     def get_pattern(self):
29 |         return self.pattern
30 | 


--------------------------------------------------------------------------------
/src/analyze/detectors/pii_detector.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import pkgutil
 3 | import inspect
 4 | import sys
 5 | 
 6 | import pandas as pd
 7 | 
 8 | import src.analyze.detectors
 9 | from src.analyze.detectors.base_detector import BaseDetector
10 | from src.anonymize.drop_anonymizer import DropAnonymizer
11 | from src.anonymize.anonymizer_result import AnonymizerResult
12 | 
13 | 
14 | #TODO : refactor this to use the annotations instead of the module path.
15 | class PIIDetector:
16 | 
17 |     def __init__(self):
18 |         self.detectors = self.__get_detector_instances()
19 | 
20 |     def __get_detector_modules(self):
21 |         modules = [modname for importer, modname, ispkg in
22 |                         pkgutil.walk_packages(path=src.analyze.detectors.__path__,
23 |                                               prefix=src.analyze.detectors.__name__+".")
24 |                    if "tests" not in modname]
25 |         return modules
26 | 
27 |     def __get_detector_instances(self):
28 |         modules = self.__get_detector_modules()
29 |         detectors = []
30 |         for module in modules:
31 |             importlib.import_module(module)
32 |             classes = inspect.getmembers(sys.modules[module], inspect.isclass)
33 |             for class_name, class_type in classes:
34 |                 if class_name != "BaseDetector" and issubclass(class_type, BaseDetector):
35 |                     detectors.append(class_type())
36 |         return detectors
37 | 
38 |     #TODO : Should we make this static?
39 |     def analyze_and_redact(self, text: str):
40 |         analyzer_results = []
41 |         for detector in self.detectors:
42 |             analyzer_results = analyzer_results + detector.execute(text)
43 |         redacted_text = DropAnonymizer.redact(text, analyzer_results)
44 |         return AnonymizerResult(redacted_text, analyzer_results)
45 | 
46 |     def __contains_pii(self, results):
47 |         for result in results:
48 |             if len(result.analyzer_results) > 0:
49 |                 return True
50 |         return False
51 | 
52 |     def analyze_data_frame(self, input_data_frame):
53 |         result_df = input_data_frame.applymap(self.analyze_and_redact)
54 |         return result_df.applymap(lambda x: x.analyzer_results), result_df.applymap(lambda x: x.redacted_text)
55 | 


--------------------------------------------------------------------------------
/src/analyze/detectors/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/detectors/tests/__init__.py


--------------------------------------------------------------------------------
/src/analyze/detectors/tests/test_base_detector.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | 
 4 | from src.analyze.detectors.base_detector import BaseDetector
 5 | from src.analyze.utils.analyzer_result import AnalyzerResult
 6 | from src.analyze.utils.regex import RegEx
 7 | 
 8 | 
 9 | class TestBaseDetector(TestCase):
10 | 
11 |     def setUp(self):
12 | 
13 |         class TestClass(BaseDetector):
14 |             def get_pattern(self):
15 |                 return RegEx().one_of("STFG").any_digit().num_occurrences(7).range("A", "Z").build()
16 | 
17 |             def get_name(self):
18 |                 return "NRIC"
19 | 
20 |         self.test_class = TestClass()
21 | 
22 |     def test_execute_calls_match_and_validate(self):
23 |         results = self.test_class.execute("First President of Singapore NRIC was S0000001I")
24 |         self.assertEqual(len(results), 1)
25 |         self.assertEqual(AnalyzerResult("S0000001I", "NRIC", 38, 47), results[0])
26 | 
27 |     def test_execute_returns_all_matches_when_more_than_one(self):
28 |         results = self.test_class.execute("First President of Singapore NRIC was S0000001I and the second president's was T0000001R")
29 |         self.assertEqual(len(results), 2)
30 |         self.assertCountEqual([AnalyzerResult("S0000001I", "NRIC", 38, 47),AnalyzerResult("T0000001R", "NRIC", 79, 88)], results)
31 | 
32 |     def test_execute_returns_empty_list_when_no_matches(self):
33 |         results = self.test_class.execute("First President of Singapore NRIC was ABC and the second president's was DEF")
34 |         self.assertEqual(len(results), 0)
35 | 
36 |     def test_get_name_and_get_patterns_are_abstract(self):
37 |         with self.assertRaises(TypeError) as te:
38 |             BaseDetector()
39 |         self.assertEqual(str(te.exception), "Can't instantiate abstract class BaseDetector with abstract methods get_name, get_pattern")
40 | 


--------------------------------------------------------------------------------
/src/analyze/detectors/tests/test_credit_card_detector.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from src.analyze.detectors.credit_card_detector import CreditCardDetector
 4 | 
 5 | 
 6 | class TestCreditCardDetector(TestCase):
 7 | 
 8 |     def setUp(self):
 9 |         self.credit_card_detector = CreditCardDetector()
10 | 
11 |     def test_default_property_values_are_correct(self):
12 |         self.assertEqual("CREDIT_CARD", self.credit_card_detector.name)
13 |         self.assertEqual('4\\d{3}|5[0-5]\\d{2}|6\\d{3}|1\\d{3}|3\\d{3}[- ]?\\d{3,4}[- ]?\\d{3,4}[- ]?\\d{3,5}',
14 |                          self.credit_card_detector.pattern)
15 | 
16 |     def test_valid_credit_cards(self):
17 |         self.assertTrue(self.credit_card_detector.validate("4012888888881881"))
18 |         self.assertTrue(self.credit_card_detector.validate("4012-8888-8888-1881"))
19 |         self.assertTrue(self.credit_card_detector.validate("4012 8888 8888 1881"))
20 | 
21 |     def test_valid_airplus_credit_card(self):
22 |         self.assertTrue(self.credit_card_detector.validate('122000000000003'))
23 | 
24 |     def test_valid_amex_credit_card(self):
25 |         self.assertTrue(self.credit_card_detector.validate('371449635398431'))
26 | 
27 |     def test_valid_cartebleue_credit_card(self):
28 |         self.assertTrue(self.credit_card_detector.validate('5555555555554444'))
29 | 
30 |     def test_valid_dankort_credit_card(self):
31 |         self.assertTrue(self.credit_card_detector.validate('5019717010103742'))
32 | 
33 |     def test_valid_diners_credit_card(self):
34 |         self.assertTrue(self.credit_card_detector.validate('30569309025904'))
35 | 
36 |     def test_valid_discover_credit_card(self):
37 |         self.assertTrue(self.credit_card_detector.validate('6011000400000000'))
38 | 
39 |     def test_valid_jcb_credit_card(self):
40 |         self.assertTrue(self.credit_card_detector.validate('3528000700000000'))
41 | 
42 |     def test_valid_maestro_credit_card(self):
43 |         self.assertTrue(self.credit_card_detector.validate('6759649826438453'))
44 | 
45 |     def test_valid_mastercard_credit_card(self):
46 |         self.assertTrue(self.credit_card_detector.validate('5555555555554444'))
47 | 
48 |     def test_valid_visa_credit_card(self):
49 |         self.assertTrue(self.credit_card_detector.validate('4111111111111111'))
50 | 
51 |     def test_valid_visa_debit_credit_card(self):
52 |         self.assertTrue(self.credit_card_detector.validate('4111111111111111'))
53 | 
54 |     def test_valid_visa_electron_credit_card(self):
55 |         self.assertTrue(self.credit_card_detector.validate('4917300800000000'))
56 | 
57 |     def test_valid_visa_purchasing_credit_card(self):
58 |         self.assertTrue(self.credit_card_detector.validate('4484070000000000'))
59 | 
60 |     def test_invalid_credit_card(self):
61 |         self.assertFalse(self.credit_card_detector.validate('4012-8888-8888-1882'))
62 | 
63 |     def test_invalid_diners_card(self):
64 |         self.assertFalse(self.credit_card_detector.validate('36168002586008'))
65 | 


--------------------------------------------------------------------------------
/src/analyze/detectors/tests/test_email_detector.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from unittest.mock import patch
 3 | 
 4 | from src.analyze.detectors.email_detector import EmailDetector
 5 | 
 6 | 
 7 | class TestEmailDetector(TestCase):
 8 | 
 9 |     def setUp(self):
10 |         self.email_detector = EmailDetector()
11 | 
12 |     def test_get_name_returns_the_valid_detector_name(self):
13 |         self.assertEqual(self.email_detector.get_name(), "EMAIL")
14 | 
15 |     def test_get_pattern_returns_compiled_regex(self):
16 |         actual_value = self.email_detector.get_pattern()
17 |         return_value = "[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+"
18 |         self.assertEqual(return_value, actual_value)
19 | 
20 |     def test_valid_email_gets_detected_correctly(self):
21 |         self.assertEqual(len(self.email_detector.execute("abc@hotmail.com")), 1)
22 | 
23 |     def test_invalid_email_does_not_get_detected(self):
24 |         self.assertEqual(len(self.email_detector.execute("@hotmail.com")), 0)
25 | 


--------------------------------------------------------------------------------
/src/analyze/detectors/tests/test_national_id_detector.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from src.analyze.detectors.national_id_detector import NationalIdDetector
 4 | 
 5 | 
 6 | class TestNationalIdDetector(TestCase):
 7 | 
 8 |     def setUp(self):
 9 |         self.national_id_detector = NationalIdDetector()
10 | 
11 |     def test_default_property_values_are_correct(self):
12 |         self.assertEqual("NRIC", self.national_id_detector.name)
13 |         self.assertEqual("[STFG]\\d{7}[A-Z]", self.national_id_detector.pattern)
14 | 
15 |     def test_execute_return_true_when_valid_old_NRIC(self):
16 |         self.assertTrue(self.national_id_detector.validate("S0000001I"))
17 | 
18 |     def test_execute_return_true_when_valid_old_FIN(self):
19 |         self.assertTrue(self.national_id_detector.validate("F0000001U"))
20 | 
21 |     def test_execute_return_true_when_valid_new_NRIC(self):
22 |         self.assertTrue(self.national_id_detector.validate("T0000001E"))
23 | 
24 |     def test_execute_return_true_when_valid_new_FIN(self):
25 |         self.assertTrue(self.national_id_detector.validate("G0000001P"))
26 | 
27 |     def test_execute_return_false_when_invalid_old_NRIC(self):
28 |         self.assertFalse(self.national_id_detector.validate("S0000001K"))
29 | 
30 |     def test_execute_return_false_when_invalid_new_NRIC(self):
31 |         self.assertFalse(self.national_id_detector.validate("F0000001V"))
32 | 
33 |     def test_execute_return_false_when_invalid_old_FIN(self):
34 |         self.assertFalse(self.national_id_detector.validate("T0000001F"))
35 | 
36 |     def test_execute_return_false_when_invalid_new_FIN(self):
37 |         self.assertFalse(self.national_id_detector.validate("G0000001Q"))
38 | 


--------------------------------------------------------------------------------
/src/analyze/detectors/tests/test_phone_number_detector.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from src.analyze.detectors.phone_number_detector import PhoneNumberDetector
 4 | from src.analyze.utils.analyzer_result import AnalyzerResult
 5 | 
 6 | 
 7 | class TestPhoneNumberDetector(TestCase):
 8 | 
 9 |     def setUp(self):
10 |         self.phone_number_detector = PhoneNumberDetector()
11 | 
12 |     def test_default_property_values_are_correct(self):
13 |         self.assertEqual("PHONE_NUMBER", self.phone_number_detector.name)
14 |         self.assertEqual('(\\+65?\\s?[689]\\d{7})|'
15 |                          '(\\+65?\\s?[689]\\d{3} \\d{4})|'
16 |                          '([689]\\d{7})|'
17 |                          '([689]\\d{3} \\d{4})|'
18 |                          '([(]65[)]\\s?[689]\\d{7})|'
19 |                          '([(]65[)]\\s?[689]\\d{3} \\d{4})',
20 |                          self.phone_number_detector.pattern)
21 | 
22 |     def test_invalid_phone_number_does_not_get_detected(self):
23 |         self.assertEqual(len(self.phone_number_detector.execute("S0000001I")), 0)
24 | 
25 |     def __assert_single_result(self, text_to_be_tested, start, end):
26 |         actual = self.phone_number_detector.execute(text_to_be_tested)
27 |         expected = AnalyzerResult(text_to_be_tested, "PHONE_NUMBER", start, end)
28 |         self.assertEqual(len(actual), 1)
29 |         self.assertEqual(expected, actual[0])
30 | 
31 |     def test_valid_phone_number_gets_detected_correctly(self):
32 |         self.__assert_single_result("+65 65781234", 0, 12)
33 |         self.__assert_single_result("+65 85781234", 0, 12)
34 |         self.__assert_single_result("+65 95781234", 0, 12)
35 | 
36 |         self.__assert_single_result("+65 6578 1234", 0, 13)
37 |         self.__assert_single_result("+65 8578 1234", 0, 13)
38 |         self.__assert_single_result("+65 9578 1234", 0, 13)
39 | 
40 |         self.__assert_single_result("65781234", 0, 8)
41 |         self.__assert_single_result("85781234", 0, 8)
42 |         self.__assert_single_result("95781234", 0, 8)
43 | 
44 |         self.__assert_single_result("6578 1234", 0, 9)
45 |         self.__assert_single_result("8578 1234", 0, 9)
46 |         self.__assert_single_result("9578 1234", 0, 9)
47 | 
48 |         self.__assert_single_result("(65) 65781234", 0, 13)
49 |         self.__assert_single_result("(65) 85781234", 0, 13)
50 |         self.__assert_single_result("(65) 95781234", 0, 13)
51 | 
52 |         self.__assert_single_result("(65) 6578 1234", 0, 14)
53 |         self.__assert_single_result("(65) 8578 1234", 0, 14)
54 |         self.__assert_single_result("(65) 9578 1234", 0, 14)
55 | 


--------------------------------------------------------------------------------
/src/analyze/detectors/tests/test_pii_detector.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from unittest import TestCase
  3 | 
  4 | from src.analyze.detectors.pii_detector import PIIDetector
  5 | from src.analyze.utils.analyzer_result import AnalyzerResult
  6 | from src.anonymize.anonymizer_result import AnonymizerResult
  7 | 
  8 | 
  9 | class TestPIIDetector(TestCase):
 10 | 
 11 |     def setUp(self):
 12 |         self.pii_detector = PIIDetector()
 13 | 
 14 |     def test_should_detect_and_redact_nric_in_text(self):
 15 |         actual = self.pii_detector.analyze_and_redact("First President of Singapore NRIC was S0000001I")
 16 |         expected = AnonymizerResult("First President of Singapore NRIC was ", [AnalyzerResult("S0000001I", "NRIC", 38, 47)])
 17 |         self.assertEqual(actual, expected)
 18 | 
 19 |     def test_should_detect_and_redact_email_in_text(self):
 20 |         actual = self.pii_detector.analyze_and_redact("A typical email id would look something like test@sample.com")
 21 |         expected = AnonymizerResult("A typical email id would look something like ",
 22 |                                     [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)])
 23 |         self.assertEqual(actual, expected)
 24 | 
 25 |     def test_should_detect_and_redact_phone_in_text(self):
 26 |         actual = self.pii_detector.analyze_and_redact("Some examples of phone numbers are +65 62345678")
 27 |         expected = AnonymizerResult("Some examples of phone numbers are ",
 28 |                                     [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
 29 |         self.assertEqual(actual, expected)
 30 | 
 31 |     def test_should_detect_and_redact_all_pii_fields_in_text(self):
 32 |         actual = self.pii_detector.analyze_and_redact("""First President of Singapore NRIC was S0000001I.
 33 |                                          A typical email id would look something like test@sample.com""")
 34 |         expected_redacted_text = """First President of Singapore NRIC was .
 35 |                                          A typical email id would look something like """
 36 | 
 37 |         expected = AnonymizerResult(expected_redacted_text, [AnalyzerResult("test@sample.com", "EMAIL", 135, 150),
 38 |                                                              AnalyzerResult("S0000001I", "NRIC", 38, 47)])
 39 |         self.assertEqual(actual, expected)
 40 | 
 41 |     def test_analyze_returns_returns_same_text_and_no_results_when_no_PII_fields(self):
 42 |         input_text = """First President of Singapore NRIC was ABC.
 43 |                                          A typical email id would look something like test"""
 44 |         actual = self.pii_detector.analyze_and_redact(input_text)
 45 |         expected = AnonymizerResult(input_text, [])
 46 |         self.assertEqual(actual, expected)
 47 | 
 48 |     def test_analyze_data_frame_runs_analyze_against_each_cell_with_a_PII_value(self):
 49 |         test_data_frame = pd.DataFrame({"summary": ["First President of Singapore NRIC was S0000001I",
 50 |                                                     "A typical email id would look something like test@sample.com"],
 51 |                                         "phone number": ["Some examples of phone numbers are +65 62345678",
 52 |                                                          "Some examples of phone numbers are +65 62345678"]})
 53 | 
 54 |         actual, _ = self.pii_detector.analyze_data_frame(test_data_frame)
 55 | 
 56 |         expected_data_frame = pd.DataFrame({"summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)],
 57 |                                                         [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]],
 58 |                                             "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)],
 59 |                                                              [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]})
 60 | 
 61 |         pd.testing.assert_frame_equal(expected_data_frame, actual)
 62 | 
 63 |     def test_analyze_data_frame_runs_analyze_against_each_cell_when_there_are_no_PII_values_returns_empty_data_frame(
 64 |             self):
 65 |         test_data_frame = pd.DataFrame({"summary": ["First President of Singapore NRIC was abcde",
 66 |                                                     "A typical email id would look something like test@t"],
 67 |                                         "phone number": ["Some examples of phone numbers are +34342",
 68 |                                                          "Some examples of phone numbers are +8909"]})
 69 |         expected_report = pd.DataFrame({"summary": [[],[]],
 70 |                                         "phone number": [[],[]]
 71 |                                         })
 72 |         expected_result = pd.DataFrame({"summary": ["First President of Singapore NRIC was abcde",
 73 |                                                     "A typical email id would look something like test@t"],
 74 |                                         "phone number": ["Some examples of phone numbers are +34342",
 75 |                                                          "Some examples of phone numbers are +8909"]})
 76 |         actual_report, actual_result = self.pii_detector.analyze_data_frame(test_data_frame)
 77 |         
 78 |         pd.testing.assert_frame_equal(expected_report, actual_report)
 79 |         pd.testing.assert_frame_equal(expected_result, actual_result)
 80 | 
 81 |     def test_analyze_data_frame_runs_analyze_only_on_cells_with_a_PII_value(self):
 82 |         test_data_frame = pd.DataFrame({"summary": ["First President of Singapore NRIC was S0000001I",
 83 |                                                     "A typical email id would look something like test@sample.com"],
 84 |                                         "remarks": ["No sensitive data",
 85 |                                                          "No sensitive data"]})
 86 | 
 87 |         actual_report, actual_result = self.pii_detector.analyze_data_frame(test_data_frame)
 88 | 
 89 |         expected_report = pd.DataFrame({"summary": [[AnalyzerResult("S0000001I", "NRIC", 38, 47)],
 90 |                                                         [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]],
 91 |                                         "remarks": [[],[]]
 92 |                                         })
 93 |                                             
 94 |         expected_result = pd.DataFrame({"summary": ["First President of Singapore NRIC was ",
 95 |                                                     "A typical email id would look something like "],
 96 |                                         "remarks": ["No sensitive data",
 97 |                                                          "No sensitive data"]})
 98 | 
 99 |         pd.testing.assert_frame_equal(expected_report, actual_report)
100 |         pd.testing.assert_frame_equal(expected_result, actual_result)


--------------------------------------------------------------------------------
/src/analyze/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/utils/__init__.py


--------------------------------------------------------------------------------
/src/analyze/utils/analyzer_result.py:
--------------------------------------------------------------------------------
 1 | class AnalyzerResult:
 2 | 
 3 |     def __init__(self, text, type, start, end):
 4 |         self.text = text
 5 |         self.type = type
 6 |         self.start = start
 7 |         self.end = end
 8 | 
 9 |     def __eq__(self, other):
10 |         return type(self) == type(other) and self.text == other.text and self.type == other.type \
11 |                and self.start == other.start and self.end == other.end
12 | 
13 |     def __repr__(self):
14 |         return self.__str__()
15 | 
16 |     def __str__(self):
17 |         return "Text {} at position ({},{}) was identified as {}".format(self.text, self.start, self.end, self.type)
18 | 
19 |     def detector(self):
20 |         return self.type
21 | 


--------------------------------------------------------------------------------
/src/analyze/utils/regex.py:
--------------------------------------------------------------------------------
 1 | class RegEx:
 2 | 
 3 |     def __init__(self):
 4 |         self.regex_string = ""
 5 | 
 6 |     def __is_numeric(self, value):
 7 |         return isinstance(value, int)
 8 | 
 9 |     def __is_single_character_value(self, value):
10 |         return len(str(value)) == 1
11 | 
12 |     def __validate_range(self, start, end):
13 |         if start > end:
14 |             raise ValueError("Range start should be less than end")
15 | 
16 |     def boundary(self):
17 |         self.regex_string += "\\b"
18 |         return self
19 | 
20 |     def pipe(self):
21 |         self.regex_string += "|"
22 |         return self
23 | 
24 |     def range(self, from_char, to_char):
25 |         if not self.__is_single_character_value(from_char) or not self.__is_single_character_value(to_char):
26 |             raise ValueError("Range boundaries should be single character")
27 | 
28 |         self.__validate_range(from_char, to_char)
29 |         self.regex_string += "[{}-{}]".format(from_char, to_char)
30 |         return self
31 | 
32 |     def one_of(self, character_set):
33 |         if character_set is None or character_set == "":
34 |             raise ValueError("Character Set should not be empty")
35 | 
36 |         self.regex_string += "[" + character_set + "]"
37 |         return self
38 | 
39 |     def any_digit(self):
40 |         self.regex_string += "\\d"
41 |         return self
42 | 
43 |     def num_occurrences(self, number):
44 |         if number < 1:
45 |             raise ValueError
46 | 
47 |         self.regex_string += "{" + str(number) + "}"
48 |         return self
49 | 
50 |     def one_or_more_occurrences(self):
51 |         self.regex_string += "+"
52 |         return self
53 | 
54 |     def zero_or_more_occurrences(self):
55 |         self.regex_string += "*"
56 |         return self
57 | 
58 |     def zero_or_one_occurrences(self):
59 |         self.regex_string += "?"
60 |         return self
61 | 
62 |     def range_occurrences(self, start, end):
63 |         if not self.__is_numeric(start) or not self.__is_numeric(end):
64 |             raise TypeError("Range should be integers")
65 | 
66 |         self.__validate_range(start, end)
67 |         self.regex_string += "{" + str(start) + "," + str(end) + "}"
68 |         return self
69 | 
70 |     def literal(self, literal):
71 |         self.regex_string += literal
72 |         return self
73 | 
74 |     def build(self):
75 |         return self.regex_string
76 | 


--------------------------------------------------------------------------------
/src/analyze/utils/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/analyze/utils/tests/__init__.py


--------------------------------------------------------------------------------
/src/analyze/utils/tests/test_analyzer_result.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from src.analyze.utils.analyzer_result import AnalyzerResult
 4 | 
 5 | 
 6 | class TestAnalyzerResult(TestCase):
 7 | 
 8 |     def test_equality(self):
 9 |         expected = AnalyzerResult("text", "type", 0, 10)
10 |         actual = AnalyzerResult("text", "type", 0, 10)
11 |         self.assertEqual(expected, actual)
12 | 
13 |     def test_inequality(self):
14 |         self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("different_text", "type", 0, 10))
15 |         self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("text", "different_type", 0, 10))
16 |         self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("text", "type", 1, 10))
17 |         self.assertNotEqual(AnalyzerResult("text", "type", 0, 10), AnalyzerResult("text", "type", 0, 11))
18 | 
19 |     def test_repr(self):
20 |         expected = "Text sample_data at position (0,10) was identified as type"
21 |         self.assertEqual(AnalyzerResult("sample_data", "type", 0, 10).__repr__(), expected)
22 | 
23 |     def test_str(self):
24 |         expected = "Text sample_data at position (0,10) was identified as type"
25 |         self.assertEqual(str(AnalyzerResult("sample_data", "type", 0, 10)), expected)
26 | 
27 |     def test_get_detector_fetches_detector_type_correctly(self):
28 |         result = AnalyzerResult("text", "EMAIL", 0, 10)
29 |         self.assertEqual(result.detector(), "EMAIL")


--------------------------------------------------------------------------------
/src/analyze/utils/tests/test_regex.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | 
  3 | from src.analyze.utils.regex import RegEx
  4 | 
  5 | 
  6 | class TestRegEx(TestCase):
  7 | 
  8 |     # Testing one_of
  9 |     def test_when_one_of_param_is_empty_throws_error(self):
 10 |         self.assertRaises(ValueError, lambda: RegEx().one_of("").build())
 11 | 
 12 |     def test_when_valid_input_is_passed_one_of_returns_correct_output(self):
 13 |         self.assertEqual("[AB]", RegEx().one_of("AB").build())
 14 |         self.assertEqual("[357]", RegEx().one_of("357").build())
 15 | 
 16 |     # Testing num_occurrences
 17 |     def test_when_non_positive_number_of_occurrences_throws_error(self):
 18 |         self.assertRaises(ValueError, lambda: RegEx().num_occurrences(-7).build())
 19 |         self.assertRaises(ValueError, lambda: RegEx().num_occurrences(0).build())
 20 | 
 21 |     def test_when_valid_input_is_passed_num_occurrences_returns_correct_output(self):
 22 |         self.assertEqual("{7}", RegEx().num_occurrences(7).build())
 23 | 
 24 |     # Testing any_digit
 25 |     def test_when_any_digit_returns_correct_output(self):
 26 |         self.assertEqual("\\d", RegEx().any_digit().build())
 27 | 
 28 |     def __assert_value_error_is_raised(self, fn, msg):
 29 |         with self.assertRaises(ValueError) as ve:
 30 |             fn()
 31 |         self.assertEqual(str(ve.exception), msg)
 32 | 
 33 |     def __assert_type_error_is_raised(self, fn, msg):
 34 |         with self.assertRaises(TypeError) as ve:
 35 |             fn()
 36 |         self.assertEqual(str(ve.exception), msg)
 37 | 
 38 |     # Testing range
 39 |     def test_when_range_is_incomplete(self):
 40 |         single_character = "Range boundaries should be single character"
 41 |         self.__assert_value_error_is_raised(lambda: RegEx().range("", "Z").build(), single_character)
 42 |         self.__assert_value_error_is_raised(lambda: RegEx().range("0", "").build(), single_character)
 43 |         self.__assert_value_error_is_raised(lambda: RegEx().range("01", "9").build(), single_character)
 44 |         self.__assert_value_error_is_raised(lambda: RegEx().range("A", "YZ").build(), single_character)
 45 | 
 46 |     def test_when_invalid_range_boundaries_are_provided(self):
 47 |         less_than_end = "Range start should be less than end"
 48 |         self.__assert_value_error_is_raised(lambda: RegEx().range("B", "A").build(), less_than_end)
 49 |         self.__assert_value_error_is_raised(lambda: RegEx().range("9", "0").build(), less_than_end)
 50 | 
 51 |     def test_when_valid_input_is_passed_range_returns_correct_output(self):
 52 |         self.assertEqual("[A-Z]", RegEx().range("A", "Z").build())
 53 |         self.assertEqual("[0-9]", RegEx().range("0", "9").build())
 54 | 
 55 |     # Testing range_occurrences
 56 |     def test_when_invalid_numeric_range_boundaries_are_provided(self):
 57 |         less_than_end = "Range start should be less than end"
 58 |         self.__assert_value_error_is_raised(lambda: RegEx().range_occurrences(9, 0).build(), less_than_end)
 59 | 
 60 |     def test_when_invalid_input_for_range_occurrences_throws_error(self):
 61 |         range_should_be_integers = "Range should be integers"
 62 |         self.__assert_type_error_is_raised(lambda: RegEx().range_occurrences(1.2, 2).build(), range_should_be_integers)
 63 |         self.__assert_type_error_is_raised(lambda: RegEx().range_occurrences("A", 9).build(), range_should_be_integers)
 64 | 
 65 |     def test_when_valid_input_is_passed_range_occurrences_returns_correct_output(self):
 66 |         self.assertEqual("{0,9}", RegEx().range_occurrences(0, 9).build())
 67 | 
 68 |     # Testing one_or_more_occurrences
 69 |     def test_when_valid_input_is_passed_one_or_more_occurrences_returns_correct_output(self):
 70 |         self.assertEqual("+", RegEx().one_or_more_occurrences().build())
 71 | 
 72 |     # Testing zero_or_more_occurrences
 73 |     def test_when_valid_input_is_passed_zero_or_more_occurrences_returns_correct_output(self):
 74 |         self.assertEqual("*", RegEx().zero_or_more_occurrences().build())
 75 | 
 76 |     # Testing zero_or_one_occurrences
 77 |     def test_when_valid_input_is_passed_zero_or_one_occurrences_returns_correct_output(self):
 78 |         self.assertEqual("?", RegEx().zero_or_one_occurrences().build())
 79 | 
 80 |     # Testing literal
 81 |     def test_when_valid_input_is_passed_literal_returns_correct_output(self):
 82 |         self.assertEqual("@", RegEx().literal("@").build())
 83 | 
 84 |     # Testing boundary
 85 |     def test_boundary(self):
 86 |         self.assertEqual("\\b", RegEx().boundary().build())
 87 | 
 88 |     # Testing complex inputs
 89 |     def test_builds_correct_pattern_for_NRIC(self):
 90 |         self.assertEqual("[AIR]\\d{7}[A-Z]",
 91 |                          RegEx()
 92 |                          .one_of("AIR")
 93 |                          .any_digit()
 94 |                          .num_occurrences(7)
 95 |                          .range("A", "Z")
 96 |                          .build())
 97 | 
 98 |         self.assertEqual("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+",
 99 |                          RegEx()
100 |                          .one_of("a-zA-Z0-9_.+-")
101 |                          .one_or_more_occurrences()
102 |                          .literal("@")
103 |                          .one_of("a-zA-Z0-9-")
104 |                          .one_or_more_occurrences()
105 |                          .literal("\\.")
106 |                          .one_of("a-zA-Z0-9-.")
107 |                          .one_or_more_occurrences()
108 |                          .build())
109 | 


--------------------------------------------------------------------------------
/src/anonymize/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/anonymize/__init__.py


--------------------------------------------------------------------------------
/src/anonymize/anonymizer_result.py:
--------------------------------------------------------------------------------
 1 | class AnonymizerResult:
 2 | 
 3 |     def __init__(self, redacted_text, analyzer_results):
 4 |         self.redacted_text = redacted_text
 5 |         self.analyzer_results = analyzer_results
 6 | 
 7 |     def __eq__(self, other):
 8 |         return type(self) == type(other) and self.redacted_text == other.redacted_text and self.analyzer_results == other.analyzer_results
 9 | 
10 |     def __repr__(self):
11 |         return self.__str__()
12 | 
13 |     def __str__(self):
14 |         return "PII information found: \n{}\nRedacted text: {}".format(self.analyzer_results, self.redacted_text)
15 | 


--------------------------------------------------------------------------------
/src/anonymize/drop_anonymizer.py:
--------------------------------------------------------------------------------
 1 | from src.analyze.utils.analyzer_result import AnalyzerResult
 2 | 
 3 | 
 4 | class DropAnonymizer:
 5 | 
 6 |     @staticmethod
 7 |     def redact(text: str, analyzer_results: [AnalyzerResult]):
 8 |         for result in analyzer_results:
 9 |             text = text.replace(result.text, "")
10 |         return text
11 | 


--------------------------------------------------------------------------------
/src/anonymize/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/anonymize/tests/__init__.py


--------------------------------------------------------------------------------
/src/anonymize/tests/test_drop_anonymizer.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from src.anonymize.drop_anonymizer import DropAnonymizer
 3 | from src.analyze.utils.analyzer_result import AnalyzerResult
 4 | 
 5 | 
 6 | class TestDropAnonymizer(TestCase):
 7 | 
 8 |     def test_redact_for_single_analyzer_result(self):
 9 |         text = "text containing pii"
10 |         analyzer_results = [AnalyzerResult("pii", "PII_DETECTOR", 16, 18)]
11 |         result = DropAnonymizer.redact(text, analyzer_results)
12 |         self.assertEqual(result, "text containing ")
13 | 
14 |     def test_redact_for_multiple_analyzer_results(self):
15 |         text = "text containing pii1 and pii2"
16 |         analyzer_results = [AnalyzerResult("pii1", "PII_DETECTOR", 16, 19),
17 |                             AnalyzerResult("pii2", "PII_DETECTOR", 25, 28)]
18 |         result = DropAnonymizer.redact(text, analyzer_results)
19 |         self.assertEqual(result, "text containing  and ")
20 | 
21 | 


--------------------------------------------------------------------------------
/src/constants.py:
--------------------------------------------------------------------------------
1 | ACQUIRE="acquire"
2 | FILE_PATH="file_path"
3 | ANALYZE="analyze"
4 | REPORT="report"
5 | LOCATION="location"
6 | REPORT_LEVEL="level"
7 | OUTPUT_FILE_PATH="output_file_path"


--------------------------------------------------------------------------------
/src/dpf_main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append(os.path.abspath('.'))
 4 | 
 5 | import argparse
 6 | import json
 7 | 
 8 | from src.report.report_generator import ReportGenerator
 9 | from src.acquire.csv_parser import CsvParser
10 | from src.analyze.detectors.pii_detector import PIIDetector
11 | from src.constants import ACQUIRE, REPORT
12 | from src.write.csv_writer import CsvWriter
13 | 
14 | 
15 | class DPFMain():
16 | 
17 |     def __init__(self, config_file_path):
18 |         with open(config_file_path) as config_file:
19 |             self.config = json.load(config_file)
20 | 
21 |     #TODO : validate the config for the stages right here
22 |     def run(self):
23 |         parsed_data_frame = CsvParser(config=self.config[ACQUIRE]).parse()
24 |         pii_analysis_report, redacted_data_frame = PIIDetector().analyze_data_frame(parsed_data_frame)
25 |         if pii_analysis_report.empty:
26 |             print("NO PII VALUES WERE FOUND!")
27 |         else:
28 |             ReportGenerator(config=self.config[REPORT])\
29 |                             .generate(results_df=pii_analysis_report,
30 |                                        )
31 |         CsvWriter(config=self.config).write_csv(df=redacted_data_frame)
32 | 
33 |         
34 | # output_directory needs to be obtained from the config json file as a parameter in the 'anonymize' section.
35 | 
36 | def get_args():
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument('--config-file', help='config file to run the tool')
39 |     args = parser.parse_args()
40 |     if not args.config_file:
41 |         raise ValueError("Config file path should be provided for the tool to run.")
42 |     return args
43 | 
44 | if __name__ == "__main__":
45 |     args = get_args()
46 |     DPFMain(args.config_file).run()


--------------------------------------------------------------------------------
/src/report/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/report/__init__.py


--------------------------------------------------------------------------------
/src/report/report_generator.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from enum import Enum
  3 | 
  4 | import os
  5 | import pandas as pd
  6 | import logging
  7 | from src.constants import LOCATION, REPORT_LEVEL
  8 | 
  9 | 
 10 | class ReportLevel(Enum):
 11 | 
 12 |     HIGH = "high"
 13 |     MEDIUM = "medium"
 14 |     LOW = "low"
 15 | 
 16 | class ReportGenerator():
 17 | 
 18 |     def __init__(self, config):
 19 |         self.report_file_location = config[LOCATION]
 20 |         self.report_level = config[REPORT_LEVEL]
 21 |         self.setup_logging_config()
 22 | 
 23 |     def setup_logging_config(self):
 24 |         date = datetime.today().strftime("%Y%m%d")
 25 |         file_name = "{}/report_{}.log".format(self.report_file_location, date)
 26 |         if os.path.exists(file_name):
 27 |             mode = "a"
 28 |         else:
 29 |             if not os.path.exists(self.report_file_location):
 30 |                 os.makedirs(self.report_file_location)
 31 |             mode = "x"
 32 |         file_handler = logging.FileHandler(filename=file_name, mode=mode)
 33 |         formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 34 |         file_handler.setFormatter(formatter)
 35 |         logging.getLogger().addHandler(file_handler)
 36 |         logging.getLogger().setLevel(logging.INFO)
 37 | 
 38 |     def __generate_high_level_report(self, results_df):
 39 |         report_df = pd.DataFrame({"Columns with PII values" : results_df.columns.values})
 40 |         return report_df
 41 | 
 42 |     def __collate_all_detectors_per_cell(self, analyzer_result):
 43 |         return [result.detector() for result in analyzer_result[1]]
 44 | 
 45 |     def __calculate_percentage(self, item_count, total_count):
 46 |         return round((item_count/total_count) * 100.0, 2)
 47 | 
 48 |     def __calculate_detector_percentage(self, row_count, count_map):
 49 |         percentage_map = {}
 50 |         for key, value in count_map.items():
 51 |             percentage_map[key] = "{}%".format(self.__calculate_percentage(value, row_count))
 52 |         return percentage_map
 53 | 
 54 |     def __calculate_detector_count(self, column_series):
 55 |         detector_count_map = {}
 56 |         for analyzer_results in column_series.iteritems():
 57 |             if not analyzer_results:
 58 |                 continue
 59 |             detector_types = self.__collate_all_detectors_per_cell(analyzer_results)
 60 |             for detector_type in detector_types:
 61 |                 if detector_type not in detector_count_map:
 62 |                     detector_count_map[detector_type] = 0
 63 |                 detector_count_map[detector_type] += 1
 64 |         return detector_count_map
 65 | 
 66 | 
 67 |     #TODO : filter out the NAs before passing through this
 68 |     def calculate_detector_stats_for_each_column(self, column_series):
 69 |         stats_map = {}
 70 |         count_map = self.__calculate_detector_count(column_series)
 71 |         percentage_map = self.__calculate_detector_percentage(len(column_series), count_map)
 72 |         for key, value in count_map.items():
 73 |             stats_tuple = (value, percentage_map[key])
 74 |             stats_map[key] = stats_tuple
 75 |         return stats_map
 76 | 
 77 |     def __generate_medium_level_report(self, results_df):
 78 |         report_df = pd.DataFrame({})
 79 |         columns = list(results_df)
 80 |         column_reports = []
 81 |         for column in columns:
 82 |             detector_stats_for_each_column = self.calculate_detector_stats_for_each_column(results_df[column])
 83 |             column_report = pd.Series(detector_stats_for_each_column, name=column, index=detector_stats_for_each_column.keys())
 84 |             if not column_report.empty:
 85 |                 column_reports.append(column_report)
 86 |         if column_reports:
 87 |             report_df = pd.concat(column_reports, axis=1, keys=[series.name for series in column_reports], sort=True)
 88 |         return report_df.fillna(value=0)
 89 | 
 90 |     def generate_report_content(self, results_df):
 91 |         if self.report_level == ReportLevel.HIGH.value:
 92 |             return self.__generate_high_level_report(results_df)
 93 |         elif self.report_level == ReportLevel.MEDIUM.value:
 94 |             return self.__generate_medium_level_report(results_df)
 95 | 
 96 |     def __print(self, msg):
 97 |         print(msg)
 98 |         logging.info(msg)
 99 | 
100 |     def __print_report(self, report):
101 |         self.__print("\n\n****************************PII ANALYSIS REPORT**************************\n\n")
102 |         if report.empty:
103 |             self.__print("NO PII VALUES WERE FOUND!")
104 |         else:
105 |             self.__print(report)
106 |         self.__print("\n\n****************************DONE!**************************\n\n")
107 | 
108 |     def generate(self, results_df):
109 |         final_report = self.generate_report_content(results_df)
110 |         self.__print_report(final_report)
111 |         return final_report
112 | 
113 | 


--------------------------------------------------------------------------------
/src/report/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/report/tests/__init__.py


--------------------------------------------------------------------------------
/src/report/tests/test_report_generator.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from unittest.mock import patch, MagicMock
 3 | 
 4 | import os
 5 | import pandas as pd
 6 | from freezegun import freeze_time
 7 | 
 8 | from src.report.report_generator import ReportGenerator
 9 | from src.analyze.utils.analyzer_result import AnalyzerResult
10 | 
11 | 
12 | 
13 | 
14 | class TestReportGenerator(TestCase):
15 | 
16 |     @patch("src.report.report_generator.ReportGenerator.setup_logging_config")
17 |     def setUp(self, mock_setup_logging_config):
18 |         self.report_generator_high_level = ReportGenerator(config={"location" : "abc", "level" : "high"})
19 |         mock_setup_logging_config.assert_called_with()
20 |         self.report_generator_medium_level = ReportGenerator(config={"location" : "abc", "level" : "medium"})
21 |         mock_setup_logging_config.assert_called_with()
22 | 
23 |     def test_high_level_reporting_returns_columns_with_PII_values_when_given_a_results_data_frame(self):
24 |         result_data_frame = pd.DataFrame({"summary" : [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]],
25 |                                          "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]})
26 |         expected_data_frame = pd.DataFrame({"Columns with PII values" : ["summary", "phone number"]})
27 |         self.assertCountEqual(expected_data_frame, self.report_generator_high_level.generate_report_content(result_data_frame))
28 | 
29 |     def test_medium_level_reporting_returns_data_frame_with_detectors_and_column_details(self):
30 |         result_data_frame = pd.DataFrame({"summary" : [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]],
31 |                                           "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]})
32 |         expected_data_frame = pd.DataFrame({"summary" : pd.Series({"NRIC" : (1, "50%"), "EMAIL" : (1,"50%")}),
33 |                                            "phone number" : pd.Series({"PHONE_NUMBER" : (2, "100%")})})
34 |         self.assertCountEqual(list(expected_data_frame), self.report_generator_medium_level.generate_report_content(result_data_frame))
35 | 
36 |     def test_calculate_detector_stats_returns_detector_counts_and_percentages(self):
37 |         result_column_values = pd.Series([[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]])
38 |         actual_result = self.report_generator_medium_level.calculate_detector_stats_for_each_column(result_column_values)
39 |         expected_result = {"NRIC" : (1, "33.33%"), "EMAIL" : (2, "66.67%")}
40 |         self.assertCountEqual(expected_result, actual_result)
41 | 
42 |     @patch("logging.info")
43 |     @patch("src.report.report_generator.ReportGenerator.generate_report_content")
44 |     def test_generate_report_calls_content_generate_report_content_and_logs_it(self, mock_generate_content, mock_logging):
45 |         result_data_frame = pd.DataFrame({"summary" : [[AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]],
46 |                                          "phone number": [[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]]})
47 |         mock_generate_content.return_value = pd.DataFrame({"Columns with PII values" : ["summary", "phone number"]})
48 |         mock_logging.return_value = None
49 |         expected_result = self.report_generator_high_level.generate(result_data_frame)
50 |         self.assertCountEqual(expected_result, mock_generate_content.return_value)
51 | 
52 | 
53 |     @freeze_time('2019-05-29 01:01:03')
54 |     @patch("logging.FileHandler")
55 |     @patch("logging.Logger.addHandler")
56 |     @patch("genericpath.exists")
57 |     def test_creation_of_the_report_file_if_not_present(self, mock_file_exists, mock_add_handler, mock_file_handler):
58 |         mock_file_exists.return_value = False
59 |         mock_file_handler.return_value = MagicMock()
60 |         self.report_generator_high_level.setup_logging_config()
61 |         mock_file_handler.assert_called_with(filename="abc/report_20190529.log", mode="x")
62 |         mock_add_handler.assert_called_with(mock_file_handler.return_value)
63 | 
64 | 
65 |     @freeze_time('2019-05-29 01:01:03')
66 |     @patch("logging.FileHandler")
67 |     @patch("logging.Logger.addHandler")
68 |     @patch("os.path.exists")
69 |     def test_appending_to_report_file_if_already_present(self, mock_file_exists, mock_add_handler, mock_file_handler):
70 |         mock_file_exists.return_value = True
71 |         mock_file_handler.return_value = MagicMock()
72 |         self.report_generator_high_level.setup_logging_config()
73 |         mock_file_handler.assert_called_with(filename="abc/report_20190529.log", mode="a")
74 |         mock_add_handler.assert_called_with(mock_file_handler.return_value)
75 | 


--------------------------------------------------------------------------------
/src/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/tests/__init__.py


--------------------------------------------------------------------------------
/src/tests/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/tests/config/__init__.py


--------------------------------------------------------------------------------
/src/tests/config/test_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "acquire": {
 3 |     "file_path": "/Users/wisuchoi/Documents/anonymizer/src/acquire/tests/data/comma_delimited_file.csv",
 4 |     "delimiter": ","
 5 |   },
 6 |   "analyze": {
 7 |   },
 8 |   "report" : {
 9 |     "location" : "/Users/wisuchoi/Documents/anonymizer/report",
10 |     "level" : "high"
11 |   },
12 |   "anonymize": {
13 |     "output_file_path" : "/Users/wisuchoi/Documents/anonymizer/output"
14 |   }
15 | }


--------------------------------------------------------------------------------
/src/tests/test_dpf_main.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from unittest import TestCase
 4 | from unittest.mock import patch, MagicMock
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from src.constants import ACQUIRE, REPORT
 9 | from src.dpf_main import DPFMain
10 | 
11 | 
12 | class TestDPFMain(TestCase):
13 | 
14 |     def setUp(self):
15 |         test_config = "{}/{}".format(os.path.dirname(os.path.realpath(__file__)),"config/test_config.json")
16 |         self.dpf_main = DPFMain(test_config)
17 |         with open(test_config) as input_file:
18 |             self.config_json = json.load(input_file)
19 | 
20 |     @patch('src.write.csv_writer.CsvWriter.write_csv')
21 |     @patch('src.write.csv_writer.CsvWriter.__init__')
22 |     @patch('src.report.report_generator.ReportGenerator.generate')
23 |     @patch('src.report.report_generator.ReportGenerator.__init__')
24 |     @patch('src.analyze.detectors.pii_detector.PIIDetector.analyze_data_frame')
25 |     @patch('src.acquire.csv_parser.CsvParser.parse')
26 |     @patch('src.acquire.csv_parser.CsvParser.__init__')
27 |     def test_run_parses_the_config_file_and_invokes_respective_stages_correctly(self, mock_csv_parser_init,
28 |                                                                                 mock_csv_parser_parse,
29 |                                                                                 mock_pii_analyze_df,
30 |                                                                                 mock_report_generator_init,
31 |                                                                                 mock_generate_report,
32 |                                                                                 mock_csv_writer_init,
33 |                                                                                 mock_csv_writer_write_csv):
34 |         mock_csv_parser_init.return_value = None
35 |         mock_csv_parser_parse.return_value = MagicMock()
36 |         mock_pii_analyze_df.return_value = (pd.DataFrame({"summary" : ["test result"]}), pd.DataFrame({}))
37 |         mock_report_generator_init.return_value = None
38 |         mock_generate_report.return_value = MagicMock()
39 |         mock_csv_writer_init.return_value = None
40 |         mock_csv_writer_write_csv.return_value = None
41 |         self.dpf_main.run()
42 |         mock_csv_parser_init.assert_called_with(config=self.config_json[ACQUIRE])
43 |         mock_csv_parser_parse.assert_called_with()
44 |         mock_pii_analyze_df.assert_called_with(mock_csv_parser_parse.return_value)
45 |         mock_report_generator_init.assert_called_with(config=self.config_json[REPORT])
46 |         mock_generate_report.assert_called_with(results_df=mock_pii_analyze_df.return_value[0])
47 |         mock_csv_writer_init.assert_called_with(config=self.config_json)
48 |         mock_csv_writer_write_csv.assert_called_with(df=mock_pii_analyze_df.return_value[1])
49 | 
50 | 
51 |     @patch('src.write.csv_writer.CsvWriter.write_csv')
52 |     @patch('src.write.csv_writer.CsvWriter.__init__')
53 |     @patch('src.report.report_generator.ReportGenerator.generate')
54 |     @patch('src.analyze.detectors.pii_detector.PIIDetector.analyze_data_frame')
55 |     @patch('src.acquire.csv_parser.CsvParser.parse')
56 |     @patch('src.acquire.csv_parser.CsvParser.__init__')
57 |     def test_run_short_circuits_generate_report_when_no_PII_values_detected(self, mock_csv_parser_init,
58 |                                                                                 mock_csv_parser_parse,
59 |                                                                                 mock_pii_analyze_df,
60 |                                                                                 mock_generate_report,
61 |                                                                                 mock_csv_writer_init,
62 |                                                                                 mock_csv_writer_write_csv):
63 |         mock_csv_parser_init.return_value = None
64 |         mock_csv_parser_parse.return_value = pd.DataFrame({})
65 |         mock_pii_analyze_df.return_value = (pd.DataFrame({}), pd.DataFrame({}))
66 |         mock_generate_report.return_value = MagicMock()
67 |         mock_generate_report.return_value = None
68 |         mock_csv_writer_init.return_value = None
69 |         mock_csv_writer_write_csv.return_value = None
70 |         self.dpf_main.run()
71 |         mock_csv_parser_init.assert_called_with(config=self.config_json[ACQUIRE])
72 |         mock_csv_parser_parse.assert_called_with()
73 |         mock_pii_analyze_df.assert_called_with(mock_csv_parser_parse.return_value)
74 |         mock_generate_report.assert_not_called()
75 |         mock_csv_writer_init.assert_called_with(config=self.config_json)
76 |         mock_csv_writer_write_csv.assert_called_with(df=mock_pii_analyze_df.return_value[1])
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/src/write/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/write/__init__.py


--------------------------------------------------------------------------------
/src/write/csv_writer.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pandas import DataFrame
 3 | 
 4 | from src.constants import OUTPUT_FILE_PATH, FILE_PATH
 5 | 
 6 | 
 7 | class CsvWriter:
 8 | 
 9 |     def __init__(self, config):
10 |         self.__validate_config(config)
11 |         self.output_path = config["anonymize"][OUTPUT_FILE_PATH]
12 |         self.input_file_name = config["acquire"][FILE_PATH]
13 | 
14 |     def __validate_config(self, config):
15 |         if "anonymize" not in config or not config["anonymize"] or OUTPUT_FILE_PATH not in config["anonymize"] or not config["anonymize"][OUTPUT_FILE_PATH]:
16 |             raise ValueError("Config 'output_file_path' needs to be provided for parsing")
17 | 
18 |     def get_output_file_path(self):
19 |         file_name = self.input_file_name.split('/')[-1]
20 |         file_name_no_extension = file_name.split('.')[0]
21 |         result = f"{self.output_path}/{file_name_no_extension}_anonymized_.csv"
22 |         return result
23 | 
24 |     def write_csv(self, df: DataFrame):
25 |         df.to_csv(self.get_output_file_path(), index=False)
26 |         print("Anonymized csv has been successfully created!")


--------------------------------------------------------------------------------
/src/write/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src/write/tests/__init__.py


--------------------------------------------------------------------------------
/src/write/tests/test_csv_writer.py:
--------------------------------------------------------------------------------
 1 | from src.write.csv_writer import CsvWriter
 2 | from unittest import TestCase
 3 | import os
 4 | import pandas as pd
 5 | 
 6 | 
 7 | class TestCsvWriter(TestCase):
 8 | 
 9 |     #TODO: check acquire file path exists
10 |     def test_invalid_config_gets_caught_during_initialization(self):
11 |         context = {}
12 |         with self.assertRaises(ValueError) as ve:
13 |             CsvWriter(config=context)
14 |         self.assertEqual(str(ve.exception), "Config 'output_file_path' needs to be provided for parsing")
15 | 
16 | 
17 |     def test_correct_output_path_is_generated(self):
18 |         context = {
19 |             "acquire": {
20 |                 "file_path": "/anonymizer/test_data.csv",
21 |                 "delimiter": ","
22 |             },
23 |             "anonymize": {
24 |                 "output_file_path" : "/anonymizer/output"
25 |             }
26 |         }
27 |         input_file_name = "test_data"
28 |         output_directory = "/anonymizer/output"
29 |         expected = f"{output_directory}/{input_file_name}_anonymized_.csv"
30 |         writer = CsvWriter(config=context)
31 |         self.assertEqual(writer.get_output_file_path(), expected)


--------------------------------------------------------------------------------
/src_spark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/__init__.py


--------------------------------------------------------------------------------
/src_spark/acquire/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/acquire/__init__.py


--------------------------------------------------------------------------------
/src_spark/acquire/csv_parser.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from src_spark.constants import FILE_PATH
 3 | import pyspark.sql.functions as f
 4 | 
 5 | class CsvParser:
 6 | 
 7 |     def __init__(self, spark: SparkSession, config):
 8 |         self.__validate_config(config)
 9 |         self.input_path = config["file_path"]
10 |         self.delimiter = config["delimiter"] if "delimiter" in config and config["delimiter"] else ","
11 |         self.spark = spark
12 | 
13 |     def __validate_config(self, config):
14 |         if FILE_PATH not in config or not config[FILE_PATH]:
15 |             raise ValueError("Config 'file_path' needs to be provided for parsing")
16 | 
17 |     def parse(self):
18 |         df = self.spark.read.load(
19 |                             self.input_path,
20 |                             format="csv",
21 |                             sep=self.delimiter,
22 |                             header="true",
23 |                             inferSchema="true")
24 |         
25 |         
26 | 
27 |         return df


--------------------------------------------------------------------------------
/src_spark/acquire/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/acquire/tests/__init__.py


--------------------------------------------------------------------------------
/src_spark/acquire/tests/data/comma_delimited_file.csv:
--------------------------------------------------------------------------------
1 | name,ssn
2 | Lisa Beard,557-39-2479


--------------------------------------------------------------------------------
/src_spark/acquire/tests/data/empty.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/acquire/tests/data/empty.csv


--------------------------------------------------------------------------------
/src_spark/acquire/tests/data/missing_comma.csv:
--------------------------------------------------------------------------------
1 | name,ssn,age
2 | Lisa Beard,557-39-2479,33
3 | John Sohn,33


--------------------------------------------------------------------------------
/src_spark/acquire/tests/data/pipe_delimited_file.csv:
--------------------------------------------------------------------------------
1 | name|ssn
2 | Lisa Beard|557-39-2479


--------------------------------------------------------------------------------
/src_spark/acquire/tests/test_csv_parser.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest import TestCase
 3 | from pyspark.sql import SparkSession
 4 | from pyspark.sql.types import StructType
 5 | from src_spark.acquire.csv_parser import CsvParser
 6 | 
 7 | 
 8 | class TestCsvParser(TestCase):
 9 | 
10 |     def setUp(self) -> None:
11 |         self.SPARK = SparkSession.builder \
12 |                                 .master("local") \
13 |                                 .appName("Test CSVParser") \
14 |                                 .getOrCreate()
15 |         self.current_dir = os.path.dirname(os.path.realpath(__file__))
16 |     
17 |     def test_invalid_config_gets_caught_during_initialization(self):
18 |         context = {}
19 |         with self.assertRaises(ValueError) as ve:
20 |             CsvParser(self.SPARK, config=context)
21 |         self.assertEqual(str(ve.exception), "Config 'file_path' needs to be provided for parsing")
22 | 
23 |     def test_if_valid_csv_file_provided_returns_spark_df(self):
24 |         file_path = "{}/data/comma_delimited_file.csv".format(self.current_dir)
25 |         config = {"file_path" : file_path, "delimiter" : ""}
26 |         
27 |         expected = self.SPARK.createDataFrame(
28 |             [("Lisa Beard", "557-39-2479")],
29 |             ["name", "ssn"]
30 |         )
31 |         actual = CsvParser(spark=self.SPARK, config=config).parse()
32 |         
33 |         self.assertEqual(actual.schema, expected.schema)
34 |         self.assertEqual(actual.collect(), expected.collect())
35 | 
36 |     def test_if_valid_csv_file_with_different_delimiter_provided_returns_spark_df(self):
37 |         file_path = "{}/data/pipe_delimited_file.csv".format(self.current_dir)
38 |         config = {"file_path" : file_path, "delimiter" : "|"}
39 |         
40 |         expected = self.SPARK.createDataFrame(
41 |             [("Lisa Beard", "557-39-2479")],
42 |             ["name", "ssn"]
43 |         )
44 |         actual = CsvParser(spark=self.SPARK, config=config).parse()
45 |         
46 |         self.assertEqual(actual.schema, expected.schema)
47 |         self.assertEqual(actual.collect(), expected.collect())
48 | 
49 |     def test_if_empty_csv_file_returns_empty_pandas_df(self):
50 |         file_path = "{}/data/empty.csv".format(self.current_dir)
51 |         config = {"file_path" : file_path}
52 |         expected = self.SPARK.createDataFrame([], StructType([]))
53 |         actual = CsvParser(spark=self.SPARK, config=config).parse()
54 |         self.assertEqual(actual.schema, expected.schema)
55 |         self.assertEqual(actual.collect(), expected.collect())
56 |         
57 |     


--------------------------------------------------------------------------------
/src_spark/analyze/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/analyze/__init__.py


--------------------------------------------------------------------------------
/src_spark/analyze/detectors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/analyze/detectors/__init__.py


--------------------------------------------------------------------------------
/src_spark/analyze/detectors/base_detector.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | from src_spark.analyze.utils.analyzer_result import AnalyzerResult
 5 | 
 6 | 
 7 | 
 8 | class BaseDetector(ABC):
 9 | 
10 |     def __init__(self):
11 |         self.name = None
12 |         self.pattern = None
13 | 
14 |     @abstractmethod
15 |     def get_pattern(self):
16 |         pass
17 | 
18 |     @abstractmethod
19 |     def get_name(self):
20 |         pass
21 | 
22 |     def validate(self, text):
23 |         return True
24 | 
25 |     def execute(self, text):
26 |         results = []
27 |         matches = re.finditer(self.get_pattern(), text)
28 |         for match in matches:
29 |             matched_string = match.string[match.start(): match.end()]
30 |             if self.validate(matched_string):
31 |                 results.append(AnalyzerResult(matched_string, self.get_name(), match.start(), match.end()))
32 |         return results
33 | 


--------------------------------------------------------------------------------
/src_spark/analyze/detectors/credit_card_detector.py:
--------------------------------------------------------------------------------
 1 | from src_spark.analyze.detectors.base_detector import BaseDetector
 2 | from src_spark.analyze.utils.regex import RegEx
 3 | 
 4 | 
 5 | class CreditCardDetector(BaseDetector):
 6 | 
 7 |     def __init__(self):
 8 |         self.name = "CREDIT_CARD"
 9 |         self.pattern = RegEx().literal("4").any_digit().num_occurrences(3).pipe() \
10 |             .literal("5").range(0, 5).any_digit().num_occurrences(2).pipe() \
11 |             .literal("6").any_digit().num_occurrences(3).pipe() \
12 |             .literal("1").any_digit().num_occurrences(3).pipe() \
13 |             .literal("3").any_digit().num_occurrences(3) \
14 |             .one_of("- ").zero_or_one_occurrences() \
15 |             .any_digit().range_occurrences(3, 4) \
16 |             .one_of("- ").zero_or_one_occurrences() \
17 |             .any_digit().range_occurrences(3, 4) \
18 |             .one_of("- ").zero_or_one_occurrences() \
19 |             .any_digit().range_occurrences(3, 5).build()
20 | 
21 |     def get_name(self):
22 |         return self.name
23 | 
24 |     def get_pattern(self):
25 |         return self.pattern
26 | 
27 |     def validate(self, text):
28 |         def digits_of(n):
29 |             return [int(d) for d in str(n)]
30 | 
31 |         digits = digits_of(text.replace('-', '').replace(' ', ''))
32 |         odd_digits = digits[-1::-2]
33 |         even_digits = digits[-2::-2]
34 |         checksum = sum(odd_digits)
35 | 
36 |         for d in even_digits:
37 |             checksum += sum(digits_of(d * 2))
38 | 
39 |         return checksum % 10 == 0
40 | 


--------------------------------------------------------------------------------
/src_spark/analyze/detectors/email_detector.py:
--------------------------------------------------------------------------------
 1 | from src_spark.analyze.detectors.base_detector import BaseDetector
 2 | from src_spark.analyze.utils.regex import RegEx
 3 | 
 4 | 
 5 | class EmailDetector(BaseDetector):
 6 | 
 7 |     def __init__(self):
 8 |         self.name = "EMAIL"
 9 |         self.pattern = RegEx().one_of("a-zA-Z0-9_.+-").one_or_more_occurrences().literal("@").one_of("a-zA-Z0-9-")\
10 |             .one_or_more_occurrences().literal("\\.").one_of("a-zA-Z0-9-.").one_or_more_occurrences().build()
11 | 
12 |     def get_name(self):
13 |         return self.name
14 | 
15 |     def get_pattern(self):
16 |         return self.pattern
17 | 


--------------------------------------------------------------------------------
/src_spark/analyze/detectors/national_id_detector.py:
--------------------------------------------------------------------------------
 1 | from src_spark.analyze.detectors.base_detector import BaseDetector
 2 | from src_spark.analyze.utils.regex import RegEx
 3 | 
 4 | 
 5 | class NationalIdDetector(BaseDetector):
 6 | 
 7 |     def __init__(self):
 8 |         self.name = "NRIC"
 9 |         self.pattern = RegEx().one_of("STFG").any_digit().num_occurrences(7).range("A", "Z").build()
10 | 
11 |     def get_name(self):
12 |         return self.name
13 | 
14 |     def get_pattern(self):
15 |         return self.pattern
16 | 
17 |     def __get_offset(self, text):
18 |         return 4 if text in "TG" else 0
19 | 
20 |     def __is_NRIC(self, text, loc):
21 |         if text[0] in "ST":
22 |             return "JZIHGFEDCBA"[loc] == text[8]
23 |         return False
24 | 
25 |     def __is_FIN(self, text, loc):
26 |         if text[0] in "FG":
27 |             return "XWUTRQPNMLK"[loc] == text[8]
28 |         return False
29 | 
30 |     def validate(self, text):
31 |         weight = self.__get_weight(text)
32 |         first_character = text[0]
33 |         offset = self.__get_offset(first_character)
34 |         loc = (offset + weight) % 11
35 |         return self.__is_NRIC(text, loc) or self.__is_FIN(text, loc)
36 | 
37 |     def __get_weight(self, text):
38 |         numbers = [int(digit) for digit in list(text[1:-1])]
39 |         for index, i in enumerate(numbers):
40 |             if index == 0:
41 |                 numbers[index] *= 2
42 |             numbers[index] *= 8 - index
43 |         return sum(numbers)
44 | 


--------------------------------------------------------------------------------
/src_spark/analyze/detectors/phone_number_detector.py:
--------------------------------------------------------------------------------
 1 | from src_spark.analyze.detectors.base_detector import BaseDetector
 2 | from src_spark.analyze.utils.regex import RegEx
 3 | 
 4 | 
 5 | class PhoneNumberDetector(BaseDetector):
 6 | 
 7 |     def __init__(self):
 8 |         self.name = "PHONE_NUMBER"
 9 |         regex_pipe = RegEx().pipe().build()
10 | 
11 |         regex_with_country_code_and_no_space = '(\\+65?\\s?[689]\\d{7})'
12 |         regex_with_country_code_and_single_space = '(\\+65?\\s?[689]\\d{3} \\d{4})'
13 |         regex_no_country_code_and_no_space = '([689]\\d{7})'
14 |         regex_no_country_code_and_single_space = '([689]\\d{3} \\d{4})'
15 |         regex_with_country_code_in_brackets_and_no_space = '([(]65[)]\\s?[689]\\d{7})'
16 |         regex_with_country_code_in_brackets_and_single_space = '([(]65[)]\\s?[689]\\d{3} \\d{4})'
17 | 
18 |         self.pattern = regex_with_country_code_and_no_space + regex_pipe + \
19 |             regex_with_country_code_and_single_space + regex_pipe + \
20 |             regex_no_country_code_and_no_space + regex_pipe + \
21 |             regex_no_country_code_and_single_space + regex_pipe + \
22 |             regex_with_country_code_in_brackets_and_no_space + regex_pipe + \
23 |             regex_with_country_code_in_brackets_and_single_space
24 | 
25 |     def get_name(self):
26 |         return self.name
27 | 
28 |     def get_pattern(self):
29 |         return self.pattern
30 | 


--------------------------------------------------------------------------------
/src_spark/analyze/detectors/pii_detector.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import pkgutil
 3 | import inspect
 4 | import sys
 5 | from pyspark.sql import DataFrame
 6 | from pyspark.sql.types import StructField, StructType, ArrayType, StringType, LongType
 7 | from src_spark.analyze.detectors.base_detector import BaseDetector
 8 | import src_spark.analyze.detectors
 9 | 
10 | class PIIDetector():
11 | 
12 |     def __init__(self):
13 |         self.detectors = self.__get_detector_instances()
14 | 
15 |     def __get_detector_modules(self):
16 |         modules = [modname for importer, modname, ispkg in
17 |                         pkgutil.walk_packages(path=src_spark.analyze.detectors.__path__,
18 |                                               prefix=src_spark.analyze.detectors.__name__+".")
19 |                    if "tests" not in modname]
20 |         return modules
21 | 
22 |     def __get_detector_instances(self):
23 |         modules = self.__get_detector_modules()
24 |         detectors = []
25 |         for module in modules:
26 |             importlib.import_module(module)
27 |             classes = inspect.getmembers(sys.modules[module], inspect.isclass)
28 |             for class_name, class_type in classes:
29 |                 if class_name != "BaseDetector" and issubclass(class_type, BaseDetector):
30 |                     detectors.append(class_type())
31 |         return detectors
32 |     
33 |     def __detect_pii_row(self, row):
34 |         new_row = []
35 |         for element in row:
36 |             results = []
37 |             for detector in self.detectors:
38 |                 results += detector.execute(element)
39 |             new_row.append(results) 
40 | 
41 |         return new_row
42 |     
43 |     def get_analyzer_results(self, input_data_frame: DataFrame):
44 |         columns = input_data_frame.columns
45 | 
46 |         array_structtype = StructType([
47 |             StructField("end", LongType(), False),
48 |             StructField("start", LongType(), False),
49 |             StructField("text", StringType(), False),
50 |             StructField("type", StringType(), False)
51 |         ])
52 |         result_schema = []
53 |         for column in columns:
54 |             result_schema.append(StructField(column, ArrayType(array_structtype, True), nullable=False) )
55 | 
56 |         result = input_data_frame.rdd.map(lambda x: self.__detect_pii_row(x)).toDF(schema=StructType(result_schema))
57 |         
58 |         return result
59 | 
60 |     def _get_pii_list(self, row):
61 |         get_analyzer_results_text = lambda x: x.text
62 | 
63 |         new_row = []
64 |         for cell in row:
65 |             pii_sublist = list(map(get_analyzer_results_text,cell))
66 |             new_row.extend(pii_sublist)
67 |         return new_row
68 |     
69 |     def get_redacted_text(self, input_data_frame: DataFrame, report: DataFrame):
70 |         pii_list = report.rdd.flatMap(lambda row: self._get_pii_list(row)).collect()
71 |         column = input_data_frame.columns
72 |         result = input_data_frame.rdd.map(lambda row: self.__replace_redacted_text(row, pii_list)).toDF(column)
73 | 
74 |         return result
75 | 
76 |     def __replace_redacted_text(self, row, pii_list):
77 |         new_row = []
78 |         for cell in row:
79 |             for word in pii_list:
80 |                 if word in cell:
81 |                     cell = cell.replace(word, "")
82 |             new_row.append(cell)
83 |         return new_row
84 | 
85 |     def analyze_data_frame(self, input_data_frame: DataFrame):
86 |         report = self.get_analyzer_results(input_data_frame)
87 |         redacted = self.get_redacted_text(input_data_frame, report)
88 | 
89 |         return report, redacted
90 | 
91 | 


--------------------------------------------------------------------------------
/src_spark/analyze/detectors/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/analyze/detectors/tests/__init__.py


--------------------------------------------------------------------------------
/src_spark/analyze/detectors/tests/test_pii_detector.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | from pyspark.sql import SparkSession
  3 | from src_spark.analyze.detectors.pii_detector import PIIDetector
  4 | from src.analyze.utils.analyzer_result import AnalyzerResult
  5 | from pyspark.sql.types import StructField, StructType, ArrayType, StringType, LongType, Row
  6 | 
  7 | 
  8 | class TestPIIDetector(TestCase):
  9 |     
 10 |     def setUp(self) -> None:
 11 |         self.SPARK = SparkSession.builder \
 12 |                                 .master("local") \
 13 |                                 .appName("Test PIIDetector") \
 14 |                                 .getOrCreate()
 15 |         self.pii_detector = PIIDetector()
 16 | 
 17 |         self.array_structtype = StructType([
 18 |             StructField("end", LongType(), False),
 19 |             StructField("start", LongType(), False),
 20 |             StructField("text", StringType(), False),
 21 |             StructField("type", StringType(), False)
 22 |         ])
 23 |         self.schema = StructType([
 24 |             StructField("summary", ArrayType(self.array_structtype, True), nullable=False),
 25 |             StructField("phone number", ArrayType(self.array_structtype, True), nullable=False) 
 26 |         ])
 27 | 
 28 |     def test_analyze_data_frame_runs_analyze_against_each_cell_with_a_PII_value(self):
 29 |         test_data_frame = self.SPARK.createDataFrame(
 30 |             [
 31 |                 ("First President of Singapore NRIC was S0000001I", "Some examples of phone numbers are +65 62345678"),
 32 |                 ("A typical email id would look something like test@sample.com","Some examples of phone numbers are +65 62345678")
 33 |             ],
 34 |             ["summary", "phone number"]
 35 |         )
 36 | 
 37 |         actual = self.pii_detector.get_analyzer_results(test_data_frame)
 38 | 
 39 |         expected_data_frame = self.SPARK.createDataFrame(
 40 |             [
 41 |                 ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]),
 42 |                 ([AnalyzerResult("test@sample.com", "EMAIL", 45, 60)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
 43 |             ],
 44 |             self.schema
 45 |         )
 46 | 
 47 |         self.assertEqual(actual.schema, expected_data_frame.schema)
 48 |         self.assertEqual(actual.collect(), expected_data_frame.collect())
 49 | 
 50 |     def test_analyze_data_frame_runs_analyze_against_cell_with_multiple_PII_values(self):
 51 |         test_data_frame = self.SPARK.createDataFrame(
 52 |             [
 53 |                 ("First President of Singapore NRIC was S0000001I", "Some examples of phone numbers are +65 62345678"),
 54 |                 ("email test@sample.com and phone +65 62345678","Phone one +65 62345678 Phone two +65 62345678")
 55 |             ],
 56 |             ["summary", "phone number"]
 57 |         )
 58 | 
 59 |         actual = self.pii_detector.get_analyzer_results(test_data_frame)
 60 | 
 61 |         expected_data_frame = self.SPARK.createDataFrame(
 62 |             [
 63 |                 ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]),
 64 |                 ([AnalyzerResult("test@sample.com", "EMAIL", 6, 21), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 32, 44)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 10, 22), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 33, 45)])
 65 |             ],
 66 |             self.schema
 67 |         )
 68 | 
 69 |         self.assertEqual(actual.schema, expected_data_frame.schema)
 70 |         self.assertEqual(actual.collect(), expected_data_frame.collect())
 71 | 
 72 |     def test_analyze_data_frame_returns_empty_data_frame_when_there_are_no_PII_values(self):
 73 |         test_data_frame = self.SPARK.createDataFrame(
 74 |             [
 75 |                 ("No", "Personal"),
 76 |                 ("Data","Inside")
 77 |             ],
 78 |             ["summary", "phone number"]
 79 |         )
 80 | 
 81 |         actual = self.pii_detector.get_analyzer_results(test_data_frame)
 82 | 
 83 |         expected_data_frame = self.SPARK.createDataFrame(
 84 |             [
 85 |                 ([], []),
 86 |                 ([], [])
 87 |             ],
 88 |             self.schema
 89 |         )
 90 | 
 91 |         self.assertEqual(actual.schema, expected_data_frame.schema)
 92 |         self.assertEqual(actual.collect(), expected_data_frame.collect())
 93 | 
 94 |     def test_get_pii_list_returns_list_of_pii_words_given_row_of_list_of_analyzer_results(self):
 95 |         test_row = Row(
 96 |             summary=[
 97 |                 AnalyzerResult("S0000001I", "NRIC", 38, 47),
 98 |                 AnalyzerResult("S0000002I", "NRIC", 38, 47)
 99 |                 ],
100 |             phone_number=[AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
101 |         actual = self.pii_detector._get_pii_list(test_row)
102 |         expected = ["S0000001I","S0000002I","+65 62345678"]
103 |         self.assertEqual(actual, expected)
104 | 
105 |     def test_get_pii_list_returns_empty_lists_no_analyzer_results(self):
106 |         test_row = Row(summary=[],phone_number=[])
107 |         actual = self.pii_detector._get_pii_list(test_row)
108 |         expected = []
109 |         self.assertEqual(actual, expected)
110 | 
111 |     def test_get_redacted_text_returns_redacted_data_frame(self):
112 |         test_report_data_frame = self.SPARK.createDataFrame(
113 |             [
114 |                 ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)]),
115 |                 ([AnalyzerResult("test@sample.com", "EMAIL", 6, 21), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 32, 44)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 10, 22), AnalyzerResult("+65 62345678", "PHONE_NUMBER", 33, 45)])
116 |             ],
117 |             self.schema
118 |         )
119 | 
120 |         test_input_data_frame = self.SPARK.createDataFrame(
121 |             [
122 |                 ("First President of Singapore NRIC was S0000001I", "Some examples of phone numbers are +65 62345678"),
123 |                 ("email test@sample.com and phone +65 62345678","Phone one +65 62345678 Phone two +65 62345678")
124 |             ],
125 |             ["summary", "phone number"]
126 |         )
127 | 
128 |         actual = self.pii_detector.get_redacted_text(test_input_data_frame, test_report_data_frame)
129 |         
130 |         expected = self.SPARK.createDataFrame(
131 |             [
132 |                 ("First President of Singapore NRIC was ", "Some examples of phone numbers are "),
133 |                 ("email  and phone ","Phone one  Phone two ")
134 |             ],
135 |             ["summary", "phone number"]
136 |         )
137 | 
138 |         self.assertEqual(actual.schema, expected.schema)
139 |         self.assertEqual(actual.collect(), expected.collect())
140 | 
141 |     def test_get_redacted_text_returns_same_data_frame_if_analyzer_results_are_empty(self):
142 |         test_report_data_frame = self.SPARK.createDataFrame(
143 |             [
144 |                 ([], []),
145 |                 ([], [])
146 |             ],
147 |             self.schema
148 |         )
149 | 
150 |         test_input_data_frame = self.SPARK.createDataFrame(
151 |             [
152 |                 ("No", "Personal"),
153 |                 ("Data","Inside")
154 |             ],
155 |             ["summary", "phone number"]
156 |         )
157 | 
158 |         actual = self.pii_detector.get_redacted_text(test_input_data_frame, test_report_data_frame)
159 |         
160 |         expected = self.SPARK.createDataFrame(
161 |             [
162 |                 ("No", "Personal"),
163 |                 ("Data","Inside")
164 |             ],
165 |             ["summary", "phone number"]
166 |         )
167 | 
168 |         self.assertEqual(actual.schema, expected.schema)
169 |         self.assertEqual(actual.collect(), expected.collect())
170 | 
171 | 


--------------------------------------------------------------------------------
/src_spark/analyze/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/analyze/utils/__init__.py


--------------------------------------------------------------------------------
/src_spark/analyze/utils/analyzer_result.py:
--------------------------------------------------------------------------------
 1 | class AnalyzerResult:
 2 | 
 3 |     def __init__(self, text, type, start, end):
 4 |         self.text = text
 5 |         self.type = type
 6 |         self.start = start
 7 |         self.end = end
 8 | 
 9 |     def __eq__(self, other):
10 |         return type(self) == type(other) and self.text == other.text and self.type == other.type \
11 |                and self.start == other.start and self.end == other.end
12 | 
13 |     def __repr__(self):
14 |         return self.__str__()
15 | 
16 |     def __str__(self):
17 |         return "Text {} at position ({},{}) was identified as {}".format(self.text, self.start, self.end, self.type)
18 | 
19 |     def detector(self):
20 |         return self.type
21 | 


--------------------------------------------------------------------------------
/src_spark/analyze/utils/regex.py:
--------------------------------------------------------------------------------
 1 | class RegEx:
 2 | 
 3 |     def __init__(self):
 4 |         self.regex_string = ""
 5 | 
 6 |     def __is_numeric(self, value):
 7 |         return isinstance(value, int)
 8 | 
 9 |     def __is_single_character_value(self, value):
10 |         return len(str(value)) == 1
11 | 
12 |     def __validate_range(self, start, end):
13 |         if start > end:
14 |             raise ValueError("Range start should be less than end")
15 | 
16 |     def boundary(self):
17 |         self.regex_string += "\\b"
18 |         return self
19 | 
20 |     def pipe(self):
21 |         self.regex_string += "|"
22 |         return self
23 | 
24 |     def range(self, from_char, to_char):
25 |         if not self.__is_single_character_value(from_char) or not self.__is_single_character_value(to_char):
26 |             raise ValueError("Range boundaries should be single character")
27 | 
28 |         self.__validate_range(from_char, to_char)
29 |         self.regex_string += "[{}-{}]".format(from_char, to_char)
30 |         return self
31 | 
32 |     def one_of(self, character_set):
33 |         if character_set is None or character_set == "":
34 |             raise ValueError("Character Set should not be empty")
35 | 
36 |         self.regex_string += "[" + character_set + "]"
37 |         return self
38 | 
39 |     def any_digit(self):
40 |         self.regex_string += "\\d"
41 |         return self
42 | 
43 |     def num_occurrences(self, number):
44 |         if number < 1:
45 |             raise ValueError
46 | 
47 |         self.regex_string += "{" + str(number) + "}"
48 |         return self
49 | 
50 |     def one_or_more_occurrences(self):
51 |         self.regex_string += "+"
52 |         return self
53 | 
54 |     def zero_or_more_occurrences(self):
55 |         self.regex_string += "*"
56 |         return self
57 | 
58 |     def zero_or_one_occurrences(self):
59 |         self.regex_string += "?"
60 |         return self
61 | 
62 |     def range_occurrences(self, start, end):
63 |         if not self.__is_numeric(start) or not self.__is_numeric(end):
64 |             raise TypeError("Range should be integers")
65 | 
66 |         self.__validate_range(start, end)
67 |         self.regex_string += "{" + str(start) + "," + str(end) + "}"
68 |         return self
69 | 
70 |     def literal(self, literal):
71 |         self.regex_string += literal
72 |         return self
73 | 
74 |     def build(self):
75 |         return self.regex_string
76 | 


--------------------------------------------------------------------------------
/src_spark/constants.py:
--------------------------------------------------------------------------------
1 | ACQUIRE="acquire"
2 | FILE_PATH="file_path"
3 | ANALYZE="analyze"
4 | REPORT="report"
5 | LOCATION="location"
6 | REPORT_LEVEL="level"
7 | OUTPUT_FILE_PATH="output_file_path"


--------------------------------------------------------------------------------
/src_spark/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append(os.path.abspath('.'))
 4 | 
 5 | import argparse
 6 | import json
 7 | 
 8 | from pyspark.sql import SparkSession
 9 | from src_spark.report.report_generator import ReportGenerator
10 | from src_spark.acquire.csv_parser import CsvParser
11 | from src_spark.analyze.detectors.pii_detector import PIIDetector
12 | from src_spark.constants import ACQUIRE, REPORT
13 | from src_spark.write.csv_writer import CsvWriter
14 | 
15 | 
16 | class Main():
17 | 
18 |     def __init__(self, config_file_path):
19 |         with open(config_file_path) as config_file:
20 |             self.config = json.load(config_file)
21 | 
22 |     #TODO : validate the config for the stages right here
23 |     def run(self):
24 |         spark = SparkSession.builder \
25 |                                 .master("local") \
26 |                                 .appName("PIIDetector") \
27 |                                 .getOrCreate()
28 |         parsed_data_frame = CsvParser(spark, config=self.config[ACQUIRE]).parse()
29 |         pii_analysis_report, redacted_data_frame = PIIDetector().analyze_data_frame(parsed_data_frame)
30 |         
31 |         report_generator = ReportGenerator(config=self.config[REPORT])
32 |         if report_generator.is_empty_report_dataframe(pii_analysis_report):
33 |             print("NO PII VALUES WERE FOUND!")
34 |         else:
35 |             report_generator.generate(results_df=pii_analysis_report)
36 |         CsvWriter(spark, config=self.config).write_csv(df=redacted_data_frame)
37 | 
38 | def get_args():
39 |     parser = argparse.ArgumentParser()
40 |     parser.add_argument('--config-file', help='config file to run the tool')
41 |     args = parser.parse_args()
42 |     if not args.config_file:
43 |         raise ValueError("Config file path should be provided for the tool to run.")
44 |     return args
45 | 
46 | if __name__ == "__main__":
47 |     args = get_args()
48 |     Main(args.config_file).run()


--------------------------------------------------------------------------------
/src_spark/report/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/report/__init__.py


--------------------------------------------------------------------------------
/src_spark/report/report_generator.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from enum import Enum
  3 | import os
  4 | import logging
  5 | 
  6 | import pandas as pd
  7 | from pyspark.sql.dataframe import DataFrame
  8 | from pyspark.sql.types import Row
  9 | from src_spark.constants import LOCATION, REPORT_LEVEL
 10 | 
 11 | 
 12 | class ReportLevel(Enum):
 13 | 
 14 |     HIGH = "high"
 15 |     MEDIUM = "medium"
 16 |     LOW = "low"
 17 | 
 18 | class ReportGenerator():
 19 | 
 20 |     def __init__(self, config):
 21 |         self.report_file_location = config[LOCATION]
 22 |         self.report_level = config[REPORT_LEVEL]
 23 |         self.setup_logging_config()
 24 |         self.dataframe_is_empty = None
 25 | 
 26 |     def setup_logging_config(self):
 27 |         date = datetime.today().strftime("%Y%m%d")
 28 |         file_name = "{}/report_{}.log".format(self.report_file_location, date)
 29 |         if os.path.exists(file_name):
 30 |             mode = "a"
 31 |         else:
 32 |             if not os.path.exists(self.report_file_location):
 33 |                 os.makedirs(self.report_file_location)
 34 |             mode = "x"
 35 |         file_handler = logging.FileHandler(filename=file_name, mode=mode)
 36 |         formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 37 |         file_handler.setFormatter(formatter)
 38 |         logging.getLogger().addHandler(file_handler)
 39 |         logging.getLogger().setLevel(logging.INFO)
 40 | 
 41 |     def __generate_high_level_report(self, results_df: DataFrame):
 42 |         columns = results_df.columns
 43 |         report_df = pd.DataFrame({"Columns with PII values" : columns})
 44 |         return report_df
 45 | 
 46 |     def __calculate_percentage(self, item_count, total_count):
 47 |         return round((item_count/total_count) * 100.0, 2)
 48 |     
 49 |     def _get_detector_results(self, row:Row, columns:list):
 50 |         new_row = []
 51 |         for index, cell in enumerate(row):
 52 |             current_col = columns[index]
 53 |             if cell != []:
 54 |                 for analyzer_result in cell:
 55 |                     detector = analyzer_result["type"]
 56 |                     new_row.append(((current_col, detector), 1))
 57 |             else:
 58 |                 new_row.append(((current_col, "no_pii"), 1))
 59 |         return new_row
 60 |         
 61 |     def __get_list_of_detectors(self, detector_results):
 62 |         report_detectors = []
 63 |         for key, _ in detector_results:
 64 |             detector = key[1]
 65 |             if detector not in report_detectors and detector != "no_pii":
 66 |                 report_detectors.append(detector)
 67 |         return report_detectors
 68 | 
 69 |     def spark_generate_medium_level_report(self, results_df: DataFrame) -> pd.DataFrame:
 70 |         columns = results_df.columns
 71 |         detector_results = results_df.rdd.flatMap(lambda row: self._get_detector_results(row, columns)).reduceByKey(lambda acc, next: acc + next).collect()
 72 |         report_detectors = self.__get_list_of_detectors(detector_results)
 73 |         num_rows = results_df.count()
 74 |         pd_columns = []
 75 |         for column in columns:
 76 |             detection_stats = self.__get_detection_stats(column, report_detectors, detector_results, num_rows)
 77 |             pd_columns.append(pd.Series(data=detection_stats, index=report_detectors, name=column))
 78 |         report_df = pd.concat(pd_columns,axis=1).fillna(0)
 79 |         return report_df
 80 | 
 81 |     def __get_detection_stats(self, column: list, report_detectors: list, detector_results: list, num_rows: int) -> dict:
 82 |         detection_stats = {}
 83 |         default_value = ()
 84 |         for detector in report_detectors:
 85 |             column_detector_count = next(filter(lambda result: result[0] == (column, detector), detector_results), default_value)
 86 |             if len(column_detector_count) > 0:
 87 |                 count = column_detector_count[1]
 88 |                 percentage_value = self.__calculate_percentage(item_count=count, total_count=num_rows)
 89 |                 detection_stats[detector] = (count, f"{percentage_value}%")
 90 |         return detection_stats
 91 |         
 92 | 
 93 |     def generate_report_content(self, results_df: DataFrame) -> pd.DataFrame:
 94 |         if self.report_level == ReportLevel.HIGH.value:
 95 |             return self.__generate_high_level_report(results_df)
 96 |         elif self.report_level == ReportLevel.MEDIUM.value:
 97 |             return self.spark_generate_medium_level_report(results_df)
 98 |         return self.spark_generate_medium_level_report(results_df)
 99 | 
100 |     def __print(self, msg):
101 |         formatted_msg = f"\n{msg}"
102 |         print(formatted_msg)
103 |         logging.info(formatted_msg)
104 | 
105 |     def __print_report(self, report):
106 |         self.__print("\n\n****************************PII ANALYSIS REPORT**************************\n\n")
107 |         if report.empty:
108 |             self.__print("NO PII VALUES WERE FOUND!")
109 |         else:
110 |             self.__print(report)
111 |         self.__print("\n\n****************************DONE!**************************\n\n")
112 | 
113 |     def generate(self, results_df: DataFrame):
114 |         if self.is_empty_report_dataframe(results_df):
115 |             print("NO PII VALUES WERE FOUND!")
116 | 
117 |         final_report = self.generate_report_content(results_df)
118 |         self.__print_report(final_report)
119 |         return final_report
120 | 
121 |     def is_empty_report_dataframe(self, results_df: DataFrame) -> bool:
122 |         if self.dataframe_is_empty == None:
123 |             self.dataframe_is_empty = results_df.rdd.flatMap(lambda row: self._row_is_empty_list(row)).reduce(lambda acc, item: acc and item)
124 |         return self.dataframe_is_empty
125 | 
126 |     def _row_is_empty_list(self, row: Row) -> map:
127 |         return map(lambda cell: True if cell == [] else False , row)
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/src_spark/report/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/report/tests/__init__.py


--------------------------------------------------------------------------------
/src_spark/report/tests/test_report_generator.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | from unittest.mock import patch, MagicMock
  3 | 
  4 | import pandas as pd
  5 | from pandas._testing import assert_frame_equal
  6 | from freezegun import freeze_time
  7 | from pyspark.sql.session import SparkSession
  8 | from pyspark.sql.types import Row, StructField, StructType, ArrayType, StringType, LongType
  9 | from src_spark.report.report_generator import ReportGenerator
 10 | from src_spark.analyze.utils.analyzer_result import AnalyzerResult
 11 | 
 12 | 
 13 | 
 14 | class TestReportGenerator(TestCase):
 15 | 
 16 |     @patch("src_spark.report.report_generator.ReportGenerator.setup_logging_config")
 17 |     def setUp(self, mock_setup_logging_config):
 18 |         self.SPARK = SparkSession.builder \
 19 |                                 .master("local") \
 20 |                                 .appName("Test PIIDetector") \
 21 |                                 .getOrCreate()
 22 | 
 23 | 
 24 |         self.array_structtype = StructType([
 25 |             StructField("end", LongType(), False),
 26 |             StructField("start", LongType(), False),
 27 |             StructField("text", StringType(), False),
 28 |             StructField("type", StringType(), False)
 29 |         ])
 30 |         self.schema = StructType([
 31 |             StructField("summary", ArrayType(self.array_structtype, True), nullable=False),
 32 |             StructField("phone number", ArrayType(self.array_structtype, True), nullable=False) 
 33 |         ])
 34 |         self.report_generator_high_level = ReportGenerator(config={"location" : "abc", "level" : "high"})
 35 |         mock_setup_logging_config.assert_called_with()
 36 |         self.report_generator_medium_level = ReportGenerator(config={"location" : "abc", "level" : "medium"})
 37 |         mock_setup_logging_config.assert_called_with()
 38 | 
 39 |     def test_high_level_reporting_returns_columns_with_PII_values_when_given_a_results_data_frame(self):
 40 |         test_data_frame = self.SPARK.createDataFrame(
 41 |             [
 42 |                 ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]),
 43 |                 ([AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
 44 |             ],
 45 |             self.schema
 46 |         )
 47 |         expected_data_frame = pd.DataFrame({"Columns with PII values" : ["summary", "phone number"]})
 48 |         self.assertCountEqual(expected_data_frame, self.report_generator_high_level.generate_report_content(test_data_frame))
 49 | 
 50 | 
 51 | 
 52 |     def test_medium_level_reporting_returns_data_frame_with_detectors_and_column_details(self):
 53 |         test_data_frame = self.SPARK.createDataFrame(
 54 |             [
 55 |                 ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]),
 56 |                 ([AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
 57 |             ],
 58 |             self.schema
 59 |         )
 60 | 
 61 |         expected_data_frame = pd.DataFrame({
 62 |             "summary": [(1, "50.0%"), 0, (1, "50.0%")],
 63 |             "phone number": [0, (1, "50.0%"), (1, "50.0%")]
 64 |         },index=["NRIC","EMAIL","PHONE_NUMBER"])
 65 | 
 66 |         self.assertCountEqual(list(expected_data_frame), self.report_generator_medium_level.spark_generate_medium_level_report(test_data_frame))
 67 | 
 68 |     def test_that_medium_level_reporting_returns_correct_data_frame(self):
 69 |         test_data_frame = self.SPARK.createDataFrame(
 70 |             [
 71 |                 ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]),
 72 |                 ([AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
 73 |             ],
 74 |             self.schema
 75 |         )
 76 | 
 77 |         expected_data_frame = pd.DataFrame({
 78 |             "summary": [(1, "50.0%"), 0, (1, "50.0%")],
 79 |             "phone number": [0, (1, "50.0%"), (1, "50.0%")]
 80 |         },index=["NRIC","EMAIL","PHONE_NUMBER"])
 81 | 
 82 |         actual = self.report_generator_medium_level.spark_generate_medium_level_report(test_data_frame)
 83 |         assert_frame_equal(actual, expected_data_frame)
 84 | 
 85 |     @patch("logging.info")
 86 |     @patch("src.report.report_generator.ReportGenerator.generate_report_content")
 87 |     def test_generate_report_calls_content_generate_report_content_and_logs_it(self, mock_generate_content, mock_logging):
 88 |         test_data_frame = self.SPARK.createDataFrame(
 89 |             [
 90 |                 ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], [AnalyzerResult("test@sample.com", "EMAIL", 45, 60)]),
 91 |                 ([AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)], [AnalyzerResult("+65 62345678", "PHONE_NUMBER", 35, 47)])
 92 |             ],
 93 |             self.schema
 94 |         )
 95 |         mock_generate_content.return_value = pd.DataFrame({"Columns with PII values" : ["summary", "phone number"]})
 96 |         mock_logging.return_value = None
 97 |         expected_result = self.report_generator_high_level.generate(test_data_frame)
 98 |         self.assertCountEqual(expected_result, mock_generate_content.return_value)
 99 | 
100 | 
101 |     @freeze_time('2019-05-29 01:01:03')
102 |     @patch("logging.FileHandler")
103 |     @patch("logging.Logger.addHandler")
104 |     @patch("genericpath.exists")
105 |     def test_creation_of_the_report_file_if_not_present(self, mock_file_exists, mock_add_handler, mock_file_handler):
106 |         mock_file_exists.return_value = False
107 |         mock_file_handler.return_value = MagicMock()
108 |         self.report_generator_high_level.setup_logging_config()
109 |         mock_file_handler.assert_called_with(filename="abc/report_20190529.log", mode="x")
110 |         mock_add_handler.assert_called_with(mock_file_handler.return_value)
111 | 
112 | 
113 |     @freeze_time('2019-05-29 01:01:03')
114 |     @patch("logging.FileHandler")
115 |     @patch("logging.Logger.addHandler")
116 |     @patch("os.path.exists")
117 |     def test_appending_to_report_file_if_already_present(self, mock_file_exists, mock_add_handler, mock_file_handler):
118 |         mock_file_exists.return_value = True
119 |         mock_file_handler.return_value = MagicMock()
120 |         self.report_generator_high_level.setup_logging_config()
121 |         mock_file_handler.assert_called_with(filename="abc/report_20190529.log", mode="a")
122 |         mock_add_handler.assert_called_with(mock_file_handler.return_value)
123 |     
124 |     def test_that_when_report_dataframe_contains_only_empty_lists_it_is_considered_empty(self):
125 |         test_data_frame = self.SPARK.createDataFrame(
126 |             [
127 |                 ([], []),
128 |                 ([], [])
129 |             ],
130 |             self.schema
131 |         )
132 | 
133 |         actual = self.report_generator_medium_level.is_empty_report_dataframe(test_data_frame)
134 |         expected = True
135 | 
136 |         self.assertEqual(actual, expected)
137 | 
138 |     def test_that_when_report_dataframe_contains_some_text_it_is_not_considered_empty(self):
139 |         test_data_frame = self.SPARK.createDataFrame(
140 |             [
141 |                 ([AnalyzerResult("S0000001I", "NRIC", 38, 47)], []),
142 |                 ([], [])
143 |             ],
144 |             self.schema
145 |         )
146 | 
147 |         actual = self.report_generator_medium_level.is_empty_report_dataframe(test_data_frame)
148 |         expected = False
149 | 
150 |         self.assertEqual(actual, expected)
151 | 
152 |     def test_that_get_detector_results_returns_list_of_detector_results(self):
153 |         columns = ["summary", "phone_number"]
154 |         test_row = Row(summary=[Row(end=47, start=38, text='S0000001I', type='NRIC')], phone_number=[Row(end=60, start=45, text='test@sample.com', type='EMAIL')])
155 |         actual = self.report_generator_medium_level._get_detector_results(test_row, columns)
156 |         expected = [(('summary', 'NRIC'), 1), (('phone_number', 'EMAIL'), 1)]
157 |         self.assertEqual(actual, expected)
158 | 
159 |     def test_that_get_detector_results_returns_list_of_detector_results_if_column_is_empty(self):
160 |         columns = ["summary", "phone_number"]
161 |         test_row = Row(summary=[Row(end=47, start=38, text='S0000001I', type='NRIC')], phone_number=[])
162 |         actual = self.report_generator_medium_level._get_detector_results(test_row, columns)
163 |         expected = [(('summary', 'NRIC'), 1), (('phone_number', 'no_pii'), 1)]
164 |         self.assertEqual(actual, expected)
165 |         
166 | 


--------------------------------------------------------------------------------
/src_spark/write/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/write/__init__.py


--------------------------------------------------------------------------------
/src_spark/write/csv_writer.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession, DataFrame
 2 | from src.constants import OUTPUT_FILE_PATH, FILE_PATH
 3 | 
 4 | 
 5 | class CsvWriter():
 6 | 
 7 |     def __init__(self, spark: SparkSession, config):
 8 |         self.__validate_config(config)
 9 |         self.output_path = config["anonymize"][OUTPUT_FILE_PATH]
10 |         self.input_file_name = config["acquire"][FILE_PATH]
11 |         self.spark = spark
12 | 
13 |     def __validate_config(self, config):
14 |         if "anonymize" not in config or not config["anonymize"] or OUTPUT_FILE_PATH not in config["anonymize"] or not config["anonymize"][OUTPUT_FILE_PATH]:
15 |             raise ValueError("Config 'output_file_path' needs to be provided for parsing")
16 |     
17 |     def get_output_file_path(self):
18 |         file_name = self.input_file_name.split('/')[-1]
19 |         file_name_no_extension = file_name.split('.')[0]
20 |         result = f"{self.output_path}/{file_name_no_extension}_anonymized_.csv"
21 |         return result
22 | 
23 |     def write_csv(self, df: DataFrame):
24 |         df.write.csv(self.get_output_file_path())


--------------------------------------------------------------------------------
/src_spark/write/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thoughtworks-datakind/anonymizer/875696d3c71661db5ba2c58579ae19ef542f2d77/src_spark/write/tests/__init__.py


--------------------------------------------------------------------------------
/src_spark/write/tests/test_csv_writer.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | from pyspark.sql import SparkSession
 3 | from src_spark.write.csv_writer import CsvWriter
 4 | 
 5 | 
 6 | class TestCsvWriter(TestCase):
 7 |     
 8 |     def setUp(self) -> None:
 9 |         self.SPARK = SparkSession.builder \
10 |                                 .master("local") \
11 |                                 .appName("Test CsvWriter") \
12 |                                 .getOrCreate()
13 | 
14 |     def test_invalid_config_gets_caught_during_initialization(self):
15 |         context = {}
16 |         with self.assertRaises(ValueError) as ve:
17 |             CsvWriter(self.SPARK, config=context)
18 |         self.assertEqual(str(ve.exception), "Config 'output_file_path' needs to be provided for parsing")
19 | 
20 |     def test_correct_output_path_is_generated(self):
21 |         context = {
22 |                 "acquire": {
23 |                     "file_path": "/anonymizer/test_data.csv",
24 |                     "delimiter": ","
25 |                 },
26 |                 "anonymize": {
27 |                     "output_file_path" : "/anonymizer/output"
28 |                 }
29 |             }
30 |         input_file_name = "test_data"
31 |         output_directory = "/anonymizer/output"
32 |         expected = f"{output_directory}/{input_file_name}_anonymized_.csv"
33 |         writer = CsvWriter(spark=self.SPARK, config=context)
34 |         self.assertEqual(writer.get_output_file_path(), expected)
35 | 
36 | 


--------------------------------------------------------------------------------
/test_data.csv:
--------------------------------------------------------------------------------
1 | National ID,Phone Number,Address,Remarks
2 | S0000001I,+65 91264944,112 Bedok,A typical email id would look something like test@sample.com
3 | S00000dfs,+65 91264944,112 Bedok,A typical email id would look something like ANC


--------------------------------------------------------------------------------