├── .coveragerc ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── examples.yml │ ├── main.yml │ └── release.yml ├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── Makefile ├── NOTICE ├── README.md ├── cape_dataframes ├── __init__.py ├── audit │ ├── __init__.py │ └── audit.py ├── coordinator │ ├── __init__.py │ ├── auth │ │ ├── __init__.py │ │ ├── api_token.py │ │ └── api_token_test.py │ ├── client.py │ └── client_test.py ├── pandas │ ├── __init__.py │ ├── dtypes.py │ ├── registry.py │ ├── registry_test.py │ ├── transformations │ │ ├── __init__.py │ │ ├── base.py │ │ ├── column_redact.py │ │ ├── column_redact_test.py │ │ ├── perturbation.py │ │ ├── perturbation_test.py │ │ ├── rounding.py │ │ ├── rounding_test.py │ │ ├── row_redact.py │ │ ├── row_redact_test.py │ │ ├── test_utils.py │ │ ├── tokenizer.py │ │ └── tokenizer_test.py │ └── transformer.py ├── policy │ ├── __init__.py │ ├── data.py │ ├── data_test.py │ ├── exceptions.py │ ├── policy.py │ ├── policy_test.py │ └── policy_test_fixtures.py ├── spark │ ├── __init__.py │ ├── dtypes.py │ ├── registry.py │ ├── registry_test.py │ ├── transformations │ │ ├── __init__.py │ │ ├── base.py │ │ ├── perturbation.py │ │ ├── perturbation_test.py │ │ ├── redaction.py │ │ ├── redaction_test.py │ │ ├── rounding.py │ │ ├── rounding_test.py │ │ ├── tokenizer.py │ │ └── tokenizer_test.py │ ├── transformer.py │ └── utils.py └── utils │ ├── __init__.py │ ├── base64.py │ ├── base64_test.py │ ├── typecheck.py │ └── typecheck_test.py ├── codecov.yml ├── docs ├── README.md ├── policies.md ├── quickstart.md ├── redactions.md ├── transformations.md └── tutorials │ └── reversible-tokenization.md ├── examples ├── notebooks │ ├── Cape Policy for Pandas - IoT Example.ipynb │ ├── Cape Policy for Spark - IoT Example.ipynb │ ├── Cape Python with Pandas - IoT Exploratory Data Analysis.ipynb │ ├── Cape Python with PySpark - Taxi Dataset.ipynb │ └── README.md ├── policy │ ├── iot_example_policy.yaml │ ├── mask_personal_information.yaml │ ├── nyc_taxi_dataset_policy.yaml │ ├── perturb_value_field.yaml │ └── spark_round.yaml ├── simple_transformation.py ├── spark_example.py └── tutorials │ ├── credit │ ├── README.md │ ├── apply_policy_spark.py │ ├── data │ │ └── credit_with_pii.csv │ ├── mask_credit_data_in_pandas.ipynb │ └── policy │ │ └── credit_policy.yaml │ ├── quickstart │ ├── README.md │ ├── apply_policy_pandas.py │ ├── apply_policy_spark.py │ ├── dataset.py │ ├── experiment_pandas.py │ ├── experiment_spark.py │ └── mask_personal_information.yaml │ └── reversible_tokenizer │ ├── README.md │ ├── reversible_tokenizer_pandas.ipynb │ └── reversible_tokenizer_pandas.py ├── requirements ├── base.in ├── base.txt ├── dev.in ├── dev.txt ├── spark.in └── spark.txt ├── setup.cfg └── setup.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | omit= 3 | cape_dataframes/spark/examples/* 4 | *test.py 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior, including any code you can share. 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Screenshots** 20 | If applicable, add screenshots to help explain your problem. 21 | 22 | **Desktop (please complete the following information):** 23 | - OS: [e.g. macOS, Linux] 24 | - OS Version [e.g. 22] 25 | - Python Version 26 | - Installed pip packages 27 | 28 | **Additional context** 29 | Add any other context about the problem here. 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/examples.yml: -------------------------------------------------------------------------------- 1 | name: Main 2 | on: [push] 3 | jobs: 4 | build: 5 | name: Examples 6 | runs-on: ubuntu-latest 7 | strategy: 8 | matrix: 9 | python-version: [3.8, 3.9, "3.10"] 10 | steps: 11 | - name: Setup python ${{ matrix.python-version }} Environment 12 | uses: actions/setup-python@v1 13 | with: 14 | python-version: ${{ matrix.python-version }} 15 | - name: Check out repository 16 | uses: actions/checkout@v2 17 | - name: Run Make Examples 18 | run: | 19 | make bootstrap 20 | make examples -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Main 2 | on: [push] 3 | jobs: 4 | build: 5 | name: Test 6 | runs-on: ubuntu-latest 7 | strategy: 8 | matrix: 9 | python-version: [3.8, 3.9, "3.10"] 10 | steps: 11 | - name: Setup python ${{ matrix.python-version }} Environment 12 | uses: actions/setup-python@v1 13 | with: 14 | python-version: ${{ matrix.python-version }} 15 | - name: Check out repository 16 | uses: actions/checkout@v2 17 | - name: Cache pip 18 | uses: actions/cache@v2 19 | with: 20 | # This path is specific to Ubuntu 21 | path: ~/.cache/pip 22 | # Look to see if there is a cache hit for the corresponding requirements file 23 | key: ${{ runner.os }}-${{ matrix.python-version }}-pip-${{ hashFiles('requirements/base.txt', 'requirements/spark.txt') }} 24 | restore-keys: | 25 | ${{ runner.os }}-${{ matrix.python-version }}-pip- 26 | ${{ runner.os }}- 27 | - name: Install All Dependencies 28 | run: | 29 | make pydep 30 | - name: Run CI 31 | run: | 32 | make ci 33 | - if: matrix.python-version == 3.9 34 | name: Docker build 35 | run: | 36 | make docker 37 | - if: matrix.python-version == 3.9 38 | name: Upload coverage to Codecov 39 | uses: codecov/codecov-action@v1.0.7 40 | with: 41 | token: ${{ secrets.CODECOV_TOKEN }} 42 | file: coverage.xml 43 | flags: unittests 44 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | python: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: '3.x' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | docker: 28 | runs-on: ubuntu-latest 29 | steps: 30 | - uses: actions/checkout@v2 31 | - uses: docker/build-push-action@v1 32 | name: Build and Push Docker 33 | with: 34 | username: ${{ secrets.DOCKER_USERNAME }} 35 | password: ${{ secrets.DOCKER_PASSWORD }} 36 | repository: capeprivacy/cape-python 37 | tag_with_ref: true 38 | - name: Docker Hub Description 39 | uses: peter-evans/dockerhub-description@v2 40 | env: 41 | DOCKERHUB_USERNAME: ${{ secrets.DOCKER_USERNAME }} 42 | DOCKERHUB_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} 43 | DOCKERHUB_REPOSITORY: capeprivacy/cape-python 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | spark/data/*.csv 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | .idea/ 134 | .vscode/ 135 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capeprivacy/cape-dataframes/ed65cece5caebcce1ac549573514834effab5ecd/CHANGELOG.md -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution guide 2 | 3 | Contributions are more than welcome and we're always looking for use cases and feature ideas! 4 | 5 | This document helps you get started on: 6 | 7 | - [Submitting a pull request](#submitting-a-pull-request) 8 | - [Writing documentation](#writing-documentation) 9 | - [Useful tricks](#useful-tricks) 10 | - [Reporting a bug](#reporting-a-bug) 11 | - [Asking for help](#asking-for-help) 12 | 13 | 14 | ## Submitting a pull request 15 | 16 | To contribute, [fork](https://help.github.com/articles/fork-a-repo/) Cape Python, commit your changes, and [open a pull request](https://help.github.com/articles/using-pull-requests/). 17 | 18 | While you may be asked to make changes to your submission during the review process, we will work with you on this and suggest changes. Consider giving us [push rights to your branch](https://help.github.com/articles/allowing-changes-to-a-pull-request-branch-created-from-a-fork/) so we can potentially also help via commits. 19 | 20 | ### Commit history and merging 21 | 22 | For the sake of transparency our key rule is to keep a logical and intelligible commit history, meaning anyone stepping through the commits on either the `master` branch or as part of a review should be able to easily follow the changes made and their potential implications. 23 | 24 | To this end we ask all contributors to sanitize pull requests before submitting them. All pull requests will either be [squashed or rebased](https://help.github.com/en/articles/about-pull-request-merges). 25 | 26 | Some guidelines: 27 | 28 | - Even simple code changes such as moving code around can obscure semantic changes, and in those case there should be two commits: for example, one that only moves code (with a note of this in the commit description) and one that performs the semantic change. 29 | 30 | - Progressions that have no logical justification for being split into several commits should be squeezed. 31 | 32 | - Code does not have to compile or pass all tests at each commit, but leave a remark and a plan in the commit description so reviewers are aware and can plan accordingly. 33 | 34 | See below for some [useful tricks](#git-and-github) for working with Git and GitHub. 35 | 36 | ### Before submitting for review 37 | 38 | Make sure to give some context and overview in the body of your pull request to make it easier for reviewers to understand your changes. Ideally explain why your particular changes were made the way they are. 39 | 40 | Importantly, use [keywords](https://help.github.com/en/articles/closing-issues-using-keywords) such as `Closes #` to indicate any issues or other pull requests related to your work. 41 | 42 | Furthermore: 43 | 44 | - Run tests (`make test`) and linting (`make lint`) before submitting as our [CI](#continuous-integration) will block pull requests failing either check 45 | - Test your change thoroughly with unit tests where appropriate 46 | - Update any affected docstrings in the code base 47 | - Add a line in [CHANGELOG.md](CHANGELOG.md) for any major change 48 | 49 | ## Continuous integration 50 | 51 | All pull requests are run against our [continuous integration suite](https://github.com/capeprivacy/cape-python/actions). The entire suite must pass before a pull request is accepted. 52 | 53 | ## Writing documentation 54 | 55 | Ensure you add docstrings where necessary. We use [Google's style](https://github.com/google/styleguide/blob/gh-pages/pyguide.md). 56 | 57 | The documentation site is managed in the [documentation repository](https://github.com/capeprivacy/documentation). 58 | 59 | ## Useful tricks 60 | 61 | ### git and GitHub 62 | 63 | - [GitHub Desktop](https://desktop.github.com/) provides a useful interface for inspecting and committing code changes 64 | - `git add -p` 65 | - lets you leave out some changes in a file (GitHub Desktop can be used for this as well) 66 | - `git commit --amend` 67 | - allows you to add to the previous commit instead of creating a new one 68 | - `git rebase -i ` 69 | - allows you to [squeeze and reorder commits](https://git-scm.com/book/en/v2/Git-Tools-Rewriting-History) 70 | - use `HEAD~5` to consider 5 most recent commits 71 | - use `~1` to start from commit identified by `` 72 | - `git rebase master` 73 | - [pull in latest updates](https://git-scm.com/book/en/v2/Git-Branching-Rebasing) on `master` 74 | - `git fetch --no-tags :` 75 | - pulls down a remote branch from e.g. a fork and makes it available to check out as a local branch 76 | - `` is e.g. `git@github.com:/tf-encrypted.git` 77 | - `git push :` 78 | - pushes the local branch to a remote branch on e.g. a fork 79 | - `` is e.g. `git@github.com:/tf-encrypted.git` 80 | - `git tag -d && git push origin :refs/tags/` 81 | - can be used to delete a tag remotely 82 | 83 | ## Reporting a bug 84 | 85 | Please file [bug reports](https://github.com/capeprivacy/cape-python/issues/new?template=bug_report.md) as GitHub issues. 86 | 87 | ### Security disclosures 88 | 89 | If you encounter a security issue then please responsibly disclose it by reaching out to us at [privacy@capeprivacy.com](privacy@capeprivacy.com). We will work with you to mitigate the issue and responsibly disclose it to anyone using the project in a timely manner. 90 | 91 | ## Asking for help 92 | 93 | If you have any questions you are more than welcome to reach out through GitHub issues or [our Slack channel](https://join.slack.com/t/capecommunity/shared_invite/zt-f8jeskkm-r9_FD0o4LkuQqhJSa~~IQA). -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim-buster 2 | 3 | RUN apt-get update && apt-get install -y build-essential 4 | COPY Makefile setup.py README.md ./ 5 | RUN mkdir requirements 6 | COPY requirements/base.txt requirements/spark.txt ./requirements/ 7 | 8 | RUN make bootstrap 9 | 10 | COPY . . 11 | 12 | RUN pip install . 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Global Variables used across many different rule types 2 | 3 | # Definition of the default rule 4 | all: test 5 | .PHONY: all 6 | 7 | # ############################################### 8 | # Bootstrapping 9 | # 10 | # Rules for bootstrapping the Makefile such as checking for docker, python versions, etc. 11 | # ############################################### 12 | DOCKER_REQUIRED_VERSION=18. 13 | SHELL := /bin/bash 14 | 15 | CURRENT_DIR=$(shell pwd) 16 | PIP_PATH=$(shell which pip) 17 | DOCKER_PATH=$(shell which docker) 18 | 19 | # Default platform 20 | # PYPI doesn't allow linux build tags to be pushed and doesn't support 21 | # specific operating systems such a ubuntu. It only allows build tags for linux 22 | # to be pushed as manylinux. 23 | DEFAULT_PLATFORM=manylinux1_x86_64 24 | 25 | dockercheck: 26 | ifeq (,$(DOCKER_PATH)) 27 | ifeq (,$(findstring $(DOCKER_REQUIRED_VERSION),$(shell docker version))) 28 | ifeq (,$(BYPASS_DOCKER_CHECK)) 29 | $(error "Docker version $(DOCKER_REQUIRED_VERSION) is required.") 30 | endif 31 | endif 32 | endif 33 | 34 | pydep-upgrade: 35 | pip install -U pip-tools 36 | CUSTOM_COMPILE_COMMAND="make pydep-upgrade" pip-compile --output-file requirements/base.txt requirements/base.in --resolver=backtracking 37 | CUSTOM_COMPILE_COMMAND="make pydep-upgrade" pip-compile --output-file requirements/spark.txt requirements/spark.in --resolver=backtracking 38 | CUSTOM_COMPILE_COMMAND="make pydep-upgrade" pip-compile --output-file requirements/dev.txt requirements/dev.in --resolver=backtracking 39 | pip install -r requirements/base.txt -r requirements/spark.txt -r requirements/dev.txt 40 | 41 | 42 | pydep: 43 | pip install -r requirements/base.txt -r requirements/spark.txt -r requirements/dev.txt 44 | 45 | bootstrap: 46 | pip install -U pip setuptools 47 | pip install -r requirements/base.txt -r requirements/spark.txt 48 | pip install -e . 49 | 50 | # ############################################### 51 | # Testing and Linting 52 | # 53 | # Rules for running our tests and for running various different linters 54 | # ############################################### 55 | test: 56 | pytest 57 | 58 | CI_FILES=cape_dataframes/pandas cape_dataframes/spark cape_dataframes/policy cape_dataframes/coordinator 59 | 60 | lint: 61 | flake8 . 62 | 63 | ci: lint test coverage 64 | 65 | fmt: 66 | isort --atomic . 67 | black . 68 | 69 | coverage: 70 | pytest --cov-report=xml --cov=cape_dataframes ${CI_FILES} 71 | coverage report --fail-under=90 72 | 73 | examples: 74 | shopt -s nullglob; \ 75 | for dir in examples examples/tutorials; do \ 76 | pushd $$dir; \ 77 | for i in *.py; do \ 78 | line=$$(head -n 1 $$i); \ 79 | if [[ $$line == "# SKIP_CI" ]]; then \ 80 | continue; \ 81 | fi; \ 82 | echo "Running $$i"; \ 83 | python $$i || exit 1; \ 84 | done; \ 85 | popd; \ 86 | done; 87 | 88 | docker: 89 | docker build -t capeprivacy/cape-dataframes . 90 | 91 | .PHONY: lint fmt test coverage examples 92 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Cape Python 2 | 3 | All contributions by Cape Privacy: 4 | Copyright (c) 2020, Cape, Inc. 5 | All rights reserved. 6 | 7 | All other contributions: 8 | Copyright (c) 2020, the respective contributors. 9 | All rights reserved. 10 | 11 | This project includes software developed by Cape Privacy, Inc (https://capeprivacy.com/). 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cape Dataframes 2 | 3 | [![](https://github.com/capeprivacy/cape-dataframes/workflows/Main/badge.svg)](https://github.com/capeprivacy/cape-dataframes/actions/workflows/main.yml) 4 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 5 | [![codecov](https://codecov.io/gh/capeprivacy/cape-python/branch/master/graph/badge.svg?token=L9A8HFAJK5)](https://codecov.io/gh/capeprivacy/cape-python) 6 | [![PyPI version](https://badge.fury.io/py/cape-privacy.svg)](https://badge.fury.io/py/cape-privacy) 7 | [![Cape Community Discord](https://img.shields.io/discord/1027271440061435975)](https://discord.gg/nQW7YxUYjh) 8 | 9 | A Python library supporting data transformations and collaborative privacy policies, for data science projects in Pandas and Apache Spark 10 | 11 | See below for instructions on how to get started or visit the [documentation](https://github.com/capeprivacy/cape-dataframes/tree/master/docs/). 12 | 13 | ## Getting started 14 | 15 | ### Prerequisites 16 | 17 | * Python 3.6 or above, and pip 18 | * Pandas 1.0+ 19 | * PySpark 3.0+ (if using Spark) 20 | * [Make](https://www.gnu.org/software/make/) (if installing from source) 21 | 22 | ### Install with pip 23 | 24 | Cape Dataframes is available through PyPi. 25 | 26 | ```sh 27 | pip install cape-dataframes 28 | ``` 29 | 30 | Support for Apache Spark is optional. If you plan on using the library together with Apache Spark, we suggest the following instead: 31 | 32 | ```sh 33 | pip install cape-dataframes[spark] 34 | ``` 35 | 36 | We recommend running it in a virtual environment, such as [venv](https://docs.python.org/3/library/venv.html). 37 | 38 | ### Install from source 39 | 40 | It is possible to install the library from source. This installs all dependencies, including Apache Spark: 41 | 42 | ```sh 43 | git clone https://github.com/capeprivacy/cape-dataframes.git 44 | cd cape-dataframes 45 | make bootstrap 46 | ``` 47 | ### Usage example 48 | 49 | *This example is an abridged version of the tutorial found [here](https://github.com/capeprivacy/cape-dataframes/tree/master/examples/tutorials)* 50 | 51 | 52 | ```python 53 | df = pd.DataFrame({ 54 | "name": ["alice", "bob"], 55 | "age": [34, 55], 56 | "birthdate": [pd.Timestamp(1985, 2, 23), pd.Timestamp(1963, 5, 10)], 57 | }) 58 | 59 | tokenize = Tokenizer(max_token_len=10, key=b"my secret") 60 | perturb_numeric = NumericPerturbation(dtype=dtypes.Integer, min=-10, max=10) 61 | 62 | df["name"] = tokenize(df["name"]) 63 | df["age"] = perturb_numeric(df["age"]) 64 | 65 | print(df.head()) 66 | # >> 67 | # name age birthdate 68 | # 0 f42c2f1964 34 1985-02-23 69 | # 1 2e586494b2 63 1963-05-10 70 | ``` 71 | 72 | These steps can be saved in policy files so you can share them and collaborate with your team: 73 | 74 | ```yaml 75 | # my-policy.yaml 76 | label: my-policy 77 | version: 1 78 | rules: 79 | - match: 80 | name: age 81 | actions: 82 | - transform: 83 | type: numeric-perturbation 84 | dtype: Integer 85 | min: -10 86 | max: 10 87 | seed: 4984 88 | - match: 89 | name: name 90 | actions: 91 | - transform: 92 | type: tokenizer 93 | max_token_len: 10 94 | key: my secret 95 | ``` 96 | 97 | You can then load this policy and apply it to your data frame: 98 | 99 | ```python 100 | # df can be a Pandas or Spark data frame 101 | policy = cape.parse_policy("my-policy.yaml") 102 | df = cape.apply_policy(policy, df) 103 | 104 | print(df.head()) 105 | # >> 106 | # name age birthdate 107 | # 0 f42c2f1964 34 1985-02-23 108 | # 1 2e586494b2 63 1963-05-10 109 | ``` 110 | 111 | You can see more [examples and usage](https://github.com/capeprivacy/cape-dataframes/tree/master/examples/) or read our [documentation](https://github.com/capeprivacy/cape-dataframes/tree/master/docs/). 112 | 113 | ## About Cape Privacy and Cape Dataframes 114 | 115 | [Cape Privacy](https://capeprivacy.com) empowers developers to easily encrypt data and process it confidentially. No cryptography or key management required.. Learn more at [capeprivacy.com](https://capeprivacy.com). 116 | 117 | Cape Dataframes brings Cape's policy language to Pandas and Apache Spark. The supported techniques include tokenization with linkability as well as perturbation and rounding. You can experiment with these techniques programmatically, in Python or in human-readable policy files. 118 | 119 | ### Project status and roadmap 120 | 121 | Cape Python 0.1.1 was released 24th June 2020. It is actively maintained and developed, alongside other elements of the Cape ecosystem. 122 | 123 | **Upcoming features:** 124 | 125 | * Reversible tokenisation: allow reversing of tokenization to reveal the raw value. 126 | * Expand pipeline integrations: add Apache Beam, Apache Flink, Apache Arrow Flight or Dask integration as another pipeline we can support, either as part of Cape Dataframes or in its own separate project. 127 | 128 | ## Help and resources 129 | 130 | If you need help using Cape Dataframes, you can: 131 | 132 | * View the [documentation](https://github.com/capeprivacy/cape-dataframes/tree/master/docs/). 133 | * Submit an issue. 134 | * Talk to us on the [Cape Community Discord](https://discord.gg/nQW7YxUYjh) [![Cape Community Discord](https://img.shields.io/discord/1027271440061435975)](https://discord.gg/nQW7YxUYjh) 135 | 136 | Please file [feature requests](https://github.com/capeprivacy/cape-dataframes/issues/new?template=feature_request.md) and 137 | [bug reports](https://github.com/capeprivacy/cape-dataframes/issues/new?template=bug_report.md) as GitHub issues. 138 | 139 | ### Contributing 140 | 141 | View our [contributing](CONTRIBUTING.md) guide for more information. 142 | 143 | ### Code of conduct 144 | 145 | Our [code of conduct](https://capeprivacy.com/conduct/) is included on the Cape Privacy website. All community members are expected to follow it. Please refer to that page for information on how to report problems. 146 | 147 | ## License 148 | 149 | Licensed under Apache License, Version 2.0 (see [LICENSE](https://github.com/capeprivacy/cape-python/blob/master/LICENSE) or http://www.apache.org/licenses/LICENSE-2.0). Copyright as specified in [NOTICE](https://github.com/capeprivacy/cape-python/blob/master/NOTICE). 150 | -------------------------------------------------------------------------------- /cape_dataframes/__init__.py: -------------------------------------------------------------------------------- 1 | from cape_dataframes import pandas 2 | from cape_dataframes import spark 3 | from cape_dataframes.coordinator import Client 4 | from cape_dataframes.policy.policy import apply_policy 5 | from cape_dataframes.policy.policy import parse_policy 6 | 7 | __all__ = ["apply_policy", "pandas", "parse_policy", "spark", "Client"] 8 | -------------------------------------------------------------------------------- /cape_dataframes/audit/__init__.py: -------------------------------------------------------------------------------- 1 | from cape_dataframes.audit.audit import APPLY_POLICY_EVENT 2 | from cape_dataframes.audit.audit import AuditLogger 3 | 4 | __all__ = ["AuditLogger", "APPLY_POLICY_EVENT"] 5 | -------------------------------------------------------------------------------- /cape_dataframes/audit/audit.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | APPLY_POLICY_EVENT = "apply-policy" 4 | 5 | 6 | class AuditLogger: 7 | def audit_log(self, event_name, target_id, target_type, target_label): 8 | logging.info( 9 | f"{event_name}: ID: {target_id} Type: {target_type} Label: {target_label}" 10 | ) 11 | -------------------------------------------------------------------------------- /cape_dataframes/coordinator/__init__.py: -------------------------------------------------------------------------------- 1 | from cape_dataframes.coordinator.client import Client 2 | 3 | __all__ = ["Client"] 4 | -------------------------------------------------------------------------------- /cape_dataframes/coordinator/auth/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capeprivacy/cape-dataframes/ed65cece5caebcce1ac549573514834effab5ecd/cape_dataframes/coordinator/auth/__init__.py -------------------------------------------------------------------------------- /cape_dataframes/coordinator/auth/api_token.py: -------------------------------------------------------------------------------- 1 | from cape_dataframes.utils import base64 2 | 3 | SECRET_BYTES = 16 4 | VERSION = b"\x01" 5 | 6 | 7 | class APIToken: 8 | """Represents an API token used to authenticate with the coordinator. 9 | 10 | The format is: , 11 | 12 | The first byte of the decoded Base64 string is the version and the rest 13 | is the secret. 14 | 15 | Attributes: 16 | token_id: The ID of the token. 17 | version: The version of the token format. 18 | secret: The password used to authenticate. 19 | raw: The raw token string. 20 | """ 21 | 22 | token_id: str 23 | version: bytes 24 | secret: bytes 25 | raw: str 26 | 27 | def __init__(self, token: str): 28 | self.raw = token 29 | splits = token.split(",") 30 | self.token_id = splits[0] 31 | 32 | token_bytes = bytes(base64.from_string(splits[1])) 33 | self.version = token_bytes[0] 34 | self.secret = token_bytes[1:] 35 | 36 | 37 | def create_api_token(token_id: str, secret: bytes) -> APIToken: 38 | """Creates an APIToken. Mostly used for testing. 39 | 40 | Args: 41 | token_id: The token id to use. 42 | secret: The password to use. 43 | 44 | Returns: 45 | The constructed APIToken. 46 | """ 47 | token_bytes = bytes(VERSION) + bytes(secret, "utf-8") 48 | b64 = base64.Base64(token_bytes) 49 | 50 | token = f"{token_id},{b64}" 51 | 52 | return APIToken(token) 53 | -------------------------------------------------------------------------------- /cape_dataframes/coordinator/auth/api_token_test.py: -------------------------------------------------------------------------------- 1 | from cape_dataframes.coordinator.auth.api_token import create_api_token 2 | 3 | 4 | def test_api_token(): 5 | token_id = "imatokenid" 6 | secret = "aaaabbbbccccdddd" 7 | token = create_api_token(token_id, secret) 8 | 9 | assert token.token_id == token_id 10 | assert token.secret == bytes(secret, "utf-8") 11 | assert token.version == 1 12 | -------------------------------------------------------------------------------- /cape_dataframes/coordinator/client.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Any 3 | from typing import Dict 4 | 5 | import requests 6 | import rfc3339 7 | 8 | from cape_dataframes.coordinator.auth.api_token import APIToken 9 | from cape_dataframes.policy import parse_policy 10 | from cape_dataframes.policy.data import Policy 11 | from cape_dataframes.utils import base64 12 | 13 | 14 | class GraphQLError: 15 | """Represents a GraphQL error that can be returned by a coordinator. 16 | 17 | Attributes: 18 | message: The error message. 19 | extensions: Any extra information returned by coordinator. 20 | """ 21 | 22 | message: str 23 | extensions: Dict[str, Any] 24 | 25 | def __init__(self, error): 26 | self.message = error["message"] 27 | 28 | if "extensions" in error: 29 | self.extensions = error["extensions"] 30 | 31 | 32 | class GraphQLException(Exception): 33 | """Exception wrapping a list of GraphQL errors. 34 | 35 | Attributes: 36 | errors: List of GraphQL errors. 37 | """ 38 | 39 | def __init__(self, errors): 40 | self.errors = [GraphQLError(error) for error in errors] 41 | 42 | 43 | class CapeError: 44 | """Represents a Cape error coming from the coordinator. 45 | 46 | Attributes: 47 | messages: A list of error messages 48 | cause: The cause of the error 49 | """ 50 | 51 | def __init__(self, error): 52 | self.messages = error["messages"] 53 | self.cause = error["cause"] 54 | 55 | 56 | class CapeException(Exception): 57 | """Exception wrapping a CapeError. 58 | Attributes: 59 | error: the CapeError 60 | """ 61 | 62 | def __init__(self, error): 63 | self.error = error 64 | 65 | 66 | class Client: 67 | """Coordinator client for making GraphQL requests. 68 | 69 | Implements a simple GraphQL protocol to communicate with a 70 | coordinator. 71 | 72 | Attributes: 73 | host: The address of the coordinator. 74 | token: The token used to authenticate with a coordinator. 75 | """ 76 | 77 | def __init__(self, host: str): 78 | self.host = f"{host}" 79 | self.token: str = "" 80 | 81 | self.s = requests.Session() 82 | 83 | def graphql_request(self, query: str, variables: Dict[str, str]): 84 | """Makes a GraphQL request to a coordinator. 85 | 86 | Adds an authorization header if it exists. 87 | 88 | Arguments: 89 | query: The GraphQL query to be passed to a coordinator. 90 | variables: The variables to be passed to a coordinator. 91 | 92 | Returns: 93 | The coordinator's GraphQL data response. 94 | 95 | Raises: 96 | GraphQLException: If a GraphQL error occurs. 97 | """ 98 | 99 | r = self.s.post( 100 | f"{self.host}/v1/query", 101 | json={"query": query, "variables": variables}, 102 | ) 103 | 104 | # attempt to get json so we can get the errors 105 | # if an error has occurred, if json doesn't exist 106 | # just raise the error 107 | try: 108 | j = r.json() 109 | except ValueError: 110 | r.raise_for_status() 111 | 112 | if "errors" in j: 113 | raise GraphQLException(j["errors"]) 114 | 115 | return j["data"] 116 | 117 | def login(self, token: str): 118 | """Logs in with the given token string""" 119 | 120 | self.api_token = APIToken(token) 121 | 122 | r = self.s.post( 123 | f"{self.host}/v1/login", 124 | json={ 125 | "token_id": self.api_token.token_id, 126 | "secret": str(base64.Base64(self.api_token.secret)), 127 | }, 128 | ) 129 | 130 | # attempt to get json so we can get the errors 131 | # if an error has occurred, if json doesn't exist 132 | # just raise the error 133 | try: 134 | j = r.json() 135 | except ValueError: 136 | r.raise_for_status() 137 | 138 | if "cause" in j: 139 | raise CapeException(j) 140 | 141 | self.token = base64.from_string(j["token"]) 142 | 143 | self.user = self.me() 144 | 145 | return self.token 146 | 147 | def me(self) -> str: 148 | """Returns the ID of the authenticated identity.""" 149 | 150 | query = """ 151 | query Me() { 152 | me { 153 | id 154 | name 155 | email 156 | } 157 | } 158 | """ 159 | 160 | res = self.graphql_request(query, None) 161 | 162 | return res["me"] 163 | 164 | def get_policy(self, label: str) -> Policy: 165 | """Returns the current policy for a given project label.""" 166 | 167 | query = """ 168 | query CurrentSpec($label: ModelLabel!) { 169 | project(label: $label) { 170 | current_spec { 171 | id 172 | rules 173 | transformations 174 | } 175 | } 176 | } 177 | """ 178 | 179 | variables = { 180 | "label": label, 181 | } 182 | 183 | res = self.graphql_request(query, variables) 184 | 185 | spec = res["project"]["current_spec"] 186 | spec["label"] = label 187 | 188 | return parse_policy(spec, logger=self) 189 | 190 | def audit_log(self, event_name, target_id, target_type, target_label): 191 | """Returns the current policy for a given project label.""" 192 | 193 | query = """ 194 | mutation AddAuditLog($audit: AuditEventInput!) { 195 | addAuditLog(audit: $audit) { 196 | event_name 197 | } 198 | } 199 | """ 200 | 201 | variables = { 202 | "audit": { 203 | "event_name": event_name, 204 | "user_id": self.user["id"], 205 | "user_name": self.user["name"], 206 | "user_email": self.user["email"], 207 | "time": rfc3339.rfc3339(datetime.now()), 208 | "target_id": target_id, 209 | "target_type": target_type, 210 | "target_label": target_label, 211 | }, 212 | } 213 | 214 | self.graphql_request(query, variables) 215 | 216 | def __repr__(self): 217 | return f"This client is connected to {self.host}" 218 | -------------------------------------------------------------------------------- /cape_dataframes/coordinator/client_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | import responses 5 | 6 | from cape_dataframes.audit import APPLY_POLICY_EVENT 7 | from cape_dataframes.coordinator.auth.api_token import create_api_token 8 | from cape_dataframes.coordinator.client import Client 9 | from cape_dataframes.coordinator.client import GraphQLException 10 | from cape_dataframes.policy import parse_policy 11 | 12 | host = "http://localhost:8080" 13 | 14 | 15 | @responses.activate 16 | def test_graphql_error(): 17 | responses.add( 18 | responses.POST, 19 | f"{host}/v1/query", 20 | json={ 21 | "errors": [ 22 | { 23 | "message": "Access denied", 24 | "extensions": { 25 | "cause": { 26 | "name": "authorization_failure", 27 | "category": "unauthorized", 28 | } 29 | }, 30 | } 31 | ] 32 | }, 33 | ) 34 | 35 | c = Client(host) 36 | 37 | with pytest.raises(GraphQLException) as excinfo: 38 | c.me() 39 | 40 | g_err = excinfo.value.errors[0] 41 | assert g_err.message == "Access denied" 42 | assert g_err.extensions == { 43 | "cause": {"name": "authorization_failure", "category": "unauthorized"} 44 | } 45 | 46 | 47 | @responses.activate 48 | def test_login(): 49 | exp_token = "ABCDEFE" 50 | token_id = "specialid" 51 | secret = "secret" 52 | 53 | token = create_api_token(token_id, secret) 54 | 55 | def cb(request): 56 | resp_body = {"token": exp_token} 57 | 58 | return 200, {}, json.dumps(resp_body) 59 | 60 | responses.add_callback(responses.POST, f"{host}/v1/login", cb) 61 | 62 | my_id = "thisisanid" 63 | responses.add( 64 | responses.POST, 65 | f"{host}/v1/query", 66 | json={"data": {"me": {"id": my_id}}}, 67 | ) 68 | 69 | c = Client(host) 70 | 71 | c.login(token.raw) 72 | 73 | assert str(c.token) == exp_token 74 | 75 | 76 | @responses.activate 77 | def test_me(): 78 | my_id = "thisisanid" 79 | responses.add( 80 | responses.POST, 81 | f"{host}/v1/query", 82 | json={"data": {"me": {"id": my_id}}}, 83 | ) 84 | 85 | c = Client(host) 86 | 87 | user = c.me() 88 | 89 | assert my_id == user["id"] 90 | 91 | 92 | @responses.activate 93 | def test_get_policy(): 94 | rules = [ 95 | {"match": {"name": "column"}, "actions": [{"transform": {"name": "plusOne"}}]} 96 | ] 97 | 98 | responses.add( 99 | responses.POST, 100 | f"{host}/v1/query", 101 | json={"data": {"project": {"current_spec": {"rules": rules}}}}, 102 | ) 103 | 104 | c = Client(host) 105 | 106 | policy = c.get_policy("random-project") 107 | 108 | expected = {"label": "random-project", "rules": rules} 109 | 110 | expected = parse_policy(expected) 111 | 112 | assert policy.label == expected.label 113 | assert ( 114 | policy.rules[0].actions[0].transform.field 115 | == expected.rules[0].actions[0].transform.field 116 | ) 117 | 118 | 119 | @responses.activate 120 | def test_audit_log(): 121 | exp_token = "ABCDEFE" 122 | token_id = "specialid" 123 | secret = "secret" 124 | 125 | token = create_api_token(token_id, secret) 126 | 127 | def cb(request): 128 | resp_body = {"token": exp_token} 129 | 130 | return 200, {}, json.dumps(resp_body) 131 | 132 | responses.add_callback(responses.POST, f"{host}/v1/login", cb) 133 | 134 | my_id = "thisisanid" 135 | responses.add( 136 | responses.POST, 137 | f"{host}/v1/query", 138 | json={"data": {"me": {"id": my_id, "name": "hey", "email": "yo@yo.com"}}}, 139 | ) 140 | 141 | responses.add( 142 | responses.POST, 143 | f"{host}/v1/query", 144 | json={"data": {"addAuditLog": {"event_name": APPLY_POLICY_EVENT}}}, 145 | ) 146 | 147 | c = Client(host) 148 | 149 | c.login(token.raw) 150 | 151 | c.audit_log(APPLY_POLICY_EVENT, "idididid", "policy", "project-label") 152 | 153 | 154 | def test_client_repr(): 155 | c = Client(host) 156 | 157 | assert c.__repr__() == f"This client is connected to {host}" 158 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/__init__.py: -------------------------------------------------------------------------------- 1 | from cape_dataframes.pandas import dtypes 2 | from cape_dataframes.pandas import registry 3 | from cape_dataframes.pandas import transformations 4 | from cape_dataframes.pandas.transformer import transformer 5 | 6 | __all__ = ["dtypes", "transformations", "transformer", "registry"] 7 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/dtypes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | String = pd.api.types.pandas_dtype(str) 5 | Date = pd.api.types.pandas_dtype("datetime64") 6 | # numeric types 7 | Float = pd.api.types.pandas_dtype(np.float32) 8 | Double = pd.api.types.pandas_dtype(np.float64) 9 | Byte = pd.api.types.pandas_dtype(np.byte) 10 | Short = pd.api.types.pandas_dtype(np.short) 11 | Integer = pd.api.types.pandas_dtype(np.int32) 12 | Long = pd.api.types.pandas_dtype(np.int64) 13 | # groupings 14 | Floats = (Float, Double) 15 | Integers = (Byte, Short, Integer, Long) 16 | Numerics = Floats + Integers 17 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/registry.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | from typing import Dict 3 | 4 | from cape_dataframes.pandas.transformations import ColumnRedact 5 | from cape_dataframes.pandas.transformations import DatePerturbation 6 | from cape_dataframes.pandas.transformations import DateTruncation 7 | from cape_dataframes.pandas.transformations import NumericPerturbation 8 | from cape_dataframes.pandas.transformations import NumericRounding 9 | from cape_dataframes.pandas.transformations import ReversibleTokenizer 10 | from cape_dataframes.pandas.transformations import RowRedact 11 | from cape_dataframes.pandas.transformations import Tokenizer 12 | from cape_dataframes.pandas.transformations import TokenReverser 13 | 14 | TransformationCtor = Callable 15 | 16 | _registry: Dict[str, TransformationCtor] = {} 17 | 18 | 19 | def get(transformation: str) -> TransformationCtor: 20 | """Returns the constructor for the given key. 21 | 22 | Arguments: 23 | transformation: The key of transformation to retrieve. 24 | """ 25 | return _registry.get(transformation, None) 26 | 27 | 28 | def register(label: str, ctor: TransformationCtor): 29 | """Registers a new transformation constructor under the label provided. 30 | 31 | Arguments: 32 | label: The label that will be used as the key in the registry 33 | ctor: The transformation constructor 34 | """ 35 | _registry[label] = ctor 36 | 37 | 38 | register(DatePerturbation.identifier, DatePerturbation) 39 | register(NumericPerturbation.identifier, NumericPerturbation) 40 | register(NumericRounding.identifier, NumericRounding) 41 | register(Tokenizer.identifier, Tokenizer) 42 | register(DateTruncation.identifier, DateTruncation) 43 | register(ColumnRedact.identifier, ColumnRedact) 44 | register(RowRedact.identifier, RowRedact) 45 | register(TokenReverser.identifier, TokenReverser) 46 | register(ReversibleTokenizer.identifier, ReversibleTokenizer) 47 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/registry_test.py: -------------------------------------------------------------------------------- 1 | from cape_dataframes.pandas import registry 2 | from cape_dataframes.pandas.transformations import test_utils 3 | 4 | 5 | def test_get(): 6 | registry.register("plusN", test_utils.PlusN) 7 | ctor = registry.get("plusN") 8 | args = {"n": 1} 9 | ctor(**args) 10 | 11 | 12 | def test_get_missing(): 13 | ctor = registry.get("plusWhat?") 14 | assert ctor is None 15 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/transformations/__init__.py: -------------------------------------------------------------------------------- 1 | from cape_dataframes.pandas.transformations.column_redact import ColumnRedact 2 | from cape_dataframes.pandas.transformations.perturbation import DatePerturbation 3 | from cape_dataframes.pandas.transformations.perturbation import NumericPerturbation 4 | from cape_dataframes.pandas.transformations.rounding import DateTruncation 5 | from cape_dataframes.pandas.transformations.rounding import NumericRounding 6 | from cape_dataframes.pandas.transformations.row_redact import RowRedact 7 | from cape_dataframes.pandas.transformations.tokenizer import ReversibleTokenizer 8 | from cape_dataframes.pandas.transformations.tokenizer import Tokenizer 9 | from cape_dataframes.pandas.transformations.tokenizer import TokenReverser 10 | 11 | __all__ = [ 12 | "DateTruncation", 13 | "DatePerturbation", 14 | "NumericPerturbation", 15 | "NumericRounding", 16 | "ReversibleTokenizer", 17 | "Tokenizer", 18 | "TokenReverser", 19 | "ColumnRedact", 20 | "RowRedact", 21 | ] 22 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/transformations/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class AbstractTransformation(metaclass=abc.ABCMeta): 5 | @property 6 | @abc.abstractmethod 7 | def dtype(self): 8 | pass 9 | 10 | @abc.abstractmethod 11 | def __call__(self, x): 12 | pass 13 | 14 | 15 | class Transformation(AbstractTransformation): 16 | def __init__(self, dtype): 17 | self._dtype = dtype 18 | 19 | @property 20 | def dtype(self): 21 | return self._dtype 22 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/transformations/column_redact.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import pandas as pd 4 | 5 | 6 | class ColumnRedact: 7 | """Redacts columns. 8 | 9 | Attributes: 10 | columns: The columns to redact. 11 | """ 12 | 13 | identifier = "column-redact" 14 | type_signature = "df->df" 15 | 16 | def __init__(self, columns: List[str]) -> None: 17 | self.columns = columns 18 | 19 | def __call__(self, df: pd.DataFrame) -> pd.DataFrame: 20 | return df.drop(columns=self.columns) 21 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/transformations/column_redact_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pandas.testing as pdt 4 | 5 | from cape_dataframes.pandas.transformations import ColumnRedact 6 | 7 | 8 | def test_column_redact(): 9 | redact = ColumnRedact(["b", "c"]) 10 | 11 | df = pd.DataFrame(np.ones((5, 3)), columns=["a", "b", "c"]) 12 | 13 | expected = pd.DataFrame(np.ones((5,)), columns=["a"]) 14 | 15 | result = redact(df) 16 | 17 | pdt.assert_frame_equal(result, expected) 18 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/transformations/perturbation.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from typing import Optional 3 | from typing import Tuple 4 | from typing import Union 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from cape_dataframes.pandas import dtypes 10 | from cape_dataframes.pandas.transformations import base 11 | from cape_dataframes.utils import typecheck 12 | 13 | _FREQUENCY_TO_DELTA_FN = { 14 | "YEAR": lambda noise: pd.to_timedelta(noise * 365, unit="days"), 15 | "MONTH": lambda noise: pd.to_timedelta(noise * 30, unit="days"), 16 | "DAY": lambda noise: pd.to_timedelta(noise, unit="days"), 17 | "HOUR": lambda noise: pd.to_timedelta(noise, unit="hours"), 18 | "minutes": lambda noise: pd.to_timedelta(noise, unit="minutes"), 19 | "seconds": lambda noise: pd.to_timedelta(noise, unit="seconds"), 20 | } 21 | IntTuple = Union[int, Tuple[int, ...]] 22 | StrTuple = Union[str, Tuple[str, ...]] 23 | 24 | 25 | class NumericPerturbation(base.Transformation): 26 | """Add uniform random noise to a numeric Pandas series 27 | 28 | Mask a numeric Pandas series by adding uniform random 29 | noise to each value. The amount of noise is drawn from 30 | the interval [min, max). 31 | 32 | Example: 33 | ``` 34 | s = pd.Series([0, 1, 2, 3, 4]) 35 | perturb = NumericPerturbation(dtype=Integer, min=-10, max=10, seed=123) 36 | perturb(s) # pd.Series([3, -7, -3, -3]) 37 | ``` 38 | 39 | Attributes: 40 | dtype (dtypes.Numerics): Pandas Series type 41 | min (int, float): the values generated will be greater then or equal to min 42 | max (int, float): the values generated will be less than max 43 | seed (int), optional: a seed to initialize the random generator 44 | """ 45 | 46 | identifier = "numeric-perturbation" 47 | type_signature = "col->col" 48 | 49 | def __init__( 50 | self, 51 | dtype: dtypes.Numerics, 52 | min: Union[int, float], 53 | max: Union[int, float], 54 | seed: Optional[int] = None, 55 | ): 56 | assert dtype in dtypes.Numerics 57 | typecheck.check_arg(min, (int, float)) 58 | typecheck.check_arg(max, (int, float)) 59 | typecheck.check_arg(seed, (int, type(None))) 60 | super().__init__(dtype) 61 | self._min = min 62 | self._max = max 63 | self._rng = np.random.default_rng(seed=seed) 64 | 65 | def __call__(self, x: pd.Series) -> pd.Series: 66 | noise = pd.Series(self._rng.uniform(self._min, self._max, size=x.shape)) 67 | if not isinstance(noise.dtype.type, self.dtype.type): 68 | noise = noise.astype(self.dtype) 69 | return x + noise 70 | 71 | 72 | class DatePerturbation(base.Transformation): 73 | """Add uniform random noise to a Pandas series of timestamps 74 | 75 | Mask a Pandas series by adding uniform random noise to the 76 | specified frequencies of timestamps. The amount of noise for 77 | each frequency is drawn from the internal [min_freq, max_freq). 78 | 79 | Example: 80 | ``` 81 | s = pd.Series([datetime.date(year=2020, month=2, day=15)]) 82 | perturb = DatePerturbation(frequency="MONTH", min=-10, max=10, seed=1234) 83 | perturb(s) # pd.Series([datetime.date(year=2020, month=11, day=11)]) 84 | ``` 85 | 86 | Attributes: 87 | frequency (str, str list): one or more frequencies to perturbate 88 | min (int, int list): the frequency value will be greater or equal to min 89 | max (int, int list): the frequency value will be less than max 90 | seed (int), optional: a seed to initialize the random generator 91 | """ 92 | 93 | identifier = "date-perturbation" 94 | type_signature = "col->col" 95 | 96 | def __init__( 97 | self, 98 | frequency: StrTuple, 99 | min: IntTuple, 100 | max: IntTuple, 101 | seed: Optional[int] = None, 102 | ): 103 | super().__init__(dtypes.Date) 104 | self._frequency = _check_freq_arg(frequency) 105 | self._min = _check_minmax_arg(min) 106 | self._max = _check_minmax_arg(max) 107 | self._rng = np.random.default_rng(seed) 108 | 109 | def __call__(self, x: pd.Series): 110 | is_date_no_time = False 111 | 112 | # Use equality instead of isinstance because of inheritance 113 | if type(x.iloc[0]) == datetime.date: 114 | x = pd.to_datetime(x) 115 | is_date_no_time = True 116 | 117 | for f, mn, mx in zip(self._frequency, self._min, self._max): 118 | noise = self._rng.integers(mn, mx, size=x.shape) 119 | delta_fn = _FREQUENCY_TO_DELTA_FN.get(f, None) 120 | if delta_fn is None: 121 | raise ValueError( 122 | "Frequency {} must be one of {}.".format( 123 | f, list(_FREQUENCY_TO_DELTA_FN.keys()) 124 | ) 125 | ) 126 | x += delta_fn(noise) 127 | 128 | if is_date_no_time: 129 | return pd.Series(x).dt.date 130 | else: 131 | return x 132 | 133 | 134 | def _check_minmax_arg(arg): 135 | """Checks that arg is an integer or a flat collection of integers.""" 136 | if not isinstance(arg, (tuple, list)): 137 | if not isinstance(arg, int): 138 | raise ValueError 139 | return [arg] 140 | else: 141 | for a in arg: 142 | if not isinstance(a, int): 143 | raise ValueError 144 | return arg 145 | 146 | 147 | def _check_freq_arg(arg): 148 | """Checks that arg in one of the frequency options.""" 149 | if not isinstance(arg, (tuple, list)): 150 | if not isinstance(arg, str): 151 | raise ValueError 152 | return [arg] 153 | else: 154 | for a in arg: 155 | if not isinstance(a, str): 156 | raise ValueError 157 | return arg 158 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/transformations/perturbation_test.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pandas.testing as pdt 6 | 7 | from cape_dataframes.pandas import dtypes 8 | from cape_dataframes.pandas.transformations import DatePerturbation 9 | from cape_dataframes.pandas.transformations import NumericPerturbation 10 | 11 | 12 | def test_perturbation_float(): 13 | transform = NumericPerturbation(dtype=dtypes.Float, min=-10, max=10, seed=1234) 14 | 15 | df = pd.DataFrame({"amount": range(5)}) 16 | expected = pd.DataFrame( 17 | {"amount": [9.53399, -1.39608, 10.46492, -1.76615, 0.38194]} 18 | ) 19 | 20 | df["amount"] = transform(df.amount) 21 | 22 | pdt.assert_frame_equal(df, expected) 23 | 24 | 25 | def test_perturbation_int(): 26 | transform = NumericPerturbation(dtype=dtypes.Integer, min=-10, max=10, seed=12345) 27 | 28 | df = pd.DataFrame({"amount": range(5)}) 29 | expected = pd.DataFrame({"amount": [-5, -2, 7, 6, 2]}) 30 | 31 | df["amount"] = transform(df.amount) 32 | 33 | pdt.assert_frame_equal(df, expected) 34 | 35 | 36 | def test_perturbation_datetime(): 37 | transform = DatePerturbation(frequency="DAY", min=-10, max=10, seed=1234) 38 | 39 | df = pd.DataFrame({"date": [np.datetime64("2018-10-15")]}) 40 | expected = pd.DataFrame({"date": [np.datetime64("2018-10-24")]}) 41 | 42 | df["date"] = transform(df.date) 43 | 44 | pdt.assert_frame_equal(df, expected) 45 | 46 | 47 | def test_perturbation_date(): 48 | transform = DatePerturbation(frequency="DAY", min=-10, max=10, seed=1234) 49 | 50 | df = pd.DataFrame({"date": [datetime.date(year=2018, month=10, day=15)]}) 51 | expected = pd.DataFrame({"date": [datetime.date(year=2018, month=10, day=24)]}) 52 | 53 | df["date"] = transform(df.date) 54 | 55 | pdt.assert_frame_equal(df, expected) 56 | 57 | 58 | def test_perturbation_dat_mutliple_freq(): 59 | transform = DatePerturbation( 60 | frequency=("DAY", "YEAR"), min=(-10, -5), max=(10, 5), seed=1234 61 | ) 62 | 63 | df = pd.DataFrame({"date": [np.datetime64("2018-10-15")]}) 64 | expected = pd.DataFrame({"date": [np.datetime64("2022-10-23")]}) 65 | 66 | df["date"] = transform(df.date) 67 | 68 | pdt.assert_frame_equal(df, expected) 69 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/transformations/rounding.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import pandas as pd 4 | 5 | from cape_dataframes.pandas import dtypes 6 | from cape_dataframes.pandas.transformations import base 7 | from cape_dataframes.utils import typecheck 8 | 9 | 10 | class NumericRounding(base.Transformation): 11 | """Reduce the precision of a numeric Pandas Series 12 | 13 | Round each value in the Pandas Series to the given number 14 | of digits. 15 | 16 | Example: 17 | ``` 18 | s = pd.Series([1.384]) 19 | round = NumericRounding(precision=1) 20 | round(s) # pd.Series([1.4]) 21 | ``` 22 | 23 | Attributes: 24 | dtypes (dtypes.Numerics): Pandas Series type. 25 | precision (int): set the number of digits. 26 | """ 27 | 28 | identifier = "numeric-rounding" 29 | type_signature = "col->col" 30 | 31 | def __init__(self, dtype: dtypes.Numerics, precision: int): 32 | if dtype not in dtypes.Numerics: 33 | raise ValueError("NumericRounding requires a Numeric dtype.") 34 | typecheck.check_arg(precision, int) 35 | super().__init__(dtype) 36 | self._precision = precision 37 | 38 | def __call__(self, x: pd.Series) -> pd.Series: 39 | """Round each value in the Pandas Series 40 | 41 | Args: 42 | x (A Pandas Series): need to be a list of numeric values. 43 | 44 | Return: 45 | A Pandas Series with each value rounded 46 | """ 47 | return self.round_numeric(x) 48 | 49 | def round_numeric(self, x: pd.Series): 50 | rounded = x.round(self._precision) 51 | if isinstance(rounded.dtype.type, self.dtype.type): 52 | return rounded 53 | else: 54 | return rounded.astype(self.dtype) 55 | 56 | 57 | class DateTruncation(base.Transformation): 58 | """Reduce the precision of a date Pandas Series 59 | Truncate each date in a Pandas Series to the unit (year 60 | or month) specified by frequency. 61 | Example: 62 | ``` 63 | s = pd.Series([pd.Timestamp("2018-10-15")]) 64 | trunc = DateTruncation(frequency="year") 65 | trunc(s) # pd.Serie([pd.Timestamp("2018-01-01")]) 66 | ``` 67 | Attributes: 68 | frequency (string): expect to be 'year' or 'month' 69 | """ 70 | 71 | identifier = "date-truncation" 72 | type_signature = "col->col" 73 | 74 | def __init__(self, frequency: str): 75 | typecheck.check_arg(frequency, str) 76 | super().__init__(dtypes.Date) 77 | self._frequency = frequency.lower() 78 | _check_freq_arg(self._frequency) 79 | 80 | def __call__(self, x: pd.Series) -> pd.Series: 81 | return self._trunc_date(x) 82 | 83 | def _trunc_date(self, x: pd.Series) -> pd.Series: 84 | if self._frequency == "year": 85 | truncated = x.values.astype(" None: 15 | self.condition = condition 16 | 17 | def __call__(self, df: pd.DataFrame) -> pd.DataFrame: 18 | """Redacts rows using Dataframe.query. 19 | 20 | DataFrame.query returns all the fields that it matches so 21 | we negate it here to get the opposite. 22 | """ 23 | 24 | condition = f"~({self.condition})" 25 | return df.query(condition) 26 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/transformations/row_redact_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pandas.testing as pdt 4 | 5 | from cape_dataframes.pandas.transformations import RowRedact 6 | 7 | 8 | def test_row_redact(): 9 | redact = RowRedact("a > 5") 10 | 11 | df = pd.DataFrame(np.ones((5, 2)), columns=["a", "b"]) 12 | 13 | df["a"].iloc[0] = 6 14 | df["a"].iloc[3] = 6 15 | 16 | expected = pd.DataFrame(np.ones((3, 2)), columns=["a", "b"], index=[1, 2, 4]) 17 | 18 | result = redact(df) 19 | 20 | pdt.assert_frame_equal(result, expected) 21 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/transformations/test_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | class PlusN: 5 | """A sample transform that adds n to a specific field. 6 | 7 | Attributes: 8 | field: The field that this transform will be applied to. 9 | n: The value to add to the field. 10 | """ 11 | 12 | identifier = "plusN" 13 | type_signature = "col->col" 14 | 15 | def __init__(self, n: int = 1) -> None: 16 | self.n = n 17 | 18 | def __call__(self, column: pd.Series) -> pd.Series: 19 | return column + self.n 20 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/transformations/tokenizer.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import secrets 3 | 4 | import pandas as pd 5 | from Crypto.Cipher import AES 6 | 7 | from cape_dataframes.pandas import dtypes 8 | from cape_dataframes.pandas.transformations import base 9 | from cape_dataframes.utils import typecheck 10 | 11 | 12 | class Tokenizer(base.Transformation): 13 | """Tokenizer: map a string to a token to obfuscate it. 14 | 15 | When applying the Tokenizer to a Pandas Series of type string, 16 | each value gets mapped to a token (hexadecimal string). 17 | If a value is repeated several times across the series, it always 18 | get mapped to the same token in order to maintain the count. 19 | A value can be mapped to different tokens by setting the key to a 20 | different value. 21 | 22 | Example: 23 | ``` 24 | s = pd.Series(['A']) 25 | tokenize = Tokenizer(max_token_len=5, key='secret') 26 | tokenize(s) # pd.Series(['40a1e']) 27 | ``` 28 | 29 | Attributes: 30 | max_token_len (int or bytes): control the token length (default 31 | length is 64) 32 | key: expect a string or byte string. If not specified, key will 33 | be set to a random byte string. 34 | """ 35 | 36 | identifier = "tokenizer" 37 | type_signature = "col->col" 38 | 39 | def __init__(self, max_token_len: int = None, key: str = None): 40 | typecheck.check_arg(max_token_len, (int, type(None))) 41 | typecheck.check_arg(key, (str, bytes, type(None))) 42 | super().__init__(dtype=dtypes.String) 43 | self._max_token_len = max_token_len 44 | if isinstance(key, str): 45 | key = key.encode() 46 | self._key = key or secrets.token_bytes(8) 47 | 48 | def __call__(self, series: pd.Series) -> pd.Series: 49 | """Map a Pandas Series to tokens. 50 | 51 | Args: 52 | series (A Pandas Series): need to be a list of strings. 53 | 54 | Return: 55 | A Pandas Series with a list of tokens represented as hexadecimal 56 | strings. 57 | """ 58 | 59 | return series.apply(lambda x: self.to_token(x)) 60 | 61 | def to_token(self, x): 62 | token = hashlib.sha256(x.encode() + self.key).hexdigest() 63 | if self._max_token_len is not None: 64 | return token[: self._max_token_len] 65 | else: 66 | return token 67 | 68 | @property 69 | def key(self): 70 | return self._key 71 | 72 | 73 | class ReversibleTokenizer(base.Transformation): 74 | """ReversibleTokenizer: map a string to a token to obfuscate it. 75 | 76 | When applying the Tokenizer to a Pandas Series of type string, 77 | each value gets mapped to a token (hexadecimal string). 78 | If a value is repeated several times across the series, it always 79 | get mapped to the same token in order to maintain the count. 80 | A value can be mapped to different tokens by setting the key to a 81 | different value. 82 | 83 | This tokenizer allows tokens to be reversed to their original data 84 | when the secret key is known. 85 | 86 | Example: 87 | ``` 88 | s = pd.Series(['A']) 89 | tokenize = ReversibleTokenizer(key='secret') 90 | tokenize(s) # pd.Series(['40a1e']) 91 | ``` 92 | 93 | Attributes: 94 | key: expect a string or byte string of length exactly 32 bytes. 95 | encoding: string identifying the Python encoding used for inputs. 96 | """ 97 | 98 | identifier = "reversible-tokenizer" 99 | type_signature = "col->col" 100 | 101 | def __init__(self, key, encoding="utf-8"): 102 | typecheck.check_arg(key, (str, bytes)) 103 | typecheck.check_arg(encoding, str) 104 | super().__init__(dtype=dtypes.String) 105 | if isinstance(key, str): 106 | key = key.encode() 107 | if len(key) != 32: 108 | raise ValueError(f"Key must be exactly 32 bytes, got {len(key)}") 109 | self.key = key 110 | self.encoding = encoding 111 | 112 | def __call__(self, series: pd.Series) -> pd.Series: 113 | """Map a Pandas Series to tokens. 114 | 115 | Args: 116 | series (A Pandas Series): need to be a list of strings. 117 | 118 | Return: 119 | A Pandas Series with a list of tokens represented as hexadecimal 120 | strings. 121 | """ 122 | 123 | return series.apply(self._to_token) 124 | 125 | def _to_token(self, x: str): 126 | cipher = AES.new(key=self.key, mode=AES.MODE_SIV) 127 | ciphertext, tag = cipher.encrypt_and_digest(x.encode(encoding=self.encoding)) 128 | assert len(tag) == 16, len(tag) 129 | token = tag.hex() + ciphertext.hex() 130 | return token 131 | 132 | 133 | class TokenReverser(base.Transformation): 134 | """TokenReverser: recover string from token. 135 | 136 | When applying the TokenReverser to a Pandas Series of tokens, 137 | each token is mapped back to the string that was originally used 138 | by ReversibleTokenizer to construct the token. The same key must 139 | be used. 140 | 141 | Example: 142 | ``` 143 | s = pd.Series(['40a1e']) 144 | reverser = TokenReverser(key='secret') 145 | reverser(s) # pd.Series(['A']) 146 | ``` 147 | 148 | Attributes: 149 | key: expect a string or byte string of length exactly 32 bytes. 150 | encoding: string identifying the Python encoding used for outputs. 151 | """ 152 | 153 | identifier = "token-reverser" 154 | type_signature = "col->col" 155 | 156 | def __init__(self, key, encoding="utf-8"): 157 | typecheck.check_arg(key, (str, bytes)) 158 | typecheck.check_arg(encoding, str) 159 | super().__init__(dtype=dtypes.String) 160 | if isinstance(key, str): 161 | key = key.encode() 162 | if len(key) != 32: 163 | raise ValueError(f"Key must be exactly 32 bytes, got {len(key)}") 164 | self.key = key 165 | self.encoding = encoding 166 | 167 | def __call__(self, series: pd.Series) -> pd.Series: 168 | """Reverse a Pandas Series of tokens. 169 | 170 | Args: 171 | series (A Pandas Series): need to be a list of strings. 172 | 173 | Return: 174 | A Pandas Series with a list of recovered strings. 175 | """ 176 | 177 | return series.apply(self._from_token) 178 | 179 | def _from_token(self, token: str): 180 | cipher = AES.new(key=self.key, mode=AES.MODE_SIV) 181 | token_bytes = bytearray.fromhex(token) 182 | tag, ciphertext = token_bytes[:16], token_bytes[16:] 183 | x = cipher.decrypt_and_verify(ciphertext, tag) 184 | return x.decode(encoding=self.encoding) 185 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/transformations/tokenizer_test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pandas.testing as pdt 3 | import pytest 4 | 5 | from cape_dataframes.pandas.transformations import ReversibleTokenizer 6 | from cape_dataframes.pandas.transformations import Tokenizer 7 | from cape_dataframes.pandas.transformations import TokenReverser 8 | 9 | 10 | def test_tokenizer(): 11 | transform = Tokenizer(key="secret_key") 12 | 13 | df = pd.DataFrame({"name": ["Alice", "Bob"]}) 14 | expected = pd.DataFrame( 15 | { 16 | "name": [ 17 | "70a4b1a987767abf36463cd3e3f2b37144132e572fbb9b39f28bcaafe10d9b24", 18 | "dd4532a296deb4f114b1e7e88faefe4fb2b32c559ac15a8c6bcbdbcbc2aa4d4b", 19 | ] 20 | } 21 | ) 22 | 23 | df["name"] = transform(df["name"]) 24 | 25 | pdt.assert_frame_equal(df, expected) 26 | 27 | 28 | def test_tokenizer_with_max_size(): 29 | transform = Tokenizer(max_token_len=10, key="secret_key") 30 | 31 | df = pd.DataFrame({"name": ["Alice", "Bob"]}) 32 | expected = pd.DataFrame({"name": ["70a4b1a987", "dd4532a296"]}) 33 | 34 | df["name"] = transform(df["name"]) 35 | 36 | pdt.assert_frame_equal(df, expected) 37 | 38 | 39 | def test_reversible_tokenizer(): 40 | key = b"5" * 32 41 | plaintext = pd.DataFrame({"name": ["Alice", "Bob"]}) 42 | 43 | tokenizer = ReversibleTokenizer(key=key) 44 | tokenized_expected = pd.DataFrame( 45 | { 46 | "name": [ 47 | "c8c7e80144304276183e5bcd589db782bc5ff95309", 48 | "e0f40aea0d5c21b35967c4231b98b5b3e5338e", 49 | ] 50 | } 51 | ) 52 | tokenized = pd.DataFrame() 53 | tokenized["name"] = tokenizer(plaintext["name"]) 54 | pdt.assert_frame_equal(tokenized, tokenized_expected) 55 | 56 | reverser = TokenReverser(key=key) 57 | recovered = pd.DataFrame() 58 | recovered["name"] = reverser(tokenized["name"]) 59 | pdt.assert_frame_equal(recovered, plaintext) 60 | 61 | 62 | def test_reversible_tokenizer_string_key(): 63 | _ = ReversibleTokenizer(key="5" * 32) 64 | 65 | 66 | def test_reversible_tokenizer_insufficient_key(): 67 | with pytest.raises(ValueError): 68 | _ = ReversibleTokenizer(key=b"5" * 10) 69 | -------------------------------------------------------------------------------- /cape_dataframes/pandas/transformer.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from cape_dataframes.pandas.transformations import base as tfm 4 | 5 | 6 | def transformer(transformation: tfm.Transformation, df: pd.DataFrame, field_name: str): 7 | df[field_name] = transformation(df[field_name]) 8 | return df 9 | -------------------------------------------------------------------------------- /cape_dataframes/policy/__init__.py: -------------------------------------------------------------------------------- 1 | from cape_dataframes.policy.data import Policy 2 | from cape_dataframes.policy.exceptions import NamedTransformNotFound 3 | from cape_dataframes.policy.exceptions import TransformNotFound 4 | from cape_dataframes.policy.policy import parse_policy 5 | from cape_dataframes.policy.policy import reverse 6 | 7 | __all__ = [ 8 | "parse_policy", 9 | "Policy", 10 | "NamedTransformNotFound", 11 | "TransformNotFound", 12 | "reverse", 13 | ] 14 | -------------------------------------------------------------------------------- /cape_dataframes/policy/data.py: -------------------------------------------------------------------------------- 1 | """Contains the policy classes that are initialized from a yaml policy file. 2 | 3 | There are five main classes with Policy being the top level class. Policy contains 4 | the PolicySpec and NamedTransformations. PolicySpec contains Rules and Rules 5 | contain Transformations. 6 | 7 | Typical usage example: 8 | 9 | yaml_str = "...." 10 | d = yaml.load(yaml_str, Loader=yaml.FullLoad) 11 | 12 | # **d unpacks the dictionary produced by yaml and 13 | # passes them in has keyword arguments. 14 | policy = Policy(**d) 15 | """ 16 | 17 | from typing import List 18 | 19 | import yaml 20 | 21 | from cape_dataframes.audit import AuditLogger 22 | from cape_dataframes.utils import base64 23 | 24 | 25 | class Transform: 26 | """A actual transform that will be applied. 27 | 28 | Either named or function must be passed in here. The process to apply this 29 | transform will look at both function and named and apply the relevant one. 30 | 31 | Attributes: 32 | field: The field this transform will be applied to. 33 | name: The name of the named transform, referenced from 34 | the top level policy object. 35 | type: The builtin transform that will be initialized. 36 | kwargs: The rest of the arguments that will be passed to the transformation. 37 | """ 38 | 39 | def __init__(self, field, name=None, type=None, **kwargs): 40 | if field == "": 41 | raise ValueError("Field must be specified for transformation") 42 | 43 | if name is None and type is None: 44 | raise ValueError( 45 | "Either named or function must be specified" 46 | + f" for transformation on field {field}" 47 | ) 48 | 49 | if name is not None and type is not None: 50 | raise ValueError( 51 | "Both named and function cannot be " 52 | + "fset for transformation on field {field}" 53 | ) 54 | 55 | self.field = field 56 | self.name = name 57 | self.type = type 58 | self.args = kwargs 59 | 60 | 61 | class Action: 62 | def __init__(self, field, transform=None): 63 | self.transform = Transform(field, **transform) 64 | 65 | 66 | class Rule: 67 | """A rule contains actionable information of a policy. 68 | 69 | Attributes: 70 | match: The match used to select a field to be transformed. 71 | actions: The actions to take on a matched field. 72 | """ 73 | 74 | def __init__(self, match, actions=[]): 75 | self.actions = [] 76 | for action in actions: 77 | if type(action) is dict: 78 | self.actions.append(Action(match["name"], **action)) 79 | # special case for dropping a column (i.e. column redaction) 80 | elif type(action) is str and action == "drop": 81 | self.actions.append( 82 | Action( 83 | match["name"], 84 | {"type": "column-redact", "columns": [match["name"]]}, 85 | ) 86 | ) 87 | 88 | self.transformations = [action.transform for action in self.actions] 89 | 90 | 91 | class NamedTransform: 92 | """A named transformation that captures the args. 93 | 94 | Attributes: 95 | name: The name of the named transformation. 96 | type: The builtin type (i.e. transform) that the named transform initializes to. 97 | kwargs: The args that are captured by the named transform. 98 | """ 99 | 100 | def __init__(self, name, type, **kwargs): 101 | if name == "": 102 | raise ValueError("Name must be specified for named transformation") 103 | 104 | if type == "": 105 | raise ValueError(f"Type must be specified for named transformation {name}") 106 | 107 | if len(kwargs) == 0: 108 | raise ValueError( 109 | f"Args must be specified for named transformation {self.name}" 110 | ) 111 | 112 | self.name = name 113 | self.type = type 114 | self.args = kwargs 115 | 116 | for key, arg in self.args.items(): 117 | # if an arg is a secret 118 | if isinstance(arg, dict) and "type" in arg and arg["type"] == "secret": 119 | if "value" not in arg: 120 | raise ValueError( 121 | "Secret named transformation arg" 122 | + f"{arg['name']} must contain a value" 123 | ) 124 | 125 | # then set the arg value to the inner value 126 | self.args[key] = bytes(base64.from_string(arg["value"])) 127 | 128 | 129 | class Policy: 130 | """Top level policy object. 131 | 132 | The top level policy object holds the all of the relevant information 133 | for applying policy to data. 134 | 135 | Attributes: 136 | label: The label of the policy. 137 | version: The version of the policy. 138 | rules: List of rules that will be applied to a data frame. 139 | transformations: The named transformations for this policy. 140 | """ 141 | 142 | def __init__( 143 | self, 144 | logger: AuditLogger = AuditLogger(), 145 | id: str = "", 146 | label: str = "", 147 | version: int = 1, 148 | rules: List[Rule] = [], 149 | transformations: List[NamedTransform] = [], 150 | ): 151 | self.id = id 152 | self.logger = logger 153 | self.label = label 154 | self.version = version 155 | 156 | self._raw_transforms = transformations 157 | self.transformations = [ 158 | NamedTransform(**transform) for transform in transformations 159 | ] 160 | 161 | if len(rules) == 0: 162 | raise ValueError( 163 | f"At least one rule must be specified for policy specification {label}" 164 | ) 165 | 166 | self._raw_rules = rules 167 | self.rules = [Rule(**rule) for rule in rules] 168 | 169 | def __repr__(self): 170 | d = { 171 | "label": self.label, 172 | "version": self.version, 173 | "transformations": self._raw_transforms, 174 | "rules": self._raw_rules, 175 | } 176 | 177 | return "Policy:\n\n" + yaml.dump(d, sort_keys=False) 178 | -------------------------------------------------------------------------------- /cape_dataframes/policy/data_test.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | from cape_dataframes.policy.data import Policy 4 | from cape_dataframes.policy.policy_test_fixtures import named_with_secret_y 5 | from cape_dataframes.utils import base64 6 | 7 | y = """label: test_policy 8 | version: 1 9 | transformations: 10 | - name: plusOne 11 | type: plusN 12 | n: 1 13 | rules: 14 | - match: 15 | name: test 16 | actions: 17 | - transform: 18 | name: plusOne 19 | - transform: 20 | type: plusN 21 | n: 1 22 | - match: 23 | name: test2 24 | """ 25 | 26 | 27 | def test_policy_class(): 28 | d = yaml.load(y, Loader=yaml.FullLoader) 29 | 30 | p = Policy(**d) 31 | 32 | assert p.label == "test_policy" 33 | assert len(p.transformations) == 1 34 | 35 | named = p.transformations[0] 36 | assert named.name == "plusOne" 37 | assert named.type == "plusN" 38 | assert len(named.args) == 1 39 | 40 | assert named.args["n"] == 1 41 | 42 | rule = p.rules[0] 43 | assert len(p.rules) == 2 44 | assert len(rule.actions) == 2 45 | 46 | assert len(rule.transformations) == 2 47 | 48 | namedTransform = rule.transformations[0] 49 | builtinTransform = rule.transformations[1] 50 | 51 | assert namedTransform.field == "test" 52 | assert namedTransform.name == "plusOne" 53 | 54 | assert builtinTransform.field == "test" 55 | assert builtinTransform.type == "plusN" 56 | assert builtinTransform.args["n"] == 1 57 | 58 | 59 | def test_policy_with_secret(): 60 | d = yaml.load(named_with_secret_y, Loader=yaml.FullLoader) 61 | 62 | p = Policy(**d) 63 | 64 | assert p.transformations[1].args["key"] == bytes(base64.from_string("BASE")) 65 | 66 | 67 | def test_policy_repr(): 68 | d = yaml.load(y, Loader=yaml.FullLoader) 69 | 70 | p = Policy(**d) 71 | 72 | assert p.__repr__() == "Policy:\n\n" + y 73 | -------------------------------------------------------------------------------- /cape_dataframes/policy/exceptions.py: -------------------------------------------------------------------------------- 1 | class DependencyError(Exception): 2 | pass 3 | 4 | 5 | class NamedTransformNotFound(Exception): 6 | pass 7 | 8 | 9 | class TransformNotFound(Exception): 10 | pass 11 | -------------------------------------------------------------------------------- /cape_dataframes/policy/policy.py: -------------------------------------------------------------------------------- 1 | """Utils for parsing policy and applying them. 2 | 3 | The module reads in policy as yaml and then through apply_policy 4 | applies them to dataframes. 5 | 6 | Example policy yaml: 7 | 8 | label: test_policy 9 | version: 1 10 | rules: 11 | - match: 12 | name: value 13 | actions: 14 | # Tells the policy runner to apply the transformation 15 | # plusN with the specified arguments. 16 | - transform: 17 | type: plusN 18 | n: 1 19 | # Tells the policy runner to apply another plusN 20 | # transformation. 21 | - transform: 22 | type: plusN 23 | n: 2 24 | 25 | Applying policy: 26 | 27 | policy = parse_policy("policy.yaml") 28 | df = pd.DataFrame(np.ones(5,), columns=["value"]) 29 | df = apply_policy(policy, df) 30 | """ 31 | 32 | import copy 33 | import logging 34 | import types 35 | from typing import Any 36 | from typing import Callable 37 | from typing import Dict 38 | from typing import Union 39 | 40 | import pandas as pd 41 | import pyspark 42 | import requests 43 | import validators 44 | import yaml 45 | 46 | from cape_dataframes import pandas as pandas_lib 47 | from cape_dataframes import spark as spark_lib 48 | from cape_dataframes.audit import APPLY_POLICY_EVENT 49 | from cape_dataframes.audit import AuditLogger 50 | from cape_dataframes.pandas import transformations 51 | from cape_dataframes.policy import data 52 | from cape_dataframes.policy import exceptions 53 | 54 | 55 | def apply_policy(policy: data.Policy, df, inplace=False): 56 | """Applies a Policy to some DataFrame. 57 | 58 | This function is responsible for inferring the type of the DataFrame, preparing the 59 | relevant Spark or Pandas Transformations, and applying them to produce a transformed 60 | DataFrame that conforms to the Policy. 61 | 62 | Args: 63 | policy: The `Policy` object that the transformed DataFrame will conform to, e.g. 64 | as returned by `cape_dataframes.parse_policy`. 65 | df: The DataFrame object to transform according to `policies`. Must be of type 66 | pandas.DataFrame or pyspark.sql.DataFrame. 67 | inplace: Whether to mutate the `df` or produce a new one. This argument is only 68 | relevant for Pandas DataFrames, as Spark DataFrames do not support mutation. 69 | 70 | Raises: 71 | ValueError: If df is a Spark DataFrame and inplace=True, or if df is something 72 | other than a Pandas or Spark DataFrame. 73 | DependencyError: If Spark is not configured correctly in the Python environment. 74 | TransformNotFound, NamedTransformNotFound: If the Policy contains a reference to 75 | a Transformation or NamedTransformation that is unrecognized in the 76 | Transformation registry. 77 | """ 78 | if isinstance(df, pd.DataFrame): 79 | registry = pandas_lib.registry 80 | transformer = pandas_lib.transformer 81 | dtypes = pandas_lib.dtypes 82 | if not inplace: 83 | result_df = df.copy() 84 | else: 85 | result_df = df 86 | elif not spark_lib.is_available(): 87 | raise exceptions.DependencyError 88 | elif isinstance(df, spark_lib.DataFrame): 89 | if inplace: 90 | raise ValueError( 91 | "Spark does not support DataFrame mutation, so inplace=True is invalid." 92 | ) 93 | registry = spark_lib.registry 94 | transformer = spark_lib.transformer 95 | dtypes = spark_lib.dtypes 96 | result_df = df 97 | else: 98 | raise ValueError(f"Expected df to be a DataFrame, found {type(df)}.") 99 | for rule in policy.rules: 100 | result_df = _do_transformations( 101 | policy, rule, result_df, registry, transformer, dtypes 102 | ) 103 | 104 | policy.logger.audit_log(APPLY_POLICY_EVENT, policy.id, "policy", policy.label) 105 | 106 | return result_df 107 | 108 | 109 | def parse_policy( 110 | p: Union[str, Dict[Any, Any]], logger: AuditLogger = AuditLogger() 111 | ) -> data.Policy: 112 | """Parses a policy YAML file. 113 | 114 | The passed in string can either be a path to a local file, 115 | a URL pointing to a file or a dictionary representing the policy. 116 | If it is a URL then requests attempts to download it. 117 | 118 | Args: 119 | p: a path string, a URL string or a dictionary representing the 120 | policy. 121 | 122 | Returns: 123 | The Policy object initialized by the YAML. 124 | """ 125 | if type(p) == str: 126 | if validators.url(p): 127 | yaml_data = requests.get(p).text 128 | else: 129 | with open(p) as f: 130 | yaml_data = f.read() 131 | 132 | policy = yaml.load(yaml_data, Loader=yaml.FullLoader) 133 | else: 134 | policy = p 135 | 136 | return data.Policy(logger=logger, **policy) 137 | 138 | 139 | def _maybe_replace_dtype_arg(args, dtypes): 140 | if "dtype" in args: 141 | args["dtype"] = getattr(dtypes, args["dtype"]) 142 | return args 143 | 144 | 145 | def _get_transformation( 146 | policy: data.Policy, 147 | transform: data.Transform, 148 | registry: types.ModuleType, 149 | dtypes, 150 | ): 151 | """Looks up the correct transform class. 152 | 153 | If the transform is anonymous (i.e. unnamed) then it looks it up from the 154 | transform registry. If it is a named transform it used load_named_transform 155 | to find it. 156 | 157 | Args: 158 | policy: The top level policy. 159 | transform: The specific transform to be applied. 160 | registry: The module representing the transformation registry; differs for 161 | Spark/Pandas. 162 | dtypes: Passthrough; concrete dtypes to use (spark.dtypes or pandas.dtypes). 163 | 164 | Returns: 165 | The initialize transform object. 166 | 167 | Raises: 168 | TransformNotFound: The builtin transform cannot be found. 169 | NamedTransformNotFound: The named transform cannot be found on the 170 | top level policy object. 171 | ValueError: If neither a function or named transform exists on the transform 172 | arg. 173 | """ 174 | if transform.type is not None: 175 | tfm_ctor = registry.get(transform.type) 176 | if tfm_ctor is None: 177 | raise exceptions.TransformNotFound( 178 | f"Could not find builtin transform {transform.type}" 179 | ) 180 | tfm_args = _maybe_replace_dtype_arg(transform.args, dtypes) 181 | initTransform = tfm_ctor(**tfm_args) 182 | elif transform.name is not None: 183 | initTransform = _load_named_transform(policy, transform.name, registry, dtypes) 184 | else: 185 | raise ValueError( 186 | f"Expected type or name for transform with field {transform.field}" 187 | ) 188 | return initTransform 189 | 190 | 191 | def _do_transformations( 192 | policy: data.Policy, 193 | rule: data.Rule, 194 | df, 195 | registry: types.ModuleType, 196 | transformer: Callable, 197 | dtypes, 198 | ): 199 | """Applies a specific rule's transformations to a dataframe. 200 | 201 | For each transform, lookup the required transform class and then apply it 202 | to the correct column in that dataframe. 203 | 204 | Args: 205 | policy: The top level policy. 206 | rule: The specific rule to apply. 207 | df: A Pandas or Spark dataframe. 208 | registry: The module representing the transformation registry; differs for 209 | Spark/Pandas. 210 | transformer: A function mapping (Transformation, DataFrame, str) to a DataFrame 211 | that mutates a DataFrame by applying the Transformation to one of its 212 | columns. 213 | dtypes: Passthrough; concrete dtypes to use (spark.dtypes or pandas.dtypes). 214 | 215 | Returns: 216 | The transformed dataframe. 217 | """ 218 | 219 | for transform in rule.transformations: 220 | do_transform = _get_transformation(policy, transform, registry, dtypes) 221 | try: 222 | if do_transform.type_signature == "df->df": 223 | df = do_transform(df) 224 | else: 225 | df = transformer(do_transform, df, transform.field) 226 | except (KeyError, pyspark.sql.utils.AnalysisException): 227 | logging.warning( 228 | f"Unable to transform column {transform.field} in policy {policy.label}" 229 | ) 230 | 231 | return df 232 | 233 | 234 | def _load_named_transform( 235 | policy: data.Policy, 236 | transformLabel: str, 237 | registry: types.ModuleType, 238 | dtypes, 239 | ): 240 | """Attempts to load a named transform from the top level policy. 241 | 242 | Looks at the top level policy object for the named transform given as transformLabel 243 | and initializes it from the args pulled from the policy object. 244 | 245 | Args: 246 | policy: Top level policy object. 247 | transformLabel: The name of the named transform. 248 | registry: The module representing the transformation registry; differs for 249 | Spark/Pandas. 250 | dtypes: Passthrough; concrete dtypes to use (spark.dtypes or pandas.dtypes). 251 | 252 | Returns: 253 | The initialized transform object. 254 | 255 | Raises: 256 | NamedTransformNotFound: The named transform cannot be 257 | found in the top level policy object. 258 | DependencyError: If return_spark is True but PySpark is missing from the current 259 | environment. 260 | """ 261 | found = False 262 | 263 | named_transforms = policy.transformations 264 | for transform in named_transforms: 265 | if transformLabel == transform.name: 266 | tfm_ctor = registry.get(transform.type) 267 | if tfm_ctor is None: 268 | raise exceptions.NamedTransformNotFound( 269 | f"Could not find transform of type {transform.type} in registry" 270 | ) 271 | tfm_args = _maybe_replace_dtype_arg(transform.args, dtypes) 272 | initTransform = tfm_ctor(**tfm_args) 273 | found = True 274 | break 275 | 276 | if not found: 277 | raise exceptions.NamedTransformNotFound( 278 | f"Could not find transform {transformLabel} in transformations block" 279 | ) 280 | 281 | return initTransform 282 | 283 | 284 | def reverse(policy: data.Policy) -> data.Policy: 285 | """Turns reversible tokenizations into token reversers 286 | 287 | If any named transformations contain a reversible tokenization transformation 288 | this helper function turns them into token reverser transformations. 289 | 290 | Args: 291 | policy: Top level policy object. 292 | 293 | Returns: 294 | The modified policy. 295 | """ 296 | new_policy = copy.deepcopy(policy) 297 | 298 | for named in new_policy.transformations: 299 | if named.type == transformations.ReversibleTokenizer.identifier: 300 | named.type = transformations.TokenReverser.identifier 301 | 302 | return new_policy 303 | -------------------------------------------------------------------------------- /cape_dataframes/policy/policy_test.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pandas.testing as pdt 6 | import pytest 7 | import requests 8 | import yaml 9 | 10 | from cape_dataframes import pandas as pandas_lib 11 | from cape_dataframes import spark as spark_lib 12 | from cape_dataframes.pandas.transformations import test_utils 13 | from cape_dataframes.policy import data 14 | from cape_dataframes.policy import exceptions 15 | from cape_dataframes.policy import policy as policy_lib 16 | from cape_dataframes.policy import policy_test_fixtures as fixtures 17 | 18 | 19 | def test_parse_policy(tmp_path): 20 | d = tmp_path / "policy" 21 | 22 | d.mkdir() 23 | 24 | p = d / "policy.yaml" 25 | p.write_text(fixtures.y) 26 | 27 | policy = policy_lib.parse_policy(str(p.absolute())) 28 | 29 | assert policy.label == "test_policy" 30 | 31 | 32 | def test_parse_policy_dict(): 33 | p = yaml.load(fixtures.y, Loader=yaml.FullLoader) 34 | 35 | policy = policy_lib.parse_policy(p) 36 | 37 | assert policy.label == "test_policy" 38 | 39 | 40 | def test_named_transform_not_found(): 41 | pandas_lib.registry.register("plusN", test_utils.PlusN) 42 | d = yaml.load( 43 | fixtures.named_not_found_y("plusOne", "plusOneThousand", "plusN"), 44 | Loader=yaml.FullLoader, 45 | ) 46 | 47 | df = pd.DataFrame( 48 | np.ones( 49 | 5, 50 | ), 51 | columns=["test"], 52 | ) 53 | 54 | p = data.Policy(**d) 55 | tfm = p.rules[0].transformations[0] 56 | 57 | with pytest.raises(exceptions.NamedTransformNotFound) as e: 58 | policy_lib._get_transformation(p, tfm, df, pandas_lib.dtypes) 59 | 60 | assert str(e.value) == ( 61 | "Could not find transform plusOneThousand in transformations block" 62 | ) 63 | 64 | 65 | def test_named_transform_type_not_found(): 66 | d = yaml.load( 67 | fixtures.named_not_found_y("plusOne", "plusOne", "plusM"), 68 | Loader=yaml.FullLoader, 69 | ) 70 | p = data.Policy(**d) 71 | tfm = p.rules[0].transformations[0] 72 | 73 | with pytest.raises(exceptions.NamedTransformNotFound) as e: 74 | policy_lib._get_transformation(p, tfm, pandas_lib.registry, pandas_lib.dtypes) 75 | assert str(e.value) == "Could not find transform of type plusM in registry" 76 | 77 | 78 | def test_parse_policy_url(httpserver): 79 | httpserver.expect_request("/policy").respond_with_data(fixtures.y) 80 | url = httpserver.url_for("/policy") 81 | policy = policy_lib.parse_policy(url) 82 | assert policy.label == "test_policy" 83 | 84 | 85 | def test_parse_policy_invalid_url(): 86 | with pytest.raises(requests.exceptions.ConnectionError): 87 | policy_lib.parse_policy("https://notapolicy.here.com/policy") 88 | 89 | 90 | def test_parse_policy_invalid_file(): 91 | with pytest.raises(FileNotFoundError): 92 | policy_lib.parse_policy("iamnotarealthingonthisfilesystem") 93 | 94 | 95 | def test_apply_policy_pandas(): 96 | pandas_lib.registry.register("plusN", test_utils.PlusN) 97 | d = yaml.load(fixtures.y, Loader=yaml.FullLoader) 98 | 99 | df = pd.DataFrame( 100 | np.ones( 101 | 5, 102 | ), 103 | columns=["test"], 104 | ) 105 | 106 | expected_df = df + 3 107 | 108 | p = data.Policy(**d) 109 | 110 | new_df = policy_lib.apply_policy(p, df) 111 | 112 | pdt.assert_frame_equal(new_df, expected_df) 113 | 114 | 115 | def test_missing_column(): 116 | pandas_lib.registry.register("plusN", test_utils.PlusN) 117 | d = yaml.load(fixtures.y, Loader=yaml.FullLoader) 118 | 119 | df = pd.DataFrame( 120 | np.ones( 121 | 5, 122 | ), 123 | columns=["boat"], 124 | ) 125 | 126 | expected_df = df 127 | 128 | p = data.Policy(**d) 129 | 130 | new_df = policy_lib.apply_policy(p, df) 131 | 132 | pdt.assert_frame_equal(new_df, expected_df) 133 | 134 | 135 | def test_apply_complex_policies_pandas(): 136 | d = yaml.load(fixtures.complex_y, Loader=yaml.FullLoader) 137 | 138 | df = pd.DataFrame( 139 | { 140 | "name": ["bob", "alice"], 141 | "val-int": [30, 50], 142 | "val-float": [32.43424, 56.64543], 143 | "date": [pd.Timestamp("2018-10-15"), pd.Timestamp("2016-09-10")], 144 | } 145 | ) 146 | expected_df = pd.DataFrame( 147 | { 148 | "name": [ 149 | "db6063546d5d6c1fd3826bc0a1d8188fa0dae1a174823eac1e8e063a073bf149", 150 | "4ae0639267ad49c658e8d266aa1caa51c876ed1d7ca788a0749d5189248295eb", 151 | ], 152 | "val-int": [23, 58], 153 | "val-float": [32.4, 56.6], 154 | "date": [pd.Timestamp("2018-01-01"), pd.Timestamp("2016-01-01")], 155 | } 156 | ) 157 | 158 | p = data.Policy(**d) 159 | 160 | new_df = policy_lib.apply_policy(p, df) 161 | 162 | pdt.assert_frame_equal(new_df, expected_df) 163 | 164 | 165 | def test_named_transformation_pandas(): 166 | pandas_lib.registry.register("plusN", test_utils.PlusN) 167 | d = yaml.load(fixtures.named_y, Loader=yaml.FullLoader) 168 | 169 | df = pd.DataFrame( 170 | np.ones( 171 | 5, 172 | ), 173 | columns=["test"], 174 | ) 175 | 176 | expected_df = df + 3 177 | 178 | p = data.Policy(**d) 179 | 180 | new_df = policy_lib.apply_policy(p, df) 181 | 182 | pdt.assert_frame_equal(new_df, expected_df) 183 | 184 | 185 | def test_column_redact_pandas(): 186 | pandas_lib.registry.register("plusN", test_utils.PlusN) 187 | d = yaml.load(fixtures.redact_y, Loader=yaml.FullLoader) 188 | 189 | df = pd.DataFrame(np.ones((5, 2)), columns=["test", "apple"]) 190 | 191 | p = data.Policy(**d) 192 | 193 | new_df = policy_lib.apply_policy(p, df) 194 | 195 | expected_df = pd.DataFrame( 196 | np.ones( 197 | 5, 198 | ), 199 | columns=["test"], 200 | ) 201 | 202 | expected_df = expected_df + 3 203 | 204 | pdt.assert_frame_equal(new_df, expected_df) 205 | 206 | 207 | def test_apply_policy_spark(): 208 | sess = spark_lib.utils.make_session("test.policy.applyPolicies") 209 | pd_df = pd.DataFrame( 210 | np.ones( 211 | 5, 212 | ), 213 | columns=["test"], 214 | ) 215 | expected_df = pd_df + 3 216 | df = sess.createDataFrame(pd_df) 217 | 218 | spark_lib.registry.register(test_utils.PlusN.identifier, test_utils.PlusN) 219 | d = yaml.load(fixtures.y, Loader=yaml.FullLoader) 220 | p = data.Policy(**d) 221 | new_df = policy_lib.apply_policy(p, df).toPandas() 222 | 223 | pdt.assert_frame_equal(new_df, expected_df) 224 | del spark_lib.registry._registry[test_utils.PlusN.identifier] 225 | 226 | 227 | def test_apply_complex_policies_spark(): 228 | sess = spark_lib.utils.make_session("test.policy.applyComplexPolicies") 229 | pd_df = pd.DataFrame( 230 | { 231 | "name": ["bob", "alice"], 232 | "val-int": [30, 50], 233 | "val-float": [32.43424, 56.64543], 234 | "date": [pd.Timestamp("2018-10-15"), pd.Timestamp("2016-09-10")], 235 | } 236 | ) 237 | expected_df = pd.DataFrame( 238 | { 239 | "name": [ 240 | "db6063546d5d6c1fd3826bc0a1d8188fa0dae1a174823eac1e8e063a073bf149", 241 | "4ae0639267ad49c658e8d266aa1caa51c876ed1d7ca788a0749d5189248295eb", 242 | ], 243 | "val-int": [25, 56], 244 | "val-float": [32.4, 56.6], 245 | # TODO: when these are pd.Timestamp, Spark's date_trunc is causing 246 | # dtype erasure. We should figure out why that's happening 247 | "date": [datetime.date(2018, 1, 1), datetime.date(2016, 1, 1)], 248 | } 249 | ) 250 | df = sess.createDataFrame(pd_df) 251 | 252 | d = yaml.load(fixtures.complex_y, Loader=yaml.FullLoader) 253 | p = data.Policy(**d) 254 | new_df = policy_lib.apply_policy(p, df).toPandas() 255 | pdt.assert_frame_equal(new_df, expected_df, check_dtype=True) 256 | 257 | 258 | def test_named_transformation_spark(): 259 | sess = spark_lib.utils.make_session("test.policy.namedTransformations") 260 | pd_df = pd.DataFrame( 261 | np.ones( 262 | 5, 263 | ), 264 | columns=["test"], 265 | ) 266 | expected_df = pd_df + 3 267 | df = sess.createDataFrame(pd_df) 268 | 269 | spark_lib.registry.register(test_utils.PlusN.identifier, test_utils.PlusN) 270 | d = yaml.load(fixtures.named_y, Loader=yaml.FullLoader) 271 | p = data.Policy(**d) 272 | new_df = policy_lib.apply_policy(p, df).toPandas() 273 | 274 | pdt.assert_frame_equal(new_df, expected_df) 275 | del spark_lib.registry._registry[test_utils.PlusN.identifier] 276 | 277 | 278 | def test_column_redaction_spark(): 279 | sess = spark_lib.utils.make_session("test.policy.redaction") 280 | pd_df = pd.DataFrame(np.ones((5, 2)), columns=["test", "apple"]) 281 | expected_df = pd.DataFrame( 282 | np.ones( 283 | 5, 284 | ), 285 | columns=["test"], 286 | ) 287 | expected_df = expected_df + 3 288 | df = sess.createDataFrame(pd_df) 289 | 290 | spark_lib.registry.register(test_utils.PlusN.identifier, test_utils.PlusN) 291 | d = yaml.load(fixtures.redact_y, Loader=yaml.FullLoader) 292 | p = data.Policy(**d) 293 | new_df = policy_lib.apply_policy(p, df).toPandas() 294 | 295 | pdt.assert_frame_equal(new_df, expected_df) 296 | del spark_lib.registry._registry[test_utils.PlusN.identifier] 297 | 298 | 299 | def test_secret_in_named_transform(): 300 | d = yaml.load(fixtures.secret_yaml, Loader=yaml.FullLoader) 301 | 302 | df = pd.DataFrame({"name": ["bob", "alice"]}) 303 | 304 | p = data.Policy(**d) 305 | 306 | new_df = policy_lib.apply_policy(p, df) 307 | 308 | pdt.assert_frame_equal(new_df, df) 309 | 310 | 311 | def test_reverse_helper(): 312 | p = yaml.load(fixtures.reversible_yaml, Loader=yaml.FullLoader) 313 | 314 | policy = policy_lib.parse_policy(p) 315 | 316 | df = pd.DataFrame({"name": ["bob", "alice"]}) 317 | 318 | new_df = policy_lib.apply_policy(policy, df) 319 | 320 | new_policy = policy_lib.reverse(policy) 321 | 322 | another_df = policy_lib.apply_policy(new_policy, new_df) 323 | 324 | for transform in new_policy.transformations: 325 | assert transform.type == pandas_lib.transformations.TokenReverser.identifier 326 | 327 | pdt.assert_frame_equal(df, another_df) 328 | -------------------------------------------------------------------------------- /cape_dataframes/policy/policy_test_fixtures.py: -------------------------------------------------------------------------------- 1 | y = """ 2 | label: test_policy 3 | version: 1 4 | rules: 5 | - match: 6 | name: test 7 | actions: 8 | - transform: 9 | type: plusN 10 | n: 1 11 | - transform: 12 | type: plusN 13 | n: 2 14 | """ 15 | 16 | named_y = """ 17 | version: 1 18 | label: test_policy 19 | transformations: 20 | - name: plusOne 21 | type: plusN 22 | n: 1 23 | - name: plusTwo 24 | type: plusN 25 | n: 2 26 | rules: 27 | - match: 28 | name: test 29 | actions: 30 | - transform: 31 | name: plusOne 32 | - transform: 33 | name: plusTwo 34 | """ 35 | 36 | named_with_secret_y = """ 37 | version: 1 38 | label: test_policy 39 | transformations: 40 | - name: plusOne 41 | type: plusN 42 | n: 1 43 | - name: tokenWithSecret 44 | type: tokenizer 45 | key: 46 | type: secret 47 | name: my-key 48 | value: BASE 49 | rules: 50 | - match: 51 | name: test 52 | actions: 53 | - transform: 54 | name: plusOne 55 | - transform: 56 | name: plusTwo 57 | """ 58 | 59 | 60 | def named_not_found_y(saved_tfm, ref_tfm, tfm_type): 61 | return """ 62 | label: test_policy 63 | version: 1 64 | transformations: 65 | - name: {saved} 66 | type: {type} 67 | n: 1 68 | rules: 69 | - match: 70 | name: test 71 | actions: 72 | - transform: 73 | name: {ref} 74 | """.format( 75 | saved=saved_tfm, type=tfm_type, ref=ref_tfm 76 | ) 77 | 78 | 79 | complex_y = """ 80 | label: test_policy 81 | version: 1 82 | rules: 83 | - match: 84 | name: val-int 85 | actions: 86 | - transform: 87 | type: numeric-perturbation 88 | dtype: Integer 89 | min: -10 90 | max: 10 91 | seed: 4984 92 | - match: 93 | name: val-float 94 | actions: 95 | - transform: 96 | type: numeric-rounding 97 | dtype: Double 98 | precision: 1 99 | - match: 100 | name: name 101 | actions: 102 | - transform: 103 | type: tokenizer 104 | key: secret_key 105 | - match: 106 | name: date 107 | actions: 108 | - transform: 109 | type: date-truncation 110 | frequency: year 111 | """ 112 | 113 | 114 | redact_y = """ 115 | label: test_policy 116 | version: 1 117 | rules: 118 | - match: 119 | name: apple 120 | actions: 121 | - drop 122 | - match: 123 | name: test 124 | actions: 125 | - transform: 126 | type: plusN 127 | n: 1 128 | - transform: 129 | type: plusN 130 | n: 2 131 | """ 132 | 133 | secret_yaml = """ 134 | label: masking_policy 135 | version: 1 136 | transformations: 137 | - name: reversible 138 | type: reversible-tokenizer 139 | key: 140 | type: secret 141 | value: m5YNKBP-a3GMyy52457ok-4zQHqLuiB3aFD7mPTBpoc 142 | - name: reverse 143 | type: token-reverser 144 | key: 145 | type: secret 146 | value: m5YNKBP-a3GMyy52457ok-4zQHqLuiB3aFD7mPTBpoc 147 | rules: 148 | - match: 149 | name: name 150 | actions: 151 | - transform: 152 | name: reversible 153 | - match: 154 | name: name 155 | actions: 156 | - transform: 157 | name: reverse 158 | """ 159 | 160 | reversible_yaml = """ 161 | label: masking_policy 162 | version: 1 163 | transformations: 164 | - name: reversible 165 | type: reversible-tokenizer 166 | key: 167 | type: secret 168 | value: m5YNKBP-a3GMyy52457ok-4zQHqLuiB3aFD7mPTBpoc 169 | rules: 170 | - match: 171 | name: name 172 | actions: 173 | - transform: 174 | name: reversible 175 | """ 176 | -------------------------------------------------------------------------------- /cape_dataframes/spark/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | if importlib.util.find_spec("pyspark") is None: 4 | 5 | def is_available(): 6 | return False 7 | 8 | __all__ = ["is_available"] 9 | 10 | else: 11 | from pyspark.sql import DataFrame 12 | 13 | from cape_dataframes.spark import dtypes 14 | from cape_dataframes.spark import registry 15 | from cape_dataframes.spark import transformations 16 | from cape_dataframes.spark.transformer import transformer 17 | from cape_dataframes.spark.utils import configure_session 18 | from cape_dataframes.spark.utils import make_session 19 | 20 | def is_available(): 21 | return True 22 | 23 | __all__ = [ 24 | "configure_session", 25 | "DataFrame", 26 | "dtypes", 27 | "is_available", 28 | "make_session", 29 | "transformations", 30 | "transformer", 31 | "registry", 32 | ] 33 | 34 | del importlib 35 | -------------------------------------------------------------------------------- /cape_dataframes/spark/dtypes.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import types 2 | 3 | # base type 4 | DType = types.DataType 5 | # individual types 6 | String = types.StringType() 7 | Date = types.DateType() 8 | Datetime = types.TimestampType() 9 | # numeric types 10 | Float = types.FloatType() 11 | Double = types.DoubleType() 12 | Byte = types.ByteType() 13 | Short = types.ShortType() 14 | Integer = types.IntegerType() 15 | Long = types.LongType() 16 | # groups 17 | Floats = (Float, Double) 18 | Integers = (Byte, Short, Integer, Long) 19 | Numerics = Floats + Integers 20 | -------------------------------------------------------------------------------- /cape_dataframes/spark/registry.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | from typing import Dict 3 | 4 | from cape_dataframes.spark.transformations import perturbation 5 | from cape_dataframes.spark.transformations import redaction 6 | from cape_dataframes.spark.transformations import rounding 7 | from cape_dataframes.spark.transformations import tokenizer 8 | 9 | TransformationCtor = Callable 10 | 11 | _registry: Dict[str, TransformationCtor] = {} 12 | 13 | 14 | def get(transformation: str) -> TransformationCtor: 15 | """Returns the constructor for the given key. 16 | 17 | Arguments: 18 | transformation: The key of transformation to retrieve. 19 | """ 20 | return _registry.get(transformation, None) 21 | 22 | 23 | def register(label: str, ctor: TransformationCtor): 24 | """Registers a new transformation constructor under the label provided. 25 | 26 | Arguments: 27 | label: The label that will be used as the key in the registry 28 | ctor: The transformation constructor 29 | """ 30 | _registry[label] = ctor 31 | 32 | 33 | register(perturbation.DatePerturbation.identifier, perturbation.DatePerturbation) 34 | register(perturbation.NumericPerturbation.identifier, perturbation.NumericPerturbation) 35 | register(rounding.NumericRounding.identifier, rounding.NumericRounding) 36 | register(tokenizer.Tokenizer.identifier, tokenizer.Tokenizer) 37 | register(rounding.DateTruncation.identifier, rounding.DateTruncation) 38 | register(redaction.ColumnRedact.identifier, redaction.ColumnRedact) 39 | register(redaction.RowRedact.identifier, redaction.RowRedact) 40 | register(tokenizer.ReversibleTokenizer.identifier, tokenizer.ReversibleTokenizer) 41 | register(tokenizer.TokenReverser.identifier, tokenizer.TokenReverser) 42 | -------------------------------------------------------------------------------- /cape_dataframes/spark/registry_test.py: -------------------------------------------------------------------------------- 1 | from cape_dataframes.spark import registry 2 | from cape_dataframes.spark.transformations import base 3 | 4 | 5 | class MockTransformation(base.Transformation): 6 | identifier = "mock" 7 | 8 | def __init__(self, fake_arg): 9 | super().__init__(None) 10 | 11 | def __call__(self, x): 12 | pass 13 | 14 | 15 | def test_get(): 16 | registry.register(MockTransformation.identifier, MockTransformation) 17 | tfm_cls = registry.get("mock") 18 | args = {"fake_arg": 1} 19 | tfm_cls(**args) 20 | registry._registry.pop("mock") 21 | 22 | 23 | def test_get_missing(): 24 | tfm_cls = registry.get("plusWhat?") 25 | assert tfm_cls is None 26 | -------------------------------------------------------------------------------- /cape_dataframes/spark/transformations/__init__.py: -------------------------------------------------------------------------------- 1 | from cape_dataframes.spark.transformations.perturbation import DatePerturbation 2 | from cape_dataframes.spark.transformations.perturbation import NumericPerturbation 3 | from cape_dataframes.spark.transformations.redaction import ColumnRedact 4 | from cape_dataframes.spark.transformations.redaction import RowRedact 5 | from cape_dataframes.spark.transformations.rounding import DateTruncation 6 | from cape_dataframes.spark.transformations.rounding import NumericRounding 7 | from cape_dataframes.spark.transformations.tokenizer import Tokenizer 8 | 9 | __all__ = [ 10 | "DatePerturbation", 11 | "NumericPerturbation", 12 | "DateTruncation", 13 | "NumericRounding", 14 | "Tokenizer", 15 | "ColumnRedact", 16 | "RowRedact", 17 | ] 18 | -------------------------------------------------------------------------------- /cape_dataframes/spark/transformations/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | from cape_dataframes.spark import dtypes 4 | 5 | 6 | class AbstractTransformation(metaclass=abc.ABCMeta): 7 | @property 8 | @abc.abstractmethod 9 | def dtype(self): 10 | pass 11 | 12 | @abc.abstractmethod 13 | def __call__(self, x): 14 | pass 15 | 16 | 17 | class Transformation(AbstractTransformation): 18 | def __init__(self, dtype: dtypes.DType): 19 | self._dtype = dtype 20 | 21 | @property 22 | def dtype(self): 23 | return self._dtype 24 | -------------------------------------------------------------------------------- /cape_dataframes/spark/transformations/perturbation.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from typing import Tuple 3 | from typing import Union 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from pyspark import sql 8 | from pyspark.sql import functions 9 | 10 | from cape_dataframes.spark import dtypes 11 | from cape_dataframes.spark.transformations import base 12 | from cape_dataframes.utils import typecheck 13 | 14 | _FREQUENCY_TO_DELTA_FN = { 15 | "YEAR": lambda noise: pd.to_timedelta(noise * 365, unit="days"), 16 | "MONTH": lambda noise: pd.to_timedelta(noise * 30, unit="days"), 17 | "DAY": lambda noise: pd.to_timedelta(noise, unit="days"), 18 | "HOUR": lambda noise: pd.to_timedelta(noise, unit="hours"), 19 | "minutes": lambda noise: pd.to_timedelta(noise, unit="minutes"), 20 | "seconds": lambda noise: pd.to_timedelta(noise, unit="seconds"), 21 | } 22 | IntTuple = Union[int, Tuple[int, ...]] 23 | StrTuple = Union[str, Tuple[str, ...]] 24 | 25 | 26 | class NumericPerturbation(base.Transformation): 27 | """Add uniform random noise to a numeric series 28 | 29 | Mask a numeric series by adding uniform random noise to each value. 30 | The amount of noise is drawn from the interval [min, max). 31 | 32 | Attributes: 33 | dtype (dtypes.Numerics): series type 34 | min (int, float): the values generated will be greater or equal to min 35 | max (int, float): the values generated will be less than max 36 | seed (int), optional: a seed to initialize the random generator 37 | """ 38 | 39 | identifier = "numeric-perturbation" 40 | type_signature = "col->col" 41 | 42 | def __init__( 43 | self, 44 | dtype: dtypes.DType, 45 | min: (int, float), 46 | max: (int, float), 47 | seed: Optional[int] = None, 48 | ): 49 | assert dtype in dtypes.Numerics 50 | typecheck.check_arg(min, (int, float)) 51 | typecheck.check_arg(max, (int, float)) 52 | typecheck.check_arg(seed, (int, type(None))) 53 | super().__init__(dtype) 54 | self._min = min 55 | self._max = max 56 | self._seed = seed 57 | 58 | def __call__(self, x: sql.Column): 59 | uniform_noise = functions.rand(seed=self._seed) 60 | if self._seed is not None: 61 | self._seed += 1 62 | affine_noise = self._min + uniform_noise * (self._max - self._min) 63 | if self._dtype is not dtypes.Double: 64 | affine_noise = affine_noise.astype(self._dtype) 65 | return x + affine_noise 66 | 67 | 68 | class DatePerturbation(base.Transformation): 69 | """Add uniform random noise to a Pandas series of timestamps 70 | 71 | Mask a series by adding uniform random noise to the specified 72 | frequencies of timestamps. The amount of noise for each frequency 73 | is drawn from the internal [min_freq, max_freq). 74 | 75 | Note that seeds are currently not supported. 76 | 77 | Attributes: 78 | frequency (str, str list): one or more frequencies to perturbate 79 | min (int, int list): the frequency value will be greater or equal to min 80 | max (int, int list): the frequency value will be less than max 81 | """ 82 | 83 | identifier = "date-perturbation" 84 | type_signature = "col->col" 85 | 86 | def __init__( 87 | self, 88 | frequency: StrTuple, 89 | min: IntTuple, 90 | max: IntTuple, 91 | ): 92 | super().__init__(dtypes.Date) 93 | self._frequency = _check_freq_arg(frequency) 94 | self._min = _check_minmax_arg(min) 95 | self._max = _check_minmax_arg(max) 96 | self._perturb_date = None 97 | 98 | def __call__(self, x: sql.Column): 99 | if self._perturb_date is None: 100 | self._perturb_date = self._make_perturb_udf() 101 | return self._perturb_date(x) 102 | 103 | def _make_perturb_udf(self): 104 | @functions.pandas_udf(dtypes.Date) 105 | def perturb_date(x: pd.Series) -> pd.Series: 106 | rng = np.random.default_rng() 107 | for f, mn, mx in zip(self._frequency, self._min, self._max): 108 | # TODO can we switch to a lower dtype than np.int64? 109 | noise = rng.integers(mn, mx, size=x.shape) 110 | delta_fn = _FREQUENCY_TO_DELTA_FN.get(f, None) 111 | if delta_fn is None: 112 | raise ValueError( 113 | "Frequency {} must be one of {}.".format( 114 | f, list(_FREQUENCY_TO_DELTA_FN.keys()) 115 | ) 116 | ) 117 | x += delta_fn(noise) 118 | return x 119 | 120 | return perturb_date 121 | 122 | 123 | def _check_minmax_arg(arg): 124 | """Checks that arg is an integer or a flat collection of integers.""" 125 | if not isinstance(arg, (tuple, list)): 126 | if not isinstance(arg, int): 127 | raise ValueError 128 | return [arg] 129 | else: 130 | for a in arg: 131 | if not isinstance(a, int): 132 | raise ValueError 133 | return arg 134 | 135 | 136 | def _check_freq_arg(arg): 137 | """Checks that arg is string or a flat collection of strings.""" 138 | if not isinstance(arg, (tuple, list)): 139 | if not isinstance(arg, str): 140 | raise ValueError 141 | return [arg] 142 | else: 143 | for a in arg: 144 | if not isinstance(a, str): 145 | raise ValueError 146 | return arg 147 | -------------------------------------------------------------------------------- /cape_dataframes/spark/transformations/perturbation_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from pyspark.sql import functions 4 | 5 | from cape_dataframes.spark import dtypes 6 | from cape_dataframes.spark import utils 7 | from cape_dataframes.spark.transformations import perturbation as ptb 8 | 9 | 10 | def _make_and_apply_numeric_ptb(sess, df, dtype, min, max): 11 | df = sess.createDataFrame(df, schema=["data"]) 12 | perturb = ptb.NumericPerturbation(dtype, min=min, max=max) 13 | result_df = df.select(perturb(functions.col("data"))) 14 | return result_df.toPandas() 15 | 16 | 17 | def _make_and_apply_date_ptb(sess, df, frequency, min, max): 18 | df = sess.createDataFrame(df, schema=["data"]) 19 | perturb = ptb.DatePerturbation(frequency, min, max) 20 | result_df = df.select(perturb(functions.col("data"))) 21 | return result_df.withColumnRenamed("perturb_date(data)", "data").toPandas() 22 | 23 | 24 | def test_float_ptb_bounds(): 25 | sess = utils.make_session("test.perturbation.float.bounds") 26 | data = np.arange(6, dtype=np.float32).reshape((6, 1)) 27 | test_df = pd.DataFrame(data, columns=["data"]) 28 | lower, upper = -2, 2 29 | result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Float, lower, upper) 30 | result = result_df.values 31 | assert result.dtype == data.dtype 32 | noise = result - data 33 | lower_check = noise >= lower 34 | upper_check = noise <= upper 35 | assert lower_check.all() 36 | assert upper_check.all() 37 | 38 | 39 | def test_double_ptb_bounds(): 40 | sess = utils.make_session("test.perturbation.double.bounds") 41 | data = np.arange(6, dtype=np.float64).reshape((6, 1)) 42 | test_df = pd.DataFrame(data, columns=["data"]) 43 | lower, upper = -2, 2 44 | result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Double, lower, upper) 45 | result = result_df.values 46 | assert result.dtype == data.dtype 47 | noise = result - data 48 | lower_check = noise >= lower 49 | upper_check = noise <= upper 50 | assert lower_check.all() 51 | assert upper_check.all() 52 | 53 | 54 | def test_int_ptb_bounds(): 55 | sess = utils.make_session("test.perturbation.integer.bounds") 56 | data = np.arange(10, dtype=np.int32).reshape((10, 1)) 57 | test_df = pd.DataFrame(data, columns=["data"]) 58 | lower, upper = -3, 3 59 | result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Integer, lower, upper) 60 | result = result_df.values 61 | assert result.dtype == data.dtype 62 | noise = result - data 63 | lower_check = noise >= lower 64 | upper_check = noise <= upper 65 | assert lower_check.all() 66 | assert upper_check.all() 67 | 68 | 69 | def test_byte_ptb_bounds(): 70 | sess = utils.make_session("test.perturbation.byte.bounds") 71 | data = np.arange(10, dtype=np.int8).reshape((10, 1)) 72 | test_df = pd.DataFrame(data, columns=["data"]) 73 | lower, upper = -3, 3 74 | result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Byte, lower, upper) 75 | result = result_df.values 76 | assert result.dtype == data.dtype 77 | noise = result - data 78 | lower_check = noise >= lower 79 | upper_check = noise <= upper 80 | assert lower_check.all() 81 | assert upper_check.all() 82 | 83 | 84 | def test_short_ptb_bounds(): 85 | sess = utils.make_session("test.perturbation.short.bounds") 86 | data = np.arange(10, dtype=np.int16).reshape((10, 1)) 87 | test_df = pd.DataFrame(data, columns=["data"]) 88 | lower, upper = -3, 3 89 | result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Short, lower, upper) 90 | result = result_df.values 91 | assert result.dtype == data.dtype 92 | noise = result - data 93 | lower_check = noise >= lower 94 | upper_check = noise <= upper 95 | assert lower_check.all() 96 | assert upper_check.all() 97 | 98 | 99 | def test_integer_ptb_bounds(): 100 | sess = utils.make_session("test.perturbation.integer.bounds") 101 | data = np.arange(10, dtype=np.int32).reshape((10, 1)) 102 | test_df = pd.DataFrame(data, columns=["data"]) 103 | lower, upper = -3, 3 104 | result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Integer, lower, upper) 105 | result = result_df.values 106 | assert result.dtype == data.dtype 107 | noise = result - data 108 | lower_check = noise >= lower 109 | upper_check = noise <= upper 110 | assert lower_check.all() 111 | assert upper_check.all() 112 | 113 | 114 | def test_long_ptb_bounds(): 115 | sess = utils.make_session("test.perturbation.long.bounds") 116 | data = np.arange(10, dtype=np.int64).reshape((10, 1)) 117 | test_df = pd.DataFrame(data, columns=["data"]) 118 | lower, upper = -3, 3 119 | result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Long, lower, upper) 120 | result = result_df.values 121 | assert result.dtype == data.dtype 122 | noise = result - data 123 | lower_check = noise >= lower 124 | upper_check = noise <= upper 125 | assert lower_check.all() 126 | assert upper_check.all() 127 | 128 | 129 | def test_date_perturbation_singlefreq_bounds(): 130 | sess = utils.make_session("test.perturbation.date.bounds.singleFrequency") 131 | data = pd.to_datetime(["1997-03-15", "2020-06-24"]) 132 | test_df = pd.DataFrame(data, columns=["data"]) 133 | frequencies = ["YEAR", "MONTH", "DAY"] 134 | num_days = [365, 30, 1] 135 | lower, upper = -2, 2 136 | for freq, days in zip(frequencies, num_days): 137 | result_df = _make_and_apply_date_ptb(sess, test_df, freq, lower, upper) 138 | result_df = result_df.apply(pd.to_datetime) 139 | noise_df = result_df - test_df 140 | lower_check = noise_df >= pd.to_timedelta(lower * days, unit="days") 141 | upper_check = noise_df <= pd.to_timedelta(upper * days, unit="days") 142 | assert lower_check.values.all() 143 | assert upper_check.values.all() 144 | 145 | 146 | def test_date_perturbation_multifreq_bounds(): 147 | sess = utils.make_session("test.perturbation.date.bounds.singleFrequency") 148 | data = pd.to_datetime(["1997-03-15", "2020-06-24"]) 149 | test_df = pd.DataFrame(data, columns=["data"]) 150 | frequency = ("MONTH", "DAY") 151 | lower, upper = (-1, -30), (1, 30) 152 | result_df = _make_and_apply_date_ptb(sess, test_df, frequency, lower, upper) 153 | result_df = result_df.apply(pd.to_datetime) 154 | noise_df = result_df - test_df 155 | lower_check = noise_df >= pd.to_timedelta(-60, unit="days") 156 | upper_check = noise_df <= pd.to_timedelta(60, unit="days") 157 | assert lower_check.values.all() 158 | assert upper_check.values.all() 159 | -------------------------------------------------------------------------------- /cape_dataframes/spark/transformations/redaction.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from pyspark import sql 4 | 5 | 6 | class ColumnRedact: 7 | """Redacts columns from a Spark dataframe. 8 | 9 | Attributes: 10 | columns: Which columns are redacted. 11 | """ 12 | 13 | identifier = "column-redact" 14 | type_signature = "df->df" 15 | 16 | def __init__(self, columns: List[str]): 17 | self.columns = columns 18 | 19 | def __call__(self, df: sql.DataFrame) -> sql.DataFrame: 20 | return df.drop(*self.columns) 21 | 22 | 23 | class RowRedact: 24 | """Redacts rows satisfying some condition from a Spark DataFrame. 25 | 26 | Attributes: 27 | condition: When this condition evaluates to True for a row, that row 28 | will be dropped. 29 | """ 30 | 31 | identifier = "row-redact" 32 | type_signature = "df->df" 33 | 34 | def __init__(self, condition: str): 35 | self.condition = condition 36 | 37 | def __call__(self, df: sql.DataFrame) -> sql.DataFrame: 38 | cond = f"NOT {self.condition}" 39 | return df.filter(cond) 40 | -------------------------------------------------------------------------------- /cape_dataframes/spark/transformations/redaction_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pandas.testing as pdt 4 | 5 | from cape_dataframes.spark import utils 6 | from cape_dataframes.spark.transformations import redaction as rdc 7 | 8 | 9 | def test_column_redact(): 10 | sess = utils.make_session("test.redaction.column") 11 | df = pd.DataFrame(np.ones((5, 3)), columns=["a", "b", "c"]) 12 | expected = pd.DataFrame(np.ones((5,)), columns=["a"]) 13 | test_df = sess.createDataFrame(df, schema=["a", "b", "c"]) 14 | redact = rdc.ColumnRedact(["b", "c"]) 15 | result = redact(test_df).toPandas() 16 | pdt.assert_frame_equal(result, expected) 17 | 18 | 19 | def test_row_redact(): 20 | sess = utils.make_session("test.redaction.row") 21 | df = pd.DataFrame(np.ones((5, 2)), columns=["a", "b"]) 22 | df["a"].iloc[0] = 6 23 | df["a"].iloc[3] = 6 24 | expected = pd.DataFrame(np.ones((3, 2)), columns=["a", "b"]) 25 | test_df = sess.createDataFrame(df, schema=["a", "b"]) 26 | redact = rdc.RowRedact("a > 5") 27 | result = redact(test_df).toPandas() 28 | pdt.assert_frame_equal(result, expected) 29 | -------------------------------------------------------------------------------- /cape_dataframes/spark/transformations/rounding.py: -------------------------------------------------------------------------------- 1 | from pyspark import sql 2 | from pyspark.sql import functions 3 | 4 | from cape_dataframes.spark import dtypes 5 | from cape_dataframes.spark.transformations import base 6 | from cape_dataframes.utils import typecheck 7 | 8 | 9 | class NumericRounding(base.Transformation): 10 | """Reduce the precision of a numeric series 11 | 12 | Round each value in the series to the given number 13 | of digits. 14 | 15 | Attributes: 16 | dtypes (dtypes.Numerics): series type. 17 | precision (int): set the number of digits. 18 | """ 19 | 20 | identifier = "numeric-rounding" 21 | type_signature = "col->col" 22 | 23 | def __init__(self, dtype: dtypes.DType, precision: int): 24 | if dtype not in dtypes.Numerics: 25 | raise ValueError("NumericRounding requires a Numeric dtype.") 26 | typecheck.check_arg(precision, int) 27 | super().__init__(dtype) 28 | self._precision = precision 29 | 30 | def __call__(self, x: sql.Column): 31 | return functions.round(x, scale=self._precision) 32 | 33 | 34 | class DateTruncation(base.Transformation): 35 | """Reduce the precision of a date series 36 | 37 | Truncate each date in a series to the unit (year or month) 38 | specified by frequency. 39 | 40 | Attributes: 41 | frequency (string): expect to be 'year' or 'month' 42 | """ 43 | 44 | identifier = "date-truncation" 45 | type_signature = "col->col" 46 | 47 | def __init__(self, frequency: str): 48 | typecheck.check_arg(frequency, str) 49 | super().__init__(dtypes.Date) 50 | self._frequency = frequency.lower() 51 | 52 | def __call__(self, x: sql.Column): 53 | truncated = functions.date_trunc(self._frequency, x) 54 | return truncated.astype(self.dtype) 55 | -------------------------------------------------------------------------------- /cape_dataframes/spark/transformations/rounding_test.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from pyspark.sql import functions 6 | 7 | from cape_dataframes.spark import dtypes 8 | from cape_dataframes.spark import utils 9 | from cape_dataframes.spark.transformations import rounding as rnd 10 | 11 | 12 | # Utils 13 | def _make_and_apply_rounder(sess, df, dtype, precision): 14 | df = sess.createDataFrame(df, schema=["data"]) 15 | rounder = rnd.NumericRounding(dtype, precision) 16 | result_df = df.select(rounder(functions.col("data"))) 17 | return result_df.toPandas() 18 | 19 | 20 | def _make_float_data(dtype, precision=0, scale=0.1): 21 | data = np.arange(6, dtype=dtype).reshape((6, 1)) 22 | delta = data * scale 23 | expected = np.around(data + delta, decimals=precision) 24 | test_df = pd.DataFrame(data + delta, columns=["data"]) 25 | return test_df, expected 26 | 27 | 28 | def _make_integer_data(dtype, precision): 29 | data = np.array([123, 1234, 12345, 123456], dtype=dtype).reshape((4, 1)) 30 | expected = np.around(data, precision) 31 | test_df = pd.DataFrame(data, columns=["data"]) 32 | return test_df, expected 33 | 34 | 35 | def _make_date_data(sess): 36 | df = sess.createDataFrame([("1997-02-28",)], ["data"]) 37 | expected = np.array(datetime.date(1997, 2, 1)) 38 | return df, expected 39 | 40 | 41 | def _make_datetime_data(sess): 42 | df = sess.createDataFrame([("1997-02-28 05:02:11",)], ["data"]) 43 | expected = np.array(datetime.datetime(1997, 2, 1, 0, 0, 0)) 44 | return df, expected 45 | 46 | 47 | # Tests 48 | def test_rounding_float(): 49 | precision = 0 50 | sess = utils.make_session("test.rounding.float") 51 | test_df, expected = _make_float_data(np.float32, precision) 52 | result_df = _make_and_apply_rounder(sess, test_df, dtypes.Float, precision) 53 | result = result_df.values 54 | assert result.dtype == expected.dtype 55 | np.testing.assert_almost_equal(result, expected) 56 | 57 | 58 | def test_rounding_double(): 59 | precision = 0 60 | sess = utils.make_session("test.rounding.double") 61 | test_df, expected = _make_float_data(np.float64, precision) 62 | result_df = _make_and_apply_rounder(sess, test_df, dtypes.Double, precision) 63 | result = result_df.values 64 | assert result.dtype == expected.dtype 65 | np.testing.assert_almost_equal(result, expected) 66 | 67 | 68 | def test_rounding_integer(): 69 | precision = -2 70 | sess = utils.make_session("test.rounding.integer") 71 | test_df, expected = _make_integer_data(np.int32, precision) 72 | result_df = _make_and_apply_rounder(sess, test_df, dtypes.Integer, precision) 73 | result = result_df.values 74 | assert result.dtype == expected.dtype 75 | np.testing.assert_almost_equal(result, expected) 76 | 77 | 78 | def test_rounding_long(): 79 | precision = -2 80 | sess = utils.make_session("test.rounding.integer") 81 | test_df, expected = _make_integer_data(np.int64, precision) 82 | result_df = _make_and_apply_rounder(sess, test_df, dtypes.Long, precision) 83 | result = result_df.values 84 | assert result.dtype == expected.dtype 85 | np.testing.assert_almost_equal(result, expected) 86 | 87 | 88 | def test_truncate_date(): 89 | sess = utils.make_session("test.truncation.date") 90 | test_df, expected = _make_date_data(sess) 91 | truncate = rnd.DateTruncation("month") 92 | result_df = test_df.select(truncate(test_df.data)).toPandas() 93 | result = result_df.values 94 | assert result.dtype == expected.dtype 95 | np.testing.assert_equal(result, expected) 96 | -------------------------------------------------------------------------------- /cape_dataframes/spark/transformations/tokenizer.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import secrets 3 | 4 | import pandas as pd 5 | from Crypto.Cipher import AES 6 | from pyspark.sql import functions 7 | 8 | from cape_dataframes.spark import dtypes 9 | from cape_dataframes.spark.transformations import base 10 | from cape_dataframes.utils import typecheck 11 | 12 | 13 | class Tokenizer(base.Transformation): 14 | """Tokenizer: map a string to a token to obfuscate it. 15 | 16 | When applying the tokenizer to a Spark series of type string, 17 | each value gets mapped to a token (hexadecimal string). 18 | If a value is repeated several times across the series, it always 19 | get mapped to the same token in order to maintain the count. 20 | A value can be mapped to different tokens by setting the key to a 21 | different value. 22 | 23 | Attributes: 24 | max_token_len (int or bytes): control the token length (default 25 | length is 64) 26 | key: expect a string or byte string. if not specified, key will 27 | be set to a random byte string. 28 | """ 29 | 30 | identifier = "tokenizer" 31 | type_signature = "col->col" 32 | 33 | def __init__(self, max_token_len=None, key=None): 34 | typecheck.check_arg(max_token_len, (int, type(None))) 35 | typecheck.check_arg(key, (str, bytes, type(None))) 36 | super().__init__(dtypes.String) 37 | self._max_token_len = max_token_len 38 | if isinstance(key, str): 39 | key = key.encode() 40 | self._key = key or secrets.token_bytes(8) 41 | self._tokenize = None 42 | 43 | def __call__(self, x): 44 | if self._tokenize is None: 45 | self._tokenize = self._make_tokenize_udf() 46 | return self._tokenize(x) 47 | 48 | def _make_tokenize_udf(self): 49 | @functions.pandas_udf(dtypes.String) 50 | def to_token(x: pd.Series) -> pd.Series: 51 | return x.map(self._to_token) 52 | 53 | return to_token 54 | 55 | def _to_token(self, x: str): 56 | token = hashlib.sha256(x.encode() + self.key).hexdigest() 57 | if self._max_token_len is None: 58 | return token 59 | return token[: self._max_token_len] 60 | 61 | @property 62 | def key(self): 63 | return self._key 64 | 65 | 66 | class ReversibleTokenizer(base.Transformation): 67 | """ReversibleTokenizer: map a string to a token to obfuscate it. 68 | 69 | When applying the Tokenizer to a Spark series of type string, 70 | each value gets mapped to a token (hexadecimal string). 71 | If a value is repeated several times across the series, it always 72 | get mapped to the same token in order to maintain the count. 73 | A value can be mapped to different tokens by setting the key to a 74 | different value. 75 | 76 | This tokenizer allows tokens to be reversed to their original data 77 | when the secret key is known. 78 | 79 | Attributes: 80 | key: expect a string or byte string of length exactly 32 bytes. 81 | encoding: string identifying the Python encoding used for inputs. 82 | """ 83 | 84 | identifier = "reversible-tokenizer" 85 | type_signature = "col->col" 86 | 87 | def __init__(self, key, encoding="utf-8"): 88 | typecheck.check_arg(key, (str, bytes)) 89 | typecheck.check_arg(encoding, str) 90 | super().__init__(dtype=dtypes.String) 91 | if isinstance(key, str): 92 | key = key.encode() 93 | if len(key) != 32: 94 | raise ValueError(f"Key must be exactly 32 bytes, got {len(key)}") 95 | self.key = key 96 | self.encoding = encoding 97 | 98 | def __call__(self, series): 99 | @functions.pandas_udf(dtypes.String) 100 | def to_token(series: pd.Series) -> pd.Series: 101 | return series.map(self._to_token) 102 | 103 | return to_token(series) 104 | 105 | def _to_token(self, x: str): 106 | cipher = AES.new(key=self.key, mode=AES.MODE_SIV) 107 | ciphertext, tag = cipher.encrypt_and_digest(x.encode(encoding=self.encoding)) 108 | assert len(tag) == 16, len(tag) 109 | token = tag.hex() + ciphertext.hex() 110 | return token 111 | 112 | 113 | class TokenReverser(base.Transformation): 114 | """TokenReverser: recover string from token. 115 | 116 | When applying the TokenReverser to a Spark series of tokens, 117 | each token is mapped back to the string that was originally used 118 | by ReversibleTokenizer to construct the token. The same key must 119 | be used. 120 | 121 | Attributes: 122 | key: expect a string or byte string of length exactly 32 bytes. 123 | encoding: string identifying the Python encoding used for outputs. 124 | """ 125 | 126 | identifier = "token-reverser" 127 | type_signature = "col->col" 128 | 129 | def __init__(self, key, encoding="utf-8"): 130 | typecheck.check_arg(key, (str, bytes)) 131 | typecheck.check_arg(encoding, str) 132 | super().__init__(dtype=dtypes.String) 133 | if isinstance(key, str): 134 | key = key.encode() 135 | if len(key) != 32: 136 | raise ValueError(f"Key must be exactly 32 bytes, got {len(key)}") 137 | self.key = key 138 | self.encoding = encoding 139 | 140 | def __call__(self, series) -> pd.Series: 141 | @functions.pandas_udf(dtypes.String) 142 | def from_token(series: pd.Series) -> pd.Series: 143 | return series.map(self._from_token) 144 | 145 | return from_token(series) 146 | 147 | def _from_token(self, token: str): 148 | cipher = AES.new(key=self.key, mode=AES.MODE_SIV) 149 | token_bytes = bytearray.fromhex(token) 150 | tag, ciphertext = token_bytes[:16], token_bytes[16:] 151 | x = cipher.decrypt_and_verify(ciphertext, tag) 152 | return x.decode(encoding=self.encoding) 153 | -------------------------------------------------------------------------------- /cape_dataframes/spark/transformations/tokenizer_test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pandas.testing as pdt 3 | import pytest 4 | from pyspark.sql import functions 5 | 6 | from cape_dataframes.spark import utils 7 | from cape_dataframes.spark.transformations import tokenizer as tkn 8 | 9 | 10 | def _apply_tokenizer(sess, df, tokenizer, col_to_rename): 11 | df = sess.createDataFrame(df, schema=["name"]) 12 | result_df = df.select(tokenizer(functions.col("name"))) 13 | return result_df.withColumnRenamed(col_to_rename, "name").toPandas() 14 | 15 | 16 | def test_tokenizer_simple(): 17 | sess = utils.make_session("test.tokenizer.simple") 18 | test_df = pd.DataFrame({"name": ["Alice", "Bob"]}) 19 | expected = pd.DataFrame( 20 | { 21 | "name": [ 22 | "70a4b1a987767abf36463cd3e3f2b37144132e572fbb9b39f28bcaafe10d9b24", 23 | "dd4532a296deb4f114b1e7e88faefe4fb2b32c559ac15a8c6bcbdbcbc2aa4d4b", 24 | ] 25 | } 26 | ) 27 | key = "secret_key" 28 | df = _apply_tokenizer( 29 | sess, 30 | test_df, 31 | tkn.Tokenizer(max_token_len=None, key=key), 32 | col_to_rename="to_token(name)", 33 | ) 34 | pdt.assert_frame_equal(df, expected) 35 | 36 | 37 | def test_tokenizer_is_linkable(): 38 | sess = utils.make_session("test.tokenizer.isLinkable") 39 | test_df = pd.DataFrame({"name": ["Alice", "Bob"]}) 40 | key1 = "secret_key" 41 | key2 = "secret_key" 42 | df1 = _apply_tokenizer( 43 | sess, 44 | test_df, 45 | tkn.Tokenizer(max_token_len=None, key=key1), 46 | col_to_rename="to_token(name)", 47 | ) 48 | df2 = _apply_tokenizer( 49 | sess, 50 | test_df, 51 | tkn.Tokenizer(max_token_len=None, key=key2), 52 | col_to_rename="to_token(name)", 53 | ) 54 | pdt.assert_frame_equal(df1, df2) 55 | 56 | 57 | def test_tokenizer_is_not_linkable(): 58 | sess = utils.make_session("test.tokenizer.isNotLinkable") 59 | test_df = pd.DataFrame({"name": ["Alice", "Bob"]}) 60 | key1 = "secret_key" 61 | key2 = "not_your_secret_key" 62 | df1 = _apply_tokenizer( 63 | sess, 64 | test_df, 65 | tkn.Tokenizer(max_token_len=None, key=key1), 66 | col_to_rename="to_token(name)", 67 | ) 68 | df2 = _apply_tokenizer( 69 | sess, 70 | test_df, 71 | tkn.Tokenizer(max_token_len=None, key=key2), 72 | col_to_rename="to_token(name)", 73 | ) 74 | try: 75 | pdt.assert_frame_equal(df1, df2) 76 | raise NotImplemented # noqa: F901 77 | except AssertionError: 78 | pass 79 | except NotImplemented: 80 | raise AssertionError 81 | 82 | 83 | def test_tokenizer_with_max_token_len(): 84 | sess = utils.make_session("test.tokenizer.maxTokenLen") 85 | test_df = pd.DataFrame({"name": ["Alice", "Bob"]}) 86 | expected = pd.DataFrame({"name": ["70a4b1a987", "dd4532a296"]}) 87 | max_token_len = 10 88 | key = "secret_key" 89 | df = _apply_tokenizer( 90 | sess, 91 | test_df, 92 | tkn.Tokenizer(max_token_len=max_token_len, key=key), 93 | col_to_rename="to_token(name)", 94 | ) 95 | pdt.assert_frame_equal(df, expected) 96 | 97 | 98 | def test_tokenizer_no_key(): 99 | sess = utils.make_session("test.tokenizer.maxTokenLen") 100 | test_df = pd.DataFrame({"name": ["Alice", "Bob"]}) 101 | _apply_tokenizer( 102 | sess, 103 | test_df, 104 | tkn.Tokenizer(max_token_len=None, key=None), 105 | col_to_rename="to_token(name)", 106 | ) 107 | 108 | 109 | def test_reversible_tokenizer(): 110 | sess = utils.make_session("test.tokenizer.reversibleTokenizer") 111 | key = b"5" * 32 112 | plaintext = pd.DataFrame({"name": ["Alice", "Bob"]}) 113 | 114 | tokenized = _apply_tokenizer( 115 | sess, 116 | plaintext, 117 | tkn.ReversibleTokenizer(key=key), 118 | col_to_rename="to_token(name)", 119 | ) 120 | tokenized_expected = pd.DataFrame( 121 | { 122 | "name": [ 123 | "c8c7e80144304276183e5bcd589db782bc5ff95309", 124 | "e0f40aea0d5c21b35967c4231b98b5b3e5338e", 125 | ] 126 | } 127 | ) 128 | pdt.assert_frame_equal(tokenized, tokenized_expected) 129 | 130 | recovered = _apply_tokenizer( 131 | sess, 132 | tokenized, 133 | tkn.TokenReverser(key=key), 134 | col_to_rename="from_token(name)", 135 | ) 136 | pdt.assert_frame_equal(recovered, plaintext) 137 | 138 | 139 | def test_reversible_tokenizer_string_key(): 140 | _ = tkn.ReversibleTokenizer(key="5" * 32) 141 | 142 | 143 | def test_reversible_tokenizer_insufficient_key(): 144 | with pytest.raises(ValueError): 145 | _ = tkn.ReversibleTokenizer(key=b"5" * 10) 146 | -------------------------------------------------------------------------------- /cape_dataframes/spark/transformer.py: -------------------------------------------------------------------------------- 1 | from pyspark import sql 2 | from pyspark.sql import functions 3 | 4 | from cape_dataframes.spark.transformations import base as tfm 5 | 6 | 7 | def transformer(transformation: tfm.Transformation, df: sql.DataFrame, field_name: str): 8 | field_column = functions.col(field_name) 9 | return df.withColumn(field_name, transformation(field_column)) 10 | -------------------------------------------------------------------------------- /cape_dataframes/spark/utils.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | from packaging import version 3 | from pyspark import sql 4 | 5 | _3_0_0_VERSION = version.Version("3.0.0") 6 | _spark_version = version.parse(pyspark.__version__) 7 | 8 | 9 | def configure_session(sess: sql.SparkSession, arrow=True): 10 | if arrow: 11 | if _spark_version >= _3_0_0_VERSION: 12 | sess.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") 13 | else: 14 | sess.conf.set("spark.sql.execution.arrow.enabled", "true") 15 | return sess 16 | 17 | 18 | def make_session(name: str, arrow: bool = True): 19 | sess_builder = sql.SparkSession.builder 20 | sess_builder = sess_builder.appName(name) 21 | sess = sess_builder.getOrCreate() 22 | sess = configure_session(sess, arrow=arrow) 23 | return sess 24 | -------------------------------------------------------------------------------- /cape_dataframes/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capeprivacy/cape-dataframes/ed65cece5caebcce1ac549573514834effab5ecd/cape_dataframes/utils/__init__.py -------------------------------------------------------------------------------- /cape_dataframes/utils/base64.py: -------------------------------------------------------------------------------- 1 | from base64 import urlsafe_b64decode 2 | from base64 import urlsafe_b64encode 3 | from typing import Union 4 | 5 | 6 | # This implements a similar wrapped as cape has in golang. 7 | # It stores the bytes and converts it to encoded string as needed. 8 | # The python base64 package appends padding when encoding but 9 | # in cape this causes errors to occur so must strip that off 10 | # before sending. 11 | class Base64: 12 | def __init__(self, value: Union[str, bytes]): 13 | self.bytes = value 14 | if type(value) == str: 15 | self.bytes = bytes(value, "utf-8") 16 | 17 | def __bytes__(self) -> bytes: 18 | return self.bytes 19 | 20 | # returns the base64 encoded value as a string 21 | def __str__(self) -> str: 22 | b = urlsafe_b64encode(self.bytes) 23 | b = b.strip(b"==") 24 | 25 | return str(b, "utf-8") 26 | 27 | 28 | # Returns a Base64 object from the base64 encoded string. 29 | # Adds padding when decoding so that it doesn't error. 30 | def from_string(s: str) -> Base64: 31 | b = urlsafe_b64decode(bytes(s, "utf-8") + b"==") 32 | return Base64(b) 33 | -------------------------------------------------------------------------------- /cape_dataframes/utils/base64_test.py: -------------------------------------------------------------------------------- 1 | from cape_dataframes.utils.base64 import Base64 2 | from cape_dataframes.utils.base64 import from_string 3 | 4 | 5 | def test_base64(): 6 | b64 = Base64("heythere") 7 | assert "aGV5dGhlcmU" == str(b64) 8 | 9 | 10 | def test_from_string(): 11 | s = "ABCD" 12 | b64 = from_string(s) 13 | 14 | assert s == str(b64) 15 | -------------------------------------------------------------------------------- /cape_dataframes/utils/typecheck.py: -------------------------------------------------------------------------------- 1 | def check_arg(arg, types): 2 | if not isinstance(arg, types): 3 | if not isinstance(types, (tuple, list)): 4 | types = (types,) 5 | raise ValueError("Expected one of {}, got {}.".format(types, type(arg))) 6 | -------------------------------------------------------------------------------- /cape_dataframes/utils/typecheck_test.py: -------------------------------------------------------------------------------- 1 | from cape_dataframes.utils import typecheck 2 | 3 | 4 | def _make_args_and_types(): 5 | string = "hi" 6 | integer = 4 7 | flt = 2.0 8 | lst = [string, integer, flt] 9 | tpl = (string, integer, float) 10 | args_list = (string, integer, flt, lst, tpl, None) 11 | types_list = (str, int, float, list, tuple, type(None)) 12 | return args_list, types_list 13 | 14 | 15 | def test_typecheck_args(): 16 | args_list, types_list = _make_args_and_types() 17 | # check passing 18 | for a, t in zip(args_list, types_list): 19 | typecheck.check_arg(a, t) 20 | # check failure 21 | for a, t in zip(args_list, types_list[::-1]): 22 | try: 23 | typecheck.check_arg(a, t) 24 | raise AssertionError 25 | except ValueError: 26 | pass 27 | 28 | 29 | def test_typecheck_more_types_passes(): 30 | args_list, types_list = _make_args_and_types() 31 | args = args_list[:3] 32 | types = types_list[:3] 33 | for arg in args: 34 | typecheck.check_arg(arg, types) 35 | 36 | 37 | def test_typecheck_more_types_fails(): 38 | args_list, types_list = _make_args_and_types() 39 | arg = args_list[-1] 40 | types = types_list[:3] 41 | try: 42 | typecheck.check_arg(arg, types) 43 | raise AssertionError 44 | except ValueError: 45 | pass 46 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | project: 4 | default: 5 | target: 90% 6 | threshold: 1% 7 | base: auto 8 | patch: 9 | default: 10 | target: 93% 11 | threshold: 5% 12 | base: auto -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Cape Dataframes overview 2 | 3 | Cape Dataframes allows you to write data privacy policies and data transformations to integrate with [Pandas](https://pandas.pydata.org/) and [Spark](https://spark.apache.org/). 4 | 5 | You can view the source code in the [Cape Dataframes GitHub Repository](https://github.com/capeprivacy/cape-dataframes). 6 | 7 | ## Use cases 8 | 9 | Review the [transformations](./transformations) and decide which are a good fit for your data science needs. 10 | 11 | The 0.1.0 release includes five transformations that provide some common privacy protections. 12 | 13 | | Use case | Text data | Numeric data | Inconsistent data 14 | | ------------- | ------------- | --------------- | ----------- 15 | | EDA | Tokenization | Rounding or pertubation | Tokenization 16 | | Analytics | Tokenization | Rounding or pertubation | - 17 | | ML development | - | Rounding or pertubation | Tokenization 18 | | ML training/serving | No transformation | No transformation | No transformation 19 | 20 | Cape Dataframes will support more use cases through additional transformations in future releases. 21 | -------------------------------------------------------------------------------- /docs/policies.md: -------------------------------------------------------------------------------- 1 | # Policies 2 | 3 | The data policy defines the data you want to change, and the [transformations](./transformations) or [redactions](./redactions) you want to apply. 4 | 5 | Cape Dataframes requires data policies in YAML format. This example describes all the available YAML objects: 6 | 7 | ``` yaml 8 | # Required. The policy name. 9 | label: test_policy 10 | # Required. The Cape Dataframes specification version. Must be 1. 11 | version: 1 12 | # Configure your named transformations. 13 | # Named transformations allow you to reuse a transformation 14 | # with a set value throughout your policy. 15 | transformations: 16 | # This named transformation uses the built-in tokenizer transformation 17 | - name: my_tokenizer 18 | type: tokenizer 19 | max_token_len: 10 20 | key: "my secret" 21 | rules: 22 | # Required. The column name. 23 | - match: 24 | name: fruit 25 | actions: 26 | # This example shows a named transformation. 27 | # It tells the policy runner to apply the my_tokenizer transformation 28 | # to all fields in the "fruit" column. 29 | - transform: 30 | name: my_tokenizer 31 | - match: 32 | name: weight 33 | actions: 34 | - transform: 35 | # This example shows an unnamed transformation. 36 | # It tells the policy runner to: 37 | # (1) Apply the transformation numeric-rounding 38 | # (2) Round to one decimal place 39 | type: numeric-rounding 40 | dtype: Double 41 | precision: 1 42 | ``` 43 | 44 | -------------------------------------------------------------------------------- /docs/quickstart.md: -------------------------------------------------------------------------------- 1 | # Cape Dataframes API 2 | 3 | This guide provides an example of using Cape Dataframes with either Pandas or Spark. 4 | 5 | ## Prerequisites 6 | 7 | * Python 3.6 or above. 8 | * Cape Dataframes recommends using a virtual environment such as [venv](https://docs.python.org/3/library/venv.html). 9 | 10 | 11 | ## Installation 12 | 13 | You can install Cape Dataframes with pip: 14 | 15 | ```shell 16 | pip install cape-privacy 17 | ``` 18 | 19 | ## Quickstart 20 | 21 | ### Write the policy 22 | 23 | The data policy file defines the target data and permissions. It is written in YAML. Cape Dataframes reads the `.yaml` policy file and applies the policies based on your [policy application script](#write-the-policy-application-script). 24 | 25 | Create a `test-policy.yaml` file in your project, with the following content: 26 | 27 | ```yaml 28 | label: test-policy 29 | version: 1 30 | rules: 31 | # Set the column name 32 | - match: 33 | name: weight 34 | actions: 35 | - transform: 36 | # This example shows an unnamed transformation. 37 | # It tells the policy runner to: 38 | # (1) Apply the transformation numeric-rounding 39 | # (2) Round to one decimal place 40 | type: numeric-rounding 41 | dtype: Double 42 | precision: 1 43 | ``` 44 | 45 | 46 | ### Write the policy application script 47 | 48 | To apply the policy `.yaml` to your data, you must run a script that defines which policy you apply to which data target. 49 | 50 | Create a `test-transformation.py` file in your project, with the following content: 51 | 52 | 53 | === "Pandas" 54 | ```python 55 | import cape_dataframes as cape_df 56 | import pandas as pd 57 | 58 | # Create a simple Pandas DataFrame 59 | df = pd.DataFrame([114.432, 134.622, 142.984], columns=["weight"]) 60 | # Load the privacy policy 61 | policy = cape_df.parse_policy("test-policy.yaml") 62 | # Apply the policy to the DataFrame 63 | df = cape_df.apply_policy(policy, df, inplace=False) 64 | # Output the altered data 65 | print(df.head()) 66 | ``` 67 | 68 | === "Spark" 69 | ```python 70 | import cape_dataframes as cape_df 71 | from pyspark import sql 72 | 73 | sess_builder = sql.SparkSession.builder 74 | sess_builder = sess_builder.appName('cape.examples.rounding') 75 | sess_builder = sess_builder.config('spark.sql.execution.arrow.enabled', 'true') 76 | sess = sess_builder.getOrCreate() 77 | 78 | # Create a simple Spark DataFrame 79 | df = sess.createDataFrame([114.432, 134.622, 142.984], "double").toDF("weight") 80 | # Load the privacy policy 81 | policy = cape_df.parse_policy("test-policy.yaml") 82 | # Apply the policy to the DataFrame 83 | df = cape_df.apply_policy(policy, df, inplace=False) 84 | # Output the altered data 85 | print(df.show()) 86 | ``` 87 | 88 | 89 | ### Run your transformations 90 | 91 | The quickstart example creates a dataset programatically, so you can run the policy application script and view the output: 92 | 93 | ```shell 94 | python test-transformation.py 95 | ``` 96 | 97 | 98 | ### Usage Best Practices 99 | 100 | * Ensure that you have your data collected and joined before applying transformations, especially in the case of multiple sensitive columns. 101 | * Some transformations require sensitive data to be contained in the policy files. For this reason, keep your policy files stored securely. In a future release, we will support pulling transformation keys from key storage software, such as Hashicorp Vault. 102 | * Consider using transformations as the final step in your pre-processing before creating a "clean sink" or "safe dataset". This means that you can begin your work on that clean dataset. 103 | * Experiment with the transformations directly on your data to learn how they impact your data utility. Figure out the right utility vs. privacy tradeoff for the task at hand, and amend your policy accordingly. 104 | -------------------------------------------------------------------------------- /docs/redactions.md: -------------------------------------------------------------------------------- 1 | # Redactions 2 | 3 | Redactions involve dropping the matched data. Unlike [transformations](./transformations), which modify but preserve data, redactions will change the shape of your dataframes. 4 | 5 | Cape Dataframes has one built-in redaction function. This document describes what it does, and provides an example of how to use it in your policy. 6 | 7 | !!! warning 8 | Redactions change the shape of your data. 9 | 10 | ## Column redaction 11 | 12 | The `column-redact` redaction deletes matching columns. 13 | 14 | ```yaml 15 | - transform: 16 | type: "column-redact" 17 | # Replace with the column name you want to redact. 18 | columns: [""] 19 | ``` 20 | 21 | -------------------------------------------------------------------------------- /docs/transformations.md: -------------------------------------------------------------------------------- 1 | # Transformations 2 | 3 | Transformations are functions that alter your data, ensuring it is free of sensitive information. 4 | 5 | Cape Dataframes has five built-in transformation functions. This document describes what they do, and provides an example of how to use each transformation in your policy. 6 | 7 | ## Date perturbation 8 | 9 | The `date-perturbation` transformation adds random noise to dates. The amount of noise depends on the `min` and `max` values that you set in the policy. 10 | 11 | ``` yaml 12 | - transform: 13 | type: date-pertubation 14 | frequency: 15 | min: 16 | max: 17 | # Optional. The base number to initialize the random number generator. 18 | # Pandas only (Spark does not currently support seeding) 19 | seed: 20 | ``` 21 | 22 | 23 | ## Date truncation 24 | 25 | The `date-truncation` transformation shortens dates to a unit (year or month). Set the unit in `frequency`. 26 | 27 | ``` yaml 28 | - transform: 29 | type: date-truncation 30 | frequency: 31 | ``` 32 | 33 | ## Numeric pertubation 34 | 35 | The `numeric-pertubation` transformation adds random noise to numeric data sets. The amount of noise depends on the `min` and `max` values that you set in the policy. 36 | 37 | ``` yaml 38 | - transform: 39 | type: numeric-pertubation 40 | dtype: 41 | min: 42 | max: 43 | # Optional. The base number to initialize the random number generator. 44 | seed: 45 | ``` 46 | 47 | ## Numeric rounding 48 | 49 | The `numeric-rounding` transformation rounds numeric values to a given number of decimal places. Use `precision` to set the number of decimal places. 50 | 51 | ``` yaml 52 | - transform: 53 | type: numeric-rounding 54 | dtype: 55 | precision: 56 | ``` 57 | 58 | ## Tokenizer 59 | 60 | The `tokenizer` transformation maps a string to a token to obfuscate it. 61 | 62 | !!! warning 63 | Linkable tokenization for sensitive data is vulnerable to privacy attacks. Cape Privacy does not recommend sharing tokenized data with preserved linkability with untrusted or outside parties. Cape Python does not support anonymized transformations. 64 | 65 | ``` yaml 66 | - transform: 67 | type: tokenizer 68 | # Default is 64 69 | max_token_len: 70 | # If unspecified, Cape Dataframes uses a random byte string 71 | key: 72 | ``` 73 | 74 | ## ReversibleTokenizer 75 | 76 | The `ReversibleTokenizer` transformation maps a sting to a token to obfuscate it. However, when using the `ReversibleTokenizer`, the tokens can be reverted back to their plaintext form by using the `TokenReverser`. 77 | 78 | ```yaml 79 | - transform: 80 | type: reversible-tokenizer 81 | # If unspecified, Cape Dataframes uses a random byte string 82 | key: 83 | ``` 84 | 85 | ## TokenReverser 86 | 87 | The `TokenReverser` is designed to be used with the `ReversibleTokenizer`. The `TokenReverser` reverts tokens produced by the `ReversibleTokenizer` back to their plaintext form. 88 | 89 | ```yaml 90 | - transform: 91 | type: token-reverser 92 | # If unspecified, Cape Dataframes uses a random byte string 93 | key: 94 | ``` 95 | -------------------------------------------------------------------------------- /docs/tutorials/reversible-tokenization.md: -------------------------------------------------------------------------------- 1 | # Reversible Tokenizer 2 | 3 | Here we show an example of how you can use the `ReversibleTokenizer` to tokenize data within a pandas dataframe. 4 | 5 | The `ReversibleTokenizer` will tokenize the input data so it can be used in a privacy preserving manner. 6 | 7 | The `ReversibleTokenizer` can be used in conjunction with the `TokenReverser` to recover the original data. 8 | 9 | ## Tokenizing Data 10 | 11 | The `ReversibleTokenizer` and `TokenReverser` classes can be found in the `pandas.transformations` package. 12 | 13 | ```python 14 | from cape_dataframes.pandas.transformations import ReversibleTokenizer 15 | from cape_dataframes.pandas.transformations import TokenReverser 16 | ``` 17 | 18 | In this example, we will simply hide the names within our dataset. 19 | 20 | ```python 21 | import pandas as pd 22 | plaintext_data = pd.DataFrame({'name': ["Alice", "Bob", "Carol"], "# friends": [100, 200, 300]}) 23 | ``` 24 | 25 | You instantiate a `ReversibleTokenizer` by passing it a key. For the `TokenReverser` to be able to reverse the tokens produced by the `ReversibleTokenizer`, you must use the same key. 26 | 27 | ```python 28 | key=b"5" * 32 29 | tokenizer = ReversibleTokenizer(key=key) 30 | ``` 31 | 32 | ```python 33 | tokenized = pd.DataFrame(plaintext_data) 34 | tokenized["name"] = tokenizer(plaintext_data["name"]) 35 | ``` 36 | 37 | ## Recovering Tokens 38 | 39 | If we ever need to reveal the tokenized data, we can use the `TokenReverser` class. 40 | 41 | ```python 42 | reverser = TokenReverser(key=key) 43 | recovered = pd.DataFrame(tokenized) 44 | recovered["name"] = reverser(tokenized["name"]) 45 | ``` 46 | 47 | You can see full code for this example on [Github](https://github.com/capeprivacy/cape-dataframes/blob/master/examples/tutorials/reversible_tokenizer/reversible_tokenizer_pandas.ipynb) 48 | -------------------------------------------------------------------------------- /examples/notebooks/Cape Policy for Spark - IoT Example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exploring Cape Python Policy with Pandas and Cape Core\n", 8 | "\n", 9 | "This Jupyter Notebook is accompanied by our [Medium Post on Getting Started with Cape Core](https://medium.com/dropoutlabs/cape-core-privacy-and-data-science-working-together-d25a55526506). To follow along, you will need to [download the example dataset](https://capeprivacy.com/example-dataset/) and put it in a relative folder called `data` (or update the file path below). You will also need to [download the policy file](https://github.com/capeprivacy/cape-python/blob/master/examples/policy/iot_example_policy.yaml) and put it in a relative folder called `policy` or ensure you have Cape Python installed locally and change the path to use the copy in the `examples` folder.\n", 10 | "\n", 11 | "You will also need a local (or deployed version) of [Cape Core](https://github.com/capeprivacy/cape) running and have generated an API token to follow along." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/html": [ 22 | "\n", 23 | "
\n", 24 | "

SparkSession - hive

\n", 25 | " \n", 26 | "
\n", 27 | "

SparkContext

\n", 28 | "\n", 29 | "

Spark UI

\n", 30 | "\n", 31 | "
\n", 32 | "
Version
\n", 33 | "
v3.0.0
\n", 34 | "
Master
\n", 35 | "
local[*]
\n", 36 | "
AppName
\n", 37 | "
PySparkShell
\n", 38 | "
\n", 39 | "
\n", 40 | " \n", 41 | "
\n", 42 | " " 43 | ], 44 | "text/plain": [ 45 | "" 46 | ] 47 | }, 48 | "execution_count": 1, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "spark" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "import cape_dataframes as cape_df" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "df = spark.read.csv('../data/iot_example.csv', header=True)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "+-------------------+---------------+-----------+---------+--------------------+------+------+\n", 85 | "| timestamp| username|temperature|heartrate| build|latest| note|\n", 86 | "+-------------------+---------------+-----------+---------+--------------------+------+------+\n", 87 | "|2017-01-01T12:18:39| moonjuan| 26| 76|22989085-e6fe-eae...| 1| n/a|\n", 88 | "|2017-01-01T12:22:52| ylee| 29| 73|ff29e7ab-934f-f7b...| 0| test|\n", 89 | "|2017-01-01T12:32:20| alicecampos| 29| 76|547ed6d5-0e12-4c2...| 0| test|\n", 90 | "|2017-01-01T12:36:40| stevenmiller| 26| 64|e12b053c-d772-c94...| 0|update|\n", 91 | "|2017-01-01T12:40:26|robinsongabriel| 17| 80|f0bfb52c-b805-cd1...| 1| n/a|\n", 92 | "+-------------------+---------------+-----------+---------+--------------------+------+------+\n", 93 | "\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "df.sample(0.1).limit(5).show()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "### Privacy Concerns\n", 106 | "\n", 107 | "In this dataset which has mock data from wearable devices, we are concerned about the privacy of the individuals. It is a timeseries-based analysis, so we'd like to ensure we retain the ability to see the data of an individual change over time, but we want to provide some basic privacy protections for our exploratory data analysis and later model development activities.\n", 108 | "\n", 109 | "The following policy file provides these protections:\n", 110 | "\n", 111 | "- [Tokenization](https://docs.capeprivacy.com/libraries/cape-python/transformations/#tokenizer) of the username column with a maximum token length of 10 and a key defined in the file.\n", 112 | "- [Date Truncation](https://docs.capeprivacy.com/libraries/cape-python/transformations/#date-truncation) for the timestamp column - removing the minutes and seconds of the data but keeping the year, month, date and hour.\n", 113 | "- [Redaction](https://docs.capeprivacy.com/libraries/cape-python/redactions) of the build column, which reveals information about the device it was built on. In Cape, redaction involves dropping of the matching data so this will change the shape of your dataframes." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "label: iot_dataset_policy\r\n", 126 | "version: 1\r\n", 127 | "rules:\r\n", 128 | " - match:\r\n", 129 | " name: username\r\n", 130 | " actions:\r\n", 131 | " - transform:\r\n", 132 | " type: \"tokenizer\"\r\n", 133 | " max_token_len: 10\r\n", 134 | " key: \"Please change this :)\"\r\n", 135 | " - match:\r\n", 136 | " name: timestamp\r\n", 137 | " actions:\r\n", 138 | " - transform:\r\n", 139 | " type: \"date-truncation\"\r\n", 140 | " frequency: \"hour\"\r\n", 141 | " - match:\r\n", 142 | " name: build\r\n", 143 | " actions:\r\n", 144 | " - transform:\r\n", 145 | " type: \"column-redact\"\r\n", 146 | " columns: [\"build\"] \r\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "!cat ../policy/iot_example_policy.yaml" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "### With Cape Core\n", 159 | "\n", 160 | "If you are using Cape Core and have a project setup and registered with the above policy as well as an API token, you can use the following code to download the policy from the Cape Coordinator." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 3, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "c = cape_df.Client(\"http://localhost:8080\")\n", 170 | "c.login(\"INSERT YOUR CAPE TOKEN HERE\")\n", 171 | "policy = c.get_policy(\"first-project\")" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "### Apply the parsed policy\n", 179 | "\n", 180 | "To apply the parsed policy, call the `apply_policy` function to your dataframe and sample the results." 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 6, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stderr", 190 | "output_type": "stream", 191 | "text": [ 192 | "/usr/local/spark/python/pyspark/sql/pandas/functions.py:386: UserWarning: In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for pandas UDF instead of specifying pandas UDF type which will be deprecated in the future releases. See SPARK-28264 for more details.\n", 193 | " \"in the future releases. See SPARK-28264 for more details.\", UserWarning)\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "caped_df = cape_df.apply_policy(policy, df)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 7, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "+----------+----------+-----------+---------+------+--------+\n", 211 | "| timestamp| username|temperature|heartrate|latest| note|\n", 212 | "+----------+----------+-----------+---------+------+--------+\n", 213 | "|2017-01-01|1763f4313b| 22| 83| 1| update|\n", 214 | "|2017-01-01|d0c44f5675| 12| 77| 0| wake|\n", 215 | "|2017-01-01|0a89db1e39| 12| 78| 1|interval|\n", 216 | "|2017-01-01|26594010f3| 29| 76| 0| test|\n", 217 | "|2017-01-01|37db75f0f1| 12| 71| 0| sleep|\n", 218 | "+----------+----------+-----------+---------+------+--------+\n", 219 | "\n" 220 | ] 221 | } 222 | ], 223 | "source": [ 224 | "caped_df.sample(0.1).limit(5).show()" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "### Send it to Sink\n", 232 | "\n", 233 | "Now it's time to send along our caped DataFrame to our clean sink or utilize it in a Spark task (for example, for analytics, EDA or machine learning). \n", 234 | "\n", 235 | "Note: You'll need to edit the database details below (or specify where you'd like the dataframe to be written." 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "caped_df.write \\\n", 245 | " .format(\"jdbc\") \\\n", 246 | " .option(\"url\", \"jdbc:postgresql:dbserver\") \\\n", 247 | " .option(\"dbtable\", \"schema.tablename\") \\\n", 248 | " .option(\"user\", \"username\") \\\n", 249 | " .option(\"password\", \"password\") \\\n", 250 | " .save()" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [] 259 | } 260 | ], 261 | "metadata": { 262 | "kernelspec": { 263 | "display_name": "cape-df", 264 | "language": "python", 265 | "name": "python3" 266 | }, 267 | "language_info": { 268 | "codemirror_mode": { 269 | "name": "ipython", 270 | "version": 3 271 | }, 272 | "file_extension": ".py", 273 | "mimetype": "text/x-python", 274 | "name": "python", 275 | "nbconvert_exporter": "python", 276 | "pygments_lexer": "ipython3", 277 | "version": "3.8.16" 278 | }, 279 | "vscode": { 280 | "interpreter": { 281 | "hash": "2c0eb9acd3ce9f628738cc91d7613a5d048e1a93f709104c9a35d77254cfaaac" 282 | } 283 | } 284 | }, 285 | "nbformat": 4, 286 | "nbformat_minor": 2 287 | } 288 | -------------------------------------------------------------------------------- /examples/notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Jupyter Notebooks: Working Examples for Cape Python 2 | 3 | `cape-privacy` gives you the ability to apply several masking techniques (transformations) such as tokenization, perturbation, rounding, etc., in order to obfuscate personal information contained in your dataset. You can find out more by visiting [our documentation](https://docs.capeprivacy.com/libraries/cape-python/). 4 | 5 | ## Notebook Overview 6 | 7 | There are several posts related to these Jupyter notebooks to assist you in evaluating the code and privacy considerations. The related policy files can be found in the `policy` folder in this `examples` folder. You will find links to the datasets in the individual notebooks. Note that some datasets are reused, so you might need to download only once. 8 | 9 | ### Cape Core: Introduction to Collaborative Privacy and Security Policy 10 | 11 | This [overview of Cape Core software](https://medium.com/dropoutlabs/cape-core-privacy-and-data-science-working-together-d25a55526506) walks you through the use of Cape Core alongside Cape Privacy. It is a great way to get started with managing your transformations in a responsible way. Cape Core allows you to store policies centrally and coordinate them with your data science and machine learning peers. There are two notebooks related to this blog post: 12 | 13 | - [Cape Policy for Pandas - IoT Example](https://github.com/capeprivacy/cape-python/blob/master/examples/notebooks/Cape%20Policy%20for%20Pandas%20-%20IoT%20Example.ipynb) 14 | - [Cape Policy for Spark - IoT Example](https://github.com/capeprivacy/cape-python/blob/master/examples/notebooks/Cape%20Policy%20for%20Spark%20-%20IoT%20Example.ipynb) 15 | 16 | ### Coiled Science Thursdays: Data Privacy in Distributed Data Science 17 | 18 | The Cape team was invited to join in on [Coiled](https://coiled.io/)'s Science Thursdays to show how [Cape Python can help add privacy to distributed data science](https://coiled.io/blog/data-privacy-distributed-compute.html). This [live webinar (recording available)](https://www.youtube.com/watch?v=cIvv8EGMDY0&feature=youtu.be) walks you through the use of Cape Privacy in Spark and Pandas. There are two notebooks related to [this recording and 19 | writeup](https://coiled.io/blog/data-privacy-distributed-compute.html): 20 | 21 | - [Exploring Cape Python in an EDA Setting](https://github.com/capeprivacy/cape-python/blob/master/examples/notebooks/Cape%20Python%20with%20Pandas%20-%20IoT%20Exploratory%20Data%20Analysis.ipynb) 22 | - [Implementing Policy in Apache Spark - Taxi Dataset](https://github.com/capeprivacy/cape-python/blob/master/examples/notebooks/Cape%20Python%20with%20PySpark%20-%20Taxi%20Dataset.ipynb) 23 | 24 | 25 | -------------------------------------------------------------------------------- /examples/policy/iot_example_policy.yaml: -------------------------------------------------------------------------------- 1 | label: iot_examplew_policy 2 | version: 1 3 | rules: 4 | - match: 5 | name: username 6 | actions: 7 | - transform: 8 | type: "tokenizer" 9 | max_token_len: 10 10 | key: "Please change this :)" 11 | - match: 12 | name: timestamp 13 | actions: 14 | - transform: 15 | type: "date-truncation" 16 | frequency: "hour" 17 | - match: 18 | name: build 19 | actions: 20 | - transform: 21 | type: "column-redact" 22 | columns: ["build"] 23 | -------------------------------------------------------------------------------- /examples/policy/mask_personal_information.yaml: -------------------------------------------------------------------------------- 1 | label: masking_policy 2 | version: 1 3 | rules: 4 | - match: 5 | name: name 6 | actions: 7 | - transform: 8 | type: "tokenizer" 9 | max_token_len: 10 10 | key: "my secret" 11 | - match: 12 | name: age 13 | actions: 14 | - transform: 15 | type: "numeric-perturbation" 16 | dtype: Integer 17 | min: -10 18 | max: 10 19 | - match: 20 | name: salary 21 | actions: 22 | - transform: 23 | type: "numeric-rounding" 24 | dtype: Double 25 | precision: -3 26 | - match: 27 | name: birthdate 28 | actions: 29 | - transform: 30 | type: "date-perturbation" 31 | frequency: ["YEAR", "MONTH", "DAY"] 32 | min: [-10, -5, -5] 33 | max: [10, 5, 5] 34 | - match: 35 | name: ssn 36 | actions: 37 | - transform: 38 | type: "column-redact" 39 | columns: ["ssn"] 40 | 41 | 42 | -------------------------------------------------------------------------------- /examples/policy/nyc_taxi_dataset_policy.yaml: -------------------------------------------------------------------------------- 1 | label: taxi_dataset_policy 2 | version: 1 3 | rules: 4 | - match: 5 | name: VendorID 6 | actions: 7 | - transform: 8 | type: "tokenizer" 9 | max_token_len: 10 10 | key: "Please change this :)" 11 | - match: 12 | name: passenger_count 13 | actions: 14 | - transform: 15 | type: "numeric-perturbation" 16 | dtype: Integer 17 | min: 0 18 | max: 2 19 | - match: 20 | name: pickup_longitude 21 | actions: 22 | - transform: 23 | type: "numeric-rounding" 24 | dtype: Double 25 | precision: 4 26 | - match: 27 | name: pickup_latitude 28 | actions: 29 | - transform: 30 | type: "numeric-rounding" 31 | dtype: Double 32 | precision: 4 33 | - match: 34 | name: dropoff_longitude 35 | actions: 36 | - transform: 37 | type: "numeric-rounding" 38 | dtype: Double 39 | precision: 4 40 | - match: 41 | name: dropoff_latitude 42 | actions: 43 | - transform: 44 | type: "numeric-rounding" 45 | dtype: Double 46 | precision: 4 47 | - match: 48 | name: PULocationID 49 | actions: 50 | - transform: 51 | type: "column-redact" 52 | columns: ["PULocationID", "DOLocationID"] 53 | -------------------------------------------------------------------------------- /examples/policy/perturb_value_field.yaml: -------------------------------------------------------------------------------- 1 | label: perturb-ones-field 2 | version: 1 3 | rules: 4 | - match: 5 | name: ones 6 | actions: 7 | - transform: 8 | type: numeric-perturbation 9 | dtype: Integer 10 | min: -10 11 | max: 10 12 | seed: 4984 13 | -------------------------------------------------------------------------------- /examples/policy/spark_round.yaml: -------------------------------------------------------------------------------- 1 | label: spark-round-float 2 | version: 1 3 | transformations: 4 | - name: roundFloat 5 | type: numeric-rounding 6 | dtype: Float 7 | precision: 0 8 | rules: 9 | - match: 10 | name: ones 11 | actions: 12 | - transform: 13 | name: roundFloat 14 | -------------------------------------------------------------------------------- /examples/simple_transformation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import cape_dataframes as cape 5 | 6 | policy = cape.parse_policy("policy/perturb_value_field.yaml") 7 | 8 | df = pd.DataFrame( 9 | np.ones( 10 | 5, 11 | ), 12 | columns=["ones"], 13 | ) 14 | df = cape.apply_policy(policy, df) 15 | print(df.head()) 16 | -------------------------------------------------------------------------------- /examples/spark_example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from pyspark import sql 4 | 5 | import cape_dataframes as cape 6 | 7 | sess_builder = sql.SparkSession.builder 8 | sess_builder = sess_builder.appName("cape.examples.rounding") 9 | sess = sess_builder.getOrCreate() 10 | sess = cape.spark.configure_session(sess) 11 | 12 | pdf = pd.DataFrame(np.ones(5, dtype=np.float32) + 0.2, columns=["ones"]) 13 | df = sess.createDataFrame(pdf) 14 | df.show() 15 | 16 | policy = cape.parse_policy("policy/spark_round.yaml") 17 | result = cape.apply_policy(policy, df) 18 | result.show() 19 | -------------------------------------------------------------------------------- /examples/tutorials/credit/README.md: -------------------------------------------------------------------------------- 1 | # Credit Risk Tutorial 2 | 3 | This tutorial was created for the blog post [Cape Python: Apply Privacy-Enhancing Techniques to Protect Sensitive Data in Pandas and Spark](https://medium.com/dropoutlabs/cape-python-apply-privacy-enhancing-techniques-to-protect-sensitive-data-in-pandas-and-spark-e0bf8c0d55db). 4 | 5 | As an example, we experiment with the public [German credit card dataset](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)). We just added some fake PII information (such as name, address, etc.) and quasi-identifiers (city, salary etc.) to make it more similar to a real dataset for which we would use the masking techniques. 6 | 7 | ## Prototype your privacy-preserving pipeline in Pandas 8 | 9 | The notebook `mask_credit_data_in_pandas.ipynb` shows you how you can prototype the masking techniques in Pandas, and then define a data privacy policy. 10 | 11 | ## Make your Spark pipeline privacy-preserving 12 | Once you have defined the data privacy policy, you can can apply it to a Spark DataFrame. As an example, you can run the following script: 13 | ``` 14 | # submit the script to a Spark cluster 15 | spark-submit apply_policy_spark.py 16 | ``` 17 | -------------------------------------------------------------------------------- /examples/tutorials/credit/apply_policy_spark.py: -------------------------------------------------------------------------------- 1 | from pyspark import sql 2 | from pyspark.sql import functions 3 | 4 | import cape_dataframes as cape 5 | 6 | # Set up your SparkSession as usual, but configure it for use with Cape. 7 | # We do this because some transformations expect Arrow to be enabled. 8 | sess = sql.SparkSession.builder.appName( 9 | "cape.tutorial.maskPersonalInformation" 10 | ).getOrCreate() 11 | sess = cape.spark.configure_session(sess) 12 | 13 | # Load a Spark DataFrame 14 | df = sess.read.load( 15 | "data/credit_with_pii.csv", format="csv", sep=",", inferSchema="true", header="true" 16 | ) 17 | df = df.withColumn( 18 | "Application_date", 19 | functions.to_date(functions.col("Application_date"), "yyyy-MM-dd"), 20 | ) 21 | print("Original Dataset:") 22 | print(df.show()) 23 | # Load the privacy policy and apply it to the DataFrame 24 | policy = cape.parse_policy("policy/credit_policy.yaml") 25 | df = cape.apply_policy(policy, df) 26 | 27 | print("Masked Dataset:") 28 | print(df.show()) 29 | -------------------------------------------------------------------------------- /examples/tutorials/credit/policy/credit_policy.yaml: -------------------------------------------------------------------------------- 1 | label: credit_policy 2 | version: 1 3 | rules: 4 | - match: 5 | name: Name 6 | actions: 7 | - transform: 8 | type: "tokenizer" 9 | - match: 10 | name: Sex 11 | actions: 12 | - transform: 13 | type: "tokenizer" 14 | max_token_len: 10 15 | - match: 16 | name: Age 17 | actions: 18 | - transform: 19 | type: "numeric-perturbation" 20 | dtype: Integer 21 | min: -5 22 | max: 5 23 | - match: 24 | name: Salary 25 | actions: 26 | - transform: 27 | type: "numeric-rounding" 28 | dtype: Double 29 | precision: -3 30 | - match: 31 | name: Application_date 32 | actions: 33 | - transform: 34 | type: "date-perturbation" 35 | frequency: DAY 36 | min: -3 37 | max: 3 38 | - match: 39 | name: City 40 | actions: 41 | - transform: 42 | type: "column-redact" 43 | columns: ["City"] 44 | - match: 45 | name: Street_address 46 | actions: 47 | - transform: 48 | type: "column-redact" 49 | columns: ["Street_address"] 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /examples/tutorials/quickstart/README.md: -------------------------------------------------------------------------------- 1 | # Tutorial: Mask Your Data in Pandas and Spark 2 | 3 | `cape-privacy` gives you the ability to apply several masking techniques (transformations) such as tokenization, perturbation, rounding, etc., in order to obfuscate personal information contained in your dataset. 4 | 5 | In this short tutorial, we will show you how you can prototype a masking policy on a Pandas DataFrame to then apply it on a Spark DataFrame. 6 | 7 | ## Experiment with masking techniques without a policy file 8 | 9 | In order to get familiar with the different masking techniques and identify which one would fit your needs, you can apply these transformations directly on a Pandas DataFrame through the `cape_dataframes.pandas.transformations` API without having to write the policy in a yaml file. 10 | 11 | For this example, we will use a mock dataset with the following PII fields: name, age, birthdate, salary and SSN. In order to obfuscate these different fields we will apply the following transformations: 12 | 13 | - `name`: map each name to a unique token (`Tokenizer`). It will give the ability to obfuscate the name while maintaining user count in your dataset. 14 | - `age`: add uniform random noise within the interval of `[-10, 10]` (`NumericPerturbation`). 15 | - `birthdate`: add uniform random noise to year, month and day (`DatePerturbation`). 16 | - `salary`: round each value to nearest 1,000 (`NumericRounding`). 17 | - `SSN`: redact the field from the dataset (`ColumnRedact`). 18 | 19 | You can experiment with these transformations on a Pandas DataFrame by running the following script: 20 | 21 | ``` 22 | python experiment_pandas.py 23 | ``` 24 | 25 | You can also experiment with these transformations on Spark DataFrame with the `cape_dataframes.spark.transformations` API. 26 | 27 | ```sh 28 | python experiment_spark.py 29 | 30 | # submit the script to a Spark cluster 31 | spark-submit experiment_spark.py 32 | ``` 33 | 34 | As you will notice, the `transformations` API is standardized, so you can easily transfer the transformations applied in `Pandas` to `Spark`. 35 | 36 | ## Write your policy 37 | 38 | Once you've identified the masking techniques you'd like to apply, you can define your policy in a `yaml` file. Below, you'll find a sample of the policy corresponding to the transformations applied in `experiment_pandas.py`. You can find the full policy in `mask_personal_information.yaml`. You can select the field with `match`, then define the transformation you'd like to apply under `transform` with the appropriate arguments. The argument names in the policy file match the arguments of the `transformations` API. 39 | 40 | ```yaml 41 | label: masking_policy 42 | version: 1 43 | rules: 44 | - match: 45 | name: name 46 | actions: 47 | - transform: 48 | type: "tokenizer" 49 | max_token_len: 10 50 | key: "my secret" 51 | - match: 52 | name: age 53 | actions: 54 | - transform: 55 | type: "numeric-perturbation" 56 | dtype: Integer 57 | min: -10 58 | max: 10 59 | ``` 60 | 61 | ## Apply the policy to your Spark DataFrame 62 | 63 | You are now ready to apply the policy to your Spark DataFrame. You just need to add two methods to your PySpark job: 64 | - `cape_dataframes.parse_policy`: load and parse the policy defined in the `yaml` file. 65 | - `cape_dataframes.apply_policy`: apply the policy to a DataFrame. 66 | 67 | To mask your data in a Spark job: 68 | 69 | ```sh 70 | spark-submit apply_policy_spark.py 71 | ``` 72 | 73 | The same process works for Pandas too, in case you'd rather test or deploy with a quick prototype. 74 | 75 | ```sh 76 | python apply_policy_pandas.py 77 | ``` 78 | 79 | In your terminal, you should the see the data masked! 80 | -------------------------------------------------------------------------------- /examples/tutorials/quickstart/apply_policy_pandas.py: -------------------------------------------------------------------------------- 1 | from dataset import load_dataset 2 | 3 | import cape_dataframes as cape 4 | 5 | # Load the Pandas DataFrame 6 | df = load_dataset() 7 | print("Original Dataset:") 8 | print(df.head()) 9 | # Load the privacy policy and apply it to the DataFrame 10 | policy = cape.parse_policy("mask_personal_information.yaml") 11 | df = cape.apply_policy(policy, df) 12 | 13 | print("Masked Dataset:") 14 | print(df.head()) 15 | -------------------------------------------------------------------------------- /examples/tutorials/quickstart/apply_policy_spark.py: -------------------------------------------------------------------------------- 1 | from dataset import load_dataset 2 | from pyspark import sql 3 | 4 | import cape_dataframes as cape 5 | 6 | # Set up your SparkSession as usual, but configure it for use with Cape. 7 | # We do this because some transformations expect Arrow to be enabled. 8 | sess = sql.SparkSession.builder.appName( 9 | "cape.tutorial.maskPersonalInformation" 10 | ).getOrCreate() 11 | sess = cape.spark.configure_session(sess) 12 | 13 | # Load a Spark DataFrame 14 | df = load_dataset(sess) 15 | print("Original Dataset:") 16 | print(df.show()) 17 | # Load the privacy policy and apply it to the DataFrame 18 | policy = cape.parse_policy("mask_personal_information.yaml") 19 | df = cape.apply_policy(policy, df) 20 | 21 | print("Masked Dataset:") 22 | print(df.show()) 23 | -------------------------------------------------------------------------------- /examples/tutorials/quickstart/dataset.py: -------------------------------------------------------------------------------- 1 | # SKIP_CI 2 | 3 | import pandas as pd 4 | 5 | 6 | def load_dataset(sess=None): 7 | dataset = pd.DataFrame( 8 | { 9 | "name": ["alice", "bob"], 10 | "age": [34, 55], 11 | "birthdate": [pd.Timestamp(1985, 2, 23), pd.Timestamp(1963, 5, 10)], 12 | "salary": [59234.32, 49324.53], 13 | "ssn": ["343554334", "656564664"], 14 | } 15 | ) 16 | if sess is not None: 17 | return sess.createDataFrame(dataset) 18 | else: 19 | return dataset 20 | -------------------------------------------------------------------------------- /examples/tutorials/quickstart/experiment_pandas.py: -------------------------------------------------------------------------------- 1 | from dataset import load_dataset 2 | 3 | from cape_dataframes.pandas import dtypes 4 | from cape_dataframes.pandas.transformations import ColumnRedact 5 | from cape_dataframes.pandas.transformations import DatePerturbation 6 | from cape_dataframes.pandas.transformations import NumericPerturbation 7 | from cape_dataframes.pandas.transformations import NumericRounding 8 | from cape_dataframes.pandas.transformations import Tokenizer 9 | 10 | # Load Pandas DataFrame 11 | df = load_dataset() 12 | print("Original Dataset:") 13 | print(df.head()) 14 | 15 | # Define the transformations 16 | tokenize = Tokenizer(max_token_len=10, key=b"my secret") 17 | perturb_numric = NumericPerturbation(dtype=dtypes.Integer, min=-10, max=10) 18 | perturb_date = DatePerturbation( 19 | frequency=("YEAR", "MONTH", "DAY"), min=(-10, -5, -5), max=(10, 5, 5) 20 | ) 21 | round_numeric = NumericRounding(dtype=dtypes.Float, precision=-3) 22 | redact_column = ColumnRedact(columns="ssn") 23 | 24 | # Apply the transformations 25 | df["name"] = tokenize(df["name"]) 26 | df["age"] = perturb_numric(df["age"]) 27 | df["salary"] = round_numeric(df["salary"]) 28 | df["birthdate"] = perturb_date(df["birthdate"]) 29 | df = redact_column(df) 30 | 31 | print("Masked Dataset:") 32 | print(df.head()) 33 | -------------------------------------------------------------------------------- /examples/tutorials/quickstart/experiment_spark.py: -------------------------------------------------------------------------------- 1 | from dataset import load_dataset 2 | from pyspark import sql 3 | from pyspark.sql import functions 4 | 5 | import cape_dataframes as cape 6 | from cape_dataframes.spark import dtypes 7 | from cape_dataframes.spark.transformations import ColumnRedact 8 | from cape_dataframes.spark.transformations import DatePerturbation 9 | from cape_dataframes.spark.transformations import NumericPerturbation 10 | from cape_dataframes.spark.transformations import NumericRounding 11 | from cape_dataframes.spark.transformations import Tokenizer 12 | 13 | # Set up your SparkSession as usual, but configure it for use with Cape. 14 | # We do this because some transformations expect Arrow to be enabled. 15 | sess = sql.SparkSession.builder.appName( 16 | "cape.tutorial.maskPersonalInformation" 17 | ).getOrCreate() 18 | sess = cape.spark.configure_session(sess) 19 | 20 | # Load Spark DataFrame 21 | df = load_dataset(sess) 22 | print("Original Dataset:") 23 | df.show() 24 | 25 | # Define the transformations 26 | tokenize = Tokenizer(max_token_len=10, key=b"my secret") 27 | perturb_numric = NumericPerturbation(dtype=dtypes.Integer, min=-10, max=10) 28 | perturb_date = DatePerturbation( 29 | frequency=("YEAR", "MONTH", "DAY"), min=(-10, -5, -5), max=(10, 5, 5) 30 | ) 31 | round_numeric = NumericRounding(dtype=dtypes.Float, precision=-3) 32 | redact_column = ColumnRedact(columns="ssn") 33 | 34 | # Apply the transformation 35 | df = redact_column(df) 36 | df = df.select( 37 | tokenize(functions.col("name")).alias("name"), 38 | perturb_numric(functions.col("age")).alias("age"), 39 | round_numeric(functions.col("salary")).alias("salary"), 40 | perturb_date(functions.col("birthdate")).alias("birthdate"), 41 | ) 42 | 43 | print("Masked Dataset:") 44 | print(df.show()) 45 | -------------------------------------------------------------------------------- /examples/tutorials/quickstart/mask_personal_information.yaml: -------------------------------------------------------------------------------- 1 | label: masking_policy 2 | version: 1 3 | rules: 4 | - match: 5 | name: name 6 | actions: 7 | - transform: 8 | type: "tokenizer" 9 | max_token_len: 10 10 | key: "my secret" 11 | - match: 12 | name: age 13 | actions: 14 | - transform: 15 | type: "numeric-perturbation" 16 | dtype: Integer 17 | min: -10 18 | max: 10 19 | - match: 20 | name: salary 21 | actions: 22 | - transform: 23 | type: "numeric-rounding" 24 | dtype: Double 25 | precision: -3 26 | - match: 27 | name: birthdate 28 | actions: 29 | - transform: 30 | type: "date-perturbation" 31 | frequency: ["YEAR", "MONTH", "DAY"] 32 | min: [-10, -5, -5] 33 | max: [10, 5, 5] 34 | - match: 35 | name: ssn 36 | actions: 37 | - transform: 38 | type: "column-redact" 39 | columns: ["ssn"] 40 | 41 | 42 | -------------------------------------------------------------------------------- /examples/tutorials/reversible_tokenizer/README.md: -------------------------------------------------------------------------------- 1 | ## Reversible Tokenizer 2 | 3 | This directory includes an example showcasing how you can use the `ReversibleTokenizer` 4 | to tokenize data in a dataframe, as well as usage of a `TokenReverser` to recover 5 | the original data. -------------------------------------------------------------------------------- /examples/tutorials/reversible_tokenizer/reversible_tokenizer_pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Reversible Tokenizer\n", 8 | "\n", 9 | "Here we show an example of how you can use the `ReversibleTokenizer` to tokenize data within a pandas dataframe.\n", 10 | "\n", 11 | "The `ReversibleTokenizer` will tokenize the input data so it can be used in a privacy preserving manner.\n", 12 | "\n", 13 | "The `ReversibleTokenizer` can be used in conjunction with the `TokenReverser` to recover the original data." 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "### Tokenizing Data\n", 21 | "\n", 22 | "The `ReversibleTokenizer` and `TokenReverser` classes can be found in the `pandas.transformations` package." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 63, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "from cape_dataframes.pandas.transformations import ReversibleTokenizer\n", 32 | "from cape_dataframes.pandas.transformations import TokenReverser" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "The `ReversibleTokenizer` and `TokenReverser` classes both take a `key` as input.\n", 40 | "\n", 41 | "For the `TokenReverser` to be able to reverse the tokens produced by the `ReversibleTokenizer`, you must\n", 42 | "use the same key." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 64, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/plain": [ 53 | "b'55555555555555555555555555555555'" 54 | ] 55 | }, 56 | "execution_count": 64, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "key=b\"5\" * 32\n", 63 | "key" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "In this example, we will simply hide the names within our dataset." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 71, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/html": [ 81 | "
\n", 82 | "\n", 95 | "\n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | "
name# friends
0Alice100
1Bob200
2Carol300
\n", 121 | "
" 122 | ], 123 | "text/plain": [ 124 | " name # friends\n", 125 | "0 Alice 100\n", 126 | "1 Bob 200\n", 127 | "2 Carol 300" 128 | ] 129 | }, 130 | "execution_count": 71, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "import pandas as pd\n", 137 | "plaintext_data = pd.DataFrame({'name': [\"Alice\", \"Bob\", \"Carol\"], \"# friends\": [100, 200, 300]})\n", 138 | "plaintext_data" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "You instantiate a `ReversibleTokenizer` by passing it your key" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 72, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "" 157 | ] 158 | }, 159 | "execution_count": 72, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "tokenizer = ReversibleTokenizer(key=key)\n", 166 | "tokenizer" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "Next, we can pass our dataframe to the `tokenizer`" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 73, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/html": [ 184 | "
\n", 185 | "\n", 198 | "\n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | "
name# friends
0c8c7e80144304276183e5bcd589db782bc5ff95309100
1e0f40aea0d5c21b35967c4231b98b5b3e5338e200
27bfcdf25f73a1fe7a7fcb0970976f3393ed5df5ceb300
\n", 224 | "
" 225 | ], 226 | "text/plain": [ 227 | " name # friends\n", 228 | "0 c8c7e80144304276183e5bcd589db782bc5ff95309 100\n", 229 | "1 e0f40aea0d5c21b35967c4231b98b5b3e5338e 200\n", 230 | "2 7bfcdf25f73a1fe7a7fcb0970976f3393ed5df5ceb 300" 231 | ] 232 | }, 233 | "execution_count": 73, 234 | "metadata": {}, 235 | "output_type": "execute_result" 236 | } 237 | ], 238 | "source": [ 239 | "tokenized = pd.DataFrame(plaintext_data)\n", 240 | "tokenized[\"name\"] = tokenizer(plaintext_data[\"name\"])\n", 241 | "tokenized" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "## Recovering Tokens\n", 249 | "\n", 250 | "If we ever need to reveal the tokenized data, we can use the `TokenReverser` class." 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 74, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "data": { 260 | "text/html": [ 261 | "
\n", 262 | "\n", 275 | "\n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | "
name# friends
0Alice100
1Bob200
2Carol300
\n", 301 | "
" 302 | ], 303 | "text/plain": [ 304 | " name # friends\n", 305 | "0 Alice 100\n", 306 | "1 Bob 200\n", 307 | "2 Carol 300" 308 | ] 309 | }, 310 | "execution_count": 74, 311 | "metadata": {}, 312 | "output_type": "execute_result" 313 | } 314 | ], 315 | "source": [ 316 | "reverser = TokenReverser(key=key)\n", 317 | "recovered = pd.DataFrame(tokenized)\n", 318 | "recovered[\"name\"] = reverser(tokenized[\"name\"])\n", 319 | "recovered" 320 | ] 321 | } 322 | ], 323 | "metadata": { 324 | "kernelspec": { 325 | "display_name": "cape-df", 326 | "language": "python", 327 | "name": "python3" 328 | }, 329 | "language_info": { 330 | "codemirror_mode": { 331 | "name": "ipython", 332 | "version": 3 333 | }, 334 | "file_extension": ".py", 335 | "mimetype": "text/x-python", 336 | "name": "python", 337 | "nbconvert_exporter": "python", 338 | "pygments_lexer": "ipython3", 339 | "version": "3.8.16" 340 | }, 341 | "vscode": { 342 | "interpreter": { 343 | "hash": "2c0eb9acd3ce9f628738cc91d7613a5d048e1a93f709104c9a35d77254cfaaac" 344 | } 345 | } 346 | }, 347 | "nbformat": 4, 348 | "nbformat_minor": 4 349 | } 350 | -------------------------------------------------------------------------------- /examples/tutorials/reversible_tokenizer/reversible_tokenizer_pandas.py: -------------------------------------------------------------------------------- 1 | # This is a pure python version of the notebook from this directory 2 | 3 | # Here we show an example of how you can use the `ReversibleTokenizer` to tokenize data 4 | # within a pandas dataframe. The `ReversibleTokenizer` will tokenize the input data so 5 | # it can be used in a privacy-preserving manner. The `ReversibleTokenizer` can be used 6 | # in conjunction with the `TokenReverser` to recover the original data. 7 | 8 | import pandas as pd 9 | 10 | from cape_dataframes.pandas.transformations import ReversibleTokenizer 11 | from cape_dataframes.pandas.transformations import TokenReverser 12 | 13 | # The `ReversibleTokenizer` and `TokenReverser` classes both take a `key` as input. 14 | # For the `TokenReverser` to be able to reverse the tokens produced by the 15 | # `ReversibleTokenizer`, you must use the same key. 16 | 17 | key = b"5" * 32 18 | 19 | # In this example, we will simply hide the names within our dataset. 20 | plaintext_data = pd.DataFrame( 21 | {"name": ["Alice", "Bob", "Carol"], "# friends": [100, 200, 300]} 22 | ) 23 | print("plantext data") 24 | print(plaintext_data) 25 | print("\n") 26 | 27 | # Tokenization logic 28 | tokenizer = ReversibleTokenizer(key=key) 29 | tokenized = pd.DataFrame(plaintext_data) 30 | tokenized["name"] = tokenizer(plaintext_data["name"]) 31 | 32 | print("tokenized data") 33 | print(plaintext_data) 34 | print("\n") 35 | 36 | # Reverse the tokenization 37 | reverser = TokenReverser(key=key) 38 | recovered = pd.DataFrame(tokenized) 39 | recovered["name"] = reverser(tokenized["name"]) 40 | 41 | print("reversed tokens") 42 | print(recovered) 43 | print("\n") 44 | -------------------------------------------------------------------------------- /requirements/base.in: -------------------------------------------------------------------------------- 1 | pandas>=1.0 2 | numpy>=1.22 3 | pycryptodome 4 | pyyaml>=5.4 5 | requests 6 | rfc3339 7 | validators 8 | -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.8 3 | # by the following command: 4 | # 5 | # make pydep-upgrade 6 | # 7 | certifi==2022.12.7 8 | # via requests 9 | charset-normalizer==3.0.1 10 | # via requests 11 | decorator==5.1.1 12 | # via validators 13 | idna==3.4 14 | # via requests 15 | numpy==1.24.2 16 | # via 17 | # -r requirements/base.in 18 | # pandas 19 | pandas==1.3.5 20 | # via -r requirements/base.in 21 | pycryptodome==3.17 22 | # via -r requirements/base.in 23 | python-dateutil==2.8.2 24 | # via pandas 25 | pytz==2022.7.1 26 | # via pandas 27 | pyyaml==6.0 28 | # via -r requirements/base.in 29 | requests==2.28.2 30 | # via -r requirements/base.in 31 | rfc3339==6.2 32 | # via -r requirements/base.in 33 | six==1.16.0 34 | # via python-dateutil 35 | urllib3==1.26.14 36 | # via requests 37 | validators==0.20.0 38 | # via -r requirements/base.in 39 | -------------------------------------------------------------------------------- /requirements/dev.in: -------------------------------------------------------------------------------- 1 | -c base.txt 2 | black 3 | coverage 4 | flake8 5 | flake8-black 6 | isort 7 | pytest 8 | pytest-cov 9 | pytest-httpserver 10 | responses 11 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.8 3 | # by the following command: 4 | # 5 | # make pydep-upgrade 6 | # 7 | attrs==22.2.0 8 | # via pytest 9 | black==23.1.0 10 | # via 11 | # -r requirements/dev.in 12 | # flake8-black 13 | certifi==2022.12.7 14 | # via 15 | # -c requirements/base.txt 16 | # requests 17 | charset-normalizer==3.0.1 18 | # via 19 | # -c requirements/base.txt 20 | # requests 21 | click==8.1.3 22 | # via black 23 | coverage[toml]==7.1.0 24 | # via 25 | # -r requirements/dev.in 26 | # pytest-cov 27 | exceptiongroup==1.1.0 28 | # via pytest 29 | flake8==5.0.4 30 | # via 31 | # -r requirements/dev.in 32 | # flake8-black 33 | flake8-black==0.3.6 34 | # via -r requirements/dev.in 35 | idna==3.4 36 | # via 37 | # -c requirements/base.txt 38 | # requests 39 | iniconfig==2.0.0 40 | # via pytest 41 | isort==5.11.5 42 | # via -r requirements/dev.in 43 | markupsafe==2.1.2 44 | # via werkzeug 45 | mccabe==0.7.0 46 | # via flake8 47 | mypy-extensions==1.0.0 48 | # via black 49 | packaging==23.0 50 | # via 51 | # black 52 | # pytest 53 | pathspec==0.11.0 54 | # via black 55 | platformdirs==3.0.0 56 | # via black 57 | pluggy==1.0.0 58 | # via pytest 59 | pycodestyle==2.9.1 60 | # via flake8 61 | pyflakes==2.5.0 62 | # via flake8 63 | pytest==7.2.1 64 | # via 65 | # -r requirements/dev.in 66 | # pytest-cov 67 | pytest-cov==4.0.0 68 | # via -r requirements/dev.in 69 | pytest-httpserver==1.0.6 70 | # via -r requirements/dev.in 71 | requests==2.28.2 72 | # via 73 | # -c requirements/base.txt 74 | # responses 75 | responses==0.22.0 76 | # via -r requirements/dev.in 77 | toml==0.10.2 78 | # via responses 79 | tomli==2.0.1 80 | # via 81 | # black 82 | # coverage 83 | # flake8-black 84 | # pytest 85 | types-toml==0.10.8.4 86 | # via responses 87 | typing-extensions==4.5.0 88 | # via black 89 | urllib3==1.26.14 90 | # via 91 | # -c requirements/base.txt 92 | # requests 93 | # responses 94 | werkzeug==2.2.3 95 | # via pytest-httpserver 96 | -------------------------------------------------------------------------------- /requirements/spark.in: -------------------------------------------------------------------------------- 1 | -c base.txt 2 | packaging 3 | pyarrow 4 | pyspark[sql]>=3.2.2 5 | -------------------------------------------------------------------------------- /requirements/spark.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.8 3 | # by the following command: 4 | # 5 | # make pydep-upgrade 6 | # 7 | numpy==1.24.2 8 | # via 9 | # -c requirements/base.txt 10 | # pandas 11 | # pyarrow 12 | packaging==23.0 13 | # via -r requirements/spark.in 14 | pandas==1.3.5 15 | # via 16 | # -c requirements/base.txt 17 | # pyspark 18 | py4j==0.10.9.5 19 | # via pyspark 20 | pyarrow==11.0.0 21 | # via 22 | # -r requirements/spark.in 23 | # pyspark 24 | pyspark[sql]==3.3.1 25 | # via -r requirements/spark.in 26 | python-dateutil==2.8.2 27 | # via 28 | # -c requirements/base.txt 29 | # pandas 30 | pytz==2022.7.1 31 | # via 32 | # -c requirements/base.txt 33 | # pandas 34 | six==1.16.0 35 | # via 36 | # -c requirements/base.txt 37 | # python-dateutil 38 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:isort] 2 | line_length=88 3 | force_single_line=True 4 | 5 | [flake8] 6 | max-line-length=88 7 | extend-ignore= 8 | E203 9 | D10,D20,D40 10 | exclude=cape/connector/proto 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Installing with setuptools.""" 2 | import setuptools 3 | 4 | with open("README.md", "r", encoding="utf8") as fh: 5 | long_description = fh.read() 6 | 7 | setuptools.setup( 8 | name="cape-dataframes", 9 | version="0.3.1", 10 | packages=setuptools.find_packages(), 11 | python_requires=">=3.6", 12 | license="Apache License 2.0", 13 | url="https://github.com/capeprivacy/cape-dataframes", 14 | description="Cape manages secure access to all of your data.", 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | author="Cape Privacy", 18 | author_email="contact@capeprivacy.com", 19 | install_requires=[ 20 | "pandas", 21 | "pycryptodome", 22 | "pyyaml", 23 | "requests", 24 | "rfc3339", 25 | ], 26 | extras_require={ 27 | "spark": ["pyspark >=2.4", "pyarrow >=0.15.1"], 28 | }, 29 | classifiers=[ 30 | "Programming Language :: Python :: 3", 31 | "License :: OSI Approved :: Apache Software License", 32 | "Development Status :: 3 - Alpha", 33 | "Operating System :: OS Independent", 34 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 35 | "Topic :: Security :: Cryptography", 36 | ], 37 | ) 38 | --------------------------------------------------------------------------------