├── .coveragerc
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── examples.yml
    │   ├── main.yml
    │   └── release.yml
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── Makefile
├── NOTICE
├── README.md
├── cape_dataframes
    ├── __init__.py
    ├── audit
    │   ├── __init__.py
    │   └── audit.py
    ├── coordinator
    │   ├── __init__.py
    │   ├── auth
    │   │   ├── __init__.py
    │   │   ├── api_token.py
    │   │   └── api_token_test.py
    │   ├── client.py
    │   └── client_test.py
    ├── pandas
    │   ├── __init__.py
    │   ├── dtypes.py
    │   ├── registry.py
    │   ├── registry_test.py
    │   ├── transformations
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── column_redact.py
    │   │   ├── column_redact_test.py
    │   │   ├── perturbation.py
    │   │   ├── perturbation_test.py
    │   │   ├── rounding.py
    │   │   ├── rounding_test.py
    │   │   ├── row_redact.py
    │   │   ├── row_redact_test.py
    │   │   ├── test_utils.py
    │   │   ├── tokenizer.py
    │   │   └── tokenizer_test.py
    │   └── transformer.py
    ├── policy
    │   ├── __init__.py
    │   ├── data.py
    │   ├── data_test.py
    │   ├── exceptions.py
    │   ├── policy.py
    │   ├── policy_test.py
    │   └── policy_test_fixtures.py
    ├── spark
    │   ├── __init__.py
    │   ├── dtypes.py
    │   ├── registry.py
    │   ├── registry_test.py
    │   ├── transformations
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── perturbation.py
    │   │   ├── perturbation_test.py
    │   │   ├── redaction.py
    │   │   ├── redaction_test.py
    │   │   ├── rounding.py
    │   │   ├── rounding_test.py
    │   │   ├── tokenizer.py
    │   │   └── tokenizer_test.py
    │   ├── transformer.py
    │   └── utils.py
    └── utils
    │   ├── __init__.py
    │   ├── base64.py
    │   ├── base64_test.py
    │   ├── typecheck.py
    │   └── typecheck_test.py
├── codecov.yml
├── docs
    ├── README.md
    ├── policies.md
    ├── quickstart.md
    ├── redactions.md
    ├── transformations.md
    └── tutorials
    │   └── reversible-tokenization.md
├── examples
    ├── notebooks
    │   ├── Cape Policy for Pandas - IoT Example.ipynb
    │   ├── Cape Policy for Spark - IoT Example.ipynb
    │   ├── Cape Python with Pandas - IoT Exploratory Data Analysis.ipynb
    │   ├── Cape Python with PySpark - Taxi Dataset.ipynb
    │   └── README.md
    ├── policy
    │   ├── iot_example_policy.yaml
    │   ├── mask_personal_information.yaml
    │   ├── nyc_taxi_dataset_policy.yaml
    │   ├── perturb_value_field.yaml
    │   └── spark_round.yaml
    ├── simple_transformation.py
    ├── spark_example.py
    └── tutorials
    │   ├── credit
    │       ├── README.md
    │       ├── apply_policy_spark.py
    │       ├── data
    │       │   └── credit_with_pii.csv
    │       ├── mask_credit_data_in_pandas.ipynb
    │       └── policy
    │       │   └── credit_policy.yaml
    │   ├── quickstart
    │       ├── README.md
    │       ├── apply_policy_pandas.py
    │       ├── apply_policy_spark.py
    │       ├── dataset.py
    │       ├── experiment_pandas.py
    │       ├── experiment_spark.py
    │       └── mask_personal_information.yaml
    │   └── reversible_tokenizer
    │       ├── README.md
    │       ├── reversible_tokenizer_pandas.ipynb
    │       └── reversible_tokenizer_pandas.py
├── requirements
    ├── base.in
    ├── base.txt
    ├── dev.in
    ├── dev.txt
    ├── spark.in
    └── spark.txt
├── setup.cfg
└── setup.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | omit=
3 |     cape_dataframes/spark/examples/*
4 |     *test.py
5 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior, including any code you can share.
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Screenshots**
20 | If applicable, add screenshots to help explain your problem.
21 | 
22 | **Desktop (please complete the following information):**
23 |  - OS: [e.g. macOS, Linux]
24 |  - OS Version [e.g. 22]
25 |  - Python Version
26 |  - Installed pip packages
27 | 
28 | **Additional context**
29 | Add any other context about the problem here.
30 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/examples.yml:
--------------------------------------------------------------------------------
 1 | name: Main
 2 | on: [push]
 3 | jobs:
 4 |   build:
 5 |     name: Examples
 6 |     runs-on: ubuntu-latest
 7 |     strategy:
 8 |       matrix:
 9 |         python-version: [3.8, 3.9, "3.10"]
10 |     steps:
11 |       - name: Setup python ${{ matrix.python-version }} Environment
12 |         uses: actions/setup-python@v1
13 |         with:
14 |           python-version: ${{ matrix.python-version }}
15 |       - name: Check out repository
16 |         uses: actions/checkout@v2
17 |       - name: Run Make Examples
18 |         run: |
19 |           make bootstrap
20 |           make examples


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Main
 2 | on: [push]
 3 | jobs:
 4 |   build:
 5 |     name: Test
 6 |     runs-on: ubuntu-latest
 7 |     strategy:
 8 |       matrix:
 9 |         python-version: [3.8, 3.9, "3.10"]
10 |     steps:
11 |       - name: Setup python ${{ matrix.python-version }} Environment
12 |         uses: actions/setup-python@v1
13 |         with:
14 |           python-version: ${{ matrix.python-version }}
15 |       - name: Check out repository
16 |         uses: actions/checkout@v2
17 |       - name: Cache pip
18 |         uses: actions/cache@v2
19 |         with:
20 |           # This path is specific to Ubuntu
21 |           path: ~/.cache/pip
22 |           # Look to see if there is a cache hit for the corresponding requirements file
23 |           key: ${{ runner.os }}-${{ matrix.python-version }}-pip-${{ hashFiles('requirements/base.txt', 'requirements/spark.txt') }}
24 |           restore-keys: |
25 |             ${{ runner.os }}-${{ matrix.python-version }}-pip-
26 |             ${{ runner.os }}-
27 |       - name: Install All Dependencies
28 |         run: |
29 |           make pydep
30 |       - name: Run CI
31 |         run: |
32 |           make ci
33 |       - if: matrix.python-version == 3.9
34 |         name: Docker build
35 |         run: |
36 |           make docker
37 |       - if: matrix.python-version == 3.9
38 |         name: Upload coverage to Codecov
39 |         uses: codecov/codecov-action@v1.0.7
40 |         with:
41 |           token: ${{ secrets.CODECOV_TOKEN }}
42 |           file: coverage.xml
43 |           flags: unittests
44 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   python:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v2
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v2
14 |       with:
15 |         python-version: '3.x'
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         pip install setuptools wheel twine
20 |     - name: Build and publish
21 |       env:
22 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
23 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
24 |       run: |
25 |         python setup.py sdist bdist_wheel
26 |         twine upload dist/*
27 |   docker:
28 |     runs-on: ubuntu-latest
29 |     steps:
30 |       - uses: actions/checkout@v2
31 |       - uses: docker/build-push-action@v1
32 |         name: Build and Push Docker
33 |         with:
34 |           username: ${{ secrets.DOCKER_USERNAME }}
35 |           password: ${{ secrets.DOCKER_PASSWORD }}
36 |           repository: capeprivacy/cape-python
37 |           tag_with_ref: true
38 |       - name: Docker Hub Description
39 |         uses: peter-evans/dockerhub-description@v2
40 |         env:
41 |           DOCKERHUB_USERNAME: ${{ secrets.DOCKER_USERNAME }}
42 |           DOCKERHUB_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
43 |           DOCKERHUB_REPOSITORY: capeprivacy/cape-python
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | spark/data/*.csv
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # mkdocs documentation
123 | /site
124 | 
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 | 
130 | # Pyre type checker
131 | .pyre/
132 | 
133 | .idea/
134 | .vscode/
135 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capeprivacy/cape-dataframes/ed65cece5caebcce1ac549573514834effab5ecd/CHANGELOG.md


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribution guide
 2 | 
 3 | Contributions are more than welcome and we're always looking for use cases and feature ideas!
 4 | 
 5 | This document helps you get started on:
 6 | 
 7 | - [Submitting a pull request](#submitting-a-pull-request)
 8 | - [Writing documentation](#writing-documentation)
 9 | - [Useful tricks](#useful-tricks)
10 | - [Reporting a bug](#reporting-a-bug)
11 | - [Asking for help](#asking-for-help)
12 | 
13 | 
14 | ## Submitting a pull request
15 | 
16 | To contribute, [fork](https://help.github.com/articles/fork-a-repo/) Cape Python, commit your changes, and [open a pull request](https://help.github.com/articles/using-pull-requests/).
17 | 
18 | While you may be asked to make changes to your submission during the review process, we will work with you on this and suggest changes. Consider giving us [push rights to your branch](https://help.github.com/articles/allowing-changes-to-a-pull-request-branch-created-from-a-fork/) so we can potentially also help via commits.
19 | 
20 | ### Commit history and merging
21 | 
22 | For the sake of transparency our key rule is to keep a logical and intelligible commit history, meaning anyone stepping through the commits on either the `master` branch or as part of a review should be able to easily follow the changes made and their potential implications.
23 | 
24 | To this end we ask all contributors to sanitize pull requests before submitting them. All pull requests will either be [squashed or rebased](https://help.github.com/en/articles/about-pull-request-merges).
25 | 
26 | Some guidelines:
27 | 
28 | - Even simple code changes such as moving code around can obscure semantic changes, and in those case there should be two commits: for example, one that only moves code (with a note of this in the commit description) and one that performs the semantic change.
29 | 
30 | - Progressions that have no logical justification for being split into several commits should be squeezed.
31 | 
32 | - Code does not have to compile or pass all tests at each commit, but leave a remark and a plan in the commit description so reviewers are aware and can plan accordingly.
33 | 
34 | See below for some [useful tricks](#git-and-github) for working with Git and GitHub.
35 | 
36 | ### Before submitting for review
37 | 
38 | Make sure to give some context and overview in the body of your pull request to make it easier for reviewers to understand your changes. Ideally explain why your particular changes were made the way they are.
39 | 
40 | Importantly, use [keywords](https://help.github.com/en/articles/closing-issues-using-keywords) such as `Closes #<issue-number>` to indicate any issues or other pull requests related to your work.
41 | 
42 | Furthermore:
43 | 
44 | - Run tests (`make test`) and linting (`make lint`) before submitting as our [CI](#continuous-integration) will block pull requests failing either check
45 | - Test your change thoroughly with unit tests where appropriate
46 | - Update any affected docstrings in the code base
47 | - Add a line in [CHANGELOG.md](CHANGELOG.md) for any major change
48 | 
49 | ## Continuous integration
50 | 
51 | All pull requests are run against our [continuous integration suite](https://github.com/capeprivacy/cape-python/actions). The entire suite must pass before a pull request is accepted.
52 | 
53 | ## Writing documentation
54 | 
55 | Ensure you add docstrings where necessary. We use [Google's style](https://github.com/google/styleguide/blob/gh-pages/pyguide.md).
56 | 
57 | The documentation site is managed in the [documentation repository](https://github.com/capeprivacy/documentation).
58 | 
59 | ## Useful tricks
60 | 
61 | ### git and GitHub
62 | 
63 | - [GitHub Desktop](https://desktop.github.com/) provides a useful interface for inspecting and committing code changes
64 | - `git add -p`
65 |   - lets you leave out some changes in a file (GitHub Desktop can be used for this as well)
66 | - `git commit --amend`
67 |   - allows you to add to the previous commit instead of creating a new one
68 | - `git rebase -i <commit>`
69 |   - allows you to [squeeze and reorder commits](https://git-scm.com/book/en/v2/Git-Tools-Rewriting-History)
70 |   - use `HEAD~5` to consider 5 most recent commits
71 |   - use `<hash>~1` to start from commit identified by `<hash>`
72 | - `git rebase master`
73 |   - [pull in latest updates](https://git-scm.com/book/en/v2/Git-Branching-Rebasing) on `master`
74 | - `git fetch --no-tags <repo> <remote branch>:<local branch>`
75 |   - pulls down a remote branch from e.g. a fork and makes it available to check out as a local branch
76 |   - `<repo>` is e.g. `git@github.com:<user>/tf-encrypted.git`
77 | - `git push <repo> <local branch>:<remote branch>`
78 |   - pushes the local branch to a remote branch on e.g. a fork
79 |   - `<repo>` is e.g. `git@github.com:<user>/tf-encrypted.git`
80 | - `git tag -d <tag> && git push origin :refs/tags/<tag>`
81 |   - can be used to delete a tag remotely
82 | 
83 | ## Reporting a bug
84 | 
85 | Please file [bug reports](https://github.com/capeprivacy/cape-python/issues/new?template=bug_report.md) as GitHub issues.
86 | 
87 | ### Security disclosures
88 | 
89 | If you encounter a security issue then please responsibly disclose it by reaching out to us at [privacy@capeprivacy.com](privacy@capeprivacy.com). We will work with you to mitigate the issue and responsibly disclose it to anyone using the project in a timely manner.
90 | 
91 | ## Asking for help
92 | 
93 | If you have any questions you are more than welcome to reach out through GitHub issues or [our Slack channel](https://join.slack.com/t/capecommunity/shared_invite/zt-f8jeskkm-r9_FD0o4LkuQqhJSa~~IQA).


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim-buster
 2 | 
 3 | RUN apt-get update && apt-get install -y build-essential
 4 | COPY Makefile setup.py README.md ./
 5 | RUN mkdir requirements
 6 | COPY requirements/base.txt requirements/spark.txt ./requirements/
 7 | 
 8 | RUN make bootstrap
 9 | 
10 | COPY . .
11 | 
12 | RUN pip install .
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Global Variables used across many different rule types
 2 | 
 3 | # Definition of the default rule
 4 | all: test
 5 | .PHONY: all
 6 | 
 7 | # ###############################################
 8 | # Bootstrapping
 9 | #
10 | # Rules for bootstrapping the Makefile such as checking for docker, python versions, etc.
11 | # ###############################################
12 | DOCKER_REQUIRED_VERSION=18.
13 | SHELL := /bin/bash
14 | 
15 | CURRENT_DIR=$(shell pwd)
16 | PIP_PATH=$(shell which pip)
17 | DOCKER_PATH=$(shell which docker)
18 | 
19 | # Default platform
20 | # PYPI doesn't allow linux build tags to be pushed and doesn't support
21 | # specific operating systems such a ubuntu. It only allows build tags for linux
22 | # to be pushed as manylinux.
23 | DEFAULT_PLATFORM=manylinux1_x86_64
24 | 
25 | dockercheck:
26 | ifeq (,$(DOCKER_PATH))
27 | ifeq (,$(findstring $(DOCKER_REQUIRED_VERSION),$(shell docker version)))
28 | ifeq (,$(BYPASS_DOCKER_CHECK))
29 | 	$(error "Docker version $(DOCKER_REQUIRED_VERSION) is required.")
30 | endif
31 | endif
32 | endif
33 | 
34 | pydep-upgrade:
35 | 	pip install -U pip-tools
36 | 	CUSTOM_COMPILE_COMMAND="make pydep-upgrade" pip-compile --output-file requirements/base.txt requirements/base.in --resolver=backtracking
37 | 	CUSTOM_COMPILE_COMMAND="make pydep-upgrade" pip-compile --output-file requirements/spark.txt requirements/spark.in --resolver=backtracking
38 | 	CUSTOM_COMPILE_COMMAND="make pydep-upgrade" pip-compile --output-file requirements/dev.txt requirements/dev.in --resolver=backtracking
39 | 	pip install -r requirements/base.txt -r requirements/spark.txt -r requirements/dev.txt
40 | 
41 | 
42 | pydep:
43 | 	pip install -r requirements/base.txt -r requirements/spark.txt -r requirements/dev.txt
44 | 
45 | bootstrap:
46 | 	pip install -U pip setuptools
47 | 	pip install -r requirements/base.txt -r requirements/spark.txt
48 | 	pip install -e .
49 | 
50 | # ###############################################
51 | # Testing and Linting
52 | #
53 | # Rules for running our tests and for running various different linters
54 | # ###############################################
55 | test:
56 | 	pytest
57 | 
58 | CI_FILES=cape_dataframes/pandas cape_dataframes/spark cape_dataframes/policy cape_dataframes/coordinator
59 | 
60 | lint:
61 | 	flake8 .
62 | 
63 | ci: lint test coverage
64 | 
65 | fmt:
66 | 	isort --atomic .
67 | 	black .
68 | 
69 | coverage:
70 | 	pytest --cov-report=xml --cov=cape_dataframes ${CI_FILES}
71 | 	coverage report --fail-under=90
72 | 
73 | examples:
74 | 	shopt -s nullglob; \
75 | 	for dir in examples examples/tutorials; do \
76 | 		pushd $$dir; \
77 | 		for i in *.py; do \
78 | 			line=$$(head -n 1 $$i); \
79 | 			if [[ $$line == "# SKIP_CI" ]]; then \
80 | 			  continue; \
81 | 			fi; \
82 | 			echo "Running $$i"; \
83 | 			python $$i || exit 1; \
84 | 		done; \
85 | 		popd; \
86 | 	done;
87 | 
88 | docker:
89 | 	docker build -t capeprivacy/cape-dataframes .
90 | 
91 | .PHONY: lint fmt test coverage examples
92 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | Cape Python
 2 | 
 3 | All contributions by Cape Privacy:
 4 | Copyright (c) 2020, Cape, Inc.
 5 | All rights reserved.
 6 | 
 7 | All other contributions:
 8 | Copyright (c) 2020, the respective contributors.
 9 | All rights reserved.
10 | 
11 | This project includes software developed by Cape Privacy, Inc (https://capeprivacy.com/).
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Cape Dataframes
  2 | 
  3 | [![](https://github.com/capeprivacy/cape-dataframes/workflows/Main/badge.svg)](https://github.com/capeprivacy/cape-dataframes/actions/workflows/main.yml)
  4 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 
  5 | [![codecov](https://codecov.io/gh/capeprivacy/cape-python/branch/master/graph/badge.svg?token=L9A8HFAJK5)](https://codecov.io/gh/capeprivacy/cape-python)
  6 | [![PyPI version](https://badge.fury.io/py/cape-privacy.svg)](https://badge.fury.io/py/cape-privacy)
  7 | [![Cape Community Discord](https://img.shields.io/discord/1027271440061435975)](https://discord.gg/nQW7YxUYjh)
  8 | 
  9 | A Python library supporting data transformations and collaborative privacy policies, for data science projects in Pandas and Apache Spark
 10 | 
 11 | See below for instructions on how to get started or visit the [documentation](https://github.com/capeprivacy/cape-dataframes/tree/master/docs/).
 12 | 
 13 | ## Getting started
 14 | 
 15 | ### Prerequisites
 16 | 
 17 | * Python 3.6 or above, and pip
 18 | * Pandas 1.0+
 19 | * PySpark 3.0+ (if using Spark)
 20 | * [Make](https://www.gnu.org/software/make/) (if installing from source)
 21 | 
 22 | ### Install with pip
 23 | 
 24 | Cape Dataframes is available through PyPi.
 25 | 
 26 | ```sh
 27 | pip install cape-dataframes
 28 | ```
 29 | 
 30 | Support for Apache Spark is optional.  If you plan on using the library together with Apache Spark, we suggest the following instead:
 31 | 
 32 | ```sh
 33 | pip install cape-dataframes[spark]
 34 | ```
 35 | 
 36 | We recommend running it in a virtual environment, such as [venv](https://docs.python.org/3/library/venv.html).
 37 | 
 38 | ### Install from source
 39 | 
 40 | It is possible to install the library from source. This installs all dependencies, including Apache Spark:
 41 | 
 42 | ```sh
 43 | git clone https://github.com/capeprivacy/cape-dataframes.git
 44 | cd cape-dataframes
 45 | make bootstrap
 46 | ```
 47 | ### Usage example
 48 | 
 49 | *This example is an abridged version of the tutorial found [here](https://github.com/capeprivacy/cape-dataframes/tree/master/examples/tutorials)*
 50 | 
 51 | 
 52 | ```python
 53 | df = pd.DataFrame({
 54 |     "name": ["alice", "bob"],
 55 |     "age": [34, 55],
 56 |     "birthdate": [pd.Timestamp(1985, 2, 23), pd.Timestamp(1963, 5, 10)],
 57 | })
 58 | 
 59 | tokenize = Tokenizer(max_token_len=10, key=b"my secret")
 60 | perturb_numeric = NumericPerturbation(dtype=dtypes.Integer, min=-10, max=10)
 61 | 
 62 | df["name"] = tokenize(df["name"])
 63 | df["age"] = perturb_numeric(df["age"])
 64 | 
 65 | print(df.head())
 66 | # >>
 67 | #          name  age  birthdate
 68 | # 0  f42c2f1964   34 1985-02-23
 69 | # 1  2e586494b2   63 1963-05-10
 70 | ```
 71 | 
 72 | These steps can be saved in policy files so you can share them and collaborate with your team:
 73 | 
 74 | ```yaml
 75 | # my-policy.yaml
 76 | label: my-policy
 77 | version: 1
 78 | rules:
 79 |   - match:
 80 |       name: age
 81 |     actions:
 82 |       - transform:
 83 |           type: numeric-perturbation
 84 |           dtype: Integer
 85 |           min: -10
 86 |           max: 10
 87 |           seed: 4984
 88 |   - match:
 89 |       name: name
 90 |     actions:
 91 |       - transform:
 92 |           type: tokenizer
 93 |           max_token_len: 10
 94 |           key: my secret
 95 | ``` 
 96 | 
 97 | You can then load this policy and apply it to your data frame:
 98 | 
 99 | ```python
100 | # df can be a Pandas or Spark data frame 
101 | policy = cape.parse_policy("my-policy.yaml")
102 | df = cape.apply_policy(policy, df)
103 | 
104 | print(df.head())
105 | # >>
106 | #          name  age  birthdate
107 | # 0  f42c2f1964   34 1985-02-23
108 | # 1  2e586494b2   63 1963-05-10
109 | ```
110 | 
111 | You can see more [examples and usage](https://github.com/capeprivacy/cape-dataframes/tree/master/examples/) or read our [documentation](https://github.com/capeprivacy/cape-dataframes/tree/master/docs/).
112 | 
113 | ## About Cape Privacy and Cape Dataframes
114 | 
115 | [Cape Privacy](https://capeprivacy.com) empowers developers to easily encrypt data and process it confidentially. No cryptography or key management required.. Learn more at [capeprivacy.com](https://capeprivacy.com).
116 | 
117 | Cape Dataframes brings Cape's policy language to Pandas and Apache Spark. The supported techniques include tokenization with linkability as well as perturbation and rounding. You can experiment with these techniques programmatically, in Python or in human-readable policy files.
118 | 
119 | ### Project status and roadmap
120 | 
121 | Cape Python 0.1.1 was released 24th June 2020. It is actively maintained and developed, alongside other elements of the Cape ecosystem.
122 | 
123 | **Upcoming features:**
124 | 
125 | * Reversible tokenisation: allow reversing of tokenization to reveal the raw value.
126 | * Expand pipeline integrations: add Apache Beam, Apache Flink, Apache Arrow Flight or Dask integration as another pipeline we can support, either as part of Cape Dataframes or in its own separate project.
127 | 
128 | ## Help and resources
129 | 
130 | If you need help using Cape Dataframes, you can:
131 | 
132 | * View the [documentation](https://github.com/capeprivacy/cape-dataframes/tree/master/docs/).
133 | * Submit an issue.
134 | * Talk to us on the [Cape Community Discord](https://discord.gg/nQW7YxUYjh) [![Cape Community Discord](https://img.shields.io/discord/1027271440061435975)](https://discord.gg/nQW7YxUYjh)
135 | 
136 | Please file [feature requests](https://github.com/capeprivacy/cape-dataframes/issues/new?template=feature_request.md) and 
137 | [bug reports](https://github.com/capeprivacy/cape-dataframes/issues/new?template=bug_report.md) as GitHub issues.
138 | 
139 | ### Contributing
140 | 
141 | View our [contributing](CONTRIBUTING.md) guide for more information.
142 | 
143 | ### Code of conduct
144 | 
145 | Our [code of conduct](https://capeprivacy.com/conduct/) is included on the Cape Privacy website. All community members are expected to follow it. Please refer to that page for information on how to report problems.
146 | 
147 | ## License
148 | 
149 | Licensed under Apache License, Version 2.0 (see [LICENSE](https://github.com/capeprivacy/cape-python/blob/master/LICENSE) or http://www.apache.org/licenses/LICENSE-2.0). Copyright as specified in [NOTICE](https://github.com/capeprivacy/cape-python/blob/master/NOTICE).
150 | 


--------------------------------------------------------------------------------
/cape_dataframes/__init__.py:
--------------------------------------------------------------------------------
1 | from cape_dataframes import pandas
2 | from cape_dataframes import spark
3 | from cape_dataframes.coordinator import Client
4 | from cape_dataframes.policy.policy import apply_policy
5 | from cape_dataframes.policy.policy import parse_policy
6 | 
7 | __all__ = ["apply_policy", "pandas", "parse_policy", "spark", "Client"]
8 | 


--------------------------------------------------------------------------------
/cape_dataframes/audit/__init__.py:
--------------------------------------------------------------------------------
1 | from cape_dataframes.audit.audit import APPLY_POLICY_EVENT
2 | from cape_dataframes.audit.audit import AuditLogger
3 | 
4 | __all__ = ["AuditLogger", "APPLY_POLICY_EVENT"]
5 | 


--------------------------------------------------------------------------------
/cape_dataframes/audit/audit.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | APPLY_POLICY_EVENT = "apply-policy"
 4 | 
 5 | 
 6 | class AuditLogger:
 7 |     def audit_log(self, event_name, target_id, target_type, target_label):
 8 |         logging.info(
 9 |             f"{event_name}: ID: {target_id} Type: {target_type} Label: {target_label}"
10 |         )
11 | 


--------------------------------------------------------------------------------
/cape_dataframes/coordinator/__init__.py:
--------------------------------------------------------------------------------
1 | from cape_dataframes.coordinator.client import Client
2 | 
3 | __all__ = ["Client"]
4 | 


--------------------------------------------------------------------------------
/cape_dataframes/coordinator/auth/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capeprivacy/cape-dataframes/ed65cece5caebcce1ac549573514834effab5ecd/cape_dataframes/coordinator/auth/__init__.py


--------------------------------------------------------------------------------
/cape_dataframes/coordinator/auth/api_token.py:
--------------------------------------------------------------------------------
 1 | from cape_dataframes.utils import base64
 2 | 
 3 | SECRET_BYTES = 16
 4 | VERSION = b"\x01"
 5 | 
 6 | 
 7 | class APIToken:
 8 |     """Represents an API token used to authenticate with the coordinator.
 9 | 
10 |     The format is: <token_id>,<base64 string>
11 | 
12 |     The first byte of the decoded Base64 string is the version and the rest
13 |     is the secret.
14 | 
15 |     Attributes:
16 |         token_id: The ID of the token.
17 |         version: The version of the token format.
18 |         secret: The password used to authenticate.
19 |         raw: The raw token string.
20 |     """
21 | 
22 |     token_id: str
23 |     version: bytes
24 |     secret: bytes
25 |     raw: str
26 | 
27 |     def __init__(self, token: str):
28 |         self.raw = token
29 |         splits = token.split(",")
30 |         self.token_id = splits[0]
31 | 
32 |         token_bytes = bytes(base64.from_string(splits[1]))
33 |         self.version = token_bytes[0]
34 |         self.secret = token_bytes[1:]
35 | 
36 | 
37 | def create_api_token(token_id: str, secret: bytes) -> APIToken:
38 |     """Creates an APIToken. Mostly used for testing.
39 | 
40 |     Args:
41 |         token_id: The token id to use.
42 |         secret: The password to use.
43 | 
44 |     Returns:
45 |         The constructed APIToken.
46 |     """
47 |     token_bytes = bytes(VERSION) + bytes(secret, "utf-8")
48 |     b64 = base64.Base64(token_bytes)
49 | 
50 |     token = f"{token_id},{b64}"
51 | 
52 |     return APIToken(token)
53 | 


--------------------------------------------------------------------------------
/cape_dataframes/coordinator/auth/api_token_test.py:
--------------------------------------------------------------------------------
 1 | from cape_dataframes.coordinator.auth.api_token import create_api_token
 2 | 
 3 | 
 4 | def test_api_token():
 5 |     token_id = "imatokenid"
 6 |     secret = "aaaabbbbccccdddd"
 7 |     token = create_api_token(token_id, secret)
 8 | 
 9 |     assert token.token_id == token_id
10 |     assert token.secret == bytes(secret, "utf-8")
11 |     assert token.version == 1
12 | 


--------------------------------------------------------------------------------
/cape_dataframes/coordinator/client.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from typing import Any
  3 | from typing import Dict
  4 | 
  5 | import requests
  6 | import rfc3339
  7 | 
  8 | from cape_dataframes.coordinator.auth.api_token import APIToken
  9 | from cape_dataframes.policy import parse_policy
 10 | from cape_dataframes.policy.data import Policy
 11 | from cape_dataframes.utils import base64
 12 | 
 13 | 
 14 | class GraphQLError:
 15 |     """Represents a GraphQL error that can be returned by a coordinator.
 16 | 
 17 |     Attributes:
 18 |         message: The error message.
 19 |         extensions: Any extra information returned by coordinator.
 20 |     """
 21 | 
 22 |     message: str
 23 |     extensions: Dict[str, Any]
 24 | 
 25 |     def __init__(self, error):
 26 |         self.message = error["message"]
 27 | 
 28 |         if "extensions" in error:
 29 |             self.extensions = error["extensions"]
 30 | 
 31 | 
 32 | class GraphQLException(Exception):
 33 |     """Exception wrapping a list of GraphQL errors.
 34 | 
 35 |     Attributes:
 36 |         errors: List of GraphQL errors.
 37 |     """
 38 | 
 39 |     def __init__(self, errors):
 40 |         self.errors = [GraphQLError(error) for error in errors]
 41 | 
 42 | 
 43 | class CapeError:
 44 |     """Represents a Cape error coming from the coordinator.
 45 | 
 46 |     Attributes:
 47 |         messages: A list of error messages
 48 |         cause: The cause of the error
 49 |     """
 50 | 
 51 |     def __init__(self, error):
 52 |         self.messages = error["messages"]
 53 |         self.cause = error["cause"]
 54 | 
 55 | 
 56 | class CapeException(Exception):
 57 |     """Exception wrapping a CapeError.
 58 |     Attributes:
 59 |         error: the CapeError
 60 |     """
 61 | 
 62 |     def __init__(self, error):
 63 |         self.error = error
 64 | 
 65 | 
 66 | class Client:
 67 |     """Coordinator client for making GraphQL requests.
 68 | 
 69 |     Implements a simple GraphQL protocol to communicate with a
 70 |     coordinator.
 71 | 
 72 |     Attributes:
 73 |         host: The address of the coordinator.
 74 |         token: The token used to authenticate with a coordinator.
 75 |     """
 76 | 
 77 |     def __init__(self, host: str):
 78 |         self.host = f"{host}"
 79 |         self.token: str = ""
 80 | 
 81 |         self.s = requests.Session()
 82 | 
 83 |     def graphql_request(self, query: str, variables: Dict[str, str]):
 84 |         """Makes a GraphQL request to a coordinator.
 85 | 
 86 |         Adds an authorization header if it exists.
 87 | 
 88 |         Arguments:
 89 |             query: The GraphQL query to be passed to a coordinator.
 90 |             variables: The variables to be passed to a coordinator.
 91 | 
 92 |         Returns:
 93 |             The coordinator's GraphQL data response.
 94 | 
 95 |         Raises:
 96 |             GraphQLException: If a GraphQL error occurs.
 97 |         """
 98 | 
 99 |         r = self.s.post(
100 |             f"{self.host}/v1/query",
101 |             json={"query": query, "variables": variables},
102 |         )
103 | 
104 |         # attempt to get json so we can get the errors
105 |         # if an error has occurred, if json doesn't exist
106 |         # just raise the error
107 |         try:
108 |             j = r.json()
109 |         except ValueError:
110 |             r.raise_for_status()
111 | 
112 |         if "errors" in j:
113 |             raise GraphQLException(j["errors"])
114 | 
115 |         return j["data"]
116 | 
117 |     def login(self, token: str):
118 |         """Logs in with the given token string"""
119 | 
120 |         self.api_token = APIToken(token)
121 | 
122 |         r = self.s.post(
123 |             f"{self.host}/v1/login",
124 |             json={
125 |                 "token_id": self.api_token.token_id,
126 |                 "secret": str(base64.Base64(self.api_token.secret)),
127 |             },
128 |         )
129 | 
130 |         # attempt to get json so we can get the errors
131 |         # if an error has occurred, if json doesn't exist
132 |         # just raise the error
133 |         try:
134 |             j = r.json()
135 |         except ValueError:
136 |             r.raise_for_status()
137 | 
138 |         if "cause" in j:
139 |             raise CapeException(j)
140 | 
141 |         self.token = base64.from_string(j["token"])
142 | 
143 |         self.user = self.me()
144 | 
145 |         return self.token
146 | 
147 |     def me(self) -> str:
148 |         """Returns the ID of the authenticated identity."""
149 | 
150 |         query = """
151 |         query Me() {
152 |             me {
153 |                 id
154 |                 name
155 |                 email
156 |             }
157 |         }
158 |         """
159 | 
160 |         res = self.graphql_request(query, None)
161 | 
162 |         return res["me"]
163 | 
164 |     def get_policy(self, label: str) -> Policy:
165 |         """Returns the current policy for a given project label."""
166 | 
167 |         query = """
168 |         query CurrentSpec($label: ModelLabel!) {
169 |             project(label: $label) {
170 |                 current_spec {
171 |                     id
172 |                     rules
173 |                     transformations
174 |                 }
175 |             }
176 |         }
177 |         """
178 | 
179 |         variables = {
180 |             "label": label,
181 |         }
182 | 
183 |         res = self.graphql_request(query, variables)
184 | 
185 |         spec = res["project"]["current_spec"]
186 |         spec["label"] = label
187 | 
188 |         return parse_policy(spec, logger=self)
189 | 
190 |     def audit_log(self, event_name, target_id, target_type, target_label):
191 |         """Returns the current policy for a given project label."""
192 | 
193 |         query = """
194 |         mutation AddAuditLog($audit: AuditEventInput!) {
195 |             addAuditLog(audit: $audit) {
196 |                 event_name
197 |             }
198 |         }
199 |         """
200 | 
201 |         variables = {
202 |             "audit": {
203 |                 "event_name": event_name,
204 |                 "user_id": self.user["id"],
205 |                 "user_name": self.user["name"],
206 |                 "user_email": self.user["email"],
207 |                 "time": rfc3339.rfc3339(datetime.now()),
208 |                 "target_id": target_id,
209 |                 "target_type": target_type,
210 |                 "target_label": target_label,
211 |             },
212 |         }
213 | 
214 |         self.graphql_request(query, variables)
215 | 
216 |     def __repr__(self):
217 |         return f"This client is connected to {self.host}"
218 | 


--------------------------------------------------------------------------------
/cape_dataframes/coordinator/client_test.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import pytest
  4 | import responses
  5 | 
  6 | from cape_dataframes.audit import APPLY_POLICY_EVENT
  7 | from cape_dataframes.coordinator.auth.api_token import create_api_token
  8 | from cape_dataframes.coordinator.client import Client
  9 | from cape_dataframes.coordinator.client import GraphQLException
 10 | from cape_dataframes.policy import parse_policy
 11 | 
 12 | host = "http://localhost:8080"
 13 | 
 14 | 
 15 | @responses.activate
 16 | def test_graphql_error():
 17 |     responses.add(
 18 |         responses.POST,
 19 |         f"{host}/v1/query",
 20 |         json={
 21 |             "errors": [
 22 |                 {
 23 |                     "message": "Access denied",
 24 |                     "extensions": {
 25 |                         "cause": {
 26 |                             "name": "authorization_failure",
 27 |                             "category": "unauthorized",
 28 |                         }
 29 |                     },
 30 |                 }
 31 |             ]
 32 |         },
 33 |     )
 34 | 
 35 |     c = Client(host)
 36 | 
 37 |     with pytest.raises(GraphQLException) as excinfo:
 38 |         c.me()
 39 | 
 40 |     g_err = excinfo.value.errors[0]
 41 |     assert g_err.message == "Access denied"
 42 |     assert g_err.extensions == {
 43 |         "cause": {"name": "authorization_failure", "category": "unauthorized"}
 44 |     }
 45 | 
 46 | 
 47 | @responses.activate
 48 | def test_login():
 49 |     exp_token = "ABCDEFE"
 50 |     token_id = "specialid"
 51 |     secret = "secret"
 52 | 
 53 |     token = create_api_token(token_id, secret)
 54 | 
 55 |     def cb(request):
 56 |         resp_body = {"token": exp_token}
 57 | 
 58 |         return 200, {}, json.dumps(resp_body)
 59 | 
 60 |     responses.add_callback(responses.POST, f"{host}/v1/login", cb)
 61 | 
 62 |     my_id = "thisisanid"
 63 |     responses.add(
 64 |         responses.POST,
 65 |         f"{host}/v1/query",
 66 |         json={"data": {"me": {"id": my_id}}},
 67 |     )
 68 | 
 69 |     c = Client(host)
 70 | 
 71 |     c.login(token.raw)
 72 | 
 73 |     assert str(c.token) == exp_token
 74 | 
 75 | 
 76 | @responses.activate
 77 | def test_me():
 78 |     my_id = "thisisanid"
 79 |     responses.add(
 80 |         responses.POST,
 81 |         f"{host}/v1/query",
 82 |         json={"data": {"me": {"id": my_id}}},
 83 |     )
 84 | 
 85 |     c = Client(host)
 86 | 
 87 |     user = c.me()
 88 | 
 89 |     assert my_id == user["id"]
 90 | 
 91 | 
 92 | @responses.activate
 93 | def test_get_policy():
 94 |     rules = [
 95 |         {"match": {"name": "column"}, "actions": [{"transform": {"name": "plusOne"}}]}
 96 |     ]
 97 | 
 98 |     responses.add(
 99 |         responses.POST,
100 |         f"{host}/v1/query",
101 |         json={"data": {"project": {"current_spec": {"rules": rules}}}},
102 |     )
103 | 
104 |     c = Client(host)
105 | 
106 |     policy = c.get_policy("random-project")
107 | 
108 |     expected = {"label": "random-project", "rules": rules}
109 | 
110 |     expected = parse_policy(expected)
111 | 
112 |     assert policy.label == expected.label
113 |     assert (
114 |         policy.rules[0].actions[0].transform.field
115 |         == expected.rules[0].actions[0].transform.field
116 |     )
117 | 
118 | 
119 | @responses.activate
120 | def test_audit_log():
121 |     exp_token = "ABCDEFE"
122 |     token_id = "specialid"
123 |     secret = "secret"
124 | 
125 |     token = create_api_token(token_id, secret)
126 | 
127 |     def cb(request):
128 |         resp_body = {"token": exp_token}
129 | 
130 |         return 200, {}, json.dumps(resp_body)
131 | 
132 |     responses.add_callback(responses.POST, f"{host}/v1/login", cb)
133 | 
134 |     my_id = "thisisanid"
135 |     responses.add(
136 |         responses.POST,
137 |         f"{host}/v1/query",
138 |         json={"data": {"me": {"id": my_id, "name": "hey", "email": "yo@yo.com"}}},
139 |     )
140 | 
141 |     responses.add(
142 |         responses.POST,
143 |         f"{host}/v1/query",
144 |         json={"data": {"addAuditLog": {"event_name": APPLY_POLICY_EVENT}}},
145 |     )
146 | 
147 |     c = Client(host)
148 | 
149 |     c.login(token.raw)
150 | 
151 |     c.audit_log(APPLY_POLICY_EVENT, "idididid", "policy", "project-label")
152 | 
153 | 
154 | def test_client_repr():
155 |     c = Client(host)
156 | 
157 |     assert c.__repr__() == f"This client is connected to {host}"
158 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/__init__.py:
--------------------------------------------------------------------------------
1 | from cape_dataframes.pandas import dtypes
2 | from cape_dataframes.pandas import registry
3 | from cape_dataframes.pandas import transformations
4 | from cape_dataframes.pandas.transformer import transformer
5 | 
6 | __all__ = ["dtypes", "transformations", "transformer", "registry"]
7 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/dtypes.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | String = pd.api.types.pandas_dtype(str)
 5 | Date = pd.api.types.pandas_dtype("datetime64")
 6 | # numeric types
 7 | Float = pd.api.types.pandas_dtype(np.float32)
 8 | Double = pd.api.types.pandas_dtype(np.float64)
 9 | Byte = pd.api.types.pandas_dtype(np.byte)
10 | Short = pd.api.types.pandas_dtype(np.short)
11 | Integer = pd.api.types.pandas_dtype(np.int32)
12 | Long = pd.api.types.pandas_dtype(np.int64)
13 | # groupings
14 | Floats = (Float, Double)
15 | Integers = (Byte, Short, Integer, Long)
16 | Numerics = Floats + Integers
17 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/registry.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | from typing import Dict
 3 | 
 4 | from cape_dataframes.pandas.transformations import ColumnRedact
 5 | from cape_dataframes.pandas.transformations import DatePerturbation
 6 | from cape_dataframes.pandas.transformations import DateTruncation
 7 | from cape_dataframes.pandas.transformations import NumericPerturbation
 8 | from cape_dataframes.pandas.transformations import NumericRounding
 9 | from cape_dataframes.pandas.transformations import ReversibleTokenizer
10 | from cape_dataframes.pandas.transformations import RowRedact
11 | from cape_dataframes.pandas.transformations import Tokenizer
12 | from cape_dataframes.pandas.transformations import TokenReverser
13 | 
14 | TransformationCtor = Callable
15 | 
16 | _registry: Dict[str, TransformationCtor] = {}
17 | 
18 | 
19 | def get(transformation: str) -> TransformationCtor:
20 |     """Returns the constructor for the given key.
21 | 
22 |     Arguments:
23 |         transformation: The key of transformation to retrieve.
24 |     """
25 |     return _registry.get(transformation, None)
26 | 
27 | 
28 | def register(label: str, ctor: TransformationCtor):
29 |     """Registers a new transformation constructor under the label provided.
30 | 
31 |     Arguments:
32 |         label: The label that will be used as the key in the registry
33 |         ctor: The transformation constructor
34 |     """
35 |     _registry[label] = ctor
36 | 
37 | 
38 | register(DatePerturbation.identifier, DatePerturbation)
39 | register(NumericPerturbation.identifier, NumericPerturbation)
40 | register(NumericRounding.identifier, NumericRounding)
41 | register(Tokenizer.identifier, Tokenizer)
42 | register(DateTruncation.identifier, DateTruncation)
43 | register(ColumnRedact.identifier, ColumnRedact)
44 | register(RowRedact.identifier, RowRedact)
45 | register(TokenReverser.identifier, TokenReverser)
46 | register(ReversibleTokenizer.identifier, ReversibleTokenizer)
47 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/registry_test.py:
--------------------------------------------------------------------------------
 1 | from cape_dataframes.pandas import registry
 2 | from cape_dataframes.pandas.transformations import test_utils
 3 | 
 4 | 
 5 | def test_get():
 6 |     registry.register("plusN", test_utils.PlusN)
 7 |     ctor = registry.get("plusN")
 8 |     args = {"n": 1}
 9 |     ctor(**args)
10 | 
11 | 
12 | def test_get_missing():
13 |     ctor = registry.get("plusWhat?")
14 |     assert ctor is None
15 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformations/__init__.py:
--------------------------------------------------------------------------------
 1 | from cape_dataframes.pandas.transformations.column_redact import ColumnRedact
 2 | from cape_dataframes.pandas.transformations.perturbation import DatePerturbation
 3 | from cape_dataframes.pandas.transformations.perturbation import NumericPerturbation
 4 | from cape_dataframes.pandas.transformations.rounding import DateTruncation
 5 | from cape_dataframes.pandas.transformations.rounding import NumericRounding
 6 | from cape_dataframes.pandas.transformations.row_redact import RowRedact
 7 | from cape_dataframes.pandas.transformations.tokenizer import ReversibleTokenizer
 8 | from cape_dataframes.pandas.transformations.tokenizer import Tokenizer
 9 | from cape_dataframes.pandas.transformations.tokenizer import TokenReverser
10 | 
11 | __all__ = [
12 |     "DateTruncation",
13 |     "DatePerturbation",
14 |     "NumericPerturbation",
15 |     "NumericRounding",
16 |     "ReversibleTokenizer",
17 |     "Tokenizer",
18 |     "TokenReverser",
19 |     "ColumnRedact",
20 |     "RowRedact",
21 | ]
22 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformations/base.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | 
 4 | class AbstractTransformation(metaclass=abc.ABCMeta):
 5 |     @property
 6 |     @abc.abstractmethod
 7 |     def dtype(self):
 8 |         pass
 9 | 
10 |     @abc.abstractmethod
11 |     def __call__(self, x):
12 |         pass
13 | 
14 | 
15 | class Transformation(AbstractTransformation):
16 |     def __init__(self, dtype):
17 |         self._dtype = dtype
18 | 
19 |     @property
20 |     def dtype(self):
21 |         return self._dtype
22 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformations/column_redact.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import pandas as pd
 4 | 
 5 | 
 6 | class ColumnRedact:
 7 |     """Redacts columns.
 8 | 
 9 |     Attributes:
10 |         columns: The columns to redact.
11 |     """
12 | 
13 |     identifier = "column-redact"
14 |     type_signature = "df->df"
15 | 
16 |     def __init__(self, columns: List[str]) -> None:
17 |         self.columns = columns
18 | 
19 |     def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
20 |         return df.drop(columns=self.columns)
21 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformations/column_redact_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pandas.testing as pdt
 4 | 
 5 | from cape_dataframes.pandas.transformations import ColumnRedact
 6 | 
 7 | 
 8 | def test_column_redact():
 9 |     redact = ColumnRedact(["b", "c"])
10 | 
11 |     df = pd.DataFrame(np.ones((5, 3)), columns=["a", "b", "c"])
12 | 
13 |     expected = pd.DataFrame(np.ones((5,)), columns=["a"])
14 | 
15 |     result = redact(df)
16 | 
17 |     pdt.assert_frame_equal(result, expected)
18 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformations/perturbation.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from typing import Optional
  3 | from typing import Tuple
  4 | from typing import Union
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | from cape_dataframes.pandas import dtypes
 10 | from cape_dataframes.pandas.transformations import base
 11 | from cape_dataframes.utils import typecheck
 12 | 
 13 | _FREQUENCY_TO_DELTA_FN = {
 14 |     "YEAR": lambda noise: pd.to_timedelta(noise * 365, unit="days"),
 15 |     "MONTH": lambda noise: pd.to_timedelta(noise * 30, unit="days"),
 16 |     "DAY": lambda noise: pd.to_timedelta(noise, unit="days"),
 17 |     "HOUR": lambda noise: pd.to_timedelta(noise, unit="hours"),
 18 |     "minutes": lambda noise: pd.to_timedelta(noise, unit="minutes"),
 19 |     "seconds": lambda noise: pd.to_timedelta(noise, unit="seconds"),
 20 | }
 21 | IntTuple = Union[int, Tuple[int, ...]]
 22 | StrTuple = Union[str, Tuple[str, ...]]
 23 | 
 24 | 
 25 | class NumericPerturbation(base.Transformation):
 26 |     """Add uniform random noise to a numeric Pandas series
 27 | 
 28 |     Mask a numeric Pandas series by adding uniform random
 29 |     noise to each value. The amount of noise is drawn from
 30 |     the interval [min, max).
 31 | 
 32 |     Example:
 33 |         ```
 34 |         s = pd.Series([0, 1, 2, 3, 4])
 35 |         perturb = NumericPerturbation(dtype=Integer, min=-10, max=10, seed=123)
 36 |         perturb(s) # pd.Series([3, -7, -3, -3])
 37 |         ```
 38 | 
 39 |     Attributes:
 40 |         dtype (dtypes.Numerics): Pandas Series type
 41 |         min (int, float): the values generated will be greater then or equal to min
 42 |         max (int, float): the values generated will be less than max
 43 |         seed (int), optional: a seed to initialize the random generator
 44 |     """
 45 | 
 46 |     identifier = "numeric-perturbation"
 47 |     type_signature = "col->col"
 48 | 
 49 |     def __init__(
 50 |         self,
 51 |         dtype: dtypes.Numerics,
 52 |         min: Union[int, float],
 53 |         max: Union[int, float],
 54 |         seed: Optional[int] = None,
 55 |     ):
 56 |         assert dtype in dtypes.Numerics
 57 |         typecheck.check_arg(min, (int, float))
 58 |         typecheck.check_arg(max, (int, float))
 59 |         typecheck.check_arg(seed, (int, type(None)))
 60 |         super().__init__(dtype)
 61 |         self._min = min
 62 |         self._max = max
 63 |         self._rng = np.random.default_rng(seed=seed)
 64 | 
 65 |     def __call__(self, x: pd.Series) -> pd.Series:
 66 |         noise = pd.Series(self._rng.uniform(self._min, self._max, size=x.shape))
 67 |         if not isinstance(noise.dtype.type, self.dtype.type):
 68 |             noise = noise.astype(self.dtype)
 69 |         return x + noise
 70 | 
 71 | 
 72 | class DatePerturbation(base.Transformation):
 73 |     """Add uniform random noise to a Pandas series of timestamps
 74 | 
 75 |     Mask a Pandas series by adding uniform random noise to the
 76 |     specified frequencies of timestamps. The amount of noise for
 77 |     each frequency is drawn from the internal [min_freq, max_freq).
 78 | 
 79 |     Example:
 80 |         ```
 81 |         s = pd.Series([datetime.date(year=2020, month=2, day=15)])
 82 |         perturb = DatePerturbation(frequency="MONTH", min=-10, max=10, seed=1234)
 83 |         perturb(s) # pd.Series([datetime.date(year=2020, month=11, day=11)])
 84 |         ```
 85 | 
 86 |     Attributes:
 87 |         frequency (str, str list): one or more frequencies to perturbate
 88 |         min (int, int list): the frequency value will be greater or equal to min
 89 |         max (int, int list): the frequency value will be less than max
 90 |         seed (int), optional: a seed to initialize the random generator
 91 |     """
 92 | 
 93 |     identifier = "date-perturbation"
 94 |     type_signature = "col->col"
 95 | 
 96 |     def __init__(
 97 |         self,
 98 |         frequency: StrTuple,
 99 |         min: IntTuple,
100 |         max: IntTuple,
101 |         seed: Optional[int] = None,
102 |     ):
103 |         super().__init__(dtypes.Date)
104 |         self._frequency = _check_freq_arg(frequency)
105 |         self._min = _check_minmax_arg(min)
106 |         self._max = _check_minmax_arg(max)
107 |         self._rng = np.random.default_rng(seed)
108 | 
109 |     def __call__(self, x: pd.Series):
110 |         is_date_no_time = False
111 | 
112 |         # Use equality instead of isinstance because of inheritance
113 |         if type(x.iloc[0]) == datetime.date:
114 |             x = pd.to_datetime(x)
115 |             is_date_no_time = True
116 | 
117 |         for f, mn, mx in zip(self._frequency, self._min, self._max):
118 |             noise = self._rng.integers(mn, mx, size=x.shape)
119 |             delta_fn = _FREQUENCY_TO_DELTA_FN.get(f, None)
120 |             if delta_fn is None:
121 |                 raise ValueError(
122 |                     "Frequency {} must be one of {}.".format(
123 |                         f, list(_FREQUENCY_TO_DELTA_FN.keys())
124 |                     )
125 |                 )
126 |             x += delta_fn(noise)
127 | 
128 |         if is_date_no_time:
129 |             return pd.Series(x).dt.date
130 |         else:
131 |             return x
132 | 
133 | 
134 | def _check_minmax_arg(arg):
135 |     """Checks that arg is an integer or a flat collection of integers."""
136 |     if not isinstance(arg, (tuple, list)):
137 |         if not isinstance(arg, int):
138 |             raise ValueError
139 |         return [arg]
140 |     else:
141 |         for a in arg:
142 |             if not isinstance(a, int):
143 |                 raise ValueError
144 |     return arg
145 | 
146 | 
147 | def _check_freq_arg(arg):
148 |     """Checks that arg in one of the frequency options."""
149 |     if not isinstance(arg, (tuple, list)):
150 |         if not isinstance(arg, str):
151 |             raise ValueError
152 |         return [arg]
153 |     else:
154 |         for a in arg:
155 |             if not isinstance(a, str):
156 |                 raise ValueError
157 |     return arg
158 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformations/perturbation_test.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pandas.testing as pdt
 6 | 
 7 | from cape_dataframes.pandas import dtypes
 8 | from cape_dataframes.pandas.transformations import DatePerturbation
 9 | from cape_dataframes.pandas.transformations import NumericPerturbation
10 | 
11 | 
12 | def test_perturbation_float():
13 |     transform = NumericPerturbation(dtype=dtypes.Float, min=-10, max=10, seed=1234)
14 | 
15 |     df = pd.DataFrame({"amount": range(5)})
16 |     expected = pd.DataFrame(
17 |         {"amount": [9.53399, -1.39608, 10.46492, -1.76615, 0.38194]}
18 |     )
19 | 
20 |     df["amount"] = transform(df.amount)
21 | 
22 |     pdt.assert_frame_equal(df, expected)
23 | 
24 | 
25 | def test_perturbation_int():
26 |     transform = NumericPerturbation(dtype=dtypes.Integer, min=-10, max=10, seed=12345)
27 | 
28 |     df = pd.DataFrame({"amount": range(5)})
29 |     expected = pd.DataFrame({"amount": [-5, -2, 7, 6, 2]})
30 | 
31 |     df["amount"] = transform(df.amount)
32 | 
33 |     pdt.assert_frame_equal(df, expected)
34 | 
35 | 
36 | def test_perturbation_datetime():
37 |     transform = DatePerturbation(frequency="DAY", min=-10, max=10, seed=1234)
38 | 
39 |     df = pd.DataFrame({"date": [np.datetime64("2018-10-15")]})
40 |     expected = pd.DataFrame({"date": [np.datetime64("2018-10-24")]})
41 | 
42 |     df["date"] = transform(df.date)
43 | 
44 |     pdt.assert_frame_equal(df, expected)
45 | 
46 | 
47 | def test_perturbation_date():
48 |     transform = DatePerturbation(frequency="DAY", min=-10, max=10, seed=1234)
49 | 
50 |     df = pd.DataFrame({"date": [datetime.date(year=2018, month=10, day=15)]})
51 |     expected = pd.DataFrame({"date": [datetime.date(year=2018, month=10, day=24)]})
52 | 
53 |     df["date"] = transform(df.date)
54 | 
55 |     pdt.assert_frame_equal(df, expected)
56 | 
57 | 
58 | def test_perturbation_dat_mutliple_freq():
59 |     transform = DatePerturbation(
60 |         frequency=("DAY", "YEAR"), min=(-10, -5), max=(10, 5), seed=1234
61 |     )
62 | 
63 |     df = pd.DataFrame({"date": [np.datetime64("2018-10-15")]})
64 |     expected = pd.DataFrame({"date": [np.datetime64("2022-10-23")]})
65 | 
66 |     df["date"] = transform(df.date)
67 | 
68 |     pdt.assert_frame_equal(df, expected)
69 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformations/rounding.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | import pandas as pd
  4 | 
  5 | from cape_dataframes.pandas import dtypes
  6 | from cape_dataframes.pandas.transformations import base
  7 | from cape_dataframes.utils import typecheck
  8 | 
  9 | 
 10 | class NumericRounding(base.Transformation):
 11 |     """Reduce the precision of a numeric Pandas Series
 12 | 
 13 |     Round each value in the Pandas Series to the given number
 14 |     of digits.
 15 | 
 16 |     Example:
 17 |         ```
 18 |         s = pd.Series([1.384])
 19 |         round = NumericRounding(precision=1)
 20 |         round(s) # pd.Series([1.4])
 21 |         ```
 22 | 
 23 |     Attributes:
 24 |         dtypes (dtypes.Numerics): Pandas Series type.
 25 |         precision (int): set the number of digits.
 26 |     """
 27 | 
 28 |     identifier = "numeric-rounding"
 29 |     type_signature = "col->col"
 30 | 
 31 |     def __init__(self, dtype: dtypes.Numerics, precision: int):
 32 |         if dtype not in dtypes.Numerics:
 33 |             raise ValueError("NumericRounding requires a Numeric dtype.")
 34 |         typecheck.check_arg(precision, int)
 35 |         super().__init__(dtype)
 36 |         self._precision = precision
 37 | 
 38 |     def __call__(self, x: pd.Series) -> pd.Series:
 39 |         """Round each value in the Pandas Series
 40 | 
 41 |         Args:
 42 |             x (A Pandas Series): need to be a list of numeric values.
 43 | 
 44 |         Return:
 45 |             A Pandas Series with each value rounded
 46 |         """
 47 |         return self.round_numeric(x)
 48 | 
 49 |     def round_numeric(self, x: pd.Series):
 50 |         rounded = x.round(self._precision)
 51 |         if isinstance(rounded.dtype.type, self.dtype.type):
 52 |             return rounded
 53 |         else:
 54 |             return rounded.astype(self.dtype)
 55 | 
 56 | 
 57 | class DateTruncation(base.Transformation):
 58 |     """Reduce the precision of a date Pandas Series
 59 |     Truncate each date in a Pandas Series to the unit (year
 60 |     or month) specified by frequency.
 61 |     Example:
 62 |         ```
 63 |         s = pd.Series([pd.Timestamp("2018-10-15")])
 64 |         trunc = DateTruncation(frequency="year")
 65 |         trunc(s) # pd.Serie([pd.Timestamp("2018-01-01")])
 66 |         ```
 67 |     Attributes:
 68 |         frequency (string): expect to be 'year' or 'month'
 69 |     """
 70 | 
 71 |     identifier = "date-truncation"
 72 |     type_signature = "col->col"
 73 | 
 74 |     def __init__(self, frequency: str):
 75 |         typecheck.check_arg(frequency, str)
 76 |         super().__init__(dtypes.Date)
 77 |         self._frequency = frequency.lower()
 78 |         _check_freq_arg(self._frequency)
 79 | 
 80 |     def __call__(self, x: pd.Series) -> pd.Series:
 81 |         return self._trunc_date(x)
 82 | 
 83 |     def _trunc_date(self, x: pd.Series) -> pd.Series:
 84 |         if self._frequency == "year":
 85 |             truncated = x.values.astype("<M8[Y]")
 86 |         elif self._frequency == "month":
 87 |             truncated = x.values.astype("<M8[M]")
 88 |         elif self._frequency == "day":
 89 |             truncated = x.values.astype("<M8[D]")
 90 |         elif self._frequency == "hour":
 91 |             truncated = x.values.astype("<M8[h]")
 92 |         elif self._frequency == "minute":
 93 |             truncated = x.values.astype("<M8[m]")
 94 |         elif self._frequency == "second":
 95 |             truncated = x.values.astype("<M8[s]")
 96 |         else:
 97 |             raise ValueError
 98 | 
 99 |         # Use equality instead of isintance because of inheritance
100 |         if type(x.iloc[0]) == datetime.date:
101 |             return pd.Series(truncated).dt.date
102 |         else:
103 |             return pd.Series(truncated)
104 | 
105 | 
106 | def _check_freq_arg(arg):
107 |     """Checks that arg is string or a flat collection of strings."""
108 |     freq_options = ["year", "month", "day", "hour", "minute", "second"]
109 | 
110 |     if arg not in freq_options:
111 |         raise ValueError(
112 |             "Frequency {} must be one of {}.".format(
113 |                 arg,
114 |                 freq_options,
115 |             )
116 |         )
117 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformations/rounding_test.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pandas.testing as pdt
 6 | 
 7 | from cape_dataframes.pandas import dtypes
 8 | from cape_dataframes.pandas.transformations import DateTruncation
 9 | from cape_dataframes.pandas.transformations import NumericRounding
10 | 
11 | 
12 | def _make_apply_numeric_rounding(input, expected_output, ctype, dtype):
13 |     transform = NumericRounding(dtype=ctype, precision=1)
14 |     df = pd.DataFrame({"amount": input}).astype(dtype)
15 |     expected = pd.DataFrame({"amount": expected_output}).astype(dtype)
16 |     df["amount"] = transform(df.amount)
17 |     return df, expected
18 | 
19 | 
20 | def _make_apply_datetruncation(frequency, input_date, expected_date):
21 |     transform = DateTruncation(frequency=frequency)
22 |     df = pd.DataFrame({"date": [input_date]})
23 |     expected = pd.DataFrame({"date": [expected_date]})
24 |     df["date"] = transform(df.date)
25 |     return df, expected
26 | 
27 | 
28 | def test_rounding_float32():
29 |     input = [10.8834, 4.21221]
30 |     expected_output = [10.9, 4.2]
31 |     df, expected = _make_apply_numeric_rounding(
32 |         input, expected_output, dtypes.Float, np.float32
33 |     )
34 |     pdt.assert_frame_equal(df, expected)
35 | 
36 | 
37 | def test_rounding_float64():
38 |     input = [10.8834, 4.21221]
39 |     expected_output = [10.9, 4.2]
40 |     df, expected = _make_apply_numeric_rounding(
41 |         input, expected_output, dtypes.Double, np.float64
42 |     )
43 |     pdt.assert_frame_equal(df, expected)
44 | 
45 | 
46 | def test_truncate_date_year():
47 |     input_date = datetime.date(year=2018, month=10, day=3)
48 |     expected_date = datetime.date(year=2018, month=1, day=1)
49 |     df, expected = _make_apply_datetruncation("YEAR", input_date, expected_date)
50 |     pdt.assert_frame_equal(df, expected)
51 | 
52 | 
53 | def test_truncate_datetime_year():
54 |     input_date = pd.Timestamp(year=2018, month=10, day=3)
55 |     expected_date = pd.Timestamp(year=2018, month=1, day=1)
56 |     df, expected = _make_apply_datetruncation("YEAR", input_date, expected_date)
57 |     pdt.assert_frame_equal(df, expected)
58 | 
59 | 
60 | def test_truncate_datetime_month():
61 |     input_date = pd.Timestamp(year=2018, month=10, day=3, hour=9, minute=20, second=25)
62 |     expected_date = pd.Timestamp(year=2018, month=10, day=1, hour=0, minute=0, second=0)
63 |     df, expected = _make_apply_datetruncation("MONTH", input_date, expected_date)
64 |     pdt.assert_frame_equal(df, expected)
65 | 
66 | 
67 | def test_truncate_datetime_day():
68 |     input_date = pd.Timestamp(year=2018, month=10, day=3, hour=9, minute=20, second=25)
69 |     expected_date = pd.Timestamp(year=2018, month=10, day=3, hour=0, minute=0, second=0)
70 |     df, expected = _make_apply_datetruncation("DAY", input_date, expected_date)
71 |     pdt.assert_frame_equal(df, expected)
72 | 
73 | 
74 | def test_truncate_datetime_hour():
75 |     input_date = pd.Timestamp(year=2018, month=10, day=3, hour=9, minute=20, second=25)
76 |     expected_date = pd.Timestamp(year=2018, month=10, day=3, hour=9, minute=0, second=0)
77 |     df, expected = _make_apply_datetruncation("hour", input_date, expected_date)
78 |     pdt.assert_frame_equal(df, expected)
79 | 
80 | 
81 | def test_truncate_datetime_minute():
82 |     input_date = datetime.datetime(
83 |         year=2018, month=10, day=3, hour=9, minute=20, second=25
84 |     )
85 |     expected_date = datetime.datetime(
86 |         year=2018, month=10, day=3, hour=9, minute=20, second=0
87 |     )
88 |     df, expected = _make_apply_datetruncation("minute", input_date, expected_date)
89 |     pdt.assert_frame_equal(df, expected)
90 | 
91 | 
92 | def test_truncate_datetime_second():
93 |     input_date = pd.Timestamp(year=2018, month=10, day=3, hour=9, minute=20, second=25)
94 |     expected_date = pd.Timestamp(
95 |         year=2018, month=10, day=3, hour=9, minute=20, second=25
96 |     )
97 |     df, expected = _make_apply_datetruncation("second", input_date, expected_date)
98 |     pdt.assert_frame_equal(df, expected)
99 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformations/row_redact.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | class RowRedact:
 5 |     """Redacts rows based on the condition.
 6 | 
 7 |     Attributes:
 8 |         condition: The condition to be passed into the query function.
 9 |     """
10 | 
11 |     identifier = "row-redact"
12 |     type_signature = "df->df"
13 | 
14 |     def __init__(self, condition: str) -> None:
15 |         self.condition = condition
16 | 
17 |     def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
18 |         """Redacts rows using Dataframe.query.
19 | 
20 |         DataFrame.query returns all the fields that it matches so
21 |         we negate it here to get the opposite.
22 |         """
23 | 
24 |         condition = f"~({self.condition})"
25 |         return df.query(condition)
26 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformations/row_redact_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pandas.testing as pdt
 4 | 
 5 | from cape_dataframes.pandas.transformations import RowRedact
 6 | 
 7 | 
 8 | def test_row_redact():
 9 |     redact = RowRedact("a > 5")
10 | 
11 |     df = pd.DataFrame(np.ones((5, 2)), columns=["a", "b"])
12 | 
13 |     df["a"].iloc[0] = 6
14 |     df["a"].iloc[3] = 6
15 | 
16 |     expected = pd.DataFrame(np.ones((3, 2)), columns=["a", "b"], index=[1, 2, 4])
17 | 
18 |     result = redact(df)
19 | 
20 |     pdt.assert_frame_equal(result, expected)
21 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformations/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | class PlusN:
 5 |     """A sample transform that adds n to a specific field.
 6 | 
 7 |     Attributes:
 8 |         field: The field that this transform will be applied to.
 9 |         n: The value to add to the field.
10 |     """
11 | 
12 |     identifier = "plusN"
13 |     type_signature = "col->col"
14 | 
15 |     def __init__(self, n: int = 1) -> None:
16 |         self.n = n
17 | 
18 |     def __call__(self, column: pd.Series) -> pd.Series:
19 |         return column + self.n
20 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformations/tokenizer.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import secrets
  3 | 
  4 | import pandas as pd
  5 | from Crypto.Cipher import AES
  6 | 
  7 | from cape_dataframes.pandas import dtypes
  8 | from cape_dataframes.pandas.transformations import base
  9 | from cape_dataframes.utils import typecheck
 10 | 
 11 | 
 12 | class Tokenizer(base.Transformation):
 13 |     """Tokenizer: map a string to a token to obfuscate it.
 14 | 
 15 |     When applying the Tokenizer to a Pandas Series of type string,
 16 |     each value gets mapped to a token (hexadecimal string).
 17 |     If a value is repeated several times across the series, it always
 18 |     get mapped to the same token in order to maintain the count.
 19 |     A value can be mapped to different tokens by setting the key to a
 20 |     different value.
 21 | 
 22 |     Example:
 23 |         ```
 24 |         s = pd.Series(['A'])
 25 |         tokenize = Tokenizer(max_token_len=5, key='secret')
 26 |         tokenize(s) # pd.Series(['40a1e'])
 27 |         ```
 28 | 
 29 |     Attributes:
 30 |         max_token_len (int or bytes): control the token length (default
 31 |             length is 64)
 32 |         key: expect a string or byte string. If not specified, key will
 33 |             be set to a random byte string.
 34 |     """
 35 | 
 36 |     identifier = "tokenizer"
 37 |     type_signature = "col->col"
 38 | 
 39 |     def __init__(self, max_token_len: int = None, key: str = None):
 40 |         typecheck.check_arg(max_token_len, (int, type(None)))
 41 |         typecheck.check_arg(key, (str, bytes, type(None)))
 42 |         super().__init__(dtype=dtypes.String)
 43 |         self._max_token_len = max_token_len
 44 |         if isinstance(key, str):
 45 |             key = key.encode()
 46 |         self._key = key or secrets.token_bytes(8)
 47 | 
 48 |     def __call__(self, series: pd.Series) -> pd.Series:
 49 |         """Map a Pandas Series to tokens.
 50 | 
 51 |         Args:
 52 |             series (A Pandas Series): need to be a list of strings.
 53 | 
 54 |         Return:
 55 |             A Pandas Series with a list of tokens represented as hexadecimal
 56 |                 strings.
 57 |         """
 58 | 
 59 |         return series.apply(lambda x: self.to_token(x))
 60 | 
 61 |     def to_token(self, x):
 62 |         token = hashlib.sha256(x.encode() + self.key).hexdigest()
 63 |         if self._max_token_len is not None:
 64 |             return token[: self._max_token_len]
 65 |         else:
 66 |             return token
 67 | 
 68 |     @property
 69 |     def key(self):
 70 |         return self._key
 71 | 
 72 | 
 73 | class ReversibleTokenizer(base.Transformation):
 74 |     """ReversibleTokenizer: map a string to a token to obfuscate it.
 75 | 
 76 |     When applying the Tokenizer to a Pandas Series of type string,
 77 |     each value gets mapped to a token (hexadecimal string).
 78 |     If a value is repeated several times across the series, it always
 79 |     get mapped to the same token in order to maintain the count.
 80 |     A value can be mapped to different tokens by setting the key to a
 81 |     different value.
 82 | 
 83 |     This tokenizer allows tokens to be reversed to their original data
 84 |     when the secret key is known.
 85 | 
 86 |     Example:
 87 |         ```
 88 |         s = pd.Series(['A'])
 89 |         tokenize = ReversibleTokenizer(key='secret')
 90 |         tokenize(s) # pd.Series(['40a1e'])
 91 |         ```
 92 | 
 93 |     Attributes:
 94 |         key: expect a string or byte string of length exactly 32 bytes.
 95 |         encoding: string identifying the Python encoding used for inputs.
 96 |     """
 97 | 
 98 |     identifier = "reversible-tokenizer"
 99 |     type_signature = "col->col"
100 | 
101 |     def __init__(self, key, encoding="utf-8"):
102 |         typecheck.check_arg(key, (str, bytes))
103 |         typecheck.check_arg(encoding, str)
104 |         super().__init__(dtype=dtypes.String)
105 |         if isinstance(key, str):
106 |             key = key.encode()
107 |         if len(key) != 32:
108 |             raise ValueError(f"Key must be exactly 32 bytes, got {len(key)}")
109 |         self.key = key
110 |         self.encoding = encoding
111 | 
112 |     def __call__(self, series: pd.Series) -> pd.Series:
113 |         """Map a Pandas Series to tokens.
114 | 
115 |         Args:
116 |             series (A Pandas Series): need to be a list of strings.
117 | 
118 |         Return:
119 |             A Pandas Series with a list of tokens represented as hexadecimal
120 |                 strings.
121 |         """
122 | 
123 |         return series.apply(self._to_token)
124 | 
125 |     def _to_token(self, x: str):
126 |         cipher = AES.new(key=self.key, mode=AES.MODE_SIV)
127 |         ciphertext, tag = cipher.encrypt_and_digest(x.encode(encoding=self.encoding))
128 |         assert len(tag) == 16, len(tag)
129 |         token = tag.hex() + ciphertext.hex()
130 |         return token
131 | 
132 | 
133 | class TokenReverser(base.Transformation):
134 |     """TokenReverser: recover string from token.
135 | 
136 |     When applying the TokenReverser to a Pandas Series of tokens,
137 |     each token is mapped back to the string that was originally used
138 |     by ReversibleTokenizer to construct the token. The same key must
139 |     be used.
140 | 
141 |     Example:
142 |         ```
143 |         s = pd.Series(['40a1e'])
144 |         reverser = TokenReverser(key='secret')
145 |         reverser(s) # pd.Series(['A'])
146 |         ```
147 | 
148 |     Attributes:
149 |         key: expect a string or byte string of length exactly 32 bytes.
150 |         encoding: string identifying the Python encoding used for outputs.
151 |     """
152 | 
153 |     identifier = "token-reverser"
154 |     type_signature = "col->col"
155 | 
156 |     def __init__(self, key, encoding="utf-8"):
157 |         typecheck.check_arg(key, (str, bytes))
158 |         typecheck.check_arg(encoding, str)
159 |         super().__init__(dtype=dtypes.String)
160 |         if isinstance(key, str):
161 |             key = key.encode()
162 |         if len(key) != 32:
163 |             raise ValueError(f"Key must be exactly 32 bytes, got {len(key)}")
164 |         self.key = key
165 |         self.encoding = encoding
166 | 
167 |     def __call__(self, series: pd.Series) -> pd.Series:
168 |         """Reverse a Pandas Series of tokens.
169 | 
170 |         Args:
171 |             series (A Pandas Series): need to be a list of strings.
172 | 
173 |         Return:
174 |             A Pandas Series with a list of recovered strings.
175 |         """
176 | 
177 |         return series.apply(self._from_token)
178 | 
179 |     def _from_token(self, token: str):
180 |         cipher = AES.new(key=self.key, mode=AES.MODE_SIV)
181 |         token_bytes = bytearray.fromhex(token)
182 |         tag, ciphertext = token_bytes[:16], token_bytes[16:]
183 |         x = cipher.decrypt_and_verify(ciphertext, tag)
184 |         return x.decode(encoding=self.encoding)
185 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformations/tokenizer_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pandas.testing as pdt
 3 | import pytest
 4 | 
 5 | from cape_dataframes.pandas.transformations import ReversibleTokenizer
 6 | from cape_dataframes.pandas.transformations import Tokenizer
 7 | from cape_dataframes.pandas.transformations import TokenReverser
 8 | 
 9 | 
10 | def test_tokenizer():
11 |     transform = Tokenizer(key="secret_key")
12 | 
13 |     df = pd.DataFrame({"name": ["Alice", "Bob"]})
14 |     expected = pd.DataFrame(
15 |         {
16 |             "name": [
17 |                 "70a4b1a987767abf36463cd3e3f2b37144132e572fbb9b39f28bcaafe10d9b24",
18 |                 "dd4532a296deb4f114b1e7e88faefe4fb2b32c559ac15a8c6bcbdbcbc2aa4d4b",
19 |             ]
20 |         }
21 |     )
22 | 
23 |     df["name"] = transform(df["name"])
24 | 
25 |     pdt.assert_frame_equal(df, expected)
26 | 
27 | 
28 | def test_tokenizer_with_max_size():
29 |     transform = Tokenizer(max_token_len=10, key="secret_key")
30 | 
31 |     df = pd.DataFrame({"name": ["Alice", "Bob"]})
32 |     expected = pd.DataFrame({"name": ["70a4b1a987", "dd4532a296"]})
33 | 
34 |     df["name"] = transform(df["name"])
35 | 
36 |     pdt.assert_frame_equal(df, expected)
37 | 
38 | 
39 | def test_reversible_tokenizer():
40 |     key = b"5" * 32
41 |     plaintext = pd.DataFrame({"name": ["Alice", "Bob"]})
42 | 
43 |     tokenizer = ReversibleTokenizer(key=key)
44 |     tokenized_expected = pd.DataFrame(
45 |         {
46 |             "name": [
47 |                 "c8c7e80144304276183e5bcd589db782bc5ff95309",
48 |                 "e0f40aea0d5c21b35967c4231b98b5b3e5338e",
49 |             ]
50 |         }
51 |     )
52 |     tokenized = pd.DataFrame()
53 |     tokenized["name"] = tokenizer(plaintext["name"])
54 |     pdt.assert_frame_equal(tokenized, tokenized_expected)
55 | 
56 |     reverser = TokenReverser(key=key)
57 |     recovered = pd.DataFrame()
58 |     recovered["name"] = reverser(tokenized["name"])
59 |     pdt.assert_frame_equal(recovered, plaintext)
60 | 
61 | 
62 | def test_reversible_tokenizer_string_key():
63 |     _ = ReversibleTokenizer(key="5" * 32)
64 | 
65 | 
66 | def test_reversible_tokenizer_insufficient_key():
67 |     with pytest.raises(ValueError):
68 |         _ = ReversibleTokenizer(key=b"5" * 10)
69 | 


--------------------------------------------------------------------------------
/cape_dataframes/pandas/transformer.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 
3 | from cape_dataframes.pandas.transformations import base as tfm
4 | 
5 | 
6 | def transformer(transformation: tfm.Transformation, df: pd.DataFrame, field_name: str):
7 |     df[field_name] = transformation(df[field_name])
8 |     return df
9 | 


--------------------------------------------------------------------------------
/cape_dataframes/policy/__init__.py:
--------------------------------------------------------------------------------
 1 | from cape_dataframes.policy.data import Policy
 2 | from cape_dataframes.policy.exceptions import NamedTransformNotFound
 3 | from cape_dataframes.policy.exceptions import TransformNotFound
 4 | from cape_dataframes.policy.policy import parse_policy
 5 | from cape_dataframes.policy.policy import reverse
 6 | 
 7 | __all__ = [
 8 |     "parse_policy",
 9 |     "Policy",
10 |     "NamedTransformNotFound",
11 |     "TransformNotFound",
12 |     "reverse",
13 | ]
14 | 


--------------------------------------------------------------------------------
/cape_dataframes/policy/data.py:
--------------------------------------------------------------------------------
  1 | """Contains the policy classes that are initialized from a yaml policy file.
  2 | 
  3 | There are five main classes with Policy being the top level class. Policy contains
  4 | the PolicySpec and NamedTransformations. PolicySpec contains Rules and Rules
  5 | contain Transformations.
  6 | 
  7 |     Typical usage example:
  8 | 
  9 |     yaml_str = "...."
 10 |     d = yaml.load(yaml_str, Loader=yaml.FullLoad)
 11 | 
 12 |     # **d unpacks the dictionary produced by yaml and
 13 |     # passes them in has keyword arguments.
 14 |     policy = Policy(**d)
 15 | """
 16 | 
 17 | from typing import List
 18 | 
 19 | import yaml
 20 | 
 21 | from cape_dataframes.audit import AuditLogger
 22 | from cape_dataframes.utils import base64
 23 | 
 24 | 
 25 | class Transform:
 26 |     """A actual transform that will be applied.
 27 | 
 28 |     Either named or function must be passed in here. The process to apply this
 29 |     transform will look at both function and named and apply the relevant one.
 30 | 
 31 |     Attributes:
 32 |         field: The field this transform will be applied to.
 33 |         name: The name of the named transform, referenced from
 34 |               the top level policy object.
 35 |         type: The builtin transform that will be initialized.
 36 |         kwargs: The rest of the arguments that will be passed to the transformation.
 37 |     """
 38 | 
 39 |     def __init__(self, field, name=None, type=None, **kwargs):
 40 |         if field == "":
 41 |             raise ValueError("Field must be specified for transformation")
 42 | 
 43 |         if name is None and type is None:
 44 |             raise ValueError(
 45 |                 "Either named or function must be specified"
 46 |                 + f" for transformation on field {field}"
 47 |             )
 48 | 
 49 |         if name is not None and type is not None:
 50 |             raise ValueError(
 51 |                 "Both named and function cannot be "
 52 |                 + "fset for transformation on field {field}"
 53 |             )
 54 | 
 55 |         self.field = field
 56 |         self.name = name
 57 |         self.type = type
 58 |         self.args = kwargs
 59 | 
 60 | 
 61 | class Action:
 62 |     def __init__(self, field, transform=None):
 63 |         self.transform = Transform(field, **transform)
 64 | 
 65 | 
 66 | class Rule:
 67 |     """A rule contains actionable information of a policy.
 68 | 
 69 |     Attributes:
 70 |         match: The match used to select a field to be transformed.
 71 |         actions: The actions to take on a matched field.
 72 |     """
 73 | 
 74 |     def __init__(self, match, actions=[]):
 75 |         self.actions = []
 76 |         for action in actions:
 77 |             if type(action) is dict:
 78 |                 self.actions.append(Action(match["name"], **action))
 79 |             # special case for dropping a column (i.e. column redaction)
 80 |             elif type(action) is str and action == "drop":
 81 |                 self.actions.append(
 82 |                     Action(
 83 |                         match["name"],
 84 |                         {"type": "column-redact", "columns": [match["name"]]},
 85 |                     )
 86 |                 )
 87 | 
 88 |         self.transformations = [action.transform for action in self.actions]
 89 | 
 90 | 
 91 | class NamedTransform:
 92 |     """A named transformation that captures the args.
 93 | 
 94 |     Attributes:
 95 |         name: The name of the named transformation.
 96 |         type: The builtin type (i.e. transform) that the named transform initializes to.
 97 |         kwargs: The args that are captured by the named transform.
 98 |     """
 99 | 
100 |     def __init__(self, name, type, **kwargs):
101 |         if name == "":
102 |             raise ValueError("Name must be specified for named transformation")
103 | 
104 |         if type == "":
105 |             raise ValueError(f"Type must be specified for named transformation {name}")
106 | 
107 |         if len(kwargs) == 0:
108 |             raise ValueError(
109 |                 f"Args must be specified for named transformation {self.name}"
110 |             )
111 | 
112 |         self.name = name
113 |         self.type = type
114 |         self.args = kwargs
115 | 
116 |         for key, arg in self.args.items():
117 |             # if an arg is a secret
118 |             if isinstance(arg, dict) and "type" in arg and arg["type"] == "secret":
119 |                 if "value" not in arg:
120 |                     raise ValueError(
121 |                         "Secret named transformation arg"
122 |                         + f"{arg['name']} must contain a value"
123 |                     )
124 | 
125 |                 # then set the arg value to the inner value
126 |                 self.args[key] = bytes(base64.from_string(arg["value"]))
127 | 
128 | 
129 | class Policy:
130 |     """Top level policy object.
131 | 
132 |     The top level policy object holds the all of the relevant information
133 |     for applying policy to data.
134 | 
135 |     Attributes:
136 |         label: The label of the policy.
137 |         version: The version of the policy.
138 |         rules: List of rules that will be applied to a data frame.
139 |         transformations: The named transformations for this policy.
140 |     """
141 | 
142 |     def __init__(
143 |         self,
144 |         logger: AuditLogger = AuditLogger(),
145 |         id: str = "",
146 |         label: str = "",
147 |         version: int = 1,
148 |         rules: List[Rule] = [],
149 |         transformations: List[NamedTransform] = [],
150 |     ):
151 |         self.id = id
152 |         self.logger = logger
153 |         self.label = label
154 |         self.version = version
155 | 
156 |         self._raw_transforms = transformations
157 |         self.transformations = [
158 |             NamedTransform(**transform) for transform in transformations
159 |         ]
160 | 
161 |         if len(rules) == 0:
162 |             raise ValueError(
163 |                 f"At least one rule must be specified for policy specification {label}"
164 |             )
165 | 
166 |         self._raw_rules = rules
167 |         self.rules = [Rule(**rule) for rule in rules]
168 | 
169 |     def __repr__(self):
170 |         d = {
171 |             "label": self.label,
172 |             "version": self.version,
173 |             "transformations": self._raw_transforms,
174 |             "rules": self._raw_rules,
175 |         }
176 | 
177 |         return "Policy:\n\n" + yaml.dump(d, sort_keys=False)
178 | 


--------------------------------------------------------------------------------
/cape_dataframes/policy/data_test.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | 
 3 | from cape_dataframes.policy.data import Policy
 4 | from cape_dataframes.policy.policy_test_fixtures import named_with_secret_y
 5 | from cape_dataframes.utils import base64
 6 | 
 7 | y = """label: test_policy
 8 | version: 1
 9 | transformations:
10 | - name: plusOne
11 |   type: plusN
12 |   n: 1
13 | rules:
14 | - match:
15 |     name: test
16 |   actions:
17 |   - transform:
18 |       name: plusOne
19 |   - transform:
20 |       type: plusN
21 |       n: 1
22 | - match:
23 |     name: test2
24 | """
25 | 
26 | 
27 | def test_policy_class():
28 |     d = yaml.load(y, Loader=yaml.FullLoader)
29 | 
30 |     p = Policy(**d)
31 | 
32 |     assert p.label == "test_policy"
33 |     assert len(p.transformations) == 1
34 | 
35 |     named = p.transformations[0]
36 |     assert named.name == "plusOne"
37 |     assert named.type == "plusN"
38 |     assert len(named.args) == 1
39 | 
40 |     assert named.args["n"] == 1
41 | 
42 |     rule = p.rules[0]
43 |     assert len(p.rules) == 2
44 |     assert len(rule.actions) == 2
45 | 
46 |     assert len(rule.transformations) == 2
47 | 
48 |     namedTransform = rule.transformations[0]
49 |     builtinTransform = rule.transformations[1]
50 | 
51 |     assert namedTransform.field == "test"
52 |     assert namedTransform.name == "plusOne"
53 | 
54 |     assert builtinTransform.field == "test"
55 |     assert builtinTransform.type == "plusN"
56 |     assert builtinTransform.args["n"] == 1
57 | 
58 | 
59 | def test_policy_with_secret():
60 |     d = yaml.load(named_with_secret_y, Loader=yaml.FullLoader)
61 | 
62 |     p = Policy(**d)
63 | 
64 |     assert p.transformations[1].args["key"] == bytes(base64.from_string("BASE"))
65 | 
66 | 
67 | def test_policy_repr():
68 |     d = yaml.load(y, Loader=yaml.FullLoader)
69 | 
70 |     p = Policy(**d)
71 | 
72 |     assert p.__repr__() == "Policy:\n\n" + y
73 | 


--------------------------------------------------------------------------------
/cape_dataframes/policy/exceptions.py:
--------------------------------------------------------------------------------
 1 | class DependencyError(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class NamedTransformNotFound(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class TransformNotFound(Exception):
10 |     pass
11 | 


--------------------------------------------------------------------------------
/cape_dataframes/policy/policy.py:
--------------------------------------------------------------------------------
  1 | """Utils for parsing policy and applying them.
  2 | 
  3 | The module reads in policy as yaml and then through apply_policy
  4 | applies them to dataframes.
  5 | 
  6 | Example policy yaml:
  7 | 
  8 |     label: test_policy
  9 |     version: 1
 10 |     rules:
 11 |     - match:
 12 |         name: value
 13 |         actions:
 14 |         # Tells the policy runner to apply the transformation
 15 |         # plusN with the specified arguments.
 16 |         - transform:
 17 |             type: plusN
 18 |             n: 1
 19 |         # Tells the policy runner to apply another plusN
 20 |         # transformation.
 21 |         - transform:
 22 |             type: plusN
 23 |             n: 2
 24 | 
 25 | Applying policy:
 26 | 
 27 |     policy = parse_policy("policy.yaml")
 28 |     df = pd.DataFrame(np.ones(5,), columns=["value"])
 29 |     df = apply_policy(policy, df)
 30 | """
 31 | 
 32 | import copy
 33 | import logging
 34 | import types
 35 | from typing import Any
 36 | from typing import Callable
 37 | from typing import Dict
 38 | from typing import Union
 39 | 
 40 | import pandas as pd
 41 | import pyspark
 42 | import requests
 43 | import validators
 44 | import yaml
 45 | 
 46 | from cape_dataframes import pandas as pandas_lib
 47 | from cape_dataframes import spark as spark_lib
 48 | from cape_dataframes.audit import APPLY_POLICY_EVENT
 49 | from cape_dataframes.audit import AuditLogger
 50 | from cape_dataframes.pandas import transformations
 51 | from cape_dataframes.policy import data
 52 | from cape_dataframes.policy import exceptions
 53 | 
 54 | 
 55 | def apply_policy(policy: data.Policy, df, inplace=False):
 56 |     """Applies a Policy to some DataFrame.
 57 | 
 58 |     This function is responsible for inferring the type of the DataFrame, preparing the
 59 |     relevant Spark or Pandas Transformations, and applying them to produce a transformed
 60 |     DataFrame that conforms to the Policy.
 61 | 
 62 |     Args:
 63 |         policy: The `Policy` object that the transformed DataFrame will conform to, e.g.
 64 |             as returned by `cape_dataframes.parse_policy`.
 65 |         df: The DataFrame object to transform according to `policies`. Must be of type
 66 |             pandas.DataFrame or pyspark.sql.DataFrame.
 67 |         inplace: Whether to mutate the `df` or produce a new one. This argument is only
 68 |             relevant for Pandas DataFrames, as Spark DataFrames do not support mutation.
 69 | 
 70 |     Raises:
 71 |         ValueError: If df is a Spark DataFrame and inplace=True, or if df is something
 72 |             other than a Pandas or Spark DataFrame.
 73 |         DependencyError: If Spark is not configured correctly in the Python environment.
 74 |         TransformNotFound, NamedTransformNotFound: If the Policy contains a reference to
 75 |             a Transformation or NamedTransformation that is unrecognized in the
 76 |             Transformation registry.
 77 |     """
 78 |     if isinstance(df, pd.DataFrame):
 79 |         registry = pandas_lib.registry
 80 |         transformer = pandas_lib.transformer
 81 |         dtypes = pandas_lib.dtypes
 82 |         if not inplace:
 83 |             result_df = df.copy()
 84 |         else:
 85 |             result_df = df
 86 |     elif not spark_lib.is_available():
 87 |         raise exceptions.DependencyError
 88 |     elif isinstance(df, spark_lib.DataFrame):
 89 |         if inplace:
 90 |             raise ValueError(
 91 |                 "Spark does not support DataFrame mutation, so inplace=True is invalid."
 92 |             )
 93 |         registry = spark_lib.registry
 94 |         transformer = spark_lib.transformer
 95 |         dtypes = spark_lib.dtypes
 96 |         result_df = df
 97 |     else:
 98 |         raise ValueError(f"Expected df to be a DataFrame, found {type(df)}.")
 99 |     for rule in policy.rules:
100 |         result_df = _do_transformations(
101 |             policy, rule, result_df, registry, transformer, dtypes
102 |         )
103 | 
104 |     policy.logger.audit_log(APPLY_POLICY_EVENT, policy.id, "policy", policy.label)
105 | 
106 |     return result_df
107 | 
108 | 
109 | def parse_policy(
110 |     p: Union[str, Dict[Any, Any]], logger: AuditLogger = AuditLogger()
111 | ) -> data.Policy:
112 |     """Parses a policy YAML file.
113 | 
114 |     The passed in string can either be a path to a local file,
115 |     a URL pointing to a file or a dictionary representing the policy.
116 |     If it is a URL then requests attempts to download it.
117 | 
118 |     Args:
119 |         p: a path string, a URL string or a dictionary representing the
120 |            policy.
121 | 
122 |     Returns:
123 |         The Policy object initialized by the YAML.
124 |     """
125 |     if type(p) == str:
126 |         if validators.url(p):
127 |             yaml_data = requests.get(p).text
128 |         else:
129 |             with open(p) as f:
130 |                 yaml_data = f.read()
131 | 
132 |         policy = yaml.load(yaml_data, Loader=yaml.FullLoader)
133 |     else:
134 |         policy = p
135 | 
136 |     return data.Policy(logger=logger, **policy)
137 | 
138 | 
139 | def _maybe_replace_dtype_arg(args, dtypes):
140 |     if "dtype" in args:
141 |         args["dtype"] = getattr(dtypes, args["dtype"])
142 |     return args
143 | 
144 | 
145 | def _get_transformation(
146 |     policy: data.Policy,
147 |     transform: data.Transform,
148 |     registry: types.ModuleType,
149 |     dtypes,
150 | ):
151 |     """Looks up the correct transform class.
152 | 
153 |     If the transform is anonymous (i.e. unnamed) then it looks it up from the
154 |     transform registry. If it is a named transform it used load_named_transform
155 |     to find it.
156 | 
157 |     Args:
158 |         policy: The top level policy.
159 |         transform: The specific transform to be applied.
160 |         registry: The module representing the transformation registry; differs for
161 |             Spark/Pandas.
162 |         dtypes: Passthrough; concrete dtypes to use (spark.dtypes or pandas.dtypes).
163 | 
164 |     Returns:
165 |         The initialize transform object.
166 | 
167 |     Raises:
168 |         TransformNotFound: The builtin transform cannot be found.
169 |         NamedTransformNotFound: The named transform cannot be found on the
170 |             top level policy object.
171 |         ValueError: If neither a function or named transform exists on the transform
172 |             arg.
173 |     """
174 |     if transform.type is not None:
175 |         tfm_ctor = registry.get(transform.type)
176 |         if tfm_ctor is None:
177 |             raise exceptions.TransformNotFound(
178 |                 f"Could not find builtin transform {transform.type}"
179 |             )
180 |         tfm_args = _maybe_replace_dtype_arg(transform.args, dtypes)
181 |         initTransform = tfm_ctor(**tfm_args)
182 |     elif transform.name is not None:
183 |         initTransform = _load_named_transform(policy, transform.name, registry, dtypes)
184 |     else:
185 |         raise ValueError(
186 |             f"Expected type or name for transform with field {transform.field}"
187 |         )
188 |     return initTransform
189 | 
190 | 
191 | def _do_transformations(
192 |     policy: data.Policy,
193 |     rule: data.Rule,
194 |     df,
195 |     registry: types.ModuleType,
196 |     transformer: Callable,
197 |     dtypes,
198 | ):
199 |     """Applies a specific rule's transformations to a dataframe.
200 | 
201 |     For each transform, lookup the required transform class and then apply it
202 |     to the correct column in that dataframe.
203 | 
204 |     Args:
205 |         policy: The top level policy.
206 |         rule: The specific rule to apply.
207 |         df: A Pandas or Spark dataframe.
208 |         registry: The module representing the transformation registry; differs for
209 |             Spark/Pandas.
210 |         transformer: A function mapping (Transformation, DataFrame, str) to a DataFrame
211 |             that mutates a DataFrame by applying the Transformation to one of its
212 |             columns.
213 |         dtypes: Passthrough; concrete dtypes to use (spark.dtypes or pandas.dtypes).
214 | 
215 |     Returns:
216 |         The transformed dataframe.
217 |     """
218 | 
219 |     for transform in rule.transformations:
220 |         do_transform = _get_transformation(policy, transform, registry, dtypes)
221 |         try:
222 |             if do_transform.type_signature == "df->df":
223 |                 df = do_transform(df)
224 |             else:
225 |                 df = transformer(do_transform, df, transform.field)
226 |         except (KeyError, pyspark.sql.utils.AnalysisException):
227 |             logging.warning(
228 |                 f"Unable to transform column {transform.field} in policy {policy.label}"
229 |             )
230 | 
231 |     return df
232 | 
233 | 
234 | def _load_named_transform(
235 |     policy: data.Policy,
236 |     transformLabel: str,
237 |     registry: types.ModuleType,
238 |     dtypes,
239 | ):
240 |     """Attempts to load a named transform from the top level policy.
241 | 
242 |     Looks at the top level policy object for the named transform given as transformLabel
243 |     and initializes it from the args pulled from the policy object.
244 | 
245 |     Args:
246 |         policy: Top level policy object.
247 |         transformLabel: The name of the named transform.
248 |         registry: The module representing the transformation registry; differs for
249 |             Spark/Pandas.
250 |         dtypes: Passthrough; concrete dtypes to use (spark.dtypes or pandas.dtypes).
251 | 
252 |     Returns:
253 |         The initialized transform object.
254 | 
255 |     Raises:
256 |         NamedTransformNotFound: The named transform cannot be
257 |             found in the top level policy object.
258 |         DependencyError: If return_spark is True but PySpark is missing from the current
259 |             environment.
260 |     """
261 |     found = False
262 | 
263 |     named_transforms = policy.transformations
264 |     for transform in named_transforms:
265 |         if transformLabel == transform.name:
266 |             tfm_ctor = registry.get(transform.type)
267 |             if tfm_ctor is None:
268 |                 raise exceptions.NamedTransformNotFound(
269 |                     f"Could not find transform of type {transform.type} in registry"
270 |                 )
271 |             tfm_args = _maybe_replace_dtype_arg(transform.args, dtypes)
272 |             initTransform = tfm_ctor(**tfm_args)
273 |             found = True
274 |             break
275 | 
276 |     if not found:
277 |         raise exceptions.NamedTransformNotFound(
278 |             f"Could not find transform {transformLabel} in transformations block"
279 |         )
280 | 
281 |     return initTransform
282 | 
283 | 
284 | def reverse(policy: data.Policy) -> data.Policy:
285 |     """Turns reversible tokenizations into token reversers
286 | 
287 |     If any named transformations contain a reversible tokenization transformation
288 |     this helper function turns them into token reverser transformations.
289 | 
290 |     Args:
291 |         policy: Top level policy object.
292 | 
293 |     Returns:
294 |         The modified policy.
295 |     """
296 |     new_policy = copy.deepcopy(policy)
297 | 
298 |     for named in new_policy.transformations:
299 |         if named.type == transformations.ReversibleTokenizer.identifier:
300 |             named.type = transformations.TokenReverser.identifier
301 | 
302 |     return new_policy
303 | 


--------------------------------------------------------------------------------
/cape_dataframes/policy/policy_test.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pandas.testing as pdt
  6 | import pytest
  7 | import requests
  8 | import yaml
  9 | 
 10 | from cape_dataframes import pandas as pandas_lib
 11 | from cape_dataframes import spark as spark_lib
 12 | from cape_dataframes.pandas.transformations import test_utils
 13 | from cape_dataframes.policy import data
 14 | from cape_dataframes.policy import exceptions
 15 | from cape_dataframes.policy import policy as policy_lib
 16 | from cape_dataframes.policy import policy_test_fixtures as fixtures
 17 | 
 18 | 
 19 | def test_parse_policy(tmp_path):
 20 |     d = tmp_path / "policy"
 21 | 
 22 |     d.mkdir()
 23 | 
 24 |     p = d / "policy.yaml"
 25 |     p.write_text(fixtures.y)
 26 | 
 27 |     policy = policy_lib.parse_policy(str(p.absolute()))
 28 | 
 29 |     assert policy.label == "test_policy"
 30 | 
 31 | 
 32 | def test_parse_policy_dict():
 33 |     p = yaml.load(fixtures.y, Loader=yaml.FullLoader)
 34 | 
 35 |     policy = policy_lib.parse_policy(p)
 36 | 
 37 |     assert policy.label == "test_policy"
 38 | 
 39 | 
 40 | def test_named_transform_not_found():
 41 |     pandas_lib.registry.register("plusN", test_utils.PlusN)
 42 |     d = yaml.load(
 43 |         fixtures.named_not_found_y("plusOne", "plusOneThousand", "plusN"),
 44 |         Loader=yaml.FullLoader,
 45 |     )
 46 | 
 47 |     df = pd.DataFrame(
 48 |         np.ones(
 49 |             5,
 50 |         ),
 51 |         columns=["test"],
 52 |     )
 53 | 
 54 |     p = data.Policy(**d)
 55 |     tfm = p.rules[0].transformations[0]
 56 | 
 57 |     with pytest.raises(exceptions.NamedTransformNotFound) as e:
 58 |         policy_lib._get_transformation(p, tfm, df, pandas_lib.dtypes)
 59 | 
 60 |     assert str(e.value) == (
 61 |         "Could not find transform plusOneThousand in transformations block"
 62 |     )
 63 | 
 64 | 
 65 | def test_named_transform_type_not_found():
 66 |     d = yaml.load(
 67 |         fixtures.named_not_found_y("plusOne", "plusOne", "plusM"),
 68 |         Loader=yaml.FullLoader,
 69 |     )
 70 |     p = data.Policy(**d)
 71 |     tfm = p.rules[0].transformations[0]
 72 | 
 73 |     with pytest.raises(exceptions.NamedTransformNotFound) as e:
 74 |         policy_lib._get_transformation(p, tfm, pandas_lib.registry, pandas_lib.dtypes)
 75 |     assert str(e.value) == "Could not find transform of type plusM in registry"
 76 | 
 77 | 
 78 | def test_parse_policy_url(httpserver):
 79 |     httpserver.expect_request("/policy").respond_with_data(fixtures.y)
 80 |     url = httpserver.url_for("/policy")
 81 |     policy = policy_lib.parse_policy(url)
 82 |     assert policy.label == "test_policy"
 83 | 
 84 | 
 85 | def test_parse_policy_invalid_url():
 86 |     with pytest.raises(requests.exceptions.ConnectionError):
 87 |         policy_lib.parse_policy("https://notapolicy.here.com/policy")
 88 | 
 89 | 
 90 | def test_parse_policy_invalid_file():
 91 |     with pytest.raises(FileNotFoundError):
 92 |         policy_lib.parse_policy("iamnotarealthingonthisfilesystem")
 93 | 
 94 | 
 95 | def test_apply_policy_pandas():
 96 |     pandas_lib.registry.register("plusN", test_utils.PlusN)
 97 |     d = yaml.load(fixtures.y, Loader=yaml.FullLoader)
 98 | 
 99 |     df = pd.DataFrame(
100 |         np.ones(
101 |             5,
102 |         ),
103 |         columns=["test"],
104 |     )
105 | 
106 |     expected_df = df + 3
107 | 
108 |     p = data.Policy(**d)
109 | 
110 |     new_df = policy_lib.apply_policy(p, df)
111 | 
112 |     pdt.assert_frame_equal(new_df, expected_df)
113 | 
114 | 
115 | def test_missing_column():
116 |     pandas_lib.registry.register("plusN", test_utils.PlusN)
117 |     d = yaml.load(fixtures.y, Loader=yaml.FullLoader)
118 | 
119 |     df = pd.DataFrame(
120 |         np.ones(
121 |             5,
122 |         ),
123 |         columns=["boat"],
124 |     )
125 | 
126 |     expected_df = df
127 | 
128 |     p = data.Policy(**d)
129 | 
130 |     new_df = policy_lib.apply_policy(p, df)
131 | 
132 |     pdt.assert_frame_equal(new_df, expected_df)
133 | 
134 | 
135 | def test_apply_complex_policies_pandas():
136 |     d = yaml.load(fixtures.complex_y, Loader=yaml.FullLoader)
137 | 
138 |     df = pd.DataFrame(
139 |         {
140 |             "name": ["bob", "alice"],
141 |             "val-int": [30, 50],
142 |             "val-float": [32.43424, 56.64543],
143 |             "date": [pd.Timestamp("2018-10-15"), pd.Timestamp("2016-09-10")],
144 |         }
145 |     )
146 |     expected_df = pd.DataFrame(
147 |         {
148 |             "name": [
149 |                 "db6063546d5d6c1fd3826bc0a1d8188fa0dae1a174823eac1e8e063a073bf149",
150 |                 "4ae0639267ad49c658e8d266aa1caa51c876ed1d7ca788a0749d5189248295eb",
151 |             ],
152 |             "val-int": [23, 58],
153 |             "val-float": [32.4, 56.6],
154 |             "date": [pd.Timestamp("2018-01-01"), pd.Timestamp("2016-01-01")],
155 |         }
156 |     )
157 | 
158 |     p = data.Policy(**d)
159 | 
160 |     new_df = policy_lib.apply_policy(p, df)
161 | 
162 |     pdt.assert_frame_equal(new_df, expected_df)
163 | 
164 | 
165 | def test_named_transformation_pandas():
166 |     pandas_lib.registry.register("plusN", test_utils.PlusN)
167 |     d = yaml.load(fixtures.named_y, Loader=yaml.FullLoader)
168 | 
169 |     df = pd.DataFrame(
170 |         np.ones(
171 |             5,
172 |         ),
173 |         columns=["test"],
174 |     )
175 | 
176 |     expected_df = df + 3
177 | 
178 |     p = data.Policy(**d)
179 | 
180 |     new_df = policy_lib.apply_policy(p, df)
181 | 
182 |     pdt.assert_frame_equal(new_df, expected_df)
183 | 
184 | 
185 | def test_column_redact_pandas():
186 |     pandas_lib.registry.register("plusN", test_utils.PlusN)
187 |     d = yaml.load(fixtures.redact_y, Loader=yaml.FullLoader)
188 | 
189 |     df = pd.DataFrame(np.ones((5, 2)), columns=["test", "apple"])
190 | 
191 |     p = data.Policy(**d)
192 | 
193 |     new_df = policy_lib.apply_policy(p, df)
194 | 
195 |     expected_df = pd.DataFrame(
196 |         np.ones(
197 |             5,
198 |         ),
199 |         columns=["test"],
200 |     )
201 | 
202 |     expected_df = expected_df + 3
203 | 
204 |     pdt.assert_frame_equal(new_df, expected_df)
205 | 
206 | 
207 | def test_apply_policy_spark():
208 |     sess = spark_lib.utils.make_session("test.policy.applyPolicies")
209 |     pd_df = pd.DataFrame(
210 |         np.ones(
211 |             5,
212 |         ),
213 |         columns=["test"],
214 |     )
215 |     expected_df = pd_df + 3
216 |     df = sess.createDataFrame(pd_df)
217 | 
218 |     spark_lib.registry.register(test_utils.PlusN.identifier, test_utils.PlusN)
219 |     d = yaml.load(fixtures.y, Loader=yaml.FullLoader)
220 |     p = data.Policy(**d)
221 |     new_df = policy_lib.apply_policy(p, df).toPandas()
222 | 
223 |     pdt.assert_frame_equal(new_df, expected_df)
224 |     del spark_lib.registry._registry[test_utils.PlusN.identifier]
225 | 
226 | 
227 | def test_apply_complex_policies_spark():
228 |     sess = spark_lib.utils.make_session("test.policy.applyComplexPolicies")
229 |     pd_df = pd.DataFrame(
230 |         {
231 |             "name": ["bob", "alice"],
232 |             "val-int": [30, 50],
233 |             "val-float": [32.43424, 56.64543],
234 |             "date": [pd.Timestamp("2018-10-15"), pd.Timestamp("2016-09-10")],
235 |         }
236 |     )
237 |     expected_df = pd.DataFrame(
238 |         {
239 |             "name": [
240 |                 "db6063546d5d6c1fd3826bc0a1d8188fa0dae1a174823eac1e8e063a073bf149",
241 |                 "4ae0639267ad49c658e8d266aa1caa51c876ed1d7ca788a0749d5189248295eb",
242 |             ],
243 |             "val-int": [25, 56],
244 |             "val-float": [32.4, 56.6],
245 |             # TODO: when these are pd.Timestamp, Spark's date_trunc is causing
246 |             # dtype erasure. We should figure out why that's happening
247 |             "date": [datetime.date(2018, 1, 1), datetime.date(2016, 1, 1)],
248 |         }
249 |     )
250 |     df = sess.createDataFrame(pd_df)
251 | 
252 |     d = yaml.load(fixtures.complex_y, Loader=yaml.FullLoader)
253 |     p = data.Policy(**d)
254 |     new_df = policy_lib.apply_policy(p, df).toPandas()
255 |     pdt.assert_frame_equal(new_df, expected_df, check_dtype=True)
256 | 
257 | 
258 | def test_named_transformation_spark():
259 |     sess = spark_lib.utils.make_session("test.policy.namedTransformations")
260 |     pd_df = pd.DataFrame(
261 |         np.ones(
262 |             5,
263 |         ),
264 |         columns=["test"],
265 |     )
266 |     expected_df = pd_df + 3
267 |     df = sess.createDataFrame(pd_df)
268 | 
269 |     spark_lib.registry.register(test_utils.PlusN.identifier, test_utils.PlusN)
270 |     d = yaml.load(fixtures.named_y, Loader=yaml.FullLoader)
271 |     p = data.Policy(**d)
272 |     new_df = policy_lib.apply_policy(p, df).toPandas()
273 | 
274 |     pdt.assert_frame_equal(new_df, expected_df)
275 |     del spark_lib.registry._registry[test_utils.PlusN.identifier]
276 | 
277 | 
278 | def test_column_redaction_spark():
279 |     sess = spark_lib.utils.make_session("test.policy.redaction")
280 |     pd_df = pd.DataFrame(np.ones((5, 2)), columns=["test", "apple"])
281 |     expected_df = pd.DataFrame(
282 |         np.ones(
283 |             5,
284 |         ),
285 |         columns=["test"],
286 |     )
287 |     expected_df = expected_df + 3
288 |     df = sess.createDataFrame(pd_df)
289 | 
290 |     spark_lib.registry.register(test_utils.PlusN.identifier, test_utils.PlusN)
291 |     d = yaml.load(fixtures.redact_y, Loader=yaml.FullLoader)
292 |     p = data.Policy(**d)
293 |     new_df = policy_lib.apply_policy(p, df).toPandas()
294 | 
295 |     pdt.assert_frame_equal(new_df, expected_df)
296 |     del spark_lib.registry._registry[test_utils.PlusN.identifier]
297 | 
298 | 
299 | def test_secret_in_named_transform():
300 |     d = yaml.load(fixtures.secret_yaml, Loader=yaml.FullLoader)
301 | 
302 |     df = pd.DataFrame({"name": ["bob", "alice"]})
303 | 
304 |     p = data.Policy(**d)
305 | 
306 |     new_df = policy_lib.apply_policy(p, df)
307 | 
308 |     pdt.assert_frame_equal(new_df, df)
309 | 
310 | 
311 | def test_reverse_helper():
312 |     p = yaml.load(fixtures.reversible_yaml, Loader=yaml.FullLoader)
313 | 
314 |     policy = policy_lib.parse_policy(p)
315 | 
316 |     df = pd.DataFrame({"name": ["bob", "alice"]})
317 | 
318 |     new_df = policy_lib.apply_policy(policy, df)
319 | 
320 |     new_policy = policy_lib.reverse(policy)
321 | 
322 |     another_df = policy_lib.apply_policy(new_policy, new_df)
323 | 
324 |     for transform in new_policy.transformations:
325 |         assert transform.type == pandas_lib.transformations.TokenReverser.identifier
326 | 
327 |     pdt.assert_frame_equal(df, another_df)
328 | 


--------------------------------------------------------------------------------
/cape_dataframes/policy/policy_test_fixtures.py:
--------------------------------------------------------------------------------
  1 | y = """
  2 |     label: test_policy
  3 |     version: 1
  4 |     rules:
  5 |       - match:
  6 |           name: test
  7 |         actions:
  8 |           - transform:
  9 |               type: plusN
 10 |               n: 1
 11 |           - transform:
 12 |               type: plusN
 13 |               n: 2
 14 |     """
 15 | 
 16 | named_y = """
 17 |     version: 1
 18 |     label: test_policy
 19 |     transformations:
 20 |       - name: plusOne
 21 |         type: plusN
 22 |         n: 1
 23 |       - name: plusTwo
 24 |         type: plusN
 25 |         n: 2
 26 |     rules:
 27 |       - match:
 28 |           name: test
 29 |         actions:
 30 |           - transform:
 31 |               name: plusOne
 32 |           - transform:
 33 |               name: plusTwo
 34 |     """
 35 | 
 36 | named_with_secret_y = """
 37 |     version: 1
 38 |     label: test_policy
 39 |     transformations:
 40 |       - name: plusOne
 41 |         type: plusN
 42 |         n: 1
 43 |       - name: tokenWithSecret
 44 |         type: tokenizer
 45 |         key:
 46 |           type: secret
 47 |           name: my-key
 48 |           value: BASE
 49 |     rules:
 50 |       - match:
 51 |           name: test
 52 |         actions:
 53 |           - transform:
 54 |               name: plusOne
 55 |           - transform:
 56 |               name: plusTwo
 57 |     """
 58 | 
 59 | 
 60 | def named_not_found_y(saved_tfm, ref_tfm, tfm_type):
 61 |     return """
 62 |         label: test_policy
 63 |         version: 1
 64 |         transformations:
 65 |           - name: {saved}
 66 |             type: {type}
 67 |             n: 1
 68 |         rules:
 69 |           - match:
 70 |               name: test
 71 |             actions:
 72 |               - transform:
 73 |                   name: {ref}
 74 |     """.format(
 75 |         saved=saved_tfm, type=tfm_type, ref=ref_tfm
 76 |     )
 77 | 
 78 | 
 79 | complex_y = """
 80 |     label: test_policy
 81 |     version: 1
 82 |     rules:
 83 |       - match:
 84 |           name: val-int
 85 |         actions:
 86 |           - transform:
 87 |               type: numeric-perturbation
 88 |               dtype: Integer
 89 |               min: -10
 90 |               max: 10
 91 |               seed: 4984
 92 |       - match:
 93 |           name: val-float
 94 |         actions:
 95 |           - transform:
 96 |               type: numeric-rounding
 97 |               dtype: Double
 98 |               precision: 1
 99 |       - match:
100 |           name: name
101 |         actions:
102 |           - transform:
103 |               type: tokenizer
104 |               key: secret_key
105 |       - match:
106 |           name: date
107 |         actions:
108 |           - transform:
109 |               type: date-truncation
110 |               frequency: year
111 |     """
112 | 
113 | 
114 | redact_y = """
115 |     label: test_policy
116 |     version: 1
117 |     rules:
118 |       - match:
119 |           name: apple
120 |         actions:
121 |           - drop
122 |       - match:
123 |           name: test
124 |         actions:
125 |           - transform:
126 |               type: plusN
127 |               n: 1
128 |           - transform:
129 |               type: plusN
130 |               n: 2
131 |     """
132 | 
133 | secret_yaml = """
134 | label: masking_policy
135 | version: 1
136 | transformations:
137 |   - name: reversible
138 |     type: reversible-tokenizer
139 |     key:
140 |       type: secret
141 |       value: m5YNKBP-a3GMyy52457ok-4zQHqLuiB3aFD7mPTBpoc
142 |   - name: reverse
143 |     type: token-reverser
144 |     key:
145 |       type: secret
146 |       value: m5YNKBP-a3GMyy52457ok-4zQHqLuiB3aFD7mPTBpoc
147 | rules:
148 |   - match:
149 |       name: name
150 |     actions:
151 |       - transform:
152 |           name: reversible
153 |   - match:
154 |       name: name
155 |     actions:
156 |       - transform:
157 |           name: reverse
158 | """
159 | 
160 | reversible_yaml = """
161 | label: masking_policy
162 | version: 1
163 | transformations:
164 |   - name: reversible
165 |     type: reversible-tokenizer
166 |     key:
167 |       type: secret
168 |       value: m5YNKBP-a3GMyy52457ok-4zQHqLuiB3aFD7mPTBpoc
169 | rules:
170 |   - match:
171 |       name: name
172 |     actions:
173 |       - transform:
174 |           name: reversible
175 | """
176 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | if importlib.util.find_spec("pyspark") is None:
 4 | 
 5 |     def is_available():
 6 |         return False
 7 | 
 8 |     __all__ = ["is_available"]
 9 | 
10 | else:
11 |     from pyspark.sql import DataFrame
12 | 
13 |     from cape_dataframes.spark import dtypes
14 |     from cape_dataframes.spark import registry
15 |     from cape_dataframes.spark import transformations
16 |     from cape_dataframes.spark.transformer import transformer
17 |     from cape_dataframes.spark.utils import configure_session
18 |     from cape_dataframes.spark.utils import make_session
19 | 
20 |     def is_available():
21 |         return True
22 | 
23 |     __all__ = [
24 |         "configure_session",
25 |         "DataFrame",
26 |         "dtypes",
27 |         "is_available",
28 |         "make_session",
29 |         "transformations",
30 |         "transformer",
31 |         "registry",
32 |     ]
33 | 
34 | del importlib
35 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/dtypes.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import types
 2 | 
 3 | # base type
 4 | DType = types.DataType
 5 | # individual types
 6 | String = types.StringType()
 7 | Date = types.DateType()
 8 | Datetime = types.TimestampType()
 9 | # numeric types
10 | Float = types.FloatType()
11 | Double = types.DoubleType()
12 | Byte = types.ByteType()
13 | Short = types.ShortType()
14 | Integer = types.IntegerType()
15 | Long = types.LongType()
16 | # groups
17 | Floats = (Float, Double)
18 | Integers = (Byte, Short, Integer, Long)
19 | Numerics = Floats + Integers
20 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/registry.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | from typing import Dict
 3 | 
 4 | from cape_dataframes.spark.transformations import perturbation
 5 | from cape_dataframes.spark.transformations import redaction
 6 | from cape_dataframes.spark.transformations import rounding
 7 | from cape_dataframes.spark.transformations import tokenizer
 8 | 
 9 | TransformationCtor = Callable
10 | 
11 | _registry: Dict[str, TransformationCtor] = {}
12 | 
13 | 
14 | def get(transformation: str) -> TransformationCtor:
15 |     """Returns the constructor for the given key.
16 | 
17 |     Arguments:
18 |         transformation: The key of transformation to retrieve.
19 |     """
20 |     return _registry.get(transformation, None)
21 | 
22 | 
23 | def register(label: str, ctor: TransformationCtor):
24 |     """Registers a new transformation constructor under the label provided.
25 | 
26 |     Arguments:
27 |         label: The label that will be used as the key in the registry
28 |         ctor: The transformation constructor
29 |     """
30 |     _registry[label] = ctor
31 | 
32 | 
33 | register(perturbation.DatePerturbation.identifier, perturbation.DatePerturbation)
34 | register(perturbation.NumericPerturbation.identifier, perturbation.NumericPerturbation)
35 | register(rounding.NumericRounding.identifier, rounding.NumericRounding)
36 | register(tokenizer.Tokenizer.identifier, tokenizer.Tokenizer)
37 | register(rounding.DateTruncation.identifier, rounding.DateTruncation)
38 | register(redaction.ColumnRedact.identifier, redaction.ColumnRedact)
39 | register(redaction.RowRedact.identifier, redaction.RowRedact)
40 | register(tokenizer.ReversibleTokenizer.identifier, tokenizer.ReversibleTokenizer)
41 | register(tokenizer.TokenReverser.identifier, tokenizer.TokenReverser)
42 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/registry_test.py:
--------------------------------------------------------------------------------
 1 | from cape_dataframes.spark import registry
 2 | from cape_dataframes.spark.transformations import base
 3 | 
 4 | 
 5 | class MockTransformation(base.Transformation):
 6 |     identifier = "mock"
 7 | 
 8 |     def __init__(self, fake_arg):
 9 |         super().__init__(None)
10 | 
11 |     def __call__(self, x):
12 |         pass
13 | 
14 | 
15 | def test_get():
16 |     registry.register(MockTransformation.identifier, MockTransformation)
17 |     tfm_cls = registry.get("mock")
18 |     args = {"fake_arg": 1}
19 |     tfm_cls(**args)
20 |     registry._registry.pop("mock")
21 | 
22 | 
23 | def test_get_missing():
24 |     tfm_cls = registry.get("plusWhat?")
25 |     assert tfm_cls is None
26 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/transformations/__init__.py:
--------------------------------------------------------------------------------
 1 | from cape_dataframes.spark.transformations.perturbation import DatePerturbation
 2 | from cape_dataframes.spark.transformations.perturbation import NumericPerturbation
 3 | from cape_dataframes.spark.transformations.redaction import ColumnRedact
 4 | from cape_dataframes.spark.transformations.redaction import RowRedact
 5 | from cape_dataframes.spark.transformations.rounding import DateTruncation
 6 | from cape_dataframes.spark.transformations.rounding import NumericRounding
 7 | from cape_dataframes.spark.transformations.tokenizer import Tokenizer
 8 | 
 9 | __all__ = [
10 |     "DatePerturbation",
11 |     "NumericPerturbation",
12 |     "DateTruncation",
13 |     "NumericRounding",
14 |     "Tokenizer",
15 |     "ColumnRedact",
16 |     "RowRedact",
17 | ]
18 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/transformations/base.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | from cape_dataframes.spark import dtypes
 4 | 
 5 | 
 6 | class AbstractTransformation(metaclass=abc.ABCMeta):
 7 |     @property
 8 |     @abc.abstractmethod
 9 |     def dtype(self):
10 |         pass
11 | 
12 |     @abc.abstractmethod
13 |     def __call__(self, x):
14 |         pass
15 | 
16 | 
17 | class Transformation(AbstractTransformation):
18 |     def __init__(self, dtype: dtypes.DType):
19 |         self._dtype = dtype
20 | 
21 |     @property
22 |     def dtype(self):
23 |         return self._dtype
24 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/transformations/perturbation.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | from typing import Tuple
  3 | from typing import Union
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from pyspark import sql
  8 | from pyspark.sql import functions
  9 | 
 10 | from cape_dataframes.spark import dtypes
 11 | from cape_dataframes.spark.transformations import base
 12 | from cape_dataframes.utils import typecheck
 13 | 
 14 | _FREQUENCY_TO_DELTA_FN = {
 15 |     "YEAR": lambda noise: pd.to_timedelta(noise * 365, unit="days"),
 16 |     "MONTH": lambda noise: pd.to_timedelta(noise * 30, unit="days"),
 17 |     "DAY": lambda noise: pd.to_timedelta(noise, unit="days"),
 18 |     "HOUR": lambda noise: pd.to_timedelta(noise, unit="hours"),
 19 |     "minutes": lambda noise: pd.to_timedelta(noise, unit="minutes"),
 20 |     "seconds": lambda noise: pd.to_timedelta(noise, unit="seconds"),
 21 | }
 22 | IntTuple = Union[int, Tuple[int, ...]]
 23 | StrTuple = Union[str, Tuple[str, ...]]
 24 | 
 25 | 
 26 | class NumericPerturbation(base.Transformation):
 27 |     """Add uniform random noise to a numeric series
 28 | 
 29 |     Mask a numeric series by adding uniform random noise to each value.
 30 |     The amount of noise is drawn from the interval [min, max).
 31 | 
 32 |     Attributes:
 33 |         dtype (dtypes.Numerics): series type
 34 |         min (int, float): the values generated will be greater or equal to min
 35 |         max (int, float): the values generated will be less than max
 36 |         seed (int), optional: a seed to initialize the random generator
 37 |     """
 38 | 
 39 |     identifier = "numeric-perturbation"
 40 |     type_signature = "col->col"
 41 | 
 42 |     def __init__(
 43 |         self,
 44 |         dtype: dtypes.DType,
 45 |         min: (int, float),
 46 |         max: (int, float),
 47 |         seed: Optional[int] = None,
 48 |     ):
 49 |         assert dtype in dtypes.Numerics
 50 |         typecheck.check_arg(min, (int, float))
 51 |         typecheck.check_arg(max, (int, float))
 52 |         typecheck.check_arg(seed, (int, type(None)))
 53 |         super().__init__(dtype)
 54 |         self._min = min
 55 |         self._max = max
 56 |         self._seed = seed
 57 | 
 58 |     def __call__(self, x: sql.Column):
 59 |         uniform_noise = functions.rand(seed=self._seed)
 60 |         if self._seed is not None:
 61 |             self._seed += 1
 62 |         affine_noise = self._min + uniform_noise * (self._max - self._min)
 63 |         if self._dtype is not dtypes.Double:
 64 |             affine_noise = affine_noise.astype(self._dtype)
 65 |         return x + affine_noise
 66 | 
 67 | 
 68 | class DatePerturbation(base.Transformation):
 69 |     """Add uniform random noise to a Pandas series of timestamps
 70 | 
 71 |     Mask a series by adding uniform random noise to the specified
 72 |     frequencies of timestamps. The amount of noise for each frequency
 73 |     is drawn from the internal [min_freq, max_freq).
 74 | 
 75 |     Note that seeds are currently not supported.
 76 | 
 77 |     Attributes:
 78 |         frequency (str, str list): one or more frequencies to perturbate
 79 |         min (int, int list): the frequency value will be greater or equal to min
 80 |         max (int, int list): the frequency value will be less than max
 81 |     """
 82 | 
 83 |     identifier = "date-perturbation"
 84 |     type_signature = "col->col"
 85 | 
 86 |     def __init__(
 87 |         self,
 88 |         frequency: StrTuple,
 89 |         min: IntTuple,
 90 |         max: IntTuple,
 91 |     ):
 92 |         super().__init__(dtypes.Date)
 93 |         self._frequency = _check_freq_arg(frequency)
 94 |         self._min = _check_minmax_arg(min)
 95 |         self._max = _check_minmax_arg(max)
 96 |         self._perturb_date = None
 97 | 
 98 |     def __call__(self, x: sql.Column):
 99 |         if self._perturb_date is None:
100 |             self._perturb_date = self._make_perturb_udf()
101 |         return self._perturb_date(x)
102 | 
103 |     def _make_perturb_udf(self):
104 |         @functions.pandas_udf(dtypes.Date)
105 |         def perturb_date(x: pd.Series) -> pd.Series:
106 |             rng = np.random.default_rng()
107 |             for f, mn, mx in zip(self._frequency, self._min, self._max):
108 |                 # TODO can we switch to a lower dtype than np.int64?
109 |                 noise = rng.integers(mn, mx, size=x.shape)
110 |                 delta_fn = _FREQUENCY_TO_DELTA_FN.get(f, None)
111 |                 if delta_fn is None:
112 |                     raise ValueError(
113 |                         "Frequency {} must be one of {}.".format(
114 |                             f, list(_FREQUENCY_TO_DELTA_FN.keys())
115 |                         )
116 |                     )
117 |                 x += delta_fn(noise)
118 |             return x
119 | 
120 |         return perturb_date
121 | 
122 | 
123 | def _check_minmax_arg(arg):
124 |     """Checks that arg is an integer or a flat collection of integers."""
125 |     if not isinstance(arg, (tuple, list)):
126 |         if not isinstance(arg, int):
127 |             raise ValueError
128 |         return [arg]
129 |     else:
130 |         for a in arg:
131 |             if not isinstance(a, int):
132 |                 raise ValueError
133 |     return arg
134 | 
135 | 
136 | def _check_freq_arg(arg):
137 |     """Checks that arg is string or a flat collection of strings."""
138 |     if not isinstance(arg, (tuple, list)):
139 |         if not isinstance(arg, str):
140 |             raise ValueError
141 |         return [arg]
142 |     else:
143 |         for a in arg:
144 |             if not isinstance(a, str):
145 |                 raise ValueError
146 |     return arg
147 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/transformations/perturbation_test.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from pyspark.sql import functions
  4 | 
  5 | from cape_dataframes.spark import dtypes
  6 | from cape_dataframes.spark import utils
  7 | from cape_dataframes.spark.transformations import perturbation as ptb
  8 | 
  9 | 
 10 | def _make_and_apply_numeric_ptb(sess, df, dtype, min, max):
 11 |     df = sess.createDataFrame(df, schema=["data"])
 12 |     perturb = ptb.NumericPerturbation(dtype, min=min, max=max)
 13 |     result_df = df.select(perturb(functions.col("data")))
 14 |     return result_df.toPandas()
 15 | 
 16 | 
 17 | def _make_and_apply_date_ptb(sess, df, frequency, min, max):
 18 |     df = sess.createDataFrame(df, schema=["data"])
 19 |     perturb = ptb.DatePerturbation(frequency, min, max)
 20 |     result_df = df.select(perturb(functions.col("data")))
 21 |     return result_df.withColumnRenamed("perturb_date(data)", "data").toPandas()
 22 | 
 23 | 
 24 | def test_float_ptb_bounds():
 25 |     sess = utils.make_session("test.perturbation.float.bounds")
 26 |     data = np.arange(6, dtype=np.float32).reshape((6, 1))
 27 |     test_df = pd.DataFrame(data, columns=["data"])
 28 |     lower, upper = -2, 2
 29 |     result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Float, lower, upper)
 30 |     result = result_df.values
 31 |     assert result.dtype == data.dtype
 32 |     noise = result - data
 33 |     lower_check = noise >= lower
 34 |     upper_check = noise <= upper
 35 |     assert lower_check.all()
 36 |     assert upper_check.all()
 37 | 
 38 | 
 39 | def test_double_ptb_bounds():
 40 |     sess = utils.make_session("test.perturbation.double.bounds")
 41 |     data = np.arange(6, dtype=np.float64).reshape((6, 1))
 42 |     test_df = pd.DataFrame(data, columns=["data"])
 43 |     lower, upper = -2, 2
 44 |     result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Double, lower, upper)
 45 |     result = result_df.values
 46 |     assert result.dtype == data.dtype
 47 |     noise = result - data
 48 |     lower_check = noise >= lower
 49 |     upper_check = noise <= upper
 50 |     assert lower_check.all()
 51 |     assert upper_check.all()
 52 | 
 53 | 
 54 | def test_int_ptb_bounds():
 55 |     sess = utils.make_session("test.perturbation.integer.bounds")
 56 |     data = np.arange(10, dtype=np.int32).reshape((10, 1))
 57 |     test_df = pd.DataFrame(data, columns=["data"])
 58 |     lower, upper = -3, 3
 59 |     result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Integer, lower, upper)
 60 |     result = result_df.values
 61 |     assert result.dtype == data.dtype
 62 |     noise = result - data
 63 |     lower_check = noise >= lower
 64 |     upper_check = noise <= upper
 65 |     assert lower_check.all()
 66 |     assert upper_check.all()
 67 | 
 68 | 
 69 | def test_byte_ptb_bounds():
 70 |     sess = utils.make_session("test.perturbation.byte.bounds")
 71 |     data = np.arange(10, dtype=np.int8).reshape((10, 1))
 72 |     test_df = pd.DataFrame(data, columns=["data"])
 73 |     lower, upper = -3, 3
 74 |     result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Byte, lower, upper)
 75 |     result = result_df.values
 76 |     assert result.dtype == data.dtype
 77 |     noise = result - data
 78 |     lower_check = noise >= lower
 79 |     upper_check = noise <= upper
 80 |     assert lower_check.all()
 81 |     assert upper_check.all()
 82 | 
 83 | 
 84 | def test_short_ptb_bounds():
 85 |     sess = utils.make_session("test.perturbation.short.bounds")
 86 |     data = np.arange(10, dtype=np.int16).reshape((10, 1))
 87 |     test_df = pd.DataFrame(data, columns=["data"])
 88 |     lower, upper = -3, 3
 89 |     result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Short, lower, upper)
 90 |     result = result_df.values
 91 |     assert result.dtype == data.dtype
 92 |     noise = result - data
 93 |     lower_check = noise >= lower
 94 |     upper_check = noise <= upper
 95 |     assert lower_check.all()
 96 |     assert upper_check.all()
 97 | 
 98 | 
 99 | def test_integer_ptb_bounds():
100 |     sess = utils.make_session("test.perturbation.integer.bounds")
101 |     data = np.arange(10, dtype=np.int32).reshape((10, 1))
102 |     test_df = pd.DataFrame(data, columns=["data"])
103 |     lower, upper = -3, 3
104 |     result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Integer, lower, upper)
105 |     result = result_df.values
106 |     assert result.dtype == data.dtype
107 |     noise = result - data
108 |     lower_check = noise >= lower
109 |     upper_check = noise <= upper
110 |     assert lower_check.all()
111 |     assert upper_check.all()
112 | 
113 | 
114 | def test_long_ptb_bounds():
115 |     sess = utils.make_session("test.perturbation.long.bounds")
116 |     data = np.arange(10, dtype=np.int64).reshape((10, 1))
117 |     test_df = pd.DataFrame(data, columns=["data"])
118 |     lower, upper = -3, 3
119 |     result_df = _make_and_apply_numeric_ptb(sess, test_df, dtypes.Long, lower, upper)
120 |     result = result_df.values
121 |     assert result.dtype == data.dtype
122 |     noise = result - data
123 |     lower_check = noise >= lower
124 |     upper_check = noise <= upper
125 |     assert lower_check.all()
126 |     assert upper_check.all()
127 | 
128 | 
129 | def test_date_perturbation_singlefreq_bounds():
130 |     sess = utils.make_session("test.perturbation.date.bounds.singleFrequency")
131 |     data = pd.to_datetime(["1997-03-15", "2020-06-24"])
132 |     test_df = pd.DataFrame(data, columns=["data"])
133 |     frequencies = ["YEAR", "MONTH", "DAY"]
134 |     num_days = [365, 30, 1]
135 |     lower, upper = -2, 2
136 |     for freq, days in zip(frequencies, num_days):
137 |         result_df = _make_and_apply_date_ptb(sess, test_df, freq, lower, upper)
138 |         result_df = result_df.apply(pd.to_datetime)
139 |         noise_df = result_df - test_df
140 |         lower_check = noise_df >= pd.to_timedelta(lower * days, unit="days")
141 |         upper_check = noise_df <= pd.to_timedelta(upper * days, unit="days")
142 |         assert lower_check.values.all()
143 |         assert upper_check.values.all()
144 | 
145 | 
146 | def test_date_perturbation_multifreq_bounds():
147 |     sess = utils.make_session("test.perturbation.date.bounds.singleFrequency")
148 |     data = pd.to_datetime(["1997-03-15", "2020-06-24"])
149 |     test_df = pd.DataFrame(data, columns=["data"])
150 |     frequency = ("MONTH", "DAY")
151 |     lower, upper = (-1, -30), (1, 30)
152 |     result_df = _make_and_apply_date_ptb(sess, test_df, frequency, lower, upper)
153 |     result_df = result_df.apply(pd.to_datetime)
154 |     noise_df = result_df - test_df
155 |     lower_check = noise_df >= pd.to_timedelta(-60, unit="days")
156 |     upper_check = noise_df <= pd.to_timedelta(60, unit="days")
157 |     assert lower_check.values.all()
158 |     assert upper_check.values.all()
159 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/transformations/redaction.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from pyspark import sql
 4 | 
 5 | 
 6 | class ColumnRedact:
 7 |     """Redacts columns from a Spark dataframe.
 8 | 
 9 |     Attributes:
10 |         columns: Which columns are redacted.
11 |     """
12 | 
13 |     identifier = "column-redact"
14 |     type_signature = "df->df"
15 | 
16 |     def __init__(self, columns: List[str]):
17 |         self.columns = columns
18 | 
19 |     def __call__(self, df: sql.DataFrame) -> sql.DataFrame:
20 |         return df.drop(*self.columns)
21 | 
22 | 
23 | class RowRedact:
24 |     """Redacts rows satisfying some condition from a Spark DataFrame.
25 | 
26 |     Attributes:
27 |         condition: When this condition evaluates to True for a row, that row
28 |             will be dropped.
29 |     """
30 | 
31 |     identifier = "row-redact"
32 |     type_signature = "df->df"
33 | 
34 |     def __init__(self, condition: str):
35 |         self.condition = condition
36 | 
37 |     def __call__(self, df: sql.DataFrame) -> sql.DataFrame:
38 |         cond = f"NOT {self.condition}"
39 |         return df.filter(cond)
40 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/transformations/redaction_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pandas.testing as pdt
 4 | 
 5 | from cape_dataframes.spark import utils
 6 | from cape_dataframes.spark.transformations import redaction as rdc
 7 | 
 8 | 
 9 | def test_column_redact():
10 |     sess = utils.make_session("test.redaction.column")
11 |     df = pd.DataFrame(np.ones((5, 3)), columns=["a", "b", "c"])
12 |     expected = pd.DataFrame(np.ones((5,)), columns=["a"])
13 |     test_df = sess.createDataFrame(df, schema=["a", "b", "c"])
14 |     redact = rdc.ColumnRedact(["b", "c"])
15 |     result = redact(test_df).toPandas()
16 |     pdt.assert_frame_equal(result, expected)
17 | 
18 | 
19 | def test_row_redact():
20 |     sess = utils.make_session("test.redaction.row")
21 |     df = pd.DataFrame(np.ones((5, 2)), columns=["a", "b"])
22 |     df["a"].iloc[0] = 6
23 |     df["a"].iloc[3] = 6
24 |     expected = pd.DataFrame(np.ones((3, 2)), columns=["a", "b"])
25 |     test_df = sess.createDataFrame(df, schema=["a", "b"])
26 |     redact = rdc.RowRedact("a > 5")
27 |     result = redact(test_df).toPandas()
28 |     pdt.assert_frame_equal(result, expected)
29 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/transformations/rounding.py:
--------------------------------------------------------------------------------
 1 | from pyspark import sql
 2 | from pyspark.sql import functions
 3 | 
 4 | from cape_dataframes.spark import dtypes
 5 | from cape_dataframes.spark.transformations import base
 6 | from cape_dataframes.utils import typecheck
 7 | 
 8 | 
 9 | class NumericRounding(base.Transformation):
10 |     """Reduce the precision of a numeric series
11 | 
12 |     Round each value in the series to the given number
13 |     of digits.
14 | 
15 |     Attributes:
16 |         dtypes (dtypes.Numerics): series type.
17 |         precision (int): set the number of digits.
18 |     """
19 | 
20 |     identifier = "numeric-rounding"
21 |     type_signature = "col->col"
22 | 
23 |     def __init__(self, dtype: dtypes.DType, precision: int):
24 |         if dtype not in dtypes.Numerics:
25 |             raise ValueError("NumericRounding requires a Numeric dtype.")
26 |         typecheck.check_arg(precision, int)
27 |         super().__init__(dtype)
28 |         self._precision = precision
29 | 
30 |     def __call__(self, x: sql.Column):
31 |         return functions.round(x, scale=self._precision)
32 | 
33 | 
34 | class DateTruncation(base.Transformation):
35 |     """Reduce the precision of a date series
36 | 
37 |     Truncate each date in a series to the unit (year or month)
38 |     specified by frequency.
39 | 
40 |     Attributes:
41 |         frequency (string): expect to be 'year' or 'month'
42 |     """
43 | 
44 |     identifier = "date-truncation"
45 |     type_signature = "col->col"
46 | 
47 |     def __init__(self, frequency: str):
48 |         typecheck.check_arg(frequency, str)
49 |         super().__init__(dtypes.Date)
50 |         self._frequency = frequency.lower()
51 | 
52 |     def __call__(self, x: sql.Column):
53 |         truncated = functions.date_trunc(self._frequency, x)
54 |         return truncated.astype(self.dtype)
55 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/transformations/rounding_test.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from pyspark.sql import functions
 6 | 
 7 | from cape_dataframes.spark import dtypes
 8 | from cape_dataframes.spark import utils
 9 | from cape_dataframes.spark.transformations import rounding as rnd
10 | 
11 | 
12 | # Utils
13 | def _make_and_apply_rounder(sess, df, dtype, precision):
14 |     df = sess.createDataFrame(df, schema=["data"])
15 |     rounder = rnd.NumericRounding(dtype, precision)
16 |     result_df = df.select(rounder(functions.col("data")))
17 |     return result_df.toPandas()
18 | 
19 | 
20 | def _make_float_data(dtype, precision=0, scale=0.1):
21 |     data = np.arange(6, dtype=dtype).reshape((6, 1))
22 |     delta = data * scale
23 |     expected = np.around(data + delta, decimals=precision)
24 |     test_df = pd.DataFrame(data + delta, columns=["data"])
25 |     return test_df, expected
26 | 
27 | 
28 | def _make_integer_data(dtype, precision):
29 |     data = np.array([123, 1234, 12345, 123456], dtype=dtype).reshape((4, 1))
30 |     expected = np.around(data, precision)
31 |     test_df = pd.DataFrame(data, columns=["data"])
32 |     return test_df, expected
33 | 
34 | 
35 | def _make_date_data(sess):
36 |     df = sess.createDataFrame([("1997-02-28",)], ["data"])
37 |     expected = np.array(datetime.date(1997, 2, 1))
38 |     return df, expected
39 | 
40 | 
41 | def _make_datetime_data(sess):
42 |     df = sess.createDataFrame([("1997-02-28 05:02:11",)], ["data"])
43 |     expected = np.array(datetime.datetime(1997, 2, 1, 0, 0, 0))
44 |     return df, expected
45 | 
46 | 
47 | # Tests
48 | def test_rounding_float():
49 |     precision = 0
50 |     sess = utils.make_session("test.rounding.float")
51 |     test_df, expected = _make_float_data(np.float32, precision)
52 |     result_df = _make_and_apply_rounder(sess, test_df, dtypes.Float, precision)
53 |     result = result_df.values
54 |     assert result.dtype == expected.dtype
55 |     np.testing.assert_almost_equal(result, expected)
56 | 
57 | 
58 | def test_rounding_double():
59 |     precision = 0
60 |     sess = utils.make_session("test.rounding.double")
61 |     test_df, expected = _make_float_data(np.float64, precision)
62 |     result_df = _make_and_apply_rounder(sess, test_df, dtypes.Double, precision)
63 |     result = result_df.values
64 |     assert result.dtype == expected.dtype
65 |     np.testing.assert_almost_equal(result, expected)
66 | 
67 | 
68 | def test_rounding_integer():
69 |     precision = -2
70 |     sess = utils.make_session("test.rounding.integer")
71 |     test_df, expected = _make_integer_data(np.int32, precision)
72 |     result_df = _make_and_apply_rounder(sess, test_df, dtypes.Integer, precision)
73 |     result = result_df.values
74 |     assert result.dtype == expected.dtype
75 |     np.testing.assert_almost_equal(result, expected)
76 | 
77 | 
78 | def test_rounding_long():
79 |     precision = -2
80 |     sess = utils.make_session("test.rounding.integer")
81 |     test_df, expected = _make_integer_data(np.int64, precision)
82 |     result_df = _make_and_apply_rounder(sess, test_df, dtypes.Long, precision)
83 |     result = result_df.values
84 |     assert result.dtype == expected.dtype
85 |     np.testing.assert_almost_equal(result, expected)
86 | 
87 | 
88 | def test_truncate_date():
89 |     sess = utils.make_session("test.truncation.date")
90 |     test_df, expected = _make_date_data(sess)
91 |     truncate = rnd.DateTruncation("month")
92 |     result_df = test_df.select(truncate(test_df.data)).toPandas()
93 |     result = result_df.values
94 |     assert result.dtype == expected.dtype
95 |     np.testing.assert_equal(result, expected)
96 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/transformations/tokenizer.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import secrets
  3 | 
  4 | import pandas as pd
  5 | from Crypto.Cipher import AES
  6 | from pyspark.sql import functions
  7 | 
  8 | from cape_dataframes.spark import dtypes
  9 | from cape_dataframes.spark.transformations import base
 10 | from cape_dataframes.utils import typecheck
 11 | 
 12 | 
 13 | class Tokenizer(base.Transformation):
 14 |     """Tokenizer: map a string to a token to obfuscate it.
 15 | 
 16 |     When applying the tokenizer to a Spark series of type string,
 17 |     each value gets mapped to a token (hexadecimal string).
 18 |     If a value is repeated several times across the series, it always
 19 |     get mapped to the same token in order to maintain the count.
 20 |     A value can be mapped to different tokens by setting the key to a
 21 |     different value.
 22 | 
 23 |     Attributes:
 24 |         max_token_len (int or bytes): control the token length (default
 25 |             length is 64)
 26 |         key: expect a string or byte string. if not specified, key will
 27 |             be set to a random byte string.
 28 |     """
 29 | 
 30 |     identifier = "tokenizer"
 31 |     type_signature = "col->col"
 32 | 
 33 |     def __init__(self, max_token_len=None, key=None):
 34 |         typecheck.check_arg(max_token_len, (int, type(None)))
 35 |         typecheck.check_arg(key, (str, bytes, type(None)))
 36 |         super().__init__(dtypes.String)
 37 |         self._max_token_len = max_token_len
 38 |         if isinstance(key, str):
 39 |             key = key.encode()
 40 |         self._key = key or secrets.token_bytes(8)
 41 |         self._tokenize = None
 42 | 
 43 |     def __call__(self, x):
 44 |         if self._tokenize is None:
 45 |             self._tokenize = self._make_tokenize_udf()
 46 |         return self._tokenize(x)
 47 | 
 48 |     def _make_tokenize_udf(self):
 49 |         @functions.pandas_udf(dtypes.String)
 50 |         def to_token(x: pd.Series) -> pd.Series:
 51 |             return x.map(self._to_token)
 52 | 
 53 |         return to_token
 54 | 
 55 |     def _to_token(self, x: str):
 56 |         token = hashlib.sha256(x.encode() + self.key).hexdigest()
 57 |         if self._max_token_len is None:
 58 |             return token
 59 |         return token[: self._max_token_len]
 60 | 
 61 |     @property
 62 |     def key(self):
 63 |         return self._key
 64 | 
 65 | 
 66 | class ReversibleTokenizer(base.Transformation):
 67 |     """ReversibleTokenizer: map a string to a token to obfuscate it.
 68 | 
 69 |     When applying the Tokenizer to a Spark series of type string,
 70 |     each value gets mapped to a token (hexadecimal string).
 71 |     If a value is repeated several times across the series, it always
 72 |     get mapped to the same token in order to maintain the count.
 73 |     A value can be mapped to different tokens by setting the key to a
 74 |     different value.
 75 | 
 76 |     This tokenizer allows tokens to be reversed to their original data
 77 |     when the secret key is known.
 78 | 
 79 |     Attributes:
 80 |         key: expect a string or byte string of length exactly 32 bytes.
 81 |         encoding: string identifying the Python encoding used for inputs.
 82 |     """
 83 | 
 84 |     identifier = "reversible-tokenizer"
 85 |     type_signature = "col->col"
 86 | 
 87 |     def __init__(self, key, encoding="utf-8"):
 88 |         typecheck.check_arg(key, (str, bytes))
 89 |         typecheck.check_arg(encoding, str)
 90 |         super().__init__(dtype=dtypes.String)
 91 |         if isinstance(key, str):
 92 |             key = key.encode()
 93 |         if len(key) != 32:
 94 |             raise ValueError(f"Key must be exactly 32 bytes, got {len(key)}")
 95 |         self.key = key
 96 |         self.encoding = encoding
 97 | 
 98 |     def __call__(self, series):
 99 |         @functions.pandas_udf(dtypes.String)
100 |         def to_token(series: pd.Series) -> pd.Series:
101 |             return series.map(self._to_token)
102 | 
103 |         return to_token(series)
104 | 
105 |     def _to_token(self, x: str):
106 |         cipher = AES.new(key=self.key, mode=AES.MODE_SIV)
107 |         ciphertext, tag = cipher.encrypt_and_digest(x.encode(encoding=self.encoding))
108 |         assert len(tag) == 16, len(tag)
109 |         token = tag.hex() + ciphertext.hex()
110 |         return token
111 | 
112 | 
113 | class TokenReverser(base.Transformation):
114 |     """TokenReverser: recover string from token.
115 | 
116 |     When applying the TokenReverser to a Spark series of tokens,
117 |     each token is mapped back to the string that was originally used
118 |     by ReversibleTokenizer to construct the token. The same key must
119 |     be used.
120 | 
121 |     Attributes:
122 |         key: expect a string or byte string of length exactly 32 bytes.
123 |         encoding: string identifying the Python encoding used for outputs.
124 |     """
125 | 
126 |     identifier = "token-reverser"
127 |     type_signature = "col->col"
128 | 
129 |     def __init__(self, key, encoding="utf-8"):
130 |         typecheck.check_arg(key, (str, bytes))
131 |         typecheck.check_arg(encoding, str)
132 |         super().__init__(dtype=dtypes.String)
133 |         if isinstance(key, str):
134 |             key = key.encode()
135 |         if len(key) != 32:
136 |             raise ValueError(f"Key must be exactly 32 bytes, got {len(key)}")
137 |         self.key = key
138 |         self.encoding = encoding
139 | 
140 |     def __call__(self, series) -> pd.Series:
141 |         @functions.pandas_udf(dtypes.String)
142 |         def from_token(series: pd.Series) -> pd.Series:
143 |             return series.map(self._from_token)
144 | 
145 |         return from_token(series)
146 | 
147 |     def _from_token(self, token: str):
148 |         cipher = AES.new(key=self.key, mode=AES.MODE_SIV)
149 |         token_bytes = bytearray.fromhex(token)
150 |         tag, ciphertext = token_bytes[:16], token_bytes[16:]
151 |         x = cipher.decrypt_and_verify(ciphertext, tag)
152 |         return x.decode(encoding=self.encoding)
153 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/transformations/tokenizer_test.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import pandas.testing as pdt
  3 | import pytest
  4 | from pyspark.sql import functions
  5 | 
  6 | from cape_dataframes.spark import utils
  7 | from cape_dataframes.spark.transformations import tokenizer as tkn
  8 | 
  9 | 
 10 | def _apply_tokenizer(sess, df, tokenizer, col_to_rename):
 11 |     df = sess.createDataFrame(df, schema=["name"])
 12 |     result_df = df.select(tokenizer(functions.col("name")))
 13 |     return result_df.withColumnRenamed(col_to_rename, "name").toPandas()
 14 | 
 15 | 
 16 | def test_tokenizer_simple():
 17 |     sess = utils.make_session("test.tokenizer.simple")
 18 |     test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
 19 |     expected = pd.DataFrame(
 20 |         {
 21 |             "name": [
 22 |                 "70a4b1a987767abf36463cd3e3f2b37144132e572fbb9b39f28bcaafe10d9b24",
 23 |                 "dd4532a296deb4f114b1e7e88faefe4fb2b32c559ac15a8c6bcbdbcbc2aa4d4b",
 24 |             ]
 25 |         }
 26 |     )
 27 |     key = "secret_key"
 28 |     df = _apply_tokenizer(
 29 |         sess,
 30 |         test_df,
 31 |         tkn.Tokenizer(max_token_len=None, key=key),
 32 |         col_to_rename="to_token(name)",
 33 |     )
 34 |     pdt.assert_frame_equal(df, expected)
 35 | 
 36 | 
 37 | def test_tokenizer_is_linkable():
 38 |     sess = utils.make_session("test.tokenizer.isLinkable")
 39 |     test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
 40 |     key1 = "secret_key"
 41 |     key2 = "secret_key"
 42 |     df1 = _apply_tokenizer(
 43 |         sess,
 44 |         test_df,
 45 |         tkn.Tokenizer(max_token_len=None, key=key1),
 46 |         col_to_rename="to_token(name)",
 47 |     )
 48 |     df2 = _apply_tokenizer(
 49 |         sess,
 50 |         test_df,
 51 |         tkn.Tokenizer(max_token_len=None, key=key2),
 52 |         col_to_rename="to_token(name)",
 53 |     )
 54 |     pdt.assert_frame_equal(df1, df2)
 55 | 
 56 | 
 57 | def test_tokenizer_is_not_linkable():
 58 |     sess = utils.make_session("test.tokenizer.isNotLinkable")
 59 |     test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
 60 |     key1 = "secret_key"
 61 |     key2 = "not_your_secret_key"
 62 |     df1 = _apply_tokenizer(
 63 |         sess,
 64 |         test_df,
 65 |         tkn.Tokenizer(max_token_len=None, key=key1),
 66 |         col_to_rename="to_token(name)",
 67 |     )
 68 |     df2 = _apply_tokenizer(
 69 |         sess,
 70 |         test_df,
 71 |         tkn.Tokenizer(max_token_len=None, key=key2),
 72 |         col_to_rename="to_token(name)",
 73 |     )
 74 |     try:
 75 |         pdt.assert_frame_equal(df1, df2)
 76 |         raise NotImplemented  # noqa: F901
 77 |     except AssertionError:
 78 |         pass
 79 |     except NotImplemented:
 80 |         raise AssertionError
 81 | 
 82 | 
 83 | def test_tokenizer_with_max_token_len():
 84 |     sess = utils.make_session("test.tokenizer.maxTokenLen")
 85 |     test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
 86 |     expected = pd.DataFrame({"name": ["70a4b1a987", "dd4532a296"]})
 87 |     max_token_len = 10
 88 |     key = "secret_key"
 89 |     df = _apply_tokenizer(
 90 |         sess,
 91 |         test_df,
 92 |         tkn.Tokenizer(max_token_len=max_token_len, key=key),
 93 |         col_to_rename="to_token(name)",
 94 |     )
 95 |     pdt.assert_frame_equal(df, expected)
 96 | 
 97 | 
 98 | def test_tokenizer_no_key():
 99 |     sess = utils.make_session("test.tokenizer.maxTokenLen")
100 |     test_df = pd.DataFrame({"name": ["Alice", "Bob"]})
101 |     _apply_tokenizer(
102 |         sess,
103 |         test_df,
104 |         tkn.Tokenizer(max_token_len=None, key=None),
105 |         col_to_rename="to_token(name)",
106 |     )
107 | 
108 | 
109 | def test_reversible_tokenizer():
110 |     sess = utils.make_session("test.tokenizer.reversibleTokenizer")
111 |     key = b"5" * 32
112 |     plaintext = pd.DataFrame({"name": ["Alice", "Bob"]})
113 | 
114 |     tokenized = _apply_tokenizer(
115 |         sess,
116 |         plaintext,
117 |         tkn.ReversibleTokenizer(key=key),
118 |         col_to_rename="to_token(name)",
119 |     )
120 |     tokenized_expected = pd.DataFrame(
121 |         {
122 |             "name": [
123 |                 "c8c7e80144304276183e5bcd589db782bc5ff95309",
124 |                 "e0f40aea0d5c21b35967c4231b98b5b3e5338e",
125 |             ]
126 |         }
127 |     )
128 |     pdt.assert_frame_equal(tokenized, tokenized_expected)
129 | 
130 |     recovered = _apply_tokenizer(
131 |         sess,
132 |         tokenized,
133 |         tkn.TokenReverser(key=key),
134 |         col_to_rename="from_token(name)",
135 |     )
136 |     pdt.assert_frame_equal(recovered, plaintext)
137 | 
138 | 
139 | def test_reversible_tokenizer_string_key():
140 |     _ = tkn.ReversibleTokenizer(key="5" * 32)
141 | 
142 | 
143 | def test_reversible_tokenizer_insufficient_key():
144 |     with pytest.raises(ValueError):
145 |         _ = tkn.ReversibleTokenizer(key=b"5" * 10)
146 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/transformer.py:
--------------------------------------------------------------------------------
 1 | from pyspark import sql
 2 | from pyspark.sql import functions
 3 | 
 4 | from cape_dataframes.spark.transformations import base as tfm
 5 | 
 6 | 
 7 | def transformer(transformation: tfm.Transformation, df: sql.DataFrame, field_name: str):
 8 |     field_column = functions.col(field_name)
 9 |     return df.withColumn(field_name, transformation(field_column))
10 | 


--------------------------------------------------------------------------------
/cape_dataframes/spark/utils.py:
--------------------------------------------------------------------------------
 1 | import pyspark
 2 | from packaging import version
 3 | from pyspark import sql
 4 | 
 5 | _3_0_0_VERSION = version.Version("3.0.0")
 6 | _spark_version = version.parse(pyspark.__version__)
 7 | 
 8 | 
 9 | def configure_session(sess: sql.SparkSession, arrow=True):
10 |     if arrow:
11 |         if _spark_version >= _3_0_0_VERSION:
12 |             sess.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
13 |         else:
14 |             sess.conf.set("spark.sql.execution.arrow.enabled", "true")
15 |     return sess
16 | 
17 | 
18 | def make_session(name: str, arrow: bool = True):
19 |     sess_builder = sql.SparkSession.builder
20 |     sess_builder = sess_builder.appName(name)
21 |     sess = sess_builder.getOrCreate()
22 |     sess = configure_session(sess, arrow=arrow)
23 |     return sess
24 | 


--------------------------------------------------------------------------------
/cape_dataframes/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capeprivacy/cape-dataframes/ed65cece5caebcce1ac549573514834effab5ecd/cape_dataframes/utils/__init__.py


--------------------------------------------------------------------------------
/cape_dataframes/utils/base64.py:
--------------------------------------------------------------------------------
 1 | from base64 import urlsafe_b64decode
 2 | from base64 import urlsafe_b64encode
 3 | from typing import Union
 4 | 
 5 | 
 6 | # This implements a similar wrapped as cape has in golang.
 7 | # It stores the bytes and converts it to encoded string as needed.
 8 | # The python base64 package appends padding when encoding but
 9 | # in cape this causes errors to occur so must strip that off
10 | # before sending.
11 | class Base64:
12 |     def __init__(self, value: Union[str, bytes]):
13 |         self.bytes = value
14 |         if type(value) == str:
15 |             self.bytes = bytes(value, "utf-8")
16 | 
17 |     def __bytes__(self) -> bytes:
18 |         return self.bytes
19 | 
20 |     # returns the base64 encoded value as a string
21 |     def __str__(self) -> str:
22 |         b = urlsafe_b64encode(self.bytes)
23 |         b = b.strip(b"==")
24 | 
25 |         return str(b, "utf-8")
26 | 
27 | 
28 | # Returns a Base64 object from the base64 encoded string.
29 | # Adds padding when decoding so that it doesn't error.
30 | def from_string(s: str) -> Base64:
31 |     b = urlsafe_b64decode(bytes(s, "utf-8") + b"==")
32 |     return Base64(b)
33 | 


--------------------------------------------------------------------------------
/cape_dataframes/utils/base64_test.py:
--------------------------------------------------------------------------------
 1 | from cape_dataframes.utils.base64 import Base64
 2 | from cape_dataframes.utils.base64 import from_string
 3 | 
 4 | 
 5 | def test_base64():
 6 |     b64 = Base64("heythere")
 7 |     assert "aGV5dGhlcmU" == str(b64)
 8 | 
 9 | 
10 | def test_from_string():
11 |     s = "ABCD"
12 |     b64 = from_string(s)
13 | 
14 |     assert s == str(b64)
15 | 


--------------------------------------------------------------------------------
/cape_dataframes/utils/typecheck.py:
--------------------------------------------------------------------------------
1 | def check_arg(arg, types):
2 |     if not isinstance(arg, types):
3 |         if not isinstance(types, (tuple, list)):
4 |             types = (types,)
5 |         raise ValueError("Expected one of {}, got {}.".format(types, type(arg)))
6 | 


--------------------------------------------------------------------------------
/cape_dataframes/utils/typecheck_test.py:
--------------------------------------------------------------------------------
 1 | from cape_dataframes.utils import typecheck
 2 | 
 3 | 
 4 | def _make_args_and_types():
 5 |     string = "hi"
 6 |     integer = 4
 7 |     flt = 2.0
 8 |     lst = [string, integer, flt]
 9 |     tpl = (string, integer, float)
10 |     args_list = (string, integer, flt, lst, tpl, None)
11 |     types_list = (str, int, float, list, tuple, type(None))
12 |     return args_list, types_list
13 | 
14 | 
15 | def test_typecheck_args():
16 |     args_list, types_list = _make_args_and_types()
17 |     # check passing
18 |     for a, t in zip(args_list, types_list):
19 |         typecheck.check_arg(a, t)
20 |     # check failure
21 |     for a, t in zip(args_list, types_list[::-1]):
22 |         try:
23 |             typecheck.check_arg(a, t)
24 |             raise AssertionError
25 |         except ValueError:
26 |             pass
27 | 
28 | 
29 | def test_typecheck_more_types_passes():
30 |     args_list, types_list = _make_args_and_types()
31 |     args = args_list[:3]
32 |     types = types_list[:3]
33 |     for arg in args:
34 |         typecheck.check_arg(arg, types)
35 | 
36 | 
37 | def test_typecheck_more_types_fails():
38 |     args_list, types_list = _make_args_and_types()
39 |     arg = args_list[-1]
40 |     types = types_list[:3]
41 |     try:
42 |         typecheck.check_arg(arg, types)
43 |         raise AssertionError
44 |     except ValueError:
45 |         pass
46 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   status:
 3 |     project:
 4 |       default:
 5 |         target: 90%
 6 |         threshold: 1%
 7 |         base: auto
 8 |     patch:
 9 |       default:
10 |         target: 93%
11 |         threshold: 5%
12 |         base: auto


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Cape Dataframes overview
 2 | 
 3 | Cape Dataframes allows you to write data privacy policies and data transformations to integrate with [Pandas](https://pandas.pydata.org/) and [Spark](https://spark.apache.org/).
 4 | 
 5 | You can view the source code in the [Cape Dataframes GitHub Repository](https://github.com/capeprivacy/cape-dataframes).
 6 | 
 7 | ## Use cases
 8 | 
 9 | Review the [transformations](./transformations) and decide which are a good fit for your data science needs. 
10 | 
11 | The 0.1.0 release includes five transformations that provide some common privacy protections. 
12 | 
13 | | Use case  | Text data | Numeric data | Inconsistent data
14 | | ------------- | ------------- | --------------- | -----------
15 | | EDA | Tokenization  | Rounding or pertubation | Tokenization 
16 | | Analytics | Tokenization  | Rounding or pertubation | -
17 | | ML development | - | Rounding or pertubation | Tokenization
18 | | ML training/serving | No transformation | No transformation | No transformation
19 | 
20 | Cape Dataframes will support more use cases through additional transformations in future releases.
21 | 


--------------------------------------------------------------------------------
/docs/policies.md:
--------------------------------------------------------------------------------
 1 | # Policies
 2 | 
 3 | The data policy defines the data you want to change, and the [transformations](./transformations) or [redactions](./redactions) you want to apply.
 4 | 
 5 | Cape Dataframes requires data policies in YAML format. This example describes all the available YAML objects:
 6 | 
 7 | ``` yaml
 8 | # Required. The policy name.
 9 | label: test_policy
10 | # Required. The Cape Dataframes specification version. Must be 1.
11 | version: 1
12 | # Configure your named transformations.
13 | # Named transformations allow you to reuse a transformation
14 | # with a set value throughout your policy.
15 | transformations:
16 |     # This named transformation uses the built-in tokenizer transformation
17 |     - name: my_tokenizer
18 |       type: tokenizer
19 |       max_token_len: 10
20 |       key: "my secret"
21 | rules:
22 |     # Required. The column name.
23 |     - match: 
24 |         name: fruit
25 |       actions:
26 |         # This example shows a named transformation.
27 |         # It tells the policy runner to apply the my_tokenizer transformation
28 |         # to all fields in the "fruit" column.
29 |         - transform:
30 |             name: my_tokenizer
31 |     - match: 
32 |         name: weight
33 |       actions:
34 |         - transform:
35 |             # This example shows an unnamed transformation.
36 |             # It tells the policy runner to:
37 |             # (1) Apply the transformation numeric-rounding 
38 |             # (2) Round to one decimal place
39 |             type: numeric-rounding
40 |             dtype: Double
41 |             precision: 1
42 | ```
43 | 
44 | 


--------------------------------------------------------------------------------
/docs/quickstart.md:
--------------------------------------------------------------------------------
  1 | # Cape Dataframes API
  2 | 
  3 | This guide provides an example of using Cape Dataframes with either Pandas or Spark.
  4 | 
  5 | ## Prerequisites
  6 | 
  7 | * Python 3.6 or above.
  8 | * Cape Dataframes recommends using a virtual environment such as [venv](https://docs.python.org/3/library/venv.html).
  9 | 
 10 | 
 11 | ## Installation
 12 | 
 13 | You can install Cape Dataframes with pip:
 14 | 
 15 | ```shell
 16 | pip install cape-privacy
 17 | ```
 18 | 
 19 | ## Quickstart
 20 | 
 21 | ### Write the policy
 22 | 
 23 | The data policy file defines the target data and permissions. It is written in YAML. Cape Dataframes reads the `.yaml` policy file and applies the policies based on your [policy application script](#write-the-policy-application-script).
 24 | 
 25 | Create a `test-policy.yaml` file in your project, with the following content:
 26 | 
 27 | ```yaml
 28 | label: test-policy
 29 | version: 1
 30 | rules:
 31 | # Set the column name
 32 | - match:
 33 |     name: weight
 34 |   actions:
 35 |     - transform:
 36 |         # This example shows an unnamed transformation.
 37 |         # It tells the policy runner to:
 38 |         # (1) Apply the transformation numeric-rounding
 39 |         # (2) Round to one decimal place
 40 |         type: numeric-rounding
 41 |         dtype: Double
 42 |         precision: 1
 43 | ```
 44 | 
 45 | 
 46 | ### Write the policy application script
 47 | 
 48 | To apply the policy `.yaml` to your data, you must run a script that defines which policy you apply to which data target.
 49 | 
 50 | Create a `test-transformation.py` file in your project, with the following content:
 51 | 
 52 | 
 53 | === "Pandas"
 54 |     ```python
 55 |     import cape_dataframes as cape_df
 56 |     import pandas as pd
 57 | 
 58 |     # Create a simple Pandas DataFrame
 59 |     df = pd.DataFrame([114.432, 134.622, 142.984], columns=["weight"])
 60 |     # Load the privacy policy
 61 |     policy = cape_df.parse_policy("test-policy.yaml")
 62 |     # Apply the policy to the DataFrame
 63 |     df = cape_df.apply_policy(policy, df, inplace=False)
 64 |     # Output the altered data
 65 |     print(df.head())
 66 |     ```
 67 | 
 68 | === "Spark"
 69 |     ```python
 70 |     import cape_dataframes as cape_df
 71 |     from pyspark import sql
 72 | 
 73 |     sess_builder = sql.SparkSession.builder
 74 |     sess_builder = sess_builder.appName('cape.examples.rounding')
 75 |     sess_builder = sess_builder.config('spark.sql.execution.arrow.enabled', 'true')
 76 |     sess = sess_builder.getOrCreate()
 77 | 
 78 |     # Create a simple Spark DataFrame
 79 |     df = sess.createDataFrame([114.432, 134.622, 142.984], "double").toDF("weight")
 80 |     # Load the privacy policy
 81 |     policy = cape_df.parse_policy("test-policy.yaml")
 82 |     # Apply the policy to the DataFrame
 83 |     df = cape_df.apply_policy(policy, df, inplace=False)
 84 |     # Output the altered data
 85 |     print(df.show())
 86 |     ```
 87 | 
 88 | 
 89 | ### Run your transformations
 90 | 
 91 | The quickstart example creates a dataset programatically, so you can run the policy application script and view the output:
 92 | 
 93 | ```shell
 94 | python test-transformation.py
 95 | ```
 96 | 
 97 | 
 98 | ### Usage Best Practices
 99 | 
100 | * Ensure that you have your data collected and joined before applying transformations, especially in the case of multiple sensitive columns.
101 | * Some transformations require sensitive data to be contained in the policy files. For this reason, keep your policy files stored securely. In a future release, we will support pulling transformation keys from key storage software, such as Hashicorp Vault.
102 | * Consider using transformations as the final step in your pre-processing before creating a "clean sink" or "safe dataset". This means that you can begin your work on that clean dataset. 
103 | * Experiment with the transformations directly on your data to learn how they impact your data utility. Figure out the right utility vs. privacy tradeoff for the task at hand, and amend your policy accordingly.
104 | 


--------------------------------------------------------------------------------
/docs/redactions.md:
--------------------------------------------------------------------------------
 1 | # Redactions
 2 | 
 3 | Redactions involve dropping the matched data. Unlike [transformations](./transformations), which modify but preserve data, redactions will change the shape of your dataframes.
 4 | 
 5 | Cape Dataframes has one built-in redaction function. This document describes what it does, and provides an example of how to use it in your policy.
 6 | 
 7 | !!! warning
 8 |     Redactions change the shape of your data.
 9 | 
10 | ## Column redaction
11 | 
12 | The `column-redact` redaction deletes matching columns.
13 | 
14 | ```yaml
15 | - transform:
16 |     type: "column-redact"
17 |     # Replace <COLUMN_NAME> with the column name you want to redact.
18 |     columns: ["<COLUMN_NAME>"]
19 | ```
20 | 
21 | 


--------------------------------------------------------------------------------
/docs/transformations.md:
--------------------------------------------------------------------------------
 1 | # Transformations
 2 | 
 3 | Transformations are functions that alter your data, ensuring it is free of sensitive information.
 4 | 
 5 | Cape Dataframes has five built-in transformation functions. This document describes what they do, and provides an example of how to use each transformation in your policy.
 6 | 
 7 | ## Date perturbation
 8 | 
 9 | The `date-perturbation` transformation adds random noise to dates. The amount of noise depends on the `min` and `max` values that you set in the policy.
10 | 
11 | ``` yaml
12 | - transform:
13 |     type: date-pertubation
14 |     frequency: <one of: 'year', 'month', 'day', 'hour', 'minute', 'second'>
15 |     min: <int or float>
16 |     max: <int or float>
17 |     # Optional. The base number to initialize the random number generator.
18 |     # Pandas only (Spark does not currently support seeding)
19 |     seed: <int>
20 | ```
21 | 
22 | 
23 | ## Date truncation
24 | 
25 | The `date-truncation` transformation shortens dates to a unit (year or month). Set the unit in `frequency`.
26 | 
27 | ``` yaml
28 | - transform:
29 |     type: date-truncation
30 |     frequency: <one of: 'year', 'month', 'day', 'hour', 'minute', 'second'>
31 | ```
32 | 
33 | ## Numeric pertubation
34 | 
35 | The `numeric-pertubation` transformation adds random noise to numeric data sets. The amount of noise depends on the `min` and `max` values that you set in the policy.
36 | 
37 | ``` yaml
38 | - transform:
39 |     type: numeric-pertubation
40 |     dtype: <Pandas Series type or Spark Series type>
41 |     min: <int or float>
42 |     max: <int or float>
43 |     # Optional. The base number to initialize the random number generator.
44 |     seed: <int>
45 | ```
46 | 
47 | ## Numeric rounding
48 | 
49 | The `numeric-rounding` transformation rounds numeric values to a given number of decimal places. Use `precision` to set the number of decimal places.
50 | 
51 | ``` yaml
52 | - transform:
53 |     type: numeric-rounding
54 |     dtype: <Pandas Series type or Spark Series type>
55 |     precision: <int>
56 | ```
57 | 
58 | ## Tokenizer
59 | 
60 | The `tokenizer` transformation maps a string to a token to obfuscate it.
61 | 
62 | !!! warning
63 |     Linkable tokenization for sensitive data is vulnerable to privacy attacks. Cape Privacy does not recommend sharing tokenized data with preserved linkability with untrusted or outside parties. Cape Python does not support anonymized transformations.
64 | 
65 | ``` yaml
66 | - transform:
67 |     type: tokenizer
68 |     # Default is 64
69 |     max_token_len: <int or bytes>
70 |     # If unspecified, Cape Dataframes uses a random byte string
71 |     key: <string or byte string>
72 | ```
73 | 
74 | ## ReversibleTokenizer
75 | 
76 | The `ReversibleTokenizer` transformation maps a sting to a token to obfuscate it. However, when using the `ReversibleTokenizer`, the tokens can be reverted back to their plaintext form by using the `TokenReverser`.
77 | 
78 | ```yaml
79 | - transform:
80 |     type: reversible-tokenizer
81 |     # If unspecified, Cape Dataframes uses a random byte string
82 |     key: <string or byte string>
83 | ```
84 | 
85 | ## TokenReverser
86 | 
87 | The `TokenReverser` is designed to be used with the `ReversibleTokenizer`. The `TokenReverser` reverts tokens produced by the `ReversibleTokenizer` back to their plaintext form.
88 | 
89 | ```yaml
90 | - transform:
91 |     type: token-reverser
92 |     # If unspecified, Cape Dataframes uses a random byte string
93 |     key: <string or byte string>
94 | ```
95 | 


--------------------------------------------------------------------------------
/docs/tutorials/reversible-tokenization.md:
--------------------------------------------------------------------------------
 1 | # Reversible Tokenizer
 2 | 
 3 | Here we show an example of how you can use the `ReversibleTokenizer` to tokenize data within a pandas dataframe.
 4 | 
 5 | The `ReversibleTokenizer` will tokenize the input data so it can be used in a privacy preserving manner.
 6 | 
 7 | The `ReversibleTokenizer` can be used in conjunction with the `TokenReverser` to recover the original data.
 8 | 
 9 | ## Tokenizing Data
10 | 
11 | The `ReversibleTokenizer` and `TokenReverser` classes can be found in the `pandas.transformations` package.
12 | 
13 | ```python
14 | from cape_dataframes.pandas.transformations import ReversibleTokenizer
15 | from cape_dataframes.pandas.transformations import TokenReverser
16 | ```
17 | 
18 | In this example, we will simply hide the names within our dataset.
19 | 
20 | ```python
21 | import pandas as pd
22 | plaintext_data = pd.DataFrame({'name': ["Alice", "Bob", "Carol"], "# friends": [100, 200, 300]})
23 | ```
24 | 
25 | You instantiate a `ReversibleTokenizer` by passing it a key. For the `TokenReverser` to be able to reverse the tokens produced by the `ReversibleTokenizer`, you must use the same key.
26 | 
27 | ```python
28 | key=b"5" * 32
29 | tokenizer = ReversibleTokenizer(key=key)
30 | ```
31 | 
32 | ```python
33 | tokenized = pd.DataFrame(plaintext_data)
34 | tokenized["name"] = tokenizer(plaintext_data["name"])
35 | ```
36 | 
37 | ## Recovering Tokens
38 | 
39 | If we ever need to reveal the tokenized data, we can use the `TokenReverser` class.
40 | 
41 | ```python
42 | reverser = TokenReverser(key=key)
43 | recovered = pd.DataFrame(tokenized)
44 | recovered["name"] = reverser(tokenized["name"])
45 | ```
46 | 
47 | You can see full code for this example on [Github](https://github.com/capeprivacy/cape-dataframes/blob/master/examples/tutorials/reversible_tokenizer/reversible_tokenizer_pandas.ipynb)
48 | 


--------------------------------------------------------------------------------
/examples/notebooks/Cape Policy for Spark - IoT Example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Exploring Cape Python Policy with Pandas and Cape Core\n",
  8 |     "\n",
  9 |     "This Jupyter Notebook is accompanied by our [Medium Post on Getting Started with Cape Core](https://medium.com/dropoutlabs/cape-core-privacy-and-data-science-working-together-d25a55526506). To follow along, you will need to [download the example dataset](https://capeprivacy.com/example-dataset/) and put it in a relative folder called `data` (or update the file path below). You will also need to [download the policy file](https://github.com/capeprivacy/cape-python/blob/master/examples/policy/iot_example_policy.yaml) and put it in a relative folder called `policy` or ensure you have Cape Python installed locally and change the path to use the copy in the `examples` folder.\n",
 10 |     "\n",
 11 |     "You will also need a local (or deployed version) of [Cape Core](https://github.com/capeprivacy/cape) running and have generated an API token to follow along."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/html": [
 22 |        "\n",
 23 |        "            <div>\n",
 24 |        "                <p><b>SparkSession - hive</b></p>\n",
 25 |        "                \n",
 26 |        "        <div>\n",
 27 |        "            <p><b>SparkContext</b></p>\n",
 28 |        "\n",
 29 |        "            <p><a href=\"http://kjamistan.lan:4040\">Spark UI</a></p>\n",
 30 |        "\n",
 31 |        "            <dl>\n",
 32 |        "              <dt>Version</dt>\n",
 33 |        "                <dd><code>v3.0.0</code></dd>\n",
 34 |        "              <dt>Master</dt>\n",
 35 |        "                <dd><code>local[*]</code></dd>\n",
 36 |        "              <dt>AppName</dt>\n",
 37 |        "                <dd><code>PySparkShell</code></dd>\n",
 38 |        "            </dl>\n",
 39 |        "        </div>\n",
 40 |        "        \n",
 41 |        "            </div>\n",
 42 |        "        "
 43 |       ],
 44 |       "text/plain": [
 45 |        "<pyspark.sql.session.SparkSession at 0x7f7fe9978438>"
 46 |       ]
 47 |      },
 48 |      "execution_count": 1,
 49 |      "metadata": {},
 50 |      "output_type": "execute_result"
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "spark"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 2,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "import cape_dataframes as cape_df"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 4,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "df = spark.read.csv('../data/iot_example.csv', header=True)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 5,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "name": "stdout",
 82 |      "output_type": "stream",
 83 |      "text": [
 84 |       "+-------------------+---------------+-----------+---------+--------------------+------+------+\n",
 85 |       "|          timestamp|       username|temperature|heartrate|               build|latest|  note|\n",
 86 |       "+-------------------+---------------+-----------+---------+--------------------+------+------+\n",
 87 |       "|2017-01-01T12:18:39|       moonjuan|         26|       76|22989085-e6fe-eae...|     1|   n/a|\n",
 88 |       "|2017-01-01T12:22:52|           ylee|         29|       73|ff29e7ab-934f-f7b...|     0|  test|\n",
 89 |       "|2017-01-01T12:32:20|    alicecampos|         29|       76|547ed6d5-0e12-4c2...|     0|  test|\n",
 90 |       "|2017-01-01T12:36:40|   stevenmiller|         26|       64|e12b053c-d772-c94...|     0|update|\n",
 91 |       "|2017-01-01T12:40:26|robinsongabriel|         17|       80|f0bfb52c-b805-cd1...|     1|   n/a|\n",
 92 |       "+-------------------+---------------+-----------+---------+--------------------+------+------+\n",
 93 |       "\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "df.sample(0.1).limit(5).show()"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "### Privacy Concerns\n",
106 |     "\n",
107 |     "In this dataset which has mock data from wearable devices, we are concerned about the privacy of the individuals. It is a timeseries-based analysis, so we'd like to ensure we retain the ability to see the data of an individual change over time, but we want to provide some basic privacy protections for our exploratory data analysis and later model development activities.\n",
108 |     "\n",
109 |     "The following policy file provides these protections:\n",
110 |     "\n",
111 |     "- [Tokenization](https://docs.capeprivacy.com/libraries/cape-python/transformations/#tokenizer) of the username column with a maximum token length of 10 and a key defined in the file.\n",
112 |     "- [Date Truncation](https://docs.capeprivacy.com/libraries/cape-python/transformations/#date-truncation) for the timestamp column - removing the minutes and seconds of the data but keeping the year, month, date and hour.\n",
113 |     "- [Redaction](https://docs.capeprivacy.com/libraries/cape-python/redactions) of the build column, which reveals information about the device it was built on. In Cape, redaction involves dropping of the matching data so this will change the shape of your dataframes."
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 6,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "label: iot_dataset_policy\r\n",
126 |       "version: 1\r\n",
127 |       "rules:\r\n",
128 |       "  - match:\r\n",
129 |       "      name: username\r\n",
130 |       "    actions:\r\n",
131 |       "      - transform:\r\n",
132 |       "          type: \"tokenizer\"\r\n",
133 |       "          max_token_len: 10\r\n",
134 |       "          key: \"Please change this :)\"\r\n",
135 |       "  - match:\r\n",
136 |       "      name: timestamp\r\n",
137 |       "    actions:\r\n",
138 |       "      - transform:\r\n",
139 |       "          type: \"date-truncation\"\r\n",
140 |       "          frequency: \"hour\"\r\n",
141 |       "  - match:\r\n",
142 |       "      name: build\r\n",
143 |       "    actions:\r\n",
144 |       "      - transform:\r\n",
145 |       "          type: \"column-redact\"\r\n",
146 |       "          columns: [\"build\"] \r\n"
147 |      ]
148 |     }
149 |    ],
150 |    "source": [
151 |     "!cat ../policy/iot_example_policy.yaml"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "### With Cape Core\n",
159 |     "\n",
160 |     "If you are using Cape Core and have a project setup and registered with the above policy as well as an API token, you can use the following code to download the policy from the Cape Coordinator."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 3,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "c = cape_df.Client(\"http://localhost:8080\")\n",
170 |     "c.login(\"INSERT YOUR CAPE TOKEN HERE\")\n",
171 |     "policy = c.get_policy(\"first-project\")"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "### Apply the parsed policy\n",
179 |     "\n",
180 |     "To apply the parsed policy, call the `apply_policy` function to your dataframe and sample the results."
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 6,
186 |    "metadata": {},
187 |    "outputs": [
188 |     {
189 |      "name": "stderr",
190 |      "output_type": "stream",
191 |      "text": [
192 |       "/usr/local/spark/python/pyspark/sql/pandas/functions.py:386: UserWarning: In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for pandas UDF instead of specifying pandas UDF type which will be deprecated in the future releases. See SPARK-28264 for more details.\n",
193 |       "  \"in the future releases. See SPARK-28264 for more details.\", UserWarning)\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "caped_df = cape_df.apply_policy(policy, df)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 7,
204 |    "metadata": {},
205 |    "outputs": [
206 |     {
207 |      "name": "stdout",
208 |      "output_type": "stream",
209 |      "text": [
210 |       "+----------+----------+-----------+---------+------+--------+\n",
211 |       "| timestamp|  username|temperature|heartrate|latest|    note|\n",
212 |       "+----------+----------+-----------+---------+------+--------+\n",
213 |       "|2017-01-01|1763f4313b|         22|       83|     1|  update|\n",
214 |       "|2017-01-01|d0c44f5675|         12|       77|     0|    wake|\n",
215 |       "|2017-01-01|0a89db1e39|         12|       78|     1|interval|\n",
216 |       "|2017-01-01|26594010f3|         29|       76|     0|    test|\n",
217 |       "|2017-01-01|37db75f0f1|         12|       71|     0|   sleep|\n",
218 |       "+----------+----------+-----------+---------+------+--------+\n",
219 |       "\n"
220 |      ]
221 |     }
222 |    ],
223 |    "source": [
224 |     "caped_df.sample(0.1).limit(5).show()"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "### Send it to Sink\n",
232 |     "\n",
233 |     "Now it's time to send along our caped DataFrame to our clean sink or utilize it in a Spark task (for example, for analytics, EDA or machine learning). \n",
234 |     "\n",
235 |     "Note: You'll need to edit the database details below (or specify where you'd like the dataframe to be written."
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "caped_df.write \\\n",
245 |     "    .format(\"jdbc\") \\\n",
246 |     "    .option(\"url\", \"jdbc:postgresql:dbserver\") \\\n",
247 |     "    .option(\"dbtable\", \"schema.tablename\") \\\n",
248 |     "    .option(\"user\", \"username\") \\\n",
249 |     "    .option(\"password\", \"password\") \\\n",
250 |     "    .save()"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": []
259 |   }
260 |  ],
261 |  "metadata": {
262 |   "kernelspec": {
263 |    "display_name": "cape-df",
264 |    "language": "python",
265 |    "name": "python3"
266 |   },
267 |   "language_info": {
268 |    "codemirror_mode": {
269 |     "name": "ipython",
270 |     "version": 3
271 |    },
272 |    "file_extension": ".py",
273 |    "mimetype": "text/x-python",
274 |    "name": "python",
275 |    "nbconvert_exporter": "python",
276 |    "pygments_lexer": "ipython3",
277 |    "version": "3.8.16"
278 |   },
279 |   "vscode": {
280 |    "interpreter": {
281 |     "hash": "2c0eb9acd3ce9f628738cc91d7613a5d048e1a93f709104c9a35d77254cfaaac"
282 |    }
283 |   }
284 |  },
285 |  "nbformat": 4,
286 |  "nbformat_minor": 2
287 | }
288 | 


--------------------------------------------------------------------------------
/examples/notebooks/README.md:
--------------------------------------------------------------------------------
 1 | # Jupyter Notebooks: Working Examples for Cape Python
 2 | 
 3 | `cape-privacy` gives you the ability to apply several masking techniques (transformations) such as tokenization, perturbation, rounding, etc., in order to obfuscate personal information contained in your dataset. You can find out more by visiting [our documentation](https://docs.capeprivacy.com/libraries/cape-python/).
 4 | 
 5 | ## Notebook Overview
 6 | 
 7 | There are several posts related to these Jupyter notebooks to assist you in evaluating the code and privacy considerations. The related policy files can be found in the `policy` folder in this `examples` folder. You will find links to the datasets in the individual notebooks. Note that some datasets are reused, so you might need to download only once.
 8 | 
 9 | ### Cape Core: Introduction to Collaborative Privacy and Security Policy
10 | 
11 | This [overview of Cape Core software](https://medium.com/dropoutlabs/cape-core-privacy-and-data-science-working-together-d25a55526506) walks you through the use of Cape Core alongside Cape Privacy. It is a great way to get started with managing your transformations in a responsible way. Cape Core allows you to store policies centrally and coordinate them with your data science and machine learning peers. There are two notebooks related to this blog post:
12 | 
13 | - [Cape Policy for Pandas - IoT Example](https://github.com/capeprivacy/cape-python/blob/master/examples/notebooks/Cape%20Policy%20for%20Pandas%20-%20IoT%20Example.ipynb)
14 | - [Cape Policy for Spark - IoT Example](https://github.com/capeprivacy/cape-python/blob/master/examples/notebooks/Cape%20Policy%20for%20Spark%20-%20IoT%20Example.ipynb)
15 | 
16 | ### Coiled Science Thursdays: Data Privacy in Distributed Data Science
17 | 
18 | The Cape team was invited to join in on [Coiled](https://coiled.io/)'s Science Thursdays to show how [Cape Python can help add privacy to distributed data science](https://coiled.io/blog/data-privacy-distributed-compute.html). This [live webinar (recording available)](https://www.youtube.com/watch?v=cIvv8EGMDY0&feature=youtu.be) walks you through the use of Cape Privacy in Spark and Pandas. There are two notebooks related to [this recording and
19 | writeup](https://coiled.io/blog/data-privacy-distributed-compute.html): 
20 | 
21 | - [Exploring Cape Python in an EDA Setting](https://github.com/capeprivacy/cape-python/blob/master/examples/notebooks/Cape%20Python%20with%20Pandas%20-%20IoT%20Exploratory%20Data%20Analysis.ipynb)
22 | - [Implementing Policy in Apache Spark - Taxi Dataset](https://github.com/capeprivacy/cape-python/blob/master/examples/notebooks/Cape%20Python%20with%20PySpark%20-%20Taxi%20Dataset.ipynb)
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/examples/policy/iot_example_policy.yaml:
--------------------------------------------------------------------------------
 1 | label: iot_examplew_policy
 2 | version: 1
 3 | rules:
 4 |   - match:
 5 |       name: username
 6 |     actions:
 7 |       - transform:
 8 |           type: "tokenizer"
 9 |           max_token_len: 10
10 |           key: "Please change this :)"
11 |   - match:
12 |       name: timestamp
13 |     actions:
14 |       - transform:
15 |           type: "date-truncation"
16 |           frequency: "hour"
17 |   - match:
18 |       name: build
19 |     actions:
20 |       - transform:
21 |           type: "column-redact"
22 |           columns: ["build"] 
23 | 


--------------------------------------------------------------------------------
/examples/policy/mask_personal_information.yaml:
--------------------------------------------------------------------------------
 1 | label: masking_policy
 2 | version: 1
 3 | rules:
 4 |   - match:
 5 |       name: name
 6 |     actions:
 7 |       - transform:
 8 |           type: "tokenizer"
 9 |           max_token_len: 10
10 |           key: "my secret"
11 |   - match:
12 |       name: age
13 |     actions:
14 |       - transform:
15 |           type: "numeric-perturbation"
16 |           dtype: Integer
17 |           min: -10
18 |           max: 10
19 |   - match:
20 |       name: salary
21 |     actions:
22 |       - transform:
23 |           type: "numeric-rounding"
24 |           dtype: Double
25 |           precision: -3
26 |   - match:
27 |       name: birthdate
28 |     actions:
29 |       - transform:
30 |           type: "date-perturbation"
31 |           frequency: ["YEAR", "MONTH", "DAY"]
32 |           min: [-10, -5, -5]
33 |           max: [10, 5, 5]
34 |   - match:
35 |       name: ssn
36 |     actions:
37 |       - transform:
38 |           type: "column-redact"
39 |           columns: ["ssn"] 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/examples/policy/nyc_taxi_dataset_policy.yaml:
--------------------------------------------------------------------------------
 1 | label: taxi_dataset_policy
 2 | version: 1
 3 | rules:
 4 |   - match:
 5 |       name: VendorID 
 6 |     actions:
 7 |       - transform:
 8 |           type: "tokenizer"
 9 |           max_token_len: 10
10 |           key: "Please change this :)"
11 |   - match:
12 |       name: passenger_count
13 |     actions:
14 |       - transform:
15 |           type: "numeric-perturbation"
16 |           dtype: Integer
17 |           min: 0
18 |           max: 2
19 |   - match:
20 |       name: pickup_longitude
21 |     actions:
22 |       - transform:
23 |           type: "numeric-rounding"
24 |           dtype: Double
25 |           precision: 4
26 |   - match:
27 |       name: pickup_latitude
28 |     actions:
29 |       - transform:
30 |           type: "numeric-rounding"
31 |           dtype: Double                   
32 |           precision: 4                   
33 |   - match:
34 |       name: dropoff_longitude                    
35 |     actions:                                    
36 |       - transform:                              
37 |           type: "numeric-rounding"              
38 |           dtype: Double                         
39 |           precision: 4                         
40 |   - match:                                      
41 |       name: dropoff_latitude                     
42 |     actions:                                    
43 |       - transform:                              
44 |           type: "numeric-rounding"              
45 |           dtype: Double                         
46 |           precision: 4                         
47 |   - match:
48 |       name: PULocationID
49 |     actions:
50 |       - transform:
51 |           type: "column-redact"
52 |           columns: ["PULocationID", "DOLocationID"] 
53 | 


--------------------------------------------------------------------------------
/examples/policy/perturb_value_field.yaml:
--------------------------------------------------------------------------------
 1 | label: perturb-ones-field
 2 | version: 1
 3 | rules:
 4 |   - match:
 5 |       name: ones
 6 |     actions:
 7 |       - transform:
 8 |           type: numeric-perturbation
 9 |           dtype: Integer
10 |           min: -10
11 |           max: 10
12 |           seed: 4984
13 | 


--------------------------------------------------------------------------------
/examples/policy/spark_round.yaml:
--------------------------------------------------------------------------------
 1 | label: spark-round-float
 2 | version: 1
 3 | transformations:
 4 |       - name: roundFloat
 5 |         type: numeric-rounding
 6 |         dtype: Float
 7 |         precision: 0
 8 | rules:
 9 |   - match:
10 |       name: ones
11 |     actions:
12 |       - transform:
13 |           name: roundFloat
14 | 


--------------------------------------------------------------------------------
/examples/simple_transformation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | import cape_dataframes as cape
 5 | 
 6 | policy = cape.parse_policy("policy/perturb_value_field.yaml")
 7 | 
 8 | df = pd.DataFrame(
 9 |     np.ones(
10 |         5,
11 |     ),
12 |     columns=["ones"],
13 | )
14 | df = cape.apply_policy(policy, df)
15 | print(df.head())
16 | 


--------------------------------------------------------------------------------
/examples/spark_example.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from pyspark import sql
 4 | 
 5 | import cape_dataframes as cape
 6 | 
 7 | sess_builder = sql.SparkSession.builder
 8 | sess_builder = sess_builder.appName("cape.examples.rounding")
 9 | sess = sess_builder.getOrCreate()
10 | sess = cape.spark.configure_session(sess)
11 | 
12 | pdf = pd.DataFrame(np.ones(5, dtype=np.float32) + 0.2, columns=["ones"])
13 | df = sess.createDataFrame(pdf)
14 | df.show()
15 | 
16 | policy = cape.parse_policy("policy/spark_round.yaml")
17 | result = cape.apply_policy(policy, df)
18 | result.show()
19 | 


--------------------------------------------------------------------------------
/examples/tutorials/credit/README.md:
--------------------------------------------------------------------------------
 1 | # Credit Risk Tutorial
 2 | 
 3 | This tutorial was created for the blog post [Cape Python: Apply Privacy-Enhancing Techniques to Protect Sensitive Data in Pandas and Spark](https://medium.com/dropoutlabs/cape-python-apply-privacy-enhancing-techniques-to-protect-sensitive-data-in-pandas-and-spark-e0bf8c0d55db).
 4 | 
 5 | As an example, we experiment with the public [German credit card dataset](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)). We just added some fake PII information (such as name, address, etc.) and quasi-identifiers (city, salary etc.) to make it more similar to a real dataset for which we would use the masking techniques. 
 6 | 
 7 | ## Prototype your privacy-preserving pipeline in Pandas
 8 | 
 9 | The notebook `mask_credit_data_in_pandas.ipynb` shows you how you can prototype the masking techniques in Pandas, and then define a data privacy policy.
10 | 
11 | ## Make your Spark pipeline privacy-preserving
12 | Once you have defined the data privacy policy, you can can apply it to a Spark DataFrame. As an example, you can run the following script:
13 | ```
14 | # submit the script to a Spark cluster
15 | spark-submit apply_policy_spark.py
16 | ```
17 | 


--------------------------------------------------------------------------------
/examples/tutorials/credit/apply_policy_spark.py:
--------------------------------------------------------------------------------
 1 | from pyspark import sql
 2 | from pyspark.sql import functions
 3 | 
 4 | import cape_dataframes as cape
 5 | 
 6 | # Set up your SparkSession as usual, but configure it for use with Cape.
 7 | # We do this because some transformations expect Arrow to be enabled.
 8 | sess = sql.SparkSession.builder.appName(
 9 |     "cape.tutorial.maskPersonalInformation"
10 | ).getOrCreate()
11 | sess = cape.spark.configure_session(sess)
12 | 
13 | # Load a Spark DataFrame
14 | df = sess.read.load(
15 |     "data/credit_with_pii.csv", format="csv", sep=",", inferSchema="true", header="true"
16 | )
17 | df = df.withColumn(
18 |     "Application_date",
19 |     functions.to_date(functions.col("Application_date"), "yyyy-MM-dd"),
20 | )
21 | print("Original Dataset:")
22 | print(df.show())
23 | # Load the privacy policy and apply it to the DataFrame
24 | policy = cape.parse_policy("policy/credit_policy.yaml")
25 | df = cape.apply_policy(policy, df)
26 | 
27 | print("Masked Dataset:")
28 | print(df.show())
29 | 


--------------------------------------------------------------------------------
/examples/tutorials/credit/policy/credit_policy.yaml:
--------------------------------------------------------------------------------
 1 | label: credit_policy
 2 | version: 1
 3 | rules:
 4 |   - match:
 5 |       name: Name
 6 |     actions:
 7 |       - transform:
 8 |           type: "tokenizer"
 9 |   - match:
10 |       name: Sex
11 |     actions:
12 |       - transform:
13 |           type: "tokenizer"
14 |           max_token_len: 10
15 |   - match:
16 |       name: Age
17 |     actions:
18 |       - transform:
19 |           type: "numeric-perturbation"
20 |           dtype: Integer
21 |           min: -5
22 |           max: 5
23 |   - match:
24 |       name: Salary
25 |     actions:
26 |       - transform:
27 |           type: "numeric-rounding"
28 |           dtype: Double
29 |           precision: -3
30 |   - match:
31 |       name: Application_date
32 |     actions:
33 |       - transform:
34 |           type: "date-perturbation"
35 |           frequency: DAY
36 |           min: -3
37 |           max: 3
38 |   - match:
39 |       name: City
40 |     actions:
41 |       - transform:
42 |           type: "column-redact"
43 |           columns: ["City"] 
44 |   - match:
45 |       name: Street_address
46 |     actions:
47 |       - transform:
48 |           type: "column-redact"
49 |           columns: ["Street_address"] 
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/examples/tutorials/quickstart/README.md:
--------------------------------------------------------------------------------
 1 | # Tutorial: Mask Your Data in Pandas and Spark
 2 | 
 3 | `cape-privacy` gives you the ability to apply several masking techniques (transformations) such as tokenization, perturbation, rounding, etc., in order to obfuscate personal information contained in your dataset.
 4 | 
 5 | In this short tutorial, we will show you how you can prototype a masking policy on a Pandas DataFrame to then apply it on a Spark DataFrame.
 6 | 
 7 | ## Experiment with masking techniques without a policy file
 8 | 
 9 | In order to get familiar with the different masking techniques and identify which one would fit your needs, you can apply these transformations directly on a Pandas DataFrame through the `cape_dataframes.pandas.transformations` API without having to write the policy in a yaml file. 
10 | 
11 | For this example, we will use a mock dataset with the following PII fields: name, age, birthdate, salary and SSN. In order to obfuscate these different fields we will apply the following transformations:
12 | 
13 | - `name`: map each name to a unique token (`Tokenizer`). It will give the ability to obfuscate the name while maintaining user count in your dataset.
14 | - `age`: add uniform random noise within the interval of `[-10, 10]` (`NumericPerturbation`).
15 | - `birthdate`: add uniform random noise to year, month and day (`DatePerturbation`).
16 | - `salary`: round each value to nearest 1,000 (`NumericRounding`).
17 | - `SSN`: redact the field from the dataset (`ColumnRedact`).
18 | 
19 | You can experiment with these transformations on a Pandas DataFrame by running the following script:
20 | 
21 | ```
22 | python experiment_pandas.py
23 | ```
24 | 
25 | You can also experiment with these transformations on Spark DataFrame with the `cape_dataframes.spark.transformations` API.
26 | 
27 | ```sh
28 | python experiment_spark.py
29 | 
30 | # submit the script to a Spark cluster
31 | spark-submit experiment_spark.py
32 | ```
33 | 
34 | As you will notice, the `transformations` API is standardized, so you can easily transfer the transformations applied in `Pandas` to `Spark`.
35 | 
36 | ## Write your policy
37 | 
38 | Once you've identified the masking techniques you'd like to apply, you can define your policy in a `yaml` file. Below, you'll find a sample of the policy corresponding to the transformations applied in `experiment_pandas.py`. You can find the full policy in `mask_personal_information.yaml`. You can select the field with `match`, then define the transformation you'd like to apply under `transform` with the appropriate arguments. The argument names in the policy file match the arguments of the `transformations` API.
39 | 
40 | ```yaml
41 | label: masking_policy
42 | version: 1
43 | rules:
44 |   - match:
45 |       name: name
46 |     actions:
47 |       - transform:
48 |           type: "tokenizer"
49 |           max_token_len: 10
50 |           key: "my secret"
51 |   - match:
52 |       name: age
53 |     actions:
54 |       - transform:
55 |           type: "numeric-perturbation"
56 |           dtype: Integer
57 |           min: -10
58 |           max: 10
59 | ```
60 | 
61 | ## Apply the policy to your Spark DataFrame
62 | 
63 | You are now ready to apply the policy to your Spark DataFrame. You just need to add two methods to your PySpark job:
64 | - `cape_dataframes.parse_policy`: load and parse the policy defined in the `yaml` file.
65 | - `cape_dataframes.apply_policy`: apply the policy to a DataFrame.
66 | 
67 | To mask your data in a Spark job:
68 | 
69 | ```sh
70 | spark-submit apply_policy_spark.py
71 | ```
72 | 
73 | The same process works for Pandas too, in case you'd rather test or deploy with a quick prototype.
74 | 
75 | ```sh
76 | python apply_policy_pandas.py
77 | ```
78 | 
79 | In your terminal, you should the see the data masked!
80 | 


--------------------------------------------------------------------------------
/examples/tutorials/quickstart/apply_policy_pandas.py:
--------------------------------------------------------------------------------
 1 | from dataset import load_dataset
 2 | 
 3 | import cape_dataframes as cape
 4 | 
 5 | # Load the Pandas DataFrame
 6 | df = load_dataset()
 7 | print("Original Dataset:")
 8 | print(df.head())
 9 | # Load the privacy policy and apply it to the DataFrame
10 | policy = cape.parse_policy("mask_personal_information.yaml")
11 | df = cape.apply_policy(policy, df)
12 | 
13 | print("Masked Dataset:")
14 | print(df.head())
15 | 


--------------------------------------------------------------------------------
/examples/tutorials/quickstart/apply_policy_spark.py:
--------------------------------------------------------------------------------
 1 | from dataset import load_dataset
 2 | from pyspark import sql
 3 | 
 4 | import cape_dataframes as cape
 5 | 
 6 | # Set up your SparkSession as usual, but configure it for use with Cape.
 7 | # We do this because some transformations expect Arrow to be enabled.
 8 | sess = sql.SparkSession.builder.appName(
 9 |     "cape.tutorial.maskPersonalInformation"
10 | ).getOrCreate()
11 | sess = cape.spark.configure_session(sess)
12 | 
13 | # Load a Spark DataFrame
14 | df = load_dataset(sess)
15 | print("Original Dataset:")
16 | print(df.show())
17 | # Load the privacy policy and apply it to the DataFrame
18 | policy = cape.parse_policy("mask_personal_information.yaml")
19 | df = cape.apply_policy(policy, df)
20 | 
21 | print("Masked Dataset:")
22 | print(df.show())
23 | 


--------------------------------------------------------------------------------
/examples/tutorials/quickstart/dataset.py:
--------------------------------------------------------------------------------
 1 | # SKIP_CI
 2 | 
 3 | import pandas as pd
 4 | 
 5 | 
 6 | def load_dataset(sess=None):
 7 |     dataset = pd.DataFrame(
 8 |         {
 9 |             "name": ["alice", "bob"],
10 |             "age": [34, 55],
11 |             "birthdate": [pd.Timestamp(1985, 2, 23), pd.Timestamp(1963, 5, 10)],
12 |             "salary": [59234.32, 49324.53],
13 |             "ssn": ["343554334", "656564664"],
14 |         }
15 |     )
16 |     if sess is not None:
17 |         return sess.createDataFrame(dataset)
18 |     else:
19 |         return dataset
20 | 


--------------------------------------------------------------------------------
/examples/tutorials/quickstart/experiment_pandas.py:
--------------------------------------------------------------------------------
 1 | from dataset import load_dataset
 2 | 
 3 | from cape_dataframes.pandas import dtypes
 4 | from cape_dataframes.pandas.transformations import ColumnRedact
 5 | from cape_dataframes.pandas.transformations import DatePerturbation
 6 | from cape_dataframes.pandas.transformations import NumericPerturbation
 7 | from cape_dataframes.pandas.transformations import NumericRounding
 8 | from cape_dataframes.pandas.transformations import Tokenizer
 9 | 
10 | # Load Pandas DataFrame
11 | df = load_dataset()
12 | print("Original Dataset:")
13 | print(df.head())
14 | 
15 | # Define the transformations
16 | tokenize = Tokenizer(max_token_len=10, key=b"my secret")
17 | perturb_numric = NumericPerturbation(dtype=dtypes.Integer, min=-10, max=10)
18 | perturb_date = DatePerturbation(
19 |     frequency=("YEAR", "MONTH", "DAY"), min=(-10, -5, -5), max=(10, 5, 5)
20 | )
21 | round_numeric = NumericRounding(dtype=dtypes.Float, precision=-3)
22 | redact_column = ColumnRedact(columns="ssn")
23 | 
24 | # Apply the transformations
25 | df["name"] = tokenize(df["name"])
26 | df["age"] = perturb_numric(df["age"])
27 | df["salary"] = round_numeric(df["salary"])
28 | df["birthdate"] = perturb_date(df["birthdate"])
29 | df = redact_column(df)
30 | 
31 | print("Masked Dataset:")
32 | print(df.head())
33 | 


--------------------------------------------------------------------------------
/examples/tutorials/quickstart/experiment_spark.py:
--------------------------------------------------------------------------------
 1 | from dataset import load_dataset
 2 | from pyspark import sql
 3 | from pyspark.sql import functions
 4 | 
 5 | import cape_dataframes as cape
 6 | from cape_dataframes.spark import dtypes
 7 | from cape_dataframes.spark.transformations import ColumnRedact
 8 | from cape_dataframes.spark.transformations import DatePerturbation
 9 | from cape_dataframes.spark.transformations import NumericPerturbation
10 | from cape_dataframes.spark.transformations import NumericRounding
11 | from cape_dataframes.spark.transformations import Tokenizer
12 | 
13 | # Set up your SparkSession as usual, but configure it for use with Cape.
14 | # We do this because some transformations expect Arrow to be enabled.
15 | sess = sql.SparkSession.builder.appName(
16 |     "cape.tutorial.maskPersonalInformation"
17 | ).getOrCreate()
18 | sess = cape.spark.configure_session(sess)
19 | 
20 | # Load Spark DataFrame
21 | df = load_dataset(sess)
22 | print("Original Dataset:")
23 | df.show()
24 | 
25 | # Define the transformations
26 | tokenize = Tokenizer(max_token_len=10, key=b"my secret")
27 | perturb_numric = NumericPerturbation(dtype=dtypes.Integer, min=-10, max=10)
28 | perturb_date = DatePerturbation(
29 |     frequency=("YEAR", "MONTH", "DAY"), min=(-10, -5, -5), max=(10, 5, 5)
30 | )
31 | round_numeric = NumericRounding(dtype=dtypes.Float, precision=-3)
32 | redact_column = ColumnRedact(columns="ssn")
33 | 
34 | # Apply the transformation
35 | df = redact_column(df)
36 | df = df.select(
37 |     tokenize(functions.col("name")).alias("name"),
38 |     perturb_numric(functions.col("age")).alias("age"),
39 |     round_numeric(functions.col("salary")).alias("salary"),
40 |     perturb_date(functions.col("birthdate")).alias("birthdate"),
41 | )
42 | 
43 | print("Masked Dataset:")
44 | print(df.show())
45 | 


--------------------------------------------------------------------------------
/examples/tutorials/quickstart/mask_personal_information.yaml:
--------------------------------------------------------------------------------
 1 | label: masking_policy
 2 | version: 1
 3 | rules:
 4 |   - match:
 5 |       name: name
 6 |     actions:
 7 |       - transform:
 8 |           type: "tokenizer"
 9 |           max_token_len: 10
10 |           key: "my secret"
11 |   - match:
12 |       name: age
13 |     actions:
14 |       - transform:
15 |           type: "numeric-perturbation"
16 |           dtype: Integer
17 |           min: -10
18 |           max: 10
19 |   - match:
20 |       name: salary
21 |     actions:
22 |       - transform:
23 |           type: "numeric-rounding"
24 |           dtype: Double
25 |           precision: -3
26 |   - match:
27 |       name: birthdate
28 |     actions:
29 |       - transform:
30 |           type: "date-perturbation"
31 |           frequency: ["YEAR", "MONTH", "DAY"]
32 |           min: [-10, -5, -5]
33 |           max: [10, 5, 5]
34 |   - match:
35 |       name: ssn
36 |     actions:
37 |       - transform:
38 |           type: "column-redact"
39 |           columns: ["ssn"] 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/examples/tutorials/reversible_tokenizer/README.md:
--------------------------------------------------------------------------------
1 | ## Reversible Tokenizer
2 | 
3 | This directory includes an example showcasing how you can use the `ReversibleTokenizer`
4 | to tokenize data in a dataframe, as well as usage of a `TokenReverser` to recover
5 | the original data.


--------------------------------------------------------------------------------
/examples/tutorials/reversible_tokenizer/reversible_tokenizer_pandas.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Reversible Tokenizer\n",
  8 |     "\n",
  9 |     "Here we show an example of how you can use the `ReversibleTokenizer` to tokenize data within a pandas dataframe.\n",
 10 |     "\n",
 11 |     "The `ReversibleTokenizer` will tokenize the input data so it can be used in a privacy preserving manner.\n",
 12 |     "\n",
 13 |     "The `ReversibleTokenizer` can be used in conjunction with the `TokenReverser` to recover the original data."
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "### Tokenizing Data\n",
 21 |     "\n",
 22 |     "The `ReversibleTokenizer` and `TokenReverser` classes can be found in the `pandas.transformations` package."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 63,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "from cape_dataframes.pandas.transformations import ReversibleTokenizer\n",
 32 |     "from cape_dataframes.pandas.transformations import TokenReverser"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "The `ReversibleTokenizer` and `TokenReverser` classes both take a `key` as input.\n",
 40 |     "\n",
 41 |     "For the `TokenReverser` to be able to reverse the tokens produced by the `ReversibleTokenizer`, you must\n",
 42 |     "use the same key."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 64,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "data": {
 52 |       "text/plain": [
 53 |        "b'55555555555555555555555555555555'"
 54 |       ]
 55 |      },
 56 |      "execution_count": 64,
 57 |      "metadata": {},
 58 |      "output_type": "execute_result"
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "key=b\"5\" * 32\n",
 63 |     "key"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "In this example, we will simply hide the names within our dataset."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 71,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/html": [
 81 |        "<div>\n",
 82 |        "<style scoped>\n",
 83 |        "    .dataframe tbody tr th:only-of-type {\n",
 84 |        "        vertical-align: middle;\n",
 85 |        "    }\n",
 86 |        "\n",
 87 |        "    .dataframe tbody tr th {\n",
 88 |        "        vertical-align: top;\n",
 89 |        "    }\n",
 90 |        "\n",
 91 |        "    .dataframe thead th {\n",
 92 |        "        text-align: right;\n",
 93 |        "    }\n",
 94 |        "</style>\n",
 95 |        "<table border=\"1\" class=\"dataframe\">\n",
 96 |        "  <thead>\n",
 97 |        "    <tr style=\"text-align: right;\">\n",
 98 |        "      <th></th>\n",
 99 |        "      <th>name</th>\n",
100 |        "      <th># friends</th>\n",
101 |        "    </tr>\n",
102 |        "  </thead>\n",
103 |        "  <tbody>\n",
104 |        "    <tr>\n",
105 |        "      <th>0</th>\n",
106 |        "      <td>Alice</td>\n",
107 |        "      <td>100</td>\n",
108 |        "    </tr>\n",
109 |        "    <tr>\n",
110 |        "      <th>1</th>\n",
111 |        "      <td>Bob</td>\n",
112 |        "      <td>200</td>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>2</th>\n",
116 |        "      <td>Carol</td>\n",
117 |        "      <td>300</td>\n",
118 |        "    </tr>\n",
119 |        "  </tbody>\n",
120 |        "</table>\n",
121 |        "</div>"
122 |       ],
123 |       "text/plain": [
124 |        "    name  # friends\n",
125 |        "0  Alice        100\n",
126 |        "1    Bob        200\n",
127 |        "2  Carol        300"
128 |       ]
129 |      },
130 |      "execution_count": 71,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "import pandas as pd\n",
137 |     "plaintext_data = pd.DataFrame({'name': [\"Alice\", \"Bob\", \"Carol\"], \"# friends\": [100, 200, 300]})\n",
138 |     "plaintext_data"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "You instantiate a `ReversibleTokenizer` by passing it your key"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 72,
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/plain": [
156 |        "<cape_privacy.pandas.transformations.tokenizer.ReversibleTokenizer at 0x11a8da630>"
157 |       ]
158 |      },
159 |      "execution_count": 72,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "tokenizer = ReversibleTokenizer(key=key)\n",
166 |     "tokenizer"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "Next, we can pass our dataframe to the `tokenizer`"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 73,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "data": {
183 |       "text/html": [
184 |        "<div>\n",
185 |        "<style scoped>\n",
186 |        "    .dataframe tbody tr th:only-of-type {\n",
187 |        "        vertical-align: middle;\n",
188 |        "    }\n",
189 |        "\n",
190 |        "    .dataframe tbody tr th {\n",
191 |        "        vertical-align: top;\n",
192 |        "    }\n",
193 |        "\n",
194 |        "    .dataframe thead th {\n",
195 |        "        text-align: right;\n",
196 |        "    }\n",
197 |        "</style>\n",
198 |        "<table border=\"1\" class=\"dataframe\">\n",
199 |        "  <thead>\n",
200 |        "    <tr style=\"text-align: right;\">\n",
201 |        "      <th></th>\n",
202 |        "      <th>name</th>\n",
203 |        "      <th># friends</th>\n",
204 |        "    </tr>\n",
205 |        "  </thead>\n",
206 |        "  <tbody>\n",
207 |        "    <tr>\n",
208 |        "      <th>0</th>\n",
209 |        "      <td>c8c7e80144304276183e5bcd589db782bc5ff95309</td>\n",
210 |        "      <td>100</td>\n",
211 |        "    </tr>\n",
212 |        "    <tr>\n",
213 |        "      <th>1</th>\n",
214 |        "      <td>e0f40aea0d5c21b35967c4231b98b5b3e5338e</td>\n",
215 |        "      <td>200</td>\n",
216 |        "    </tr>\n",
217 |        "    <tr>\n",
218 |        "      <th>2</th>\n",
219 |        "      <td>7bfcdf25f73a1fe7a7fcb0970976f3393ed5df5ceb</td>\n",
220 |        "      <td>300</td>\n",
221 |        "    </tr>\n",
222 |        "  </tbody>\n",
223 |        "</table>\n",
224 |        "</div>"
225 |       ],
226 |       "text/plain": [
227 |        "                                         name  # friends\n",
228 |        "0  c8c7e80144304276183e5bcd589db782bc5ff95309        100\n",
229 |        "1      e0f40aea0d5c21b35967c4231b98b5b3e5338e        200\n",
230 |        "2  7bfcdf25f73a1fe7a7fcb0970976f3393ed5df5ceb        300"
231 |       ]
232 |      },
233 |      "execution_count": 73,
234 |      "metadata": {},
235 |      "output_type": "execute_result"
236 |     }
237 |    ],
238 |    "source": [
239 |     "tokenized = pd.DataFrame(plaintext_data)\n",
240 |     "tokenized[\"name\"] = tokenizer(plaintext_data[\"name\"])\n",
241 |     "tokenized"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "## Recovering Tokens\n",
249 |     "\n",
250 |     "If we ever need to reveal the tokenized data, we can use the `TokenReverser` class."
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 74,
256 |    "metadata": {},
257 |    "outputs": [
258 |     {
259 |      "data": {
260 |       "text/html": [
261 |        "<div>\n",
262 |        "<style scoped>\n",
263 |        "    .dataframe tbody tr th:only-of-type {\n",
264 |        "        vertical-align: middle;\n",
265 |        "    }\n",
266 |        "\n",
267 |        "    .dataframe tbody tr th {\n",
268 |        "        vertical-align: top;\n",
269 |        "    }\n",
270 |        "\n",
271 |        "    .dataframe thead th {\n",
272 |        "        text-align: right;\n",
273 |        "    }\n",
274 |        "</style>\n",
275 |        "<table border=\"1\" class=\"dataframe\">\n",
276 |        "  <thead>\n",
277 |        "    <tr style=\"text-align: right;\">\n",
278 |        "      <th></th>\n",
279 |        "      <th>name</th>\n",
280 |        "      <th># friends</th>\n",
281 |        "    </tr>\n",
282 |        "  </thead>\n",
283 |        "  <tbody>\n",
284 |        "    <tr>\n",
285 |        "      <th>0</th>\n",
286 |        "      <td>Alice</td>\n",
287 |        "      <td>100</td>\n",
288 |        "    </tr>\n",
289 |        "    <tr>\n",
290 |        "      <th>1</th>\n",
291 |        "      <td>Bob</td>\n",
292 |        "      <td>200</td>\n",
293 |        "    </tr>\n",
294 |        "    <tr>\n",
295 |        "      <th>2</th>\n",
296 |        "      <td>Carol</td>\n",
297 |        "      <td>300</td>\n",
298 |        "    </tr>\n",
299 |        "  </tbody>\n",
300 |        "</table>\n",
301 |        "</div>"
302 |       ],
303 |       "text/plain": [
304 |        "    name  # friends\n",
305 |        "0  Alice        100\n",
306 |        "1    Bob        200\n",
307 |        "2  Carol        300"
308 |       ]
309 |      },
310 |      "execution_count": 74,
311 |      "metadata": {},
312 |      "output_type": "execute_result"
313 |     }
314 |    ],
315 |    "source": [
316 |     "reverser = TokenReverser(key=key)\n",
317 |     "recovered = pd.DataFrame(tokenized)\n",
318 |     "recovered[\"name\"] = reverser(tokenized[\"name\"])\n",
319 |     "recovered"
320 |    ]
321 |   }
322 |  ],
323 |  "metadata": {
324 |   "kernelspec": {
325 |    "display_name": "cape-df",
326 |    "language": "python",
327 |    "name": "python3"
328 |   },
329 |   "language_info": {
330 |    "codemirror_mode": {
331 |     "name": "ipython",
332 |     "version": 3
333 |    },
334 |    "file_extension": ".py",
335 |    "mimetype": "text/x-python",
336 |    "name": "python",
337 |    "nbconvert_exporter": "python",
338 |    "pygments_lexer": "ipython3",
339 |    "version": "3.8.16"
340 |   },
341 |   "vscode": {
342 |    "interpreter": {
343 |     "hash": "2c0eb9acd3ce9f628738cc91d7613a5d048e1a93f709104c9a35d77254cfaaac"
344 |    }
345 |   }
346 |  },
347 |  "nbformat": 4,
348 |  "nbformat_minor": 4
349 | }
350 | 


--------------------------------------------------------------------------------
/examples/tutorials/reversible_tokenizer/reversible_tokenizer_pandas.py:
--------------------------------------------------------------------------------
 1 | # This is a pure python version of the notebook from this directory
 2 | 
 3 | # Here we show an example of how you can use the `ReversibleTokenizer` to tokenize data
 4 | # within a pandas dataframe. The `ReversibleTokenizer` will tokenize the input data so
 5 | # it can be used in a privacy-preserving manner. The `ReversibleTokenizer` can be used
 6 | # in conjunction with the `TokenReverser` to recover the original data.
 7 | 
 8 | import pandas as pd
 9 | 
10 | from cape_dataframes.pandas.transformations import ReversibleTokenizer
11 | from cape_dataframes.pandas.transformations import TokenReverser
12 | 
13 | # The `ReversibleTokenizer` and `TokenReverser` classes both take a `key` as input.
14 | # For the `TokenReverser` to be able to reverse the tokens produced by the
15 | # `ReversibleTokenizer`, you must use the same key.
16 | 
17 | key = b"5" * 32
18 | 
19 | # In this example, we will simply hide the names within our dataset.
20 | plaintext_data = pd.DataFrame(
21 |     {"name": ["Alice", "Bob", "Carol"], "# friends": [100, 200, 300]}
22 | )
23 | print("plantext data")
24 | print(plaintext_data)
25 | print("\n")
26 | 
27 | # Tokenization logic
28 | tokenizer = ReversibleTokenizer(key=key)
29 | tokenized = pd.DataFrame(plaintext_data)
30 | tokenized["name"] = tokenizer(plaintext_data["name"])
31 | 
32 | print("tokenized data")
33 | print(plaintext_data)
34 | print("\n")
35 | 
36 | # Reverse the tokenization
37 | reverser = TokenReverser(key=key)
38 | recovered = pd.DataFrame(tokenized)
39 | recovered["name"] = reverser(tokenized["name"])
40 | 
41 | print("reversed tokens")
42 | print(recovered)
43 | print("\n")
44 | 


--------------------------------------------------------------------------------
/requirements/base.in:
--------------------------------------------------------------------------------
1 | pandas>=1.0
2 | numpy>=1.22
3 | pycryptodome
4 | pyyaml>=5.4
5 | requests
6 | rfc3339
7 | validators
8 | 


--------------------------------------------------------------------------------
/requirements/base.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.8
 3 | # by the following command:
 4 | #
 5 | #    make pydep-upgrade
 6 | #
 7 | certifi==2022.12.7
 8 |     # via requests
 9 | charset-normalizer==3.0.1
10 |     # via requests
11 | decorator==5.1.1
12 |     # via validators
13 | idna==3.4
14 |     # via requests
15 | numpy==1.24.2
16 |     # via
17 |     #   -r requirements/base.in
18 |     #   pandas
19 | pandas==1.3.5
20 |     # via -r requirements/base.in
21 | pycryptodome==3.17
22 |     # via -r requirements/base.in
23 | python-dateutil==2.8.2
24 |     # via pandas
25 | pytz==2022.7.1
26 |     # via pandas
27 | pyyaml==6.0
28 |     # via -r requirements/base.in
29 | requests==2.28.2
30 |     # via -r requirements/base.in
31 | rfc3339==6.2
32 |     # via -r requirements/base.in
33 | six==1.16.0
34 |     # via python-dateutil
35 | urllib3==1.26.14
36 |     # via requests
37 | validators==0.20.0
38 |     # via -r requirements/base.in
39 | 


--------------------------------------------------------------------------------
/requirements/dev.in:
--------------------------------------------------------------------------------
 1 | -c base.txt
 2 | black
 3 | coverage
 4 | flake8
 5 | flake8-black
 6 | isort
 7 | pytest
 8 | pytest-cov
 9 | pytest-httpserver
10 | responses
11 | 


--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.8
 3 | # by the following command:
 4 | #
 5 | #    make pydep-upgrade
 6 | #
 7 | attrs==22.2.0
 8 |     # via pytest
 9 | black==23.1.0
10 |     # via
11 |     #   -r requirements/dev.in
12 |     #   flake8-black
13 | certifi==2022.12.7
14 |     # via
15 |     #   -c requirements/base.txt
16 |     #   requests
17 | charset-normalizer==3.0.1
18 |     # via
19 |     #   -c requirements/base.txt
20 |     #   requests
21 | click==8.1.3
22 |     # via black
23 | coverage[toml]==7.1.0
24 |     # via
25 |     #   -r requirements/dev.in
26 |     #   pytest-cov
27 | exceptiongroup==1.1.0
28 |     # via pytest
29 | flake8==5.0.4
30 |     # via
31 |     #   -r requirements/dev.in
32 |     #   flake8-black
33 | flake8-black==0.3.6
34 |     # via -r requirements/dev.in
35 | idna==3.4
36 |     # via
37 |     #   -c requirements/base.txt
38 |     #   requests
39 | iniconfig==2.0.0
40 |     # via pytest
41 | isort==5.11.5
42 |     # via -r requirements/dev.in
43 | markupsafe==2.1.2
44 |     # via werkzeug
45 | mccabe==0.7.0
46 |     # via flake8
47 | mypy-extensions==1.0.0
48 |     # via black
49 | packaging==23.0
50 |     # via
51 |     #   black
52 |     #   pytest
53 | pathspec==0.11.0
54 |     # via black
55 | platformdirs==3.0.0
56 |     # via black
57 | pluggy==1.0.0
58 |     # via pytest
59 | pycodestyle==2.9.1
60 |     # via flake8
61 | pyflakes==2.5.0
62 |     # via flake8
63 | pytest==7.2.1
64 |     # via
65 |     #   -r requirements/dev.in
66 |     #   pytest-cov
67 | pytest-cov==4.0.0
68 |     # via -r requirements/dev.in
69 | pytest-httpserver==1.0.6
70 |     # via -r requirements/dev.in
71 | requests==2.28.2
72 |     # via
73 |     #   -c requirements/base.txt
74 |     #   responses
75 | responses==0.22.0
76 |     # via -r requirements/dev.in
77 | toml==0.10.2
78 |     # via responses
79 | tomli==2.0.1
80 |     # via
81 |     #   black
82 |     #   coverage
83 |     #   flake8-black
84 |     #   pytest
85 | types-toml==0.10.8.4
86 |     # via responses
87 | typing-extensions==4.5.0
88 |     # via black
89 | urllib3==1.26.14
90 |     # via
91 |     #   -c requirements/base.txt
92 |     #   requests
93 |     #   responses
94 | werkzeug==2.2.3
95 |     # via pytest-httpserver
96 | 


--------------------------------------------------------------------------------
/requirements/spark.in:
--------------------------------------------------------------------------------
1 | -c base.txt
2 | packaging
3 | pyarrow
4 | pyspark[sql]>=3.2.2
5 | 


--------------------------------------------------------------------------------
/requirements/spark.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.8
 3 | # by the following command:
 4 | #
 5 | #    make pydep-upgrade
 6 | #
 7 | numpy==1.24.2
 8 |     # via
 9 |     #   -c requirements/base.txt
10 |     #   pandas
11 |     #   pyarrow
12 | packaging==23.0
13 |     # via -r requirements/spark.in
14 | pandas==1.3.5
15 |     # via
16 |     #   -c requirements/base.txt
17 |     #   pyspark
18 | py4j==0.10.9.5
19 |     # via pyspark
20 | pyarrow==11.0.0
21 |     # via
22 |     #   -r requirements/spark.in
23 |     #   pyspark
24 | pyspark[sql]==3.3.1
25 |     # via -r requirements/spark.in
26 | python-dateutil==2.8.2
27 |     # via
28 |     #   -c requirements/base.txt
29 |     #   pandas
30 | pytz==2022.7.1
31 |     # via
32 |     #   -c requirements/base.txt
33 |     #   pandas
34 | six==1.16.0
35 |     # via
36 |     #   -c requirements/base.txt
37 |     #   python-dateutil
38 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [tool:isort]
 2 | line_length=88
 3 | force_single_line=True
 4 | 
 5 | [flake8]
 6 | max-line-length=88
 7 | extend-ignore=
 8 |     E203
 9 |     D10,D20,D40
10 | exclude=cape/connector/proto
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """Installing with setuptools."""
 2 | import setuptools
 3 | 
 4 | with open("README.md", "r", encoding="utf8") as fh:
 5 |     long_description = fh.read()
 6 | 
 7 | setuptools.setup(
 8 |     name="cape-dataframes",
 9 |     version="0.3.1",
10 |     packages=setuptools.find_packages(),
11 |     python_requires=">=3.6",
12 |     license="Apache License 2.0",
13 |     url="https://github.com/capeprivacy/cape-dataframes",
14 |     description="Cape manages secure access to all of your data.",
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     author="Cape Privacy",
18 |     author_email="contact@capeprivacy.com",
19 |     install_requires=[
20 |         "pandas",
21 |         "pycryptodome",
22 |         "pyyaml",
23 |         "requests",
24 |         "rfc3339",
25 |     ],
26 |     extras_require={
27 |         "spark": ["pyspark >=2.4", "pyarrow >=0.15.1"],
28 |     },
29 |     classifiers=[
30 |         "Programming Language :: Python :: 3",
31 |         "License :: OSI Approved :: Apache Software License",
32 |         "Development Status :: 3 - Alpha",
33 |         "Operating System :: OS Independent",
34 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
35 |         "Topic :: Security :: Cryptography",
36 |     ],
37 | )
38 | 


--------------------------------------------------------------------------------