├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── .codecov.yml
    │   ├── cicd.yml
    │   ├── pypi.yml
    │   └── release-please.yml
├── .gitignore
├── .idea
    ├── .gitignore
    ├── anjana.iml
    ├── inspectionProfiles
    │   ├── Project_Default.xml
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    └── vcs.xml
├── .readthedocs.yml
├── .release-please-manifest.json
├── CHANGELOG.md
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── anjana
    ├── __init__.py
    └── anonymity
    │   ├── __init__.py
    │   ├── _beta_likeness.py
    │   ├── _delta_disclosure.py
    │   ├── _k_anonymity.py
    │   ├── _l_diversity.py
    │   ├── _t_closeness.py
    │   └── utils
    │       ├── __init__.py
    │       └── utils.py
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── anjana.anonymity.rst
    │   ├── anjana.anonymity.utils.rst
    │   ├── anjana.rst
    │   ├── conf.py
    │   ├── get_transformation.rst
    │   ├── getting_started.rst
    │   ├── index.rst
    │   ├── intro.rst
    │   ├── modules.rst
    │   └── multiple_sa.rst
├── examples
    ├── adult.py
    ├── adult_alpha_k_anonymity.py
    ├── adult_basic_beta_likeness.py
    ├── adult_delta_disclosure.py
    ├── adult_enhanced_beta_likeness.py
    ├── adult_get_transformation.py
    ├── adult_k10.csv
    ├── adult_k_l_t.py
    ├── adult_ldiversity.py
    ├── adult_tcloseness.py
    ├── data
    │   ├── adult.csv
    │   ├── adult_k10.csv
    │   └── hospital_extended.csv
    ├── hierarchies
    │   ├── age.csv
    │   ├── country.csv
    │   ├── education.csv
    │   ├── marital.csv
    │   ├── occupation.csv
    │   ├── race.csv
    │   ├── salary.csv
    │   ├── sex.csv
    │   └── workclass.csv
    ├── hospital.py
    └── hospital_get_transformation.py
├── pyproject.toml
├── release-please-config.json
├── test-requirements.txt
├── tests
    ├── __init__.py
    ├── test_anonymity.py
    └── test_unitary.py
└── tox.ini


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "pip"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "daily"
 7 |     labels:
 8 |       - "dependencies"
 9 |     reviewers:
10 |       - "judithspd"
11 |       - "alvarolopez"
12 |     assignees:
13 |       - "judithspd"
14 | 
15 |   - package-ecosystem: "github-actions"
16 |     directory: "/"
17 |     schedule:
18 |       interval: "weekly"
19 |     reviewers:
20 |       - "judithspd"
21 |       - "alvarolopez"
22 | 
23 | 


--------------------------------------------------------------------------------
/.github/workflows/.codecov.yml:
--------------------------------------------------------------------------------
 1 | name: Code Coverage
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   run:
 5 |     runs-on: ubuntu-22.04
 6 |     steps:
 7 |       - name: Checkout code coverage
 8 |         uses: actions/checkout@v4
 9 |         with:
10 |           fetch-depth: 0
11 |       - name: Set up Python 3.10
12 |         uses: actions/setup-python@v5
13 |         with:
14 |           python-version: '3.10'
15 |       - name: Install pytest
16 |         run: pip install pytest-cov
17 |       - name: Install poetry
18 |         run: pip install poetry
19 |       - name: Configure poetry
20 |         run: poetry config virtualenvs.create false
21 |       - name: Install dependencies
22 |         run: poetry install
23 |       - name: Run tests and collect coverage
24 |         run: coverage run --omit="./tests" -m pytest
25 |       - name: Upload coverage reports to Codecov
26 |         uses: codecov/codecov-action@v5.4.3
27 |         with:
28 |           token: ${{ secrets.CODECOV_TOKEN }}
29 |           slug: IFCA-Advanced-Computing/anjana
30 | 


--------------------------------------------------------------------------------
/.github/workflows/cicd.yml:
--------------------------------------------------------------------------------
 1 | name: CI/CD Pipeline
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "*"
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 |       
11 | jobs:
12 |   lint:
13 |     name: Lint
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Checkout code
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: Set up Python
20 |         uses: actions/setup-python@v5
21 |         with:
22 |           python-version: "3.10"
23 | 
24 |       - name: Install and configure poetry
25 |         run: |
26 |           pip install poetry
27 |           poetry config virtualenvs.create false
28 |           poetry install
29 | 
30 |       - name: Install dependencies
31 |         run: |
32 |           pip install --upgrade pip
33 |           pip install tox
34 | 
35 |       - name: Linting
36 |         run: tox -e bandit,black,flake8
37 | 
38 |   test:
39 |     name: Test
40 |     runs-on: ubuntu-latest
41 |     steps:
42 |       - name: Checkout code
43 |         uses: actions/checkout@v4
44 | 
45 |       - name: Set up Python
46 |         uses: actions/setup-python@v5
47 |         with:
48 |           python-version: "3.9"
49 | 
50 |       - name: Install and configure poetry
51 |         run: |
52 |           pip install poetry
53 |           poetry config virtualenvs.create false
54 |           poetry install
55 | 
56 |       - name: Install dependencies
57 |         run: |
58 |           pip install --upgrade pip
59 |           pip install tox
60 | 
61 |       - name: Testing
62 |         run: tox -e py39,py310,py311,py312
63 | 
64 |   build:
65 |     name: Build
66 |     runs-on: ubuntu-latest
67 |     steps:
68 |       - name: Checkout code
69 |         uses: actions/checkout@v4
70 | 
71 |       - name: Set up Python
72 |         uses: actions/setup-python@v5
73 |         with:
74 |           python-version: "3.10"
75 | 
76 |       - name: Install dependencies
77 |         run: |
78 |           pip install --upgrade pip
79 |           pip install poetry
80 | 
81 |       - name: Build package
82 |         run: |
83 |           poetry build
84 |           pip install dist/*.whl
85 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Package in PyPI
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   publish:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - name: Checkout
16 |         uses: actions/checkout@v4
17 | 
18 |       - name: Set up Python 3.10
19 |         uses: actions/setup-python@v5
20 |         with:
21 |           python-version: "3.10"
22 | 
23 |       - name: Install dependencies
24 |         run: |
25 |           pip install --upgrade pip
26 |           pip install build
27 |           pip install twine
28 | 
29 |       - name: Build and upload package
30 |         run: |
31 |           python -m build
32 |           python -m twine upload dist/*
33 |         env:
34 |           TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
35 |           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
36 |           
37 | 


--------------------------------------------------------------------------------
/.github/workflows/release-please.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - main
 5 |   pull_request:
 6 |     branches:
 7 |       - main
 8 | 
 9 | permissions:
10 |   contents: write
11 |   pull-requests: write
12 | 
13 | name: release-please
14 | 
15 | jobs:
16 |   release-please:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - uses: google-github-actions/release-please-action@v4
20 |         with:
21 |           # this assumes that you have created a personal access token
22 |           # (PAT) and configured it as a GitHub action secret named
23 |           # `MY_RELEASE_PLEASE_TOKEN` (this secret name is not important).
24 |           token: ${{ secrets.MY_RELEASE_PLEASE_TOKEN }}
25 |           release-type: python
26 |           config-file: release-please-config.json
27 |           manifest-file: .release-please-manifest.json
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | lib
16 | lib64
17 | 
18 | # Sphinx
19 | docs/build
20 | 
21 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.idea/anjana.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <excludeFolder url="file://$MODULE_DIR$/.venv" />
 6 |     </content>
 7 |     <orderEntry type="jdk" jdkName="Python 3.10 virtualenv at ~/anjana/.venv" jdkType="Python SDK" />
 8 |     <orderEntry type="sourceFolder" forTests="false" />
 9 |   </component>
10 | </module>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="ignoredPackages">
 6 |         <value>
 7 |           <list size="7">
 8 |             <item index="0" class="java.lang.String" itemvalue="pandas" />
 9 |             <item index="1" class="java.lang.String" itemvalue="typer" />
10 |             <item index="2" class="java.lang.String" itemvalue="tabulate" />
11 |             <item index="3" class="java.lang.String" itemvalue="numpy" />
12 |             <item index="4" class="java.lang.String" itemvalue="pycanon" />
13 |             <item index="5" class="java.lang.String" itemvalue="typing_extensions" />
14 |             <item index="6" class="java.lang.String" itemvalue="black" />
15 |           </list>
16 |         </value>
17 |       </option>
18 |     </inspection_tool>
19 |   </profile>
20 | </component>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Black">
4 |     <option name="sdkName" value="Python 3.10 (anjana) (2)" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 virtualenv at ~/anjana/.venv" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/anjana.iml" filepath="$PROJECT_DIR$/.idea/anjana.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the OS, Python version and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.10"
13 |     # You can also specify other tool versions:
14 |     # nodejs: "19"
15 |     # rust: "1.64"
16 |     # golang: "1.19"
17 | 
18 | # Build documentation in the "docs/" directory with Sphinx
19 | sphinx:
20 |   configuration: docs/source/conf.py
21 | 
22 | # Optionally build your docs in additional formats such as PDF and ePub
23 | # formats:
24 | #    - pdf
25 | #    - epub
26 | 
27 | # Optional but recommended, declare the Python requirements required
28 | # to build your documentation
29 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
30 | python:
31 |    install:
32 |      - requirements: test-requirements.txt
33 |      
34 | 


--------------------------------------------------------------------------------
/.release-please-manifest.json:
--------------------------------------------------------------------------------
1 | {
2 |   ".": "1.1.0"
3 | }
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## [1.1.0](https://github.com/IFCA-Advanced-Computing/anjana/compare/v1.0.0...v1.1.0) (2025-01-30)
 4 | 
 5 | 
 6 | ### Features
 7 | 
 8 | * add fuction for applying a given transformation ([be9c7dd](https://github.com/IFCA-Advanced-Computing/anjana/commit/be9c7dd7fbf88bb74d30c7bb56279cbde08958dd))
 9 | 
10 | 
11 | ### Bug Fixes
12 | 
13 | * fix  apply_transformation function ([61998c3](https://github.com/IFCA-Advanced-Computing/anjana/commit/61998c33466c2810d6d350dd4eade5a676a627ff))
14 | 
15 | 
16 | ### Documentation
17 | 
18 | * fix EU acknowledgement with link to Cordis ([18de191](https://github.com/IFCA-Advanced-Computing/anjana/commit/18de1913acb090eeba482484dc506266ee86b78f))
19 | * update getting_started.rst ([56cf748](https://github.com/IFCA-Advanced-Computing/anjana/commit/56cf74846942e93374c008921b5d2d042bdfce38))
20 | * update index.rst ([f7967ad](https://github.com/IFCA-Advanced-Computing/anjana/commit/f7967ad5884c30d926f2a3cd355d58d20f54951c))
21 | 
22 | ## [1.0.0](https://github.com/IFCA-Advanced-Computing/anjana/compare/v0.2.2...v1.0.0) (2024-08-14)
23 | 
24 | 
25 | ### ⚠ BREAKING CHANGES
26 | 
27 | * move to version 1.0.0
28 | 
29 | ### Features
30 | 
31 | * move to version 1.0.0 ([368a5ca](https://github.com/IFCA-Advanced-Computing/anjana/commit/368a5ca52886a343de2af42cb90e5df4dbc7fafd))
32 | * move to version 1.0.0 ([ed06f0e](https://github.com/IFCA-Advanced-Computing/anjana/commit/ed06f0e4ce603bc01f1332d3cfdfa85b98bb5efa))
33 | 
34 | ## [0.2.2](https://github.com/IFCA-Advanced-Computing/anjana/compare/v0.2.1...v0.2.2) (2024-05-15)
35 | 
36 | 
37 | ### Bug Fixes
38 | 
39 | * update dependencies ([7077e21](https://github.com/IFCA-Advanced-Computing/anjana/commit/7077e21be4252febb8fb033fa4cb4edd76a1d32e))
40 | 
41 | ## [0.2.0](https://github.com/IFCA-Advanced-Computing/anjana/compare/v0.1.1...v0.2.0) (2024-05-13)
42 | 
43 | 
44 | ### Features
45 | 
46 | * generate interval-based hierarchies ([8500823](https://github.com/IFCA-Advanced-Computing/anjana/commit/850082334322b14f6c5d98975aa3d93fb95a860c))
47 | 
48 | 
49 | ### Bug Fixes
50 | 
51 | * record suppression supported ([f050a6e](https://github.com/IFCA-Advanced-Computing/anjana/commit/f050a6ebaf2e13ed159e3beac7635c9b9b4ccb14))
52 | 
53 | ## [0.1.0](https://github.com/IFCA-Advanced-Computing/anjana/compare/v0.0.2...v0.1.0) (2024-05-13)
54 | 
55 | 
56 | ### Features
57 | 
58 | * generate interval-based hierarchies ([8500823](https://github.com/IFCA-Advanced-Computing/anjana/commit/850082334322b14f6c5d98975aa3d93fb95a860c))
59 | 
60 | 
61 | ### Bug Fixes
62 | 
63 | * record suppression supported ([f050a6e](https://github.com/IFCA-Advanced-Computing/anjana/commit/f050a6ebaf2e13ed159e3beac7635c9b9b4ccb14))
64 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 0.0.6
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Sáinz-Pardo Díaz"
 5 |   given-names: "Judith"
 6 |   orcid: "https://orcid.org/0000-0002-8387-578X"
 7 | - family-names: "López García"
 8 |   given-names: "Álvaro"
 9 |   orcid: "https://orcid.org/0000-0002-0013-4602"
10 | title: "ANJANA"
11 | version: 1.1.0
12 | date-released: 2024-05-13
13 | url: "https://github.com/IFCA-Advanced-Computing/anjana"
14 | identifiers:
15 |   - type: doi
16 |     value: 10.5281/zenodo.11186382
17 |   - type: doi
18 |     value: 10.1038/s41597-024-04019-z
19 | references:
20 |   - type: article
21 |     authors:
22 |       - family-names: "Sáinz-Pardo Díaz"
23 |         given-names: "Judith"
24 |         orcid: "https://orcid.org/0000-0002-8387-578X"
25 |       - family-names: "López García"
26 |         given-names: "Álvaro"
27 |         orcid: "https://orcid.org/0000-0002-0013-4602"
28 |     title: "An Open Source Python Library for Anonymizing Sensitive Data"
29 |     journal: "Scientific Data"
30 |     year: "2024"
31 |     doi: "10.1038/s41597-024-04019-z"
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribute to the development of anjana
 2 | 
 3 | If this library is being useful for your research and you want to contribute to its improvement, we are happy to receive your proposals! Below you will find a guide on how you can contribute to improving anjana.
 4 | 
 5 | ### Add a new feature
 6 | 
 7 | If you are using anjana and have developed a new feature (especially implementing an additional anonymity functionality), and you want to contribute to the community, follow the steps below:
 8 | 1. Fork the repository.
 9 | 2. Clone the local repository:
10 |    
11 |    ```bash
12 |    git clone https://github.com/your-user/anjana.git
13 | 
14 | 3. Set up a virtual environment and install the requirements:
15 | 
16 |     ```bash
17 |     cd anjana
18 |     virtualenv .venv -p python3
19 |     source .venv/bin/activate
20 |     pip install -e .
21 | 
22 | 4. Create a new brach (e.g. develop).
23 | 
24 |      ```bash
25 |      git checkout -b develop
26 | 
27 | 5. Add the functionalities you want to contribute.
28 | 6. Include commits that are descriptive and clear about the changes made and functionality added. Make sure you create Semantic Commit Messages (conventional commits) including the use of flags such as `feat`, `fix`, `refactor`, `test`, etc.
29 | 7. Check the style and that linting is successfully executed by using `tox`.
30 | 8. Check that the code coverage is greater than 90%:
31 | 
32 |      ```bash
33 |      pytest --cov=.
34 | 
35 | 9. Send your code to your fork:
36 | 
37 |     ```bash
38 |     git push
39 | 
40 | 10. Open a [pull request](https://github.com/IFCA-Advanced-Computing/anjana/pulls) from your fork.
41 |     
42 | ### Solve a bug
43 | 
44 | If you have found a bug with some functionality of the library, it is recommended that you open an [issue](https://github.com/IFCA-Advanced-Computing/anjana/issues) so that we can solve it. You can follow the next steps: 
45 | 1. First, check that it is not an open [issue](https://github.com/IFCA-Advanced-Computing/anjana/issues) or one that has been previously resolved.
46 | 2. Give us information about the issue: clearly describe what you expected to receive and the error that occurred.
47 | 3. Describe clearly the steps that will allow us to reproduce your error, indicating the function you are using and details about the input introduced. 
48 | 4. Provide us details about the computing environment used: operating system, version of Python used and version of Anajan used.  
49 | 
50 | Any additional details you consider important will help us to resolve it more quickly.
51 | 
52 | ### Suggest a new feature:
53 | 
54 | If you are missing some functionality that you would like to see implemented in anjana, you can request it by opening an [issue](https://github.com/IFCA-Advanced-Computing/anjana/issues) as indicated in the previous section.
55 | 1. check that this functionality is not included or has not been previously requested in another issue.
56 | 2. Give us information about the required functionality, including why it is important for the users to have it be available in anjana.
57 | 3. Include theoretical information about the new technique or feature requested, including papers supporting its usefulness.
58 | 
59 |    
60 | 
61 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2024 Judith Sainz-Pardo Diaz
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ANJANA
  2 | [![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-green.svg)](https://gitlab.ifca.es/privacy-security/anjana/-/blob/main/LICENSE)
  3 | [![codecov](https://codecov.io/gh/IFCA-Advanced-Computing/anjana/graph/badge.svg?token=AVI53GZ7YD)](https://codecov.io/gh/IFCA-Advanced-Computing/anjana)
  4 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.11184467.svg)](https://doi.org/10.5281/zenodo.11184467)
  5 | ![PyPI](https://img.shields.io/pypi/v/anjana)
  6 | [![Downloads](https://static.pepy.tech/badge/anjana)](https://pepy.tech/project/anjana)
  7 | [![Documentation Status](https://readthedocs.org/projects/anjana/badge/?version=latest)](https://anjana.readthedocs.io/en/latest/?badge=latest)
  8 | [![release-please](https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/release-please.yml/badge.svg)](https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/release-please.yml)
  9 | [![Publish Package in PyPI](https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/pypi.yml/badge.svg)](https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/pypi.yml)
 10 | [![CI/CD Pipeline](https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/cicd.yml/badge.svg)](https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/cicd.yml)
 11 | [![Code Coverage](https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/.codecov.yml/badge.svg)](https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/.codecov.yml)
 12 | ![Python version](https://img.shields.io/badge/python-3.9|3.10|3.11|3.12-blue)
 13 | 
 14 | **Anonymity as major assurance of personal data privacy:**
 15 | 
 16 | ANJANA is a Python library for anonymizing sensitive data.
 17 | 
 18 | The following anonymity techniques are implemented, based on the Python library _[pyCANON](https://github.com/IFCA-Advanced-Computing/pycanon)_:
 19 | * _k-anonymity_.
 20 | * _(α,k)-anonymity_.
 21 | * _ℓ-diversity_.
 22 | * _Entropy ℓ-diversity_.
 23 | * _Recursive (c,ℓ)-diversity_.
 24 | * _t-closeness_.
 25 | * _Basic β-likeness_.
 26 | * _Enhanced β-likeness_.
 27 | * _δ-disclosure privacy_.
 28 | 
 29 | ## Installation
 30 | First, we strongly recommend the use of a virtual environment. In linux:
 31 | ```bash
 32 | virtualenv .venv -p python3
 33 | source .venv/bin/activate
 34 | ```
 35 | 
 36 | **Using [pip](https://pypi.org/project/anjana/)**:
 37 | 
 38 | Install anjana (linux and windows):
 39 | ```bash
 40 | pip install anjana
 41 | ```
 42 | 
 43 | **Using git**:
 44 | 
 45 | Install the most updated version of anjana (linux and windows):
 46 | 
 47 | ```bash
 48 | pip install git+https://github.com/IFCA-Advanced-Computing/anjana.git
 49 | ```
 50 | 
 51 | ## Getting started
 52 | 
 53 | For anonymizing your data you need to introduce:
 54 | * The **pandas dataframe** with the data to be anonymized. Each column can contain: identifiers, quasi-indentifiers or sensitive attributes.
 55 | * The **list with the names of the identifiers** in the dataframe, in order to suppress them.
 56 | * The **list with the names of the quasi-identifiers** in the dataframe.
 57 | * The **sentive attribute** (only one) in case of applying other techniques than _k-anonymity_.
 58 | * The **level of anonymity to be applied**, e.g. _k_ (for _k-anonymity_), _ℓ_ (for _ℓ-diversity_), _t_ (for _t-closeness_), _β_ (for _basic or enhanced β-likeness_), etc.
 59 | * Maximum **level of record suppression** allowed (from 0 to 100, acting as the percentage of suppressed records).
 60 | * Dictionary containing one dictionary for each quasi-identifier with the **hierarchies** and the levels.
 61 | 
 62 | ### Example: apply _k-anonymity_, _ℓ-diversity_ and _t-closeness_ to the [adult dataset](https://archive.ics.uci.edu/dataset/2/adult) with some predefined hierarchies:
 63 | ```python
 64 | import pandas as pd
 65 | import anjana
 66 | from anjana.anonymity import k_anonymity, l_diversity, t_closeness
 67 | 
 68 | # Read and process the data
 69 | data = pd.read_csv("adult.csv")
 70 | data.columns = data.columns.str.strip()
 71 | cols = [
 72 |     "workclass",
 73 |     "education",
 74 |     "marital-status",
 75 |     "occupation",
 76 |     "sex",
 77 |     "native-country",
 78 | ]
 79 | for col in cols:
 80 |     data[col] = data[col].str.strip()
 81 | 
 82 | # Define the identifiers, quasi-identifiers and the sensitive attribute
 83 | quasi_ident = [
 84 |     "age",
 85 |     "education",
 86 |     "marital-status",
 87 |     "occupation",
 88 |     "sex",
 89 |     "native-country",
 90 | ]
 91 | ident = ["race"]
 92 | sens_att = "salary-class"
 93 | 
 94 | # Select the desired level of k, l and t
 95 | k = 10
 96 | l_div = 2
 97 | t = 0.5
 98 | 
 99 | # Select the suppression limit allowed
100 | supp_level = 50
101 | 
102 | # Import the hierarquies for each quasi-identifier. Define a dictionary containing them
103 | hierarchies = {
104 |     "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
105 |     "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
106 |     "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
107 |     "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
108 |     "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
109 |     "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
110 | }
111 | 
112 | # Apply the three functions: k-anonymity, l-diversity and t-closeness
113 | data_anon = k_anonymity(data, ident, quasi_ident, k, supp_level, hierarchies)
114 | data_anon = l_diversity(
115 |     data_anon, ident, quasi_ident, sens_att, k, l_div, supp_level, hierarchies
116 | )
117 | data_anon = t_closeness(
118 |     data_anon, ident, quasi_ident, sens_att, k, t, supp_level, hierarchies
119 | )
120 | ```
121 | 
122 | The previous code can be executed in less than 4 seconds for the more than 30,000 records of the original dataset.
123 | 
124 | ### Define your own hierarchies
125 | 
126 | All the anonymity functions available in ANJANA receive a dictionary with the hierarchies to be applied to the quasi-identifiers. In particular, this dictionary has as key the names of the columns that are quasi-identifiers to which a hierarchy is to be applied (it may happen that you do not want to generalize some QIs and therefore no hierarchy is to be applied to them, just do not include them in this dictionary). The value for each key (QI) is formed by a dictionary in such a way that the value 0 has as value the raw column (as it is in the original dataset), the value 1 corresponds to the first level of transformation to be applied, in relation to the values of the original column, and so on with as many keys as levels of hierarchies have been established.
127 | 
128 | For a better understanding, let's look at the following example. Supose that we have the following simulated dataset (extracted from the [_hospital_extended.csv_](https://github.com/IFCA-Advanced-Computing/anjana/blob/main/examples/data/hospital_extended.csv) dataset used for testing purposes) with _age_, _gender_ and _city_ as quasi-identifiers, _name_ as identifier and _disease_ as sensitive attribute. Regarding the QI, we want to apply the following hierarquies: interval of 5 years (first level) and 10 years (second level) for the _age_. Suppression as first level for both _gender_ and _city_.
129 | 
130 | | name      | age | gender | city       | disease        |
131 | |-----------|-----|--------|------------|----------------|
132 | | Ramsha    | 29  | Female | Tamil Nadu | Cancer         |
133 | | Yadu      | 24  | Female | Kerala     | Viralinfection |
134 | | Salima    | 28  | Female | Tamil Nadu | TB             |
135 | | Sunny     | 27  | Male   | Karnataka  | No illness     |
136 | | Joan      | 24  | Female | Kerala     | Heart-related  |
137 | | Bahuksana | 23  | Male   | Karnataka  | TB             |
138 | | Rambha    | 19  | Male   | Kerala     | Cancer         |
139 | | Kishor    | 29  | Male   | Karnataka  | Heart-related  |
140 | | Johnson   | 17  | Male   | Kerala     | Heart-related  |
141 | | John      | 19  | Male   | Kerala     | Viralinfection |
142 | 
143 | Then, in order to create the hierarquies we can define the following dictionary:
144 | 
145 | ```python
146 | import numpy as np
147 | 
148 | age = data['age'].values
149 | # Values: [29 24 28 27 24 23 19 29 17 19] (note that the following can be automatized)
150 | age_5years = ['[25, 30)', '[20, 25)', '[25, 30)',
151 |               '[25, 30)', '[20, 25)', '[20, 25)',
152 |               '[15, 20)', '[25, 30)', '[15, 20)', '[15, 20)']
153 | 
154 | age_10years = ['[20, 30)', '[20, 30)', '[20, 30)',
155 |                '[20, 30)', '[20, 30)', '[20, 30)',
156 |                '[10, 20)', '[20, 30)', '[10, 20)', '[10, 20)']
157 | 
158 | hierarchies = {
159 |     "age": {0: age,
160 |             1: age_5years,
161 |             2: age_10years},
162 |     "gender": {
163 |         0: data["gender"].values,
164 |         1: np.array(["*"] * len(data["gender"].values)) # Suppression
165 |     },
166 |     "city": {0: data["city"].values,
167 |              1: np.array(["*"] * len(data["city"].values))} # Suppression
168 | }
169 | ```
170 | 
171 | You can also use the function _generate_intervals()_ from _utils_ for creating the interval-based hierarchy as follows:
172 | 
173 | ```python
174 | import numpy as np
175 | from anjana.anonymity import utils
176 | 
177 | age = data['age'].values
178 | 
179 | hierarchies = {
180 |     "age": {
181 |         0: data["age"].values,
182 |         1: utils.generate_intervals(data["age"].values, 0, 100, 5),
183 |         2: utils.generate_intervals(data["age"].values, 0, 100, 10),
184 |     },
185 |     "gender": {
186 |         0: data["gender"].values,
187 |         1: np.array(["*"] * len(data["gender"].values)) # Suppression
188 |     },
189 |     "city": {0: data["city"].values,
190 |              1: np.array(["*"] * len(data["city"].values))} # Suppression
191 | }
192 | ```
193 | 
194 | 
195 | ## License
196 | This project is licensed under the [Apache 2.0 license](https://github.com/IFCA-Advanced-Computing/anjana/blob/main/LICENSE).
197 | 
198 | ## Citation
199 | If you are using _anjana_ you can cite it as follows:
200 | 
201 | ```bibtex
202 | @article{sainzpardo2024anjana,
203 |     title={An Open Source Python Library for Anonymizing Sensitive Data},
204 |     author={S{\'a}inz-Pardo D{\'\i}az, Judith and L{\'o}pez Garc{\'\i}a, {\'A}lvaro},
205 |     journal={Scientific data},
206 |     volume={11},
207 |     number={1},
208 |     pages={1289},
209 |     year={2024},
210 |     publisher={Nature Publishing Group UK London}
211 |   }
212 | ```
213 |   
214 | ## Funding and acknowledgments
215 | This work is funded by European Union through the SIESTA project (Horizon Europe) under Grant number [101131957](https://cordis.europa.eu/project/id/101131957).
216 | <p>
217 | <img align="center" width="250" src="https://raw.githubusercontent.com/SIESTA-eu/.github/main/profile/EN-Funded.jpg">
218 | <img align="center" width="250" src="https://raw.githubusercontent.com/SIESTA-eu/.github/main/profile/logo.png">
219 | <p>
220 | 
221 | 
222 | ----
223 | **_Note: Anjana and the mythology of Cantabria_**
224 | <p align="center">
225 |     <i>
226 | "La Anjana" is a character from the mythology of Cantabria. Known as the good fairy of Cantabria, generous and protective of all people, she helps the poor, the suffering and those who stray in the forest.
227 |     </i>
228 | </p>
229 | <p align="center">
230 |     <i>
231 | - Partially extracted from: Cotera, Gustavo. Mitología de Cantabria. Ed. Tantin, Santander, 1998.
232 |     </i>
233 |     </p>
234 | </div>
235 | 
236 | 


--------------------------------------------------------------------------------
/anjana/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | """ANJANA is an open source framework for anonymizing data with different techniques."""
18 | 
19 | __version__ = "1.1.0"
20 | 


--------------------------------------------------------------------------------
/anjana/anonymity/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | """Python library for applying different anonymity techniques."""
18 | 
19 | from ._k_anonymity import k_anonymity, k_anonymity_inner, alpha_k_anonymity
20 | from ._l_diversity import l_diversity, entropy_l_diversity, recursive_c_l_diversity
21 | from ._t_closeness import t_closeness
22 | from ._beta_likeness import basic_beta_likeness, enhanced_beta_likeness
23 | from ._delta_disclosure import delta_disclosure
24 | 
25 | __all__ = [
26 |     "k_anonymity",
27 |     "k_anonymity_inner",
28 |     "alpha_k_anonymity",
29 |     "l_diversity",
30 |     "entropy_l_diversity",
31 |     "recursive_c_l_diversity",
32 |     "t_closeness",
33 |     "basic_beta_likeness",
34 |     "enhanced_beta_likeness",
35 |     "delta_disclosure",
36 | ]
37 | 


--------------------------------------------------------------------------------
/anjana/anonymity/_beta_likeness.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2024 Spanish National Research Council (CSIC)
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
  6 | # not use this file except in compliance with the License. You may obtain
  7 | # a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 14 | # License for the specific language governing permissions and limitations
 15 | # under the License.
 16 | 
 17 | import numpy as np
 18 | import pandas as pd
 19 | import pycanon
 20 | from anjana.anonymity.utils import utils
 21 | from copy import copy
 22 | from anjana.anonymity import k_anonymity_inner
 23 | from beartype import beartype
 24 | from beartype import typing
 25 | 
 26 | 
 27 | @beartype()
 28 | def basic_beta_likeness(
 29 |     data: pd.DataFrame,
 30 |     ident: typing.Union[typing.List, np.ndarray],
 31 |     quasi_ident: typing.Union[typing.List, np.ndarray],
 32 |     sens_att: str,
 33 |     k: int,
 34 |     beta: typing.Union[float, int],
 35 |     supp_level: typing.Union[float, int],
 36 |     hierarchies: dict,
 37 | ) -> pd.DataFrame:
 38 |     """Anonymize a dataset using basic beta-likeness and k-anonymity.
 39 | 
 40 |     :param data: data under study.
 41 |     :type data: pandas dataframe
 42 | 
 43 |     :param ident: list with the name of the columns of the dataframe
 44 |         that are identifiers.
 45 |     :type ident: list of strings
 46 | 
 47 |     :param quasi_ident: list with the name of the columns of the dataframe
 48 |         that are quasi-identifiers.
 49 |     :type quasi_ident: list of strings
 50 | 
 51 |     :param sens_att: string with the name of the sensitive attribute.
 52 |     :type sens_att: string
 53 | 
 54 |     :param k: value of k for k-anonymity to be applied.
 55 |     :type k: int
 56 | 
 57 |     :param beta: value of beta for basic beta-likeness to be applied.
 58 |     :type beta: float
 59 | 
 60 |     :param supp_level: maximum level of record suppression allowed
 61 |         (from 0 to 100).
 62 |     :type supp_level: float
 63 | 
 64 |     :param hierarchies: hierarchies for generalizing the QI.
 65 |     :type hierarchies: dictionary containing one dictionary for QI
 66 |         with the hierarchies and the levels
 67 | 
 68 |     :return: anonymized data.
 69 |     :rtype: pandas dataframe
 70 |     """
 71 |     if beta < 0:
 72 |         raise ValueError(f"Invalid value of beta for beta-likeness, beta={beta}")
 73 | 
 74 |     data_kanon, supp_records, gen_level = k_anonymity_inner(
 75 |         data, ident, quasi_ident, k, supp_level, hierarchies
 76 |     )
 77 | 
 78 |     beta_real = pycanon.anonymity.basic_beta_likeness(
 79 |         data_kanon, quasi_ident, [sens_att]
 80 |     )
 81 |     quasi_ident_gen = copy(quasi_ident)
 82 | 
 83 |     if beta_real <= beta:
 84 |         print(f"The data verifies basic beta-likeness with beta={beta_real}")
 85 |         return data_kanon
 86 | 
 87 |     while beta_real > beta:
 88 |         if len(quasi_ident_gen) == 0:
 89 |             print(f"Basic beta likeness cannot be achieved for beta={beta}")
 90 |             return pd.DataFrame()
 91 | 
 92 |         qi_gen = quasi_ident_gen[
 93 |             np.argmax([len(np.unique(data_kanon[qi])) for qi in quasi_ident_gen])
 94 |         ]
 95 | 
 96 |         try:
 97 |             generalization_qi = utils.apply_hierarchy(
 98 |                 data_kanon[qi_gen].values, hierarchies[qi_gen], gen_level[qi_gen] + 1
 99 |             )
100 |             data_kanon[qi_gen] = generalization_qi
101 |             gen_level[qi_gen] = gen_level[qi_gen] + 1
102 |         except ValueError:
103 |             if qi_gen in quasi_ident_gen:
104 |                 quasi_ident_gen.remove(qi_gen)
105 | 
106 |         beta_real = pycanon.anonymity.basic_beta_likeness(
107 |             data_kanon, quasi_ident, [sens_att]
108 |         )
109 |         if beta_real <= beta:
110 |             return data_kanon
111 | 
112 |     return data_kanon
113 | 
114 | 
115 | @beartype()
116 | def enhanced_beta_likeness(
117 |     data: pd.DataFrame,
118 |     ident: typing.Union[typing.List, np.ndarray],
119 |     quasi_ident: typing.Union[typing.List, np.ndarray],
120 |     sens_att: str,
121 |     k: int,
122 |     beta: typing.Union[float, int],
123 |     supp_level: typing.Union[float, int],
124 |     hierarchies: dict,
125 | ) -> pd.DataFrame:
126 |     """Anonymize a dataset using enhanced beta-likeness and k-anonymity.
127 | 
128 |     :param data: data under study.
129 |     :type data: pandas dataframe
130 | 
131 |     :param ident: list with the name of the columns of the dataframe
132 |         that are identifiers.
133 |     :type ident: list of strings
134 | 
135 |     :param quasi_ident: list with the name of the columns of the dataframe
136 |         that are quasi-identifiers.
137 |     :type quasi_ident: list of strings
138 | 
139 |     :param sens_att: string with the name of the sensitive attribute.
140 |     :type sens_att: string
141 | 
142 |     :param k: value of k for k-anonymity to be applied.
143 |     :type k: int
144 | 
145 |     :param beta: value of beta for enhanced beta-likeness to be applied.
146 |     :type beta: float
147 | 
148 |     :param supp_level: maximum level of record suppression allowed
149 |         (from 0 to 100).
150 |     :type supp_level: float
151 | 
152 |     :param hierarchies: hierarchies for generalizing the QI.
153 |     :type hierarchies: dictionary containing one dictionary for QI
154 |         with the hierarchies and the levels
155 | 
156 |     :return: anonymized data.
157 |     :rtype: pandas dataframe
158 |     """
159 |     if beta < 0:
160 |         raise ValueError(f"Invalid value of beta for beta-likeness, beta={beta}")
161 | 
162 |     data_kanon, supp_records, gen_level = k_anonymity_inner(
163 |         data, ident, quasi_ident, k, supp_level, hierarchies
164 |     )
165 | 
166 |     beta_real = pycanon.anonymity.enhanced_beta_likeness(
167 |         data_kanon, quasi_ident, [sens_att]
168 |     )
169 |     quasi_ident_gen = copy(quasi_ident)
170 | 
171 |     if beta_real <= beta:
172 |         print(f"The data verifies enhanced beta-likeness with beta={beta_real}")
173 |         return data_kanon
174 | 
175 |     while beta_real > beta:
176 |         if len(quasi_ident_gen) == 0:
177 |             print(f"Enhanced beta likeness cannot be achieved for beta={beta}")
178 |             return pd.DataFrame()
179 | 
180 |         qi_gen = quasi_ident_gen[
181 |             np.argmax([len(np.unique(data_kanon[qi])) for qi in quasi_ident_gen])
182 |         ]
183 | 
184 |         try:
185 |             generalization_qi = utils.apply_hierarchy(
186 |                 data_kanon[qi_gen].values, hierarchies[qi_gen], gen_level[qi_gen] + 1
187 |             )
188 |             data_kanon[qi_gen] = generalization_qi
189 |             gen_level[qi_gen] = gen_level[qi_gen] + 1
190 |         except ValueError:
191 |             if qi_gen in quasi_ident_gen:
192 |                 quasi_ident_gen.remove(qi_gen)
193 | 
194 |         beta_real = pycanon.anonymity.enhanced_beta_likeness(
195 |             data_kanon, quasi_ident, [sens_att]
196 |         )
197 |         if beta_real <= beta:
198 |             return data_kanon
199 | 
200 |     return data_kanon
201 | 


--------------------------------------------------------------------------------
/anjana/anonymity/_delta_disclosure.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2024 Spanish National Research Council (CSIC)
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
  6 | # not use this file except in compliance with the License. You may obtain
  7 | # a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 14 | # License for the specific language governing permissions and limitations
 15 | # under the License.
 16 | 
 17 | import numpy as np
 18 | import pandas as pd
 19 | import pycanon
 20 | from anjana.anonymity.utils import utils
 21 | from copy import copy
 22 | from anjana.anonymity import k_anonymity_inner
 23 | from beartype import beartype
 24 | from beartype import typing
 25 | 
 26 | 
 27 | @beartype()
 28 | def delta_disclosure(
 29 |     data: pd.DataFrame,
 30 |     ident: typing.Union[typing.List, np.ndarray],
 31 |     quasi_ident: typing.Union[typing.List, np.ndarray],
 32 |     sens_att: str,
 33 |     k: int,
 34 |     delta: typing.Union[float, int],
 35 |     supp_level: typing.Union[float, int],
 36 |     hierarchies: dict,
 37 | ) -> pd.DataFrame:
 38 |     """Anonymize a dataset using delta-disclosure privacy and k-anonymity.
 39 | 
 40 |     :param data: data under study.
 41 |     :type data: pandas dataframe
 42 | 
 43 |     :param ident: list with the name of the columns of the dataframe
 44 |         that are identifiers.
 45 |     :type ident: list of strings
 46 | 
 47 |     :param quasi_ident: list with the name of the columns of the dataframe
 48 |         that are quasi-identifiers.
 49 |     :type quasi_ident: list of strings
 50 | 
 51 |     :param sens_att: str with the name of the sensitive attribute.
 52 |     :type sens_att: string
 53 | 
 54 |     :param k: value of k for k-anonymity to be applied.
 55 |     :type k: int
 56 | 
 57 |     :param delta: value of delta for delta-disclosure privacy to be applied.
 58 |     :type delta: float
 59 | 
 60 |     :param supp_level: maximum level of record suppression allowed
 61 |         (from 0 to 100).
 62 |     :type supp_level: float
 63 | 
 64 |     :param hierarchies: hierarchies for generalizing the QI.
 65 |     :type hierarchies: dictionary containing one dictionary for QI
 66 |         with the hierarchies and the levels
 67 | 
 68 |     :return: anonymized data.
 69 |     :rtype: pandas dataframe
 70 |     """
 71 |     if delta < 0:
 72 |         raise ValueError(f"Invalid value of delta for delta-disclosure, delta={delta}")
 73 | 
 74 |     data_kanon, supp_records, gen_level = k_anonymity_inner(
 75 |         data, ident, quasi_ident, k, supp_level, hierarchies
 76 |     )
 77 | 
 78 |     delta_real = pycanon.anonymity.delta_disclosure(data_kanon, quasi_ident, [sens_att])
 79 |     quasi_ident_gen = copy(quasi_ident)
 80 | 
 81 |     if delta_real <= delta:
 82 |         print(f"The data verifies delta-disclosure with delta={delta_real}")
 83 |         return data_kanon
 84 | 
 85 |     while delta_real > delta:
 86 |         if len(quasi_ident_gen) == 0:
 87 |             print(f"Delta-disclosure privacy cannot be achieved for delta={delta}")
 88 |             return pd.DataFrame()
 89 | 
 90 |         qi_gen = quasi_ident_gen[
 91 |             np.argmax([len(np.unique(data_kanon[qi])) for qi in quasi_ident_gen])
 92 |         ]
 93 | 
 94 |         try:
 95 |             generalization_qi = utils.apply_hierarchy(
 96 |                 data_kanon[qi_gen].values, hierarchies[qi_gen], gen_level[qi_gen] + 1
 97 |             )
 98 |             data_kanon[qi_gen] = generalization_qi
 99 |             gen_level[qi_gen] = gen_level[qi_gen] + 1
100 |         except ValueError:
101 |             if qi_gen in quasi_ident_gen:
102 |                 quasi_ident_gen.remove(qi_gen)
103 | 
104 |         delta_real = pycanon.anonymity.delta_disclosure(
105 |             data_kanon, quasi_ident, [sens_att]
106 |         )
107 |         if delta_real <= delta:
108 |             return data_kanon
109 | 
110 |     return data_kanon
111 | 


--------------------------------------------------------------------------------
/anjana/anonymity/_k_anonymity.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2024 Spanish National Research Council (CSIC)
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
  6 | # not use this file except in compliance with the License. You may obtain
  7 | # a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 14 | # License for the specific language governing permissions and limitations
 15 | # under the License.
 16 | 
 17 | import numpy as np
 18 | import pandas as pd
 19 | import pycanon.anonymity
 20 | from anjana.anonymity.utils import utils
 21 | from copy import copy
 22 | from beartype import beartype
 23 | from beartype import typing
 24 | 
 25 | 
 26 | @beartype
 27 | def k_anonymity(
 28 |     data: pd.DataFrame,
 29 |     ident: typing.Union[typing.List, np.ndarray],
 30 |     quasi_ident: typing.Union[typing.List, np.ndarray],
 31 |     k: int,
 32 |     supp_level: typing.Union[float, int],
 33 |     hierarchies: dict,
 34 | ) -> pd.DataFrame:
 35 |     """Anonymize a dataset using k-anonymity.
 36 | 
 37 |     :param data: data under study.
 38 |     :type data: pandas dataframe
 39 | 
 40 |     :param ident: list with the name of the columns of the dataframe
 41 |         that are identifiers.
 42 |     :type ident: list of strings
 43 | 
 44 |     :param quasi_ident: list with the name of the columns of the dataframe
 45 |         that are quasi-identifiers.
 46 |     :type quasi_ident: list of strings
 47 | 
 48 |     :param k: desired level of k-anonymity.
 49 |     :type k: int
 50 | 
 51 |     :param supp_level: maximum level of record suppression allowed
 52 |         (from 0 to 100).
 53 |     :type supp_level: float
 54 | 
 55 |     :param hierarchies: hierarchies for generalizing the QI.
 56 |     :type hierarchies: dictionary containing one dictionary for QI
 57 |         with the hierarchies and the levels
 58 | 
 59 |     :return: anonymized data.
 60 |     :rtype: pandas dataframe
 61 |     """
 62 |     data_anon, _, _ = k_anonymity_inner(
 63 |         data, ident, quasi_ident, k, supp_level, hierarchies
 64 |     )
 65 |     return data_anon
 66 | 
 67 | 
 68 | @beartype()
 69 | def alpha_k_anonymity(
 70 |     data: pd.DataFrame,
 71 |     ident: typing.Union[typing.List, np.ndarray],
 72 |     quasi_ident: typing.Union[typing.List, np.ndarray],
 73 |     sens_att: str,
 74 |     k: int,
 75 |     alpha: typing.Union[float, int],
 76 |     supp_level: typing.Union[float, int],
 77 |     hierarchies: dict,
 78 | ) -> pd.DataFrame:
 79 |     """Anonymize a dataset using (alpha,k)-anonymity.
 80 | 
 81 |     :param data: data under study.
 82 |     :type data: pandas dataframe
 83 | 
 84 |     :param ident: list with the name of the columns of the dataframe
 85 |         that are identifiers.
 86 |     :type ident: list of strings
 87 | 
 88 |     :param quasi_ident: list with the name of the columns of the dataframe
 89 |         that are quasi-identifiers.
 90 |     :type quasi_ident: list of strings
 91 | 
 92 |     :param sens_att: string with the name of the sensitive attribute.
 93 |     :type sens_att: string
 94 | 
 95 |     :param k: desired level of k-anonymity.
 96 |     :type k: int
 97 | 
 98 |     :param alpha: desired level of alpha for (alpha,k)-anonymity.
 99 |     :type alpha: float
100 | 
101 |     :param supp_level: maximum level of record suppression allowed
102 |         (from 0 to 100).
103 |     :type supp_level: float
104 | 
105 |     :param hierarchies: hierarchies for generalizing the QI.
106 |     :type hierarchies: dictionary containing one dictionary for QI
107 |         with the hierarchies and the levels
108 | 
109 |     :return: anonymized data.
110 |     :rtype: pandas dataframe
111 |     """
112 |     data_kanon, supp_records, gen_level = k_anonymity_inner(
113 |         data, ident, quasi_ident, k, supp_level, hierarchies
114 |     )
115 | 
116 |     if alpha > 1 or alpha < 0:
117 |         raise ValueError(
118 |             f"Invalid value of alpha for (alpha,k)-anonymity " f"alpha={alpha}"
119 |         )
120 | 
121 |     alpha_real, _ = pycanon.anonymity.alpha_k_anonymity(
122 |         data_kanon, quasi_ident, [sens_att]
123 |     )
124 |     quasi_ident_gen = copy(quasi_ident)
125 | 
126 |     while alpha_real > alpha:
127 |         if len(quasi_ident_gen) == 0:
128 |             print(f"(alpha,k)-anonymity cannot be achieved for alpha={alpha}")
129 |             return pd.DataFrame()
130 | 
131 |         qi_gen = quasi_ident_gen[
132 |             np.argmax([len(np.unique(data_kanon[qi])) for qi in quasi_ident_gen])
133 |         ]
134 | 
135 |         try:
136 |             generalization_qi = utils.apply_hierarchy(
137 |                 data_kanon[qi_gen].values, hierarchies[qi_gen], gen_level[qi_gen] + 1
138 |             )
139 |             data_kanon[qi_gen] = generalization_qi
140 |             gen_level[qi_gen] = gen_level[qi_gen] + 1
141 |         except ValueError:
142 |             if qi_gen in quasi_ident_gen:
143 |                 quasi_ident_gen.remove(qi_gen)
144 | 
145 |         alpha_real, _ = pycanon.anonymity.alpha_k_anonymity(
146 |             data_kanon, quasi_ident, [sens_att]
147 |         )
148 | 
149 |         if alpha_real <= alpha:
150 |             return data_kanon
151 | 
152 |         equiv_class = pycanon.anonymity.utils.aux_anonymity.get_equiv_class(
153 |             data_kanon, quasi_ident
154 |         )
155 | 
156 |         k_ec = []
157 |         alpha_ec = []
158 |         for ec in equiv_class:
159 |             data_temp = data_kanon.iloc[
160 |                 pycanon.anonymity.utils.aux_functions.convert(ec)
161 |             ]
162 |             values = np.unique(data_temp[sens_att].values)
163 |             alpha_s = [
164 |                 len(data_temp[data_temp[sens_att] == s]) / len(data_temp)
165 |                 for s in values
166 |             ]
167 |             alpha_ec.append(max(alpha_s))
168 |             k_ec.append(len(ec))
169 | 
170 |         if alpha > min(alpha_ec):
171 |             if max(alpha_ec) <= alpha:
172 |                 return data_kanon
173 | 
174 |             data_ec = pd.DataFrame(
175 |                 {"equiv_class": equiv_class, "alpha": alpha_ec, "k": k_ec}
176 |             )
177 |             data_ec_alpha = data_ec[data_ec.alpha > alpha]
178 |             records_sup = sum(data_ec_alpha.k.values)
179 |             if (records_sup + supp_records) * 100 / len(data) <= supp_level:
180 |                 ec_elim = np.concatenate(
181 |                     [
182 |                         pycanon.anonymity.utils.aux_functions.convert(ec)
183 |                         for ec in data_ec_alpha.equiv_class.values
184 |                     ]
185 |                 )
186 |                 anonim_data = data_kanon.drop(ec_elim).reset_index()
187 |                 alpha_supp, _ = pycanon.anonymity.alpha_k_anonymity(
188 |                     anonim_data, quasi_ident, [sens_att]
189 |                 )
190 |                 if alpha_supp <= alpha:
191 |                     return anonim_data
192 | 
193 |     return data_kanon
194 | 
195 | 
196 | def k_anonymity_inner(
197 |     data: pd.DataFrame,
198 |     ident: typing.Union[typing.List, np.ndarray],
199 |     quasi_ident: typing.Union[typing.List, np.ndarray],
200 |     k: int,
201 |     supp_level: typing.Union[float, int],
202 |     hierarchies: dict,
203 | ) -> (pd.DataFrame, int, dict):
204 |     """Auxiliary function for applying k-anonymity.
205 | 
206 |     :param data: data under study.
207 |     :type data: pandas dataframe
208 | 
209 |     :param ident: list with the name of the columns of the dataframe
210 |         that are identifiers.
211 |     :type ident: list of strings
212 | 
213 |     :param quasi_ident: list with the name of the columns of the dataframe
214 |         that are quasi-identifiers.
215 |     :type quasi_ident: list of strings
216 | 
217 |     :param k: desired level of k-anonymity.
218 |     :type k: int
219 | 
220 |     :param supp_level: maximum level of record suppression allowed
221 |         (from 0 to 100).
222 |     :type supp_level: float
223 | 
224 |     :param hierarchies: hierarchies for generalizing the QI.
225 |     :type hierarchies: dictionary containing one dictionary for QI
226 |         with the hierarchies and the levels
227 | 
228 |     :return: anonymized data.
229 |     :rtype: pandas dataframe
230 | 
231 |     :return: number of records suppressed.
232 |     :rtype: int
233 | 
234 |     :return: level of generalization applied to each QI.
235 |     :rtype: dict
236 |     """
237 |     if k < 1:
238 |         raise ValueError(f"Invalid value of k for k-anonymity k={k}")
239 | 
240 |     if supp_level > 100 or supp_level < 0:
241 |         raise ValueError(f"Invalid value of for the suppression level {supp_level}")
242 | 
243 |     data = copy(data)
244 |     data = utils.suppress_identifiers(data, ident)
245 |     n = len(data)
246 | 
247 |     gen_level = utils.check_gen_level(data, quasi_ident, hierarchies)
248 | 
249 |     k_real = pycanon.anonymity.k_anonymity(data, quasi_ident)
250 |     quasi_ident_gen = copy(quasi_ident)
251 | 
252 |     if k_real >= k:
253 |         print(f"The data verifies k-anonymity with k={k_real}")
254 |         supp_records = n - len(data)
255 |         return data, supp_records, gen_level
256 | 
257 |     while k_real < k:
258 |         k_real = pycanon.anonymity.k_anonymity(data, quasi_ident)
259 |         if k_real >= k:
260 |             supp_records = n - len(data)
261 |             return data, supp_records, gen_level
262 |         else:
263 |             equiv_class = pycanon.anonymity.utils.aux_anonymity.get_equiv_class(
264 |                 data, quasi_ident
265 |             )
266 |             len_ec = [len(ec) for ec in equiv_class]
267 | 
268 |             if k <= max(len_ec):
269 |                 data_ec = pd.DataFrame({"equiv_class": equiv_class, "k": len_ec})
270 |                 data_ec_k = data_ec[data_ec.k < k]
271 |                 records_sup = sum(data_ec_k.k.values)
272 |                 if records_sup * 100 / len(data) <= supp_level:
273 |                     ec_elim = np.concatenate(
274 |                         [
275 |                             pycanon.anonymity.utils.aux_functions.convert(ec)
276 |                             for ec in data_ec_k.equiv_class.values
277 |                         ]
278 |                     )
279 |                     anonim_data = data.drop(ec_elim).reset_index()
280 |                     supp_records = n - len(anonim_data)
281 |                     k_supp = pycanon.anonymity.k_anonymity(anonim_data, quasi_ident)
282 |                     if k_supp >= k:
283 |                         return anonim_data, supp_records, gen_level
284 | 
285 |         if len(quasi_ident_gen) == 0:
286 |             print(f"The anonymization cannot be carried out for the given value k={k}")
287 |             supp_records = n - len(data)
288 |             return pd.DataFrame(), supp_records, gen_level
289 | 
290 |         qi_gen = quasi_ident_gen[
291 |             np.argmax([len(np.unique(data[qi])) for qi in quasi_ident_gen])
292 |         ]
293 | 
294 |         try:
295 |             generalization_qi = utils.apply_hierarchy(
296 |                 data[qi_gen].values, hierarchies[qi_gen], gen_level[qi_gen] + 1
297 |             )
298 |             data[qi_gen] = generalization_qi
299 |             gen_level[qi_gen] = gen_level[qi_gen] + 1
300 |         except ValueError:
301 |             if qi_gen in quasi_ident_gen:
302 |                 quasi_ident_gen.remove(qi_gen)
303 | 
304 |     supp_records = n - len(data)
305 | 
306 |     return data, supp_records, gen_level
307 | 


--------------------------------------------------------------------------------
/anjana/anonymity/_l_diversity.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2024 Spanish National Research Council (CSIC)
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
  6 | # not use this file except in compliance with the License. You may obtain
  7 | # a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 14 | # License for the specific language governing permissions and limitations
 15 | # under the License.
 16 | 
 17 | import numpy as np
 18 | import pandas as pd
 19 | import pycanon
 20 | from anjana.anonymity.utils import utils
 21 | from copy import copy
 22 | from anjana.anonymity import k_anonymity_inner
 23 | from beartype import beartype
 24 | from beartype import typing
 25 | 
 26 | 
 27 | @beartype()
 28 | def l_diversity(
 29 |     data: pd.DataFrame,
 30 |     ident: typing.Union[typing.List, np.ndarray],
 31 |     quasi_ident: typing.Union[typing.List, np.ndarray],
 32 |     sens_att: str,
 33 |     k: int,
 34 |     l_div: int,
 35 |     supp_level: typing.Union[float, int],
 36 |     hierarchies: dict,
 37 | ) -> pd.DataFrame:
 38 |     """Anonymize a dataset using l-diversity.
 39 | 
 40 |     :param data: data under study.
 41 |     :type data: pandas dataframe
 42 | 
 43 |     :param ident: list with the name of the columns of the dataframe
 44 |         that are identifiers.
 45 |     :type ident: list of strings
 46 | 
 47 |     :param quasi_ident: list with the name of the columns of the dataframe
 48 |         that are quasi-identifiers.
 49 |     :type quasi_ident: list of strings
 50 | 
 51 |     :param sens_att: string with the name of the sensitive attribute.
 52 |     :type sens_att: string
 53 | 
 54 |     :param k: desired level of k-anonymity.
 55 |     :type k: int
 56 | 
 57 |     :param l_div: desired level of l-diversity.
 58 |     :type l_div: int
 59 | 
 60 |     :param supp_level: maximum level of record suppression allowed
 61 |         (from 0 to 100).
 62 |     :type supp_level: float
 63 | 
 64 |     :param hierarchies: hierarchies for generalizing the QI.
 65 |     :type hierarchies: dictionary containing one dictionary for QI
 66 |         with the hierarchies and the levels
 67 | 
 68 |     :return: anonymized data.
 69 |     :rtype: pandas dataframe
 70 |     """
 71 |     data_anon, _ = _l_diversity_inner(
 72 |         data, ident, quasi_ident, sens_att, k, l_div, supp_level, hierarchies
 73 |     )
 74 |     return data_anon
 75 | 
 76 | 
 77 | @beartype()
 78 | def entropy_l_diversity(
 79 |     data: pd.DataFrame,
 80 |     ident: typing.Union[typing.List, np.ndarray],
 81 |     quasi_ident: typing.Union[typing.List, np.ndarray],
 82 |     sens_att: str,
 83 |     k: int,
 84 |     l_div: int,
 85 |     supp_level: typing.Union[float, int],
 86 |     hierarchies: dict,
 87 | ) -> pd.DataFrame:
 88 |     """Anonymize a dataset using entropy l-diversity.
 89 | 
 90 |     :param data: data under study.
 91 |     :type data: pandas dataframe
 92 | 
 93 |     :param ident: list with the name of the columns of the dataframe
 94 |         that are identifiers.
 95 |     :type ident: list of strings
 96 | 
 97 |     :param quasi_ident: list with the name of the columns of the dataframe
 98 |         that are quasi-identifiers.
 99 |     :type quasi_ident: list of strings
100 | 
101 |     :param sens_att: string with the name of the sensitive attribute.
102 |     :type sens_att: string
103 | 
104 |     :param k: desired level of k-anonymity.
105 |     :type k: int
106 | 
107 |     :param l_div: desired level of entropy l-diversity.
108 |     :type l_div: int
109 | 
110 |     :param supp_level: maximum level of record suppression allowed
111 |         (from 0 to 100).
112 |     :type supp_level: float
113 | 
114 |     :param hierarchies: hierarchies for generalizing the QI.
115 |     :type hierarchies: dictionary containing one dictionary for QI
116 |         with the hierarchies and the levels
117 | 
118 |     :return: anonymized data.
119 |     :rtype: pandas dataframe
120 |     """
121 |     data_kanon = l_diversity(
122 |         data, ident, quasi_ident, sens_att, k, l_div, supp_level, hierarchies
123 |     )
124 | 
125 |     l_real = pycanon.anonymity.entropy_l_diversity(data_kanon, quasi_ident, [sens_att])
126 |     quasi_ident_gen = copy(quasi_ident)
127 |     gen_level = utils.check_gen_level(data_kanon, quasi_ident, hierarchies)
128 | 
129 |     if l_real >= l_div:
130 |         print(f"The data verifies entropy l-diversity with l={l_real}")
131 |         return data_kanon
132 | 
133 |     while l_real < l_div:
134 |         if len(quasi_ident_gen) == 0:
135 |             print(f"Entropy l-diversity cannot be achieved for l={l_div}")
136 |             return pd.DataFrame()
137 | 
138 |         qi_gen = quasi_ident_gen[
139 |             np.argmax([len(np.unique(data_kanon[qi])) for qi in quasi_ident_gen])
140 |         ]
141 | 
142 |         try:
143 |             generalization_qi = utils.apply_hierarchy(
144 |                 data_kanon[qi_gen].values, hierarchies[qi_gen], gen_level[qi_gen] + 1
145 |             )
146 |             data_kanon[qi_gen] = generalization_qi
147 |             gen_level[qi_gen] = gen_level[qi_gen] + 1
148 |         except ValueError:
149 |             if qi_gen in quasi_ident_gen:
150 |                 quasi_ident_gen.remove(qi_gen)
151 | 
152 |         l_real = pycanon.anonymity.entropy_l_diversity(
153 |             data_kanon, quasi_ident, [sens_att]
154 |         )
155 |         if l_real >= l_div:
156 |             return data_kanon
157 | 
158 |     return data_kanon
159 | 
160 | 
161 | @beartype()
162 | def recursive_c_l_diversity(
163 |     data: pd.DataFrame,
164 |     ident: typing.Union[typing.List, np.ndarray],
165 |     quasi_ident: typing.Union[typing.List, np.ndarray],
166 |     sens_att: str,
167 |     k: int,
168 |     c: int,
169 |     l_div: int,
170 |     supp_level: typing.Union[float, int],
171 |     hierarchies: dict,
172 | ) -> pd.DataFrame:
173 |     """Anonymize a dataset using recursive (c,l)-diversity.
174 | 
175 |     :param data: data under study.
176 |     :type data: pandas dataframe
177 | 
178 |     :param ident: list with the name of the columns of the dataframe
179 |         that are identifiers.
180 |     :type ident: list of strings
181 | 
182 |     :param quasi_ident: list with the name of the columns of the dataframe
183 |         that are quasi-identifiers.
184 |     :type quasi_ident: list of strings
185 | 
186 |     :param sens_att: string with the name of the sensitive attribute.
187 |     :type sens_att: string
188 | 
189 |     :param k: desired level of k-anonymity.
190 |     :type k: int
191 | 
192 |     :param c: desired value of c for recursive (c,l)-diversity.
193 |     :type c: int
194 | 
195 |     :param l_div: desired level of l-diversity.
196 |     :type l_div: int
197 | 
198 |     :param supp_level: maximum level of record suppression allowed
199 |         (from 0 to 100).
200 |     :type supp_level: float
201 | 
202 |     :param hierarchies: hierarchies for generalizing the QI.
203 |     :type hierarchies: dictionary containing one dictionary for QI
204 |         with the hierarchies and the levels
205 | 
206 |     :return: anonymized data.
207 |     :rtype: pandas dataframe
208 |     """
209 |     if c < 1:
210 |         raise ValueError(f"Invalid value of c for recursive (c,l)-diversity, c={c}")
211 | 
212 |     data = copy(data)
213 |     data_kanon, supp_records = _l_diversity_inner(
214 |         data, ident, quasi_ident, sens_att, k, l_div, supp_level, hierarchies
215 |     )
216 | 
217 |     c_real, l_real = pycanon.anonymity.recursive_c_l_diversity(
218 |         data_kanon, quasi_ident, [sens_att]
219 |     )
220 |     quasi_ident_gen = copy(quasi_ident)
221 |     gen_level = utils.check_gen_level(data_kanon, quasi_ident, hierarchies)
222 | 
223 |     if l_real >= l_div and c_real >= c:
224 |         print(
225 |             f"The data verifies recursive (c,l)-diversity with l={l_real}, c={c_real}"
226 |         )
227 |         return data_kanon
228 | 
229 |     while l_real < l_div or c_real < c:
230 |         if len(quasi_ident_gen) == 0:
231 |             print(
232 |                 f"Recursive (c,l)-diversity cannot be achieved for l={l_div} and c={c}"
233 |             )
234 |             return pd.DataFrame()
235 | 
236 |         qi_gen = quasi_ident_gen[
237 |             np.argmax([len(np.unique(data_kanon[qi])) for qi in quasi_ident_gen])
238 |         ]
239 | 
240 |         try:
241 |             generalization_qi = utils.apply_hierarchy(
242 |                 data_kanon[qi_gen].values, hierarchies[qi_gen], gen_level[qi_gen] + 1
243 |             )
244 |             data_kanon[qi_gen] = generalization_qi
245 |             gen_level[qi_gen] = gen_level[qi_gen] + 1
246 |         except ValueError:
247 |             if qi_gen in quasi_ident_gen:
248 |                 quasi_ident_gen.remove(qi_gen)
249 | 
250 |         c_real, l_real = pycanon.anonymity.recursive_c_l_diversity(
251 |             data_kanon, quasi_ident, [sens_att]
252 |         )
253 | 
254 |         equiv_class = pycanon.anonymity.utils.aux_anonymity.get_equiv_class(
255 |             data_kanon, quasi_ident
256 |         )
257 |         k_ec = []
258 |         c_ec = []
259 |         for ec in equiv_class:
260 |             data_temp = data_kanon.iloc[
261 |                 pycanon.anonymity.utils.aux_functions.convert(ec)
262 |             ]
263 |             values = np.unique(data_temp[sens_att].values)
264 |             r_ec = np.sort([len(data_temp[data_temp[sens_att] == s]) for s in values])
265 |             c_ec.append(np.floor(r_ec[0] / sum(r_ec[(l_div - 1) :]) + 1))
266 |             k_ec.append(len(ec))
267 |             if max(c_ec) < c:
268 |                 f"Recursive (c,l)-diversity cannot be achieved for l={l_div} and c={c}"
269 |             else:
270 |                 data_ec = pd.DataFrame(
271 |                     {"equiv_class": equiv_class, "c_ec": c_ec, "k": k_ec}
272 |                 )
273 |                 data_ec_c = data_ec[data_ec.c_ec < c]
274 |                 records_sup = sum(data_ec_c.k.values)
275 |                 if (records_sup + supp_records) * 100 / len(data) <= supp_level:
276 |                     ec_elim = np.concatenate(
277 |                         [
278 |                             pycanon.anonymity.utils.aux_functions.convert(ec)
279 |                             for ec in data_ec_c.equiv_class.values
280 |                         ]
281 |                     )
282 |                     anonim_data = data_kanon.drop(ec_elim).reset_index()
283 |                     c_supp, l_supp = pycanon.anonymity.recursive_c_l_diversity(
284 |                         anonim_data, quasi_ident, [sens_att]
285 |                     )
286 |                     if l_supp >= l_div and c_supp > c:
287 |                         return anonim_data
288 | 
289 |         if l_real >= l_div and c_real >= c:
290 |             return data_kanon
291 | 
292 |     return data_kanon
293 | 
294 | 
295 | def _l_diversity_inner(
296 |     data: pd.DataFrame,
297 |     ident: typing.Union[typing.List, np.ndarray],
298 |     quasi_ident: typing.Union[typing.List, np.ndarray],
299 |     sens_att: str,
300 |     k: int,
301 |     l_div: int,
302 |     supp_level: typing.Union[float, int],
303 |     hierarchies: dict,
304 | ) -> (pd.DataFrame, int):
305 |     """Anonymize a dataset using l-diversity.
306 | 
307 |     :param data: data under study.
308 |     :type data: pandas dataframe
309 | 
310 |     :param ident: list with the name of the columns of the dataframe
311 |         that are identifiers.
312 |     :type ident: list of strings
313 | 
314 |     :param quasi_ident: list with the name of the columns of the dataframe
315 |         that are quasi-identifiers.
316 |     :type quasi_ident: list of strings
317 | 
318 |     :param sens_att: string with the name of the sensitive attribute.
319 |     :type sens_att: string
320 | 
321 |     :param k: desired level of k-anonymity.
322 |     :type k: int
323 | 
324 |     :param l_div: desired level of l-diversity.
325 |     :type l_div: int
326 | 
327 |     :param supp_level: maximum level of record suppression allowed
328 |         (from 0 to 100).
329 |     :type supp_level: float
330 | 
331 |     :param hierarchies: hierarchies for generalizing the QI.
332 |     :type hierarchies: dictionary containing one dictionary for QI
333 |         with the hierarchies and the levels
334 | 
335 |     :return: anonymized data.
336 |     :rtype: pandas dataframe
337 | 
338 |     :return: number of records suppressed.
339 |     :rtype: int
340 |     """
341 |     if l_div < 1:
342 |         raise ValueError(f"Invalid value of l for l-diversity l={l_div}")
343 | 
344 |     data_kanon, supp_records_k, gen_level = k_anonymity_inner(
345 |         data, ident, quasi_ident, k, supp_level, hierarchies
346 |     )
347 | 
348 |     data = copy(data)
349 |     data = utils.suppress_identifiers(data, ident)
350 | 
351 |     l_real = pycanon.anonymity.l_diversity(data_kanon, quasi_ident, [sens_att])
352 |     quasi_ident_gen = copy(quasi_ident)
353 | 
354 |     if l_real >= l_div:
355 |         print(f"The data verifies l-diversity with l={l_real}")
356 |         return data_kanon, supp_records_k
357 | 
358 |     while l_real < l_div:
359 |         equiv_class = pycanon.anonymity.utils.aux_anonymity.get_equiv_class(
360 |             data_kanon, quasi_ident
361 |         )
362 |         ec_sensitivity = [
363 |             len(np.unique(data_kanon.iloc[ec][sens_att])) for ec in equiv_class
364 |         ]
365 |         k_ec = [len(ec) for ec in equiv_class]
366 | 
367 |         if l_div > max(ec_sensitivity):
368 |             data_ec = pd.DataFrame(
369 |                 {"equiv_class": equiv_class, "l": ec_sensitivity, "k": k_ec}
370 |             )
371 |             data_ec_l = data_ec[data_ec.l < l_div]
372 |             records_sup = sum(data_ec_l.k.values)
373 |             if (records_sup + supp_records_k) * 100 / len(data) <= supp_level:
374 |                 ec_elim = np.concatenate(
375 |                     [
376 |                         pycanon.anonymity.utils.aux_functions.convert(ec)
377 |                         for ec in data_ec_l.equiv_class.values
378 |                     ]
379 |                 )
380 |                 anonim_data = data_kanon.drop(ec_elim).reset_index()
381 |                 l_supp = pycanon.anonymity.l_diversity(
382 |                     anonim_data, quasi_ident, [sens_att]
383 |                 )
384 |                 supp_records_l = supp_records_k + records_sup
385 |                 if l_supp >= l_div:
386 |                     return anonim_data, supp_records_l
387 | 
388 |         if len(quasi_ident_gen) == 0:
389 |             print(f"l-diversity cannot be achieved for l={l_div}")
390 |             return pd.DataFrame(), supp_records_k
391 | 
392 |         qi_gen = quasi_ident_gen[
393 |             np.argmax([len(np.unique(data_kanon[qi])) for qi in quasi_ident_gen])
394 |         ]
395 | 
396 |         try:
397 |             generalization_qi = utils.apply_hierarchy(
398 |                 data_kanon[qi_gen].values, hierarchies[qi_gen], gen_level[qi_gen] + 1
399 |             )
400 |             data_kanon[qi_gen] = generalization_qi
401 |             gen_level[qi_gen] = gen_level[qi_gen] + 1
402 |         except ValueError:
403 |             if qi_gen in quasi_ident_gen:
404 |                 quasi_ident_gen.remove(qi_gen)
405 | 
406 |         l_real = pycanon.anonymity.l_diversity(data_kanon, quasi_ident, [sens_att])
407 |         if l_real >= l_div:
408 |             return data_kanon, supp_records_k
409 | 
410 |     return data_kanon, supp_records_k
411 | 


--------------------------------------------------------------------------------
/anjana/anonymity/_t_closeness.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2024 Spanish National Research Council (CSIC)
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
  6 | # not use this file except in compliance with the License. You may obtain
  7 | # a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 14 | # License for the specific language governing permissions and limitations
 15 | # under the License.
 16 | 
 17 | import numpy as np
 18 | import pandas as pd
 19 | import pycanon
 20 | from anjana.anonymity.utils import utils
 21 | from copy import copy
 22 | from anjana.anonymity import k_anonymity_inner
 23 | from beartype import beartype
 24 | from beartype import typing
 25 | 
 26 | 
 27 | @beartype()
 28 | def t_closeness(
 29 |     data: pd.DataFrame,
 30 |     ident: typing.Union[typing.List, np.ndarray],
 31 |     quasi_ident: typing.Union[typing.List, np.ndarray],
 32 |     sens_att: str,
 33 |     k: int,
 34 |     t: typing.Union[float, int],
 35 |     supp_level: typing.Union[float, int],
 36 |     hierarchies: dict,
 37 | ) -> pd.DataFrame:
 38 |     """Anonymize a dataset using t-closeness and k-anonymity.
 39 | 
 40 |     :param data: data under study.
 41 |     :type data: pandas dataframe
 42 | 
 43 |     :param ident: list with the name of the columns of the dataframe
 44 |         that are identifiers.
 45 |     :type ident: list of strings
 46 | 
 47 |     :param quasi_ident: list with the name of the columns of the dataframe
 48 |         that are quasi-identifiers.
 49 |     :type quasi_ident: list of strings
 50 | 
 51 |     :param sens_att: str with the name of the sensitive attribute.
 52 |     :type sens_att: string
 53 | 
 54 |     :param k: value of k for k-anonymity to be applied.
 55 |     :type k: int
 56 | 
 57 |     :param t: value of t for t-closeness to be applied.
 58 |     :type t: float
 59 | 
 60 |     :param supp_level: maximum level of record suppression allowed
 61 |         (from 0 to 100).
 62 |     :type supp_level: float
 63 | 
 64 |     :param hierarchies: hierarchies for generalizing the QI.
 65 |     :type hierarchies: dictionary containing one dictionary for QI
 66 |         with the hierarchies and the levels
 67 | 
 68 |     :return: anonymized data.
 69 |     :rtype: pandas dataframe
 70 |     """
 71 |     if t < 0 or t > 1:
 72 |         raise ValueError(f"Invalid value of t for t-closeness, t={t}")
 73 | 
 74 |     data_kanon, supp_records, gen_level = k_anonymity_inner(
 75 |         data, ident, quasi_ident, k, supp_level, hierarchies
 76 |     )
 77 | 
 78 |     t_real = pycanon.anonymity.t_closeness(data_kanon, quasi_ident, [sens_att])
 79 |     quasi_ident_gen = copy(quasi_ident)
 80 | 
 81 |     if t_real <= t:
 82 |         print(f"The data verifies t-closeness with t={t_real}")
 83 |         return data_kanon
 84 | 
 85 |     while t_real > t:
 86 |         if len(quasi_ident_gen) == 0:
 87 |             print(f"The anonymization cannot be carried out for the given value t={t}")
 88 |             return pd.DataFrame()
 89 | 
 90 |         qi_gen = quasi_ident_gen[
 91 |             np.argmax([len(np.unique(data_kanon[qi])) for qi in quasi_ident_gen])
 92 |         ]
 93 | 
 94 |         try:
 95 |             generalization_qi = utils.apply_hierarchy(
 96 |                 data_kanon[qi_gen].values, hierarchies[qi_gen], gen_level[qi_gen] + 1
 97 |             )
 98 |             data_kanon[qi_gen] = generalization_qi
 99 |             gen_level[qi_gen] = gen_level[qi_gen] + 1
100 |         except ValueError:
101 |             if qi_gen in quasi_ident_gen:
102 |                 quasi_ident_gen.remove(qi_gen)
103 | 
104 |         t_real = pycanon.anonymity.t_closeness(data_kanon, quasi_ident, [sens_att])
105 |         if t_real <= t:
106 |             return data_kanon
107 | 
108 |     return data_kanon
109 | 


--------------------------------------------------------------------------------
/anjana/anonymity/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | """Package containing auxiliary functions for performing the anonymization."""
18 | from .utils import (
19 |     suppress_identifiers,
20 |     apply_hierarchy,
21 |     check_gen_level,
22 |     get_transformation,
23 |     apply_transformation,
24 |     generate_intervals,
25 | )
26 | 
27 | __all__ = [
28 |     "suppress_identifiers",
29 |     "apply_hierarchy",
30 |     "check_gen_level",
31 |     "get_transformation",
32 |     "apply_transformation",
33 |     "generate_intervals",
34 | ]
35 | 


--------------------------------------------------------------------------------
/anjana/anonymity/utils/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2024 Spanish National Research Council (CSIC)
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
  6 | # not use this file except in compliance with the License. You may obtain
  7 | # a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 14 | # License for the specific language governing permissions and limitations
 15 | # under the License.
 16 | 
 17 | """Module with utils for the anonymization tools."""
 18 | 
 19 | import numpy as np
 20 | import pandas as pd
 21 | from beartype import beartype
 22 | from beartype import typing
 23 | from copy import copy
 24 | 
 25 | 
 26 | @beartype()
 27 | def suppress_identifiers(
 28 |     data: pd.DataFrame, ident: typing.Union[typing.List, np.ndarray]
 29 | ) -> pd.DataFrame:
 30 |     """Remove the identifiers from a dataset.
 31 | 
 32 |     :param data: data under study.
 33 |     :type data: pandas dataframe
 34 | 
 35 |     :param ident: list with the name of the columns of the dataframe
 36 |         that are identifiers.
 37 |     :type ident: list of strings
 38 | 
 39 |     :return: data with the identifiers suppressed.
 40 |     :rtype: pandas dataframe
 41 |     """
 42 |     for i in ident:
 43 |         if i not in data.columns:
 44 |             raise ValueError(f"Identifier {i} is not a column in the given dataset")
 45 |         data[i] = ["*"] * len(data)
 46 | 
 47 |     return data
 48 | 
 49 | 
 50 | @beartype()
 51 | def apply_hierarchy(
 52 |     data: typing.Union[typing.List, np.ndarray], hierarchies: dict, level: int
 53 | ) -> typing.Union[typing.List, np.ndarray]:
 54 |     """Apply the given level of a hierarchy for a quasi-identifier.
 55 | 
 56 |     :param data: data under study.
 57 |     :type data: list, numpy array
 58 | 
 59 |     :param hierarchies: hierarchies for generalizing a given QI.
 60 |     :type hierarchies: dictionary with the hierarchies and the levels
 61 | 
 62 |     :param level: level of the hierarchy to be applied.
 63 |     :type level: int
 64 | 
 65 |     :return: column with the given level of hierarchy applied.
 66 |     :rtype: numpy array
 67 |     """
 68 |     num_level = len(hierarchies.keys()) - 1
 69 |     if level > num_level:
 70 |         raise ValueError("Error, invalid hierarchy level")
 71 |     if not isinstance(hierarchies[level], pd.Series):
 72 |         hierarchies[level] = pd.Series(hierarchies[level])
 73 |     if not isinstance(hierarchies[level - 1], pd.Series):
 74 |         hierarchies[level - 1] = pd.Series(hierarchies[level - 1])
 75 | 
 76 |     pos = []
 77 |     for elem in data:
 78 |         pos.append(np.where(hierarchies[level - 1].values == elem)[0][0])
 79 |     data_anon = hierarchies[level].values[pos]
 80 |     return data_anon
 81 | 
 82 | 
 83 | @beartype()
 84 | def apply_hierarchy_current(
 85 |     data: typing.Union[typing.List, np.ndarray],
 86 |     hierarchies: dict,
 87 |     level: int,
 88 |     actual: int,
 89 | ) -> typing.Union[typing.List, np.ndarray]:
 90 |     """Apply certain level of a hierarchy for a quasi-identifier given the current one.
 91 | 
 92 |     :param data: data under study.
 93 |     :type data: list, numpy array
 94 | 
 95 |     :param hierarchies: hierarchies for generalizing a given QI.
 96 |     :type hierarchies: dictionary with the hierarchies and the levels
 97 | 
 98 |     :param level: level of the hierarchy to be applied.
 99 |     :type level: int
100 | 
101 |     :param actual: current level of the hierarchy applied.
102 |     :type actual: int
103 | 
104 |     :return: column with the given level of hierarchy applied.
105 |     :rtype: numpy array
106 |     """
107 |     num_level = len(hierarchies.keys()) - 1
108 |     if level > num_level:
109 |         raise ValueError("Error, invalid hierarchy level")
110 |     if not isinstance(hierarchies[level], pd.Series):
111 |         hierarchies[level] = pd.Series(hierarchies[level])
112 |     if not isinstance(hierarchies[actual], pd.Series):
113 |         hierarchies[actual] = pd.Series(hierarchies[actual])
114 | 
115 |     pos = []
116 |     for elem in data:
117 |         pos.append(np.where(hierarchies[actual].values == elem)[0][0])
118 |     data_anon = hierarchies[level].values[pos]
119 |     return data_anon
120 | 
121 | 
122 | @beartype()
123 | def check_gen_level(
124 |     data: pd.DataFrame,
125 |     quasi_ident: typing.Union[typing.List, np.ndarray],
126 |     hierarchies: dict,
127 | ) -> dict:
128 |     """Check the generalization level for each quasi-identifier.
129 | 
130 |     :param data: data under study.
131 |     :type data: pandas dataframe
132 | 
133 |     :param quasi_ident: list with the name of the columns of the dataframe
134 |         that are quasi-identifiers.
135 |     :type quasi_ident: list of strings
136 | 
137 |     :param hierarchies: hierarchies for generalizing the QI.
138 |     :type hierarchies: dictionary containing one dictionary for QI
139 |         with the hierarchies and the levels
140 | 
141 |     :return: level of generalization applied to each QI.
142 |     :rtype: dict
143 |     """
144 |     gen_level = {}
145 |     for qi in quasi_ident:
146 |         if qi in hierarchies.keys():
147 |             for level in hierarchies[qi].keys():
148 |                 hierarchy_level = set(hierarchies[qi][level])
149 |                 if set(data[qi].values).issubset(hierarchy_level):
150 |                     gen_level[qi] = level
151 | 
152 |     return gen_level
153 | 
154 | 
155 | @beartype()
156 | def get_transformation(
157 |     data_anon: pd.DataFrame,
158 |     quasi_ident: typing.Union[typing.List, np.ndarray],
159 |     hierarchies: dict,
160 | ) -> list:
161 |     """Get the transformation applied for anonymizing the data.
162 | 
163 |     Example: a transformation [0,1,2,0] means:
164 |     - Level 0 of generalization for th 1st QI
165 |     - Level 1 of generalization for th 2nd QI
166 |     - Level 2 of generalization for th 3rd QI
167 |     - Level 0 of generalization for the 4th QI
168 | 
169 |     :param data_anon: data under study.
170 |     :type data_anon: pandas dataframe
171 | 
172 |     :param quasi_ident: list with the name of the columns of the dataframe
173 |         that are quasi-identifiers.
174 |     :type quasi_ident: list of strings
175 | 
176 |     :param hierarchies: hierarchies for generalizing the QI.
177 |     :type hierarchies: dictionary containing one dictionary for QI
178 |         with the hierarchies and the levels
179 | 
180 |     :return: transformation applied
181 |     :rtype: list
182 |     """
183 |     gen_level = check_gen_level(data_anon, quasi_ident, hierarchies)
184 |     transformation = []
185 |     for qi in quasi_ident:
186 |         if qi in gen_level.keys():
187 |             transformation.append(gen_level[qi])
188 |         else:
189 |             transformation.append(0)
190 | 
191 |     return transformation
192 | 
193 | 
194 | @beartype()
195 | def apply_transformation(
196 |     data: pd.DataFrame,
197 |     quasi_ident: typing.Union[typing.List, np.ndarray],
198 |     hierarchies: dict,
199 |     transformation: list,
200 | ) -> pd.DataFrame:
201 |     """Apply a given transformation to the data.
202 | 
203 |     :param data: data under study.
204 |     :type data: pandas dataframe
205 | 
206 |     :param quasi_ident: list with the name of the columns of the dataframe
207 |         that are quasi-identifiers.
208 |     :type quasi_ident: list of strings
209 | 
210 |     :param hierarchies: hierarchies for generalizing the QI.
211 |     :type hierarchies: dictionary containing one dictionary for QI
212 |         with the hierarchies and the levels
213 | 
214 |     :param transformation: transformation to be applied
215 |     :type transformation: list
216 | 
217 |     :return: dataset generalized with the transformation given
218 |     :rtype: pandas dataframe
219 |     """
220 |     data_anon = copy(data)
221 |     actual_transform = check_gen_level(data_anon, quasi_ident, hierarchies)
222 |     for i, qi in enumerate(quasi_ident):
223 |         hierarchy_qi = hierarchies[qi]
224 |         level = transformation[i]
225 |         if level < 0:
226 |             raise ValueError("Error, invalid hierarchy level")
227 |         if level > max(hierarchies[qi].keys()):
228 |             raise ValueError("Error, invalid hierarchy level")
229 |         actual = actual_transform[qi]
230 |         if level != actual:
231 |             column = apply_hierarchy_current(
232 |                 data_anon[qi].values, hierarchy_qi, level, actual
233 |             )
234 |             data_anon[qi] = column
235 | 
236 |     return data_anon
237 | 
238 | 
239 | @beartype()
240 | def generate_intervals(
241 |     quasi_ident: typing.Union[typing.List, np.ndarray],
242 |     inf: typing.Union[int, float],
243 |     sup: typing.Union[int, float],
244 |     step: int,
245 | ) -> list:
246 |     """
247 |     Generate intervals as hierarchies.
248 | 
249 |     Given a quasi-identifier of numeric type, creates a list containing an
250 |     interval-based generalization (hierarchy) of the values of the quasi-identifier.
251 |     The intervals will have the length entered in step.
252 | 
253 |     :param quasi_ident: values of the quasi-identifier on which the interval-based
254 |         generalization is to be obtained
255 |     :type quasi_ident: list or numpy array
256 | 
257 |     :param inf: lower value of the set of intervals
258 |     :type inf: int or float
259 | 
260 |     :param sup: bigger value of the set of intervals
261 |     :type sup: int or float
262 | 
263 |     :param step: spacing between values of the intervals
264 |     :type step: int
265 | 
266 |     :return: list with the intervals associated with the given values
267 |     :rtype: list
268 |     """
269 |     values = np.arange(inf, sup + 1, step)
270 |     interval = []
271 |     for num in quasi_ident:
272 |         lower = np.searchsorted(values, num)
273 |         if lower == 0:
274 |             lower = 1
275 |         interval.append(f"[{values[lower - 1]}, {values[lower]})")
276 | 
277 |     return interval
278 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/anjana.anonymity.rst:
--------------------------------------------------------------------------------
 1 | anjana.anonymity package
 2 | ========================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    anjana.anonymity.utils
11 | 
12 | Module contents
13 | ---------------
14 | 
15 | .. automodule:: anjana.anonymity
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/docs/source/anjana.anonymity.utils.rst:
--------------------------------------------------------------------------------
 1 | anjana.anonymity.utils package
 2 | ==============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | anjana.anonymity.utils.utils module
 8 | -----------------------------------
 9 | 
10 | .. automodule:: anjana.anonymity.utils.utils
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: anjana.anonymity.utils
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/docs/source/anjana.rst:
--------------------------------------------------------------------------------
 1 | anjana package
 2 | ==============
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    anjana.anonymity
11 | 
12 | Module contents
13 | ---------------
14 | 
15 | .. automodule:: anjana
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | # -- Path setup --------------------------------------------------------------
 6 | 
 7 | # If extensions (or modules to document with autodoc) are in another directory,
 8 | # add these directories to sys.path here. If the directory is relative to the
 9 | # documentation root, use os.path.abspath to make it absolute, like shown here.
10 | #
11 | import os
12 | import sys
13 | 
14 | sys.path.insert(0, os.path.abspath("./../../"))
15 | 
16 | # -- Project information -----------------------------------------------------
17 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
18 | 
19 | project = "ANJANA"
20 | copyright = "2024, Spanish National Research Council (CSIC)"
21 | author = "Judith Sáinz-Pardo Díaz (CSIC)"
22 | release = "1.1.0"
23 | 
24 | # -- General configuration ---------------------------------------------------
25 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
26 | 
27 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon"]
28 | 
29 | templates_path = ["_templates"]
30 | 
31 | source_parsers = {
32 |     ".md": "recommonmark.parser.CommonMarkParser",
33 | }
34 | # The suffix of source filenames.
35 | source_suffix = [".rst", ".md"]
36 | 
37 | 
38 | # -- Options for HTML output -------------------------------------------------
39 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
40 | 
41 | html_theme = "furo"
42 | 
43 | # Add any paths that contain custom static files (such as style sheets) here,
44 | # relative to this directory. They are copied after the builtin static files,
45 | # so a file named "default.css" will overwrite the builtin "default.css".
46 | html_static_path = ["_static"]
47 | 


--------------------------------------------------------------------------------
/docs/source/get_transformation.rst:
--------------------------------------------------------------------------------
 1 | Transformation applied
 2 | ######################
 3 | 
 4 |    In some cases, you may need to obtain the transformation that has been performed on the set of quasi-identifiers, in order to transfer statistics on the processing performed on the data. Usually, this transformation will be detonated with a list of the same length as the number of quasi-identifiers. When performing the anonymization process, the quasi-identifiers are entered in a certain order, which will be the same as the order in which they are represented in the list with the transformation.
 5 | 
 6 | .. note::
 7 | 
 8 |    An example would be the following. Suppose we have the quasi-identifiers from the adult dataset example: *age*, *education*, *marital-status*, *occupation*, *sex* and *native-country*. If we get the transformation [4, 2, 1, 2, 2, 0, 0], this would mean the following:
 9 |    
10 |    - Hierarchy level 4 has been applied for *age*, with level 0 being the original value in the database.
11 |    - Hierarchy level 2 has been applied for *education*.
12 |    - Hierarchy level 1 has been applied for *marital-status*.
13 |    - Hierarchy level 2 has been applied for *occupation*.
14 |    - No hierarchy has been applied for *sex* and *native-country*.
15 |    
16 |    If a quasi-identifier has been used to anonymize the data, even if no hierarchy has been included for it, it will appear in the corresponding order in the list with the transformation (with the value 0, because no generalization level has been applied).
17 |    
18 |    
19 | To obtain this transofrmation, the ``get_transformation()`` function from the ``utils`` submodule can be used as follows (the data and hierarquies can be found in the `examples folder of the repository`_):
20 | 
21 | .. code-block:: python 
22 | 
23 |    import pandas as pd
24 |    from anjana.anonymity import utils
25 |    from anjana.anonymity import k_anonymity, l_diversity, t_closeness
26 | 
27 |    data = pd.read_csv("data/adult.csv")
28 |    data.columns = data.columns.str.strip()
29 |    cols = [
30 |        "workclass",
31 |        "education",
32 |        "marital-status",
33 |        "occupation",
34 |        "sex",
35 |        "native-country",
36 |    ]
37 |    for col in cols:
38 |        data[col] = data[col].str.strip()
39 | 
40 |    quasi_ident = [
41 |        "age",
42 |        "education",
43 |        "marital-status",
44 |        "occupation",
45 |        "sex",
46 |        "native-country",
47 |    ]
48 |    ident = ["race"]
49 |    sens_att = "salary-class"
50 |    k = 10
51 |    supp_level = 50
52 | 
53 |    hierarchies = {
54 |        "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
55 |        "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
56 |        "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
57 |        "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
58 |        "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
59 |        "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
60 |    }
61 | 	
62 |    # Anonymize the data using k-anonymity with k=10:
63 |    data_anon = k_anonymity(data, ident, quasi_ident, k, supp_level, hierarchies)
64 |    
65 |    # Get the transformation applied:
66 |    transformation_anon = utils.get_transformation(data_anon, quasi_ident, hierarchies)
67 |    # The transformation obtained is: [1, 0, 0, 0, 0, 0], which means that
68 |    # the QI age has been generalized using the first hierarchy level.
69 |    # No hierarchy has been applied for the other five QIs.
70 |    
71 |    
72 | .. _examples folder of the repository: https://gitlab.ifca.es/privacy-security/siesta-anonymity/-/tree/main/examples
73 | 
74 | 


--------------------------------------------------------------------------------
/docs/source/getting_started.rst:
--------------------------------------------------------------------------------
  1 | Getting started
  2 | ###############
  3 | 
  4 | Example with the `adult dataset`_, anonymizing using three techniques: k-anonymity, :math:`\ell`-diversity and t-closeness (the data and hierarchies can be found in the `examples folder of the repository`_):
  5 | 
  6 | .. code-block:: python
  7 | 
  8 |    import pandas as pd
  9 |    import anjana
 10 |    from anjana.anonymity import k_anonymity, l_diversity, t_closeness
 11 | 
 12 |    # Read and process the data
 13 |    data = pd.read_csv("adult.csv") 
 14 |    data.columns = data.columns.str.strip()
 15 |    cols = [
 16 |        "workclass",
 17 |        "education",
 18 |        "marital-status",
 19 |        "occupation",
 20 |        "sex",
 21 |        "native-country",
 22 |    ]
 23 |    for col in cols:
 24 |       data[col] = data[col].str.strip()
 25 | 
 26 |    # Define the identifiers, quasi-identifiers and the sensitive attribute
 27 |    quasi_ident = [
 28 |        "age",
 29 |        "education",
 30 |        "marital-status",
 31 |        "occupation",
 32 |        "sex",
 33 |        "native-country",
 34 |    ]
 35 |    ident = ["race"]
 36 |    sens_att = "salary-class"
 37 | 
 38 |    # Select the desired level of k, l and t
 39 |    k = 10
 40 |    l_div = 2
 41 |    t = 0.5
 42 | 
 43 |    # Select the suppression limit allowed
 44 |    supp_level = 50
 45 | 
 46 |    # Import the hierarquies for each quasi-identifier. Define a dictionary containing them
 47 |    hierarchies = {
 48 |        "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
 49 |        "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
 50 |        "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
 51 |        "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
 52 |        "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
 53 |        "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
 54 |    }
 55 | 
 56 |    # Apply the three functions: k-anonymity, l-diversity and t-closeness
 57 |    data_anon = k_anonymity(data, ident, quasi_ident, k, supp_level, hierarchies)
 58 |    data_anon = l_diversity(
 59 |        data_anon, ident, quasi_ident, sens_att, k, l_div, supp_level, hierarchies
 60 |    )
 61 |    data_anon = t_closeness(
 62 |        data_anon, ident, quasi_ident, sens_att, k, t, supp_level, hierarchies
 63 |    )
 64 |    
 65 |    
 66 | .. note::
 67 |    Applying the three techniques outlined above on the given dataset (with more than 30,000 rows), and with 6 quasi-identifiers, takes less than 4 seconds.
 68 |    
 69 |    
 70 | Define your own hierarchies
 71 | ***************************
 72 | 
 73 | All the anonymity functions available in ANJANA receive a dictionary with the hierarchies to be applied to the quasi-identifiers. In particular, this dictionary has as key the names of the columns that are quasi-identifiers to which a hierarchy is to be applied (it may happen that you do not want to generalize some QIs and therefore no hierarchy is to be applied to them, just do not include them in this dictionary). The value for each key (QI) is formed by a dictionary in such a way that the value 0 has as value the raw column (as it is in the original dataset), the value 1 corresponds to the first level of transformation to be applied, in relation to the values of the original column, and so on with as many keys as levels of hierarchies have been established.
 74 | 
 75 | For a better understanding, let's look at the following example. Supose that we have the following simulated dataset (extracted from the `hospital_extended.csv`_ dataset used for testing purposes) with *age*, *gender* and *city* as quasi-identifiers, *name* as identifier and *disease* as sensitive attribute. Regarding the QI, we want to apply the following hierarquies: interval of 5 years (first level) and 10 years (second level) for the *age*. Suppression as first level for both *gender* and *city*.
 76 | 
 77 | +-----------+-----+--------+------------+-----------------+
 78 | | name      | age | gender | city       | disease         |
 79 | +===========+=====+========+============+=================+
 80 | | Ramsha    | 29  | Female | Tamil Nadu | Cancer          |
 81 | +-----------+-----+--------+------------+-----------------+
 82 | | Yadu      | 24  | Female | Kerala     | Viral infection |
 83 | +-----------+-----+--------+------------+-----------------+
 84 | | Salima    | 28  | Female | Tamil Nadu | TB              |
 85 | +-----------+-----+--------+------------+-----------------+
 86 | | Sunny     | 27  | Male   | Karnataka  | No illness      |
 87 | +-----------+-----+--------+------------+-----------------+
 88 | | Joan      | 24  | Female | Kerala     | Heart-related   |
 89 | +-----------+-----+--------+------------+-----------------+
 90 | | Bahuksana | 23  | Male   | Karnataka  | TB              |
 91 | +-----------+-----+--------+------------+-----------------+
 92 | | Rambha    | 19  | Male   | Kerala     | Cancer          |
 93 | +-----------+-----+--------+------------+-----------------+
 94 | | Kishor    | 29  | Male   | Karnataka  | Heart-related   |
 95 | +-----------+-----+--------+------------+-----------------+
 96 | | Johnson   | 17  | Male   | Kerala     | Heart-related   |
 97 | +-----------+-----+--------+------------+-----------------+
 98 | | John      | 19  | Male   | Kerala     | Viral infection |
 99 | +-----------+-----+--------+------------+-----------------+
100 | 
101 | Then, in order to create the hierarchies we can define the following dictionary:
102 | 
103 | .. code-block:: python
104 | 
105 |    age = data['age'].values
106 |    # Values: [29 24 28 27 24 23 19 29 17 19] (note that the following can be automatized)
107 |    age_5years = ['[25, 30)', '[20, 25)', '[25, 30)',
108 |                  '[25, 30)', '[20, 25)', '[20, 25)',
109 |                  '[15, 20)', '[25, 30)', '[15, 20)', '[15, 20)']
110 | 
111 |    age_10years = ['[20, 30)', '[20, 30)', '[20, 30)',
112 |                   '[20, 30)', '[20, 30)', '[20, 30)',
113 |                   '[10, 20)', '[20, 30)', '[10, 20)', '[10, 20)']
114 | 
115 |    hierarchies = {
116 |        "age": {0: age,
117 |                1: age_5years,
118 |                2: age_10years},
119 |        "gender": {
120 |            0: data["gender"].values,
121 |            1: np.array(["*"] * len(data["gender"].values)) # Suppression
122 |        },
123 |        "city": {0: data["city"].values,
124 |                 1: np.array(["*"] * len(data["city"].values))} # Suppression
125 |    }
126 | 
127 | In addition, we can also use the function _generate_intervals()_ from _utils_ for creating the interval-based hierarchy as follows:
128 | 
129 | .. code-block:: python
130 | 
131 |     import numpy as np
132 |     from anjana.anonymity import utils
133 | 
134 |     age = data['age'].values
135 | 
136 |     hierarchies = {
137 |         "age": {
138 |             0: data["age"].values,
139 |             1: utils.generate_intervals(data["age"].values, 0, 100, 5),
140 |             2: utils.generate_intervals(data["age"].values, 0, 100, 10),
141 |         },
142 |         "gender": {
143 |             0: data["gender"].values,
144 |             1: np.array(["*"] * len(data["gender"].values)) # Suppression
145 |         },
146 |         "city": {0: data["city"].values,
147 |                  1: np.array(["*"] * len(data["city"].values))} # Suppression
148 |     }
149 | 
150 | 
151 | .. _adult dataset: https://archive.ics.uci.edu/ml/datasets/adult
152 | .. _examples folder of the repository: https://github.com/IFCA-Advanced-Computing/anjana/tree/main/examples/hierarchies
153 | .. _hospital_extended.csv: https://github.com/IFCA-Advanced-Computing/anjana/blob/main/examples/data/hospital_extended.csv
154 | 
155 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | ANJANA
 2 | =============================================================================
 3 | 
 4 | |License| |codecov| |DOI| |Downloads| |Documentation Status|
 5 | |release-please| |Publish Package in PyPI| |CI/CD Pipeline| |Code Coverage|
 6 | 
 7 | |Python version| |PyPI|
 8 | 
 9 | ANJANA is a `Python`_ library which allows the application of different anonymity
10 | techniques based on a set of identifiers, quasi-identifiers (QI) and a sensitive 
11 | attribute. It's easy to use and fast. 
12 | The following anonymity techniques can be applied:
13 | 
14 | * k-anonymity.
15 | * (:math:`\alpha`,k)-anonymity.
16 | * :math:`\ell`-diversity.
17 | * Entropy :math:`\ell`-diversity.
18 | * Recursive (c, :math:`\ell`)-diversity.
19 | * t-closeness.
20 | * Basic :math:`\beta`-likeness.
21 | * Enhanced :math:`\beta`-likeness.
22 | * :math:`\delta`-disclosure privacy.
23 | 
24 | .. _Python: https://www.python.org
25 | 
26 | User documentation
27 | ******************
28 | 
29 | .. toctree::
30 |    :maxdepth: 4
31 | 
32 |    intro
33 |    getting_started
34 |    modules
35 |    get_transformation
36 |    multiple_sa
37 |    
38 | 
39 | License
40 | ***********************
41 | 
42 | ANJANA is licensed under Apache License Version 2.0 (http://www.apache.org/licenses/)
43 | 
44 | 
45 |   
46 | Indices and tables
47 | ==================
48 | 
49 | * :ref:`genindex`
50 | * :ref:`modindex`
51 | * :ref:`search`
52 | 
53 | .. |License| image:: https://img.shields.io/badge/License-Apache_2.0-green.svg
54 |    :target: https://github.com/IFCA-Advanced-Computing/anjana/blob/main/LICENSE
55 | .. |codecov| image:: https://codecov.io/gh/IFCA-Advanced-Computing/anjana/graph/badge.svg?token=AVI53GZ7YD
56 |    :target: https://codecov.io/gh/IFCA-Advanced-Computing/anjana
57 | .. |DOI| image:: https://zenodo.org/badge/DOI/10.5281/zenodo.11184468.svg
58 |    :target: https://doi.org/10.5281/zenodo.11184468
59 | .. |PyPI| image:: https://img.shields.io/pypi/v/anjana
60 | .. |Downloads| image:: https://static.pepy.tech/badge/anjana
61 |    :target: https://pepy.tech/project/anjana
62 | .. |Documentation Status| image:: https://readthedocs.org/projects/anjana/badge/?version=latest
63 |    :target: https://anjana.readthedocs.io/en/latest/?badge=latest
64 | .. |release-please| image:: https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/release-please.yml/badge.svg
65 |    :target: https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/release-please.yml
66 | .. |Publish Package in PyPI| image:: https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/pypi.yml/badge.svg
67 |    :target: https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/pypi.yml
68 | .. |CI/CD Pipeline| image:: https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/cicd.yml/badge.svg
69 |    :target: https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/cicd.yml
70 | .. |Code Coverage| image:: https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/.codecov.yml/badge.svg
71 |    :target: https://github.com/IFCA-Advanced-Computing/anjana/actions/workflows/.codecov.yml
72 | .. |Python version| image:: https://img.shields.io/badge/python-3.9|3.10|3.11|3.12-blue
73 | 


--------------------------------------------------------------------------------
/docs/source/intro.rst:
--------------------------------------------------------------------------------
 1 | First steps
 2 | ###########
 3 | 
 4 | Start protecting the privacy of your data using ANJANA!
 5 | 
 6 | Install
 7 | ***********************
 8 |     
 9 | First, we strongly recommend the use of a virtual environment. In linux: 
10 | 
11 | .. code-block:: console
12 | 
13 |    virtualenv .venv -p python3
14 |    source .venv/bin/activate
15 | 
16 | 
17 | Install anjana (linux and windows) using `pip`_:
18 | 
19 | .. code-block:: console
20 | 
21 |    pip install anjana
22 | 
23 | 
24 | Install the most updated version of anjana (linux and windows), using git:
25 | 
26 | .. code-block:: console
27 | 
28 |    pip install git+https://github.com/IFCA-Advanced-Computing/anjana.git
29 | 
30 | 
31 | Usage example
32 | *************
33 | 
34 | Example with the `adult dataset`_, anonymizing using (:math:`\alpha`,k)-anonymity (the data and hierarquies can be found in the `examples folder of the repository`_):
35 | 
36 | .. code-block:: python
37 | 
38 |     import pandas as pd
39 |     from anjana.anonymity import alpha_k_anonymity
40 | 
41 |     data = pd.read_csv("adult.csv")  # 32561 rows
42 |     data.columns = data.columns.str.strip()
43 |     cols = [
44 |         "workclass",
45 |         "education",
46 |         "marital-status",
47 |         "occupation",
48 |         "sex",
49 |         "native-country",
50 |     ]
51 |     
52 |     for col in cols:
53 |         data[col] = data[col].str.strip()
54 |     
55 |     ident = ["race"]
56 |     sens_att = "salary-class"
57 |     k = 10
58 |     alpha = 0.8
59 |     supp_level = 100
60 | 
61 |     hierarchies = {
62 |         "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
63 |         "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
64 |         "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
65 |         "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
66 |         "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
67 |         "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
68 |     }
69 | 
70 | 
71 |     data_anon = alpha_k_anonymity(
72 |         data, ident, quasi_ident, sens_att, k, alpha, supp_level, hierarchies
73 |     )
74 | 
75 | 
76 | 
77 | .. _adult dataset: https://archive.ics.uci.edu/ml/datasets/adult
78 | .. _examples folder of the repository: https://gitlab.ifca.es/privacy-security/siesta-anonymity/-/tree/main/examples
79 | .. _pip: https://pypi.org/project/anjana/
80 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | anjana
2 | ======
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    anjana
8 | 


--------------------------------------------------------------------------------
/docs/source/multiple_sa.rst:
--------------------------------------------------------------------------------
1 | Multiple sensitive attributes
2 | #############################
3 | 
4 | .. note::
5 |    Currently, ANJANA allows the incorporation of a single sensitive attribute (SA) for the anonymization process with the different techniques implemented. If you have more than one SA, we recommend you to apply the desired technique with respect to the first SA, and once the anonymized dataset is obtained, anonymize it again with respect to another of the sensitive attributes, and so on. In addition, if it is considered that any of the sensitive attributes can act as a quasi-identifier for the rest of the sensitive attributes, it can be included as QI when applicable.
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/examples/adult.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | import pandas as pd
18 | from anjana.anonymity import k_anonymity
19 | import pycanon
20 | import time
21 | 
22 | data = pd.read_csv("data/adult.csv")  # 32561 rows
23 | data.columns = data.columns.str.strip()
24 | cols = [
25 |     "workclass",
26 |     "education",
27 |     "marital-status",
28 |     "occupation",
29 |     "sex",
30 |     "native-country",
31 | ]
32 | for col in cols:
33 |     data[col] = data[col].str.strip()
34 | print(data)  # 32561 rows
35 | quasi_ident = [
36 |     "age",
37 |     "education",
38 |     "marital-status",
39 |     "occupation",
40 |     "sex",
41 |     "native-country",
42 | ]
43 | ident = ["race"]
44 | k = 10
45 | supp_level = 50
46 | 
47 | hierarchies = {
48 |     "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
49 |     "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
50 |     "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
51 |     "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
52 |     "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
53 |     "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
54 | }
55 | 
56 | start = time.time()
57 | data_anon = k_anonymity(data, ident, quasi_ident, k, supp_level, hierarchies)
58 | end = time.time()
59 | print(f"Elapsed time: {end-start}")
60 | print(f"Value of k calculated: {pycanon.anonymity.k_anonymity(data_anon, quasi_ident)}")
61 | 
62 | # Elapsed time: 0.9592475891113281
63 | # Value of k calculated: 10
64 | 
65 | data_anon.to_csv("adult_k10.csv")
66 | 
67 | print(f"Number of records suppressed: {len(data) - len(data_anon)}")
68 | print(
69 |     f"Percentage of records suppressed: {100 * (len(data) - len(data_anon)) / len(data)} %"
70 | )
71 | 
72 | # Number of records suppressed: 14234
73 | # Percentage of records suppressed: 43.71487362181751 %
74 | 


--------------------------------------------------------------------------------
/examples/adult_alpha_k_anonymity.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | import pandas as pd
18 | from anjana.anonymity import alpha_k_anonymity
19 | import pycanon
20 | import time
21 | 
22 | data = pd.read_csv("data/adult.csv")  # 32561 rows
23 | data.columns = data.columns.str.strip()
24 | cols = [
25 |     "workclass",
26 |     "education",
27 |     "marital-status",
28 |     "occupation",
29 |     "sex",
30 |     "native-country",
31 | ]
32 | for col in cols:
33 |     data[col] = data[col].str.strip()
34 | print(data)
35 | quasi_ident = [
36 |     "age",
37 |     "education",
38 |     "marital-status",
39 |     "occupation",
40 |     "sex",
41 |     "native-country",
42 | ]
43 | ident = ["race"]
44 | sens_att = "salary-class"
45 | k = 10
46 | alpha = 0.8
47 | supp_level = 100
48 | 
49 | hierarchies = {
50 |     "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
51 |     "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
52 |     "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
53 |     "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
54 |     "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
55 |     "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
56 | }
57 | 
58 | start = time.time()
59 | data_anon = alpha_k_anonymity(
60 |     data, ident, quasi_ident, sens_att, k, alpha, supp_level, hierarchies
61 | )
62 | end = time.time()
63 | print(f"Elapsed time: {end - start}")
64 | print(f"Value of k calculated: {pycanon.anonymity.k_anonymity(data_anon, quasi_ident)}")
65 | alpha_cal, _ = pycanon.anonymity.alpha_k_anonymity(data_anon, quasi_ident, [sens_att])
66 | print(f"Value of alpha calculated: {alpha_cal}")
67 | 
68 | # Elapsed time: 1.1014823913574219
69 | # Value of k calculated: 10
70 | # Value of alpha calculated: 0.8
71 | 
72 | print(f"Number of records suppressed: {len(data) - len(data_anon)}")
73 | print(
74 |     f"Percentage of records suppressed: {100 * (len(data) - len(data_anon)) / len(data)} %"
75 | )
76 | # Number of records suppressed: 14234
77 | # Percentage of records suppressed: 43.71487362181751 %
78 | 


--------------------------------------------------------------------------------
/examples/adult_basic_beta_likeness.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | import pandas as pd
18 | from anjana.anonymity import basic_beta_likeness
19 | import pycanon
20 | import time
21 | 
22 | data = pd.read_csv("data/adult.csv")  # 32561 rows
23 | data.columns = data.columns.str.strip()
24 | cols = [
25 |     "workclass",
26 |     "education",
27 |     "marital-status",
28 |     "occupation",
29 |     "sex",
30 |     "native-country",
31 | ]
32 | for col in cols:
33 |     data[col] = data[col].str.strip()
34 | print(data)
35 | quasi_ident = [
36 |     "age",
37 |     "education",
38 |     "marital-status",
39 |     "occupation",
40 |     "sex",
41 |     "native-country",
42 | ]
43 | ident = ["race"]
44 | sens_att = "salary-class"
45 | k = 10
46 | beta = 0.5
47 | supp_level = 100
48 | 
49 | hierarchies = {
50 |     "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
51 |     "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
52 |     "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
53 |     "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
54 |     "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
55 |     "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
56 | }
57 | 
58 | start = time.time()
59 | data_anon = basic_beta_likeness(
60 |     data, ident, quasi_ident, sens_att, k, beta, supp_level, hierarchies
61 | )
62 | end = time.time()
63 | print(f"Elapsed time: {end - start}")
64 | print(f"Value of k calculated: {pycanon.anonymity.k_anonymity(data_anon, quasi_ident)}")
65 | print(
66 |     f"Value of beta (basic) calculated: "
67 |     f"{pycanon.anonymity.basic_beta_likeness(data_anon, quasi_ident, [sens_att])}"
68 | )
69 | 
70 | # Elapsed time: 1.1014823913574219
71 | # Value of k calculated: 2098
72 | # Value of beta (basic) calculated: 0.41781323480116844
73 | 
74 | print(f"Number of records suppressed: {len(data) - len(data_anon)}")
75 | print(
76 |     f"Percentage of records suppressed: {100 * (len(data) - len(data_anon)) / len(data)} %"
77 | )
78 | 
79 | # Number of records suppressed: 23686
80 | # Percentage of records suppressed: 72.74346610976322 %
81 | 


--------------------------------------------------------------------------------
/examples/adult_delta_disclosure.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | import pandas as pd
18 | from anjana.anonymity import delta_disclosure
19 | import pycanon
20 | import time
21 | 
22 | data = pd.read_csv("data/adult.csv")  # 32561 rows
23 | data.columns = data.columns.str.strip()
24 | cols = [
25 |     "workclass",
26 |     "education",
27 |     "marital-status",
28 |     "occupation",
29 |     "sex",
30 |     "native-country",
31 | ]
32 | for col in cols:
33 |     data[col] = data[col].str.strip()
34 | print(data)
35 | quasi_ident = [
36 |     "age",
37 |     "education",
38 |     "marital-status",
39 |     "occupation",
40 |     "sex",
41 |     "native-country",
42 | ]
43 | ident = ["race"]
44 | sens_att = "salary-class"
45 | k = 10
46 | delta = 3
47 | supp_level = 50
48 | 
49 | all_cols = [
50 |     "age",
51 |     "education",
52 |     "marital-status",
53 |     "occupation",
54 |     "sex",
55 |     "native-country",
56 |     "race",
57 |     "salary-class",
58 | ]
59 | sample = data.sample(n=15)
60 | sample = sample.loc[:, all_cols]
61 | sample.to_csv("test.csv")
62 | 
63 | hierarchies = {
64 |     "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
65 |     "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
66 |     "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
67 |     "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
68 |     "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
69 |     "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
70 | }
71 | 
72 | start = time.time()
73 | data_anon = delta_disclosure(
74 |     data, ident, quasi_ident, sens_att, k, delta, supp_level, hierarchies
75 | )
76 | end = time.time()
77 | print(f"Elapsed time: {end - start}")
78 | print(f"Value of k calculated: {pycanon.anonymity.k_anonymity(data_anon, quasi_ident)}")
79 | print(
80 |     f"Value of delta calculated: "
81 |     f"{pycanon.anonymity.delta_disclosure(data_anon, quasi_ident, [sens_att])}"
82 | )
83 | 
84 | # Elapsed time: 4.623609304428101
85 | # Value of k calculated: 392
86 | # Value of delta calculated: 2.159243878369523
87 | 
88 | print(f"Number of records suppressed: {len(data) - len(data_anon)}")
89 | print(
90 |     f"Percentage of records suppressed: {100 * (len(data) - len(data_anon)) / len(data)} %"
91 | )
92 | 
93 | # Number of records suppressed: 14234
94 | # Percentage of records suppressed: 43.71487362181751 %
95 | 


--------------------------------------------------------------------------------
/examples/adult_enhanced_beta_likeness.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | import pandas as pd
18 | from anjana.anonymity import enhanced_beta_likeness
19 | import pycanon
20 | import time
21 | 
22 | data = pd.read_csv("data/adult.csv")  # 32561 rows
23 | data.columns = data.columns.str.strip()
24 | cols = [
25 |     "workclass",
26 |     "education",
27 |     "marital-status",
28 |     "occupation",
29 |     "sex",
30 |     "native-country",
31 | ]
32 | for col in cols:
33 |     data[col] = data[col].str.strip()
34 | print(data)
35 | quasi_ident = [
36 |     "age",
37 |     "education",
38 |     "marital-status",
39 |     "occupation",
40 |     "sex",
41 |     "native-country",
42 | ]
43 | ident = ["race"]
44 | sens_att = "salary-class"
45 | k = 10
46 | beta = 0.5
47 | supp_level = 100
48 | 
49 | hierarchies = {
50 |     "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
51 |     "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
52 |     "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
53 |     "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
54 |     "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
55 |     "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
56 | }
57 | 
58 | start = time.time()
59 | data_anon = enhanced_beta_likeness(
60 |     data, ident, quasi_ident, sens_att, k, beta, supp_level, hierarchies
61 | )
62 | end = time.time()
63 | print(f"Elapsed time: {end - start}")
64 | print(f"Value of k calculated: {pycanon.anonymity.k_anonymity(data_anon, quasi_ident)}")
65 | print(
66 |     f"Value of beta (enhanced) calculated: "
67 |     f"{pycanon.anonymity.enhanced_beta_likeness(data_anon, quasi_ident, [sens_att])}"
68 | )
69 | 
70 | # Elapsed time: 2.7565865516662598
71 | # Value of k calculated: 2098
72 | # Value of beta (enhanced) calculated: 0.41781323480116844
73 | 
74 | print(f"Number of records suppressed: {len(data) - len(data_anon)}")
75 | print(
76 |     f"Percentage of records suppressed: {100 * (len(data) - len(data_anon)) / len(data)} %"
77 | )
78 | 
79 | # Number of records suppressed: 23686
80 | # Percentage of records suppressed: 72.74346610976322 %
81 | 


--------------------------------------------------------------------------------
/examples/adult_get_transformation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | import pandas as pd
18 | from anjana.anonymity import utils
19 | from anjana.anonymity import k_anonymity, l_diversity, t_closeness
20 | import pycanon
21 | import time
22 | 
23 | data = pd.read_csv("data/adult.csv")  # 32561 rows
24 | data.columns = data.columns.str.strip()
25 | cols = [
26 |     "workclass",
27 |     "education",
28 |     "marital-status",
29 |     "occupation",
30 |     "sex",
31 |     "native-country",
32 | ]
33 | for col in cols:
34 |     data[col] = data[col].str.strip()
35 | print(data)  # 32561 rows
36 | quasi_ident = [
37 |     "age",
38 |     "education",
39 |     "marital-status",
40 |     "occupation",
41 |     "sex",
42 |     "native-country",
43 | ]
44 | ident = ["race"]
45 | sens_att = "salary-class"
46 | k = 10
47 | l_div = 2
48 | t = 0.5
49 | supp_level = 50
50 | 
51 | hierarchies = {
52 |     "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
53 |     "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
54 |     "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
55 |     "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
56 |     "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
57 |     "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
58 | }
59 | 
60 | start = time.time()
61 | data_anon = k_anonymity(data, ident, quasi_ident, k, supp_level, hierarchies)
62 | data_anon = l_diversity(
63 |     data_anon, ident, quasi_ident, sens_att, k, l_div, supp_level, hierarchies
64 | )
65 | data_anon = t_closeness(
66 |     data_anon, ident, quasi_ident, sens_att, k, t, supp_level, hierarchies
67 | )
68 | end = time.time()
69 | print(f"Elapsed time: {end-start}")
70 | print(
71 |     f"Value of k calculated: "
72 |     f"\t{pycanon.anonymity.k_anonymity(data_anon, quasi_ident)}"
73 | )
74 | print(
75 |     f"Value of l-diversity: "
76 |     f"\t{pycanon.anonymity.l_diversity(data_anon, quasi_ident, [sens_att])}"
77 | )
78 | print(
79 |     f"Value of t-closeness: "
80 |     f"\t{pycanon.anonymity.t_closeness(data_anon, quasi_ident, [sens_att])}"
81 | )
82 | 
83 | transformation_raw = utils.get_transformation(data, quasi_ident, hierarchies)
84 | print(transformation_raw)  # [0, 0, 0, 0, 0, 0]
85 | transformation_anon = utils.get_transformation(data_anon, quasi_ident, hierarchies)
86 | print(transformation_anon)  # [4, 2, 1, 2, 0, 0]
87 | 


--------------------------------------------------------------------------------
/examples/adult_k_l_t.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | import pandas as pd
18 | from anjana.anonymity import k_anonymity, l_diversity, t_closeness
19 | import pycanon
20 | import time
21 | 
22 | data = pd.read_csv("data/adult.csv")  # 32561 rows
23 | data.columns = data.columns.str.strip()
24 | cols = [
25 |     "workclass",
26 |     "education",
27 |     "marital-status",
28 |     "occupation",
29 |     "sex",
30 |     "native-country",
31 | ]
32 | for col in cols:
33 |     data[col] = data[col].str.strip()
34 | print(data)  # 32561 rows
35 | quasi_ident = [
36 |     "age",
37 |     "education",
38 |     "marital-status",
39 |     "occupation",
40 |     "sex",
41 |     "native-country",
42 | ]
43 | ident = ["race"]
44 | sens_att = "salary-class"
45 | k = 10
46 | l_div = 2
47 | t = 0.5
48 | supp_level = 50
49 | 
50 | hierarchies = {
51 |     "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
52 |     "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
53 |     "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
54 |     "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
55 |     "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
56 |     "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
57 | }
58 | 
59 | start = time.time()
60 | data_anon = k_anonymity(data, ident, quasi_ident, k, supp_level, hierarchies)
61 | data_anon = l_diversity(
62 |     data_anon, ident, quasi_ident, sens_att, k, l_div, supp_level, hierarchies
63 | )
64 | data_anon = t_closeness(
65 |     data_anon, ident, quasi_ident, sens_att, k, t, supp_level, hierarchies
66 | )
67 | end = time.time()
68 | print(f"Elapsed time: {end-start}")
69 | print(
70 |     f"Value of k calculated: "
71 |     f"\t{pycanon.anonymity.k_anonymity(data_anon, quasi_ident)}"
72 | )
73 | print(
74 |     f"Value of l-diversity: "
75 |     f"\t{pycanon.anonymity.l_diversity(data_anon, quasi_ident, [sens_att])}"
76 | )
77 | print(
78 |     f"Value of t-closeness: "
79 |     f"\t{pycanon.anonymity.t_closeness(data_anon, quasi_ident, [sens_att])}"
80 | )
81 | 
82 | # Elapsed time: 3.8451220989227295
83 | # Value of k calculated: 	72
84 | # Value of l-diversity: 	2
85 | # Value of t-closeness: 	0.4737011422127644
86 | 


--------------------------------------------------------------------------------
/examples/adult_ldiversity.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2024 Spanish National Research Council (CSIC)
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
  6 | # not use this file except in compliance with the License. You may obtain
  7 | # a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 14 | # License for the specific language governing permissions and limitations
 15 | # under the License.
 16 | 
 17 | import pandas as pd
 18 | from anjana.anonymity import l_diversity, entropy_l_diversity, recursive_c_l_diversity
 19 | import pycanon
 20 | import time
 21 | 
 22 | data = pd.read_csv("data/adult.csv")  # 32561 rows
 23 | data.columns = data.columns.str.strip()
 24 | cols = [
 25 |     "workclass",
 26 |     "education",
 27 |     "marital-status",
 28 |     "occupation",
 29 |     "sex",
 30 |     "native-country",
 31 | ]
 32 | for col in cols:
 33 |     data[col] = data[col].str.strip()
 34 | print(data)
 35 | quasi_ident = [
 36 |     "age",
 37 |     "education",
 38 |     "marital-status",
 39 |     "occupation",
 40 |     "sex",
 41 |     "native-country",
 42 | ]
 43 | ident = ["race"]
 44 | sens_att = "salary-class"
 45 | k = 10
 46 | l_div = 2
 47 | supp_level = 50
 48 | 
 49 | hierarchies = {
 50 |     "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
 51 |     "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
 52 |     "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
 53 |     "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
 54 |     "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
 55 |     "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
 56 | }
 57 | 
 58 | start = time.time()
 59 | data_anon = l_diversity(
 60 |     data, ident, quasi_ident, sens_att, k, l_div, supp_level, hierarchies
 61 | )
 62 | end = time.time()
 63 | print(f"Elapsed time: {end - start}")
 64 | print(f"Value of k calculated: {pycanon.anonymity.k_anonymity(data_anon, quasi_ident)}")
 65 | print(
 66 |     f"Value of l calculated: "
 67 |     f"{pycanon.anonymity.l_diversity(data_anon, quasi_ident, [sens_att])}"
 68 | )
 69 | 
 70 | print(f"Number of records suppressed: {len(data) - len(data_anon)}")
 71 | print(
 72 |     f"Percentage of records suppressed: {100 * (len(data) - len(data_anon)) / len(data)} %"
 73 | )
 74 | 
 75 | # Number of records suppressed: 14234
 76 | # Percentage of records suppressed: 43.71487362181751 %
 77 | 
 78 | # Elapsed time: 1.1014823913574219
 79 | # Value of k calculated: 72
 80 | # Value of l calculated: 2
 81 | 
 82 | start = time.time()
 83 | data_anon = entropy_l_diversity(
 84 |     data, ident, quasi_ident, sens_att, k, l_div, supp_level, hierarchies
 85 | )
 86 | end = time.time()
 87 | print(f"Elapsed time: {end - start}")
 88 | if len(data_anon) > 1:
 89 |     print(
 90 |         f"Value of k calculated: "
 91 |         f"{pycanon.anonymity.k_anonymity(data_anon, quasi_ident)}"
 92 |     )
 93 |     print(
 94 |         f"Value of l calculated: "
 95 |         f"{pycanon.anonymity.entropy_l_diversity(data_anon, quasi_ident, [sens_att])}"
 96 |     )
 97 | 
 98 | # Entropy l-diversity cannot be achieved for l=2
 99 | # Elapsed time: 6.262372255325317
100 | # Value of k calculated: 18327
101 | # Value of l calculated: 1
102 | c = 2
103 | start = time.time()
104 | data_anon = recursive_c_l_diversity(
105 |     data, ident, quasi_ident, sens_att, k, c, l_div, supp_level, hierarchies
106 | )
107 | end = time.time()
108 | print(f"Elapsed time: {end - start}")
109 | if len(data_anon) > 1:
110 |     print(
111 |         f"Value of k calculated: "
112 |         f"{pycanon.anonymity.k_anonymity(data_anon, quasi_ident)}"
113 |     )
114 |     c_cal, l_cal = pycanon.anonymity.recursive_c_l_diversity(
115 |         data_anon, quasi_ident, [sens_att]
116 |     )
117 |     print(f"Values of c and l calculated: " f"c={c_cal}, l={l_cal}")
118 | 
119 | # Recursive (c,l)-diversity cannot be achieved for l=2 and c=2
120 | # Elapsed time: 5.675975561141968
121 | # Value of k calculated: 18327
122 | # Values of c and l calculated: (1, 2)
123 | 


--------------------------------------------------------------------------------
/examples/adult_tcloseness.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | import pandas as pd
18 | from anjana.anonymity import t_closeness
19 | import pycanon
20 | import time
21 | 
22 | data = pd.read_csv("data/adult.csv")  # 32561 rows
23 | data.columns = data.columns.str.strip()
24 | cols = [
25 |     "workclass",
26 |     "education",
27 |     "marital-status",
28 |     "occupation",
29 |     "sex",
30 |     "native-country",
31 | ]
32 | for col in cols:
33 |     data[col] = data[col].str.strip()
34 | print(data)
35 | quasi_ident = [
36 |     "age",
37 |     "education",
38 |     "marital-status",
39 |     "occupation",
40 |     "sex",
41 |     "native-country",
42 | ]
43 | ident = ["race"]
44 | sens_att = "salary-class"
45 | k = 10
46 | t = 0.5
47 | supp_level = 50
48 | 
49 | hierarchies = {
50 |     "age": dict(pd.read_csv("hierarchies/age.csv", header=None)),
51 |     "education": dict(pd.read_csv("hierarchies/education.csv", header=None)),
52 |     "marital-status": dict(pd.read_csv("hierarchies/marital.csv", header=None)),
53 |     "occupation": dict(pd.read_csv("hierarchies/occupation.csv", header=None)),
54 |     "sex": dict(pd.read_csv("hierarchies/sex.csv", header=None)),
55 |     "native-country": dict(pd.read_csv("hierarchies/country.csv", header=None)),
56 | }
57 | 
58 | start = time.time()
59 | data_anon = t_closeness(
60 |     data, ident, quasi_ident, sens_att, k, t, supp_level, hierarchies
61 | )
62 | end = time.time()
63 | print(f"Elapsed time: {end - start}")
64 | print(f"Value of k calculated: {pycanon.anonymity.k_anonymity(data_anon, quasi_ident)}")
65 | print(
66 |     f"Value of t calculated: "
67 |     f"{pycanon.anonymity.t_closeness(data_anon, quasi_ident, [sens_att])}"
68 | )
69 | 
70 | # Elapsed time: 3.8912816047668457
71 | # Value of k calculated: 72
72 | # Value of t calculated: 0.4737011422127644
73 | 
74 | print(f"Number of records suppressed: {len(data) - len(data_anon)}")
75 | print(
76 |     f"Percentage of records suppressed: {100 * (len(data) - len(data_anon)) / len(data)} %"
77 | )
78 | 
79 | # Number of records suppressed: 14234
80 | # Percentage of records suppressed: 43.71487362181751 %
81 | 


--------------------------------------------------------------------------------
/examples/data/hospital_extended.csv:
--------------------------------------------------------------------------------
 1 | name,age,gender,city,religion,disease
 2 | Ramsha,29,Female,Tamil Nadu,Hindu,Cancer
 3 | Gabu,24,Male,Tamil Nadu,Hindu,Cancer
 4 | Sabu,23,Male,Tamil Nadu,Hindu,Cancer
 5 | Jonas,22,Male,Tamil Nadu,Hindu,Cancer
 6 | Yadu,24,Female,Kerala,Hindu,Viral infection
 7 | Salima,28,Female,Tamil Nadu,Muslim,TB
 8 | Sunny,27,Male,Karnataka,Parsi,No illness
 9 | Joan,24,Female,Kerala,Christian,Heart-related
10 | Bahuksana,23,Male,Karnataka,Buddhist,TB
11 | Rambha,19,Male,Kerala,Hindu,Cancer
12 | Kishor,29,Male,Karnataka,Hindu,Heart-related
13 | Johnson,17,Male,Kerala,Christian,Heart-related
14 | John,19,Male,Kerala,Christian,Viral infection
15 | 
16 | 


--------------------------------------------------------------------------------
/examples/hierarchies/age.csv:
--------------------------------------------------------------------------------
 1 | 17,"[15, 20[","[10, 20[","[0, 20[","[0, 40[","[0, 80[",*
 2 | 18,"[15, 20[","[10, 20[","[0, 20[","[0, 40[","[0, 80[",*
 3 | 19,"[15, 20[","[10, 20[","[0, 20[","[0, 40[","[0, 80[",*
 4 | 20,"[20, 25[","[20, 30[","[20, 40[","[0, 40[","[0, 80[",*
 5 | 21,"[20, 25[","[20, 30[","[20, 40[","[0, 40[","[0, 80[",*
 6 | 22,"[20, 25[","[20, 30[","[20, 40[","[0, 40[","[0, 80[",*
 7 | 23,"[20, 25[","[20, 30[","[20, 40[","[0, 40[","[0, 80[",*
 8 | 24,"[20, 25[","[20, 30[","[20, 40[","[0, 40[","[0, 80[",*
 9 | 25,"[25, 30[","[20, 30[","[20, 40[","[0, 40[","[0, 80[",*
10 | 26,"[25, 30[","[20, 30[","[20, 40[","[0, 40[","[0, 80[",*
11 | 27,"[25, 30[","[20, 30[","[20, 40[","[0, 40[","[0, 80[",*
12 | 28,"[25, 30[","[20, 30[","[20, 40[","[0, 40[","[0, 80[",*
13 | 29,"[25, 30[","[20, 30[","[20, 40[","[0, 40[","[0, 80[",*
14 | 30,"[30, 35[","[30, 40[","[20, 40[","[0, 40[","[0, 80[",*
15 | 31,"[30, 35[","[30, 40[","[20, 40[","[0, 40[","[0, 80[",*
16 | 32,"[30, 35[","[30, 40[","[20, 40[","[0, 40[","[0, 80[",*
17 | 33,"[30, 35[","[30, 40[","[20, 40[","[0, 40[","[0, 80[",*
18 | 34,"[30, 35[","[30, 40[","[20, 40[","[0, 40[","[0, 80[",*
19 | 35,"[35, 40[","[30, 40[","[20, 40[","[0, 40[","[0, 80[",*
20 | 36,"[35, 40[","[30, 40[","[20, 40[","[0, 40[","[0, 80[",*
21 | 37,"[35, 40[","[30, 40[","[20, 40[","[0, 40[","[0, 80[",*
22 | 38,"[35, 40[","[30, 40[","[20, 40[","[0, 40[","[0, 80[",*
23 | 39,"[35, 40[","[30, 40[","[20, 40[","[0, 40[","[0, 80[",*
24 | 40,"[40, 45[","[40, 50[","[40, 60[","[40, 80[","[0, 80[",*
25 | 41,"[40, 45[","[40, 50[","[40, 60[","[40, 80[","[0, 80[",*
26 | 42,"[40, 45[","[40, 50[","[40, 60[","[40, 80[","[0, 80[",*
27 | 43,"[40, 45[","[40, 50[","[40, 60[","[40, 80[","[0, 80[",*
28 | 44,"[40, 45[","[40, 50[","[40, 60[","[40, 80[","[0, 80[",*
29 | 45,"[45, 50[","[40, 50[","[40, 60[","[40, 80[","[0, 80[",*
30 | 46,"[45, 50[","[40, 50[","[40, 60[","[40, 80[","[0, 80[",*
31 | 47,"[45, 50[","[40, 50[","[40, 60[","[40, 80[","[0, 80[",*
32 | 48,"[45, 50[","[40, 50[","[40, 60[","[40, 80[","[0, 80[",*
33 | 49,"[45, 50[","[40, 50[","[40, 60[","[40, 80[","[0, 80[",*
34 | 50,"[50, 55[","[50, 60[","[40, 60[","[40, 80[","[0, 80[",*
35 | 51,"[50, 55[","[50, 60[","[40, 60[","[40, 80[","[0, 80[",*
36 | 52,"[50, 55[","[50, 60[","[40, 60[","[40, 80[","[0, 80[",*
37 | 53,"[50, 55[","[50, 60[","[40, 60[","[40, 80[","[0, 80[",*
38 | 54,"[50, 55[","[50, 60[","[40, 60[","[40, 80[","[0, 80[",*
39 | 55,"[55, 60[","[50, 60[","[40, 60[","[40, 80[","[0, 80[",*
40 | 56,"[55, 60[","[50, 60[","[40, 60[","[40, 80[","[0, 80[",*
41 | 57,"[55, 60[","[50, 60[","[40, 60[","[40, 80[","[0, 80[",*
42 | 58,"[55, 60[","[50, 60[","[40, 60[","[40, 80[","[0, 80[",*
43 | 59,"[55, 60[","[50, 60[","[40, 60[","[40, 80[","[0, 80[",*
44 | 60,"[60, 65[","[60, 70[","[60, 80[","[40, 80[","[0, 80[",*
45 | 61,"[60, 65[","[60, 70[","[60, 80[","[40, 80[","[0, 80[",*
46 | 62,"[60, 65[","[60, 70[","[60, 80[","[40, 80[","[0, 80[",*
47 | 63,"[60, 65[","[60, 70[","[60, 80[","[40, 80[","[0, 80[",*
48 | 64,"[60, 65[","[60, 70[","[60, 80[","[40, 80[","[0, 80[",*
49 | 65,"[65, 70[","[60, 70[","[60, 80[","[40, 80[","[0, 80[",*
50 | 66,"[65, 70[","[60, 70[","[60, 80[","[40, 80[","[0, 80[",*
51 | 67,"[65, 70[","[60, 70[","[60, 80[","[40, 80[","[0, 80[",*
52 | 68,"[65, 70[","[60, 70[","[60, 80[","[40, 80[","[0, 80[",*
53 | 69,"[65, 70[","[60, 70[","[60, 80[","[40, 80[","[0, 80[",*
54 | 70,"[70, 75[","[70, 80[","[60, 80[","[40, 80[","[0, 80[",*
55 | 71,"[70, 75[","[70, 80[","[60, 80[","[40, 80[","[0, 80[",*
56 | 72,"[70, 75[","[70, 80[","[60, 80[","[40, 80[","[0, 80[",*
57 | 73,"[70, 75[","[70, 80[","[60, 80[","[40, 80[","[0, 80[",*
58 | 74,"[70, 75[","[70, 80[","[60, 80[","[40, 80[","[0, 80[",*
59 | 75,"[75, 80[","[70, 80[","[60, 80[","[40, 80[","[0, 80[",*
60 | 76,"[75, 80[","[70, 80[","[60, 80[","[40, 80[","[0, 80[",*
61 | 77,"[75, 80[","[70, 80[","[60, 80[","[40, 80[","[0, 80[",*
62 | 78,"[75, 80[","[70, 80[","[60, 80[","[40, 80[","[0, 80[",*
63 | 79,"[75, 80[","[70, 80[","[60, 80[","[40, 80[","[0, 80[",*
64 | 80,>=80,>=80,>=80,>=80,>=80,*
65 | 81,>=80,>=80,>=80,>=80,>=80,*
66 | 82,>=80,>=80,>=80,>=80,>=80,*
67 | 83,>=80,>=80,>=80,>=80,>=80,*
68 | 84,>=80,>=80,>=80,>=80,>=80,*
69 | 85,>=80,>=80,>=80,>=80,>=80,*
70 | 86,>=80,>=80,>=80,>=80,>=80,*
71 | 87,>=80,>=80,>=80,>=80,>=80,*
72 | 88,>=80,>=80,>=80,>=80,>=80,*
73 | 90,>=80,>=80,>=80,>=80,>=80,*
74 | 


--------------------------------------------------------------------------------
/examples/hierarchies/country.csv:
--------------------------------------------------------------------------------
 1 | United-States,North America,*
 2 | Cambodia,Asia,*
 3 | England,Europa,*
 4 | Puerto-Rico,North America,*
 5 | Canada,North America,*
 6 | Germany,Europe,*
 7 | Outlying-US(Guam-USVI-etc),North America,*
 8 | India,Asia,*
 9 | Japan,Asia,*
10 | Greece,Europe,*
11 | South,Africa,*
12 | China,Asia,*
13 | Cuba,North America,*
14 | Iran,Asia,*
15 | Honduras,North America,*
16 | Philippines,Asia,*
17 | Italy,Europe,*
18 | Poland,Europe,*
19 | Jamaica,North America,*
20 | Vietnam,Asia,*
21 | Mexico,North America,*
22 | Portugal,Europe,*
23 | Ireland,Europe,*
24 | France,Europe,*
25 | Dominican-Republic,North America,*
26 | Laos,Asia,*
27 | Ecuador,South America,*
28 | Taiwan,Asia,*
29 | Haiti,North America,*
30 | Columbia,South America,*
31 | Hungary,Europe,*
32 | Guatemala,North America,*
33 | Nicaragua,South America,*
34 | Scotland,Europe,*
35 | Thailand,Asia,*
36 | Yugoslavia,Europe,*
37 | El-Salvador,North America,*
38 | Trinadad&Tobago,South America,*
39 | Peru,South America,*
40 | Hong,Asia,*
41 | Holand-Netherlands,Europe,*
42 | ?,Unknown,*
43 | 


--------------------------------------------------------------------------------
/examples/hierarchies/education.csv:
--------------------------------------------------------------------------------
 1 | Bachelors,Undergraduate,Higher education,*
 2 | Some-college,Undergraduate,Higher education,*
 3 | 11th,High School,Secondary education,*
 4 | HS-grad,High School,Secondary education,*
 5 | Prof-school,Professional Education,Higher education,*
 6 | Assoc-acdm,Professional Education,Higher education,*
 7 | Assoc-voc,Professional Education,Higher education,*
 8 | 9th,High School,Secondary education,*
 9 | 7th-8th,High School,Secondary education,*
10 | 12th,High School,Secondary education,*
11 | Masters,Graduate,Higher education,*
12 | 1st-4th,Primary School,Primary education,*
13 | 10th,High School,Secondary education,*
14 | Doctorate,Graduate,Higher education,*
15 | 5th-6th,Primary School,Primary education,*
16 | Preschool,Primary School,Primary education,*
17 | 


--------------------------------------------------------------------------------
/examples/hierarchies/marital.csv:
--------------------------------------------------------------------------------
1 | Married-civ-spouse,Spouse present,*
2 | Divorced,Spouse not present,*
3 | Never-married,Spouse not present,*
4 | Separated,Spouse not present,*
5 | Widowed,Spouse not present,*
6 | Married-spouse-absent,Spouse not present,*
7 | Married-AF-spouse,Spouse present,*
8 | 


--------------------------------------------------------------------------------
/examples/hierarchies/occupation.csv:
--------------------------------------------------------------------------------
 1 | Tech-support,Technical,*
 2 | Craft-repair,Technical,*
 3 | Other-service,Other,*
 4 | Sales,Nontechnical,*
 5 | Exec-managerial,Nontechnical,*
 6 | Prof-specialty,Technical,*
 7 | Handlers-cleaners,Nontechnical,*
 8 | Machine-op-inspct,Technical,*
 9 | Adm-clerical,Other,*
10 | Farming-fishing,Other,*
11 | Transport-moving,Other,*
12 | Priv-house-serv,Other,*
13 | Protective-serv,Other,*
14 | Armed-Forces,Other,*
15 | ?,Other,*
16 | 


--------------------------------------------------------------------------------
/examples/hierarchies/race.csv:
--------------------------------------------------------------------------------
1 | White,*
2 | Asian-Pac-Islander,*
3 | Amer-Indian-Eskimo,*
4 | Other,*
5 | Black,*
6 | 


--------------------------------------------------------------------------------
/examples/hierarchies/salary.csv:
--------------------------------------------------------------------------------
1 | >50K,*
2 | <=50K,*
3 | 


--------------------------------------------------------------------------------
/examples/hierarchies/sex.csv:
--------------------------------------------------------------------------------
1 | Female,*
2 | Male,*
3 | 


--------------------------------------------------------------------------------
/examples/hierarchies/workclass.csv:
--------------------------------------------------------------------------------
 1 | Private,Non-Government,*
 2 | Self-emp-not-inc,Non-Government,*
 3 | Self-emp-inc,Non-Government,*
 4 | Federal-gov,Government,*
 5 | Local-gov,Government,*
 6 | State-gov,Government,*
 7 | Without-pay,Unemployed,*
 8 | Never-worked,Unemployed,*
 9 | ?,Unknown,*
10 | 


--------------------------------------------------------------------------------
/examples/hospital.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | import numpy as np
18 | import pandas as pd
19 | from anjana.anonymity import k_anonymity, l_diversity, utils, basic_beta_likeness
20 | 
21 | data = pd.read_csv("data/hospital_extended.csv")
22 | 
23 | print(data)
24 | 
25 | ident = ["name"]
26 | quasi_ident = ["age", "gender", "city"]
27 | sens_attr = "disease"
28 | k = 2
29 | supp_level = 0
30 | ages = data["age"].values
31 | hierarchies = {
32 |     "age": {
33 |         0: data["age"].values,
34 |         1: utils.generate_intervals(data["age"].values, 0, 100, 5),
35 |         2: utils.generate_intervals(data["age"].values, 0, 100, 10),
36 |     },
37 |     "gender": {
38 |         0: data["gender"].values,
39 |         1: np.array(["*"] * len(data["gender"].values)),
40 |     },
41 |     "city": {0: data["city"].values, 1: np.array(["*"] * len(data["city"].values))},
42 | }
43 | data_anon = k_anonymity(data, ident, quasi_ident, k, supp_level, hierarchies)
44 | print(data_anon)
45 | print(utils.get_transformation(data_anon, quasi_ident, hierarchies))
46 | 
47 | #    name       age  gender        city   religion          disease
48 | # 0     *  [20, 30[  Female  Tamil Nadu      Hindu           Cancer
49 | # 1     *  [20, 30[    Male  Tamil Nadu      Hindu           Cancer
50 | # 2     *  [20, 30[    Male  Tamil Nadu      Hindu           Cancer
51 | # 3     *  [20, 30[    Male  Tamil Nadu      Hindu           Cancer
52 | # 4     *  [20, 30[  Female      Kerala      Hindu  Viral infection
53 | # 5     *  [20, 30[  Female  Tamil Nadu     Muslim               TB
54 | # 6     *  [20, 30[    Male   Karnataka      Parsi       No illness
55 | # 7     *  [20, 30[  Female      Kerala  Christian    Heart-related
56 | # 8     *  [20, 30[    Male   Karnataka   Buddhist               TB
57 | # 9     *  [10, 20[    Male      Kerala      Hindu           Cancer
58 | # 10    *  [20, 30[    Male   Karnataka      Hindu    Heart-related
59 | # 11    *  [10, 20[    Male      Kerala  Christian    Heart-related
60 | # 12    *  [10, 20[    Male      Kerala  Christian  Viral infection
61 | 
62 | l_div = 2
63 | data_anon = l_diversity(
64 |     data, ident, quasi_ident, sens_attr, k, l_div, supp_level, hierarchies
65 | )
66 | print(data_anon)
67 | print(utils.get_transformation(data_anon, quasi_ident, hierarchies))
68 | 
69 | # 0     *  [20, 30[  Female    *      Hindu           Cancer
70 | # 1     *  [20, 30[    Male    *      Hindu           Cancer
71 | # 2     *  [20, 30[    Male    *      Hindu           Cancer
72 | # 3     *  [20, 30[    Male    *      Hindu           Cancer
73 | # 4     *  [20, 30[  Female    *      Hindu  Viral infection
74 | # 5     *  [20, 30[  Female    *     Muslim               TB
75 | # 6     *  [20, 30[    Male    *      Parsi       No illness
76 | # 7     *  [20, 30[  Female    *  Christian    Heart-related
77 | # 8     *  [20, 30[    Male    *   Buddhist               TB
78 | # 9     *  [10, 20[    Male    *      Hindu           Cancer
79 | # 10    *  [20, 30[    Male    *      Hindu    Heart-related
80 | # 11    *  [10, 20[    Male    *  Christian    Heart-related
81 | # 12    *  [10, 20[    Male    *  Christian  Viral infection
82 | 


--------------------------------------------------------------------------------
/examples/hospital_get_transformation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 
17 | import numpy as np
18 | import pandas as pd
19 | from anjana.anonymity import k_anonymity, l_diversity
20 | from anjana.anonymity import utils
21 | 
22 | data = pd.read_csv("data/hospital_extended.csv")
23 | 
24 | ident = ["name"]
25 | quasi_ident = ["age", "gender", "city"]
26 | sens_attr = "disease"
27 | k = 2
28 | supp_level = 0
29 | hierarchies = {
30 |     "age": dict(pd.read_csv("../examples/hierarchies/age.csv", header=None)),
31 |     "gender": {
32 |         0: data["gender"].values,
33 |         1: np.array(["*"] * len(data["gender"].values)),
34 |     },
35 |     "city": {0: data["city"].values, 1: np.array(["*"] * len(data["city"].values))},
36 | }
37 | data_anon = k_anonymity(data, ident, quasi_ident, k, supp_level, hierarchies)
38 | print(data_anon)
39 | 
40 | #    name       age  gender        city   religion          disease
41 | # 0     *  [20, 30[  Female  Tamil Nadu      Hindu           Cancer
42 | # 1     *  [20, 30[    Male  Tamil Nadu      Hindu           Cancer
43 | # 2     *  [20, 30[    Male  Tamil Nadu      Hindu           Cancer
44 | # 3     *  [20, 30[    Male  Tamil Nadu      Hindu           Cancer
45 | # 4     *  [20, 30[  Female      Kerala      Hindu  Viral infection
46 | # 5     *  [20, 30[  Female  Tamil Nadu     Muslim               TB
47 | # 6     *  [20, 30[    Male   Karnataka      Parsi       No illness
48 | # 7     *  [20, 30[  Female      Kerala  Christian    Heart-related
49 | # 8     *  [20, 30[    Male   Karnataka   Buddhist               TB
50 | # 9     *  [10, 20[    Male      Kerala      Hindu           Cancer
51 | # 10    *  [20, 30[    Male   Karnataka      Hindu    Heart-related
52 | # 11    *  [10, 20[    Male      Kerala  Christian    Heart-related
53 | # 12    *  [10, 20[    Male      Kerala  Christian  Viral infection
54 | 
55 | l_div = 2
56 | data_anon = l_diversity(
57 |     data, ident, quasi_ident, sens_attr, k, l_div, supp_level, hierarchies
58 | )
59 | print(data_anon)
60 | # 0     *  [20, 30[  Female    *      Hindu           Cancer
61 | # 1     *  [20, 30[    Male    *      Hindu           Cancer
62 | # 2     *  [20, 30[    Male    *      Hindu           Cancer
63 | # 3     *  [20, 30[    Male    *      Hindu           Cancer
64 | # 4     *  [20, 30[  Female    *      Hindu  Viral infection
65 | # 5     *  [20, 30[  Female    *     Muslim               TB
66 | # 6     *  [20, 30[    Male    *      Parsi       No illness
67 | # 7     *  [20, 30[  Female    *  Christian    Heart-related
68 | # 8     *  [20, 30[    Male    *   Buddhist               TB
69 | # 9     *  [10, 20[    Male    *      Hindu           Cancer
70 | # 10    *  [20, 30[    Male    *      Hindu    Heart-related
71 | # 11    *  [10, 20[    Male    *  Christian    Heart-related
72 | # 12    *  [10, 20[    Male    *  Christian  Viral infection
73 | 
74 | transformation_raw = utils.get_transformation(data, quasi_ident, hierarchies)
75 | print(transformation_raw)  # [0, 0, 0]
76 | transformation_anon = utils.get_transformation(data_anon, quasi_ident, hierarchies)
77 | print(transformation_anon)  # [2, 0, 1]
78 | 
79 | # Testing the function apply_transformation
80 | data_transform1 = utils.apply_transformation(data, quasi_ident, hierarchies, [1, 1, 1])
81 | print(data_transform1)
82 | print(utils.get_transformation(data_transform1, quasi_ident, hierarchies))  # [1, 1, 1]
83 | 
84 | data_transform2 = utils.apply_transformation(data, quasi_ident, hierarchies, [5, 1, 1])
85 | print(data_transform2)
86 | print(utils.get_transformation(data_transform2, quasi_ident, hierarchies))  # [5, 1, 1]
87 | 
88 | data_transform3 = utils.apply_transformation(
89 |     data_anon, quasi_ident, hierarchies, [5, 1, 1]
90 | )
91 | print(data_transform3)
92 | print(utils.get_transformation(data_transform3, quasi_ident, hierarchies))  # [5, 1, 1]
93 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "anjana"
 3 | version = "1.1.0"
 4 | description = "ANJANA is an open source framework for applying different anonymity techniques."
 5 | authors = [
 6 |     "Judith Sáinz-Pardo Díaz <sainzpardo@ifca.unican.es>",
 7 |     "Álvaro López García <aloga@ifca.unican.es>"
 8 | ]
 9 | maintainers = ["Judith Sáinz-Pardo Díaz <sainzpardo@ifca.unican.es>"]
10 | license = "Apache License 2.0"
11 | readme = "README.md"
12 | repository = "https://gitlab.ifca.es/privacy-security/anjana"
13 | keywords = ["anonymity", "privacy"]
14 | classifiers = [
15 |     "Development Status :: 5 - Production/Stable",
16 |     "Intended Audience :: Developers",
17 |     "Intended Audience :: Education",
18 |     "Intended Audience :: Science/Research",
19 |     "License :: OSI Approved :: Apache Software License",
20 |     "Programming Language :: Python :: 3 :: Only",
21 |     "Programming Language :: Python :: 3.9",
22 |     "Programming Language :: Python :: 3.10",
23 |     "Programming Language :: Python :: 3.11",
24 |     "Programming Language :: Python :: 3.12",
25 |     "Topic :: Scientific/Engineering",
26 |     "Topic :: Scientific/Engineering :: Mathematics",
27 |     "Topic :: Security"
28 | ]
29 | 
30 | 
31 | [tool.poetry.dependencies]
32 | python = "^3.9"
33 | numpy = "2.0.2"
34 | pandas = "2.2.2"
35 | pycanon = "1.0.3"
36 | typing_extensions = "4.12.2"
37 | beartype = "0.19.0"
38 | docutils = "0.21.2"
39 | 
40 | 
41 | [tool.poetry.group.dev.dependencies]
42 | tox = "4.16.0"
43 | 
44 | 
45 | [tool.poetry.group.test.dependencies]
46 | pytest = ">=7.1.2,<9.0.0"
47 | pytest-cov = ">=4,<7"
48 | 
49 | 
50 | [tool.poetry.group.test-flake8.dependencies]
51 | flake8 = ">=4,<8"
52 | flake8-bugbear = ">=22.3,<25.0"
53 | flake8-docstrings = "^1.6"
54 | flake8-typing-imports = "^1.12"
55 | flake8-colors = "^0.1"
56 | pep8-naming = ">=0.12,<0.16"
57 | pydocstyle = "^6.1"
58 | 
59 | 
60 | [tool.poetry.group.test-black.dependencies]
61 | black = ">=22.3,<26.0"
62 | 
63 | 
64 | [tool.poetry.group.test-bandit.dependencies]
65 | bandit = "1.8.3"
66 | 
67 | 
68 | [tool.poetry.group.test-pip-missing-reqs.dependencies]
69 | pip-check-reqs = "^2.5.3"
70 | 
71 | 
72 | [tool.poetry.group.test-mypy.dependencies]
73 | mypy = "1.15.0"
74 | 
75 | 
76 | [tool.poetry.group.test-pypi.dependencies]
77 | twine = ">=4.0.2,<7.0.0"
78 | 
79 | 
80 | [build-system]
81 | requires = ["poetry-core"]
82 | build-backend = "poetry.core.masonry.api"
83 | 


--------------------------------------------------------------------------------
/release-please-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "packages": {
 3 |         ".": {
 4 |         "changelog-path": "CHANGELOG.md",
 5 |         "release-type": "simple",
 6 |         "bump-minor-pre-major": false,
 7 |         "bump-patch-for-minor-pre-major": false,
 8 |         "draft": false,
 9 |         "prerelease": false
10 |         }
11 |     },
12 |     "$schema": "https://raw.githubusercontent.com/googleapis/release-please/main/schemas/config.json"
13 | }
14 | 


--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
 1 | # anjana
 2 | pandas==2.2.3
 3 | pycanon==1.0.1.post2
 4 | tabulate==0.9.0
 5 | typing_extensions==4.12.2
 6 | beartype==0.19.0
 7 | 
 8 | # docs
 9 | sphinx==7.4.7
10 | furo==2024.5.6
11 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Spanish National Research Council (CSIC)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
 6 | # not use this file except in compliance with the License. You may obtain
 7 | # a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | # License for the specific language governing permissions and limitations
15 | # under the License.
16 | 


--------------------------------------------------------------------------------
/tests/test_anonymity.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from anjana import anonymity
  3 | from anjana.anonymity import utils
  4 | import pycanon
  5 | from copy import copy
  6 | import numpy as np
  7 | 
  8 | 
  9 | class TestAdult:
 10 |     data = pd.read_csv("./examples/data/adult.csv")  # 32561 rows
 11 |     data.columns = data.columns.str.strip()
 12 |     cols = [
 13 |         "workclass",
 14 |         "education",
 15 |         "marital-status",
 16 |         "occupation",
 17 |         "sex",
 18 |         "native-country",
 19 |     ]
 20 |     for col in cols:
 21 |         data[col] = data[col].str.strip()
 22 | 
 23 |     quasi_ident = [
 24 |         "age",
 25 |         "education",
 26 |         "marital-status",
 27 |         "occupation",
 28 |         "sex",
 29 |         "native-country",
 30 |     ]
 31 |     ident = ["race"]
 32 |     sens_att = "salary-class"
 33 |     k = 10
 34 |     l_div = 2
 35 |     c_div = 2
 36 |     t = 0.5
 37 |     alpha = 0.8
 38 |     beta = 0.5
 39 |     delta = 0.4
 40 |     supp_level = 50
 41 | 
 42 |     hierarchies = {
 43 |         "age": dict(pd.read_csv("./examples/hierarchies/age.csv", header=None)),
 44 |         "education": dict(
 45 |             pd.read_csv("./examples/hierarchies/education.csv", header=None)
 46 |         ),
 47 |         "marital-status": dict(
 48 |             pd.read_csv("./examples/hierarchies/marital.csv", header=None)
 49 |         ),
 50 |         "occupation": dict(
 51 |             pd.read_csv("./examples/hierarchies/occupation.csv", header=None)
 52 |         ),
 53 |         "sex": dict(pd.read_csv("./examples/hierarchies/sex.csv", header=None)),
 54 |         "native-country": dict(
 55 |             pd.read_csv("./examples/hierarchies/country.csv", header=None)
 56 |         ),
 57 |     }
 58 | 
 59 |     def test_supp_ident(self):
 60 |         data_anon = anonymity.utils.suppress_identifiers(self.data, self.ident)
 61 |         data_anon_real = copy(self.data)
 62 |         data_anon_real["race"] = "*"
 63 |         assert data_anon_real.equals(data_anon)
 64 | 
 65 |     def test_k_anon(self):
 66 |         data_anon = anonymity.k_anonymity(
 67 |             self.data,
 68 |             self.ident,
 69 |             self.quasi_ident,
 70 |             self.k,
 71 |             self.supp_level,
 72 |             self.hierarchies,
 73 |         )
 74 |         assert self.k <= pycanon.anonymity.k_anonymity(data_anon, self.quasi_ident)
 75 | 
 76 |     def test_k_anon_100sup(self):
 77 |         data_anon = anonymity.k_anonymity(
 78 |             self.data,
 79 |             self.ident,
 80 |             self.quasi_ident,
 81 |             self.k,
 82 |             100,
 83 |             self.hierarchies,
 84 |         )
 85 |         assert self.k <= pycanon.anonymity.k_anonymity(data_anon, self.quasi_ident)
 86 | 
 87 |     def test_l_div(self):
 88 |         data_anon = anonymity.l_diversity(
 89 |             self.data,
 90 |             self.ident,
 91 |             self.quasi_ident,
 92 |             self.sens_att,
 93 |             self.k,
 94 |             self.l_div,
 95 |             self.supp_level,
 96 |             self.hierarchies,
 97 |         )
 98 |         assert self.l_div <= pycanon.anonymity.l_diversity(
 99 |             data_anon, self.quasi_ident, [self.sens_att]
100 |         )
101 | 
102 |     def test_t_closs(self):
103 |         data_anon = anonymity.t_closeness(
104 |             self.data,
105 |             self.ident,
106 |             self.quasi_ident,
107 |             self.sens_att,
108 |             self.k,
109 |             self.t,
110 |             self.supp_level,
111 |             self.hierarchies,
112 |         )
113 |         assert self.t >= pycanon.anonymity.t_closeness(
114 |             data_anon, self.quasi_ident, [self.sens_att]
115 |         )
116 | 
117 |     def test_alpha_k_anon(self):
118 |         data_anon = anonymity.alpha_k_anonymity(
119 |             self.data,
120 |             self.ident,
121 |             self.quasi_ident,
122 |             self.sens_att,
123 |             self.k,
124 |             self.alpha,
125 |             self.supp_level,
126 |             self.hierarchies,
127 |         )
128 |         alpha, k = pycanon.anonymity.alpha_k_anonymity(
129 |             data_anon, self.quasi_ident, [self.sens_att]
130 |         )
131 |         assert self.alpha >= alpha and self.k <= k
132 | 
133 |     def test_basic_beta(self):
134 |         data_anon = anonymity.basic_beta_likeness(
135 |             self.data,
136 |             self.ident,
137 |             self.quasi_ident,
138 |             self.sens_att,
139 |             self.k,
140 |             self.beta,
141 |             self.supp_level,
142 |             self.hierarchies,
143 |         )
144 |         assert self.beta >= pycanon.anonymity.basic_beta_likeness(
145 |             data_anon, self.quasi_ident, [self.sens_att]
146 |         )
147 | 
148 |     def test_enhanced_beta(self):
149 |         data_anon = anonymity.enhanced_beta_likeness(
150 |             self.data,
151 |             self.ident,
152 |             self.quasi_ident,
153 |             self.sens_att,
154 |             self.k,
155 |             self.beta,
156 |             self.supp_level,
157 |             self.hierarchies,
158 |         )
159 |         assert self.beta >= pycanon.anonymity.enhanced_beta_likeness(
160 |             data_anon, self.quasi_ident, [self.sens_att]
161 |         )
162 | 
163 |     def test_delta_disclosure(self):
164 |         data_anon = anonymity.delta_disclosure(
165 |             self.data,
166 |             self.ident,
167 |             self.quasi_ident,
168 |             self.sens_att,
169 |             self.k,
170 |             self.delta,
171 |             self.supp_level,
172 |             self.hierarchies,
173 |         )
174 |         assert self.delta >= pycanon.anonymity.delta_disclosure(
175 |             data_anon, self.quasi_ident, [self.sens_att]
176 |         )
177 | 
178 |     def test_entropy_l(self):
179 |         data_anon = anonymity.entropy_l_diversity(
180 |             self.data,
181 |             self.ident,
182 |             self.quasi_ident,
183 |             self.sens_att,
184 |             self.k,
185 |             self.l_div,
186 |             self.supp_level,
187 |             self.hierarchies,
188 |         )
189 |         assert len(data_anon) == 0
190 | 
191 |     def test_entropy_l1(self):
192 |         data_anon = anonymity.entropy_l_diversity(
193 |             self.data,
194 |             self.ident,
195 |             self.quasi_ident,
196 |             self.sens_att,
197 |             self.k,
198 |             1,
199 |             self.supp_level,
200 |             self.hierarchies,
201 |         )
202 |         assert 1 == pycanon.anonymity.entropy_l_diversity(
203 |             data_anon, self.quasi_ident, [self.sens_att]
204 |         )
205 | 
206 |     def test_rec_c_l(self):
207 |         data_anon = anonymity.recursive_c_l_diversity(
208 |             self.data,
209 |             self.ident,
210 |             self.quasi_ident,
211 |             self.sens_att,
212 |             self.k,
213 |             self.l_div,
214 |             self.c_div,
215 |             self.supp_level,
216 |             self.hierarchies,
217 |         )
218 |         assert len(data_anon) == 0
219 | 
220 |     def test_basic_beta0(self):
221 |         data_anon = anonymity.basic_beta_likeness(
222 |             self.data,
223 |             self.ident,
224 |             self.quasi_ident,
225 |             self.sens_att,
226 |             self.k,
227 |             0,
228 |             self.supp_level,
229 |             self.hierarchies,
230 |         )
231 |         assert 0 == pycanon.anonymity.basic_beta_likeness(
232 |             data_anon, self.quasi_ident, [self.sens_att]
233 |         )
234 | 
235 |     def test_enhanced_beta0(self):
236 |         data_anon = anonymity.enhanced_beta_likeness(
237 |             self.data,
238 |             self.ident,
239 |             self.quasi_ident,
240 |             self.sens_att,
241 |             self.k,
242 |             0,
243 |             self.supp_level,
244 |             self.hierarchies,
245 |         )
246 |         assert 0 == pycanon.anonymity.enhanced_beta_likeness(
247 |             data_anon, self.quasi_ident, [self.sens_att]
248 |         )
249 | 
250 |     def test_basic_beta10(self):
251 |         data_anon = anonymity.basic_beta_likeness(
252 |             self.data,
253 |             self.ident,
254 |             self.quasi_ident,
255 |             self.sens_att,
256 |             self.k,
257 |             10,
258 |             self.supp_level,
259 |             self.hierarchies,
260 |         )
261 |         assert 10 >= pycanon.anonymity.basic_beta_likeness(
262 |             data_anon, self.quasi_ident, [self.sens_att]
263 |         )
264 | 
265 |     def test_enhanced_beta10(self):
266 |         data_anon = anonymity.enhanced_beta_likeness(
267 |             self.data,
268 |             self.ident,
269 |             self.quasi_ident,
270 |             self.sens_att,
271 |             self.k,
272 |             10,
273 |             self.supp_level,
274 |             self.hierarchies,
275 |         )
276 |         assert 10 >= pycanon.anonymity.enhanced_beta_likeness(
277 |             data_anon, self.quasi_ident, [self.sens_att]
278 |         )
279 | 
280 |     def test_delta0(self):
281 |         data_anon = anonymity.delta_disclosure(
282 |             self.data,
283 |             self.ident,
284 |             self.quasi_ident,
285 |             self.sens_att,
286 |             self.k,
287 |             0,
288 |             self.supp_level,
289 |             self.hierarchies,
290 |         )
291 |         assert 0 == pycanon.anonymity.delta_disclosure(
292 |             data_anon, self.quasi_ident, [self.sens_att]
293 |         )
294 | 
295 |     def test_delta10(self):
296 |         data_anon = anonymity.delta_disclosure(
297 |             self.data,
298 |             self.ident,
299 |             self.quasi_ident,
300 |             self.sens_att,
301 |             self.k,
302 |             10,
303 |             self.supp_level,
304 |             self.hierarchies,
305 |         )
306 |         assert 10 >= pycanon.anonymity.delta_disclosure(
307 |             data_anon, self.quasi_ident, [self.sens_att]
308 |         )
309 | 
310 |     def test_l_div_k1(self):
311 |         data_anon = anonymity.l_diversity(
312 |             self.data,
313 |             self.ident,
314 |             self.quasi_ident,
315 |             self.sens_att,
316 |             1,
317 |             self.l_div,
318 |             self.supp_level,
319 |             self.hierarchies,
320 |         )
321 |         assert self.l_div <= pycanon.anonymity.l_diversity(
322 |             data_anon, self.quasi_ident, [self.sens_att]
323 |         )
324 | 
325 |     def test_l_div1(self):
326 |         data_anon = anonymity.l_diversity(
327 |             self.data,
328 |             self.ident,
329 |             self.quasi_ident,
330 |             self.sens_att,
331 |             self.k,
332 |             1,
333 |             self.supp_level,
334 |             self.hierarchies,
335 |         )
336 |         assert 1 <= pycanon.anonymity.l_diversity(
337 |             data_anon, self.quasi_ident, [self.sens_att]
338 |         )
339 | 
340 |     def test_entropy_l_div1(self):
341 |         data_anon = anonymity.entropy_l_diversity(
342 |             self.data,
343 |             self.ident,
344 |             self.quasi_ident,
345 |             self.sens_att,
346 |             self.k,
347 |             1,
348 |             self.supp_level,
349 |             self.hierarchies,
350 |         )
351 |         assert 1 <= pycanon.anonymity.entropy_l_diversity(
352 |             data_anon, self.quasi_ident, [self.sens_att]
353 |         )
354 | 
355 |     def test_rec_c1_l_div2(self):
356 |         data_anon = anonymity.recursive_c_l_diversity(
357 |             self.data,
358 |             self.ident,
359 |             self.quasi_ident,
360 |             self.sens_att,
361 |             self.k,
362 |             1,
363 |             2,
364 |             self.supp_level,
365 |             self.hierarchies,
366 |         )
367 |         c_cal, l_cal = pycanon.anonymity.recursive_c_l_diversity(
368 |             data_anon, self.quasi_ident, [self.sens_att]
369 |         )
370 |         assert 1 <= c_cal and 1 <= l_cal
371 | 
372 | 
373 | class TestHospital:
374 |     data = pd.read_csv("./examples/data/hospital_extended.csv")
375 | 
376 |     ident = ["name"]
377 |     quasi_ident = ["age", "gender", "city"]
378 |     sens_att = "disease"
379 |     k = 2
380 |     l_div = 2
381 |     supp_level = 0
382 |     hierarchies = {
383 |         "age": {
384 |             0: data["age"].values,
385 |             1: utils.generate_intervals(data["age"].values, 0, 100, 5),
386 |             2: utils.generate_intervals(data["age"].values, 0, 100, 10),
387 |         },
388 |         "gender": {
389 |             0: data["gender"].values,
390 |             1: np.array(["*"] * len(data["gender"].values)),
391 |         },
392 |         "city": {0: data["city"].values, 1: np.array(["*"] * len(data["city"].values))},
393 |     }
394 | 
395 |     def test_k_anon(self):
396 |         data_anon = anonymity.k_anonymity(
397 |             self.data,
398 |             self.ident,
399 |             self.quasi_ident,
400 |             self.k,
401 |             self.supp_level,
402 |             self.hierarchies,
403 |         )
404 | 
405 |         data_anon_real = copy(self.data)
406 |         data_anon_real[self.ident] = "*"
407 |         hierarchy_age = self.hierarchies["age"]
408 |         pos = []
409 |         for elem in data_anon_real["age"].values:
410 |             pos.append(np.where(hierarchy_age[0].values == elem)[0][0])
411 |         data_anon_real["age"] = hierarchy_age[2].values[pos]
412 |         assert data_anon_real.equals(data_anon)
413 | 
414 |     def test_k_anon_big(self):
415 |         data_anon = anonymity.k_anonymity(
416 |             self.data,
417 |             self.ident,
418 |             self.quasi_ident,
419 |             30,
420 |             self.supp_level,
421 |             self.hierarchies,
422 |         )
423 | 
424 |         assert data_anon.equals(pd.DataFrame())
425 | 
426 |     def test_l_div(self):
427 |         data_anon = anonymity.l_diversity(
428 |             self.data,
429 |             self.ident,
430 |             self.quasi_ident,
431 |             self.sens_att,
432 |             self.k,
433 |             self.l_div,
434 |             self.supp_level,
435 |             self.hierarchies,
436 |         )
437 | 
438 |         data_anon_real = copy(self.data)
439 |         data_anon_real[self.ident] = "*"
440 |         hierarchy_age = self.hierarchies["age"]
441 |         pos = []
442 |         for elem in data_anon_real["age"].values:
443 |             pos.append(np.where(hierarchy_age[0].values == elem)[0][0])
444 |         data_anon_real["age"] = hierarchy_age[2].values[pos]
445 |         data_anon_real["city"] = "*"
446 |         assert data_anon_real.equals(data_anon)
447 | 
448 |     def test_basic_beta0_supp0(self):
449 |         data_anon = anonymity.basic_beta_likeness(
450 |             self.data,
451 |             self.ident,
452 |             self.quasi_ident,
453 |             self.sens_att,
454 |             self.k,
455 |             0,
456 |             0,
457 |             self.hierarchies,
458 |         )
459 |         assert data_anon.equals(pd.DataFrame())
460 | 
461 |     def test_enhanced_beta0_supp0(self):
462 |         data_anon = anonymity.enhanced_beta_likeness(
463 |             self.data,
464 |             self.ident,
465 |             self.quasi_ident,
466 |             self.sens_att,
467 |             self.k,
468 |             0,
469 |             0,
470 |             self.hierarchies,
471 |         )
472 |         assert data_anon.equals(pd.DataFrame())
473 | 
474 |     def test_get_transformation(self):
475 |         data_anon = anonymity.k_anonymity(
476 |             self.data,
477 |             self.ident,
478 |             self.quasi_ident,
479 |             self.k,
480 |             self.supp_level,
481 |             self.hierarchies,
482 |         )
483 | 
484 |         transformation = utils.get_transformation(
485 |             data_anon, self.quasi_ident, self.hierarchies
486 |         )
487 |         assert [2, 0, 0] == transformation
488 | 
489 |     def test_get_transformation_2qi(self):
490 |         hierarchies = {
491 |             "age": {
492 |                 0: self.data["age"].values,
493 |                 1: utils.generate_intervals(self.data["age"].values, 0, 100, 5),
494 |                 2: utils.generate_intervals(self.data["age"].values, 0, 100, 10),
495 |             },
496 |             "city": {
497 |                 0: self.data["city"].values,
498 |                 1: np.array(["*"] * len(self.data["city"].values)),
499 |             },
500 |         }
501 |         data_anon = anonymity.k_anonymity(
502 |             self.data,
503 |             self.ident,
504 |             self.quasi_ident,
505 |             self.k,
506 |             self.supp_level,
507 |             hierarchies,
508 |         )
509 | 
510 |         transformation = utils.get_transformation(
511 |             data_anon, self.quasi_ident, hierarchies
512 |         )
513 |         assert [2, 0, 0] == transformation
514 | 
515 |     def test_apply_transformation_raw(self):
516 |         data_transform = utils.apply_transformation(
517 |             self.data, self.quasi_ident, self.hierarchies, [2, 0, 1]
518 |         )
519 |         assert [2, 0, 1] == utils.get_transformation(
520 |             data_transform, self.quasi_ident, self.hierarchies
521 |         )
522 | 
523 |     def test_apply_transformation_anon(self):
524 |         hierarchies = {
525 |             "age": {
526 |                 0: self.data["age"].values,
527 |                 1: utils.generate_intervals(self.data["age"].values, 0, 100, 5),
528 |                 2: utils.generate_intervals(self.data["age"].values, 0, 100, 10),
529 |                 3: utils.generate_intervals(self.data["age"].values, 0, 100, 20),
530 |                 4: utils.generate_intervals(self.data["age"].values, 0, 100, 50),
531 |             },
532 |             "gender": {
533 |                 0: self.data["gender"].values,
534 |                 1: np.array(["*"] * len(self.data["gender"].values)),
535 |             },
536 |             "city": {
537 |                 0: self.data["city"].values,
538 |                 1: np.array(["*"] * len(self.data["city"].values)),
539 |             },
540 |         }
541 | 
542 |         data_anon = anonymity.k_anonymity(
543 |             self.data,
544 |             self.ident,
545 |             self.quasi_ident,
546 |             self.k,
547 |             self.supp_level,
548 |             hierarchies,
549 |         )
550 | 
551 |         data_transform = utils.apply_transformation(
552 |             data_anon, self.quasi_ident, hierarchies, [4, 1, 1]
553 |         )
554 |         assert [4, 1, 1] == utils.get_transformation(
555 |             data_transform, self.quasi_ident, hierarchies
556 |         )
557 | 
558 |     def test_generate_intervals(self):
559 |         int5 = utils.generate_intervals(self.data["age"].values, 0, 100, 5)
560 |         # [29 24 23 22 24 28 27 24 23 19 29 17 19]
561 |         real_interval = [
562 |             "[25, 30)",
563 |             "[20, 25)",
564 |             "[20, 25)",
565 |             "[20, 25)",
566 |             "[20, 25)",
567 |             "[25, 30)",
568 |             "[25, 30)",
569 |             "[20, 25)",
570 |             "[20, 25)",
571 |             "[15, 20)",
572 |             "[25, 30)",
573 |             "[15, 20)",
574 |             "[15, 20)",
575 |         ]
576 |         assert real_interval == int5
577 | 


--------------------------------------------------------------------------------
/tests/test_unitary.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import anjana
  3 | from anjana import anonymity
  4 | from anjana.anonymity import utils
  5 | import pandas as pd
  6 | import beartype
  7 | 
  8 | 
  9 | class TestInvalidValues(unittest.TestCase):
 10 |     data = pd.read_csv("./examples/data/adult.csv")  # 32561 rows
 11 |     data.columns = data.columns.str.strip()
 12 |     cols = [
 13 |         "workclass",
 14 |         "education",
 15 |         "marital-status",
 16 |         "occupation",
 17 |         "sex",
 18 |         "native-country",
 19 |     ]
 20 |     for col in cols:
 21 |         data[col] = data[col].str.strip()
 22 | 
 23 |     quasi_ident = [
 24 |         "age",
 25 |         "education",
 26 |         "marital-status",
 27 |         "occupation",
 28 |         "sex",
 29 |         "native-country",
 30 |     ]
 31 |     ident = ["race"]
 32 |     sens_att = "salary-class"
 33 | 
 34 |     hierarchies = {
 35 |         "age": dict(pd.read_csv("./examples/hierarchies/age.csv", header=None)),
 36 |         "education": dict(
 37 |             pd.read_csv("./examples/hierarchies/education.csv", header=None)
 38 |         ),
 39 |         "marital-status": dict(
 40 |             pd.read_csv("./examples/hierarchies/marital.csv", header=None)
 41 |         ),
 42 |         "occupation": dict(
 43 |             pd.read_csv("./examples/hierarchies/occupation.csv", header=None)
 44 |         ),
 45 |         "sex": dict(pd.read_csv("./examples/hierarchies/sex.csv", header=None)),
 46 |         "native-country": dict(
 47 |             pd.read_csv("./examples/hierarchies/country.csv", header=None)
 48 |         ),
 49 |     }
 50 | 
 51 |     def test_supp_identifiers(self):
 52 |         ident = ["id", "name"]
 53 |         with self.assertRaises(ValueError):
 54 |             anjana.anonymity.utils.suppress_identifiers(self.data, ident)
 55 | 
 56 |     def test_k_neg(self):
 57 |         k = -1
 58 |         supp_level = 50
 59 |         with self.assertRaises(ValueError):
 60 |             anonymity.k_anonymity(
 61 |                 self.data,
 62 |                 self.ident,
 63 |                 self.quasi_ident,
 64 |                 k,
 65 |                 supp_level,
 66 |                 self.hierarchies,
 67 |             )
 68 | 
 69 |     def test_k_0(self):
 70 |         k = 0
 71 |         supp_level = 50
 72 |         with self.assertRaises(ValueError):
 73 |             anonymity.k_anonymity(
 74 |                 self.data,
 75 |                 self.ident,
 76 |                 self.quasi_ident,
 77 |                 k,
 78 |                 supp_level,
 79 |                 self.hierarchies,
 80 |             )
 81 | 
 82 |     def test_alpha_neg(self):
 83 |         k = 2
 84 |         alpha = -1
 85 |         supp_level = 50
 86 |         with self.assertRaises(ValueError):
 87 |             anonymity.alpha_k_anonymity(
 88 |                 self.data,
 89 |                 self.ident,
 90 |                 self.quasi_ident,
 91 |                 self.sens_att,
 92 |                 k,
 93 |                 alpha,
 94 |                 supp_level,
 95 |                 self.hierarchies,
 96 |             )
 97 | 
 98 |     def test_alpha_high(self):
 99 |         k = 2
100 |         alpha = 1.5
101 |         supp_level = 50
102 |         with self.assertRaises(ValueError):
103 |             anonymity.alpha_k_anonymity(
104 |                 self.data,
105 |                 self.ident,
106 |                 self.quasi_ident,
107 |                 self.sens_att,
108 |                 k,
109 |                 alpha,
110 |                 supp_level,
111 |                 self.hierarchies,
112 |             )
113 | 
114 |     def test_supp_level_neg(self):
115 |         k = 1
116 |         supp_level = -10
117 |         with self.assertRaises(ValueError):
118 |             anonymity.k_anonymity(
119 |                 self.data,
120 |                 self.ident,
121 |                 self.quasi_ident,
122 |                 k,
123 |                 supp_level,
124 |                 self.hierarchies,
125 |             )
126 | 
127 |     def test_supp_level_high(self):
128 |         k = 1
129 |         supp_level = 110
130 |         with self.assertRaises(ValueError):
131 |             anonymity.k_anonymity(
132 |                 self.data,
133 |                 self.ident,
134 |                 self.quasi_ident,
135 |                 k,
136 |                 supp_level,
137 |                 self.hierarchies,
138 |             )
139 | 
140 |     def test_l_neg(self):
141 |         k = 1
142 |         l_div = -1
143 |         supp_level = 50
144 |         with self.assertRaises(ValueError):
145 |             anonymity.l_diversity(
146 |                 self.data,
147 |                 self.ident,
148 |                 self.quasi_ident,
149 |                 self.sens_att,
150 |                 k,
151 |                 l_div,
152 |                 supp_level,
153 |                 self.hierarchies,
154 |             )
155 | 
156 |     def test_l_0(self):
157 |         k = 1
158 |         l_div = 0
159 |         supp_level = 50
160 |         with self.assertRaises(ValueError):
161 |             anonymity.l_diversity(
162 |                 self.data,
163 |                 self.ident,
164 |                 self.quasi_ident,
165 |                 self.sens_att,
166 |                 k,
167 |                 l_div,
168 |                 supp_level,
169 |                 self.hierarchies,
170 |             )
171 | 
172 |     def test_ent_l_neg(self):
173 |         k = 1
174 |         l_div = -1
175 |         supp_level = 50
176 |         with self.assertRaises(ValueError):
177 |             anonymity.entropy_l_diversity(
178 |                 self.data,
179 |                 self.ident,
180 |                 self.quasi_ident,
181 |                 self.sens_att,
182 |                 k,
183 |                 l_div,
184 |                 supp_level,
185 |                 self.hierarchies,
186 |             )
187 | 
188 |     def test_ent_l_0(self):
189 |         k = 1
190 |         l_div = 0
191 |         supp_level = 50
192 |         with self.assertRaises(ValueError):
193 |             anonymity.entropy_l_diversity(
194 |                 self.data,
195 |                 self.ident,
196 |                 self.quasi_ident,
197 |                 self.sens_att,
198 |                 k,
199 |                 l_div,
200 |                 supp_level,
201 |                 self.hierarchies,
202 |             )
203 | 
204 |     def test_rec_l_neg(self):
205 |         k = 1
206 |         l_div = -1
207 |         c = 1
208 |         supp_level = 50
209 |         with self.assertRaises(ValueError):
210 |             anonymity.recursive_c_l_diversity(
211 |                 self.data,
212 |                 self.ident,
213 |                 self.quasi_ident,
214 |                 self.sens_att,
215 |                 k,
216 |                 c,
217 |                 l_div,
218 |                 supp_level,
219 |                 self.hierarchies,
220 |             )
221 | 
222 |     def test_rec_l_0(self):
223 |         k = 1
224 |         l_div = 0
225 |         c = 1
226 |         supp_level = 50
227 |         with self.assertRaises(ValueError):
228 |             anonymity.recursive_c_l_diversity(
229 |                 self.data,
230 |                 self.ident,
231 |                 self.quasi_ident,
232 |                 self.sens_att,
233 |                 k,
234 |                 c,
235 |                 l_div,
236 |                 supp_level,
237 |                 self.hierarchies,
238 |             )
239 | 
240 |     def test_rec_c_neg(self):
241 |         k = 1
242 |         l_div = 1
243 |         c = -1
244 |         supp_level = 50
245 |         with self.assertRaises(ValueError):
246 |             anonymity.recursive_c_l_diversity(
247 |                 self.data,
248 |                 self.ident,
249 |                 self.quasi_ident,
250 |                 self.sens_att,
251 |                 k,
252 |                 c,
253 |                 l_div,
254 |                 supp_level,
255 |                 self.hierarchies,
256 |             )
257 | 
258 |     def test_rec_c_0(self):
259 |         k = 1
260 |         l_div = 1
261 |         c = 0
262 |         supp_level = 50
263 |         with self.assertRaises(ValueError):
264 |             anonymity.recursive_c_l_diversity(
265 |                 self.data,
266 |                 self.ident,
267 |                 self.quasi_ident,
268 |                 self.sens_att,
269 |                 k,
270 |                 c,
271 |                 l_div,
272 |                 supp_level,
273 |                 self.hierarchies,
274 |             )
275 | 
276 |     def test_t_neg(self):
277 |         k = 1
278 |         t = -1.5
279 |         supp_level = 50
280 |         with self.assertRaises(ValueError):
281 |             anonymity.t_closeness(
282 |                 self.data,
283 |                 self.ident,
284 |                 self.quasi_ident,
285 |                 self.sens_att,
286 |                 k,
287 |                 t,
288 |                 supp_level,
289 |                 self.hierarchies,
290 |             )
291 | 
292 |     def test_t_high(self):
293 |         k = 1
294 |         t = 1.5
295 |         supp_level = 50
296 |         with self.assertRaises(ValueError):
297 |             anonymity.t_closeness(
298 |                 self.data,
299 |                 self.ident,
300 |                 self.quasi_ident,
301 |                 self.sens_att,
302 |                 k,
303 |                 t,
304 |                 supp_level,
305 |                 self.hierarchies,
306 |             )
307 | 
308 |     def test_basic_beta_neg(self):
309 |         k = 1
310 |         beta = -1
311 |         supp_level = 50
312 |         with self.assertRaises(ValueError):
313 |             anonymity.basic_beta_likeness(
314 |                 self.data,
315 |                 self.ident,
316 |                 self.quasi_ident,
317 |                 self.sens_att,
318 |                 k,
319 |                 beta,
320 |                 supp_level,
321 |                 self.hierarchies,
322 |             )
323 | 
324 |     def test_enhanced_beta_neg(self):
325 |         k = 1
326 |         beta = -1
327 |         supp_level = 50
328 |         with self.assertRaises(ValueError):
329 |             anonymity.enhanced_beta_likeness(
330 |                 self.data,
331 |                 self.ident,
332 |                 self.quasi_ident,
333 |                 self.sens_att,
334 |                 k,
335 |                 beta,
336 |                 supp_level,
337 |                 self.hierarchies,
338 |             )
339 | 
340 |     def test_delta_neg(self):
341 |         k = 1
342 |         delta = -1
343 |         supp_level = 50
344 |         with self.assertRaises(ValueError):
345 |             anonymity.delta_disclosure(
346 |                 self.data,
347 |                 self.ident,
348 |                 self.quasi_ident,
349 |                 self.sens_att,
350 |                 k,
351 |                 delta,
352 |                 supp_level,
353 |                 self.hierarchies,
354 |             )
355 | 
356 |     def test_kanon_data(self):
357 |         k = 1
358 |         supp_level = 50
359 |         with self.assertRaises(beartype.roar.BeartypeCallHintParamViolation):
360 |             anonymity.k_anonymity(
361 |                 "data.csv",
362 |                 self.ident,
363 |                 self.quasi_ident,
364 |                 self.sens_att,
365 |                 k,
366 |                 supp_level,
367 |                 self.hierarchies,
368 |             )
369 | 
370 |     def test_kanon_float(self):
371 |         k = 1.5
372 |         supp_level = 50
373 |         with self.assertRaises(beartype.roar.BeartypeCallHintParamViolation):
374 |             anonymity.k_anonymity(
375 |                 self.data,
376 |                 self.ident,
377 |                 self.quasi_ident,
378 |                 self.sens_att,
379 |                 k,
380 |                 supp_level,
381 |                 self.hierarchies,
382 |             )
383 | 
384 |     def test_alpha_kanon_float(self):
385 |         k = 1.5
386 |         alpha = 0.5
387 |         supp_level = 50
388 |         with self.assertRaises(beartype.roar.BeartypeCallHintParamViolation):
389 |             anonymity.k_anonymity(
390 |                 self.data,
391 |                 self.ident,
392 |                 self.quasi_ident,
393 |                 self.sens_att,
394 |                 k,
395 |                 alpha,
396 |                 supp_level,
397 |                 self.hierarchies,
398 |             )
399 | 
400 |     def test_ldiv_float(self):
401 |         k = 2
402 |         l_div = 1.5
403 |         supp_level = 50
404 |         with self.assertRaises(beartype.roar.BeartypeCallHintParamViolation):
405 |             anonymity.l_diversity(
406 |                 self.data,
407 |                 self.ident,
408 |                 self.quasi_ident,
409 |                 self.sens_att,
410 |                 k,
411 |                 l_div,
412 |                 supp_level,
413 |                 self.hierarchies,
414 |             )
415 | 
416 |     def test_entropy_ldiv_float(self):
417 |         k = 2
418 |         l_div = 1.5
419 |         supp_level = 50
420 |         with self.assertRaises(beartype.roar.BeartypeCallHintParamViolation):
421 |             anonymity.entropy_l_diversity(
422 |                 self.data,
423 |                 self.ident,
424 |                 self.quasi_ident,
425 |                 self.sens_att,
426 |                 k,
427 |                 l_div,
428 |                 supp_level,
429 |                 self.hierarchies,
430 |             )
431 | 
432 |     def test_rec_ldiv_float(self):
433 |         k = 2
434 |         c = 1
435 |         l_div = 1.5
436 |         supp_level = 50
437 |         with self.assertRaises(beartype.roar.BeartypeCallHintParamViolation):
438 |             anonymity.recursive_c_l_diversity(
439 |                 self.data,
440 |                 self.ident,
441 |                 self.quasi_ident,
442 |                 self.sens_att,
443 |                 k,
444 |                 c,
445 |                 l_div,
446 |                 supp_level,
447 |                 self.hierarchies,
448 |             )
449 | 
450 |     def test_rec_c_ldiv_float(self):
451 |         k = 2
452 |         c = 1.5
453 |         l_div = 1
454 |         supp_level = 50
455 |         with self.assertRaises(beartype.roar.BeartypeCallHintParamViolation):
456 |             anonymity.recursive_c_l_diversity(
457 |                 self.data,
458 |                 self.ident,
459 |                 self.quasi_ident,
460 |                 self.sens_att,
461 |                 k,
462 |                 c,
463 |                 l_div,
464 |                 supp_level,
465 |                 self.hierarchies,
466 |             )
467 | 
468 |     def test_return_k_anonymity(self):
469 |         k = 2
470 |         supp_level = 50
471 |         data_anon = anonymity.k_anonymity(
472 |             self.data,
473 |             self.ident,
474 |             self.quasi_ident,
475 |             k,
476 |             supp_level,
477 |             self.hierarchies,
478 |         )
479 |         assert isinstance(data_anon, pd.DataFrame)
480 | 
481 |     def test_return_l_div(self):
482 |         k = 2
483 |         l_div = 2
484 |         supp_level = 50
485 |         data_anon = anonymity.l_diversity(
486 |             self.data,
487 |             self.ident,
488 |             self.quasi_ident,
489 |             self.sens_att,
490 |             k,
491 |             l_div,
492 |             supp_level,
493 |             self.hierarchies,
494 |         )
495 |         assert isinstance(data_anon, pd.DataFrame)
496 | 
497 |     def test_return_t_closs(self):
498 |         k = 2
499 |         t = 0.8
500 |         supp_level = 50
501 |         data_anon = anonymity.t_closeness(
502 |             self.data,
503 |             self.ident,
504 |             self.quasi_ident,
505 |             self.sens_att,
506 |             k,
507 |             t,
508 |             supp_level,
509 |             self.hierarchies,
510 |         )
511 |         assert isinstance(data_anon, pd.DataFrame)
512 | 
513 |     def test_return_basic_beta(self):
514 |         k = 2
515 |         beta = 0.8
516 |         supp_level = 50
517 |         data_anon = anonymity.basic_beta_likeness(
518 |             self.data,
519 |             self.ident,
520 |             self.quasi_ident,
521 |             self.sens_att,
522 |             k,
523 |             beta,
524 |             supp_level,
525 |             self.hierarchies,
526 |         )
527 |         assert isinstance(data_anon, pd.DataFrame)
528 | 
529 |     def test_return_enhanced_beta(self):
530 |         k = 2
531 |         beta = 0.8
532 |         supp_level = 50
533 |         data_anon = anonymity.enhanced_beta_likeness(
534 |             self.data,
535 |             self.ident,
536 |             self.quasi_ident,
537 |             self.sens_att,
538 |             k,
539 |             beta,
540 |             supp_level,
541 |             self.hierarchies,
542 |         )
543 |         assert isinstance(data_anon, pd.DataFrame)
544 | 
545 |     def test_return_delta_dic(self):
546 |         k = 2
547 |         delta = 0.8
548 |         supp_level = 50
549 |         data_anon = anonymity.delta_disclosure(
550 |             self.data,
551 |             self.ident,
552 |             self.quasi_ident,
553 |             self.sens_att,
554 |             k,
555 |             delta,
556 |             supp_level,
557 |             self.hierarchies,
558 |         )
559 |         assert isinstance(data_anon, pd.DataFrame)
560 | 
561 |     def test_apply_transformation_neg(self):
562 |         with self.assertRaises(ValueError):
563 |             utils.apply_transformation(
564 |                 self.data, self.quasi_ident, self.hierarchies, [-1, 1, 1]
565 |             )
566 | 
567 |     def test_apply_transformation_out(self):
568 |         with self.assertRaises(ValueError):
569 |             utils.apply_transformation(
570 |                 self.data, self.quasi_ident, self.hierarchies, [100, 1, 1]
571 |             )
572 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
  1 | [tox]
  2 | min_version = 4.3.3
  3 | isolated_build = true
  4 | envlist = 
  5 |     py3{9, 10, 11, 12}
  6 |     flake8
  7 |     black
  8 |     bandit
  9 |     mypy
 10 |     pip-missing-reqs
 11 |     pypi
 12 | 
 13 | [gh-actions]
 14 | python =
 15 |     3.9: py39
 16 |     3.10: py310, flake8, black, bandit, pip-missing-reqs, pypi
 17 |     3.11: py311
 18 |     3.12: py312
 19 | 
 20 | [base]
 21 | python = python3.10
 22 | package = anjana
 23 | 
 24 | [pytest]
 25 | addopts = -p no:warnings
 26 | 
 27 | [testenv]
 28 | usedevelop = True
 29 | basepython = python3.10
 30 | deps =
 31 |     pytest>=7.1.2
 32 |     pytest-cov>=4.0.0
 33 | allowlist_externals =
 34 |     poetry
 35 |     find
 36 |     rm
 37 |     mkdir
 38 |     twine
 39 | setenv =
 40 |    VIRTUAL_ENV={envdir}
 41 |    LC_ALL=en_US.utf-8
 42 | commands_pre =
 43 |     poetry install --no-root --sync --with test,test-{envname}
 44 | commands =
 45 |     find . -type f -name "*.pyc" -delete
 46 |     poetry run pytest {posargs}
 47 | 
 48 | [testenv:py39]
 49 | basepython = python3.9
 50 | commands_pre =
 51 |     poetry install --no-root --sync --with test
 52 | 
 53 | [testenv:py310]
 54 | basepython = python3.10
 55 | commands_pre =
 56 |     poetry install --no-root --sync --with test
 57 | 
 58 | [testenv:py311]
 59 | basepython = python3.11
 60 | commands_pre =
 61 |     poetry install --no-root --sync --with test
 62 | 
 63 | [testenv:py312]
 64 | basepython = python3.12
 65 | commands_pre =
 66 |     poetry install --no-root --sync --with test
 67 | 
 68 | [flake8]
 69 | # Black default line length is 88
 70 | max-line-length = 88 
 71 | show-source = True
 72 | builtins = _
 73 | ignore = B008, E203
 74 | exclude = 
 75 |     .venv
 76 |     .git
 77 |     .tox
 78 |     dist
 79 |     docs
 80 |     *lib/python*
 81 |     *egg
 82 |     build
 83 | 
 84 | [testenv:flake8]
 85 | basepython = {[base]python}
 86 | deps =
 87 |     flake8>=4.0,<4.1
 88 |     flake8-bugbear>=22.3,<22.4
 89 |     flake8-docstrings>=1.6,<1.7
 90 |     flake8-typing-imports>=1.12,<1.13
 91 |     flake8-colors>=0.1,<0.2
 92 |     pep8-naming>=0.12,<0.13
 93 |     pydocstyle>=6.1,<6.2
 94 | commands =
 95 |   poetry run flake8 {[base]package}
 96 | 
 97 | [testenv:black]
 98 | basepython = {[base]python}
 99 | deps =
100 |     black>=22.3,<22.4
101 | commands = poetry run black --check --diff {[base]package}
102 | 
103 | [testenv:bandit]
104 | basepython = {[base]python}
105 | deps = 
106 |     bandit>=1.7.5
107 | commands = poetry run --verbose bandit -r {[base]package} -x tests -s B110,B410
108 | 
109 | [testenv:bandit-report]
110 | basepython = {[base]python}
111 | deps = 
112 |     {[testenv:bandit]deps}
113 | commands = 
114 |     - mkdir /tmp/bandit
115 |     - poetry run bandit -r {[base]package} -x tests -s B110,B410 -f html -o /tmp/bandit/index.html
116 | 
117 | [testenv:docs]
118 | basepython = {[base]python}
119 | deps = 
120 |     -r {toxinidir}/docs/requirements.txt
121 | commands =
122 |     rm -rf docs/build
123 |     build_sphinx
124 | 
125 | [testenv:mypy]                                                                  
126 | description = Static type checks                              
127 | basepython = {[base]python}
128 | deps = 
129 |     mypy==1.3.0
130 |     types-tabulate==0.9.0.2
131 |     pandas-stubs==2.0.1.230501
132 | commands =                                                                      
133 |     poetry run mypy --config-file mypy.ini -p {[base]package}
134 | 
135 | [testenv:pip-missing-reqs]
136 | basepython = {[base]python}
137 | deps = pip_check_reqs
138 | commands=
139 |     mkdir -p tmp
140 |     poetry export -o tmp/requirements.txt
141 |     poetry run pip-missing-reqs --requirements-file tmp/requirements.txt --ignore-file={[base]package}/tests/* {[base]package}
142 |     rm tmp/requirements.txt
143 | 


--------------------------------------------------------------------------------