├── __init__.py ├── tests ├── __init__.py ├── input_dict │ ├── test_empty_dictionary.py │ ├── test_dbg_stages.py │ ├── meta_words_and_phrases_constants.py │ ├── meta_partial_constants.py │ ├── meta_include_rules.py │ ├── test_empty_meta_dict.py │ ├── meta_data_sql_condition.py │ ├── meta_data_func.py │ ├── test_sync_struct.py │ ├── test_exclude.py │ ├── test_sync_data_2.py │ ├── test_partial_exclude_tables_dict.py │ ├── mask_test.py │ ├── test_sync_data.py │ ├── test_partial_tables_dict.py │ ├── test_meta_dict_type_aliases_complex.py │ ├── test_sens_with_sql_conditions.py │ ├── meta_include_and_skip_rules.py │ ├── test_meta_dict_default_func.py │ ├── test_meta_dict.py │ ├── test.py │ └── test_meta_dict_type_aliases.py ├── expected_results │ ├── test_prepared_sens_dict_result_with_no_existing_schema.py │ ├── test_prepared_sens_dict_result_by_include_rule_expected.py │ ├── test_prepared_sens_dict_result_by_include_and_skip_rules_expected.py │ ├── test_prepared_sens_dict_result_type_aliases_expected.py │ ├── test_prepared_sens_dict_result_type_aliases_complex_expected.py │ ├── PGAnonMaskUnitTest_target_tables.result │ ├── PGAnonMaskUnitTest_source_tables.result │ ├── test_prepared_sens_dict_result_by_data_sql_condition_expected.py │ ├── test_prepared_sens_dict_result_by_words_and_phrases_constants_expected.py │ ├── test_prepared_sens_dict_result_default_func_expected.py │ ├── test_prepared_sens_dict_result_by_partial_constants_expected.py │ ├── test_prepared_sens_dict_result_expected.py │ ├── test_prepared_sens_dict_result_by_data_func_expected.py │ └── test_prepared_no_sens_dict_result_expected.py ├── config.yml └── sql │ ├── init_additional_simple_env.sql │ ├── init_simple_env.sql │ └── init_stress_env.sql ├── MANIFEST.in ├── pg_anon ├── common │ ├── __init__.py │ ├── multiprocessing_utils.py │ ├── enums.py │ └── constants.py ├── modes │ ├── __init__.py │ ├── initialization.py │ ├── view_data.py │ └── view_fields.py ├── __init__.py ├── __main__.py ├── version.py ├── logger.py └── app.py ├── rest_api ├── runners │ ├── __init__.py │ ├── direct │ │ ├── __init__.py │ │ ├── view_data.py │ │ └── view_fields.py │ └── background │ │ ├── __init__.py │ │ ├── init.py │ │ ├── base.py │ │ ├── dump.py │ │ ├── restore.py │ │ └── scan.py ├── requirements.txt ├── constants.py ├── enums.py ├── dependencies.py └── utils.py ├── setup.py ├── images ├── dbg-stage-1.png ├── dbg-stage-2.png ├── dbg-stage-3.png ├── scan_workflow.png ├── Create-dict-Terms.drawio.png └── Dump-Resore-Terms.drawio.png ├── pg_anon.py ├── requirements.txt ├── docker ├── entrypoint_dbg.sh ├── motd ├── Makefile ├── entrypoint.sh ├── README.md └── Dockerfile ├── pyproject.toml ├── docs ├── operations │ ├── init.md │ ├── view-data.md │ ├── view-fields.md │ └── scan.md ├── dicts │ ├── non-sens-dict-schema.md │ ├── tables-dictionary.md │ └── sens-dict-schema.md ├── sql-functions-library.md ├── installation-and-configuring.md ├── how-it-works.md ├── debugging.md └── faq.md └── .gitignore /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | prune tests/ 2 | -------------------------------------------------------------------------------- /pg_anon/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pg_anon/modes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rest_api/runners/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup() 4 | -------------------------------------------------------------------------------- /tests/input_dict/test_empty_dictionary.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [] 3 | } 4 | -------------------------------------------------------------------------------- /pg_anon/__init__.py: -------------------------------------------------------------------------------- 1 | from .app import PgAnonApp 2 | 3 | __all__ = ["PgAnonApp"] 4 | -------------------------------------------------------------------------------- /rest_api/runners/direct/__init__.py: -------------------------------------------------------------------------------- 1 | from .view_fields import ViewFieldsRunner 2 | -------------------------------------------------------------------------------- /images/dbg-stage-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TantorLabs/pg_anon/HEAD/images/dbg-stage-1.png -------------------------------------------------------------------------------- /images/dbg-stage-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TantorLabs/pg_anon/HEAD/images/dbg-stage-2.png -------------------------------------------------------------------------------- /images/dbg-stage-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TantorLabs/pg_anon/HEAD/images/dbg-stage-3.png -------------------------------------------------------------------------------- /pg_anon.py: -------------------------------------------------------------------------------- 1 | from pg_anon.cli import main 2 | 3 | if __name__ == "__main__": 4 | main() 5 | -------------------------------------------------------------------------------- /rest_api/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi>=0.113,<1.0 2 | uvicorn[standard]>=0.38 3 | aiohttp>=3.13.2 4 | -------------------------------------------------------------------------------- /images/scan_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TantorLabs/pg_anon/HEAD/images/scan_workflow.png -------------------------------------------------------------------------------- /pg_anon/__main__.py: -------------------------------------------------------------------------------- 1 | from pg_anon.cli import main 2 | 3 | if __name__ == "__main__": 4 | main() 5 | -------------------------------------------------------------------------------- /images/Create-dict-Terms.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TantorLabs/pg_anon/HEAD/images/Create-dict-Terms.drawio.png -------------------------------------------------------------------------------- /images/Dump-Resore-Terms.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TantorLabs/pg_anon/HEAD/images/Dump-Resore-Terms.drawio.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aioprocessing==2.0.1 2 | async-timeout==4.0.3 3 | asyncpg==0.29.0 4 | prettytable==3.17.0 5 | pyyaml==6.0.3 6 | wcwidth==0.2.14 7 | concurrent-log-handler==0.9.28 8 | -------------------------------------------------------------------------------- /docker/entrypoint_dbg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo '[ ! -z "$TERM" -a -r /etc/motd ] && cat /etc/motd' >> /etc/bash.bashrc 5 | 6 | trap : TERM INT; sleep infinity & wait 7 | -------------------------------------------------------------------------------- /tests/input_dict/test_dbg_stages.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [], 3 | "dictionary_exclude": [ 4 | { 5 | "schema": "schm_other_1", 6 | "table": "some_tbl", 7 | } 8 | ], 9 | } -------------------------------------------------------------------------------- /rest_api/runners/background/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseRunner 2 | from .init import InitRunner 3 | from .dump import DumpRunner 4 | from .scan import ScanRunner 5 | from .restore import RestoreRunner 6 | -------------------------------------------------------------------------------- /tests/input_dict/meta_words_and_phrases_constants.py: -------------------------------------------------------------------------------- 1 | { 2 | "data_const": { 3 | "constants": [ 4 | "CompanyNameWordSens", 5 | "include CompanyNamePhrase" 6 | ] 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /rest_api/runners/background/init.py: -------------------------------------------------------------------------------- 1 | from rest_api.runners.background import BaseRunner 2 | 3 | from pg_anon.common.enums import AnonMode 4 | 5 | 6 | class InitRunner(BaseRunner): 7 | mode: str = AnonMode.INIT.value 8 | -------------------------------------------------------------------------------- /tests/input_dict/meta_partial_constants.py: -------------------------------------------------------------------------------- 1 | { 2 | "data_const": { 3 | "partial_constants": [ 4 | "_NamE_", # case insensitive test 5 | ".cOm" # case insensitive test 6 | ] 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /rest_api/constants.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from pathlib import Path 3 | 4 | from pg_anon.common.constants import BASE_DIR 5 | 6 | BASE_TEMP_DIR = Path(tempfile.gettempdir()) / 'pg_anon' 7 | DUMP_STORAGE_BASE_DIR = (BASE_DIR / 'output').resolve() 8 | -------------------------------------------------------------------------------- /tests/input_dict/meta_include_rules.py: -------------------------------------------------------------------------------- 1 | { 2 | "include_rules": [ 3 | { 4 | "schema": "schm_other_2", 5 | "table": "tbl_test_anon_functions", 6 | "fields": ["fld_5_email"] 7 | } 8 | ] 9 | } 10 | -------------------------------------------------------------------------------- /tests/expected_results/test_prepared_sens_dict_result_with_no_existing_schema.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema":"not_exists_schema", 5 | "table":"some_tbl", 6 | "fields": { 7 | "val":"'text const'" 8 | } 9 | }, 10 | ], 11 | } 12 | -------------------------------------------------------------------------------- /tests/input_dict/test_empty_meta_dict.py: -------------------------------------------------------------------------------- 1 | { 2 | "field": { 3 | "rules": [], 4 | "constants": [] 5 | }, 6 | "skip_rules": [], 7 | "data_regex": { 8 | "rules": [] 9 | }, 10 | "data_const": { 11 | "constants": [] 12 | }, 13 | "funcs": {} 14 | } -------------------------------------------------------------------------------- /pg_anon/version.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import version, PackageNotFoundError 2 | 3 | 4 | try: 5 | # Get version from metadata 6 | __version__ = version("pg_anon") 7 | except PackageNotFoundError: 8 | # TMP: if package is not installed, return hardcoded 9 | __version__ = "1.8.5" 10 | -------------------------------------------------------------------------------- /tests/input_dict/meta_data_sql_condition.py: -------------------------------------------------------------------------------- 1 | { 2 | "data_sql_condition": [ 3 | { 4 | "schema": "schm_customer", 5 | "table": "customer_company", 6 | "sql_condition": 7 | """ 8 | WHERE inn is null 9 | """ 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /docker/motd: -------------------------------------------------------------------------------- 1 | ============================================= 2 | # Documentation 3 | https://github.com/TantorLabs/pg_anon/blob/master/README.md 4 | ============================================= 5 | python3 pg_anon.py --help 6 | 7 | # Run tests 8 | python3 tests/test_full.py -v 9 | ============================================= 10 | -------------------------------------------------------------------------------- /tests/input_dict/meta_data_func.py: -------------------------------------------------------------------------------- 1 | { 2 | "data_func": { 3 | "anyelement": [ 4 | { 5 | "scan_func": "test_anon_funcs.test_check_is_company_email", 6 | "anon_func": "anon_funcs.partial_email(\"%s\")", 7 | "n_count": 1, 8 | }, 9 | ], 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /tests/expected_results/test_prepared_sens_dict_result_by_include_rule_expected.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema": "schm_other_2", 5 | "table": "tbl_test_anon_functions", 6 | "fields": { 7 | "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')" 8 | } 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /tests/input_dict/test_sync_struct.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema":"schm_other_2", 5 | "table":"exclude_tbl" 6 | }, 7 | { 8 | "schema":"schm_other_2", 9 | "table":"some_tbl" 10 | }, 11 | { 12 | "schema":"schm_mask_include_1", 13 | "table":"tbl_123" 14 | } 15 | ], 16 | "dictionary_exclude": [ 17 | { 18 | "schema_mask": "*", 19 | "table_mask": "*", 20 | } 21 | ] 22 | } -------------------------------------------------------------------------------- /tests/input_dict/test_exclude.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema":"schm_other_1", 5 | "table":"some_tbl", 6 | "fields": { 7 | "val":"'text const'" 8 | } 9 | } 10 | ], 11 | "dictionary_exclude": [ 12 | { 13 | "schema_mask": "*", 14 | "table_mask": "*", 15 | } 16 | ], 17 | "validate_tables": [ # only this tables must contains rows 18 | { 19 | "schema": "schm_other_1", 20 | "table": "some_tbl" 21 | } 22 | ] 23 | } -------------------------------------------------------------------------------- /tests/input_dict/test_sync_data_2.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema":"schm_other_1", 5 | "table":"some_tbl", 6 | "fields": { 7 | "val":"'text const modified'" 8 | } 9 | }, 10 | { 11 | "schema":"schm_other_2", 12 | "table":"some_tbl", 13 | "raw_sql": "SELECT id, val || ' modified 2' as val FROM schm_other_2.some_tbl" 14 | } 15 | ], 16 | "dictionary_exclude": [ 17 | { 18 | "schema_mask": "*", 19 | "table_mask": "*", 20 | } 21 | ] 22 | } -------------------------------------------------------------------------------- /tests/input_dict/test_partial_exclude_tables_dict.py: -------------------------------------------------------------------------------- 1 | { 2 | "tables": [ 3 | { 4 | "schema": "public", 5 | "table": "inn_info" 6 | }, 7 | { 8 | "schema": "schm_other_1", 9 | "table_mask": "*" 10 | }, 11 | { 12 | "schema_mask": ".*customer.*", 13 | "table": "customer_manager" 14 | }, 15 | { 16 | "schema_mask": "^_SCHM", 17 | "table_mask": ".*2$" 18 | }, 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /tests/config.yml: -------------------------------------------------------------------------------- 1 | pg-utils-versions: 2 | 15: 3 | pg_dump: "/usr/lib/postgresql/15/bin/pg_dump" 4 | pg_restore: "/usr/lib/postgresql/15/bin/pg_restore" 5 | 16: 6 | pg_dump: "/usr/lib/postgresql/16/bin/pg_dump" 7 | pg_restore: "/usr/lib/postgresql/16/bin/pg_restore" 8 | 17: 9 | pg_dump: "/usr/lib/postgresql/17/bin/pg_dump" 10 | pg_restore: "/usr/lib/postgresql/17/bin/pg_restore" 11 | default: 12 | pg_dump: "/usr/lib/postgresql/17/bin/pg_dump" 13 | pg_restore: "/usr/lib/postgresql/17/bin/pg_restore" 14 | -------------------------------------------------------------------------------- /rest_api/enums.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, StrEnum 2 | 3 | 4 | class ScanMode(StrEnum): 5 | FULL = "full" 6 | PARTIAL = "partial" 7 | 8 | 9 | class DumpMode(StrEnum): 10 | FULL = "dump" 11 | STRUCT = "sync-struct-dump" 12 | DATA = "sync-data-dump" 13 | 14 | 15 | class RestoreMode(StrEnum): 16 | FULL = "restore" 17 | STRUCT = "sync-struct-restore" 18 | DATA = "sync-data-restore" 19 | 20 | 21 | class ResponseStatus(Enum): 22 | UNKNOWN = 1 23 | SUCCESS = 2 24 | ERROR = 3 25 | IN_PROGRESS = 4 26 | STARTING = 5 27 | -------------------------------------------------------------------------------- /docker/Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := build 2 | .PHONY: build 3 | 4 | check-env: 5 | ifndef PG_VERSION 6 | $(error PG_VERSION is undefined) 7 | endif 8 | 9 | default: build 10 | 11 | build: 12 | if test -d pg_anon; \ 13 | then cd pg_anon && git pull; \ 14 | else git clone https://github.com/TantorLabs/pg_anon.git; \ 15 | fi 16 | 17 | docker build -t pg_anon:pg${PG_VERSION} --build-arg PG_VERSION=${PG_VERSION} . 18 | 19 | .PHONY: clean 20 | clean: 21 | rm -rf pg_anon 22 | 23 | .PHONY: prune 24 | prune: 25 | docker images prune -a 26 | docker system prune -a -f 27 | -------------------------------------------------------------------------------- /tests/input_dict/mask_test.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema_mask": "*", 5 | "table_mask": "*", 6 | "fields": { 7 | "amount": "101010" 8 | } 9 | }, 10 | { 11 | "schema":"schm_other_1", 12 | "table":"some_tbl", 13 | "fields": { 14 | "val":"'text const'" 15 | } 16 | }, 17 | { 18 | "schema_mask": "*", 19 | "table": "tbl_100", 20 | "fields": { 21 | "amount": "202020" 22 | } 23 | }, 24 | { 25 | "schema":"schm_other_2", 26 | "table":"some_tbl", 27 | "raw_sql": "SELECT id, val || ' modified' as val FROM schm_other_2.some_tbl" 28 | } 29 | ] 30 | } -------------------------------------------------------------------------------- /tests/input_dict/test_sync_data.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema":"schm_other_2", 5 | "table":"exclude_tbl", 6 | "fields": { 7 | "val":"'text const modified'" 8 | } 9 | }, 10 | { 11 | "schema":"schm_other_2", 12 | "table":"some_tbl", 13 | "raw_sql": "SELECT id, val || ' modified 2' as val FROM schm_other_2.some_tbl" 14 | }, 15 | { 16 | "schema":"schm_mask_include_1", 17 | "table":"tbl_123", 18 | "fields": { 19 | "val":"anon_funcs.partial(val,1,'***',3)" 20 | } 21 | } 22 | ], 23 | "dictionary_exclude": [ 24 | { 25 | "schema_mask": "*", 26 | "table_mask": "*", 27 | } 28 | ] 29 | } -------------------------------------------------------------------------------- /tests/input_dict/test_partial_tables_dict.py: -------------------------------------------------------------------------------- 1 | { 2 | "tables": [ 3 | { 4 | "schema": "public", 5 | "table": "inn_info" 6 | }, 7 | { 8 | "schema": "_SCHM.$complex#имя;@&* a'", 9 | "table_mask": "^_TBL" 10 | }, 11 | { 12 | "schema_mask": "^schm_other", 13 | "table": "some_tbl" 14 | }, 15 | { 16 | "schema_mask": "schm_customer", 17 | "table_mask": "*" 18 | }, 19 | { 20 | "schema_mask": "^*", # wrong regex 21 | "table_mask": "^*" # wrong regex 22 | }, 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /tests/input_dict/test_meta_dict_type_aliases_complex.py: -------------------------------------------------------------------------------- 1 | { 2 | "include_rules": [ 3 | { 4 | "schema": "schm_other_3", 5 | "table": "data_types_test", 6 | } 7 | ], 8 | "field": { 9 | "rules": [".*"] 10 | }, 11 | "funcs": { 12 | "default": "anon_funcs.digest(\"%s\", 'default', 'md5')", 13 | "character varying (20)": "anon_funcs.digest(\"%s\", 'varchar(20)', 'md5')", 14 | "bit varying (5) ": "anon_funcs.digest(\"%s\", 'varbit(5)', 'md5')", 15 | "time (3) without time zone": "anon_funcs.digest(\"%s\", 'time(3)', 'md5')", 16 | "time (3) with time zone": "anon_funcs.digest(\"%s\", 'timetz(3)', 'md5')", 17 | "double precision": "anon_funcs.digest(\"%s\", 'float', 'md5')", 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /tests/input_dict/test_sens_with_sql_conditions.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema": "schm_other_4", 5 | "table": "goods", 6 | "fields": { 7 | "title": "anon_funcs.digest(\"title\", 'salt_word', 'sha256')", 8 | "description": "anon_funcs.digest(\"description\", 'salt_word', 'sha256')", 9 | "quantity": "10", 10 | }, 11 | "sql_condition": 12 | """ 13 | WHERE release_date > NOW() - '15 days'::interval 14 | AND valid_until < NOW() + '15 days'::interval 15 | """ 16 | } 17 | ], 18 | "dictionary_exclude": [ 19 | { 20 | "schema_mask": "*", 21 | "table_mask": "*", 22 | } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /tests/input_dict/meta_include_and_skip_rules.py: -------------------------------------------------------------------------------- 1 | { 2 | "skip_rules": [ 3 | { 4 | "schema_mask": "*", 5 | "table": "customer_company", 6 | "fields": ["inn"] 7 | }, 8 | { 9 | "schema_mask": "mask", 10 | "fields": ["val"] 11 | }, 12 | { 13 | "schema_mask": "*", 14 | "table_mask": "complex", 15 | "fields": ["fld_key"], 16 | }, 17 | ], 18 | "include_rules": [ 19 | { 20 | "schema_mask": "*", 21 | "fields": ["email", "inn", "phone", "val", "site"] 22 | }, 23 | { 24 | "schema_mask": "*", 25 | "table": "_TBL.$complex#имя;@&* a'", 26 | }, 27 | { 28 | "schema_mask": "mask", 29 | "table_mask": "^card", 30 | }, 31 | { 32 | "schema": "schm_other_2", 33 | "table_mask": "anon", 34 | }, 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pg_anon" 3 | version = "1.8.5" 4 | description = "PostgreSQL anonymization tool." 5 | authors = [ 6 | {name="Tantor Labs", email="tantor@tantorlabs.ru"} 7 | ] 8 | readme = "README.md" 9 | dependencies=[ 10 | "aioprocessing==2.0.1", 11 | "asyncpg==0.29.0", 12 | "async-timeout==4.0.3", 13 | "prettytable>=3.17.0", 14 | "pyyaml (>=6.0.3,<7.0.0)", 15 | "concurrent-log-handler (>=0.9.28,<0.10.0)" 16 | ] 17 | 18 | [build-system] 19 | requires = ["setuptools>=78"] 20 | build-backend = "setuptools.build_meta" 21 | 22 | [project.scripts] 23 | pg_anon = "pg_anon.__main__:main" 24 | 25 | [tool.setuptools] 26 | include-package-data = false 27 | packages.find.include = ["pg_anon", "pg_anon.*"] 28 | packages.find.exclude = ["tests", "dict"] 29 | 30 | [tool.poetry] 31 | name = "pg_anon" 32 | version = "1.8.5" 33 | description = "" 34 | authors = ["Tantor Labs "] 35 | readme = "README.md" 36 | 37 | packages = [ 38 | { include = "pg_anon" } 39 | ] 40 | 41 | [tool.poetry.dependencies] 42 | python = "^3.11" 43 | -------------------------------------------------------------------------------- /rest_api/dependencies.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | from fastapi import HTTPException, Query, status 6 | 7 | from pg_anon.common.constants import RUNS_BASE_DIR 8 | 9 | 10 | def date_range_filter( 11 | date_before: Optional[date] = Query(None, description="Filter: operations before this date"), 12 | date_after: Optional[date] = Query(None, description="Filter: operations after this date"), 13 | ): 14 | if date_before and date_after and date_after > date_before: 15 | raise HTTPException( 16 | status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, 17 | detail="`date_after` must be less than or equal to `date_before`", 18 | ) 19 | return {"date_before": date_before, "date_after": date_after} 20 | 21 | 22 | def get_operation_run_dir(internal_operation_id: str) -> Path: 23 | for run_dir in RUNS_BASE_DIR.glob(f'*/*/*/{internal_operation_id}'): 24 | return run_dir 25 | 26 | raise HTTPException( 27 | status_code=status.HTTP_404_NOT_FOUND, 28 | detail=f"Operation run directory not found", 29 | ) 30 | -------------------------------------------------------------------------------- /pg_anon/common/multiprocessing_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import List, Callable 3 | 4 | import aioprocessing 5 | 6 | 7 | async def init_process(name: str, ctx, target_func: Callable, tasks: List, *args, **kwargs): 8 | from pg_anon.context import Context 9 | 10 | ctx: Context 11 | start_t = time.time() 12 | ctx.logger.info(f"================> Process [{name}] started. Input items: {len(tasks)}") 13 | queue = aioprocessing.AioQueue() 14 | 15 | p = aioprocessing.AioProcess( 16 | target=target_func, 17 | args=(name, queue, tasks, *args), 18 | kwargs=kwargs, 19 | ) 20 | p.start() 21 | res = None 22 | while True: 23 | result = await queue.coro_get() 24 | if result is None: 25 | break 26 | res = result 27 | await p.coro_join() 28 | end_t = time.time() 29 | elapsed = round(end_t - start_t, 2) 30 | result_item_log = str(len(res)) if res is not None else "0" 31 | ctx.logger.info( 32 | f"<================ Process [{name}] finished, elapsed: {elapsed} sec. Result {result_item_log} item(s)" 33 | ) 34 | return res 35 | -------------------------------------------------------------------------------- /docker/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | pg_ctlcluster ${PG_VERSION} main start 5 | 6 | sed -i '/listen_addresses/s/^#//g' /etc/postgresql/${PG_VERSION}/main/postgresql.conf 7 | sed -ie "s/^listen_addresses.*/listen_addresses = '127.0.0.1'/" /etc/postgresql/${PG_VERSION}/main/postgresql.conf 8 | sed -i -e '/local.*peer/s/postgres/all/' -e 's/peer\|md5/trust/g' /etc/postgresql/${PG_VERSION}/main/pg_hba.conf 9 | 10 | pg_ctlcluster ${PG_VERSION} main restart 11 | 12 | psql -c "ALTER USER postgres WITH PASSWORD 'YmTLbLTLxF'" -U postgres 13 | psql -c "CREATE USER anon_test_user WITH PASSWORD 'mYy5RexGsZ' SUPERUSER" -U postgres 14 | 15 | ln -s /usr/share/pg_anon/pg_anon.py /usr/bin/pg_anon.py 16 | 17 | cat > /usr/bin/pg_anon << EOL 18 | #!/bin/bash 19 | python3 /usr/share/pg_anon/pg_anon.py \$@ 20 | EOL 21 | 22 | chmod +x /usr/bin/pg_anon 23 | chown postgres:postgres -R /usr/share/pg_anon 24 | 25 | usermod -d /usr/share/pg_anon postgres 26 | 27 | cd /usr/share/pg_anon 28 | 29 | echo 'export PYTHONPATH=/usr/share/pg_anon' >> /etc/bash.bashrc 30 | 31 | echo '[ ! -z "$TERM" -a -r /etc/motd ] && cat /etc/motd' >> /etc/bash.bashrc 32 | 33 | trap : TERM INT; sleep infinity & wait 34 | -------------------------------------------------------------------------------- /pg_anon/common/enums.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class ResultCode(Enum): 5 | DONE = "done" 6 | FAIL = "fail" 7 | UNKNOWN = "unknown" 8 | 9 | 10 | class VerboseOptions(Enum): 11 | INFO = "info" 12 | DEBUG = "debug" 13 | ERROR = "error" 14 | 15 | 16 | class AnonMode(Enum): 17 | DUMP = "dump" # dump table contents to files using dictionary 18 | RESTORE = "restore" # create tables in target database and load data from files 19 | INIT = "init" # create a schema with anonymization helper functions 20 | SYNC_DATA_DUMP = "sync-data-dump" # synchronize the contents of one or more tables (dump stage) 21 | SYNC_DATA_RESTORE = "sync-data-restore" # synchronize the contents of one or more tables (restore stage) 22 | SYNC_STRUCT_DUMP = "sync-struct-dump" # synchronize the structure of one or more tables (dump stage) 23 | SYNC_STRUCT_RESTORE = "sync-struct-restore" # synchronize the structure of one or more tables (restore stage) 24 | CREATE_DICT = "create-dict" # create dictionary 25 | VIEW_FIELDS = "view-fields" # view fields 26 | VIEW_DATA = "view-data" # view data using prepared-sens-dict-file 27 | 28 | 29 | class ScanMode(Enum): 30 | FULL = "full" 31 | PARTIAL = "partial" 32 | -------------------------------------------------------------------------------- /tests/sql/init_additional_simple_env.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS test_simple.orders CASCADE; 2 | DROP TABLE IF EXISTS test_simple.clients CASCADE; 3 | 4 | CREATE TABLE test_simple.clients 5 | ( 6 | id serial, 7 | firstname character varying(32), 8 | lastname character varying(32), 9 | email character varying(64), 10 | phone character varying(32), 11 | CONSTRAINT clients_pk UNIQUE (id) 12 | ); 13 | 14 | CREATE TABLE test_simple.orders 15 | ( 16 | id serial, 17 | item_id integer NOT NULL, 18 | amount numeric(16,4) DEFAULT 0 NOT NULL, 19 | details text, 20 | status_id integer NOT NULL, 21 | CONSTRAINT orders_pk UNIQUE (id) 22 | ); 23 | 24 | -- prepare data 25 | INSERT INTO test_simple.clients 26 | (firstname, lastname, email, phone) 27 | select 28 | 'first_name_' || v as firstname, 29 | 'last_name_' || v as lastname, 30 | 'first_name_' ||v || '.last_name_' || v || '@' || 'some_hoster_' || v || '.com' as email, 31 | 79101438060 + v as phone 32 | from generate_series(1,1512) as v; 33 | 34 | INSERT INTO test_simple.orders 35 | (item_id, amount, details, status_id) 36 | select 37 | v as item_id, 38 | floor(v * 0.7)::integer as amount, 39 | 'details_' || v as details, 40 | v % 2 41 | from generate_series(1,1512) as v; 42 | -------------------------------------------------------------------------------- /pg_anon/modes/initialization.py: -------------------------------------------------------------------------------- 1 | from pg_anon.common.constants import BASE_DIR 2 | from pg_anon.common.db_utils import create_connection 3 | from pg_anon.common.utils import exception_helper 4 | from pg_anon.context import Context 5 | 6 | 7 | class InitMode: 8 | def __init__(self, context: Context): 9 | self.context = context 10 | 11 | async def run(self) -> None: 12 | self.context.logger.info("-------------> Started init mode") 13 | 14 | async def handle_notice(connection, message): 15 | self.context.logger.info("NOTICE: %s" % message) 16 | 17 | db_conn = await create_connection(self.context.connection_params, server_settings=self.context.server_settings) 18 | db_conn.add_log_listener(handle_notice) 19 | 20 | tr = db_conn.transaction() 21 | await tr.start() 22 | 23 | try: 24 | with open(BASE_DIR / "init.sql", "r") as f: 25 | data = f.read() 26 | await db_conn.execute(data) 27 | await tr.commit() 28 | 29 | self.context.logger.info("<------------- Finished init mode") 30 | except Exception as ex: 31 | self.context.logger.error("<------------- Init failed\n" + exception_helper()) 32 | await tr.rollback() 33 | raise ex 34 | finally: 35 | await db_conn.close() 36 | -------------------------------------------------------------------------------- /tests/sql/init_simple_env.sql: -------------------------------------------------------------------------------- 1 | DROP SCHEMA IF EXISTS test_simple CASCADE; 2 | CREATE SCHEMA IF NOT EXISTS test_simple; 3 | 4 | DROP TABLE IF EXISTS test_simple.customer_company CASCADE; 5 | DROP TABLE IF EXISTS test_simple.contracts CASCADE; 6 | 7 | CREATE TABLE test_simple.customer_company 8 | ( 9 | id serial, 10 | company_name character varying(32), 11 | email character varying(64), 12 | phone character varying(32), 13 | site character varying(64), 14 | inn bigint, 15 | CONSTRAINT customer_company_pkey UNIQUE (id), 16 | CONSTRAINT inn_uniq UNIQUE (inn) 17 | ); 18 | 19 | CREATE TABLE test_simple.contracts 20 | ( 21 | id serial, 22 | customer_company_id integer NOT NULL, 23 | customer_manager_id integer NOT NULL, 24 | amount numeric(16,4) DEFAULT 0 NOT NULL, 25 | details text, 26 | status_id integer NOT NULL, 27 | contract_expires timestamp, 28 | CONSTRAINT contracts_pk UNIQUE (id) 29 | ); 30 | 31 | -- prepare data 32 | INSERT INTO test_simple.customer_company 33 | (company_name, email, phone, site, inn) 34 | select 35 | 'company_name_' || v as company_name, 36 | 'info' || v || '@' || 'company_name_' || v || '.com' as email, 37 | 79101438060 + v as phone, 38 | 'company_name_' || v || '.com' as site, 39 | 10000000 + v * 10 as inn 40 | from generate_series(1,1512) as v; 41 | 42 | INSERT INTO test_simple.contracts 43 | (customer_company_id, customer_manager_id, amount, details, status_id, contract_expires) 44 | select 45 | v as customer_company_id, 46 | v as customer_manager_id, 47 | floor(v * 0.7)::integer as amount, 48 | 'details_' || v as details, 49 | v % 2, 50 | NOW() + (random() * (NOW() + '365 days' - NOW())) + '365 days' as contract_expires 51 | from generate_series(1,1512) as v; 52 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # pg_anon Dockerfile 2 | 3 | ## Usage 4 | 5 | Make image: 6 | 7 | ```bash 8 | cd pg_anon/docker 9 | make PG_VERSION=15 10 | docker tag $(docker images -q | head -n 1) pg_anon:pg15 11 | ``` 12 | 13 | Push image: 14 | 15 | ```bash 16 | docker tag $(docker images -q | head -n 1) pg_anon:pg15 17 | 18 | docker save -o pg_anon_22_10_23.tar pg_anon:pg15 19 | 20 | curl --fail -v --user 'user:password' --upload-file pg_anon_22_10_23.tar https://nexus.tantorlabs.ru/repository/tantorlabs-raw/ 21 | ``` 22 | 23 | ## Run container 24 | 25 | ```bash 26 | # If "The container name "/pg_anon" is already in use" 27 | # docker rm -f pg_anon 28 | 29 | docker run --name pg_anon -d pg_anon:pg15 30 | docker exec -it pg_anon bash 31 | chown -R postgres . 32 | su - postgres 33 | python3 test/full_test.py -v 34 | exit 35 | 36 | # Run and mount directory from HOST to /usr/share/pg_anon_from_host 37 | docker rm -f pg_anon 38 | docker run --name pg_anon -v $PWD:/usr/share/pg_anon -d pg_anon:pg15 39 | ``` 40 | 41 | If tests raised error like: `asyncpg.exceptions.ExternalRoutineError: program "gzip > ... *.dat.gz" failed` 42 | 43 | See: [Configure permission](https://github.com/TantorLabs/pg_anon#configure-permission) 44 | 45 | ## Load saved image 46 | 47 | ```bash 48 | docker load < pg_anon_22_9_12.tar 49 | ``` 50 | 51 | ## How to debug container 52 | 53 | ```bash 54 | docker exec -it pg_anon bash 55 | >> 56 | Error response from daemon: Container c876d... is not running 57 | 58 | docker logs c876d... 59 | 60 | # Fix errors in entrypoint.sh 61 | # Set "ENTRYPOINT exec /entrypoint_dbg.sh" in Dockerfile 62 | 63 | docker rm -f pg_anon 64 | make PG_VERSION=15 65 | docker tag $(docker images -q | head -n 1) pg_anon:pg15 66 | docker run --name pg_anon -d pg_anon:pg15 67 | docker exec -it pg_anon bash 68 | ``` 69 | -------------------------------------------------------------------------------- /tests/expected_results/test_prepared_sens_dict_result_by_include_and_skip_rules_expected.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema": "_SCHM.$complex#имя;@&* a'", 5 | "table": "_TBL.$complex#имя;@&* a'", 6 | "fields": { 7 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')" 8 | } 9 | }, 10 | { 11 | "schema": "schm_customer", 12 | "table": "customer_company", 13 | "fields": { 14 | "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')" 15 | } 16 | }, 17 | { 18 | "schema": "schm_mask_ext_exclude_2", 19 | "table": "card_numbers", 20 | "fields": { 21 | "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')", 22 | "usd": "anon_funcs.noise(\"usd\", 30)", 23 | "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')" 24 | } 25 | }, 26 | { 27 | "schema": "schm_customer", 28 | "table": "customer_manager", 29 | "fields": { 30 | "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')" 31 | } 32 | }, 33 | { 34 | "schema": "public", 35 | "table": "inn_info", 36 | "fields": { 37 | "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')" 38 | } 39 | }, 40 | { 41 | "schema": "schm_other_2", 42 | "table": "tbl_test_anon_functions", 43 | "fields": { 44 | "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')" 45 | } 46 | } 47 | ] 48 | } -------------------------------------------------------------------------------- /pg_anon/common/constants.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | BASE_DIR = Path(__file__).resolve().parent.parent.parent 4 | RUNS_BASE_DIR = BASE_DIR / 'runs' 5 | 6 | LOGS_DIR_NAME = 'logs' 7 | LOGS_FILE_NAME = 'logs.log' 8 | SAVED_RUN_OPTIONS_FILE_NAME = 'run_options.json' 9 | SAVED_RUN_STATUS_FILE_NAME = 'run_status.json' 10 | SAVED_DICTS_INFO_FILE_NAME = 'saved_dicts_info.json' 11 | 12 | ANON_UTILS_DB_SCHEMA_NAME = 'anon_funcs' 13 | DEFAULT_HASH_FUNC = f"{ANON_UTILS_DB_SCHEMA_NAME}.digest(\"%s\", 'salt_word', 'md5')" 14 | 15 | SERVER_SETTINGS = { 16 | "application_name": "pg_anon", 17 | "statement_timeout": "0", 18 | "lock_timeout": "0", 19 | } 20 | 21 | TRANSACTIONS_SERVER_SETTINGS = { 22 | "idle_in_transaction_session_timeout": "0", 23 | "idle_session_timeout": "0", 24 | } 25 | 26 | DEFAULT_EXCLUDED_SCHEMAS = [ 27 | ANON_UTILS_DB_SCHEMA_NAME, 28 | "pg_catalog", 29 | "information_schema" 30 | ] 31 | 32 | BASE_TYPE_ALIASES = { 33 | "varbit": "bit varying", 34 | "bool": "boolean", 35 | 36 | "char": "character", 37 | "varchar": "character varying", 38 | 39 | "int": "integer", 40 | "int4": "integer", 41 | "int2": "smallint", 42 | "int8": "bigint", 43 | 44 | "float": "double precision", 45 | "float8": "double precision", 46 | "float4": "real", 47 | "decimal": "numeric", 48 | "dec": "numeric", 49 | 50 | "serial2": "smallserial", 51 | "serial4": "serial", 52 | "serial8": "bigserial", 53 | 54 | "time": "time", 55 | "timetz": "time with time zone", 56 | 57 | "timestamp": "timestamp", 58 | "timestamptz": "timestamp with time zone", 59 | } 60 | 61 | SENS_PG_TYPES = ["text", "character", "varchar", "mvarchar", "json", "integer", "bigint"] 62 | 63 | SECRET_RUN_OPTIONS = [ 64 | "db_user_password" 65 | ] 66 | 67 | TRACEBACK_LINES_COUNT = 100 68 | -------------------------------------------------------------------------------- /tests/sql/init_stress_env.sql: -------------------------------------------------------------------------------- 1 | do $$ 2 | declare 3 | count_tbls integer; 4 | test_res text; 5 | q_tbl text = 'CREATE TABLE stress.tbl_%s' 6 | '(' 7 | ' id serial,' 8 | ' customer_company_id integer NOT NULL,' 9 | ' first_name character varying(32),' 10 | ' last_name character varying(32),' 11 | ' name text,' 12 | ' email character varying(64),' 13 | ' phone character varying(32),' 14 | ' fld_datetime timestamp,' 15 | ' CONSTRAINT tbl_%s_pkey UNIQUE (id)' 16 | ');'; 17 | q_insert text = 'INSERT INTO stress.tbl_%s' 18 | '(customer_company_id, first_name, last_name, name, email, phone, fld_datetime)' 19 | ' select' 20 | ' v as customer_company_id,' 21 | ' ''first_name_'' || v as first_name,' 22 | ' ''last_name_'' || v as last_name,' 23 | ' (select array_to_string(array_agg(t.v::text), '' '')' 24 | ' from (' 25 | ' select anon_funcs.random_string(10) as v' 26 | ' from generate_series(1,100)' 27 | ' ) t) as name,' 28 | ' ''first_name_'' || v || ''@'' || ''company_name_'' || v || ''.com'' as email,' 29 | ' 79101538060 + v as phone,' 30 | ' NOW() + (random() * (NOW() + ''100 days'' - NOW())) + ''100 days''' 31 | ' from generate_series(1,1512) as v'; 32 | query text; 33 | begin 34 | execute 'DROP SCHEMA IF EXISTS stress CASCADE'; 35 | execute 'CREATE SCHEMA stress'; 36 | FOR i IN 1..10 LOOP 37 | query = format(q_tbl, i, i); 38 | --raise notice '%', query; 39 | execute query; 40 | query = format(q_insert, i); 41 | --raise notice '%', query; 42 | execute query; 43 | if i % 100 = 0 then 44 | raise notice 'i = %', i; 45 | end if; 46 | END LOOP; 47 | end$$; 48 | 49 | SELECT pg_size_pretty(pg_database_size(datname)), datname, pg_database_size(datname) as v 50 | from pg_database 51 | order by v desc; 52 | -->> 53 | -- 20 GB test_source_db_stress 21824553763 54 | -------------------------------------------------------------------------------- /tests/input_dict/test_meta_dict_default_func.py: -------------------------------------------------------------------------------- 1 | { 2 | "field": { # must be anonymized without scanning 3 | "rules": [ 4 | "^fld_5_em", 5 | "^amount", 6 | "details$", 7 | "contract_expires$", 8 | "inn$" 9 | ], 10 | "constants": [ 11 | "usd", 12 | "имя_поля" 13 | ] 14 | }, 15 | "skip_rules": [ 16 | { 17 | "schema": "schm_mask_ext_exclude_2", 18 | "table": "card_numbers", # Optional. If no "table" then whole schema will be skipped 19 | "fields": ["val_skip"] # Optional. If no "fields" then whole table will be skipped 20 | } 21 | ], 22 | "data_regex": { 23 | "rules": [ 24 | r"""[A-Za-z0-9]+([._-][A-Za-z0-9]+)*@[A-Za-z0-9-]+(\.[A-Za-z]{2,})+""", # email 25 | r"^(7?\d{10})$", # phone 7XXXXXXXXXX 26 | r"^other_ext_tbl_text", # catch "schm_mask_ext_exclude_2.other_ext_tbl_2" 27 | r"""[0-9]{3}-[0-9]{2}-[0-9]{4}""", # social Security numbers "nnn-nn-nnnn" 28 | r"""\b[0-9A-Z]{3}([^ 0-9A-Z]|\s)?[0-9]{4}\b""", # license plate numbers aaa-nnnn 29 | r"""^\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}$""", # IPV4 addresses 30 | r"""^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$""", # Dates in MM/DD/YYYY format 31 | # MasterCard numbers 5258704108753590 32 | r"""^(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}$""", 33 | # Visa card numbers 4563-7568-5698-4587 34 | r"""\b([4]\d{3}[\s]\d{4}[\s]\d{4}[\s]\d{4}|[4]\d{3}[-]\d{4}[-]\d{4}[-]\d{4}|[4]\d{3}[.]\d{4}[.]\d{4}[.]\d{4}|[4]\d{3}\d{4}\d{4}\d{4})\b""", 35 | # Any card number 36 | r"""[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}""", 37 | # URLs 38 | r"""(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()]+|\(([^\s()]+|(\([^\s()]+\)))*\))+(?:\(([^\s()]+|(\([^\s()]+\)))*\)|[^\s`!()\[\]{};:'".,?«»“”‘’]))""", 39 | r"""[0-9]{2}-[0-9]{7}""" # INN from 1c 40 | ] 41 | }, 42 | "data_const": { 43 | "constants": [ 44 | "account", 45 | "email", 46 | "слово", 47 | "сергей" 48 | ] 49 | }, 50 | "sens_pg_types": [ 51 | "text", 52 | "integer", 53 | "bigint", 54 | "character", 55 | "json" 56 | ], 57 | "funcs": { 58 | "default": "anon_funcs.digest(\"%s\", 'by_default_func', 'sha256')", 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | RUN apt update && \ 4 | apt install -y wget vim nano htop tree sysbench net-tools sysstat less iotop && \ 5 | apt -y install curl gpg gnupg2 apt-transport-https lsb-release ca-certificates && \ 6 | apt -y install software-properties-common && \ 7 | apt -y install python3-pip && \ 8 | apt install -y locales && locale-gen en_US.UTF-8 && \ 9 | rm -rf /tmp/* && apt purge -y --auto-remove && apt clean -y autoclean 10 | 11 | ARG PG_VERSION 12 | ARG DEBIAN_FRONTEND=noninteractive 13 | 14 | # RUN if [ "$PG_VERSION" = "13" ]; then curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc| gpg --dearmor -o /etc/apt/trusted.gpg.d/postgresql.gpg; fi 15 | # RUN if [ "$PG_VERSION" = "13" ]; then echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" | tee /etc/apt/sources.list.d/pgdg.list; fi 16 | # RUN if [ "$PG_VERSION" = "13" ]; then apt update ; fi 17 | 18 | 19 | # ======================================= 20 | # Ubuntu 20.04 21 | RUN curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc| gpg --dearmor -o /etc/apt/trusted.gpg.d/postgresql.gpg 22 | RUN echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" | tee /etc/apt/sources.list.d/pgdg.list 23 | RUN apt update 24 | # ======================================= 25 | 26 | RUN apt -y install postgresql-${PG_VERSION} postgresql-client-${PG_VERSION} 27 | 28 | RUN add-apt-repository ppa:deadsnakes/ppa && \ 29 | apt update && \ 30 | apt install -y python3.12 python3.12-distutils python3.12-dev 31 | 32 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 33 | 34 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ 35 | python3.12 get-pip.py && \ 36 | rm get-pip.py 37 | 38 | # Add pg_anon 39 | ADD ./pg_anon /usr/share/pg_anon 40 | 41 | RUN pip3 install -r /usr/share/pg_anon/requirements.txt 42 | 43 | EXPOSE 5432 44 | 45 | ENV PG_VERSION=${PG_VERSION} 46 | 47 | ADD entrypoint.sh /entrypoint.sh 48 | RUN chmod +x /entrypoint.sh 49 | 50 | ADD entrypoint_dbg.sh /entrypoint_dbg.sh 51 | RUN chmod +x /entrypoint_dbg.sh 52 | 53 | ADD motd /etc/motd 54 | 55 | WORKDIR /usr/share/pg_anon 56 | 57 | ENTRYPOINT exec /entrypoint.sh 58 | # ENTRYPOINT exec /entrypoint_dbg.sh 59 | -------------------------------------------------------------------------------- /tests/expected_results/test_prepared_sens_dict_result_type_aliases_expected.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema": "schm_other_3", 5 | "table": "data_types_test", 6 | "fields": { 7 | "field_type_bit": "anon_funcs.digest(\"field_type_bit\", 'bit(5)', 'md5')", 8 | "field_type_bool": "anon_funcs.digest(\"field_type_bool\", 'bool', 'md5')", 9 | "field_type_char": "anon_funcs.digest(\"field_type_char\", 'char(5)', 'md5')", 10 | "field_type_decimal": "anon_funcs.digest(\"field_type_decimal\", 'decimal(10,2)', 'md5')", 11 | "field_type_float": "anon_funcs.digest(\"field_type_float\", 'float8', 'md5')", 12 | "field_type_float4": "anon_funcs.digest(\"field_type_float4\", 'float4', 'md5')", 13 | "field_type_float8": "anon_funcs.digest(\"field_type_float8\", 'float8', 'md5')", 14 | "field_type_int": "anon_funcs.digest(\"field_type_int\", 'int4', 'md5')", 15 | "field_type_int2": "anon_funcs.digest(\"field_type_int2\", 'int2', 'md5')", 16 | "field_type_int4": "anon_funcs.digest(\"field_type_int4\", 'int4', 'md5')", 17 | "field_type_int8": "anon_funcs.digest(\"field_type_int8\", 'int8', 'md5')", 18 | "field_type_time": "anon_funcs.digest(\"field_type_time\", 'time', 'md5')", 19 | "field_type_time_p": "anon_funcs.digest(\"field_type_time_p\", 'time(3)', 'md5')", 20 | "field_type_timestamp": "anon_funcs.digest(\"field_type_timestamp\", 'timestamp', 'md5')", 21 | "field_type_timestamp_p": "anon_funcs.digest(\"field_type_timestamp_p\", 'timestamp(3)', 'md5')", 22 | "field_type_timestamptz": "anon_funcs.digest(\"field_type_timestamptz\", 'timestamptz', 'md5')", 23 | "field_type_timestamptz_p": "anon_funcs.digest(\"field_type_timestamptz_p\", 'timestamptz(3)', 'md5')", 24 | "field_type_timetz": "anon_funcs.digest(\"field_type_timetz\", 'timetz', 'md5')", 25 | "field_type_timetz_p": "anon_funcs.digest(\"field_type_timetz_p\", 'timetz(3)', 'md5')", 26 | "field_type_varbit": "anon_funcs.digest(\"field_type_varbit\", 'varbit(5)', 'md5')", 27 | "field_type_varchar": "anon_funcs.digest(\"field_type_varchar\", 'varchar(20)', 'md5')" 28 | } 29 | } 30 | ] 31 | } -------------------------------------------------------------------------------- /tests/expected_results/test_prepared_sens_dict_result_type_aliases_complex_expected.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema": "schm_other_3", 5 | "table": "data_types_test", 6 | "fields": { 7 | "field_type_bit": "anon_funcs.digest(\"field_type_bit\", 'default', 'md5')", 8 | "field_type_bool": "anon_funcs.digest(\"field_type_bool\", 'default', 'md5')", 9 | "field_type_char": "anon_funcs.digest(\"field_type_char\", 'default', 'md5')", 10 | "field_type_decimal": "anon_funcs.digest(\"field_type_decimal\", 'default', 'md5')", 11 | "field_type_float": "anon_funcs.digest(\"field_type_float\", 'float', 'md5')", 12 | "field_type_float4": "anon_funcs.digest(\"field_type_float4\", 'default', 'md5')", 13 | "field_type_float8": "anon_funcs.digest(\"field_type_float8\", 'float', 'md5')", 14 | "field_type_int": "anon_funcs.digest(\"field_type_int\", 'default', 'md5')", 15 | "field_type_int2": "anon_funcs.digest(\"field_type_int2\", 'default', 'md5')", 16 | "field_type_int4": "anon_funcs.digest(\"field_type_int4\", 'default', 'md5')", 17 | "field_type_int8": "anon_funcs.digest(\"field_type_int8\", 'default', 'md5')", 18 | "field_type_time": "anon_funcs.digest(\"field_type_time\", 'default', 'md5')", 19 | "field_type_time_p": "anon_funcs.digest(\"field_type_time_p\", 'time(3)', 'md5')", 20 | "field_type_timestamp": "anon_funcs.digest(\"field_type_timestamp\", 'default', 'md5')", 21 | "field_type_timestamp_p": "anon_funcs.digest(\"field_type_timestamp_p\", 'default', 'md5')", 22 | "field_type_timestamptz": "anon_funcs.digest(\"field_type_timestamptz\", 'default', 'md5')", 23 | "field_type_timestamptz_p": "anon_funcs.digest(\"field_type_timestamptz_p\", 'default', 'md5')", 24 | "field_type_timetz": "anon_funcs.digest(\"field_type_timetz\", 'default', 'md5')", 25 | "field_type_timetz_p": "anon_funcs.digest(\"field_type_timetz_p\", 'timetz(3)', 'md5')", 26 | "field_type_varbit": "anon_funcs.digest(\"field_type_varbit\", 'varbit(5)', 'md5')", 27 | "field_type_varchar": "anon_funcs.digest(\"field_type_varchar\", 'varchar(20)', 'md5')" 28 | } 29 | } 30 | ] 31 | } -------------------------------------------------------------------------------- /rest_api/runners/background/base.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from typing import List 3 | 4 | from pg_anon.common.constants import BASE_DIR 5 | from pg_anon.common.dto import PgAnonResult 6 | from rest_api.constants import BASE_TEMP_DIR 7 | from rest_api.pydantic_models import StatelessRunnerRequest 8 | from rest_api.utils import run_pg_anon_worker 9 | 10 | 11 | class BaseRunner: 12 | mode: str 13 | request: StatelessRunnerRequest 14 | operation_id: str 15 | cli_params: List[str] = None 16 | result: PgAnonResult = None 17 | 18 | def __init__(self, request: StatelessRunnerRequest): 19 | self.request = request 20 | self.operation_id = request.operation_id 21 | self.base_tmp_dir = BASE_TEMP_DIR / f'{self.operation_id}__{uuid.uuid4()}' 22 | self._prepare_cli_params() 23 | 24 | def _prepare_db_credentials_cli_params(self): 25 | self.cli_params.extend([ 26 | f'--db-host={self.request.db_connection_params.host}', 27 | f'--db-port={self.request.db_connection_params.port}', 28 | f'--db-user={self.request.db_connection_params.user_login}', 29 | f'--db-user-password={self.request.db_connection_params.user_password}', 30 | f'--db-name={self.request.db_connection_params.db_name}', 31 | ]) 32 | 33 | def _prepare_config(self): 34 | config_file_path = BASE_DIR / "config.yml" 35 | if config_file_path.exists(): 36 | self.cli_params.extend([ 37 | f"--config={str(config_file_path)}", 38 | ]) 39 | 40 | def _prepare_verbosity_cli_params(self): 41 | self.cli_params.extend([ 42 | "--debug", 43 | ]) 44 | 45 | def _prepare_other_cli_params(self): 46 | if self.request.save_dicts: 47 | self.cli_params.extend([ 48 | "--save-dicts", 49 | ]) 50 | 51 | def _prepare_cli_params(self): 52 | self.cli_params = [] 53 | self._prepare_db_credentials_cli_params() 54 | self._prepare_config() 55 | self._prepare_other_cli_params() 56 | 57 | async def run(self): 58 | if not self.mode: 59 | raise ValueError(f'Mode is not set') 60 | 61 | self.result = await run_pg_anon_worker( 62 | mode=self.mode, 63 | operation_id=self.operation_id, 64 | cli_run_params=self.cli_params 65 | ) 66 | 67 | if not self.result: 68 | raise RuntimeError('Operation not completed successfully') 69 | 70 | return self.result 71 | -------------------------------------------------------------------------------- /pg_anon/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from pathlib import Path 4 | 5 | from concurrent_log_handler import ConcurrentRotatingFileHandler 6 | 7 | 8 | class Logger: 9 | _instance = None 10 | _formatter: str 11 | 12 | logger = None 13 | 14 | def __new__(cls): 15 | if cls._instance is not None: 16 | return cls._instance 17 | 18 | cls._instance = super().__new__(cls) 19 | cls._instance.logger = logging.getLogger('pg_anon.logger') 20 | cls._instance.logger.setLevel(logging.INFO) 21 | 22 | cls._instance._formatter = logging.Formatter( 23 | datefmt="%Y-%m-%d %H:%M:%S", 24 | fmt="%(asctime)s,%(msecs)03d - %(levelname)8s - %(message)s", 25 | ) 26 | 27 | handler = logging.StreamHandler(sys.stdout) 28 | handler.setFormatter(cls._instance._formatter) 29 | cls._instance.logger.addHandler(handler) 30 | 31 | return cls._instance 32 | 33 | def add_file_handler(self, log_dir: Path, log_file_name: str): 34 | for handler in list(self.logger.handlers): 35 | if isinstance(handler, logging.FileHandler): 36 | self.logger.removeHandler(handler) 37 | handler.close() 38 | 39 | log_dir.mkdir(parents=True, exist_ok=True) 40 | 41 | file_handler = ConcurrentRotatingFileHandler( 42 | log_dir / log_file_name, 43 | maxBytes=10 * 1024 * 1024, 44 | backupCount=10, 45 | ) 46 | file_handler.setFormatter(self._formatter) 47 | self.logger.addHandler(file_handler) 48 | 49 | def set_log_level(self, log_level: int): 50 | self.logger.setLevel(log_level) 51 | 52 | def __del__(self): 53 | # Закрытие всех обработчиков при уничтожении экземпляра класса 54 | for handler in self.logger.handlers.copy(): 55 | try: 56 | handler.acquire() 57 | handler.flush() 58 | handler.close() 59 | except Exception as e: 60 | print(f"Error closing log handler: {e}") 61 | finally: 62 | handler.release() 63 | self.logger.removeHandler(handler) 64 | 65 | 66 | def get_logger(): 67 | return Logger().logger 68 | 69 | 70 | def logger_add_file_handler(log_dir: Path, log_file_name: str): 71 | Logger().add_file_handler( 72 | log_dir=log_dir, 73 | log_file_name=log_file_name, 74 | ) 75 | 76 | 77 | def logger_set_log_level(log_level: int): 78 | Logger().set_log_level(log_level) 79 | -------------------------------------------------------------------------------- /docs/operations/init.md: -------------------------------------------------------------------------------- 1 | # 🏗️ Init 2 | 3 | > [🏠 Home](../../README.md#-operations) | [🔍 Scan](scan.md) | [💾 Dump](dump.md) | [📂 Restore](restore.md) | [🔬 View Fields](view-fields.md) | [📊 View Data](view-data.md) | [📚 SQL Functions Library](../sql-functions-library.md) 4 | 5 | ## Overview 6 | 7 | This mode creates the `anon_funcs` schema in the source database and loads the predefined SQL functions from [init.sql](../../init.sql). 8 | These functions are required for processing data in the source database. 9 | 10 | ## Run example 11 | 12 | ```commandline 13 | python -m pg_anon --mode=init \ 14 | --db-user=postgres \ 15 | --db-user-password=postgres \ 16 | --db-name=source_db 17 | ``` 18 | 19 | --- 20 | 21 | ## Options 22 | 23 | ### Common pg_anon options: 24 | 25 | | Option | Required | Description | 26 | |-------------|----------|--------------------------------------------------------------------------------------------------| 27 | | `--verbose` | No | Sets the log verbosity level: `info`, `debug`, `error`. (default: info) | 28 | | `--debug` | No | Enables debug mode (equivalent to `--verbose=debug`) and adds extra debug logs. (default: false) | 29 | 30 | ### Database configuration options: 31 | 32 | | Option | Required | Description | 33 | |----------------------|----------|---------------------------------------------------------------------| 34 | | `--db-host` | Yes | Database host. | 35 | | `--db-port` | Yes | Database port. | 36 | | `--db-name` | Yes | Database name. | 37 | | `--db-user` | Yes | Database user. | 38 | | `--db-user-password` | No | Database user password. | 39 | | `--db-passfile` | No | Path to a file containing the password used for authentication. | 40 | | `--db-ssl-key-file` | No | Path to the client SSL key file for secure connections. | 41 | | `--db-ssl-cert-file` | No | Path to the client SSL certificate file. | 42 | | `--db-ssl-ca-file` | No | Path to the CA certificate used to verify the server’s certificate. | 43 | -------------------------------------------------------------------------------- /tests/input_dict/test_meta_dict.py: -------------------------------------------------------------------------------- 1 | { 2 | "field": { # must be anonymized without scanning 3 | "rules": [ 4 | "^fld_5_em", 5 | "^amount", 6 | "details$", 7 | "contract_expires$", 8 | "inn$" 9 | ], 10 | "constants": [ 11 | "usd", 12 | "имя_поля" 13 | ] 14 | }, 15 | "skip_rules": [ 16 | { 17 | "schema": "schm_mask_ext_exclude_2", 18 | "table": "card_numbers", # Optional. If no "table" then whole schema will be skipped 19 | "fields": ["val_skip"] # Optional. If no "fields" then whole table will be skipped 20 | }, 21 | { 22 | "schema": "schm_other_3", 23 | }, 24 | ], 25 | "data_regex": { 26 | "rules": [ 27 | r"""[A-Za-z0-9]+([._-][A-Za-z0-9]+)*@[A-Za-z0-9-]+(\.[A-Za-z]{2,})+""", # email 28 | r"^(7?\d{10})$", # phone 7XXXXXXXXXX 29 | r"^other_ext_tbl_text", # catch "schm_mask_ext_exclude_2.other_ext_tbl_2" 30 | r"""[0-9]{3}-[0-9]{2}-[0-9]{4}""", # social Security numbers "nnn-nn-nnnn" 31 | r"""\b[0-9A-Z]{3}([^ 0-9A-Z]|\s)?[0-9]{4}\b""", # license plate numbers aaa-nnnn 32 | r"""^\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}$""", # IPV4 addresses 33 | r"""^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$""", # Dates in MM/DD/YYYY format 34 | # MasterCard numbers 5258704108753590 35 | r"""^(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}$""", 36 | # Visa card numbers 4563-7568-5698-4587 37 | r"""\b([4]\d{3}[\s]\d{4}[\s]\d{4}[\s]\d{4}|[4]\d{3}[-]\d{4}[-]\d{4}[-]\d{4}|[4]\d{3}[.]\d{4}[.]\d{4}[.]\d{4}|[4]\d{3}\d{4}\d{4}\d{4})\b""", 38 | # Any card number 39 | r"""[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}""", 40 | # URLs 41 | r"""(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()]+|\(([^\s()]+|(\([^\s()]+\)))*\))+(?:\(([^\s()]+|(\([^\s()]+\)))*\)|[^\s`!()\[\]{};:'".,?«»“”‘’]))""", 42 | r"""[0-9]{2}-[0-9]{7}""" # INN from 1c 43 | ] 44 | }, 45 | "data_const": { 46 | "constants": [ 47 | "account", 48 | "email", 49 | "слово", 50 | "сергей" 51 | ] 52 | }, 53 | "sens_pg_types": [ 54 | "text", 55 | "integer", 56 | "bigint", 57 | "varchar", 58 | "json" 59 | ], 60 | "funcs": { 61 | "text": "anon_funcs.digest(\"%s\", 'salt_word', 'md5')", 62 | "numeric": "anon_funcs.noise(\"%s\", 10)", 63 | "numeric(30,4)": "anon_funcs.noise(\"%s\", 30)", 64 | "timestamp": "anon_funcs.dnoise(\"%s\", interval '6 month')", 65 | "bigint": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')", 66 | "integer": "anon_funcs.random_int_between(1, 10)", 67 | "mvarchar": "anon_funcs.digest(\"%s\"::text, 'salt_word', 'md5')" 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /rest_api/runners/background/dump.py: -------------------------------------------------------------------------------- 1 | from pg_anon.common.enums import AnonMode 2 | from rest_api.enums import DumpMode 3 | from rest_api.pydantic_models import DumpRequest 4 | from rest_api.runners.background import BaseRunner 5 | from rest_api.utils import write_dictionary_contents 6 | 7 | 8 | class DumpRunner(BaseRunner): 9 | mode: str = AnonMode.DUMP.value 10 | request: DumpRequest 11 | full_dump_path: str 12 | 13 | def __init__(self, request: DumpRequest): 14 | super().__init__(request) 15 | self._set_mode() 16 | 17 | def _set_mode(self): 18 | if self.request.type == DumpMode.FULL: 19 | self.mode = AnonMode.DUMP.value 20 | elif self.request.type == DumpMode.STRUCT: 21 | self.mode = AnonMode.SYNC_STRUCT_DUMP.value 22 | elif self.request.type == DumpMode.DATA: 23 | self.mode = AnonMode.SYNC_DATA_DUMP.value 24 | 25 | def _prepare_dictionaries_cli_params(self): 26 | input_sens_dict_file_names = list( 27 | write_dictionary_contents(self.request.sens_dict_contents, self.base_tmp_dir).keys() 28 | ) 29 | self.cli_params.append(f"--prepared-sens-dict-file={','.join(input_sens_dict_file_names)}") 30 | 31 | if self.request.partial_tables_dict_contents: 32 | input_partial_tables_dict_file_names = list( 33 | write_dictionary_contents(self.request.partial_tables_dict_contents, self.base_tmp_dir).keys() 34 | ) 35 | self.cli_params.append( 36 | f"--partial-tables-dict-file={','.join(input_partial_tables_dict_file_names)}" 37 | ) 38 | 39 | if self.request.partial_tables_exclude_dict_contents: 40 | input_partial_tables_exclude_dict_file_names = list( 41 | write_dictionary_contents(self.request.partial_tables_exclude_dict_contents, self.base_tmp_dir).keys() 42 | ) 43 | self.cli_params.append( 44 | f"--partial-tables-exclude-dict-file={','.join(input_partial_tables_exclude_dict_file_names)}" 45 | ) 46 | 47 | def _prepare_dump_path_cli_params(self): 48 | self.full_dump_path = self.request.validated_output_path 49 | self.cli_params.extend([ 50 | f'--output-dir={self.full_dump_path}', 51 | '--clear-output-dir', 52 | ]) 53 | 54 | def _prepare_parallelization_cli_params(self): 55 | if self.request.proc_count: 56 | self.cli_params.append( 57 | f'--processes={self.request.proc_count}' 58 | ) 59 | 60 | if self.request.proc_conn_count: 61 | self.cli_params.append( 62 | f'--db-connections-per-process={self.request.proc_conn_count}' 63 | ) 64 | 65 | def _prepare_pg_dump_cli_params(self): 66 | if self.request.pg_dump_path: 67 | self.cli_params.append( 68 | f'--pg-dump={self.request.pg_dump_path}' 69 | ) 70 | 71 | def _prepare_cli_params(self): 72 | super()._prepare_cli_params() 73 | self._prepare_dictionaries_cli_params() 74 | self._prepare_dump_path_cli_params() 75 | self._prepare_parallelization_cli_params() 76 | self._prepare_pg_dump_cli_params() 77 | self._prepare_verbosity_cli_params() 78 | -------------------------------------------------------------------------------- /tests/input_dict/test.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema":"schm_other_1", 5 | "table":"some_tbl", 6 | "fields": { 7 | "val":"'text const'" 8 | } 9 | }, 10 | { 11 | "schema":"schm_other_2", 12 | "table":"some_tbl", 13 | "raw_sql": "SELECT id, val || ' modified' as val FROM schm_other_2.some_tbl" 14 | }, 15 | { 16 | "schema":"public", 17 | "table":"key_value", 18 | "fields": { 19 | "fld_value":"""SQL: 20 | CASE 21 | WHEN "fld_key" ILIKE '%email%' THEN CONCAT(md5(random()::TEXT),'@domain.com') 22 | WHEN "fld_key" ILIKE '%password%' THEN md5(fld_value) 23 | WHEN "fld_key" ILIKE '%address%' THEN 'test address' 24 | WHEN "fld_key" ILIKE '%login%' THEN 'test_login' 25 | WHEN "fld_key" ILIKE '%name%' THEN 'test_name' 26 | WHEN "fld_key" ILIKE '%amount%' THEN (select anon_funcs.noise(fld_value::int, 1000.2)::text) 27 | ELSE fld_value 28 | END""" 29 | } 30 | }, 31 | { 32 | "schema":"_SCHM.$complex#имя;@&* a'", 33 | "table":"_TBL.$complex#имя;@&* a'2", 34 | "fields": { 35 | "_FLD.$complex#имя;@&* a'": "'text const'" 36 | } 37 | }, 38 | { 39 | "schema":"_SCHM.$complex#имя;@&* a'", 40 | "table":"_TBL.$complex#имя;@&* a'3", 41 | "raw_sql": """ 42 | SELECT id, fld_key, "_FLD.$complex#имя;@&* a'" || ' (modified)' as "_FLD.$complex#имя;@&* a'" 43 | FROM "_SCHM.$complex#имя;@&* a'"."_TBL.$complex#имя;@&* a'3" 44 | """ 45 | }, 46 | { 47 | "schema":"schm_other_2", 48 | "table":"tbl_test_anon_functions", 49 | "fields": { 50 | "fld_1_int": "anon_funcs.noise(fld_1_int, 2000)", 51 | "fld_2_datetime": "anon_funcs.dnoise(fld_2_datetime, interval '1 month')", 52 | "fld_3_txt": "anon_funcs.digest(fld_3_txt, 'salt', 'sha256') ", 53 | "fld_4_txt": "anon_funcs.partial(fld_4_txt,1,'***',3)", 54 | "fld_5_email": "anon_funcs.partial_email(fld_5_email)", 55 | "fld_6_txt": "anon_funcs.random_string(7)", 56 | "fld_7_zip": "anon_funcs.random_zip()", 57 | "fld_8_datetime": """ 58 | anon_funcs.random_date_between( 59 | fld_8_datetime - interval '1 year', 60 | fld_8_datetime + interval '1 year' 61 | ) 62 | """, 63 | "fld_9_datetime": "anon_funcs.random_date()", 64 | "fld_10_int": "anon_funcs.random_int_between(fld_10_int - 1000, fld_10_int + 2000)", 65 | "fld_11_int": "anon_funcs.random_bigint_between(6000000000, 7000000000)", 66 | "fld_12_phone": "anon_funcs.random_phone('+7')", 67 | "fld_13_txt": "anon_funcs.random_hash('seed', 'sha512')", 68 | "fld_14_txt": "anon_funcs.random_in(array['a', 'b', 'c'])", 69 | "fld_15_txt": "anon_funcs.hex_to_int(fld_15_txt)::text" 70 | } 71 | }, 72 | { 73 | "schema_mask": "^schm_mask_incl", 74 | "table_mask": "^some_t", 75 | "fields": { 76 | "val": "'text const'" 77 | } 78 | }, 79 | { 80 | "schema_mask": "^schm_mask_incl", 81 | "table": "tbl_123", 82 | "fields": { 83 | "val": "'text const'" 84 | } 85 | }, 86 | { 87 | "schema": "schm_mask_include_1", 88 | "table_mask": "\w+\_\d+\_\d+", 89 | "fields": { 90 | "val": "'text const'" 91 | } 92 | } 93 | ], 94 | "dictionary_exclude": [ 95 | { 96 | "schema":"schm_other_2", 97 | "table":"exclude_tbl" 98 | } 99 | ] 100 | } -------------------------------------------------------------------------------- /docs/dicts/non-sens-dict-schema.md: -------------------------------------------------------------------------------- 1 | # 📋 Non-Sensitive Dictionary 2 | > [🏠 Home](../../README.md#-dictionary-schemas) | [🔍 Scan](../operations/scan.md) | [🗂️ Meta Dictionary](meta-dict-schema.md) | [🔐 Sensitive Dictionary](sens-dict-schema.md) | 3 | 4 | The non-sensitive dictionary is used only during the [create-dict (scan) mode](../operations/scan.md) to speed up processing. 5 | It defines which fields should be treated as non-sensitive. Fields listed here are **excluded** from all sensitivity checks according to [meta-dictionary](meta-dict-schema.md) rules. 6 | 7 | This dictionary can be created manually or generated automatically using [create-dict (scan) mode](../operations/scan.md) with `--output-no-sens-dict-file` option. 8 | 9 | > ⚠️ **Note** 10 | > 11 | > If a field appears both in the [sensitive dictionary](sens-dict-schema.md) and the non-sensitive dictionary, the sensitive dictionary takes priority. 12 | 13 | --- 14 | 15 | ## Schema 16 | ```python 17 | { 18 | "no_sens_dictionary": [ 19 | { 20 | "schema": "", 21 | "table": "", 22 | "fields": [ 23 | "", 24 | ] 25 | }, 26 | ] 27 | } 28 | ``` 29 | 30 | --- 31 | 32 | ## ⚙️ Using the Dictionary 33 | 34 | **🏛️ Example Tables Structure** 35 | 36 | | Schema | Table | Field | 37 | |-----------|-----------|------------------| 38 | | public | employees | id | 39 | | public | employees | full_name | 40 | | public | employees | email | 41 | | public | employees | hire_date | 42 | | public | salaries | employee_id | 43 | | public | salaries | monthly_salary | 44 | | public | salaries | currency | 45 | 46 | **📘 Example Non-Sensitive Dictionary** 47 | ```python 48 | { 49 | "no_sens_dictionary": [ 50 | { 51 | "schema": "public", 52 | "table": "employees", 53 | "fields": [ 54 | "id", 55 | "hire_date", 56 | ] 57 | }, 58 | { 59 | "schema": "public", 60 | "table": "salaries", 61 | "fields": [ 62 | "employee_id", 63 | "currency", 64 | ] 65 | }, 66 | ] 67 | } 68 | ``` 69 | 70 | **This dictionary matches the following table fields:** 71 | 72 | | Schema | Table | Field | Used in `create-dict (scan)` mode | 73 | |----------|------------|------------------|------------------------------------------------------------| 74 | | public | employees | id | Excluded from sensitivity checks as a "no sensitive" field | 75 | | public | employees | full_name | Fields scanned using meta-dictionary rules | 76 | | public | employees | email | Fields scanned using meta-dictionary rules | 77 | | public | employees | hire_date | Excluded from sensitivity checks as a "no sensitive" field | 78 | | public | salaries | employee_id | Excluded from sensitivity checks as a "no sensitive" field | 79 | | public | salaries | monthly_salary | Fields scanned using meta-dictionary rules | 80 | | public | salaries | currency | Excluded from sensitivity checks as a "no sensitive" field | 81 | -------------------------------------------------------------------------------- /rest_api/runners/background/restore.py: -------------------------------------------------------------------------------- 1 | from pg_anon.common.enums import AnonMode 2 | from rest_api.enums import RestoreMode 3 | from rest_api.pydantic_models import RestoreRequest 4 | from rest_api.runners.background import BaseRunner 5 | from rest_api.utils import write_dictionary_contents 6 | 7 | 8 | class RestoreRunner(BaseRunner): 9 | mode: str = AnonMode.RESTORE.value 10 | request: RestoreRequest 11 | full_input_path: str 12 | 13 | def __init__(self, request: RestoreRequest): 14 | super().__init__(request) 15 | self._set_mode() 16 | 17 | def _set_mode(self): 18 | if self.request.type == RestoreMode.FULL: 19 | self.mode = AnonMode.RESTORE.value 20 | elif self.request.type == RestoreMode.STRUCT: 21 | self.mode = AnonMode.SYNC_STRUCT_RESTORE.value 22 | elif self.request.type == RestoreMode.DATA: 23 | self.mode = AnonMode.SYNC_DATA_RESTORE.value 24 | 25 | def _prepare_dictionaries_cli_params(self): 26 | if self.request.partial_tables_dict_contents: 27 | input_partial_tables_dict_file_names = list( 28 | write_dictionary_contents(self.request.partial_tables_dict_contents, self.base_tmp_dir).keys() 29 | ) 30 | self.cli_params.append( 31 | f"--partial-tables-dict-file={','.join(input_partial_tables_dict_file_names)}" 32 | ) 33 | 34 | if self.request.partial_tables_exclude_dict_contents: 35 | input_partial_tables_exclude_dict_file_names = list( 36 | write_dictionary_contents(self.request.partial_tables_exclude_dict_contents, self.base_tmp_dir).keys() 37 | ) 38 | self.cli_params.append( 39 | f"--partial-tables-exclude-dict-file={','.join(input_partial_tables_exclude_dict_file_names)}" 40 | ) 41 | 42 | def _prepare_input_dump_path_cli_params(self): 43 | self.full_input_path = self.request.validated_input_path 44 | self.cli_params.extend([ 45 | f'--input-dir={self.full_input_path}', 46 | ]) 47 | 48 | def _prepare_parallelization_cli_params(self): 49 | if self.request.proc_conn_count: 50 | self.cli_params.append( 51 | f'--db-connections-per-process={self.request.proc_conn_count}' 52 | ) 53 | 54 | def _prepare_pg_restore_cli_params(self): 55 | if self.request.pg_restore_path: 56 | self.cli_params.append( 57 | f'--pg-restore={self.request.pg_restore_path}' 58 | ) 59 | 60 | def _prepare_additional_cli_params(self): 61 | if self.request.drop_custom_check_constr: 62 | self.cli_params.append( 63 | f'--drop-custom-check-constr' 64 | ) 65 | if self.request.clean_db: 66 | self.cli_params.append( 67 | f'--clean-db' 68 | ) 69 | if self.request.drop_db: 70 | self.cli_params.append( 71 | f'--drop-db' 72 | ) 73 | 74 | def _prepare_cli_params(self): 75 | super()._prepare_cli_params() 76 | self._prepare_dictionaries_cli_params() 77 | self._prepare_input_dump_path_cli_params() 78 | self._prepare_parallelization_cli_params() 79 | self._prepare_pg_restore_cli_params() 80 | self._prepare_additional_cli_params() 81 | self._prepare_verbosity_cli_params() 82 | -------------------------------------------------------------------------------- /rest_api/runners/background/scan.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pg_anon.common.enums import AnonMode, ScanMode 4 | from rest_api.enums import ScanMode 5 | from rest_api.pydantic_models import ScanRequest 6 | from rest_api.runners.background import BaseRunner 7 | from rest_api.utils import write_dictionary_contents 8 | 9 | 10 | class ScanRunner(BaseRunner): 11 | mode: str = AnonMode.CREATE_DICT.value 12 | request: ScanRequest 13 | output_sens_dict_file_name: str 14 | output_no_sens_dict_file_name: Optional[str] = None 15 | 16 | def _prepare_dictionaries_cli_params(self): 17 | input_meta_dict_file_names = list( 18 | write_dictionary_contents(self.request.meta_dict_contents, self.base_tmp_dir).keys() 19 | ) 20 | 21 | input_sens_dict_file_names = None 22 | if self.request.sens_dict_contents: 23 | input_sens_dict_file_names = list( 24 | write_dictionary_contents(self.request.sens_dict_contents, self.base_tmp_dir).keys() 25 | ) 26 | 27 | input_no_sens_dict_file_names = None 28 | if self.request.no_sens_dict_contents: 29 | input_no_sens_dict_file_names = list( 30 | write_dictionary_contents(self.request.no_sens_dict_contents, self.base_tmp_dir).keys() 31 | ) 32 | 33 | self.output_sens_dict_file_name = self.base_tmp_dir / 'output_sens_dict.py' 34 | 35 | self.cli_params.extend([ 36 | f"--meta-dict-file={','.join(input_meta_dict_file_names)}", 37 | f"--output-sens-dict-file={self.output_sens_dict_file_name}", 38 | ]) 39 | 40 | if self.request.need_no_sens_dict: 41 | self.output_no_sens_dict_file_name = self.base_tmp_dir / 'output_no_sens_dict.py' 42 | self.cli_params.append( 43 | f"--output-no-sens-dict-file={self.output_no_sens_dict_file_name}", 44 | ) 45 | 46 | if input_sens_dict_file_names: 47 | self.cli_params.append( 48 | f"--prepared-sens-dict-file={','.join(input_sens_dict_file_names)}" 49 | ) 50 | 51 | if input_no_sens_dict_file_names: 52 | self.cli_params.append( 53 | f"--prepared-no-sens-dict-file={','.join(input_no_sens_dict_file_names)}" 54 | ) 55 | 56 | def _prepare_parallelization_cli_params(self): 57 | if self.request.proc_count: 58 | self.cli_params.append( 59 | f'--processes={self.request.proc_count}' 60 | ) 61 | 62 | if self.request.proc_conn_count: 63 | self.cli_params.append( 64 | f'--db-connections-per-process={self.request.proc_conn_count}' 65 | ) 66 | 67 | def _prepare_scan_mode_cli_params(self): 68 | if self.request.type == ScanMode.PARTIAL and self.request.depth: 69 | self.cli_params.extend([ 70 | f'--scan-mode={ScanMode.PARTIAL.value}', 71 | f'--scan-partial-rows={self.request.depth}', 72 | ]) 73 | else: 74 | self.cli_params.append( 75 | f'--scan-mode={ScanMode.FULL.value}' 76 | ) 77 | 78 | def _prepare_cli_params(self): 79 | super()._prepare_cli_params() 80 | self._prepare_dictionaries_cli_params() 81 | self._prepare_parallelization_cli_params() 82 | self._prepare_scan_mode_cli_params() 83 | self._prepare_verbosity_cli_params() 84 | -------------------------------------------------------------------------------- /tests/expected_results/PGAnonMaskUnitTest_target_tables.result: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | "public", 4 | "contracts", 5 | "amount", 6 | [ 7 | [ 8 | 101010.0 9 | ], 10 | [ 11 | 101010.0 12 | ], 13 | [ 14 | 101010.0 15 | ], 16 | [ 17 | 101010.0 18 | ], 19 | [ 20 | 101010.0 21 | ] 22 | ] 23 | ], 24 | [ 25 | "public", 26 | "tbl_100", 27 | "amount", 28 | [ 29 | [ 30 | 202020.0 31 | ], 32 | [ 33 | 202020.0 34 | ], 35 | [ 36 | 202020.0 37 | ], 38 | [ 39 | 202020.0 40 | ], 41 | [ 42 | 202020.0 43 | ] 44 | ] 45 | ], 46 | [ 47 | "schm_other_1", 48 | "some_tbl", 49 | "val", 50 | [ 51 | [ 52 | "text const" 53 | ], 54 | [ 55 | "text const" 56 | ], 57 | [ 58 | "text const" 59 | ], 60 | [ 61 | "text const" 62 | ], 63 | [ 64 | "text const" 65 | ] 66 | ] 67 | ], 68 | [ 69 | "schm_other_2", 70 | "some_tbl", 71 | "val", 72 | [ 73 | [ 74 | "text_val_1 modified" 75 | ], 76 | [ 77 | "text_val_2 modified" 78 | ], 79 | [ 80 | "text_val_3 modified" 81 | ], 82 | [ 83 | "text_val_4 modified" 84 | ], 85 | [ 86 | "text_val_5 modified" 87 | ] 88 | ] 89 | ], 90 | [ 91 | "schm_other_4", 92 | "partitioned_table", 93 | "amount", 94 | [ 95 | [ 96 | 101010.0 97 | ], 98 | [ 99 | 101010.0 100 | ], 101 | [ 102 | 101010.0 103 | ], 104 | [ 105 | 101010.0 106 | ], 107 | [ 108 | 101010.0 109 | ] 110 | ] 111 | ], 112 | [ 113 | "schm_other_4", 114 | "partitioned_table_2025_01", 115 | "amount", 116 | [ 117 | [ 118 | 101010.0 119 | ], 120 | [ 121 | 101010.0 122 | ] 123 | ] 124 | ], 125 | [ 126 | "schm_other_4", 127 | "partitioned_table_2025_02", 128 | "amount", 129 | [ 130 | [ 131 | 101010.0 132 | ] 133 | ] 134 | ], 135 | [ 136 | "schm_other_4", 137 | "partitioned_table_2025_03", 138 | "amount", 139 | [ 140 | [ 141 | 101010.0 142 | ] 143 | ] 144 | ], 145 | [ 146 | "schm_other_4", 147 | "partitioned_table_default", 148 | "amount", 149 | [ 150 | [ 151 | 101010.0 152 | ] 153 | ] 154 | ] 155 | ] -------------------------------------------------------------------------------- /docs/dicts/tables-dictionary.md: -------------------------------------------------------------------------------- 1 | # 📑 Tables dictionary 2 | > [🏠 Home](../../README.md#-dictionary-schemas) | [💾 Dump](../operations/dump.md) | [📂 Restore](../operations/restore.md) 3 | 4 | ## Overview 5 | The tables dictionary defines which tables participate in the partial dump and partial restore operations. 6 | It can act as either a whitelist (include-only) or a blacklist (exclude-only). 7 | 8 | Use this dictionary when you need to: 9 | - dump or restore only specific tables 10 | - exclude unwanted tables from the dump or restore 11 | 12 | ## Schema 13 | ```python 14 | { 15 | "tables": [ 16 | { 17 | "schema": "", # Include only this schema 18 | "schema_mask": "", # Or include schemas matching regex pattern 19 | "table": "", # Include only this table 20 | "table_mask": "", # Or include tables matching regex pattern 21 | } 22 | ] 23 | } 24 | ``` 25 | > ⚠️ **Note** 26 | > - You must use either `schema` or `schema_mask` → not both. 27 | > - You must use either `table` or `table_mask` → not both. 28 | 29 | --- 30 | 31 | ## ⚙️ Using the Dictionary 32 | 33 | You can use the same dictionary in two different roles: 34 | - Whitelist — dump/restore only the matched tables 35 | - Blacklist — dump/restore all tables except the matched ones 36 | 37 | 38 | **🏛️ Example Database Structure** 39 | 40 | | Schema | Table | 41 | |-----------|-------------| 42 | | public | employees | 43 | | public | departments | 44 | | public | positions | 45 | | public | salaries | 46 | | public | users | 47 | | ecommerce | products | 48 | | ecommerce | categories | 49 | | ecommerce | orders | 50 | | ecommerce | order_items | 51 | | tenant_a | users | 52 | | tenant_a | projects | 53 | | tenant_a | tasks | 54 | | tenant_a | comments | 55 | | tenant_b | users | 56 | | tenant_b | projects | 57 | | tenant_b | tasks | 58 | | tenant_b | comments | 59 | | tenant_c | users | 60 | | tenant_c | projects | 61 | | tenant_c | tasks | 62 | | tenant_c | comments | 63 | 64 | 65 | 66 | **📘 Example Tables Dictionary** 67 | ```python 68 | { 69 | "tables": [ 70 | { 71 | "schema": "public", 72 | "table": "employees" 73 | }, 74 | { 75 | "schema": "ecommerce", 76 | "table_mask": "^orders" 77 | }, 78 | { 79 | "schema_mask": "_a$", 80 | "table": "projects" 81 | }, 82 | { 83 | "schema_mask": "*", 84 | "table_mask": "users" 85 | }, 86 | ] 87 | } 88 | ``` 89 | 90 | **This dictionary matches the following tables:** 91 | 92 | | Schema | Table | Matched by rule | 93 | |-----------|-------------|--------------------------------------------| 94 | | ecommerce | orders | `schema="ecommerce", table_mask="^orders"` | 95 | | ecommerce | order_items | `schema="ecommerce", table_mask="^orders"` | 96 | | tenant_a | projects | `schema_mask="_a$", table="projects"` | 97 | | tenant_a | users | `schema_mask="*", table_mask="users"` | 98 | | tenant_b | users | `schema_mask="*", table_mask="users"` | 99 | | tenant_c | users | `schema_mask="*", table_mask="users"` | 100 | | public | users | `schema_mask="*", table_mask="users"` | 101 | | public | employees | `schema="public", table="employees"` | 102 | -------------------------------------------------------------------------------- /tests/input_dict/test_meta_dict_type_aliases.py: -------------------------------------------------------------------------------- 1 | { 2 | "include_rules": [ 3 | { 4 | "schema": "schm_other_3", 5 | "table": "data_types_test", 6 | } 7 | ], 8 | "field": { 9 | "rules": [".*"] 10 | }, 11 | "funcs": { 12 | "default": "anon_funcs.digest(\"%s\", 'default', 'md5')", 13 | "bit": "anon_funcs.digest(\"%s\", 'bit', 'md5')", 14 | "varbit": "anon_funcs.digest(\"%s\", 'varbit', 'md5')", 15 | "bool": "anon_funcs.digest(\"%s\", 'bool', 'md5')", 16 | "char": "anon_funcs.digest(\"%s\", 'char', 'md5')", 17 | "varchar": "anon_funcs.digest(\"%s\", 'varchar', 'md5')", 18 | "int": "anon_funcs.digest(\"%s\", 'int', 'md5')", 19 | "int4": "anon_funcs.digest(\"%s\", 'int4', 'md5')", 20 | "int2": "anon_funcs.digest(\"%s\", 'int2', 'md5')", 21 | "int8": "anon_funcs.digest(\"%s\", 'int8', 'md5')", 22 | "float": "anon_funcs.digest(\"%s\", 'float', 'md5')", 23 | "float8": "anon_funcs.digest(\"%s\", 'float8', 'md5')", 24 | "float4": "anon_funcs.digest(\"%s\", 'float4', 'md5')", 25 | "decimal": "anon_funcs.digest(\"%s\", 'decimal', 'md5')", 26 | "dec": "anon_funcs.digest(\"%s\", 'dec', 'md5')", 27 | "serial2": "anon_funcs.digest(\"%s\", 'serial2', 'md5')", 28 | "serial4": "anon_funcs.digest(\"%s\", 'serial4', 'md5')", 29 | "serial8": "anon_funcs.digest(\"%s\", 'serial8', 'md5')", 30 | "time": "anon_funcs.digest(\"%s\", 'time', 'md5')", 31 | "timetz": "anon_funcs.digest(\"%s\", 'timetz', 'md5')", 32 | "timestamp": "anon_funcs.digest(\"%s\", 'timestamp', 'md5')", 33 | "timestamptz": "anon_funcs.digest(\"%s\", 'timestamptz', 'md5')", 34 | "bit(4)": "anon_funcs.digest(\"%s\", 'bit(4)', 'md5')", 35 | "bit(5)": "anon_funcs.digest(\"%s\", 'bit(5)', 'md5')", 36 | "bit(6)": "anon_funcs.digest(\"%s\", 'bit(6)', 'md5')", 37 | "varbit(4)": "anon_funcs.digest(\"%s\", 'varbit(4)', 'md5')", 38 | "varbit(5)": "anon_funcs.digest(\"%s\", 'varbit(5)', 'md5')", 39 | "varbit(6)": "anon_funcs.digest(\"%s\", 'varbit(6)', 'md5')", 40 | "char(4)": "anon_funcs.digest(\"%s\", 'char(4)', 'md5')", 41 | "char(5)": "anon_funcs.digest(\"%s\", 'char(5)', 'md5')", 42 | "char(6)": "anon_funcs.digest(\"%s\", 'char(6)', 'md5')", 43 | "varchar(19)": "anon_funcs.digest(\"%s\", 'varchar(19)', 'md5')", 44 | "varchar(20)": "anon_funcs.digest(\"%s\", 'varchar(20)', 'md5')", 45 | "varchar(21)": "anon_funcs.digest(\"%s\", 'varchar(21)', 'md5')", 46 | "decimal(10,1)": "anon_funcs.digest(\"%s\", 'decimal(10,1)', 'md5')", 47 | "decimal(10,2)": "anon_funcs.digest(\"%s\", 'decimal(10,2)', 'md5')", 48 | "decimal(11,2)": "anon_funcs.digest(\"%s\", 'decimal(11,2)', 'md5')", 49 | "time(2)": "anon_funcs.digest(\"%s\", 'time(2)', 'md5')", 50 | "time(3)": "anon_funcs.digest(\"%s\", 'time(3)', 'md5')", 51 | "time(4)": "anon_funcs.digest(\"%s\", 'time(4)', 'md5')", 52 | "timestamp(2)": "anon_funcs.digest(\"%s\", 'timestamp(2)', 'md5')", 53 | "timestamp(3)": "anon_funcs.digest(\"%s\", 'timestamp(3)', 'md5')", 54 | "timestamp(4)": "anon_funcs.digest(\"%s\", 'timestamp(4)', 'md5')", 55 | "timestamptz(2)": "anon_funcs.digest(\"%s\", 'timestamptz(2)', 'md5')", 56 | "timestamptz(3)": "anon_funcs.digest(\"%s\", 'timestamptz(3)', 'md5')", 57 | "timestamptz(4)": "anon_funcs.digest(\"%s\", 'timestamptz(4)', 'md5')", 58 | "timetz(2)": "anon_funcs.digest(\"%s\", 'timetz(2)', 'md5')", 59 | "timetz(3)": "anon_funcs.digest(\"%s\", 'timetz(3)', 'md5')", 60 | "timetz(4)": "anon_funcs.digest(\"%s\", 'timetz(4)', 'md5')", 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /tests/expected_results/PGAnonMaskUnitTest_source_tables.result: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | "columnar_internal", 4 | "tbl_200", 5 | "id", 6 | [] 7 | ], 8 | [ 9 | "columnar_internal", 10 | "tbl_200", 11 | "val", 12 | [] 13 | ], 14 | [ 15 | "columnar_internal", 16 | "tbl_200", 17 | "val_skip", 18 | [] 19 | ], 20 | [ 21 | "public", 22 | "contracts", 23 | "amount", 24 | [ 25 | [ 26 | 0.0 27 | ], 28 | [ 29 | 1.0 30 | ], 31 | [ 32 | 2.0 33 | ], 34 | [ 35 | 2.0 36 | ], 37 | [ 38 | 3.0 39 | ] 40 | ] 41 | ], 42 | [ 43 | "public", 44 | "tbl_100", 45 | "amount", 46 | [ 47 | [ 48 | 0.1 49 | ], 50 | [ 51 | 0.2 52 | ], 53 | [ 54 | 0.3 55 | ], 56 | [ 57 | 0.4 58 | ], 59 | [ 60 | 0.5 61 | ] 62 | ] 63 | ], 64 | [ 65 | "schm_other_1", 66 | "some_tbl", 67 | "val", 68 | [ 69 | [ 70 | "text_val_1" 71 | ], 72 | [ 73 | "text_val_2" 74 | ], 75 | [ 76 | "text_val_3" 77 | ], 78 | [ 79 | "text_val_4" 80 | ], 81 | [ 82 | "text_val_5" 83 | ] 84 | ] 85 | ], 86 | [ 87 | "schm_other_2", 88 | "some_tbl", 89 | "val", 90 | [ 91 | [ 92 | "text_val_1" 93 | ], 94 | [ 95 | "text_val_2" 96 | ], 97 | [ 98 | "text_val_3" 99 | ], 100 | [ 101 | "text_val_4" 102 | ], 103 | [ 104 | "text_val_5" 105 | ] 106 | ] 107 | ], 108 | [ 109 | "schm_other_4", 110 | "partitioned_table", 111 | "amount", 112 | [ 113 | [ 114 | 99.98 115 | ], 116 | [ 117 | 25.5 118 | ], 119 | [ 120 | 149.97 121 | ], 122 | [ 123 | 15.7 124 | ], 125 | [ 126 | 76.23 127 | ] 128 | ] 129 | ], 130 | [ 131 | "schm_other_4", 132 | "partitioned_table_2025_01", 133 | "amount", 134 | [ 135 | [ 136 | 99.98 137 | ], 138 | [ 139 | 25.5 140 | ] 141 | ] 142 | ], 143 | [ 144 | "schm_other_4", 145 | "partitioned_table_2025_02", 146 | "amount", 147 | [ 148 | [ 149 | 149.97 150 | ] 151 | ] 152 | ], 153 | [ 154 | "schm_other_4", 155 | "partitioned_table_2025_03", 156 | "amount", 157 | [ 158 | [ 159 | 15.7 160 | ] 161 | ] 162 | ], 163 | [ 164 | "schm_other_4", 165 | "partitioned_table_default", 166 | "amount", 167 | [ 168 | [ 169 | 76.23 170 | ] 171 | ] 172 | ] 173 | ] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Service specific 2 | output/ 3 | log/ 4 | runs/ 5 | tests/output_dict/ 6 | tests/output/ 7 | tests/saved_results/ 8 | docker/pg_anon 9 | *.tar 10 | *.tar.gz 11 | venv*/* 12 | venv* 13 | tmp* 14 | tmp*/* 15 | .idea/ 16 | 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | share/python-wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | MANIFEST 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | .nox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | *.py,cover 66 | .hypothesis/ 67 | .pytest_cache/ 68 | cover/ 69 | 70 | # Translations 71 | *.mo 72 | *.pot 73 | 74 | # Django stuff: 75 | *.log 76 | local_settings.py 77 | db.sqlite3 78 | db.sqlite3-journal 79 | 80 | # Flask stuff: 81 | instance/ 82 | .webassets-cache 83 | 84 | # Scrapy stuff: 85 | .scrapy 86 | 87 | # Sphinx documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | .pybuilder/ 92 | target/ 93 | 94 | # Jupyter Notebook 95 | .ipynb_checkpoints 96 | 97 | # IPython 98 | profile_default/ 99 | ipython_config.py 100 | 101 | # pyenv 102 | # For a library or package, you might want to ignore these files since the code is 103 | # intended to run in multiple environments; otherwise, check them in: 104 | # .python-version 105 | 106 | # pipenv 107 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 108 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 109 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 110 | # install all needed dependencies. 111 | #Pipfile.lock 112 | 113 | # poetry 114 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 115 | # This is especially recommended for binary packages to ensure reproducibility, and is more 116 | # commonly ignored for libraries. 117 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 118 | #poetry.lock 119 | 120 | # pdm 121 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 122 | #pdm.lock 123 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 124 | # in version control. 125 | # https://pdm.fming.dev/#use-with-ide 126 | .pdm.toml 127 | 128 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 129 | __pypackages__/ 130 | 131 | # Celery stuff 132 | celerybeat-schedule 133 | celerybeat.pid 134 | 135 | # SageMath parsed files 136 | *.sage.py 137 | 138 | # Environments 139 | .env 140 | .venv 141 | env/ 142 | venv/ 143 | ENV/ 144 | env.bak/ 145 | venv.bak/ 146 | 147 | # Spyder project settings 148 | .spyderproject 149 | .spyproject 150 | 151 | # Rope project settings 152 | .ropeproject 153 | 154 | # mkdocs documentation 155 | /site 156 | 157 | # mypy 158 | .mypy_cache/ 159 | .dmypy.json 160 | dmypy.json 161 | 162 | # Pyre type checker 163 | .pyre/ 164 | 165 | # pytype static type analyzer 166 | .pytype/ 167 | 168 | # Cython debug symbols 169 | cython_debug/ 170 | -------------------------------------------------------------------------------- /rest_api/runners/direct/view_data.py: -------------------------------------------------------------------------------- 1 | from typing import List, Type 2 | 3 | from pg_anon.cli import build_run_options 4 | from pg_anon.common.dto import PgAnonResult 5 | from pg_anon.context import Context 6 | from pg_anon.modes.view_data import ViewDataMode 7 | from rest_api.constants import BASE_TEMP_DIR 8 | from rest_api.pydantic_models import ViewDataRequest, ViewDataContent 9 | from rest_api.utils import write_dictionary_contents 10 | 11 | 12 | class ViewDataRunner: 13 | request: ViewDataRequest 14 | cli_params: List[str] = None 15 | result: PgAnonResult = None 16 | _executor = Type[ViewDataMode] 17 | 18 | def __init__(self, request: ViewDataRequest): 19 | self.request = request 20 | self._prepare_cli_params() 21 | self._init_context() 22 | self._init_executor() 23 | 24 | def _prepare_db_credentials_cli_params(self): 25 | self.cli_params.extend([ 26 | f'--db-host={self.request.db_connection_params.host}', 27 | f'--db-port={self.request.db_connection_params.port}', 28 | f'--db-user={self.request.db_connection_params.user_login}', 29 | f'--db-user-password={self.request.db_connection_params.user_password}', 30 | f'--db-name={self.request.db_connection_params.db_name}', 31 | ]) 32 | 33 | def _prepare_dictionaries_cli_params(self): 34 | self._input_sens_dict_file_names = write_dictionary_contents(self.request.sens_dict_contents, BASE_TEMP_DIR) 35 | self.cli_params.append( 36 | f"--prepared-sens-dict-file={','.join(self._input_sens_dict_file_names.keys())}" 37 | ) 38 | 39 | def _prepare_filters_cli_params(self): 40 | self.cli_params.append( 41 | f'--schema-name={self.request.schema_name}', 42 | ) 43 | 44 | self.cli_params.append( 45 | f'--table-name={self.request.table_name}', 46 | ) 47 | 48 | def _prepare_pagination_cli_params(self): 49 | if self.request.limit: 50 | self.cli_params.append( 51 | f'--limit={self.request.limit}', 52 | ) 53 | 54 | if self.request.offset: 55 | self.cli_params.append( 56 | f'--offset={self.request.offset}', 57 | ) 58 | 59 | def _prepare_json_cli_params(self): 60 | self.cli_params.append( 61 | f'--json', 62 | ) 63 | 64 | def _prepare_verbosity_cli_params(self): 65 | self.cli_params.extend([ 66 | "--verbose=debug", 67 | "--debug", 68 | ]) 69 | 70 | def _prepare_cli_params(self): 71 | self.cli_params = [] 72 | self._prepare_db_credentials_cli_params() 73 | self._prepare_dictionaries_cli_params() 74 | self._prepare_filters_cli_params() 75 | self._prepare_pagination_cli_params() 76 | self._prepare_json_cli_params() 77 | self._prepare_verbosity_cli_params() 78 | 79 | def _init_context(self): 80 | options = build_run_options(self.cli_params) 81 | self.context = Context(options) 82 | 83 | def _init_executor(self): 84 | self._executor = ViewDataMode(self.context, need_raw_data=True) 85 | 86 | def _format_output(self) -> ViewDataContent: 87 | def _format_data_to_str(records: List[List[str]]): 88 | return [[str(data) for data in record] for record in records] 89 | 90 | rows_before = _format_data_to_str(self._executor.raw_data) 91 | rows_after = _format_data_to_str(self._executor.data) 92 | 93 | return ViewDataContent( 94 | schema_name=self.request.schema_name, 95 | table_name=self.request.table_name, 96 | field_names=self._executor.raw_field_names, 97 | total_rows_count=self._executor.rows_count, 98 | rows_before=rows_before, 99 | rows_after=rows_after, 100 | ) 101 | 102 | async def run(self): 103 | await self._executor.run() 104 | await self._executor.get_rows_count() 105 | return self._format_output() 106 | -------------------------------------------------------------------------------- /tests/expected_results/test_prepared_sens_dict_result_by_data_sql_condition_expected.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema": "_SCHM.$complex#имя;@&* a'", 5 | "table": "_TBL.$complex#имя;@&* a'", 6 | "fields": { 7 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 8 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 9 | } 10 | }, 11 | { 12 | "schema": "public", 13 | "table": "key_value", 14 | "fields": { 15 | "fld_value": "anon_funcs.digest(\"fld_value\", 'salt_word', 'md5')", 16 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 17 | } 18 | }, 19 | { 20 | "schema": "public", 21 | "table": "tbl_100", 22 | "fields": { 23 | "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')", 24 | "amount": "anon_funcs.noise(\"amount\", 30)", 25 | "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')" 26 | } 27 | }, 28 | { 29 | "schema": "schm_mask_ext_exclude_2", 30 | "table": "other_ext_tbl_2", 31 | "fields": { 32 | "val_2": "anon_funcs.digest(\"val_2\", 'salt_word', 'md5')", 33 | "val_1": "anon_funcs.digest(\"val_1\", 'salt_word', 'md5')" 34 | } 35 | }, 36 | { 37 | "schema": "_SCHM.$complex#имя;@&* a'", 38 | "table": "_TBL.$complex#имя;@&* a'3", 39 | "fields": { 40 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 41 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 42 | } 43 | }, 44 | { 45 | "schema": "schm_customer", 46 | "table": "customer_manager", 47 | "fields": { 48 | "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')" 49 | } 50 | }, 51 | { 52 | "schema": "schm_mask_ext_exclude_2", 53 | "table": "card_numbers", 54 | "fields": { 55 | "val": "anon_funcs.digest(\"val\", 'salt_word', 'md5')", 56 | "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')", 57 | "usd": "anon_funcs.noise(\"usd\", 30)", 58 | "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')" 59 | } 60 | }, 61 | { 62 | "schema": "_SCHM.$complex#имя;@&* a'", 63 | "table": "_TBL.$complex#имя;@&* a'2", 64 | "fields": { 65 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 66 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 67 | } 68 | }, 69 | { 70 | "schema": "public", 71 | "table": "contracts", 72 | "fields": { 73 | "amount": "anon_funcs.noise(\"amount\", 10)", 74 | "contract_expires": "anon_funcs.dnoise(\"contract_expires\", interval '6 month')", 75 | "details": "anon_funcs.digest(\"details\", 'salt_word', 'md5')" 76 | } 77 | }, 78 | { 79 | "schema": "public", 80 | "table": "inn_info", 81 | "fields": { 82 | "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')" 83 | } 84 | }, 85 | { 86 | "schema": "schm_customer", 87 | "table": "customer_company", 88 | "fields": { 89 | "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')" 90 | } 91 | }, 92 | { 93 | "schema": "schm_other_2", 94 | "table": "tbl_test_anon_functions", 95 | "fields": { 96 | "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')" 97 | } 98 | } 99 | ] 100 | } -------------------------------------------------------------------------------- /docs/sql-functions-library.md: -------------------------------------------------------------------------------- 1 | # 📚 SQL Functions Library 2 | 3 | > [🏠 Home](../README.md#-documentation-index) | [🏗️ Init](operations/init.md) | [🔍 Scan](operations/scan.md) | [💾 Dump](operations/dump.md) | [🔬 View Fields](operations/view-fields.md) | [📊 View Data](operations/view-data.md) | [🗂️ Meta Dictionary](dicts/meta-dict-schema.md) | [🔐 Sensitive Dictionary](dicts/sens-dict-schema.md) 4 | 5 | ## Overview 6 | 7 | All functions are contained in the `init.sql` file. After run pg_anon in `init` mode, they will reside in the `anon_funcs` schema in the source database. 8 | If you want to write a new function, simply create it in the `anon_funcs` schema in your source database. 9 | 10 | List of some functions available for use in dictionaries: 11 | 12 | --- 13 | 14 | ## Functions list 15 | 16 | ### 1. noise 17 | Add noise to a real number: 18 | ```SQL 19 | SELECT anon_funcs.noise(100, 1.2); 20 | >> 123 21 | ``` 22 | 23 | ### 2. dnoise 24 | Add noise to a date or timestamp: 25 | ```SQL 26 | SELECT anon_funcs.dnoise('2020-02-02 10:10:10'::timestamp, interval '1 month'); 27 | >> 2020-03-02 10:10:10 28 | ``` 29 | 30 | ### 3. digest 31 | Hash a string value with a specified hash function: 32 | ```SQL 33 | SELECT anon_funcs.digest('text', 'salt', 'sha256'); 34 | >> '3353e....' 35 | ``` 36 | 37 | ### 4. partial 38 | Keep the first few characters (2nd argument) and the last few characters (4th argument) of the specified string, adding a constant (3rd argument) in between: 39 | ```SQL 40 | SELECT anon_funcs.partial('123456789', 1, '***', 3); 41 | >> 1***789 42 | ``` 43 | 44 | ### 5. partial_email 45 | Mask an email address: 46 | ```SQL 47 | SELECT anon_funcs.partial_email('example@gmail.com'); 48 | >> ex*****@gm*****.com 49 | ``` 50 | 51 | ### 6. random_string 52 | Generate a random string of specified length: 53 | ```SQL 54 | SELECT anon_funcs.random_string(7); 55 | >> H3ZVL5P 56 | ``` 57 | 58 | ### 7. random_zip 59 | Generate a random ZIP code: 60 | ```SQL 61 | SELECT anon_funcs.random_zip(); 62 | >> 851467 63 | ``` 64 | 65 | ### 8. random_date_between 66 | Generate a random date and time within a specified range: 67 | ```SQL 68 | SELECT anon_funcs.random_date_between( 69 | '2020-02-02 10:10:10'::timestamp, 70 | '2022-02-05 10:10:10'::timestamp 71 | ); 72 | >> 2021-11-08 06:47:48.057 73 | ``` 74 | 75 | ### 9. random_date 76 | Generate a random date and time: 77 | ```SQL 78 | SELECT anon_funcs.random_date(); 79 | >> 1911-04-18 21:54:13.139 80 | ``` 81 | 82 | ### 10. random_int_between 83 | Generate a random integer within a specified range: 84 | ```SQL 85 | SELECT anon_funcs.random_int_between(100, 200); 86 | >> 159 87 | ``` 88 | 89 | ### 11. random_bigint_between 90 | Generate a random bigint within a specified range: 91 | ```SQL 92 | SELECT anon_funcs.random_bigint_between(6000000000, 7000000000); 93 | >> 6268278565 94 | ``` 95 | 96 | ### 12. random_phone 97 | Generate a random phone number: 98 | ```SQL 99 | SELECT anon_funcs.random_phone('+7'); 100 | >> +7297479867 101 | ``` 102 | 103 | ### 13. random_hash 104 | Generate a random hash using the specified function: 105 | ```SQL 106 | SELECT anon_funcs.random_hash('seed', 'sha512'); 107 | >> b972f895ebea9cf2f65e19abc151b8031926c4a332471dc5c40fab608950870d6dbddcd18c7e467563f9b527e63d4d13870e4961c0ff2a62f021827654ae51fd 108 | ``` 109 | 110 | ### 14. random_in 111 | Select a random element from an array: 112 | ```SQL 113 | SELECT anon_funcs.random_in(array['a', 'b', 'c']); 114 | >> a 115 | ``` 116 | 117 | ### 15. hex_to_int 118 | Convert a hexadecimal value to decimal: 119 | ```SQL 120 | SELECT anon_funcs.hex_to_int('8AB'); 121 | >> 2219 122 | ``` 123 | 124 | --- 125 | 126 | ## pgcrypto 127 | In addition to the existing functions in the anon_funcs schema, functions from the pgcrypto extension can also be used. 128 | ```sql 129 | CREATE EXTENSION IF NOT EXISTS pgcrypto; 130 | ``` 131 | 132 | Example of using encryption with base64 encoding to store the encrypted value in a text field: 133 | ```SQL 134 | SELECT encode((SELECT encrypt('data', 'password', 'bf')), 'base64'); 135 | >> cSMq9gb1vOw= 136 | 137 | SELECT decrypt( 138 | ( 139 | SELECT decode('cSMq9gb1vOw=', 'base64') 140 | ), 'password', 'bf'); 141 | >> data 142 | ``` 143 | 144 | --- 145 | 146 | ## How to add your own functions 147 | Also, adding new anonymization functions can be performed by adding `init.sql` to the file and then run pg_anon in `init` mode. 148 | -------------------------------------------------------------------------------- /rest_api/runners/direct/view_fields.py: -------------------------------------------------------------------------------- 1 | from typing import List, Type 2 | 3 | from pg_anon.cli import build_run_options 4 | from pg_anon.common.dto import PgAnonResult 5 | from pg_anon.context import Context 6 | from pg_anon.modes.view_fields import ViewFieldsMode 7 | from rest_api.constants import BASE_TEMP_DIR 8 | from rest_api.pydantic_models import ViewFieldsRequest, ViewFieldsContent 9 | from rest_api.utils import write_dictionary_contents 10 | 11 | 12 | class ViewFieldsRunner: 13 | request: ViewFieldsRequest 14 | cli_params: List[str] = None 15 | result: PgAnonResult = None 16 | _executor = Type[ViewFieldsMode] 17 | 18 | def __init__(self, request: ViewFieldsRequest): 19 | self.request = request 20 | self._prepare_cli_params() 21 | self._init_context() 22 | self._init_executor() 23 | 24 | def _prepare_db_credentials_cli_params(self): 25 | self.cli_params.extend([ 26 | f'--db-host={self.request.db_connection_params.host}', 27 | f'--db-port={self.request.db_connection_params.port}', 28 | f'--db-user={self.request.db_connection_params.user_login}', 29 | f'--db-user-password={self.request.db_connection_params.user_password}', 30 | f'--db-name={self.request.db_connection_params.db_name}', 31 | ]) 32 | 33 | def _prepare_dictionaries_cli_params(self): 34 | self._input_sens_dict_file_names = write_dictionary_contents(self.request.sens_dict_contents, BASE_TEMP_DIR) 35 | self.cli_params.append( 36 | f"--prepared-sens-dict-file={','.join(self._input_sens_dict_file_names.keys())}" 37 | ) 38 | 39 | def _prepare_filters_cli_params(self): 40 | if self.request.schema_name: 41 | self.cli_params.append( 42 | f'--schema-name={self.request.schema_name}', 43 | ) 44 | 45 | if self.request.schema_mask: 46 | self.cli_params.append( 47 | f'--schema-mask={self.request.schema_mask}', 48 | ) 49 | 50 | if self.request.table_name: 51 | self.cli_params.append( 52 | f'--table-name={self.request.table_name}', 53 | ) 54 | 55 | if self.request.table_mask: 56 | self.cli_params.append( 57 | f'--table-mask={self.request.table_mask}', 58 | ) 59 | 60 | if self.request.view_only_sensitive_fields: 61 | self.cli_params.append( 62 | f'--view-only-sensitive-fields', 63 | ) 64 | 65 | def _prepare_limit_cli_params(self): 66 | if self.request.fields_limit_count: 67 | self.cli_params.append( 68 | f'--fields-count={self.request.fields_limit_count}', 69 | ) 70 | 71 | def _prepare_json_cli_params(self): 72 | self.cli_params.append( 73 | f'--json', 74 | ) 75 | 76 | def _prepare_verbosity_cli_params(self): 77 | self.cli_params.extend([ 78 | "--verbose=debug", 79 | "--debug", 80 | ]) 81 | 82 | def _prepare_cli_params(self): 83 | self.cli_params = [] 84 | self._prepare_db_credentials_cli_params() 85 | self._prepare_dictionaries_cli_params() 86 | self._prepare_filters_cli_params() 87 | self._prepare_limit_cli_params() 88 | self._prepare_json_cli_params() 89 | self._prepare_verbosity_cli_params() 90 | 91 | def _init_context(self): 92 | options = build_run_options(self.cli_params) 93 | self.context = Context(options) 94 | 95 | def _init_executor(self): 96 | self._executor = ViewFieldsMode(self.context) 97 | 98 | def _format_output(self) -> List[ViewFieldsContent]: 99 | result = [] 100 | for field in self._executor.fields: 101 | dict_data = None 102 | if field.dict_file_name != self._executor.empty_data_filler: 103 | dict_data = self._input_sens_dict_file_names[field.dict_file_name] 104 | 105 | field_rule = None 106 | if field.rule != self._executor.empty_data_filler: 107 | field_rule = field.rule 108 | 109 | result.append( 110 | ViewFieldsContent( 111 | schema_name=field.nspname, 112 | table_name=field.relname, 113 | field_name=field.column_name, 114 | type=field.type, 115 | dict_data=dict_data, 116 | rule=field_rule, 117 | ) 118 | ) 119 | 120 | return result 121 | 122 | async def run(self): 123 | await self._executor.run() 124 | return self._format_output() 125 | -------------------------------------------------------------------------------- /tests/expected_results/test_prepared_sens_dict_result_by_words_and_phrases_constants_expected.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema": "_SCHM.$complex#имя;@&* a'", 5 | "table": "_TBL.$complex#имя;@&* a'", 6 | "fields": { 7 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 8 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 9 | } 10 | }, 11 | { 12 | "schema": "public", 13 | "table": "key_value", 14 | "fields": { 15 | "fld_value": "anon_funcs.digest(\"fld_value\", 'salt_word', 'md5')", 16 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 17 | } 18 | }, 19 | { 20 | "schema": "public", 21 | "table": "tbl_100", 22 | "fields": { 23 | "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')", 24 | "amount": "anon_funcs.noise(\"amount\", 30)", 25 | "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')" 26 | } 27 | }, 28 | { 29 | "schema": "public", 30 | "table": "tbl_constants", 31 | "fields": { 32 | "phrases_sens_2": "anon_funcs.digest(\"phrases_sens_2\", 'salt_word', 'md5')", 33 | "phrases_sens_1": "anon_funcs.digest(\"phrases_sens_1\", 'salt_word', 'md5')", 34 | "words_sens": "anon_funcs.digest(\"words_sens\", 'salt_word', 'md5')" 35 | } 36 | }, 37 | { 38 | "schema": "schm_customer", 39 | "table": "customer_manager", 40 | "fields": { 41 | "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')" 42 | } 43 | }, 44 | { 45 | "schema": "schm_mask_ext_exclude_2", 46 | "table": "card_numbers", 47 | "fields": { 48 | "val": "anon_funcs.digest(\"val\", 'salt_word', 'md5')", 49 | "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')", 50 | "usd": "anon_funcs.noise(\"usd\", 30)", 51 | "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')" 52 | } 53 | }, 54 | { 55 | "schema": "_SCHM.$complex#имя;@&* a'", 56 | "table": "_TBL.$complex#имя;@&* a'3", 57 | "fields": { 58 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 59 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 60 | } 61 | }, 62 | { 63 | "schema": "_SCHM.$complex#имя;@&* a'", 64 | "table": "_TBL.$complex#имя;@&* a'2", 65 | "fields": { 66 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 67 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 68 | } 69 | }, 70 | { 71 | "schema": "schm_customer", 72 | "table": "customer_company", 73 | "fields": { 74 | "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')", 75 | "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')" 76 | } 77 | }, 78 | { 79 | "schema": "schm_mask_ext_exclude_2", 80 | "table": "other_ext_tbl_2", 81 | "fields": { 82 | "val_1": "anon_funcs.digest(\"val_1\", 'salt_word', 'md5')", 83 | "val_2": "anon_funcs.digest(\"val_2\", 'salt_word', 'md5')" 84 | } 85 | }, 86 | { 87 | "schema": "public", 88 | "table": "contracts", 89 | "fields": { 90 | "amount": "anon_funcs.noise(\"amount\", 10)", 91 | "contract_expires": "anon_funcs.dnoise(\"contract_expires\", interval '6 month')", 92 | "details": "anon_funcs.digest(\"details\", 'salt_word', 'md5')" 93 | } 94 | }, 95 | { 96 | "schema": "public", 97 | "table": "inn_info", 98 | "fields": { 99 | "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')" 100 | } 101 | }, 102 | { 103 | "schema": "schm_other_2", 104 | "table": "tbl_test_anon_functions", 105 | "fields": { 106 | "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')" 107 | } 108 | } 109 | ] 110 | } -------------------------------------------------------------------------------- /tests/expected_results/test_prepared_sens_dict_result_default_func_expected.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema": "_SCHM.$complex#имя;@&* a'", 5 | "table": "_TBL.$complex#имя;@&* a'", 6 | "fields": { 7 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'by_default_func', 'sha256')", 8 | "fld_key": "anon_funcs.digest(\"fld_key\", 'by_default_func', 'sha256')" 9 | } 10 | }, 11 | { 12 | "schema": "public", 13 | "table": "key_value", 14 | "fields": { 15 | "fld_value": "anon_funcs.digest(\"fld_value\", 'by_default_func', 'sha256')", 16 | "fld_key": "anon_funcs.digest(\"fld_key\", 'by_default_func', 'sha256')" 17 | } 18 | }, 19 | { 20 | "schema": "public", 21 | "table": "tbl_100", 22 | "fields": { 23 | "другое_поле": "anon_funcs.digest(\"другое_поле\", 'by_default_func', 'sha256')", 24 | "amount": "anon_funcs.digest(\"amount\", 'by_default_func', 'sha256')", 25 | "имя_поля": "anon_funcs.digest(\"имя_поля\", 'by_default_func', 'sha256')" 26 | } 27 | }, 28 | { 29 | "schema": "schm_mask_ext_exclude_2", 30 | "table": "other_ext_tbl_2", 31 | "fields": { 32 | "val_2": "anon_funcs.digest(\"val_2\", 'by_default_func', 'sha256')", 33 | "val_1": "anon_funcs.digest(\"val_1\", 'by_default_func', 'sha256')" 34 | } 35 | }, 36 | { 37 | "schema": "_SCHM.$complex#имя;@&* a'", 38 | "table": "_TBL.$complex#имя;@&* a'3", 39 | "fields": { 40 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'by_default_func', 'sha256')", 41 | "fld_key": "anon_funcs.digest(\"fld_key\", 'by_default_func', 'sha256')" 42 | } 43 | }, 44 | { 45 | "schema": "schm_customer", 46 | "table": "customer_manager", 47 | "fields": { 48 | "phone": "anon_funcs.digest(\"phone\", 'by_default_func', 'sha256')" 49 | } 50 | }, 51 | { 52 | "schema": "schm_mask_ext_exclude_2", 53 | "table": "card_numbers", 54 | "fields": { 55 | "val": "anon_funcs.digest(\"val\", 'by_default_func', 'sha256')", 56 | "другое_поле": "anon_funcs.digest(\"другое_поле\", 'by_default_func', 'sha256')", 57 | "usd": "anon_funcs.digest(\"usd\", 'by_default_func', 'sha256')", 58 | "имя_поля": "anon_funcs.digest(\"имя_поля\", 'by_default_func', 'sha256')" 59 | } 60 | }, 61 | { 62 | "schema": "_SCHM.$complex#имя;@&* a'", 63 | "table": "_TBL.$complex#имя;@&* a'2", 64 | "fields": { 65 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'by_default_func', 'sha256')", 66 | "fld_key": "anon_funcs.digest(\"fld_key\", 'by_default_func', 'sha256')" 67 | } 68 | }, 69 | { 70 | "schema": "schm_other_3", 71 | "table": "data_types_test", 72 | "fields": { 73 | "field_type_int8": "anon_funcs.digest(\"field_type_int8\", 'by_default_func', 'sha256')" 74 | } 75 | }, 76 | { 77 | "schema": "schm_customer", 78 | "table": "customer_company", 79 | "fields": { 80 | "phone": "anon_funcs.digest(\"phone\", 'by_default_func', 'sha256')", 81 | "inn": "anon_funcs.digest(\"inn\", 'by_default_func', 'sha256')" 82 | } 83 | }, 84 | { 85 | "schema": "public", 86 | "table": "contracts", 87 | "fields": { 88 | "amount": "anon_funcs.digest(\"amount\", 'by_default_func', 'sha256')", 89 | "contract_expires": "anon_funcs.digest(\"contract_expires\", 'by_default_func', 'sha256')", 90 | "details": "anon_funcs.digest(\"details\", 'by_default_func', 'sha256')" 91 | } 92 | }, 93 | { 94 | "schema": "public", 95 | "table": "inn_info", 96 | "fields": { 97 | "inn": "anon_funcs.digest(\"inn\", 'by_default_func', 'sha256')" 98 | } 99 | }, 100 | { 101 | "schema": "schm_other_2", 102 | "table": "tbl_test_anon_functions", 103 | "fields": { 104 | "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'by_default_func', 'sha256')" 105 | } 106 | } 107 | ] 108 | } -------------------------------------------------------------------------------- /tests/expected_results/test_prepared_sens_dict_result_by_partial_constants_expected.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema": "_SCHM.$complex#имя;@&* a'", 5 | "table": "_TBL.$complex#имя;@&* a'", 6 | "fields": { 7 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 8 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 9 | } 10 | }, 11 | { 12 | "schema": "public", 13 | "table": "key_value", 14 | "fields": { 15 | "fld_value": "anon_funcs.digest(\"fld_value\", 'salt_word', 'md5')", 16 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 17 | } 18 | }, 19 | { 20 | "schema": "public", 21 | "table": "tbl_100", 22 | "fields": { 23 | "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')", 24 | "amount": "anon_funcs.noise(\"amount\", 30)", 25 | "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')" 26 | } 27 | }, 28 | { 29 | "schema": "schm_customer", 30 | "table": "customer_company", 31 | "fields": { 32 | "site": "anon_funcs.digest(\"site\", 'salt_word', 'md5')", 33 | "company_name": "anon_funcs.digest(\"company_name\", 'salt_word', 'md5')", 34 | "email": "anon_funcs.digest(\"email\", 'salt_word', 'md5')", 35 | "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')", 36 | "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')" 37 | } 38 | }, 39 | { 40 | "schema": "schm_customer", 41 | "table": "customer_manager", 42 | "fields": { 43 | "last_name": "anon_funcs.digest(\"last_name\", 'salt_word', 'md5')", 44 | "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')", 45 | "email": "anon_funcs.digest(\"email\", 'salt_word', 'md5')", 46 | "first_name": "anon_funcs.digest(\"first_name\", 'salt_word', 'md5')" 47 | } 48 | }, 49 | { 50 | "schema": "schm_mask_ext_exclude_2", 51 | "table": "other_ext_tbl_2", 52 | "fields": { 53 | "val_2": "anon_funcs.digest(\"val_2\", 'salt_word', 'md5')", 54 | "val_1": "anon_funcs.digest(\"val_1\", 'salt_word', 'md5')" 55 | } 56 | }, 57 | { 58 | "schema": "_SCHM.$complex#имя;@&* a'", 59 | "table": "_TBL.$complex#имя;@&* a'3", 60 | "fields": { 61 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 62 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 63 | } 64 | }, 65 | { 66 | "schema": "schm_mask_ext_exclude_2", 67 | "table": "card_numbers", 68 | "fields": { 69 | "val": "anon_funcs.digest(\"val\", 'salt_word', 'md5')", 70 | "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')", 71 | "usd": "anon_funcs.noise(\"usd\", 30)", 72 | "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')" 73 | } 74 | }, 75 | { 76 | "schema": "_SCHM.$complex#имя;@&* a'", 77 | "table": "_TBL.$complex#имя;@&* a'2", 78 | "fields": { 79 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 80 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 81 | } 82 | }, 83 | { 84 | "schema": "public", 85 | "table": "contracts", 86 | "fields": { 87 | "amount": "anon_funcs.noise(\"amount\", 10)", 88 | "contract_expires": "anon_funcs.dnoise(\"contract_expires\", interval '6 month')", 89 | "details": "anon_funcs.digest(\"details\", 'salt_word', 'md5')" 90 | } 91 | }, 92 | { 93 | "schema": "public", 94 | "table": "inn_info", 95 | "fields": { 96 | "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')" 97 | } 98 | }, 99 | { 100 | "schema": "schm_other_2", 101 | "table": "tbl_test_anon_functions", 102 | "fields": { 103 | "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')" 104 | } 105 | } 106 | ] 107 | } -------------------------------------------------------------------------------- /docs/installation-and-configuring.md: -------------------------------------------------------------------------------- 1 | # 💽 Installation & Configuration 2 | > [🏠 Home](../README.md#-documentation-index) | [⚙️ How it works](how-it-works.md) | [💬 FAQ](faq.md) 3 | 4 | ## Before you install 5 | pg_anon provides 2 ways to run: **CLI** and **REST API** 6 | 7 | The REST API service is optional to install. This service is designed to integrate `pg_anon` functionality into any system or pipelines via HTTP requests. 8 | It works just as a thin wrapper around the CLI version of `pg_anon`. REST API calls prepare CLI parameters and run the CLI version of pg_anon in the background. 9 | 10 | It doesn’t keep state or store data in a database, so it can be scaled easily without extra setup. 11 | 12 | However, this means that the system that integrates pg_anon must implement its own storage for dictionaries, dump tasks, and restore tasks. 13 | 14 | > ⚠️ **Note** 15 | > 16 | > Not suitable for fully autonomous operation. 17 | > 18 | > All operation runs logs and info will be stored in the directory `/path_to_pg_anon/runs`. 19 | > All dumps will be stored in the directory `/path_to_pg_anon/output`. 20 | > If the REST API service is scaled, you must create a symlink to this directory on a shared disk. 21 | > This is required because restore operations also read dumps from `/path_to_pg_anon/output`. 22 | 23 | --- 24 | 25 | ## Linux 26 | 27 | 1. Install Python 3 if it is not installed: `sudo apt-get install python3.11` (for Ubuntu), `sudo yum install python311` (for Redhat/Centos) 28 | 2. Clone the repository: `git clone https://github.com/TantorLabs/pg_anon.git` 29 | 3. Go to the project directory: `cd pg_anon` 30 | 4. Set up a virtual environment: 31 | - Install the virtual environment: `python3 -m venv venv` 32 | - Activate the virtual environment: `source venv/bin/activate` 33 | 5. Install the dependencies: `pip install -r requirements.txt` 34 | 6. Optional, if you want to use the REST API service, install its dependencies: `pip install -r rest_api/requirements.txt` 35 | 36 | ## Windows 37 | 38 | 1. Install Python 3 if it is not installed: Download it from the official [Python website](https://www.python.org/downloads/) 39 | 2. Clone the repository: `git clone https://github.com/TantorLabs/pg_anon.git` 40 | 3. Go to the project directory: `cd pg_anon` 41 | 4. Set up a virtual environment: 42 | - Install the virtual environment: `py -m venv venv` 43 | - Activate the virtual environment: `.\venv\Scripts\activate` 44 | 5. Install the dependencies: `pip install -r requirements.txt` 45 | 6. Optional, if you want to use the REST API service, install its dependencies: `pip install -r rest_api/requirements.txt` 46 | 47 | ## macOS 48 | 49 | 1. Install Python 3 if it is not installed: 50 | - Install [Homebrew](https://brew.sh/) 51 | - [`brew install python@3.11`](https://formulae.brew.sh/formula/python@3.11) 52 | 2. Clone the repository: `git clone https://github.com/TantorLabs/pg_anon.git` 53 | 3. Go to the project directory: `cd pg_anon` 54 | 4. Set up a virtual environment: 55 | - Install the virtual environment: `python3 -m venv venv` 56 | - Activate the virtual environment: `source venv/bin/activate` 57 | 5. Install the dependencies: `pip install -r requirements.txt` 58 | 6. Optional, if you want to use the REST API service, install its dependencies: `pip install -r rest_api/requirements.txt` 59 | 60 | --- 61 | 62 | ## Configuring pg_anon 63 | 64 | To specify custom `pg_dump` and `pg_restore` utilities, use the `--pg-dump` and `--pg-restore` parameters. 65 | 66 | Advanced configuration is also available: 67 | - CLI - use run parameter `--config` 68 | - REST API - config must be placed at `/path_to_pg_anon/config.yml` 69 | 70 | This parameter accepts a YAML file in this format: 71 | ```yaml 72 | pg-utils-versions: 73 | : 74 | pg_dump: "/path/to//pg_dump" 75 | pg_restore: "/path/to//pg_restore" 76 | : 77 | pg_dump: "/path/to//pg_dump" 78 | pg_restore: "/path/to//pg_restore" 79 | default: 80 | pg_dump: "/path/to/default_postgres_version/pg_dump" 81 | pg_restore: "/path/to/default_postgres_version/pg_restore" 82 | ``` 83 | 84 | For example, you can specify a configuration for postgres 15 and 17: 85 | 86 | ```yaml 87 | pg-utils-versions: 88 | 15: 89 | pg_dump: "/usr/lib/postgresql/15/bin/pg_dump" 90 | pg_restore: "/usr/lib/postgresql/15/bin/pg_restore" 91 | 17: 92 | pg_dump: "/usr/lib/postgresql/17/bin/pg_dump" 93 | pg_restore: "/usr/lib/postgresql/17/bin/pg_restore" 94 | default: 95 | pg_dump: "/usr/lib/postgresql/17/bin/pg_dump" 96 | pg_restore: "/usr/lib/postgresql/17/bin/pg_restore" 97 | ``` 98 | 99 | If the current PostgreSQL version does not match any version in this config, the utilities from the default section will be used. 100 | For example, `pg_anon` can be run with this config on Postgres 16. In this case, `pg_dump 17` and `pg_restore 17` will be used. 101 | 102 | --- 103 | 104 | ## Running REST API 105 | Run service command 106 | ```sh 107 | python -m uvicorn rest_api.api:app --host 0.0.0.0 --port 8000 --workers=3 108 | ``` 109 | - Recommended worker count = `2 * CPU_CORES + 1` 110 | - Service OpenAPI documentation will be able by address - http://0.0.0.0:8000/docs#/ 111 | - Also you can see [API documentation](api.md) 112 | -------------------------------------------------------------------------------- /docs/how-it-works.md: -------------------------------------------------------------------------------- 1 | # How it works 2 | > [🏠 Home](../README.md#-documentation-index) | [💬 FAQ](faq.md) 3 | 4 | ## Anonymization (masking) 5 | 6 | The diagram below illustrates how data is transferred from the **source DB** to the **target DB**. 7 | 8 | The source database contains sensitive information and is typically located in a production environment with strictly limited access. 9 | 10 | ![Dump-Resore-Terms.drawio.png](../images/Dump-Resore-Terms.drawio.png) 11 | 12 | A trusted administrator runs pg_anon with credentials for the **source DB**. 13 | Using the prepared and approved sensitive dictionary, pg_anon creates an anonymized dump in the specified directory. 14 | The dictionary must be created in advance and validated by the security team. 15 | 16 | The resulting dump directory is then transferred to the host of the target database. 17 | Compression during transfer is unnecessary because the dump files are already compressed. 18 | 19 | Once the directory is placed on the target host, the restore process is started using target database credentials. 20 | The target database must be created beforehand using CREATE DATABASE and must be empty. 21 | 22 | After a successful restore, the anonymized database is ready for development or testing. Any number of employees can safely use it without risking exposure of sensitive data. 23 | 24 | --- 25 | 26 | ## What kind of work does pg_anon do inside during dump and restore? The simplest representation. 27 | 28 | ### For example, we have data that we want to anonymize: 29 | 30 | 1. Create the `source` table: 31 | 32 | ```SQL 33 | create table users ( 34 | id bigserial, 35 | email text, 36 | login text 37 | ); 38 | 39 | -- Checking the contents of the source table 40 | select * from users; 41 | ``` 42 | ``` output 43 | >> 44 | id | email | login 45 | ----+---------+------- 46 | ``` 47 | 48 | 2. Populating the `source` table: 49 | 50 | ```SQL 51 | insert into users (email, login) 52 | select 53 | 'user' || generate_series(1001, 1020) || '@example.com', 54 | 'user' || generate_series(1001, 1020); 55 | 56 | -- Checking the contents of the source table 57 | select * from users; 58 | ``` 59 | ```output 60 | >> 61 | id | email | login 62 | ----+----------------------+---------- 63 | 1 | user1001@example.com | user1001 64 | 2 | user1002@example.com | user1002 65 | ... 66 | ``` 67 | 68 | **The 'email' field contains `sensitive data`. We need to `anonymize` it.** 69 | 70 | 71 | ### What is the process of creating a dump with masking? 72 | 73 | 1. Data `dump` from the `source` table to a CSV file (without masking): 74 | 75 | ```SQL 76 | copy ( 77 | select * 78 | from users 79 | ) to '/tmp/users.csv' with csv; 80 | ``` 81 | ```output 82 | cat /tmp/users.csv 83 | >> 84 | 1,user1001@example.com,user1001 85 | 2,user1002@example.com,user1002 86 | ... 87 | ``` 88 | 89 | 2. `Masking` the contents of the `source` table: 90 | 91 | ```SQL 92 | select 93 | id, 94 | md5(email) || '@abc.com' as email, -- hashing the email (masking rule in prepared sens dict file) 95 | login 96 | from users; 97 | ``` 98 | ```output 99 | >> 100 | id | email | login 101 | ----+------------------------------------------+---------- 102 | 1 | 385513d80895c4c5e19c91d1df9eacae@abc.com | user1001 103 | 2 | 9f4c0c30f85b0353c4d5fe3c9cc633e3@abc.com | user1002 104 | ... 105 | ``` 106 | 107 | 3. Data `dump` from the `source` table to a CSV file (with `masking`): 108 | 109 | ```SQL 110 | copy ( 111 | select 112 | id, 113 | md5(email) || '@abc.com' as email, -- hashing the email (masking rule in prepared sens dict file) 114 | login 115 | from users 116 | ) to '/tmp/users_anonymized.csv' with csv; 117 | ``` 118 | ```output 119 | cat /tmp/users_anonymized.csv 120 | >> 121 | 1,385513d80895c4c5e19c91d1df9eacae@abc.com,user1001 122 | 2,9f4c0c30f85b0353c4d5fe3c9cc633e3@abc.com,user1002 123 | ... 124 | ``` 125 | 126 | **The `prepared sens dict file` contains masking rules like hashing** 127 | 128 | ### What is the process for restoring a masked dump? 129 | 130 | 1. Reproducing of the structure. Creating the `target` table: 131 | 132 | ```SQL 133 | create table users_anonymized ( 134 | id bigserial, 135 | email text, 136 | login text 137 | ); 138 | 139 | -- Checking the contents of the target table 140 | select * from users_anonymized; 141 | ``` 142 | ```output 143 | >> 144 | id | email | login 145 | ----+---------+------- 146 | ``` 147 | 148 | 2. Loading data from the `source` table data `dump` (CSV file) to `target` table: 149 | 150 | ```SQL 151 | copy users_anonymized 152 | from '/tmp/users_anonymized.csv' 153 | with csv; 154 | 155 | -- Checking the contents of the target table 156 | select * from users_anonymized; 157 | ``` 158 | ```output 159 | >> 160 | id | email | login 161 | ----+------------------------------------------+---------- 162 | 1 | 385513d80895c4c5e19c91d1df9eacae@abc.com | user1001 163 | 2 | 9f4c0c30f85b0353c4d5fe3c9cc633e3@abc.com | user1002 164 | ... 165 | ``` 166 | 167 | ### Differences between original work of pg_anon and that representation: 168 | - `pg_anon` operates on the entire database (not only one table) 169 | - `pg_anon` uses `.bin.gz` files to save data (not csv) 170 | - Masking rules are provided to `pg_anon` via a `prepared sens dict file` 171 | 172 | -------------------------------------------------------------------------------- /tests/expected_results/test_prepared_sens_dict_result_expected.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema": "_SCHM.$complex#имя;@&* a'", 5 | "table": "_TBL.$complex#имя;@&* a'", 6 | "fields": { 7 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 8 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 9 | } 10 | }, 11 | { 12 | "schema": "_SCHM.$complex#имя;@&* a'", 13 | "table": "_TBL.$complex#имя;@&* a'2", 14 | "fields": { 15 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 16 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 17 | } 18 | }, 19 | { 20 | "schema": "_SCHM.$complex#имя;@&* a'", 21 | "table": "_TBL.$complex#имя;@&* a'3", 22 | "fields": { 23 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 24 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 25 | } 26 | }, 27 | { 28 | "schema": "public", 29 | "table": "contracts", 30 | "fields": { 31 | "amount": "anon_funcs.noise(\"amount\", 10)", 32 | "contract_expires": "anon_funcs.dnoise(\"contract_expires\", interval '6 month')", 33 | "details": "anon_funcs.digest(\"details\", 'salt_word', 'md5')" 34 | } 35 | }, 36 | { 37 | "schema": "public", 38 | "table": "inn_info", 39 | "fields": { 40 | "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')" 41 | } 42 | }, 43 | { 44 | "schema": "public", 45 | "table": "key_value", 46 | "fields": { 47 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')", 48 | "fld_value": "anon_funcs.digest(\"fld_value\", 'salt_word', 'md5')" 49 | } 50 | }, 51 | { 52 | "schema": "public", 53 | "table": "tbl_100", 54 | "fields": { 55 | "amount": "anon_funcs.noise(\"amount\", 30)", 56 | "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')", 57 | "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')" 58 | } 59 | }, 60 | { 61 | "schema": "schm_customer", 62 | "table": "customer_company", 63 | "fields": { 64 | "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')", 65 | "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')" 66 | } 67 | }, 68 | { 69 | "schema": "schm_customer", 70 | "table": "customer_manager", 71 | "fields": { 72 | "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')" 73 | } 74 | }, 75 | { 76 | "schema": "schm_mask_ext_exclude_2", 77 | "table": "card_numbers", 78 | "fields": { 79 | "usd": "anon_funcs.noise(\"usd\", 30)", 80 | "val": "anon_funcs.digest(\"val\", 'salt_word', 'md5')", 81 | "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')", 82 | "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')" 83 | } 84 | }, 85 | { 86 | "schema": "schm_mask_ext_exclude_2", 87 | "table": "other_ext_tbl_2", 88 | "fields": { 89 | "val_1": "anon_funcs.digest(\"val_1\", 'salt_word', 'md5')", 90 | "val_2": "anon_funcs.digest(\"val_2\", 'salt_word', 'md5')" 91 | } 92 | }, 93 | { 94 | "schema": "schm_other_2", 95 | "table": "tbl_test_anon_functions", 96 | "fields": { 97 | "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')" 98 | } 99 | }, 100 | { 101 | "schema": "schm_other_4", 102 | "table": "partitioned_table", 103 | "fields": { 104 | "amount": "anon_funcs.noise(\"amount\", 10)" 105 | } 106 | }, 107 | { 108 | "schema": "schm_other_4", 109 | "table": "partitioned_table_2025_01", 110 | "fields": { 111 | "amount": "anon_funcs.noise(\"amount\", 10)" 112 | } 113 | }, 114 | { 115 | "schema": "schm_other_4", 116 | "table": "partitioned_table_2025_02", 117 | "fields": { 118 | "amount": "anon_funcs.noise(\"amount\", 10)" 119 | } 120 | }, 121 | { 122 | "schema": "schm_other_4", 123 | "table": "partitioned_table_2025_03", 124 | "fields": { 125 | "amount": "anon_funcs.noise(\"amount\", 10)" 126 | } 127 | }, 128 | { 129 | "schema": "schm_other_4", 130 | "table": "partitioned_table_default", 131 | "fields": { 132 | "amount": "anon_funcs.noise(\"amount\", 10)" 133 | } 134 | } 135 | ] 136 | } -------------------------------------------------------------------------------- /tests/expected_results/test_prepared_sens_dict_result_by_data_func_expected.py: -------------------------------------------------------------------------------- 1 | { 2 | "dictionary": [ 3 | { 4 | "schema": "_SCHM.$complex#имя;@&* a'", 5 | "table": "_TBL.$complex#имя;@&* a'", 6 | "fields": { 7 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 8 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 9 | } 10 | }, 11 | { 12 | "schema": "public", 13 | "table": "key_value", 14 | "fields": { 15 | "fld_value": "anon_funcs.digest(\"fld_value\", 'salt_word', 'md5')", 16 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 17 | } 18 | }, 19 | { 20 | "schema": "public", 21 | "table": "tbl_100", 22 | "fields": { 23 | "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')", 24 | "amount": "anon_funcs.noise(\"amount\", 30)", 25 | "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')" 26 | } 27 | }, 28 | { 29 | "schema": "schm_mask_ext_exclude_2", 30 | "table": "other_ext_tbl_2", 31 | "fields": { 32 | "val_2": "anon_funcs.digest(\"val_2\", 'salt_word', 'md5')", 33 | "val_1": "anon_funcs.digest(\"val_1\", 'salt_word', 'md5')" 34 | } 35 | }, 36 | { 37 | "schema": "_SCHM.$complex#имя;@&* a'", 38 | "table": "_TBL.$complex#имя;@&* a'3", 39 | "fields": { 40 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 41 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 42 | } 43 | }, 44 | { 45 | "schema": "schm_customer", 46 | "table": "customer_manager", 47 | "fields": { 48 | "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')", 49 | "email": "anon_funcs.partial_email(\"email\")" 50 | } 51 | }, 52 | { 53 | "schema": "schm_mask_ext_exclude_2", 54 | "table": "card_numbers", 55 | "fields": { 56 | "val": "anon_funcs.digest(\"val\", 'salt_word', 'md5')", 57 | "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')", 58 | "usd": "anon_funcs.noise(\"usd\", 30)", 59 | "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')" 60 | } 61 | }, 62 | { 63 | "schema": "_SCHM.$complex#имя;@&* a'", 64 | "table": "_TBL.$complex#имя;@&* a'2", 65 | "fields": { 66 | "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')", 67 | "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')" 68 | } 69 | }, 70 | { 71 | "schema": "schm_customer", 72 | "table": "customer_company", 73 | "fields": { 74 | "email": "anon_funcs.partial_email(\"email\")", 75 | "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')", 76 | "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')" 77 | } 78 | }, 79 | { 80 | "schema": "public", 81 | "table": "contracts", 82 | "fields": { 83 | "amount": "anon_funcs.noise(\"amount\", 10)", 84 | "contract_expires": "anon_funcs.dnoise(\"contract_expires\", interval '6 month')", 85 | "details": "anon_funcs.digest(\"details\", 'salt_word', 'md5')" 86 | } 87 | }, 88 | { 89 | "schema": "public", 90 | "table": "inn_info", 91 | "fields": { 92 | "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')" 93 | } 94 | }, 95 | { 96 | "schema": "schm_other_2", 97 | "table": "tbl_test_anon_functions", 98 | "fields": { 99 | "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')" 100 | } 101 | }, 102 | { 103 | "schema": "schm_other_4", 104 | "table": "partitioned_table", 105 | "fields": { 106 | "amount": "anon_funcs.noise(\"amount\", 10)" 107 | } 108 | }, 109 | { 110 | "schema": "schm_other_4", 111 | "table": "partitioned_table_2025_01", 112 | "fields": { 113 | "amount": "anon_funcs.noise(\"amount\", 10)" 114 | } 115 | }, 116 | { 117 | "schema": "schm_other_4", 118 | "table": "partitioned_table_2025_02", 119 | "fields": { 120 | "amount": "anon_funcs.noise(\"amount\", 10)" 121 | } 122 | }, 123 | { 124 | "schema": "schm_other_4", 125 | "table": "partitioned_table_2025_03", 126 | "fields": { 127 | "amount": "anon_funcs.noise(\"amount\", 10)" 128 | } 129 | }, 130 | { 131 | "schema": "schm_other_4", 132 | "table": "partitioned_table_default", 133 | "fields": { 134 | "amount": "anon_funcs.noise(\"amount\", 10)" 135 | } 136 | } 137 | ] 138 | } -------------------------------------------------------------------------------- /rest_api/utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import shutil 4 | from collections import deque 5 | from pathlib import Path 6 | from typing import List, Optional, Dict, Union 7 | 8 | import aioprocessing 9 | 10 | from pg_anon.cli import run_pg_anon 11 | from pg_anon.common.dto import PgAnonResult 12 | from pg_anon.common.utils import validate_exists_mode, simple_slugify 13 | from rest_api.constants import DUMP_STORAGE_BASE_DIR 14 | from rest_api.pydantic_models import DictionaryContent, DictionaryMetadata 15 | 16 | 17 | def get_full_dump_path(dump_path: str) -> str: 18 | full_dump_path = Path(DUMP_STORAGE_BASE_DIR / dump_path.lstrip("/")).resolve() 19 | if not str(full_dump_path).startswith(str(DUMP_STORAGE_BASE_DIR)) or full_dump_path == DUMP_STORAGE_BASE_DIR: 20 | raise ValueError(f"Invalid path: {dump_path}") 21 | 22 | return str(full_dump_path) 23 | 24 | 25 | def write_dictionary_contents(dictionary_contents: List[DictionaryContent], base_dir: Path) -> Dict[str, DictionaryMetadata]: 26 | file_names = {} 27 | base_dir.mkdir(parents=True, exist_ok=True) 28 | 29 | for dictionary_content in dictionary_contents: 30 | file_name = (base_dir / simple_slugify(dictionary_content.name)).with_suffix('.py') 31 | with open(file_name, "w") as out_file: 32 | out_file.write(dictionary_content.content) 33 | 34 | file_names[str(file_name)] = DictionaryMetadata( 35 | name=dictionary_content.name, 36 | additional_info=dictionary_content.additional_info, 37 | ) 38 | 39 | return file_names 40 | 41 | 42 | def read_dictionary_contents(file_path: Union[str, Path]) -> str: 43 | with open(file_path, "r") as dictionary_file: 44 | data = dictionary_file.read() 45 | 46 | return data 47 | 48 | 49 | def read_json_file(file_path: Union[str, Path]) -> Dict: 50 | with open(file_path, "r") as file: 51 | data = json.loads(file.read()) 52 | 53 | return data 54 | 55 | 56 | def read_logs_from_tail(logs_path: Union[str, Path], lines_count: int) -> List[str]: 57 | def log_sort_key(file_path: Path): 58 | parts = file_path.name.split(".") 59 | try: 60 | return int(parts[-1]) 61 | except ValueError: 62 | return 0 63 | 64 | log_files = sorted(logs_path.glob("*"), key=log_sort_key) 65 | 66 | result_lines = deque(maxlen=lines_count) 67 | block_size = 1024 68 | for log_file in log_files: 69 | if len(result_lines) >= lines_count: 70 | break 71 | buffer = bytearray() 72 | 73 | with log_file.open("rb") as f: 74 | f.seek(0, 2) 75 | pointer = f.tell() 76 | 77 | while pointer > 0 and len(result_lines) < lines_count: 78 | read_size = min(block_size, pointer) 79 | pointer -= read_size 80 | f.seek(pointer) 81 | buffer[:0] = f.read(read_size) 82 | log_lines = buffer.split(b"\n") 83 | for idx, line in enumerate(reversed(log_lines[1:])): 84 | if idx == 0 and line == b"": 85 | continue 86 | 87 | result_lines.appendleft(line.decode("utf-8", errors="replace")) 88 | if len(result_lines) >= lines_count: 89 | break 90 | 91 | buffer = log_lines[0] 92 | 93 | if buffer and len(result_lines) < lines_count: 94 | result_lines.appendleft(buffer.decode("utf-8", errors="replace")) 95 | 96 | return list(result_lines) 97 | 98 | 99 | def delete_folder(folder_path: Path): 100 | try: 101 | shutil.rmtree(folder_path) 102 | print(f"Folder {folder_path} deleted successfully.") 103 | except Exception as e: 104 | print(f"Error deleting folder {folder_path}: {str(e)}") 105 | 106 | 107 | def run_pg_anon_subprocess_wrapper(queue: aioprocessing.AioQueue, cli_run_params: List[str]): 108 | loop = asyncio.new_event_loop() 109 | asyncio.set_event_loop(loop) 110 | 111 | try: 112 | # Выполняем асинхронную функцию внутри нового event loop 113 | result = loop.run_until_complete( 114 | run_pg_anon(cli_run_params) 115 | ) 116 | queue.put(result) 117 | except Exception as ex: 118 | print(ex) 119 | finally: 120 | queue.put(None) # Завершаем процесс 121 | queue.close() 122 | loop.close() 123 | 124 | 125 | async def run_pg_anon_worker(mode: str, operation_id: str, cli_run_params: List[str]) -> Optional[PgAnonResult]: 126 | if not validate_exists_mode(mode): 127 | raise ValueError(f'Invalid mode: {mode}') 128 | 129 | application_name_suffix = f'worker__{mode}__{operation_id}' 130 | cli_run_params.extend([ 131 | f'--mode={mode}', 132 | f'--application-name-suffix={application_name_suffix}', 133 | ]) 134 | 135 | queue = aioprocessing.AioQueue() 136 | 137 | p = aioprocessing.AioProcess( 138 | name=f"pg_anon_{application_name_suffix}", 139 | target=run_pg_anon_subprocess_wrapper, 140 | args=(queue, cli_run_params), 141 | ) 142 | p.start() 143 | 144 | result = None 145 | while True: 146 | coro_result = await queue.coro_get() 147 | if coro_result is None: 148 | break 149 | result = coro_result 150 | await p.coro_join() 151 | 152 | return result 153 | 154 | 155 | def normalize_headers(headers: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]: 156 | if not headers: 157 | return None 158 | 159 | headers = {k.lower(): v for k, v in headers.items()} 160 | headers.setdefault('content-type', 'application/json') 161 | return headers 162 | -------------------------------------------------------------------------------- /docs/operations/view-data.md: -------------------------------------------------------------------------------- 1 | # 📊 View Data 2 | > [🏠 Home](../../README.md#-operations) | [🔍 Scan](scan.md) | [💾 Dump](dump.md) | [📂 Restore](restore.md) | [🔬 View Fields](view-fields.md) | [📚 SQL Functions Library](../sql-functions-library.md) 3 | 4 | ## Overview 5 | 6 | This mode displays anonymized table data without creating a dump. 7 | 8 | ## Prerequisites 9 | - The `anon_funcs` schema with anonymization functions must already exist. See [init mode](init.md). 10 | - A sensitive dictionary containing data about database fields and their anonymization rules must be prepared beforehand. See [create-dict (scan) mode](scan.md). 11 | 12 | ## Run example 13 | 14 | ```commandline 15 | python pg_anon.py --mode=view-data \ 16 | --db-host=127.0.0.1 \ 17 | --db-user=postgres \ 18 | --db-user-password=postgres \ 19 | --db-name=source_db \ 20 | --prepared-sens-dict-file=sens_dict.py \ 21 | --schema-name=public \ 22 | --table-name=users \ 23 | --limit=10 \ 24 | --offset=0 25 | ``` 26 | 27 | --- 28 | 29 | ## Options 30 | 31 | ### Common pg_anon options: 32 | 33 | | Option | Required | Description | 34 | |--------------------------------|----------|--------------------------------------------------------------------------------------------------| 35 | | `--config` | No | Path to the config file that can specify `pg_dump` and `pg_restore` utilities. (default: none) | 36 | | `--processes` | No | Number of processes used for multiprocessing operations. (default: 4) | 37 | | `--db-connections-per-process` | No | Number of database connections per process for I/O operations. (default: 4) | 38 | | `--verbose` | No | Sets the log verbosity level: `info`, `debug`, `error`. (default: info) | 39 | | `--debug` | No | Enables debug mode (equivalent to `--verbose=debug`) and adds extra debug logs. (default: false) | 40 | 41 | 42 | ### Database configuration options: 43 | 44 | | Option | Required | Description | 45 | |----------------------|----------|---------------------------------------------------------------------| 46 | | `--db-host` | Yes | Database host. | 47 | | `--db-port` | Yes | Database port. | 48 | | `--db-name` | Yes | Database name. | 49 | | `--db-user` | Yes | Database user. | 50 | | `--db-user-password` | No | Database user password. | 51 | | `--db-passfile` | No | Path to a file containing the password used for authentication. | 52 | | `--db-ssl-key-file` | No | Path to the client SSL key file for secure connections. | 53 | | `--db-ssl-cert-file` | No | Path to the client SSL certificate file. | 54 | | `--db-ssl-ca-file` | No | Path to the CA certificate used to verify the server’s certificate. | 55 | 56 | 57 | ### View-data mode options: 58 | 59 | | Option | Required | Description | 60 | |-----------------------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 61 | | `--prepared-sens-dict-file` | Yes | Input file or file list contains [sensitive dictionary](../dicts/sens-dict-schema.md), which was generated by the [create-dict (scan) mode](scan.md) or created manually. In rules collision case, priority has rules in last file from the list. | 62 | | `--schema-name` | Yes | Schema name. | 63 | | `--table-name` | Yes | Table name. | 64 | | `--limit` | No | Number of rows to display. (default: 100) | 65 | | `--offset` | No | Row offset for pagination. (default: 0) | 66 | | `--json` | No | Outputs results in JSON format instead of a table. | 67 | -------------------------------------------------------------------------------- /pg_anon/app.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from pg_anon.common.constants import ANON_UTILS_DB_SCHEMA_NAME, SAVED_RUN_STATUS_FILE_NAME, SAVED_RUN_OPTIONS_FILE_NAME 4 | from pg_anon.common.db_utils import check_anon_utils_db_schema_exists, get_pg_version 5 | from pg_anon.common.dto import PgAnonResult, RunOptions 6 | from pg_anon.common.enums import AnonMode 7 | from pg_anon.common.utils import check_pg_util, exception_helper, save_json_file 8 | from pg_anon.context import Context 9 | from pg_anon.modes.create_dict import CreateDictMode 10 | from pg_anon.modes.dump import DumpMode 11 | from pg_anon.modes.initialization import InitMode 12 | from pg_anon.modes.restore import RestoreMode 13 | from pg_anon.modes.view_data import ViewDataMode 14 | from pg_anon.modes.view_fields import ViewFieldsMode 15 | from pg_anon.version import __version__ 16 | 17 | 18 | class PgAnonApp: 19 | 20 | def __init__(self, options: RunOptions): 21 | run_dir = Path(options.run_dir) 22 | run_dir.mkdir(parents=True, exist_ok=True) 23 | save_json_file(run_dir / SAVED_RUN_OPTIONS_FILE_NAME, options.to_dict()) 24 | 25 | self.context = Context(options) 26 | self.result = PgAnonResult() 27 | self._skip_check_postgres_utils = self.context.options.mode in ( 28 | AnonMode.INIT, 29 | AnonMode.CREATE_DICT, 30 | AnonMode.VIEW_FIELDS, 31 | AnonMode.VIEW_DATA, 32 | ) 33 | 34 | def _bootstrap(self): 35 | self.context.logger.info( 36 | "============> Started pg_anon (v%s) in mode: %s" 37 | % (__version__, self.context.options.mode.value) 38 | ) 39 | if self.context.options.debug: 40 | params_info = "#--------------- Run options\n" 41 | params_info += self.context.options.to_json() 42 | params_info += "\n#-----------------------------------" 43 | self.context.logger.debug(params_info) 44 | 45 | async def _set_postgres_utils(self): 46 | pg_version = await get_pg_version(self.context.connection_params, server_settings=self.context.server_settings) 47 | self.context.set_postgres_version(pg_version) 48 | self.context.logger.info(f"Target DB version: {pg_version}") 49 | self.context.logger.info(f"pg_dump path: {self.context.pg_dump}") 50 | self.context.logger.info(f"pg_restore path: {self.context.pg_restore}") 51 | 52 | def _check_postgres_utils(self): 53 | if self._skip_check_postgres_utils: 54 | self.context.logger.info(f"Skip postgres utils exists check") 55 | return 56 | 57 | self.context.logger.info(f"Postgres utils exists checking") 58 | 59 | pg_dump_exists = check_pg_util(self.context, self.context.pg_dump, "pg_dump") 60 | pg_restore_exists = check_pg_util(self.context, self.context.pg_restore, "pg_restore") 61 | 62 | if not pg_dump_exists or not pg_restore_exists: 63 | raise RuntimeError('pg_dump or pg_restore not found') 64 | 65 | async def _check_initialization(self): 66 | if self.context.options.mode in ( 67 | AnonMode.CREATE_DICT, 68 | AnonMode.DUMP, 69 | AnonMode.SYNC_DATA_DUMP, 70 | AnonMode.SYNC_STRUCT_DUMP, 71 | ): 72 | anon_utils_schema_exists = await check_anon_utils_db_schema_exists( 73 | connection_params=self.context.connection_params, 74 | server_settings=self.context.server_settings 75 | ) 76 | if not anon_utils_schema_exists: 77 | raise ValueError( 78 | f"Schema '{ANON_UTILS_DB_SCHEMA_NAME}' does not exist. First you need execute init, by run '--mode=init'" 79 | ) 80 | 81 | def _get_mode(self): 82 | if self.context.options.mode in (AnonMode.DUMP, AnonMode.SYNC_DATA_DUMP, AnonMode.SYNC_STRUCT_DUMP): 83 | return DumpMode(self.context) 84 | 85 | if self.context.options.mode in (AnonMode.RESTORE, AnonMode.SYNC_DATA_RESTORE, AnonMode.SYNC_STRUCT_RESTORE): 86 | return RestoreMode(self.context) 87 | 88 | if self.context.options.mode == AnonMode.INIT: 89 | return InitMode(self.context) 90 | 91 | if self.context.options.mode == AnonMode.CREATE_DICT: 92 | return CreateDictMode(self.context) 93 | 94 | if self.context.options.mode == AnonMode.VIEW_FIELDS: 95 | return ViewFieldsMode(self.context) 96 | 97 | if self.context.options.mode == AnonMode.VIEW_DATA: 98 | return ViewDataMode(self.context) 99 | 100 | raise RuntimeError("Unknown mode: " + self.context.options.mode.value) 101 | 102 | async def run(self) -> PgAnonResult: 103 | self._bootstrap() 104 | self.result.start(self.context.options) 105 | try: 106 | await self._set_postgres_utils() 107 | self._check_postgres_utils() 108 | await self._check_initialization() 109 | 110 | mode = self._get_mode() 111 | self.result.result_data = await mode.run() 112 | self.result.complete() 113 | except Exception as exc: 114 | self.context.logger.error(exception_helper(show_traceback=True)) 115 | self.result.fail(exc) 116 | finally: 117 | self.context.logger.info( 118 | f"<============ Finished pg_anon in mode: {self.context.options.mode.value}, " 119 | f"result_code = {self.result.result_code.value}, " 120 | f"elapsed: {self.result.elapsed} sec" 121 | ) 122 | save_json_file(Path(self.context.options.run_dir) / SAVED_RUN_STATUS_FILE_NAME, self.result.to_dict()) 123 | 124 | return self.result 125 | 126 | async def validate_target_tables(self) -> PgAnonResult: 127 | result = PgAnonResult() 128 | result.start(self.context.options) 129 | 130 | try: 131 | await RestoreMode.validate_restore(self.context) 132 | result.complete() 133 | except: 134 | self.context.logger.error(exception_helper(show_traceback=True)) 135 | result.fail() 136 | finally: 137 | return result 138 | -------------------------------------------------------------------------------- /pg_anon/modes/view_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List, Dict, Optional 3 | 4 | from prettytable import PrettyTable, SINGLE_BORDER 5 | 6 | from pg_anon.common.db_utils import get_fields_list, create_connection, get_rows_count, get_dump_query 7 | from pg_anon.common.utils import exception_helper, get_dict_rule_for_table 8 | from pg_anon.context import Context 9 | 10 | 11 | class ViewDataMode: 12 | context: Context 13 | _limit: int 14 | _offset: int 15 | _schema_name: str 16 | _table_name: str 17 | table_rule: Dict 18 | raw_field_names: List[str] = None 19 | field_names: List[str] = None 20 | rows_count: int = 0 21 | query: str 22 | data: List[List[str]] = None 23 | raw_query: Optional[str] = None 24 | raw_data: Optional[List[List[str]]] = None 25 | table: PrettyTable = None 26 | _need_raw_data: bool = False 27 | 28 | def __init__(self, context: Context, need_raw_data: bool = False): 29 | self.context = context 30 | self._limit = context.options.limit 31 | self._offset = context.options.offset 32 | self._schema_name = context.options.schema_name 33 | self._table_name = context.options.table_name 34 | self.field_names = [] 35 | self.raw_field_names = [] 36 | self.data = [] 37 | self.raw_data = [] 38 | self._need_raw_data = need_raw_data 39 | 40 | async def _get_fields_for_view(self) -> None: 41 | """ 42 | Get field names and all fields for view-data mode 43 | """ 44 | fields_list = await get_fields_list( 45 | connection_params=self.context.connection_params, 46 | server_settings=self.context.server_settings, 47 | table_schema=self._schema_name, 48 | table_name=self._table_name 49 | ) 50 | for field in fields_list: 51 | field_name = field["column_name"] 52 | self.raw_field_names.append(field_name) 53 | 54 | if self.table_rule and field_name in self.table_rule["fields"]: 55 | self.field_names.append('* ' + field_name) 56 | else: 57 | self.field_names.append(field_name) 58 | 59 | async def _get_data_for_view(self, query: str) -> List[List[str]]: 60 | db_conn = await create_connection(self.context.connection_params, server_settings=self.context.server_settings) 61 | table_result = await db_conn.fetch(query) 62 | await db_conn.close() 63 | 64 | data = [[record[field_name] for field_name in self.raw_field_names] for record in table_result] 65 | return data 66 | 67 | async def get_rows_count(self): 68 | self.rows_count = await get_rows_count( 69 | connection_params=self.context.connection_params, 70 | server_settings=self.context.server_settings, 71 | schema_name=self._schema_name, 72 | table_name=self._table_name 73 | ) 74 | return self.rows_count 75 | 76 | def _prepare_table(self) -> None: 77 | self.table = PrettyTable(self.field_names) 78 | self.table.set_style(SINGLE_BORDER) 79 | for row in self.data: 80 | self.table.add_row(row) 81 | 82 | def _prepare_json(self) -> None: 83 | result = {field: [] for field in self.field_names} 84 | 85 | for field_values in self.data: 86 | for field, value in zip(self.field_names, field_values): 87 | result[field].append(value) 88 | 89 | self.json = json.dumps(result, default=lambda x: str(x), ensure_ascii=False) 90 | 91 | async def _output_fields(self) -> None: 92 | 93 | await self._get_fields_for_view() 94 | if not self.field_names: 95 | raise ValueError("No field names for view!") 96 | 97 | self.data = await self._get_data_for_view(self.query) 98 | if not self.data: 99 | raise ValueError("Not found fields for view!") 100 | 101 | if self._need_raw_data: 102 | self.raw_data = await self._get_data_for_view(self.raw_query) 103 | 104 | if self.context.options.json: 105 | self._prepare_json() 106 | print(self.json) 107 | else: 108 | self._prepare_table() 109 | print(self.table) 110 | 111 | async def _prepare_queries(self): 112 | 113 | query_without_limit = await get_dump_query( 114 | ctx=self.context, 115 | table_schema=self._schema_name, 116 | table_name=self._table_name, 117 | table_rule=self.table_rule, 118 | nulls_last=True 119 | ) 120 | self.query = query_without_limit + f" LIMIT {self._limit} OFFSET {self._offset}" 121 | 122 | if self._need_raw_data: 123 | query_without_limit = await get_dump_query( 124 | ctx=self.context, 125 | table_schema=self._schema_name, 126 | table_name=self._table_name, 127 | table_rule=None, 128 | nulls_last=True 129 | ) 130 | self.raw_query = query_without_limit + f" LIMIT {self._limit} OFFSET {self._offset}" 131 | 132 | async def run(self) -> None: 133 | self.context.logger.info("-------------> Started view_data mode") 134 | 135 | try: 136 | if self._limit < 1: 137 | raise ValueError("Processing fields limit must be greater than zero!") 138 | if self._offset < 0: 139 | raise ValueError("Processing fields offset must be greater than zero or equals to zero!") 140 | 141 | self.context.read_prepared_dict() 142 | self.table_rule = get_dict_rule_for_table( 143 | dictionary_rules=self.context.prepared_dictionary_obj["dictionary"], 144 | schema=self._schema_name, 145 | table=self._table_name, 146 | ) 147 | 148 | await self._prepare_queries() 149 | await self._output_fields() 150 | 151 | self.context.logger.info("<------------- Finished view_fields mode") 152 | except Exception as ex: 153 | self.context.logger.error("<------------- view_fields failed\n" + exception_helper()) 154 | raise ex 155 | -------------------------------------------------------------------------------- /docs/operations/view-fields.md: -------------------------------------------------------------------------------- 1 | # 🔬 View Fields 2 | > [🏠 Home](../../README.md#-operations) | [🔍 Scan](scan.md) | [💾 Dump](dump.md) | [📂 Restore](restore.md) | [📊 View Data](view-data.md) | [📚 SQL Functions Library](../sql-functions-library.md) 3 | 4 | ## Overview 5 | 6 | This mode displays how database fields match the anonymization rules. 7 | 8 | ## Prerequisites 9 | - The `anon_funcs` schema with anonymization functions must already exist. See [init mode](init.md). 10 | - A sensitive dictionary containing data about database fields and their anonymization rules must be prepared beforehand. See [create-dict (scan) mode](scan.md). 11 | 12 | ## Run example 13 | 14 | ```commandline 15 | python pg_anon.py --mode=view-fields \ 16 | --db-host=127.0.0.1 \ 17 | --db-user=postgres \ 18 | --db-user-password=postgres \ 19 | --db-name=source_db \ 20 | --prepared-sens-dict-file=sens_dict.py 21 | ``` 22 | 23 | > ⚠️ **Note** 24 | > 25 | > This mode can process only a limited number of fields when no filters are applied, for performance reasons. 26 | > 27 | > This limit is controlled by the `--fields-count` option (default: 5000 fields). 28 | > To avoid hitting this limit, increase the `--fields-count` value or use filter options: `--schema-name`, `--schema-mask`, `--table-name`, `--table-mask`. 29 | 30 | --- 31 | 32 | ## Options 33 | 34 | ### Common pg_anon options: 35 | 36 | | Option | Required | Description | 37 | |--------------------------------|----------|--------------------------------------------------------------------------------------------------| 38 | | `--config` | No | Path to the config file that can specify `pg_dump` and `pg_restore` utilities. (default: none) | 39 | | `--processes` | No | Number of processes used for multiprocessing operations. (default: 4) | 40 | | `--db-connections-per-process` | No | Number of database connections per process for I/O operations. (default: 4) | 41 | | `--verbose` | No | Sets the log verbosity level: `info`, `debug`, `error`. (default: info) | 42 | | `--debug` | No | Enables debug mode (equivalent to `--verbose=debug`) and adds extra debug logs. (default: false) | 43 | 44 | 45 | ### Database configuration options: 46 | 47 | | Option | Required | Description | 48 | |----------------------|----------|-----------------------------------------------------------------------------------| 49 | | `--db-host` | Yes | **Required.** Database host. | 50 | | `--db-port` | Yes | **Required.** Database port. | 51 | | `--db-name` | Yes | **Required.** Database name. | 52 | | `--db-user` | Yes | **Required.** Database user. | 53 | | `--db-user-password` | No | **Optional.** Database user password. | 54 | | `--db-passfile` | No | **Optional.** Path to a file containing the password used for authentication. | 55 | | `--db-ssl-key-file` | No | **Optional.** Path to the client SSL key file for secure connections. | 56 | | `--db-ssl-cert-file` | No | **Optional.** Path to the client SSL certificate file. | 57 | | `--db-ssl-ca-file` | No | **Optional.** Path to the CA certificate used to verify the server’s certificate. | 58 | 59 | 60 | ### View-fields mode options: 61 | 62 | | Option | Required | Description | 63 | |--------------------------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 64 | | `--prepared-sens-dict-file` | Yes | Input file or file list contains [sensitive dictionary](../dicts/sens-dict-schema.md), which was generated by the [create-dict (scan) mode](scan.md) or created manually. In rules collision case, priority has rules in last file from the list. | 65 | | `--view-only-sensitive-fields` | No | Displays only sensitive fields. (default: all fields) | 66 | | `--fields-count` | No | Maximum number of fields to process for output. (default: 5000) | 67 | | `--schema-name` | No | Filter by schema name. | 68 | | `--schema-mask` | No | Filter by schema name using a regular expression. | 69 | | `--table-name` | No | Filter by table name. | 70 | | `--table-mask` | No | Filter by table name using a regular expression. | 71 | | `--json` | No | Outputs results in JSON format instead of a table. | 72 | -------------------------------------------------------------------------------- /docs/debugging.md: -------------------------------------------------------------------------------- 1 | # 🛠️ Debug stages for anonymization process 2 | 3 | > [🏠 Home](../README.md#-documentation-index) | [💾 Dump](operations/dump.md) | [📂 Restore](docs/operations/restore.md) | [⚙️ How it works](how-it-works.md) | [💬 FAQ](faq.md) 4 | 5 | ## Overview 6 | 7 | The debug stages allow you to test and troubleshoot the anonymization workflow without performing a full dump or restore, saving significant time and resources. 8 | 9 | Each stage emulates a specific part of the anonymization pipeline: 10 | 11 | - **Stage 1 — Validate Dict** 12 | 13 | Validates the sensitive dictionary and checks SQL logic without exporting any data. 14 | 15 | - **Stage 2 — Validate Data** 16 | 17 | Performs anonymization checks on real data with a limited sample (LIMIT 100) using a prepared database schema. 18 | 19 | - **Stage 3 — Validate Full**: 20 | 21 | Executes the full anonymization logic with data sampling (LIMIT 100), but without requiring a prepared database. 22 | 23 | These stages help you quickly debug rules, anonymization functions, SQL conditions, and dictionary configuration before running a full anonymized dump/restore process. 24 | 25 | --- 26 | 27 | ## Stage 1: Validate dict 28 | 29 | This stage validate dictionary, show the tables and run SQL queries without data export into the disk or database. 30 | So if program works without errors => the stage is passed. 31 | 32 | ![dbg-stage-1.png](../images/dbg-stage-1.png) 33 | 34 | ```commandline 35 | python pg_anon.py --mode=dump \ 36 | --db-host=127.0.0.1 \ 37 | --db-user=postgres \ 38 | --db-user-password=postgres \ 39 | --db-name=test_source_db \ 40 | --output-dir=test_dbg_stages \ 41 | --prepared-sens-dict-file=test_dbg_stages.py \ 42 | --clear-output-dir \ 43 | --verbose=debug \ 44 | --debug \ 45 | --dbg-stage-1-validate-dict 46 | ``` 47 | --- 48 | 49 | ## Stage 2: Validate data 50 | 51 | Validate data, show the tables and run SQL queries with data export and limit 100 in prepared database. 52 | This stage requires database with all structure with only pre-data condition, which described in --prepared-sens-dict-file. 53 | 54 | 55 | 56 | - If you want to create the database with required structure, just run: 57 | 58 | One-time structure dump: 59 | 60 | ```commandline 61 | python pg_anon.py --mode=sync-struct-dump \ 62 | --db-host=127.0.0.1 \ 63 | --db-user=postgres \ 64 | --db-user-password=postgres \ 65 | --db-name=test_source_db \ 66 | --output-dir=test_stage_2 \ 67 | --prepared-sens-dict-file=test_dbg_stages.py \ 68 | --clear-output-dir \ 69 | --verbose=debug \ 70 | --debug \ 71 | --dbg-stage-3-validate-full 72 | ``` 73 | 74 | And then as many times as you want structure restore: 75 | 76 | ```commandline 77 | su - postgres -c "psql -U postgres -d postgres -c \"DROP DATABASE IF EXISTS test_target_db_7\"" 78 | su - postgres -c "psql -U postgres -d postgres -c \"CREATE DATABASE test_target_db_7\"" 79 | python pg_anon.py --mode=sync-struct-restore \ 80 | --db-host=127.0.0.1 \ 81 | --db-user=postgres \ 82 | --db-user-password=postgres \ 83 | --db-name=test_target_db_7 \ 84 | --input-dir=test_stage_2 \ 85 | --verbose=debug \ 86 | --debug 87 | ``` 88 | 89 | - Validate data stage in dump: 90 | 91 | ![dbg-stage-2.png](../images/dbg-stage-2.png) 92 | 93 | ```commandline 94 | python pg_anon.py --mode=dump \ 95 | --db-host=127.0.0.1 \ 96 | --db-user=postgres \ 97 | --db-user-password=postgres \ 98 | --db-name=test_source_db \ 99 | --output-dir=test_dbg_stages \ 100 | --prepared-sens-dict-file=test_dbg_stages.py \ 101 | --clear-output-dir \ 102 | --verbose=debug \ 103 | --debug \ 104 | --dbg-stage-2-validate-data 105 | ``` 106 | 107 | - Validate data stage in data-restore: 108 | 109 | ```commandline 110 | python pg_anon.py --mode=sync-data-restore \ 111 | --db-host=127.0.0.1 \ 112 | --db-user=postgres \ 113 | --db-user-password=postgres \ 114 | --db-name=test_target_db_7 \ 115 | --input-dir=test_dbg_stages \ 116 | --verbose=debug \ 117 | --debug 118 | 119 | # And for example view all data in every table: 120 | su - postgres -c "psql -U postgres -d test_target_db_7 -c \"SELECT * FROM public.contracts\"" 121 | ``` 122 | --- 123 | 124 | ## Stage 3: Validate full 125 | 126 | ![dbg-stage-3.png](../images/dbg-stage-3.png) 127 | 128 | Makes all logic with "limit 100" in SQL queries. In this stage you don't need prepared database, just run: 129 | 130 | ```commandline 131 | su - postgres -c "psql -U postgres -d postgres -c \"DROP DATABASE IF EXISTS test_target_db_8\"" 132 | su - postgres -c "psql -U postgres -d postgres -c \"CREATE DATABASE test_target_db_8\"" 133 | ``` 134 | 135 | - Validate full stage in dump: 136 | 137 | ```commandline 138 | python pg_anon.py --mode=dump \ 139 | --db-host=127.0.0.1 \ 140 | --db-user=postgres \ 141 | --db-user-password=postgres \ 142 | --db-name=test_source_db \ 143 | --output-dir=test_dbg_stages \ 144 | --prepared-sens-dict-file=test_dbg_stages.py \ 145 | --clear-output-dir \ 146 | --verbose=debug \ 147 | --debug \ 148 | --dbg-stage-3-validate-full 149 | ``` 150 | 151 | - Validate full stage in restore: 152 | 153 | ```commandline 154 | python pg_anon.py --mode=restore \ 155 | --db-host=127.0.0.1 \ 156 | --db-user=postgres \ 157 | --db-user-password=postgres \ 158 | --db-name=test_target_db_8 \ 159 | --input-dir=test_dbg_stages \ 160 | --verbose=debug \ 161 | --debug 162 | 163 | # And for example view all data in every table: 164 | su - postgres -c "psql -U postgres -d test_target_db_8 -c \"SELECT * FROM public.contracts\"" 165 | ``` 166 | -------------------------------------------------------------------------------- /docs/operations/scan.md: -------------------------------------------------------------------------------- 1 | # 🔍 Scan 2 | > [🏠 Home](../../README.md#-operations) | [💾 Dump](dump.md) | [📂 Restore](restore.md) | [🔬 View Fields](view-fields.md) | [📊 View Data](view-data.md) | [📚 SQL Functions Library](../sql-functions-library.md) 3 | 4 | --- 5 | 6 | ## Overview 7 | The **scan** operation analyzes your PostgreSQL database to detect potentially sensitive data and generate dictionaries files. 8 | It used for dump and repeat scan. 9 | 10 | --- 11 | 12 | ## Prerequisites: 13 | - Manually created [meta-dictionary](../dicts/meta-dict-schema.md) 14 | - Already run `init` mode for source database 15 | 16 | ## Usage: 17 | To scan source database and create dictionary for dump, run pg_anon in `create-dict` mode. 18 | You need: 19 | - **meta-dictionary** file with scan rules. 20 | 21 | ```commandline 22 | python pg_anon.py --mode=create-dict \ 23 | --db-user=postgres \ 24 | --db-user-password=postgres \ 25 | --db-name=test_source_db \ 26 | --meta-dict-file=test_meta_dict.py \ 27 | --prepared-sens-dict-file=test_sens_dict_output_previous_use.py \ 28 | --prepared-no-sens-dict-file=test_no_sens_dict_output_previous_use.py \ 29 | --output-sens-dict-file=test_sens_dict_output.py \ 30 | --output-no-sens-dict-file=test_no_sens_dict_output.py \ 31 | --processes=2 32 | ``` 33 | 34 | --- 35 | 36 | ## Options 37 | 38 | ### Common pg_anon options: 39 | 40 | | Option | Required | Description | 41 | |--------------------------------|----------|--------------------------------------------------------------------------------------------------| 42 | | `--config` | No | Path to the config file that can specify `pg_dump` and `pg_restore` utilities. (default: none) | 43 | | `--processes` | No | Number of processes used for multiprocessing operations. (default: 4) | 44 | | `--db-connections-per-process` | No | Number of database connections per process for I/O operations. (default: 4) | 45 | | `--verbose` | No | Sets the log verbosity level: `info`, `debug`, `error`. (default: info) | 46 | | `--debug` | No | Enables debug mode (equivalent to `--verbose=debug`) and adds extra debug logs. (default: false) | 47 | 48 | 49 | ### Database configuration options: 50 | 51 | | Option | Required | Description | 52 | |----------------------|----------|---------------------------------------------------------------------| 53 | | `--db-host` | Yes | Database host. | 54 | | `--db-port` | Yes | Database port. | 55 | | `--db-name` | Yes | Database name. | 56 | | `--db-user` | Yes | Database user. | 57 | | `--db-user-password` | No | Database user password. | 58 | | `--db-passfile` | No | Path to a file containing the password used for authentication. | 59 | | `--db-ssl-key-file` | No | Path to the client SSL key file for secure connections. | 60 | | `--db-ssl-cert-file` | No | Path to the client SSL certificate file. | 61 | | `--db-ssl-ca-file` | No | Path to the CA certificate used to verify the server’s certificate. | 62 | 63 | 64 | ### Create-dict (scan) mode options 65 | 66 | | Option | Required | Description | 67 | |--------------------------------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 68 | | `--meta-dict-file` | Yes | Input file or file list contains [meta-dictionary](../dicts/meta-dict-schema.md), which was prepared manually. In rules collision case, priority has rules in last file from the list. | 69 | | `--prepared-sens-dict-file` | No | Input file or file list contains [sensitive dictionary](../dicts/sens-dict-schema.md), which was obtained in previous use by option `--output-sens-dict-file` or prepared manually. In rules collision case, priority has rules in last file from the list. | 70 | | `--prepared-no-sens-dict-file` | No | Input file or file list contains [not sensitive dictionary](../dicts/non-sens-dict-schema.md), which was obtained in previous use by option `--output-no-sens-dict-file` or prepared manually. In rules collision case, priority has rules in last file from the list. | 71 | | `--output-sens-dict-file` | Yes | Output file path for saving sensitive dictionary. | 72 | | `--output-no-sens-dict-file` | No | Output file path for saving not sensitive dictionary. | 73 | | `--scan-mode` | No | Defines whether to scan all data or only part of it ["full", "partial"] (default "partial"). | 74 | | `--scan-partial-rows` | No | In `--scan-mode partial` defines amount of rows to scan (default 10000). Actual rows count can be smaller after getting unique values. | 75 | | `--save-dicts` | No | Duplicate all input and output dictionaries to dir `runs`. It can be useful for debugging or integration purposes. | 76 | -------------------------------------------------------------------------------- /tests/expected_results/test_prepared_no_sens_dict_result_expected.py: -------------------------------------------------------------------------------- 1 | { 2 | "no_sens_dictionary": [ 3 | { 4 | "schema": "_SCHM.$complex#имя;@&* a'", 5 | "table": "_TBL.$complex#имя;@&* a'2", 6 | "fields": [ 7 | "id" 8 | ] 9 | }, 10 | { 11 | "schema": "_SCHM.$complex#имя;@&* a'", 12 | "table": "_TBL.$complex#имя;@&* a'3", 13 | "fields": [ 14 | "id" 15 | ] 16 | }, 17 | { 18 | "schema": "columnar_internal", 19 | "table": "tbl_200", 20 | "fields": [ 21 | "id", 22 | "val", 23 | "val_skip" 24 | ] 25 | }, 26 | { 27 | "schema": "public", 28 | "table": "contracts", 29 | "fields": [ 30 | "customer_company_id", 31 | "customer_manager_id", 32 | "status" 33 | ] 34 | }, 35 | { 36 | "schema": "public", 37 | "table": "inn_info", 38 | "fields": [ 39 | "company_info" 40 | ] 41 | }, 42 | { 43 | "schema": "public", 44 | "table": "tbl_100", 45 | "fields": [ 46 | "num_val", 47 | "val", 48 | "val_skip" 49 | ] 50 | }, 51 | { 52 | "schema": "public", 53 | "table": "tbl_constants", 54 | "fields": [ 55 | "phrases_no_sens_1", 56 | "phrases_no_sens_2", 57 | "phrases_sens_1", 58 | "phrases_sens_2", 59 | "words_no_sens_1", 60 | "words_no_sens_2", 61 | "words_sens" 62 | ] 63 | }, 64 | { 65 | "schema": "schm_customer", 66 | "table": "customer_company", 67 | "fields": [ 68 | "company_name", 69 | "email", 70 | "site" 71 | ] 72 | }, 73 | { 74 | "schema": "schm_customer", 75 | "table": "customer_manager", 76 | "fields": [ 77 | "customer_company_id", 78 | "email", 79 | "first_name", 80 | "last_name" 81 | ] 82 | }, 83 | { 84 | "schema": "schm_mask_exclude_1", 85 | "table": "other_tbl", 86 | "fields": [ 87 | "val" 88 | ] 89 | }, 90 | { 91 | "schema": "schm_mask_exclude_1", 92 | "table": "some_tbl", 93 | "fields": [ 94 | "val" 95 | ] 96 | }, 97 | { 98 | "schema": "schm_mask_ext_exclude_2", 99 | "table": "card_numbers", 100 | "fields": [ 101 | "num_val" 102 | ] 103 | }, 104 | { 105 | "schema": "schm_mask_ext_exclude_2", 106 | "table": "some_ext_tbl", 107 | "fields": [ 108 | "val" 109 | ] 110 | }, 111 | { 112 | "schema": "schm_mask_ext_include_2", 113 | "table": "other_ext_tbl", 114 | "fields": [ 115 | "val" 116 | ] 117 | }, 118 | { 119 | "schema": "schm_mask_ext_include_2", 120 | "table": "some_ext_tbl", 121 | "fields": [ 122 | "val" 123 | ] 124 | }, 125 | { 126 | "schema": "schm_mask_include_1", 127 | "table": "other_tbl", 128 | "fields": [ 129 | "val" 130 | ] 131 | }, 132 | { 133 | "schema": "schm_mask_include_1", 134 | "table": "some_tbl", 135 | "fields": [ 136 | "val" 137 | ] 138 | }, 139 | { 140 | "schema": "schm_mask_include_1", 141 | "table": "tbl_123", 142 | "fields": [ 143 | "val" 144 | ] 145 | }, 146 | { 147 | "schema": "schm_mask_include_1", 148 | "table": "tbl_123_456", 149 | "fields": [ 150 | "val" 151 | ] 152 | }, 153 | { 154 | "schema": "schm_other_1", 155 | "table": "some_tbl", 156 | "fields": [ 157 | "val" 158 | ] 159 | }, 160 | { 161 | "schema": "schm_other_2", 162 | "table": "exclude_tbl", 163 | "fields": [ 164 | "val" 165 | ] 166 | }, 167 | { 168 | "schema": "schm_other_2", 169 | "table": "some_tbl", 170 | "fields": [ 171 | "val" 172 | ] 173 | }, 174 | { 175 | "schema": "schm_other_2", 176 | "table": "tbl_test_anon_functions", 177 | "fields": [ 178 | "fld_10_int", 179 | "fld_11_int", 180 | "fld_12_phone", 181 | "fld_13_txt", 182 | "fld_14_txt", 183 | "fld_15_txt", 184 | "fld_1_int", 185 | "fld_2_datetime", 186 | "fld_3_txt", 187 | "fld_4_txt", 188 | "fld_6_txt", 189 | "fld_7_zip", 190 | "fld_8_datetime", 191 | "fld_9_datetime" 192 | ] 193 | }, 194 | { 195 | "schema": "schm_other_4", 196 | "table": "goods", 197 | "fields": [ 198 | "created_at", 199 | "description", 200 | "quantity", 201 | "release_date", 202 | "title", 203 | "type_id", 204 | "valid_until" 205 | ] 206 | }, 207 | { 208 | "schema": "schm_other_4", 209 | "table": "partitioned_table", 210 | "fields": [ 211 | "created_at", 212 | "product_id", 213 | "quantity", 214 | "region_code" 215 | ] 216 | }, 217 | { 218 | "schema": "schm_other_4", 219 | "table": "partitioned_table_2025_01", 220 | "fields": [ 221 | "created_at", 222 | "product_id", 223 | "quantity", 224 | "region_code" 225 | ] 226 | }, 227 | { 228 | "schema": "schm_other_4", 229 | "table": "partitioned_table_2025_02", 230 | "fields": [ 231 | "created_at", 232 | "product_id", 233 | "quantity", 234 | "region_code" 235 | ] 236 | }, 237 | { 238 | "schema": "schm_other_4", 239 | "table": "partitioned_table_2025_03", 240 | "fields": [ 241 | "created_at", 242 | "product_id", 243 | "quantity", 244 | "region_code" 245 | ] 246 | }, 247 | { 248 | "schema": "schm_other_4", 249 | "table": "partitioned_table_default", 250 | "fields": [ 251 | "created_at", 252 | "product_id", 253 | "quantity", 254 | "region_code" 255 | ] 256 | } 257 | ] 258 | } -------------------------------------------------------------------------------- /pg_anon/modes/view_fields.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List, Dict 3 | 4 | from prettytable import PrettyTable, SINGLE_BORDER 5 | 6 | from pg_anon.common.db_utils import get_scan_fields_list, get_scan_fields_count 7 | from pg_anon.common.dto import FieldInfo 8 | from pg_anon.common.utils import exception_helper, get_dict_rule_for_table 9 | from pg_anon.context import Context 10 | 11 | 12 | class ViewFieldsMode: 13 | context: Context 14 | _processing_fields_limit: int = 5000 15 | _filter_dict_rule: Dict = None 16 | fields: List[FieldInfo] = None 17 | table: PrettyTable = None 18 | json: str = None 19 | fields_cut_by_limits: bool = False 20 | empty_data_filler: str = '---' 21 | 22 | def __init__(self, context: Context): 23 | self.context = context 24 | if context.options.fields_count is not None: 25 | self._processing_fields_limit = context.options.fields_count 26 | self._init_filter_dict_rule() 27 | 28 | def _init_filter_dict_rule(self): 29 | self._filter_dict_rule = {} 30 | has_schema: bool = False 31 | has_table: bool = False 32 | 33 | if self.context.options.schema_name: 34 | self._filter_dict_rule["schema"] = self.context.options.schema_name 35 | has_schema = True 36 | 37 | if self.context.options.schema_mask: 38 | self._filter_dict_rule["schema_mask"] = self.context.options.schema_mask 39 | has_schema = True 40 | 41 | if self.context.options.table_name: 42 | self._filter_dict_rule["table"] = self.context.options.table_name 43 | has_table = True 44 | 45 | if self.context.options.table_mask: 46 | self._filter_dict_rule["table_mask"] = self.context.options.table_mask 47 | has_table = True 48 | 49 | if has_schema and not has_table: 50 | self._filter_dict_rule["table_mask"] = '*' 51 | 52 | if not has_schema and has_table: 53 | self._filter_dict_rule["schema_mask"] = '*' 54 | 55 | def _check_by_filters(self, field: FieldInfo) -> bool: 56 | return bool(get_dict_rule_for_table( 57 | dictionary_rules=[self._filter_dict_rule], 58 | schema=field.nspname, 59 | table=field.relname, 60 | )) 61 | 62 | async def _get_fields_for_view(self) -> List[FieldInfo]: 63 | """ 64 | Get scanning fields for view mode 65 | :return: list of fields for view mode 66 | """ 67 | fields_list = await get_scan_fields_list( 68 | connection_params=self.context.connection_params, 69 | server_settings=self.context.server_settings, 70 | limit=self._processing_fields_limit 71 | ) 72 | 73 | result = [] 74 | for field in fields_list: 75 | field_info = FieldInfo(**field) 76 | if not self._filter_dict_rule or self._check_by_filters(field_info): 77 | result.append(field_info) 78 | 79 | return result 80 | 81 | async def _make_notice_fields_cut_by_limits(self): 82 | fields_count = await get_scan_fields_count( 83 | connection_params=self.context.connection_params, 84 | server_settings=self.context.server_settings 85 | ) 86 | 87 | if fields_count > self._processing_fields_limit and not self.context.options.json: 88 | print(f'You try to get too many fields ({fields_count} fields).' 89 | f' Will processed for output only first {self._processing_fields_limit} fields.' 90 | f' Use arguments --schema-name, --schema-mask, --table-name, --table-mask to reduce fields amount.' 91 | f' Also you can use --fields-count to extend limit.') 92 | self.fields_cut_by_limits = True 93 | 94 | def _prepare_fields_for_view(self): 95 | fields_with_find_rules = [] 96 | 97 | for field in self.fields.copy(): 98 | include_rule = get_dict_rule_for_table( 99 | dictionary_rules=self.context.prepared_dictionary_obj["dictionary"], 100 | schema=field.nspname, 101 | table=field.relname, 102 | ) 103 | 104 | if include_rule: 105 | if field.column_name in include_rule.get('fields', {}): 106 | field.rule = include_rule['fields'][field.column_name] 107 | field.dict_file_name = include_rule["dict_file_name"] 108 | fields_with_find_rules.append(field) 109 | continue 110 | elif include_rule.get('raw_sql'): 111 | field.rule = include_rule['raw_sql'] 112 | field.dict_file_name = include_rule["dict_file_name"] 113 | fields_with_find_rules.append(field) 114 | continue 115 | 116 | if not self.context.options.view_only_sensitive_fields: 117 | field.rule = self.empty_data_filler 118 | field.dict_file_name = self.empty_data_filler 119 | fields_with_find_rules.append(field) 120 | 121 | self.fields = fields_with_find_rules 122 | 123 | def _prepare_table(self): 124 | self.table = PrettyTable([ 125 | 'schema', 126 | 'table', 127 | 'field', 128 | 'type', 129 | 'dict_file_name', 130 | 'rule', 131 | ], align='l') 132 | self.table.set_style(SINGLE_BORDER) 133 | 134 | for field in self.fields: 135 | self.table.add_row([ 136 | field.nspname, 137 | field.relname, 138 | field.column_name, 139 | field.type, 140 | field.dict_file_name, 141 | field.rule, 142 | ]) 143 | 144 | def _prepare_json(self): 145 | self.json = json.dumps([{ 146 | 'schema': field.nspname, 147 | 'table': field.relname, 148 | 'field': field.column_name, 149 | 'type': field.type, 150 | 'dict_file_name': field.dict_file_name, 151 | 'rule': field.rule, 152 | } for field in self.fields], ensure_ascii=False) 153 | 154 | async def _output_fields(self): 155 | await self._make_notice_fields_cut_by_limits() 156 | 157 | self.fields = await self._get_fields_for_view() 158 | if not self.fields: 159 | raise ValueError("Not found fields for view!") 160 | 161 | self._prepare_fields_for_view() 162 | 163 | if not self.fields: 164 | raise ValueError("Haven't fields for view!") 165 | 166 | if self.context.options.json: 167 | self._prepare_json() 168 | print(self.json) 169 | else: 170 | self._prepare_table() 171 | print(self.table) 172 | 173 | async def run(self) -> None: 174 | self.context.logger.info("-------------> Started view_fields mode") 175 | 176 | try: 177 | if self._processing_fields_limit < 1: 178 | raise ValueError("Processing fields limit must be greater than zero!") 179 | self.context.read_prepared_dict(save_dict_file_name_for_each_rule=True) 180 | if not self.context.prepared_dictionary_obj.get("dictionary"): 181 | raise ValueError("Prepared dictionary is empty!") 182 | await self._output_fields() 183 | 184 | self.context.logger.info("<------------- Finished view_fields mode") 185 | except Exception as ex: 186 | self.context.logger.error("<------------- view_fields failed\n" + exception_helper()) 187 | raise ex 188 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | > [🏠 Home](../README.md#-documentation-index) | [⚙️ How it works](how-it-works.md) 3 | 4 | ### 1. Where can I find operation logs and launch parameters? 5 | All run data is stored in the `/path/to/pg_anon/runs` directory. 6 | Inside, the structure is: `///`. 7 | 8 | Each operation folder contains: 9 | - a `logs` directory with all log files 10 | - a `run_options.json` file with all parameters used to run `pg_anon` 11 | 12 | If the `--save-dicts` option was used, the folders `input` and `output` will also appear. 13 | They contain all input and output dictionaries for that run. 14 | 15 | --- 16 | 17 | ### 2. Can I restore a pg_anon dump using pg_dump? 18 | 19 | **No.** The pg_anon dump format is not compatible with pg_dump due to the specifics of anonymization. 20 | 21 | For the same reason, a regular backup created with pg_dump cannot be restored using pg_anon. 22 | 23 | --- 24 | 25 | ### 3. Does pg_anon modify the structure or data of the source database during scan, dump, view-data, or view-fields? 26 | 27 | pg_anon does **not** modify either the structure or the data of the source database. 28 | 29 | The only thing pg_anon adds is the `anon_funcs` schema, which is required for its internal operations. 30 | 31 | --- 32 | 33 | ### 4. Can I use custom functions for scanning? 34 | 35 | **Yes.** The meta-dictionary has a [`data_func`](dicts/meta-dict-schema.md#6-section-data_func) section. 36 | In this section, you can use any custom SQL function for sensitivity validation. 37 | 38 | This allows you to implement checks using full-text search or any other SQL capabilities. 39 | 40 | Such functions must follow this template: 41 | 42 | ```sql 43 | CREATE OR REPLACE FUNCTION .( 44 | value TEXT, 45 | schema_name TEXT, 46 | table_name TEXT, 47 | field_name TEXT 48 | ) 49 | RETURNS boolean AS $$ 50 | BEGIN 51 | ; 52 | END; 53 | $$ LANGUAGE plpgsql; 54 | ``` 55 | 56 | --- 57 | 58 | ### 5. Can I use custom functions for anonymization? 59 | 60 | **Yes.** You can use any functions and values available in the source database. 61 | 62 | You must ensure that anonymized values match the field format. 63 | For example, if the field type is `varchar(15)`, you must **manually** ensure the generated value does not exceed 15 characters. 64 | 65 | If the format is violated, the dump may be created successfully, but restoring it may fail. 66 | 67 | Also for this cases can be used [`data_func`](dicts/meta-dict-schema.md#6-section-data_func) section with scan_func for field length comparison and specific anon_function for specific length. 68 | 69 | For example, scan function bellow getting only fields with length less than 20 symbols and containing emails: 70 | ```sql 71 | CREATE OR REPLACE FUNCTION my_scan_funcs.is_email_field_with_len_20_chars( 72 | value TEXT, 73 | schema_name TEXT, 74 | table_name TEXT, 75 | field_name TEXT 76 | ) 77 | RETURNS boolean AS $$ 78 | DECLARE 79 | max_len integer; 80 | is_email boolean; 81 | BEGIN 82 | SELECT c.character_maximum_length 83 | INTO max_len 84 | FROM information_schema.columns c 85 | WHERE c.table_schema = $2 86 | AND c.table_name = $3 87 | AND c.column_name = $4; 88 | 89 | -- field length must be 20 characters 90 | if max_len != 20 then 91 | return false; 92 | end if; 93 | 94 | -- value must be not null for comparison 95 | if $1 is null then 96 | return false; 97 | end if; 98 | 99 | -- check email format by regexp 100 | return $1 ~* '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'; 101 | END; 102 | $$ LANGUAGE plpgsql; 103 | ``` 104 | 105 | The meta-dict rule below can be used to detect email fields with a length of 20 characters and anonymize them while preserving both format and length. 106 | ```python 107 | { 108 | "data_func": { 109 | "varchar": [ 110 | { 111 | "scan_func": "my_scan_funcs.is_email_field_with_len_20_chars", 112 | "anon_func": "lower(anon_funcs.random_string(9)) || '@secret.com'", 113 | "n_count": 10 114 | } 115 | ] 116 | } 117 | } 118 | ``` 119 | 120 | --- 121 | 122 | ### 6. Is the scanning stage required? 123 | 124 | **No**. You can create all required dictionaries manually or reuse previously generated dictionaries. 125 | 126 | --- 127 | 128 | ### 7. Why load sensitive and non-sensitive dictionaries during scanning? 129 | 130 | They are used only to speed up scanning. 131 | 132 | These dictionaries act as a cache, allowing pg_anon to immediately know which fields are sensitive and which are not. 133 | 134 | This way, repeated scans of the same database will run very quickly. 135 | 136 | If new fields appear that are not present in the dictionaries, pg_anon will evaluate them using the rules from the meta-dictionary. 137 | 138 | --- 139 | 140 | ### 8. When should I use `--config` with a configuration file? 141 | 142 | If you plan to use pg_anon with different PostgreSQL major versions, you should define a config file. 143 | 144 | It is much easier to configure this once rather than repeatedly passing paths to pg_dump and pg_restore. 145 | 146 | If you always use a single PostgreSQL version, the system pg_dump and pg_restore will be used, and a config file is unnecessary. 147 | 148 | --- 149 | 150 | ### 9. Can I split one large dictionary into multiple smaller ones? 151 | 152 | **Yes**. All dictionary-related parameters accept lists of files. 153 | 154 | At startup, pg_anon merges them into a single dictionary internally. 155 | 156 | This makes it easy to separate different groups of rules into different files and combine them as needed. 157 | This is especially helpful for the meta-dictionary, which contains many optional sections. 158 | 159 | --- 160 | 161 | ### 10. Restore error: "Database is not empty" 162 | 163 | Restore mode checks that the target database is empty. 164 | 165 | This is done to prevent accidental data loss in the target database. 166 | 167 | If needed, use the `--drop-db` or `--clean-db` options during restore. 168 | 169 | --- 170 | 171 | ### 11. Restore error: "Database is being accessed by other users" 172 | 173 | When using the `--drop-db` option, the target database will be recreated using `DROP DATABASE` and `CREATE DATABASE`. 174 | 175 | If there are active connections, the `DROP DATABASE` command cannot be executed. 176 | 177 | You must terminate all active sessions and run the restore operation again. 178 | 179 | --- 180 | 181 | ### 12. Difference between options `--drop-db` and `--clean-db` for restore mode 182 | 183 | - `--drop-db` - recreate target database using commands `DROP DATABASE` and `CREATE DATABASE`. After that running restore process on empty db. 184 | - `--clean-db` - Performs a restore similar to pg_restore --clean --if-exists. It creates missing tables from the backup in the target database. It also preserves extra tables that exist in the target DB and are not contained in the restoring backup. This option does not require an empty target database. 185 | 186 | --- 187 | 188 | ### 13. Determining Optimal Process and Connection Counts 189 | 190 | To configure optimal values, first identify these system parameters: 191 | - max_connections - maximum connections allowed by your PostgreSQL database 192 | - CPU core count 193 | - Reserved connections (typically 3-10 for maintenance/admin connections) 194 | 195 | Important Considerations: 196 | - Exceeding max_connections may cause pg_anon failures and affect other database applications 197 | - Ensure sufficient connection headroom for other services 198 | 199 | #### Recommended Configuration: 200 | 201 | Process Count 202 | ```bash 203 | --processes = CPU cores 204 | ``` 205 | Database Connections per Process 206 | ```bash 207 | --db-connections-per-process ≤ (max_connections - reserved_connections) / --processes 208 | ``` 209 | 210 | #### Example Calculation: 211 | - CPU cores: 4 212 | - max_connections: 100 213 | - reserved_connections: 5 214 | - --processes: 4 215 | - --db-connections-per-process: (100 - 5) / 4 ≈ 23.75 → 23 216 | - **Verification:** 4 processes × 23 connections = 92 total connections (within 100 limit) 217 | -------------------------------------------------------------------------------- /docs/dicts/sens-dict-schema.md: -------------------------------------------------------------------------------- 1 | # 📋 Sensitive Dictionary 2 | > [🏠 Home](../../README.md#-dictionary-schemas) | [🔍 Scan](../operations/scan.md) | [💾 Dump](../operations/dump.md) | [🔬 View Fields](../operations/view-fields.md) | [📊 View Data](../operations/view-data.md) | [🗂️ Meta Dictionary](meta-dict-schema.md) | [📋 Non-sensitive Dictionary](non-sens-dict-schema.md) 3 | 4 | ## Overview 5 | The sensitive dictionary defines explicit anonymization rules for fields. 6 | It is used in four operation modes, and its behavior differs slightly across them: 7 | 8 | 1. [💾 Dump mode](../operations/dump.md) 9 | 10 | Fields listed in the dictionary are anonymized using the defined rules. 11 | All other fields are dumped as-is. 12 | 13 | 2. [🔍 Create-dict (scan) mode](../operations/scan.md) 14 | 15 | Fields listed in the sensitive dictionary are treated as known **sensitive** fields, 16 | which skips sensitivity detection for them. 17 | This speeds up scanning process. 18 | 19 | 3. [🔬 View fields mode](../operations/view-fields.md) 20 | 21 | Shows which anonymization rules would be applied to fields. 22 | 23 | 4. [📊 View data mode](../operations/view-data.md) 24 | 25 | Shows how the rules would affect sample data, without performing a dump. 26 | 27 | This dictionary can be created manually or generated automatically using [create-dict (scan) mode](../operations/scan.md). 28 | 29 | > ⚠️ **Note** 30 | > 31 | > If a field appears both in the sensitive dictionary and the [non-sensitive](non-sens-dict-schema.md) dictionary, the sensitive dictionary takes priority. 32 | 33 | 34 | --- 35 | 36 | ## Schema 37 | ```python 38 | { 39 | "dictionary": [ 40 | { 41 | "schema": "", 42 | "table": "", 43 | "fields": { 44 | "": "", 45 | }, 46 | "sql_condition": # Optional. Condition in raw SQL format for filtering the data to dump. (This section ignored for create-dict (scan) mode 47 | """ 48 | 49 | """ 50 | } 51 | ], 52 | # Optional section. It is used to exclude schemas and tables from the data dump. 53 | "dictionary_exclude": [ 54 | { 55 | "schema": "", # Exclude only this schema 56 | "schema_mask": "", # Or exclude schemas matching regex pattern 57 | "table": "", # Exclude only this table 58 | "table_mask": "", # Or exclude tables matching regex pattern 59 | } 60 | ] 61 | } 62 | ``` 63 | > ⚠️ **Note** 64 | > - `sql_condition` in `dictionary` section is optional. It can be used for taking a part of data. Example: getting table data only by last week. 65 | > - `dictionary_exclude` is optional section. If a table appears in both the "dictionary_exclude" and "dictionary" sections, then table will be dumped. It can be used for particular dump and debugging of anonymization process. 66 | > - In `dictionary_exclude`, you must use either `schema` or `schema_mask` → not both. 67 | > - In `dictionary_exclude`, you must use either `table` or `table_mask` → not both. 68 | 69 | --- 70 | 71 | ## ⚙️ Using the Dictionary 72 | 73 | **🏛️ Example Database Structure** 74 | 75 | | Schema | Table | Field | 76 | |-----------|-----------|------------------| 77 | | public | employees | id | 78 | | public | employees | full_name | 79 | | public | employees | email | 80 | | public | employees | hire_date | 81 | | public | salaries | employee_id | 82 | | public | salaries | monthly_salary | 83 | | public | salaries | currency | 84 | | ecommerce | orders | product_id | 85 | | ecommerce | orders | count | 86 | | ecommerce | orders | client_name | 87 | | ecommerce | orders | delivery_address | 88 | | ecommerce | orders | created | 89 | | ecommerce | orders | status | 90 | | tenant_a | projects | title | 91 | | tenant_a | projects | description | 92 | | tenant_b | projects | title | 93 | | tenant_b | projects | description | 94 | | tenant_c | projects | title | 95 | | tenant_c | projects | description | 96 | 97 | 98 | 99 | **📘 Example Sensitive Dictionary** 100 | ```python 101 | { 102 | "dictionary": [ 103 | { 104 | "schema": "public", 105 | "table": "employees", 106 | "fields": { 107 | "full_name": "anon_funcs.digest(\"full_name\", 'salt_word', 'sha256')", # hashing employees names 108 | "email": "md5(\"email\") || @abc.com", # hashing employee emails while preserving email format 109 | }, 110 | }, 111 | { 112 | "schema": "public", 113 | "table": "salaries", 114 | "fields": { 115 | "monthly_salary": "10000", # just defines one value for the field for all rows 116 | }, 117 | }, 118 | { 119 | "schema": "ecommerce", 120 | "table": "orders", 121 | "fields": { 122 | "client_name": "anon_funcs.digest(\"client_name\", 'salt_word', 'sha256')", 123 | "delivery_address": "anon_funcs.digest(\"delivery_address\", 'salt_word', 'sha256')", 124 | }, 125 | "sql_condition": # Dumping only the orders completed within the last week 126 | """ 127 | WHERE created > NOW() - '7 days'::interval 128 | AND status = 'done' 129 | """ 130 | } 131 | ], 132 | # Excluding all tables from schemas `tenant_a`, `tenant_b`, `tenant_c` 133 | "dictionary_exclude": [ 134 | { 135 | "schema_mask": "tenant_.*", 136 | "table_mask": "*", 137 | } 138 | ] 139 | } 140 | ``` 141 | 142 | **This dictionary matches the following table fields:** 143 | 144 | | Schema | Table | Field | Used in `dump` mode | Used in `create-dict (scan)` mode | 145 | |--------------|-----------|------------------|---------------------------|---------------------------------------------------------| 146 | | public | employees | id | Dumped as is | Fields scanned using meta-dictionary rules | 147 | | public | employees | full_name | Dumped with anonymization | Excluded from sensitivity checks as a "sensitive" field | 148 | | public | employees | email | Dumped with anonymization | Excluded from sensitivity checks as a "sensitive" field | 149 | | public | employees | hire_date | Dumped as is | Fields scanned using meta-dictionary rules | 150 | | public | salaries | employee_id | Dumped as is | Fields scanned using meta-dictionary rules | 151 | | public | salaries | monthly_salary | Dumped with anonymization | Excluded from sensitivity checks as a "sensitive" field | 152 | | public | salaries | currency | Dumped as is | Fields scanned using meta-dictionary rules | 153 | | ecommerce | orders | product_id | Dumped as is | Fields scanned using meta-dictionary rules | 154 | | ecommerce | orders | client_name | Dumped with anonymization | Excluded from sensitivity checks as a "sensitive" field | 155 | | ecommerce | orders | delivery_address | Dumped with anonymization | Excluded from sensitivity checks as a "sensitive" field | 156 | | ecommerce | orders | count | Dumped as is | Fields scanned using meta-dictionary rules | 157 | | ecommerce | orders | created | Dumped as is | Fields scanned using meta-dictionary rules | 158 | | ecommerce | orders | status | Dumped as is | Fields scanned using meta-dictionary rules | 159 | --------------------------------------------------------------------------------