├── __init__.py
├── tests
    ├── __init__.py
    ├── input_dict
    │   ├── test_empty_dictionary.py
    │   ├── test_dbg_stages.py
    │   ├── meta_words_and_phrases_constants.py
    │   ├── meta_partial_constants.py
    │   ├── meta_include_rules.py
    │   ├── test_empty_meta_dict.py
    │   ├── meta_data_sql_condition.py
    │   ├── meta_data_func.py
    │   ├── test_sync_struct.py
    │   ├── test_exclude.py
    │   ├── test_sync_data_2.py
    │   ├── test_partial_exclude_tables_dict.py
    │   ├── mask_test.py
    │   ├── test_sync_data.py
    │   ├── test_partial_tables_dict.py
    │   ├── test_meta_dict_type_aliases_complex.py
    │   ├── test_sens_with_sql_conditions.py
    │   ├── meta_include_and_skip_rules.py
    │   ├── test_meta_dict_default_func.py
    │   ├── test_meta_dict.py
    │   ├── test.py
    │   └── test_meta_dict_type_aliases.py
    ├── expected_results
    │   ├── test_prepared_sens_dict_result_with_no_existing_schema.py
    │   ├── test_prepared_sens_dict_result_by_include_rule_expected.py
    │   ├── test_prepared_sens_dict_result_by_include_and_skip_rules_expected.py
    │   ├── test_prepared_sens_dict_result_type_aliases_expected.py
    │   ├── test_prepared_sens_dict_result_type_aliases_complex_expected.py
    │   ├── PGAnonMaskUnitTest_target_tables.result
    │   ├── PGAnonMaskUnitTest_source_tables.result
    │   ├── test_prepared_sens_dict_result_by_data_sql_condition_expected.py
    │   ├── test_prepared_sens_dict_result_by_words_and_phrases_constants_expected.py
    │   ├── test_prepared_sens_dict_result_default_func_expected.py
    │   ├── test_prepared_sens_dict_result_by_partial_constants_expected.py
    │   ├── test_prepared_sens_dict_result_expected.py
    │   ├── test_prepared_sens_dict_result_by_data_func_expected.py
    │   └── test_prepared_no_sens_dict_result_expected.py
    ├── config.yml
    └── sql
    │   ├── init_additional_simple_env.sql
    │   ├── init_simple_env.sql
    │   └── init_stress_env.sql
├── MANIFEST.in
├── pg_anon
    ├── common
    │   ├── __init__.py
    │   ├── multiprocessing_utils.py
    │   ├── enums.py
    │   └── constants.py
    ├── modes
    │   ├── __init__.py
    │   ├── initialization.py
    │   ├── view_data.py
    │   └── view_fields.py
    ├── __init__.py
    ├── __main__.py
    ├── version.py
    ├── logger.py
    └── app.py
├── rest_api
    ├── runners
    │   ├── __init__.py
    │   ├── direct
    │   │   ├── __init__.py
    │   │   ├── view_data.py
    │   │   └── view_fields.py
    │   └── background
    │   │   ├── __init__.py
    │   │   ├── init.py
    │   │   ├── base.py
    │   │   ├── dump.py
    │   │   ├── restore.py
    │   │   └── scan.py
    ├── requirements.txt
    ├── constants.py
    ├── enums.py
    ├── dependencies.py
    └── utils.py
├── setup.py
├── images
    ├── dbg-stage-1.png
    ├── dbg-stage-2.png
    ├── dbg-stage-3.png
    ├── scan_workflow.png
    ├── Create-dict-Terms.drawio.png
    └── Dump-Resore-Terms.drawio.png
├── pg_anon.py
├── requirements.txt
├── docker
    ├── entrypoint_dbg.sh
    ├── motd
    ├── Makefile
    ├── entrypoint.sh
    ├── README.md
    └── Dockerfile
├── pyproject.toml
├── docs
    ├── operations
    │   ├── init.md
    │   ├── view-data.md
    │   ├── view-fields.md
    │   └── scan.md
    ├── dicts
    │   ├── non-sens-dict-schema.md
    │   ├── tables-dictionary.md
    │   └── sens-dict-schema.md
    ├── sql-functions-library.md
    ├── installation-and-configuring.md
    ├── how-it-works.md
    ├── debugging.md
    └── faq.md
└── .gitignore


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | prune tests/
2 | 


--------------------------------------------------------------------------------
/pg_anon/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pg_anon/modes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rest_api/runners/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup()
4 | 


--------------------------------------------------------------------------------
/tests/input_dict/test_empty_dictionary.py:
--------------------------------------------------------------------------------
1 | {
2 | 	"dictionary": []
3 | }
4 | 


--------------------------------------------------------------------------------
/pg_anon/__init__.py:
--------------------------------------------------------------------------------
1 | from .app import PgAnonApp
2 | 
3 | __all__ = ["PgAnonApp"]
4 | 


--------------------------------------------------------------------------------
/rest_api/runners/direct/__init__.py:
--------------------------------------------------------------------------------
1 | from .view_fields import ViewFieldsRunner
2 | 


--------------------------------------------------------------------------------
/images/dbg-stage-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TantorLabs/pg_anon/HEAD/images/dbg-stage-1.png


--------------------------------------------------------------------------------
/images/dbg-stage-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TantorLabs/pg_anon/HEAD/images/dbg-stage-2.png


--------------------------------------------------------------------------------
/images/dbg-stage-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TantorLabs/pg_anon/HEAD/images/dbg-stage-3.png


--------------------------------------------------------------------------------
/pg_anon.py:
--------------------------------------------------------------------------------
1 | from pg_anon.cli import main
2 | 
3 | if __name__ == "__main__":
4 |     main()
5 | 


--------------------------------------------------------------------------------
/rest_api/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi>=0.113,<1.0
2 | uvicorn[standard]>=0.38
3 | aiohttp>=3.13.2
4 | 


--------------------------------------------------------------------------------
/images/scan_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TantorLabs/pg_anon/HEAD/images/scan_workflow.png


--------------------------------------------------------------------------------
/pg_anon/__main__.py:
--------------------------------------------------------------------------------
1 | from pg_anon.cli import main
2 | 
3 | if __name__ == "__main__":
4 |     main()
5 | 


--------------------------------------------------------------------------------
/images/Create-dict-Terms.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TantorLabs/pg_anon/HEAD/images/Create-dict-Terms.drawio.png


--------------------------------------------------------------------------------
/images/Dump-Resore-Terms.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TantorLabs/pg_anon/HEAD/images/Dump-Resore-Terms.drawio.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aioprocessing==2.0.1
2 | async-timeout==4.0.3
3 | asyncpg==0.29.0
4 | prettytable==3.17.0
5 | pyyaml==6.0.3
6 | wcwidth==0.2.14
7 | concurrent-log-handler==0.9.28
8 | 


--------------------------------------------------------------------------------
/docker/entrypoint_dbg.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | echo '[ ! -z "$TERM" -a -r /etc/motd ] && cat /etc/motd' >> /etc/bash.bashrc
5 | 
6 | trap : TERM INT; sleep infinity & wait
7 | 


--------------------------------------------------------------------------------
/tests/input_dict/test_dbg_stages.py:
--------------------------------------------------------------------------------
1 | {
2 | 	"dictionary": [],
3 | 	"dictionary_exclude": [
4 | 			{
5 | 				"schema": "schm_other_1",
6 | 				"table": "some_tbl",
7 | 			}
8 | 		],
9 | }


--------------------------------------------------------------------------------
/rest_api/runners/background/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BaseRunner
2 | from .init import InitRunner
3 | from .dump import DumpRunner
4 | from .scan import ScanRunner
5 | from .restore import RestoreRunner
6 | 


--------------------------------------------------------------------------------
/tests/input_dict/meta_words_and_phrases_constants.py:
--------------------------------------------------------------------------------
1 | {
2 |     "data_const": {
3 |         "constants": [
4 |             "CompanyNameWordSens",
5 |             "include CompanyNamePhrase"
6 |         ]
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/rest_api/runners/background/init.py:
--------------------------------------------------------------------------------
1 | from rest_api.runners.background import BaseRunner
2 | 
3 | from pg_anon.common.enums import AnonMode
4 | 
5 | 
6 | class InitRunner(BaseRunner):
7 |     mode: str = AnonMode.INIT.value
8 | 


--------------------------------------------------------------------------------
/tests/input_dict/meta_partial_constants.py:
--------------------------------------------------------------------------------
1 | {
2 |     "data_const": {
3 |         "partial_constants": [
4 |             "_NamE_", # case insensitive test
5 |             ".cOm" # case insensitive test
6 |         ]
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/rest_api/constants.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | from pathlib import Path
3 | 
4 | from pg_anon.common.constants import BASE_DIR
5 | 
6 | BASE_TEMP_DIR = Path(tempfile.gettempdir()) / 'pg_anon'
7 | DUMP_STORAGE_BASE_DIR = (BASE_DIR / 'output').resolve()
8 | 


--------------------------------------------------------------------------------
/tests/input_dict/meta_include_rules.py:
--------------------------------------------------------------------------------
 1 | {
 2 |     "include_rules": [
 3 |         {
 4 |             "schema": "schm_other_2",
 5 |             "table": "tbl_test_anon_functions",
 6 |             "fields": ["fld_5_email"]
 7 |         }
 8 |     ]
 9 | }
10 | 


--------------------------------------------------------------------------------
/tests/expected_results/test_prepared_sens_dict_result_with_no_existing_schema.py:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"dictionary": [
 3 | 		{
 4 | 			"schema":"not_exists_schema",
 5 | 			"table":"some_tbl",
 6 | 			"fields": {
 7 | 					"val":"'text const'"
 8 | 			}
 9 | 		},
10 | 	],
11 | }
12 | 


--------------------------------------------------------------------------------
/tests/input_dict/test_empty_meta_dict.py:
--------------------------------------------------------------------------------
 1 | {
 2 |   "field": {
 3 |     "rules": [],
 4 |     "constants": []
 5 |   },
 6 |   "skip_rules": [],
 7 |   "data_regex": {
 8 |     "rules": []
 9 |   },
10 |   "data_const": {
11 |     "constants": []
12 |   },
13 |   "funcs": {}
14 | }


--------------------------------------------------------------------------------
/pg_anon/version.py:
--------------------------------------------------------------------------------
 1 | from importlib.metadata import version, PackageNotFoundError
 2 | 
 3 | 
 4 | try:
 5 |     # Get version from metadata
 6 |     __version__ = version("pg_anon")
 7 | except PackageNotFoundError:
 8 |     # TMP: if package is not installed, return hardcoded
 9 |     __version__ = "1.8.5"
10 | 


--------------------------------------------------------------------------------
/tests/input_dict/meta_data_sql_condition.py:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data_sql_condition": [
 3 |         {
 4 |             "schema": "schm_customer",
 5 |             "table": "customer_company",
 6 |             "sql_condition":
 7 |             """
 8 |             WHERE inn is null
 9 |             """
10 |         }
11 |     ]
12 | }
13 | 


--------------------------------------------------------------------------------
/docker/motd:
--------------------------------------------------------------------------------
 1 | =============================================
 2 | # Documentation
 3 | https://github.com/TantorLabs/pg_anon/blob/master/README.md
 4 | =============================================
 5 | python3 pg_anon.py --help
 6 | 
 7 | # Run tests
 8 | python3 tests/test_full.py -v
 9 | =============================================
10 | 


--------------------------------------------------------------------------------
/tests/input_dict/meta_data_func.py:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data_func": {
 3 |         "anyelement": [
 4 |             {
 5 |                 "scan_func": "test_anon_funcs.test_check_is_company_email",
 6 |                 "anon_func": "anon_funcs.partial_email(\"%s\")",
 7 |                 "n_count": 1,
 8 |             },
 9 |         ],
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/tests/expected_results/test_prepared_sens_dict_result_by_include_rule_expected.py:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dictionary": [
 3 |         {
 4 |             "schema": "schm_other_2",
 5 |             "table": "tbl_test_anon_functions",
 6 |             "fields": {
 7 |                 "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')"
 8 |             }
 9 |         }
10 |     ]
11 | }


--------------------------------------------------------------------------------
/tests/input_dict/test_sync_struct.py:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"dictionary": [
 3 | 		{
 4 | 			"schema":"schm_other_2",
 5 | 			"table":"exclude_tbl"
 6 | 		},
 7 | 		{
 8 | 			"schema":"schm_other_2",
 9 | 			"table":"some_tbl"
10 | 		},
11 | 		{
12 | 			"schema":"schm_mask_include_1",
13 | 			"table":"tbl_123"
14 | 		}
15 |     ],
16 | 	"dictionary_exclude": [
17 | 		{
18 | 			"schema_mask": "*",
19 | 			"table_mask": "*",
20 | 		}
21 | 	]
22 | }


--------------------------------------------------------------------------------
/tests/input_dict/test_exclude.py:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"dictionary": [
 3 | 		{
 4 | 			"schema":"schm_other_1",
 5 | 			"table":"some_tbl",
 6 | 			"fields": {
 7 | 					"val":"'text const'"
 8 | 			}
 9 | 		}
10 | 	],
11 | 	"dictionary_exclude": [
12 | 		{
13 | 			"schema_mask": "*",
14 | 			"table_mask": "*",
15 | 		}
16 | 	],
17 | 	"validate_tables": [		# only this tables must contains rows
18 | 		{
19 | 			"schema": "schm_other_1",
20 | 			"table": "some_tbl"
21 | 		}
22 | 	]
23 | }


--------------------------------------------------------------------------------
/tests/input_dict/test_sync_data_2.py:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"dictionary": [
 3 | 		{
 4 | 			"schema":"schm_other_1",
 5 | 			"table":"some_tbl",
 6 | 			"fields": {
 7 | 					"val":"'text const modified'"
 8 | 			}
 9 | 		},
10 | 		{
11 | 			"schema":"schm_other_2",
12 | 			"table":"some_tbl",
13 | 			"raw_sql": "SELECT id, val || ' modified 2' as val FROM schm_other_2.some_tbl"
14 | 		}
15 |     ],
16 | 	"dictionary_exclude": [
17 | 		{
18 | 			"schema_mask": "*",
19 | 			"table_mask": "*",
20 | 		}
21 | 	]
22 | }


--------------------------------------------------------------------------------
/tests/input_dict/test_partial_exclude_tables_dict.py:
--------------------------------------------------------------------------------
 1 | {
 2 |     "tables": [
 3 |         {
 4 |             "schema": "public",
 5 |             "table": "inn_info"
 6 |         },
 7 |         {
 8 |             "schema": "schm_other_1",
 9 |             "table_mask": "*"
10 |         },
11 |         {
12 |             "schema_mask": ".*customer.*",
13 |             "table": "customer_manager"
14 |         },
15 |         {
16 |             "schema_mask": "^_SCHM",
17 |             "table_mask": ".*2$"
18 |         },
19 |     ]
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/config.yml:
--------------------------------------------------------------------------------
 1 | pg-utils-versions:
 2 |   15:
 3 |     pg_dump: "/usr/lib/postgresql/15/bin/pg_dump"
 4 |     pg_restore: "/usr/lib/postgresql/15/bin/pg_restore"
 5 |   16:
 6 |     pg_dump: "/usr/lib/postgresql/16/bin/pg_dump"
 7 |     pg_restore: "/usr/lib/postgresql/16/bin/pg_restore"
 8 |   17:
 9 |     pg_dump: "/usr/lib/postgresql/17/bin/pg_dump"
10 |     pg_restore: "/usr/lib/postgresql/17/bin/pg_restore"
11 |   default:
12 |       pg_dump: "/usr/lib/postgresql/17/bin/pg_dump"
13 |       pg_restore: "/usr/lib/postgresql/17/bin/pg_restore"
14 | 


--------------------------------------------------------------------------------
/rest_api/enums.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum, StrEnum
 2 | 
 3 | 
 4 | class ScanMode(StrEnum):
 5 |     FULL = "full"
 6 |     PARTIAL = "partial"
 7 | 
 8 | 
 9 | class DumpMode(StrEnum):
10 |     FULL = "dump"
11 |     STRUCT = "sync-struct-dump"
12 |     DATA = "sync-data-dump"
13 | 
14 | 
15 | class RestoreMode(StrEnum):
16 |     FULL = "restore"
17 |     STRUCT = "sync-struct-restore"
18 |     DATA = "sync-data-restore"
19 | 
20 | 
21 | class ResponseStatus(Enum):
22 |     UNKNOWN = 1
23 |     SUCCESS = 2
24 |     ERROR = 3
25 |     IN_PROGRESS = 4
26 |     STARTING = 5
27 | 


--------------------------------------------------------------------------------
/docker/Makefile:
--------------------------------------------------------------------------------
 1 | .DEFAULT_GOAL := build
 2 | .PHONY: build
 3 | 
 4 | check-env:
 5 | ifndef PG_VERSION
 6 | 	$(error PG_VERSION is undefined)
 7 | endif
 8 | 
 9 | default: build
10 | 
11 | build:
12 | 	if test -d pg_anon; \
13 | 	then cd pg_anon && git pull; \
14 | 	else git clone https://github.com/TantorLabs/pg_anon.git; \
15 | 	fi
16 | 
17 | 	docker build -t pg_anon:pg${PG_VERSION} --build-arg PG_VERSION=${PG_VERSION} .
18 | 
19 | .PHONY: clean
20 | clean:
21 | 	rm -rf pg_anon
22 | 
23 | .PHONY: prune
24 | prune:
25 | 	docker images prune -a
26 | 	docker system prune -a -f
27 | 


--------------------------------------------------------------------------------
/tests/input_dict/mask_test.py:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"dictionary": [
 3 | 		{
 4 | 			"schema_mask": "*",
 5 | 			"table_mask": "*",
 6 | 			"fields": {
 7 | 				"amount": "101010"
 8 | 			}
 9 | 		},
10 | 		{
11 | 			"schema":"schm_other_1",
12 | 			"table":"some_tbl",
13 | 			"fields": {
14 | 					"val":"'text const'"
15 | 			}
16 | 		},
17 | 		{
18 | 			"schema_mask": "*",
19 | 			"table": "tbl_100",
20 | 			"fields": {
21 | 				"amount": "202020"
22 | 			}
23 | 		},
24 | 		{
25 | 			"schema":"schm_other_2",
26 | 			"table":"some_tbl",
27 | 			"raw_sql": "SELECT id, val || ' modified' as val FROM schm_other_2.some_tbl"
28 | 		}
29 | 	]
30 | }


--------------------------------------------------------------------------------
/tests/input_dict/test_sync_data.py:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"dictionary": [
 3 | 		{
 4 | 			"schema":"schm_other_2",
 5 | 			"table":"exclude_tbl",
 6 | 			"fields": {
 7 | 					"val":"'text const modified'"
 8 | 			}
 9 | 		},
10 | 		{
11 | 			"schema":"schm_other_2",
12 | 			"table":"some_tbl",
13 | 			"raw_sql": "SELECT id, val || ' modified 2' as val FROM schm_other_2.some_tbl"
14 | 		},
15 | 		{
16 | 			"schema":"schm_mask_include_1",
17 | 			"table":"tbl_123",
18 | 			"fields": {
19 | 					"val":"anon_funcs.partial(val,1,'***',3)"
20 | 			}
21 | 		}
22 |     ],
23 | 	"dictionary_exclude": [
24 | 		{
25 | 			"schema_mask": "*",
26 | 			"table_mask": "*",
27 | 		}
28 | 	]
29 | }


--------------------------------------------------------------------------------
/tests/input_dict/test_partial_tables_dict.py:
--------------------------------------------------------------------------------
 1 | {
 2 |     "tables": [
 3 |         {
 4 |             "schema": "public",
 5 |             "table": "inn_info"
 6 |         },
 7 |         {
 8 |             "schema": "_SCHM.$complex#имя;@&* a'",
 9 |             "table_mask": "^_TBL"
10 |         },
11 |         {
12 |             "schema_mask": "^schm_other",
13 |             "table": "some_tbl"
14 |         },
15 |         {
16 |             "schema_mask": "schm_customer",
17 |             "table_mask": "*"
18 |         },
19 |         {
20 |             "schema_mask": "^*",  # wrong regex
21 |             "table_mask": "^*"  # wrong regex
22 |         },
23 |     ]
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/input_dict/test_meta_dict_type_aliases_complex.py:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"include_rules": [
 3 | 		{
 4 | 			"schema": "schm_other_3",
 5 | 			"table": "data_types_test",
 6 | 		}
 7 | 	],
 8 | 	"field": {
 9 | 		"rules": [".*"]
10 | 	},
11 | 	"funcs": {
12 |         "default": "anon_funcs.digest(\"%s\", 'default', 'md5')",
13 |         "character varying  	(20)": "anon_funcs.digest(\"%s\", 'varchar(20)', 'md5')",
14 | 		"bit varying     		(5)  ": "anon_funcs.digest(\"%s\", 'varbit(5)', 'md5')",
15 | 		"time        	 		(3)     without time zone": "anon_funcs.digest(\"%s\", 'time(3)', 'md5')",
16 | 		"time        	 		(3) with time zone": "anon_funcs.digest(\"%s\", 'timetz(3)', 'md5')",
17 | 		"double precision": "anon_funcs.digest(\"%s\", 'float', 'md5')",
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/input_dict/test_sens_with_sql_conditions.py:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dictionary": [
 3 |         {
 4 |             "schema": "schm_other_4",
 5 |             "table": "goods",
 6 |             "fields": {
 7 |                 "title": "anon_funcs.digest(\"title\", 'salt_word', 'sha256')",
 8 |                 "description": "anon_funcs.digest(\"description\", 'salt_word', 'sha256')",
 9 |                 "quantity": "10",
10 |             },
11 |             "sql_condition":
12 |             """
13 |             WHERE release_date > NOW() - '15 days'::interval
14 |             AND valid_until < NOW() + '15 days'::interval
15 |             """
16 |         }
17 |     ],
18 |     "dictionary_exclude": [
19 |         {
20 |             "schema_mask": "*",
21 |             "table_mask": "*",
22 |         }
23 |     ]
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/input_dict/meta_include_and_skip_rules.py:
--------------------------------------------------------------------------------
 1 | {
 2 |     "skip_rules": [
 3 |         {
 4 |             "schema_mask": "*",
 5 |             "table": "customer_company",
 6 |             "fields": ["inn"]
 7 |         },
 8 |         {
 9 |             "schema_mask": "mask",
10 |             "fields": ["val"]
11 |         },
12 |         {
13 |             "schema_mask": "*",
14 |             "table_mask": "complex",
15 |             "fields": ["fld_key"],
16 |         },
17 |     ],
18 |     "include_rules": [
19 |         {
20 |             "schema_mask": "*",
21 |             "fields": ["email", "inn", "phone", "val", "site"]
22 |         },
23 |         {
24 |             "schema_mask": "*",
25 |             "table": "_TBL.$complex#имя;@&* a'",
26 |         },
27 |         {
28 |             "schema_mask": "mask",
29 |             "table_mask": "^card",
30 |         },
31 |         {
32 |             "schema": "schm_other_2",
33 |             "table_mask": "anon",
34 |         },
35 |     ]
36 | }
37 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "pg_anon"
 3 | version = "1.8.5"
 4 | description = "PostgreSQL anonymization tool."
 5 | authors = [
 6 |     {name="Tantor Labs", email="tantor@tantorlabs.ru"}
 7 | ]
 8 | readme = "README.md"
 9 | dependencies=[
10 |     "aioprocessing==2.0.1",
11 |     "asyncpg==0.29.0",
12 |     "async-timeout==4.0.3",
13 |     "prettytable>=3.17.0",
14 |     "pyyaml (>=6.0.3,<7.0.0)",
15 |     "concurrent-log-handler (>=0.9.28,<0.10.0)"
16 | ]
17 | 
18 | [build-system]
19 | requires = ["setuptools>=78"]
20 | build-backend = "setuptools.build_meta"
21 | 
22 | [project.scripts]
23 | pg_anon = "pg_anon.__main__:main"
24 | 
25 | [tool.setuptools]
26 | include-package-data = false
27 | packages.find.include = ["pg_anon", "pg_anon.*"]
28 | packages.find.exclude = ["tests", "dict"]
29 | 
30 | [tool.poetry]
31 | name = "pg_anon"
32 | version = "1.8.5"
33 | description = ""
34 | authors = ["Tantor Labs <tantor@tantorlabs.ru>"]
35 | readme = "README.md"
36 | 
37 | packages = [
38 |     { include = "pg_anon" }
39 | ]
40 | 
41 | [tool.poetry.dependencies]
42 | python = "^3.11"
43 | 


--------------------------------------------------------------------------------
/rest_api/dependencies.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | from pathlib import Path
 3 | from typing import Optional
 4 | 
 5 | from fastapi import HTTPException, Query, status
 6 | 
 7 | from pg_anon.common.constants import RUNS_BASE_DIR
 8 | 
 9 | 
10 | def date_range_filter(
11 |     date_before: Optional[date] = Query(None, description="Filter: operations before this date"),
12 |     date_after: Optional[date] = Query(None, description="Filter: operations after this date"),
13 | ):
14 |     if date_before and date_after and date_after > date_before:
15 |         raise HTTPException(
16 |             status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
17 |             detail="`date_after` must be less than or equal to `date_before`",
18 |         )
19 |     return {"date_before": date_before, "date_after": date_after}
20 | 
21 | 
22 | def get_operation_run_dir(internal_operation_id: str) -> Path:
23 |     for run_dir in RUNS_BASE_DIR.glob(f'*/*/*/{internal_operation_id}'):
24 |         return run_dir
25 | 
26 |     raise HTTPException(
27 |         status_code=status.HTTP_404_NOT_FOUND,
28 |         detail=f"Operation run directory not found",
29 |     )
30 | 


--------------------------------------------------------------------------------
/pg_anon/common/multiprocessing_utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from typing import List, Callable
 3 | 
 4 | import aioprocessing
 5 | 
 6 | 
 7 | async def init_process(name: str, ctx, target_func: Callable, tasks: List, *args, **kwargs):
 8 |     from pg_anon.context import Context
 9 | 
10 |     ctx: Context
11 |     start_t = time.time()
12 |     ctx.logger.info(f"================> Process [{name}] started. Input items: {len(tasks)}")
13 |     queue = aioprocessing.AioQueue()
14 | 
15 |     p = aioprocessing.AioProcess(
16 |         target=target_func,
17 |         args=(name, queue, tasks, *args),
18 |         kwargs=kwargs,
19 |     )
20 |     p.start()
21 |     res = None
22 |     while True:
23 |         result = await queue.coro_get()
24 |         if result is None:
25 |             break
26 |         res = result
27 |     await p.coro_join()
28 |     end_t = time.time()
29 |     elapsed = round(end_t - start_t, 2)
30 |     result_item_log = str(len(res)) if res is not None else "0"
31 |     ctx.logger.info(
32 |         f"<================ Process [{name}] finished, elapsed: {elapsed} sec. Result {result_item_log} item(s)"
33 |     )
34 |     return res
35 | 


--------------------------------------------------------------------------------
/docker/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | pg_ctlcluster ${PG_VERSION} main start
 5 | 
 6 | sed -i  '/listen_addresses/s/^#//g' /etc/postgresql/${PG_VERSION}/main/postgresql.conf
 7 | sed -ie "s/^listen_addresses.*/listen_addresses = '127.0.0.1'/" /etc/postgresql/${PG_VERSION}/main/postgresql.conf
 8 | sed -i -e '/local.*peer/s/postgres/all/' -e 's/peer\|md5/trust/g' /etc/postgresql/${PG_VERSION}/main/pg_hba.conf
 9 | 
10 | pg_ctlcluster ${PG_VERSION} main restart
11 | 
12 | psql -c "ALTER USER postgres WITH PASSWORD 'YmTLbLTLxF'" -U postgres
13 | psql -c "CREATE USER anon_test_user WITH PASSWORD 'mYy5RexGsZ' SUPERUSER" -U postgres
14 | 
15 | ln -s /usr/share/pg_anon/pg_anon.py /usr/bin/pg_anon.py
16 | 
17 | cat > /usr/bin/pg_anon << EOL
18 | #!/bin/bash
19 | python3 /usr/share/pg_anon/pg_anon.py \$@
20 | EOL
21 | 
22 | chmod +x /usr/bin/pg_anon
23 | chown postgres:postgres -R /usr/share/pg_anon
24 | 
25 | usermod -d /usr/share/pg_anon postgres
26 | 
27 | cd /usr/share/pg_anon
28 | 
29 | echo 'export PYTHONPATH=/usr/share/pg_anon' >> /etc/bash.bashrc
30 | 
31 | echo '[ ! -z "$TERM" -a -r /etc/motd ] && cat /etc/motd' >> /etc/bash.bashrc
32 | 
33 | trap : TERM INT; sleep infinity & wait
34 | 


--------------------------------------------------------------------------------
/pg_anon/common/enums.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class ResultCode(Enum):
 5 |     DONE = "done"
 6 |     FAIL = "fail"
 7 |     UNKNOWN = "unknown"
 8 | 
 9 | 
10 | class VerboseOptions(Enum):
11 |     INFO = "info"
12 |     DEBUG = "debug"
13 |     ERROR = "error"
14 | 
15 | 
16 | class AnonMode(Enum):
17 |     DUMP = "dump"  # dump table contents to files using dictionary
18 |     RESTORE = "restore"  # create tables in target database and load data from files
19 |     INIT = "init"  # create a schema with anonymization helper functions
20 |     SYNC_DATA_DUMP = "sync-data-dump"  # synchronize the contents of one or more tables (dump stage)
21 |     SYNC_DATA_RESTORE = "sync-data-restore"  # synchronize the contents of one or more tables (restore stage)
22 |     SYNC_STRUCT_DUMP = "sync-struct-dump"  # synchronize the structure of one or more tables (dump stage)
23 |     SYNC_STRUCT_RESTORE = "sync-struct-restore"  # synchronize the structure of one or more tables (restore stage)
24 |     CREATE_DICT = "create-dict"  # create dictionary
25 |     VIEW_FIELDS = "view-fields"  # view fields
26 |     VIEW_DATA = "view-data"  # view data using prepared-sens-dict-file
27 | 
28 | 
29 | class ScanMode(Enum):
30 |     FULL = "full"
31 |     PARTIAL = "partial"
32 | 


--------------------------------------------------------------------------------
/tests/sql/init_additional_simple_env.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS test_simple.orders CASCADE;
 2 | DROP TABLE IF EXISTS test_simple.clients CASCADE;
 3 | 
 4 | CREATE TABLE test_simple.clients
 5 | (
 6 |     id serial,
 7 |     firstname character varying(32),
 8 |     lastname character varying(32),
 9 |     email character varying(64),
10 |     phone character varying(32),
11 |     CONSTRAINT clients_pk UNIQUE (id)
12 | );
13 | 
14 | CREATE TABLE test_simple.orders
15 | (
16 |     id serial,
17 |     item_id integer NOT NULL,
18 |     amount numeric(16,4) DEFAULT 0 NOT NULL,
19 |     details text,
20 |     status_id integer NOT NULL,
21 | 	CONSTRAINT orders_pk UNIQUE (id)
22 | );
23 | 
24 | -- prepare data
25 | INSERT INTO test_simple.clients
26 | (firstname, lastname, email, phone)
27 | select
28 | 	'first_name_' || v as firstname,
29 | 	'last_name_' || v as lastname,
30 | 	'first_name_' ||v || '.last_name_' || v || '@' || 'some_hoster_' || v || '.com' as email,
31 | 	79101438060 + v as phone
32 | from generate_series(1,1512) as v;
33 | 
34 | INSERT INTO test_simple.orders
35 | (item_id, amount, details, status_id)
36 | select
37 | 	v as item_id,
38 | 	floor(v * 0.7)::integer as amount,
39 | 	'details_' || v as details,
40 | 	v % 2
41 | from generate_series(1,1512) as v;
42 | 


--------------------------------------------------------------------------------
/pg_anon/modes/initialization.py:
--------------------------------------------------------------------------------
 1 | from pg_anon.common.constants import BASE_DIR
 2 | from pg_anon.common.db_utils import create_connection
 3 | from pg_anon.common.utils import exception_helper
 4 | from pg_anon.context import Context
 5 | 
 6 | 
 7 | class InitMode:
 8 |     def __init__(self, context: Context):
 9 |         self.context = context
10 | 
11 |     async def run(self) -> None:
12 |         self.context.logger.info("-------------> Started init mode")
13 | 
14 |         async def handle_notice(connection, message):
15 |             self.context.logger.info("NOTICE: %s" % message)
16 | 
17 |         db_conn = await create_connection(self.context.connection_params, server_settings=self.context.server_settings)
18 |         db_conn.add_log_listener(handle_notice)
19 | 
20 |         tr = db_conn.transaction()
21 |         await tr.start()
22 | 
23 |         try:
24 |             with open(BASE_DIR / "init.sql", "r") as f:
25 |                 data = f.read()
26 |             await db_conn.execute(data)
27 |             await tr.commit()
28 | 
29 |             self.context.logger.info("<------------- Finished init mode")
30 |         except Exception as ex:
31 |             self.context.logger.error("<------------- Init failed\n" + exception_helper())
32 |             await tr.rollback()
33 |             raise ex
34 |         finally:
35 |             await db_conn.close()
36 | 


--------------------------------------------------------------------------------
/tests/sql/init_simple_env.sql:
--------------------------------------------------------------------------------
 1 | DROP SCHEMA IF EXISTS test_simple CASCADE;
 2 | CREATE SCHEMA IF NOT EXISTS test_simple;
 3 | 
 4 | DROP TABLE IF EXISTS test_simple.customer_company CASCADE;
 5 | DROP TABLE IF EXISTS test_simple.contracts CASCADE;
 6 | 
 7 | CREATE TABLE test_simple.customer_company
 8 | (
 9 |     id serial,
10 |     company_name character varying(32),
11 |     email character varying(64),
12 |     phone character varying(32),
13 |     site character varying(64),
14 |     inn bigint,
15 |     CONSTRAINT customer_company_pkey UNIQUE (id),
16 |     CONSTRAINT inn_uniq UNIQUE (inn)
17 | );
18 | 
19 | CREATE TABLE test_simple.contracts
20 | (
21 |     id serial,
22 |     customer_company_id integer NOT NULL,
23 |     customer_manager_id integer NOT NULL,
24 |     amount numeric(16,4) DEFAULT 0 NOT NULL,
25 |     details text,
26 |     status_id integer NOT NULL,
27 |     contract_expires timestamp,
28 | 	CONSTRAINT contracts_pk UNIQUE (id)
29 | );
30 | 
31 | -- prepare data
32 | INSERT INTO test_simple.customer_company
33 | (company_name, email, phone, site, inn)
34 | select
35 | 	'company_name_' || v as company_name,
36 | 	'info' || v || '@' || 'company_name_' || v || '.com' as email,
37 | 	79101438060 + v as phone,
38 | 	'company_name_' || v || '.com' as site,
39 | 	10000000 + v * 10 as inn
40 | from generate_series(1,1512) as v;
41 | 
42 | INSERT INTO test_simple.contracts
43 | (customer_company_id, customer_manager_id, amount, details, status_id, contract_expires)
44 | select
45 | 	v as customer_company_id,
46 | 	v as customer_manager_id,
47 | 	floor(v * 0.7)::integer as amount,
48 | 	'details_' || v as details,
49 | 	v % 2,
50 | 	NOW() + (random() * (NOW() + '365 days' - NOW())) + '365 days' as contract_expires
51 | from generate_series(1,1512) as v;
52 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # pg_anon Dockerfile
 2 | 
 3 | ## Usage
 4 | 
 5 | Make image:
 6 | 
 7 | ```bash
 8 | cd pg_anon/docker
 9 | make PG_VERSION=15
10 | docker tag $(docker images -q | head -n 1) pg_anon:pg15
11 | ```
12 | 
13 | Push image:
14 | 
15 | ```bash
16 | docker tag $(docker images -q | head -n 1) pg_anon:pg15
17 | 
18 | docker save -o pg_anon_22_10_23.tar pg_anon:pg15
19 | 
20 | curl --fail -v --user 'user:password' --upload-file pg_anon_22_10_23.tar https://nexus.tantorlabs.ru/repository/tantorlabs-raw/
21 | ```
22 | 
23 | ## Run container
24 | 
25 | ```bash
26 | # If "The container name "/pg_anon" is already in use"
27 | # docker rm -f pg_anon
28 | 
29 | docker run --name pg_anon -d pg_anon:pg15
30 | docker exec -it pg_anon bash
31 | chown -R postgres .
32 | su - postgres
33 | python3 test/full_test.py -v
34 | exit 
35 | 
36 | # Run and mount directory from HOST to /usr/share/pg_anon_from_host
37 | docker rm -f pg_anon
38 | docker run --name pg_anon -v $PWD:/usr/share/pg_anon -d pg_anon:pg15
39 | ```
40 | 
41 | If tests raised error like: `asyncpg.exceptions.ExternalRoutineError: program "gzip > ... *.dat.gz" failed`
42 | 
43 | See: [Configure permission](https://github.com/TantorLabs/pg_anon#configure-permission)
44 | 
45 | ## Load saved image
46 | 
47 | ```bash
48 | docker load < pg_anon_22_9_12.tar
49 | ```
50 | 
51 | ## How to debug container
52 | 
53 | ```bash
54 | docker exec -it pg_anon bash
55 | >>
56 | 	Error response from daemon: Container c876d... is not running
57 | 
58 | docker logs c876d...
59 | 
60 | # Fix errors in entrypoint.sh
61 | # Set "ENTRYPOINT exec /entrypoint_dbg.sh" in Dockerfile
62 | 
63 | docker rm -f pg_anon
64 | make PG_VERSION=15
65 | docker tag $(docker images -q | head -n 1) pg_anon:pg15
66 | docker run --name pg_anon -d pg_anon:pg15
67 | docker exec -it pg_anon bash
68 | ```
69 | 


--------------------------------------------------------------------------------
/tests/expected_results/test_prepared_sens_dict_result_by_include_and_skip_rules_expected.py:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dictionary": [
 3 |         {
 4 |             "schema": "_SCHM.$complex#имя;@&* a'",
 5 |             "table": "_TBL.$complex#имя;@&* a'",
 6 |             "fields": {
 7 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')"
 8 |             }
 9 |         },
10 |         {
11 |             "schema": "schm_customer",
12 |             "table": "customer_company",
13 |             "fields": {
14 |                 "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')"
15 |             }
16 |         },
17 |         {
18 |             "schema": "schm_mask_ext_exclude_2",
19 |             "table": "card_numbers",
20 |             "fields": {
21 |                 "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')",
22 |                 "usd": "anon_funcs.noise(\"usd\", 30)",
23 |                 "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')"
24 |             }
25 |         },
26 |         {
27 |             "schema": "schm_customer",
28 |             "table": "customer_manager",
29 |             "fields": {
30 |                 "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')"
31 |             }
32 |         },
33 |         {
34 |             "schema": "public",
35 |             "table": "inn_info",
36 |             "fields": {
37 |                 "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')"
38 |             }
39 |         },
40 |         {
41 |             "schema": "schm_other_2",
42 |             "table": "tbl_test_anon_functions",
43 |             "fields": {
44 |                 "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')"
45 |             }
46 |         }
47 |     ]
48 | }


--------------------------------------------------------------------------------
/pg_anon/common/constants.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | BASE_DIR = Path(__file__).resolve().parent.parent.parent
 4 | RUNS_BASE_DIR = BASE_DIR / 'runs'
 5 | 
 6 | LOGS_DIR_NAME = 'logs'
 7 | LOGS_FILE_NAME = 'logs.log'
 8 | SAVED_RUN_OPTIONS_FILE_NAME = 'run_options.json'
 9 | SAVED_RUN_STATUS_FILE_NAME = 'run_status.json'
10 | SAVED_DICTS_INFO_FILE_NAME = 'saved_dicts_info.json'
11 | 
12 | ANON_UTILS_DB_SCHEMA_NAME = 'anon_funcs'
13 | DEFAULT_HASH_FUNC = f"{ANON_UTILS_DB_SCHEMA_NAME}.digest(\"%s\", 'salt_word', 'md5')"
14 | 
15 | SERVER_SETTINGS = {
16 |     "application_name": "pg_anon",
17 |     "statement_timeout": "0",
18 |     "lock_timeout": "0",
19 | }
20 | 
21 | TRANSACTIONS_SERVER_SETTINGS = {
22 |     "idle_in_transaction_session_timeout": "0",
23 |     "idle_session_timeout": "0",
24 | }
25 | 
26 | DEFAULT_EXCLUDED_SCHEMAS = [
27 |     ANON_UTILS_DB_SCHEMA_NAME,
28 |     "pg_catalog",
29 |     "information_schema"
30 | ]
31 | 
32 | BASE_TYPE_ALIASES = {
33 |     "varbit": "bit varying",
34 |     "bool": "boolean",
35 | 
36 |     "char": "character",
37 |     "varchar": "character varying",
38 | 
39 |     "int": "integer",
40 |     "int4": "integer",
41 |     "int2": "smallint",
42 |     "int8": "bigint",
43 | 
44 |     "float": "double precision",
45 |     "float8": "double precision",
46 |     "float4": "real",
47 |     "decimal": "numeric",
48 |     "dec": "numeric",
49 | 
50 |     "serial2": "smallserial",
51 |     "serial4": "serial",
52 |     "serial8": "bigserial",
53 | 
54 |     "time": "time",
55 |     "timetz": "time with time zone",
56 | 
57 |     "timestamp": "timestamp",
58 |     "timestamptz": "timestamp with time zone",
59 | }
60 | 
61 | SENS_PG_TYPES = ["text", "character", "varchar", "mvarchar", "json", "integer", "bigint"]
62 | 
63 | SECRET_RUN_OPTIONS = [
64 |     "db_user_password"
65 | ]
66 | 
67 | TRACEBACK_LINES_COUNT = 100
68 | 


--------------------------------------------------------------------------------
/tests/sql/init_stress_env.sql:
--------------------------------------------------------------------------------
 1 | do $$
 2 | declare
 3 | 	count_tbls integer;
 4 | 	test_res text;
 5 |     q_tbl text = 'CREATE TABLE stress.tbl_%s'
 6 | 		'('
 7 | 		'    id serial,'
 8 | 		'    customer_company_id integer NOT NULL,'
 9 | 		'    first_name character varying(32),'
10 | 		'    last_name character varying(32),'
11 |         '    name text,'
12 | 		'    email character varying(64),'
13 | 		'    phone character varying(32),'
14 |         '    fld_datetime timestamp,'
15 | 		'    CONSTRAINT tbl_%s_pkey UNIQUE (id)'
16 | 		');';
17 | 	q_insert text = 'INSERT INTO stress.tbl_%s'
18 | 		'(customer_company_id, first_name, last_name, name, email, phone, fld_datetime)'
19 | 		' select'
20 | 		'	v as customer_company_id,'
21 | 		'	''first_name_'' || v as first_name,'
22 | 		'	''last_name_'' || v as last_name,'
23 | 		'	(select array_to_string(array_agg(t.v::text), '' '')'
24 | 		'	from ('
25 | 		'			select anon_funcs.random_string(10) as v'
26 | 		'		from generate_series(1,100)'
27 | 		'	) t) as name,'
28 | 		'	''first_name_'' || v || ''@'' || ''company_name_'' || v || ''.com'' as email,'
29 | 		'	79101538060 + v as phone,'
30 | 	    '  NOW() + (random() * (NOW() + ''100 days'' - NOW())) + ''100 days'''
31 | 		' from generate_series(1,1512) as v';
32 | 	query text;
33 | begin
34 | 	execute 'DROP SCHEMA IF EXISTS stress CASCADE';
35 |     execute 'CREATE SCHEMA stress';
36 | 	FOR i IN 1..10 LOOP
37 | 		query = format(q_tbl, i, i);
38 | 		--raise notice '%', query;
39 | 		execute query;
40 | 		query = format(q_insert, i);
41 | 		--raise notice '%', query;
42 | 		execute query;
43 | 		if i % 100 = 0 then
44 | 			raise notice 'i = %', i;
45 | 		end if;
46 | 	END LOOP;
47 | end$$;
48 | 
49 | SELECT pg_size_pretty(pg_database_size(datname)), datname, pg_database_size(datname) as v
50 | from pg_database
51 | order by v desc;
52 | -->>
53 | --	20 GB	test_source_db_stress	21824553763
54 | 


--------------------------------------------------------------------------------
/tests/input_dict/test_meta_dict_default_func.py:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"field": {  # must be anonymized without scanning
 3 | 		"rules": [
 4 | 			"^fld_5_em",
 5 | 			"^amount",
 6 | 			"details$",
 7 | 			"contract_expires$",
 8 | 			"inn$"
 9 | 		],
10 | 		"constants": [
11 | 			"usd",
12 | 			"имя_поля"
13 | 		]
14 | 	},
15 | 	"skip_rules": [
16 | 		{
17 | 			"schema": "schm_mask_ext_exclude_2",
18 | 			"table": "card_numbers",  # Optional. If no "table" then whole schema will be skipped
19 | 			"fields": ["val_skip"]  # Optional. If no "fields" then whole table will be skipped
20 | 		}
21 | 	],
22 | 	"data_regex": {
23 | 		"rules": [
24 | 			r"""[A-Za-z0-9]+([._-][A-Za-z0-9]+)*@[A-Za-z0-9-]+(\.[A-Za-z]{2,})+""",  # email
25 | 			r"^(7?\d{10})$",  # phone 7XXXXXXXXXX
26 | 			r"^other_ext_tbl_text",  # catch "schm_mask_ext_exclude_2.other_ext_tbl_2"
27 | 			r"""[0-9]{3}-[0-9]{2}-[0-9]{4}""",  # social Security numbers "nnn-nn-nnnn"
28 | 			r"""\b[0-9A-Z]{3}([^ 0-9A-Z]|\s)?[0-9]{4}\b""",  # license plate numbers aaa-nnnn
29 | 			r"""^\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}$""",  # IPV4 addresses
30 | 			r"""^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$""",  # Dates in MM/DD/YYYY format
31 | 			# MasterCard numbers 5258704108753590
32 | 			r"""^(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}$""",
33 | 			# Visa card numbers 4563-7568-5698-4587
34 | 			r"""\b([4]\d{3}[\s]\d{4}[\s]\d{4}[\s]\d{4}|[4]\d{3}[-]\d{4}[-]\d{4}[-]\d{4}|[4]\d{3}[.]\d{4}[.]\d{4}[.]\d{4}|[4]\d{3}\d{4}\d{4}\d{4})\b""",
35 | 			# Any card number
36 | 			r"""[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}""",
37 | 			# URLs
38 | 			r"""(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()]+|\(([^\s()]+|(\([^\s()]+\)))*\))+(?:\(([^\s()]+|(\([^\s()]+\)))*\)|[^\s`!()\[\]{};:'".,?«»“”‘’]))""",
39 | 			r"""[0-9]{2}-[0-9]{7}"""  # INN from 1c
40 | 		]
41 | 	},
42 | 	"data_const": {
43 | 		"constants": [
44 | 			"account",
45 | 			"email",
46 | 			"слово",
47 | 			"сергей"
48 | 		]
49 | 	},
50 | 	"sens_pg_types": [
51 | 		"text",
52 | 		"integer",
53 | 		"bigint",
54 | 		"character",
55 | 		"json"
56 | 	],
57 | 	"funcs": {
58 | 		"default": "anon_funcs.digest(\"%s\", 'by_default_func', 'sha256')",
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | 
 3 | RUN apt update && \
 4 |   apt install -y wget vim nano htop tree sysbench net-tools sysstat less iotop && \
 5 |   apt -y install curl gpg gnupg2 apt-transport-https lsb-release ca-certificates && \
 6 |   apt -y install software-properties-common && \
 7 |   apt -y install python3-pip && \
 8 |   apt install -y locales && locale-gen en_US.UTF-8 && \
 9 |   rm -rf /tmp/* && apt purge -y --auto-remove && apt clean -y autoclean
10 | 
11 | ARG PG_VERSION
12 | ARG DEBIAN_FRONTEND=noninteractive
13 | 
14 | # RUN if [ "$PG_VERSION" = "13" ]; then curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc| gpg --dearmor -o /etc/apt/trusted.gpg.d/postgresql.gpg; fi
15 | # RUN if [ "$PG_VERSION" = "13" ]; then echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" | tee  /etc/apt/sources.list.d/pgdg.list; fi
16 | # RUN if [ "$PG_VERSION" = "13" ]; then apt update ; fi
17 | 
18 | 
19 | # =======================================
20 | # Ubuntu 20.04
21 | RUN curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc| gpg --dearmor -o /etc/apt/trusted.gpg.d/postgresql.gpg
22 | RUN echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" | tee  /etc/apt/sources.list.d/pgdg.list
23 | RUN apt update
24 | # =======================================
25 | 
26 | RUN apt -y install postgresql-${PG_VERSION} postgresql-client-${PG_VERSION}
27 | 
28 | RUN add-apt-repository ppa:deadsnakes/ppa && \
29 |     apt update && \
30 |     apt install -y python3.12 python3.12-distutils python3.12-dev
31 | 
32 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
33 | 
34 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
35 |     python3.12 get-pip.py && \
36 |     rm get-pip.py
37 | 
38 | # Add pg_anon
39 | ADD ./pg_anon /usr/share/pg_anon
40 | 
41 | RUN pip3 install -r /usr/share/pg_anon/requirements.txt
42 | 
43 | EXPOSE 5432
44 | 
45 | ENV PG_VERSION=${PG_VERSION}
46 | 
47 | ADD entrypoint.sh /entrypoint.sh
48 | RUN chmod +x /entrypoint.sh
49 | 
50 | ADD entrypoint_dbg.sh /entrypoint_dbg.sh
51 | RUN chmod +x /entrypoint_dbg.sh
52 | 
53 | ADD motd /etc/motd
54 | 
55 | WORKDIR /usr/share/pg_anon
56 | 
57 | ENTRYPOINT exec /entrypoint.sh
58 | # ENTRYPOINT exec /entrypoint_dbg.sh
59 | 


--------------------------------------------------------------------------------
/tests/expected_results/test_prepared_sens_dict_result_type_aliases_expected.py:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dictionary": [
 3 |         {
 4 |             "schema": "schm_other_3",
 5 |             "table": "data_types_test",
 6 |             "fields": {
 7 |                 "field_type_bit": "anon_funcs.digest(\"field_type_bit\", 'bit(5)', 'md5')",
 8 |                 "field_type_bool": "anon_funcs.digest(\"field_type_bool\", 'bool', 'md5')",
 9 |                 "field_type_char": "anon_funcs.digest(\"field_type_char\", 'char(5)', 'md5')",
10 |                 "field_type_decimal": "anon_funcs.digest(\"field_type_decimal\", 'decimal(10,2)', 'md5')",
11 |                 "field_type_float": "anon_funcs.digest(\"field_type_float\", 'float8', 'md5')",
12 |                 "field_type_float4": "anon_funcs.digest(\"field_type_float4\", 'float4', 'md5')",
13 |                 "field_type_float8": "anon_funcs.digest(\"field_type_float8\", 'float8', 'md5')",
14 |                 "field_type_int": "anon_funcs.digest(\"field_type_int\", 'int4', 'md5')",
15 |                 "field_type_int2": "anon_funcs.digest(\"field_type_int2\", 'int2', 'md5')",
16 |                 "field_type_int4": "anon_funcs.digest(\"field_type_int4\", 'int4', 'md5')",
17 |                 "field_type_int8": "anon_funcs.digest(\"field_type_int8\", 'int8', 'md5')",
18 |                 "field_type_time": "anon_funcs.digest(\"field_type_time\", 'time', 'md5')",
19 |                 "field_type_time_p": "anon_funcs.digest(\"field_type_time_p\", 'time(3)', 'md5')",
20 |                 "field_type_timestamp": "anon_funcs.digest(\"field_type_timestamp\", 'timestamp', 'md5')",
21 |                 "field_type_timestamp_p": "anon_funcs.digest(\"field_type_timestamp_p\", 'timestamp(3)', 'md5')",
22 |                 "field_type_timestamptz": "anon_funcs.digest(\"field_type_timestamptz\", 'timestamptz', 'md5')",
23 |                 "field_type_timestamptz_p": "anon_funcs.digest(\"field_type_timestamptz_p\", 'timestamptz(3)', 'md5')",
24 |                 "field_type_timetz": "anon_funcs.digest(\"field_type_timetz\", 'timetz', 'md5')",
25 |                 "field_type_timetz_p": "anon_funcs.digest(\"field_type_timetz_p\", 'timetz(3)', 'md5')",
26 |                 "field_type_varbit": "anon_funcs.digest(\"field_type_varbit\", 'varbit(5)', 'md5')",
27 |                 "field_type_varchar": "anon_funcs.digest(\"field_type_varchar\", 'varchar(20)', 'md5')"
28 |             }
29 |         }
30 |     ]
31 | }


--------------------------------------------------------------------------------
/tests/expected_results/test_prepared_sens_dict_result_type_aliases_complex_expected.py:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dictionary": [
 3 |         {
 4 |             "schema": "schm_other_3",
 5 |             "table": "data_types_test",
 6 |             "fields": {
 7 |                 "field_type_bit": "anon_funcs.digest(\"field_type_bit\", 'default', 'md5')",
 8 |                 "field_type_bool": "anon_funcs.digest(\"field_type_bool\", 'default', 'md5')",
 9 |                 "field_type_char": "anon_funcs.digest(\"field_type_char\", 'default', 'md5')",
10 |                 "field_type_decimal": "anon_funcs.digest(\"field_type_decimal\", 'default', 'md5')",
11 |                 "field_type_float": "anon_funcs.digest(\"field_type_float\", 'float', 'md5')",
12 |                 "field_type_float4": "anon_funcs.digest(\"field_type_float4\", 'default', 'md5')",
13 |                 "field_type_float8": "anon_funcs.digest(\"field_type_float8\", 'float', 'md5')",
14 |                 "field_type_int": "anon_funcs.digest(\"field_type_int\", 'default', 'md5')",
15 |                 "field_type_int2": "anon_funcs.digest(\"field_type_int2\", 'default', 'md5')",
16 |                 "field_type_int4": "anon_funcs.digest(\"field_type_int4\", 'default', 'md5')",
17 |                 "field_type_int8": "anon_funcs.digest(\"field_type_int8\", 'default', 'md5')",
18 |                 "field_type_time": "anon_funcs.digest(\"field_type_time\", 'default', 'md5')",
19 |                 "field_type_time_p": "anon_funcs.digest(\"field_type_time_p\", 'time(3)', 'md5')",
20 |                 "field_type_timestamp": "anon_funcs.digest(\"field_type_timestamp\", 'default', 'md5')",
21 |                 "field_type_timestamp_p": "anon_funcs.digest(\"field_type_timestamp_p\", 'default', 'md5')",
22 |                 "field_type_timestamptz": "anon_funcs.digest(\"field_type_timestamptz\", 'default', 'md5')",
23 |                 "field_type_timestamptz_p": "anon_funcs.digest(\"field_type_timestamptz_p\", 'default', 'md5')",
24 |                 "field_type_timetz": "anon_funcs.digest(\"field_type_timetz\", 'default', 'md5')",
25 |                 "field_type_timetz_p": "anon_funcs.digest(\"field_type_timetz_p\", 'timetz(3)', 'md5')",
26 |                 "field_type_varbit": "anon_funcs.digest(\"field_type_varbit\", 'varbit(5)', 'md5')",
27 |                 "field_type_varchar": "anon_funcs.digest(\"field_type_varchar\", 'varchar(20)', 'md5')"
28 |             }
29 |         }
30 |     ]
31 | }


--------------------------------------------------------------------------------
/rest_api/runners/background/base.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | from typing import List
 3 | 
 4 | from pg_anon.common.constants import BASE_DIR
 5 | from pg_anon.common.dto import PgAnonResult
 6 | from rest_api.constants import BASE_TEMP_DIR
 7 | from rest_api.pydantic_models import StatelessRunnerRequest
 8 | from rest_api.utils import run_pg_anon_worker
 9 | 
10 | 
11 | class BaseRunner:
12 |     mode: str
13 |     request: StatelessRunnerRequest
14 |     operation_id: str
15 |     cli_params: List[str] = None
16 |     result: PgAnonResult = None
17 | 
18 |     def __init__(self, request: StatelessRunnerRequest):
19 |         self.request = request
20 |         self.operation_id = request.operation_id
21 |         self.base_tmp_dir = BASE_TEMP_DIR / f'{self.operation_id}__{uuid.uuid4()}'
22 |         self._prepare_cli_params()
23 | 
24 |     def _prepare_db_credentials_cli_params(self):
25 |         self.cli_params.extend([
26 |             f'--db-host={self.request.db_connection_params.host}',
27 |             f'--db-port={self.request.db_connection_params.port}',
28 |             f'--db-user={self.request.db_connection_params.user_login}',
29 |             f'--db-user-password={self.request.db_connection_params.user_password}',
30 |             f'--db-name={self.request.db_connection_params.db_name}',
31 |         ])
32 | 
33 |     def _prepare_config(self):
34 |         config_file_path = BASE_DIR / "config.yml"
35 |         if config_file_path.exists():
36 |             self.cli_params.extend([
37 |                 f"--config={str(config_file_path)}",
38 |             ])
39 | 
40 |     def _prepare_verbosity_cli_params(self):
41 |         self.cli_params.extend([
42 |             "--debug",
43 |         ])
44 | 
45 |     def _prepare_other_cli_params(self):
46 |         if self.request.save_dicts:
47 |             self.cli_params.extend([
48 |                 "--save-dicts",
49 |             ])
50 | 
51 |     def _prepare_cli_params(self):
52 |         self.cli_params = []
53 |         self._prepare_db_credentials_cli_params()
54 |         self._prepare_config()
55 |         self._prepare_other_cli_params()
56 | 
57 |     async def run(self):
58 |         if not self.mode:
59 |             raise ValueError(f'Mode is not set')
60 | 
61 |         self.result = await run_pg_anon_worker(
62 |             mode=self.mode,
63 |             operation_id=self.operation_id,
64 |             cli_run_params=self.cli_params
65 |         )
66 | 
67 |         if not self.result:
68 |             raise RuntimeError('Operation not completed successfully')
69 | 
70 |         return self.result
71 | 


--------------------------------------------------------------------------------
/pg_anon/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | from concurrent_log_handler import ConcurrentRotatingFileHandler
 6 | 
 7 | 
 8 | class Logger:
 9 |     _instance = None
10 |     _formatter: str
11 | 
12 |     logger = None
13 | 
14 |     def __new__(cls):
15 |         if cls._instance is not None:
16 |             return cls._instance
17 | 
18 |         cls._instance = super().__new__(cls)
19 |         cls._instance.logger = logging.getLogger('pg_anon.logger')
20 |         cls._instance.logger.setLevel(logging.INFO)
21 | 
22 |         cls._instance._formatter = logging.Formatter(
23 |             datefmt="%Y-%m-%d %H:%M:%S",
24 |             fmt="%(asctime)s,%(msecs)03d - %(levelname)8s - %(message)s",
25 |         )
26 | 
27 |         handler = logging.StreamHandler(sys.stdout)
28 |         handler.setFormatter(cls._instance._formatter)
29 |         cls._instance.logger.addHandler(handler)
30 | 
31 |         return cls._instance
32 | 
33 |     def add_file_handler(self, log_dir: Path, log_file_name: str):
34 |         for handler in list(self.logger.handlers):
35 |             if isinstance(handler, logging.FileHandler):
36 |                 self.logger.removeHandler(handler)
37 |                 handler.close()
38 | 
39 |         log_dir.mkdir(parents=True, exist_ok=True)
40 | 
41 |         file_handler = ConcurrentRotatingFileHandler(
42 |             log_dir / log_file_name,
43 |             maxBytes=10 * 1024 * 1024,
44 |             backupCount=10,
45 |         )
46 |         file_handler.setFormatter(self._formatter)
47 |         self.logger.addHandler(file_handler)
48 | 
49 |     def set_log_level(self, log_level: int):
50 |         self.logger.setLevel(log_level)
51 | 
52 |     def __del__(self):
53 |         # Закрытие всех обработчиков при уничтожении экземпляра класса
54 |         for handler in self.logger.handlers.copy():
55 |             try:
56 |                 handler.acquire()
57 |                 handler.flush()
58 |                 handler.close()
59 |             except Exception as e:
60 |                 print(f"Error closing log handler: {e}")
61 |             finally:
62 |                 handler.release()
63 |                 self.logger.removeHandler(handler)
64 | 
65 | 
66 | def get_logger():
67 |     return Logger().logger
68 | 
69 | 
70 | def logger_add_file_handler(log_dir: Path, log_file_name: str):
71 |     Logger().add_file_handler(
72 |         log_dir=log_dir,
73 |         log_file_name=log_file_name,
74 |     )
75 | 
76 | 
77 | def logger_set_log_level(log_level: int):
78 |     Logger().set_log_level(log_level)
79 | 


--------------------------------------------------------------------------------
/docs/operations/init.md:
--------------------------------------------------------------------------------
 1 | # 🏗️ Init
 2 | 
 3 | > [🏠 Home](../../README.md#-operations) | [🔍 Scan](scan.md) | [💾 Dump](dump.md) | [📂 Restore](restore.md) | [🔬 View Fields](view-fields.md) | [📊 View Data](view-data.md) | [📚 SQL Functions Library](../sql-functions-library.md)
 4 | 
 5 | ## Overview
 6 | 
 7 | This mode creates the `anon_funcs` schema in the source database and loads the predefined SQL functions from [init.sql](../../init.sql).
 8 | These functions are required for processing data in the source database.
 9 | 
10 | ## Run example
11 | 
12 | ```commandline
13 | python -m pg_anon --mode=init \
14 |                   --db-user=postgres \
15 |                   --db-user-password=postgres \
16 |                   --db-name=source_db
17 | ```
18 | 
19 | ---
20 | 
21 | ## Options
22 | 
23 | ### Common pg_anon options:
24 | 
25 | | Option      | Required | Description                                                                                      |
26 | |-------------|----------|--------------------------------------------------------------------------------------------------|
27 | | `--verbose` | No       | Sets the log verbosity level: `info`, `debug`, `error`. (default: info)                          |
28 | | `--debug`   | No       | Enables debug mode (equivalent to `--verbose=debug`) and adds extra debug logs. (default: false) |
29 | 
30 | ### Database configuration options:
31 | 
32 | | Option               | Required | Description                                                         |
33 | |----------------------|----------|---------------------------------------------------------------------|
34 | | `--db-host`          | Yes      | Database host.                                                      |
35 | | `--db-port`          | Yes      | Database port.                                                      |
36 | | `--db-name`          | Yes      | Database name.                                                      |
37 | | `--db-user`          | Yes      | Database user.                                                      |
38 | | `--db-user-password` | No       | Database user password.                                             |
39 | | `--db-passfile`      | No       | Path to a file containing the password used for authentication.     |
40 | | `--db-ssl-key-file`  | No       | Path to the client SSL key file for secure connections.             |
41 | | `--db-ssl-cert-file` | No       | Path to the client SSL certificate file.                            |
42 | | `--db-ssl-ca-file`   | No       | Path to the CA certificate used to verify the server’s certificate. |
43 | 


--------------------------------------------------------------------------------
/tests/input_dict/test_meta_dict.py:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"field": {				# must be anonymized without scanning
 3 | 		"rules": [
 4 | 			"^fld_5_em",
 5 | 			"^amount",
 6 | 			"details$",
 7 | 			"contract_expires$",
 8 | 			"inn$"
 9 | 		],
10 | 		"constants": [
11 | 			"usd",
12 | 			"имя_поля"
13 | 		]
14 | 	},
15 | 	"skip_rules": [
16 | 		{
17 | 			"schema": "schm_mask_ext_exclude_2",
18 | 			"table": "card_numbers",    # Optional. If no "table" then whole schema will be skipped
19 | 			"fields": ["val_skip"]		# Optional. If no "fields" then whole table will be skipped
20 | 		},
21 | 		{
22 | 			"schema": "schm_other_3",
23 | 		},
24 | 	],
25 | 	"data_regex": {
26 | 		"rules": [
27 | 			r"""[A-Za-z0-9]+([._-][A-Za-z0-9]+)*@[A-Za-z0-9-]+(\.[A-Za-z]{2,})+""",  # email
28 | 			r"^(7?\d{10})$",				# phone 7XXXXXXXXXX
29 | 			r"^other_ext_tbl_text",		# catch "schm_mask_ext_exclude_2.other_ext_tbl_2"
30 | 			r"""[0-9]{3}-[0-9]{2}-[0-9]{4}""",  # social Security numbers "nnn-nn-nnnn"
31 | 			r"""\b[0-9A-Z]{3}([^ 0-9A-Z]|\s)?[0-9]{4}\b""",	# license plate numbers aaa-nnnn
32 | 			r"""^\d{1,3}[.]\d{1,3}[.]\d{1,3}[.]\d{1,3}$""",	# IPV4 addresses
33 | 			r"""^([1][12]|[0]?[1-9])[\/-]([3][01]|[12]\d|[0]?[1-9])[\/-](\d{4}|\d{2})$""",  # Dates in MM/DD/YYYY format
34 | 			# MasterCard numbers 5258704108753590
35 | 			r"""^(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}$""",
36 | 			# Visa card numbers 4563-7568-5698-4587
37 | 			r"""\b([4]\d{3}[\s]\d{4}[\s]\d{4}[\s]\d{4}|[4]\d{3}[-]\d{4}[-]\d{4}[-]\d{4}|[4]\d{3}[.]\d{4}[.]\d{4}[.]\d{4}|[4]\d{3}\d{4}\d{4}\d{4})\b""",
38 | 			# Any card number
39 | 			r"""[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}""",
40 | 			# URLs
41 | 			r"""(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()]+|\(([^\s()]+|(\([^\s()]+\)))*\))+(?:\(([^\s()]+|(\([^\s()]+\)))*\)|[^\s`!()\[\]{};:'".,?«»“”‘’]))""",
42 | 			r"""[0-9]{2}-[0-9]{7}"""  # INN from 1c
43 | 		]
44 | 	},
45 | 	"data_const": {
46 | 		"constants": [
47 | 			"account",
48 | 			"email",
49 | 			"слово",
50 | 			"сергей"
51 | 		]
52 | 	},
53 | 	"sens_pg_types": [
54 | 		"text",
55 | 		"integer",
56 | 		"bigint",
57 | 		"varchar",
58 | 		"json"
59 | 	],
60 | 	"funcs": {
61 | 		"text": "anon_funcs.digest(\"%s\", 'salt_word', 'md5')",
62 | 		"numeric": "anon_funcs.noise(\"%s\", 10)",
63 | 		"numeric(30,4)": "anon_funcs.noise(\"%s\", 30)",
64 | 		"timestamp": "anon_funcs.dnoise(\"%s\",  interval '6 month')",
65 | 		"bigint": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')",
66 | 		"integer": "anon_funcs.random_int_between(1, 10)",
67 | 		"mvarchar": "anon_funcs.digest(\"%s\"::text, 'salt_word', 'md5')"
68 | 	}
69 | }
70 | 


--------------------------------------------------------------------------------
/rest_api/runners/background/dump.py:
--------------------------------------------------------------------------------
 1 | from pg_anon.common.enums import AnonMode
 2 | from rest_api.enums import DumpMode
 3 | from rest_api.pydantic_models import DumpRequest
 4 | from rest_api.runners.background import BaseRunner
 5 | from rest_api.utils import write_dictionary_contents
 6 | 
 7 | 
 8 | class DumpRunner(BaseRunner):
 9 |     mode: str = AnonMode.DUMP.value
10 |     request: DumpRequest
11 |     full_dump_path: str
12 | 
13 |     def __init__(self, request: DumpRequest):
14 |         super().__init__(request)
15 |         self._set_mode()
16 | 
17 |     def _set_mode(self):
18 |         if self.request.type == DumpMode.FULL:
19 |             self.mode = AnonMode.DUMP.value
20 |         elif self.request.type == DumpMode.STRUCT:
21 |             self.mode = AnonMode.SYNC_STRUCT_DUMP.value
22 |         elif self.request.type == DumpMode.DATA:
23 |             self.mode = AnonMode.SYNC_DATA_DUMP.value
24 | 
25 |     def _prepare_dictionaries_cli_params(self):
26 |         input_sens_dict_file_names = list(
27 |             write_dictionary_contents(self.request.sens_dict_contents, self.base_tmp_dir).keys()
28 |         )
29 |         self.cli_params.append(f"--prepared-sens-dict-file={','.join(input_sens_dict_file_names)}")
30 | 
31 |         if self.request.partial_tables_dict_contents:
32 |             input_partial_tables_dict_file_names = list(
33 |                 write_dictionary_contents(self.request.partial_tables_dict_contents, self.base_tmp_dir).keys()
34 |             )
35 |             self.cli_params.append(
36 |                 f"--partial-tables-dict-file={','.join(input_partial_tables_dict_file_names)}"
37 |             )
38 | 
39 |         if self.request.partial_tables_exclude_dict_contents:
40 |             input_partial_tables_exclude_dict_file_names = list(
41 |                 write_dictionary_contents(self.request.partial_tables_exclude_dict_contents, self.base_tmp_dir).keys()
42 |             )
43 |             self.cli_params.append(
44 |                 f"--partial-tables-exclude-dict-file={','.join(input_partial_tables_exclude_dict_file_names)}"
45 |             )
46 | 
47 |     def _prepare_dump_path_cli_params(self):
48 |         self.full_dump_path = self.request.validated_output_path
49 |         self.cli_params.extend([
50 |             f'--output-dir={self.full_dump_path}',
51 |             '--clear-output-dir',
52 |         ])
53 | 
54 |     def _prepare_parallelization_cli_params(self):
55 |         if self.request.proc_count:
56 |             self.cli_params.append(
57 |                 f'--processes={self.request.proc_count}'
58 |             )
59 | 
60 |         if self.request.proc_conn_count:
61 |             self.cli_params.append(
62 |                 f'--db-connections-per-process={self.request.proc_conn_count}'
63 |             )
64 | 
65 |     def _prepare_pg_dump_cli_params(self):
66 |         if self.request.pg_dump_path:
67 |             self.cli_params.append(
68 |                 f'--pg-dump={self.request.pg_dump_path}'
69 |             )
70 | 
71 |     def _prepare_cli_params(self):
72 |         super()._prepare_cli_params()
73 |         self._prepare_dictionaries_cli_params()
74 |         self._prepare_dump_path_cli_params()
75 |         self._prepare_parallelization_cli_params()
76 |         self._prepare_pg_dump_cli_params()
77 |         self._prepare_verbosity_cli_params()
78 | 


--------------------------------------------------------------------------------
/tests/input_dict/test.py:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"dictionary": [
  3 | 		{
  4 | 			"schema":"schm_other_1",
  5 | 			"table":"some_tbl",
  6 | 			"fields": {
  7 | 					"val":"'text const'"
  8 | 			}
  9 | 		},
 10 | 		{
 11 | 			"schema":"schm_other_2",
 12 | 			"table":"some_tbl",
 13 | 			"raw_sql": "SELECT id, val || ' modified' as val FROM schm_other_2.some_tbl"
 14 | 		},
 15 | 		{
 16 | 			"schema":"public",
 17 | 			"table":"key_value",
 18 | 			"fields": {
 19 | 					"fld_value":"""SQL:
 20 | 					CASE
 21 | 						WHEN "fld_key" ILIKE '%email%' THEN CONCAT(md5(random()::TEXT),'@domain.com')
 22 | 						WHEN "fld_key" ILIKE '%password%' THEN md5(fld_value)
 23 | 						WHEN "fld_key" ILIKE '%address%' THEN 'test address'
 24 | 						WHEN "fld_key" ILIKE '%login%' THEN 'test_login'
 25 | 						WHEN "fld_key" ILIKE '%name%' THEN 'test_name'
 26 | 						WHEN "fld_key" ILIKE '%amount%' THEN (select anon_funcs.noise(fld_value::int, 1000.2)::text)
 27 | 						ELSE fld_value
 28 | 					END"""
 29 | 			}
 30 | 		},
 31 | 		{
 32 | 			"schema":"_SCHM.$complex#имя;@&* a'",
 33 | 			"table":"_TBL.$complex#имя;@&* a'2",
 34 | 			"fields": {
 35 | 				"_FLD.$complex#имя;@&* a'": "'text const'"
 36 | 			}
 37 | 		},
 38 | 		{
 39 | 			"schema":"_SCHM.$complex#имя;@&* a'",
 40 | 			"table":"_TBL.$complex#имя;@&* a'3",
 41 | 			"raw_sql": """
 42 | 				SELECT id, fld_key, "_FLD.$complex#имя;@&* a'" || ' (modified)' as "_FLD.$complex#имя;@&* a'"
 43 | 				FROM "_SCHM.$complex#имя;@&* a'"."_TBL.$complex#имя;@&* a'3"
 44 | 			"""
 45 | 		},
 46 | 		{
 47 | 			"schema":"schm_other_2",
 48 | 			"table":"tbl_test_anon_functions",
 49 | 			"fields": {
 50 | 					"fld_1_int": "anon_funcs.noise(fld_1_int, 2000)",
 51 | 					"fld_2_datetime": "anon_funcs.dnoise(fld_2_datetime, interval '1 month')",
 52 | 					"fld_3_txt": "anon_funcs.digest(fld_3_txt, 'salt', 'sha256') ",
 53 | 					"fld_4_txt": "anon_funcs.partial(fld_4_txt,1,'***',3)",
 54 | 					"fld_5_email": "anon_funcs.partial_email(fld_5_email)",
 55 | 					"fld_6_txt": "anon_funcs.random_string(7)",
 56 | 					"fld_7_zip": "anon_funcs.random_zip()",
 57 | 					"fld_8_datetime": """
 58 | 						anon_funcs.random_date_between(
 59 | 							fld_8_datetime - interval '1 year',
 60 | 							fld_8_datetime + interval '1 year'
 61 | 						)
 62 | 					""",
 63 | 					"fld_9_datetime": "anon_funcs.random_date()",
 64 | 					"fld_10_int": "anon_funcs.random_int_between(fld_10_int - 1000, fld_10_int + 2000)",
 65 | 					"fld_11_int": "anon_funcs.random_bigint_between(6000000000, 7000000000)",
 66 | 					"fld_12_phone": "anon_funcs.random_phone('+7')",
 67 | 					"fld_13_txt": "anon_funcs.random_hash('seed', 'sha512')",
 68 | 					"fld_14_txt": "anon_funcs.random_in(array['a', 'b', 'c'])",
 69 | 					"fld_15_txt": "anon_funcs.hex_to_int(fld_15_txt)::text"
 70 | 			}
 71 | 		},
 72 | 		{
 73 | 			"schema_mask": "^schm_mask_incl",
 74 | 			"table_mask": "^some_t",
 75 | 			"fields": {
 76 | 				"val": "'text const'"
 77 | 			}
 78 | 		},
 79 | 		{
 80 | 			"schema_mask": "^schm_mask_incl",
 81 | 			"table": "tbl_123",
 82 | 			"fields": {
 83 | 				"val": "'text const'"
 84 | 			}
 85 | 		},
 86 | 		{
 87 | 			"schema": "schm_mask_include_1",
 88 | 			"table_mask": "\w+\_\d+\_\d+",
 89 | 			"fields": {
 90 | 				"val": "'text const'"
 91 | 			}
 92 | 		}
 93 | 	],
 94 | 	"dictionary_exclude": [
 95 | 		{
 96 | 			"schema":"schm_other_2",
 97 | 			"table":"exclude_tbl"
 98 | 		}
 99 | 	]
100 | }


--------------------------------------------------------------------------------
/docs/dicts/non-sens-dict-schema.md:
--------------------------------------------------------------------------------
 1 | # 📋 Non-Sensitive Dictionary
 2 | > [🏠 Home](../../README.md#-dictionary-schemas) | [🔍 Scan](../operations/scan.md) | [🗂️ Meta Dictionary](meta-dict-schema.md) | [🔐 Sensitive Dictionary](sens-dict-schema.md) |  
 3 | 
 4 | The non-sensitive dictionary is used only during the [create-dict (scan) mode](../operations/scan.md) to speed up processing.
 5 | It defines which fields should be treated as non-sensitive. Fields listed here are **excluded** from all sensitivity checks according to [meta-dictionary](meta-dict-schema.md) rules.
 6 | 
 7 | This dictionary can be created manually or generated automatically using [create-dict (scan) mode](../operations/scan.md) with `--output-no-sens-dict-file` option. 
 8 | 
 9 | > ⚠️ **Note**
10 | > 
11 | > If a field appears both in the [sensitive dictionary](sens-dict-schema.md) and the non-sensitive dictionary, the sensitive dictionary takes priority.
12 | 
13 | ---
14 | 
15 | ## Schema
16 | ```python
17 | {    
18 |     "no_sens_dictionary": [
19 |         {
20 |             "schema": "<schema_name: string>",
21 |             "table": "<table_name: string>",
22 |             "fields": [
23 |                 "<field_name: string>",
24 |             ]
25 |         },
26 |     ]
27 | }
28 | ```
29 | 
30 | ---
31 | 
32 | ## ⚙️ Using the Dictionary
33 | 
34 | **🏛️ Example Tables Structure**
35 | 
36 | | Schema    | Table     | Field            |
37 | |-----------|-----------|------------------|
38 | | public    | employees | id               |
39 | | public    | employees | full_name        |
40 | | public    | employees | email            |
41 | | public    | employees | hire_date        |
42 | | public    | salaries  | employee_id      |
43 | | public    | salaries  | monthly_salary   |
44 | | public    | salaries  | currency         |
45 | 
46 | **📘 Example Non-Sensitive Dictionary**
47 | ```python
48 | {
49 |     "no_sens_dictionary": [
50 |         {
51 |             "schema": "public",
52 |             "table": "employees",
53 |             "fields": [
54 |                 "id",
55 |                 "hire_date",
56 |             ]
57 |         },
58 |         {
59 |             "schema": "public",
60 |             "table": "salaries",
61 |             "fields": [
62 |                 "employee_id",
63 |                 "currency",
64 |             ]
65 |         },
66 |     ]
67 | }
68 | ```
69 | 
70 | **This dictionary matches the following table fields:**
71 | 
72 | | Schema   | Table      | Field            | Used in `create-dict (scan)` mode                          |
73 | |----------|------------|------------------|------------------------------------------------------------|
74 | | public   | employees  | id               | Excluded from sensitivity checks as a "no sensitive" field |
75 | | public   | employees  | full_name        | Fields scanned using meta-dictionary rules                 |
76 | | public   | employees  | email            | Fields scanned using meta-dictionary rules                 |
77 | | public   | employees  | hire_date        | Excluded from sensitivity checks as a "no sensitive" field |
78 | | public   | salaries   | employee_id      | Excluded from sensitivity checks as a "no sensitive" field |
79 | | public   | salaries   | monthly_salary   | Fields scanned using meta-dictionary rules                 |
80 | | public   | salaries   | currency         | Excluded from sensitivity checks as a "no sensitive" field |
81 | 


--------------------------------------------------------------------------------
/rest_api/runners/background/restore.py:
--------------------------------------------------------------------------------
 1 | from pg_anon.common.enums import AnonMode
 2 | from rest_api.enums import RestoreMode
 3 | from rest_api.pydantic_models import RestoreRequest
 4 | from rest_api.runners.background import BaseRunner
 5 | from rest_api.utils import write_dictionary_contents
 6 | 
 7 | 
 8 | class RestoreRunner(BaseRunner):
 9 |     mode: str = AnonMode.RESTORE.value
10 |     request: RestoreRequest
11 |     full_input_path: str
12 | 
13 |     def __init__(self, request: RestoreRequest):
14 |         super().__init__(request)
15 |         self._set_mode()
16 | 
17 |     def _set_mode(self):
18 |         if self.request.type == RestoreMode.FULL:
19 |             self.mode = AnonMode.RESTORE.value
20 |         elif self.request.type == RestoreMode.STRUCT:
21 |             self.mode = AnonMode.SYNC_STRUCT_RESTORE.value
22 |         elif self.request.type == RestoreMode.DATA:
23 |             self.mode = AnonMode.SYNC_DATA_RESTORE.value
24 | 
25 |     def _prepare_dictionaries_cli_params(self):
26 |         if self.request.partial_tables_dict_contents:
27 |             input_partial_tables_dict_file_names = list(
28 |                 write_dictionary_contents(self.request.partial_tables_dict_contents, self.base_tmp_dir).keys()
29 |             )
30 |             self.cli_params.append(
31 |                 f"--partial-tables-dict-file={','.join(input_partial_tables_dict_file_names)}"
32 |             )
33 | 
34 |         if self.request.partial_tables_exclude_dict_contents:
35 |             input_partial_tables_exclude_dict_file_names = list(
36 |                 write_dictionary_contents(self.request.partial_tables_exclude_dict_contents, self.base_tmp_dir).keys()
37 |             )
38 |             self.cli_params.append(
39 |                 f"--partial-tables-exclude-dict-file={','.join(input_partial_tables_exclude_dict_file_names)}"
40 |             )
41 | 
42 |     def _prepare_input_dump_path_cli_params(self):
43 |         self.full_input_path = self.request.validated_input_path
44 |         self.cli_params.extend([
45 |             f'--input-dir={self.full_input_path}',
46 |         ])
47 | 
48 |     def _prepare_parallelization_cli_params(self):
49 |         if self.request.proc_conn_count:
50 |             self.cli_params.append(
51 |                 f'--db-connections-per-process={self.request.proc_conn_count}'
52 |             )
53 | 
54 |     def _prepare_pg_restore_cli_params(self):
55 |         if self.request.pg_restore_path:
56 |             self.cli_params.append(
57 |                 f'--pg-restore={self.request.pg_restore_path}'
58 |             )
59 | 
60 |     def _prepare_additional_cli_params(self):
61 |         if self.request.drop_custom_check_constr:
62 |             self.cli_params.append(
63 |                 f'--drop-custom-check-constr'
64 |             )
65 |         if self.request.clean_db:
66 |             self.cli_params.append(
67 |                 f'--clean-db'
68 |             )
69 |         if self.request.drop_db:
70 |             self.cli_params.append(
71 |                 f'--drop-db'
72 |             )
73 | 
74 |     def _prepare_cli_params(self):
75 |         super()._prepare_cli_params()
76 |         self._prepare_dictionaries_cli_params()
77 |         self._prepare_input_dump_path_cli_params()
78 |         self._prepare_parallelization_cli_params()
79 |         self._prepare_pg_restore_cli_params()
80 |         self._prepare_additional_cli_params()
81 |         self._prepare_verbosity_cli_params()
82 | 


--------------------------------------------------------------------------------
/rest_api/runners/background/scan.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from pg_anon.common.enums import AnonMode, ScanMode
 4 | from rest_api.enums import ScanMode
 5 | from rest_api.pydantic_models import ScanRequest
 6 | from rest_api.runners.background import BaseRunner
 7 | from rest_api.utils import write_dictionary_contents
 8 | 
 9 | 
10 | class ScanRunner(BaseRunner):
11 |     mode: str = AnonMode.CREATE_DICT.value
12 |     request: ScanRequest
13 |     output_sens_dict_file_name: str
14 |     output_no_sens_dict_file_name: Optional[str] = None
15 | 
16 |     def _prepare_dictionaries_cli_params(self):
17 |         input_meta_dict_file_names = list(
18 |             write_dictionary_contents(self.request.meta_dict_contents, self.base_tmp_dir).keys()
19 |         )
20 | 
21 |         input_sens_dict_file_names = None
22 |         if self.request.sens_dict_contents:
23 |             input_sens_dict_file_names = list(
24 |                 write_dictionary_contents(self.request.sens_dict_contents, self.base_tmp_dir).keys()
25 |             )
26 | 
27 |         input_no_sens_dict_file_names = None
28 |         if self.request.no_sens_dict_contents:
29 |             input_no_sens_dict_file_names = list(
30 |                 write_dictionary_contents(self.request.no_sens_dict_contents, self.base_tmp_dir).keys()
31 |             )
32 | 
33 |         self.output_sens_dict_file_name = self.base_tmp_dir / 'output_sens_dict.py'
34 | 
35 |         self.cli_params.extend([
36 |             f"--meta-dict-file={','.join(input_meta_dict_file_names)}",
37 |             f"--output-sens-dict-file={self.output_sens_dict_file_name}",
38 |         ])
39 | 
40 |         if self.request.need_no_sens_dict:
41 |             self.output_no_sens_dict_file_name = self.base_tmp_dir / 'output_no_sens_dict.py'
42 |             self.cli_params.append(
43 |                 f"--output-no-sens-dict-file={self.output_no_sens_dict_file_name}",
44 |             )
45 | 
46 |         if input_sens_dict_file_names:
47 |             self.cli_params.append(
48 |                 f"--prepared-sens-dict-file={','.join(input_sens_dict_file_names)}"
49 |             )
50 | 
51 |         if input_no_sens_dict_file_names:
52 |             self.cli_params.append(
53 |                 f"--prepared-no-sens-dict-file={','.join(input_no_sens_dict_file_names)}"
54 |             )
55 | 
56 |     def _prepare_parallelization_cli_params(self):
57 |         if self.request.proc_count:
58 |             self.cli_params.append(
59 |                 f'--processes={self.request.proc_count}'
60 |             )
61 | 
62 |         if self.request.proc_conn_count:
63 |             self.cli_params.append(
64 |                 f'--db-connections-per-process={self.request.proc_conn_count}'
65 |             )
66 | 
67 |     def _prepare_scan_mode_cli_params(self):
68 |         if self.request.type == ScanMode.PARTIAL and self.request.depth:
69 |             self.cli_params.extend([
70 |                 f'--scan-mode={ScanMode.PARTIAL.value}',
71 |                 f'--scan-partial-rows={self.request.depth}',
72 |             ])
73 |         else:
74 |             self.cli_params.append(
75 |                 f'--scan-mode={ScanMode.FULL.value}'
76 |             )
77 | 
78 |     def _prepare_cli_params(self):
79 |         super()._prepare_cli_params()
80 |         self._prepare_dictionaries_cli_params()
81 |         self._prepare_parallelization_cli_params()
82 |         self._prepare_scan_mode_cli_params()
83 |         self._prepare_verbosity_cli_params()
84 | 


--------------------------------------------------------------------------------
/tests/expected_results/PGAnonMaskUnitTest_target_tables.result:
--------------------------------------------------------------------------------
  1 | [
  2 |     [
  3 |         "public",
  4 |         "contracts",
  5 |         "amount",
  6 |         [
  7 |             [
  8 |                 101010.0
  9 |             ],
 10 |             [
 11 |                 101010.0
 12 |             ],
 13 |             [
 14 |                 101010.0
 15 |             ],
 16 |             [
 17 |                 101010.0
 18 |             ],
 19 |             [
 20 |                 101010.0
 21 |             ]
 22 |         ]
 23 |     ],
 24 |     [
 25 |         "public",
 26 |         "tbl_100",
 27 |         "amount",
 28 |         [
 29 |             [
 30 |                 202020.0
 31 |             ],
 32 |             [
 33 |                 202020.0
 34 |             ],
 35 |             [
 36 |                 202020.0
 37 |             ],
 38 |             [
 39 |                 202020.0
 40 |             ],
 41 |             [
 42 |                 202020.0
 43 |             ]
 44 |         ]
 45 |     ],
 46 |     [
 47 |         "schm_other_1",
 48 |         "some_tbl",
 49 |         "val",
 50 |         [
 51 |             [
 52 |                 "text const"
 53 |             ],
 54 |             [
 55 |                 "text const"
 56 |             ],
 57 |             [
 58 |                 "text const"
 59 |             ],
 60 |             [
 61 |                 "text const"
 62 |             ],
 63 |             [
 64 |                 "text const"
 65 |             ]
 66 |         ]
 67 |     ],
 68 |     [
 69 |         "schm_other_2",
 70 |         "some_tbl",
 71 |         "val",
 72 |         [
 73 |             [
 74 |                 "text_val_1 modified"
 75 |             ],
 76 |             [
 77 |                 "text_val_2 modified"
 78 |             ],
 79 |             [
 80 |                 "text_val_3 modified"
 81 |             ],
 82 |             [
 83 |                 "text_val_4 modified"
 84 |             ],
 85 |             [
 86 |                 "text_val_5 modified"
 87 |             ]
 88 |         ]
 89 |     ],
 90 |     [
 91 |         "schm_other_4",
 92 |         "partitioned_table",
 93 |         "amount",
 94 |         [
 95 |             [
 96 |                 101010.0
 97 |             ],
 98 |             [
 99 |                 101010.0
100 |             ],
101 |             [
102 |                 101010.0
103 |             ],
104 |             [
105 |                 101010.0
106 |             ],
107 |             [
108 |                 101010.0
109 |             ]
110 |         ]
111 |     ],
112 |     [
113 |         "schm_other_4",
114 |         "partitioned_table_2025_01",
115 |         "amount",
116 |         [
117 |             [
118 |                 101010.0
119 |             ],
120 |             [
121 |                 101010.0
122 |             ]
123 |         ]
124 |     ],
125 |     [
126 |         "schm_other_4",
127 |         "partitioned_table_2025_02",
128 |         "amount",
129 |         [
130 |             [
131 |                 101010.0
132 |             ]
133 |         ]
134 |     ],
135 |     [
136 |         "schm_other_4",
137 |         "partitioned_table_2025_03",
138 |         "amount",
139 |         [
140 |             [
141 |                 101010.0
142 |             ]
143 |         ]
144 |     ],
145 |     [
146 |         "schm_other_4",
147 |         "partitioned_table_default",
148 |         "amount",
149 |         [
150 |             [
151 |                 101010.0
152 |             ]
153 |         ]
154 |     ]
155 | ]


--------------------------------------------------------------------------------
/docs/dicts/tables-dictionary.md:
--------------------------------------------------------------------------------
  1 | # 📑 Tables dictionary
  2 | > [🏠 Home](../../README.md#-dictionary-schemas) | [💾 Dump](../operations/dump.md) | [📂 Restore](../operations/restore.md)
  3 | 
  4 | ## Overview
  5 | The tables dictionary defines which tables participate in the partial dump and partial restore operations.
  6 | It can act as either a whitelist (include-only) or a blacklist (exclude-only).
  7 | 
  8 | Use this dictionary when you need to:
  9 | - dump or restore only specific tables
 10 | - exclude unwanted tables from the dump or restore
 11 | 
 12 | ## Schema
 13 | ```python
 14 | {
 15 |     "tables": [
 16 |         {
 17 |             "schema": "<schema_name: string>",             # Include only this schema
 18 |             "schema_mask": "<schema_regex_mask: string>",  # Or include schemas matching regex pattern
 19 |             "table": "<table_name: string>",               # Include only this table
 20 |             "table_mask": "<table_regex_mask: string>",    # Or include tables matching regex pattern
 21 |         }
 22 |     ]
 23 | }
 24 | ```
 25 | > ⚠️ **Note**
 26 | > - You must use either `schema` or `schema_mask` → not both.
 27 | > - You must use either `table` or `table_mask` → not both.
 28 | 
 29 | ---
 30 | 
 31 | ## ⚙️ Using the Dictionary
 32 | 
 33 | You can use the same dictionary in two different roles:
 34 | - Whitelist — dump/restore only the matched tables
 35 | - Blacklist — dump/restore all tables except the matched ones
 36 | 
 37 | 
 38 | **🏛️ Example Database Structure**
 39 | 
 40 | | Schema    | Table       |
 41 | |-----------|-------------|
 42 | | public    | employees   |
 43 | | public    | departments |
 44 | | public    | positions   |
 45 | | public    | salaries    |
 46 | | public    | users       |
 47 | | ecommerce | products    |
 48 | | ecommerce | categories  |
 49 | | ecommerce | orders      |
 50 | | ecommerce | order_items |
 51 | | tenant_a  | users       | 
 52 | | tenant_a  | projects    | 
 53 | | tenant_a  | tasks       | 
 54 | | tenant_a  | comments    | 
 55 | | tenant_b  | users       | 
 56 | | tenant_b  | projects    | 
 57 | | tenant_b  | tasks       | 
 58 | | tenant_b  | comments    | 
 59 | | tenant_c  | users       | 
 60 | | tenant_c  | projects    | 
 61 | | tenant_c  | tasks       | 
 62 | | tenant_c  | comments    | 
 63 | 
 64 | 
 65 | 
 66 | **📘 Example Tables Dictionary**
 67 | ```python
 68 | {
 69 |     "tables": [  
 70 |         {
 71 |             "schema": "public",
 72 |             "table": "employees"
 73 |         },
 74 |         {
 75 |             "schema": "ecommerce",
 76 |             "table_mask": "^orders"
 77 |         },
 78 |         {
 79 |             "schema_mask": "_a$",
 80 |             "table": "projects"
 81 |         },
 82 |         {
 83 |             "schema_mask": "*",
 84 |             "table_mask": "users"
 85 |         },
 86 |     ]
 87 | }
 88 | ```
 89 | 
 90 | **This dictionary matches the following tables:**
 91 | 
 92 | | Schema    | Table       | Matched by rule                            |
 93 | |-----------|-------------|--------------------------------------------|
 94 | | ecommerce | orders      | `schema="ecommerce", table_mask="^orders"` |
 95 | | ecommerce | order_items | `schema="ecommerce", table_mask="^orders"` |
 96 | | tenant_a  | projects    | `schema_mask="_a$", table="projects"`      |
 97 | | tenant_a  | users       | `schema_mask="*", table_mask="users"`      |
 98 | | tenant_b  | users       | `schema_mask="*", table_mask="users"`      |
 99 | | tenant_c  | users       | `schema_mask="*", table_mask="users"`      |
100 | | public    | users       | `schema_mask="*", table_mask="users"`      |
101 | | public    | employees   | `schema="public", table="employees"`       |
102 | 


--------------------------------------------------------------------------------
/tests/input_dict/test_meta_dict_type_aliases.py:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"include_rules": [
 3 | 		{
 4 | 			"schema": "schm_other_3",
 5 | 			"table": "data_types_test",
 6 | 		}
 7 | 	],
 8 | 	"field": {
 9 | 		"rules": [".*"]
10 | 	},
11 | 	"funcs": {
12 |         "default": "anon_funcs.digest(\"%s\", 'default', 'md5')",
13 |         "bit": "anon_funcs.digest(\"%s\", 'bit', 'md5')",
14 |         "varbit": "anon_funcs.digest(\"%s\", 'varbit', 'md5')",
15 |         "bool": "anon_funcs.digest(\"%s\", 'bool', 'md5')",
16 |         "char": "anon_funcs.digest(\"%s\", 'char', 'md5')",
17 |         "varchar": "anon_funcs.digest(\"%s\", 'varchar', 'md5')",
18 |         "int": "anon_funcs.digest(\"%s\", 'int', 'md5')",
19 |         "int4": "anon_funcs.digest(\"%s\", 'int4', 'md5')",
20 |         "int2": "anon_funcs.digest(\"%s\", 'int2', 'md5')",
21 |         "int8": "anon_funcs.digest(\"%s\", 'int8', 'md5')",
22 |         "float": "anon_funcs.digest(\"%s\", 'float', 'md5')",
23 |         "float8": "anon_funcs.digest(\"%s\", 'float8', 'md5')",
24 |         "float4": "anon_funcs.digest(\"%s\", 'float4', 'md5')",
25 |         "decimal": "anon_funcs.digest(\"%s\", 'decimal', 'md5')",
26 |         "dec": "anon_funcs.digest(\"%s\", 'dec', 'md5')",
27 |         "serial2": "anon_funcs.digest(\"%s\", 'serial2', 'md5')",
28 |         "serial4": "anon_funcs.digest(\"%s\", 'serial4', 'md5')",
29 |         "serial8": "anon_funcs.digest(\"%s\", 'serial8', 'md5')",
30 |         "time": "anon_funcs.digest(\"%s\", 'time', 'md5')",
31 |         "timetz": "anon_funcs.digest(\"%s\", 'timetz', 'md5')",
32 |         "timestamp": "anon_funcs.digest(\"%s\", 'timestamp', 'md5')",
33 |         "timestamptz": "anon_funcs.digest(\"%s\", 'timestamptz', 'md5')",
34 |         "bit(4)": "anon_funcs.digest(\"%s\", 'bit(4)', 'md5')",
35 |         "bit(5)": "anon_funcs.digest(\"%s\", 'bit(5)', 'md5')",
36 |         "bit(6)": "anon_funcs.digest(\"%s\", 'bit(6)', 'md5')",
37 |         "varbit(4)": "anon_funcs.digest(\"%s\", 'varbit(4)', 'md5')",
38 |         "varbit(5)": "anon_funcs.digest(\"%s\", 'varbit(5)', 'md5')",
39 |         "varbit(6)": "anon_funcs.digest(\"%s\", 'varbit(6)', 'md5')",
40 |         "char(4)": "anon_funcs.digest(\"%s\", 'char(4)', 'md5')",
41 |         "char(5)": "anon_funcs.digest(\"%s\", 'char(5)', 'md5')",
42 |         "char(6)": "anon_funcs.digest(\"%s\", 'char(6)', 'md5')",
43 |         "varchar(19)": "anon_funcs.digest(\"%s\", 'varchar(19)', 'md5')",
44 |         "varchar(20)": "anon_funcs.digest(\"%s\", 'varchar(20)', 'md5')",
45 |         "varchar(21)": "anon_funcs.digest(\"%s\", 'varchar(21)', 'md5')",
46 |         "decimal(10,1)": "anon_funcs.digest(\"%s\", 'decimal(10,1)', 'md5')",
47 |         "decimal(10,2)": "anon_funcs.digest(\"%s\", 'decimal(10,2)', 'md5')",
48 |         "decimal(11,2)": "anon_funcs.digest(\"%s\", 'decimal(11,2)', 'md5')",
49 |         "time(2)": "anon_funcs.digest(\"%s\", 'time(2)', 'md5')",
50 |         "time(3)": "anon_funcs.digest(\"%s\", 'time(3)', 'md5')",
51 |         "time(4)": "anon_funcs.digest(\"%s\", 'time(4)', 'md5')",
52 |         "timestamp(2)": "anon_funcs.digest(\"%s\", 'timestamp(2)', 'md5')",
53 |         "timestamp(3)": "anon_funcs.digest(\"%s\", 'timestamp(3)', 'md5')",
54 |         "timestamp(4)": "anon_funcs.digest(\"%s\", 'timestamp(4)', 'md5')",
55 |         "timestamptz(2)": "anon_funcs.digest(\"%s\", 'timestamptz(2)', 'md5')",
56 |         "timestamptz(3)": "anon_funcs.digest(\"%s\", 'timestamptz(3)', 'md5')",
57 |         "timestamptz(4)": "anon_funcs.digest(\"%s\", 'timestamptz(4)', 'md5')",
58 |         "timetz(2)": "anon_funcs.digest(\"%s\", 'timetz(2)', 'md5')",
59 |         "timetz(3)": "anon_funcs.digest(\"%s\", 'timetz(3)', 'md5')",
60 |         "timetz(4)": "anon_funcs.digest(\"%s\", 'timetz(4)', 'md5')",
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/tests/expected_results/PGAnonMaskUnitTest_source_tables.result:
--------------------------------------------------------------------------------
  1 | [
  2 |     [
  3 |         "columnar_internal",
  4 |         "tbl_200",
  5 |         "id",
  6 |         []
  7 |     ],
  8 |     [
  9 |         "columnar_internal",
 10 |         "tbl_200",
 11 |         "val",
 12 |         []
 13 |     ],
 14 |     [
 15 |         "columnar_internal",
 16 |         "tbl_200",
 17 |         "val_skip",
 18 |         []
 19 |     ],
 20 |     [
 21 |         "public",
 22 |         "contracts",
 23 |         "amount",
 24 |         [
 25 |             [
 26 |                 0.0
 27 |             ],
 28 |             [
 29 |                 1.0
 30 |             ],
 31 |             [
 32 |                 2.0
 33 |             ],
 34 |             [
 35 |                 2.0
 36 |             ],
 37 |             [
 38 |                 3.0
 39 |             ]
 40 |         ]
 41 |     ],
 42 |     [
 43 |         "public",
 44 |         "tbl_100",
 45 |         "amount",
 46 |         [
 47 |             [
 48 |                 0.1
 49 |             ],
 50 |             [
 51 |                 0.2
 52 |             ],
 53 |             [
 54 |                 0.3
 55 |             ],
 56 |             [
 57 |                 0.4
 58 |             ],
 59 |             [
 60 |                 0.5
 61 |             ]
 62 |         ]
 63 |     ],
 64 |     [
 65 |         "schm_other_1",
 66 |         "some_tbl",
 67 |         "val",
 68 |         [
 69 |             [
 70 |                 "text_val_1"
 71 |             ],
 72 |             [
 73 |                 "text_val_2"
 74 |             ],
 75 |             [
 76 |                 "text_val_3"
 77 |             ],
 78 |             [
 79 |                 "text_val_4"
 80 |             ],
 81 |             [
 82 |                 "text_val_5"
 83 |             ]
 84 |         ]
 85 |     ],
 86 |     [
 87 |         "schm_other_2",
 88 |         "some_tbl",
 89 |         "val",
 90 |         [
 91 |             [
 92 |                 "text_val_1"
 93 |             ],
 94 |             [
 95 |                 "text_val_2"
 96 |             ],
 97 |             [
 98 |                 "text_val_3"
 99 |             ],
100 |             [
101 |                 "text_val_4"
102 |             ],
103 |             [
104 |                 "text_val_5"
105 |             ]
106 |         ]
107 |     ],
108 |     [
109 |         "schm_other_4",
110 |         "partitioned_table",
111 |         "amount",
112 |         [
113 |             [
114 |                 99.98
115 |             ],
116 |             [
117 |                 25.5
118 |             ],
119 |             [
120 |                 149.97
121 |             ],
122 |             [
123 |                 15.7
124 |             ],
125 |             [
126 |                 76.23
127 |             ]
128 |         ]
129 |     ],
130 |     [
131 |         "schm_other_4",
132 |         "partitioned_table_2025_01",
133 |         "amount",
134 |         [
135 |             [
136 |                 99.98
137 |             ],
138 |             [
139 |                 25.5
140 |             ]
141 |         ]
142 |     ],
143 |     [
144 |         "schm_other_4",
145 |         "partitioned_table_2025_02",
146 |         "amount",
147 |         [
148 |             [
149 |                 149.97
150 |             ]
151 |         ]
152 |     ],
153 |     [
154 |         "schm_other_4",
155 |         "partitioned_table_2025_03",
156 |         "amount",
157 |         [
158 |             [
159 |                 15.7
160 |             ]
161 |         ]
162 |     ],
163 |     [
164 |         "schm_other_4",
165 |         "partitioned_table_default",
166 |         "amount",
167 |         [
168 |             [
169 |                 76.23
170 |             ]
171 |         ]
172 |     ]
173 | ]


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Service specific
  2 | output/
  3 | log/
  4 | runs/
  5 | tests/output_dict/
  6 | tests/output/
  7 | tests/saved_results/
  8 | docker/pg_anon
  9 | *.tar
 10 | *.tar.gz
 11 | venv*/*
 12 | venv*
 13 | tmp*
 14 | tmp*/*
 15 | .idea/
 16 | 
 17 | # Byte-compiled / optimized / DLL files
 18 | __pycache__/
 19 | *.py[cod]
 20 | *$py.class
 21 | 
 22 | # C extensions
 23 | *.so
 24 | 
 25 | # Distribution / packaging
 26 | .Python
 27 | build/
 28 | develop-eggs/
 29 | dist/
 30 | downloads/
 31 | eggs/
 32 | .eggs/
 33 | lib/
 34 | lib64/
 35 | parts/
 36 | sdist/
 37 | var/
 38 | wheels/
 39 | share/python-wheels/
 40 | *.egg-info/
 41 | .installed.cfg
 42 | *.egg
 43 | MANIFEST
 44 | 
 45 | # PyInstaller
 46 | #  Usually these files are written by a python script from a template
 47 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 48 | *.manifest
 49 | *.spec
 50 | 
 51 | # Installer logs
 52 | pip-log.txt
 53 | pip-delete-this-directory.txt
 54 | 
 55 | # Unit test / coverage reports
 56 | htmlcov/
 57 | .tox/
 58 | .nox/
 59 | .coverage
 60 | .coverage.*
 61 | .cache
 62 | nosetests.xml
 63 | coverage.xml
 64 | *.cover
 65 | *.py,cover
 66 | .hypothesis/
 67 | .pytest_cache/
 68 | cover/
 69 | 
 70 | # Translations
 71 | *.mo
 72 | *.pot
 73 | 
 74 | # Django stuff:
 75 | *.log
 76 | local_settings.py
 77 | db.sqlite3
 78 | db.sqlite3-journal
 79 | 
 80 | # Flask stuff:
 81 | instance/
 82 | .webassets-cache
 83 | 
 84 | # Scrapy stuff:
 85 | .scrapy
 86 | 
 87 | # Sphinx documentation
 88 | docs/_build/
 89 | 
 90 | # PyBuilder
 91 | .pybuilder/
 92 | target/
 93 | 
 94 | # Jupyter Notebook
 95 | .ipynb_checkpoints
 96 | 
 97 | # IPython
 98 | profile_default/
 99 | ipython_config.py
100 | 
101 | # pyenv
102 | #   For a library or package, you might want to ignore these files since the code is
103 | #   intended to run in multiple environments; otherwise, check them in:
104 | # .python-version
105 | 
106 | # pipenv
107 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
108 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
109 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
110 | #   install all needed dependencies.
111 | #Pipfile.lock
112 | 
113 | # poetry
114 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
115 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
116 | #   commonly ignored for libraries.
117 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
118 | #poetry.lock
119 | 
120 | # pdm
121 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
122 | #pdm.lock
123 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
124 | #   in version control.
125 | #   https://pdm.fming.dev/#use-with-ide
126 | .pdm.toml
127 | 
128 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
129 | __pypackages__/
130 | 
131 | # Celery stuff
132 | celerybeat-schedule
133 | celerybeat.pid
134 | 
135 | # SageMath parsed files
136 | *.sage.py
137 | 
138 | # Environments
139 | .env
140 | .venv
141 | env/
142 | venv/
143 | ENV/
144 | env.bak/
145 | venv.bak/
146 | 
147 | # Spyder project settings
148 | .spyderproject
149 | .spyproject
150 | 
151 | # Rope project settings
152 | .ropeproject
153 | 
154 | # mkdocs documentation
155 | /site
156 | 
157 | # mypy
158 | .mypy_cache/
159 | .dmypy.json
160 | dmypy.json
161 | 
162 | # Pyre type checker
163 | .pyre/
164 | 
165 | # pytype static type analyzer
166 | .pytype/
167 | 
168 | # Cython debug symbols
169 | cython_debug/
170 | 


--------------------------------------------------------------------------------
/rest_api/runners/direct/view_data.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Type
  2 | 
  3 | from pg_anon.cli import build_run_options
  4 | from pg_anon.common.dto import PgAnonResult
  5 | from pg_anon.context import Context
  6 | from pg_anon.modes.view_data import ViewDataMode
  7 | from rest_api.constants import BASE_TEMP_DIR
  8 | from rest_api.pydantic_models import ViewDataRequest, ViewDataContent
  9 | from rest_api.utils import write_dictionary_contents
 10 | 
 11 | 
 12 | class ViewDataRunner:
 13 |     request: ViewDataRequest
 14 |     cli_params: List[str] = None
 15 |     result: PgAnonResult = None
 16 |     _executor = Type[ViewDataMode]
 17 | 
 18 |     def __init__(self, request: ViewDataRequest):
 19 |         self.request = request
 20 |         self._prepare_cli_params()
 21 |         self._init_context()
 22 |         self._init_executor()
 23 | 
 24 |     def _prepare_db_credentials_cli_params(self):
 25 |         self.cli_params.extend([
 26 |             f'--db-host={self.request.db_connection_params.host}',
 27 |             f'--db-port={self.request.db_connection_params.port}',
 28 |             f'--db-user={self.request.db_connection_params.user_login}',
 29 |             f'--db-user-password={self.request.db_connection_params.user_password}',
 30 |             f'--db-name={self.request.db_connection_params.db_name}',
 31 |         ])
 32 | 
 33 |     def _prepare_dictionaries_cli_params(self):
 34 |         self._input_sens_dict_file_names = write_dictionary_contents(self.request.sens_dict_contents, BASE_TEMP_DIR)
 35 |         self.cli_params.append(
 36 |             f"--prepared-sens-dict-file={','.join(self._input_sens_dict_file_names.keys())}"
 37 |         )
 38 | 
 39 |     def _prepare_filters_cli_params(self):
 40 |         self.cli_params.append(
 41 |             f'--schema-name={self.request.schema_name}',
 42 |         )
 43 | 
 44 |         self.cli_params.append(
 45 |             f'--table-name={self.request.table_name}',
 46 |         )
 47 | 
 48 |     def _prepare_pagination_cli_params(self):
 49 |         if self.request.limit:
 50 |             self.cli_params.append(
 51 |                 f'--limit={self.request.limit}',
 52 |             )
 53 | 
 54 |         if self.request.offset:
 55 |             self.cli_params.append(
 56 |                 f'--offset={self.request.offset}',
 57 |             )
 58 | 
 59 |     def _prepare_json_cli_params(self):
 60 |         self.cli_params.append(
 61 |             f'--json',
 62 |         )
 63 | 
 64 |     def _prepare_verbosity_cli_params(self):
 65 |         self.cli_params.extend([
 66 |             "--verbose=debug",
 67 |             "--debug",
 68 |         ])
 69 | 
 70 |     def _prepare_cli_params(self):
 71 |         self.cli_params = []
 72 |         self._prepare_db_credentials_cli_params()
 73 |         self._prepare_dictionaries_cli_params()
 74 |         self._prepare_filters_cli_params()
 75 |         self._prepare_pagination_cli_params()
 76 |         self._prepare_json_cli_params()
 77 |         self._prepare_verbosity_cli_params()
 78 | 
 79 |     def _init_context(self):
 80 |         options = build_run_options(self.cli_params)
 81 |         self.context = Context(options)
 82 | 
 83 |     def _init_executor(self):
 84 |         self._executor = ViewDataMode(self.context, need_raw_data=True)
 85 | 
 86 |     def _format_output(self) -> ViewDataContent:
 87 |         def _format_data_to_str(records: List[List[str]]):
 88 |             return [[str(data) for data in record] for record in records]
 89 | 
 90 |         rows_before = _format_data_to_str(self._executor.raw_data)
 91 |         rows_after = _format_data_to_str(self._executor.data)
 92 | 
 93 |         return ViewDataContent(
 94 |             schema_name=self.request.schema_name,
 95 |             table_name=self.request.table_name,
 96 |             field_names=self._executor.raw_field_names,
 97 |             total_rows_count=self._executor.rows_count,
 98 |             rows_before=rows_before,
 99 |             rows_after=rows_after,
100 |         )
101 | 
102 |     async def run(self):
103 |         await self._executor.run()
104 |         await self._executor.get_rows_count()
105 |         return self._format_output()
106 | 


--------------------------------------------------------------------------------
/tests/expected_results/test_prepared_sens_dict_result_by_data_sql_condition_expected.py:
--------------------------------------------------------------------------------
  1 | {
  2 |     "dictionary": [
  3 |         {
  4 |             "schema": "_SCHM.$complex#имя;@&* a'",
  5 |             "table": "_TBL.$complex#имя;@&* a'",
  6 |             "fields": {
  7 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
  8 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
  9 |             }
 10 |         },
 11 |         {
 12 |             "schema": "public",
 13 |             "table": "key_value",
 14 |             "fields": {
 15 |                 "fld_value": "anon_funcs.digest(\"fld_value\", 'salt_word', 'md5')",
 16 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 17 |             }
 18 |         },
 19 |         {
 20 |             "schema": "public",
 21 |             "table": "tbl_100",
 22 |             "fields": {
 23 |                 "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')",
 24 |                 "amount": "anon_funcs.noise(\"amount\", 30)",
 25 |                 "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')"
 26 |             }
 27 |         },
 28 |         {
 29 |             "schema": "schm_mask_ext_exclude_2",
 30 |             "table": "other_ext_tbl_2",
 31 |             "fields": {
 32 |                 "val_2": "anon_funcs.digest(\"val_2\", 'salt_word', 'md5')",
 33 |                 "val_1": "anon_funcs.digest(\"val_1\", 'salt_word', 'md5')"
 34 |             }
 35 |         },
 36 |         {
 37 |             "schema": "_SCHM.$complex#имя;@&* a'",
 38 |             "table": "_TBL.$complex#имя;@&* a'3",
 39 |             "fields": {
 40 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
 41 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 42 |             }
 43 |         },
 44 |         {
 45 |             "schema": "schm_customer",
 46 |             "table": "customer_manager",
 47 |             "fields": {
 48 |                 "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')"
 49 |             }
 50 |         },
 51 |         {
 52 |             "schema": "schm_mask_ext_exclude_2",
 53 |             "table": "card_numbers",
 54 |             "fields": {
 55 |                 "val": "anon_funcs.digest(\"val\", 'salt_word', 'md5')",
 56 |                 "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')",
 57 |                 "usd": "anon_funcs.noise(\"usd\", 30)",
 58 |                 "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')"
 59 |             }
 60 |         },
 61 |         {
 62 |             "schema": "_SCHM.$complex#имя;@&* a'",
 63 |             "table": "_TBL.$complex#имя;@&* a'2",
 64 |             "fields": {
 65 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
 66 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 67 |             }
 68 |         },
 69 |         {
 70 |             "schema": "public",
 71 |             "table": "contracts",
 72 |             "fields": {
 73 |                 "amount": "anon_funcs.noise(\"amount\", 10)",
 74 |                 "contract_expires": "anon_funcs.dnoise(\"contract_expires\",  interval '6 month')",
 75 |                 "details": "anon_funcs.digest(\"details\", 'salt_word', 'md5')"
 76 |             }
 77 |         },
 78 |         {
 79 |             "schema": "public",
 80 |             "table": "inn_info",
 81 |             "fields": {
 82 |                 "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')"
 83 |             }
 84 |         },
 85 |         {
 86 |             "schema": "schm_customer",
 87 |             "table": "customer_company",
 88 |             "fields": {
 89 |                 "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')"
 90 |             }
 91 |         },
 92 |         {
 93 |             "schema": "schm_other_2",
 94 |             "table": "tbl_test_anon_functions",
 95 |             "fields": {
 96 |                 "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')"
 97 |             }
 98 |         }
 99 |     ]
100 | }


--------------------------------------------------------------------------------
/docs/sql-functions-library.md:
--------------------------------------------------------------------------------
  1 | # 📚 SQL Functions Library
  2 | 
  3 | > [🏠 Home](../README.md#-documentation-index) | [🏗️ Init](operations/init.md) | [🔍 Scan](operations/scan.md) | [💾 Dump](operations/dump.md) | [🔬 View Fields](operations/view-fields.md) | [📊 View Data](operations/view-data.md) | [🗂️ Meta Dictionary](dicts/meta-dict-schema.md) | [🔐 Sensitive Dictionary](dicts/sens-dict-schema.md)  
  4 | 
  5 | ## Overview
  6 | 
  7 | All functions are contained in the `init.sql` file. After run pg_anon in `init` mode, they will reside in the `anon_funcs` schema in the source database.
  8 | If you want to write a new function, simply create it in the `anon_funcs` schema in your source database.
  9 | 
 10 | List of some functions available for use in dictionaries:
 11 | 
 12 | ---
 13 | 
 14 | ## Functions list
 15 | 
 16 | ### 1. noise
 17 | Add noise to a real number:
 18 | ```SQL
 19 | SELECT anon_funcs.noise(100, 1.2);
 20 | >> 123
 21 | ```
 22 | 
 23 | ### 2. dnoise
 24 | Add noise to a date or timestamp:
 25 | ```SQL
 26 | SELECT anon_funcs.dnoise('2020-02-02 10:10:10'::timestamp, interval '1 month');
 27 | >> 2020-03-02 10:10:10
 28 | ```
 29 | 
 30 | ### 3. digest
 31 | Hash a string value with a specified hash function:
 32 | ```SQL
 33 | SELECT anon_funcs.digest('text', 'salt', 'sha256');
 34 | >> '3353e....'
 35 | ```
 36 | 
 37 | ### 4. partial
 38 | Keep the first few characters (2nd argument) and the last few characters (4th argument) of the specified string, adding a constant (3rd argument) in between:
 39 | ```SQL
 40 | SELECT anon_funcs.partial('123456789', 1, '***', 3);
 41 | >> 1***789
 42 | ```
 43 | 
 44 | ### 5. partial_email
 45 | Mask an email address:
 46 | ```SQL
 47 | SELECT anon_funcs.partial_email('example@gmail.com');
 48 | >> ex*****@gm*****.com
 49 | ```
 50 | 
 51 | ### 6. random_string
 52 | Generate a random string of specified length:
 53 | ```SQL
 54 | SELECT anon_funcs.random_string(7);
 55 | >> H3ZVL5P
 56 | ```
 57 | 
 58 | ### 7. random_zip
 59 | Generate a random ZIP code:
 60 | ```SQL
 61 | SELECT anon_funcs.random_zip();
 62 | >> 851467
 63 | ```
 64 | 
 65 | ### 8. random_date_between
 66 | Generate a random date and time within a specified range:
 67 | ```SQL
 68 | SELECT anon_funcs.random_date_between(
 69 |    '2020-02-02 10:10:10'::timestamp,
 70 |    '2022-02-05 10:10:10'::timestamp
 71 | );
 72 | >> 2021-11-08 06:47:48.057
 73 | ```
 74 | 
 75 | ### 9. random_date
 76 | Generate a random date and time:
 77 | ```SQL
 78 | SELECT anon_funcs.random_date();
 79 | >> 1911-04-18 21:54:13.139
 80 | ```
 81 | 
 82 | ### 10. random_int_between
 83 | Generate a random integer within a specified range:
 84 | ```SQL
 85 | SELECT anon_funcs.random_int_between(100, 200);
 86 | >> 159
 87 | ```
 88 | 
 89 | ### 11. random_bigint_between
 90 | Generate a random bigint within a specified range:
 91 | ```SQL
 92 | SELECT anon_funcs.random_bigint_between(6000000000, 7000000000);
 93 | >> 6268278565
 94 | ```
 95 | 
 96 | ### 12. random_phone
 97 | Generate a random phone number:
 98 | ```SQL
 99 | SELECT anon_funcs.random_phone('+7');
100 | >> +7297479867
101 | ```
102 | 
103 | ### 13. random_hash
104 | Generate a random hash using the specified function:
105 | ```SQL
106 | SELECT anon_funcs.random_hash('seed', 'sha512');
107 | >> b972f895ebea9cf2f65e19abc151b8031926c4a332471dc5c40fab608950870d6dbddcd18c7e467563f9b527e63d4d13870e4961c0ff2a62f021827654ae51fd
108 | ```
109 | 
110 | ### 14. random_in
111 | Select a random element from an array:
112 | ```SQL
113 | SELECT anon_funcs.random_in(array['a', 'b', 'c']);
114 | >> a
115 | ```
116 | 
117 | ### 15. hex_to_int
118 | Convert a hexadecimal value to decimal:
119 | ```SQL
120 | SELECT anon_funcs.hex_to_int('8AB');
121 | >> 2219
122 | ```
123 | 
124 | ---
125 | 
126 | ## pgcrypto
127 | In addition to the existing functions in the anon_funcs schema, functions from the pgcrypto extension can also be used.
128 | ```sql
129 | CREATE EXTENSION IF NOT EXISTS pgcrypto;
130 | ```
131 | 
132 | Example of using encryption with base64 encoding to store the encrypted value in a text field:
133 | ```SQL
134 | SELECT encode((SELECT encrypt('data', 'password', 'bf')), 'base64');
135 | >> cSMq9gb1vOw=
136 | 
137 | SELECT decrypt(
138 | (
139 | SELECT decode('cSMq9gb1vOw=', 'base64')
140 | ), 'password', 'bf');
141 | >> data
142 | ```
143 | 
144 | ---
145 | 
146 | ## How to add your own functions
147 | Also, adding new anonymization functions can be performed by adding `init.sql` to the file and then run pg_anon in `init` mode.
148 | 


--------------------------------------------------------------------------------
/rest_api/runners/direct/view_fields.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Type
  2 | 
  3 | from pg_anon.cli import build_run_options
  4 | from pg_anon.common.dto import PgAnonResult
  5 | from pg_anon.context import Context
  6 | from pg_anon.modes.view_fields import ViewFieldsMode
  7 | from rest_api.constants import BASE_TEMP_DIR
  8 | from rest_api.pydantic_models import ViewFieldsRequest, ViewFieldsContent
  9 | from rest_api.utils import write_dictionary_contents
 10 | 
 11 | 
 12 | class ViewFieldsRunner:
 13 |     request: ViewFieldsRequest
 14 |     cli_params: List[str] = None
 15 |     result: PgAnonResult = None
 16 |     _executor = Type[ViewFieldsMode]
 17 | 
 18 |     def __init__(self, request: ViewFieldsRequest):
 19 |         self.request = request
 20 |         self._prepare_cli_params()
 21 |         self._init_context()
 22 |         self._init_executor()
 23 | 
 24 |     def _prepare_db_credentials_cli_params(self):
 25 |         self.cli_params.extend([
 26 |             f'--db-host={self.request.db_connection_params.host}',
 27 |             f'--db-port={self.request.db_connection_params.port}',
 28 |             f'--db-user={self.request.db_connection_params.user_login}',
 29 |             f'--db-user-password={self.request.db_connection_params.user_password}',
 30 |             f'--db-name={self.request.db_connection_params.db_name}',
 31 |         ])
 32 | 
 33 |     def _prepare_dictionaries_cli_params(self):
 34 |         self._input_sens_dict_file_names = write_dictionary_contents(self.request.sens_dict_contents, BASE_TEMP_DIR)
 35 |         self.cli_params.append(
 36 |             f"--prepared-sens-dict-file={','.join(self._input_sens_dict_file_names.keys())}"
 37 |         )
 38 | 
 39 |     def _prepare_filters_cli_params(self):
 40 |         if self.request.schema_name:
 41 |             self.cli_params.append(
 42 |                 f'--schema-name={self.request.schema_name}',
 43 |             )
 44 | 
 45 |         if self.request.schema_mask:
 46 |             self.cli_params.append(
 47 |                 f'--schema-mask={self.request.schema_mask}',
 48 |             )
 49 | 
 50 |         if self.request.table_name:
 51 |             self.cli_params.append(
 52 |                 f'--table-name={self.request.table_name}',
 53 |             )
 54 | 
 55 |         if self.request.table_mask:
 56 |             self.cli_params.append(
 57 |                 f'--table-mask={self.request.table_mask}',
 58 |             )
 59 | 
 60 |         if self.request.view_only_sensitive_fields:
 61 |             self.cli_params.append(
 62 |                 f'--view-only-sensitive-fields',
 63 |             )
 64 | 
 65 |     def _prepare_limit_cli_params(self):
 66 |         if self.request.fields_limit_count:
 67 |             self.cli_params.append(
 68 |                 f'--fields-count={self.request.fields_limit_count}',
 69 |             )
 70 | 
 71 |     def _prepare_json_cli_params(self):
 72 |         self.cli_params.append(
 73 |             f'--json',
 74 |         )
 75 | 
 76 |     def _prepare_verbosity_cli_params(self):
 77 |         self.cli_params.extend([
 78 |             "--verbose=debug",
 79 |             "--debug",
 80 |         ])
 81 | 
 82 |     def _prepare_cli_params(self):
 83 |         self.cli_params = []
 84 |         self._prepare_db_credentials_cli_params()
 85 |         self._prepare_dictionaries_cli_params()
 86 |         self._prepare_filters_cli_params()
 87 |         self._prepare_limit_cli_params()
 88 |         self._prepare_json_cli_params()
 89 |         self._prepare_verbosity_cli_params()
 90 | 
 91 |     def _init_context(self):
 92 |         options = build_run_options(self.cli_params)
 93 |         self.context = Context(options)
 94 | 
 95 |     def _init_executor(self):
 96 |         self._executor = ViewFieldsMode(self.context)
 97 | 
 98 |     def _format_output(self) -> List[ViewFieldsContent]:
 99 |         result = []
100 |         for field in self._executor.fields:
101 |             dict_data = None
102 |             if field.dict_file_name != self._executor.empty_data_filler:
103 |                 dict_data = self._input_sens_dict_file_names[field.dict_file_name]
104 | 
105 |             field_rule = None
106 |             if field.rule != self._executor.empty_data_filler:
107 |                 field_rule = field.rule
108 | 
109 |             result.append(
110 |                 ViewFieldsContent(
111 |                     schema_name=field.nspname,
112 |                     table_name=field.relname,
113 |                     field_name=field.column_name,
114 |                     type=field.type,
115 |                     dict_data=dict_data,
116 |                     rule=field_rule,
117 |                 )
118 |             )
119 | 
120 |         return result
121 | 
122 |     async def run(self):
123 |         await self._executor.run()
124 |         return self._format_output()
125 | 


--------------------------------------------------------------------------------
/tests/expected_results/test_prepared_sens_dict_result_by_words_and_phrases_constants_expected.py:
--------------------------------------------------------------------------------
  1 | {
  2 |     "dictionary": [
  3 |         {
  4 |             "schema": "_SCHM.$complex#имя;@&* a'",
  5 |             "table": "_TBL.$complex#имя;@&* a'",
  6 |             "fields": {
  7 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
  8 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
  9 |             }
 10 |         },
 11 |         {
 12 |             "schema": "public",
 13 |             "table": "key_value",
 14 |             "fields": {
 15 |                 "fld_value": "anon_funcs.digest(\"fld_value\", 'salt_word', 'md5')",
 16 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 17 |             }
 18 |         },
 19 |         {
 20 |             "schema": "public",
 21 |             "table": "tbl_100",
 22 |             "fields": {
 23 |                 "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')",
 24 |                 "amount": "anon_funcs.noise(\"amount\", 30)",
 25 |                 "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')"
 26 |             }
 27 |         },
 28 |         {
 29 |             "schema": "public",
 30 |             "table": "tbl_constants",
 31 |             "fields": {
 32 |                 "phrases_sens_2": "anon_funcs.digest(\"phrases_sens_2\", 'salt_word', 'md5')",
 33 |                 "phrases_sens_1": "anon_funcs.digest(\"phrases_sens_1\", 'salt_word', 'md5')",
 34 |                 "words_sens": "anon_funcs.digest(\"words_sens\", 'salt_word', 'md5')"
 35 |             }
 36 |         },
 37 |         {
 38 |             "schema": "schm_customer",
 39 |             "table": "customer_manager",
 40 |             "fields": {
 41 |                 "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')"
 42 |             }
 43 |         },
 44 |         {
 45 |             "schema": "schm_mask_ext_exclude_2",
 46 |             "table": "card_numbers",
 47 |             "fields": {
 48 |                 "val": "anon_funcs.digest(\"val\", 'salt_word', 'md5')",
 49 |                 "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')",
 50 |                 "usd": "anon_funcs.noise(\"usd\", 30)",
 51 |                 "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')"
 52 |             }
 53 |         },
 54 |         {
 55 |             "schema": "_SCHM.$complex#имя;@&* a'",
 56 |             "table": "_TBL.$complex#имя;@&* a'3",
 57 |             "fields": {
 58 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
 59 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 60 |             }
 61 |         },
 62 |         {
 63 |             "schema": "_SCHM.$complex#имя;@&* a'",
 64 |             "table": "_TBL.$complex#имя;@&* a'2",
 65 |             "fields": {
 66 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
 67 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 68 |             }
 69 |         },
 70 |         {
 71 |             "schema": "schm_customer",
 72 |             "table": "customer_company",
 73 |             "fields": {
 74 |                 "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')",
 75 |                 "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')"
 76 |             }
 77 |         },
 78 |         {
 79 |             "schema": "schm_mask_ext_exclude_2",
 80 |             "table": "other_ext_tbl_2",
 81 |             "fields": {
 82 |                 "val_1": "anon_funcs.digest(\"val_1\", 'salt_word', 'md5')",
 83 |                 "val_2": "anon_funcs.digest(\"val_2\", 'salt_word', 'md5')"
 84 |             }
 85 |         },
 86 |         {
 87 |             "schema": "public",
 88 |             "table": "contracts",
 89 |             "fields": {
 90 |                 "amount": "anon_funcs.noise(\"amount\", 10)",
 91 |                 "contract_expires": "anon_funcs.dnoise(\"contract_expires\",  interval '6 month')",
 92 |                 "details": "anon_funcs.digest(\"details\", 'salt_word', 'md5')"
 93 |             }
 94 |         },
 95 |         {
 96 |             "schema": "public",
 97 |             "table": "inn_info",
 98 |             "fields": {
 99 |                 "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')"
100 |             }
101 |         },
102 |         {
103 |             "schema": "schm_other_2",
104 |             "table": "tbl_test_anon_functions",
105 |             "fields": {
106 |                 "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')"
107 |             }
108 |         }
109 |     ]
110 | }


--------------------------------------------------------------------------------
/tests/expected_results/test_prepared_sens_dict_result_default_func_expected.py:
--------------------------------------------------------------------------------
  1 | {
  2 |     "dictionary": [
  3 |         {
  4 |             "schema": "_SCHM.$complex#имя;@&* a'",
  5 |             "table": "_TBL.$complex#имя;@&* a'",
  6 |             "fields": {
  7 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'by_default_func', 'sha256')",
  8 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'by_default_func', 'sha256')"
  9 |             }
 10 |         },
 11 |         {
 12 |             "schema": "public",
 13 |             "table": "key_value",
 14 |             "fields": {
 15 |                 "fld_value": "anon_funcs.digest(\"fld_value\", 'by_default_func', 'sha256')",
 16 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'by_default_func', 'sha256')"
 17 |             }
 18 |         },
 19 |         {
 20 |             "schema": "public",
 21 |             "table": "tbl_100",
 22 |             "fields": {
 23 |                 "другое_поле": "anon_funcs.digest(\"другое_поле\", 'by_default_func', 'sha256')",
 24 |                 "amount": "anon_funcs.digest(\"amount\", 'by_default_func', 'sha256')",
 25 |                 "имя_поля": "anon_funcs.digest(\"имя_поля\", 'by_default_func', 'sha256')"
 26 |             }
 27 |         },
 28 |         {
 29 |             "schema": "schm_mask_ext_exclude_2",
 30 |             "table": "other_ext_tbl_2",
 31 |             "fields": {
 32 |                 "val_2": "anon_funcs.digest(\"val_2\", 'by_default_func', 'sha256')",
 33 |                 "val_1": "anon_funcs.digest(\"val_1\", 'by_default_func', 'sha256')"
 34 |             }
 35 |         },
 36 |         {
 37 |             "schema": "_SCHM.$complex#имя;@&* a'",
 38 |             "table": "_TBL.$complex#имя;@&* a'3",
 39 |             "fields": {
 40 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'by_default_func', 'sha256')",
 41 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'by_default_func', 'sha256')"
 42 |             }
 43 |         },
 44 |         {
 45 |             "schema": "schm_customer",
 46 |             "table": "customer_manager",
 47 |             "fields": {
 48 |                 "phone": "anon_funcs.digest(\"phone\", 'by_default_func', 'sha256')"
 49 |             }
 50 |         },
 51 |         {
 52 |             "schema": "schm_mask_ext_exclude_2",
 53 |             "table": "card_numbers",
 54 |             "fields": {
 55 |                 "val": "anon_funcs.digest(\"val\", 'by_default_func', 'sha256')",
 56 |                 "другое_поле": "anon_funcs.digest(\"другое_поле\", 'by_default_func', 'sha256')",
 57 |                 "usd": "anon_funcs.digest(\"usd\", 'by_default_func', 'sha256')",
 58 |                 "имя_поля": "anon_funcs.digest(\"имя_поля\", 'by_default_func', 'sha256')"
 59 |             }
 60 |         },
 61 |         {
 62 |             "schema": "_SCHM.$complex#имя;@&* a'",
 63 |             "table": "_TBL.$complex#имя;@&* a'2",
 64 |             "fields": {
 65 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'by_default_func', 'sha256')",
 66 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'by_default_func', 'sha256')"
 67 |             }
 68 |         },
 69 |         {
 70 |             "schema": "schm_other_3",
 71 |             "table": "data_types_test",
 72 |             "fields": {
 73 |                 "field_type_int8": "anon_funcs.digest(\"field_type_int8\", 'by_default_func', 'sha256')"
 74 |             }
 75 |         },
 76 |         {
 77 |             "schema": "schm_customer",
 78 |             "table": "customer_company",
 79 |             "fields": {
 80 |                 "phone": "anon_funcs.digest(\"phone\", 'by_default_func', 'sha256')",
 81 |                 "inn": "anon_funcs.digest(\"inn\", 'by_default_func', 'sha256')"
 82 |             }
 83 |         },
 84 |         {
 85 |             "schema": "public",
 86 |             "table": "contracts",
 87 |             "fields": {
 88 |                 "amount": "anon_funcs.digest(\"amount\", 'by_default_func', 'sha256')",
 89 |                 "contract_expires": "anon_funcs.digest(\"contract_expires\", 'by_default_func', 'sha256')",
 90 |                 "details": "anon_funcs.digest(\"details\", 'by_default_func', 'sha256')"
 91 |             }
 92 |         },
 93 |         {
 94 |             "schema": "public",
 95 |             "table": "inn_info",
 96 |             "fields": {
 97 |                 "inn": "anon_funcs.digest(\"inn\", 'by_default_func', 'sha256')"
 98 |             }
 99 |         },
100 |         {
101 |             "schema": "schm_other_2",
102 |             "table": "tbl_test_anon_functions",
103 |             "fields": {
104 |                 "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'by_default_func', 'sha256')"
105 |             }
106 |         }
107 |     ]
108 | }


--------------------------------------------------------------------------------
/tests/expected_results/test_prepared_sens_dict_result_by_partial_constants_expected.py:
--------------------------------------------------------------------------------
  1 | {
  2 |     "dictionary": [
  3 |         {
  4 |             "schema": "_SCHM.$complex#имя;@&* a'",
  5 |             "table": "_TBL.$complex#имя;@&* a'",
  6 |             "fields": {
  7 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
  8 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
  9 |             }
 10 |         },
 11 |         {
 12 |             "schema": "public",
 13 |             "table": "key_value",
 14 |             "fields": {
 15 |                 "fld_value": "anon_funcs.digest(\"fld_value\", 'salt_word', 'md5')",
 16 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 17 |             }
 18 |         },
 19 |         {
 20 |             "schema": "public",
 21 |             "table": "tbl_100",
 22 |             "fields": {
 23 |                 "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')",
 24 |                 "amount": "anon_funcs.noise(\"amount\", 30)",
 25 |                 "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')"
 26 |             }
 27 |         },
 28 |         {
 29 |             "schema": "schm_customer",
 30 |             "table": "customer_company",
 31 |             "fields": {
 32 |                 "site": "anon_funcs.digest(\"site\", 'salt_word', 'md5')",
 33 |                 "company_name": "anon_funcs.digest(\"company_name\", 'salt_word', 'md5')",
 34 |                 "email": "anon_funcs.digest(\"email\", 'salt_word', 'md5')",
 35 |                 "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')",
 36 |                 "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')"
 37 |             }
 38 |         },
 39 |         {
 40 |             "schema": "schm_customer",
 41 |             "table": "customer_manager",
 42 |             "fields": {
 43 |                 "last_name": "anon_funcs.digest(\"last_name\", 'salt_word', 'md5')",
 44 |                 "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')",
 45 |                 "email": "anon_funcs.digest(\"email\", 'salt_word', 'md5')",
 46 |                 "first_name": "anon_funcs.digest(\"first_name\", 'salt_word', 'md5')"
 47 |             }
 48 |         },
 49 |         {
 50 |             "schema": "schm_mask_ext_exclude_2",
 51 |             "table": "other_ext_tbl_2",
 52 |             "fields": {
 53 |                 "val_2": "anon_funcs.digest(\"val_2\", 'salt_word', 'md5')",
 54 |                 "val_1": "anon_funcs.digest(\"val_1\", 'salt_word', 'md5')"
 55 |             }
 56 |         },
 57 |         {
 58 |             "schema": "_SCHM.$complex#имя;@&* a'",
 59 |             "table": "_TBL.$complex#имя;@&* a'3",
 60 |             "fields": {
 61 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
 62 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 63 |             }
 64 |         },
 65 |         {
 66 |             "schema": "schm_mask_ext_exclude_2",
 67 |             "table": "card_numbers",
 68 |             "fields": {
 69 |                 "val": "anon_funcs.digest(\"val\", 'salt_word', 'md5')",
 70 |                 "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')",
 71 |                 "usd": "anon_funcs.noise(\"usd\", 30)",
 72 |                 "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')"
 73 |             }
 74 |         },
 75 |         {
 76 |             "schema": "_SCHM.$complex#имя;@&* a'",
 77 |             "table": "_TBL.$complex#имя;@&* a'2",
 78 |             "fields": {
 79 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
 80 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 81 |             }
 82 |         },
 83 |         {
 84 |             "schema": "public",
 85 |             "table": "contracts",
 86 |             "fields": {
 87 |                 "amount": "anon_funcs.noise(\"amount\", 10)",
 88 |                 "contract_expires": "anon_funcs.dnoise(\"contract_expires\",  interval '6 month')",
 89 |                 "details": "anon_funcs.digest(\"details\", 'salt_word', 'md5')"
 90 |             }
 91 |         },
 92 |         {
 93 |             "schema": "public",
 94 |             "table": "inn_info",
 95 |             "fields": {
 96 |                 "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')"
 97 |             }
 98 |         },
 99 |         {
100 |             "schema": "schm_other_2",
101 |             "table": "tbl_test_anon_functions",
102 |             "fields": {
103 |                 "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')"
104 |             }
105 |         }
106 |     ]
107 | }


--------------------------------------------------------------------------------
/docs/installation-and-configuring.md:
--------------------------------------------------------------------------------
  1 | # 💽 Installation & Configuration
  2 | > [🏠 Home](../README.md#-documentation-index) | [⚙️ How it works](how-it-works.md) | [💬 FAQ](faq.md) 
  3 | 
  4 | ## Before you install
  5 | pg_anon provides 2 ways to run: **CLI** and **REST API**
  6 | 
  7 | The REST API service is optional to install. This service is designed to integrate `pg_anon` functionality into any system or pipelines via HTTP requests.
  8 | It works just as a thin wrapper around the CLI version of `pg_anon`. REST API calls prepare CLI parameters and run the CLI version of pg_anon in the background.
  9 | 
 10 | It doesn’t keep state or store data in a database, so it can be scaled easily without extra setup.
 11 | 
 12 | However, this means that the system that integrates pg_anon must implement its own storage for dictionaries, dump tasks, and restore tasks.
 13 | 
 14 | > ⚠️ **Note**
 15 | > 
 16 | > Not suitable for fully autonomous operation.
 17 | > 
 18 | > All operation runs logs and info will be stored in the directory `/path_to_pg_anon/runs`.
 19 | > All dumps will be stored in the directory `/path_to_pg_anon/output`.
 20 | > If the REST API service is scaled, you must create a symlink to this directory on a shared disk. 
 21 | > This is required because restore operations also read dumps from `/path_to_pg_anon/output`.
 22 | 
 23 | ---
 24 | 
 25 | ## Linux
 26 | 
 27 | 1. Install Python 3 if it is not installed: `sudo apt-get install python3.11` (for Ubuntu), `sudo yum install python311` (for Redhat/Centos)
 28 | 2. Clone the repository: `git clone https://github.com/TantorLabs/pg_anon.git`
 29 | 3. Go to the project directory: `cd pg_anon`
 30 | 4. Set up a virtual environment:
 31 |     - Install the virtual environment: `python3 -m venv venv`
 32 |     - Activate the virtual environment: `source venv/bin/activate`
 33 | 5. Install the dependencies: `pip install -r requirements.txt`
 34 | 6. Optional, if you want to use the REST API service, install its dependencies: `pip install -r rest_api/requirements.txt`
 35 | 
 36 | ## Windows
 37 | 
 38 | 1. Install Python 3 if it is not installed: Download it from the official [Python website](https://www.python.org/downloads/)
 39 | 2. Clone the repository: `git clone https://github.com/TantorLabs/pg_anon.git`
 40 | 3. Go to the project directory: `cd pg_anon`
 41 | 4. Set up a virtual environment:
 42 |     - Install the virtual environment: `py -m venv venv`
 43 |     - Activate the virtual environment: `.\venv\Scripts\activate`
 44 | 5. Install the dependencies: `pip install -r requirements.txt`
 45 | 6. Optional, if you want to use the REST API service, install its dependencies: `pip install -r rest_api/requirements.txt`
 46 | 
 47 | ## macOS
 48 | 
 49 | 1. Install Python 3 if it is not installed:
 50 |     - Install [Homebrew](https://brew.sh/)
 51 |     - [`brew install python@3.11`](https://formulae.brew.sh/formula/python@3.11)
 52 | 2. Clone the repository: `git clone https://github.com/TantorLabs/pg_anon.git`
 53 | 3. Go to the project directory: `cd pg_anon`
 54 | 4. Set up a virtual environment:
 55 |     - Install the virtual environment: `python3 -m venv venv`
 56 |     - Activate the virtual environment: `source venv/bin/activate`
 57 | 5. Install the dependencies: `pip install -r requirements.txt`
 58 | 6. Optional, if you want to use the REST API service, install its dependencies: `pip install -r rest_api/requirements.txt`
 59 | 
 60 | ---
 61 | 
 62 | ## Configuring pg_anon
 63 | 
 64 | To specify custom `pg_dump` and `pg_restore` utilities, use the `--pg-dump` and `--pg-restore` parameters.
 65 | 
 66 | Advanced configuration is also available:
 67 | - CLI - use run parameter `--config`
 68 | - REST API - config must be placed at `/path_to_pg_anon/config.yml`
 69 | 
 70 | This parameter accepts a YAML file in this format:
 71 | ```yaml
 72 | pg-utils-versions:
 73 |   <postgres_major_version>:
 74 |     pg_dump: "/path/to/<postgres_major_version>/pg_dump"
 75 |     pg_restore: "/path/to/<postgres_major_version>/pg_restore"
 76 |   <another_postgres_major_version>:
 77 |     pg_dump: "/path/to/<postgres_major_version>/pg_dump"
 78 |     pg_restore: "/path/to/<postgres_major_version>/pg_restore"
 79 |   default:
 80 |     pg_dump: "/path/to/default_postgres_version/pg_dump"
 81 |     pg_restore: "/path/to/default_postgres_version/pg_restore"
 82 | ```
 83 | 
 84 | For example, you can specify a configuration for postgres 15 and 17:
 85 | 
 86 | ```yaml
 87 | pg-utils-versions:
 88 |   15:
 89 |     pg_dump: "/usr/lib/postgresql/15/bin/pg_dump"
 90 |     pg_restore: "/usr/lib/postgresql/15/bin/pg_restore"
 91 |   17:
 92 |     pg_dump: "/usr/lib/postgresql/17/bin/pg_dump"
 93 |     pg_restore: "/usr/lib/postgresql/17/bin/pg_restore"
 94 |   default:
 95 |     pg_dump: "/usr/lib/postgresql/17/bin/pg_dump"
 96 |     pg_restore: "/usr/lib/postgresql/17/bin/pg_restore"
 97 | ```
 98 | 
 99 | If the current PostgreSQL version does not match any version in this config, the utilities from the default section will be used.
100 | For example, `pg_anon` can be run with this config on Postgres 16. In this case, `pg_dump 17` and `pg_restore 17` will be used.
101 | 
102 | ---
103 | 
104 | ## Running REST API
105 | Run service command
106 | ```sh
107 | python -m uvicorn rest_api.api:app --host 0.0.0.0 --port 8000 --workers=3
108 | ```
109 | - Recommended worker count = `2 * CPU_CORES + 1`
110 | - Service OpenAPI documentation will be able by address - http://0.0.0.0:8000/docs#/
111 | - Also you can see [API documentation](api.md) 
112 | 


--------------------------------------------------------------------------------
/docs/how-it-works.md:
--------------------------------------------------------------------------------
  1 | # How it works
  2 | > [🏠 Home](../README.md#-documentation-index) | [💬 FAQ](faq.md)
  3 | 
  4 | ## Anonymization (masking)
  5 | 
  6 | The diagram below illustrates how data is transferred from the **source DB** to the **target DB**.
  7 | 
  8 | The source database contains sensitive information and is typically located in a production environment with strictly limited access.
  9 | 
 10 | ![Dump-Resore-Terms.drawio.png](../images/Dump-Resore-Terms.drawio.png)
 11 | 
 12 | A trusted administrator runs pg_anon with credentials for the **source DB**.
 13 | Using the prepared and approved sensitive dictionary, pg_anon creates an anonymized dump in the specified directory.
 14 | The dictionary must be created in advance and validated by the security team.
 15 | 
 16 | The resulting dump directory is then transferred to the host of the target database.
 17 | Compression during transfer is unnecessary because the dump files are already compressed.
 18 | 
 19 | Once the directory is placed on the target host, the restore process is started using target database credentials.
 20 | The target database must be created beforehand using CREATE DATABASE and must be empty.
 21 | 
 22 | After a successful restore, the anonymized database is ready for development or testing. Any number of employees can safely use it without risking exposure of sensitive data.
 23 | 
 24 | ---
 25 | 
 26 | ## What kind of work does pg_anon do inside during dump and restore? The simplest representation.
 27 | 
 28 | ### For example, we have data that we want to anonymize:
 29 | 
 30 | 1. Create the `source` table:
 31 | 
 32 | ```SQL
 33 | create table users (
 34 |     id bigserial,
 35 |     email text,
 36 |     login text
 37 | );
 38 | 
 39 | -- Checking the contents of the source table
 40 | select * from users;
 41 | ```
 42 | ``` output
 43 | >>
 44 |     id |  email  | login 
 45 |    ----+---------+-------
 46 | ```
 47 | 
 48 | 2. Populating the `source` table:
 49 | 
 50 | ```SQL
 51 | insert into users (email, login)
 52 | select
 53 |  'user' || generate_series(1001, 1020) || '@example.com',
 54 |  'user' || generate_series(1001, 1020);
 55 | 
 56 | -- Checking the contents of the source table
 57 | select * from users;
 58 | ```
 59 | ```output
 60 | >>
 61 |     id |	email	      |  login   
 62 |    ----+----------------------+----------
 63 |      1 | user1001@example.com | user1001
 64 |      2 | user1002@example.com | user1002
 65 |     ...
 66 | ```
 67 | 
 68 | **The 'email' field contains `sensitive data`. We need to `anonymize` it.**
 69 | 
 70 | 
 71 | ### What is the process of creating a dump with masking?
 72 | 
 73 | 1. Data `dump` from the `source` table to a CSV file (without masking):
 74 | 
 75 | ```SQL
 76 | copy (
 77 | 	select *
 78 | 	from users
 79 | ) to '/tmp/users.csv' with csv;
 80 | ```
 81 | ```output
 82 | cat /tmp/users.csv
 83 | >>
 84 |    1,user1001@example.com,user1001
 85 |    2,user1002@example.com,user1002
 86 |    ...
 87 | ```
 88 | 
 89 | 2. `Masking` the contents of the `source` table:
 90 | 
 91 | ```SQL
 92 | select
 93 |    id,
 94 |    md5(email) || '@abc.com' as email, -- hashing the email (masking rule in prepared sens dict file)
 95 |    login
 96 | from users;
 97 | ```
 98 | ```output
 99 | >>
100 |     id |              	email                     |  login   
101 |    ----+------------------------------------------+----------
102 |      1 | 385513d80895c4c5e19c91d1df9eacae@abc.com | user1001
103 |      2 | 9f4c0c30f85b0353c4d5fe3c9cc633e3@abc.com | user1002
104 |     ...
105 | ```
106 | 
107 | 3. Data `dump` from the `source` table to a CSV file (with `masking`):
108 | 
109 | ```SQL
110 | copy (
111 |   select
112 |     id,
113 |     md5(email) || '@abc.com' as email, -- hashing the email (masking rule in prepared sens dict file)
114 |     login
115 |   from users
116 | ) to '/tmp/users_anonymized.csv' with csv;
117 | ```
118 | ```output
119 | cat /tmp/users_anonymized.csv
120 | >>
121 |    1,385513d80895c4c5e19c91d1df9eacae@abc.com,user1001
122 |    2,9f4c0c30f85b0353c4d5fe3c9cc633e3@abc.com,user1002
123 |    ...
124 | ```
125 | 
126 | **The `prepared sens dict file` contains masking rules like hashing**
127 | 
128 | ### What is the process for restoring a masked dump?
129 | 
130 | 1. Reproducing of the structure. Creating the `target` table:
131 | 
132 | ```SQL
133 | create table users_anonymized (
134 |     id bigserial,
135 |     email text,
136 |     login text
137 | );
138 | 
139 | -- Checking the contents of the target table
140 | select * from users_anonymized;
141 | ```
142 | ```output
143 | >>
144 |     id |  email  | login 
145 |    ----+---------+-------
146 | ```
147 | 
148 | 2. Loading data from the `source` table data `dump` (CSV file) to `target` table:
149 | 
150 | ```SQL
151 | copy users_anonymized
152 | from '/tmp/users_anonymized.csv'
153 | with csv;
154 | 
155 | -- Checking the contents of the target table
156 | select * from users_anonymized;
157 | ```
158 | ```output
159 | >>
160 |     id |              	email                     |  login   
161 |    ----+------------------------------------------+----------
162 |      1 | 385513d80895c4c5e19c91d1df9eacae@abc.com | user1001
163 |      2 | 9f4c0c30f85b0353c4d5fe3c9cc633e3@abc.com | user1002
164 |     ...
165 | ```
166 | 
167 | ### Differences between original work of pg_anon and that representation:
168 | - `pg_anon` operates on the entire database (not only one table)
169 | - `pg_anon` uses `.bin.gz` files to save data (not csv)
170 | - Masking rules are provided to `pg_anon` via a `prepared sens dict file`
171 | 
172 | 


--------------------------------------------------------------------------------
/tests/expected_results/test_prepared_sens_dict_result_expected.py:
--------------------------------------------------------------------------------
  1 | {
  2 |     "dictionary": [
  3 |         {
  4 |             "schema": "_SCHM.$complex#имя;@&* a'",
  5 |             "table": "_TBL.$complex#имя;@&* a'",
  6 |             "fields": {
  7 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
  8 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
  9 |             }
 10 |         },
 11 |         {
 12 |             "schema": "_SCHM.$complex#имя;@&* a'",
 13 |             "table": "_TBL.$complex#имя;@&* a'2",
 14 |             "fields": {
 15 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
 16 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 17 |             }
 18 |         },
 19 |         {
 20 |             "schema": "_SCHM.$complex#имя;@&* a'",
 21 |             "table": "_TBL.$complex#имя;@&* a'3",
 22 |             "fields": {
 23 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
 24 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 25 |             }
 26 |         },
 27 |         {
 28 |             "schema": "public",
 29 |             "table": "contracts",
 30 |             "fields": {
 31 |                 "amount": "anon_funcs.noise(\"amount\", 10)",
 32 |                 "contract_expires": "anon_funcs.dnoise(\"contract_expires\",  interval '6 month')",
 33 |                 "details": "anon_funcs.digest(\"details\", 'salt_word', 'md5')"
 34 |             }
 35 |         },
 36 |         {
 37 |             "schema": "public",
 38 |             "table": "inn_info",
 39 |             "fields": {
 40 |                 "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')"
 41 |             }
 42 |         },
 43 |         {
 44 |             "schema": "public",
 45 |             "table": "key_value",
 46 |             "fields": {
 47 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')",
 48 |                 "fld_value": "anon_funcs.digest(\"fld_value\", 'salt_word', 'md5')"
 49 |             }
 50 |         },
 51 |         {
 52 |             "schema": "public",
 53 |             "table": "tbl_100",
 54 |             "fields": {
 55 |                 "amount": "anon_funcs.noise(\"amount\", 30)",
 56 |                 "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')",
 57 |                 "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')"
 58 |             }
 59 |         },
 60 |         {
 61 |             "schema": "schm_customer",
 62 |             "table": "customer_company",
 63 |             "fields": {
 64 |                 "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')",
 65 |                 "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')"
 66 |             }
 67 |         },
 68 |         {
 69 |             "schema": "schm_customer",
 70 |             "table": "customer_manager",
 71 |             "fields": {
 72 |                 "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')"
 73 |             }
 74 |         },
 75 |         {
 76 |             "schema": "schm_mask_ext_exclude_2",
 77 |             "table": "card_numbers",
 78 |             "fields": {
 79 |                 "usd": "anon_funcs.noise(\"usd\", 30)",
 80 |                 "val": "anon_funcs.digest(\"val\", 'salt_word', 'md5')",
 81 |                 "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')",
 82 |                 "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')"
 83 |             }
 84 |         },
 85 |         {
 86 |             "schema": "schm_mask_ext_exclude_2",
 87 |             "table": "other_ext_tbl_2",
 88 |             "fields": {
 89 |                 "val_1": "anon_funcs.digest(\"val_1\", 'salt_word', 'md5')",
 90 |                 "val_2": "anon_funcs.digest(\"val_2\", 'salt_word', 'md5')"
 91 |             }
 92 |         },
 93 |         {
 94 |             "schema": "schm_other_2",
 95 |             "table": "tbl_test_anon_functions",
 96 |             "fields": {
 97 |                 "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')"
 98 |             }
 99 |         },
100 |         {
101 |             "schema": "schm_other_4",
102 |             "table": "partitioned_table",
103 |             "fields": {
104 |                 "amount": "anon_funcs.noise(\"amount\", 10)"
105 |             }
106 |         },
107 |         {
108 |             "schema": "schm_other_4",
109 |             "table": "partitioned_table_2025_01",
110 |             "fields": {
111 |                 "amount": "anon_funcs.noise(\"amount\", 10)"
112 |             }
113 |         },
114 |         {
115 |             "schema": "schm_other_4",
116 |             "table": "partitioned_table_2025_02",
117 |             "fields": {
118 |                 "amount": "anon_funcs.noise(\"amount\", 10)"
119 |             }
120 |         },
121 |         {
122 |             "schema": "schm_other_4",
123 |             "table": "partitioned_table_2025_03",
124 |             "fields": {
125 |                 "amount": "anon_funcs.noise(\"amount\", 10)"
126 |             }
127 |         },
128 |         {
129 |             "schema": "schm_other_4",
130 |             "table": "partitioned_table_default",
131 |             "fields": {
132 |                 "amount": "anon_funcs.noise(\"amount\", 10)"
133 |             }
134 |         }
135 |     ]
136 | }


--------------------------------------------------------------------------------
/tests/expected_results/test_prepared_sens_dict_result_by_data_func_expected.py:
--------------------------------------------------------------------------------
  1 | {
  2 |     "dictionary": [
  3 |         {
  4 |             "schema": "_SCHM.$complex#имя;@&* a'",
  5 |             "table": "_TBL.$complex#имя;@&* a'",
  6 |             "fields": {
  7 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
  8 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
  9 |             }
 10 |         },
 11 |         {
 12 |             "schema": "public",
 13 |             "table": "key_value",
 14 |             "fields": {
 15 |                 "fld_value": "anon_funcs.digest(\"fld_value\", 'salt_word', 'md5')",
 16 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 17 |             }
 18 |         },
 19 |         {
 20 |             "schema": "public",
 21 |             "table": "tbl_100",
 22 |             "fields": {
 23 |                 "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')",
 24 |                 "amount": "anon_funcs.noise(\"amount\", 30)",
 25 |                 "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')"
 26 |             }
 27 |         },
 28 |         {
 29 |             "schema": "schm_mask_ext_exclude_2",
 30 |             "table": "other_ext_tbl_2",
 31 |             "fields": {
 32 |                 "val_2": "anon_funcs.digest(\"val_2\", 'salt_word', 'md5')",
 33 |                 "val_1": "anon_funcs.digest(\"val_1\", 'salt_word', 'md5')"
 34 |             }
 35 |         },
 36 |         {
 37 |             "schema": "_SCHM.$complex#имя;@&* a'",
 38 |             "table": "_TBL.$complex#имя;@&* a'3",
 39 |             "fields": {
 40 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
 41 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 42 |             }
 43 |         },
 44 |         {
 45 |             "schema": "schm_customer",
 46 |             "table": "customer_manager",
 47 |             "fields": {
 48 |                 "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')",
 49 |                 "email": "anon_funcs.partial_email(\"email\")"
 50 |             }
 51 |         },
 52 |         {
 53 |             "schema": "schm_mask_ext_exclude_2",
 54 |             "table": "card_numbers",
 55 |             "fields": {
 56 |                 "val": "anon_funcs.digest(\"val\", 'salt_word', 'md5')",
 57 |                 "другое_поле": "anon_funcs.digest(\"другое_поле\", 'salt_word', 'md5')",
 58 |                 "usd": "anon_funcs.noise(\"usd\", 30)",
 59 |                 "имя_поля": "anon_funcs.digest(\"имя_поля\", 'salt_word', 'md5')"
 60 |             }
 61 |         },
 62 |         {
 63 |             "schema": "_SCHM.$complex#имя;@&* a'",
 64 |             "table": "_TBL.$complex#имя;@&* a'2",
 65 |             "fields": {
 66 |                 "_FLD.$complex#имя;@&* a'": "anon_funcs.digest(\"_FLD.$complex#имя;@&* a'\", 'salt_word', 'md5')",
 67 |                 "fld_key": "anon_funcs.digest(\"fld_key\", 'salt_word', 'md5')"
 68 |             }
 69 |         },
 70 |         {
 71 |             "schema": "schm_customer",
 72 |             "table": "customer_company",
 73 |             "fields": {
 74 |                 "email": "anon_funcs.partial_email(\"email\")",
 75 |                 "phone": "anon_funcs.digest(\"phone\", 'salt_word', 'md5')",
 76 |                 "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')"
 77 |             }
 78 |         },
 79 |         {
 80 |             "schema": "public",
 81 |             "table": "contracts",
 82 |             "fields": {
 83 |                 "amount": "anon_funcs.noise(\"amount\", 10)",
 84 |                 "contract_expires": "anon_funcs.dnoise(\"contract_expires\",  interval '6 month')",
 85 |                 "details": "anon_funcs.digest(\"details\", 'salt_word', 'md5')"
 86 |             }
 87 |         },
 88 |         {
 89 |             "schema": "public",
 90 |             "table": "inn_info",
 91 |             "fields": {
 92 |                 "inn": "LPAD((10000000 + ROW_NUMBER() OVER (ORDER BY inn))::TEXT, 8, '0')"
 93 |             }
 94 |         },
 95 |         {
 96 |             "schema": "schm_other_2",
 97 |             "table": "tbl_test_anon_functions",
 98 |             "fields": {
 99 |                 "fld_5_email": "anon_funcs.digest(\"fld_5_email\", 'salt_word', 'md5')"
100 |             }
101 |         },
102 |         {
103 |             "schema": "schm_other_4",
104 |             "table": "partitioned_table",
105 |             "fields": {
106 |                 "amount": "anon_funcs.noise(\"amount\", 10)"
107 |             }
108 |         },
109 |         {
110 |             "schema": "schm_other_4",
111 |             "table": "partitioned_table_2025_01",
112 |             "fields": {
113 |                 "amount": "anon_funcs.noise(\"amount\", 10)"
114 |             }
115 |         },
116 |         {
117 |             "schema": "schm_other_4",
118 |             "table": "partitioned_table_2025_02",
119 |             "fields": {
120 |                 "amount": "anon_funcs.noise(\"amount\", 10)"
121 |             }
122 |         },
123 |         {
124 |             "schema": "schm_other_4",
125 |             "table": "partitioned_table_2025_03",
126 |             "fields": {
127 |                 "amount": "anon_funcs.noise(\"amount\", 10)"
128 |             }
129 |         },
130 |         {
131 |             "schema": "schm_other_4",
132 |             "table": "partitioned_table_default",
133 |             "fields": {
134 |                 "amount": "anon_funcs.noise(\"amount\", 10)"
135 |             }
136 |         }
137 |     ]
138 | }


--------------------------------------------------------------------------------
/rest_api/utils.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import shutil
  4 | from collections import deque
  5 | from pathlib import Path
  6 | from typing import List, Optional, Dict, Union
  7 | 
  8 | import aioprocessing
  9 | 
 10 | from pg_anon.cli import run_pg_anon
 11 | from pg_anon.common.dto import PgAnonResult
 12 | from pg_anon.common.utils import validate_exists_mode, simple_slugify
 13 | from rest_api.constants import DUMP_STORAGE_BASE_DIR
 14 | from rest_api.pydantic_models import DictionaryContent, DictionaryMetadata
 15 | 
 16 | 
 17 | def get_full_dump_path(dump_path: str) -> str:
 18 |     full_dump_path = Path(DUMP_STORAGE_BASE_DIR / dump_path.lstrip("/")).resolve()
 19 |     if not str(full_dump_path).startswith(str(DUMP_STORAGE_BASE_DIR)) or full_dump_path == DUMP_STORAGE_BASE_DIR:
 20 |         raise ValueError(f"Invalid path: {dump_path}")
 21 | 
 22 |     return str(full_dump_path)
 23 | 
 24 | 
 25 | def write_dictionary_contents(dictionary_contents: List[DictionaryContent], base_dir: Path) -> Dict[str, DictionaryMetadata]:
 26 |     file_names = {}
 27 |     base_dir.mkdir(parents=True, exist_ok=True)
 28 | 
 29 |     for dictionary_content in dictionary_contents:
 30 |         file_name = (base_dir / simple_slugify(dictionary_content.name)).with_suffix('.py')
 31 |         with open(file_name, "w") as out_file:
 32 |             out_file.write(dictionary_content.content)
 33 | 
 34 |         file_names[str(file_name)] = DictionaryMetadata(
 35 |             name=dictionary_content.name,
 36 |             additional_info=dictionary_content.additional_info,
 37 |         )
 38 | 
 39 |     return file_names
 40 | 
 41 | 
 42 | def read_dictionary_contents(file_path: Union[str, Path]) -> str:
 43 |     with open(file_path, "r") as dictionary_file:
 44 |         data = dictionary_file.read()
 45 | 
 46 |     return data
 47 | 
 48 | 
 49 | def read_json_file(file_path: Union[str, Path]) -> Dict:
 50 |     with open(file_path, "r") as file:
 51 |         data = json.loads(file.read())
 52 | 
 53 |     return data
 54 | 
 55 | 
 56 | def read_logs_from_tail(logs_path: Union[str, Path], lines_count: int) -> List[str]:
 57 |     def log_sort_key(file_path: Path):
 58 |         parts = file_path.name.split(".")
 59 |         try:
 60 |             return int(parts[-1])
 61 |         except ValueError:
 62 |             return 0
 63 | 
 64 |     log_files = sorted(logs_path.glob("*"), key=log_sort_key)
 65 | 
 66 |     result_lines = deque(maxlen=lines_count)
 67 |     block_size = 1024
 68 |     for log_file in log_files:
 69 |         if len(result_lines) >= lines_count:
 70 |             break
 71 |         buffer = bytearray()
 72 | 
 73 |         with log_file.open("rb") as f:
 74 |             f.seek(0, 2)
 75 |             pointer = f.tell()
 76 | 
 77 |             while pointer > 0 and len(result_lines) < lines_count:
 78 |                 read_size = min(block_size, pointer)
 79 |                 pointer -= read_size
 80 |                 f.seek(pointer)
 81 |                 buffer[:0] = f.read(read_size)
 82 |                 log_lines = buffer.split(b"\n")
 83 |                 for idx, line in enumerate(reversed(log_lines[1:])):
 84 |                     if idx == 0 and line == b"":
 85 |                         continue
 86 | 
 87 |                     result_lines.appendleft(line.decode("utf-8", errors="replace"))
 88 |                     if len(result_lines) >= lines_count:
 89 |                         break
 90 | 
 91 |                 buffer = log_lines[0]
 92 | 
 93 |             if buffer and len(result_lines) < lines_count:
 94 |                 result_lines.appendleft(buffer.decode("utf-8", errors="replace"))
 95 | 
 96 |     return list(result_lines)
 97 | 
 98 | 
 99 | def delete_folder(folder_path: Path):
100 |     try:
101 |         shutil.rmtree(folder_path)
102 |         print(f"Folder {folder_path} deleted successfully.")
103 |     except Exception as e:
104 |         print(f"Error deleting folder {folder_path}: {str(e)}")
105 | 
106 | 
107 | def run_pg_anon_subprocess_wrapper(queue: aioprocessing.AioQueue, cli_run_params: List[str]):
108 |     loop = asyncio.new_event_loop()
109 |     asyncio.set_event_loop(loop)
110 | 
111 |     try:
112 |         # Выполняем асинхронную функцию внутри нового event loop
113 |         result = loop.run_until_complete(
114 |             run_pg_anon(cli_run_params)
115 |         )
116 |         queue.put(result)
117 |     except Exception as ex:
118 |         print(ex)
119 |     finally:
120 |         queue.put(None)  # Завершаем процесс
121 |         queue.close()
122 |         loop.close()
123 | 
124 | 
125 | async def run_pg_anon_worker(mode: str, operation_id: str, cli_run_params: List[str]) -> Optional[PgAnonResult]:
126 |     if not validate_exists_mode(mode):
127 |         raise ValueError(f'Invalid mode: {mode}')
128 | 
129 |     application_name_suffix = f'worker__{mode}__{operation_id}'
130 |     cli_run_params.extend([
131 |         f'--mode={mode}',
132 |         f'--application-name-suffix={application_name_suffix}',
133 |     ])
134 | 
135 |     queue = aioprocessing.AioQueue()
136 | 
137 |     p = aioprocessing.AioProcess(
138 |         name=f"pg_anon_{application_name_suffix}",
139 |         target=run_pg_anon_subprocess_wrapper,
140 |         args=(queue, cli_run_params),
141 |     )
142 |     p.start()
143 | 
144 |     result = None
145 |     while True:
146 |         coro_result = await queue.coro_get()
147 |         if coro_result is None:
148 |             break
149 |         result = coro_result
150 |     await p.coro_join()
151 | 
152 |     return result
153 | 
154 | 
155 | def normalize_headers(headers: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]:
156 |     if not headers:
157 |         return None
158 | 
159 |     headers = {k.lower(): v for k, v in headers.items()}
160 |     headers.setdefault('content-type', 'application/json')
161 |     return headers
162 | 


--------------------------------------------------------------------------------
/docs/operations/view-data.md:
--------------------------------------------------------------------------------
 1 | # 📊 View Data
 2 | > [🏠 Home](../../README.md#-operations) | [🔍 Scan](scan.md) | [💾 Dump](dump.md) | [📂 Restore](restore.md) | [🔬 View Fields](view-fields.md) | [📚 SQL Functions Library](../sql-functions-library.md)
 3 | 
 4 | ## Overview
 5 | 
 6 | This mode displays anonymized table data without creating a dump.
 7 | 
 8 | ## Prerequisites
 9 | - The `anon_funcs` schema with anonymization functions must already exist. See [init mode](init.md).
10 | - A sensitive dictionary containing data about database fields and their anonymization rules must be prepared beforehand. See [create-dict (scan) mode](scan.md).
11 | 
12 | ## Run example
13 | 
14 | ```commandline
15 |    python pg_anon.py --mode=view-data \
16 |                      --db-host=127.0.0.1 \
17 |                      --db-user=postgres \
18 |                      --db-user-password=postgres \
19 |                      --db-name=source_db \
20 |                      --prepared-sens-dict-file=sens_dict.py \
21 |                      --schema-name=public \
22 |                      --table-name=users \
23 |                      --limit=10 \
24 |                      --offset=0
25 | ```
26 | 
27 | ---
28 | 
29 | ## Options
30 | 
31 | ### Common pg_anon options:
32 | 
33 | | Option                         | Required | Description                                                                                      |
34 | |--------------------------------|----------|--------------------------------------------------------------------------------------------------|
35 | | `--config`                     | No       | Path to the config file that can specify `pg_dump` and `pg_restore` utilities. (default: none)   |
36 | | `--processes`                  | No       | Number of processes used for multiprocessing operations. (default: 4)                            |
37 | | `--db-connections-per-process` | No       | Number of database connections per process for I/O operations. (default: 4)                      |
38 | | `--verbose`                    | No       | Sets the log verbosity level: `info`, `debug`, `error`. (default: info)                          |
39 | | `--debug`                      | No       | Enables debug mode (equivalent to `--verbose=debug`) and adds extra debug logs. (default: false) |
40 | 
41 | 
42 | ### Database configuration options:
43 | 
44 | | Option               | Required | Description                                                         |
45 | |----------------------|----------|---------------------------------------------------------------------|
46 | | `--db-host`          | Yes      | Database host.                                                      |
47 | | `--db-port`          | Yes      | Database port.                                                      |
48 | | `--db-name`          | Yes      | Database name.                                                      |
49 | | `--db-user`          | Yes      | Database user.                                                      |
50 | | `--db-user-password` | No       | Database user password.                                             |
51 | | `--db-passfile`      | No       | Path to a file containing the password used for authentication.     |
52 | | `--db-ssl-key-file`  | No       | Path to the client SSL key file for secure connections.             |
53 | | `--db-ssl-cert-file` | No       | Path to the client SSL certificate file.                            |
54 | | `--db-ssl-ca-file`   | No       | Path to the CA certificate used to verify the server’s certificate. |
55 | 
56 | 
57 | ### View-data mode options:
58 | 
59 | | Option                      | Required | Description                                                                                                                                                                                                                                       |
60 | |-----------------------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
61 | | `--prepared-sens-dict-file` | Yes      | Input file or file list contains [sensitive dictionary](../dicts/sens-dict-schema.md), which was generated by the [create-dict (scan) mode](scan.md) or created manually. In rules collision case, priority has rules in last file from the list. |
62 | | `--schema-name`             | Yes      | Schema name.                                                                                                                                                                                                                                      |
63 | | `--table-name`              | Yes      | Table name.                                                                                                                                                                                                                                       |
64 | | `--limit`                   | No       | Number of rows to display. (default: 100)                                                                                                                                                                                                         |
65 | | `--offset`                  | No       | Row offset for pagination. (default: 0)                                                                                                                                                                                                           |
66 | | `--json`                    | No       | Outputs results in JSON format instead of a table.                                                                                                                                                                                                |
67 | 


--------------------------------------------------------------------------------
/pg_anon/app.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | from pg_anon.common.constants import ANON_UTILS_DB_SCHEMA_NAME, SAVED_RUN_STATUS_FILE_NAME, SAVED_RUN_OPTIONS_FILE_NAME
  4 | from pg_anon.common.db_utils import check_anon_utils_db_schema_exists, get_pg_version
  5 | from pg_anon.common.dto import PgAnonResult, RunOptions
  6 | from pg_anon.common.enums import AnonMode
  7 | from pg_anon.common.utils import check_pg_util, exception_helper, save_json_file
  8 | from pg_anon.context import Context
  9 | from pg_anon.modes.create_dict import CreateDictMode
 10 | from pg_anon.modes.dump import DumpMode
 11 | from pg_anon.modes.initialization import InitMode
 12 | from pg_anon.modes.restore import RestoreMode
 13 | from pg_anon.modes.view_data import ViewDataMode
 14 | from pg_anon.modes.view_fields import ViewFieldsMode
 15 | from pg_anon.version import __version__
 16 | 
 17 | 
 18 | class PgAnonApp:
 19 | 
 20 |     def __init__(self, options: RunOptions):
 21 |         run_dir = Path(options.run_dir)
 22 |         run_dir.mkdir(parents=True, exist_ok=True)
 23 |         save_json_file(run_dir / SAVED_RUN_OPTIONS_FILE_NAME, options.to_dict())
 24 | 
 25 |         self.context = Context(options)
 26 |         self.result = PgAnonResult()
 27 |         self._skip_check_postgres_utils = self.context.options.mode in (
 28 |             AnonMode.INIT,
 29 |             AnonMode.CREATE_DICT,
 30 |             AnonMode.VIEW_FIELDS,
 31 |             AnonMode.VIEW_DATA,
 32 |         )
 33 | 
 34 |     def _bootstrap(self):
 35 |         self.context.logger.info(
 36 |             "============> Started pg_anon (v%s) in mode: %s"
 37 |             % (__version__, self.context.options.mode.value)
 38 |         )
 39 |         if self.context.options.debug:
 40 |             params_info = "#--------------- Run options\n"
 41 |             params_info += self.context.options.to_json()
 42 |             params_info += "\n#-----------------------------------"
 43 |             self.context.logger.debug(params_info)
 44 | 
 45 |     async def _set_postgres_utils(self):
 46 |         pg_version = await get_pg_version(self.context.connection_params, server_settings=self.context.server_settings)
 47 |         self.context.set_postgres_version(pg_version)
 48 |         self.context.logger.info(f"Target DB version: {pg_version}")
 49 |         self.context.logger.info(f"pg_dump path: {self.context.pg_dump}")
 50 |         self.context.logger.info(f"pg_restore path: {self.context.pg_restore}")
 51 | 
 52 |     def _check_postgres_utils(self):
 53 |         if self._skip_check_postgres_utils:
 54 |             self.context.logger.info(f"Skip postgres utils exists check")
 55 |             return
 56 | 
 57 |         self.context.logger.info(f"Postgres utils exists checking")
 58 | 
 59 |         pg_dump_exists = check_pg_util(self.context, self.context.pg_dump, "pg_dump")
 60 |         pg_restore_exists = check_pg_util(self.context, self.context.pg_restore, "pg_restore")
 61 | 
 62 |         if not pg_dump_exists or not pg_restore_exists:
 63 |             raise RuntimeError('pg_dump or pg_restore not found')
 64 | 
 65 |     async def _check_initialization(self):
 66 |         if self.context.options.mode in (
 67 |                 AnonMode.CREATE_DICT,
 68 |                 AnonMode.DUMP,
 69 |                 AnonMode.SYNC_DATA_DUMP,
 70 |                 AnonMode.SYNC_STRUCT_DUMP,
 71 |         ):
 72 |             anon_utils_schema_exists = await check_anon_utils_db_schema_exists(
 73 |                 connection_params=self.context.connection_params,
 74 |                 server_settings=self.context.server_settings
 75 |             )
 76 |             if not anon_utils_schema_exists:
 77 |                 raise ValueError(
 78 |                     f"Schema '{ANON_UTILS_DB_SCHEMA_NAME}' does not exist. First you need execute init, by run '--mode=init'"
 79 |                 )
 80 | 
 81 |     def _get_mode(self):
 82 |         if self.context.options.mode in (AnonMode.DUMP, AnonMode.SYNC_DATA_DUMP, AnonMode.SYNC_STRUCT_DUMP):
 83 |             return DumpMode(self.context)
 84 | 
 85 |         if self.context.options.mode in (AnonMode.RESTORE, AnonMode.SYNC_DATA_RESTORE, AnonMode.SYNC_STRUCT_RESTORE):
 86 |             return RestoreMode(self.context)
 87 | 
 88 |         if self.context.options.mode == AnonMode.INIT:
 89 |             return InitMode(self.context)
 90 | 
 91 |         if self.context.options.mode == AnonMode.CREATE_DICT:
 92 |             return CreateDictMode(self.context)
 93 | 
 94 |         if self.context.options.mode == AnonMode.VIEW_FIELDS:
 95 |             return ViewFieldsMode(self.context)
 96 | 
 97 |         if self.context.options.mode == AnonMode.VIEW_DATA:
 98 |             return ViewDataMode(self.context)
 99 | 
100 |         raise RuntimeError("Unknown mode: " + self.context.options.mode.value)
101 | 
102 |     async def run(self) -> PgAnonResult:
103 |         self._bootstrap()
104 |         self.result.start(self.context.options)
105 |         try:
106 |             await self._set_postgres_utils()
107 |             self._check_postgres_utils()
108 |             await self._check_initialization()
109 | 
110 |             mode = self._get_mode()
111 |             self.result.result_data = await mode.run()
112 |             self.result.complete()
113 |         except Exception as exc:
114 |             self.context.logger.error(exception_helper(show_traceback=True))
115 |             self.result.fail(exc)
116 |         finally:
117 |             self.context.logger.info(
118 |                 f"<============ Finished pg_anon in mode: {self.context.options.mode.value}, "
119 |                 f"result_code = {self.result.result_code.value}, "
120 |                 f"elapsed: {self.result.elapsed} sec"
121 |             )
122 |             save_json_file(Path(self.context.options.run_dir) / SAVED_RUN_STATUS_FILE_NAME, self.result.to_dict())
123 | 
124 |             return self.result
125 | 
126 |     async def validate_target_tables(self) -> PgAnonResult:
127 |         result = PgAnonResult()
128 |         result.start(self.context.options)
129 | 
130 |         try:
131 |             await RestoreMode.validate_restore(self.context)
132 |             result.complete()
133 |         except:
134 |             self.context.logger.error(exception_helper(show_traceback=True))
135 |             result.fail()
136 |         finally:
137 |             return result
138 | 


--------------------------------------------------------------------------------
/pg_anon/modes/view_data.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import List, Dict, Optional
  3 | 
  4 | from prettytable import PrettyTable, SINGLE_BORDER
  5 | 
  6 | from pg_anon.common.db_utils import get_fields_list, create_connection, get_rows_count, get_dump_query
  7 | from pg_anon.common.utils import exception_helper, get_dict_rule_for_table
  8 | from pg_anon.context import Context
  9 | 
 10 | 
 11 | class ViewDataMode:
 12 |     context: Context
 13 |     _limit: int
 14 |     _offset: int
 15 |     _schema_name: str
 16 |     _table_name: str
 17 |     table_rule: Dict
 18 |     raw_field_names: List[str] = None
 19 |     field_names: List[str] = None
 20 |     rows_count: int = 0
 21 |     query: str
 22 |     data: List[List[str]] = None
 23 |     raw_query: Optional[str] = None
 24 |     raw_data: Optional[List[List[str]]] = None
 25 |     table: PrettyTable = None
 26 |     _need_raw_data: bool = False
 27 | 
 28 |     def __init__(self, context: Context, need_raw_data: bool = False):
 29 |         self.context = context
 30 |         self._limit = context.options.limit
 31 |         self._offset = context.options.offset
 32 |         self._schema_name = context.options.schema_name
 33 |         self._table_name = context.options.table_name
 34 |         self.field_names = []
 35 |         self.raw_field_names = []
 36 |         self.data = []
 37 |         self.raw_data = []
 38 |         self._need_raw_data = need_raw_data
 39 | 
 40 |     async def _get_fields_for_view(self) -> None:
 41 |         """
 42 |         Get field names and all fields for view-data mode
 43 |         """
 44 |         fields_list = await get_fields_list(
 45 |             connection_params=self.context.connection_params,
 46 |             server_settings=self.context.server_settings,
 47 |             table_schema=self._schema_name,
 48 |             table_name=self._table_name
 49 |         )
 50 |         for field in fields_list:
 51 |             field_name = field["column_name"]
 52 |             self.raw_field_names.append(field_name)
 53 | 
 54 |             if self.table_rule and field_name in self.table_rule["fields"]:
 55 |                 self.field_names.append('* ' + field_name)
 56 |             else:
 57 |                 self.field_names.append(field_name)
 58 | 
 59 |     async def _get_data_for_view(self, query: str) -> List[List[str]]:
 60 |         db_conn = await create_connection(self.context.connection_params, server_settings=self.context.server_settings)
 61 |         table_result = await db_conn.fetch(query)
 62 |         await db_conn.close()
 63 |         
 64 |         data = [[record[field_name] for field_name in self.raw_field_names] for record in table_result]
 65 |         return data
 66 | 
 67 |     async def get_rows_count(self):
 68 |         self.rows_count = await get_rows_count(
 69 |             connection_params=self.context.connection_params,
 70 |             server_settings=self.context.server_settings,
 71 |             schema_name=self._schema_name,
 72 |             table_name=self._table_name
 73 |         )
 74 |         return self.rows_count
 75 | 
 76 |     def _prepare_table(self) -> None:
 77 |         self.table = PrettyTable(self.field_names)
 78 |         self.table.set_style(SINGLE_BORDER)
 79 |         for row in self.data:
 80 |             self.table.add_row(row)
 81 | 
 82 |     def _prepare_json(self) -> None:
 83 |         result = {field: [] for field in self.field_names}
 84 | 
 85 |         for field_values in self.data:
 86 |             for field, value in zip(self.field_names, field_values):
 87 |                 result[field].append(value)
 88 | 
 89 |         self.json = json.dumps(result, default=lambda x: str(x), ensure_ascii=False)
 90 | 
 91 |     async def _output_fields(self) -> None:
 92 | 
 93 |         await self._get_fields_for_view()
 94 |         if not self.field_names:
 95 |             raise ValueError("No field names for view!")
 96 | 
 97 |         self.data = await self._get_data_for_view(self.query)
 98 |         if not self.data:
 99 |             raise ValueError("Not found fields for view!")
100 | 
101 |         if self._need_raw_data:
102 |             self.raw_data = await self._get_data_for_view(self.raw_query)
103 | 
104 |         if self.context.options.json:
105 |             self._prepare_json()
106 |             print(self.json)
107 |         else:
108 |             self._prepare_table()
109 |             print(self.table)
110 | 
111 |     async def _prepare_queries(self):
112 | 
113 |         query_without_limit = await get_dump_query(
114 |             ctx=self.context,
115 |             table_schema=self._schema_name,
116 |             table_name=self._table_name,
117 |             table_rule=self.table_rule,
118 |             nulls_last=True
119 |         )
120 |         self.query = query_without_limit + f" LIMIT {self._limit} OFFSET {self._offset}"
121 | 
122 |         if self._need_raw_data:
123 |             query_without_limit = await get_dump_query(
124 |                 ctx=self.context,
125 |                 table_schema=self._schema_name,
126 |                 table_name=self._table_name,
127 |                 table_rule=None,
128 |                 nulls_last=True
129 |             )
130 |             self.raw_query = query_without_limit + f" LIMIT {self._limit} OFFSET {self._offset}"
131 | 
132 |     async def run(self) -> None:
133 |         self.context.logger.info("-------------> Started view_data mode")
134 | 
135 |         try:
136 |             if self._limit < 1:
137 |                 raise ValueError("Processing fields limit must be greater than zero!")
138 |             if self._offset < 0:
139 |                 raise ValueError("Processing fields offset must be greater than zero or equals to zero!")
140 | 
141 |             self.context.read_prepared_dict()
142 |             self.table_rule = get_dict_rule_for_table(
143 |                 dictionary_rules=self.context.prepared_dictionary_obj["dictionary"],
144 |                 schema=self._schema_name,
145 |                 table=self._table_name,
146 |             )
147 | 
148 |             await self._prepare_queries()
149 |             await self._output_fields()
150 | 
151 |             self.context.logger.info("<------------- Finished view_fields mode")
152 |         except Exception as ex:
153 |             self.context.logger.error("<------------- view_fields failed\n" + exception_helper())
154 |             raise ex
155 | 


--------------------------------------------------------------------------------
/docs/operations/view-fields.md:
--------------------------------------------------------------------------------
 1 | # 🔬 View Fields
 2 | > [🏠 Home](../../README.md#-operations) | [🔍 Scan](scan.md) | [💾 Dump](dump.md) | [📂 Restore](restore.md) | [📊 View Data](view-data.md) | [📚 SQL Functions Library](../sql-functions-library.md)
 3 | 
 4 | ## Overview
 5 | 
 6 | This mode displays how database fields match the anonymization rules.
 7 | 
 8 | ## Prerequisites
 9 | - The `anon_funcs` schema with anonymization functions must already exist. See [init mode](init.md).
10 | - A sensitive dictionary containing data about database fields and their anonymization rules must be prepared beforehand. See [create-dict (scan) mode](scan.md).
11 | 
12 | ## Run example
13 | 
14 | ```commandline
15 |    python pg_anon.py --mode=view-fields \
16 |                      --db-host=127.0.0.1 \
17 |                      --db-user=postgres \
18 |                      --db-user-password=postgres \
19 |                      --db-name=source_db \
20 |                      --prepared-sens-dict-file=sens_dict.py
21 |    ```
22 | 
23 | > ⚠️ **Note**
24 | > 
25 | > This mode can process only a limited number of fields when no filters are applied, for performance reasons.
26 | > 
27 | > This limit is controlled by the `--fields-count` option (default: 5000 fields).
28 | > To avoid hitting this limit, increase the `--fields-count` value or use filter options: `--schema-name`, `--schema-mask`, `--table-name`, `--table-mask`.
29 | 
30 | ---
31 | 
32 | ## Options
33 | 
34 | ### Common pg_anon options:
35 | 
36 | | Option                         | Required | Description                                                                                      |
37 | |--------------------------------|----------|--------------------------------------------------------------------------------------------------|
38 | | `--config`                     | No       | Path to the config file that can specify `pg_dump` and `pg_restore` utilities. (default: none)   |
39 | | `--processes`                  | No       | Number of processes used for multiprocessing operations. (default: 4)                            |
40 | | `--db-connections-per-process` | No       | Number of database connections per process for I/O operations. (default: 4)                      |
41 | | `--verbose`                    | No       | Sets the log verbosity level: `info`, `debug`, `error`. (default: info)                          |
42 | | `--debug`                      | No       | Enables debug mode (equivalent to `--verbose=debug`) and adds extra debug logs. (default: false) |
43 | 
44 | 
45 | ### Database configuration options:
46 | 
47 | | Option               | Required | Description                                                                       |
48 | |----------------------|----------|-----------------------------------------------------------------------------------|
49 | | `--db-host`          | Yes      | **Required.** Database host.                                                      |
50 | | `--db-port`          | Yes      | **Required.** Database port.                                                      |
51 | | `--db-name`          | Yes      | **Required.** Database name.                                                      |
52 | | `--db-user`          | Yes      | **Required.** Database user.                                                      |
53 | | `--db-user-password` | No       | **Optional.** Database user password.                                             |
54 | | `--db-passfile`      | No       | **Optional.** Path to a file containing the password used for authentication.     |
55 | | `--db-ssl-key-file`  | No       | **Optional.** Path to the client SSL key file for secure connections.             |
56 | | `--db-ssl-cert-file` | No       | **Optional.** Path to the client SSL certificate file.                            |
57 | | `--db-ssl-ca-file`   | No       | **Optional.** Path to the CA certificate used to verify the server’s certificate. |
58 | 
59 | 
60 | ### View-fields mode options:
61 | 
62 | | Option                         | Required | Description                                                                                                                                                                                                                                       |
63 | |--------------------------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
64 | | `--prepared-sens-dict-file`    | Yes      | Input file or file list contains [sensitive dictionary](../dicts/sens-dict-schema.md), which was generated by the [create-dict (scan) mode](scan.md) or created manually. In rules collision case, priority has rules in last file from the list. |
65 | | `--view-only-sensitive-fields` | No       | Displays only sensitive fields. (default: all fields)                                                                                                                                                                                             |
66 | | `--fields-count`               | No       | Maximum number of fields to process for output. (default: 5000)                                                                                                                                                                                   |
67 | | `--schema-name`                | No       | Filter by schema name.                                                                                                                                                                                                                            |
68 | | `--schema-mask`                | No       | Filter by schema name using a regular expression.                                                                                                                                                                                                 |
69 | | `--table-name`                 | No       | Filter by table name.                                                                                                                                                                                                                             |
70 | | `--table-mask`                 | No       | Filter by table name using a regular expression.                                                                                                                                                                                                  |
71 | | `--json`                       | No       | Outputs results in JSON format instead of a table.                                                                                                                                                                                                |
72 | 


--------------------------------------------------------------------------------
/docs/debugging.md:
--------------------------------------------------------------------------------
  1 | # 🛠️ Debug stages for anonymization process
  2 | 
  3 | > [🏠 Home](../README.md#-documentation-index) | [💾 Dump](operations/dump.md) | [📂 Restore](docs/operations/restore.md) | [⚙️ How it works](how-it-works.md) | [💬 FAQ](faq.md) 
  4 | 
  5 | ## Overview
  6 | 
  7 | The debug stages allow you to test and troubleshoot the anonymization workflow without performing a full dump or restore, saving significant time and resources.
  8 | 
  9 | Each stage emulates a specific part of the anonymization pipeline:
 10 | 
 11 | - **Stage 1 — Validate Dict**
 12 | 
 13 |   Validates the sensitive dictionary and checks SQL logic without exporting any data.
 14 | 
 15 | - **Stage 2 — Validate Data**
 16 | 
 17 |   Performs anonymization checks on real data with a limited sample (LIMIT 100) using a prepared database schema.
 18 | 
 19 | - **Stage 3 — Validate Full**:
 20 |   
 21 |     Executes the full anonymization logic with data sampling (LIMIT 100), but without requiring a prepared database.
 22 | 
 23 | These stages help you quickly debug rules, anonymization functions, SQL conditions, and dictionary configuration before running a full anonymized dump/restore process.
 24 | 
 25 | ---
 26 | 
 27 | ## Stage 1: Validate dict
 28 | 
 29 | This stage validate dictionary, show the tables and run SQL queries without data export into the disk or database.
 30 | So if program works without errors => the stage is passed.
 31 | 
 32 | ![dbg-stage-1.png](../images/dbg-stage-1.png)
 33 | 
 34 | ```commandline
 35 |    python pg_anon.py --mode=dump \
 36 |                      --db-host=127.0.0.1 \
 37 |                      --db-user=postgres \
 38 |                      --db-user-password=postgres \
 39 |                      --db-name=test_source_db \
 40 |                      --output-dir=test_dbg_stages \
 41 |                      --prepared-sens-dict-file=test_dbg_stages.py \
 42 |                      --clear-output-dir \
 43 |                      --verbose=debug \
 44 |                      --debug \
 45 |                      --dbg-stage-1-validate-dict
 46 |    ```
 47 | ---
 48 | 
 49 | ## Stage 2: Validate data
 50 | 
 51 | Validate data, show the tables and run SQL queries with data export and limit 100 in prepared database.
 52 | This stage requires database with all structure with only pre-data condition, which described in --prepared-sens-dict-file.
 53 | 
 54 | 
 55 | 
 56 | - If you want to create the database with required structure, just run:
 57 | 
 58 | One-time structure dump:
 59 | 
 60 | ```commandline
 61 |    python pg_anon.py --mode=sync-struct-dump \
 62 |                      --db-host=127.0.0.1 \
 63 |                      --db-user=postgres \
 64 |                      --db-user-password=postgres \
 65 |                      --db-name=test_source_db \
 66 |                      --output-dir=test_stage_2 \
 67 |                      --prepared-sens-dict-file=test_dbg_stages.py \
 68 |                      --clear-output-dir \
 69 |                      --verbose=debug \
 70 |                      --debug \
 71 |                      --dbg-stage-3-validate-full
 72 |    ```
 73 | 
 74 | And then as many times as you want structure restore:
 75 | 
 76 | ```commandline
 77 |    su - postgres -c "psql -U postgres -d postgres -c \"DROP DATABASE IF EXISTS test_target_db_7\""
 78 |    su - postgres -c "psql -U postgres -d postgres -c \"CREATE DATABASE test_target_db_7\""
 79 |    python pg_anon.py --mode=sync-struct-restore \
 80 |                      --db-host=127.0.0.1 \
 81 |                      --db-user=postgres \
 82 |                      --db-user-password=postgres \
 83 |                      --db-name=test_target_db_7 \
 84 |                      --input-dir=test_stage_2 \
 85 |                      --verbose=debug \
 86 |                      --debug 
 87 |    ```
 88 | 
 89 | - Validate data stage in dump:
 90 | 
 91 | ![dbg-stage-2.png](../images/dbg-stage-2.png)
 92 | 
 93 | ```commandline
 94 |    python pg_anon.py --mode=dump \
 95 |                      --db-host=127.0.0.1 \
 96 |                      --db-user=postgres \
 97 |                      --db-user-password=postgres \
 98 |                      --db-name=test_source_db \
 99 |                      --output-dir=test_dbg_stages \
100 |                      --prepared-sens-dict-file=test_dbg_stages.py \
101 |                      --clear-output-dir \
102 |                      --verbose=debug \
103 |                      --debug \
104 |                      --dbg-stage-2-validate-data
105 |    ```
106 | 
107 | - Validate data stage in data-restore:
108 | 
109 | ```commandline
110 |    python pg_anon.py --mode=sync-data-restore \
111 |                      --db-host=127.0.0.1 \
112 |                      --db-user=postgres \
113 |                      --db-user-password=postgres \
114 |                      --db-name=test_target_db_7 \
115 |                      --input-dir=test_dbg_stages \
116 |                      --verbose=debug \
117 |                      --debug 
118 |    
119 |    # And for example view all data in every table:
120 |    su - postgres -c "psql -U postgres -d test_target_db_7 -c \"SELECT * FROM public.contracts\""
121 |    ```
122 | ---
123 | 
124 | ## Stage 3: Validate full
125 | 
126 | ![dbg-stage-3.png](../images/dbg-stage-3.png)
127 | 
128 | Makes all logic with "limit 100" in SQL queries. In this stage you don't need prepared database, just run:
129 | 
130 | ```commandline
131 |    su - postgres -c "psql -U postgres -d postgres -c \"DROP DATABASE IF EXISTS test_target_db_8\""
132 |    su - postgres -c "psql -U postgres -d postgres -c \"CREATE DATABASE test_target_db_8\""
133 |    ```
134 | 
135 | - Validate full stage in dump:
136 | 
137 | ```commandline
138 |    python pg_anon.py --mode=dump \
139 |                      --db-host=127.0.0.1 \
140 |                      --db-user=postgres \
141 |                      --db-user-password=postgres \
142 |                      --db-name=test_source_db \
143 |                      --output-dir=test_dbg_stages \
144 |                      --prepared-sens-dict-file=test_dbg_stages.py \
145 |                      --clear-output-dir \
146 |                      --verbose=debug \
147 |                      --debug \
148 |                      --dbg-stage-3-validate-full
149 |    ```
150 | 
151 | - Validate full stage in restore:
152 | 
153 | ```commandline
154 |    python pg_anon.py --mode=restore \
155 |                      --db-host=127.0.0.1 \
156 |                      --db-user=postgres \
157 |                      --db-user-password=postgres \
158 |                      --db-name=test_target_db_8 \
159 |                      --input-dir=test_dbg_stages \
160 |                      --verbose=debug \
161 |                      --debug 
162 |    
163 |    # And for example view all data in every table:
164 |    su - postgres -c "psql -U postgres -d test_target_db_8 -c \"SELECT * FROM public.contracts\""
165 |    ```
166 | 


--------------------------------------------------------------------------------
/docs/operations/scan.md:
--------------------------------------------------------------------------------
 1 | # 🔍 Scan
 2 | > [🏠 Home](../../README.md#-operations) | [💾 Dump](dump.md) | [📂 Restore](restore.md) | [🔬 View Fields](view-fields.md) | [📊 View Data](view-data.md) | [📚 SQL Functions Library](../sql-functions-library.md) 
 3 | 
 4 | ---
 5 | 
 6 | ## Overview
 7 | The **scan** operation analyzes your PostgreSQL database to detect potentially sensitive data and generate dictionaries files.
 8 | It used for dump and repeat scan.
 9 | 
10 | ---
11 | 
12 | ## Prerequisites:
13 | - Manually created [meta-dictionary](../dicts/meta-dict-schema.md)
14 | - Already run `init` mode for source database
15 | 
16 | ## Usage:
17 | To scan source database and create dictionary for dump, run pg_anon in `create-dict` mode.
18 | You need:
19 | - **meta-dictionary** file with scan rules.
20 | 
21 | ```commandline
22 | python pg_anon.py --mode=create-dict \
23 |                   --db-user=postgres \
24 |                   --db-user-password=postgres \
25 |                   --db-name=test_source_db \
26 |                   --meta-dict-file=test_meta_dict.py \
27 |                   --prepared-sens-dict-file=test_sens_dict_output_previous_use.py \
28 |                   --prepared-no-sens-dict-file=test_no_sens_dict_output_previous_use.py \
29 |                   --output-sens-dict-file=test_sens_dict_output.py \
30 |                   --output-no-sens-dict-file=test_no_sens_dict_output.py \
31 |                   --processes=2
32 | ```
33 | 
34 | ---
35 | 
36 | ## Options
37 | 
38 | ### Common pg_anon options:
39 | 
40 | | Option                         | Required | Description                                                                                      |
41 | |--------------------------------|----------|--------------------------------------------------------------------------------------------------|
42 | | `--config`                     | No       | Path to the config file that can specify `pg_dump` and `pg_restore` utilities. (default: none)   |
43 | | `--processes`                  | No       | Number of processes used for multiprocessing operations. (default: 4)                            |
44 | | `--db-connections-per-process` | No       | Number of database connections per process for I/O operations. (default: 4)                      |
45 | | `--verbose`                    | No       | Sets the log verbosity level: `info`, `debug`, `error`. (default: info)                          |
46 | | `--debug`                      | No       | Enables debug mode (equivalent to `--verbose=debug`) and adds extra debug logs. (default: false) |
47 | 
48 | 
49 | ### Database configuration options:
50 | 
51 | | Option               | Required | Description                                                         |
52 | |----------------------|----------|---------------------------------------------------------------------|
53 | | `--db-host`          | Yes      | Database host.                                                      |
54 | | `--db-port`          | Yes      | Database port.                                                      |
55 | | `--db-name`          | Yes      | Database name.                                                      |
56 | | `--db-user`          | Yes      | Database user.                                                      |
57 | | `--db-user-password` | No       | Database user password.                                             |
58 | | `--db-passfile`      | No       | Path to a file containing the password used for authentication.     |
59 | | `--db-ssl-key-file`  | No       | Path to the client SSL key file for secure connections.             |
60 | | `--db-ssl-cert-file` | No       | Path to the client SSL certificate file.                            |
61 | | `--db-ssl-ca-file`   | No       | Path to the CA certificate used to verify the server’s certificate. |
62 | 
63 | 
64 | ### Create-dict (scan) mode options
65 | 
66 | | Option                         | Required | Description                                                                                                                                                                                                                                                            |
67 | |--------------------------------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
68 | | `--meta-dict-file`             | Yes      | Input file or file list contains [meta-dictionary](../dicts/meta-dict-schema.md), which was prepared manually. In rules collision case, priority has rules in last file from the list.                                                                                 |
69 | | `--prepared-sens-dict-file`    | No       | Input file or file list contains [sensitive dictionary](../dicts/sens-dict-schema.md), which was obtained in previous use by option `--output-sens-dict-file` or prepared manually. In rules collision case, priority has rules in last file from the list.            |
70 | | `--prepared-no-sens-dict-file` | No       | Input file or file list contains [not sensitive dictionary](../dicts/non-sens-dict-schema.md), which was obtained in previous use by option `--output-no-sens-dict-file` or prepared manually. In rules collision case, priority has rules in last file from the list. |
71 | | `--output-sens-dict-file`      | Yes      | Output file path for saving sensitive dictionary.                                                                                                                                                                                                                      |
72 | | `--output-no-sens-dict-file`   | No       | Output file path for saving not sensitive dictionary.                                                                                                                                                                                                                  |
73 | | `--scan-mode`                  | No       | Defines whether to scan all data or only part of it ["full", "partial"] (default "partial").                                                                                                                                                                           |
74 | | `--scan-partial-rows`          | No       | In `--scan-mode partial` defines amount of rows to scan (default 10000). Actual rows count can be smaller after getting unique values.                                                                                                                                 |
75 | | `--save-dicts`                 | No       | Duplicate all input and output dictionaries to dir `runs`. It can be useful for debugging or integration purposes.                                                                                                                                                     |
76 | 


--------------------------------------------------------------------------------
/tests/expected_results/test_prepared_no_sens_dict_result_expected.py:
--------------------------------------------------------------------------------
  1 | {
  2 |     "no_sens_dictionary": [
  3 |         {
  4 |             "schema": "_SCHM.$complex#имя;@&* a'",
  5 |             "table": "_TBL.$complex#имя;@&* a'2",
  6 |             "fields": [
  7 |                 "id"
  8 |             ]
  9 |         },
 10 |         {
 11 |             "schema": "_SCHM.$complex#имя;@&* a'",
 12 |             "table": "_TBL.$complex#имя;@&* a'3",
 13 |             "fields": [
 14 |                 "id"
 15 |             ]
 16 |         },
 17 |         {
 18 |             "schema": "columnar_internal",
 19 |             "table": "tbl_200",
 20 |             "fields": [
 21 |                 "id",
 22 |                 "val",
 23 |                 "val_skip"
 24 |             ]
 25 |         },
 26 |         {
 27 |             "schema": "public",
 28 |             "table": "contracts",
 29 |             "fields": [
 30 |                 "customer_company_id",
 31 |                 "customer_manager_id",
 32 |                 "status"
 33 |             ]
 34 |         },
 35 |         {
 36 |             "schema": "public",
 37 |             "table": "inn_info",
 38 |             "fields": [
 39 |                 "company_info"
 40 |             ]
 41 |         },
 42 |         {
 43 |             "schema": "public",
 44 |             "table": "tbl_100",
 45 |             "fields": [
 46 |                 "num_val",
 47 |                 "val",
 48 |                 "val_skip"
 49 |             ]
 50 |         },
 51 |         {
 52 |             "schema": "public",
 53 |             "table": "tbl_constants",
 54 |             "fields": [
 55 |                 "phrases_no_sens_1",
 56 |                 "phrases_no_sens_2",
 57 |                 "phrases_sens_1",
 58 |                 "phrases_sens_2",
 59 |                 "words_no_sens_1",
 60 |                 "words_no_sens_2",
 61 |                 "words_sens"
 62 |             ]
 63 |         },
 64 |         {
 65 |             "schema": "schm_customer",
 66 |             "table": "customer_company",
 67 |             "fields": [
 68 |                 "company_name",
 69 |                 "email",
 70 |                 "site"
 71 |             ]
 72 |         },
 73 |         {
 74 |             "schema": "schm_customer",
 75 |             "table": "customer_manager",
 76 |             "fields": [
 77 |                 "customer_company_id",
 78 |                 "email",
 79 |                 "first_name",
 80 |                 "last_name"
 81 |             ]
 82 |         },
 83 |         {
 84 |             "schema": "schm_mask_exclude_1",
 85 |             "table": "other_tbl",
 86 |             "fields": [
 87 |                 "val"
 88 |             ]
 89 |         },
 90 |         {
 91 |             "schema": "schm_mask_exclude_1",
 92 |             "table": "some_tbl",
 93 |             "fields": [
 94 |                 "val"
 95 |             ]
 96 |         },
 97 |         {
 98 |             "schema": "schm_mask_ext_exclude_2",
 99 |             "table": "card_numbers",
100 |             "fields": [
101 |                 "num_val"
102 |             ]
103 |         },
104 |         {
105 |             "schema": "schm_mask_ext_exclude_2",
106 |             "table": "some_ext_tbl",
107 |             "fields": [
108 |                 "val"
109 |             ]
110 |         },
111 |         {
112 |             "schema": "schm_mask_ext_include_2",
113 |             "table": "other_ext_tbl",
114 |             "fields": [
115 |                 "val"
116 |             ]
117 |         },
118 |         {
119 |             "schema": "schm_mask_ext_include_2",
120 |             "table": "some_ext_tbl",
121 |             "fields": [
122 |                 "val"
123 |             ]
124 |         },
125 |         {
126 |             "schema": "schm_mask_include_1",
127 |             "table": "other_tbl",
128 |             "fields": [
129 |                 "val"
130 |             ]
131 |         },
132 |         {
133 |             "schema": "schm_mask_include_1",
134 |             "table": "some_tbl",
135 |             "fields": [
136 |                 "val"
137 |             ]
138 |         },
139 |         {
140 |             "schema": "schm_mask_include_1",
141 |             "table": "tbl_123",
142 |             "fields": [
143 |                 "val"
144 |             ]
145 |         },
146 |         {
147 |             "schema": "schm_mask_include_1",
148 |             "table": "tbl_123_456",
149 |             "fields": [
150 |                 "val"
151 |             ]
152 |         },
153 |         {
154 |             "schema": "schm_other_1",
155 |             "table": "some_tbl",
156 |             "fields": [
157 |                 "val"
158 |             ]
159 |         },
160 |         {
161 |             "schema": "schm_other_2",
162 |             "table": "exclude_tbl",
163 |             "fields": [
164 |                 "val"
165 |             ]
166 |         },
167 |         {
168 |             "schema": "schm_other_2",
169 |             "table": "some_tbl",
170 |             "fields": [
171 |                 "val"
172 |             ]
173 |         },
174 |         {
175 |             "schema": "schm_other_2",
176 |             "table": "tbl_test_anon_functions",
177 |             "fields": [
178 |                 "fld_10_int",
179 |                 "fld_11_int",
180 |                 "fld_12_phone",
181 |                 "fld_13_txt",
182 |                 "fld_14_txt",
183 |                 "fld_15_txt",
184 |                 "fld_1_int",
185 |                 "fld_2_datetime",
186 |                 "fld_3_txt",
187 |                 "fld_4_txt",
188 |                 "fld_6_txt",
189 |                 "fld_7_zip",
190 |                 "fld_8_datetime",
191 |                 "fld_9_datetime"
192 |             ]
193 |         },
194 |         {
195 |             "schema": "schm_other_4",
196 |             "table": "goods",
197 |             "fields": [
198 |                 "created_at",
199 |                 "description",
200 |                 "quantity",
201 |                 "release_date",
202 |                 "title",
203 |                 "type_id",
204 |                 "valid_until"
205 |             ]
206 |         },
207 |         {
208 |             "schema": "schm_other_4",
209 |             "table": "partitioned_table",
210 |             "fields": [
211 |                 "created_at",
212 |                 "product_id",
213 |                 "quantity",
214 |                 "region_code"
215 |             ]
216 |         },
217 |         {
218 |             "schema": "schm_other_4",
219 |             "table": "partitioned_table_2025_01",
220 |             "fields": [
221 |                 "created_at",
222 |                 "product_id",
223 |                 "quantity",
224 |                 "region_code"
225 |             ]
226 |         },
227 |         {
228 |             "schema": "schm_other_4",
229 |             "table": "partitioned_table_2025_02",
230 |             "fields": [
231 |                 "created_at",
232 |                 "product_id",
233 |                 "quantity",
234 |                 "region_code"
235 |             ]
236 |         },
237 |         {
238 |             "schema": "schm_other_4",
239 |             "table": "partitioned_table_2025_03",
240 |             "fields": [
241 |                 "created_at",
242 |                 "product_id",
243 |                 "quantity",
244 |                 "region_code"
245 |             ]
246 |         },
247 |         {
248 |             "schema": "schm_other_4",
249 |             "table": "partitioned_table_default",
250 |             "fields": [
251 |                 "created_at",
252 |                 "product_id",
253 |                 "quantity",
254 |                 "region_code"
255 |             ]
256 |         }
257 |     ]
258 | }


--------------------------------------------------------------------------------
/pg_anon/modes/view_fields.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import List, Dict
  3 | 
  4 | from prettytable import PrettyTable, SINGLE_BORDER
  5 | 
  6 | from pg_anon.common.db_utils import get_scan_fields_list, get_scan_fields_count
  7 | from pg_anon.common.dto import FieldInfo
  8 | from pg_anon.common.utils import exception_helper, get_dict_rule_for_table
  9 | from pg_anon.context import Context
 10 | 
 11 | 
 12 | class ViewFieldsMode:
 13 |     context: Context
 14 |     _processing_fields_limit: int = 5000
 15 |     _filter_dict_rule: Dict = None
 16 |     fields: List[FieldInfo] = None
 17 |     table: PrettyTable = None
 18 |     json: str = None
 19 |     fields_cut_by_limits: bool = False
 20 |     empty_data_filler: str = '---'
 21 | 
 22 |     def __init__(self, context: Context):
 23 |         self.context = context
 24 |         if context.options.fields_count is not None:
 25 |             self._processing_fields_limit = context.options.fields_count
 26 |         self._init_filter_dict_rule()
 27 | 
 28 |     def _init_filter_dict_rule(self):
 29 |         self._filter_dict_rule = {}
 30 |         has_schema: bool = False
 31 |         has_table: bool = False
 32 | 
 33 |         if self.context.options.schema_name:
 34 |             self._filter_dict_rule["schema"] = self.context.options.schema_name
 35 |             has_schema = True
 36 | 
 37 |         if self.context.options.schema_mask:
 38 |             self._filter_dict_rule["schema_mask"] = self.context.options.schema_mask
 39 |             has_schema = True
 40 | 
 41 |         if self.context.options.table_name:
 42 |             self._filter_dict_rule["table"] = self.context.options.table_name
 43 |             has_table = True
 44 | 
 45 |         if self.context.options.table_mask:
 46 |             self._filter_dict_rule["table_mask"] = self.context.options.table_mask
 47 |             has_table = True
 48 | 
 49 |         if has_schema and not has_table:
 50 |             self._filter_dict_rule["table_mask"] = '*'
 51 | 
 52 |         if not has_schema and has_table:
 53 |             self._filter_dict_rule["schema_mask"] = '*'
 54 | 
 55 |     def _check_by_filters(self, field: FieldInfo) -> bool:
 56 |         return bool(get_dict_rule_for_table(
 57 |             dictionary_rules=[self._filter_dict_rule],
 58 |             schema=field.nspname,
 59 |             table=field.relname,
 60 |         ))
 61 | 
 62 |     async def _get_fields_for_view(self) -> List[FieldInfo]:
 63 |         """
 64 |         Get scanning fields for view mode
 65 |         :return: list of fields for view mode
 66 |         """
 67 |         fields_list = await get_scan_fields_list(
 68 |             connection_params=self.context.connection_params,
 69 |             server_settings=self.context.server_settings,
 70 |             limit=self._processing_fields_limit
 71 |         )
 72 | 
 73 |         result = []
 74 |         for field in fields_list:
 75 |             field_info = FieldInfo(**field)
 76 |             if not self._filter_dict_rule or self._check_by_filters(field_info):
 77 |                 result.append(field_info)
 78 | 
 79 |         return result
 80 | 
 81 |     async def _make_notice_fields_cut_by_limits(self):
 82 |         fields_count = await get_scan_fields_count(
 83 |             connection_params=self.context.connection_params,
 84 |             server_settings=self.context.server_settings
 85 |         )
 86 | 
 87 |         if fields_count > self._processing_fields_limit and not self.context.options.json:
 88 |             print(f'You try to get too many fields ({fields_count} fields).'
 89 |                   f' Will processed for output only first {self._processing_fields_limit} fields.'
 90 |                   f' Use arguments --schema-name, --schema-mask, --table-name, --table-mask to reduce fields amount.'
 91 |                   f' Also you can use --fields-count to extend limit.')
 92 |             self.fields_cut_by_limits = True
 93 | 
 94 |     def _prepare_fields_for_view(self):
 95 |         fields_with_find_rules = []
 96 | 
 97 |         for field in self.fields.copy():
 98 |             include_rule = get_dict_rule_for_table(
 99 |                 dictionary_rules=self.context.prepared_dictionary_obj["dictionary"],
100 |                 schema=field.nspname,
101 |                 table=field.relname,
102 |             )
103 | 
104 |             if include_rule:
105 |                 if field.column_name in include_rule.get('fields', {}):
106 |                     field.rule = include_rule['fields'][field.column_name]
107 |                     field.dict_file_name = include_rule["dict_file_name"]
108 |                     fields_with_find_rules.append(field)
109 |                     continue
110 |                 elif include_rule.get('raw_sql'):
111 |                     field.rule = include_rule['raw_sql']
112 |                     field.dict_file_name = include_rule["dict_file_name"]
113 |                     fields_with_find_rules.append(field)
114 |                     continue
115 | 
116 |             if not self.context.options.view_only_sensitive_fields:
117 |                 field.rule = self.empty_data_filler
118 |                 field.dict_file_name = self.empty_data_filler
119 |                 fields_with_find_rules.append(field)
120 | 
121 |         self.fields = fields_with_find_rules
122 | 
123 |     def _prepare_table(self):
124 |         self.table = PrettyTable([
125 |             'schema',
126 |             'table',
127 |             'field',
128 |             'type',
129 |             'dict_file_name',
130 |             'rule',
131 |         ], align='l')
132 |         self.table.set_style(SINGLE_BORDER)
133 | 
134 |         for field in self.fields:
135 |             self.table.add_row([
136 |                 field.nspname,
137 |                 field.relname,
138 |                 field.column_name,
139 |                 field.type,
140 |                 field.dict_file_name,
141 |                 field.rule,
142 |             ])
143 | 
144 |     def _prepare_json(self):
145 |         self.json = json.dumps([{
146 |             'schema': field.nspname,
147 |             'table': field.relname,
148 |             'field': field.column_name,
149 |             'type': field.type,
150 |             'dict_file_name': field.dict_file_name,
151 |             'rule': field.rule,
152 |         } for field in self.fields], ensure_ascii=False)
153 | 
154 |     async def _output_fields(self):
155 |         await self._make_notice_fields_cut_by_limits()
156 | 
157 |         self.fields = await self._get_fields_for_view()
158 |         if not self.fields:
159 |             raise ValueError("Not found fields for view!")
160 | 
161 |         self._prepare_fields_for_view()
162 | 
163 |         if not self.fields:
164 |             raise ValueError("Haven't fields for view!")
165 | 
166 |         if self.context.options.json:
167 |             self._prepare_json()
168 |             print(self.json)
169 |         else:
170 |             self._prepare_table()
171 |             print(self.table)
172 | 
173 |     async def run(self) -> None:
174 |         self.context.logger.info("-------------> Started view_fields mode")
175 | 
176 |         try:
177 |             if self._processing_fields_limit < 1:
178 |                 raise ValueError("Processing fields limit must be greater than zero!")
179 |             self.context.read_prepared_dict(save_dict_file_name_for_each_rule=True)
180 |             if not self.context.prepared_dictionary_obj.get("dictionary"):
181 |                 raise ValueError("Prepared dictionary is empty!")
182 |             await self._output_fields()
183 | 
184 |             self.context.logger.info("<------------- Finished view_fields mode")
185 |         except Exception as ex:
186 |             self.context.logger.error("<------------- view_fields failed\n" + exception_helper())
187 |             raise ex
188 | 


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
  1 | # FAQ
  2 | > [🏠 Home](../README.md#-documentation-index) | [⚙️ How it works](how-it-works.md)
  3 | 
  4 | ### 1. Where can I find operation logs and launch parameters?
  5 | All run data is stored in the `/path/to/pg_anon/runs` directory.  
  6 | Inside, the structure is: `<year>/<month>/<day>/<operation_id>`.
  7 | 
  8 | Each operation folder contains:
  9 | - a `logs` directory with all log files  
 10 | - a `run_options.json` file with all parameters used to run `pg_anon`
 11 | 
 12 | If the `--save-dicts` option was used, the folders `input` and `output` will also appear.  
 13 | They contain all input and output dictionaries for that run.
 14 | 
 15 | ---
 16 | 
 17 | ### 2. Can I restore a pg_anon dump using pg_dump?
 18 | 
 19 | **No.** The pg_anon dump format is not compatible with pg_dump due to the specifics of anonymization.
 20 | 
 21 | For the same reason, a regular backup created with pg_dump cannot be restored using pg_anon.
 22 | 
 23 | ---
 24 | 
 25 | ### 3. Does pg_anon modify the structure or data of the source database during scan, dump, view-data, or view-fields?
 26 | 
 27 | pg_anon does **not** modify either the structure or the data of the source database.
 28 | 
 29 | The only thing pg_anon adds is the `anon_funcs` schema, which is required for its internal operations.
 30 | 
 31 | ---
 32 | 
 33 | ### 4. Can I use custom functions for scanning?
 34 | 
 35 | **Yes.** The meta-dictionary has a [`data_func`](dicts/meta-dict-schema.md#6-section-data_func) section.  
 36 | In this section, you can use any custom SQL function for sensitivity validation.
 37 | 
 38 | This allows you to implement checks using full-text search or any other SQL capabilities.
 39 | 
 40 | Such functions must follow this template:
 41 | 
 42 | ```sql
 43 | CREATE OR REPLACE FUNCTION <schema>.<function_name>(
 44 |   value TEXT,
 45 |   schema_name TEXT,
 46 |   table_name TEXT,
 47 |   field_name TEXT
 48 | )
 49 | RETURNS boolean AS $$
 50 | BEGIN
 51 |   <function_logic>;
 52 | END;
 53 | $$ LANGUAGE plpgsql; 
 54 | ```
 55 | 
 56 | ---
 57 | 
 58 | ### 5. Can I use custom functions for anonymization?
 59 | 
 60 | **Yes.** You can use any functions and values available in the source database.
 61 | 
 62 | You must ensure that anonymized values match the field format.  
 63 | For example, if the field type is `varchar(15)`, you must **manually** ensure the generated value does not exceed 15 characters.
 64 | 
 65 | If the format is violated, the dump may be created successfully, but restoring it may fail.
 66 | 
 67 | Also for this cases can be used [`data_func`](dicts/meta-dict-schema.md#6-section-data_func) section with scan_func for field length comparison and specific anon_function for specific length.
 68 | 
 69 | For example, scan function bellow getting only fields with length less than 20 symbols and containing emails:
 70 | ```sql
 71 | CREATE OR REPLACE FUNCTION my_scan_funcs.is_email_field_with_len_20_chars(
 72 |   value TEXT,
 73 |   schema_name TEXT,
 74 |   table_name TEXT,
 75 |   field_name TEXT
 76 | )
 77 | RETURNS boolean AS $$
 78 | DECLARE
 79 |     max_len integer;
 80 |     is_email boolean;
 81 | BEGIN
 82 |     SELECT c.character_maximum_length
 83 |     INTO max_len
 84 |     FROM information_schema.columns c
 85 |     WHERE c.table_schema = $2
 86 |       AND c.table_name = $3
 87 |       AND c.column_name = $4;
 88 | 
 89 |     -- field length must be 20 characters
 90 |     if max_len != 20 then
 91 |         return false;
 92 |     end if;  	
 93 |    
 94 |    -- value must be not null for comparison
 95 |     if $1 is null then
 96 |     	return false;
 97 |     end if;  	
 98 |    
 99 |     -- check email format by regexp
100 |     return $1 ~* '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$';
101 | END;
102 | $$ LANGUAGE plpgsql;
103 | ```
104 | 
105 | The meta-dict rule below can be used to detect email fields with a length of 20 characters and anonymize them while preserving both format and length.
106 | ```python
107 | {
108 |     "data_func": {
109 |         "varchar": [
110 |             {
111 |                 "scan_func": "my_scan_funcs.is_email_field_with_len_20_chars",
112 |                 "anon_func": "lower(anon_funcs.random_string(9)) || '@secret.com'",
113 |                 "n_count": 10
114 |             }
115 |         ]
116 |     }
117 | }
118 | ```
119 | 
120 | ---
121 | 
122 | ### 6. Is the scanning stage required?
123 | 
124 | **No**. You can create all required dictionaries manually or reuse previously generated dictionaries.
125 | 
126 | ---
127 | 
128 | ### 7. Why load sensitive and non-sensitive dictionaries during scanning?
129 | 
130 | They are used only to speed up scanning.
131 | 
132 | These dictionaries act as a cache, allowing pg_anon to immediately know which fields are sensitive and which are not.
133 | 
134 | This way, repeated scans of the same database will run very quickly.
135 | 
136 | If new fields appear that are not present in the dictionaries, pg_anon will evaluate them using the rules from the meta-dictionary.
137 | 
138 | ---
139 | 
140 | ### 8. When should I use `--config` with a configuration file?
141 | 
142 | If you plan to use pg_anon with different PostgreSQL major versions, you should define a config file.
143 | 
144 | It is much easier to configure this once rather than repeatedly passing paths to pg_dump and pg_restore.
145 | 
146 | If you always use a single PostgreSQL version, the system pg_dump and pg_restore will be used, and a config file is unnecessary.
147 | 
148 | ---
149 | 
150 | ### 9. Can I split one large dictionary into multiple smaller ones?
151 | 
152 | **Yes**. All dictionary-related parameters accept lists of files.
153 | 
154 | At startup, pg_anon merges them into a single dictionary internally.
155 | 
156 | This makes it easy to separate different groups of rules into different files and combine them as needed.
157 | This is especially helpful for the meta-dictionary, which contains many optional sections.
158 | 
159 | ---
160 | 
161 | ### 10. Restore error: "Database is not empty"
162 | 
163 | Restore mode checks that the target database is empty.
164 | 
165 | This is done to prevent accidental data loss in the target database.
166 | 
167 | If needed, use the `--drop-db` or `--clean-db` options during restore.
168 | 
169 | ---
170 | 
171 | ### 11. Restore error: "Database is being accessed by other users"
172 | 
173 | When using the `--drop-db` option, the target database will be recreated using `DROP DATABASE` and `CREATE DATABASE`.
174 | 
175 | If there are active connections, the `DROP DATABASE` command cannot be executed.
176 | 
177 | You must terminate all active sessions and run the restore operation again.
178 | 
179 | ---
180 | 
181 | ### 12. Difference between options `--drop-db` and `--clean-db` for restore mode
182 | 
183 | - `--drop-db` - recreate target database using commands `DROP DATABASE` and `CREATE DATABASE`. After that running restore process on empty db.
184 | - `--clean-db` - Performs a restore similar to pg_restore --clean --if-exists. It creates missing tables from the backup in the target database. It also preserves extra tables that exist in the target DB and are not contained in the restoring backup. This option does not require an empty target database.  
185 | 
186 | ---
187 | 
188 | ### 13. Determining Optimal Process and Connection Counts
189 | 
190 | To configure optimal values, first identify these system parameters:
191 |   - max_connections - maximum connections allowed by your PostgreSQL database
192 |   - CPU core count
193 |   - Reserved connections (typically 3-10 for maintenance/admin connections)
194 | 
195 | Important Considerations:
196 |   - Exceeding max_connections may cause pg_anon failures and affect other database applications
197 |   - Ensure sufficient connection headroom for other services
198 | 
199 | #### Recommended Configuration:
200 | 
201 | Process Count
202 | ```bash
203 | --processes = CPU cores
204 | ```
205 | Database Connections per Process
206 | ```bash
207 | --db-connections-per-process ≤ (max_connections - reserved_connections) / --processes
208 | ```
209 | 
210 | #### Example Calculation:
211 |   - CPU cores: 4
212 |   - max_connections: 100
213 |   - reserved_connections: 5
214 |   - --processes: 4
215 |   - --db-connections-per-process: (100 - 5) / 4 ≈ 23.75 → 23
216 |   - **Verification:** 4 processes × 23 connections = 92 total connections (within 100 limit)
217 | 


--------------------------------------------------------------------------------
/docs/dicts/sens-dict-schema.md:
--------------------------------------------------------------------------------
  1 | # 📋 Sensitive Dictionary
  2 | > [🏠 Home](../../README.md#-dictionary-schemas) | [🔍 Scan](../operations/scan.md) | [💾 Dump](../operations/dump.md) | [🔬 View Fields](../operations/view-fields.md) | [📊 View Data](../operations/view-data.md) | [🗂️ Meta Dictionary](meta-dict-schema.md) | [📋 Non-sensitive Dictionary](non-sens-dict-schema.md)  
  3 | 
  4 | ## Overview
  5 | The sensitive dictionary defines explicit anonymization rules for fields.
  6 | It is used in four operation modes, and its behavior differs slightly across them:
  7 | 
  8 | 1. [💾 Dump mode](../operations/dump.md)
  9 |     
 10 |     Fields listed in the dictionary are anonymized using the defined rules.
 11 |     All other fields are dumped as-is.
 12 | 
 13 | 2. [🔍 Create-dict (scan) mode](../operations/scan.md)
 14 | 
 15 |     Fields listed in the sensitive dictionary are treated as known **sensitive** fields,
 16 |     which skips sensitivity detection for them.
 17 |     This speeds up scanning process.
 18 | 
 19 | 3. [🔬 View fields mode](../operations/view-fields.md)
 20 | 
 21 |     Shows which anonymization rules would be applied to fields.
 22 | 
 23 | 4. [📊 View data mode](../operations/view-data.md)
 24 | 
 25 |     Shows how the rules would affect sample data, without performing a dump.
 26 | 
 27 | This dictionary can be created manually or generated automatically using [create-dict (scan) mode](../operations/scan.md).
 28 | 
 29 | > ⚠️ **Note**
 30 | > 
 31 | > If a field appears both in the sensitive dictionary and the [non-sensitive](non-sens-dict-schema.md) dictionary, the sensitive dictionary takes priority.
 32 | 
 33 | 
 34 | ---
 35 | 
 36 | ## Schema
 37 | ```python
 38 | {    
 39 |     "dictionary": [
 40 |         {
 41 |             "schema": "<schema_name: string>",
 42 |             "table": "<table_name: string>",
 43 |             "fields": {
 44 |                 "<field_name: string>": "<anonymization_rule_for_field: string>",
 45 |             },
 46 |             "sql_condition": # Optional. Condition in raw SQL format for filtering the data to dump. (This section ignored for create-dict (scan) mode
 47 |                 """
 48 |                 <raw_SQL_WHERE_condition: string>
 49 |                 """
 50 |         }
 51 |     ],
 52 |     # Optional section. It is used to exclude schemas and tables from the data dump.  
 53 |     "dictionary_exclude": [
 54 |         {
 55 |             "schema": "<schema_name: string>",             # Exclude only this schema
 56 |             "schema_mask": "<schema_regex_mask: string>",  # Or exclude schemas matching regex pattern
 57 |             "table": "<table_name: string>",               # Exclude only this table
 58 |             "table_mask": "<table_regex_mask: string>",    # Or exclude tables matching regex pattern
 59 |         }
 60 |     ]
 61 | }
 62 | ```
 63 | > ⚠️ **Note**
 64 | > - `sql_condition` in `dictionary` section is optional. It can be used for taking a part of data. Example: getting table data only by last week.
 65 | > - `dictionary_exclude` is optional section.  If a table appears in both the "dictionary_exclude" and "dictionary" sections, then table will be dumped. It can be used for particular dump and debugging of anonymization process.
 66 | > - In `dictionary_exclude`, you must use either `schema` or `schema_mask` → not both.
 67 | > - In `dictionary_exclude`, you must use either `table` or `table_mask` → not both.
 68 | 
 69 | ---
 70 | 
 71 | ## ⚙️ Using the Dictionary
 72 | 
 73 | **🏛️ Example Database Structure**
 74 | 
 75 | | Schema    | Table     | Field            |
 76 | |-----------|-----------|------------------|
 77 | | public    | employees | id               |
 78 | | public    | employees | full_name        |
 79 | | public    | employees | email            |
 80 | | public    | employees | hire_date        |
 81 | | public    | salaries  | employee_id      |
 82 | | public    | salaries  | monthly_salary   |
 83 | | public    | salaries  | currency         |
 84 | | ecommerce | orders    | product_id       |
 85 | | ecommerce | orders    | count            |
 86 | | ecommerce | orders    | client_name      |
 87 | | ecommerce | orders    | delivery_address |
 88 | | ecommerce | orders    | created          |
 89 | | ecommerce | orders    | status           |
 90 | | tenant_a  | projects  | title            |
 91 | | tenant_a  | projects  | description      |
 92 | | tenant_b  | projects  | title            |
 93 | | tenant_b  | projects  | description      |
 94 | | tenant_c  | projects  | title            |
 95 | | tenant_c  | projects  | description      |
 96 | 
 97 | 
 98 | 
 99 | **📘 Example Sensitive Dictionary**
100 | ```python
101 | {    
102 |     "dictionary": [
103 |         {
104 |             "schema": "public",
105 |             "table": "employees",
106 |             "fields": {
107 |                 "full_name": "anon_funcs.digest(\"full_name\", 'salt_word', 'sha256')",  # hashing employees names 
108 |                 "email": "md5(\"email\") || @abc.com",  # hashing employee emails while preserving email format
109 |             },
110 |         },
111 |         {
112 |             "schema": "public",
113 |             "table": "salaries",
114 |             "fields": {
115 |                 "monthly_salary": "10000",  # just defines one value for the field for all rows
116 |             },
117 |         },
118 |         {
119 |             "schema": "ecommerce",
120 |             "table": "orders",
121 |             "fields": {
122 |                 "client_name": "anon_funcs.digest(\"client_name\", 'salt_word', 'sha256')",
123 |                 "delivery_address": "anon_funcs.digest(\"delivery_address\", 'salt_word', 'sha256')",
124 |             },
125 |             "sql_condition":  # Dumping only the orders completed within the last week
126 |                 """
127 |                 WHERE created > NOW() - '7 days'::interval
128 |                 AND status = 'done'
129 |                 """
130 |         }
131 |     ],
132 |     # Excluding all tables from schemas `tenant_a`, `tenant_b`, `tenant_c` 
133 |     "dictionary_exclude": [
134 |         {
135 |             "schema_mask": "tenant_.*",
136 |             "table_mask": "*",
137 |         }
138 |     ]
139 | }
140 | ```
141 | 
142 | **This dictionary matches the following table fields:**
143 | 
144 | | Schema       | Table     | Field            | Used in `dump` mode       | Used in `create-dict (scan)` mode                       |
145 | |--------------|-----------|------------------|---------------------------|---------------------------------------------------------|
146 | | public       | employees | id               | Dumped as is              | Fields scanned using meta-dictionary rules              |
147 | | public       | employees | full_name        | Dumped with anonymization | Excluded from sensitivity checks as a "sensitive" field |
148 | | public       | employees | email            | Dumped with anonymization | Excluded from sensitivity checks as a "sensitive" field |
149 | | public       | employees | hire_date        | Dumped as is              | Fields scanned using meta-dictionary rules              |
150 | | public       | salaries  | employee_id      | Dumped as is              | Fields scanned using meta-dictionary rules              |
151 | | public       | salaries  | monthly_salary   | Dumped with anonymization | Excluded from sensitivity checks as a "sensitive" field |
152 | | public       | salaries  | currency         | Dumped as is              | Fields scanned using meta-dictionary rules              |
153 | | ecommerce    | orders    | product_id       | Dumped as is              | Fields scanned using meta-dictionary rules              |
154 | | ecommerce    | orders    | client_name      | Dumped with anonymization | Excluded from sensitivity checks as a "sensitive" field |
155 | | ecommerce    | orders    | delivery_address | Dumped with anonymization | Excluded from sensitivity checks as a "sensitive" field |
156 | | ecommerce    | orders    | count            | Dumped as is              | Fields scanned using meta-dictionary rules              |
157 | | ecommerce    | orders    | created          | Dumped as is              | Fields scanned using meta-dictionary rules              |
158 | | ecommerce    | orders    | status           | Dumped as is              | Fields scanned using meta-dictionary rules              |
159 | 


--------------------------------------------------------------------------------