├── .editorconfig ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── database_sanitizer ├── __init__.py ├── __main__.py ├── config.py ├── dump │ ├── __init__.py │ ├── mysql.py │ └── postgres.py ├── sanitizers │ ├── __init__.py │ ├── constant.py │ ├── derived.py │ ├── string.py │ ├── times.py │ └── user.py ├── session.py ├── tests │ ├── __init__.py │ ├── test_config.py │ ├── test_dump.py │ ├── test_dump_mysql.py │ ├── test_dump_postgres.py │ ├── test_main.py │ ├── test_sanitizers_constant.py │ ├── test_sanitizers_derived.py │ ├── test_sanitizers_string.py │ ├── test_sanitizers_times.py │ ├── test_sanitizers_user.py │ ├── test_session.py │ ├── test_utils_mysql.py │ └── test_utils_postgres.py └── utils │ ├── __init__.py │ ├── mysql.py │ └── postgres.py ├── requirements-test.txt ├── setup.cfg └── setup.py /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | end_of_line = lf 6 | insert_final_newline = true 7 | trim_trailing_whitespace = true 8 | 9 | [*.{md,py}] 10 | indent_style = space 11 | indent_size = 4 12 | line_length = 79 13 | 14 | [*.yml] 15 | indent_style = space 16 | indent_size = 2 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.egg 3 | *.egg-info/ 4 | __pycache__ 5 | /.coverage 6 | /.eggs 7 | /.idea 8 | /.pytest_cache 9 | /build 10 | /coverage.xml 11 | /dist 12 | /htmlcov 13 | /venv 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: python 3 | cache: pip 4 | python: 5 | - "2.7" 6 | - "3.4" 7 | - "3.5" 8 | - "3.6" 9 | install: 10 | - pip install -r requirements-test.txt 11 | script: 12 | - py.test -vvv --cov database_sanitizer --cov-report=term-missing 13 | after_success: 14 | - curl -s https://codecov.io/bash | bash 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Anders Innovations 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Database sanitation tool 2 | 3 | [![pypi][pypi-image]][pypi-url] 4 | [![travis][travis-image]][travis-url] 5 | [![codecov][codecov-image]][codecov-url] 6 | 7 | [pypi-image]: https://badge.fury.io/py/database-sanitizer.svg 8 | [pypi-url]: https://pypi.org/project/database-sanitizer/ 9 | [travis-image]: https://travis-ci.org/andersinno/python-database-sanitizer.svg?branch=master 10 | [travis-url]: https://travis-ci.org/andersinno/python-database-sanitizer 11 | [codecov-image]: https://codecov.io/gh/andersinno/python-database-sanitizer/branch/master/graph/badge.svg 12 | [codecov-url]: https://codecov.io/gh/andersinno/python-database-sanitizer 13 | 14 | `database-sanitizer` is a tool which retrieves an database dump from 15 | relational database and performs sanitation on the retrieved data 16 | according to rules defined in a configuration file. Currently the 17 | sanitation tool supports both [PostgreSQL] and [MySQL] databases. 18 | 19 | [PostgreSQL]: https://postgres.org 20 | [MySQL]: https://mysql.com 21 | 22 | ## Installation 23 | 24 | `database-sanitizer` can be installed from [PyPI] with [pip] like this: 25 | 26 | ```bash 27 | $ pip install database-sanitizer 28 | ``` 29 | 30 | If you are using MySQL, you need to install the package like this 31 | instead, so that additional requirements are included: 32 | 33 | ```bash 34 | $ pip install database-sanitizer[MySQL] 35 | ``` 36 | 37 | [PyPI]: https://pypi.org 38 | [pip]: https://pip.pypa.io/en/stable/ 39 | 40 | ## Usage 41 | 42 | Once the package has been installed, `database-sanitizer` can be used 43 | like this: 44 | 45 | ```bash 46 | $ database-sanitizer 47 | ``` 48 | 49 | Command line argument `DATABASE-URL` needs to be provided so the tool 50 | knows how to retrieve the dump from the database. With PostgreSQL, it 51 | would be something like this: 52 | 53 | ```bash 54 | $ database-sanitizer postgres://user:password@host/database 55 | ``` 56 | 57 | However, unless an configuration file is provided, no sanitation will be 58 | performed on the retrieved database dump, which leads us to the next 59 | section which will be... 60 | 61 | ## Configuration 62 | 63 | Rules for the sanitation can be given in a configuration file written in 64 | [YAML]. Path to the configuration file is then given to the command line 65 | utility with `--config` argument (`-c` for shorthand) like this: 66 | 67 | [YAML]: http://yaml.org 68 | 69 | ```bash 70 | $ database-sanitizer -c config.yml postgres://user:password@host/database 71 | ``` 72 | 73 | The configuration file uses following kind of syntax: 74 | 75 | ```YAML 76 | config: 77 | addons: 78 | - some.other.package 79 | - yet.another.package 80 | extra_parameters: # These parameters will be passed to the dump tool CLI 81 | mysqldump: 82 | - "--single-transaction" # Included by default 83 | pg_dump: 84 | - "--exclude-table=something" 85 | strategy: 86 | user: 87 | first_name: name.first_name 88 | last_name: name.last_name 89 | secret_key: string.empty 90 | access_log: skip_rows 91 | ``` 92 | 93 | In the example configuration above, there are first listed two "addon 94 | packages", which are names of Python packages where the sanitizer will 95 | be looking for sanitizer functions. They are completely optional and can 96 | be omitted, in which case only sanitizer functions defined in package 97 | called `sanitizers` and built-in sanitizers will be used instead. 98 | 99 | It's also possible to define extra parameters to pass to the dump tool ( 100 | `mysqldump` or `pg_dump`). By default, `mysqldump` will include the 101 | `--single-transaction` extra parameter. You can disable this by defining the 102 | extra parameters in the config file explicitly, e.g. with an empty array `[]`. 103 | 104 | The `strategy` portion of the configuration contains the actual 105 | sanitation rules. First you define name of the database table (in the 106 | example that would be `user`) followed by column names in that table 107 | which each one mapped to sanitation function name. The name of the 108 | sanitation function consists from two parts separated from each other by 109 | a dot: Python module name and name of the actual function, which will 110 | be prefixed with `sanitize_`, so `name.first_name` would be a function 111 | called `sanitize_first_name` in a file called `name.py`. 112 | 113 | Table content can be left out completely from the sanitized dump by 114 | setting table strategy to `skip_rows` (check `access_log` table in the 115 | example config). This will leave out all `INSERT INTO` (MySQL) or `COPY` 116 | (PostgreSQL) statements from the sanitized dump file. `CREATE TABLE` 117 | statements will not be removed. 118 | -------------------------------------------------------------------------------- /database_sanitizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersinno/python-database-sanitizer/66b7146441914e9dfd8ab31596d5ebc61bf0e04a/database_sanitizer/__init__.py -------------------------------------------------------------------------------- /database_sanitizer/__main__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import argparse 6 | import codecs 7 | import os 8 | import sys 9 | 10 | import six 11 | 12 | from .config import Configuration 13 | from .dump import run 14 | 15 | 16 | def main(argv=sys.argv): 17 | parser = argparse.ArgumentParser( 18 | prog=(argv[0] if len(argv) else "database-sanitizer"), 19 | description="Sanitizes contents of databases.", 20 | ) 21 | parser.add_argument( 22 | "--config", 23 | "-c", 24 | type=str, 25 | dest="config", 26 | help="Path to the sanitizer configuration file.", 27 | ) 28 | parser.add_argument( 29 | "--output", 30 | "-o", 31 | type=str, 32 | dest="output", 33 | help=( 34 | "Path to the file where the sanitized database will be written " 35 | "into. If omitted, standard output will be used instead." 36 | ), 37 | ) 38 | parser.add_argument( 39 | "url", 40 | help="Database URL to which to connect into and sanitize contents.", 41 | ) 42 | 43 | args = parser.parse_args(args=argv[1:]) 44 | output = sys.stdout 45 | if six.PY2: 46 | output = codecs.getwriter("utf-8")(output) 47 | config = None 48 | 49 | if args.config: 50 | conf_dir = os.path.realpath(os.path.dirname(args.config)) 51 | sys.path.insert(0, conf_dir) 52 | config = Configuration.from_file(args.config) 53 | if args.output: 54 | output = open(args.output, "w") 55 | 56 | try: 57 | run( 58 | url=args.url, 59 | output=output, 60 | config=config, 61 | ) 62 | finally: 63 | if args.output: 64 | output.close() 65 | 66 | 67 | if __name__ == "__main__": 68 | main() 69 | -------------------------------------------------------------------------------- /database_sanitizer/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import importlib 6 | 7 | import six 8 | import yaml 9 | 10 | __all__ = ("Configuration", "ConfigurationError") 11 | 12 | SKIP_ROWS_CONFIG_VALUE = "skip_rows" 13 | MYSQLDUMP_DEFAULT_PARAMETERS = ["--single-transaction"] 14 | PG_DUMP_DEFAULT_PARAMETERS = [] 15 | 16 | 17 | class ConfigurationError(ValueError): 18 | """ 19 | Custom exception type used to indicate configuration file errors. 20 | """ 21 | 22 | 23 | class Configuration(object): 24 | """ 25 | Object representation of database sanitizer configuration, usually read 26 | from a YAML file. 27 | """ 28 | def __init__(self): 29 | self.sanitizers = {} 30 | self.skip_rows_for_tables = [] 31 | self.addon_packages = [] 32 | self.mysqldump_params = [] 33 | self.pg_dump_params = [] 34 | 35 | @classmethod 36 | def from_file(cls, filename): 37 | """ 38 | Reads configuration from given path to a file in local file system and 39 | returns parsed version of it. 40 | 41 | :param filename: Path to the YAML file in local file system where the 42 | configuration will be read from. 43 | :type filename: str 44 | 45 | :return: Configuration instance parsed from given configuration file. 46 | :rtype: Configuration 47 | """ 48 | instance = cls() 49 | 50 | with open(filename, "rb") as file_stream: 51 | config_data = yaml.safe_load(file_stream) 52 | 53 | instance.load(config_data) 54 | 55 | return instance 56 | 57 | def load(self, config_data): 58 | """ 59 | Loads sanitizers according to rulesets defined in given already parsed 60 | configuration file. 61 | 62 | :param config_data: Already parsed configuration data, as dictionary. 63 | :type config_data: dict[str,any] 64 | """ 65 | if not isinstance(config_data, dict): 66 | raise ConfigurationError( 67 | "Configuration data is %s instead of dict." % ( 68 | type(config_data), 69 | ) 70 | ) 71 | 72 | self.load_addon_packages(config_data) 73 | self.load_sanitizers(config_data) 74 | self.load_dump_extra_parameters(config_data) 75 | 76 | def load_dump_extra_parameters(self, config_data): 77 | """ 78 | Loads extra parameters for mysqldump and/or pg_dump CLI usage. These 79 | parameters should be added to the mysqldump and/or pg_dump command call 80 | when taking a dump. 81 | 82 | :param config_data: Already parsed configuration data, as dictionary. 83 | :type config_data: dict[str,any] 84 | """ 85 | section_config = config_data.get("config", {}) 86 | if not isinstance(section_config, dict): 87 | raise ConfigurationError( 88 | "'config' is %s instead of dict" % ( 89 | type(section_config), 90 | ), 91 | ) 92 | 93 | section_extra_parameters = section_config.get("extra_parameters", {}) 94 | if not isinstance(section_extra_parameters, dict): 95 | raise ConfigurationError( 96 | "'config.extra_parameters' is %s instead of dict" % ( 97 | type(section_extra_parameters), 98 | ), 99 | ) 100 | 101 | mysqldump_params = section_extra_parameters.get("mysqldump", MYSQLDUMP_DEFAULT_PARAMETERS) 102 | if not isinstance(mysqldump_params, list): 103 | raise ConfigurationError( 104 | "'config.extra_parameters.mysqldump' is %s instead of list" % ( 105 | type(mysqldump_params), 106 | ), 107 | ) 108 | 109 | pg_dump_params = section_extra_parameters.get("pg_dump", PG_DUMP_DEFAULT_PARAMETERS) 110 | if not isinstance(pg_dump_params, list): 111 | raise ConfigurationError( 112 | "'config.extra_parameters.pg_dump' is %s instead of list" % ( 113 | type(pg_dump_params), 114 | ), 115 | ) 116 | 117 | self.mysqldump_params = mysqldump_params 118 | self.pg_dump_params = pg_dump_params 119 | 120 | def load_addon_packages(self, config_data): 121 | """ 122 | Loads the module paths from which the configuration will attempt to 123 | load sanitizers from. These must be stored as a list of strings under 124 | "config.addons" section of the configuration data. 125 | 126 | :param config_data: Already parsed configuration data, as dictionary. 127 | :type config_data: dict[str,any] 128 | """ 129 | section_config = config_data.get("config") 130 | if not isinstance(section_config, dict): 131 | if section_config is None: 132 | return 133 | raise ConfigurationError( 134 | "'config' is %s instead of dict" % ( 135 | type(section_config), 136 | ), 137 | ) 138 | 139 | section_addons = section_config.get("addons", []) 140 | if not isinstance(section_addons, list): 141 | raise ConfigurationError( 142 | "'config.addons' is %s instead of list" % ( 143 | type(section_addons), 144 | ), 145 | ) 146 | 147 | for index, module_path in enumerate(section_addons): 148 | if not isinstance(module_path, str): 149 | raise ConfigurationError( 150 | "Item %d in 'config.addons' is %s instead of string" % ( 151 | index, 152 | type(module_path), 153 | ), 154 | ) 155 | 156 | self.addon_packages = list(section_addons) 157 | 158 | def load_sanitizers(self, config_data): 159 | """ 160 | Loads sanitizers possibly defined in the configuration under dictionary 161 | called "strategy", which should contain mapping of database tables with 162 | column names mapped into sanitizer function names. 163 | 164 | :param config_data: Already parsed configuration data, as dictionary. 165 | :type config_data: dict[str,any] 166 | """ 167 | section_strategy = config_data.get("strategy") 168 | if not isinstance(section_strategy, dict): 169 | if section_strategy is None: 170 | return 171 | if section_strategy != SKIP_ROWS_CONFIG_VALUE: 172 | raise ConfigurationError( 173 | "'strategy' is %s instead of dict" % ( 174 | type(section_strategy), 175 | ), 176 | ) 177 | 178 | for table_name, column_data in six.iteritems(section_strategy): 179 | if column_data == SKIP_ROWS_CONFIG_VALUE: 180 | self.skip_rows_for_tables.append(table_name) 181 | continue 182 | 183 | if not isinstance(column_data, dict): 184 | if column_data is None: 185 | continue 186 | raise ConfigurationError( 187 | "'strategy.%s' is %s instead of dict" % ( 188 | table_name, 189 | type(column_data), 190 | ), 191 | ) 192 | 193 | for column_name, sanitizer_name in six.iteritems(column_data): 194 | if sanitizer_name is None: 195 | continue 196 | 197 | if not isinstance(sanitizer_name, str): 198 | raise ConfigurationError( 199 | "'strategy.%s.%s' is %s instead of string" % ( 200 | table_name, 201 | column_name, 202 | type(sanitizer_name), 203 | ), 204 | ) 205 | 206 | sanitizer_callback = self.find_sanitizer(sanitizer_name) 207 | sanitizer_key = "%s.%s" % (table_name, column_name) 208 | self.sanitizers[sanitizer_key] = sanitizer_callback 209 | 210 | def find_sanitizer(self, name): 211 | """ 212 | Searches for a sanitizer function with given name. The name should 213 | contain two parts separated from each other with a dot, the first 214 | part being the module name while the second being name of the function 215 | contained in the module, when it's being prefixed with "sanitize_". 216 | 217 | The lookup process consists from three attempts, which are: 218 | 219 | 1. First package to look the module will be top level package called 220 | "sanitizers". 221 | 2. Module will be looked under the "addon" packages, if they have been 222 | defined. 223 | 3. Finally the sanitation function will be looked from the builtin 224 | sanitizers located in "database_sanitizer.sanitizers" package. 225 | 226 | If none of these provide any results, ConfigurationError will be 227 | thrown. 228 | 229 | :param name: "Full name" of the sanitation function containing name 230 | of the module as well as name of the function. 231 | :type name: str 232 | 233 | :return: First function which can be imported with the given name. 234 | :rtype: callable 235 | """ 236 | # Split the sanitizer name into two parts, one containing the Python 237 | # module name, while second containing portion of the function name 238 | # we are looking for. 239 | name_parts = name.split(".") 240 | if len(name_parts) < 2: 241 | raise ConfigurationError( 242 | "Unable to separate module name from function name in '%s'" % ( 243 | name, 244 | ), 245 | ) 246 | 247 | module_name_suffix = ".".join(name_parts[:-1]) 248 | function_name = "sanitize_%s" % (name_parts[-1],) 249 | 250 | # Phase 1: Look for custom sanitizer under a top level package called 251 | # "sanitizers". 252 | module_name = "sanitizers.%s" % (module_name_suffix,) 253 | callback = self.find_sanitizer_from_module( 254 | module_name=module_name, 255 | function_name=function_name, 256 | ) 257 | if callback: 258 | return callback 259 | 260 | # Phase 2: Look for the sanitizer under "addon" packages, if any of 261 | # such have been defined. 262 | for addon_package_name in self.addon_packages: 263 | module_name = "%s.%s" % ( 264 | addon_package_name, 265 | module_name_suffix, 266 | ) 267 | callback = self.find_sanitizer_from_module( 268 | module_name=module_name, 269 | function_name=function_name, 270 | ) 271 | if callback: 272 | return callback 273 | 274 | # Phase 3: Look from builtin sanitizers. 275 | module_name = "database_sanitizer.sanitizers.%s" % (module_name_suffix,) 276 | callback = self.find_sanitizer_from_module( 277 | module_name=module_name, 278 | function_name=function_name, 279 | ) 280 | if callback: 281 | return callback 282 | 283 | # Give up. 284 | raise ConfigurationError("Unable to find sanitizer called '%s'" % ( 285 | name, 286 | )) 287 | 288 | @staticmethod 289 | def find_sanitizer_from_module(module_name, function_name): 290 | """ 291 | Attempts to find sanitizer function from given module. If the module 292 | cannot be imported, or function with given name does not exist in it, 293 | nothing will be returned by this method. Otherwise the found sanitizer 294 | function will be returned. 295 | 296 | :param module_name: Name of the module to import the function from. 297 | :type module_name: str 298 | 299 | :param function_name: Name of the function to look for inside the 300 | module. 301 | :type function_name: str 302 | 303 | :return: Sanitizer function found from the module, if it can be 304 | imported and it indeed contains function with the given name. 305 | Otherwise None will be returned instead. 306 | :rtype: callback|None 307 | """ 308 | try: 309 | module = importlib.import_module(module_name) 310 | except ImportError: 311 | return None 312 | 313 | # Look for the function inside the module. At this point it could be 314 | # pretty much anything. 315 | callback = getattr(module, function_name, None) 316 | 317 | # Function does not exist in this module? Give up. 318 | if callback is None: 319 | return None 320 | 321 | # It's actually callable function? Return it. 322 | if callable(callback): 323 | return callback 324 | 325 | # Sanitizer seems to be something else than a function. Throw an 326 | # exception to report such problem. 327 | raise ConfigurationError("'%s' in '%s' is %s instead of function" % ( 328 | function_name, 329 | module_name, 330 | type(callback), 331 | )) 332 | 333 | def get_sanitizer_for(self, table_name, column_name): 334 | """ 335 | Get sanitizer for given table and column name. 336 | 337 | :param table_name: Name of the database table. 338 | :type table_name: str 339 | 340 | :param column_name: Name of the database column. 341 | :type column_name: str 342 | 343 | :return: Sanitizer function or None if nothing is configured 344 | :rtype: Optional[Callable[[Optional[str]], Optional[str]]] 345 | """ 346 | sanitizer_key = "%s.%s" % (table_name, column_name) 347 | return self.sanitizers.get(sanitizer_key) 348 | 349 | def sanitize(self, table_name, column_name, value): 350 | """ 351 | Sanitizes given value extracted from the database according to the 352 | sanitation configuration. 353 | 354 | TODO: Add support for dates, booleans and other types found in SQL than 355 | string. 356 | 357 | :param table_name: Name of the database table from which the value is 358 | from. 359 | :type table_name: str 360 | 361 | :param column_name: Name of the database column from which the value is 362 | from. 363 | :type column_name: str 364 | 365 | :param value: Value from the database, either in text form or None if 366 | the value is null. 367 | :type value: str|None 368 | 369 | :return: Sanitized version of the given value. 370 | :rtype: str|None 371 | """ 372 | sanitizer_callback = self.get_sanitizer_for(table_name, column_name) 373 | return sanitizer_callback(value) if sanitizer_callback else value 374 | -------------------------------------------------------------------------------- /database_sanitizer/dump/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import importlib 6 | 7 | from six.moves.urllib import parse as urlparse 8 | 9 | from .. import session 10 | 11 | SUPPORTED_DATABASE_MODULES = { 12 | "mysql": "database_sanitizer.dump.mysql", 13 | "postgres": "database_sanitizer.dump.postgres", 14 | "postgresql": "database_sanitizer.dump.postgres", 15 | "postgis": "database_sanitizer.dump.postgres", 16 | } 17 | 18 | 19 | # Register supported database schemes. 20 | for scheme in SUPPORTED_DATABASE_MODULES.keys(): 21 | urlparse.uses_netloc.append(scheme) 22 | 23 | 24 | def run(url, output, config): 25 | """ 26 | Extracts database dump from given database URL and outputs sanitized 27 | copy of it into given stream. 28 | 29 | :param url: URL to the database which is to be sanitized. 30 | :type url: str 31 | 32 | :param output: Stream where sanitized copy of the database dump will be 33 | written into. 34 | :type output: file 35 | 36 | :param config: Optional sanitizer configuration to be used for sanitation 37 | of the values stored in the database. 38 | :type config: database_sanitizer.config.Configuration|None 39 | """ 40 | parsed_url = urlparse.urlparse(url) 41 | db_module_path = SUPPORTED_DATABASE_MODULES.get(parsed_url.scheme) 42 | if not db_module_path: 43 | raise ValueError("Unsupported database scheme: '%s'" % (parsed_url.scheme,)) 44 | db_module = importlib.import_module(db_module_path) 45 | session.reset() 46 | for line in db_module.sanitize(url=parsed_url, config=config): 47 | output.write(line + "\n") 48 | -------------------------------------------------------------------------------- /database_sanitizer/dump/mysql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import codecs 6 | import re 7 | import subprocess 8 | import io 9 | 10 | from ..utils.mysql import ( 11 | decode_mysql_literal, 12 | encode_mysql_literal, 13 | get_mysqldump_args_and_env_from_url, 14 | ) 15 | from ..config import MYSQLDUMP_DEFAULT_PARAMETERS 16 | 17 | #: Regular expression which matches `INSERT INTO` statements produced by the 18 | #: `mysqldump` utility, even when extended inserts have been enabled. 19 | INSERT_INTO_PATTERN = re.compile( 20 | r"^INSERT INTO `(?P[^`]*)`" 21 | r" \((?P.*)\)" 22 | r" VALUES (?P.*);$" 23 | ) 24 | 25 | 26 | #: Regular expression which matches various kinds of MySQL literals. 27 | VALUE_PATTERN = re.compile( 28 | r""" 29 | # Group 1: 30 | ( 31 | '(?:[^']|''|\\')*(?[^\"]*)\".\"(?P
[^\"]*)\" " 14 | r"\((?P.*)\) " 15 | r"FROM stdin;$" 16 | ) 17 | 18 | 19 | def sanitize(url, config): 20 | """ 21 | Obtains dump of an Postgres database by executing `pg_dump` command and 22 | sanitizes it's output. 23 | 24 | :param url: URL to the database which is going to be sanitized, parsed by 25 | Python's URL parser. 26 | :type url: six.moves.urllib.parse.ParseResult 27 | 28 | :param config: Optional sanitizer configuration to be used for sanitation 29 | of the values stored in the database. 30 | :type config: database_sanitizer.config.Configuration|None 31 | """ 32 | if url.scheme not in ("postgres", "postgresql", "postgis"): 33 | raise ValueError("Unsupported database type: '%s'" % (url.scheme,)) 34 | 35 | extra_params = PG_DUMP_DEFAULT_PARAMETERS 36 | if config: 37 | extra_params = config.pg_dump_params 38 | 39 | process = subprocess.Popen( 40 | ( 41 | "pg_dump", 42 | # Force output to be UTF-8 encoded. 43 | "--encoding=utf-8", 44 | # Quote all table and column names, just in case. 45 | "--quote-all-identifiers", 46 | # Luckily `pg_dump` supports DB URLs, so we can just pass it the 47 | # URL as argument to the command. 48 | "--dbname", 49 | url.geturl().replace('postgis://', 'postgresql://'), 50 | ) + tuple(extra_params), 51 | stdout=subprocess.PIPE, 52 | ) 53 | 54 | sanitize_value_line = None 55 | current_table = None 56 | current_table_columns = None 57 | skip_table = False 58 | 59 | for line in codecs.getreader("utf-8")(process.stdout): 60 | # Eat the trailing new line. 61 | line = line.rstrip("\n") 62 | 63 | # Are we currently in middle of `COPY` statement? 64 | if current_table: 65 | # Backslash following a dot marks end of an `COPY` statement. 66 | if line == "\\.": 67 | current_table = None 68 | current_table_columns = None 69 | if not skip_table: 70 | yield "\\." 71 | skip_table = False 72 | continue 73 | 74 | if skip_table: 75 | continue 76 | 77 | if not sanitize_value_line: 78 | yield line 79 | continue 80 | 81 | yield sanitize_value_line(line) 82 | continue 83 | 84 | # Is the line beginning of `COPY` statement? 85 | copy_line_match = COPY_LINE_PATTERN.match(line) 86 | if not copy_line_match: 87 | yield line 88 | continue 89 | 90 | current_table = copy_line_match.group("table") 91 | current_table_columns = parse_column_names(copy_line_match.group("columns")) 92 | 93 | # Skip `COPY` statement if table rows are configured 94 | # to be skipped. 95 | if config and current_table in config.skip_rows_for_tables: 96 | skip_table = True 97 | continue 98 | 99 | sanitize_value_line = get_value_line_sanitizer( 100 | config, current_table, current_table_columns) 101 | 102 | yield line 103 | 104 | 105 | def get_value_line_sanitizer(config, table, columns): 106 | if not config: 107 | return None 108 | 109 | def get_sanitizer(column): 110 | sanitizer = config.get_sanitizer_for(table, column) 111 | 112 | if not sanitizer: 113 | return _identity 114 | 115 | def decode_sanitize_encode(value): 116 | return encode_copy_value(sanitizer(decode_copy_value(value))) 117 | 118 | return decode_sanitize_encode 119 | 120 | sanitizers = [get_sanitizer(column) for column in columns] 121 | 122 | if all(x is _identity for x in sanitizers): 123 | return None 124 | 125 | def sanitize_line(line): 126 | values = line.split('\t') 127 | if len(values) != len(columns): 128 | raise ValueError("Mismatch between column names and values.") 129 | return '\t'.join( 130 | sanitizer(value) 131 | for (sanitizer, value) in zip(sanitizers, values)) 132 | 133 | return sanitize_line 134 | 135 | 136 | def _identity(x): 137 | return x 138 | 139 | 140 | def parse_column_names(text): 141 | """ 142 | Extracts column names from a string containing quoted and comma separated 143 | column names. 144 | 145 | :param text: Line extracted from `COPY` statement containing quoted and 146 | comma separated column names. 147 | :type text: str 148 | 149 | :return: Tuple containing just the column names. 150 | :rtype: tuple[str] 151 | """ 152 | return tuple( 153 | re.sub(r"^\"(.*)\"$", r"\1", column_name.strip()) 154 | for column_name in text.split(",") 155 | ) 156 | 157 | 158 | def parse_values(text): 159 | """ 160 | Parses line following `COPY` statement containing values for a single row 161 | in the table, in custom Postgres format. 162 | 163 | :param text: Line following `COPY` statement containing values. 164 | :type text: str 165 | 166 | :return: Column values extracted from the given line. 167 | :rtype: tuple[str|None] 168 | """ 169 | return tuple(decode_copy_value(value) for value in text.split("\t")) 170 | -------------------------------------------------------------------------------- /database_sanitizer/sanitizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersinno/python-database-sanitizer/66b7146441914e9dfd8ab31596d5ebc61bf0e04a/database_sanitizer/sanitizers/__init__.py -------------------------------------------------------------------------------- /database_sanitizer/sanitizers/constant.py: -------------------------------------------------------------------------------- 1 | def sanitize_null(value): 2 | return None 3 | 4 | 5 | def sanitize_empty_json_dict(value): 6 | return '{}' 7 | 8 | 9 | def sanitize_empty_json_list(value): 10 | return '[]' 11 | 12 | 13 | def sanitize_invalid_django_password(value): 14 | return '!' 15 | -------------------------------------------------------------------------------- /database_sanitizer/sanitizers/derived.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | from database_sanitizer.session import hash_text 4 | 5 | NIL_UUID = '00000000-0000-0000-0000-000000000000' 6 | NIL_UUID_WITHOUT_DASHES = NIL_UUID.replace('-', '') 7 | 8 | 9 | def sanitize_uuid4(value): 10 | if not value: 11 | return value 12 | if value.replace('-', '') == NIL_UUID_WITHOUT_DASHES: 13 | return NIL_UUID 14 | return str(uuid.UUID(hash_text(value)[:32], version=4)) 15 | -------------------------------------------------------------------------------- /database_sanitizer/sanitizers/string.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import, unicode_literals 4 | 5 | import random 6 | import string 7 | 8 | CHARACTERS = string.ascii_letters + string.digits 9 | 10 | 11 | def sanitize_empty(value): 12 | """ 13 | Built-in sanitizer which replaces the original value with empty string. 14 | """ 15 | return None if value is None else "" 16 | 17 | 18 | def sanitize_zfill(value): 19 | """ 20 | Built-in sanitizer which replaces the original value with zeros. 21 | """ 22 | return None if value is None else "".zfill(len(value)) 23 | 24 | 25 | def sanitize_random(value): 26 | """ 27 | Random string of same length as the given value. 28 | """ 29 | if not value: 30 | return value 31 | return ''.join(random.choice(CHARACTERS) for _ in range(len(value))) 32 | -------------------------------------------------------------------------------- /database_sanitizer/sanitizers/times.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import random 3 | 4 | TEN_YEARS_AS_SECONDS = 10 * 365 * 24 * 3600 5 | 6 | 7 | def sanitize_random_past_timestamp(value): 8 | num = random.randint(0, TEN_YEARS_AS_SECONDS * 1000) 9 | delta = datetime.timedelta(seconds=(num / 1000.0)) 10 | dt = datetime.datetime.now() - delta 11 | return dt.isoformat() 12 | -------------------------------------------------------------------------------- /database_sanitizer/sanitizers/user.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | from six import text_type 4 | 5 | from database_sanitizer.session import hash_text_to_int, hash_text_to_ints 6 | 7 | 8 | def sanitize_email(value): 9 | if not value: 10 | return value 11 | (num1, num2, num3) = hash_text_to_ints(value.strip(), [16, 16, 32]) 12 | given_name = given_names[num1 % given_names_count] 13 | surname = surnames[num2 % surnames_count] 14 | case_convert = (text_type.lower if num3 % 8 > 0 else lambda x: x) 15 | return '{first}.{last}@x{num:x}.sanitized.net'.format( 16 | first=case_convert(given_name), 17 | last=case_convert(surname).replace("'", ''), 18 | num=num3) 19 | 20 | 21 | def sanitize_username(value): 22 | if not value: 23 | return value 24 | (num1, num2) = hash_text_to_ints(value, [16, 32]) 25 | return '{}{:x}'.format(given_names[num1 % given_names_count].lower(), num2) 26 | 27 | 28 | def sanitize_full_name_en_gb(value): 29 | if not value: 30 | return value 31 | (num1, num2) = hash_text_to_ints(value.strip().lower(), [16, 16]) 32 | return '{} {}'.format( 33 | given_names[num1 % given_names_count], surnames[num2 % surnames_count]) 34 | 35 | 36 | def sanitize_given_name_en_gb(value): 37 | if not value: 38 | return value 39 | num = hash_text_to_int(value.strip().lower()) 40 | return given_names[num % given_names_count] 41 | 42 | 43 | def sanitize_surname_en_gb(value): 44 | if not value: 45 | return value 46 | num = hash_text_to_int(value.strip().lower()) 47 | return surnames[num % surnames_count] 48 | 49 | 50 | given_names = """ 51 | Aaron Abbie Abdul Abigail Adam Adrian Aimee Alan Albert Alex 52 | Alexander Alexandra Alice Alison Allan Amanda Amber Amelia Amy Andrea 53 | Andrew Angela Ann Anna Anne Annette Anthony Antony Arthur Ashleigh 54 | Ashley Barbara Barry Ben Benjamin Bernard Beth Bethan Bethany Beverley 55 | Billy Bradley Brandon Brenda Brett Brian Bruce Bryan Callum Cameron Carl 56 | Carly Carol Carole Caroline Carolyn Catherine Charlene Charles Charlie 57 | Charlotte Chelsea Cheryl Chloe Christian Christine Christopher Claire 58 | Clare Clifford Clive Colin Connor Conor Craig Dale Damian Damien Daniel 59 | Danielle Danny Darren David Dawn Dean Deborah Debra Declan Denis Denise 60 | Dennis Derek Diana Diane Dominic Donald Donna Dorothy Douglas Duncan 61 | Dylan Edward Eileen Elaine Eleanor Elizabeth Ellie Elliot Elliott Emily 62 | Emma Eric Fiona Frances Francesca Francis Frank Frederick Gail Gareth 63 | Garry Gary Gavin Gemma Geoffrey George Georgia Georgina Gerald Geraldine 64 | Gerard Gillian Glen Glenn Gordon Grace Graeme Graham Gregory Guy Hannah 65 | Harriet Harry Hayley Hazel Heather Helen Henry Hilary Hollie Holly 66 | Howard Hugh Iain Ian Irene Jack Jacob Jacqueline Jade Jake James Jamie 67 | Jane Janet Janice Jasmine Jason Jay Jayne Jean Jeffrey Jemma Jenna 68 | Jennifer Jeremy Jessica Jill Joan Joanna Joanne Jodie Joe Joel John 69 | Jonathan Jordan Joseph Josephine Josh Joshua Joyce Judith Julia Julian 70 | Julie June Justin Karen Karl Kate Katherine Kathleen Kathryn Katie Katy 71 | Kayleigh Keith Kelly Kenneth Kerry Kevin Kieran Kim Kimberley Kirsty 72 | Kyle Laura Lauren Lawrence Leah Leanne Lee Leigh Leon Leonard Lesley 73 | Leslie Lewis Liam Linda Lindsey Lisa Lorraine Louis Louise Lucy Luke 74 | Lydia Lynda Lynn Lynne Malcolm Mandy Marc Marcus Margaret Maria Marian 75 | Marie Marilyn Marion Mark Martin Martyn Mary Mathew Matthew Maureen 76 | Maurice Max Megan Melanie Melissa Michael Michelle Mitchell Mohamed 77 | Mohammad Mohammed Molly Naomi Natalie Natasha Nathan Neil Nicholas 78 | Nicola Nicole Nigel Norman Oliver Olivia Owen Paige Pamela Patricia 79 | Patrick Paul Paula Pauline Peter Philip Phillip Rachael Rachel Raymond 80 | Rebecca Reece Rhys Richard Ricky Rita Robert Robin Roger Ronald Rosemary 81 | Rosie Ross Roy Russell Ruth Ryan Sally Sam Samantha Samuel Sandra Sara 82 | Sarah Scott Sean Shane Shannon Sharon Shaun Sheila Shirley Sian Simon 83 | Sophie Stacey Stanley Stephanie Stephen Steven Stewart Stuart Susan 84 | Suzanne Sylvia Terence Teresa Terry Thomas Timothy Tina Toby Tom Tony 85 | Tracey Tracy Trevor Valerie Vanessa Victor Victoria Vincent Wayne Wendy 86 | William Yvonne Zoe 87 | """.strip().split() 88 | 89 | 90 | surnames = """ 91 | Abbott Adams Ahmed Akhtar Alexander Ali Allan Allen Anderson Andrews 92 | Archer Armstrong Arnold Ashton Atkins Atkinson Austin Bailey Baker 93 | Baldwin Ball Banks Barber Barker Barlow Barnes Barnett Barrett Barry 94 | Bartlett Barton Bates Baxter Begum Bell Bennett Benson Bentley Berry 95 | Bevan Bibi Birch Bird Bishop Black Blackburn Bolton Bond Booth Bowen 96 | Boyle Bradley Bradshaw Brady Bray Brennan Briggs Brookes Brooks Brown 97 | Browne Bruce Bryan Bryant Bull Burgess Burke Burns Burrows Burton 98 | Butcher Butler Byrne Cameron Campbell Carey Carpenter Carr Carroll 99 | Carter Cartwright Chadwick Chambers Chan Chandler Chapman Charlton Clark 100 | Clarke Clayton Clements Coates Cole Coleman Coles Collier Collins 101 | Connolly Connor Conway Cook Cooke Cooper Cox Craig Crawford Cross 102 | Cunningham Curtis Dale Daly Daniels Davey Davidson Davies Davis Davison 103 | Dawson Day Dean Dennis Dickinson Dixon Dobson Dodd Doherty Donnelly 104 | Douglas Doyle Duffy Duncan Dunn Dyer Edwards Elliott Ellis Evans Farmer 105 | Farrell Faulkner Ferguson Field Finch Fisher Fitzgerald Fleming Fletcher 106 | Flynn Ford Forster Foster Fowler Fox Francis Franklin Fraser Freeman 107 | French Frost Fry Fuller Gallagher Gardiner Gardner Garner George Gibbons 108 | Gibbs Gibson Gilbert Giles Gill Glover Goddard Godfrey Goodwin Gordon 109 | Gough Gould Graham Grant Gray Green Greenwood Gregory Griffin Griffiths 110 | Hale Hall Hamilton Hammond Hancock Hanson Harding Hardy Hargreaves 111 | Harper Harris Harrison Hart Hartley Harvey Hawkins Hayes Haynes Hayward 112 | Heath Henderson Henry Herbert Hewitt Hicks Higgins Hill Hilton Hodgson 113 | Holden Holland Holloway Holmes Holt Hooper Hope Hopkins Horton Houghton 114 | Howard Howarth Howe Howell Howells Hudson Hughes Humphreys Humphries 115 | Hunt Hunter Hurst Hussain Hutchinson Hyde Ingram Iqbal Jackson James 116 | Jarvis Jenkins Jennings John Johnson Johnston Jones Jordan Joyce Kaur 117 | Kay Kelly Kemp Kennedy Kent Kerr Khan King Kirby Kirk Knight Knowles 118 | Lamb Lambert Lane Law Lawrence Lawson Leach Lee Lees Leonard Lewis 119 | Little Lloyd Long Lord Lowe Lucas Lynch Lyons Macdonald Mahmood Mann 120 | Manning Marsden Marsh Marshall Martin Mason Matthews May McCarthy 121 | McDonald McKenzie McLean Mellor Metcalfe Miah Middleton Miles Miller 122 | Mills Mistry Mitchell Moore Moran Morgan Morley Morris Morrison Morton 123 | Moss Murphy Murray Myers Nash Naylor Nelson Newman Newton Nicholls 124 | Nicholson Nixon Noble Nolan Norman Norris North Norton O'Blake O'Buckley 125 | O'Chamberlain O'Hobbs O'Thompson Oliver Osborne Owen Owens Page Palmer 126 | Parker Parkes Parkin Parkinson Parry Parsons Patel Patterson Payne 127 | Peacock Pearce Pearson Perkins Perry Peters Phillips Pickering Pollard 128 | Poole Pope Porter Potter Potts Powell Power Pratt Preston Price 129 | Pritchard Pugh Quinn Rahman Randall Read Reed Rees Reeves Reid Reynolds 130 | Rhodes Rice Richards Richardson Riley Roberts Robertson Robinson Robson 131 | Rogers Rose Ross Rowe Rowley Russell Ryan Sanders Sanderson Saunders 132 | Savage Schofield Scott Shah Sharp Sharpe Shaw Shepherd Sheppard Short 133 | Simmons Simpson Sims Sinclair Singh Skinner Slater Smart Smith Spencer 134 | Stanley Steele Stephens Stephenson Stevens Stevenson Stewart Stokes 135 | Stone Storey Sullivan Summers Sutton Swift Sykes Talbot Taylor Thomas 136 | Thomson Thornton Thorpe Todd Tomlinson Townsend Tucker Turnbull Turner 137 | Tyler Vaughan Vincent Wade Walker Wall Wallace Wallis Walsh Walters 138 | Walton Ward Warner Warren Waters Watkins Watson Watts Webb Webster Welch 139 | Wells West Weston Wheeler White Whitehead Whitehouse Whittaker Wilkins 140 | Wilkinson Williams Williamson Willis Wilson Winter Wong Wood Woods 141 | Woodward Wright Wyatt Yates Young 142 | """.strip().split() 143 | 144 | given_names_count = len(given_names) 145 | surnames_count = len(surnames) 146 | -------------------------------------------------------------------------------- /database_sanitizer/session.py: -------------------------------------------------------------------------------- 1 | """ 2 | API to sanitation session. 3 | 4 | Sanitation session allows having a state within a single sanitation 5 | process. 6 | 7 | One important thing stored to the session is a secret key which is 8 | generated to a new random value for each sanitation session, but it 9 | stays constant during the whole sanitation process. Its value is never 10 | revealed, so that it is possible to generate such one way hashes with 11 | it, that should not be redoable afterwards. I.e. during the sanitation 12 | session it's possible to do ``hash(C) -> H`` for any clear text C, but 13 | it is not possible to check if H is the hashed value of C after the 14 | sanitation session has ended. 15 | """ 16 | 17 | import hashlib 18 | import hmac 19 | import random 20 | import sys 21 | import threading 22 | 23 | from six import int2byte 24 | 25 | if sys.version_info >= (3, 6): 26 | from typing import Callable, Optional, Sequence # noqa 27 | 28 | 29 | SECRET_KEY_BITS = 128 30 | 31 | 32 | _thread_local_storage = threading.local() 33 | 34 | 35 | def hash_text_to_int(value, bit_length=32): 36 | # type: (str, int) -> int 37 | """ 38 | Hash a text value to an integer. 39 | 40 | Generates an integer number based on the hash derived with 41 | `hash_text` from the given text value. 42 | 43 | :param bit_length: Number of bits to use from the hash value. 44 | :return: Integer value within ``0 <= result < 2**bit_length`` 45 | """ 46 | hash_value = hash_text(value) 47 | return int(hash_value[0:(bit_length // 4)], 16) 48 | 49 | 50 | def hash_text_to_ints(value, bit_lengths=(16, 16, 16, 16)): 51 | # type: (str, Sequence[int]) -> Sequence[int] 52 | """ 53 | Hash a text value to a sequence of integers. 54 | 55 | Generates a sequence of integer values with given bit-lengths 56 | similarly to `hash_text_to_int`, but allowing generating many 57 | separate numbers with a single call. 58 | 59 | :param bit_lengths: 60 | Tuple of bit lengths for the resulting integers. Defines also the 61 | length of the result tuple. 62 | :return: 63 | Tuple of ``n`` integers ``(R_1, ... R_n)`` with the requested 64 | bit-lengths ``(L_1, ..., L_n)`` and values ranging within 65 | ``0 <= R_i < 2**L_i`` for each ``i``. 66 | """ 67 | hash_value = hash_text(value) 68 | hex_lengths = [x // 4 for x in bit_lengths] 69 | hex_ranges = ( 70 | (sum(hex_lengths[0:i]), sum(hex_lengths[0:(i + 1)])) 71 | for i in range(len(hex_lengths))) 72 | return tuple(int(hash_value[a:b], 16) for (a, b) in hex_ranges) 73 | 74 | 75 | def hash_text(value, hasher=hashlib.sha256, encoding='utf-8'): 76 | # type: (str, Callable, str) -> str 77 | """ 78 | Generate a hash for a text value. 79 | 80 | The hash will be generated by encoding the text to bytes with given 81 | encoding and then generating a hash with HMAC using the session 82 | secret as the key and the given hash function. 83 | 84 | :param value: Text value to hash 85 | :param hasher: Hash function to use, SHA256 by default 86 | :param encoding: Encoding to use, UTF-8 by default 87 | :return: Hexadecimal presentation of the hash as a string 88 | """ 89 | return hash_bytes(value.encode(encoding), hasher) 90 | 91 | 92 | def hash_bytes(value, hasher=hashlib.sha256): 93 | # type: (bytes, Callable) -> str 94 | """ 95 | Generate a hash for a bytes value. 96 | 97 | The hash will be generated by generating a hash with HMAC using the 98 | session secret as the key and the given hash function. 99 | 100 | :param value: Bytes value to hash 101 | :param hasher: Hash function to use. 102 | :return: Hexadecimal presentation of the hash as a string 103 | """ 104 | return hmac.new(get_secret(), value, hasher).hexdigest() 105 | 106 | 107 | def get_secret(): 108 | # type: () -> bytes 109 | """ 110 | Get session specific secret key. 111 | 112 | :return: Session key as bytes 113 | """ 114 | if not getattr(_thread_local_storage, 'secret_key', None): 115 | _initialize_session() 116 | return _thread_local_storage.secret_key # type: ignore 117 | 118 | 119 | def reset(secret_key=None): 120 | # type: (Optional[bytes]) -> None 121 | """ 122 | Reset the session. 123 | 124 | By default, this resets the value of the secret to None so that, if 125 | there was an earlier sanitation process ran on the same thread, then 126 | a next call that needs the secret key of the session will generate a 127 | new value for it. 128 | 129 | This may also be used to set a predefined value for the secret key. 130 | 131 | :param secret_key: 132 | Value to set as the new session secret key or None if a new one 133 | should be generated as soon as one is needed. 134 | """ 135 | _thread_local_storage.secret_key = secret_key 136 | 137 | 138 | def _initialize_session(): 139 | # type: () -> None 140 | """ 141 | Generate a new session key and store it to thread local storage. 142 | """ 143 | sys_random = random.SystemRandom() 144 | _thread_local_storage.secret_key = b''.join( 145 | int2byte(sys_random.randint(0, 255)) 146 | for _ in range(SECRET_KEY_BITS // 8)) 147 | -------------------------------------------------------------------------------- /database_sanitizer/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersinno/python-database-sanitizer/66b7146441914e9dfd8ab31596d5ebc61bf0e04a/database_sanitizer/tests/__init__.py -------------------------------------------------------------------------------- /database_sanitizer/tests/test_config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from collections import namedtuple 4 | 5 | import mock 6 | import pytest 7 | 8 | from .. import config 9 | from ..config import Configuration, ConfigurationError 10 | 11 | 12 | @mock.patch.object(config, 'open') 13 | @mock.patch('yaml.safe_load') 14 | def test_from_file(mocked_yaml_load, mocked_open): 15 | mocked_yaml_load.return_value = {} 16 | 17 | Configuration.from_file('filename.yml') 18 | 19 | assert mocked_open.call_args == (('filename.yml', 'rb'), {}) 20 | opened_file = mocked_open.return_value.__enter__.return_value 21 | assert mocked_yaml_load.call_args == ((opened_file,), {}) 22 | 23 | 24 | def test_load_config_data_must_be_dict(): 25 | config = Configuration() 26 | config.load({}) 27 | with pytest.raises(ConfigurationError): 28 | config.load(config_data="test") 29 | 30 | 31 | def test_load_dump_extra_parameters(): 32 | config = Configuration() 33 | 34 | config.load_dump_extra_parameters({}) 35 | assert config.mysqldump_params == ["--single-transaction"] 36 | assert config.pg_dump_params == [] 37 | 38 | with pytest.raises(ConfigurationError): 39 | config.load_dump_extra_parameters({"config": "test"}) 40 | 41 | config.load_dump_extra_parameters({"config": {}}) 42 | assert config.mysqldump_params == ["--single-transaction"] 43 | assert config.pg_dump_params == [] 44 | 45 | with pytest.raises(ConfigurationError): 46 | config.load_dump_extra_parameters({"config": { 47 | "extra_parameters": "test" 48 | }}) 49 | 50 | with pytest.raises(ConfigurationError): 51 | config.load_dump_extra_parameters({"config": { 52 | "extra_parameters": [True] 53 | }}) 54 | 55 | with pytest.raises(ConfigurationError): 56 | config.load_dump_extra_parameters({"config": { 57 | "extra_parameters": { 58 | "mysqldump": "hernekeitto", 59 | }, 60 | }}) 61 | 62 | with pytest.raises(ConfigurationError): 63 | config.load_dump_extra_parameters({"config": { 64 | "extra_parameters": { 65 | "pg_dump": "viina", 66 | }, 67 | }}) 68 | 69 | config.load_dump_extra_parameters({"config": { 70 | "extra_parameters": { 71 | "mysqldump": ["--double-transaction"], 72 | "pg_dump": ["--exclude-table=something"], 73 | }, 74 | }}) 75 | assert config.mysqldump_params == ["--double-transaction"] 76 | assert config.pg_dump_params == ["--exclude-table=something"] 77 | 78 | 79 | def test_load_addon_packages(): 80 | config = Configuration() 81 | 82 | config.load_addon_packages({}) 83 | assert config.addon_packages == [] 84 | 85 | with pytest.raises(ConfigurationError): 86 | config.load_addon_packages({"config": "test"}) 87 | 88 | config.load_addon_packages({"config": {}}) 89 | assert config.addon_packages == [] 90 | 91 | with pytest.raises(ConfigurationError): 92 | config.load_addon_packages({"config": {"addons": "test"}}) 93 | 94 | with pytest.raises(ConfigurationError): 95 | config.load_addon_packages({"config": {"addons": [True]}}) 96 | 97 | config.load_addon_packages({"config": { 98 | "addons": [ 99 | "test1", 100 | "test2", 101 | "test3", 102 | ], 103 | }}) 104 | assert config.addon_packages == ["test1", "test2", "test3"] 105 | 106 | 107 | def test_load_sanitizers(): 108 | config = Configuration() 109 | 110 | with pytest.raises(ConfigurationError): 111 | config.load_sanitizers({"strategy": "test"}) 112 | 113 | with pytest.raises(ConfigurationError): 114 | config.load_sanitizers({"strategy": {"test": "test"}}) 115 | 116 | def mock_find_sanitizer(*args): 117 | return lambda value: value 118 | 119 | with mock.patch("database_sanitizer.config.Configuration.find_sanitizer", 120 | side_effect=mock_find_sanitizer): 121 | with pytest.raises(ConfigurationError): 122 | config.load_sanitizers({"strategy": {"table1": {"column1": True}}}) 123 | 124 | config.load_sanitizers({"strategy": { 125 | "table1": { 126 | "column1": None, 127 | "column2": "test.test", 128 | }, 129 | "table2": { 130 | "column1": "test.test", 131 | }, 132 | "table3": None, 133 | }}) 134 | 135 | assert "table1.column1" not in config.sanitizers 136 | assert "table1.column2" in config.sanitizers 137 | assert "table2.column1" in config.sanitizers 138 | 139 | 140 | def test_table_skip_rows_configuration(): 141 | config = Configuration() 142 | 143 | with pytest.raises(ConfigurationError): 144 | config.load_sanitizers({"strategy": "test"}) 145 | 146 | def mock_find_sanitizer(*args): 147 | return lambda value: value 148 | 149 | with mock.patch("database_sanitizer.config.Configuration.find_sanitizer", 150 | side_effect=mock_find_sanitizer): 151 | 152 | config.load_sanitizers({"strategy": { 153 | "table1": "skip_rows", 154 | "table2": { 155 | "column1": "test", 156 | } 157 | }}) 158 | 159 | assert "table2.column1" in config.sanitizers 160 | assert "table1" in config.skip_rows_for_tables 161 | 162 | 163 | def test_find_sanitizer(): 164 | config = Configuration() 165 | 166 | with pytest.raises(ConfigurationError): 167 | config.find_sanitizer("test") 168 | 169 | def mock_find_sanitizer_from_module1(module_name, function_name): 170 | assert module_name == "sanitizers.test" 171 | assert function_name == "sanitize_test" 172 | return lambda value: value 173 | 174 | with mock.patch("database_sanitizer.config.Configuration.find_sanitizer_from_module", 175 | side_effect=mock_find_sanitizer_from_module1): 176 | assert config.find_sanitizer("test.test") is not None 177 | 178 | def mock_find_sanitizer_from_module2(module_name, function_name): 179 | assert module_name in ("sanitizers.test", "addon.test") 180 | assert function_name == "sanitize_test" 181 | if module_name.startswith("addon."): 182 | return lambda value: value 183 | else: 184 | return None 185 | 186 | with mock.patch("database_sanitizer.config.Configuration.find_sanitizer_from_module", 187 | side_effect=mock_find_sanitizer_from_module2): 188 | config.addon_packages = ("addon",) 189 | assert config.find_sanitizer("test.test") is not None 190 | 191 | def mock_find_sanitizer_from_module3(module_name, function_name): 192 | assert module_name in ( 193 | "sanitizers.test", 194 | "addon.test", 195 | "database_sanitizer.sanitizers.test", 196 | ) 197 | assert function_name == "sanitize_test" 198 | if module_name.startswith("database_sanitizer."): 199 | return lambda value: value 200 | else: 201 | return None 202 | 203 | with mock.patch("database_sanitizer.config.Configuration.find_sanitizer_from_module", 204 | side_effect=mock_find_sanitizer_from_module3): 205 | assert config.find_sanitizer("test.test") is not None 206 | 207 | def mock_find_sanitizer_from_module4(module_name, function_name): 208 | return None 209 | 210 | with mock.patch("database_sanitizer.config.Configuration.find_sanitizer_from_module", 211 | side_effect=mock_find_sanitizer_from_module4): 212 | with pytest.raises(ConfigurationError): 213 | config.find_sanitizer("test.test") 214 | 215 | 216 | def test_find_sanitizer_from_module(): 217 | def mock_import1(module_name): 218 | assert module_name == "test" 219 | raise ImportError("Should be catched") 220 | 221 | with mock.patch("importlib.import_module", side_effect=mock_import1): 222 | assert Configuration.find_sanitizer_from_module("test", "test") is None 223 | 224 | mock_module_type = namedtuple("mock_module", ("test",)) 225 | 226 | def mock_import2(module_name): 227 | assert module_name == "test" 228 | return mock_module_type(test=None) 229 | 230 | with mock.patch("importlib.import_module", side_effect=mock_import2): 231 | assert Configuration.find_sanitizer_from_module("test", "test") is None 232 | 233 | def mock_import3(module_name): 234 | assert module_name == "test" 235 | return mock_module_type(test=lambda value: value) 236 | 237 | with mock.patch("importlib.import_module", side_effect=mock_import3): 238 | assert Configuration.find_sanitizer_from_module("test", "test") is not None 239 | 240 | def mock_import4(module_name): 241 | assert module_name == "test" 242 | return mock_module_type(test="test") 243 | 244 | with mock.patch("importlib.import_module", side_effect=mock_import4): 245 | with pytest.raises(ConfigurationError): 246 | Configuration.find_sanitizer_from_module("test", "test") 247 | 248 | 249 | def test_sanitize(): 250 | config = Configuration() 251 | config.sanitizers["a.a"] = lambda value: value.upper() 252 | config.sanitizers["a.b"] = lambda value: value[::-1] 253 | 254 | assert config.sanitize("a", "a", "test") == "TEST" 255 | assert config.sanitize("a", "b", "test") == "tset" 256 | assert config.sanitize("a", "c", "test") == "test" 257 | -------------------------------------------------------------------------------- /database_sanitizer/tests/test_dump.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from io import BytesIO, StringIO 3 | 4 | import mock 5 | import pytest 6 | 7 | from database_sanitizer import dump 8 | from database_sanitizer.config import Configuration 9 | 10 | EXPECTED_POPEN_KWARGS = { 11 | 'mysql://User:Pass@HostName/Db': { 12 | 'args': ( 13 | 'mysqldump --complete-insert --extended-insert' 14 | ' --net_buffer_length=10240 -h hostname -u User Db' 15 | ' --single-transaction' 16 | ).split(), 17 | 'env': {'MYSQL_PWD': 'Pass'}, 18 | 'stdout': subprocess.PIPE, 19 | }, 20 | 'postgres:///Db': { 21 | 'args': tuple(( 22 | 'pg_dump --encoding=utf-8 --quote-all-identifiers' 23 | ' --dbname postgres:///Db').split()), 24 | 'stdout': subprocess.PIPE, 25 | }, 26 | } 27 | 28 | for url in ['postgresql:///Db', 'postgis:///Db']: 29 | EXPECTED_POPEN_KWARGS[url] = EXPECTED_POPEN_KWARGS['postgres:///Db'].copy() 30 | EXPECTED_POPEN_KWARGS[url]['args'] = tuple( 31 | ' '.join(EXPECTED_POPEN_KWARGS[url]['args']) 32 | .replace('postgres', 'postgresql').split()) 33 | 34 | 35 | @pytest.mark.parametrize('url', list(EXPECTED_POPEN_KWARGS)) 36 | @mock.patch('subprocess.Popen') 37 | def test_run(mocked_popen, url): 38 | mocked_popen.return_value.stdout = BytesIO(b'INPUT DUMP') 39 | output = StringIO() 40 | config = None 41 | dump.run(url, output, config) 42 | 43 | expected_popen_kwargs = EXPECTED_POPEN_KWARGS[url] 44 | (popen_args, popen_kwargs) = mocked_popen.call_args 45 | expected_popen_args = ( 46 | (expected_popen_kwargs.pop('args'),) if popen_args else ()) 47 | assert popen_args == expected_popen_args 48 | assert popen_kwargs == expected_popen_kwargs 49 | 50 | 51 | @mock.patch('subprocess.Popen') 52 | def test_run_with_mysql_extra_params(mocked_popen): 53 | mocked_popen.return_value.stdout = BytesIO(b'INPUT DUMP') 54 | output = StringIO() 55 | 56 | url = "mysql://User:Pass@HostName/Db" 57 | config = Configuration() 58 | config.load({ 59 | "config": { 60 | "extra_parameters": { 61 | "mysqldump": ["--double-transaction"] 62 | } 63 | } 64 | }) 65 | 66 | dump.run(url, output, config) 67 | 68 | expected = { 69 | 'args': ( 70 | 'mysqldump --complete-insert --extended-insert' 71 | ' --net_buffer_length=10240 -h hostname -u User Db' 72 | ' --double-transaction' 73 | ).split(), 74 | 'env': {'MYSQL_PWD': 'Pass'}, 75 | 'stdout': subprocess.PIPE, 76 | } 77 | 78 | (popen_args, popen_kwargs) = mocked_popen.call_args 79 | expected_popen_args = ( 80 | (expected.pop('args'),) if popen_args else ()) 81 | assert popen_args == expected_popen_args 82 | assert popen_kwargs == expected 83 | 84 | 85 | @mock.patch('subprocess.Popen') 86 | def test_run_with_pg_dump_extra_params(mocked_popen): 87 | mocked_popen.return_value.stdout = BytesIO(b'INPUT DUMP') 88 | output = StringIO() 89 | 90 | url = "postgres:///Db" 91 | config = Configuration() 92 | config.load({ 93 | "config": { 94 | "extra_parameters": { 95 | "pg_dump": ["--exclude-table=something"] 96 | } 97 | } 98 | }) 99 | 100 | dump.run(url, output, config) 101 | 102 | expected = { 103 | 'args': tuple(( 104 | 'pg_dump --encoding=utf-8 --quote-all-identifiers' 105 | ' --dbname postgres:///Db' 106 | ' --exclude-table=something' 107 | ).split()), 108 | 'stdout': subprocess.PIPE, 109 | } 110 | 111 | (popen_args, popen_kwargs) = mocked_popen.call_args 112 | expected_popen_args = ( 113 | (expected.pop('args'),) if popen_args else ()) 114 | assert popen_args == expected_popen_args 115 | assert popen_kwargs == expected 116 | 117 | 118 | @mock.patch('subprocess.Popen') 119 | def test_run_unknown_scheme(mocked_popen): 120 | with pytest.raises(ValueError) as excinfo: 121 | dump.run('unknown:///db', None, None) 122 | assert str(excinfo.value) == "Unsupported database scheme: 'unknown'" 123 | mocked_popen.assert_not_called() 124 | -------------------------------------------------------------------------------- /database_sanitizer/tests/test_dump_mysql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import io 6 | 7 | import pytest 8 | from six.moves.urllib import parse as urlparse 9 | 10 | from ..config import Configuration 11 | from ..dump.mysql import ( 12 | parse_column_names, 13 | parse_values, 14 | sanitize, 15 | sanitize_from_stream, 16 | ) 17 | 18 | MOCK_MYSQLDUMP_OUTPUT = b""" 19 | --- Fake MySQL database dump 20 | 21 | DROP TABLE IF EXISTS `test`; 22 | 23 | CREATE TABLE `test` ( 24 | `id` int(11) NOT NULL AUTO_INCREMENT, 25 | `created_at` date NOT NULL, 26 | `notes` varchar(255) NOT NULL, 27 | PRIMARY KEY (`id`) 28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; 29 | 30 | INSERT INTO `test` (`id`, `created_at`, `notes`) VALUES \ 31 | (1,'2018-01-01','Test data 1'),\ 32 | (2,'2018-01-02','Test data 2'),\ 33 | (3,'2018-01-03','Test data 3'); 34 | 35 | --- Final line after `INSERT INTO` statement. 36 | """ 37 | 38 | MOCK_MYSQLDUMP_OUTPUT_WITH_U2028 = b""" 39 | --- Fake MySQL database dump 40 | 41 | DROP TABLE IF EXISTS `test`; 42 | 43 | CREATE TABLE `test` ( 44 | `id` int(11) NOT NULL AUTO_INCREMENT, 45 | `created_at` date NOT NULL, 46 | `notes` varchar(255) NOT NULL, 47 | PRIMARY KEY (`id`) 48 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; 49 | 50 | INSERT INTO `test` (`id`, `created_at`, `notes`) VALUES \ 51 | (1,'2018-01-01','Test \xe2\x80\xa8 data 1'),\ 52 | (2,'2018-01-02','Test data 2'),\ 53 | (3,'2018-01-03','Test data 3'); 54 | 55 | --- Final line after `INSERT INTO` statement. 56 | """ 57 | 58 | 59 | INVALID_MOCK_MYSQLDUMP_OUTPUT = b""" 60 | --- Fake MySQL database dump 61 | 62 | DROP TABLE IF EXISTS `test`; 63 | 64 | INSERT INTO `test` (`id`, `created_at`, `notes`) VALUES (1),(2),(3); 65 | 66 | --- Final line after `INSERT INTO` statement. 67 | """ 68 | 69 | 70 | def test_sanitize_wrong_scheme(): 71 | url = urlparse.urlparse("http://localhost/test") 72 | with pytest.raises(ValueError): 73 | list(sanitize(url, None)) 74 | 75 | 76 | def test_sanitize_from_stream(): 77 | stream = io.BytesIO(MOCK_MYSQLDUMP_OUTPUT) 78 | config = Configuration() 79 | config.sanitizers["test.notes"] = lambda value: "Sanitized" 80 | dump_output_lines = list(sanitize_from_stream(stream, config)) 81 | 82 | assert "--- Fake MySQL database dump" in dump_output_lines 83 | assert "--- Final line after `INSERT INTO` statement." in dump_output_lines 84 | assert """INSERT INTO `test` (`id`, `created_at`, `notes`) VALUES \ 85 | (1,'2018-01-01','Sanitized'),\ 86 | (2,'2018-01-02','Sanitized'),\ 87 | (3,'2018-01-03','Sanitized');\ 88 | """ in dump_output_lines 89 | 90 | def test_sanitize_with_u2028_from_stream(): 91 | stream = io.BytesIO(MOCK_MYSQLDUMP_OUTPUT_WITH_U2028) 92 | config = Configuration() 93 | config.sanitizers["test.notes"] = lambda value: "Sanitized" 94 | dump_output_lines = list(sanitize_from_stream(stream, config)) 95 | 96 | assert "--- Fake MySQL database dump" in dump_output_lines 97 | assert "--- Final line after `INSERT INTO` statement." in dump_output_lines 98 | assert """INSERT INTO `test` (`id`, `created_at`, `notes`) VALUES \ 99 | (1,'2018-01-01','Sanitized'),\ 100 | (2,'2018-01-02','Sanitized'),\ 101 | (3,'2018-01-03','Sanitized');\ 102 | """ in dump_output_lines 103 | 104 | 105 | def test_skip_table_rows(): 106 | stream = io.BytesIO(MOCK_MYSQLDUMP_OUTPUT) 107 | config = Configuration() 108 | config.skip_rows_for_tables.append('test') 109 | 110 | output = list(sanitize_from_stream(stream, config)) 111 | 112 | assert output == [ 113 | '', 114 | '--- Fake MySQL database dump', 115 | '', 116 | 'DROP TABLE IF EXISTS `test`;', 117 | '', 118 | 'CREATE TABLE `test` (', 119 | '`id` int(11) NOT NULL AUTO_INCREMENT,', 120 | '`created_at` date NOT NULL,', 121 | '`notes` varchar(255) NOT NULL,', 122 | 'PRIMARY KEY (`id`)', 123 | ') ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;', 124 | '', 125 | '', 126 | '--- Final line after `INSERT INTO` statement.', 127 | ] 128 | 129 | 130 | def test_sanitizer_invalid_input(): 131 | stream = io.BytesIO(INVALID_MOCK_MYSQLDUMP_OUTPUT) 132 | config = Configuration() 133 | config.sanitizers["test.notes"] = lambda value: "Sanitized" 134 | 135 | with pytest.raises(ValueError): 136 | list(sanitize_from_stream(stream, config)) 137 | 138 | 139 | @pytest.mark.parametrize( 140 | "text,expected_column_names", 141 | ( 142 | ("`test`", ("test",)), 143 | ("`test`, `test`", ("test", "test")), 144 | ("`test`,`test`", ("test", "test")), 145 | ), 146 | ) 147 | def test_parse_column_names(text, expected_column_names): 148 | assert parse_column_names(text) == expected_column_names 149 | 150 | 151 | @pytest.mark.parametrize( 152 | "text,expected_values", 153 | ( 154 | ("('test'),('test')", (("test",), ("test",))), 155 | ("(1,2),(3,4),", ((1, 2), (3, 4))), 156 | ("(TRUE),(FALSE),(NULL)", ((True,), (False,), (None,))), 157 | ("(x')", ()), # Invalid data 158 | ), 159 | ) 160 | def test_parse_values(text, expected_values): 161 | assert tuple(parse_values(text)) == expected_values 162 | 163 | 164 | @pytest.mark.parametrize('config_type', [ 165 | 'no-config', 'empty-config', 'single-column-config']) 166 | @pytest.mark.parametrize('data_label', ['ok', 'invalid']) 167 | def test_optimizations(config_type, data_label): 168 | if config_type == 'no-config': 169 | config = None 170 | decoder_call_count = 0 171 | else: 172 | config = Configuration() 173 | if config_type == 'empty-config': 174 | decoder_call_count = 0 175 | else: 176 | assert config_type == 'single-column-config' 177 | config.sanitizers["test.notes"] = (lambda x: x) 178 | decoder_call_count = 3 # Number of rows in test table 179 | 180 | data = { 181 | 'ok': MOCK_MYSQLDUMP_OUTPUT, 182 | 'invalid': INVALID_MOCK_MYSQLDUMP_OUTPUT, 183 | }[data_label] 184 | 185 | should_raise = ( 186 | config_type == 'single-column-config' 187 | and data_label == 'invalid') 188 | 189 | dump_stream = io.BytesIO(data) 190 | if should_raise: 191 | with pytest.raises(ValueError): 192 | list(sanitize_from_stream(dump_stream, config)) 193 | else: 194 | expected_output = data.decode('utf-8').splitlines() 195 | result = list(sanitize_from_stream(dump_stream, config)) 196 | assert result == expected_output 197 | -------------------------------------------------------------------------------- /database_sanitizer/tests/test_dump_postgres.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import io 6 | from collections import namedtuple 7 | 8 | import mock 9 | import pytest 10 | from six.moves.urllib import parse as urlparse 11 | 12 | from ..config import Configuration 13 | from ..dump import postgres as dump_postgres 14 | from ..dump.postgres import parse_column_names, parse_values, sanitize 15 | from ..utils.postgres import decode_copy_value 16 | 17 | MOCK_PG_DUMP_OUTPUT = b""" 18 | --- Fake PostgreSQL database dump 19 | 20 | COMMENT ON SCHEMA "public" IS 'standard public schema'; 21 | 22 | CREATE TABLE "public"."test" ( 23 | "id" integer NOT NULL, 24 | "created_at" timestamp with time zone NOT NULL, 25 | "notes" character varying(255) NOT NULL 26 | ); 27 | 28 | COPY "public"."test" ("id", "created_at", "notes") FROM stdin; 29 | 1\t2018-01-01 00:00:00\tTest data 1 30 | 2\t2018-01-02 00:00:00\tTest data 2 31 | 3\t2018-01-03 00:00:00\tTest data 3 32 | \\. 33 | 34 | --- Final line after `COPY` statement 35 | """.strip() 36 | 37 | 38 | INVALID_MOCK_PG_DUMP_OUTPUT = b""" 39 | --- Fake PostgreSQL database dump 40 | 41 | COMMENT ON SCHEMA "public" IS 'standard public schema'; 42 | 43 | COPY "public"."test" ("id", "created_at", "notes") FROM stdin; 44 | 1\t2018-01-01 00:00:00 Test data 1 45 | 2\t2018-01-02 00:00:00 Test data 2 46 | 3\t2018-01-03 00:00:00 Test data 3 47 | \\. 48 | 49 | --- Final line after `COPY` statement 50 | """.strip() 51 | 52 | 53 | def create_mock_popen(mock_pg_dump_output): 54 | def mock_popen(cmd_args, stdout): 55 | mock_pipe_type = namedtuple("mock_pipe", ("stdout",)) 56 | mock_stdout = io.BytesIO(mock_pg_dump_output) 57 | return mock_pipe_type(stdout=mock_stdout) 58 | return mock_popen 59 | 60 | 61 | def test_sanitize(): 62 | url = urlparse.urlparse("postgres://localhost/test") 63 | config = Configuration() 64 | config.sanitizers["test.notes"] = lambda value: "Sanitized" 65 | 66 | with mock.patch("subprocess.Popen", side_effect=create_mock_popen(MOCK_PG_DUMP_OUTPUT)): 67 | dump_output_lines = list(sanitize(url, config)) 68 | 69 | assert "--- Fake PostgreSQL database dump" in dump_output_lines 70 | assert "--- Final line after `COPY` statement" in dump_output_lines 71 | assert "2\t2018-01-02 00:00:00\tSanitized" in dump_output_lines 72 | 73 | 74 | def test_skip_table_rows(): 75 | url = urlparse.urlparse("postgres://localhost/test") 76 | config = Configuration() 77 | config.skip_rows_for_tables.append('test') 78 | 79 | with mock.patch("subprocess.Popen", 80 | side_effect=create_mock_popen(MOCK_PG_DUMP_OUTPUT)): 81 | output = list(sanitize(url, config)) 82 | 83 | assert output == [ 84 | '--- Fake PostgreSQL database dump', 85 | '', 86 | 'COMMENT ON SCHEMA "public" IS \'standard public schema\';', 87 | '', 88 | 'CREATE TABLE "public"."test" (', 89 | '"id" integer NOT NULL,', 90 | '"created_at" timestamp with time zone NOT NULL,', 91 | '"notes" character varying(255) NOT NULL', 92 | ');', 93 | '', 94 | '', 95 | '--- Final line after `COPY` statement' 96 | ] 97 | 98 | 99 | def test_sanitizer_invalid_input(): 100 | url = urlparse.urlparse("postgres://localhost/test") 101 | 102 | config = Configuration() 103 | config.sanitizers["test.notes"] = lambda value: "Sanitized" 104 | 105 | with mock.patch("subprocess.Popen", side_effect=create_mock_popen(INVALID_MOCK_PG_DUMP_OUTPUT)): 106 | with pytest.raises(ValueError): 107 | # Yes, we need the list() function there to eat the yields. 108 | list(sanitize(url, config)) 109 | 110 | 111 | def test_sanitizer_invalid_scheme(): 112 | url = urlparse.urlparse("http://localhost/test") 113 | with pytest.raises(ValueError): 114 | list(sanitize(url, None)) 115 | 116 | 117 | @pytest.mark.parametrize( 118 | "text,expected_column_names", 119 | ( 120 | ("\"test\"", ("test",)), 121 | ("\"test\",\"test\"", ("test", "test")), 122 | ("\"test\", \"test\"", ("test", "test")), 123 | ) 124 | ) 125 | def test_parse_column_names(text, expected_column_names): 126 | assert parse_column_names(text) == expected_column_names 127 | 128 | 129 | @pytest.mark.parametrize( 130 | "text,expected_values", 131 | ( 132 | ("Test", ("Test",)), 133 | ("Test\tTest", ("Test", "Test")), 134 | ("Test\tTest\t", ("Test", "Test", "")), 135 | ("\\N", (None,)), 136 | ) 137 | ) 138 | def test_parse_values(text, expected_values): 139 | assert parse_values(text) == expected_values 140 | 141 | 142 | @pytest.mark.parametrize('config_type', [ 143 | 'no-config', 'empty-config', 'single-column-config']) 144 | @pytest.mark.parametrize('data_label', ['ok', 'invalid']) 145 | def test_optimizations(config_type, data_label): 146 | if config_type == 'no-config': 147 | config = None 148 | decoder_call_count = 0 149 | else: 150 | config = Configuration() 151 | if config_type == 'empty-config': 152 | decoder_call_count = 0 153 | else: 154 | assert config_type == 'single-column-config' 155 | config.sanitizers["test.notes"] = (lambda x: x) 156 | decoder_call_count = 3 # Number of rows in test table 157 | 158 | data = { 159 | 'ok': MOCK_PG_DUMP_OUTPUT, 160 | 'invalid': INVALID_MOCK_PG_DUMP_OUTPUT, 161 | }[data_label] 162 | 163 | should_raise = ( 164 | config_type == 'single-column-config' 165 | and data_label == 'invalid') 166 | 167 | url = urlparse.urlparse("postgres://localhost/test") 168 | with mock.patch("subprocess.Popen", side_effect=create_mock_popen(data)): 169 | with mock.patch.object(dump_postgres, 'decode_copy_value') as decoder: 170 | decoder.side_effect = decode_copy_value 171 | if should_raise: 172 | with pytest.raises(ValueError): 173 | list(sanitize(url, config)) 174 | else: 175 | expected_output = data.decode('utf-8').splitlines() 176 | assert list(sanitize(url, config)) == expected_output 177 | assert decoder.call_count == decoder_call_count 178 | -------------------------------------------------------------------------------- /database_sanitizer/tests/test_main.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import mock 4 | import pytest 5 | import six 6 | 7 | from database_sanitizer import __main__ 8 | 9 | main = __main__.main 10 | 11 | 12 | @mock.patch.object(__main__, 'run') 13 | def test_main_without_args(mocked_run, capsys): 14 | with pytest.raises(SystemExit) as excinfo: 15 | main(['SANI']) 16 | assert excinfo.value.code == 2 17 | 18 | captured = capsys.readouterr() 19 | assert captured.out == '' 20 | assert captured.err.splitlines() == [ 21 | 'usage: SANI [-h] [--config CONFIG] [--output OUTPUT] url', 22 | 'SANI: error: the following arguments are required: url' if six.PY3 23 | else 'SANI: error: too few arguments', 24 | ] 25 | assert not mocked_run.called 26 | 27 | 28 | @mock.patch.object(__main__, 'run') 29 | def test_main_with_url(mocked_run, capsys): 30 | main(['SANI', 'some://url']) 31 | 32 | # Output should be empty 33 | captured = capsys.readouterr() 34 | assert captured.out == '' 35 | assert captured.err == '' 36 | 37 | # The run function should have been called with the URL 38 | (run_call_args, run_call_kwargs) = mocked_run.call_args 39 | assert run_call_args == () 40 | assert set(run_call_kwargs.keys()) == {'config', 'output', 'url'} 41 | assert run_call_kwargs['config'] is None 42 | assert run_call_kwargs['url'] == 'some://url' 43 | 44 | 45 | @pytest.mark.parametrize('optname', ['-c', '--config']) 46 | @mock.patch.object(__main__, 'run') 47 | @mock.patch.object(__main__, 'Configuration') 48 | def test_main_with_config(mocked_conf, mocked_run, capsys, optname): 49 | main(['SANI', optname, 'config_file.yml', 'some://url']) 50 | 51 | # Output should be empty 52 | captured = capsys.readouterr() 53 | assert captured.out == '' 54 | assert captured.err == '' 55 | 56 | # Configuration should have been created with Configuration.from_file 57 | (fromfile_args, fromfile_kwargs) = mocked_conf.from_file.call_args 58 | assert fromfile_args == ('config_file.yml',) 59 | assert fromfile_kwargs == {} 60 | 61 | # The run function should have been called with the config and URL 62 | (run_call_args, run_call_kwargs) = mocked_run.call_args 63 | assert run_call_args == () 64 | assert set(run_call_kwargs.keys()) == {'config', 'output', 'url'} 65 | assert run_call_kwargs['config'] == mocked_conf.from_file.return_value 66 | assert run_call_kwargs['url'] == 'some://url' 67 | 68 | 69 | @pytest.mark.parametrize('optname', ['-o', '--output']) 70 | @mock.patch.object(__main__, 'run') 71 | @mock.patch.object(__main__, 'open') 72 | def test_main_with_output(mocked_open, mocked_run, capsys, optname): 73 | main(['SANI', optname, 'output_file.sql', 'some://url']) 74 | 75 | # Output should be empty 76 | captured = capsys.readouterr() 77 | assert captured.out == '' 78 | assert captured.err == '' 79 | 80 | # Output file should have been opened 81 | (open_args, open_kwargs) = mocked_open.call_args 82 | assert open_args == ('output_file.sql', 'w') 83 | assert open_kwargs == {} 84 | 85 | # The run function should have been called with the output and URL 86 | (run_call_args, run_call_kwargs) = mocked_run.call_args 87 | assert run_call_args == () 88 | assert set(run_call_kwargs.keys()) == {'config', 'output', 'url'} 89 | assert run_call_kwargs['config'] is None 90 | assert run_call_kwargs['output'] == mocked_open.return_value 91 | assert run_call_kwargs['url'] == 'some://url' 92 | -------------------------------------------------------------------------------- /database_sanitizer/tests/test_sanitizers_constant.py: -------------------------------------------------------------------------------- 1 | from database_sanitizer.sanitizers import constant 2 | 3 | 4 | def test_sanitize_null(): 5 | assert constant.sanitize_null(None) is None 6 | assert constant.sanitize_null('') is None 7 | assert constant.sanitize_null('whatever') is None 8 | assert constant.sanitize_null('test') is None 9 | 10 | 11 | def test_sanitize_invalid_django_password(): 12 | assert constant.sanitize_invalid_django_password(None) == '!' 13 | assert constant.sanitize_invalid_django_password('') == '!' 14 | assert constant.sanitize_invalid_django_password('whatever') == '!' 15 | assert constant.sanitize_invalid_django_password('test') == '!' 16 | 17 | 18 | def test_sanitize_empty_json_dict(): 19 | assert constant.sanitize_empty_json_dict(None) == '{}' 20 | assert constant.sanitize_empty_json_dict('') == '{}' 21 | assert constant.sanitize_empty_json_dict('whatever') == '{}' 22 | assert constant.sanitize_empty_json_dict('test') == '{}' 23 | 24 | 25 | def test_sanitize_empty_json_list(): 26 | assert constant.sanitize_empty_json_list(None) == '[]' 27 | assert constant.sanitize_empty_json_list('') == '[]' 28 | assert constant.sanitize_empty_json_list('whatever') == '[]' 29 | assert constant.sanitize_empty_json_list('test') == '[]' 30 | -------------------------------------------------------------------------------- /database_sanitizer/tests/test_sanitizers_derived.py: -------------------------------------------------------------------------------- 1 | from database_sanitizer import session 2 | from database_sanitizer.sanitizers import derived 3 | 4 | 5 | def setup_module(): 6 | session.reset(b'not-so-secret-key') 7 | 8 | 9 | def test_sanitize_uuid4(): 10 | assert derived.sanitize_uuid4(None) is None 11 | assert derived.sanitize_uuid4('') == '' 12 | assert derived.sanitize_uuid4('0') == ( 13 | 'e3a5862f-cffb-4d89-ab3e-5563b27e287a') 14 | assert derived.sanitize_uuid4('00000000000000000000000000000000') == ( 15 | '00000000-0000-0000-0000-000000000000') 16 | assert derived.sanitize_uuid4('00000000-0000-0000-0000-000000000000') == ( 17 | '00000000-0000-0000-0000-000000000000') 18 | assert derived.sanitize_uuid4('e3a5862f-cffb-4d89-ab3e-5563b27e287a') == ( 19 | '88b0225e-6090-459a-999d-9b3a3ab28c53') 20 | -------------------------------------------------------------------------------- /database_sanitizer/tests/test_sanitizers_string.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import mock 6 | import pytest 7 | 8 | from ..sanitizers.string import sanitize_empty, sanitize_random, sanitize_zfill 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "input_value,expected_output", 13 | ( 14 | ("foo", ""), 15 | ("bar", ""), 16 | ("", ""), 17 | (" ", ""), 18 | (None, None), 19 | ), 20 | ) 21 | def test_sanitize_empty(input_value, expected_output): 22 | assert sanitize_empty(input_value) == expected_output 23 | 24 | 25 | @pytest.mark.parametrize( 26 | "input_value,expected_output", 27 | ( 28 | ("foo", "000"), 29 | ("test test", "000000000"), 30 | ("", ""), 31 | (None, None) 32 | ), 33 | ) 34 | def test_sanitize_zfill(input_value, expected_output): 35 | return sanitize_zfill(input_value) == expected_output 36 | 37 | 38 | @mock.patch('random.choice', return_value='x') 39 | def test_sanitize_random(mocked_random_choice): 40 | assert sanitize_random(None) is None 41 | assert sanitize_random('') == '' 42 | assert sanitize_random('a') == 'x' 43 | assert sanitize_random('hello') == 'xxxxx' 44 | assert sanitize_random('hello world') == 'xxxxxxxxxxx' 45 | -------------------------------------------------------------------------------- /database_sanitizer/tests/test_sanitizers_times.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import mock 4 | 5 | from database_sanitizer.sanitizers import times 6 | 7 | 8 | class _FakeDateTime(datetime.datetime): 9 | @staticmethod 10 | def now(): 11 | return datetime.datetime(2018, 1, 1, 12, 00, 00) 12 | 13 | 14 | @mock.patch('random.randint', return_value=42005) 15 | @mock.patch.object(datetime, 'datetime', _FakeDateTime) 16 | def test_sanitize_random_past_timestamp(randint_mock): 17 | assert times.sanitize_random_past_timestamp('old') == ( 18 | '2018-01-01T11:59:17.995000') 19 | -------------------------------------------------------------------------------- /database_sanitizer/tests/test_sanitizers_user.py: -------------------------------------------------------------------------------- 1 | from database_sanitizer import session 2 | from database_sanitizer.sanitizers import user 3 | 4 | 5 | def setup_module(): 6 | session.reset(b'not-so-secret-key') 7 | 8 | 9 | def test_sanitize_email(): 10 | assert user.sanitize_email(None) is None 11 | assert user.sanitize_email('') == '' 12 | assert user.sanitize_email('test@example.com') == ( 13 | 'zoe.burke@xce13103b.sanitized.net') 14 | assert user.sanitize_email('test2@example.com') == ( 15 | 'Melanie.Pratt@x4feb7f40.sanitized.net') 16 | assert user.sanitize_email('test@example.com') == ( 17 | 'zoe.burke@xce13103b.sanitized.net') 18 | assert user.sanitize_email('test3@example.com') == ( 19 | 'irene.archer@x3d2e92ec.sanitized.net') 20 | assert user.sanitize_email(' test3@example.com ') == ( 21 | 'irene.archer@x3d2e92ec.sanitized.net') 22 | 23 | 24 | def test_sanitize_username(): 25 | assert user.sanitize_username(None) is None 26 | assert user.sanitize_username('') == '' 27 | assert user.sanitize_username('John.Doe') == 'billyda979417' 28 | assert user.sanitize_username('JaneSmith') == 'helena34a7a0b' 29 | assert user.sanitize_username('john-smith') == 'arthurc5a84ec' 30 | assert user.sanitize_username('john-smith ') == 'douglas8d3b8d5e' 31 | assert user.sanitize_username('john smith ') == 'katyfdab90cc' 32 | 33 | 34 | def test_sanitize_full_name_en_gb(): 35 | assert user.sanitize_full_name_en_gb(None) is None 36 | assert user.sanitize_full_name_en_gb('') == '' 37 | assert user.sanitize_full_name_en_gb('John Doe') == 'Francis Walker' 38 | assert user.sanitize_full_name_en_gb('Jane Smith') == 'Declan Burke' 39 | assert user.sanitize_full_name_en_gb('John Smith') == 'Lawrence Norton' 40 | assert user.sanitize_full_name_en_gb('john smith ') == 'Lawrence Norton' 41 | 42 | 43 | def test_sanitize_given_name_en_gb(): 44 | assert user.sanitize_given_name_en_gb(None) is None 45 | assert user.sanitize_given_name_en_gb('') == '' 46 | assert user.sanitize_given_name_en_gb('John') == 'Cheryl' 47 | assert user.sanitize_given_name_en_gb('Jane') == 'Andrea' 48 | assert user.sanitize_given_name_en_gb('Foo bar') == 'Elliott' 49 | assert user.sanitize_given_name_en_gb(' Foo BAR ') == 'Elliott' 50 | 51 | 52 | def test_sanitize_surname_en_gb(): 53 | assert user.sanitize_surname_en_gb(None) is None 54 | assert user.sanitize_surname_en_gb('') == '' 55 | assert user.sanitize_surname_en_gb('Doe') == 'Bibi' 56 | assert user.sanitize_surname_en_gb('Smith') == 'Duffy' 57 | assert user.sanitize_surname_en_gb('Anderson') == 'Hodgson' 58 | assert user.sanitize_surname_en_gb('andersOn ') == 'Hodgson' 59 | 60 | 61 | def test_sanitize_email_resets_on_session_reset(): 62 | assert user.sanitize_email('test@example.com') == ( 63 | 'zoe.burke@xce13103b.sanitized.net') 64 | session.reset() 65 | assert user.sanitize_email('test@example.com') != ( 66 | 'zoe.burke@xce13103b.sanitized.net') 67 | session.reset(b'not-so-secret-key') 68 | -------------------------------------------------------------------------------- /database_sanitizer/tests/test_session.py: -------------------------------------------------------------------------------- 1 | from database_sanitizer import session 2 | 3 | 4 | def setup_module(): 5 | session.reset(b'not-so-secret-key') 6 | 7 | 8 | def test_hash_text_to_int(): 9 | assert session.hash_text_to_int('hello') == 4100462238 10 | 11 | 12 | def test_hash_text_to_ints(): 13 | assert session.hash_text_to_ints('hello', [4, 8, 16]) == (15, 70, 33129) 14 | 15 | 16 | def test_hash_text(): 17 | assert session.hash_text('hello') == ( 18 | 'f468169e17f4dd5d7318bd6099a4e657ceb0a978cddb4f3382be0da7121659bb') 19 | 20 | 21 | def test_hash_bytes(): 22 | assert session.hash_bytes(b'hello') == ( 23 | 'f468169e17f4dd5d7318bd6099a4e657ceb0a978cddb4f3382be0da7121659bb') 24 | 25 | 26 | def test_get_secret(): 27 | assert session.get_secret() == b'not-so-secret-key' 28 | 29 | 30 | def test_reset(): 31 | old_key = session.get_secret() 32 | session.reset() 33 | new_key = session.get_secret() 34 | assert new_key != old_key 35 | session.reset(b'not-so-secret-key') 36 | assert session.get_secret() == b'not-so-secret-key' 37 | -------------------------------------------------------------------------------- /database_sanitizer/tests/test_utils_mysql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import pytest 6 | from six.moves.urllib import parse as urlparse 7 | 8 | from ..utils.mysql import ( 9 | decode_mysql_literal, 10 | decode_mysql_string_literal, 11 | get_mysqldump_args_and_env_from_url, 12 | unescape_single_character, 13 | ) 14 | 15 | 16 | @pytest.mark.parametrize( 17 | "url", 18 | ( 19 | "mysql://test:test@localhost/test", 20 | "mysql://localhost:1234/test", 21 | "mysql://localhost", 22 | ), 23 | ) 24 | def test_get_mysqldump_args_and_env_from_url(url): 25 | parsed_url = urlparse.urlparse(url) 26 | 27 | if not parsed_url.path: 28 | with pytest.raises(ValueError): 29 | get_mysqldump_args_and_env_from_url(url=parsed_url) 30 | return 31 | 32 | args, env = get_mysqldump_args_and_env_from_url(url=parsed_url) 33 | 34 | assert isinstance(args, list) 35 | assert isinstance(env, dict) 36 | 37 | assert len(args) > 0 38 | assert "--complete-insert" in args 39 | assert "--extended-insert" in args 40 | assert "--net_buffer_length=10240" in args 41 | assert args[-1] == parsed_url.path[1:] 42 | 43 | if parsed_url.username: 44 | index = args.index("-u") 45 | assert args[index + 1] == parsed_url.username 46 | 47 | if parsed_url.password: 48 | assert env["MYSQL_PWD"] == parsed_url.password 49 | 50 | 51 | @pytest.mark.parametrize( 52 | "text,expected_value", 53 | ( 54 | ("NULL", None), 55 | ("TRUE", True), 56 | ("FALSE", False), 57 | ("12", 12), 58 | ("12.5", 12.5), 59 | ("'test'", "test"), 60 | ), 61 | ) 62 | def test_decode_mysql_literal(text, expected_value): 63 | assert decode_mysql_literal(text) == expected_value 64 | 65 | 66 | def test_decode_mysql_literal_invalid_input(): 67 | with pytest.raises(ValueError): 68 | decode_mysql_literal("ERROR") 69 | 70 | 71 | @pytest.mark.parametrize( 72 | "text,expected_output", 73 | ( 74 | ("'test'", "test"), 75 | ("'test\\ntest'", "test\ntest"), 76 | ("'\\0'", "\000"), 77 | ("'foo", None), 78 | ("foo'", None), 79 | ("foo", None), 80 | ), 81 | ) 82 | def test_decode_mysql_string_literal(text, expected_output): 83 | if expected_output is None: 84 | with pytest.raises(AssertionError): 85 | decode_mysql_string_literal(text) 86 | else: 87 | assert decode_mysql_string_literal(text) == expected_output 88 | 89 | 90 | @pytest.mark.parametrize( 91 | "text,expected_output", 92 | ( 93 | ("\\\\", "\\"), 94 | ("\\n", "\n"), 95 | ("\\r", "\r"), 96 | ("\\0", "\000"), 97 | ("\\Z", "\032"), 98 | ("\\'", "'"), 99 | ('\\"', '"'), 100 | ), 101 | ) 102 | def test_unescape_single_character(text, expected_output): 103 | class MockRegexpMatch(object): 104 | 105 | def __init__(self, text): 106 | self.text = text 107 | 108 | def group(self, index): 109 | assert index == 0 110 | return self.text 111 | 112 | assert unescape_single_character(MockRegexpMatch(text)) == expected_output 113 | -------------------------------------------------------------------------------- /database_sanitizer/tests/test_utils_postgres.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import pytest 6 | 7 | from ..utils.postgres import ( 8 | DECODE_MAP, 9 | POSTGRES_COPY_NULL_VALUE, 10 | decode_copy_value, 11 | encode_copy_value, 12 | ) 13 | 14 | 15 | @pytest.mark.parametrize( 16 | "input_value,expected_value", 17 | ( 18 | ("", ""), 19 | (POSTGRES_COPY_NULL_VALUE, None), 20 | ("Test", "Test"), 21 | ("\\\\", "\\"), 22 | ("\\b", "\b"), 23 | ("\\f", "\f"), 24 | ("\\n", "\n"), 25 | ("\\r", "\r"), 26 | ("\\t", "\t"), 27 | ("\\v", "\v"), 28 | ("\\xff", "\xff"), 29 | ("\\123", "\123"), 30 | ("Test\\r\\nTest", "Test\r\nTest"), 31 | ) 32 | ) 33 | def test_decode_copy_value(input_value, expected_value): 34 | assert decode_copy_value(input_value) == expected_value 35 | 36 | 37 | @pytest.mark.parametrize( 38 | "input_value,expected_value", 39 | ( 40 | ("", ""), 41 | (None, POSTGRES_COPY_NULL_VALUE), 42 | ("Test", "Test"), 43 | ("\\", "\\\\"), 44 | ("\b", "\\b"), 45 | ("\f", "\\f"), 46 | ("\n", "\\n"), 47 | ("\r", "\\r"), 48 | ("\t", "\\t"), 49 | ("\v", "\\v"), 50 | ("\xff", "\xff"), 51 | ("\123", "\123"), 52 | ("Test\r\nTest", "Test\\r\\nTest"), 53 | ) 54 | ) 55 | def test_encode_copy_value(input_value, expected_value): 56 | assert encode_copy_value(input_value) == expected_value 57 | 58 | 59 | def test_invalid_escape_sequence(): 60 | with pytest.raises(ValueError): 61 | decode_copy_value("\\") 62 | with pytest.raises(ValueError): 63 | decode_copy_value("\\X") 64 | 65 | 66 | def test_decode_map_contents(): 67 | assert DECODE_MAP['\\b'] == '\b' 68 | assert DECODE_MAP['\\n'] == '\n' 69 | assert DECODE_MAP['\\t'] == '\t' 70 | assert DECODE_MAP['\\\\'] == '\\' 71 | assert DECODE_MAP['\\0'] == '\0' 72 | assert DECODE_MAP['\\74'] == '\74' 73 | assert DECODE_MAP['\\x0'] == '\0' 74 | assert DECODE_MAP['\\xa'] == '\x0a' 75 | assert DECODE_MAP['\\xA'] == '\x0a' 76 | assert DECODE_MAP['\\x00'] == '\0' 77 | assert DECODE_MAP['\\xa3'] == '\xa3' 78 | assert DECODE_MAP['\\xA3'] == '\xa3' 79 | assert DECODE_MAP['\\xAb'] == '\xab' 80 | assert DECODE_MAP['\\xaB'] == '\xab' 81 | assert DECODE_MAP['\\xff'] == '\xff' 82 | 83 | assert '\\' not in DECODE_MAP, "Unterminated escape is not mapped" 84 | assert '\\z' not in DECODE_MAP, "Invalid escape sequences are not mapped" 85 | 86 | assert len(DECODE_MAP) == 1097 87 | -------------------------------------------------------------------------------- /database_sanitizer/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersinno/python-database-sanitizer/66b7146441914e9dfd8ab31596d5ebc61bf0e04a/database_sanitizer/utils/__init__.py -------------------------------------------------------------------------------- /database_sanitizer/utils/mysql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import re 6 | 7 | import pymysql 8 | import six 9 | 10 | 11 | def get_mysqldump_args_and_env_from_url(url): 12 | """ 13 | Constructs list of command line arguments and dictionary of environment 14 | variables that can be given to `mysqldump` executable to obtain database 15 | dump of the database described in given URL. 16 | 17 | :param url: Parsed database URL. 18 | :type url: urllib.urlparse.ParseResult 19 | 20 | :return: List of command line arguments as well as dictionary of 21 | environment variables that can be used to launch the MySQL dump 22 | process to obtain dump of the database. 23 | :rtype: tuple[list[str],dict[str,str]] 24 | """ 25 | args = [ 26 | # Without this, `INSERT INTO` statements will exclude column names from 27 | # the output, which are required for sanitation. 28 | "--complete-insert", 29 | 30 | # This enables use for "exteded inserts" where multiple rows of a table 31 | # are included in a single `INSERT INTO` statement (contents of the 32 | # entire table even, if it's within limits). We use it to increase the 33 | # performance of the sanitation and to decrease the dump size. 34 | "--extended-insert", 35 | 36 | # This makes the `mysqldump` to attempt to limit size of a single line 37 | # into 10 megabytes. We use it to reduce memory consumption. 38 | "--net_buffer_length=10240", 39 | 40 | # Hostname of the database to connect into, should be always present in 41 | # the parsed database URL. 42 | "-h", 43 | url.hostname, 44 | ] 45 | env = {} 46 | 47 | if url.port is not None: 48 | args.extend(("-P", six.text_type(url.port))) 49 | 50 | if url.username: 51 | args.extend(("-u", url.username)) 52 | 53 | if url.password: 54 | env["MYSQL_PWD"] = url.password 55 | 56 | if len(url.path) < 2 or not url.path.startswith("/"): 57 | raise ValueError("Name of the database is missing from the URL") 58 | 59 | args.append(url.path[1:]) 60 | 61 | return args, env 62 | 63 | 64 | MYSQL_NULL_PATTERN = re.compile(r"^NULL$", re.IGNORECASE) 65 | MYSQL_BOOLEAN_PATTERN = re.compile(r"^(TRUE|FALSE)$", re.IGNORECASE) 66 | MYSQL_FLOAT_PATTERN = re.compile(r"^[+-]?\d*\.\d+([eE][+-]?\d+)?$") 67 | MYSQL_INT_PATTERN = re.compile(r"^\d+$") 68 | MYSQL_STRING_PATTERN = re.compile(r"'(?:[^']|''|\\')*(?=2.0.0 2 | pytest>=3.6.2 3 | pytest-cov 4 | -e .[MySQL] 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = database-sanitizer 3 | version = 1.1.0 4 | description = Sanitizes contents of a database. 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | url = https://github.com/andersinno/python-database-sanitizer 8 | license = MIT 9 | license_file = LICENSE 10 | 11 | [options] 12 | zip_safe = true 13 | include_package_data = true 14 | packages = find: 15 | install_requires = 16 | PyYAML>=3.12 17 | six>=1.11.0 18 | 19 | [options.extras_require] 20 | MySQL = PyMySQL 21 | 22 | [options.packages.find] 23 | exclude = 24 | database_sanitizer.tests 25 | 26 | [options.entry_points] 27 | console_scripts = 28 | database-sanitizer = database_sanitizer.__main__:main 29 | 30 | [bdist_wheel] 31 | universal = 1 32 | 33 | [isort] 34 | multi_line_output = 3 35 | include_trailing_comma = yes 36 | skip = .tox,dist,venv 37 | not_skip = __init__.py 38 | known_first_party = database_sanitizer 39 | default_section = THIRDPARTY 40 | 41 | [coverage:run] 42 | branch = yes 43 | 44 | [coverage:report] 45 | precision = 2 46 | omit = */tests/* 47 | exclude_lines = 48 | ^ *main() 49 | ^ *from typing import 50 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import setuptools 6 | 7 | if __name__ == "__main__": 8 | setuptools.setup(setup_requires=["setuptools>=34.0"]) 9 | --------------------------------------------------------------------------------