├── .editorconfig
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── database_sanitizer
    ├── __init__.py
    ├── __main__.py
    ├── config.py
    ├── dump
    │   ├── __init__.py
    │   ├── mysql.py
    │   └── postgres.py
    ├── sanitizers
    │   ├── __init__.py
    │   ├── constant.py
    │   ├── derived.py
    │   ├── string.py
    │   ├── times.py
    │   └── user.py
    ├── session.py
    ├── tests
    │   ├── __init__.py
    │   ├── test_config.py
    │   ├── test_dump.py
    │   ├── test_dump_mysql.py
    │   ├── test_dump_postgres.py
    │   ├── test_main.py
    │   ├── test_sanitizers_constant.py
    │   ├── test_sanitizers_derived.py
    │   ├── test_sanitizers_string.py
    │   ├── test_sanitizers_times.py
    │   ├── test_sanitizers_user.py
    │   ├── test_session.py
    │   ├── test_utils_mysql.py
    │   └── test_utils_postgres.py
    └── utils
    │   ├── __init__.py
    │   ├── mysql.py
    │   └── postgres.py
├── requirements-test.txt
├── setup.cfg
└── setup.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | charset = utf-8
 5 | end_of_line = lf
 6 | insert_final_newline = true
 7 | trim_trailing_whitespace = true
 8 | 
 9 | [*.{md,py}]
10 | indent_style = space
11 | indent_size = 4
12 | line_length = 79
13 | 
14 | [*.yml]
15 | indent_style = space
16 | indent_size = 2
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.egg
 3 | *.egg-info/
 4 | __pycache__
 5 | /.coverage
 6 | /.eggs
 7 | /.idea
 8 | /.pytest_cache
 9 | /build
10 | /coverage.xml
11 | /dist
12 | /htmlcov
13 | /venv
14 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: python
 3 | cache: pip
 4 | python:
 5 |   - "2.7"
 6 |   - "3.4"
 7 |   - "3.5"
 8 |   - "3.6"
 9 | install:
10 |   - pip install -r requirements-test.txt
11 | script:
12 |   - py.test -vvv --cov database_sanitizer --cov-report=term-missing
13 | after_success:
14 |   - curl -s https://codecov.io/bash | bash
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Anders Innovations
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Database sanitation tool
  2 | 
  3 | [![pypi][pypi-image]][pypi-url]
  4 | [![travis][travis-image]][travis-url]
  5 | [![codecov][codecov-image]][codecov-url]
  6 | 
  7 | [pypi-image]: https://badge.fury.io/py/database-sanitizer.svg
  8 | [pypi-url]: https://pypi.org/project/database-sanitizer/
  9 | [travis-image]: https://travis-ci.org/andersinno/python-database-sanitizer.svg?branch=master
 10 | [travis-url]: https://travis-ci.org/andersinno/python-database-sanitizer
 11 | [codecov-image]: https://codecov.io/gh/andersinno/python-database-sanitizer/branch/master/graph/badge.svg
 12 | [codecov-url]: https://codecov.io/gh/andersinno/python-database-sanitizer
 13 | 
 14 | `database-sanitizer` is a tool which retrieves an database dump from
 15 | relational database and performs sanitation on the retrieved data
 16 | according to rules defined in a configuration file. Currently the
 17 | sanitation tool supports both [PostgreSQL] and [MySQL] databases.
 18 | 
 19 | [PostgreSQL]: https://postgres.org
 20 | [MySQL]: https://mysql.com
 21 | 
 22 | ## Installation
 23 | 
 24 | `database-sanitizer` can be installed from [PyPI] with [pip] like this:
 25 | 
 26 | ```bash
 27 | $ pip install database-sanitizer
 28 | ```
 29 | 
 30 | If you are using MySQL, you need to install the package like this
 31 | instead, so that additional requirements are included:
 32 | 
 33 | ```bash
 34 | $ pip install database-sanitizer[MySQL]
 35 | ```
 36 | 
 37 | [PyPI]: https://pypi.org
 38 | [pip]: https://pip.pypa.io/en/stable/
 39 | 
 40 | ## Usage
 41 | 
 42 | Once the package has been installed, `database-sanitizer` can be used
 43 | like this:
 44 | 
 45 | ```bash
 46 | $ database-sanitizer <DATABASE-URL>
 47 | ```
 48 | 
 49 | Command line argument `DATABASE-URL` needs to be provided so the tool
 50 | knows how to retrieve the dump from the database. With PostgreSQL, it
 51 | would be something like this:
 52 | 
 53 | ```bash
 54 | $ database-sanitizer postgres://user:password@host/database
 55 | ```
 56 | 
 57 | However, unless an configuration file is provided, no sanitation will be
 58 | performed on the retrieved database dump, which leads us to the next
 59 | section which will be...
 60 | 
 61 | ## Configuration
 62 | 
 63 | Rules for the sanitation can be given in a configuration file written in
 64 | [YAML]. Path to the configuration file is then given to the command line
 65 | utility with `--config` argument (`-c` for shorthand) like this:
 66 | 
 67 | [YAML]: http://yaml.org
 68 | 
 69 | ```bash
 70 | $ database-sanitizer -c config.yml postgres://user:password@host/database
 71 | ```
 72 | 
 73 | The configuration file uses following kind of syntax:
 74 | 
 75 | ```YAML
 76 | config:
 77 |   addons:
 78 |     - some.other.package
 79 |     - yet.another.package
 80 |   extra_parameters: # These parameters will be passed to the dump tool CLI
 81 |     mysqldump:
 82 |       - "--single-transaction" # Included by default
 83 |     pg_dump:
 84 |       - "--exclude-table=something"
 85 | strategy:
 86 |   user:
 87 |     first_name: name.first_name
 88 |     last_name: name.last_name
 89 |     secret_key: string.empty
 90 |   access_log: skip_rows
 91 | ```
 92 | 
 93 | In the example configuration above, there are first listed two "addon
 94 | packages", which are names of Python packages where the sanitizer will
 95 | be looking for sanitizer functions. They are completely optional and can
 96 | be omitted, in which case only sanitizer functions defined in package
 97 | called `sanitizers` and built-in sanitizers will be used instead.
 98 | 
 99 | It's also possible to define extra parameters to pass to the dump tool (
100 | `mysqldump` or `pg_dump`). By default, `mysqldump` will include the
101 | `--single-transaction` extra parameter. You can disable this by defining the
102 | extra parameters in the config file explicitly, e.g. with an empty array `[]`.
103 | 
104 | The `strategy` portion of the configuration contains the actual
105 | sanitation rules. First you define name of the database table (in the
106 | example that would be `user`) followed by column names in that table
107 | which each one mapped to sanitation function name. The name of the
108 | sanitation function consists from two parts separated from each other by
109 | a dot: Python module name and name of the actual function, which will
110 | be prefixed with `sanitize_`, so `name.first_name` would be a function
111 | called `sanitize_first_name` in a file called `name.py`.
112 | 
113 | Table content can be left out completely from the sanitized dump by
114 | setting table strategy to `skip_rows` (check `access_log` table in the
115 | example config). This will leave out all `INSERT INTO` (MySQL) or `COPY`
116 | (PostgreSQL) statements from the sanitized dump file. `CREATE TABLE`
117 | statements will not be removed.
118 | 


--------------------------------------------------------------------------------
/database_sanitizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersinno/python-database-sanitizer/66b7146441914e9dfd8ab31596d5ebc61bf0e04a/database_sanitizer/__init__.py


--------------------------------------------------------------------------------
/database_sanitizer/__main__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import unicode_literals
 4 | 
 5 | import argparse
 6 | import codecs
 7 | import os
 8 | import sys
 9 | 
10 | import six
11 | 
12 | from .config import Configuration
13 | from .dump import run
14 | 
15 | 
16 | def main(argv=sys.argv):
17 |     parser = argparse.ArgumentParser(
18 |         prog=(argv[0] if len(argv) else "database-sanitizer"),
19 |         description="Sanitizes contents of databases.",
20 |     )
21 |     parser.add_argument(
22 |         "--config",
23 |         "-c",
24 |         type=str,
25 |         dest="config",
26 |         help="Path to the sanitizer configuration file.",
27 |     )
28 |     parser.add_argument(
29 |         "--output",
30 |         "-o",
31 |         type=str,
32 |         dest="output",
33 |         help=(
34 |             "Path to the file where the sanitized database will be written "
35 |             "into. If omitted, standard output will be used instead."
36 |         ),
37 |     )
38 |     parser.add_argument(
39 |         "url",
40 |         help="Database URL to which to connect into and sanitize contents.",
41 |     )
42 | 
43 |     args = parser.parse_args(args=argv[1:])
44 |     output = sys.stdout
45 |     if six.PY2:
46 |         output = codecs.getwriter("utf-8")(output)
47 |     config = None
48 | 
49 |     if args.config:
50 |         conf_dir = os.path.realpath(os.path.dirname(args.config))
51 |         sys.path.insert(0, conf_dir)
52 |         config = Configuration.from_file(args.config)
53 |     if args.output:
54 |         output = open(args.output, "w")
55 | 
56 |     try:
57 |         run(
58 |             url=args.url,
59 |             output=output,
60 |             config=config,
61 |         )
62 |     finally:
63 |         if args.output:
64 |             output.close()
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     main()
69 | 


--------------------------------------------------------------------------------
/database_sanitizer/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import unicode_literals
  4 | 
  5 | import importlib
  6 | 
  7 | import six
  8 | import yaml
  9 | 
 10 | __all__ = ("Configuration", "ConfigurationError")
 11 | 
 12 | SKIP_ROWS_CONFIG_VALUE = "skip_rows"
 13 | MYSQLDUMP_DEFAULT_PARAMETERS = ["--single-transaction"]
 14 | PG_DUMP_DEFAULT_PARAMETERS = []
 15 | 
 16 | 
 17 | class ConfigurationError(ValueError):
 18 |     """
 19 |     Custom exception type used to indicate configuration file errors.
 20 |     """
 21 | 
 22 | 
 23 | class Configuration(object):
 24 |     """
 25 |     Object representation of database sanitizer configuration, usually read
 26 |     from a YAML file.
 27 |     """
 28 |     def __init__(self):
 29 |         self.sanitizers = {}
 30 |         self.skip_rows_for_tables = []
 31 |         self.addon_packages = []
 32 |         self.mysqldump_params = []
 33 |         self.pg_dump_params = []
 34 | 
 35 |     @classmethod
 36 |     def from_file(cls, filename):
 37 |         """
 38 |         Reads configuration from given path to a file in local file system and
 39 |         returns parsed version of it.
 40 | 
 41 |         :param filename: Path to the YAML file in local file system where the
 42 |                          configuration will be read from.
 43 |         :type filename: str
 44 | 
 45 |         :return: Configuration instance parsed from given configuration file.
 46 |         :rtype: Configuration
 47 |         """
 48 |         instance = cls()
 49 | 
 50 |         with open(filename, "rb") as file_stream:
 51 |             config_data = yaml.safe_load(file_stream)
 52 | 
 53 |         instance.load(config_data)
 54 | 
 55 |         return instance
 56 | 
 57 |     def load(self, config_data):
 58 |         """
 59 |         Loads sanitizers according to rulesets defined in given already parsed
 60 |         configuration file.
 61 | 
 62 |         :param config_data: Already parsed configuration data, as dictionary.
 63 |         :type config_data: dict[str,any]
 64 |         """
 65 |         if not isinstance(config_data, dict):
 66 |             raise ConfigurationError(
 67 |                 "Configuration data is %s instead of dict." % (
 68 |                     type(config_data),
 69 |                 )
 70 |             )
 71 | 
 72 |         self.load_addon_packages(config_data)
 73 |         self.load_sanitizers(config_data)
 74 |         self.load_dump_extra_parameters(config_data)
 75 | 
 76 |     def load_dump_extra_parameters(self, config_data):
 77 |         """
 78 |         Loads extra parameters for mysqldump and/or pg_dump CLI usage. These
 79 |         parameters should be added to the mysqldump and/or pg_dump command call
 80 |         when taking a dump.
 81 | 
 82 |         :param config_data: Already parsed configuration data, as dictionary.
 83 |         :type config_data: dict[str,any]
 84 |         """
 85 |         section_config = config_data.get("config", {})
 86 |         if not isinstance(section_config, dict):
 87 |             raise ConfigurationError(
 88 |                 "'config' is %s instead of dict" % (
 89 |                     type(section_config),
 90 |                 ),
 91 |             )
 92 | 
 93 |         section_extra_parameters = section_config.get("extra_parameters", {})
 94 |         if not isinstance(section_extra_parameters, dict):
 95 |             raise ConfigurationError(
 96 |                 "'config.extra_parameters' is %s instead of dict" % (
 97 |                     type(section_extra_parameters),
 98 |                 ),
 99 |             )
100 | 
101 |         mysqldump_params = section_extra_parameters.get("mysqldump", MYSQLDUMP_DEFAULT_PARAMETERS)
102 |         if not isinstance(mysqldump_params, list):
103 |             raise ConfigurationError(
104 |                 "'config.extra_parameters.mysqldump' is %s instead of list" % (
105 |                     type(mysqldump_params),
106 |                 ),
107 |             )
108 | 
109 |         pg_dump_params = section_extra_parameters.get("pg_dump", PG_DUMP_DEFAULT_PARAMETERS)
110 |         if not isinstance(pg_dump_params, list):
111 |             raise ConfigurationError(
112 |                 "'config.extra_parameters.pg_dump' is %s instead of list" % (
113 |                     type(pg_dump_params),
114 |                 ),
115 |             )
116 | 
117 |         self.mysqldump_params = mysqldump_params
118 |         self.pg_dump_params = pg_dump_params
119 | 
120 |     def load_addon_packages(self, config_data):
121 |         """
122 |         Loads the module paths from which the configuration will attempt to
123 |         load sanitizers from. These must be stored as a list of strings under
124 |         "config.addons" section of the configuration data.
125 | 
126 |         :param config_data: Already parsed configuration data, as dictionary.
127 |         :type config_data: dict[str,any]
128 |         """
129 |         section_config = config_data.get("config")
130 |         if not isinstance(section_config, dict):
131 |             if section_config is None:
132 |                 return
133 |             raise ConfigurationError(
134 |                 "'config' is %s instead of dict" % (
135 |                     type(section_config),
136 |                 ),
137 |             )
138 | 
139 |         section_addons = section_config.get("addons", [])
140 |         if not isinstance(section_addons, list):
141 |             raise ConfigurationError(
142 |                 "'config.addons' is %s instead of list" % (
143 |                     type(section_addons),
144 |                 ),
145 |             )
146 | 
147 |         for index, module_path in enumerate(section_addons):
148 |             if not isinstance(module_path, str):
149 |                 raise ConfigurationError(
150 |                     "Item %d in 'config.addons' is %s instead of string" % (
151 |                         index,
152 |                         type(module_path),
153 |                     ),
154 |                 )
155 | 
156 |         self.addon_packages = list(section_addons)
157 | 
158 |     def load_sanitizers(self, config_data):
159 |         """
160 |         Loads sanitizers possibly defined in the configuration under dictionary
161 |         called "strategy", which should contain mapping of database tables with
162 |         column names mapped into sanitizer function names.
163 | 
164 |         :param config_data: Already parsed configuration data, as dictionary.
165 |         :type config_data: dict[str,any]
166 |         """
167 |         section_strategy = config_data.get("strategy")
168 |         if not isinstance(section_strategy, dict):
169 |             if section_strategy is None:
170 |                 return
171 |             if section_strategy != SKIP_ROWS_CONFIG_VALUE:
172 |                 raise ConfigurationError(
173 |                     "'strategy' is %s instead of dict" % (
174 |                         type(section_strategy),
175 |                     ),
176 |                 )
177 | 
178 |         for table_name, column_data in six.iteritems(section_strategy):
179 |             if column_data == SKIP_ROWS_CONFIG_VALUE:
180 |                 self.skip_rows_for_tables.append(table_name)
181 |                 continue
182 | 
183 |             if not isinstance(column_data, dict):
184 |                 if column_data is None:
185 |                     continue
186 |                 raise ConfigurationError(
187 |                     "'strategy.%s' is %s instead of dict" % (
188 |                         table_name,
189 |                         type(column_data),
190 |                     ),
191 |                 )
192 | 
193 |             for column_name, sanitizer_name in six.iteritems(column_data):
194 |                 if sanitizer_name is None:
195 |                     continue
196 | 
197 |                 if not isinstance(sanitizer_name, str):
198 |                     raise ConfigurationError(
199 |                         "'strategy.%s.%s' is %s instead of string" % (
200 |                             table_name,
201 |                             column_name,
202 |                             type(sanitizer_name),
203 |                         ),
204 |                     )
205 | 
206 |                 sanitizer_callback = self.find_sanitizer(sanitizer_name)
207 |                 sanitizer_key = "%s.%s" % (table_name, column_name)
208 |                 self.sanitizers[sanitizer_key] = sanitizer_callback
209 | 
210 |     def find_sanitizer(self, name):
211 |         """
212 |         Searches for a sanitizer function with given name. The name should
213 |         contain two parts separated from each other with a dot, the first
214 |         part being the module name while the second being name of the function
215 |         contained in the module, when it's being prefixed with "sanitize_".
216 | 
217 |         The lookup process consists from three attempts, which are:
218 | 
219 |         1. First package to look the module will be top level package called
220 |            "sanitizers".
221 |         2. Module will be looked under the "addon" packages, if they have been
222 |            defined.
223 |         3. Finally the sanitation function will be looked from the builtin
224 |            sanitizers located in "database_sanitizer.sanitizers" package.
225 | 
226 |         If none of these provide any results, ConfigurationError will be
227 |         thrown.
228 | 
229 |         :param name: "Full name" of the sanitation function containing name
230 |                      of the module as well as name of the function.
231 |         :type name: str
232 | 
233 |         :return: First function which can be imported with the given name.
234 |         :rtype: callable
235 |         """
236 |         # Split the sanitizer name into two parts, one containing the Python
237 |         # module name, while second containing portion of the function name
238 |         # we are looking for.
239 |         name_parts = name.split(".")
240 |         if len(name_parts) < 2:
241 |             raise ConfigurationError(
242 |                 "Unable to separate module name from function name in '%s'" % (
243 |                     name,
244 |                 ),
245 |             )
246 | 
247 |         module_name_suffix = ".".join(name_parts[:-1])
248 |         function_name = "sanitize_%s" % (name_parts[-1],)
249 | 
250 |         # Phase 1: Look for custom sanitizer under a top level package called
251 |         # "sanitizers".
252 |         module_name = "sanitizers.%s" % (module_name_suffix,)
253 |         callback = self.find_sanitizer_from_module(
254 |             module_name=module_name,
255 |             function_name=function_name,
256 |         )
257 |         if callback:
258 |             return callback
259 | 
260 |         # Phase 2: Look for the sanitizer under "addon" packages, if any of
261 |         # such have been defined.
262 |         for addon_package_name in self.addon_packages:
263 |             module_name = "%s.%s" % (
264 |                 addon_package_name,
265 |                 module_name_suffix,
266 |             )
267 |             callback = self.find_sanitizer_from_module(
268 |                 module_name=module_name,
269 |                 function_name=function_name,
270 |             )
271 |             if callback:
272 |                 return callback
273 | 
274 |         # Phase 3: Look from builtin sanitizers.
275 |         module_name = "database_sanitizer.sanitizers.%s" % (module_name_suffix,)
276 |         callback = self.find_sanitizer_from_module(
277 |             module_name=module_name,
278 |             function_name=function_name,
279 |         )
280 |         if callback:
281 |             return callback
282 | 
283 |         # Give up.
284 |         raise ConfigurationError("Unable to find sanitizer called '%s'" % (
285 |             name,
286 |         ))
287 | 
288 |     @staticmethod
289 |     def find_sanitizer_from_module(module_name, function_name):
290 |         """
291 |         Attempts to find sanitizer function from given module. If the module
292 |         cannot be imported, or function with given name does not exist in it,
293 |         nothing will be returned by this method. Otherwise the found sanitizer
294 |         function will be returned.
295 | 
296 |         :param module_name: Name of the module to import the function from.
297 |         :type module_name: str
298 | 
299 |         :param function_name: Name of the function to look for inside the
300 |                               module.
301 |         :type function_name: str
302 | 
303 |         :return: Sanitizer function found from the module, if it can be
304 |                  imported and it indeed contains function with the given name.
305 |                  Otherwise None will be returned instead.
306 |         :rtype: callback|None
307 |         """
308 |         try:
309 |             module = importlib.import_module(module_name)
310 |         except ImportError:
311 |             return None
312 | 
313 |         # Look for the function inside the module. At this point it could be
314 |         # pretty much anything.
315 |         callback = getattr(module, function_name, None)
316 | 
317 |         # Function does not exist in this module? Give up.
318 |         if callback is None:
319 |             return None
320 | 
321 |         # It's actually callable function? Return it.
322 |         if callable(callback):
323 |             return callback
324 | 
325 |         # Sanitizer seems to be something else than a function. Throw an
326 |         # exception to report such problem.
327 |         raise ConfigurationError("'%s' in '%s' is %s instead of function" % (
328 |             function_name,
329 |             module_name,
330 |             type(callback),
331 |         ))
332 | 
333 |     def get_sanitizer_for(self, table_name, column_name):
334 |         """
335 |         Get sanitizer for given table and column name.
336 | 
337 |         :param table_name: Name of the database table.
338 |         :type table_name: str
339 | 
340 |         :param column_name: Name of the database column.
341 |         :type column_name: str
342 | 
343 |         :return: Sanitizer function or None if nothing is configured
344 |         :rtype: Optional[Callable[[Optional[str]], Optional[str]]]
345 |         """
346 |         sanitizer_key = "%s.%s" % (table_name, column_name)
347 |         return self.sanitizers.get(sanitizer_key)
348 | 
349 |     def sanitize(self, table_name, column_name, value):
350 |         """
351 |         Sanitizes given value extracted from the database according to the
352 |         sanitation configuration.
353 | 
354 |         TODO: Add support for dates, booleans and other types found in SQL than
355 |         string.
356 | 
357 |         :param table_name: Name of the database table from which the value is
358 |                            from.
359 |         :type table_name: str
360 | 
361 |         :param column_name: Name of the database column from which the value is
362 |                             from.
363 |         :type column_name: str
364 | 
365 |         :param value: Value from the database, either in text form or None if
366 |                       the value is null.
367 |         :type value: str|None
368 | 
369 |         :return: Sanitized version of the given value.
370 |         :rtype: str|None
371 |         """
372 |         sanitizer_callback = self.get_sanitizer_for(table_name, column_name)
373 |         return sanitizer_callback(value) if sanitizer_callback else value
374 | 


--------------------------------------------------------------------------------
/database_sanitizer/dump/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import unicode_literals
 4 | 
 5 | import importlib
 6 | 
 7 | from six.moves.urllib import parse as urlparse
 8 | 
 9 | from .. import session
10 | 
11 | SUPPORTED_DATABASE_MODULES = {
12 |     "mysql": "database_sanitizer.dump.mysql",
13 |     "postgres": "database_sanitizer.dump.postgres",
14 |     "postgresql": "database_sanitizer.dump.postgres",
15 |     "postgis": "database_sanitizer.dump.postgres",
16 | }
17 | 
18 | 
19 | # Register supported database schemes.
20 | for scheme in SUPPORTED_DATABASE_MODULES.keys():
21 |     urlparse.uses_netloc.append(scheme)
22 | 
23 | 
24 | def run(url, output, config):
25 |     """
26 |     Extracts database dump from given database URL and outputs sanitized
27 |     copy of it into given stream.
28 | 
29 |     :param url: URL to the database which is to be sanitized.
30 |     :type url: str
31 | 
32 |     :param output: Stream where sanitized copy of the database dump will be
33 |                    written into.
34 |     :type output: file
35 | 
36 |     :param config: Optional sanitizer configuration to be used for sanitation
37 |                    of the values stored in the database.
38 |     :type config: database_sanitizer.config.Configuration|None
39 |     """
40 |     parsed_url = urlparse.urlparse(url)
41 |     db_module_path = SUPPORTED_DATABASE_MODULES.get(parsed_url.scheme)
42 |     if not db_module_path:
43 |         raise ValueError("Unsupported database scheme: '%s'" % (parsed_url.scheme,))
44 |     db_module = importlib.import_module(db_module_path)
45 |     session.reset()
46 |     for line in db_module.sanitize(url=parsed_url, config=config):
47 |         output.write(line + "\n")
48 | 


--------------------------------------------------------------------------------
/database_sanitizer/dump/mysql.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import unicode_literals
  4 | 
  5 | import codecs
  6 | import re
  7 | import subprocess
  8 | import io
  9 | 
 10 | from ..utils.mysql import (
 11 |     decode_mysql_literal,
 12 |     encode_mysql_literal,
 13 |     get_mysqldump_args_and_env_from_url,
 14 | )
 15 | from ..config import MYSQLDUMP_DEFAULT_PARAMETERS
 16 | 
 17 | #: Regular expression which matches `INSERT INTO` statements produced by the
 18 | #: `mysqldump` utility, even when extended inserts have been enabled.
 19 | INSERT_INTO_PATTERN = re.compile(
 20 |     r"^INSERT INTO `(?P<table>[^`]*)`"
 21 |     r" \((?P<columns>.*)\)"
 22 |     r" VALUES (?P<values>.*);$"
 23 | )
 24 | 
 25 | 
 26 | #: Regular expression which matches various kinds of MySQL literals.
 27 | VALUE_PATTERN = re.compile(
 28 |     r"""
 29 |     # Group 1:
 30 |     (
 31 |         '(?:[^']|''|\\')*(?<![\\])'     # String literal
 32 |         |                               # or...
 33 |         [^',()]+                        # NULL, TRUE, etc.
 34 |     )
 35 |     # Group 2:
 36 |     (
 37 |         [,)]                            # Comma or closing parenthesis.
 38 |     )
 39 |     """,
 40 |     re.VERBOSE,
 41 | )
 42 | 
 43 | 
 44 | def sanitize(url, config):
 45 |     """
 46 |     Obtains dump of MySQL database by executing `mysqldump` command and
 47 |     sanitizes it output.
 48 | 
 49 |     :param url: URL to the database which is going to be sanitized, parsed by
 50 |                 Python's URL parser.
 51 |     :type url: urllib.urlparse.ParseResult
 52 | 
 53 |     :param config: Optional sanitizer configuration to be used for sanitation
 54 |                    of the values stored in the database.
 55 |     :type config: database_sanitizer.config.Configuration|None
 56 |     """
 57 |     if url.scheme != "mysql":
 58 |         raise ValueError("Unsupported database type: '%s'" % (url.scheme,))
 59 | 
 60 |     args, env = get_mysqldump_args_and_env_from_url(url=url)
 61 | 
 62 |     extra_params = MYSQLDUMP_DEFAULT_PARAMETERS
 63 |     if config:
 64 |         extra_params = config.mysqldump_params
 65 | 
 66 |     process = subprocess.Popen(
 67 |         args=["mysqldump"] + args + extra_params,
 68 |         env=env,
 69 |         stdout=subprocess.PIPE,
 70 |     )
 71 | 
 72 |     return sanitize_from_stream(stream=process.stdout, config=config)
 73 | 
 74 | 
 75 | def sanitize_from_stream(stream, config):
 76 |     """
 77 |     Reads dump of MySQL database from given stream and sanitizes it.
 78 | 
 79 |     :param stream: Stream where the database dump is expected to be available
 80 |                    from, such as stdout of `mysqldump` process.
 81 |     :type stream: file
 82 | 
 83 |     :param config: Optional sanitizer configuration to be used for sanitation
 84 |                    of the values stored in the database.
 85 |     :type config: database_sanitizer.config.Configuration|None
 86 |     """
 87 |     for line in io.TextIOWrapper(stream, encoding="utf-8"):
 88 |         # Eat the trailing new line.
 89 |         line = line.rstrip("\n")
 90 | 
 91 |         # If there is no configuration it means that there are no sanitizers
 92 |         # available.
 93 |         if not config:
 94 |             yield line
 95 |             continue
 96 | 
 97 |         # Does the line contain `INSERT INTO` statement? If not, use the line
 98 |         # as-is and continue into next one.
 99 |         insert_into_match = INSERT_INTO_PATTERN.match(line)
100 |         if not insert_into_match:
101 |             yield line
102 |             continue
103 | 
104 |         table_name = insert_into_match.group("table")
105 |         column_names = parse_column_names(insert_into_match.group("columns"))
106 | 
107 |         # Skip `INSERT INTO` statement if table rows are configured
108 |         # to be skipped.
109 |         if table_name in config.skip_rows_for_tables:
110 |             continue
111 | 
112 |         # Collect sanitizers possibly used for this table and place them into
113 |         # a dictionary from which we can look them up by index later.
114 |         sanitizers = {}
115 |         for index, column_name in enumerate(column_names):
116 |             sanitizer = config.get_sanitizer_for(
117 |                 table_name=table_name,
118 |                 column_name=column_name,
119 |             )
120 |             if sanitizer:
121 |                 sanitizers[index] = sanitizer
122 | 
123 |         # If this table has no sanitizers available, use the line as-is and
124 |         # continue into next line.
125 |         if len(sanitizers) == 0:
126 |             yield line
127 |             continue
128 | 
129 |         # Constructs list of tuples containing sanitized column names.
130 |         sanitized_value_tuples = []
131 |         for values in parse_values(insert_into_match.group("values")):
132 |             if len(column_names) != len(values):
133 |                 raise ValueError("Mismatch between column names and values")
134 |             sanitized_values = []
135 |             for index, value in enumerate(values):
136 |                 sanitizer_callback = sanitizers.get(index)
137 |                 if sanitizer_callback:
138 |                     value = sanitizer_callback(value)
139 |                 sanitized_values.append(encode_mysql_literal(value))
140 |             sanitized_value_tuples.append(sanitized_values)
141 | 
142 |         # Finally create new `INSERT INTO` statement from the sanitized values.
143 |         yield "INSERT INTO `%s` (%s) VALUES %s;" % (
144 |             table_name,
145 |             ", ".join("`" + column_name + "`" for column_name in column_names),
146 |             ",".join(
147 |                 "(" + ",".join(value_tuple) + ")"
148 |                 for value_tuple in sanitized_value_tuples
149 |             ),
150 |         )
151 | 
152 | 
153 | def parse_column_names(text):
154 |     """
155 |     Extracts column names from a string containing quoted and comma separated
156 |     column names of a table.
157 | 
158 |     :param text: Line extracted from MySQL's `INSERT INTO` statement containing
159 |                  quoted and comma separated column names.
160 |     :type text: str
161 | 
162 |     :return: Tuple containing just the column names.
163 |     :rtype: tuple[str]
164 |     """
165 |     return tuple(
166 |         re.sub(r"^`(.*)`$", r"\1", column_data.strip())
167 |         for column_data in text.split(",")
168 |     )
169 | 
170 | 
171 | def parse_values(text):
172 |     """
173 |     Parses values from a string containing values from extended format `INSERT
174 |     INTO` statement. Values will be yielded from the function as tuples, with
175 |     one tuple per row in the table.
176 | 
177 |     :param text: Text extracted from MySQL's `INSERT INTO` statement containing
178 |                  quoted and comma separated column values.
179 |     :type text: str
180 |     """
181 |     assert text.startswith("(")
182 |     pos = 1
183 |     values = []
184 |     text_len = len(text)
185 |     while pos < text_len:
186 |         match = VALUE_PATTERN.match(text, pos)
187 |         if not match:
188 |             break
189 |         value = match.group(1)
190 |         values.append(decode_mysql_literal(value.strip()))
191 |         pos += len(value) + 1
192 |         if match.group(2) == ")":
193 |             # Skip comma and open parenthesis ",("
194 |             pos += 2
195 |             yield tuple(values)
196 |             values = []
197 | 


--------------------------------------------------------------------------------
/database_sanitizer/dump/postgres.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import unicode_literals
  4 | 
  5 | import codecs
  6 | import re
  7 | import subprocess
  8 | 
  9 | from ..utils.postgres import decode_copy_value, encode_copy_value
 10 | from ..config import PG_DUMP_DEFAULT_PARAMETERS
 11 | 
 12 | COPY_LINE_PATTERN = re.compile(
 13 |     r"^COPY \"(?P<schema>[^\"]*)\".\"(?P<table>[^\"]*)\" "
 14 |     r"\((?P<columns>.*)\) "
 15 |     r"FROM stdin;$"
 16 | )
 17 | 
 18 | 
 19 | def sanitize(url, config):
 20 |     """
 21 |     Obtains dump of an Postgres database by executing `pg_dump` command and
 22 |     sanitizes it's output.
 23 | 
 24 |     :param url: URL to the database which is going to be sanitized, parsed by
 25 |                 Python's URL parser.
 26 |     :type url: six.moves.urllib.parse.ParseResult
 27 | 
 28 |     :param config: Optional sanitizer configuration to be used for sanitation
 29 |                    of the values stored in the database.
 30 |     :type config: database_sanitizer.config.Configuration|None
 31 |     """
 32 |     if url.scheme not in ("postgres", "postgresql", "postgis"):
 33 |         raise ValueError("Unsupported database type: '%s'" % (url.scheme,))
 34 | 
 35 |     extra_params = PG_DUMP_DEFAULT_PARAMETERS
 36 |     if config:
 37 |         extra_params = config.pg_dump_params
 38 | 
 39 |     process = subprocess.Popen(
 40 |         (
 41 |             "pg_dump",
 42 |             # Force output to be UTF-8 encoded.
 43 |             "--encoding=utf-8",
 44 |             # Quote all table and column names, just in case.
 45 |             "--quote-all-identifiers",
 46 |             # Luckily `pg_dump` supports DB URLs, so we can just pass it the
 47 |             # URL as argument to the command.
 48 |             "--dbname",
 49 |             url.geturl().replace('postgis://', 'postgresql://'),
 50 |          ) + tuple(extra_params),
 51 |         stdout=subprocess.PIPE,
 52 |     )
 53 | 
 54 |     sanitize_value_line = None
 55 |     current_table = None
 56 |     current_table_columns = None
 57 |     skip_table = False
 58 | 
 59 |     for line in codecs.getreader("utf-8")(process.stdout):
 60 |         # Eat the trailing new line.
 61 |         line = line.rstrip("\n")
 62 | 
 63 |         # Are we currently in middle of `COPY` statement?
 64 |         if current_table:
 65 |             # Backslash following a dot marks end of an `COPY` statement.
 66 |             if line == "\\.":
 67 |                 current_table = None
 68 |                 current_table_columns = None
 69 |                 if not skip_table:
 70 |                     yield "\\."
 71 |                 skip_table = False
 72 |                 continue
 73 | 
 74 |             if skip_table:
 75 |                 continue
 76 | 
 77 |             if not sanitize_value_line:
 78 |                 yield line
 79 |                 continue
 80 | 
 81 |             yield sanitize_value_line(line)
 82 |             continue
 83 | 
 84 |         # Is the line beginning of `COPY` statement?
 85 |         copy_line_match = COPY_LINE_PATTERN.match(line)
 86 |         if not copy_line_match:
 87 |             yield line
 88 |             continue
 89 | 
 90 |         current_table = copy_line_match.group("table")
 91 |         current_table_columns = parse_column_names(copy_line_match.group("columns"))
 92 | 
 93 |         # Skip `COPY` statement if table rows are configured
 94 |         # to be skipped.
 95 |         if config and current_table in config.skip_rows_for_tables:
 96 |             skip_table = True
 97 |             continue
 98 | 
 99 |         sanitize_value_line = get_value_line_sanitizer(
100 |             config, current_table, current_table_columns)
101 | 
102 |         yield line
103 | 
104 | 
105 | def get_value_line_sanitizer(config, table, columns):
106 |     if not config:
107 |         return None
108 | 
109 |     def get_sanitizer(column):
110 |         sanitizer = config.get_sanitizer_for(table, column)
111 | 
112 |         if not sanitizer:
113 |             return _identity
114 | 
115 |         def decode_sanitize_encode(value):
116 |             return encode_copy_value(sanitizer(decode_copy_value(value)))
117 | 
118 |         return decode_sanitize_encode
119 | 
120 |     sanitizers = [get_sanitizer(column) for column in columns]
121 | 
122 |     if all(x is _identity for x in sanitizers):
123 |         return None
124 | 
125 |     def sanitize_line(line):
126 |         values = line.split('\t')
127 |         if len(values) != len(columns):
128 |             raise ValueError("Mismatch between column names and values.")
129 |         return '\t'.join(
130 |             sanitizer(value)
131 |             for (sanitizer, value) in zip(sanitizers, values))
132 | 
133 |     return sanitize_line
134 | 
135 | 
136 | def _identity(x):
137 |     return x
138 | 
139 | 
140 | def parse_column_names(text):
141 |     """
142 |     Extracts column names from a string containing quoted and comma separated
143 |     column names.
144 | 
145 |     :param text: Line extracted from `COPY` statement containing quoted and
146 |                  comma separated column names.
147 |     :type text: str
148 | 
149 |     :return: Tuple containing just the column names.
150 |     :rtype: tuple[str]
151 |     """
152 |     return tuple(
153 |         re.sub(r"^\"(.*)\"$", r"\1", column_name.strip())
154 |         for column_name in text.split(",")
155 |     )
156 | 
157 | 
158 | def parse_values(text):
159 |     """
160 |     Parses line following `COPY` statement containing values for a single row
161 |     in the table, in custom Postgres format.
162 | 
163 |     :param text: Line following `COPY` statement containing values.
164 |     :type text: str
165 | 
166 |     :return: Column values extracted from the given line.
167 |     :rtype: tuple[str|None]
168 |     """
169 |     return tuple(decode_copy_value(value) for value in text.split("\t"))
170 | 


--------------------------------------------------------------------------------
/database_sanitizer/sanitizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersinno/python-database-sanitizer/66b7146441914e9dfd8ab31596d5ebc61bf0e04a/database_sanitizer/sanitizers/__init__.py


--------------------------------------------------------------------------------
/database_sanitizer/sanitizers/constant.py:
--------------------------------------------------------------------------------
 1 | def sanitize_null(value):
 2 |     return None
 3 | 
 4 | 
 5 | def sanitize_empty_json_dict(value):
 6 |     return '{}'
 7 | 
 8 | 
 9 | def sanitize_empty_json_list(value):
10 |     return '[]'
11 | 
12 | 
13 | def sanitize_invalid_django_password(value):
14 |     return '!'
15 | 


--------------------------------------------------------------------------------
/database_sanitizer/sanitizers/derived.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | 
 3 | from database_sanitizer.session import hash_text
 4 | 
 5 | NIL_UUID = '00000000-0000-0000-0000-000000000000'
 6 | NIL_UUID_WITHOUT_DASHES = NIL_UUID.replace('-', '')
 7 | 
 8 | 
 9 | def sanitize_uuid4(value):
10 |     if not value:
11 |         return value
12 |     if value.replace('-', '') == NIL_UUID_WITHOUT_DASHES:
13 |         return NIL_UUID
14 |     return str(uuid.UUID(hash_text(value)[:32], version=4))
15 | 


--------------------------------------------------------------------------------
/database_sanitizer/sanitizers/string.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import absolute_import, unicode_literals
 4 | 
 5 | import random
 6 | import string
 7 | 
 8 | CHARACTERS = string.ascii_letters + string.digits
 9 | 
10 | 
11 | def sanitize_empty(value):
12 |     """
13 |     Built-in sanitizer which replaces the original value with empty string.
14 |     """
15 |     return None if value is None else ""
16 | 
17 | 
18 | def sanitize_zfill(value):
19 |     """
20 |     Built-in sanitizer which replaces the original value with zeros.
21 |     """
22 |     return None if value is None else "".zfill(len(value))
23 | 
24 | 
25 | def sanitize_random(value):
26 |     """
27 |     Random string of same length as the given value.
28 |     """
29 |     if not value:
30 |         return value
31 |     return ''.join(random.choice(CHARACTERS) for _ in range(len(value)))
32 | 


--------------------------------------------------------------------------------
/database_sanitizer/sanitizers/times.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import random
 3 | 
 4 | TEN_YEARS_AS_SECONDS = 10 * 365 * 24 * 3600
 5 | 
 6 | 
 7 | def sanitize_random_past_timestamp(value):
 8 |     num = random.randint(0, TEN_YEARS_AS_SECONDS * 1000)
 9 |     delta = datetime.timedelta(seconds=(num / 1000.0))
10 |     dt = datetime.datetime.now() - delta
11 |     return dt.isoformat()
12 | 


--------------------------------------------------------------------------------
/database_sanitizer/sanitizers/user.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | 
  3 | from six import text_type
  4 | 
  5 | from database_sanitizer.session import hash_text_to_int, hash_text_to_ints
  6 | 
  7 | 
  8 | def sanitize_email(value):
  9 |     if not value:
 10 |         return value
 11 |     (num1, num2, num3) = hash_text_to_ints(value.strip(), [16, 16, 32])
 12 |     given_name = given_names[num1 % given_names_count]
 13 |     surname = surnames[num2 % surnames_count]
 14 |     case_convert = (text_type.lower if num3 % 8 > 0 else lambda x: x)
 15 |     return '{first}.{last}@x{num:x}.sanitized.net'.format(
 16 |         first=case_convert(given_name),
 17 |         last=case_convert(surname).replace("'", ''),
 18 |         num=num3)
 19 | 
 20 | 
 21 | def sanitize_username(value):
 22 |     if not value:
 23 |         return value
 24 |     (num1, num2) = hash_text_to_ints(value, [16, 32])
 25 |     return '{}{:x}'.format(given_names[num1 % given_names_count].lower(), num2)
 26 | 
 27 | 
 28 | def sanitize_full_name_en_gb(value):
 29 |     if not value:
 30 |         return value
 31 |     (num1, num2) = hash_text_to_ints(value.strip().lower(), [16, 16])
 32 |     return '{} {}'.format(
 33 |         given_names[num1 % given_names_count], surnames[num2 % surnames_count])
 34 | 
 35 | 
 36 | def sanitize_given_name_en_gb(value):
 37 |     if not value:
 38 |         return value
 39 |     num = hash_text_to_int(value.strip().lower())
 40 |     return given_names[num % given_names_count]
 41 | 
 42 | 
 43 | def sanitize_surname_en_gb(value):
 44 |     if not value:
 45 |         return value
 46 |     num = hash_text_to_int(value.strip().lower())
 47 |     return surnames[num % surnames_count]
 48 | 
 49 | 
 50 | given_names = """
 51 | Aaron Abbie Abdul Abigail Adam Adrian Aimee Alan Albert Alex
 52 | Alexander Alexandra Alice Alison Allan Amanda Amber Amelia Amy Andrea
 53 | Andrew Angela Ann Anna Anne Annette Anthony Antony Arthur Ashleigh
 54 | Ashley Barbara Barry Ben Benjamin Bernard Beth Bethan Bethany Beverley
 55 | Billy Bradley Brandon Brenda Brett Brian Bruce Bryan Callum Cameron Carl
 56 | Carly Carol Carole Caroline Carolyn Catherine Charlene Charles Charlie
 57 | Charlotte Chelsea Cheryl Chloe Christian Christine Christopher Claire
 58 | Clare Clifford Clive Colin Connor Conor Craig Dale Damian Damien Daniel
 59 | Danielle Danny Darren David Dawn Dean Deborah Debra Declan Denis Denise
 60 | Dennis Derek Diana Diane Dominic Donald Donna Dorothy Douglas Duncan
 61 | Dylan Edward Eileen Elaine Eleanor Elizabeth Ellie Elliot Elliott Emily
 62 | Emma Eric Fiona Frances Francesca Francis Frank Frederick Gail Gareth
 63 | Garry Gary Gavin Gemma Geoffrey George Georgia Georgina Gerald Geraldine
 64 | Gerard Gillian Glen Glenn Gordon Grace Graeme Graham Gregory Guy Hannah
 65 | Harriet Harry Hayley Hazel Heather Helen Henry Hilary Hollie Holly
 66 | Howard Hugh Iain Ian Irene Jack Jacob Jacqueline Jade Jake James Jamie
 67 | Jane Janet Janice Jasmine Jason Jay Jayne Jean Jeffrey Jemma Jenna
 68 | Jennifer Jeremy Jessica Jill Joan Joanna Joanne Jodie Joe Joel John
 69 | Jonathan Jordan Joseph Josephine Josh Joshua Joyce Judith Julia Julian
 70 | Julie June Justin Karen Karl Kate Katherine Kathleen Kathryn Katie Katy
 71 | Kayleigh Keith Kelly Kenneth Kerry Kevin Kieran Kim Kimberley Kirsty
 72 | Kyle Laura Lauren Lawrence Leah Leanne Lee Leigh Leon Leonard Lesley
 73 | Leslie Lewis Liam Linda Lindsey Lisa Lorraine Louis Louise Lucy Luke
 74 | Lydia Lynda Lynn Lynne Malcolm Mandy Marc Marcus Margaret Maria Marian
 75 | Marie Marilyn Marion Mark Martin Martyn Mary Mathew Matthew Maureen
 76 | Maurice Max Megan Melanie Melissa Michael Michelle Mitchell Mohamed
 77 | Mohammad Mohammed Molly Naomi Natalie Natasha Nathan Neil Nicholas
 78 | Nicola Nicole Nigel Norman Oliver Olivia Owen Paige Pamela Patricia
 79 | Patrick Paul Paula Pauline Peter Philip Phillip Rachael Rachel Raymond
 80 | Rebecca Reece Rhys Richard Ricky Rita Robert Robin Roger Ronald Rosemary
 81 | Rosie Ross Roy Russell Ruth Ryan Sally Sam Samantha Samuel Sandra Sara
 82 | Sarah Scott Sean Shane Shannon Sharon Shaun Sheila Shirley Sian Simon
 83 | Sophie Stacey Stanley Stephanie Stephen Steven Stewart Stuart Susan
 84 | Suzanne Sylvia Terence Teresa Terry Thomas Timothy Tina Toby Tom Tony
 85 | Tracey Tracy Trevor Valerie Vanessa Victor Victoria Vincent Wayne Wendy
 86 | William Yvonne Zoe
 87 | """.strip().split()
 88 | 
 89 | 
 90 | surnames = """
 91 | Abbott Adams Ahmed Akhtar Alexander Ali Allan Allen Anderson Andrews
 92 | Archer Armstrong Arnold Ashton Atkins Atkinson Austin Bailey Baker
 93 | Baldwin Ball Banks Barber Barker Barlow Barnes Barnett Barrett Barry
 94 | Bartlett Barton Bates Baxter Begum Bell Bennett Benson Bentley Berry
 95 | Bevan Bibi Birch Bird Bishop Black Blackburn Bolton Bond Booth Bowen
 96 | Boyle Bradley Bradshaw Brady Bray Brennan Briggs Brookes Brooks Brown
 97 | Browne Bruce Bryan Bryant Bull Burgess Burke Burns Burrows Burton
 98 | Butcher Butler Byrne Cameron Campbell Carey Carpenter Carr Carroll
 99 | Carter Cartwright Chadwick Chambers Chan Chandler Chapman Charlton Clark
100 | Clarke Clayton Clements Coates Cole Coleman Coles Collier Collins
101 | Connolly Connor Conway Cook Cooke Cooper Cox Craig Crawford Cross
102 | Cunningham Curtis Dale Daly Daniels Davey Davidson Davies Davis Davison
103 | Dawson Day Dean Dennis Dickinson Dixon Dobson Dodd Doherty Donnelly
104 | Douglas Doyle Duffy Duncan Dunn Dyer Edwards Elliott Ellis Evans Farmer
105 | Farrell Faulkner Ferguson Field Finch Fisher Fitzgerald Fleming Fletcher
106 | Flynn Ford Forster Foster Fowler Fox Francis Franklin Fraser Freeman
107 | French Frost Fry Fuller Gallagher Gardiner Gardner Garner George Gibbons
108 | Gibbs Gibson Gilbert Giles Gill Glover Goddard Godfrey Goodwin Gordon
109 | Gough Gould Graham Grant Gray Green Greenwood Gregory Griffin Griffiths
110 | Hale Hall Hamilton Hammond Hancock Hanson Harding Hardy Hargreaves
111 | Harper Harris Harrison Hart Hartley Harvey Hawkins Hayes Haynes Hayward
112 | Heath Henderson Henry Herbert Hewitt Hicks Higgins Hill Hilton Hodgson
113 | Holden Holland Holloway Holmes Holt Hooper Hope Hopkins Horton Houghton
114 | Howard Howarth Howe Howell Howells Hudson Hughes Humphreys Humphries
115 | Hunt Hunter Hurst Hussain Hutchinson Hyde Ingram Iqbal Jackson James
116 | Jarvis Jenkins Jennings John Johnson Johnston Jones Jordan Joyce Kaur
117 | Kay Kelly Kemp Kennedy Kent Kerr Khan King Kirby Kirk Knight Knowles
118 | Lamb Lambert Lane Law Lawrence Lawson Leach Lee Lees Leonard Lewis
119 | Little Lloyd Long Lord Lowe Lucas Lynch Lyons Macdonald Mahmood Mann
120 | Manning Marsden Marsh Marshall Martin Mason Matthews May McCarthy
121 | McDonald McKenzie McLean Mellor Metcalfe Miah Middleton Miles Miller
122 | Mills Mistry Mitchell Moore Moran Morgan Morley Morris Morrison Morton
123 | Moss Murphy Murray Myers Nash Naylor Nelson Newman Newton Nicholls
124 | Nicholson Nixon Noble Nolan Norman Norris North Norton O'Blake O'Buckley
125 | O'Chamberlain O'Hobbs O'Thompson Oliver Osborne Owen Owens Page Palmer
126 | Parker Parkes Parkin Parkinson Parry Parsons Patel Patterson Payne
127 | Peacock Pearce Pearson Perkins Perry Peters Phillips Pickering Pollard
128 | Poole Pope Porter Potter Potts Powell Power Pratt Preston Price
129 | Pritchard Pugh Quinn Rahman Randall Read Reed Rees Reeves Reid Reynolds
130 | Rhodes Rice Richards Richardson Riley Roberts Robertson Robinson Robson
131 | Rogers Rose Ross Rowe Rowley Russell Ryan Sanders Sanderson Saunders
132 | Savage Schofield Scott Shah Sharp Sharpe Shaw Shepherd Sheppard Short
133 | Simmons Simpson Sims Sinclair Singh Skinner Slater Smart Smith Spencer
134 | Stanley Steele Stephens Stephenson Stevens Stevenson Stewart Stokes
135 | Stone Storey Sullivan Summers Sutton Swift Sykes Talbot Taylor Thomas
136 | Thomson Thornton Thorpe Todd Tomlinson Townsend Tucker Turnbull Turner
137 | Tyler Vaughan Vincent Wade Walker Wall Wallace Wallis Walsh Walters
138 | Walton Ward Warner Warren Waters Watkins Watson Watts Webb Webster Welch
139 | Wells West Weston Wheeler White Whitehead Whitehouse Whittaker Wilkins
140 | Wilkinson Williams Williamson Willis Wilson Winter Wong Wood Woods
141 | Woodward Wright Wyatt Yates Young
142 | """.strip().split()
143 | 
144 | given_names_count = len(given_names)
145 | surnames_count = len(surnames)
146 | 


--------------------------------------------------------------------------------
/database_sanitizer/session.py:
--------------------------------------------------------------------------------
  1 | """
  2 | API to sanitation session.
  3 | 
  4 | Sanitation session allows having a state within a single sanitation
  5 | process.
  6 | 
  7 | One important thing stored to the session is a secret key which is
  8 | generated to a new random value for each sanitation session, but it
  9 | stays constant during the whole sanitation process. Its value is never
 10 | revealed, so that it is possible to generate such one way hashes with
 11 | it, that should not be redoable afterwards. I.e. during the sanitation
 12 | session it's possible to do ``hash(C) -> H`` for any clear text C, but
 13 | it is not possible to check if H is the hashed value of C after the
 14 | sanitation session has ended.
 15 | """
 16 | 
 17 | import hashlib
 18 | import hmac
 19 | import random
 20 | import sys
 21 | import threading
 22 | 
 23 | from six import int2byte
 24 | 
 25 | if sys.version_info >= (3, 6):
 26 |     from typing import Callable, Optional, Sequence  # noqa
 27 | 
 28 | 
 29 | SECRET_KEY_BITS = 128
 30 | 
 31 | 
 32 | _thread_local_storage = threading.local()
 33 | 
 34 | 
 35 | def hash_text_to_int(value, bit_length=32):
 36 |     # type: (str, int) -> int
 37 |     """
 38 |     Hash a text value to an integer.
 39 | 
 40 |     Generates an integer number based on the hash derived with
 41 |     `hash_text` from the given text value.
 42 | 
 43 |     :param bit_length: Number of bits to use from the hash value.
 44 |     :return: Integer value within ``0 <= result < 2**bit_length``
 45 |     """
 46 |     hash_value = hash_text(value)
 47 |     return int(hash_value[0:(bit_length // 4)], 16)
 48 | 
 49 | 
 50 | def hash_text_to_ints(value, bit_lengths=(16, 16, 16, 16)):
 51 |     # type: (str, Sequence[int]) -> Sequence[int]
 52 |     """
 53 |     Hash a text value to a sequence of integers.
 54 | 
 55 |     Generates a sequence of integer values with given bit-lengths
 56 |     similarly to `hash_text_to_int`, but allowing generating many
 57 |     separate numbers with a single call.
 58 | 
 59 |     :param bit_lengths:
 60 |       Tuple of bit lengths for the resulting integers.  Defines also the
 61 |       length of the result tuple.
 62 |     :return:
 63 |       Tuple of ``n`` integers ``(R_1, ... R_n)`` with the requested
 64 |       bit-lengths ``(L_1, ..., L_n)`` and values ranging within
 65 |       ``0 <= R_i < 2**L_i`` for each ``i``.
 66 |     """
 67 |     hash_value = hash_text(value)
 68 |     hex_lengths = [x // 4 for x in bit_lengths]
 69 |     hex_ranges = (
 70 |         (sum(hex_lengths[0:i]), sum(hex_lengths[0:(i + 1)]))
 71 |         for i in range(len(hex_lengths)))
 72 |     return tuple(int(hash_value[a:b], 16) for (a, b) in hex_ranges)
 73 | 
 74 | 
 75 | def hash_text(value, hasher=hashlib.sha256, encoding='utf-8'):
 76 |     # type: (str, Callable, str) -> str
 77 |     """
 78 |     Generate a hash for a text value.
 79 | 
 80 |     The hash will be generated by encoding the text to bytes with given
 81 |     encoding and then generating a hash with HMAC using the session
 82 |     secret as the key and the given hash function.
 83 | 
 84 |     :param value: Text value to hash
 85 |     :param hasher: Hash function to use, SHA256 by default
 86 |     :param encoding: Encoding to use, UTF-8 by default
 87 |     :return: Hexadecimal presentation of the hash as a string
 88 |     """
 89 |     return hash_bytes(value.encode(encoding), hasher)
 90 | 
 91 | 
 92 | def hash_bytes(value, hasher=hashlib.sha256):
 93 |     # type: (bytes, Callable) -> str
 94 |     """
 95 |     Generate a hash for a bytes value.
 96 | 
 97 |     The hash will be generated by generating a hash with HMAC using the
 98 |     session secret as the key and the given hash function.
 99 | 
100 |     :param value: Bytes value to hash
101 |     :param hasher: Hash function to use.
102 |     :return: Hexadecimal presentation of the hash as a string
103 |     """
104 |     return hmac.new(get_secret(), value, hasher).hexdigest()
105 | 
106 | 
107 | def get_secret():
108 |     # type: () -> bytes
109 |     """
110 |     Get session specific secret key.
111 | 
112 |     :return: Session key as bytes
113 |     """
114 |     if not getattr(_thread_local_storage, 'secret_key', None):
115 |         _initialize_session()
116 |     return _thread_local_storage.secret_key  # type: ignore
117 | 
118 | 
119 | def reset(secret_key=None):
120 |     # type: (Optional[bytes]) -> None
121 |     """
122 |     Reset the session.
123 | 
124 |     By default, this resets the value of the secret to None so that, if
125 |     there was an earlier sanitation process ran on the same thread, then
126 |     a next call that needs the secret key of the session will generate a
127 |     new value for it.
128 | 
129 |     This may also be used to set a predefined value for the secret key.
130 | 
131 |     :param secret_key:
132 |       Value to set as the new session secret key or None if a new one
133 |       should be generated as soon as one is needed.
134 |     """
135 |     _thread_local_storage.secret_key = secret_key
136 | 
137 | 
138 | def _initialize_session():
139 |     # type: () -> None
140 |     """
141 |     Generate a new session key and store it to thread local storage.
142 |     """
143 |     sys_random = random.SystemRandom()
144 |     _thread_local_storage.secret_key = b''.join(
145 |         int2byte(sys_random.randint(0, 255))
146 |         for _ in range(SECRET_KEY_BITS // 8))
147 | 


--------------------------------------------------------------------------------
/database_sanitizer/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersinno/python-database-sanitizer/66b7146441914e9dfd8ab31596d5ebc61bf0e04a/database_sanitizer/tests/__init__.py


--------------------------------------------------------------------------------
/database_sanitizer/tests/test_config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from collections import namedtuple
  4 | 
  5 | import mock
  6 | import pytest
  7 | 
  8 | from .. import config
  9 | from ..config import Configuration, ConfigurationError
 10 | 
 11 | 
 12 | @mock.patch.object(config, 'open')
 13 | @mock.patch('yaml.safe_load')
 14 | def test_from_file(mocked_yaml_load, mocked_open):
 15 |     mocked_yaml_load.return_value = {}
 16 | 
 17 |     Configuration.from_file('filename.yml')
 18 | 
 19 |     assert mocked_open.call_args == (('filename.yml', 'rb'), {})
 20 |     opened_file = mocked_open.return_value.__enter__.return_value
 21 |     assert mocked_yaml_load.call_args == ((opened_file,), {})
 22 | 
 23 | 
 24 | def test_load_config_data_must_be_dict():
 25 |     config = Configuration()
 26 |     config.load({})
 27 |     with pytest.raises(ConfigurationError):
 28 |         config.load(config_data="test")
 29 | 
 30 | 
 31 | def test_load_dump_extra_parameters():
 32 |     config = Configuration()
 33 | 
 34 |     config.load_dump_extra_parameters({})
 35 |     assert config.mysqldump_params == ["--single-transaction"]
 36 |     assert config.pg_dump_params == []
 37 | 
 38 |     with pytest.raises(ConfigurationError):
 39 |         config.load_dump_extra_parameters({"config": "test"})
 40 | 
 41 |     config.load_dump_extra_parameters({"config": {}})
 42 |     assert config.mysqldump_params == ["--single-transaction"]
 43 |     assert config.pg_dump_params == []
 44 | 
 45 |     with pytest.raises(ConfigurationError):
 46 |         config.load_dump_extra_parameters({"config": {
 47 |             "extra_parameters": "test"
 48 |         }})
 49 | 
 50 |     with pytest.raises(ConfigurationError):
 51 |         config.load_dump_extra_parameters({"config": {
 52 |             "extra_parameters": [True]
 53 |         }})
 54 | 
 55 |     with pytest.raises(ConfigurationError):
 56 |         config.load_dump_extra_parameters({"config": {
 57 |             "extra_parameters": {
 58 |                 "mysqldump": "hernekeitto",
 59 |             },
 60 |         }})
 61 | 
 62 |     with pytest.raises(ConfigurationError):
 63 |         config.load_dump_extra_parameters({"config": {
 64 |             "extra_parameters": {
 65 |                 "pg_dump": "viina",
 66 |             },
 67 |         }})
 68 | 
 69 |     config.load_dump_extra_parameters({"config": {
 70 |         "extra_parameters": {
 71 |             "mysqldump": ["--double-transaction"],
 72 |             "pg_dump": ["--exclude-table=something"],
 73 |         },
 74 |     }})
 75 |     assert config.mysqldump_params == ["--double-transaction"]
 76 |     assert config.pg_dump_params == ["--exclude-table=something"]
 77 | 
 78 | 
 79 | def test_load_addon_packages():
 80 |     config = Configuration()
 81 | 
 82 |     config.load_addon_packages({})
 83 |     assert config.addon_packages == []
 84 | 
 85 |     with pytest.raises(ConfigurationError):
 86 |         config.load_addon_packages({"config": "test"})
 87 | 
 88 |     config.load_addon_packages({"config": {}})
 89 |     assert config.addon_packages == []
 90 | 
 91 |     with pytest.raises(ConfigurationError):
 92 |         config.load_addon_packages({"config": {"addons": "test"}})
 93 | 
 94 |     with pytest.raises(ConfigurationError):
 95 |         config.load_addon_packages({"config": {"addons": [True]}})
 96 | 
 97 |     config.load_addon_packages({"config": {
 98 |         "addons": [
 99 |             "test1",
100 |             "test2",
101 |             "test3",
102 |         ],
103 |     }})
104 |     assert config.addon_packages == ["test1", "test2", "test3"]
105 | 
106 | 
107 | def test_load_sanitizers():
108 |     config = Configuration()
109 | 
110 |     with pytest.raises(ConfigurationError):
111 |         config.load_sanitizers({"strategy": "test"})
112 | 
113 |     with pytest.raises(ConfigurationError):
114 |         config.load_sanitizers({"strategy": {"test": "test"}})
115 | 
116 |     def mock_find_sanitizer(*args):
117 |         return lambda value: value
118 | 
119 |     with mock.patch("database_sanitizer.config.Configuration.find_sanitizer",
120 |                     side_effect=mock_find_sanitizer):
121 |         with pytest.raises(ConfigurationError):
122 |             config.load_sanitizers({"strategy": {"table1": {"column1": True}}})
123 | 
124 |         config.load_sanitizers({"strategy": {
125 |             "table1": {
126 |                 "column1": None,
127 |                 "column2": "test.test",
128 |             },
129 |             "table2": {
130 |                 "column1": "test.test",
131 |             },
132 |             "table3": None,
133 |         }})
134 | 
135 |     assert "table1.column1" not in config.sanitizers
136 |     assert "table1.column2" in config.sanitizers
137 |     assert "table2.column1" in config.sanitizers
138 | 
139 | 
140 | def test_table_skip_rows_configuration():
141 |     config = Configuration()
142 | 
143 |     with pytest.raises(ConfigurationError):
144 |         config.load_sanitizers({"strategy": "test"})
145 | 
146 |     def mock_find_sanitizer(*args):
147 |         return lambda value: value
148 | 
149 |     with mock.patch("database_sanitizer.config.Configuration.find_sanitizer",
150 |                     side_effect=mock_find_sanitizer):
151 | 
152 |         config.load_sanitizers({"strategy": {
153 |             "table1": "skip_rows",
154 |             "table2": {
155 |                 "column1": "test",
156 |             }
157 |         }})
158 | 
159 |     assert "table2.column1" in config.sanitizers
160 |     assert "table1" in config.skip_rows_for_tables
161 | 
162 | 
163 | def test_find_sanitizer():
164 |     config = Configuration()
165 | 
166 |     with pytest.raises(ConfigurationError):
167 |         config.find_sanitizer("test")
168 | 
169 |     def mock_find_sanitizer_from_module1(module_name, function_name):
170 |         assert module_name == "sanitizers.test"
171 |         assert function_name == "sanitize_test"
172 |         return lambda value: value
173 | 
174 |     with mock.patch("database_sanitizer.config.Configuration.find_sanitizer_from_module",
175 |                     side_effect=mock_find_sanitizer_from_module1):
176 |         assert config.find_sanitizer("test.test") is not None
177 | 
178 |     def mock_find_sanitizer_from_module2(module_name, function_name):
179 |         assert module_name in ("sanitizers.test", "addon.test")
180 |         assert function_name == "sanitize_test"
181 |         if module_name.startswith("addon."):
182 |             return lambda value: value
183 |         else:
184 |             return None
185 | 
186 |     with mock.patch("database_sanitizer.config.Configuration.find_sanitizer_from_module",
187 |                     side_effect=mock_find_sanitizer_from_module2):
188 |         config.addon_packages = ("addon",)
189 |         assert config.find_sanitizer("test.test") is not None
190 | 
191 |     def mock_find_sanitizer_from_module3(module_name, function_name):
192 |         assert module_name in (
193 |             "sanitizers.test",
194 |             "addon.test",
195 |             "database_sanitizer.sanitizers.test",
196 |         )
197 |         assert function_name == "sanitize_test"
198 |         if module_name.startswith("database_sanitizer."):
199 |             return lambda value: value
200 |         else:
201 |             return None
202 | 
203 |     with mock.patch("database_sanitizer.config.Configuration.find_sanitizer_from_module",
204 |                     side_effect=mock_find_sanitizer_from_module3):
205 |         assert config.find_sanitizer("test.test") is not None
206 | 
207 |     def mock_find_sanitizer_from_module4(module_name, function_name):
208 |         return None
209 | 
210 |     with mock.patch("database_sanitizer.config.Configuration.find_sanitizer_from_module",
211 |                     side_effect=mock_find_sanitizer_from_module4):
212 |         with pytest.raises(ConfigurationError):
213 |             config.find_sanitizer("test.test")
214 | 
215 | 
216 | def test_find_sanitizer_from_module():
217 |     def mock_import1(module_name):
218 |         assert module_name == "test"
219 |         raise ImportError("Should be catched")
220 | 
221 |     with mock.patch("importlib.import_module", side_effect=mock_import1):
222 |         assert Configuration.find_sanitizer_from_module("test", "test") is None
223 | 
224 |     mock_module_type = namedtuple("mock_module", ("test",))
225 | 
226 |     def mock_import2(module_name):
227 |         assert module_name == "test"
228 |         return mock_module_type(test=None)
229 | 
230 |     with mock.patch("importlib.import_module", side_effect=mock_import2):
231 |         assert Configuration.find_sanitizer_from_module("test", "test") is None
232 | 
233 |     def mock_import3(module_name):
234 |         assert module_name == "test"
235 |         return mock_module_type(test=lambda value: value)
236 | 
237 |     with mock.patch("importlib.import_module", side_effect=mock_import3):
238 |         assert Configuration.find_sanitizer_from_module("test", "test") is not None
239 | 
240 |     def mock_import4(module_name):
241 |         assert module_name == "test"
242 |         return mock_module_type(test="test")
243 | 
244 |     with mock.patch("importlib.import_module", side_effect=mock_import4):
245 |         with pytest.raises(ConfigurationError):
246 |             Configuration.find_sanitizer_from_module("test", "test")
247 | 
248 | 
249 | def test_sanitize():
250 |     config = Configuration()
251 |     config.sanitizers["a.a"] = lambda value: value.upper()
252 |     config.sanitizers["a.b"] = lambda value: value[::-1]
253 | 
254 |     assert config.sanitize("a", "a", "test") == "TEST"
255 |     assert config.sanitize("a", "b", "test") == "tset"
256 |     assert config.sanitize("a", "c", "test") == "test"
257 | 


--------------------------------------------------------------------------------
/database_sanitizer/tests/test_dump.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | from io import BytesIO, StringIO
  3 | 
  4 | import mock
  5 | import pytest
  6 | 
  7 | from database_sanitizer import dump
  8 | from database_sanitizer.config import Configuration
  9 | 
 10 | EXPECTED_POPEN_KWARGS = {
 11 |     'mysql://User:Pass@HostName/Db': {
 12 |         'args': (
 13 |             'mysqldump --complete-insert --extended-insert'
 14 |             ' --net_buffer_length=10240 -h hostname -u User Db'
 15 |             ' --single-transaction'
 16 |         ).split(),
 17 |         'env': {'MYSQL_PWD': 'Pass'},
 18 |         'stdout': subprocess.PIPE,
 19 |     },
 20 |     'postgres:///Db': {
 21 |         'args': tuple((
 22 |             'pg_dump --encoding=utf-8 --quote-all-identifiers'
 23 |             ' --dbname postgres:///Db').split()),
 24 |         'stdout': subprocess.PIPE,
 25 |     },
 26 | }
 27 | 
 28 | for url in ['postgresql:///Db', 'postgis:///Db']:
 29 |     EXPECTED_POPEN_KWARGS[url] = EXPECTED_POPEN_KWARGS['postgres:///Db'].copy()
 30 |     EXPECTED_POPEN_KWARGS[url]['args'] = tuple(
 31 |         ' '.join(EXPECTED_POPEN_KWARGS[url]['args'])
 32 |         .replace('postgres', 'postgresql').split())
 33 | 
 34 | 
 35 | @pytest.mark.parametrize('url', list(EXPECTED_POPEN_KWARGS))
 36 | @mock.patch('subprocess.Popen')
 37 | def test_run(mocked_popen, url):
 38 |     mocked_popen.return_value.stdout = BytesIO(b'INPUT DUMP')
 39 |     output = StringIO()
 40 |     config = None
 41 |     dump.run(url, output, config)
 42 | 
 43 |     expected_popen_kwargs = EXPECTED_POPEN_KWARGS[url]
 44 |     (popen_args, popen_kwargs) = mocked_popen.call_args
 45 |     expected_popen_args = (
 46 |         (expected_popen_kwargs.pop('args'),) if popen_args else ())
 47 |     assert popen_args == expected_popen_args
 48 |     assert popen_kwargs == expected_popen_kwargs
 49 | 
 50 | 
 51 | @mock.patch('subprocess.Popen')
 52 | def test_run_with_mysql_extra_params(mocked_popen):
 53 |     mocked_popen.return_value.stdout = BytesIO(b'INPUT DUMP')
 54 |     output = StringIO()
 55 | 
 56 |     url = "mysql://User:Pass@HostName/Db"
 57 |     config = Configuration()
 58 |     config.load({
 59 |         "config": {
 60 |             "extra_parameters": {
 61 |                 "mysqldump": ["--double-transaction"]
 62 |             }
 63 |         }
 64 |     })
 65 | 
 66 |     dump.run(url, output, config)
 67 | 
 68 |     expected = {
 69 |         'args': (
 70 |             'mysqldump --complete-insert --extended-insert'
 71 |             ' --net_buffer_length=10240 -h hostname -u User Db'
 72 |             ' --double-transaction'
 73 |         ).split(),
 74 |         'env': {'MYSQL_PWD': 'Pass'},
 75 |         'stdout': subprocess.PIPE,
 76 |     }
 77 | 
 78 |     (popen_args, popen_kwargs) = mocked_popen.call_args
 79 |     expected_popen_args = (
 80 |         (expected.pop('args'),) if popen_args else ())
 81 |     assert popen_args == expected_popen_args
 82 |     assert popen_kwargs == expected
 83 | 
 84 | 
 85 | @mock.patch('subprocess.Popen')
 86 | def test_run_with_pg_dump_extra_params(mocked_popen):
 87 |     mocked_popen.return_value.stdout = BytesIO(b'INPUT DUMP')
 88 |     output = StringIO()
 89 | 
 90 |     url = "postgres:///Db"
 91 |     config = Configuration()
 92 |     config.load({
 93 |         "config": {
 94 |             "extra_parameters": {
 95 |                 "pg_dump": ["--exclude-table=something"]
 96 |             }
 97 |         }
 98 |     })
 99 | 
100 |     dump.run(url, output, config)
101 | 
102 |     expected = {
103 |         'args': tuple((
104 |             'pg_dump --encoding=utf-8 --quote-all-identifiers'
105 |             ' --dbname postgres:///Db'
106 |             ' --exclude-table=something'
107 |         ).split()),
108 |         'stdout': subprocess.PIPE,
109 |     }
110 | 
111 |     (popen_args, popen_kwargs) = mocked_popen.call_args
112 |     expected_popen_args = (
113 |         (expected.pop('args'),) if popen_args else ())
114 |     assert popen_args == expected_popen_args
115 |     assert popen_kwargs == expected
116 | 
117 | 
118 | @mock.patch('subprocess.Popen')
119 | def test_run_unknown_scheme(mocked_popen):
120 |     with pytest.raises(ValueError) as excinfo:
121 |         dump.run('unknown:///db', None, None)
122 |     assert str(excinfo.value) == "Unsupported database scheme: 'unknown'"
123 |     mocked_popen.assert_not_called()
124 | 


--------------------------------------------------------------------------------
/database_sanitizer/tests/test_dump_mysql.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import unicode_literals
  4 | 
  5 | import io
  6 | 
  7 | import pytest
  8 | from six.moves.urllib import parse as urlparse
  9 | 
 10 | from ..config import Configuration
 11 | from ..dump.mysql import (
 12 |     parse_column_names,
 13 |     parse_values,
 14 |     sanitize,
 15 |     sanitize_from_stream,
 16 | )
 17 | 
 18 | MOCK_MYSQLDUMP_OUTPUT = b"""
 19 | --- Fake MySQL database dump
 20 | 
 21 | DROP TABLE IF EXISTS `test`;
 22 | 
 23 | CREATE TABLE `test` (
 24 | `id` int(11) NOT NULL AUTO_INCREMENT,
 25 | `created_at` date NOT NULL,
 26 | `notes` varchar(255) NOT NULL,
 27 | PRIMARY KEY (`id`)
 28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
 29 | 
 30 | INSERT INTO `test` (`id`, `created_at`, `notes`) VALUES \
 31 | (1,'2018-01-01','Test data 1'),\
 32 | (2,'2018-01-02','Test data 2'),\
 33 | (3,'2018-01-03','Test data 3');
 34 | 
 35 | --- Final line after `INSERT INTO` statement.
 36 | """
 37 | 
 38 | MOCK_MYSQLDUMP_OUTPUT_WITH_U2028 = b"""
 39 | --- Fake MySQL database dump
 40 | 
 41 | DROP TABLE IF EXISTS `test`;
 42 | 
 43 | CREATE TABLE `test` (
 44 | `id` int(11) NOT NULL AUTO_INCREMENT,
 45 | `created_at` date NOT NULL,
 46 | `notes` varchar(255) NOT NULL,
 47 | PRIMARY KEY (`id`)
 48 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
 49 | 
 50 | INSERT INTO `test` (`id`, `created_at`, `notes`) VALUES \
 51 | (1,'2018-01-01','Test \xe2\x80\xa8 data 1'),\
 52 | (2,'2018-01-02','Test data 2'),\
 53 | (3,'2018-01-03','Test data 3');
 54 | 
 55 | --- Final line after `INSERT INTO` statement.
 56 | """
 57 | 
 58 | 
 59 | INVALID_MOCK_MYSQLDUMP_OUTPUT = b"""
 60 | --- Fake MySQL database dump
 61 | 
 62 | DROP TABLE IF EXISTS `test`;
 63 | 
 64 | INSERT INTO `test` (`id`, `created_at`, `notes`) VALUES (1),(2),(3);
 65 | 
 66 | --- Final line after `INSERT INTO` statement.
 67 | """
 68 | 
 69 | 
 70 | def test_sanitize_wrong_scheme():
 71 |     url = urlparse.urlparse("http://localhost/test")
 72 |     with pytest.raises(ValueError):
 73 |         list(sanitize(url, None))
 74 | 
 75 | 
 76 | def test_sanitize_from_stream():
 77 |     stream = io.BytesIO(MOCK_MYSQLDUMP_OUTPUT)
 78 |     config = Configuration()
 79 |     config.sanitizers["test.notes"] = lambda value: "Sanitized"
 80 |     dump_output_lines = list(sanitize_from_stream(stream, config))
 81 | 
 82 |     assert "--- Fake MySQL database dump" in dump_output_lines
 83 |     assert "--- Final line after `INSERT INTO` statement." in dump_output_lines
 84 |     assert """INSERT INTO `test` (`id`, `created_at`, `notes`) VALUES \
 85 | (1,'2018-01-01','Sanitized'),\
 86 | (2,'2018-01-02','Sanitized'),\
 87 | (3,'2018-01-03','Sanitized');\
 88 | """ in dump_output_lines
 89 | 
 90 | def test_sanitize_with_u2028_from_stream():
 91 |     stream = io.BytesIO(MOCK_MYSQLDUMP_OUTPUT_WITH_U2028)
 92 |     config = Configuration()
 93 |     config.sanitizers["test.notes"] = lambda value: "Sanitized"
 94 |     dump_output_lines = list(sanitize_from_stream(stream, config))
 95 | 
 96 |     assert "--- Fake MySQL database dump" in dump_output_lines
 97 |     assert "--- Final line after `INSERT INTO` statement." in dump_output_lines
 98 |     assert """INSERT INTO `test` (`id`, `created_at`, `notes`) VALUES \
 99 | (1,'2018-01-01','Sanitized'),\
100 | (2,'2018-01-02','Sanitized'),\
101 | (3,'2018-01-03','Sanitized');\
102 | """ in dump_output_lines
103 | 
104 | 
105 | def test_skip_table_rows():
106 |     stream = io.BytesIO(MOCK_MYSQLDUMP_OUTPUT)
107 |     config = Configuration()
108 |     config.skip_rows_for_tables.append('test')
109 | 
110 |     output = list(sanitize_from_stream(stream, config))
111 | 
112 |     assert output == [
113 |         '',
114 |         '--- Fake MySQL database dump',
115 |         '',
116 |         'DROP TABLE IF EXISTS `test`;',
117 |         '',
118 |         'CREATE TABLE `test` (',
119 |         '`id` int(11) NOT NULL AUTO_INCREMENT,',
120 |         '`created_at` date NOT NULL,',
121 |         '`notes` varchar(255) NOT NULL,',
122 |         'PRIMARY KEY (`id`)',
123 |         ') ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;',
124 |         '',
125 |         '',
126 |         '--- Final line after `INSERT INTO` statement.',
127 |     ]
128 | 
129 | 
130 | def test_sanitizer_invalid_input():
131 |     stream = io.BytesIO(INVALID_MOCK_MYSQLDUMP_OUTPUT)
132 |     config = Configuration()
133 |     config.sanitizers["test.notes"] = lambda value: "Sanitized"
134 | 
135 |     with pytest.raises(ValueError):
136 |         list(sanitize_from_stream(stream, config))
137 | 
138 | 
139 | @pytest.mark.parametrize(
140 |     "text,expected_column_names",
141 |     (
142 |         ("`test`", ("test",)),
143 |         ("`test`, `test`", ("test", "test")),
144 |         ("`test`,`test`", ("test", "test")),
145 |     ),
146 | )
147 | def test_parse_column_names(text, expected_column_names):
148 |     assert parse_column_names(text) == expected_column_names
149 | 
150 | 
151 | @pytest.mark.parametrize(
152 |     "text,expected_values",
153 |     (
154 |         ("('test'),('test')", (("test",), ("test",))),
155 |         ("(1,2),(3,4),", ((1, 2), (3, 4))),
156 |         ("(TRUE),(FALSE),(NULL)", ((True,), (False,), (None,))),
157 |         ("(x')", ()),  # Invalid data
158 |     ),
159 | )
160 | def test_parse_values(text, expected_values):
161 |     assert tuple(parse_values(text)) == expected_values
162 | 
163 | 
164 | @pytest.mark.parametrize('config_type', [
165 |     'no-config', 'empty-config', 'single-column-config'])
166 | @pytest.mark.parametrize('data_label', ['ok', 'invalid'])
167 | def test_optimizations(config_type, data_label):
168 |     if config_type == 'no-config':
169 |         config = None
170 |         decoder_call_count = 0
171 |     else:
172 |         config = Configuration()
173 |         if config_type == 'empty-config':
174 |             decoder_call_count = 0
175 |         else:
176 |             assert config_type == 'single-column-config'
177 |             config.sanitizers["test.notes"] = (lambda x: x)
178 |             decoder_call_count = 3  # Number of rows in test table
179 | 
180 |     data = {
181 |         'ok': MOCK_MYSQLDUMP_OUTPUT,
182 |         'invalid': INVALID_MOCK_MYSQLDUMP_OUTPUT,
183 |     }[data_label]
184 | 
185 |     should_raise = (
186 |         config_type == 'single-column-config'
187 |         and data_label == 'invalid')
188 | 
189 |     dump_stream = io.BytesIO(data)
190 |     if should_raise:
191 |         with pytest.raises(ValueError):
192 |             list(sanitize_from_stream(dump_stream, config))
193 |     else:
194 |         expected_output = data.decode('utf-8').splitlines()
195 |         result = list(sanitize_from_stream(dump_stream, config))
196 |         assert result == expected_output
197 | 


--------------------------------------------------------------------------------
/database_sanitizer/tests/test_dump_postgres.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import unicode_literals
  4 | 
  5 | import io
  6 | from collections import namedtuple
  7 | 
  8 | import mock
  9 | import pytest
 10 | from six.moves.urllib import parse as urlparse
 11 | 
 12 | from ..config import Configuration
 13 | from ..dump import postgres as dump_postgres
 14 | from ..dump.postgres import parse_column_names, parse_values, sanitize
 15 | from ..utils.postgres import decode_copy_value
 16 | 
 17 | MOCK_PG_DUMP_OUTPUT = b"""
 18 | --- Fake PostgreSQL database dump
 19 | 
 20 | COMMENT ON SCHEMA "public" IS 'standard public schema';
 21 | 
 22 | CREATE TABLE "public"."test" (
 23 | "id" integer NOT NULL,
 24 | "created_at" timestamp with time zone NOT NULL,
 25 | "notes" character varying(255) NOT NULL
 26 | );
 27 | 
 28 | COPY "public"."test" ("id", "created_at", "notes") FROM stdin;
 29 | 1\t2018-01-01 00:00:00\tTest data 1
 30 | 2\t2018-01-02 00:00:00\tTest data 2
 31 | 3\t2018-01-03 00:00:00\tTest data 3
 32 | \\.
 33 | 
 34 | --- Final line after `COPY` statement
 35 | """.strip()
 36 | 
 37 | 
 38 | INVALID_MOCK_PG_DUMP_OUTPUT = b"""
 39 | --- Fake PostgreSQL database dump
 40 | 
 41 | COMMENT ON SCHEMA "public" IS 'standard public schema';
 42 | 
 43 | COPY "public"."test" ("id", "created_at", "notes") FROM stdin;
 44 | 1\t2018-01-01 00:00:00 Test data 1
 45 | 2\t2018-01-02 00:00:00 Test data 2
 46 | 3\t2018-01-03 00:00:00 Test data 3
 47 | \\.
 48 | 
 49 | --- Final line after `COPY` statement
 50 | """.strip()
 51 | 
 52 | 
 53 | def create_mock_popen(mock_pg_dump_output):
 54 |     def mock_popen(cmd_args, stdout):
 55 |         mock_pipe_type = namedtuple("mock_pipe", ("stdout",))
 56 |         mock_stdout = io.BytesIO(mock_pg_dump_output)
 57 |         return mock_pipe_type(stdout=mock_stdout)
 58 |     return mock_popen
 59 | 
 60 | 
 61 | def test_sanitize():
 62 |     url = urlparse.urlparse("postgres://localhost/test")
 63 |     config = Configuration()
 64 |     config.sanitizers["test.notes"] = lambda value: "Sanitized"
 65 | 
 66 |     with mock.patch("subprocess.Popen", side_effect=create_mock_popen(MOCK_PG_DUMP_OUTPUT)):
 67 |         dump_output_lines = list(sanitize(url, config))
 68 | 
 69 |     assert "--- Fake PostgreSQL database dump" in dump_output_lines
 70 |     assert "--- Final line after `COPY` statement" in dump_output_lines
 71 |     assert "2\t2018-01-02 00:00:00\tSanitized" in dump_output_lines
 72 | 
 73 | 
 74 | def test_skip_table_rows():
 75 |     url = urlparse.urlparse("postgres://localhost/test")
 76 |     config = Configuration()
 77 |     config.skip_rows_for_tables.append('test')
 78 | 
 79 |     with mock.patch("subprocess.Popen",
 80 |                     side_effect=create_mock_popen(MOCK_PG_DUMP_OUTPUT)):
 81 |         output = list(sanitize(url, config))
 82 | 
 83 |     assert output == [
 84 |         '--- Fake PostgreSQL database dump',
 85 |         '',
 86 |         'COMMENT ON SCHEMA "public" IS \'standard public schema\';',
 87 |         '',
 88 |         'CREATE TABLE "public"."test" (',
 89 |         '"id" integer NOT NULL,',
 90 |         '"created_at" timestamp with time zone NOT NULL,',
 91 |         '"notes" character varying(255) NOT NULL',
 92 |         ');',
 93 |         '',
 94 |         '',
 95 |         '--- Final line after `COPY` statement'
 96 |     ]
 97 | 
 98 | 
 99 | def test_sanitizer_invalid_input():
100 |     url = urlparse.urlparse("postgres://localhost/test")
101 | 
102 |     config = Configuration()
103 |     config.sanitizers["test.notes"] = lambda value: "Sanitized"
104 | 
105 |     with mock.patch("subprocess.Popen", side_effect=create_mock_popen(INVALID_MOCK_PG_DUMP_OUTPUT)):
106 |         with pytest.raises(ValueError):
107 |             # Yes, we need the list() function there to eat the yields.
108 |             list(sanitize(url, config))
109 | 
110 | 
111 | def test_sanitizer_invalid_scheme():
112 |     url = urlparse.urlparse("http://localhost/test")
113 |     with pytest.raises(ValueError):
114 |         list(sanitize(url, None))
115 | 
116 | 
117 | @pytest.mark.parametrize(
118 |     "text,expected_column_names",
119 |     (
120 |         ("\"test\"", ("test",)),
121 |         ("\"test\",\"test\"", ("test", "test")),
122 |         ("\"test\", \"test\"", ("test", "test")),
123 |     )
124 | )
125 | def test_parse_column_names(text, expected_column_names):
126 |     assert parse_column_names(text) == expected_column_names
127 | 
128 | 
129 | @pytest.mark.parametrize(
130 |     "text,expected_values",
131 |     (
132 |         ("Test", ("Test",)),
133 |         ("Test\tTest", ("Test", "Test")),
134 |         ("Test\tTest\t", ("Test", "Test", "")),
135 |         ("\\N", (None,)),
136 |     )
137 | )
138 | def test_parse_values(text, expected_values):
139 |     assert parse_values(text) == expected_values
140 | 
141 | 
142 | @pytest.mark.parametrize('config_type', [
143 |     'no-config', 'empty-config', 'single-column-config'])
144 | @pytest.mark.parametrize('data_label', ['ok', 'invalid'])
145 | def test_optimizations(config_type, data_label):
146 |     if config_type == 'no-config':
147 |         config = None
148 |         decoder_call_count = 0
149 |     else:
150 |         config = Configuration()
151 |         if config_type == 'empty-config':
152 |             decoder_call_count = 0
153 |         else:
154 |             assert config_type == 'single-column-config'
155 |             config.sanitizers["test.notes"] = (lambda x: x)
156 |             decoder_call_count = 3  # Number of rows in test table
157 | 
158 |     data = {
159 |         'ok': MOCK_PG_DUMP_OUTPUT,
160 |         'invalid': INVALID_MOCK_PG_DUMP_OUTPUT,
161 |     }[data_label]
162 | 
163 |     should_raise = (
164 |         config_type == 'single-column-config'
165 |         and data_label == 'invalid')
166 | 
167 |     url = urlparse.urlparse("postgres://localhost/test")
168 |     with mock.patch("subprocess.Popen", side_effect=create_mock_popen(data)):
169 |         with mock.patch.object(dump_postgres, 'decode_copy_value') as decoder:
170 |             decoder.side_effect = decode_copy_value
171 |             if should_raise:
172 |                 with pytest.raises(ValueError):
173 |                     list(sanitize(url, config))
174 |             else:
175 |                 expected_output = data.decode('utf-8').splitlines()
176 |                 assert list(sanitize(url, config)) == expected_output
177 |                 assert decoder.call_count == decoder_call_count
178 | 


--------------------------------------------------------------------------------
/database_sanitizer/tests/test_main.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | 
 3 | import mock
 4 | import pytest
 5 | import six
 6 | 
 7 | from database_sanitizer import __main__
 8 | 
 9 | main = __main__.main
10 | 
11 | 
12 | @mock.patch.object(__main__, 'run')
13 | def test_main_without_args(mocked_run, capsys):
14 |     with pytest.raises(SystemExit) as excinfo:
15 |         main(['SANI'])
16 |     assert excinfo.value.code == 2
17 | 
18 |     captured = capsys.readouterr()
19 |     assert captured.out == ''
20 |     assert captured.err.splitlines() == [
21 |         'usage: SANI [-h] [--config CONFIG] [--output OUTPUT] url',
22 |         'SANI: error: the following arguments are required: url' if six.PY3
23 |         else 'SANI: error: too few arguments',
24 |     ]
25 |     assert not mocked_run.called
26 | 
27 | 
28 | @mock.patch.object(__main__, 'run')
29 | def test_main_with_url(mocked_run, capsys):
30 |     main(['SANI', 'some://url'])
31 | 
32 |     # Output should be empty
33 |     captured = capsys.readouterr()
34 |     assert captured.out == ''
35 |     assert captured.err == ''
36 | 
37 |     # The run function should have been called with the URL
38 |     (run_call_args, run_call_kwargs) = mocked_run.call_args
39 |     assert run_call_args == ()
40 |     assert set(run_call_kwargs.keys()) == {'config', 'output', 'url'}
41 |     assert run_call_kwargs['config'] is None
42 |     assert run_call_kwargs['url'] == 'some://url'
43 | 
44 | 
45 | @pytest.mark.parametrize('optname', ['-c', '--config'])
46 | @mock.patch.object(__main__, 'run')
47 | @mock.patch.object(__main__, 'Configuration')
48 | def test_main_with_config(mocked_conf, mocked_run, capsys, optname):
49 |     main(['SANI', optname, 'config_file.yml', 'some://url'])
50 | 
51 |     # Output should be empty
52 |     captured = capsys.readouterr()
53 |     assert captured.out == ''
54 |     assert captured.err == ''
55 | 
56 |     # Configuration should have been created with Configuration.from_file
57 |     (fromfile_args, fromfile_kwargs) = mocked_conf.from_file.call_args
58 |     assert fromfile_args == ('config_file.yml',)
59 |     assert fromfile_kwargs == {}
60 | 
61 |     # The run function should have been called with the config and URL
62 |     (run_call_args, run_call_kwargs) = mocked_run.call_args
63 |     assert run_call_args == ()
64 |     assert set(run_call_kwargs.keys()) == {'config', 'output', 'url'}
65 |     assert run_call_kwargs['config'] == mocked_conf.from_file.return_value
66 |     assert run_call_kwargs['url'] == 'some://url'
67 | 
68 | 
69 | @pytest.mark.parametrize('optname', ['-o', '--output'])
70 | @mock.patch.object(__main__, 'run')
71 | @mock.patch.object(__main__, 'open')
72 | def test_main_with_output(mocked_open, mocked_run, capsys, optname):
73 |     main(['SANI', optname, 'output_file.sql', 'some://url'])
74 | 
75 |     # Output should be empty
76 |     captured = capsys.readouterr()
77 |     assert captured.out == ''
78 |     assert captured.err == ''
79 | 
80 |     # Output file should have been opened
81 |     (open_args, open_kwargs) = mocked_open.call_args
82 |     assert open_args == ('output_file.sql', 'w')
83 |     assert open_kwargs == {}
84 | 
85 |     # The run function should have been called with the output and URL
86 |     (run_call_args, run_call_kwargs) = mocked_run.call_args
87 |     assert run_call_args == ()
88 |     assert set(run_call_kwargs.keys()) == {'config', 'output', 'url'}
89 |     assert run_call_kwargs['config'] is None
90 |     assert run_call_kwargs['output'] == mocked_open.return_value
91 |     assert run_call_kwargs['url'] == 'some://url'
92 | 


--------------------------------------------------------------------------------
/database_sanitizer/tests/test_sanitizers_constant.py:
--------------------------------------------------------------------------------
 1 | from database_sanitizer.sanitizers import constant
 2 | 
 3 | 
 4 | def test_sanitize_null():
 5 |     assert constant.sanitize_null(None) is None
 6 |     assert constant.sanitize_null('') is None
 7 |     assert constant.sanitize_null('whatever') is None
 8 |     assert constant.sanitize_null('test') is None
 9 | 
10 | 
11 | def test_sanitize_invalid_django_password():
12 |     assert constant.sanitize_invalid_django_password(None) == '!'
13 |     assert constant.sanitize_invalid_django_password('') == '!'
14 |     assert constant.sanitize_invalid_django_password('whatever') == '!'
15 |     assert constant.sanitize_invalid_django_password('test') == '!'
16 | 
17 | 
18 | def test_sanitize_empty_json_dict():
19 |     assert constant.sanitize_empty_json_dict(None) == '{}'
20 |     assert constant.sanitize_empty_json_dict('') == '{}'
21 |     assert constant.sanitize_empty_json_dict('whatever') == '{}'
22 |     assert constant.sanitize_empty_json_dict('test') == '{}'
23 | 
24 | 
25 | def test_sanitize_empty_json_list():
26 |     assert constant.sanitize_empty_json_list(None) == '[]'
27 |     assert constant.sanitize_empty_json_list('') == '[]'
28 |     assert constant.sanitize_empty_json_list('whatever') == '[]'
29 |     assert constant.sanitize_empty_json_list('test') == '[]'
30 | 


--------------------------------------------------------------------------------
/database_sanitizer/tests/test_sanitizers_derived.py:
--------------------------------------------------------------------------------
 1 | from database_sanitizer import session
 2 | from database_sanitizer.sanitizers import derived
 3 | 
 4 | 
 5 | def setup_module():
 6 |     session.reset(b'not-so-secret-key')
 7 | 
 8 | 
 9 | def test_sanitize_uuid4():
10 |     assert derived.sanitize_uuid4(None) is None
11 |     assert derived.sanitize_uuid4('') == ''
12 |     assert derived.sanitize_uuid4('0') == (
13 |         'e3a5862f-cffb-4d89-ab3e-5563b27e287a')
14 |     assert derived.sanitize_uuid4('00000000000000000000000000000000') == (
15 |         '00000000-0000-0000-0000-000000000000')
16 |     assert derived.sanitize_uuid4('00000000-0000-0000-0000-000000000000') == (
17 |         '00000000-0000-0000-0000-000000000000')
18 |     assert derived.sanitize_uuid4('e3a5862f-cffb-4d89-ab3e-5563b27e287a') == (
19 |         '88b0225e-6090-459a-999d-9b3a3ab28c53')
20 | 


--------------------------------------------------------------------------------
/database_sanitizer/tests/test_sanitizers_string.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import unicode_literals
 4 | 
 5 | import mock
 6 | import pytest
 7 | 
 8 | from ..sanitizers.string import sanitize_empty, sanitize_random, sanitize_zfill
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     "input_value,expected_output",
13 |     (
14 |         ("foo", ""),
15 |         ("bar", ""),
16 |         ("", ""),
17 |         ("   ", ""),
18 |         (None, None),
19 |     ),
20 | )
21 | def test_sanitize_empty(input_value, expected_output):
22 |     assert sanitize_empty(input_value) == expected_output
23 | 
24 | 
25 | @pytest.mark.parametrize(
26 |     "input_value,expected_output",
27 |     (
28 |         ("foo", "000"),
29 |         ("test test", "000000000"),
30 |         ("", ""),
31 |         (None, None)
32 |     ),
33 | )
34 | def test_sanitize_zfill(input_value, expected_output):
35 |     return sanitize_zfill(input_value) == expected_output
36 | 
37 | 
38 | @mock.patch('random.choice', return_value='x')
39 | def test_sanitize_random(mocked_random_choice):
40 |     assert sanitize_random(None) is None
41 |     assert sanitize_random('') == ''
42 |     assert sanitize_random('a') == 'x'
43 |     assert sanitize_random('hello') == 'xxxxx'
44 |     assert sanitize_random('hello world') == 'xxxxxxxxxxx'
45 | 


--------------------------------------------------------------------------------
/database_sanitizer/tests/test_sanitizers_times.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | import mock
 4 | 
 5 | from database_sanitizer.sanitizers import times
 6 | 
 7 | 
 8 | class _FakeDateTime(datetime.datetime):
 9 |     @staticmethod
10 |     def now():
11 |         return datetime.datetime(2018, 1, 1, 12, 00, 00)
12 | 
13 | 
14 | @mock.patch('random.randint', return_value=42005)
15 | @mock.patch.object(datetime, 'datetime', _FakeDateTime)
16 | def test_sanitize_random_past_timestamp(randint_mock):
17 |     assert times.sanitize_random_past_timestamp('old') == (
18 |         '2018-01-01T11:59:17.995000')
19 | 


--------------------------------------------------------------------------------
/database_sanitizer/tests/test_sanitizers_user.py:
--------------------------------------------------------------------------------
 1 | from database_sanitizer import session
 2 | from database_sanitizer.sanitizers import user
 3 | 
 4 | 
 5 | def setup_module():
 6 |     session.reset(b'not-so-secret-key')
 7 | 
 8 | 
 9 | def test_sanitize_email():
10 |     assert user.sanitize_email(None) is None
11 |     assert user.sanitize_email('') == ''
12 |     assert user.sanitize_email('test@example.com') == (
13 |         'zoe.burke@xce13103b.sanitized.net')
14 |     assert user.sanitize_email('test2@example.com') == (
15 |         'Melanie.Pratt@x4feb7f40.sanitized.net')
16 |     assert user.sanitize_email('test@example.com') == (
17 |         'zoe.burke@xce13103b.sanitized.net')
18 |     assert user.sanitize_email('test3@example.com') == (
19 |         'irene.archer@x3d2e92ec.sanitized.net')
20 |     assert user.sanitize_email(' test3@example.com  ') == (
21 |         'irene.archer@x3d2e92ec.sanitized.net')
22 | 
23 | 
24 | def test_sanitize_username():
25 |     assert user.sanitize_username(None) is None
26 |     assert user.sanitize_username('') == ''
27 |     assert user.sanitize_username('John.Doe') == 'billyda979417'
28 |     assert user.sanitize_username('JaneSmith') == 'helena34a7a0b'
29 |     assert user.sanitize_username('john-smith') == 'arthurc5a84ec'
30 |     assert user.sanitize_username('john-smith ') == 'douglas8d3b8d5e'
31 |     assert user.sanitize_username('john smith ') == 'katyfdab90cc'
32 | 
33 | 
34 | def test_sanitize_full_name_en_gb():
35 |     assert user.sanitize_full_name_en_gb(None) is None
36 |     assert user.sanitize_full_name_en_gb('') == ''
37 |     assert user.sanitize_full_name_en_gb('John Doe') == 'Francis Walker'
38 |     assert user.sanitize_full_name_en_gb('Jane Smith') == 'Declan Burke'
39 |     assert user.sanitize_full_name_en_gb('John Smith') == 'Lawrence Norton'
40 |     assert user.sanitize_full_name_en_gb('john smith ') == 'Lawrence Norton'
41 | 
42 | 
43 | def test_sanitize_given_name_en_gb():
44 |     assert user.sanitize_given_name_en_gb(None) is None
45 |     assert user.sanitize_given_name_en_gb('') == ''
46 |     assert user.sanitize_given_name_en_gb('John') == 'Cheryl'
47 |     assert user.sanitize_given_name_en_gb('Jane') == 'Andrea'
48 |     assert user.sanitize_given_name_en_gb('Foo bar') == 'Elliott'
49 |     assert user.sanitize_given_name_en_gb('  Foo BAR ') == 'Elliott'
50 | 
51 | 
52 | def test_sanitize_surname_en_gb():
53 |     assert user.sanitize_surname_en_gb(None) is None
54 |     assert user.sanitize_surname_en_gb('') == ''
55 |     assert user.sanitize_surname_en_gb('Doe') == 'Bibi'
56 |     assert user.sanitize_surname_en_gb('Smith') == 'Duffy'
57 |     assert user.sanitize_surname_en_gb('Anderson') == 'Hodgson'
58 |     assert user.sanitize_surname_en_gb('andersOn ') == 'Hodgson'
59 | 
60 | 
61 | def test_sanitize_email_resets_on_session_reset():
62 |     assert user.sanitize_email('test@example.com') == (
63 |         'zoe.burke@xce13103b.sanitized.net')
64 |     session.reset()
65 |     assert user.sanitize_email('test@example.com') != (
66 |         'zoe.burke@xce13103b.sanitized.net')
67 |     session.reset(b'not-so-secret-key')
68 | 


--------------------------------------------------------------------------------
/database_sanitizer/tests/test_session.py:
--------------------------------------------------------------------------------
 1 | from database_sanitizer import session
 2 | 
 3 | 
 4 | def setup_module():
 5 |     session.reset(b'not-so-secret-key')
 6 | 
 7 | 
 8 | def test_hash_text_to_int():
 9 |     assert session.hash_text_to_int('hello') == 4100462238
10 | 
11 | 
12 | def test_hash_text_to_ints():
13 |     assert session.hash_text_to_ints('hello', [4, 8, 16]) == (15, 70, 33129)
14 | 
15 | 
16 | def test_hash_text():
17 |     assert session.hash_text('hello') == (
18 |         'f468169e17f4dd5d7318bd6099a4e657ceb0a978cddb4f3382be0da7121659bb')
19 | 
20 | 
21 | def test_hash_bytes():
22 |     assert session.hash_bytes(b'hello') == (
23 |         'f468169e17f4dd5d7318bd6099a4e657ceb0a978cddb4f3382be0da7121659bb')
24 | 
25 | 
26 | def test_get_secret():
27 |     assert session.get_secret() == b'not-so-secret-key'
28 | 
29 | 
30 | def test_reset():
31 |     old_key = session.get_secret()
32 |     session.reset()
33 |     new_key = session.get_secret()
34 |     assert new_key != old_key
35 |     session.reset(b'not-so-secret-key')
36 |     assert session.get_secret() == b'not-so-secret-key'
37 | 


--------------------------------------------------------------------------------
/database_sanitizer/tests/test_utils_mysql.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import unicode_literals
  4 | 
  5 | import pytest
  6 | from six.moves.urllib import parse as urlparse
  7 | 
  8 | from ..utils.mysql import (
  9 |     decode_mysql_literal,
 10 |     decode_mysql_string_literal,
 11 |     get_mysqldump_args_and_env_from_url,
 12 |     unescape_single_character,
 13 | )
 14 | 
 15 | 
 16 | @pytest.mark.parametrize(
 17 |     "url",
 18 |     (
 19 |         "mysql://test:test@localhost/test",
 20 |         "mysql://localhost:1234/test",
 21 |         "mysql://localhost",
 22 |     ),
 23 | )
 24 | def test_get_mysqldump_args_and_env_from_url(url):
 25 |     parsed_url = urlparse.urlparse(url)
 26 | 
 27 |     if not parsed_url.path:
 28 |         with pytest.raises(ValueError):
 29 |             get_mysqldump_args_and_env_from_url(url=parsed_url)
 30 |         return
 31 | 
 32 |     args, env = get_mysqldump_args_and_env_from_url(url=parsed_url)
 33 | 
 34 |     assert isinstance(args, list)
 35 |     assert isinstance(env, dict)
 36 | 
 37 |     assert len(args) > 0
 38 |     assert "--complete-insert" in args
 39 |     assert "--extended-insert" in args
 40 |     assert "--net_buffer_length=10240" in args
 41 |     assert args[-1] == parsed_url.path[1:]
 42 | 
 43 |     if parsed_url.username:
 44 |         index = args.index("-u")
 45 |         assert args[index + 1] == parsed_url.username
 46 | 
 47 |     if parsed_url.password:
 48 |         assert env["MYSQL_PWD"] == parsed_url.password
 49 | 
 50 | 
 51 | @pytest.mark.parametrize(
 52 |     "text,expected_value",
 53 |     (
 54 |         ("NULL", None),
 55 |         ("TRUE", True),
 56 |         ("FALSE", False),
 57 |         ("12", 12),
 58 |         ("12.5", 12.5),
 59 |         ("'test'", "test"),
 60 |     ),
 61 | )
 62 | def test_decode_mysql_literal(text, expected_value):
 63 |     assert decode_mysql_literal(text) == expected_value
 64 | 
 65 | 
 66 | def test_decode_mysql_literal_invalid_input():
 67 |     with pytest.raises(ValueError):
 68 |         decode_mysql_literal("ERROR")
 69 | 
 70 | 
 71 | @pytest.mark.parametrize(
 72 |     "text,expected_output",
 73 |     (
 74 |         ("'test'", "test"),
 75 |         ("'test\\ntest'", "test\ntest"),
 76 |         ("'\\0'", "\000"),
 77 |         ("'foo", None),
 78 |         ("foo'", None),
 79 |         ("foo", None),
 80 |     ),
 81 | )
 82 | def test_decode_mysql_string_literal(text, expected_output):
 83 |     if expected_output is None:
 84 |         with pytest.raises(AssertionError):
 85 |             decode_mysql_string_literal(text)
 86 |     else:
 87 |         assert decode_mysql_string_literal(text) == expected_output
 88 | 
 89 | 
 90 | @pytest.mark.parametrize(
 91 |     "text,expected_output",
 92 |     (
 93 |         ("\\\\", "\\"),
 94 |         ("\\n", "\n"),
 95 |         ("\\r", "\r"),
 96 |         ("\\0", "\000"),
 97 |         ("\\Z", "\032"),
 98 |         ("\\'", "'"),
 99 |         ('\\"', '"'),
100 |     ),
101 | )
102 | def test_unescape_single_character(text, expected_output):
103 |     class MockRegexpMatch(object):
104 | 
105 |         def __init__(self, text):
106 |             self.text = text
107 | 
108 |         def group(self, index):
109 |             assert index == 0
110 |             return self.text
111 | 
112 |     assert unescape_single_character(MockRegexpMatch(text)) == expected_output
113 | 


--------------------------------------------------------------------------------
/database_sanitizer/tests/test_utils_postgres.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import unicode_literals
 4 | 
 5 | import pytest
 6 | 
 7 | from ..utils.postgres import (
 8 |     DECODE_MAP,
 9 |     POSTGRES_COPY_NULL_VALUE,
10 |     decode_copy_value,
11 |     encode_copy_value,
12 | )
13 | 
14 | 
15 | @pytest.mark.parametrize(
16 |     "input_value,expected_value",
17 |     (
18 |         ("", ""),
19 |         (POSTGRES_COPY_NULL_VALUE, None),
20 |         ("Test", "Test"),
21 |         ("\\\\", "\\"),
22 |         ("\\b", "\b"),
23 |         ("\\f", "\f"),
24 |         ("\\n", "\n"),
25 |         ("\\r", "\r"),
26 |         ("\\t", "\t"),
27 |         ("\\v", "\v"),
28 |         ("\\xff", "\xff"),
29 |         ("\\123", "\123"),
30 |         ("Test\\r\\nTest", "Test\r\nTest"),
31 |     )
32 | )
33 | def test_decode_copy_value(input_value, expected_value):
34 |     assert decode_copy_value(input_value) == expected_value
35 | 
36 | 
37 | @pytest.mark.parametrize(
38 |     "input_value,expected_value",
39 |     (
40 |         ("", ""),
41 |         (None, POSTGRES_COPY_NULL_VALUE),
42 |         ("Test", "Test"),
43 |         ("\\", "\\\\"),
44 |         ("\b", "\\b"),
45 |         ("\f", "\\f"),
46 |         ("\n", "\\n"),
47 |         ("\r", "\\r"),
48 |         ("\t", "\\t"),
49 |         ("\v", "\\v"),
50 |         ("\xff", "\xff"),
51 |         ("\123", "\123"),
52 |         ("Test\r\nTest", "Test\\r\\nTest"),
53 |     )
54 | )
55 | def test_encode_copy_value(input_value, expected_value):
56 |     assert encode_copy_value(input_value) == expected_value
57 | 
58 | 
59 | def test_invalid_escape_sequence():
60 |     with pytest.raises(ValueError):
61 |         decode_copy_value("\\")
62 |     with pytest.raises(ValueError):
63 |         decode_copy_value("\\X")
64 | 
65 | 
66 | def test_decode_map_contents():
67 |     assert DECODE_MAP['\\b'] == '\b'
68 |     assert DECODE_MAP['\\n'] == '\n'
69 |     assert DECODE_MAP['\\t'] == '\t'
70 |     assert DECODE_MAP['\\\\'] == '\\'
71 |     assert DECODE_MAP['\\0'] == '\0'
72 |     assert DECODE_MAP['\\74'] == '\74'
73 |     assert DECODE_MAP['\\x0'] == '\0'
74 |     assert DECODE_MAP['\\xa'] == '\x0a'
75 |     assert DECODE_MAP['\\xA'] == '\x0a'
76 |     assert DECODE_MAP['\\x00'] == '\0'
77 |     assert DECODE_MAP['\\xa3'] == '\xa3'
78 |     assert DECODE_MAP['\\xA3'] == '\xa3'
79 |     assert DECODE_MAP['\\xAb'] == '\xab'
80 |     assert DECODE_MAP['\\xaB'] == '\xab'
81 |     assert DECODE_MAP['\\xff'] == '\xff'
82 | 
83 |     assert '\\' not in DECODE_MAP,  "Unterminated escape is not mapped"
84 |     assert '\\z' not in DECODE_MAP,  "Invalid escape sequences are not mapped"
85 | 
86 |     assert len(DECODE_MAP) == 1097
87 | 


--------------------------------------------------------------------------------
/database_sanitizer/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersinno/python-database-sanitizer/66b7146441914e9dfd8ab31596d5ebc61bf0e04a/database_sanitizer/utils/__init__.py


--------------------------------------------------------------------------------
/database_sanitizer/utils/mysql.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from __future__ import unicode_literals
  4 | 
  5 | import re
  6 | 
  7 | import pymysql
  8 | import six
  9 | 
 10 | 
 11 | def get_mysqldump_args_and_env_from_url(url):
 12 |     """
 13 |     Constructs list of command line arguments and dictionary of environment
 14 |     variables that can be given to `mysqldump` executable to obtain database
 15 |     dump of the database described in given URL.
 16 | 
 17 |     :param url: Parsed database URL.
 18 |     :type url: urllib.urlparse.ParseResult
 19 | 
 20 |     :return: List of command line arguments as well as dictionary of
 21 |              environment variables that can be used to launch the MySQL dump
 22 |              process to obtain dump of the database.
 23 |     :rtype: tuple[list[str],dict[str,str]]
 24 |     """
 25 |     args = [
 26 |         # Without this, `INSERT INTO` statements will exclude column names from
 27 |         # the output, which are required for sanitation.
 28 |         "--complete-insert",
 29 | 
 30 |         # This enables use for "exteded inserts" where multiple rows of a table
 31 |         # are included in a single `INSERT INTO` statement (contents of the
 32 |         # entire table even, if it's within limits). We use it to increase the
 33 |         # performance of the sanitation and to decrease the dump size.
 34 |         "--extended-insert",
 35 | 
 36 |         # This makes the `mysqldump` to attempt to limit size of a single line
 37 |         # into 10 megabytes. We use it to reduce memory consumption.
 38 |         "--net_buffer_length=10240",
 39 | 
 40 |         # Hostname of the database to connect into, should be always present in
 41 |         # the parsed database URL.
 42 |         "-h",
 43 |         url.hostname,
 44 |     ]
 45 |     env = {}
 46 | 
 47 |     if url.port is not None:
 48 |         args.extend(("-P", six.text_type(url.port)))
 49 | 
 50 |     if url.username:
 51 |         args.extend(("-u", url.username))
 52 | 
 53 |     if url.password:
 54 |         env["MYSQL_PWD"] = url.password
 55 | 
 56 |     if len(url.path) < 2 or not url.path.startswith("/"):
 57 |         raise ValueError("Name of the database is missing from the URL")
 58 | 
 59 |     args.append(url.path[1:])
 60 | 
 61 |     return args, env
 62 | 
 63 | 
 64 | MYSQL_NULL_PATTERN = re.compile(r"^NULL$", re.IGNORECASE)
 65 | MYSQL_BOOLEAN_PATTERN = re.compile(r"^(TRUE|FALSE)$", re.IGNORECASE)
 66 | MYSQL_FLOAT_PATTERN = re.compile(r"^[+-]?\d*\.\d+([eE][+-]?\d+)?$")
 67 | MYSQL_INT_PATTERN = re.compile(r"^\d+$")
 68 | MYSQL_STRING_PATTERN = re.compile(r"'(?:[^']|''|\\')*(?<![\\])'")
 69 | 
 70 | 
 71 | def decode_mysql_literal(text):
 72 |     """
 73 |     Attempts to decode given MySQL literal into Python value.
 74 | 
 75 |     :param text: Value to be decoded, as MySQL literal.
 76 |     :type text: str
 77 | 
 78 |     :return: Python version of the given MySQL literal.
 79 |     :rtype: any
 80 |     """
 81 |     if MYSQL_NULL_PATTERN.match(text):
 82 |         return None
 83 | 
 84 |     if MYSQL_BOOLEAN_PATTERN.match(text):
 85 |         return text.lower() == "true"
 86 | 
 87 |     if MYSQL_FLOAT_PATTERN.match(text):
 88 |         return float(text)
 89 | 
 90 |     if MYSQL_INT_PATTERN.match(text):
 91 |         return int(text)
 92 | 
 93 |     if MYSQL_STRING_PATTERN.match(text):
 94 |         return decode_mysql_string_literal(text)
 95 | 
 96 |     raise ValueError("Unable to decode given value: %r" % (text,))
 97 | 
 98 | 
 99 | MYSQL_STRING_ESCAPE_SEQUENCE_PATTERN = re.compile(r"\\(.)")
100 | MYSQL_STRING_ESCAPE_SEQUENCE_MAPPING = {
101 |     "\\0": "\000",
102 |     "\\b": "\b",
103 |     "\\n": "\n",
104 |     "\\r": "\r",
105 |     "\\t": "\t",
106 |     "\\Z": "\032",
107 | }
108 | 
109 | 
110 | def decode_mysql_string_literal(text):
111 |     """
112 |     Removes quotes and decodes escape sequences from given MySQL string literal
113 |     returning the result.
114 | 
115 |     :param text: MySQL string literal, with the quotes still included.
116 |     :type text: str
117 | 
118 |     :return: Given string literal with quotes removed and escape sequences
119 |              decoded.
120 |     :rtype: str
121 |     """
122 |     assert text.startswith("'")
123 |     assert text.endswith("'")
124 | 
125 |     # Ditch quotes from the string literal.
126 |     text = text[1:-1]
127 | 
128 |     return MYSQL_STRING_ESCAPE_SEQUENCE_PATTERN.sub(
129 |         unescape_single_character,
130 |         text,
131 |     )
132 | 
133 | 
134 | def unescape_single_character(match):
135 |     """
136 |     Unescape a single escape sequence found from a MySQL string literal,
137 |     according to the rules defined at:
138 |     https://dev.mysql.com/doc/refman/5.6/en/string-literals.html#character-escape-sequences
139 | 
140 |     :param match: Regular expression match object.
141 | 
142 |     :return: Unescaped version of given escape sequence.
143 |     :rtype: str
144 |     """
145 |     value = match.group(0)
146 |     assert value.startswith("\\")
147 |     return MYSQL_STRING_ESCAPE_SEQUENCE_MAPPING.get(value) or value[1:]
148 | 
149 | 
150 | def encode_mysql_literal(value):
151 |     """
152 |     Converts given Python value into MySQL literal, suitable to be used inside
153 |     `INSERT INTO` statement.
154 | 
155 |     :param value: Value to convert into MySQL literal.
156 |     :type value: any
157 | 
158 |     :return: Given value encoded into MySQL literal.
159 |     :rtype: str
160 |     """
161 |     return pymysql.converters.escape_item(value, "utf-8")
162 | 


--------------------------------------------------------------------------------
/database_sanitizer/utils/postgres.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Contains utilities for working with Postgres `COPY` command, mainly encoding
  4 | and decoding values in the custom format used by Postgres.
  5 | 
  6 | Documentation about copy command and the text format used by it can be found
  7 | from:
  8 | https://www.postgresql.org/docs/9.2/static/sql-copy.html
  9 | 
 10 | For decoding we use a regular expression to find the escape sequences
 11 | and invoke `unescape_single_character` function for each occurence.
 12 | Allowed escape sequences are precalculated into `DECODE_MAP` to make the
 13 | lookups faster.
 14 | 
 15 | For encoding we use a string translation table `ENCODE_TRANSLATE_TABLE`,
 16 | which maps the "forbidden" characters to escape sequences.  This is used
 17 | with `str.translate`, which is very fast way to escape characters.
 18 | """
 19 | 
 20 | from __future__ import unicode_literals
 21 | 
 22 | import itertools
 23 | import re
 24 | 
 25 | import six
 26 | 
 27 | #: Representation of NULL value in Postgres COPY statement.
 28 | POSTGRES_COPY_NULL_VALUE = "\\N"
 29 | 
 30 | ENCODE_MAP = {
 31 |     '\\': '\\\\',
 32 |     '\b': '\\b',
 33 |     '\f': '\\f',
 34 |     '\n': '\\n',
 35 |     '\r': '\\r',
 36 |     '\t': '\\t',
 37 |     '\v': '\\v',
 38 | }
 39 | 
 40 | ENCODE_TRANSLATE_TABLE = [
 41 |     ENCODE_MAP.get(six.unichr(n), six.unichr(n))
 42 |     for n in range(256)
 43 | ]
 44 | 
 45 | DECODE_REGEX = re.compile(r"""
 46 | \\                 # a backslash
 47 | (?:                # followed by one of these (in non-capturing parenthesis):
 48 |     [0-7]{1,3}         # 1, 2 or 3 octal digits
 49 |     |                  # or
 50 |     x[0-9a-fA-F]{1,2}  # 'x' followed by 1 or 2 hexadecimal digits
 51 |     |                  # or
 52 |     .                  # any character
 53 |     |                  # or
 54 |     \Z                 # end of string
 55 | )
 56 | """, re.VERBOSE)
 57 | 
 58 | 
 59 | def decode_copy_value(value):
 60 |     """
 61 |     Decodes value received as part of Postgres `COPY` command.
 62 | 
 63 |     :param value: Value to decode.
 64 |     :type value: str
 65 | 
 66 |     :return: Either None if the value is NULL string, or the given value where
 67 |              escape sequences have been decoded from.
 68 |     :rtype: str|None
 69 |     """
 70 |     # Test for null values first.
 71 |     if value == POSTGRES_COPY_NULL_VALUE:
 72 |         return None
 73 | 
 74 |     # If there is no backslash present, there's nothing to decode.
 75 |     #
 76 |     # This early return provides a little speed-up, because it's very
 77 |     # common to not have anything to decode and then simple search for
 78 |     # backslash is faster than the regex sub below.
 79 |     if '\\' not in value:
 80 |         return value
 81 | 
 82 |     return DECODE_REGEX.sub(unescape_single_character, value)
 83 | 
 84 | 
 85 | def unescape_single_character(match):
 86 |     """
 87 |     Unescape a single escape sequence found by regular expression.
 88 | 
 89 |     :param match: Regular expression match object
 90 |     :rtype: str
 91 |     :raises: ValueError if the escape sequence is invalid
 92 |     """
 93 |     try:
 94 |         return DECODE_MAP[match.group(0)]
 95 |     except KeyError:
 96 |         value = match.group(0)
 97 |         if value == '\\':
 98 |             raise ValueError("Unterminated escape sequence encountered")
 99 | 
100 |         raise ValueError(
101 |             "Unrecognized escape sequence encountered: {}".format(value))
102 | 
103 | 
104 | def encode_copy_value(value):
105 |     """
106 |     Encodes given value into format suitable for Postgres `COPY` statement.
107 | 
108 |     :param value: Value to encode.
109 |     :type value: str|None
110 | 
111 |     :return: Given value encoded into format that is suitable to be used in the
112 |              `COPY` command.
113 |     :rtype: str
114 |     """
115 |     if value is None:
116 |         return POSTGRES_COPY_NULL_VALUE
117 | 
118 |     return value.translate(ENCODE_TRANSLATE_TABLE)
119 | 
120 | 
121 | def _generate_decode_map():
122 |     # Initialize the map by inverting the encode map
123 |     decode_map = {
124 |         encoded_char: char
125 |         for (char, encoded_char) in ENCODE_MAP.items()
126 |     }
127 | 
128 |     # Add entries for 1-3 octal digits and 1-2 hexadecimal digits
129 |     digit_encode_params = [
130 |         # (base, prefix, lengths, digit_chars)
131 |         (8, '\\', [1, 2, 3], '01234567'),
132 |         (16, '\\x', [1, 2], '0123456789abcdefABCDEF')
133 |     ]
134 |     for (base, prefix, lengths, digit_chars) in digit_encode_params:
135 |         for length in lengths:
136 |             for digits in itertools.product(digit_chars, repeat=length):
137 |                 digit_string = ''.join(digits)
138 |                 value = int(digit_string, base=base)
139 |                 char = six.unichr(value)
140 |                 decode_map[prefix + digit_string] = char
141 | 
142 |     return decode_map
143 | 
144 | 
145 | DECODE_MAP = _generate_decode_map()
146 | 


--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
1 | mock>=2.0.0
2 | pytest>=3.6.2
3 | pytest-cov
4 | -e .[MySQL]
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = database-sanitizer
 3 | version = 1.1.0
 4 | description = Sanitizes contents of a database.
 5 | long_description = file: README.md
 6 | long_description_content_type = text/markdown
 7 | url = https://github.com/andersinno/python-database-sanitizer
 8 | license = MIT
 9 | license_file = LICENSE
10 | 
11 | [options]
12 | zip_safe = true
13 | include_package_data = true
14 | packages = find:
15 | install_requires =
16 |     PyYAML>=3.12
17 |     six>=1.11.0
18 | 
19 | [options.extras_require]
20 | MySQL = PyMySQL
21 | 
22 | [options.packages.find]
23 | exclude =
24 |     database_sanitizer.tests
25 | 
26 | [options.entry_points]
27 | console_scripts =
28 |     database-sanitizer = database_sanitizer.__main__:main
29 | 
30 | [bdist_wheel]
31 | universal = 1
32 | 
33 | [isort]
34 | multi_line_output = 3
35 | include_trailing_comma = yes
36 | skip = .tox,dist,venv
37 | not_skip = __init__.py
38 | known_first_party = database_sanitizer
39 | default_section = THIRDPARTY
40 | 
41 | [coverage:run]
42 | branch = yes
43 | 
44 | [coverage:report]
45 | precision = 2
46 | omit = */tests/*
47 | exclude_lines =
48 |     ^ *main()
49 |     ^ *from typing import
50 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from __future__ import unicode_literals
4 | 
5 | import setuptools
6 | 
7 | if __name__ == "__main__":
8 |     setuptools.setup(setup_requires=["setuptools>=34.0"])
9 | 


--------------------------------------------------------------------------------