├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── BUG_REPORT.md │ ├── FEATURE_REQUEST.md │ ├── QUESTION.md │ └── config.yml ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ ├── ci.yml │ └── pythonpublish.yml ├── .gitignore ├── .pylintrc ├── CHANGELOG.md ├── LICENSE ├── README.md ├── sample_config.json ├── sample_logging.conf ├── setup.py ├── tests ├── __init__.py ├── integration │ ├── __init__.py │ ├── resources │ │ ├── catalog.json │ │ ├── invalid_config.json │ │ ├── invalid_messages.json │ │ ├── messages.json │ │ ├── streams_with_changing_schema.json │ │ ├── streams_with_object.json │ │ └── valid_config.json │ └── test_integrations.py └── unit │ ├── __init__.py │ ├── test_init.py │ ├── test_transform.py │ └── test_utils.py └── transform_field ├── __init__.py ├── errors.py ├── timings.py ├── transform.py └── utils.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @transferwise/analytics-platform 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/BUG_REPORT.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a bug report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | --- 11 | name: Bug report 12 | about: Create a report to help us improve 13 | title: '' 14 | labels: '' 15 | assignees: '' 16 | 17 | --- 18 | 19 | **Describe the bug** 20 | A clear and concise description of what the bug is. 21 | 22 | **To Reproduce** 23 | Steps to reproduce the behavior: 24 | 1. Prepare the data as '...' 25 | 2. Run the command '....' 26 | 4. See error 27 | 28 | **Expected behavior** 29 | A clear and concise description of what you expected to happen. 30 | 31 | **Screenshots** 32 | If applicable, add screenshots to help explain your problem. 33 | 34 | **Your environment** 35 | - Version, e.g branch/commit #/release/tag 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/FEATURE_REQUEST.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/QUESTION.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask anything about this project 4 | title: '' 5 | labels: help wanted 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Your question** 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: PipelineWise Community Slack channel 4 | url: https://singer-io.slack.com/messages/pipelinewise 5 | about: Open discussion about PipelineWise 6 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Problem 2 | 3 | _Describe the problem your PR is trying to solve_ 4 | 5 | ## Proposed changes 6 | 7 | _Describe the big picture of your changes here to communicate to the maintainers why we should accept this pull request. 8 | If it fixes a bug or resolves a feature request, be sure to link to that issue._ 9 | 10 | 11 | ## Types of changes 12 | 13 | What types of changes does your code introduce to pipelinewise-transform-field? 14 | _Put an `x` in the boxes that apply_ 15 | 16 | - [ ] Bugfix (non-breaking change which fixes an issue) 17 | - [ ] New feature (non-breaking change which adds functionality) 18 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 19 | - [ ] Documentation Update (if none of the other choices apply) 20 | 21 | 22 | ## Checklist 23 | 24 | - [ ] I have read the [CONTRIBUTING](https://github.com/transferwise/pipelinewise/blob/master/CONTRIBUTING.md) doc. 25 | - [ ] Description above provides context of the change 26 | - [ ] I have added tests that prove my fix is effective or that my feature works 27 | - [ ] Unit tests for changes (not needed for documentation changes) 28 | - [ ] CI checks pass with my changes 29 | - [ ] Bumping version in `setup.py` is an individual PR and not mixed with feature or bugfix PRs 30 | - [ ] Commit message/PR title starts with `[AP-NNNN]` (if applicable. AP-NNNN = JIRA ID) 31 | - [ ] Branch name starts with `AP-NNN` (if applicable. AP-NNN = JIRA ID) 32 | - [ ] Commits follow "[How to write a good git commit message](http://chris.beams.io/posts/git-commit/)" 33 | - [ ] Relevant documentation is updated including usage instructions 34 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # This is an automatically generated base configuration 2 | # For further configuration options and tuning: 3 | # https://docs.github.com/en/free-pro-team@latest/github/administering-a-repository/configuration-options-for-dependency-updates 4 | 5 | version: 2 6 | updates: 7 | - package-ecosystem: "pip" 8 | directory: "/" 9 | schedule: 10 | interval: "weekly" 11 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: [master] 8 | 9 | workflow_dispatch: 10 | 11 | concurrency: 12 | group: ci-${{ github.head_ref }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | strategy: 20 | fail-fast: true 21 | matrix: 22 | python-version: [3.6, 3.7, 3.8] 23 | 24 | steps: 25 | - name: Checking out repo 26 | uses: actions/checkout@v2 27 | 28 | - name: Set up Python ${{ matrix.container[1] }} 29 | uses: actions/setup-python@v2 30 | with: 31 | python-version: ${{ matrix.python-version }} 32 | 33 | - name: set LOGGING_CONF_FILE env 34 | run: | 35 | export LOGGING_CONF_FILE=$(pwd)/sample_logging.conf 36 | echo $LOGGING_CONF_FILE 37 | 38 | - name: Install dependencies 39 | run: | 40 | pip install --upgrade pip setuptools 41 | pip install .[test] 42 | 43 | - name: Check if pylint is happy 44 | run: pylint transform_field 45 | 46 | - name: Run Unit Tests with min coverage 47 | run: pytest --cov=transform_field --cov-fail-under=65 -v tests/unit 48 | 49 | - name: Run Integration Tests with min coverage 50 | run: pytest --cov-fail-under=73 -v tests/integration 51 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package to PyPi 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 13 | uses: actions/setup-python@v1 14 | with: 15 | python-version: '3.x' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE 2 | .vscode 3 | .idea/* 4 | 5 | 6 | # Python 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | .virtualenvs 11 | *.egg-info/ 12 | *~ 13 | dist/ 14 | 15 | # Singer JSON files 16 | properties.json 17 | config.json 18 | state.json 19 | 20 | *.db 21 | .DS_Store 22 | venv 23 | env 24 | blog_old.md 25 | node_modules 26 | *.pyc 27 | tmp 28 | 29 | # Docs 30 | docs/_build/ 31 | docs/_templates/ 32 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | # Based on Apache 2.0 licensed code from https://github.com/ClusterHQ/flocker 2 | 3 | [MASTER] 4 | 5 | # Specify a configuration file. 6 | #rcfile= 7 | 8 | # Python code to execute, usually for sys.path manipulation such as 9 | # pygtk.require(). 10 | # init-hook= 11 | 12 | # Add files or directories to the blacklist. They should be base names, not paths. 13 | ignore= 14 | 15 | # Pickle collected data for later comparisons. 16 | persistent=no 17 | 18 | # List of plugins (as comma separated values of python modules names) to load, 19 | # usually to register additional checkers. 20 | load-plugins= 21 | 22 | # Use multiple processes to speed up Pylint. 23 | # DO NOT CHANGE THIS VALUES >1 HIDE RESULTS!!!!! 24 | jobs=1 25 | 26 | # Allow loading of arbitrary C extensions. Extensions are imported into the 27 | # active Python interpreter and may run arbitrary code. 28 | unsafe-load-any-extension=no 29 | 30 | # A comma-separated list of package or module names from where C extensions may 31 | # be loaded. Extensions are loading into the active Python interpreter and may 32 | # run arbitrary code 33 | extension-pkg-whitelist=ujson 34 | 35 | # Allow optimization of some AST trees. This will activate a peephole AST 36 | # optimizer, which will apply various small optimizations. For instance, it can 37 | # be used to obtain the result of joining multiple strings with the addition 38 | # operator. Joining a lot of strings can lead to a maximum recursion error in 39 | # Pylint and this flag can prevent that. It has one side effect, the resulting 40 | # AST will be different than the one from reality. 41 | optimize-ast=no 42 | 43 | 44 | [MESSAGES CONTROL] 45 | 46 | # Only show warnings with the listed confidence levels. Leave empty to show 47 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED 48 | confidence= 49 | 50 | # Enable the message, report, category or checker with the given id(s). You can 51 | # either give multiple identifier separated by comma (,) or put this option 52 | # multiple time. See also the "--disable" option for examples. 53 | disable=wrong-import-order, 54 | broad-except, 55 | missing-module-docstring, 56 | duplicate-code, # not useful until a major code refactoring 57 | 58 | 59 | enable=import-error, 60 | import-self, 61 | reimported, 62 | wildcard-import, 63 | misplaced-future, 64 | deprecated-module, 65 | unpacking-non-sequence, 66 | invalid-all-object, 67 | undefined-all-variable, 68 | used-before-assignment, 69 | cell-var-from-loop, 70 | global-variable-undefined, 71 | redefine-in-handler, 72 | unused-import, 73 | unused-wildcard-import, 74 | global-variable-not-assigned, 75 | undefined-loop-variable, 76 | global-statement, 77 | global-at-module-level, 78 | bad-open-mode, 79 | redundant-unittest-assert, 80 | boolean-datetime 81 | deprecated-method, 82 | anomalous-unicode-escape-in-string, 83 | anomalous-backslash-in-string, 84 | not-in-loop, 85 | continue-in-finally, 86 | abstract-class-instantiated, 87 | star-needs-assignment-target, 88 | duplicate-argument-name, 89 | return-in-init, 90 | too-many-star-expressions, 91 | nonlocal-and-global, 92 | return-outside-function, 93 | return-arg-in-generator, 94 | invalid-star-assignment-target, 95 | bad-reversed-sequence, 96 | nonexistent-operator, 97 | yield-outside-function, 98 | init-is-generator, 99 | nonlocal-without-binding, 100 | lost-exception, 101 | assert-on-tuple, 102 | dangerous-default-value, 103 | duplicate-key, 104 | useless-else-on-loop 105 | expression-not-assigned, 106 | confusing-with-statement, 107 | unnecessary-lambda, 108 | pointless-statement, 109 | pointless-string-statement, 110 | unnecessary-pass, 111 | unreachable, 112 | eval-used, 113 | exec-used, 114 | using-constant-test, 115 | bad-super-call, 116 | missing-super-argument, 117 | slots-on-old-class, 118 | super-on-old-class, 119 | property-on-old-class, 120 | not-an-iterable, 121 | not-a-mapping, 122 | format-needs-mapping, 123 | truncated-format-string, 124 | missing-format-string-key, 125 | mixed-format-string, 126 | too-few-format-args, 127 | bad-str-strip-call, 128 | too-many-format-args, 129 | bad-format-character, 130 | format-combined-specification, 131 | bad-format-string-key, 132 | bad-format-string, 133 | missing-format-attribute, 134 | missing-format-argument-key, 135 | unused-format-string-argument 136 | unused-format-string-key, 137 | invalid-format-index, 138 | bad-indentation, 139 | mixed-indentation, 140 | unnecessary-semicolon, 141 | lowercase-l-suffix, 142 | invalid-encoded-data, 143 | unpacking-in-except, 144 | import-star-module-level, 145 | long-suffix, 146 | old-octal-literal, 147 | old-ne-operator, 148 | backtick, 149 | old-raise-syntax, 150 | metaclass-assignment, 151 | next-method-called, 152 | dict-iter-method, 153 | dict-view-method, 154 | indexing-exception, 155 | raising-string, 156 | using-cmp-argument, 157 | cmp-method, 158 | coerce-method, 159 | delslice-method, 160 | getslice-method, 161 | hex-method, 162 | nonzero-method, 163 | t-method, 164 | setslice-method, 165 | old-division, 166 | logging-format-truncated, 167 | logging-too-few-args, 168 | logging-too-many-args, 169 | logging-unsupported-format, 170 | logging-format-interpolation, 171 | invalid-unary-operand-type, 172 | unsupported-binary-operation, 173 | not-callable, 174 | redundant-keyword-arg, 175 | assignment-from-no-return, 176 | assignment-from-none, 177 | not-context-manager, 178 | repeated-keyword, 179 | missing-kwoa, 180 | no-value-for-parameter, 181 | invalid-sequence-index, 182 | invalid-slice-index, 183 | unexpected-keyword-arg, 184 | unsupported-membership-test, 185 | unsubscriptable-object, 186 | access-member-before-definition, 187 | method-hidden, 188 | assigning-non-slot, 189 | duplicate-bases, 190 | inconsistent-mro, 191 | inherit-non-class, 192 | invalid-slots, 193 | invalid-slots-object, 194 | no-method-argument, 195 | no-self-argument, 196 | unexpected-special-method-signature, 197 | non-iterator-returned, 198 | arguments-differ, 199 | signature-differs, 200 | bad-staticmethod-argument, 201 | non-parent-init-called, 202 | bad-except-order, 203 | catching-non-exception, 204 | bad-exception-context, 205 | notimplemented-raised, 206 | raising-bad-type, 207 | raising-non-exception, 208 | misplaced-bare-raise, 209 | duplicate-except, 210 | nonstandard-exception, 211 | binary-op-exception, 212 | bare-except, 213 | not-async-context-manager, 214 | yield-inside-async-function 215 | 216 | # Needs investigation: 217 | # abstract-method (might be indicating a bug? probably not though) 218 | # protected-access (requires some refactoring) 219 | # attribute-defined-outside-init (requires some refactoring) 220 | # super-init-not-called (requires some cleanup) 221 | 222 | # Things we'd like to enable someday: 223 | # redefined-builtin (requires a bunch of work to clean up our code first) 224 | # redefined-outer-name (requires a bunch of work to clean up our code first) 225 | # undefined-variable (re-enable when pylint fixes https://github.com/PyCQA/pylint/issues/760) 226 | # no-name-in-module (giving us spurious warnings https://github.com/PyCQA/pylint/issues/73) 227 | # unused-argument (need to clean up or code a lot, e.g. prefix unused_?) 228 | # function-redefined (@overload causes lots of spurious warnings) 229 | # too-many-function-args (@overload causes spurious warnings... I think) 230 | # parameter-unpacking (needed for eventual Python 3 compat) 231 | # print-statement (needed for eventual Python 3 compat) 232 | # filter-builtin-not-iterating (Python 3) 233 | # map-builtin-not-iterating (Python 3) 234 | # range-builtin-not-iterating (Python 3) 235 | # zip-builtin-not-iterating (Python 3) 236 | # many others relevant to Python 3 237 | # unused-variable (a little work to cleanup, is all) 238 | 239 | # ... 240 | [REPORTS] 241 | 242 | # Set the output format. Available formats are text, parseable, colorized, msvs 243 | # (visual studio) and html. You can also give a reporter class, eg 244 | # mypackage.mymodule.MyReporterClass. 245 | output-format=parseable 246 | 247 | # Put messages in a separate file for each module / package specified on the 248 | # command line instead of printing them on stdout. Reports (if any) will be 249 | # written in a file name "pylint_global.[txt|html]". 250 | files-output=no 251 | 252 | # Tells whether to display a full report or only the messages 253 | reports=no 254 | 255 | # Python expression which should return a note less than 10 (10 is the highest 256 | # note). You have access to the variables errors warning, statement which 257 | # respectively contain the number of errors / warnings messages and the total 258 | # number of statements analyzed. This is used by the global evaluation report 259 | # (RP0004). 260 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 261 | 262 | # Template used to display messages. This is a python new-style format string 263 | # used to format the message information. See doc for all details 264 | #msg-template= 265 | 266 | 267 | [LOGGING] 268 | 269 | # Logging modules to check that the string format arguments are in logging 270 | # function parameter format 271 | logging-modules=logging 272 | 273 | 274 | [FORMAT] 275 | 276 | # Maximum number of characters on a single line. 277 | max-line-length=120 278 | 279 | # Regexp for a line that is allowed to be longer than the limit. 280 | ignore-long-lines=^\s*(# )??$ 281 | 282 | # Allow the body of an if to be on the same line as the test if there is no 283 | # else. 284 | single-line-if-stmt=no 285 | 286 | # List of optional constructs for which whitespace checking is disabled. `dict- 287 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 288 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 289 | # `empty-line` allows space-only lines. 290 | no-space-check=trailing-comma,dict-separator 291 | 292 | # Maximum number of lines in a module 293 | max-module-lines=1000 294 | 295 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 296 | # tab). 297 | indent-string=' ' 298 | 299 | # Number of spaces of indent required inside a hanging or continued line. 300 | indent-after-paren=4 301 | 302 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 303 | expected-line-ending-format= 304 | 305 | 306 | [TYPECHECK] 307 | 308 | # Tells whether missing members accessed in mixin class should be ignored. A 309 | # mixin class is detected if its name ends with "mixin" (case insensitive). 310 | ignore-mixin-members=yes 311 | 312 | # List of module names for which member attributes should not be checked 313 | # (useful for modules/projects where namespaces are manipulated during runtime 314 | # and thus existing member attributes cannot be deduced by static analysis. It 315 | # supports qualified module names, as well as Unix pattern matching. 316 | ignored-modules= 317 | 318 | # List of classes names for which member attributes should not be checked 319 | # (useful for classes with attributes dynamically set). This supports can work 320 | # with qualified names. 321 | ignored-classes= 322 | 323 | # List of members which are set dynamically and missed by pylint inference 324 | # system, and so shouldn't trigger E1101 when accessed. Python regular 325 | # expressions are accepted. 326 | generated-members= 327 | 328 | 329 | [VARIABLES] 330 | 331 | # Tells whether we should check for unused import in __init__ files. 332 | init-import=no 333 | 334 | # A regular expression matching the name of dummy variables (i.e. expectedly 335 | # not used). 336 | dummy-variables-rgx=_$|dummy 337 | 338 | # List of additional names supposed to be defined in builtins. Remember that 339 | # you should avoid to define new builtins when possible. 340 | additional-builtins= 341 | 342 | # List of strings which can identify a callback function by name. A callback 343 | # name must start or end with one of those strings. 344 | callbacks=cb_,_cb 345 | 346 | 347 | [SIMILARITIES] 348 | 349 | # Minimum lines number of a similarity. 350 | min-similarity-lines=4 351 | 352 | # Ignore comments when computing similarities. 353 | ignore-comments=yes 354 | 355 | # Ignore docstrings when computing similarities. 356 | ignore-docstrings=yes 357 | 358 | # Ignore imports when computing similarities. 359 | ignore-imports=no 360 | 361 | 362 | [SPELLING] 363 | 364 | # Spelling dictionary name. Available dictionaries: none. To make it working 365 | # install python-enchant package. 366 | spelling-dict= 367 | 368 | # List of comma separated words that should not be checked. 369 | spelling-ignore-words= 370 | 371 | # A path to a file that contains private dictionary; one word per line. 372 | spelling-private-dict-file= 373 | 374 | # Tells whether to store unknown words to indicated private dictionary in 375 | # --spelling-private-dict-file option instead of raising a message. 376 | spelling-store-unknown-words=no 377 | 378 | 379 | [MISCELLANEOUS] 380 | 381 | # List of note tags to take in consideration, separated by a comma. 382 | notes=FIXME,XXX 383 | 384 | 385 | [BASIC] 386 | 387 | # List of builtins function names that should not be used, separated by a comma 388 | bad-functions=map,filter,input 389 | 390 | # Good variable names which should always be accepted, separated by a comma 391 | good-names=i,j,k,ex,Run,_ 392 | 393 | # Bad variable names which should always be refused, separated by a comma 394 | bad-names=foo,bar,baz,toto,tutu,tata 395 | 396 | # Colon-delimited sets of names that determine each other's naming style when 397 | # the name regexes allow several styles. 398 | name-group= 399 | 400 | # Include a hint for the correct naming format with invalid-name 401 | include-naming-hint=no 402 | 403 | # Regular expression matching correct function names 404 | function-rgx=[a-z_][a-z0-9_]{2,40}$ 405 | 406 | # Naming hint for function names 407 | function-name-hint=[a-z_][a-z0-9_]{2,40}$ 408 | 409 | # Regular expression matching correct variable names 410 | variable-rgx=[a-z_][a-z0-9_]{2,30}$ 411 | 412 | # Naming hint for variable names 413 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$ 414 | 415 | # Regular expression matching correct constant names 416 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 417 | 418 | # Naming hint for constant names 419 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 420 | 421 | # Regular expression matching correct attribute names 422 | attr-rgx=[a-z_][a-z0-9_]{2,30}$ 423 | 424 | # Naming hint for attribute names 425 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$ 426 | 427 | # Regular expression matching correct argument names 428 | argument-rgx=[a-z_][a-z0-9_]{2,30}$ 429 | 430 | # Naming hint for argument names 431 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$ 432 | 433 | # Regular expression matching correct class attribute names 434 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 435 | 436 | # Naming hint for class attribute names 437 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 438 | 439 | # Regular expression matching correct inline iteration names 440 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 441 | 442 | # Naming hint for inline iteration names 443 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ 444 | 445 | # Regular expression matching correct class names 446 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 447 | 448 | # Naming hint for class names 449 | class-name-hint=[A-Z_][a-zA-Z0-9]+$ 450 | 451 | # Regular expression matching correct module names 452 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 453 | 454 | # Naming hint for module names 455 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 456 | 457 | # Regular expression matching correct method names 458 | method-rgx=[a-z_][a-z0-9_]{2,30}$ 459 | 460 | # Naming hint for method names 461 | method-name-hint=[a-z_][a-z0-9_]{2,30}$ 462 | 463 | # Regular expression which should only match function or class names that do 464 | # not require a docstring. 465 | no-docstring-rgx=^_ 466 | 467 | # Minimum line length for functions/classes that require docstrings, shorter 468 | # ones are exempt. 469 | docstring-min-length=-1 470 | 471 | 472 | [ELIF] 473 | 474 | # Maximum number of nested blocks for function / method body 475 | max-nested-blocks=5 476 | 477 | 478 | [IMPORTS] 479 | 480 | # Deprecated modules which should not be used, separated by a comma 481 | deprecated-modules=regsub,TERMIOS,Bastion,rexec 482 | 483 | # Create a graph of every (i.e. internal and external) dependencies in the 484 | # given file (report RP0402 must not be disabled) 485 | import-graph= 486 | 487 | # Create a graph of external dependencies in the given file (report RP0402 must 488 | # not be disabled) 489 | ext-import-graph= 490 | 491 | # Create a graph of internal dependencies in the given file (report RP0402 must 492 | # not be disabled) 493 | int-import-graph= 494 | 495 | 496 | [DESIGN] 497 | 498 | # Maximum number of arguments for function / method 499 | max-args=7 500 | 501 | # Argument names that match this expression will be ignored. Default to name 502 | # with leading underscore 503 | ignored-argument-names=_.* 504 | 505 | # Maximum number of locals for function / method body 506 | max-locals=15 507 | 508 | # Maximum number of return / yield for function / method body 509 | max-returns=6 510 | 511 | # Maximum number of branch for function / method body 512 | max-branches=12 513 | 514 | # Maximum number of statements in function / method body 515 | max-statements=50 516 | 517 | # Maximum number of parents for a class (see R0901). 518 | max-parents=7 519 | 520 | # Maximum number of attributes for a class (see R0902). 521 | max-attributes=7 522 | 523 | # Minimum number of public methods for a class (see R0903). 524 | min-public-methods=2 525 | 526 | # Maximum number of public methods for a class (see R0904). 527 | max-public-methods=20 528 | 529 | # Maximum number of boolean expressions in a if statement 530 | max-bool-expr=5 531 | 532 | 533 | [CLASSES] 534 | 535 | # List of method names used to declare (i.e. assign) instance attributes. 536 | defining-attr-methods=__init__,__new__,setUp 537 | 538 | # List of valid names for the first argument in a class method. 539 | valid-classmethod-first-arg=cls 540 | 541 | # List of valid names for the first argument in a metaclass class method. 542 | valid-metaclass-classmethod-first-arg=mcs 543 | 544 | # List of member names, which should be excluded from the protected access 545 | # warning. 546 | exclude-protected=_asdict,_fields,_replace,_source,_make 547 | 548 | 549 | [EXCEPTIONS] 550 | 551 | # Exceptions that will emit a warning when being caught. Defaults to 552 | # "Exception" 553 | overgeneral-exceptions=Exception -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | ## 2.3.0 (2021-12-16) 4 | ### Added 5 | - Transformation of specific fields in object/array type properties in `RECORD` by using XPath syntax. 6 | - Conditions on specific fields in object/array type properties in `RECORD`. 7 | 8 | ## 2.2.0 (2021-09-17) 9 | ### Added 10 | - New transformation MASK-STRING-SKIP-ENDS-n. The transformation masks the string except start and end n-characters. 11 | 12 | ## 2.1.0 (2021-03-11) 13 | ### Addedd 14 | - `--validate` flag to do one-off validatation of the transformation config using a given catalog file. 15 | 16 | ### Changed 17 | - Validation of the transformation during runtime whenever a new `SCHEMA` type message has been received. 18 | 19 | 20 | ## 2.0.0 (2020-03-17) 21 | 22 | ### Changed 23 | - Stop trimming transformed values 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2012 The Obvious Corporation and contributors. 2 | 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | ``` 17 | ------------------------------------------------------------------------- 18 | Apache License 19 | Version 2.0, January 2004 20 | http://www.apache.org/licenses/ 21 | 22 | 23 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 24 | 25 | 1. Definitions. 26 | 27 | "License" shall mean the terms and conditions for use, reproduction, 28 | and distribution as defined by Sections 1 through 9 of this document. 29 | 30 | "Licensor" shall mean the copyright owner or entity authorized by 31 | the copyright owner that is granting the License. 32 | 33 | "Legal Entity" shall mean the union of the acting entity and all 34 | other entities that control, are controlled by, or are under common 35 | control with that entity. For the purposes of this definition, 36 | "control" means (i) the power, direct or indirect, to cause the 37 | direction or management of such entity, whether by contract or 38 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 39 | outstanding shares, or (iii) beneficial ownership of such entity. 40 | 41 | "You" (or "Your") shall mean an individual or Legal Entity 42 | exercising permissions granted by this License. 43 | 44 | "Source" form shall mean the preferred form for making modifications, 45 | including but not limited to software source code, documentation 46 | source, and configuration files. 47 | 48 | "Object" form shall mean any form resulting from mechanical 49 | transformation or translation of a Source form, including but 50 | not limited to compiled object code, generated documentation, 51 | and conversions to other media types. 52 | 53 | "Work" shall mean the work of authorship, whether in Source or 54 | Object form, made available under the License, as indicated by a 55 | copyright notice that is included in or attached to the work 56 | (an example is provided in the Appendix below). 57 | 58 | "Derivative Works" shall mean any work, whether in Source or Object 59 | form, that is based on (or derived from) the Work and for which the 60 | editorial revisions, annotations, elaborations, or other modifications 61 | represent, as a whole, an original work of authorship. For the purposes 62 | of this License, Derivative Works shall not include works that remain 63 | separable from, or merely link (or bind by name) to the interfaces of, 64 | the Work and Derivative Works thereof. 65 | 66 | "Contribution" shall mean any work of authorship, including 67 | the original version of the Work and any modifications or additions 68 | to that Work or Derivative Works thereof, that is intentionally 69 | submitted to Licensor for inclusion in the Work by the copyright owner 70 | or by an individual or Legal Entity authorized to submit on behalf of 71 | the copyright owner. For the purposes of this definition, "submitted" 72 | means any form of electronic, verbal, or written communication sent 73 | to the Licensor or its representatives, including but not limited to 74 | communication on electronic mailing lists, source code control systems, 75 | and issue tracking systems that are managed by, or on behalf of, the 76 | Licensor for the purpose of discussing and improving the Work, but 77 | excluding communication that is conspicuously marked or otherwise 78 | designated in writing by the copyright owner as "Not a Contribution." 79 | 80 | "Contributor" shall mean Licensor and any individual or Legal Entity 81 | on behalf of whom a Contribution has been received by Licensor and 82 | subsequently incorporated within the Work. 83 | 84 | 2. Grant of Copyright License. Subject to the terms and conditions of 85 | this License, each Contributor hereby grants to You a perpetual, 86 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 87 | copyright license to reproduce, prepare Derivative Works of, 88 | publicly display, publicly perform, sublicense, and distribute the 89 | Work and such Derivative Works in Source or Object form. 90 | 91 | 3. Grant of Patent License. Subject to the terms and conditions of 92 | this License, each Contributor hereby grants to You a perpetual, 93 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 94 | (except as stated in this section) patent license to make, have made, 95 | use, offer to sell, sell, import, and otherwise transfer the Work, 96 | where such license applies only to those patent claims licensable 97 | by such Contributor that are necessarily infringed by their 98 | Contribution(s) alone or by combination of their Contribution(s) 99 | with the Work to which such Contribution(s) was submitted. If You 100 | institute patent litigation against any entity (including a 101 | cross-claim or counterclaim in a lawsuit) alleging that the Work 102 | or a Contribution incorporated within the Work constitutes direct 103 | or contributory patent infringement, then any patent licenses 104 | granted to You under this License for that Work shall terminate 105 | as of the date such litigation is filed. 106 | 107 | 4. Redistribution. You may reproduce and distribute copies of the 108 | Work or Derivative Works thereof in any medium, with or without 109 | modifications, and in Source or Object form, provided that You 110 | meet the following conditions: 111 | 112 | (a) You must give any other recipients of the Work or 113 | Derivative Works a copy of this License; and 114 | 115 | (b) You must cause any modified files to carry prominent notices 116 | stating that You changed the files; and 117 | 118 | (c) You must retain, in the Source form of any Derivative Works 119 | that You distribute, all copyright, patent, trademark, and 120 | attribution notices from the Source form of the Work, 121 | excluding those notices that do not pertain to any part of 122 | the Derivative Works; and 123 | 124 | (d) If the Work includes a "NOTICE" text file as part of its 125 | distribution, then any Derivative Works that You distribute must 126 | include a readable copy of the attribution notices contained 127 | within such NOTICE file, excluding those notices that do not 128 | pertain to any part of the Derivative Works, in at least one 129 | of the following places: within a NOTICE text file distributed 130 | as part of the Derivative Works; within the Source form or 131 | documentation, if provided along with the Derivative Works; or, 132 | within a display generated by the Derivative Works, if and 133 | wherever such third-party notices normally appear. The contents 134 | of the NOTICE file are for informational purposes only and 135 | do not modify the License. You may add Your own attribution 136 | notices within Derivative Works that You distribute, alongside 137 | or as an addendum to the NOTICE text from the Work, provided 138 | that such additional attribution notices cannot be construed 139 | as modifying the License. 140 | 141 | You may add Your own copyright statement to Your modifications and 142 | may provide additional or different license terms and conditions 143 | for use, reproduction, or distribution of Your modifications, or 144 | for any such Derivative Works as a whole, provided Your use, 145 | reproduction, and distribution of the Work otherwise complies with 146 | the conditions stated in this License. 147 | 148 | 5. Submission of Contributions. Unless You explicitly state otherwise, 149 | any Contribution intentionally submitted for inclusion in the Work 150 | by You to the Licensor shall be under the terms and conditions of 151 | this License, without any additional terms or conditions. 152 | Notwithstanding the above, nothing herein shall supersede or modify 153 | the terms of any separate license agreement you may have executed 154 | with Licensor regarding such Contributions. 155 | 156 | 6. Trademarks. This License does not grant permission to use the trade 157 | names, trademarks, service marks, or product names of the Licensor, 158 | except as required for reasonable and customary use in describing the 159 | origin of the Work and reproducing the content of the NOTICE file. 160 | 161 | 7. Disclaimer of Warranty. Unless required by applicable law or 162 | agreed to in writing, Licensor provides the Work (and each 163 | Contributor provides its Contributions) on an "AS IS" BASIS, 164 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 165 | implied, including, without limitation, any warranties or conditions 166 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 167 | PARTICULAR PURPOSE. You are solely responsible for determining the 168 | appropriateness of using or redistributing the Work and assume any 169 | risks associated with Your exercise of permissions under this License. 170 | 171 | 8. Limitation of Liability. In no event and under no legal theory, 172 | whether in tort (including negligence), contract, or otherwise, 173 | unless required by applicable law (such as deliberate and grossly 174 | negligent acts) or agreed to in writing, shall any Contributor be 175 | liable to You for damages, including any direct, indirect, special, 176 | incidental, or consequential damages of any character arising as a 177 | result of this License or out of the use or inability to use the 178 | Work (including but not limited to damages for loss of goodwill, 179 | work stoppage, computer failure or malfunction, or any and all 180 | other commercial damages or losses), even if such Contributor 181 | has been advised of the possibility of such damages. 182 | 183 | 9. Accepting Warranty or Additional Liability. While redistributing 184 | the Work or Derivative Works thereof, You may choose to offer, 185 | and charge a fee for, acceptance of support, warranty, indemnity, 186 | or other liability obligations and/or rights consistent with this 187 | License. However, in accepting such obligations, You may act only 188 | on Your own behalf and on Your sole responsibility, not on behalf 189 | of any other Contributor, and only if You agree to indemnify, 190 | defend, and hold each Contributor harmless for any liability 191 | incurred by, or claims asserted against, such Contributor by reason 192 | of your accepting any such warranty or additional liability. 193 | 194 | END OF TERMS AND CONDITIONS 195 | ``` 196 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Notice 2 | To better serve Wise business and customer needs, the PipelineWise codebase needs to shrink. 3 | We have made the difficult decision that, going forward many components of PipelineWise will be removed or incorporated in the main repo. 4 | The last version before this decision is [v0.64.1](https://github.com/transferwise/pipelinewise/tree/v0.64.1) 5 | 6 | We thank all in the open-source community, that over the past 6 years, have helped to make PipelineWise a robust product for heterogeneous replication of many many Terabytes, daily 7 | 8 | # pipelinewise-transform-field 9 | 10 | [![PyPI version](https://badge.fury.io/py/pipelinewise-transform-field.svg)](https://badge.fury.io/py/pipelinewise-transform-field) 11 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pipelinewise-transform-field.svg)](https://pypi.org/project/pipelinewise-transform-field/) 12 | [![License: Apache2](https://img.shields.io/badge/License-Apache2-yellow.svg)](https://opensource.org/licenses/Apache-2.0) 13 | 14 | Transformation component between [Singer](https://www.singer.io/) taps and targets. 15 | 16 | This is a [PipelineWise](https://transferwise.github.io/pipelinewise) compatible component. 17 | 18 | ## How to use it 19 | 20 | The recommended method of running this component is to use it from [PipelineWise](https://transferwise.github.io/pipelinewise). When running it from PipelineWise you don't need to configure this tap with JSON files, and most of things are automated. 21 | Please check the related documentation at [Transformations](https://transferwise.github.io/pipelinewise/user_guide/transformations.html) 22 | 23 | If you want to run this [Singer](https://singer.io) compatible component independently please read further. 24 | 25 | ## Install 26 | 27 | First, make sure Python 3 is installed on your system or follow these 28 | installation instructions for [Mac](http://docs.python-guide.org/en/latest/starting/install3/osx/) or 29 | [Ubuntu](https://www.digitalocean.com/community/tutorials/how-to-install-python-3-and-set-up-a-local-programming-environment-on-ubuntu-16-04). 30 | 31 | It's recommended to use a virtualenv: 32 | 33 | ```bash 34 | python3 -m venv venv 35 | pip install pipelinewise-transform-field 36 | ``` 37 | 38 | or 39 | 40 | ```bash 41 | python3 -m venv venv 42 | . venv/bin/activate 43 | pip install --upgrade pip setuptools 44 | pip install . 45 | ``` 46 | 47 | ### To validate transformations 48 | 49 | `transform-field --validate --config [config.json] --catalog [catalog.json]` 50 | 51 | ### To run 52 | 53 | Put it between a tap and a target with simple unix pipes: 54 | 55 | `some-singer-tap | transform-field --config [config.json] | some-singer-target` 56 | 57 | It's reading incoming messages from STDIN and using `config.json` to transform incoming RECORD messages. 58 | 59 | **Note**: To avoid version conflicts run `tap`, `transform` and `targets` in separate virtual environments. 60 | 61 | ### Transformation types 62 | 63 | The following are the transformation types supported by _pipelinewise-transform-field_: 64 | 65 | * **SET-NULL**: Transforms any input to NULL 66 | * **HASH**: Transforms string input to hash 67 | * **HASH-SKIP-FIRST-n**: Transforms string input to hash skipping first n characters, e.g. HASH-SKIP-FIRST-2 68 | * **MASK-DATE**: Replaces the months and day parts of date columns to be always 1st of Jan 69 | * **MASK-NUMBER**: Transforms any numeric value to zero 70 | * **MASK-HIDDEN**: Transforms any string to 'hidden' 71 | * **MASK-STRING-SKIP-ENDS-n**: Transforms string input to masked version skipping first and last n characters, e.g. MASK-STRING-SKIP-ENDS-3 72 | 73 | _PS_: 1 =< n =< 9 74 | 75 | ### Conditional transformations 76 | 77 | It is possible to transform a record's property based on some given condition(s), the transformation will only take place when all conditions are met. 78 | 79 | A condition is a combination of: 80 | * column [required]: the field to look up to 81 | * operation [required]: the comparison type to use, the supported ones are `equals` and `regex_match`. 82 | * value [required]: the column value to look for in records. 83 | 84 | **An equality condition on a column** 85 | ```json 86 | { 87 | "column": "", 88 | "equals": 89 | } 90 | ``` 91 | 92 | **A regex condition on a column** 93 | ```json 94 | { 95 | "column": "", 96 | "regex_match": "" 97 | } 98 | ``` 99 | 100 | **A condition on a property within a JSON-type column** 101 | ```json 102 | { 103 | "column": "", 104 | "field_path": "", 105 | "equals": 106 | } 107 | ``` 108 | 109 | ### Configuration 110 | 111 | You need to define which columns have to be transformed by which method and in which condition the transformation needs to be applied. 112 | 113 | #### Basic transformation 114 | A basic transformation is where a field in all a stream records will be transformed can be achieved with: 115 | ```json 116 | { 117 | "tap_stream_name": "", 118 | "field_id": "", 119 | "type": "" 120 | } 121 | ``` 122 | 123 | #### Transformation within JSON 124 | 125 | In order to transform property(ies) within a JSON type field, you can make use of `field_paths` property: 126 | 127 | ```json 128 | { 129 | "tap_stream_name": "", 130 | "field_id": "", 131 | "field_paths": ["xpath to property 1", "xpath to property 2"], 132 | "type": "" 133 | } 134 | ``` 135 | 136 | #### Conditional Transformation 137 | 138 | To apply transformation conditionally, you can make use of the property `when` which can have one or many conditions: 139 | 140 | ```json 141 | { 142 | "tap_stream_name": "", 143 | "field_id": "", 144 | "type": "", 145 | "when": [ 146 | {"column": "string_col_1", "equals": "some value"}, 147 | {"column": "string_col_2", "regex_match": ".*PII.*"}, 148 | {"column": "numeric_col_1", "equals": 33}, 149 | {"column": "json_column", "field_path": "metadata/comment", "regex_match": "sensitive"} 150 | ] 151 | } 152 | ``` 153 | 154 | **Sample config** 155 | [config.json](./sample_config.json) 156 | 157 | (Tip: PipelineWise generating this for you from a more readable YAML format) 158 | 159 | 160 | ### To check code style: 161 | 162 | 1. Install python dependencies in a virtual env 163 | ``` 164 | python3 -m venv venv 165 | . venv/bin/activate 166 | pip install --upgrade pip setuptools 167 | pip install .[test] 168 | ``` 169 | 170 | 2. Run pylint 171 | ```shell 172 | pylint transform_field 173 | ``` 174 | 175 | ### To run tests: 176 | 177 | 1. Install python dependencies in a virtual env and run unit and integration tests 178 | ``` 179 | python3 -m venv venv 180 | . venv/bin/activate 181 | pip install --upgrade pip setuptools 182 | pip install .[test] 183 | ``` 184 | 185 | 2. Run tests: 186 | 187 | * Unit tests 188 | ``` 189 | pytest -v tests/unit 190 | ``` 191 | 192 | * Integration tests 193 | ``` 194 | pytest -v tests/integration 195 | ``` 196 | 197 | * All tests 198 | ``` 199 | pytest -v tests 200 | ``` 201 | 202 | 203 | 204 | ## License 205 | 206 | Apache License Version 2.0 207 | 208 | See [LICENSE](LICENSE) to see the full text. 209 | 210 | -------------------------------------------------------------------------------- /sample_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "transformations": [ 3 | { 4 | "field_id": "password_hash", 5 | "tap_stream_name": "stream-id-sent-by-the-tap", 6 | "type": "MASK-HIDDEN" 7 | }, 8 | { 9 | "field_id": "salt", 10 | "tap_stream_name": "stream-id-sent-by-the-tap", 11 | "type": "HASH" 12 | }, 13 | { 14 | "field_id": "value", 15 | "tap_stream_name": "stream-id-sent-by-the-tap", 16 | "type": "SET-NULL", 17 | "when": [ 18 | {"column": "string_column_1", "equals": "Property" }, 19 | {"column": "numeric_column", "equals": 200 }, 20 | {"column": "string_column_2", "regex_match": "sensitive.*PII" }, 21 | {"column": "json_column", "field_path": "metadata/comment", "regex_match": "sensitive" } 22 | ] 23 | }, 24 | { 25 | "field_id": "metadata", 26 | "tap_stream_name": "stream-id-sent-by-the-tap", 27 | "type": "MASK-HIDDEN", 28 | "field_paths": ["user/address", "user/zip_code"] 29 | } 30 | ] 31 | } -------------------------------------------------------------------------------- /sample_logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [handlers] 5 | keys=stderr 6 | 7 | [formatters] 8 | keys=child 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=stderr 13 | formatter=child 14 | propagate=0 15 | 16 | [handler_stderr] 17 | level=INFO 18 | class=StreamHandler 19 | formatter=child 20 | args=(sys.stderr,) 21 | 22 | [formatter_child] 23 | class=logging.Formatter 24 | format=time=%(asctime)s name=%(name)s level=%(levelname)s message=%(message)s 25 | datefmt=%Y-%m-%d %H:%M:%S 26 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | 5 | with open("README.md", "r") as fh: 6 | long_description = fh.read() 7 | 8 | setup(name='pipelinewise-transform-field', 9 | version='2.3.0', 10 | description='Singer.io simple field transformer between taps and targets - PipelineWise compatible', 11 | long_description=long_description, 12 | long_description_content_type='text/markdown', 13 | author="Wise", 14 | url='https://github.com/transferwise/pipelinewise-transform-field', 15 | classifiers=[ 16 | 'License :: OSI Approved :: Apache Software License', 17 | 'Environment :: Console', 18 | 'Programming Language :: Python :: 3 :: Only', 19 | 'Programming Language :: Python :: 3.6', 20 | 'Programming Language :: Python :: 3.7', 21 | 'Programming Language :: Python :: 3.8' 22 | ], 23 | py_modules=['transform_field'], 24 | install_requires=[ 25 | 'pipelinewise-singer-python==1.*', 26 | 'dpath==2.0.*', 27 | ], 28 | extras_require={ 29 | 'test': [ 30 | 'pytest==6.2.*', 31 | 'pytest-cov==3.0.*', 32 | 'pylint==2.12.*', 33 | ] 34 | }, 35 | entry_points=''' 36 | [console_scripts] 37 | transform-field=transform_field:main 38 | ''', 39 | packages=['transform_field'] 40 | ) 41 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/transferwise/pipelinewise-transform-field/af22b03ece9144948702236d9f2f5aaed845ac16/tests/__init__.py -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/transferwise/pipelinewise-transform-field/af22b03ece9144948702236d9f2f5aaed845ac16/tests/integration/__init__.py -------------------------------------------------------------------------------- /tests/integration/resources/catalog.json: -------------------------------------------------------------------------------- 1 | { 2 | "streams": [ 3 | { 4 | "metadata": [ 5 | { 6 | "breadcrumb": [], 7 | "metadata": { 8 | "replication-method": "FULL_TABLE", 9 | "selected": true, 10 | "selected-by-default": false, 11 | "table-key-properties": [ 12 | "column_1" 13 | ] 14 | } 15 | } 16 | ], 17 | "schema": { 18 | "properties": { 19 | "column_1": { 20 | "format": "date-time", 21 | "inclusion": "available", 22 | "type": [ 23 | "null", 24 | "string" 25 | ] 26 | }, 27 | "column_2": { 28 | "inclusion": "automatic", 29 | "maximum": 2147483647, 30 | "minimum": -2147483648, 31 | "type": [ 32 | "null", 33 | "integer" 34 | ] 35 | }, 36 | "column_3": { 37 | "inclusion": "automatic", 38 | "maximum": 2147483647, 39 | "minimum": -2147483648, 40 | "type": [ 41 | "null", 42 | "integer" 43 | ] 44 | }, 45 | "column_4": { 46 | "inclusion": "automatic", 47 | "maximum": 2147483647, 48 | "minimum": -2147483648, 49 | "type": [ 50 | "null", 51 | "integer" 52 | ] 53 | }, 54 | "column_5": { 55 | "format": "date-time", 56 | "inclusion": "available", 57 | "type": [ 58 | "null", 59 | "string" 60 | ] 61 | } 62 | }, 63 | "type": "object" 64 | }, 65 | "tap_stream_id": "dummy_stream" 66 | } 67 | ] 68 | } -------------------------------------------------------------------------------- /tests/integration/resources/invalid_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "transformations":[ 3 | { 4 | "tap_stream_name":"dummy_stream", 5 | "field_id":"column_1", 6 | "type":"SET-NULL" 7 | }, 8 | { 9 | "tap_stream_name":"dummy_stream", 10 | "field_id":"column_2", 11 | "type":"HASH" 12 | }, 13 | { 14 | "tap_stream_name": "dummy_stream", 15 | "field_id": "column_5", 16 | "type": "MASK-DATE" 17 | } 18 | ] 19 | } 20 | -------------------------------------------------------------------------------- /tests/integration/resources/invalid_messages.json: -------------------------------------------------------------------------------- 1 | {"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_one"}} 2 | {"type": "SCHEMA", "stream": "tap_mysql_test-test_table_one", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_varchar": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "c_int": {"inclusion": "available", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}}, "type": "object"}, "key_properties": ["c_pk"]} 3 | THIS IS A TEST INPUT FROM A TAP WITH A LINE WITH INVALID JSON 4 | {"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_table_one", "version": 1} 5 | -------------------------------------------------------------------------------- /tests/integration/resources/messages.json: -------------------------------------------------------------------------------- 1 | {"type": "STATE", "value": {"currently_syncing": "dummy_stream"}} 2 | {"type": "SCHEMA", "stream": "dummy_stream", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "column_1": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_2": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_3": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_4": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_5": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "column_6": {"inclusion": "available", "type": ["null", "integer"]}, "column_7": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_8": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "column_9": {"inclusion": "available", "type": ["null", "integer"]}, "column_10": {"inclusion": "available", "maxLength": 64, "type": ["null", "string"]}, "column_11": {"inclusion": "available", "maxLength": 64, "type": ["null", "string"]}, "column_12": {"inclusion": "available", "maxLength": 64, "type": ["null", "string"]}, "column_13": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_14": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["c_pk"]} 3 | {"type": "ACTIVATE_VERSION", "stream": "dummy_stream", "version": 1} 4 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 1, "column_1": "Dummy row 1", "column_2": "Dummy row 1", "column_3": "Dummy row 1", "column_4": "Dummy row 1", "column_5": "2019-12-21T12:12:45", "column_6": 1234, "column_7": "Dummy row 1", "column_8": "2019-12-21T12:12:45", "column_9": 100, "column_10": "column_11 is safe to keep", "column_11": "My name is John", "column_12": "abcd", "column_13": "domaskme", "column_14": "domaskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"} 5 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 2, "column_1": "Dummy row 2", "column_2": "Dummy row 2", "column_3": "Dummy row 2", "column_4": "Dummy row 2", "column_5": "2019-12-21T13:12:45", "column_6": 1234, "column_7": "Dummy row 2", "column_8": "2019-12-21T13:12:45", "column_9": 200, "column_10": "column_11 has sensitive data. Needs to transform to NULL", "column_11": "SUPER_SECRET_PASSWORD", "column_12": "abcd", "column_13": "nom", "column_14": "maskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"} 6 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 3, "column_1": "Dummy row 3", "column_2": "Dummy row 3", "column_3": "Dummy row 3", "column_4": "Dummy row 3", "column_5": "2019-12-21T14:12:45", "column_6": 1234, "column_7": "Dummy row 3", "column_8": "2019-12-21T14:12:45", "column_9": 300, "column_10": "Dummy row 1", "column_11": "Dummy row 1", "column_12": "abcd", "column_13": "domaskme", "column_14": "domaskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"} 7 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 3, "column_1": "Dummy row 4", "column_2": "Dummy row 4", "column_3": "Dummy row 4", "column_4": "Dummy row 4", "column_5": "2019-12-21T15:12:45", "column_6": 1234, "column_7": "Dummy row 4", "column_8": "2019-12-21T15:12:45", "column_9": 400, "column_10": "Dummy row 1", "column_11": "Dummy row 1", "column_12": "abcd", "column_13": "domaskme", "column_14": "domaskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"} 8 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 5, "column_1": "Dummy row 5", "column_2": "Dummy row 5", "column_3": "Dummy row 5", "column_4": "Dummy row 5", "column_5": "2019-12-21T16:12:45", "column_6": 1234, "column_7": "Dummy row 5", "column_8": "2019-12-21T16:12:45", "column_9": 500, "column_10": "Dummy row 1", "column_11": "Dummy row 1", "column_12": "abcd", "column_13": "domaskme", "column_14": "domaskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"} 9 | {"type": "STATE", "value": {"currently_syncing": "dummy_stream", "bookmarks": {"dummy_stream": {"initial_full_table_complete": true}}}} 10 | {"type": "ACTIVATE_VERSION", "stream": "dummy_stream", "version": 1} 11 | {"type": "STATE", "value": {"currently_syncing": null, "bookmarks": {"dummy_stream": {"initial_full_table_complete": true}}}} 12 | -------------------------------------------------------------------------------- /tests/integration/resources/streams_with_changing_schema.json: -------------------------------------------------------------------------------- 1 | {"type": "SCHEMA", "stream":"dummy_stream", "schema": {"properties": {"column_2": {"type": ["null", "integer"]}}}, "key_properties": []} 2 | {"type": "RECORD", "stream":"dummy_stream", "record": {"column_2": 1}} 3 | {"type": "RECORD", "stream":"dummy_stream", "record": {"column_2": 2}} 4 | {"type": "RECORD", "stream":"dummy_stream", "record": {"column_2": 3}} 5 | {"type": "SCHEMA", "stream":"dummy_stream", "schema": {"properties": {"column_2": {"type": ["null", "string"]}}}, "key_properties": []} 6 | {"type": "RECORD", "stream":"dummy_stream", "record": {"column_2": "ABC"}} -------------------------------------------------------------------------------- /tests/integration/resources/streams_with_object.json: -------------------------------------------------------------------------------- 1 | {"type": "STATE", "value": {"currently_syncing": "my_cool_stream"}} 2 | {"type": "SCHEMA", "stream": "my_cool_stream", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "column_1": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_2": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_3": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "column_4": {"inclusion": "available", "type": ["null", "integer"]}, "column_5": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "column_6": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["c_pk"]} 3 | {"type": "ACTIVATE_VERSION", "stream": "my_cool_stream", "version": 1} 4 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 1, "column_1": "Dummy row 1", "column_2": "Dummy row 1", "column_3": "2019-12-21T12:12:45", "column_4": 1234, "column_5": "2021-12-21T12:12:45", "column_6": {"id": 50, "key1": "A", "key2": {"key2_2": 41}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"} 5 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 2, "column_1": "Dummy row 2", "column_2": "Dummy row 2", "column_3": "2019-12-21T13:12:45", "column_4": 4, "column_5": "2021-12-21T13:12:45", "column_6": {"id": 51, "key1": "B", "key2": {"key2_1": "ds"}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"} 6 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 3, "column_1": "Dummy row 3", "column_2": "Dummy row 3", "column_3": "2019-12-21T14:12:45", "column_4": 15, "column_5": "2021-12-21T14:12:45", "column_6": {"id": 52, "key1": "C", "key2": {"key2_1": "xv43dgf", "key2_2": 4544}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"} 7 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 4, "column_1": "Dummy row 4", "column_2": "Dummy row 4", "column_3": "2019-12-21T15:12:45", "column_4": 1000, "column_5": "2021-12-21T15:12:45", "column_6": {"id": 53, "key1": "D", "key2": {"key2_1": "43xvf", "key2_2": true}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"} 8 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 5, "column_1": "Dummy row 5", "column_2": "Dummy row 5", "column_3": "2019-12-21T16:12:45", "column_4": -44, "column_5": "2021-12-21T16:12:45", "column_6": {"id": 54, "key1": "E", "key2": {"key2_1": "trter", "key2_3": false}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"} 9 | {"type": "STATE", "value": {"currently_syncing": "my_cool_stream", "bookmarks": {"my_cool_stream": {"initial_full_table_complete": true}}}} 10 | {"type": "ACTIVATE_VERSION", "stream": "my_cool_stream", "version": 1} 11 | {"type": "STATE", "value": {"currently_syncing": null, "bookmarks": {"my_cool_stream": {"initial_full_table_complete": true}}}} 12 | -------------------------------------------------------------------------------- /tests/integration/resources/valid_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "transformations":[ 3 | { 4 | "tap_stream_name":"dummy_stream", 5 | "field_id":"column_1", 6 | "type":"SET-NULL" 7 | }, 8 | { 9 | "tap_stream_name":"dummy_stream", 10 | "field_id":"column_2", 11 | "type":"MASK-NUMBER" 12 | }, 13 | { 14 | "tap_stream_name": "dummy_stream", 15 | "field_id": "column_5", 16 | "type": "MASK-DATE" 17 | } 18 | ] 19 | } 20 | -------------------------------------------------------------------------------- /tests/integration/test_integrations.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import unittest 3 | import os 4 | import sys 5 | import json 6 | import tempfile 7 | 8 | from transform_field import TransformField, TransformFieldException, InvalidTransformationException 9 | 10 | 11 | class Base(unittest.TestCase): 12 | def setUp(self): 13 | self.maxDiff = None 14 | 15 | sys.stdout = self._stdout = tempfile.NamedTemporaryFile('w+', delete=True) 16 | sys.stderr.write(self._stdout.name + ' ') 17 | 18 | def teardown(self): 19 | self._stdout.close() 20 | sys.stdout = sys.__stdout__ 21 | 22 | @property 23 | def stdout(self): 24 | self._stdout.seek(0) 25 | return self._stdout.read()[:-1] # Remove trailing \n:w 26 | 27 | def get_tap_input_messages(self, filename): 28 | lines = [] 29 | with open('{}/resources/{}'.format(os.path.dirname(__file__), filename)) as tap_stdout: 30 | for line in tap_stdout.readlines(): 31 | lines.append(line) 32 | 33 | return lines 34 | 35 | def singer_output_to_objects(self, output): 36 | messages = [] 37 | for message in output.splitlines(): 38 | messages.append(json.loads(message)) 39 | 40 | return messages 41 | 42 | 43 | class TestIntegration(Base): 44 | 45 | def test_invalid_json(self): 46 | """Receiving invalid JSONs should raise an exception""" 47 | tap_lines = self.get_tap_input_messages('invalid_messages.json') 48 | trans_config = {'transformations': []} 49 | 50 | transform_field = TransformField(trans_config) 51 | with self.assertRaises(TransformFieldException): 52 | transform_field.consume(tap_lines) 53 | 54 | def test_multiple_singer_json_messages(self): 55 | """Test a bunch of singer messages with different field transformation types""" 56 | tap_lines = self.get_tap_input_messages('messages.json') 57 | 58 | # Set transformations on some columns 59 | trans_config = {'transformations': [ 60 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_1', 'type': 'SET-NULL'}, 61 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_2', 'type': 'HASH'}, 62 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_3', 'type': 'HASH-SKIP-FIRST-2'}, 63 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_4', 'type': 'HASH-SKIP-FIRST-3'}, 64 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_5', 'type': 'MASK-DATE'}, 65 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_6', 'type': 'MASK-NUMBER'}, 66 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_11', 'type': 'SET-NULL', 67 | 'when': [ 68 | {'column': 'column_7', 'equals': "Dummy row 2"}, 69 | {'column': 'column_9', 'equals': 200}, 70 | {'column': 'column_10', 'regex_match': 'sensitive'}, 71 | ] 72 | }, 73 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_12', 'type': 'MASK-HIDDEN'}, 74 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_13', 'type': 'MASK-STRING-SKIP-ENDS-2'}, 75 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_14', 'type': 'MASK-STRING-SKIP-ENDS-3'} 76 | ]} 77 | 78 | transform_field = TransformField(trans_config) 79 | transform_field.consume(tap_lines) 80 | 81 | singer_output_messages = self.singer_output_to_objects(self.stdout) 82 | 83 | # First message is the STATE message 84 | self.assertEqual( 85 | singer_output_messages[0], 86 | { 87 | 'type': 'STATE', 88 | 'value': {'currently_syncing': 'dummy_stream'} 89 | } 90 | ) 91 | 92 | # Second message is the SCHEMA message 93 | self.assertEqual( 94 | singer_output_messages[1], 95 | { 96 | 'type': 'SCHEMA', 97 | 'stream': 'dummy_stream', 98 | 'schema': { 99 | 'properties': { 100 | 'c_pk': {'inclusion': 'automatic', 'minimum': -2147483648, 'maximum': 2147483647, 101 | 'type': ['null', 'integer']}, 102 | 'column_1': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']}, 103 | 'column_2': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']}, 104 | 'column_3': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']}, 105 | 'column_4': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']}, 106 | 'column_5': {'inclusion': 'available', 'format': 'date-time', 'type': ['null', 'string']}, 107 | 'column_6': {'inclusion': 'available', 'type': ['null', 'integer']}, 108 | 'column_7': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']}, 109 | 'column_8': {'inclusion': 'available', 'format': 'date-time', 'type': ['null', 'string']}, 110 | 'column_9': {'inclusion': 'available', 'type': ['null', 'integer']}, 111 | 'column_10': {'inclusion': 'available', 'maxLength': 64, 'type': ['null', 'string']}, 112 | 'column_11': {'inclusion': 'available', 'maxLength': 64, 'type': ['null', 'string']}, 113 | 'column_12': {'inclusion': 'available', 'maxLength': 64, 'type': ['null', 'string']}, 114 | 'column_13': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']}, 115 | 'column_14': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']}, 116 | }, 117 | 'type': 'object' 118 | }, 119 | 'key_properties': ['c_pk'] 120 | } 121 | ) 122 | 123 | # Third message is a RECORD message with transformed values 124 | self.assertEqual( 125 | singer_output_messages[2], 126 | { 127 | 'type': 'RECORD', 128 | 'stream': 'dummy_stream', 129 | 'record': { 130 | 'c_pk': 1, 131 | 'column_1': None, # should be SET-NULL transformed 132 | 'column_2': 'c584d22683f3e523df9a7396e7939c0da16af89976b613adfe4bcd4c9c526f32', 133 | # Should be HASH transformed 134 | 'column_3': 'Ducd571661edac8d47669a60b964c7124b228b69862cd21d548794af41c139a8e3', 135 | # Should be HASH-SKIP-2 tranformed 136 | 'column_4': 'Dum1fe9627d907b0a37a31b270cc0f660a7388eb470a2558e839e0c1f601aedfaa7', 137 | # Should be HASH-SKIP-3 tranformed 138 | 'column_5': '2019-01-01T12:12:45', # Should be MASK-DATE transformed 139 | 'column_6': 0, # Should be MASK-NUMBER transformed 140 | 'column_7': 'Dummy row 1', # Should be the originl value - Unknown transformation type 141 | 'column_8': '2019-12-21T12:12:45', # Should be the original date-time value 142 | 'column_9': 100, # Should be the original number value 143 | 144 | # Conditional transformation 145 | 'column_10': 'column_11 is safe to keep', 146 | 'column_11': 'My name is John', 147 | 148 | 'column_12': 'hidden', 149 | 150 | # Should be MASK-STRING-SKIP-ENDS-2 transformed 151 | 'column_13': 'do****me', 152 | # Should be MASK-STRING-SKIP-ENDS-3 transformed 153 | 'column_14': 'dom**kme', 154 | }, 155 | 'version': 1, 156 | 'time_extracted': '2019-01-31T15:51:50.215998Z' 157 | } 158 | ) 159 | 160 | # Third message is a RECORD message with transformed values 161 | self.assertEqual( 162 | singer_output_messages[3], 163 | { 164 | 'type': 'RECORD', 165 | 'stream': 'dummy_stream', 166 | 'record': { 167 | 'c_pk': 2, 168 | 'column_1': None, # should be SET-NULL transformed 169 | 'column_2': '12c7ca803f4ae4044b8c3a6aa7dbaf9fe73a25e12f2258dbf8a832961ac6abab', 170 | # Should be HASH tranformed 171 | 'column_3': 'Du7c2717bbc7489d36cea73c8519c815ce962142a5b32db413abe0bce7f58d943f', 172 | # Should be HASH-SKIP-3 tranformed 173 | 'column_4': 'Dum5b2be872199a84657234144caec9106483a522edd36783c7a12439bcf3853c56', 174 | # Should be HASH-SKIP-3 tranformed 175 | 'column_5': '2019-01-01T13:12:45', # Should be MASK-DATE transformed 176 | 'column_6': 0, # Should be MASK-NUMBER transformed 177 | 'column_7': 'Dummy row 2', # Should be the origian value - Unknown transformation type 178 | 'column_8': '2019-12-21T13:12:45', # Should be the original date-time value 179 | 'column_9': 200, # Should be the original number value 180 | 181 | # Conditional transformation 182 | 'column_10': 'column_11 has sensitive data. Needs to transform to NULL', 183 | 'column_11': None, # Should be SET-NULL transformed 184 | 185 | 'column_12': 'hidden', 186 | 187 | # Should be MASK-STRING-SKIP-ENDS-2 transformed 188 | 'column_13': '***', 189 | # Should be MASK-STRING-SKIP-ENDS-3 transformed 190 | 'column_14': '******', 191 | }, 192 | 'version': 1, 193 | 'time_extracted': '2019-01-31T15:51:50.215998Z' 194 | } 195 | ) 196 | 197 | def test_messages_with_changing_schema(self): 198 | """Test a bunch of singer messages where a column in schema message 199 | changes its type""" 200 | tap_lines = self.get_tap_input_messages('streams_with_changing_schema.json') 201 | 202 | # Set transformations on some columns 203 | trans_config = {'transformations': [ 204 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_2', 'type': 'MASK-NUMBER'}, 205 | ]} 206 | 207 | transform_field = TransformField(trans_config) 208 | 209 | with self.assertRaises(InvalidTransformationException): 210 | transform_field.consume(tap_lines) 211 | 212 | def test_validate_flag_with_invalid_transformations(self): 213 | config = '{}/resources/invalid_config.json'.format(os.path.dirname(__file__)) 214 | catalog = '{}/resources/catalog.json'.format(os.path.dirname(__file__)) 215 | 216 | result = subprocess.run([ 217 | 'transform-field', 218 | '--validate', 219 | '--config', config, 220 | '--catalog', catalog, 221 | ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) 222 | 223 | with self.assertRaises(subprocess.CalledProcessError): 224 | result.check_returncode() 225 | 226 | def test_validate_flag_with_valid_transformations(self): 227 | 228 | config = '{}/resources/valid_config.json'.format(os.path.dirname(__file__)) 229 | catalog = '{}/resources/catalog.json'.format(os.path.dirname(__file__)) 230 | 231 | result = subprocess.run([ 232 | 'transform-field', 233 | '--validate', 234 | '--config', config, 235 | '--catalog', catalog, 236 | ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) 237 | 238 | self.assertIsNone(result.check_returncode()) 239 | 240 | def test_multiple_singer_json_messages_with_transformation_on_json(self): 241 | """Test a bunch of singer messages with transformation on json""" 242 | tap_lines = self.get_tap_input_messages('streams_with_object.json') 243 | 244 | # Set transformations on some columns 245 | trans_config = {'transformations': [ 246 | {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_1', 'type': 'SET-NULL'}, 247 | {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_2', 'type': 'MASK-HIDDEN'}, 248 | {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_3', 'type': 'MASK-DATE', 249 | 'when': [ 250 | {'column': 'c_pk', 'equals': 2}, 251 | {'column': 'column_6', 'field_path': 'key1', 'equals': 'B'} 252 | ] 253 | }, 254 | {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_4', 'type': 'MASK-NUMBER', 255 | 'when': [ 256 | {'column': 'column_4', 'equals': -44}, 257 | ] 258 | }, 259 | {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_6', 'type': 'SET-NULL', 260 | 'field_paths': ['key2/key2_2']}, 261 | ]} 262 | 263 | transform_field = TransformField(trans_config) 264 | transform_field.consume(tap_lines) 265 | 266 | records = [msg['record'] for msg in self.singer_output_to_objects(self.stdout) if msg['type'] == 'RECORD'] 267 | 268 | self.assertListEqual(records, [ 269 | { 270 | 'c_pk': 1, 271 | 'column_1': None, 272 | 'column_2': 'hidden', 273 | 'column_3': '2019-12-21T12:12:45', 274 | 'column_4': 1234, 275 | 'column_5': '2021-12-21T12:12:45', 276 | 'column_6': {'id': 50, 'key1': 'A', 'key2': {'key2_2': None}}, 277 | }, 278 | { 279 | 'c_pk': 2, 280 | 'column_1': None, 281 | 'column_2': 'hidden', 282 | 'column_3': '2019-01-01T13:12:45', 283 | 'column_4': 4, 284 | 'column_5': '2021-12-21T13:12:45', 285 | 'column_6': {'id': 51, 'key1': 'B', 'key2': {'key2_1': 'ds'}}, 286 | }, 287 | { 288 | 'c_pk': 3, 289 | 'column_1': None, 290 | 'column_2': 'hidden', 291 | 'column_3': '2019-12-21T14:12:45', 292 | 'column_4': 15, 293 | 'column_5': '2021-12-21T14:12:45', 294 | 'column_6': {'id': 52, 'key1': 'C', 'key2': {'key2_1': 'xv43dgf', 'key2_2': None}}, 295 | }, 296 | { 297 | 'c_pk': 4, 298 | 'column_1': None, 299 | 'column_2': 'hidden', 300 | 'column_3': '2019-12-21T15:12:45', 301 | 'column_4': 1000, 302 | 'column_5': '2021-12-21T15:12:45', 303 | 'column_6': {'id': 53, 'key1': 'D', 'key2': {'key2_1': '43xvf', 'key2_2': None}}, 304 | }, 305 | { 306 | 'c_pk': 5, 307 | 'column_1': None, 308 | 'column_2': 'hidden', 309 | 'column_3': '2019-12-21T16:12:45', 310 | 'column_4': 0, 311 | 'column_5': '2021-12-21T16:12:45', 312 | 'column_6': {'id': 54, 'key1': 'E', 'key2': {'key2_1': 'trter', 'key2_3': False}}, 313 | }, 314 | ]) 315 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/transferwise/pipelinewise-transform-field/af22b03ece9144948702236d9f2f5aaed845ac16/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_init.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch 3 | 4 | from singer import Catalog, Schema 5 | from transform_field.errors import CatalogRequiredException, StreamNotFoundException, NoStreamSchemaException, \ 6 | UnsupportedTransformationTypeException, InvalidTransformationException 7 | 8 | from transform_field import TransformField, TransMeta 9 | 10 | 11 | class TestTransformField(unittest.TestCase): 12 | """ 13 | Unit Tests for the TransformField class 14 | """ 15 | 16 | def setUp(self) -> None: 17 | self.config = { 18 | 'transformations': [ 19 | { 20 | "tap_stream_name": "stream_1", 21 | "field_id": "column_1", 22 | "type": "SET-NULL" 23 | }, 24 | { 25 | "tap_stream_name": "stream_1", 26 | "field_id": "column_2", 27 | "type": "HASH", 28 | "when": [] 29 | }, 30 | { 31 | "tap_stream_name": "stream_2", 32 | "field_id": "column_1", 33 | "type": "MASK-DATE" 34 | }, 35 | ] 36 | } 37 | 38 | def test_init(self): 39 | instance = TransformField(self.config) 40 | 41 | self.assertListEqual(instance.messages, []) 42 | self.assertEqual(instance.buffer_size_bytes, 0) 43 | self.assertIsNone(instance.state) 44 | self.assertIsNotNone(instance.time_last_batch_sent) 45 | self.assertDictEqual(instance.trans_config, self.config) 46 | self.assertDictEqual(instance.stream_meta, {}) 47 | self.assertDictEqual(instance.trans_meta, { 48 | 'stream_1': [ 49 | TransMeta('column_1', 'SET-NULL', None, None), 50 | TransMeta('column_2', 'HASH', [], None), 51 | ], 52 | 'stream_2': [TransMeta('column_1', 'MASK-DATE', None, None)], 53 | }) 54 | 55 | def test_validate_without_catalog_fails(self): 56 | with self.assertRaises(CatalogRequiredException): 57 | TransformField(self.config).validate(None) 58 | 59 | @patch('transform_field.utils.get_stream_schemas') 60 | def test_validate_with_missing_stream_fails(self, get_stream_schemas_mock): 61 | catalog = Catalog.from_dict({'streams': []}) 62 | 63 | get_stream_schemas_mock.return_value = { 64 | 'stream_2': {'something'} 65 | } 66 | with self.assertRaises(StreamNotFoundException): 67 | TransformField(self.config).validate(catalog) 68 | 69 | @patch('transform_field.utils.get_stream_schemas') 70 | def test_validate_with_empty_stream_schema_fails(self, get_stream_schemas_mock): 71 | catalog = Catalog.from_dict({'streams': []}) 72 | 73 | get_stream_schemas_mock.return_value = { 74 | 'stream_1': {}, 75 | 'stream_2': {'something'} 76 | } 77 | with self.assertRaises(NoStreamSchemaException): 78 | TransformField(self.config).validate(catalog) 79 | 80 | @patch('transform_field.utils.get_stream_schemas') 81 | def test_validate_with_unsupported_trans_type(self, get_stream_schemas_mock): 82 | config = { 83 | 'transformations': [ 84 | { 85 | "tap_stream_name": "stream_1", 86 | "field_id": "column_1", 87 | "type": "SET-RANDOM" 88 | }, 89 | ] 90 | } 91 | 92 | catalog = Catalog.from_dict({'streams': []}) 93 | 94 | get_stream_schemas_mock.return_value = { 95 | 'stream_1': Schema.from_dict({'properties': { 96 | 'column_1': { 97 | 'type': [ 98 | 'string' 99 | ] 100 | } 101 | }}) 102 | } 103 | with self.assertRaises(UnsupportedTransformationTypeException): 104 | TransformField(config).validate(catalog) 105 | 106 | @patch('transform_field.utils.get_stream_schemas') 107 | def test_validate_with_set_null_trans_type_success(self, get_stream_schemas_mock): 108 | config = { 109 | 'transformations': [ 110 | { 111 | "tap_stream_name": "stream_1", 112 | "field_id": "column_1", 113 | "type": "SET-NULL" 114 | }, 115 | ] 116 | } 117 | 118 | catalog = Catalog.from_dict({'streams': []}) 119 | 120 | get_stream_schemas_mock.return_value = { 121 | 'stream_1': Schema.from_dict({'properties': { 122 | 'column_1': { 123 | 'type': [ 124 | 'string' 125 | ] 126 | } 127 | }}) 128 | } 129 | TransformField(config).validate(catalog) 130 | 131 | @patch('transform_field.utils.get_stream_schemas') 132 | def test_validate_with_hash_fails_1(self, get_stream_schemas_mock): 133 | """ 134 | Testing validation of HASH transformation when field has no type 135 | """ 136 | config = { 137 | 'transformations': [ 138 | { 139 | "tap_stream_name": "stream_1", 140 | "field_id": "column_1", 141 | "type": "HASH" 142 | }, 143 | ] 144 | } 145 | 146 | catalog = Catalog.from_dict({'streams': []}) 147 | 148 | get_stream_schemas_mock.return_value = { 149 | 'stream_1': Schema.from_dict({'properties': { 150 | 'column_1': {} 151 | }}) 152 | } 153 | with self.assertRaises(InvalidTransformationException): 154 | TransformField(config).validate(catalog) 155 | 156 | @patch('transform_field.utils.get_stream_schemas') 157 | def test_validate_with_hash_fails_2(self, get_stream_schemas_mock): 158 | """ 159 | Testing validation of HASH transformation when field has non-string type 160 | """ 161 | config = { 162 | 'transformations': [ 163 | { 164 | "tap_stream_name": "stream_1", 165 | "field_id": "column_1", 166 | "type": "HASH" 167 | }, 168 | ] 169 | } 170 | 171 | catalog = Catalog.from_dict({'streams': []}) 172 | 173 | get_stream_schemas_mock.return_value = { 174 | 'stream_1': Schema.from_dict({'properties': { 175 | 'column_1': { 176 | 'type': [ 177 | 'null', 178 | 'integer' 179 | ] 180 | } 181 | }}) 182 | } 183 | with self.assertRaises(InvalidTransformationException): 184 | TransformField(config).validate(catalog) 185 | 186 | @patch('transform_field.utils.get_stream_schemas') 187 | def test_validate_with_hash_fails_3(self, get_stream_schemas_mock): 188 | """ 189 | Testing validation of HASH transformation when field has string type but formatted 190 | """ 191 | config = { 192 | 'transformations': [ 193 | { 194 | "tap_stream_name": "stream_1", 195 | "field_id": "column_1", 196 | "type": "HASH" 197 | }, 198 | ] 199 | } 200 | 201 | catalog = Catalog.from_dict({'streams': []}) 202 | 203 | get_stream_schemas_mock.return_value = { 204 | 'stream_1': Schema.from_dict({'properties': { 205 | 'column_1': { 206 | 'type': [ 207 | 'null', 208 | 'string' 209 | ], 210 | 'format': 'binary' 211 | } 212 | }}) 213 | } 214 | with self.assertRaises(InvalidTransformationException): 215 | TransformField(config).validate(catalog) 216 | 217 | @patch('transform_field.utils.get_stream_schemas') 218 | def test_validate_with_hash_success(self, get_stream_schemas_mock): 219 | """ 220 | Testing validation of HASH transformation when field has string type but no format 221 | """ 222 | config = { 223 | 'transformations': [ 224 | { 225 | "tap_stream_name": "stream_1", 226 | "field_id": "column_1", 227 | "type": "HASH" 228 | }, 229 | ] 230 | } 231 | 232 | catalog = Catalog.from_dict({'streams': []}) 233 | 234 | get_stream_schemas_mock.return_value = { 235 | 'stream_1': Schema.from_dict({'properties': { 236 | 'column_1': { 237 | 'type': [ 238 | 'null', 239 | 'string' 240 | ] 241 | } 242 | }}) 243 | } 244 | TransformField(config).validate(catalog) 245 | 246 | @patch('transform_field.utils.get_stream_schemas') 247 | def test_validate_with_hash_skip_first_fails_1(self, get_stream_schemas_mock): 248 | """ 249 | Testing validation of HASH-SKIP-FIRST transformation when field has no type 250 | """ 251 | config = { 252 | 'transformations': [ 253 | { 254 | "tap_stream_name": "stream_1", 255 | "field_id": "column_1", 256 | "type": "HASH-SKIP-FIRST-1" 257 | }, 258 | ] 259 | } 260 | 261 | catalog = Catalog.from_dict({'streams': []}) 262 | 263 | get_stream_schemas_mock.return_value = { 264 | 'stream_1': Schema.from_dict({'properties': { 265 | 'column_1': {} 266 | }}) 267 | } 268 | with self.assertRaises(InvalidTransformationException): 269 | TransformField(config).validate(catalog) 270 | 271 | @patch('transform_field.utils.get_stream_schemas') 272 | def test_validate_with_hash_skip_first_fails_2(self, get_stream_schemas_mock): 273 | """ 274 | Testing validation of HASH-SKIP-FIRST transformation when field has non-string type 275 | """ 276 | config = { 277 | 'transformations': [ 278 | { 279 | "tap_stream_name": "stream_1", 280 | "field_id": "column_1", 281 | "type": "HASH-SKIP-FIRST-1" 282 | }, 283 | ] 284 | } 285 | 286 | catalog = Catalog.from_dict({'streams': []}) 287 | 288 | get_stream_schemas_mock.return_value = { 289 | 'stream_1': Schema.from_dict({'properties': { 290 | 'column_1': { 291 | 'type': [ 292 | 'null', 293 | 'integer' 294 | ] 295 | } 296 | }}) 297 | } 298 | with self.assertRaises(InvalidTransformationException): 299 | TransformField(config).validate(catalog) 300 | 301 | @patch('transform_field.utils.get_stream_schemas') 302 | def test_validate_with_hash_skip_first_fails_3(self, get_stream_schemas_mock): 303 | """ 304 | Testing validation of HASH-SKIP-FIRST-1 transformation when field has string type but formatted 305 | """ 306 | config = { 307 | 'transformations': [ 308 | { 309 | "tap_stream_name": "stream_1", 310 | "field_id": "column_1", 311 | "type": "HASH-SKIP-FIRST-1" 312 | }, 313 | ] 314 | } 315 | 316 | catalog = Catalog.from_dict({'streams': []}) 317 | 318 | get_stream_schemas_mock.return_value = { 319 | 'stream_1': Schema.from_dict({'properties': { 320 | 'column_1': { 321 | 'type': [ 322 | 'null', 323 | 'string' 324 | ], 325 | 'format': 'binary' 326 | } 327 | }}) 328 | } 329 | with self.assertRaises(InvalidTransformationException): 330 | TransformField(config).validate(catalog) 331 | 332 | @patch('transform_field.utils.get_stream_schemas') 333 | def test_validate_with_hash_skip_first_success(self, get_stream_schemas_mock): 334 | """ 335 | Testing validation of HASH-SKIP-FIRST-1 transformation when field has string type but not formatted 336 | """ 337 | config = { 338 | 'transformations': [ 339 | { 340 | "tap_stream_name": "stream_1", 341 | "field_id": "column_1", 342 | "type": "HASH-SKIP-FIRST-1" 343 | }, 344 | ] 345 | } 346 | 347 | catalog = Catalog.from_dict({'streams': []}) 348 | 349 | get_stream_schemas_mock.return_value = { 350 | 'stream_1': Schema.from_dict({'properties': { 351 | 'column_1': { 352 | 'type': [ 353 | 'null', 354 | 'string' 355 | ] 356 | } 357 | }}) 358 | } 359 | TransformField(config).validate(catalog) 360 | 361 | @patch('transform_field.utils.get_stream_schemas') 362 | def test_validate_with_mask_hidden_fails_1(self, get_stream_schemas_mock): 363 | """ 364 | Testing validation of MASK-HIDDEN transformation when field has no type 365 | """ 366 | config = { 367 | 'transformations': [ 368 | { 369 | "tap_stream_name": "stream_1", 370 | "field_id": "column_1", 371 | "type": "MASK-HIDDEN" 372 | }, 373 | ] 374 | } 375 | 376 | catalog = Catalog.from_dict({'streams': []}) 377 | 378 | get_stream_schemas_mock.return_value = { 379 | 'stream_1': Schema.from_dict({'properties': { 380 | 'column_1': {} 381 | }}) 382 | } 383 | with self.assertRaises(InvalidTransformationException): 384 | TransformField(config).validate(catalog) 385 | 386 | @patch('transform_field.utils.get_stream_schemas') 387 | def test_validate_with_mask_hidden_fails_2(self, get_stream_schemas_mock): 388 | """ 389 | Testing validation of MASK-HIDDEN transformation when field has non-string type 390 | """ 391 | config = { 392 | 'transformations': [ 393 | { 394 | "tap_stream_name": "stream_1", 395 | "field_id": "column_1", 396 | "type": "MASK-HIDDEN" 397 | }, 398 | ] 399 | } 400 | 401 | catalog = Catalog.from_dict({'streams': []}) 402 | 403 | get_stream_schemas_mock.return_value = { 404 | 'stream_1': Schema.from_dict({'properties': { 405 | 'column_1': { 406 | 'type': [ 407 | 'null', 408 | 'integer' 409 | ] 410 | } 411 | }}) 412 | } 413 | with self.assertRaises(InvalidTransformationException): 414 | TransformField(config).validate(catalog) 415 | 416 | @patch('transform_field.utils.get_stream_schemas') 417 | def test_validate_with_mask_hidden_fails_3(self, get_stream_schemas_mock): 418 | """ 419 | Testing validation of MASK-HIDDEN transformation when field has string type but formatted 420 | """ 421 | config = { 422 | 'transformations': [ 423 | { 424 | "tap_stream_name": "stream_1", 425 | "field_id": "column_1", 426 | "type": "MASK-HIDDEN" 427 | }, 428 | ] 429 | } 430 | 431 | catalog = Catalog.from_dict({'streams': []}) 432 | 433 | get_stream_schemas_mock.return_value = { 434 | 'stream_1': Schema.from_dict({'properties': { 435 | 'column_1': { 436 | 'type': [ 437 | 'null', 438 | 'string' 439 | ], 440 | 'format': 'binary' 441 | } 442 | }}) 443 | } 444 | with self.assertRaises(InvalidTransformationException): 445 | TransformField(config).validate(catalog) 446 | 447 | @patch('transform_field.utils.get_stream_schemas') 448 | def test_validate_with_mask_hidden_success(self, get_stream_schemas_mock): 449 | """ 450 | Testing validation of MASK-HIDDEN transformation when field has string type but not formatted 451 | """ 452 | config = { 453 | 'transformations': [ 454 | { 455 | "tap_stream_name": "stream_1", 456 | "field_id": "column_1", 457 | "type": "MASK-HIDDEN" 458 | }, 459 | ] 460 | } 461 | 462 | catalog = Catalog.from_dict({'streams': []}) 463 | 464 | get_stream_schemas_mock.return_value = { 465 | 'stream_1': Schema.from_dict({'properties': { 466 | 'column_1': { 467 | 'type': [ 468 | 'null', 469 | 'string' 470 | ] 471 | } 472 | }}) 473 | } 474 | TransformField(config).validate(catalog) 475 | 476 | @patch('transform_field.utils.get_stream_schemas') 477 | def test_validate_with_mask_date_fails_1(self, get_stream_schemas_mock): 478 | """ 479 | Testing validation of MASK-DATE transformation when field has no type 480 | """ 481 | config = { 482 | 'transformations': [ 483 | { 484 | "tap_stream_name": "stream_1", 485 | "field_id": "column_1", 486 | "type": "MASK-DATE" 487 | }, 488 | ] 489 | } 490 | 491 | catalog = Catalog.from_dict({'streams': []}) 492 | 493 | get_stream_schemas_mock.return_value = { 494 | 'stream_1': Schema.from_dict({'properties': { 495 | 'column_1': {} 496 | }}) 497 | } 498 | with self.assertRaises(InvalidTransformationException): 499 | TransformField(config).validate(catalog) 500 | 501 | @patch('transform_field.utils.get_stream_schemas') 502 | def test_validate_with_mask_date_fails_2(self, get_stream_schemas_mock): 503 | """ 504 | Testing validation of MASK-DATE transformation when field has string type but no format 505 | """ 506 | config = { 507 | 'transformations': [ 508 | { 509 | "tap_stream_name": "stream_1", 510 | "field_id": "column_1", 511 | "type": "MASK-DATE" 512 | }, 513 | ] 514 | } 515 | 516 | catalog = Catalog.from_dict({'streams': []}) 517 | 518 | get_stream_schemas_mock.return_value = { 519 | 'stream_1': Schema.from_dict({'properties': { 520 | 'column_1': { 521 | 'type': [ 522 | 'null', 523 | 'string' 524 | ] 525 | } 526 | }}) 527 | } 528 | with self.assertRaises(InvalidTransformationException): 529 | TransformField(config).validate(catalog) 530 | 531 | @patch('transform_field.utils.get_stream_schemas') 532 | def test_validate_with_mask_date_fails_3(self, get_stream_schemas_mock): 533 | """ 534 | Testing validation of MASK-DATE transformation when field has non-string type 535 | """ 536 | config = { 537 | 'transformations': [ 538 | { 539 | "tap_stream_name": "stream_1", 540 | "field_id": "column_1", 541 | "type": "MASK-DATE" 542 | }, 543 | ] 544 | } 545 | 546 | catalog = Catalog.from_dict({'streams': []}) 547 | 548 | get_stream_schemas_mock.return_value = { 549 | 'stream_1': Schema.from_dict({'properties': { 550 | 'column_1': { 551 | 'type': [ 552 | 'null', 553 | 'integer' 554 | ] 555 | } 556 | }}) 557 | } 558 | with self.assertRaises(InvalidTransformationException): 559 | TransformField(config).validate(catalog) 560 | 561 | @patch('transform_field.utils.get_stream_schemas') 562 | def test_validate_with_mask_date_fails_4(self, get_stream_schemas_mock): 563 | """ 564 | Testing validation of MASK-DATE transformation when field has string type but not date formatted 565 | """ 566 | config = { 567 | 'transformations': [ 568 | { 569 | "tap_stream_name": "stream_1", 570 | "field_id": "column_1", 571 | "type": "MASK-DATE" 572 | }, 573 | ] 574 | } 575 | 576 | catalog = Catalog.from_dict({'streams': []}) 577 | 578 | get_stream_schemas_mock.return_value = { 579 | 'stream_1': Schema.from_dict({'properties': { 580 | 'column_1': { 581 | 'type': [ 582 | 'null', 583 | 'string' 584 | ], 585 | 'format': 'binary' 586 | } 587 | }}) 588 | } 589 | with self.assertRaises(InvalidTransformationException): 590 | TransformField(config).validate(catalog) 591 | 592 | @patch('transform_field.utils.get_stream_schemas') 593 | def test_validate_with_mask_date_success_1(self, get_stream_schemas_mock): 594 | """ 595 | Testing validation of MASK-DATE transformation when field has string type but is date formatted 596 | """ 597 | config = { 598 | 'transformations': [ 599 | { 600 | "tap_stream_name": "stream_1", 601 | "field_id": "column_1", 602 | "type": "MASK-DATE" 603 | }, 604 | ] 605 | } 606 | 607 | catalog = Catalog.from_dict({'streams': []}) 608 | 609 | get_stream_schemas_mock.return_value = { 610 | 'stream_1': Schema.from_dict({'properties': { 611 | 'column_1': { 612 | 'type': [ 613 | 'null', 614 | 'string' 615 | ], 616 | 'format': 'date' 617 | } 618 | }}) 619 | } 620 | TransformField(config).validate(catalog) 621 | 622 | @patch('transform_field.utils.get_stream_schemas') 623 | def test_validate_with_mask_date_success_2(self, get_stream_schemas_mock): 624 | """ 625 | Testing validation of MASK-DATE transformation when field has string type but is date-time formatted 626 | """ 627 | config = { 628 | 'transformations': [ 629 | { 630 | "tap_stream_name": "stream_1", 631 | "field_id": "column_1", 632 | "type": "MASK-DATE" 633 | }, 634 | ] 635 | } 636 | 637 | catalog = Catalog.from_dict({'streams': []}) 638 | 639 | get_stream_schemas_mock.return_value = { 640 | 'stream_1': Schema.from_dict({'properties': { 641 | 'column_1': { 642 | 'type': [ 643 | 'null', 644 | 'string' 645 | ], 646 | 'format': 'date-time' 647 | } 648 | }}) 649 | } 650 | TransformField(config).validate(catalog) 651 | 652 | @patch('transform_field.utils.get_stream_schemas') 653 | def test_validate_with_mask_number_fails_1(self, get_stream_schemas_mock): 654 | """ 655 | Testing validation of MASK-NUMBER transformation when field has no type 656 | """ 657 | config = { 658 | 'transformations': [ 659 | { 660 | "tap_stream_name": "stream_1", 661 | "field_id": "column_1", 662 | "type": "MASK-NUMBER" 663 | }, 664 | ] 665 | } 666 | 667 | catalog = Catalog.from_dict({'streams': []}) 668 | 669 | get_stream_schemas_mock.return_value = { 670 | 'stream_1': Schema.from_dict({'properties': { 671 | 'column_1': {} 672 | }}) 673 | } 674 | with self.assertRaises(InvalidTransformationException): 675 | TransformField(config).validate(catalog) 676 | 677 | @patch('transform_field.utils.get_stream_schemas') 678 | def test_validate_with_mask_number_fails_2(self, get_stream_schemas_mock): 679 | """ 680 | Testing validation of MASK-NUMBER transformation when field not have integer nor number type 681 | """ 682 | config = { 683 | 'transformations': [ 684 | { 685 | "tap_stream_name": "stream_1", 686 | "field_id": "column_1", 687 | "type": "MASK-NUMBER" 688 | }, 689 | ] 690 | } 691 | 692 | catalog = Catalog.from_dict({'streams': []}) 693 | 694 | get_stream_schemas_mock.return_value = { 695 | 'stream_1': Schema.from_dict({'properties': { 696 | 'column_1': { 697 | 'type': [ 698 | 'null', 699 | 'string' 700 | ] 701 | } 702 | }}) 703 | } 704 | with self.assertRaises(InvalidTransformationException): 705 | TransformField(config).validate(catalog) 706 | 707 | @patch('transform_field.utils.get_stream_schemas') 708 | def test_validate_with_mask_number_fails_3(self, get_stream_schemas_mock): 709 | """ 710 | Testing validation of MASK-NUMBER transformation when field has integer type but formatted 711 | """ 712 | config = { 713 | 'transformations': [ 714 | { 715 | "tap_stream_name": "stream_1", 716 | "field_id": "column_1", 717 | "type": "MASK-NUMBER" 718 | }, 719 | ] 720 | } 721 | 722 | catalog = Catalog.from_dict({'streams': []}) 723 | 724 | get_stream_schemas_mock.return_value = { 725 | 'stream_1': Schema.from_dict({'properties': { 726 | 'column_1': { 727 | 'type': [ 728 | 'null', 729 | 'integer' 730 | ], 731 | 'format': 'something random' 732 | } 733 | }}) 734 | } 735 | with self.assertRaises(InvalidTransformationException): 736 | TransformField(config).validate(catalog) 737 | 738 | @patch('transform_field.utils.get_stream_schemas') 739 | def test_validate_with_mask_number_fails_4(self, get_stream_schemas_mock): 740 | """ 741 | Testing validation of MASK-NUMBER transformation when field has number type but formatted 742 | """ 743 | config = { 744 | 'transformations': [ 745 | { 746 | "tap_stream_name": "stream_1", 747 | "field_id": "column_1", 748 | "type": "MASK-DATE" 749 | }, 750 | ] 751 | } 752 | 753 | catalog = Catalog.from_dict({'streams': []}) 754 | 755 | get_stream_schemas_mock.return_value = { 756 | 'stream_1': Schema.from_dict({'properties': { 757 | 'column_1': { 758 | 'type': [ 759 | 'null', 760 | 'number' 761 | ], 762 | 'format': 'binary' 763 | } 764 | }}) 765 | } 766 | with self.assertRaises(InvalidTransformationException): 767 | TransformField(config).validate(catalog) 768 | 769 | @patch('transform_field.utils.get_stream_schemas') 770 | def test_validate_with_mask_number_success_1(self, get_stream_schemas_mock): 771 | """ 772 | Testing validation of MASK-NUMBER transformation when field has integer type 773 | """ 774 | config = { 775 | 'transformations': [ 776 | { 777 | "tap_stream_name": "stream_1", 778 | "field_id": "column_1", 779 | "type": "MASK-NUMBER" 780 | }, 781 | ] 782 | } 783 | 784 | catalog = Catalog.from_dict({'streams': []}) 785 | 786 | get_stream_schemas_mock.return_value = { 787 | 'stream_1': Schema.from_dict({'properties': { 788 | 'column_1': { 789 | 'type': [ 790 | 'null', 791 | 'integer' 792 | ] 793 | } 794 | }}) 795 | } 796 | TransformField(config).validate(catalog) 797 | 798 | @patch('transform_field.utils.get_stream_schemas') 799 | def test_validate_with_mask_number_success_2(self, get_stream_schemas_mock): 800 | """ 801 | Testing validation of MASK-NUMBER transformation when field has number type 802 | """ 803 | config = { 804 | 'transformations': [ 805 | { 806 | "tap_stream_name": "stream_1", 807 | "field_id": "column_1", 808 | "type": "MASK-NUMBER" 809 | }, 810 | ] 811 | } 812 | 813 | catalog = Catalog.from_dict({'streams': []}) 814 | 815 | get_stream_schemas_mock.return_value = { 816 | 'stream_1': Schema.from_dict({'properties': { 817 | 'column_1': { 818 | 'type': [ 819 | 'null', 820 | 'number' 821 | ] 822 | } 823 | }}) 824 | } 825 | TransformField(config).validate(catalog) 826 | 827 | @patch('transform_field.utils.get_stream_schemas') 828 | def test_validate_with_mask_string_skip_ends_fails_1(self, get_stream_schemas_mock): 829 | """ 830 | Testing validation of MASK-STRING-SKIP-ENDS transformation when field has no type 831 | """ 832 | config = { 833 | 'transformations': [ 834 | { 835 | "tap_stream_name": "stream_1", 836 | "field_id": "column_1", 837 | "type": "MASK-STRING-SKIP-ENDS-1" 838 | }, 839 | ] 840 | } 841 | 842 | catalog = Catalog.from_dict({'streams': []}) 843 | 844 | get_stream_schemas_mock.return_value = { 845 | 'stream_1': Schema.from_dict({'properties': { 846 | 'column_1': {} 847 | }}) 848 | } 849 | with self.assertRaises(InvalidTransformationException): 850 | TransformField(config).validate(catalog) 851 | 852 | @patch('transform_field.utils.get_stream_schemas') 853 | def test_validate_with_mask_string_skip_ends_fails_2(self, get_stream_schemas_mock): 854 | """ 855 | Testing validation of MASK-STRING-SKIP-ENDS transformation when field has non-string type 856 | """ 857 | config = { 858 | 'transformations': [ 859 | { 860 | "tap_stream_name": "stream_1", 861 | "field_id": "column_1", 862 | "type": "MASK-STRING-SKIP-ENDS-1" 863 | }, 864 | ] 865 | } 866 | 867 | catalog = Catalog.from_dict({'streams': []}) 868 | 869 | get_stream_schemas_mock.return_value = { 870 | 'stream_1': Schema.from_dict({'properties': { 871 | 'column_1': { 872 | 'type': [ 873 | 'null', 874 | 'integer' 875 | ] 876 | } 877 | }}) 878 | } 879 | with self.assertRaises(InvalidTransformationException): 880 | TransformField(config).validate(catalog) 881 | 882 | @patch('transform_field.utils.get_stream_schemas') 883 | def test_validate_with_mask_string_skip_ends_fails_3(self, get_stream_schemas_mock): 884 | """ 885 | Testing validation of MASK-STRING-SKIP-ENDS-1 transformation when field has string type but formatted 886 | """ 887 | config = { 888 | 'transformations': [ 889 | { 890 | "tap_stream_name": "stream_1", 891 | "field_id": "column_1", 892 | "type": "MASK-STRING-SKIP-ENDS-1" 893 | }, 894 | ] 895 | } 896 | 897 | catalog = Catalog.from_dict({'streams': []}) 898 | 899 | get_stream_schemas_mock.return_value = { 900 | 'stream_1': Schema.from_dict({'properties': { 901 | 'column_1': { 902 | 'type': [ 903 | 'null', 904 | 'string' 905 | ], 906 | 'format': 'binary' 907 | } 908 | }}) 909 | } 910 | with self.assertRaises(InvalidTransformationException): 911 | TransformField(config).validate(catalog) 912 | 913 | @patch('transform_field.utils.get_stream_schemas') 914 | def test_validate_with_mask_string_skip_ends_success(self, get_stream_schemas_mock): 915 | """ 916 | Testing validation of MASK-STRING-SKIP-ENDS-1 transformation when field has string type but not formatted 917 | """ 918 | config = { 919 | 'transformations': [ 920 | { 921 | "tap_stream_name": "stream_1", 922 | "field_id": "column_1", 923 | "type": "MASK-STRING-SKIP-ENDS-1" 924 | }, 925 | ] 926 | } 927 | 928 | catalog = Catalog.from_dict({'streams': []}) 929 | 930 | get_stream_schemas_mock.return_value = { 931 | 'stream_1': Schema.from_dict({'properties': { 932 | 'column_1': { 933 | 'type': [ 934 | 'null', 935 | 'string' 936 | ] 937 | } 938 | }}) 939 | } 940 | TransformField(config).validate(catalog) 941 | -------------------------------------------------------------------------------- /tests/unit/test_transform.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import hashlib 3 | 4 | from transform_field import transform 5 | 6 | 7 | class TestTransform(unittest.TestCase): 8 | """ 9 | Unit Tests for the transform module 10 | """ 11 | 12 | def setUp(self) -> None: 13 | self.config = {} 14 | 15 | def test_set_null(self): 16 | """TEST SET-NULL transformation""" 17 | self.assertEqual( 18 | transform.do_transform({"col_1": "John"}, "col_1", "SET-NULL"), 19 | None 20 | ) 21 | 22 | def test_hash(self): 23 | """Test HASH transformation""" 24 | self.assertEqual( 25 | transform.do_transform({"col_1": "John"}, "col_1", "HASH"), 26 | hashlib.sha256("John".encode('utf-8')).hexdigest() 27 | ) 28 | 29 | def test_mask_date(self): 30 | """Test MASK-DATE transformation""" 31 | self.assertEqual( 32 | transform.do_transform({"col_1": "2019-05-21"}, "col_1", "MASK-DATE"), 33 | "2019-01-01T00:00:00" 34 | ) 35 | 36 | # Mask date should keep the time elements 37 | self.assertEqual( 38 | transform.do_transform({"col_1": "2019-05-21T13:34:11"}, "col_1", "MASK-DATE"), 39 | "2019-01-01T13:34:11" 40 | ) 41 | 42 | # Mask date should keep the time elements, date is invalid 43 | self.assertEqual( 44 | transform.do_transform({"col_1": "2019-05-21T13:34:99"}, "col_1", "MASK-DATE"), 45 | "2019-05-21T13:34:99" 46 | ) 47 | 48 | def test_mask_number(self): 49 | """Test MASK-NUMBER transformation""" 50 | self.assertEqual( 51 | transform.do_transform({"col_1": "1234567890"}, "col_1", "MASK-NUMBER"), 52 | 0 53 | ) 54 | 55 | def test_mask_hidden(self): 56 | """Test MASK-HIDDEN transformation""" 57 | self.assertEqual( 58 | transform.do_transform({"col_1": "abakadabra123"}, "col_1", "MASK-HIDDEN"), 59 | 'hidden' 60 | ) 61 | 62 | def test_mask_string_skip_ends_case1(self): 63 | """Test MASK-STRING-SKIP-ENDS transformation with n=3""" 64 | self.assertEqual( 65 | transform.do_transform({"col_1": "do!maskme!"}, "col_1", "MASK-STRING-SKIP-ENDS-3"), 66 | 'do!****me!' 67 | ) 68 | 69 | def test_mask_string_skip_ends_case2(self): 70 | """Test MASK-STRING-SKIP-ENDS transformation with n=2""" 71 | self.assertEqual( 72 | transform.do_transform({"col_1": "nomask"}, "col_1", "MASK-STRING-SKIP-ENDS-2"), 73 | 'no**sk' 74 | ) 75 | 76 | def test_mask_string_skip_ends_case3(self): 77 | """Test MASK-STRING-SKIP-ENDS transformation where string length equals to 2 * mask_length""" 78 | self.assertEqual( 79 | transform.do_transform({"col_1": "nomask"}, "col_1", "MASK-STRING-SKIP-ENDS-3"), 80 | '******' 81 | ) 82 | 83 | def test_mask_string_skip_ends_case4(self): 84 | """Test MASK-STRING-SKIP-ENDS transformation where string length less than 2 * mask_length""" 85 | self.assertEqual( 86 | transform.do_transform({"col_1": "shortmask"}, "col_1", "MASK-STRING-SKIP-ENDS-5"), 87 | '*********' 88 | ) 89 | 90 | def test_unknown_transformation_type(self): 91 | """Test not existing transformation type""" 92 | # Should return the original value 93 | self.assertEqual( 94 | transform.do_transform({"col_1": "John"}, "col_1", "NOT-EXISTING-TRANSFORMATION-TYPE"), 95 | "John" 96 | ) 97 | 98 | def test_conditions(self): 99 | """Test conditional transformations""" 100 | 101 | # Matching condition: Should transform to NULL 102 | self.assertEqual( 103 | transform.do_transform( 104 | # Record: 105 | {"col_1": "random value", "col_2": "passwordHash", "col_3": "lkj"}, 106 | # Column to transform: 107 | "col_3", 108 | # Transform method: 109 | "SET-NULL", 110 | # Conditions when to transform: 111 | [ 112 | {'column': 'col_1', 'equals': "random value"}, 113 | {'column': 'col_2', 'equals': "passwordHash"}, 114 | ] 115 | ), 116 | 117 | # Expected output: 118 | None 119 | ) 120 | 121 | # Not matching condition: Should keep the original value 122 | self.assertEqual( 123 | transform.do_transform( 124 | # Record: 125 | {"col_1": "random value", "col_2": "id", "col_3": "123456789"}, 126 | # Column to transform: 127 | "col_3", 128 | # Transform method: 129 | "SET-NULL", 130 | # Conditions when to transform: 131 | [ 132 | {'column': 'col_1', 'equals': "random value"}, 133 | {'column': 'col_2', 'equals': "passwordHash"}, 134 | ] 135 | ), 136 | 137 | # Expected output: 138 | "123456789" 139 | ) 140 | 141 | def test_transform_field_in_json_col(self): 142 | """Test transformation of a field in a json column with no conditions""" 143 | 144 | expected_value = {'id': 1, 'info': {'last_name': 'hidden', 'first_name': 'John'}} 145 | 146 | return_value = transform.do_transform( 147 | # Record: 148 | { 149 | "col_1": "random value", 150 | "col_2": "passwordHash", 151 | "col_3": "lkj", 152 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John'}} 153 | }, 154 | # Column to transform: 155 | "col_4", 156 | # Transform method: 157 | "MASK-HIDDEN", 158 | # Conditions when to transform: 159 | None, 160 | ['info/last_name'] 161 | ) 162 | 163 | self.assertDictEqual(expected_value, return_value) 164 | 165 | def test_transform_field_in_json_col_with_conditions(self): 166 | """Test transformation of a field in a json column with conditions""" 167 | 168 | expected_value = {'id': 1, 'info': {'last_name': 'hidden', 'first_name': 'John'}} 169 | 170 | return_value = transform.do_transform( 171 | # Record: 172 | { 173 | "col_1": "random value", 174 | "col_2": "passwordHash", 175 | "col_3": "lkj", 176 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John'}} 177 | }, 178 | # Column to transform: 179 | "col_4", 180 | # Transform method: 181 | "MASK-HIDDEN", 182 | # Conditions when to transform: 183 | [ 184 | {'column': 'col_2', 'equals': "passwordHash"}, 185 | ], 186 | ['info/last_name'] 187 | ) 188 | 189 | self.assertDictEqual(expected_value, return_value) 190 | 191 | def test_transform_fields_in_json_col(self): 192 | """Test transformation of multiple fields in a json column with no conditions""" 193 | 194 | expected_value = {'id': 1, 'info': {'last_name': 'hidden', 'first_name': 'hidden', 'age': 25}} 195 | 196 | return_value = transform.do_transform( 197 | # Record: 198 | { 199 | "col_1": "random value", 200 | "col_2": "passwordHash", 201 | "col_3": "lkj", 202 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'age': 25}} 203 | }, 204 | # Column to transform: 205 | "col_4", 206 | # Transform method: 207 | "MASK-HIDDEN", 208 | # Conditions when to transform: 209 | None, 210 | ['info/last_name', 'info/first_name'] 211 | ) 212 | 213 | self.assertDictEqual(expected_value, return_value) 214 | 215 | def test_transform_col_with_condition_on_json_field(self): 216 | """Test transformation of a column with condition on a field in a json""" 217 | 218 | record = { 219 | "col_1": "random value", 220 | "col_2": "passwordHash", 221 | "col_3": "323df43983dfs", 222 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}} 223 | } 224 | 225 | self.assertEqual( 226 | 'hidden', 227 | transform.do_transform( 228 | # Record: 229 | record, 230 | # Column to transform: 231 | "col_3", 232 | # Transform method: 233 | "MASK-HIDDEN", 234 | # Conditions when to transform: 235 | [ 236 | {'column': 'col_4', 'field_path': 'info/last_name', 'equals': 'Smith'}, 237 | ] 238 | ) 239 | ) 240 | 241 | def test_transform_field_in_json_col_with_condition_on_field(self): 242 | """Test transformation of a field in a json column with condition on a field in json, condition will be met""" 243 | 244 | record = { 245 | "col_1": "random value", 246 | "col_2": "passwordHash", 247 | "col_3": "lkj", 248 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}} 249 | } 250 | 251 | self.assertDictEqual( 252 | {'id': 1, 'info': {'first_name': 'John', 'last_name': None, 'phone': '6573930'}}, 253 | transform.do_transform( 254 | # Record: 255 | record, 256 | # Column to transform: 257 | "col_4", 258 | # Transform method: 259 | "SET-NULL", 260 | # Conditions when to transform: 261 | [ 262 | {'column': 'col_4', 'field_path': 'info/phone', 'equals': '6573930'}, 263 | ], 264 | ['info/last_name'] 265 | ) 266 | ) 267 | 268 | def test_transform_field_in_json_col_with_condition_on_field_2(self): 269 | """Test transformation of a field in a json column with condition on a field in json, 270 | the condition will not be met""" 271 | 272 | record = { 273 | "col_1": "random value", 274 | "col_2": "passwordHash", 275 | "col_3": "lkj", 276 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}} 277 | } 278 | 279 | # not transformed 280 | self.assertEqual( 281 | {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}}, 282 | transform.do_transform( 283 | # Record: 284 | record, 285 | # Column to transform: 286 | "col_4", 287 | # Transform method: 288 | "SET-NULL", 289 | # Conditions when to transform: 290 | [ 291 | {'column': 'col_4', 'field_path': 'info/phone', 'regex_match': '.*6573955.*'}, 292 | ], 293 | ['info/last_name'] 294 | ) 295 | ) 296 | 297 | def test_transform_multiple_conditions_all_success(self): 298 | """Test conditional transformation, all the conditions will be met and transformation should happen""" 299 | 300 | record = { 301 | "col_1": "random value", 302 | "col_2": "passwordHash", 303 | "col_3": "lkj", 304 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}}, 305 | 'col_5': '2021-11-30T16:40:07' 306 | } 307 | 308 | self.assertEqual( 309 | '2021-01-01T16:40:07', 310 | transform.do_transform( 311 | # Record: 312 | record, 313 | # Column to transform: 314 | "col_5", 315 | # Transform method: 316 | "MASK-DATE", 317 | # Conditions when to transform: 318 | [ 319 | {'column': 'col_4', 'field_path': 'info/last_name', 'equals': 'Smith'}, 320 | {'column': 'col_4', 'field_path': 'id', 'equals': 1}, 321 | {'column': 'col_3', 'regex_match': '.*lkj.*'}, 322 | ] 323 | ) 324 | ) 325 | 326 | def test_transform_multiple_conditions_one_fails(self): 327 | """Test conditional transformation, one of the conditions will not be met and transformation should not happen""" 328 | 329 | record = { 330 | "col_1": "random value", 331 | "col_2": "passwordHash", 332 | "col_3": "lkj", 333 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}}, 334 | 'col_5': '2021-11-30T16:40:07' 335 | } 336 | 337 | # not transformed 338 | self.assertEqual( 339 | '2021-11-30T16:40:07', 340 | transform.do_transform( 341 | # Record: 342 | record, 343 | # Column to transform: 344 | "col_5", 345 | # Transform method: 346 | "MASK-DATE", 347 | # Conditions when to transform: 348 | [ 349 | {'column': 'col_4', 'field_path': 'info/last_name', 'equals': 'Smith'}, 350 | {'column': 'col_4', 'field_path': 'id', 'equals': 2}, 351 | {'column': 'col_3', 'regex_match': '.*lkj.*'}, 352 | ] 353 | ) 354 | ) 355 | 356 | -------------------------------------------------------------------------------- /tests/unit/test_utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import unittest 3 | 4 | from unittest.mock import patch 5 | from singer import Catalog 6 | 7 | from transform_field.utils import get_stream_schemas, parse_args 8 | 9 | 10 | class TestUtils(unittest.TestCase): 11 | """ 12 | Unit Tests for the utils 13 | """ 14 | 15 | def test_get_stream_schemas(self): 16 | catalog = Catalog.from_dict({ 17 | 'streams': [ 18 | { 19 | 'tap_stream_id': 'stream1', 20 | 'schema': { 21 | 'properties': { 22 | 'col_1': {} 23 | } 24 | }, 25 | 'metadata': [ 26 | { 27 | 'breadcrumb': [], 28 | 'metadata': { 29 | 'selected': True 30 | } 31 | } 32 | ] 33 | }, 34 | { 35 | 'tap_stream_id': 'stream2', 36 | 'schema': { 37 | 'properties': { 38 | 'col_2': {} 39 | } 40 | }, 41 | 'metadata': [ 42 | { 43 | 'breadcrumb': [], 44 | 'metadata': { 45 | 'selected': True 46 | } 47 | } 48 | ] 49 | }, 50 | { 51 | 'tap_stream_id': 'stream3', 52 | 'schema': { 53 | 'properties': { 54 | 'col_3': {} 55 | } 56 | }, 57 | 'metadata': [ 58 | { 59 | 'breadcrumb': [], 60 | 'metadata': { 61 | 'selected': False 62 | } 63 | } 64 | ] 65 | } 66 | ] 67 | }) 68 | 69 | output = get_stream_schemas(catalog) 70 | 71 | self.assertIn('stream1', output) 72 | self.assertIn('stream2', output) 73 | self.assertNotIn('stream3', output) 74 | 75 | self.assertEqual(len(output['stream1'].properties), 1) 76 | self.assertEqual(len(output['stream2'].properties), 1) 77 | 78 | @patch('transform_field.utils.Catalog.load') 79 | @patch('transform_field.utils.check_config') 80 | @patch('transform_field.utils.load_json') 81 | @patch('argparse.ArgumentParser.parse_args') 82 | def test_parse_args(self, parse_args_mock, load_json_mock, check_config_mock, catalog_load_mock): 83 | """ 84 | test args parsing 85 | """ 86 | check_config_mock.return_value = None 87 | load_json_mock.return_value = {} 88 | catalog_load_mock.return_value = {} 89 | 90 | parse_args_mock.return_value = argparse.Namespace(**{ 91 | 'config': './config.json', 92 | 'catalog': './properties.json', 93 | 'validate': False, 94 | }) 95 | 96 | args = parse_args({'transformations'}) 97 | 98 | load_json_mock.assert_called_once() 99 | catalog_load_mock.assert_called_once() 100 | check_config_mock.assert_called_once() 101 | 102 | self.assertEqual(args.config, {}) 103 | self.assertEqual(args.catalog, {}) 104 | self.assertEqual(args.validate, False) 105 | -------------------------------------------------------------------------------- /transform_field/__init__.py: -------------------------------------------------------------------------------- 1 | import io 2 | import sys 3 | import time 4 | import singer 5 | 6 | from typing import Union, Dict 7 | from enum import Enum, unique 8 | from collections import namedtuple 9 | from decimal import Decimal 10 | from jsonschema import FormatChecker, Draft7Validator 11 | from singer import Catalog, Schema 12 | 13 | from transform_field import transform 14 | from transform_field import utils 15 | from transform_field.timings import Timings 16 | 17 | from transform_field.errors import CatalogRequiredException, StreamNotFoundException, InvalidTransformationException, \ 18 | UnsupportedTransformationTypeException, NoStreamSchemaException 19 | 20 | 21 | LOGGER = singer.get_logger('transform_field') 22 | TIMINGS = Timings(LOGGER) 23 | DEFAULT_MAX_BATCH_BYTES = 4000000 24 | DEFAULT_MAX_BATCH_RECORDS = 20000 25 | DEFAULT_BATCH_DELAY_SECONDS = 300.0 26 | VALIDATE_RECORDS = False 27 | 28 | StreamMeta = namedtuple('StreamMeta', ['schema', 'key_properties', 'bookmark_properties']) 29 | TransMeta = namedtuple('TransMeta', ['field_id', 'type', 'when', 'field_paths']) 30 | 31 | REQUIRED_CONFIG_KEYS = [ 32 | "transformations" 33 | ] 34 | 35 | 36 | @unique 37 | class TransformationTypes(Enum): 38 | """ 39 | List of supported transformation types 40 | """ 41 | SET_NULL = 'SET-NULL' 42 | MASK_HIDDEN = 'MASK-HIDDEN' 43 | MASK_DATE = 'MASK-DATE' 44 | MASK_NUMBER = 'MASK-NUMBER' 45 | HASH = 'HASH' 46 | HASH_SKIP_FIRST = 'HASH-SKIP-FIRST' 47 | MASK_STRING_SKIP_ENDS = 'MASK-STRING-SKIP-ENDS' 48 | 49 | 50 | def float_to_decimal(value): 51 | """Walk the given data structure and turn all instances of float into 52 | double.""" 53 | if isinstance(value, float): 54 | return Decimal(str(value)) 55 | if isinstance(value, list): 56 | return [float_to_decimal(child) for child in value] 57 | if isinstance(value, dict): 58 | return {k: float_to_decimal(v) for k, v in value.items()} 59 | return value 60 | 61 | 62 | class TransformFieldException(Exception): 63 | """A known exception for which we don't need to bring a stack trace""" 64 | 65 | 66 | class TransformField: 67 | """ 68 | Main Transformer class 69 | """ 70 | 71 | def __init__(self, trans_config): 72 | self.trans_config = trans_config 73 | self.messages = [] 74 | self.buffer_size_bytes = 0 75 | self.state = None 76 | 77 | # Time that the last batch was sent 78 | self.time_last_batch_sent = time.time() 79 | 80 | # Mapping from stream name to {'schema': ..., 'key_names': ..., 'bookmark_names': ... } 81 | self.stream_meta = {} 82 | 83 | # Mapping from transformation stream to {'stream': [ 'field_id': ..., 'type': ... ] ... } 84 | self.trans_meta = {} 85 | 86 | for trans in trans_config["transformations"]: 87 | # Naming differences in stream ids: 88 | # 1. properties.json and transformation_json using 'tap_stream_id' 89 | # 2. taps send in the 'stream' key in singer messages 90 | stream = trans["tap_stream_name"] 91 | if stream not in self.trans_meta: 92 | self.trans_meta[stream] = [] 93 | 94 | self.trans_meta[stream].append(TransMeta( 95 | trans["field_id"], 96 | trans["type"], 97 | trans.get('when'), 98 | trans.get('field_paths') 99 | )) 100 | 101 | # pylint: disable=too-many-nested-blocks,too-many-branches 102 | # todo: simplify this method 103 | def flush(self): 104 | """Give batch to handlers to process""" 105 | 106 | if self.messages: 107 | stream = self.messages[0].stream 108 | stream_meta = self.stream_meta[stream] 109 | 110 | # Transform columns 111 | messages = self.messages 112 | schema = float_to_decimal(stream_meta.schema) 113 | key_properties = stream_meta.key_properties 114 | validator = Draft7Validator(schema, format_checker=FormatChecker()) 115 | trans_meta = [] 116 | if stream in self.trans_meta: 117 | trans_meta = self.trans_meta[stream] 118 | 119 | for i, message in enumerate(messages): 120 | if isinstance(message, singer.RecordMessage): 121 | 122 | # Do transformation on every column where it is required 123 | for trans in trans_meta: 124 | 125 | if trans.field_id in message.record: 126 | transformed = transform.do_transform( 127 | message.record, trans.field_id, trans.type, trans.when, trans.field_paths 128 | ) 129 | message.record[trans.field_id] = transformed 130 | 131 | if VALIDATE_RECORDS: 132 | # Validate the transformed columns 133 | data = float_to_decimal(message.record) 134 | try: 135 | validator.validate(data) 136 | if key_properties: 137 | for k in key_properties: 138 | if k not in data: 139 | raise TransformFieldException( 140 | f'Message {i} is missing key property {k}') 141 | 142 | except Exception as exc: 143 | if type(exc).__name__ == "InvalidOperation": 144 | raise TransformFieldException( 145 | f"Record does not pass schema validation. RECORD: {message.record}" 146 | "\n'multipleOf' validations that allows long precisions are not " 147 | "supported (i.e. with 15 digits or more). " 148 | f"Try removing 'multipleOf' methods from JSON schema.\n{exc}") from exc 149 | 150 | raise TransformFieldException( 151 | f"Record does not pass schema validation. RECORD: {message.record}\n{exc}") from exc 152 | 153 | # Write the transformed message 154 | singer.write_message(message) 155 | 156 | LOGGER.debug("Batch is valid with %s messages", len(messages)) 157 | 158 | # Update stats 159 | self.time_last_batch_sent = time.time() 160 | self.messages = [] 161 | self.buffer_size_bytes = 0 162 | 163 | if self.state: 164 | singer.write_message(singer.StateMessage(self.state)) 165 | self.state = None 166 | 167 | TIMINGS.log_timings() 168 | 169 | def handle_line(self, line): 170 | """Takes a raw line from stdin and transforms it""" 171 | try: 172 | message = singer.parse_message(line) 173 | 174 | if not message: 175 | raise TransformFieldException('Unknown message type') 176 | except Exception as exc: 177 | raise TransformFieldException(f'Failed to process incoming message: {line}\n{exc}') from exc 178 | 179 | # If we got a Schema, set the schema and key properties for this 180 | # stream. Flush the batch, if there is one, in case the schema is 181 | # different 182 | if isinstance(message, singer.SchemaMessage): 183 | self.flush() 184 | 185 | self.stream_meta[message.stream] = StreamMeta( 186 | message.schema, 187 | message.key_properties, 188 | message.bookmark_properties) 189 | 190 | # if schema message, do validation of transformations using the schema to detect any 191 | # incompatibilities between the transformation and column types 192 | self.__validate_stream_trans(message.stream, message.schema) 193 | 194 | # Write the transformed message 195 | singer.write_message(message) 196 | 197 | elif isinstance(message, (singer.RecordMessage, singer.ActivateVersionMessage)): 198 | if self.messages and ( 199 | message.stream != self.messages[0].stream or 200 | message.version != self.messages[0].version): 201 | self.flush() 202 | self.messages.append(message) 203 | self.buffer_size_bytes += len(line) 204 | 205 | num_bytes = self.buffer_size_bytes 206 | num_messages = len(self.messages) 207 | num_seconds = time.time() - self.time_last_batch_sent 208 | 209 | enough_bytes = num_bytes >= DEFAULT_MAX_BATCH_BYTES 210 | enough_messages = num_messages >= DEFAULT_MAX_BATCH_RECORDS 211 | enough_time = num_seconds >= DEFAULT_BATCH_DELAY_SECONDS 212 | if enough_bytes or enough_messages or enough_time: 213 | LOGGER.debug('Flushing %d bytes, %d messages, after %.2f seconds', num_bytes, num_messages, num_seconds) 214 | self.flush() 215 | 216 | elif isinstance(message, singer.StateMessage): 217 | self.state = message.value 218 | 219 | def consume(self, reader): 220 | """Consume all the lines from the queue, flushing when done.""" 221 | for line in reader: 222 | self.handle_line(line) 223 | self.flush() 224 | 225 | def validate(self, catalog: Catalog): 226 | """ 227 | Validate the transformations by checking if each transformation type is compatible with the column type 228 | :param catalog: the catalog of streams with their json schema 229 | """ 230 | LOGGER.info('Starting validation of transformations...') 231 | 232 | if not catalog: 233 | raise CatalogRequiredException('Catalog missing! please provide catalog to run validation.') 234 | 235 | # get the schema of each stream 236 | schemas = utils.get_stream_schemas(catalog) 237 | 238 | for stream_id in self.trans_meta: 239 | self.__validate_stream_trans(stream_id, schemas.get(stream_id)) 240 | 241 | def __validate_stream_trans(self, stream_id: str, stream_schema: Union[Schema, Dict]): 242 | """ 243 | Validation of each stream's transformations 244 | :param stream_id: ID of the stream 245 | :param stream_schema: schema of the streams 246 | """ 247 | 248 | if stream_id not in self.trans_meta: 249 | return 250 | 251 | # check if we even have schema for stream of this transformation 252 | if stream_schema is None: 253 | raise StreamNotFoundException(stream_id) 254 | 255 | # check if we stream has not empty schema 256 | if not stream_schema: 257 | raise NoStreamSchemaException(stream_id) 258 | 259 | for transformation in self.trans_meta[stream_id]: 260 | trans_type = transformation.type 261 | field_id = transformation.field_id 262 | 263 | if isinstance(stream_schema, Schema): 264 | field_type = stream_schema.properties[field_id].type 265 | field_format = stream_schema.properties[field_id].format 266 | else: 267 | field_type = stream_schema['properties'][field_id].get('type') 268 | field_format = stream_schema['properties'][field_id].get('format') 269 | 270 | # If the value we want to transform is a field in a JSON property 271 | # then no need to enforce rules below for now 272 | if field_type and \ 273 | ("object" in field_type or "array" in field_type) and \ 274 | transformation.field_paths is not None: 275 | continue 276 | 277 | if trans_type in (TransformationTypes.HASH.value, TransformationTypes.MASK_HIDDEN.value) or \ 278 | trans_type.startswith(TransformationTypes.HASH_SKIP_FIRST.value) or \ 279 | trans_type.startswith(TransformationTypes.MASK_STRING_SKIP_ENDS.value): 280 | if not (field_type is not None and 'string' in field_type and not field_format): 281 | raise InvalidTransformationException( 282 | f'Cannot apply `{trans_type}` transformation type to a non-string field `' 283 | f'{field_id}` in stream `{stream_id}`') 284 | 285 | elif trans_type == TransformationTypes.MASK_DATE.value: 286 | if not (field_type is not None and 'string' in field_type and field_format in {'date-time', 'date'}): 287 | raise InvalidTransformationException( 288 | f'Cannot apply `{trans_type}` transformation type to a non-stringified date field' 289 | f' `{field_id}` in stream `{stream_id}`') 290 | 291 | elif trans_type == TransformationTypes.MASK_NUMBER.value: 292 | if not (field_type is not None and ( 293 | 'number' in field_type or 'integer' in field_type) and not field_format): 294 | raise InvalidTransformationException( 295 | f'Cannot apply `{trans_type}` transformation type to a non-numeric field ' 296 | f'`{field_id}` in stream `{stream_id}`') 297 | 298 | elif trans_type == TransformationTypes.SET_NULL.value: 299 | LOGGER.info('Transformation type is %s, no need to do any validation.', trans_type) 300 | 301 | else: 302 | raise UnsupportedTransformationTypeException(trans_type) 303 | 304 | 305 | def main_impl(): 306 | """ 307 | Main implementation 308 | """ 309 | args = utils.parse_args(REQUIRED_CONFIG_KEYS) 310 | trans_config = {'transformations': args.config['transformations']} 311 | 312 | instance = TransformField(trans_config) 313 | 314 | if args.validate: 315 | instance.validate(args.catalog) 316 | else: 317 | reader = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') 318 | instance.consume(reader) 319 | 320 | LOGGER.info("Exiting normally") 321 | 322 | 323 | def main(): 324 | """Main entry point""" 325 | try: 326 | main_impl() 327 | except TransformFieldException as exc: 328 | for line in str(exc).splitlines(): 329 | LOGGER.critical(line) 330 | sys.exit(1) 331 | except Exception as exc: 332 | LOGGER.critical(exc) 333 | raise exc 334 | 335 | 336 | if __name__ == '__main__': 337 | main() 338 | -------------------------------------------------------------------------------- /transform_field/errors.py: -------------------------------------------------------------------------------- 1 | class CatalogRequiredException(Exception): 2 | """Raised when catalog needs to be provided but it has not been""" 3 | 4 | 5 | class StreamNotFoundException(Exception): 6 | """Raised when catalog doesn't have a given selected stream""" 7 | 8 | def __init__(self, stream): 9 | message = f'Catalog doesn\'t have the selected stream `{stream}`!' 10 | 11 | super().__init__(message) 12 | 13 | 14 | class NoStreamSchemaException(Exception): 15 | """Raised when stream has an empty schema""" 16 | 17 | def __init__(self, stream): 18 | message = f'Stream `{stream}` has an empty schema!' 19 | 20 | super().__init__(message) 21 | 22 | 23 | class InvalidTransformationException(Exception): 24 | """Raised when the given transformation is invalid""" 25 | 26 | 27 | class UnsupportedTransformationTypeException(Exception): 28 | """Raised when the given transformation type is not supported""" 29 | 30 | def __init__(self, trans_type): 31 | message = f'Transformation `{trans_type}` is not supported!' 32 | 33 | super().__init__(message) 34 | -------------------------------------------------------------------------------- /transform_field/timings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import time 4 | 5 | from contextlib import contextmanager 6 | 7 | 8 | class Timings: 9 | """Gathers timing information for the three main steps of the Transformer.""" 10 | 11 | def __init__(self, logger): 12 | self.logger = logger 13 | self.last_time = time.time() 14 | self.timings = { 15 | 'validating': 0.0, 16 | 'transforming': 0.0, 17 | None: 0.0 18 | } 19 | 20 | @contextmanager 21 | def mode(self, mode): 22 | """We wrap the big steps of the Tap in this context manager to accumulate 23 | timing info.""" 24 | 25 | start = time.time() 26 | yield 27 | end = time.time() 28 | self.timings[None] += start - self.last_time 29 | self.timings[mode] += end - start 30 | self.last_time = end 31 | 32 | def log_timings(self): 33 | """We call this with every flush to print out the accumulated timings""" 34 | self.logger.debug('Timings: unspecified: %.3f; validating: %.3f; transforming: %.3f;', 35 | self.timings[None], 36 | self.timings['validating'], 37 | self.timings['transforming']) 38 | -------------------------------------------------------------------------------- /transform_field/transform.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import re 3 | 4 | from typing import Dict, Any, Optional, List 5 | from dpath.util import get as get_xpath, set as set_xpath 6 | from singer import get_logger 7 | from dateutil import parser 8 | 9 | LOGGER = get_logger('transform_field') 10 | 11 | 12 | def is_transform_required(record: Dict, when: Optional[List[Dict]]) -> bool: 13 | """ 14 | Detects if the transformation is required or not based on 15 | the defined conditions and the actual values in a record. 16 | All conditions in when need to be met for the transformation to be required. 17 | """ 18 | if not when: 19 | # Transformation is always required if 'when' condition not defined 20 | LOGGER.debug('No conditions, transformations is required') 21 | return True 22 | 23 | transform_required = False 24 | 25 | # Check if conditional transformation matches criteria 26 | # Evaluate every condition 27 | for condition in when: 28 | column_to_match = condition['column'] 29 | column_value = record.get(column_to_match, "") 30 | 31 | field_path_to_match = condition.get('field_path') 32 | 33 | # check if given field exists in the column value 34 | if field_path_to_match: 35 | try: 36 | field_value = get_xpath(column_value, field_path_to_match) 37 | LOGGER.debug('field "%s" exists in the value of column "%s"', field_path_to_match, column_to_match) 38 | 39 | except KeyError: 40 | # KeyError exception means the field doesn't exist, hence we cannot proceed with the 41 | # equals/regex match condition, thus the condition isn't met and don't need to do 42 | # transformation so breaking prematurely 43 | transform_required = False 44 | 45 | LOGGER.debug('field "%s" doesn\'t exists in the value of column "%s", ' 46 | 'so transformation is not required.', field_path_to_match, column_to_match) 47 | break 48 | 49 | cond_equals = condition.get('equals') 50 | cond_pattern = condition.get('regex_match') 51 | 52 | # Exact condition 53 | if cond_equals: 54 | LOGGER.debug('Equals condition found, value is: %s', cond_equals) 55 | if field_path_to_match: 56 | transform_required = __is_condition_met('equal', cond_equals, field_value) 57 | else: 58 | transform_required = __is_condition_met('equal', cond_equals, column_value) 59 | 60 | # Condition isn't met, exit the loop 61 | if not transform_required: 62 | LOGGER.debug('Equals condition didn\'t match, so transformation is not required.') 63 | break 64 | 65 | # Regex based condition 66 | elif cond_pattern: 67 | LOGGER.debug('Regex condition found, pattern is: %s', cond_pattern) 68 | 69 | if field_path_to_match: 70 | transform_required = __is_condition_met('regex', cond_pattern, field_value) 71 | else: 72 | transform_required = __is_condition_met('regex', cond_pattern, column_value) 73 | 74 | # Condition isn't met, exit the loop 75 | if not transform_required: 76 | LOGGER.debug('Regex pattern didn\'t match, so transformation is not required.') 77 | break 78 | 79 | LOGGER.debug('Transformation required? %s', transform_required) 80 | 81 | return transform_required 82 | 83 | 84 | def __is_condition_met(condition_type: str, condition_value: Any, value: Any) -> bool: 85 | """ 86 | Checks if given value meets the given condition 87 | Args: 88 | condition_type: condition type, could be "equal" or "regex" 89 | condition_value: the value of the condition, in case of regex it's the pattern, and 90 | a value to compare to in case of equal 91 | value: the target value to run the condition against 92 | 93 | Returns: bool, True of condition is met, False otherwise 94 | """ 95 | 96 | if condition_type == 'equal': 97 | return value == condition_value 98 | 99 | if condition_type == 'regex': 100 | matcher = re.compile(condition_value) 101 | return bool(matcher.search(value)) 102 | 103 | raise NotImplementedError(f'__is_condition_met is not implemented for condition type "{condition_type}"', ) 104 | 105 | 106 | def do_transform(record: Dict, 107 | field: str, 108 | trans_type: str, 109 | when: Optional[List[Dict]] = None, 110 | field_paths: Optional[List[str]] = None 111 | ) -> Any: 112 | """Transform a value by a certain transformation type. 113 | Optionally can set conditional criteria based on other 114 | values of the record""" 115 | 116 | return_value = value = record.get(field) 117 | 118 | try: 119 | # Do transformation only if required 120 | if is_transform_required(record, when): 121 | 122 | # transforming fields nested in value dictionary 123 | if isinstance(value, dict) and field_paths: 124 | for field_path in field_paths: 125 | try: 126 | field_val = get_xpath(value, field_path) 127 | set_xpath(value, field_path, _transform_value(field_val, trans_type)) 128 | except KeyError: 129 | LOGGER.error('Field path %s does not exist', field_path) 130 | 131 | return_value = value 132 | 133 | else: 134 | return_value = _transform_value(value, trans_type) 135 | 136 | # Return the original value if transformation is not required 137 | else: 138 | return_value = value 139 | 140 | return return_value 141 | 142 | # Return the original value if cannot transform 143 | except Exception: 144 | return return_value 145 | 146 | 147 | def _transform_value(value: Any, trans_type: str) -> Any: 148 | """ 149 | Applies the given transformation type to the given value 150 | Args: 151 | value: value to transform 152 | trans_type: transformation type to apply 153 | 154 | Returns: 155 | transformed value 156 | """ 157 | # Transforms any input to NULL 158 | if trans_type == "SET-NULL": 159 | return_value = None 160 | 161 | # Transforms string input to hash 162 | elif trans_type == "HASH": 163 | return_value = hashlib.sha256(value.encode('utf-8')).hexdigest() 164 | 165 | # Transforms string input to hash skipping first n characters, e.g. HASH-SKIP-FIRST-2 166 | elif 'HASH-SKIP-FIRST' in trans_type: 167 | return_value = value[:int(trans_type[-1])] + \ 168 | hashlib.sha256(value.encode('utf-8')[int(trans_type[-1]):]).hexdigest() 169 | 170 | # Transforms any date to stg 171 | elif trans_type == "MASK-DATE": 172 | return_value = parser.parse(value).replace(month=1, day=1).isoformat() 173 | 174 | # Transforms any number to zero 175 | elif trans_type == "MASK-NUMBER": 176 | return_value = 0 177 | 178 | # Transforms any value to "hidden" 179 | elif trans_type == "MASK-HIDDEN": 180 | return_value = 'hidden' 181 | 182 | # Transforms string input to masked version skipping first and last n characters 183 | # e.g. MASK-STRING-SKIP-ENDS-3 184 | elif 'MASK-STRING-SKIP-ENDS' in trans_type: 185 | skip_ends_n = int(trans_type[-1]) 186 | value_len = len(value) 187 | return_value = '*' * value_len if value_len <= (2 * skip_ends_n) \ 188 | else f'{value[:skip_ends_n]}{"*" * (value_len - (2 * skip_ends_n))}{value[-skip_ends_n:]}' 189 | 190 | # Return the original value if cannot find transformation type 191 | # todo: is this the right behavior? 192 | else: 193 | LOGGER.warning('Cannot find transformation type %s, returning same value', trans_type) 194 | return_value = value 195 | 196 | return return_value 197 | -------------------------------------------------------------------------------- /transform_field/utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from typing import Dict 4 | from singer import Catalog, get_logger, Schema 5 | from singer.utils import check_config, load_json 6 | 7 | 8 | LOGGER = get_logger('transform_field') 9 | 10 | 11 | def parse_args(required_config_keys): 12 | """ 13 | Parse standard command-line args. 14 | 15 | Parses the command-line arguments mentioned in the SPEC and the BEST_PRACTICES documents: 16 | 17 | -c,--config Config file 18 | --validate flag to validate the transformations 19 | --catalog Catalog file 20 | 21 | Returns the parsed args object from argparse. For each argument that 22 | point to JSON files (config, catalog), we will automatically 23 | load and parse the JSON file. 24 | """ 25 | parser = argparse.ArgumentParser() 26 | 27 | parser.add_argument( 28 | '-c', '--config', 29 | help='Config file', 30 | required=True) 31 | 32 | parser.add_argument( 33 | '--validate', 34 | help='Flag to trigger one-off validation of transformations in config file using the catalog', 35 | default=False, 36 | action='store_true' 37 | ) 38 | 39 | parser.add_argument( 40 | '--catalog', 41 | help='Catalog file') 42 | 43 | args = parser.parse_args() 44 | 45 | if args.config: 46 | setattr(args, 'config_path', args.config) 47 | args.config = load_json(args.config) 48 | 49 | if args.catalog: 50 | setattr(args, 'catalog_path', args.catalog) 51 | args.catalog = Catalog.load(args.catalog) 52 | 53 | check_config(args.config, required_config_keys) 54 | 55 | return args 56 | 57 | 58 | def get_stream_schemas(catalog: Catalog) -> Dict[str, Schema]: 59 | """ 60 | Build a map of streams with their schemas 61 | :param catalog: 62 | :return: Dictionary mapping stream ID to its schema 63 | """ 64 | return { 65 | stream.tap_stream_id: stream.schema 66 | for stream in catalog.streams if stream.is_selected() 67 | } 68 | --------------------------------------------------------------------------------