├── .github
├── CODEOWNERS
├── ISSUE_TEMPLATE
│ ├── BUG_REPORT.md
│ ├── FEATURE_REQUEST.md
│ ├── QUESTION.md
│ └── config.yml
├── PULL_REQUEST_TEMPLATE.md
├── dependabot.yml
└── workflows
│ ├── ci.yml
│ └── pythonpublish.yml
├── .gitignore
├── .pylintrc
├── CHANGELOG.md
├── LICENSE
├── README.md
├── sample_config.json
├── sample_logging.conf
├── setup.py
├── tests
├── __init__.py
├── integration
│ ├── __init__.py
│ ├── resources
│ │ ├── catalog.json
│ │ ├── invalid_config.json
│ │ ├── invalid_messages.json
│ │ ├── messages.json
│ │ ├── streams_with_changing_schema.json
│ │ ├── streams_with_object.json
│ │ └── valid_config.json
│ └── test_integrations.py
└── unit
│ ├── __init__.py
│ ├── test_init.py
│ ├── test_transform.py
│ └── test_utils.py
└── transform_field
├── __init__.py
├── errors.py
├── timings.py
├── transform.py
└── utils.py
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @transferwise/analytics-platform
2 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/BUG_REPORT.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a bug report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | ---
11 | name: Bug report
12 | about: Create a report to help us improve
13 | title: ''
14 | labels: ''
15 | assignees: ''
16 |
17 | ---
18 |
19 | **Describe the bug**
20 | A clear and concise description of what the bug is.
21 |
22 | **To Reproduce**
23 | Steps to reproduce the behavior:
24 | 1. Prepare the data as '...'
25 | 2. Run the command '....'
26 | 4. See error
27 |
28 | **Expected behavior**
29 | A clear and concise description of what you expected to happen.
30 |
31 | **Screenshots**
32 | If applicable, add screenshots to help explain your problem.
33 |
34 | **Your environment**
35 | - Version, e.g branch/commit #/release/tag
36 |
37 | **Additional context**
38 | Add any other context about the problem here.
39 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/FEATURE_REQUEST.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/QUESTION.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Question
3 | about: Ask anything about this project
4 | title: ''
5 | labels: help wanted
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Your question**
11 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 | - name: PipelineWise Community Slack channel
4 | url: https://singer-io.slack.com/messages/pipelinewise
5 | about: Open discussion about PipelineWise
6 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## Problem
2 |
3 | _Describe the problem your PR is trying to solve_
4 |
5 | ## Proposed changes
6 |
7 | _Describe the big picture of your changes here to communicate to the maintainers why we should accept this pull request.
8 | If it fixes a bug or resolves a feature request, be sure to link to that issue._
9 |
10 |
11 | ## Types of changes
12 |
13 | What types of changes does your code introduce to pipelinewise-transform-field?
14 | _Put an `x` in the boxes that apply_
15 |
16 | - [ ] Bugfix (non-breaking change which fixes an issue)
17 | - [ ] New feature (non-breaking change which adds functionality)
18 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
19 | - [ ] Documentation Update (if none of the other choices apply)
20 |
21 |
22 | ## Checklist
23 |
24 | - [ ] I have read the [CONTRIBUTING](https://github.com/transferwise/pipelinewise/blob/master/CONTRIBUTING.md) doc.
25 | - [ ] Description above provides context of the change
26 | - [ ] I have added tests that prove my fix is effective or that my feature works
27 | - [ ] Unit tests for changes (not needed for documentation changes)
28 | - [ ] CI checks pass with my changes
29 | - [ ] Bumping version in `setup.py` is an individual PR and not mixed with feature or bugfix PRs
30 | - [ ] Commit message/PR title starts with `[AP-NNNN]` (if applicable. AP-NNNN = JIRA ID)
31 | - [ ] Branch name starts with `AP-NNN` (if applicable. AP-NNN = JIRA ID)
32 | - [ ] Commits follow "[How to write a good git commit message](http://chris.beams.io/posts/git-commit/)"
33 | - [ ] Relevant documentation is updated including usage instructions
34 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # This is an automatically generated base configuration
2 | # For further configuration options and tuning:
3 | # https://docs.github.com/en/free-pro-team@latest/github/administering-a-repository/configuration-options-for-dependency-updates
4 |
5 | version: 2
6 | updates:
7 | - package-ecosystem: "pip"
8 | directory: "/"
9 | schedule:
10 | interval: "weekly"
11 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | push:
5 | branches: [master]
6 | pull_request:
7 | branches: [master]
8 |
9 | workflow_dispatch:
10 |
11 | concurrency:
12 | group: ci-${{ github.head_ref }}
13 | cancel-in-progress: true
14 |
15 | jobs:
16 | build:
17 |
18 | runs-on: ubuntu-latest
19 | strategy:
20 | fail-fast: true
21 | matrix:
22 | python-version: [3.6, 3.7, 3.8]
23 |
24 | steps:
25 | - name: Checking out repo
26 | uses: actions/checkout@v2
27 |
28 | - name: Set up Python ${{ matrix.container[1] }}
29 | uses: actions/setup-python@v2
30 | with:
31 | python-version: ${{ matrix.python-version }}
32 |
33 | - name: set LOGGING_CONF_FILE env
34 | run: |
35 | export LOGGING_CONF_FILE=$(pwd)/sample_logging.conf
36 | echo $LOGGING_CONF_FILE
37 |
38 | - name: Install dependencies
39 | run: |
40 | pip install --upgrade pip setuptools
41 | pip install .[test]
42 |
43 | - name: Check if pylint is happy
44 | run: pylint transform_field
45 |
46 | - name: Run Unit Tests with min coverage
47 | run: pytest --cov=transform_field --cov-fail-under=65 -v tests/unit
48 |
49 | - name: Run Integration Tests with min coverage
50 | run: pytest --cov-fail-under=73 -v tests/integration
51 |
--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
1 | name: Upload Python Package to PyPi
2 |
3 | on:
4 | release:
5 | types: [created]
6 |
7 | jobs:
8 | deploy:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v2
12 | - name: Set up Python
13 | uses: actions/setup-python@v1
14 | with:
15 | python-version: '3.x'
16 | - name: Install dependencies
17 | run: |
18 | python -m pip install --upgrade pip
19 | pip install setuptools wheel twine
20 | - name: Build and publish
21 | env:
22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
24 | run: |
25 | python setup.py sdist bdist_wheel
26 | twine upload dist/*
27 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # IDE
2 | .vscode
3 | .idea/*
4 |
5 |
6 | # Python
7 | __pycache__/
8 | *.py[cod]
9 | *$py.class
10 | .virtualenvs
11 | *.egg-info/
12 | *~
13 | dist/
14 |
15 | # Singer JSON files
16 | properties.json
17 | config.json
18 | state.json
19 |
20 | *.db
21 | .DS_Store
22 | venv
23 | env
24 | blog_old.md
25 | node_modules
26 | *.pyc
27 | tmp
28 |
29 | # Docs
30 | docs/_build/
31 | docs/_templates/
32 |
--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | # Based on Apache 2.0 licensed code from https://github.com/ClusterHQ/flocker
2 |
3 | [MASTER]
4 |
5 | # Specify a configuration file.
6 | #rcfile=
7 |
8 | # Python code to execute, usually for sys.path manipulation such as
9 | # pygtk.require().
10 | # init-hook=
11 |
12 | # Add files or directories to the blacklist. They should be base names, not paths.
13 | ignore=
14 |
15 | # Pickle collected data for later comparisons.
16 | persistent=no
17 |
18 | # List of plugins (as comma separated values of python modules names) to load,
19 | # usually to register additional checkers.
20 | load-plugins=
21 |
22 | # Use multiple processes to speed up Pylint.
23 | # DO NOT CHANGE THIS VALUES >1 HIDE RESULTS!!!!!
24 | jobs=1
25 |
26 | # Allow loading of arbitrary C extensions. Extensions are imported into the
27 | # active Python interpreter and may run arbitrary code.
28 | unsafe-load-any-extension=no
29 |
30 | # A comma-separated list of package or module names from where C extensions may
31 | # be loaded. Extensions are loading into the active Python interpreter and may
32 | # run arbitrary code
33 | extension-pkg-whitelist=ujson
34 |
35 | # Allow optimization of some AST trees. This will activate a peephole AST
36 | # optimizer, which will apply various small optimizations. For instance, it can
37 | # be used to obtain the result of joining multiple strings with the addition
38 | # operator. Joining a lot of strings can lead to a maximum recursion error in
39 | # Pylint and this flag can prevent that. It has one side effect, the resulting
40 | # AST will be different than the one from reality.
41 | optimize-ast=no
42 |
43 |
44 | [MESSAGES CONTROL]
45 |
46 | # Only show warnings with the listed confidence levels. Leave empty to show
47 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
48 | confidence=
49 |
50 | # Enable the message, report, category or checker with the given id(s). You can
51 | # either give multiple identifier separated by comma (,) or put this option
52 | # multiple time. See also the "--disable" option for examples.
53 | disable=wrong-import-order,
54 | broad-except,
55 | missing-module-docstring,
56 | duplicate-code, # not useful until a major code refactoring
57 |
58 |
59 | enable=import-error,
60 | import-self,
61 | reimported,
62 | wildcard-import,
63 | misplaced-future,
64 | deprecated-module,
65 | unpacking-non-sequence,
66 | invalid-all-object,
67 | undefined-all-variable,
68 | used-before-assignment,
69 | cell-var-from-loop,
70 | global-variable-undefined,
71 | redefine-in-handler,
72 | unused-import,
73 | unused-wildcard-import,
74 | global-variable-not-assigned,
75 | undefined-loop-variable,
76 | global-statement,
77 | global-at-module-level,
78 | bad-open-mode,
79 | redundant-unittest-assert,
80 | boolean-datetime
81 | deprecated-method,
82 | anomalous-unicode-escape-in-string,
83 | anomalous-backslash-in-string,
84 | not-in-loop,
85 | continue-in-finally,
86 | abstract-class-instantiated,
87 | star-needs-assignment-target,
88 | duplicate-argument-name,
89 | return-in-init,
90 | too-many-star-expressions,
91 | nonlocal-and-global,
92 | return-outside-function,
93 | return-arg-in-generator,
94 | invalid-star-assignment-target,
95 | bad-reversed-sequence,
96 | nonexistent-operator,
97 | yield-outside-function,
98 | init-is-generator,
99 | nonlocal-without-binding,
100 | lost-exception,
101 | assert-on-tuple,
102 | dangerous-default-value,
103 | duplicate-key,
104 | useless-else-on-loop
105 | expression-not-assigned,
106 | confusing-with-statement,
107 | unnecessary-lambda,
108 | pointless-statement,
109 | pointless-string-statement,
110 | unnecessary-pass,
111 | unreachable,
112 | eval-used,
113 | exec-used,
114 | using-constant-test,
115 | bad-super-call,
116 | missing-super-argument,
117 | slots-on-old-class,
118 | super-on-old-class,
119 | property-on-old-class,
120 | not-an-iterable,
121 | not-a-mapping,
122 | format-needs-mapping,
123 | truncated-format-string,
124 | missing-format-string-key,
125 | mixed-format-string,
126 | too-few-format-args,
127 | bad-str-strip-call,
128 | too-many-format-args,
129 | bad-format-character,
130 | format-combined-specification,
131 | bad-format-string-key,
132 | bad-format-string,
133 | missing-format-attribute,
134 | missing-format-argument-key,
135 | unused-format-string-argument
136 | unused-format-string-key,
137 | invalid-format-index,
138 | bad-indentation,
139 | mixed-indentation,
140 | unnecessary-semicolon,
141 | lowercase-l-suffix,
142 | invalid-encoded-data,
143 | unpacking-in-except,
144 | import-star-module-level,
145 | long-suffix,
146 | old-octal-literal,
147 | old-ne-operator,
148 | backtick,
149 | old-raise-syntax,
150 | metaclass-assignment,
151 | next-method-called,
152 | dict-iter-method,
153 | dict-view-method,
154 | indexing-exception,
155 | raising-string,
156 | using-cmp-argument,
157 | cmp-method,
158 | coerce-method,
159 | delslice-method,
160 | getslice-method,
161 | hex-method,
162 | nonzero-method,
163 | t-method,
164 | setslice-method,
165 | old-division,
166 | logging-format-truncated,
167 | logging-too-few-args,
168 | logging-too-many-args,
169 | logging-unsupported-format,
170 | logging-format-interpolation,
171 | invalid-unary-operand-type,
172 | unsupported-binary-operation,
173 | not-callable,
174 | redundant-keyword-arg,
175 | assignment-from-no-return,
176 | assignment-from-none,
177 | not-context-manager,
178 | repeated-keyword,
179 | missing-kwoa,
180 | no-value-for-parameter,
181 | invalid-sequence-index,
182 | invalid-slice-index,
183 | unexpected-keyword-arg,
184 | unsupported-membership-test,
185 | unsubscriptable-object,
186 | access-member-before-definition,
187 | method-hidden,
188 | assigning-non-slot,
189 | duplicate-bases,
190 | inconsistent-mro,
191 | inherit-non-class,
192 | invalid-slots,
193 | invalid-slots-object,
194 | no-method-argument,
195 | no-self-argument,
196 | unexpected-special-method-signature,
197 | non-iterator-returned,
198 | arguments-differ,
199 | signature-differs,
200 | bad-staticmethod-argument,
201 | non-parent-init-called,
202 | bad-except-order,
203 | catching-non-exception,
204 | bad-exception-context,
205 | notimplemented-raised,
206 | raising-bad-type,
207 | raising-non-exception,
208 | misplaced-bare-raise,
209 | duplicate-except,
210 | nonstandard-exception,
211 | binary-op-exception,
212 | bare-except,
213 | not-async-context-manager,
214 | yield-inside-async-function
215 |
216 | # Needs investigation:
217 | # abstract-method (might be indicating a bug? probably not though)
218 | # protected-access (requires some refactoring)
219 | # attribute-defined-outside-init (requires some refactoring)
220 | # super-init-not-called (requires some cleanup)
221 |
222 | # Things we'd like to enable someday:
223 | # redefined-builtin (requires a bunch of work to clean up our code first)
224 | # redefined-outer-name (requires a bunch of work to clean up our code first)
225 | # undefined-variable (re-enable when pylint fixes https://github.com/PyCQA/pylint/issues/760)
226 | # no-name-in-module (giving us spurious warnings https://github.com/PyCQA/pylint/issues/73)
227 | # unused-argument (need to clean up or code a lot, e.g. prefix unused_?)
228 | # function-redefined (@overload causes lots of spurious warnings)
229 | # too-many-function-args (@overload causes spurious warnings... I think)
230 | # parameter-unpacking (needed for eventual Python 3 compat)
231 | # print-statement (needed for eventual Python 3 compat)
232 | # filter-builtin-not-iterating (Python 3)
233 | # map-builtin-not-iterating (Python 3)
234 | # range-builtin-not-iterating (Python 3)
235 | # zip-builtin-not-iterating (Python 3)
236 | # many others relevant to Python 3
237 | # unused-variable (a little work to cleanup, is all)
238 |
239 | # ...
240 | [REPORTS]
241 |
242 | # Set the output format. Available formats are text, parseable, colorized, msvs
243 | # (visual studio) and html. You can also give a reporter class, eg
244 | # mypackage.mymodule.MyReporterClass.
245 | output-format=parseable
246 |
247 | # Put messages in a separate file for each module / package specified on the
248 | # command line instead of printing them on stdout. Reports (if any) will be
249 | # written in a file name "pylint_global.[txt|html]".
250 | files-output=no
251 |
252 | # Tells whether to display a full report or only the messages
253 | reports=no
254 |
255 | # Python expression which should return a note less than 10 (10 is the highest
256 | # note). You have access to the variables errors warning, statement which
257 | # respectively contain the number of errors / warnings messages and the total
258 | # number of statements analyzed. This is used by the global evaluation report
259 | # (RP0004).
260 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
261 |
262 | # Template used to display messages. This is a python new-style format string
263 | # used to format the message information. See doc for all details
264 | #msg-template=
265 |
266 |
267 | [LOGGING]
268 |
269 | # Logging modules to check that the string format arguments are in logging
270 | # function parameter format
271 | logging-modules=logging
272 |
273 |
274 | [FORMAT]
275 |
276 | # Maximum number of characters on a single line.
277 | max-line-length=120
278 |
279 | # Regexp for a line that is allowed to be longer than the limit.
280 | ignore-long-lines=^\s*(# )??$
281 |
282 | # Allow the body of an if to be on the same line as the test if there is no
283 | # else.
284 | single-line-if-stmt=no
285 |
286 | # List of optional constructs for which whitespace checking is disabled. `dict-
287 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
288 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
289 | # `empty-line` allows space-only lines.
290 | no-space-check=trailing-comma,dict-separator
291 |
292 | # Maximum number of lines in a module
293 | max-module-lines=1000
294 |
295 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
296 | # tab).
297 | indent-string=' '
298 |
299 | # Number of spaces of indent required inside a hanging or continued line.
300 | indent-after-paren=4
301 |
302 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
303 | expected-line-ending-format=
304 |
305 |
306 | [TYPECHECK]
307 |
308 | # Tells whether missing members accessed in mixin class should be ignored. A
309 | # mixin class is detected if its name ends with "mixin" (case insensitive).
310 | ignore-mixin-members=yes
311 |
312 | # List of module names for which member attributes should not be checked
313 | # (useful for modules/projects where namespaces are manipulated during runtime
314 | # and thus existing member attributes cannot be deduced by static analysis. It
315 | # supports qualified module names, as well as Unix pattern matching.
316 | ignored-modules=
317 |
318 | # List of classes names for which member attributes should not be checked
319 | # (useful for classes with attributes dynamically set). This supports can work
320 | # with qualified names.
321 | ignored-classes=
322 |
323 | # List of members which are set dynamically and missed by pylint inference
324 | # system, and so shouldn't trigger E1101 when accessed. Python regular
325 | # expressions are accepted.
326 | generated-members=
327 |
328 |
329 | [VARIABLES]
330 |
331 | # Tells whether we should check for unused import in __init__ files.
332 | init-import=no
333 |
334 | # A regular expression matching the name of dummy variables (i.e. expectedly
335 | # not used).
336 | dummy-variables-rgx=_$|dummy
337 |
338 | # List of additional names supposed to be defined in builtins. Remember that
339 | # you should avoid to define new builtins when possible.
340 | additional-builtins=
341 |
342 | # List of strings which can identify a callback function by name. A callback
343 | # name must start or end with one of those strings.
344 | callbacks=cb_,_cb
345 |
346 |
347 | [SIMILARITIES]
348 |
349 | # Minimum lines number of a similarity.
350 | min-similarity-lines=4
351 |
352 | # Ignore comments when computing similarities.
353 | ignore-comments=yes
354 |
355 | # Ignore docstrings when computing similarities.
356 | ignore-docstrings=yes
357 |
358 | # Ignore imports when computing similarities.
359 | ignore-imports=no
360 |
361 |
362 | [SPELLING]
363 |
364 | # Spelling dictionary name. Available dictionaries: none. To make it working
365 | # install python-enchant package.
366 | spelling-dict=
367 |
368 | # List of comma separated words that should not be checked.
369 | spelling-ignore-words=
370 |
371 | # A path to a file that contains private dictionary; one word per line.
372 | spelling-private-dict-file=
373 |
374 | # Tells whether to store unknown words to indicated private dictionary in
375 | # --spelling-private-dict-file option instead of raising a message.
376 | spelling-store-unknown-words=no
377 |
378 |
379 | [MISCELLANEOUS]
380 |
381 | # List of note tags to take in consideration, separated by a comma.
382 | notes=FIXME,XXX
383 |
384 |
385 | [BASIC]
386 |
387 | # List of builtins function names that should not be used, separated by a comma
388 | bad-functions=map,filter,input
389 |
390 | # Good variable names which should always be accepted, separated by a comma
391 | good-names=i,j,k,ex,Run,_
392 |
393 | # Bad variable names which should always be refused, separated by a comma
394 | bad-names=foo,bar,baz,toto,tutu,tata
395 |
396 | # Colon-delimited sets of names that determine each other's naming style when
397 | # the name regexes allow several styles.
398 | name-group=
399 |
400 | # Include a hint for the correct naming format with invalid-name
401 | include-naming-hint=no
402 |
403 | # Regular expression matching correct function names
404 | function-rgx=[a-z_][a-z0-9_]{2,40}$
405 |
406 | # Naming hint for function names
407 | function-name-hint=[a-z_][a-z0-9_]{2,40}$
408 |
409 | # Regular expression matching correct variable names
410 | variable-rgx=[a-z_][a-z0-9_]{2,30}$
411 |
412 | # Naming hint for variable names
413 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$
414 |
415 | # Regular expression matching correct constant names
416 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
417 |
418 | # Naming hint for constant names
419 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
420 |
421 | # Regular expression matching correct attribute names
422 | attr-rgx=[a-z_][a-z0-9_]{2,30}$
423 |
424 | # Naming hint for attribute names
425 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$
426 |
427 | # Regular expression matching correct argument names
428 | argument-rgx=[a-z_][a-z0-9_]{2,30}$
429 |
430 | # Naming hint for argument names
431 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$
432 |
433 | # Regular expression matching correct class attribute names
434 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
435 |
436 | # Naming hint for class attribute names
437 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
438 |
439 | # Regular expression matching correct inline iteration names
440 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
441 |
442 | # Naming hint for inline iteration names
443 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
444 |
445 | # Regular expression matching correct class names
446 | class-rgx=[A-Z_][a-zA-Z0-9]+$
447 |
448 | # Naming hint for class names
449 | class-name-hint=[A-Z_][a-zA-Z0-9]+$
450 |
451 | # Regular expression matching correct module names
452 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
453 |
454 | # Naming hint for module names
455 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
456 |
457 | # Regular expression matching correct method names
458 | method-rgx=[a-z_][a-z0-9_]{2,30}$
459 |
460 | # Naming hint for method names
461 | method-name-hint=[a-z_][a-z0-9_]{2,30}$
462 |
463 | # Regular expression which should only match function or class names that do
464 | # not require a docstring.
465 | no-docstring-rgx=^_
466 |
467 | # Minimum line length for functions/classes that require docstrings, shorter
468 | # ones are exempt.
469 | docstring-min-length=-1
470 |
471 |
472 | [ELIF]
473 |
474 | # Maximum number of nested blocks for function / method body
475 | max-nested-blocks=5
476 |
477 |
478 | [IMPORTS]
479 |
480 | # Deprecated modules which should not be used, separated by a comma
481 | deprecated-modules=regsub,TERMIOS,Bastion,rexec
482 |
483 | # Create a graph of every (i.e. internal and external) dependencies in the
484 | # given file (report RP0402 must not be disabled)
485 | import-graph=
486 |
487 | # Create a graph of external dependencies in the given file (report RP0402 must
488 | # not be disabled)
489 | ext-import-graph=
490 |
491 | # Create a graph of internal dependencies in the given file (report RP0402 must
492 | # not be disabled)
493 | int-import-graph=
494 |
495 |
496 | [DESIGN]
497 |
498 | # Maximum number of arguments for function / method
499 | max-args=7
500 |
501 | # Argument names that match this expression will be ignored. Default to name
502 | # with leading underscore
503 | ignored-argument-names=_.*
504 |
505 | # Maximum number of locals for function / method body
506 | max-locals=15
507 |
508 | # Maximum number of return / yield for function / method body
509 | max-returns=6
510 |
511 | # Maximum number of branch for function / method body
512 | max-branches=12
513 |
514 | # Maximum number of statements in function / method body
515 | max-statements=50
516 |
517 | # Maximum number of parents for a class (see R0901).
518 | max-parents=7
519 |
520 | # Maximum number of attributes for a class (see R0902).
521 | max-attributes=7
522 |
523 | # Minimum number of public methods for a class (see R0903).
524 | min-public-methods=2
525 |
526 | # Maximum number of public methods for a class (see R0904).
527 | max-public-methods=20
528 |
529 | # Maximum number of boolean expressions in a if statement
530 | max-bool-expr=5
531 |
532 |
533 | [CLASSES]
534 |
535 | # List of method names used to declare (i.e. assign) instance attributes.
536 | defining-attr-methods=__init__,__new__,setUp
537 |
538 | # List of valid names for the first argument in a class method.
539 | valid-classmethod-first-arg=cls
540 |
541 | # List of valid names for the first argument in a metaclass class method.
542 | valid-metaclass-classmethod-first-arg=mcs
543 |
544 | # List of member names, which should be excluded from the protected access
545 | # warning.
546 | exclude-protected=_asdict,_fields,_replace,_source,_make
547 |
548 |
549 | [EXCEPTIONS]
550 |
551 | # Exceptions that will emit a warning when being caught. Defaults to
552 | # "Exception"
553 | overgeneral-exceptions=Exception
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # CHANGELOG
2 |
3 | ## 2.3.0 (2021-12-16)
4 | ### Added
5 | - Transformation of specific fields in object/array type properties in `RECORD` by using XPath syntax.
6 | - Conditions on specific fields in object/array type properties in `RECORD`.
7 |
8 | ## 2.2.0 (2021-09-17)
9 | ### Added
10 | - New transformation MASK-STRING-SKIP-ENDS-n. The transformation masks the string except start and end n-characters.
11 |
12 | ## 2.1.0 (2021-03-11)
13 | ### Addedd
14 | - `--validate` flag to do one-off validatation of the transformation config using a given catalog file.
15 |
16 | ### Changed
17 | - Validation of the transformation during runtime whenever a new `SCHEMA` type message has been received.
18 |
19 |
20 | ## 2.0.0 (2020-03-17)
21 |
22 | ### Changed
23 | - Stop trimming transformed values
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2012 The Obvious Corporation and contributors.
2 |
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 |
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |
16 | ```
17 | -------------------------------------------------------------------------
18 | Apache License
19 | Version 2.0, January 2004
20 | http://www.apache.org/licenses/
21 |
22 |
23 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
24 |
25 | 1. Definitions.
26 |
27 | "License" shall mean the terms and conditions for use, reproduction,
28 | and distribution as defined by Sections 1 through 9 of this document.
29 |
30 | "Licensor" shall mean the copyright owner or entity authorized by
31 | the copyright owner that is granting the License.
32 |
33 | "Legal Entity" shall mean the union of the acting entity and all
34 | other entities that control, are controlled by, or are under common
35 | control with that entity. For the purposes of this definition,
36 | "control" means (i) the power, direct or indirect, to cause the
37 | direction or management of such entity, whether by contract or
38 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
39 | outstanding shares, or (iii) beneficial ownership of such entity.
40 |
41 | "You" (or "Your") shall mean an individual or Legal Entity
42 | exercising permissions granted by this License.
43 |
44 | "Source" form shall mean the preferred form for making modifications,
45 | including but not limited to software source code, documentation
46 | source, and configuration files.
47 |
48 | "Object" form shall mean any form resulting from mechanical
49 | transformation or translation of a Source form, including but
50 | not limited to compiled object code, generated documentation,
51 | and conversions to other media types.
52 |
53 | "Work" shall mean the work of authorship, whether in Source or
54 | Object form, made available under the License, as indicated by a
55 | copyright notice that is included in or attached to the work
56 | (an example is provided in the Appendix below).
57 |
58 | "Derivative Works" shall mean any work, whether in Source or Object
59 | form, that is based on (or derived from) the Work and for which the
60 | editorial revisions, annotations, elaborations, or other modifications
61 | represent, as a whole, an original work of authorship. For the purposes
62 | of this License, Derivative Works shall not include works that remain
63 | separable from, or merely link (or bind by name) to the interfaces of,
64 | the Work and Derivative Works thereof.
65 |
66 | "Contribution" shall mean any work of authorship, including
67 | the original version of the Work and any modifications or additions
68 | to that Work or Derivative Works thereof, that is intentionally
69 | submitted to Licensor for inclusion in the Work by the copyright owner
70 | or by an individual or Legal Entity authorized to submit on behalf of
71 | the copyright owner. For the purposes of this definition, "submitted"
72 | means any form of electronic, verbal, or written communication sent
73 | to the Licensor or its representatives, including but not limited to
74 | communication on electronic mailing lists, source code control systems,
75 | and issue tracking systems that are managed by, or on behalf of, the
76 | Licensor for the purpose of discussing and improving the Work, but
77 | excluding communication that is conspicuously marked or otherwise
78 | designated in writing by the copyright owner as "Not a Contribution."
79 |
80 | "Contributor" shall mean Licensor and any individual or Legal Entity
81 | on behalf of whom a Contribution has been received by Licensor and
82 | subsequently incorporated within the Work.
83 |
84 | 2. Grant of Copyright License. Subject to the terms and conditions of
85 | this License, each Contributor hereby grants to You a perpetual,
86 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
87 | copyright license to reproduce, prepare Derivative Works of,
88 | publicly display, publicly perform, sublicense, and distribute the
89 | Work and such Derivative Works in Source or Object form.
90 |
91 | 3. Grant of Patent License. Subject to the terms and conditions of
92 | this License, each Contributor hereby grants to You a perpetual,
93 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
94 | (except as stated in this section) patent license to make, have made,
95 | use, offer to sell, sell, import, and otherwise transfer the Work,
96 | where such license applies only to those patent claims licensable
97 | by such Contributor that are necessarily infringed by their
98 | Contribution(s) alone or by combination of their Contribution(s)
99 | with the Work to which such Contribution(s) was submitted. If You
100 | institute patent litigation against any entity (including a
101 | cross-claim or counterclaim in a lawsuit) alleging that the Work
102 | or a Contribution incorporated within the Work constitutes direct
103 | or contributory patent infringement, then any patent licenses
104 | granted to You under this License for that Work shall terminate
105 | as of the date such litigation is filed.
106 |
107 | 4. Redistribution. You may reproduce and distribute copies of the
108 | Work or Derivative Works thereof in any medium, with or without
109 | modifications, and in Source or Object form, provided that You
110 | meet the following conditions:
111 |
112 | (a) You must give any other recipients of the Work or
113 | Derivative Works a copy of this License; and
114 |
115 | (b) You must cause any modified files to carry prominent notices
116 | stating that You changed the files; and
117 |
118 | (c) You must retain, in the Source form of any Derivative Works
119 | that You distribute, all copyright, patent, trademark, and
120 | attribution notices from the Source form of the Work,
121 | excluding those notices that do not pertain to any part of
122 | the Derivative Works; and
123 |
124 | (d) If the Work includes a "NOTICE" text file as part of its
125 | distribution, then any Derivative Works that You distribute must
126 | include a readable copy of the attribution notices contained
127 | within such NOTICE file, excluding those notices that do not
128 | pertain to any part of the Derivative Works, in at least one
129 | of the following places: within a NOTICE text file distributed
130 | as part of the Derivative Works; within the Source form or
131 | documentation, if provided along with the Derivative Works; or,
132 | within a display generated by the Derivative Works, if and
133 | wherever such third-party notices normally appear. The contents
134 | of the NOTICE file are for informational purposes only and
135 | do not modify the License. You may add Your own attribution
136 | notices within Derivative Works that You distribute, alongside
137 | or as an addendum to the NOTICE text from the Work, provided
138 | that such additional attribution notices cannot be construed
139 | as modifying the License.
140 |
141 | You may add Your own copyright statement to Your modifications and
142 | may provide additional or different license terms and conditions
143 | for use, reproduction, or distribution of Your modifications, or
144 | for any such Derivative Works as a whole, provided Your use,
145 | reproduction, and distribution of the Work otherwise complies with
146 | the conditions stated in this License.
147 |
148 | 5. Submission of Contributions. Unless You explicitly state otherwise,
149 | any Contribution intentionally submitted for inclusion in the Work
150 | by You to the Licensor shall be under the terms and conditions of
151 | this License, without any additional terms or conditions.
152 | Notwithstanding the above, nothing herein shall supersede or modify
153 | the terms of any separate license agreement you may have executed
154 | with Licensor regarding such Contributions.
155 |
156 | 6. Trademarks. This License does not grant permission to use the trade
157 | names, trademarks, service marks, or product names of the Licensor,
158 | except as required for reasonable and customary use in describing the
159 | origin of the Work and reproducing the content of the NOTICE file.
160 |
161 | 7. Disclaimer of Warranty. Unless required by applicable law or
162 | agreed to in writing, Licensor provides the Work (and each
163 | Contributor provides its Contributions) on an "AS IS" BASIS,
164 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
165 | implied, including, without limitation, any warranties or conditions
166 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
167 | PARTICULAR PURPOSE. You are solely responsible for determining the
168 | appropriateness of using or redistributing the Work and assume any
169 | risks associated with Your exercise of permissions under this License.
170 |
171 | 8. Limitation of Liability. In no event and under no legal theory,
172 | whether in tort (including negligence), contract, or otherwise,
173 | unless required by applicable law (such as deliberate and grossly
174 | negligent acts) or agreed to in writing, shall any Contributor be
175 | liable to You for damages, including any direct, indirect, special,
176 | incidental, or consequential damages of any character arising as a
177 | result of this License or out of the use or inability to use the
178 | Work (including but not limited to damages for loss of goodwill,
179 | work stoppage, computer failure or malfunction, or any and all
180 | other commercial damages or losses), even if such Contributor
181 | has been advised of the possibility of such damages.
182 |
183 | 9. Accepting Warranty or Additional Liability. While redistributing
184 | the Work or Derivative Works thereof, You may choose to offer,
185 | and charge a fee for, acceptance of support, warranty, indemnity,
186 | or other liability obligations and/or rights consistent with this
187 | License. However, in accepting such obligations, You may act only
188 | on Your own behalf and on Your sole responsibility, not on behalf
189 | of any other Contributor, and only if You agree to indemnify,
190 | defend, and hold each Contributor harmless for any liability
191 | incurred by, or claims asserted against, such Contributor by reason
192 | of your accepting any such warranty or additional liability.
193 |
194 | END OF TERMS AND CONDITIONS
195 | ```
196 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Notice
2 | To better serve Wise business and customer needs, the PipelineWise codebase needs to shrink.
3 | We have made the difficult decision that, going forward many components of PipelineWise will be removed or incorporated in the main repo.
4 | The last version before this decision is [v0.64.1](https://github.com/transferwise/pipelinewise/tree/v0.64.1)
5 |
6 | We thank all in the open-source community, that over the past 6 years, have helped to make PipelineWise a robust product for heterogeneous replication of many many Terabytes, daily
7 |
8 | # pipelinewise-transform-field
9 |
10 | [](https://badge.fury.io/py/pipelinewise-transform-field)
11 | [](https://pypi.org/project/pipelinewise-transform-field/)
12 | [](https://opensource.org/licenses/Apache-2.0)
13 |
14 | Transformation component between [Singer](https://www.singer.io/) taps and targets.
15 |
16 | This is a [PipelineWise](https://transferwise.github.io/pipelinewise) compatible component.
17 |
18 | ## How to use it
19 |
20 | The recommended method of running this component is to use it from [PipelineWise](https://transferwise.github.io/pipelinewise). When running it from PipelineWise you don't need to configure this tap with JSON files, and most of things are automated.
21 | Please check the related documentation at [Transformations](https://transferwise.github.io/pipelinewise/user_guide/transformations.html)
22 |
23 | If you want to run this [Singer](https://singer.io) compatible component independently please read further.
24 |
25 | ## Install
26 |
27 | First, make sure Python 3 is installed on your system or follow these
28 | installation instructions for [Mac](http://docs.python-guide.org/en/latest/starting/install3/osx/) or
29 | [Ubuntu](https://www.digitalocean.com/community/tutorials/how-to-install-python-3-and-set-up-a-local-programming-environment-on-ubuntu-16-04).
30 |
31 | It's recommended to use a virtualenv:
32 |
33 | ```bash
34 | python3 -m venv venv
35 | pip install pipelinewise-transform-field
36 | ```
37 |
38 | or
39 |
40 | ```bash
41 | python3 -m venv venv
42 | . venv/bin/activate
43 | pip install --upgrade pip setuptools
44 | pip install .
45 | ```
46 |
47 | ### To validate transformations
48 |
49 | `transform-field --validate --config [config.json] --catalog [catalog.json]`
50 |
51 | ### To run
52 |
53 | Put it between a tap and a target with simple unix pipes:
54 |
55 | `some-singer-tap | transform-field --config [config.json] | some-singer-target`
56 |
57 | It's reading incoming messages from STDIN and using `config.json` to transform incoming RECORD messages.
58 |
59 | **Note**: To avoid version conflicts run `tap`, `transform` and `targets` in separate virtual environments.
60 |
61 | ### Transformation types
62 |
63 | The following are the transformation types supported by _pipelinewise-transform-field_:
64 |
65 | * **SET-NULL**: Transforms any input to NULL
66 | * **HASH**: Transforms string input to hash
67 | * **HASH-SKIP-FIRST-n**: Transforms string input to hash skipping first n characters, e.g. HASH-SKIP-FIRST-2
68 | * **MASK-DATE**: Replaces the months and day parts of date columns to be always 1st of Jan
69 | * **MASK-NUMBER**: Transforms any numeric value to zero
70 | * **MASK-HIDDEN**: Transforms any string to 'hidden'
71 | * **MASK-STRING-SKIP-ENDS-n**: Transforms string input to masked version skipping first and last n characters, e.g. MASK-STRING-SKIP-ENDS-3
72 |
73 | _PS_: 1 =< n =< 9
74 |
75 | ### Conditional transformations
76 |
77 | It is possible to transform a record's property based on some given condition(s), the transformation will only take place when all conditions are met.
78 |
79 | A condition is a combination of:
80 | * column [required]: the field to look up to
81 | * operation [required]: the comparison type to use, the supported ones are `equals` and `regex_match`.
82 | * value [required]: the column value to look for in records.
83 |
84 | **An equality condition on a column**
85 | ```json
86 | {
87 | "column": "",
88 | "equals":
89 | }
90 | ```
91 |
92 | **A regex condition on a column**
93 | ```json
94 | {
95 | "column": "",
96 | "regex_match": ""
97 | }
98 | ```
99 |
100 | **A condition on a property within a JSON-type column**
101 | ```json
102 | {
103 | "column": "",
104 | "field_path": "",
105 | "equals":
106 | }
107 | ```
108 |
109 | ### Configuration
110 |
111 | You need to define which columns have to be transformed by which method and in which condition the transformation needs to be applied.
112 |
113 | #### Basic transformation
114 | A basic transformation is where a field in all a stream records will be transformed can be achieved with:
115 | ```json
116 | {
117 | "tap_stream_name": "",
118 | "field_id": "",
119 | "type": ""
120 | }
121 | ```
122 |
123 | #### Transformation within JSON
124 |
125 | In order to transform property(ies) within a JSON type field, you can make use of `field_paths` property:
126 |
127 | ```json
128 | {
129 | "tap_stream_name": "",
130 | "field_id": "",
131 | "field_paths": ["xpath to property 1", "xpath to property 2"],
132 | "type": ""
133 | }
134 | ```
135 |
136 | #### Conditional Transformation
137 |
138 | To apply transformation conditionally, you can make use of the property `when` which can have one or many conditions:
139 |
140 | ```json
141 | {
142 | "tap_stream_name": "",
143 | "field_id": "",
144 | "type": "",
145 | "when": [
146 | {"column": "string_col_1", "equals": "some value"},
147 | {"column": "string_col_2", "regex_match": ".*PII.*"},
148 | {"column": "numeric_col_1", "equals": 33},
149 | {"column": "json_column", "field_path": "metadata/comment", "regex_match": "sensitive"}
150 | ]
151 | }
152 | ```
153 |
154 | **Sample config**
155 | [config.json](./sample_config.json)
156 |
157 | (Tip: PipelineWise generating this for you from a more readable YAML format)
158 |
159 |
160 | ### To check code style:
161 |
162 | 1. Install python dependencies in a virtual env
163 | ```
164 | python3 -m venv venv
165 | . venv/bin/activate
166 | pip install --upgrade pip setuptools
167 | pip install .[test]
168 | ```
169 |
170 | 2. Run pylint
171 | ```shell
172 | pylint transform_field
173 | ```
174 |
175 | ### To run tests:
176 |
177 | 1. Install python dependencies in a virtual env and run unit and integration tests
178 | ```
179 | python3 -m venv venv
180 | . venv/bin/activate
181 | pip install --upgrade pip setuptools
182 | pip install .[test]
183 | ```
184 |
185 | 2. Run tests:
186 |
187 | * Unit tests
188 | ```
189 | pytest -v tests/unit
190 | ```
191 |
192 | * Integration tests
193 | ```
194 | pytest -v tests/integration
195 | ```
196 |
197 | * All tests
198 | ```
199 | pytest -v tests
200 | ```
201 |
202 |
203 |
204 | ## License
205 |
206 | Apache License Version 2.0
207 |
208 | See [LICENSE](LICENSE) to see the full text.
209 |
210 |
--------------------------------------------------------------------------------
/sample_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "transformations": [
3 | {
4 | "field_id": "password_hash",
5 | "tap_stream_name": "stream-id-sent-by-the-tap",
6 | "type": "MASK-HIDDEN"
7 | },
8 | {
9 | "field_id": "salt",
10 | "tap_stream_name": "stream-id-sent-by-the-tap",
11 | "type": "HASH"
12 | },
13 | {
14 | "field_id": "value",
15 | "tap_stream_name": "stream-id-sent-by-the-tap",
16 | "type": "SET-NULL",
17 | "when": [
18 | {"column": "string_column_1", "equals": "Property" },
19 | {"column": "numeric_column", "equals": 200 },
20 | {"column": "string_column_2", "regex_match": "sensitive.*PII" },
21 | {"column": "json_column", "field_path": "metadata/comment", "regex_match": "sensitive" }
22 | ]
23 | },
24 | {
25 | "field_id": "metadata",
26 | "tap_stream_name": "stream-id-sent-by-the-tap",
27 | "type": "MASK-HIDDEN",
28 | "field_paths": ["user/address", "user/zip_code"]
29 | }
30 | ]
31 | }
--------------------------------------------------------------------------------
/sample_logging.conf:
--------------------------------------------------------------------------------
1 | [loggers]
2 | keys=root
3 |
4 | [handlers]
5 | keys=stderr
6 |
7 | [formatters]
8 | keys=child
9 |
10 | [logger_root]
11 | level=INFO
12 | handlers=stderr
13 | formatter=child
14 | propagate=0
15 |
16 | [handler_stderr]
17 | level=INFO
18 | class=StreamHandler
19 | formatter=child
20 | args=(sys.stderr,)
21 |
22 | [formatter_child]
23 | class=logging.Formatter
24 | format=time=%(asctime)s name=%(name)s level=%(levelname)s message=%(message)s
25 | datefmt=%Y-%m-%d %H:%M:%S
26 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from setuptools import setup
4 |
5 | with open("README.md", "r") as fh:
6 | long_description = fh.read()
7 |
8 | setup(name='pipelinewise-transform-field',
9 | version='2.3.0',
10 | description='Singer.io simple field transformer between taps and targets - PipelineWise compatible',
11 | long_description=long_description,
12 | long_description_content_type='text/markdown',
13 | author="Wise",
14 | url='https://github.com/transferwise/pipelinewise-transform-field',
15 | classifiers=[
16 | 'License :: OSI Approved :: Apache Software License',
17 | 'Environment :: Console',
18 | 'Programming Language :: Python :: 3 :: Only',
19 | 'Programming Language :: Python :: 3.6',
20 | 'Programming Language :: Python :: 3.7',
21 | 'Programming Language :: Python :: 3.8'
22 | ],
23 | py_modules=['transform_field'],
24 | install_requires=[
25 | 'pipelinewise-singer-python==1.*',
26 | 'dpath==2.0.*',
27 | ],
28 | extras_require={
29 | 'test': [
30 | 'pytest==6.2.*',
31 | 'pytest-cov==3.0.*',
32 | 'pylint==2.12.*',
33 | ]
34 | },
35 | entry_points='''
36 | [console_scripts]
37 | transform-field=transform_field:main
38 | ''',
39 | packages=['transform_field']
40 | )
41 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/transferwise/pipelinewise-transform-field/af22b03ece9144948702236d9f2f5aaed845ac16/tests/__init__.py
--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/transferwise/pipelinewise-transform-field/af22b03ece9144948702236d9f2f5aaed845ac16/tests/integration/__init__.py
--------------------------------------------------------------------------------
/tests/integration/resources/catalog.json:
--------------------------------------------------------------------------------
1 | {
2 | "streams": [
3 | {
4 | "metadata": [
5 | {
6 | "breadcrumb": [],
7 | "metadata": {
8 | "replication-method": "FULL_TABLE",
9 | "selected": true,
10 | "selected-by-default": false,
11 | "table-key-properties": [
12 | "column_1"
13 | ]
14 | }
15 | }
16 | ],
17 | "schema": {
18 | "properties": {
19 | "column_1": {
20 | "format": "date-time",
21 | "inclusion": "available",
22 | "type": [
23 | "null",
24 | "string"
25 | ]
26 | },
27 | "column_2": {
28 | "inclusion": "automatic",
29 | "maximum": 2147483647,
30 | "minimum": -2147483648,
31 | "type": [
32 | "null",
33 | "integer"
34 | ]
35 | },
36 | "column_3": {
37 | "inclusion": "automatic",
38 | "maximum": 2147483647,
39 | "minimum": -2147483648,
40 | "type": [
41 | "null",
42 | "integer"
43 | ]
44 | },
45 | "column_4": {
46 | "inclusion": "automatic",
47 | "maximum": 2147483647,
48 | "minimum": -2147483648,
49 | "type": [
50 | "null",
51 | "integer"
52 | ]
53 | },
54 | "column_5": {
55 | "format": "date-time",
56 | "inclusion": "available",
57 | "type": [
58 | "null",
59 | "string"
60 | ]
61 | }
62 | },
63 | "type": "object"
64 | },
65 | "tap_stream_id": "dummy_stream"
66 | }
67 | ]
68 | }
--------------------------------------------------------------------------------
/tests/integration/resources/invalid_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "transformations":[
3 | {
4 | "tap_stream_name":"dummy_stream",
5 | "field_id":"column_1",
6 | "type":"SET-NULL"
7 | },
8 | {
9 | "tap_stream_name":"dummy_stream",
10 | "field_id":"column_2",
11 | "type":"HASH"
12 | },
13 | {
14 | "tap_stream_name": "dummy_stream",
15 | "field_id": "column_5",
16 | "type": "MASK-DATE"
17 | }
18 | ]
19 | }
20 |
--------------------------------------------------------------------------------
/tests/integration/resources/invalid_messages.json:
--------------------------------------------------------------------------------
1 | {"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_one"}}
2 | {"type": "SCHEMA", "stream": "tap_mysql_test-test_table_one", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_varchar": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "c_int": {"inclusion": "available", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}}, "type": "object"}, "key_properties": ["c_pk"]}
3 | THIS IS A TEST INPUT FROM A TAP WITH A LINE WITH INVALID JSON
4 | {"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_table_one", "version": 1}
5 |
--------------------------------------------------------------------------------
/tests/integration/resources/messages.json:
--------------------------------------------------------------------------------
1 | {"type": "STATE", "value": {"currently_syncing": "dummy_stream"}}
2 | {"type": "SCHEMA", "stream": "dummy_stream", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "column_1": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_2": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_3": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_4": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_5": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "column_6": {"inclusion": "available", "type": ["null", "integer"]}, "column_7": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_8": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "column_9": {"inclusion": "available", "type": ["null", "integer"]}, "column_10": {"inclusion": "available", "maxLength": 64, "type": ["null", "string"]}, "column_11": {"inclusion": "available", "maxLength": 64, "type": ["null", "string"]}, "column_12": {"inclusion": "available", "maxLength": 64, "type": ["null", "string"]}, "column_13": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_14": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["c_pk"]}
3 | {"type": "ACTIVATE_VERSION", "stream": "dummy_stream", "version": 1}
4 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 1, "column_1": "Dummy row 1", "column_2": "Dummy row 1", "column_3": "Dummy row 1", "column_4": "Dummy row 1", "column_5": "2019-12-21T12:12:45", "column_6": 1234, "column_7": "Dummy row 1", "column_8": "2019-12-21T12:12:45", "column_9": 100, "column_10": "column_11 is safe to keep", "column_11": "My name is John", "column_12": "abcd", "column_13": "domaskme", "column_14": "domaskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
5 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 2, "column_1": "Dummy row 2", "column_2": "Dummy row 2", "column_3": "Dummy row 2", "column_4": "Dummy row 2", "column_5": "2019-12-21T13:12:45", "column_6": 1234, "column_7": "Dummy row 2", "column_8": "2019-12-21T13:12:45", "column_9": 200, "column_10": "column_11 has sensitive data. Needs to transform to NULL", "column_11": "SUPER_SECRET_PASSWORD", "column_12": "abcd", "column_13": "nom", "column_14": "maskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
6 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 3, "column_1": "Dummy row 3", "column_2": "Dummy row 3", "column_3": "Dummy row 3", "column_4": "Dummy row 3", "column_5": "2019-12-21T14:12:45", "column_6": 1234, "column_7": "Dummy row 3", "column_8": "2019-12-21T14:12:45", "column_9": 300, "column_10": "Dummy row 1", "column_11": "Dummy row 1", "column_12": "abcd", "column_13": "domaskme", "column_14": "domaskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
7 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 3, "column_1": "Dummy row 4", "column_2": "Dummy row 4", "column_3": "Dummy row 4", "column_4": "Dummy row 4", "column_5": "2019-12-21T15:12:45", "column_6": 1234, "column_7": "Dummy row 4", "column_8": "2019-12-21T15:12:45", "column_9": 400, "column_10": "Dummy row 1", "column_11": "Dummy row 1", "column_12": "abcd", "column_13": "domaskme", "column_14": "domaskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
8 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 5, "column_1": "Dummy row 5", "column_2": "Dummy row 5", "column_3": "Dummy row 5", "column_4": "Dummy row 5", "column_5": "2019-12-21T16:12:45", "column_6": 1234, "column_7": "Dummy row 5", "column_8": "2019-12-21T16:12:45", "column_9": 500, "column_10": "Dummy row 1", "column_11": "Dummy row 1", "column_12": "abcd", "column_13": "domaskme", "column_14": "domaskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
9 | {"type": "STATE", "value": {"currently_syncing": "dummy_stream", "bookmarks": {"dummy_stream": {"initial_full_table_complete": true}}}}
10 | {"type": "ACTIVATE_VERSION", "stream": "dummy_stream", "version": 1}
11 | {"type": "STATE", "value": {"currently_syncing": null, "bookmarks": {"dummy_stream": {"initial_full_table_complete": true}}}}
12 |
--------------------------------------------------------------------------------
/tests/integration/resources/streams_with_changing_schema.json:
--------------------------------------------------------------------------------
1 | {"type": "SCHEMA", "stream":"dummy_stream", "schema": {"properties": {"column_2": {"type": ["null", "integer"]}}}, "key_properties": []}
2 | {"type": "RECORD", "stream":"dummy_stream", "record": {"column_2": 1}}
3 | {"type": "RECORD", "stream":"dummy_stream", "record": {"column_2": 2}}
4 | {"type": "RECORD", "stream":"dummy_stream", "record": {"column_2": 3}}
5 | {"type": "SCHEMA", "stream":"dummy_stream", "schema": {"properties": {"column_2": {"type": ["null", "string"]}}}, "key_properties": []}
6 | {"type": "RECORD", "stream":"dummy_stream", "record": {"column_2": "ABC"}}
--------------------------------------------------------------------------------
/tests/integration/resources/streams_with_object.json:
--------------------------------------------------------------------------------
1 | {"type": "STATE", "value": {"currently_syncing": "my_cool_stream"}}
2 | {"type": "SCHEMA", "stream": "my_cool_stream", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "column_1": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_2": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_3": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "column_4": {"inclusion": "available", "type": ["null", "integer"]}, "column_5": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "column_6": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["c_pk"]}
3 | {"type": "ACTIVATE_VERSION", "stream": "my_cool_stream", "version": 1}
4 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 1, "column_1": "Dummy row 1", "column_2": "Dummy row 1", "column_3": "2019-12-21T12:12:45", "column_4": 1234, "column_5": "2021-12-21T12:12:45", "column_6": {"id": 50, "key1": "A", "key2": {"key2_2": 41}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
5 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 2, "column_1": "Dummy row 2", "column_2": "Dummy row 2", "column_3": "2019-12-21T13:12:45", "column_4": 4, "column_5": "2021-12-21T13:12:45", "column_6": {"id": 51, "key1": "B", "key2": {"key2_1": "ds"}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
6 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 3, "column_1": "Dummy row 3", "column_2": "Dummy row 3", "column_3": "2019-12-21T14:12:45", "column_4": 15, "column_5": "2021-12-21T14:12:45", "column_6": {"id": 52, "key1": "C", "key2": {"key2_1": "xv43dgf", "key2_2": 4544}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
7 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 4, "column_1": "Dummy row 4", "column_2": "Dummy row 4", "column_3": "2019-12-21T15:12:45", "column_4": 1000, "column_5": "2021-12-21T15:12:45", "column_6": {"id": 53, "key1": "D", "key2": {"key2_1": "43xvf", "key2_2": true}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
8 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 5, "column_1": "Dummy row 5", "column_2": "Dummy row 5", "column_3": "2019-12-21T16:12:45", "column_4": -44, "column_5": "2021-12-21T16:12:45", "column_6": {"id": 54, "key1": "E", "key2": {"key2_1": "trter", "key2_3": false}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
9 | {"type": "STATE", "value": {"currently_syncing": "my_cool_stream", "bookmarks": {"my_cool_stream": {"initial_full_table_complete": true}}}}
10 | {"type": "ACTIVATE_VERSION", "stream": "my_cool_stream", "version": 1}
11 | {"type": "STATE", "value": {"currently_syncing": null, "bookmarks": {"my_cool_stream": {"initial_full_table_complete": true}}}}
12 |
--------------------------------------------------------------------------------
/tests/integration/resources/valid_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "transformations":[
3 | {
4 | "tap_stream_name":"dummy_stream",
5 | "field_id":"column_1",
6 | "type":"SET-NULL"
7 | },
8 | {
9 | "tap_stream_name":"dummy_stream",
10 | "field_id":"column_2",
11 | "type":"MASK-NUMBER"
12 | },
13 | {
14 | "tap_stream_name": "dummy_stream",
15 | "field_id": "column_5",
16 | "type": "MASK-DATE"
17 | }
18 | ]
19 | }
20 |
--------------------------------------------------------------------------------
/tests/integration/test_integrations.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import unittest
3 | import os
4 | import sys
5 | import json
6 | import tempfile
7 |
8 | from transform_field import TransformField, TransformFieldException, InvalidTransformationException
9 |
10 |
11 | class Base(unittest.TestCase):
12 | def setUp(self):
13 | self.maxDiff = None
14 |
15 | sys.stdout = self._stdout = tempfile.NamedTemporaryFile('w+', delete=True)
16 | sys.stderr.write(self._stdout.name + ' ')
17 |
18 | def teardown(self):
19 | self._stdout.close()
20 | sys.stdout = sys.__stdout__
21 |
22 | @property
23 | def stdout(self):
24 | self._stdout.seek(0)
25 | return self._stdout.read()[:-1] # Remove trailing \n:w
26 |
27 | def get_tap_input_messages(self, filename):
28 | lines = []
29 | with open('{}/resources/{}'.format(os.path.dirname(__file__), filename)) as tap_stdout:
30 | for line in tap_stdout.readlines():
31 | lines.append(line)
32 |
33 | return lines
34 |
35 | def singer_output_to_objects(self, output):
36 | messages = []
37 | for message in output.splitlines():
38 | messages.append(json.loads(message))
39 |
40 | return messages
41 |
42 |
43 | class TestIntegration(Base):
44 |
45 | def test_invalid_json(self):
46 | """Receiving invalid JSONs should raise an exception"""
47 | tap_lines = self.get_tap_input_messages('invalid_messages.json')
48 | trans_config = {'transformations': []}
49 |
50 | transform_field = TransformField(trans_config)
51 | with self.assertRaises(TransformFieldException):
52 | transform_field.consume(tap_lines)
53 |
54 | def test_multiple_singer_json_messages(self):
55 | """Test a bunch of singer messages with different field transformation types"""
56 | tap_lines = self.get_tap_input_messages('messages.json')
57 |
58 | # Set transformations on some columns
59 | trans_config = {'transformations': [
60 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_1', 'type': 'SET-NULL'},
61 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_2', 'type': 'HASH'},
62 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_3', 'type': 'HASH-SKIP-FIRST-2'},
63 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_4', 'type': 'HASH-SKIP-FIRST-3'},
64 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_5', 'type': 'MASK-DATE'},
65 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_6', 'type': 'MASK-NUMBER'},
66 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_11', 'type': 'SET-NULL',
67 | 'when': [
68 | {'column': 'column_7', 'equals': "Dummy row 2"},
69 | {'column': 'column_9', 'equals': 200},
70 | {'column': 'column_10', 'regex_match': 'sensitive'},
71 | ]
72 | },
73 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_12', 'type': 'MASK-HIDDEN'},
74 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_13', 'type': 'MASK-STRING-SKIP-ENDS-2'},
75 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_14', 'type': 'MASK-STRING-SKIP-ENDS-3'}
76 | ]}
77 |
78 | transform_field = TransformField(trans_config)
79 | transform_field.consume(tap_lines)
80 |
81 | singer_output_messages = self.singer_output_to_objects(self.stdout)
82 |
83 | # First message is the STATE message
84 | self.assertEqual(
85 | singer_output_messages[0],
86 | {
87 | 'type': 'STATE',
88 | 'value': {'currently_syncing': 'dummy_stream'}
89 | }
90 | )
91 |
92 | # Second message is the SCHEMA message
93 | self.assertEqual(
94 | singer_output_messages[1],
95 | {
96 | 'type': 'SCHEMA',
97 | 'stream': 'dummy_stream',
98 | 'schema': {
99 | 'properties': {
100 | 'c_pk': {'inclusion': 'automatic', 'minimum': -2147483648, 'maximum': 2147483647,
101 | 'type': ['null', 'integer']},
102 | 'column_1': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
103 | 'column_2': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
104 | 'column_3': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
105 | 'column_4': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
106 | 'column_5': {'inclusion': 'available', 'format': 'date-time', 'type': ['null', 'string']},
107 | 'column_6': {'inclusion': 'available', 'type': ['null', 'integer']},
108 | 'column_7': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
109 | 'column_8': {'inclusion': 'available', 'format': 'date-time', 'type': ['null', 'string']},
110 | 'column_9': {'inclusion': 'available', 'type': ['null', 'integer']},
111 | 'column_10': {'inclusion': 'available', 'maxLength': 64, 'type': ['null', 'string']},
112 | 'column_11': {'inclusion': 'available', 'maxLength': 64, 'type': ['null', 'string']},
113 | 'column_12': {'inclusion': 'available', 'maxLength': 64, 'type': ['null', 'string']},
114 | 'column_13': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
115 | 'column_14': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
116 | },
117 | 'type': 'object'
118 | },
119 | 'key_properties': ['c_pk']
120 | }
121 | )
122 |
123 | # Third message is a RECORD message with transformed values
124 | self.assertEqual(
125 | singer_output_messages[2],
126 | {
127 | 'type': 'RECORD',
128 | 'stream': 'dummy_stream',
129 | 'record': {
130 | 'c_pk': 1,
131 | 'column_1': None, # should be SET-NULL transformed
132 | 'column_2': 'c584d22683f3e523df9a7396e7939c0da16af89976b613adfe4bcd4c9c526f32',
133 | # Should be HASH transformed
134 | 'column_3': 'Ducd571661edac8d47669a60b964c7124b228b69862cd21d548794af41c139a8e3',
135 | # Should be HASH-SKIP-2 tranformed
136 | 'column_4': 'Dum1fe9627d907b0a37a31b270cc0f660a7388eb470a2558e839e0c1f601aedfaa7',
137 | # Should be HASH-SKIP-3 tranformed
138 | 'column_5': '2019-01-01T12:12:45', # Should be MASK-DATE transformed
139 | 'column_6': 0, # Should be MASK-NUMBER transformed
140 | 'column_7': 'Dummy row 1', # Should be the originl value - Unknown transformation type
141 | 'column_8': '2019-12-21T12:12:45', # Should be the original date-time value
142 | 'column_9': 100, # Should be the original number value
143 |
144 | # Conditional transformation
145 | 'column_10': 'column_11 is safe to keep',
146 | 'column_11': 'My name is John',
147 |
148 | 'column_12': 'hidden',
149 |
150 | # Should be MASK-STRING-SKIP-ENDS-2 transformed
151 | 'column_13': 'do****me',
152 | # Should be MASK-STRING-SKIP-ENDS-3 transformed
153 | 'column_14': 'dom**kme',
154 | },
155 | 'version': 1,
156 | 'time_extracted': '2019-01-31T15:51:50.215998Z'
157 | }
158 | )
159 |
160 | # Third message is a RECORD message with transformed values
161 | self.assertEqual(
162 | singer_output_messages[3],
163 | {
164 | 'type': 'RECORD',
165 | 'stream': 'dummy_stream',
166 | 'record': {
167 | 'c_pk': 2,
168 | 'column_1': None, # should be SET-NULL transformed
169 | 'column_2': '12c7ca803f4ae4044b8c3a6aa7dbaf9fe73a25e12f2258dbf8a832961ac6abab',
170 | # Should be HASH tranformed
171 | 'column_3': 'Du7c2717bbc7489d36cea73c8519c815ce962142a5b32db413abe0bce7f58d943f',
172 | # Should be HASH-SKIP-3 tranformed
173 | 'column_4': 'Dum5b2be872199a84657234144caec9106483a522edd36783c7a12439bcf3853c56',
174 | # Should be HASH-SKIP-3 tranformed
175 | 'column_5': '2019-01-01T13:12:45', # Should be MASK-DATE transformed
176 | 'column_6': 0, # Should be MASK-NUMBER transformed
177 | 'column_7': 'Dummy row 2', # Should be the origian value - Unknown transformation type
178 | 'column_8': '2019-12-21T13:12:45', # Should be the original date-time value
179 | 'column_9': 200, # Should be the original number value
180 |
181 | # Conditional transformation
182 | 'column_10': 'column_11 has sensitive data. Needs to transform to NULL',
183 | 'column_11': None, # Should be SET-NULL transformed
184 |
185 | 'column_12': 'hidden',
186 |
187 | # Should be MASK-STRING-SKIP-ENDS-2 transformed
188 | 'column_13': '***',
189 | # Should be MASK-STRING-SKIP-ENDS-3 transformed
190 | 'column_14': '******',
191 | },
192 | 'version': 1,
193 | 'time_extracted': '2019-01-31T15:51:50.215998Z'
194 | }
195 | )
196 |
197 | def test_messages_with_changing_schema(self):
198 | """Test a bunch of singer messages where a column in schema message
199 | changes its type"""
200 | tap_lines = self.get_tap_input_messages('streams_with_changing_schema.json')
201 |
202 | # Set transformations on some columns
203 | trans_config = {'transformations': [
204 | {'tap_stream_name': 'dummy_stream', 'field_id': 'column_2', 'type': 'MASK-NUMBER'},
205 | ]}
206 |
207 | transform_field = TransformField(trans_config)
208 |
209 | with self.assertRaises(InvalidTransformationException):
210 | transform_field.consume(tap_lines)
211 |
212 | def test_validate_flag_with_invalid_transformations(self):
213 | config = '{}/resources/invalid_config.json'.format(os.path.dirname(__file__))
214 | catalog = '{}/resources/catalog.json'.format(os.path.dirname(__file__))
215 |
216 | result = subprocess.run([
217 | 'transform-field',
218 | '--validate',
219 | '--config', config,
220 | '--catalog', catalog,
221 | ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
222 |
223 | with self.assertRaises(subprocess.CalledProcessError):
224 | result.check_returncode()
225 |
226 | def test_validate_flag_with_valid_transformations(self):
227 |
228 | config = '{}/resources/valid_config.json'.format(os.path.dirname(__file__))
229 | catalog = '{}/resources/catalog.json'.format(os.path.dirname(__file__))
230 |
231 | result = subprocess.run([
232 | 'transform-field',
233 | '--validate',
234 | '--config', config,
235 | '--catalog', catalog,
236 | ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
237 |
238 | self.assertIsNone(result.check_returncode())
239 |
240 | def test_multiple_singer_json_messages_with_transformation_on_json(self):
241 | """Test a bunch of singer messages with transformation on json"""
242 | tap_lines = self.get_tap_input_messages('streams_with_object.json')
243 |
244 | # Set transformations on some columns
245 | trans_config = {'transformations': [
246 | {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_1', 'type': 'SET-NULL'},
247 | {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_2', 'type': 'MASK-HIDDEN'},
248 | {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_3', 'type': 'MASK-DATE',
249 | 'when': [
250 | {'column': 'c_pk', 'equals': 2},
251 | {'column': 'column_6', 'field_path': 'key1', 'equals': 'B'}
252 | ]
253 | },
254 | {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_4', 'type': 'MASK-NUMBER',
255 | 'when': [
256 | {'column': 'column_4', 'equals': -44},
257 | ]
258 | },
259 | {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_6', 'type': 'SET-NULL',
260 | 'field_paths': ['key2/key2_2']},
261 | ]}
262 |
263 | transform_field = TransformField(trans_config)
264 | transform_field.consume(tap_lines)
265 |
266 | records = [msg['record'] for msg in self.singer_output_to_objects(self.stdout) if msg['type'] == 'RECORD']
267 |
268 | self.assertListEqual(records, [
269 | {
270 | 'c_pk': 1,
271 | 'column_1': None,
272 | 'column_2': 'hidden',
273 | 'column_3': '2019-12-21T12:12:45',
274 | 'column_4': 1234,
275 | 'column_5': '2021-12-21T12:12:45',
276 | 'column_6': {'id': 50, 'key1': 'A', 'key2': {'key2_2': None}},
277 | },
278 | {
279 | 'c_pk': 2,
280 | 'column_1': None,
281 | 'column_2': 'hidden',
282 | 'column_3': '2019-01-01T13:12:45',
283 | 'column_4': 4,
284 | 'column_5': '2021-12-21T13:12:45',
285 | 'column_6': {'id': 51, 'key1': 'B', 'key2': {'key2_1': 'ds'}},
286 | },
287 | {
288 | 'c_pk': 3,
289 | 'column_1': None,
290 | 'column_2': 'hidden',
291 | 'column_3': '2019-12-21T14:12:45',
292 | 'column_4': 15,
293 | 'column_5': '2021-12-21T14:12:45',
294 | 'column_6': {'id': 52, 'key1': 'C', 'key2': {'key2_1': 'xv43dgf', 'key2_2': None}},
295 | },
296 | {
297 | 'c_pk': 4,
298 | 'column_1': None,
299 | 'column_2': 'hidden',
300 | 'column_3': '2019-12-21T15:12:45',
301 | 'column_4': 1000,
302 | 'column_5': '2021-12-21T15:12:45',
303 | 'column_6': {'id': 53, 'key1': 'D', 'key2': {'key2_1': '43xvf', 'key2_2': None}},
304 | },
305 | {
306 | 'c_pk': 5,
307 | 'column_1': None,
308 | 'column_2': 'hidden',
309 | 'column_3': '2019-12-21T16:12:45',
310 | 'column_4': 0,
311 | 'column_5': '2021-12-21T16:12:45',
312 | 'column_6': {'id': 54, 'key1': 'E', 'key2': {'key2_1': 'trter', 'key2_3': False}},
313 | },
314 | ])
315 |
--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/transferwise/pipelinewise-transform-field/af22b03ece9144948702236d9f2f5aaed845ac16/tests/unit/__init__.py
--------------------------------------------------------------------------------
/tests/unit/test_init.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from unittest.mock import patch
3 |
4 | from singer import Catalog, Schema
5 | from transform_field.errors import CatalogRequiredException, StreamNotFoundException, NoStreamSchemaException, \
6 | UnsupportedTransformationTypeException, InvalidTransformationException
7 |
8 | from transform_field import TransformField, TransMeta
9 |
10 |
11 | class TestTransformField(unittest.TestCase):
12 | """
13 | Unit Tests for the TransformField class
14 | """
15 |
16 | def setUp(self) -> None:
17 | self.config = {
18 | 'transformations': [
19 | {
20 | "tap_stream_name": "stream_1",
21 | "field_id": "column_1",
22 | "type": "SET-NULL"
23 | },
24 | {
25 | "tap_stream_name": "stream_1",
26 | "field_id": "column_2",
27 | "type": "HASH",
28 | "when": []
29 | },
30 | {
31 | "tap_stream_name": "stream_2",
32 | "field_id": "column_1",
33 | "type": "MASK-DATE"
34 | },
35 | ]
36 | }
37 |
38 | def test_init(self):
39 | instance = TransformField(self.config)
40 |
41 | self.assertListEqual(instance.messages, [])
42 | self.assertEqual(instance.buffer_size_bytes, 0)
43 | self.assertIsNone(instance.state)
44 | self.assertIsNotNone(instance.time_last_batch_sent)
45 | self.assertDictEqual(instance.trans_config, self.config)
46 | self.assertDictEqual(instance.stream_meta, {})
47 | self.assertDictEqual(instance.trans_meta, {
48 | 'stream_1': [
49 | TransMeta('column_1', 'SET-NULL', None, None),
50 | TransMeta('column_2', 'HASH', [], None),
51 | ],
52 | 'stream_2': [TransMeta('column_1', 'MASK-DATE', None, None)],
53 | })
54 |
55 | def test_validate_without_catalog_fails(self):
56 | with self.assertRaises(CatalogRequiredException):
57 | TransformField(self.config).validate(None)
58 |
59 | @patch('transform_field.utils.get_stream_schemas')
60 | def test_validate_with_missing_stream_fails(self, get_stream_schemas_mock):
61 | catalog = Catalog.from_dict({'streams': []})
62 |
63 | get_stream_schemas_mock.return_value = {
64 | 'stream_2': {'something'}
65 | }
66 | with self.assertRaises(StreamNotFoundException):
67 | TransformField(self.config).validate(catalog)
68 |
69 | @patch('transform_field.utils.get_stream_schemas')
70 | def test_validate_with_empty_stream_schema_fails(self, get_stream_schemas_mock):
71 | catalog = Catalog.from_dict({'streams': []})
72 |
73 | get_stream_schemas_mock.return_value = {
74 | 'stream_1': {},
75 | 'stream_2': {'something'}
76 | }
77 | with self.assertRaises(NoStreamSchemaException):
78 | TransformField(self.config).validate(catalog)
79 |
80 | @patch('transform_field.utils.get_stream_schemas')
81 | def test_validate_with_unsupported_trans_type(self, get_stream_schemas_mock):
82 | config = {
83 | 'transformations': [
84 | {
85 | "tap_stream_name": "stream_1",
86 | "field_id": "column_1",
87 | "type": "SET-RANDOM"
88 | },
89 | ]
90 | }
91 |
92 | catalog = Catalog.from_dict({'streams': []})
93 |
94 | get_stream_schemas_mock.return_value = {
95 | 'stream_1': Schema.from_dict({'properties': {
96 | 'column_1': {
97 | 'type': [
98 | 'string'
99 | ]
100 | }
101 | }})
102 | }
103 | with self.assertRaises(UnsupportedTransformationTypeException):
104 | TransformField(config).validate(catalog)
105 |
106 | @patch('transform_field.utils.get_stream_schemas')
107 | def test_validate_with_set_null_trans_type_success(self, get_stream_schemas_mock):
108 | config = {
109 | 'transformations': [
110 | {
111 | "tap_stream_name": "stream_1",
112 | "field_id": "column_1",
113 | "type": "SET-NULL"
114 | },
115 | ]
116 | }
117 |
118 | catalog = Catalog.from_dict({'streams': []})
119 |
120 | get_stream_schemas_mock.return_value = {
121 | 'stream_1': Schema.from_dict({'properties': {
122 | 'column_1': {
123 | 'type': [
124 | 'string'
125 | ]
126 | }
127 | }})
128 | }
129 | TransformField(config).validate(catalog)
130 |
131 | @patch('transform_field.utils.get_stream_schemas')
132 | def test_validate_with_hash_fails_1(self, get_stream_schemas_mock):
133 | """
134 | Testing validation of HASH transformation when field has no type
135 | """
136 | config = {
137 | 'transformations': [
138 | {
139 | "tap_stream_name": "stream_1",
140 | "field_id": "column_1",
141 | "type": "HASH"
142 | },
143 | ]
144 | }
145 |
146 | catalog = Catalog.from_dict({'streams': []})
147 |
148 | get_stream_schemas_mock.return_value = {
149 | 'stream_1': Schema.from_dict({'properties': {
150 | 'column_1': {}
151 | }})
152 | }
153 | with self.assertRaises(InvalidTransformationException):
154 | TransformField(config).validate(catalog)
155 |
156 | @patch('transform_field.utils.get_stream_schemas')
157 | def test_validate_with_hash_fails_2(self, get_stream_schemas_mock):
158 | """
159 | Testing validation of HASH transformation when field has non-string type
160 | """
161 | config = {
162 | 'transformations': [
163 | {
164 | "tap_stream_name": "stream_1",
165 | "field_id": "column_1",
166 | "type": "HASH"
167 | },
168 | ]
169 | }
170 |
171 | catalog = Catalog.from_dict({'streams': []})
172 |
173 | get_stream_schemas_mock.return_value = {
174 | 'stream_1': Schema.from_dict({'properties': {
175 | 'column_1': {
176 | 'type': [
177 | 'null',
178 | 'integer'
179 | ]
180 | }
181 | }})
182 | }
183 | with self.assertRaises(InvalidTransformationException):
184 | TransformField(config).validate(catalog)
185 |
186 | @patch('transform_field.utils.get_stream_schemas')
187 | def test_validate_with_hash_fails_3(self, get_stream_schemas_mock):
188 | """
189 | Testing validation of HASH transformation when field has string type but formatted
190 | """
191 | config = {
192 | 'transformations': [
193 | {
194 | "tap_stream_name": "stream_1",
195 | "field_id": "column_1",
196 | "type": "HASH"
197 | },
198 | ]
199 | }
200 |
201 | catalog = Catalog.from_dict({'streams': []})
202 |
203 | get_stream_schemas_mock.return_value = {
204 | 'stream_1': Schema.from_dict({'properties': {
205 | 'column_1': {
206 | 'type': [
207 | 'null',
208 | 'string'
209 | ],
210 | 'format': 'binary'
211 | }
212 | }})
213 | }
214 | with self.assertRaises(InvalidTransformationException):
215 | TransformField(config).validate(catalog)
216 |
217 | @patch('transform_field.utils.get_stream_schemas')
218 | def test_validate_with_hash_success(self, get_stream_schemas_mock):
219 | """
220 | Testing validation of HASH transformation when field has string type but no format
221 | """
222 | config = {
223 | 'transformations': [
224 | {
225 | "tap_stream_name": "stream_1",
226 | "field_id": "column_1",
227 | "type": "HASH"
228 | },
229 | ]
230 | }
231 |
232 | catalog = Catalog.from_dict({'streams': []})
233 |
234 | get_stream_schemas_mock.return_value = {
235 | 'stream_1': Schema.from_dict({'properties': {
236 | 'column_1': {
237 | 'type': [
238 | 'null',
239 | 'string'
240 | ]
241 | }
242 | }})
243 | }
244 | TransformField(config).validate(catalog)
245 |
246 | @patch('transform_field.utils.get_stream_schemas')
247 | def test_validate_with_hash_skip_first_fails_1(self, get_stream_schemas_mock):
248 | """
249 | Testing validation of HASH-SKIP-FIRST transformation when field has no type
250 | """
251 | config = {
252 | 'transformations': [
253 | {
254 | "tap_stream_name": "stream_1",
255 | "field_id": "column_1",
256 | "type": "HASH-SKIP-FIRST-1"
257 | },
258 | ]
259 | }
260 |
261 | catalog = Catalog.from_dict({'streams': []})
262 |
263 | get_stream_schemas_mock.return_value = {
264 | 'stream_1': Schema.from_dict({'properties': {
265 | 'column_1': {}
266 | }})
267 | }
268 | with self.assertRaises(InvalidTransformationException):
269 | TransformField(config).validate(catalog)
270 |
271 | @patch('transform_field.utils.get_stream_schemas')
272 | def test_validate_with_hash_skip_first_fails_2(self, get_stream_schemas_mock):
273 | """
274 | Testing validation of HASH-SKIP-FIRST transformation when field has non-string type
275 | """
276 | config = {
277 | 'transformations': [
278 | {
279 | "tap_stream_name": "stream_1",
280 | "field_id": "column_1",
281 | "type": "HASH-SKIP-FIRST-1"
282 | },
283 | ]
284 | }
285 |
286 | catalog = Catalog.from_dict({'streams': []})
287 |
288 | get_stream_schemas_mock.return_value = {
289 | 'stream_1': Schema.from_dict({'properties': {
290 | 'column_1': {
291 | 'type': [
292 | 'null',
293 | 'integer'
294 | ]
295 | }
296 | }})
297 | }
298 | with self.assertRaises(InvalidTransformationException):
299 | TransformField(config).validate(catalog)
300 |
301 | @patch('transform_field.utils.get_stream_schemas')
302 | def test_validate_with_hash_skip_first_fails_3(self, get_stream_schemas_mock):
303 | """
304 | Testing validation of HASH-SKIP-FIRST-1 transformation when field has string type but formatted
305 | """
306 | config = {
307 | 'transformations': [
308 | {
309 | "tap_stream_name": "stream_1",
310 | "field_id": "column_1",
311 | "type": "HASH-SKIP-FIRST-1"
312 | },
313 | ]
314 | }
315 |
316 | catalog = Catalog.from_dict({'streams': []})
317 |
318 | get_stream_schemas_mock.return_value = {
319 | 'stream_1': Schema.from_dict({'properties': {
320 | 'column_1': {
321 | 'type': [
322 | 'null',
323 | 'string'
324 | ],
325 | 'format': 'binary'
326 | }
327 | }})
328 | }
329 | with self.assertRaises(InvalidTransformationException):
330 | TransformField(config).validate(catalog)
331 |
332 | @patch('transform_field.utils.get_stream_schemas')
333 | def test_validate_with_hash_skip_first_success(self, get_stream_schemas_mock):
334 | """
335 | Testing validation of HASH-SKIP-FIRST-1 transformation when field has string type but not formatted
336 | """
337 | config = {
338 | 'transformations': [
339 | {
340 | "tap_stream_name": "stream_1",
341 | "field_id": "column_1",
342 | "type": "HASH-SKIP-FIRST-1"
343 | },
344 | ]
345 | }
346 |
347 | catalog = Catalog.from_dict({'streams': []})
348 |
349 | get_stream_schemas_mock.return_value = {
350 | 'stream_1': Schema.from_dict({'properties': {
351 | 'column_1': {
352 | 'type': [
353 | 'null',
354 | 'string'
355 | ]
356 | }
357 | }})
358 | }
359 | TransformField(config).validate(catalog)
360 |
361 | @patch('transform_field.utils.get_stream_schemas')
362 | def test_validate_with_mask_hidden_fails_1(self, get_stream_schemas_mock):
363 | """
364 | Testing validation of MASK-HIDDEN transformation when field has no type
365 | """
366 | config = {
367 | 'transformations': [
368 | {
369 | "tap_stream_name": "stream_1",
370 | "field_id": "column_1",
371 | "type": "MASK-HIDDEN"
372 | },
373 | ]
374 | }
375 |
376 | catalog = Catalog.from_dict({'streams': []})
377 |
378 | get_stream_schemas_mock.return_value = {
379 | 'stream_1': Schema.from_dict({'properties': {
380 | 'column_1': {}
381 | }})
382 | }
383 | with self.assertRaises(InvalidTransformationException):
384 | TransformField(config).validate(catalog)
385 |
386 | @patch('transform_field.utils.get_stream_schemas')
387 | def test_validate_with_mask_hidden_fails_2(self, get_stream_schemas_mock):
388 | """
389 | Testing validation of MASK-HIDDEN transformation when field has non-string type
390 | """
391 | config = {
392 | 'transformations': [
393 | {
394 | "tap_stream_name": "stream_1",
395 | "field_id": "column_1",
396 | "type": "MASK-HIDDEN"
397 | },
398 | ]
399 | }
400 |
401 | catalog = Catalog.from_dict({'streams': []})
402 |
403 | get_stream_schemas_mock.return_value = {
404 | 'stream_1': Schema.from_dict({'properties': {
405 | 'column_1': {
406 | 'type': [
407 | 'null',
408 | 'integer'
409 | ]
410 | }
411 | }})
412 | }
413 | with self.assertRaises(InvalidTransformationException):
414 | TransformField(config).validate(catalog)
415 |
416 | @patch('transform_field.utils.get_stream_schemas')
417 | def test_validate_with_mask_hidden_fails_3(self, get_stream_schemas_mock):
418 | """
419 | Testing validation of MASK-HIDDEN transformation when field has string type but formatted
420 | """
421 | config = {
422 | 'transformations': [
423 | {
424 | "tap_stream_name": "stream_1",
425 | "field_id": "column_1",
426 | "type": "MASK-HIDDEN"
427 | },
428 | ]
429 | }
430 |
431 | catalog = Catalog.from_dict({'streams': []})
432 |
433 | get_stream_schemas_mock.return_value = {
434 | 'stream_1': Schema.from_dict({'properties': {
435 | 'column_1': {
436 | 'type': [
437 | 'null',
438 | 'string'
439 | ],
440 | 'format': 'binary'
441 | }
442 | }})
443 | }
444 | with self.assertRaises(InvalidTransformationException):
445 | TransformField(config).validate(catalog)
446 |
447 | @patch('transform_field.utils.get_stream_schemas')
448 | def test_validate_with_mask_hidden_success(self, get_stream_schemas_mock):
449 | """
450 | Testing validation of MASK-HIDDEN transformation when field has string type but not formatted
451 | """
452 | config = {
453 | 'transformations': [
454 | {
455 | "tap_stream_name": "stream_1",
456 | "field_id": "column_1",
457 | "type": "MASK-HIDDEN"
458 | },
459 | ]
460 | }
461 |
462 | catalog = Catalog.from_dict({'streams': []})
463 |
464 | get_stream_schemas_mock.return_value = {
465 | 'stream_1': Schema.from_dict({'properties': {
466 | 'column_1': {
467 | 'type': [
468 | 'null',
469 | 'string'
470 | ]
471 | }
472 | }})
473 | }
474 | TransformField(config).validate(catalog)
475 |
476 | @patch('transform_field.utils.get_stream_schemas')
477 | def test_validate_with_mask_date_fails_1(self, get_stream_schemas_mock):
478 | """
479 | Testing validation of MASK-DATE transformation when field has no type
480 | """
481 | config = {
482 | 'transformations': [
483 | {
484 | "tap_stream_name": "stream_1",
485 | "field_id": "column_1",
486 | "type": "MASK-DATE"
487 | },
488 | ]
489 | }
490 |
491 | catalog = Catalog.from_dict({'streams': []})
492 |
493 | get_stream_schemas_mock.return_value = {
494 | 'stream_1': Schema.from_dict({'properties': {
495 | 'column_1': {}
496 | }})
497 | }
498 | with self.assertRaises(InvalidTransformationException):
499 | TransformField(config).validate(catalog)
500 |
501 | @patch('transform_field.utils.get_stream_schemas')
502 | def test_validate_with_mask_date_fails_2(self, get_stream_schemas_mock):
503 | """
504 | Testing validation of MASK-DATE transformation when field has string type but no format
505 | """
506 | config = {
507 | 'transformations': [
508 | {
509 | "tap_stream_name": "stream_1",
510 | "field_id": "column_1",
511 | "type": "MASK-DATE"
512 | },
513 | ]
514 | }
515 |
516 | catalog = Catalog.from_dict({'streams': []})
517 |
518 | get_stream_schemas_mock.return_value = {
519 | 'stream_1': Schema.from_dict({'properties': {
520 | 'column_1': {
521 | 'type': [
522 | 'null',
523 | 'string'
524 | ]
525 | }
526 | }})
527 | }
528 | with self.assertRaises(InvalidTransformationException):
529 | TransformField(config).validate(catalog)
530 |
531 | @patch('transform_field.utils.get_stream_schemas')
532 | def test_validate_with_mask_date_fails_3(self, get_stream_schemas_mock):
533 | """
534 | Testing validation of MASK-DATE transformation when field has non-string type
535 | """
536 | config = {
537 | 'transformations': [
538 | {
539 | "tap_stream_name": "stream_1",
540 | "field_id": "column_1",
541 | "type": "MASK-DATE"
542 | },
543 | ]
544 | }
545 |
546 | catalog = Catalog.from_dict({'streams': []})
547 |
548 | get_stream_schemas_mock.return_value = {
549 | 'stream_1': Schema.from_dict({'properties': {
550 | 'column_1': {
551 | 'type': [
552 | 'null',
553 | 'integer'
554 | ]
555 | }
556 | }})
557 | }
558 | with self.assertRaises(InvalidTransformationException):
559 | TransformField(config).validate(catalog)
560 |
561 | @patch('transform_field.utils.get_stream_schemas')
562 | def test_validate_with_mask_date_fails_4(self, get_stream_schemas_mock):
563 | """
564 | Testing validation of MASK-DATE transformation when field has string type but not date formatted
565 | """
566 | config = {
567 | 'transformations': [
568 | {
569 | "tap_stream_name": "stream_1",
570 | "field_id": "column_1",
571 | "type": "MASK-DATE"
572 | },
573 | ]
574 | }
575 |
576 | catalog = Catalog.from_dict({'streams': []})
577 |
578 | get_stream_schemas_mock.return_value = {
579 | 'stream_1': Schema.from_dict({'properties': {
580 | 'column_1': {
581 | 'type': [
582 | 'null',
583 | 'string'
584 | ],
585 | 'format': 'binary'
586 | }
587 | }})
588 | }
589 | with self.assertRaises(InvalidTransformationException):
590 | TransformField(config).validate(catalog)
591 |
592 | @patch('transform_field.utils.get_stream_schemas')
593 | def test_validate_with_mask_date_success_1(self, get_stream_schemas_mock):
594 | """
595 | Testing validation of MASK-DATE transformation when field has string type but is date formatted
596 | """
597 | config = {
598 | 'transformations': [
599 | {
600 | "tap_stream_name": "stream_1",
601 | "field_id": "column_1",
602 | "type": "MASK-DATE"
603 | },
604 | ]
605 | }
606 |
607 | catalog = Catalog.from_dict({'streams': []})
608 |
609 | get_stream_schemas_mock.return_value = {
610 | 'stream_1': Schema.from_dict({'properties': {
611 | 'column_1': {
612 | 'type': [
613 | 'null',
614 | 'string'
615 | ],
616 | 'format': 'date'
617 | }
618 | }})
619 | }
620 | TransformField(config).validate(catalog)
621 |
622 | @patch('transform_field.utils.get_stream_schemas')
623 | def test_validate_with_mask_date_success_2(self, get_stream_schemas_mock):
624 | """
625 | Testing validation of MASK-DATE transformation when field has string type but is date-time formatted
626 | """
627 | config = {
628 | 'transformations': [
629 | {
630 | "tap_stream_name": "stream_1",
631 | "field_id": "column_1",
632 | "type": "MASK-DATE"
633 | },
634 | ]
635 | }
636 |
637 | catalog = Catalog.from_dict({'streams': []})
638 |
639 | get_stream_schemas_mock.return_value = {
640 | 'stream_1': Schema.from_dict({'properties': {
641 | 'column_1': {
642 | 'type': [
643 | 'null',
644 | 'string'
645 | ],
646 | 'format': 'date-time'
647 | }
648 | }})
649 | }
650 | TransformField(config).validate(catalog)
651 |
652 | @patch('transform_field.utils.get_stream_schemas')
653 | def test_validate_with_mask_number_fails_1(self, get_stream_schemas_mock):
654 | """
655 | Testing validation of MASK-NUMBER transformation when field has no type
656 | """
657 | config = {
658 | 'transformations': [
659 | {
660 | "tap_stream_name": "stream_1",
661 | "field_id": "column_1",
662 | "type": "MASK-NUMBER"
663 | },
664 | ]
665 | }
666 |
667 | catalog = Catalog.from_dict({'streams': []})
668 |
669 | get_stream_schemas_mock.return_value = {
670 | 'stream_1': Schema.from_dict({'properties': {
671 | 'column_1': {}
672 | }})
673 | }
674 | with self.assertRaises(InvalidTransformationException):
675 | TransformField(config).validate(catalog)
676 |
677 | @patch('transform_field.utils.get_stream_schemas')
678 | def test_validate_with_mask_number_fails_2(self, get_stream_schemas_mock):
679 | """
680 | Testing validation of MASK-NUMBER transformation when field not have integer nor number type
681 | """
682 | config = {
683 | 'transformations': [
684 | {
685 | "tap_stream_name": "stream_1",
686 | "field_id": "column_1",
687 | "type": "MASK-NUMBER"
688 | },
689 | ]
690 | }
691 |
692 | catalog = Catalog.from_dict({'streams': []})
693 |
694 | get_stream_schemas_mock.return_value = {
695 | 'stream_1': Schema.from_dict({'properties': {
696 | 'column_1': {
697 | 'type': [
698 | 'null',
699 | 'string'
700 | ]
701 | }
702 | }})
703 | }
704 | with self.assertRaises(InvalidTransformationException):
705 | TransformField(config).validate(catalog)
706 |
707 | @patch('transform_field.utils.get_stream_schemas')
708 | def test_validate_with_mask_number_fails_3(self, get_stream_schemas_mock):
709 | """
710 | Testing validation of MASK-NUMBER transformation when field has integer type but formatted
711 | """
712 | config = {
713 | 'transformations': [
714 | {
715 | "tap_stream_name": "stream_1",
716 | "field_id": "column_1",
717 | "type": "MASK-NUMBER"
718 | },
719 | ]
720 | }
721 |
722 | catalog = Catalog.from_dict({'streams': []})
723 |
724 | get_stream_schemas_mock.return_value = {
725 | 'stream_1': Schema.from_dict({'properties': {
726 | 'column_1': {
727 | 'type': [
728 | 'null',
729 | 'integer'
730 | ],
731 | 'format': 'something random'
732 | }
733 | }})
734 | }
735 | with self.assertRaises(InvalidTransformationException):
736 | TransformField(config).validate(catalog)
737 |
738 | @patch('transform_field.utils.get_stream_schemas')
739 | def test_validate_with_mask_number_fails_4(self, get_stream_schemas_mock):
740 | """
741 | Testing validation of MASK-NUMBER transformation when field has number type but formatted
742 | """
743 | config = {
744 | 'transformations': [
745 | {
746 | "tap_stream_name": "stream_1",
747 | "field_id": "column_1",
748 | "type": "MASK-DATE"
749 | },
750 | ]
751 | }
752 |
753 | catalog = Catalog.from_dict({'streams': []})
754 |
755 | get_stream_schemas_mock.return_value = {
756 | 'stream_1': Schema.from_dict({'properties': {
757 | 'column_1': {
758 | 'type': [
759 | 'null',
760 | 'number'
761 | ],
762 | 'format': 'binary'
763 | }
764 | }})
765 | }
766 | with self.assertRaises(InvalidTransformationException):
767 | TransformField(config).validate(catalog)
768 |
769 | @patch('transform_field.utils.get_stream_schemas')
770 | def test_validate_with_mask_number_success_1(self, get_stream_schemas_mock):
771 | """
772 | Testing validation of MASK-NUMBER transformation when field has integer type
773 | """
774 | config = {
775 | 'transformations': [
776 | {
777 | "tap_stream_name": "stream_1",
778 | "field_id": "column_1",
779 | "type": "MASK-NUMBER"
780 | },
781 | ]
782 | }
783 |
784 | catalog = Catalog.from_dict({'streams': []})
785 |
786 | get_stream_schemas_mock.return_value = {
787 | 'stream_1': Schema.from_dict({'properties': {
788 | 'column_1': {
789 | 'type': [
790 | 'null',
791 | 'integer'
792 | ]
793 | }
794 | }})
795 | }
796 | TransformField(config).validate(catalog)
797 |
798 | @patch('transform_field.utils.get_stream_schemas')
799 | def test_validate_with_mask_number_success_2(self, get_stream_schemas_mock):
800 | """
801 | Testing validation of MASK-NUMBER transformation when field has number type
802 | """
803 | config = {
804 | 'transformations': [
805 | {
806 | "tap_stream_name": "stream_1",
807 | "field_id": "column_1",
808 | "type": "MASK-NUMBER"
809 | },
810 | ]
811 | }
812 |
813 | catalog = Catalog.from_dict({'streams': []})
814 |
815 | get_stream_schemas_mock.return_value = {
816 | 'stream_1': Schema.from_dict({'properties': {
817 | 'column_1': {
818 | 'type': [
819 | 'null',
820 | 'number'
821 | ]
822 | }
823 | }})
824 | }
825 | TransformField(config).validate(catalog)
826 |
827 | @patch('transform_field.utils.get_stream_schemas')
828 | def test_validate_with_mask_string_skip_ends_fails_1(self, get_stream_schemas_mock):
829 | """
830 | Testing validation of MASK-STRING-SKIP-ENDS transformation when field has no type
831 | """
832 | config = {
833 | 'transformations': [
834 | {
835 | "tap_stream_name": "stream_1",
836 | "field_id": "column_1",
837 | "type": "MASK-STRING-SKIP-ENDS-1"
838 | },
839 | ]
840 | }
841 |
842 | catalog = Catalog.from_dict({'streams': []})
843 |
844 | get_stream_schemas_mock.return_value = {
845 | 'stream_1': Schema.from_dict({'properties': {
846 | 'column_1': {}
847 | }})
848 | }
849 | with self.assertRaises(InvalidTransformationException):
850 | TransformField(config).validate(catalog)
851 |
852 | @patch('transform_field.utils.get_stream_schemas')
853 | def test_validate_with_mask_string_skip_ends_fails_2(self, get_stream_schemas_mock):
854 | """
855 | Testing validation of MASK-STRING-SKIP-ENDS transformation when field has non-string type
856 | """
857 | config = {
858 | 'transformations': [
859 | {
860 | "tap_stream_name": "stream_1",
861 | "field_id": "column_1",
862 | "type": "MASK-STRING-SKIP-ENDS-1"
863 | },
864 | ]
865 | }
866 |
867 | catalog = Catalog.from_dict({'streams': []})
868 |
869 | get_stream_schemas_mock.return_value = {
870 | 'stream_1': Schema.from_dict({'properties': {
871 | 'column_1': {
872 | 'type': [
873 | 'null',
874 | 'integer'
875 | ]
876 | }
877 | }})
878 | }
879 | with self.assertRaises(InvalidTransformationException):
880 | TransformField(config).validate(catalog)
881 |
882 | @patch('transform_field.utils.get_stream_schemas')
883 | def test_validate_with_mask_string_skip_ends_fails_3(self, get_stream_schemas_mock):
884 | """
885 | Testing validation of MASK-STRING-SKIP-ENDS-1 transformation when field has string type but formatted
886 | """
887 | config = {
888 | 'transformations': [
889 | {
890 | "tap_stream_name": "stream_1",
891 | "field_id": "column_1",
892 | "type": "MASK-STRING-SKIP-ENDS-1"
893 | },
894 | ]
895 | }
896 |
897 | catalog = Catalog.from_dict({'streams': []})
898 |
899 | get_stream_schemas_mock.return_value = {
900 | 'stream_1': Schema.from_dict({'properties': {
901 | 'column_1': {
902 | 'type': [
903 | 'null',
904 | 'string'
905 | ],
906 | 'format': 'binary'
907 | }
908 | }})
909 | }
910 | with self.assertRaises(InvalidTransformationException):
911 | TransformField(config).validate(catalog)
912 |
913 | @patch('transform_field.utils.get_stream_schemas')
914 | def test_validate_with_mask_string_skip_ends_success(self, get_stream_schemas_mock):
915 | """
916 | Testing validation of MASK-STRING-SKIP-ENDS-1 transformation when field has string type but not formatted
917 | """
918 | config = {
919 | 'transformations': [
920 | {
921 | "tap_stream_name": "stream_1",
922 | "field_id": "column_1",
923 | "type": "MASK-STRING-SKIP-ENDS-1"
924 | },
925 | ]
926 | }
927 |
928 | catalog = Catalog.from_dict({'streams': []})
929 |
930 | get_stream_schemas_mock.return_value = {
931 | 'stream_1': Schema.from_dict({'properties': {
932 | 'column_1': {
933 | 'type': [
934 | 'null',
935 | 'string'
936 | ]
937 | }
938 | }})
939 | }
940 | TransformField(config).validate(catalog)
941 |
--------------------------------------------------------------------------------
/tests/unit/test_transform.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import hashlib
3 |
4 | from transform_field import transform
5 |
6 |
7 | class TestTransform(unittest.TestCase):
8 | """
9 | Unit Tests for the transform module
10 | """
11 |
12 | def setUp(self) -> None:
13 | self.config = {}
14 |
15 | def test_set_null(self):
16 | """TEST SET-NULL transformation"""
17 | self.assertEqual(
18 | transform.do_transform({"col_1": "John"}, "col_1", "SET-NULL"),
19 | None
20 | )
21 |
22 | def test_hash(self):
23 | """Test HASH transformation"""
24 | self.assertEqual(
25 | transform.do_transform({"col_1": "John"}, "col_1", "HASH"),
26 | hashlib.sha256("John".encode('utf-8')).hexdigest()
27 | )
28 |
29 | def test_mask_date(self):
30 | """Test MASK-DATE transformation"""
31 | self.assertEqual(
32 | transform.do_transform({"col_1": "2019-05-21"}, "col_1", "MASK-DATE"),
33 | "2019-01-01T00:00:00"
34 | )
35 |
36 | # Mask date should keep the time elements
37 | self.assertEqual(
38 | transform.do_transform({"col_1": "2019-05-21T13:34:11"}, "col_1", "MASK-DATE"),
39 | "2019-01-01T13:34:11"
40 | )
41 |
42 | # Mask date should keep the time elements, date is invalid
43 | self.assertEqual(
44 | transform.do_transform({"col_1": "2019-05-21T13:34:99"}, "col_1", "MASK-DATE"),
45 | "2019-05-21T13:34:99"
46 | )
47 |
48 | def test_mask_number(self):
49 | """Test MASK-NUMBER transformation"""
50 | self.assertEqual(
51 | transform.do_transform({"col_1": "1234567890"}, "col_1", "MASK-NUMBER"),
52 | 0
53 | )
54 |
55 | def test_mask_hidden(self):
56 | """Test MASK-HIDDEN transformation"""
57 | self.assertEqual(
58 | transform.do_transform({"col_1": "abakadabra123"}, "col_1", "MASK-HIDDEN"),
59 | 'hidden'
60 | )
61 |
62 | def test_mask_string_skip_ends_case1(self):
63 | """Test MASK-STRING-SKIP-ENDS transformation with n=3"""
64 | self.assertEqual(
65 | transform.do_transform({"col_1": "do!maskme!"}, "col_1", "MASK-STRING-SKIP-ENDS-3"),
66 | 'do!****me!'
67 | )
68 |
69 | def test_mask_string_skip_ends_case2(self):
70 | """Test MASK-STRING-SKIP-ENDS transformation with n=2"""
71 | self.assertEqual(
72 | transform.do_transform({"col_1": "nomask"}, "col_1", "MASK-STRING-SKIP-ENDS-2"),
73 | 'no**sk'
74 | )
75 |
76 | def test_mask_string_skip_ends_case3(self):
77 | """Test MASK-STRING-SKIP-ENDS transformation where string length equals to 2 * mask_length"""
78 | self.assertEqual(
79 | transform.do_transform({"col_1": "nomask"}, "col_1", "MASK-STRING-SKIP-ENDS-3"),
80 | '******'
81 | )
82 |
83 | def test_mask_string_skip_ends_case4(self):
84 | """Test MASK-STRING-SKIP-ENDS transformation where string length less than 2 * mask_length"""
85 | self.assertEqual(
86 | transform.do_transform({"col_1": "shortmask"}, "col_1", "MASK-STRING-SKIP-ENDS-5"),
87 | '*********'
88 | )
89 |
90 | def test_unknown_transformation_type(self):
91 | """Test not existing transformation type"""
92 | # Should return the original value
93 | self.assertEqual(
94 | transform.do_transform({"col_1": "John"}, "col_1", "NOT-EXISTING-TRANSFORMATION-TYPE"),
95 | "John"
96 | )
97 |
98 | def test_conditions(self):
99 | """Test conditional transformations"""
100 |
101 | # Matching condition: Should transform to NULL
102 | self.assertEqual(
103 | transform.do_transform(
104 | # Record:
105 | {"col_1": "random value", "col_2": "passwordHash", "col_3": "lkj"},
106 | # Column to transform:
107 | "col_3",
108 | # Transform method:
109 | "SET-NULL",
110 | # Conditions when to transform:
111 | [
112 | {'column': 'col_1', 'equals': "random value"},
113 | {'column': 'col_2', 'equals': "passwordHash"},
114 | ]
115 | ),
116 |
117 | # Expected output:
118 | None
119 | )
120 |
121 | # Not matching condition: Should keep the original value
122 | self.assertEqual(
123 | transform.do_transform(
124 | # Record:
125 | {"col_1": "random value", "col_2": "id", "col_3": "123456789"},
126 | # Column to transform:
127 | "col_3",
128 | # Transform method:
129 | "SET-NULL",
130 | # Conditions when to transform:
131 | [
132 | {'column': 'col_1', 'equals': "random value"},
133 | {'column': 'col_2', 'equals': "passwordHash"},
134 | ]
135 | ),
136 |
137 | # Expected output:
138 | "123456789"
139 | )
140 |
141 | def test_transform_field_in_json_col(self):
142 | """Test transformation of a field in a json column with no conditions"""
143 |
144 | expected_value = {'id': 1, 'info': {'last_name': 'hidden', 'first_name': 'John'}}
145 |
146 | return_value = transform.do_transform(
147 | # Record:
148 | {
149 | "col_1": "random value",
150 | "col_2": "passwordHash",
151 | "col_3": "lkj",
152 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John'}}
153 | },
154 | # Column to transform:
155 | "col_4",
156 | # Transform method:
157 | "MASK-HIDDEN",
158 | # Conditions when to transform:
159 | None,
160 | ['info/last_name']
161 | )
162 |
163 | self.assertDictEqual(expected_value, return_value)
164 |
165 | def test_transform_field_in_json_col_with_conditions(self):
166 | """Test transformation of a field in a json column with conditions"""
167 |
168 | expected_value = {'id': 1, 'info': {'last_name': 'hidden', 'first_name': 'John'}}
169 |
170 | return_value = transform.do_transform(
171 | # Record:
172 | {
173 | "col_1": "random value",
174 | "col_2": "passwordHash",
175 | "col_3": "lkj",
176 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John'}}
177 | },
178 | # Column to transform:
179 | "col_4",
180 | # Transform method:
181 | "MASK-HIDDEN",
182 | # Conditions when to transform:
183 | [
184 | {'column': 'col_2', 'equals': "passwordHash"},
185 | ],
186 | ['info/last_name']
187 | )
188 |
189 | self.assertDictEqual(expected_value, return_value)
190 |
191 | def test_transform_fields_in_json_col(self):
192 | """Test transformation of multiple fields in a json column with no conditions"""
193 |
194 | expected_value = {'id': 1, 'info': {'last_name': 'hidden', 'first_name': 'hidden', 'age': 25}}
195 |
196 | return_value = transform.do_transform(
197 | # Record:
198 | {
199 | "col_1": "random value",
200 | "col_2": "passwordHash",
201 | "col_3": "lkj",
202 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'age': 25}}
203 | },
204 | # Column to transform:
205 | "col_4",
206 | # Transform method:
207 | "MASK-HIDDEN",
208 | # Conditions when to transform:
209 | None,
210 | ['info/last_name', 'info/first_name']
211 | )
212 |
213 | self.assertDictEqual(expected_value, return_value)
214 |
215 | def test_transform_col_with_condition_on_json_field(self):
216 | """Test transformation of a column with condition on a field in a json"""
217 |
218 | record = {
219 | "col_1": "random value",
220 | "col_2": "passwordHash",
221 | "col_3": "323df43983dfs",
222 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}}
223 | }
224 |
225 | self.assertEqual(
226 | 'hidden',
227 | transform.do_transform(
228 | # Record:
229 | record,
230 | # Column to transform:
231 | "col_3",
232 | # Transform method:
233 | "MASK-HIDDEN",
234 | # Conditions when to transform:
235 | [
236 | {'column': 'col_4', 'field_path': 'info/last_name', 'equals': 'Smith'},
237 | ]
238 | )
239 | )
240 |
241 | def test_transform_field_in_json_col_with_condition_on_field(self):
242 | """Test transformation of a field in a json column with condition on a field in json, condition will be met"""
243 |
244 | record = {
245 | "col_1": "random value",
246 | "col_2": "passwordHash",
247 | "col_3": "lkj",
248 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}}
249 | }
250 |
251 | self.assertDictEqual(
252 | {'id': 1, 'info': {'first_name': 'John', 'last_name': None, 'phone': '6573930'}},
253 | transform.do_transform(
254 | # Record:
255 | record,
256 | # Column to transform:
257 | "col_4",
258 | # Transform method:
259 | "SET-NULL",
260 | # Conditions when to transform:
261 | [
262 | {'column': 'col_4', 'field_path': 'info/phone', 'equals': '6573930'},
263 | ],
264 | ['info/last_name']
265 | )
266 | )
267 |
268 | def test_transform_field_in_json_col_with_condition_on_field_2(self):
269 | """Test transformation of a field in a json column with condition on a field in json,
270 | the condition will not be met"""
271 |
272 | record = {
273 | "col_1": "random value",
274 | "col_2": "passwordHash",
275 | "col_3": "lkj",
276 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}}
277 | }
278 |
279 | # not transformed
280 | self.assertEqual(
281 | {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}},
282 | transform.do_transform(
283 | # Record:
284 | record,
285 | # Column to transform:
286 | "col_4",
287 | # Transform method:
288 | "SET-NULL",
289 | # Conditions when to transform:
290 | [
291 | {'column': 'col_4', 'field_path': 'info/phone', 'regex_match': '.*6573955.*'},
292 | ],
293 | ['info/last_name']
294 | )
295 | )
296 |
297 | def test_transform_multiple_conditions_all_success(self):
298 | """Test conditional transformation, all the conditions will be met and transformation should happen"""
299 |
300 | record = {
301 | "col_1": "random value",
302 | "col_2": "passwordHash",
303 | "col_3": "lkj",
304 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}},
305 | 'col_5': '2021-11-30T16:40:07'
306 | }
307 |
308 | self.assertEqual(
309 | '2021-01-01T16:40:07',
310 | transform.do_transform(
311 | # Record:
312 | record,
313 | # Column to transform:
314 | "col_5",
315 | # Transform method:
316 | "MASK-DATE",
317 | # Conditions when to transform:
318 | [
319 | {'column': 'col_4', 'field_path': 'info/last_name', 'equals': 'Smith'},
320 | {'column': 'col_4', 'field_path': 'id', 'equals': 1},
321 | {'column': 'col_3', 'regex_match': '.*lkj.*'},
322 | ]
323 | )
324 | )
325 |
326 | def test_transform_multiple_conditions_one_fails(self):
327 | """Test conditional transformation, one of the conditions will not be met and transformation should not happen"""
328 |
329 | record = {
330 | "col_1": "random value",
331 | "col_2": "passwordHash",
332 | "col_3": "lkj",
333 | 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}},
334 | 'col_5': '2021-11-30T16:40:07'
335 | }
336 |
337 | # not transformed
338 | self.assertEqual(
339 | '2021-11-30T16:40:07',
340 | transform.do_transform(
341 | # Record:
342 | record,
343 | # Column to transform:
344 | "col_5",
345 | # Transform method:
346 | "MASK-DATE",
347 | # Conditions when to transform:
348 | [
349 | {'column': 'col_4', 'field_path': 'info/last_name', 'equals': 'Smith'},
350 | {'column': 'col_4', 'field_path': 'id', 'equals': 2},
351 | {'column': 'col_3', 'regex_match': '.*lkj.*'},
352 | ]
353 | )
354 | )
355 |
356 |
--------------------------------------------------------------------------------
/tests/unit/test_utils.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import unittest
3 |
4 | from unittest.mock import patch
5 | from singer import Catalog
6 |
7 | from transform_field.utils import get_stream_schemas, parse_args
8 |
9 |
10 | class TestUtils(unittest.TestCase):
11 | """
12 | Unit Tests for the utils
13 | """
14 |
15 | def test_get_stream_schemas(self):
16 | catalog = Catalog.from_dict({
17 | 'streams': [
18 | {
19 | 'tap_stream_id': 'stream1',
20 | 'schema': {
21 | 'properties': {
22 | 'col_1': {}
23 | }
24 | },
25 | 'metadata': [
26 | {
27 | 'breadcrumb': [],
28 | 'metadata': {
29 | 'selected': True
30 | }
31 | }
32 | ]
33 | },
34 | {
35 | 'tap_stream_id': 'stream2',
36 | 'schema': {
37 | 'properties': {
38 | 'col_2': {}
39 | }
40 | },
41 | 'metadata': [
42 | {
43 | 'breadcrumb': [],
44 | 'metadata': {
45 | 'selected': True
46 | }
47 | }
48 | ]
49 | },
50 | {
51 | 'tap_stream_id': 'stream3',
52 | 'schema': {
53 | 'properties': {
54 | 'col_3': {}
55 | }
56 | },
57 | 'metadata': [
58 | {
59 | 'breadcrumb': [],
60 | 'metadata': {
61 | 'selected': False
62 | }
63 | }
64 | ]
65 | }
66 | ]
67 | })
68 |
69 | output = get_stream_schemas(catalog)
70 |
71 | self.assertIn('stream1', output)
72 | self.assertIn('stream2', output)
73 | self.assertNotIn('stream3', output)
74 |
75 | self.assertEqual(len(output['stream1'].properties), 1)
76 | self.assertEqual(len(output['stream2'].properties), 1)
77 |
78 | @patch('transform_field.utils.Catalog.load')
79 | @patch('transform_field.utils.check_config')
80 | @patch('transform_field.utils.load_json')
81 | @patch('argparse.ArgumentParser.parse_args')
82 | def test_parse_args(self, parse_args_mock, load_json_mock, check_config_mock, catalog_load_mock):
83 | """
84 | test args parsing
85 | """
86 | check_config_mock.return_value = None
87 | load_json_mock.return_value = {}
88 | catalog_load_mock.return_value = {}
89 |
90 | parse_args_mock.return_value = argparse.Namespace(**{
91 | 'config': './config.json',
92 | 'catalog': './properties.json',
93 | 'validate': False,
94 | })
95 |
96 | args = parse_args({'transformations'})
97 |
98 | load_json_mock.assert_called_once()
99 | catalog_load_mock.assert_called_once()
100 | check_config_mock.assert_called_once()
101 |
102 | self.assertEqual(args.config, {})
103 | self.assertEqual(args.catalog, {})
104 | self.assertEqual(args.validate, False)
105 |
--------------------------------------------------------------------------------
/transform_field/__init__.py:
--------------------------------------------------------------------------------
1 | import io
2 | import sys
3 | import time
4 | import singer
5 |
6 | from typing import Union, Dict
7 | from enum import Enum, unique
8 | from collections import namedtuple
9 | from decimal import Decimal
10 | from jsonschema import FormatChecker, Draft7Validator
11 | from singer import Catalog, Schema
12 |
13 | from transform_field import transform
14 | from transform_field import utils
15 | from transform_field.timings import Timings
16 |
17 | from transform_field.errors import CatalogRequiredException, StreamNotFoundException, InvalidTransformationException, \
18 | UnsupportedTransformationTypeException, NoStreamSchemaException
19 |
20 |
21 | LOGGER = singer.get_logger('transform_field')
22 | TIMINGS = Timings(LOGGER)
23 | DEFAULT_MAX_BATCH_BYTES = 4000000
24 | DEFAULT_MAX_BATCH_RECORDS = 20000
25 | DEFAULT_BATCH_DELAY_SECONDS = 300.0
26 | VALIDATE_RECORDS = False
27 |
28 | StreamMeta = namedtuple('StreamMeta', ['schema', 'key_properties', 'bookmark_properties'])
29 | TransMeta = namedtuple('TransMeta', ['field_id', 'type', 'when', 'field_paths'])
30 |
31 | REQUIRED_CONFIG_KEYS = [
32 | "transformations"
33 | ]
34 |
35 |
36 | @unique
37 | class TransformationTypes(Enum):
38 | """
39 | List of supported transformation types
40 | """
41 | SET_NULL = 'SET-NULL'
42 | MASK_HIDDEN = 'MASK-HIDDEN'
43 | MASK_DATE = 'MASK-DATE'
44 | MASK_NUMBER = 'MASK-NUMBER'
45 | HASH = 'HASH'
46 | HASH_SKIP_FIRST = 'HASH-SKIP-FIRST'
47 | MASK_STRING_SKIP_ENDS = 'MASK-STRING-SKIP-ENDS'
48 |
49 |
50 | def float_to_decimal(value):
51 | """Walk the given data structure and turn all instances of float into
52 | double."""
53 | if isinstance(value, float):
54 | return Decimal(str(value))
55 | if isinstance(value, list):
56 | return [float_to_decimal(child) for child in value]
57 | if isinstance(value, dict):
58 | return {k: float_to_decimal(v) for k, v in value.items()}
59 | return value
60 |
61 |
62 | class TransformFieldException(Exception):
63 | """A known exception for which we don't need to bring a stack trace"""
64 |
65 |
66 | class TransformField:
67 | """
68 | Main Transformer class
69 | """
70 |
71 | def __init__(self, trans_config):
72 | self.trans_config = trans_config
73 | self.messages = []
74 | self.buffer_size_bytes = 0
75 | self.state = None
76 |
77 | # Time that the last batch was sent
78 | self.time_last_batch_sent = time.time()
79 |
80 | # Mapping from stream name to {'schema': ..., 'key_names': ..., 'bookmark_names': ... }
81 | self.stream_meta = {}
82 |
83 | # Mapping from transformation stream to {'stream': [ 'field_id': ..., 'type': ... ] ... }
84 | self.trans_meta = {}
85 |
86 | for trans in trans_config["transformations"]:
87 | # Naming differences in stream ids:
88 | # 1. properties.json and transformation_json using 'tap_stream_id'
89 | # 2. taps send in the 'stream' key in singer messages
90 | stream = trans["tap_stream_name"]
91 | if stream not in self.trans_meta:
92 | self.trans_meta[stream] = []
93 |
94 | self.trans_meta[stream].append(TransMeta(
95 | trans["field_id"],
96 | trans["type"],
97 | trans.get('when'),
98 | trans.get('field_paths')
99 | ))
100 |
101 | # pylint: disable=too-many-nested-blocks,too-many-branches
102 | # todo: simplify this method
103 | def flush(self):
104 | """Give batch to handlers to process"""
105 |
106 | if self.messages:
107 | stream = self.messages[0].stream
108 | stream_meta = self.stream_meta[stream]
109 |
110 | # Transform columns
111 | messages = self.messages
112 | schema = float_to_decimal(stream_meta.schema)
113 | key_properties = stream_meta.key_properties
114 | validator = Draft7Validator(schema, format_checker=FormatChecker())
115 | trans_meta = []
116 | if stream in self.trans_meta:
117 | trans_meta = self.trans_meta[stream]
118 |
119 | for i, message in enumerate(messages):
120 | if isinstance(message, singer.RecordMessage):
121 |
122 | # Do transformation on every column where it is required
123 | for trans in trans_meta:
124 |
125 | if trans.field_id in message.record:
126 | transformed = transform.do_transform(
127 | message.record, trans.field_id, trans.type, trans.when, trans.field_paths
128 | )
129 | message.record[trans.field_id] = transformed
130 |
131 | if VALIDATE_RECORDS:
132 | # Validate the transformed columns
133 | data = float_to_decimal(message.record)
134 | try:
135 | validator.validate(data)
136 | if key_properties:
137 | for k in key_properties:
138 | if k not in data:
139 | raise TransformFieldException(
140 | f'Message {i} is missing key property {k}')
141 |
142 | except Exception as exc:
143 | if type(exc).__name__ == "InvalidOperation":
144 | raise TransformFieldException(
145 | f"Record does not pass schema validation. RECORD: {message.record}"
146 | "\n'multipleOf' validations that allows long precisions are not "
147 | "supported (i.e. with 15 digits or more). "
148 | f"Try removing 'multipleOf' methods from JSON schema.\n{exc}") from exc
149 |
150 | raise TransformFieldException(
151 | f"Record does not pass schema validation. RECORD: {message.record}\n{exc}") from exc
152 |
153 | # Write the transformed message
154 | singer.write_message(message)
155 |
156 | LOGGER.debug("Batch is valid with %s messages", len(messages))
157 |
158 | # Update stats
159 | self.time_last_batch_sent = time.time()
160 | self.messages = []
161 | self.buffer_size_bytes = 0
162 |
163 | if self.state:
164 | singer.write_message(singer.StateMessage(self.state))
165 | self.state = None
166 |
167 | TIMINGS.log_timings()
168 |
169 | def handle_line(self, line):
170 | """Takes a raw line from stdin and transforms it"""
171 | try:
172 | message = singer.parse_message(line)
173 |
174 | if not message:
175 | raise TransformFieldException('Unknown message type')
176 | except Exception as exc:
177 | raise TransformFieldException(f'Failed to process incoming message: {line}\n{exc}') from exc
178 |
179 | # If we got a Schema, set the schema and key properties for this
180 | # stream. Flush the batch, if there is one, in case the schema is
181 | # different
182 | if isinstance(message, singer.SchemaMessage):
183 | self.flush()
184 |
185 | self.stream_meta[message.stream] = StreamMeta(
186 | message.schema,
187 | message.key_properties,
188 | message.bookmark_properties)
189 |
190 | # if schema message, do validation of transformations using the schema to detect any
191 | # incompatibilities between the transformation and column types
192 | self.__validate_stream_trans(message.stream, message.schema)
193 |
194 | # Write the transformed message
195 | singer.write_message(message)
196 |
197 | elif isinstance(message, (singer.RecordMessage, singer.ActivateVersionMessage)):
198 | if self.messages and (
199 | message.stream != self.messages[0].stream or
200 | message.version != self.messages[0].version):
201 | self.flush()
202 | self.messages.append(message)
203 | self.buffer_size_bytes += len(line)
204 |
205 | num_bytes = self.buffer_size_bytes
206 | num_messages = len(self.messages)
207 | num_seconds = time.time() - self.time_last_batch_sent
208 |
209 | enough_bytes = num_bytes >= DEFAULT_MAX_BATCH_BYTES
210 | enough_messages = num_messages >= DEFAULT_MAX_BATCH_RECORDS
211 | enough_time = num_seconds >= DEFAULT_BATCH_DELAY_SECONDS
212 | if enough_bytes or enough_messages or enough_time:
213 | LOGGER.debug('Flushing %d bytes, %d messages, after %.2f seconds', num_bytes, num_messages, num_seconds)
214 | self.flush()
215 |
216 | elif isinstance(message, singer.StateMessage):
217 | self.state = message.value
218 |
219 | def consume(self, reader):
220 | """Consume all the lines from the queue, flushing when done."""
221 | for line in reader:
222 | self.handle_line(line)
223 | self.flush()
224 |
225 | def validate(self, catalog: Catalog):
226 | """
227 | Validate the transformations by checking if each transformation type is compatible with the column type
228 | :param catalog: the catalog of streams with their json schema
229 | """
230 | LOGGER.info('Starting validation of transformations...')
231 |
232 | if not catalog:
233 | raise CatalogRequiredException('Catalog missing! please provide catalog to run validation.')
234 |
235 | # get the schema of each stream
236 | schemas = utils.get_stream_schemas(catalog)
237 |
238 | for stream_id in self.trans_meta:
239 | self.__validate_stream_trans(stream_id, schemas.get(stream_id))
240 |
241 | def __validate_stream_trans(self, stream_id: str, stream_schema: Union[Schema, Dict]):
242 | """
243 | Validation of each stream's transformations
244 | :param stream_id: ID of the stream
245 | :param stream_schema: schema of the streams
246 | """
247 |
248 | if stream_id not in self.trans_meta:
249 | return
250 |
251 | # check if we even have schema for stream of this transformation
252 | if stream_schema is None:
253 | raise StreamNotFoundException(stream_id)
254 |
255 | # check if we stream has not empty schema
256 | if not stream_schema:
257 | raise NoStreamSchemaException(stream_id)
258 |
259 | for transformation in self.trans_meta[stream_id]:
260 | trans_type = transformation.type
261 | field_id = transformation.field_id
262 |
263 | if isinstance(stream_schema, Schema):
264 | field_type = stream_schema.properties[field_id].type
265 | field_format = stream_schema.properties[field_id].format
266 | else:
267 | field_type = stream_schema['properties'][field_id].get('type')
268 | field_format = stream_schema['properties'][field_id].get('format')
269 |
270 | # If the value we want to transform is a field in a JSON property
271 | # then no need to enforce rules below for now
272 | if field_type and \
273 | ("object" in field_type or "array" in field_type) and \
274 | transformation.field_paths is not None:
275 | continue
276 |
277 | if trans_type in (TransformationTypes.HASH.value, TransformationTypes.MASK_HIDDEN.value) or \
278 | trans_type.startswith(TransformationTypes.HASH_SKIP_FIRST.value) or \
279 | trans_type.startswith(TransformationTypes.MASK_STRING_SKIP_ENDS.value):
280 | if not (field_type is not None and 'string' in field_type and not field_format):
281 | raise InvalidTransformationException(
282 | f'Cannot apply `{trans_type}` transformation type to a non-string field `'
283 | f'{field_id}` in stream `{stream_id}`')
284 |
285 | elif trans_type == TransformationTypes.MASK_DATE.value:
286 | if not (field_type is not None and 'string' in field_type and field_format in {'date-time', 'date'}):
287 | raise InvalidTransformationException(
288 | f'Cannot apply `{trans_type}` transformation type to a non-stringified date field'
289 | f' `{field_id}` in stream `{stream_id}`')
290 |
291 | elif trans_type == TransformationTypes.MASK_NUMBER.value:
292 | if not (field_type is not None and (
293 | 'number' in field_type or 'integer' in field_type) and not field_format):
294 | raise InvalidTransformationException(
295 | f'Cannot apply `{trans_type}` transformation type to a non-numeric field '
296 | f'`{field_id}` in stream `{stream_id}`')
297 |
298 | elif trans_type == TransformationTypes.SET_NULL.value:
299 | LOGGER.info('Transformation type is %s, no need to do any validation.', trans_type)
300 |
301 | else:
302 | raise UnsupportedTransformationTypeException(trans_type)
303 |
304 |
305 | def main_impl():
306 | """
307 | Main implementation
308 | """
309 | args = utils.parse_args(REQUIRED_CONFIG_KEYS)
310 | trans_config = {'transformations': args.config['transformations']}
311 |
312 | instance = TransformField(trans_config)
313 |
314 | if args.validate:
315 | instance.validate(args.catalog)
316 | else:
317 | reader = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
318 | instance.consume(reader)
319 |
320 | LOGGER.info("Exiting normally")
321 |
322 |
323 | def main():
324 | """Main entry point"""
325 | try:
326 | main_impl()
327 | except TransformFieldException as exc:
328 | for line in str(exc).splitlines():
329 | LOGGER.critical(line)
330 | sys.exit(1)
331 | except Exception as exc:
332 | LOGGER.critical(exc)
333 | raise exc
334 |
335 |
336 | if __name__ == '__main__':
337 | main()
338 |
--------------------------------------------------------------------------------
/transform_field/errors.py:
--------------------------------------------------------------------------------
1 | class CatalogRequiredException(Exception):
2 | """Raised when catalog needs to be provided but it has not been"""
3 |
4 |
5 | class StreamNotFoundException(Exception):
6 | """Raised when catalog doesn't have a given selected stream"""
7 |
8 | def __init__(self, stream):
9 | message = f'Catalog doesn\'t have the selected stream `{stream}`!'
10 |
11 | super().__init__(message)
12 |
13 |
14 | class NoStreamSchemaException(Exception):
15 | """Raised when stream has an empty schema"""
16 |
17 | def __init__(self, stream):
18 | message = f'Stream `{stream}` has an empty schema!'
19 |
20 | super().__init__(message)
21 |
22 |
23 | class InvalidTransformationException(Exception):
24 | """Raised when the given transformation is invalid"""
25 |
26 |
27 | class UnsupportedTransformationTypeException(Exception):
28 | """Raised when the given transformation type is not supported"""
29 |
30 | def __init__(self, trans_type):
31 | message = f'Transformation `{trans_type}` is not supported!'
32 |
33 | super().__init__(message)
34 |
--------------------------------------------------------------------------------
/transform_field/timings.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import time
4 |
5 | from contextlib import contextmanager
6 |
7 |
8 | class Timings:
9 | """Gathers timing information for the three main steps of the Transformer."""
10 |
11 | def __init__(self, logger):
12 | self.logger = logger
13 | self.last_time = time.time()
14 | self.timings = {
15 | 'validating': 0.0,
16 | 'transforming': 0.0,
17 | None: 0.0
18 | }
19 |
20 | @contextmanager
21 | def mode(self, mode):
22 | """We wrap the big steps of the Tap in this context manager to accumulate
23 | timing info."""
24 |
25 | start = time.time()
26 | yield
27 | end = time.time()
28 | self.timings[None] += start - self.last_time
29 | self.timings[mode] += end - start
30 | self.last_time = end
31 |
32 | def log_timings(self):
33 | """We call this with every flush to print out the accumulated timings"""
34 | self.logger.debug('Timings: unspecified: %.3f; validating: %.3f; transforming: %.3f;',
35 | self.timings[None],
36 | self.timings['validating'],
37 | self.timings['transforming'])
38 |
--------------------------------------------------------------------------------
/transform_field/transform.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import re
3 |
4 | from typing import Dict, Any, Optional, List
5 | from dpath.util import get as get_xpath, set as set_xpath
6 | from singer import get_logger
7 | from dateutil import parser
8 |
9 | LOGGER = get_logger('transform_field')
10 |
11 |
12 | def is_transform_required(record: Dict, when: Optional[List[Dict]]) -> bool:
13 | """
14 | Detects if the transformation is required or not based on
15 | the defined conditions and the actual values in a record.
16 | All conditions in when need to be met for the transformation to be required.
17 | """
18 | if not when:
19 | # Transformation is always required if 'when' condition not defined
20 | LOGGER.debug('No conditions, transformations is required')
21 | return True
22 |
23 | transform_required = False
24 |
25 | # Check if conditional transformation matches criteria
26 | # Evaluate every condition
27 | for condition in when:
28 | column_to_match = condition['column']
29 | column_value = record.get(column_to_match, "")
30 |
31 | field_path_to_match = condition.get('field_path')
32 |
33 | # check if given field exists in the column value
34 | if field_path_to_match:
35 | try:
36 | field_value = get_xpath(column_value, field_path_to_match)
37 | LOGGER.debug('field "%s" exists in the value of column "%s"', field_path_to_match, column_to_match)
38 |
39 | except KeyError:
40 | # KeyError exception means the field doesn't exist, hence we cannot proceed with the
41 | # equals/regex match condition, thus the condition isn't met and don't need to do
42 | # transformation so breaking prematurely
43 | transform_required = False
44 |
45 | LOGGER.debug('field "%s" doesn\'t exists in the value of column "%s", '
46 | 'so transformation is not required.', field_path_to_match, column_to_match)
47 | break
48 |
49 | cond_equals = condition.get('equals')
50 | cond_pattern = condition.get('regex_match')
51 |
52 | # Exact condition
53 | if cond_equals:
54 | LOGGER.debug('Equals condition found, value is: %s', cond_equals)
55 | if field_path_to_match:
56 | transform_required = __is_condition_met('equal', cond_equals, field_value)
57 | else:
58 | transform_required = __is_condition_met('equal', cond_equals, column_value)
59 |
60 | # Condition isn't met, exit the loop
61 | if not transform_required:
62 | LOGGER.debug('Equals condition didn\'t match, so transformation is not required.')
63 | break
64 |
65 | # Regex based condition
66 | elif cond_pattern:
67 | LOGGER.debug('Regex condition found, pattern is: %s', cond_pattern)
68 |
69 | if field_path_to_match:
70 | transform_required = __is_condition_met('regex', cond_pattern, field_value)
71 | else:
72 | transform_required = __is_condition_met('regex', cond_pattern, column_value)
73 |
74 | # Condition isn't met, exit the loop
75 | if not transform_required:
76 | LOGGER.debug('Regex pattern didn\'t match, so transformation is not required.')
77 | break
78 |
79 | LOGGER.debug('Transformation required? %s', transform_required)
80 |
81 | return transform_required
82 |
83 |
84 | def __is_condition_met(condition_type: str, condition_value: Any, value: Any) -> bool:
85 | """
86 | Checks if given value meets the given condition
87 | Args:
88 | condition_type: condition type, could be "equal" or "regex"
89 | condition_value: the value of the condition, in case of regex it's the pattern, and
90 | a value to compare to in case of equal
91 | value: the target value to run the condition against
92 |
93 | Returns: bool, True of condition is met, False otherwise
94 | """
95 |
96 | if condition_type == 'equal':
97 | return value == condition_value
98 |
99 | if condition_type == 'regex':
100 | matcher = re.compile(condition_value)
101 | return bool(matcher.search(value))
102 |
103 | raise NotImplementedError(f'__is_condition_met is not implemented for condition type "{condition_type}"', )
104 |
105 |
106 | def do_transform(record: Dict,
107 | field: str,
108 | trans_type: str,
109 | when: Optional[List[Dict]] = None,
110 | field_paths: Optional[List[str]] = None
111 | ) -> Any:
112 | """Transform a value by a certain transformation type.
113 | Optionally can set conditional criteria based on other
114 | values of the record"""
115 |
116 | return_value = value = record.get(field)
117 |
118 | try:
119 | # Do transformation only if required
120 | if is_transform_required(record, when):
121 |
122 | # transforming fields nested in value dictionary
123 | if isinstance(value, dict) and field_paths:
124 | for field_path in field_paths:
125 | try:
126 | field_val = get_xpath(value, field_path)
127 | set_xpath(value, field_path, _transform_value(field_val, trans_type))
128 | except KeyError:
129 | LOGGER.error('Field path %s does not exist', field_path)
130 |
131 | return_value = value
132 |
133 | else:
134 | return_value = _transform_value(value, trans_type)
135 |
136 | # Return the original value if transformation is not required
137 | else:
138 | return_value = value
139 |
140 | return return_value
141 |
142 | # Return the original value if cannot transform
143 | except Exception:
144 | return return_value
145 |
146 |
147 | def _transform_value(value: Any, trans_type: str) -> Any:
148 | """
149 | Applies the given transformation type to the given value
150 | Args:
151 | value: value to transform
152 | trans_type: transformation type to apply
153 |
154 | Returns:
155 | transformed value
156 | """
157 | # Transforms any input to NULL
158 | if trans_type == "SET-NULL":
159 | return_value = None
160 |
161 | # Transforms string input to hash
162 | elif trans_type == "HASH":
163 | return_value = hashlib.sha256(value.encode('utf-8')).hexdigest()
164 |
165 | # Transforms string input to hash skipping first n characters, e.g. HASH-SKIP-FIRST-2
166 | elif 'HASH-SKIP-FIRST' in trans_type:
167 | return_value = value[:int(trans_type[-1])] + \
168 | hashlib.sha256(value.encode('utf-8')[int(trans_type[-1]):]).hexdigest()
169 |
170 | # Transforms any date to stg
171 | elif trans_type == "MASK-DATE":
172 | return_value = parser.parse(value).replace(month=1, day=1).isoformat()
173 |
174 | # Transforms any number to zero
175 | elif trans_type == "MASK-NUMBER":
176 | return_value = 0
177 |
178 | # Transforms any value to "hidden"
179 | elif trans_type == "MASK-HIDDEN":
180 | return_value = 'hidden'
181 |
182 | # Transforms string input to masked version skipping first and last n characters
183 | # e.g. MASK-STRING-SKIP-ENDS-3
184 | elif 'MASK-STRING-SKIP-ENDS' in trans_type:
185 | skip_ends_n = int(trans_type[-1])
186 | value_len = len(value)
187 | return_value = '*' * value_len if value_len <= (2 * skip_ends_n) \
188 | else f'{value[:skip_ends_n]}{"*" * (value_len - (2 * skip_ends_n))}{value[-skip_ends_n:]}'
189 |
190 | # Return the original value if cannot find transformation type
191 | # todo: is this the right behavior?
192 | else:
193 | LOGGER.warning('Cannot find transformation type %s, returning same value', trans_type)
194 | return_value = value
195 |
196 | return return_value
197 |
--------------------------------------------------------------------------------
/transform_field/utils.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from typing import Dict
4 | from singer import Catalog, get_logger, Schema
5 | from singer.utils import check_config, load_json
6 |
7 |
8 | LOGGER = get_logger('transform_field')
9 |
10 |
11 | def parse_args(required_config_keys):
12 | """
13 | Parse standard command-line args.
14 |
15 | Parses the command-line arguments mentioned in the SPEC and the BEST_PRACTICES documents:
16 |
17 | -c,--config Config file
18 | --validate flag to validate the transformations
19 | --catalog Catalog file
20 |
21 | Returns the parsed args object from argparse. For each argument that
22 | point to JSON files (config, catalog), we will automatically
23 | load and parse the JSON file.
24 | """
25 | parser = argparse.ArgumentParser()
26 |
27 | parser.add_argument(
28 | '-c', '--config',
29 | help='Config file',
30 | required=True)
31 |
32 | parser.add_argument(
33 | '--validate',
34 | help='Flag to trigger one-off validation of transformations in config file using the catalog',
35 | default=False,
36 | action='store_true'
37 | )
38 |
39 | parser.add_argument(
40 | '--catalog',
41 | help='Catalog file')
42 |
43 | args = parser.parse_args()
44 |
45 | if args.config:
46 | setattr(args, 'config_path', args.config)
47 | args.config = load_json(args.config)
48 |
49 | if args.catalog:
50 | setattr(args, 'catalog_path', args.catalog)
51 | args.catalog = Catalog.load(args.catalog)
52 |
53 | check_config(args.config, required_config_keys)
54 |
55 | return args
56 |
57 |
58 | def get_stream_schemas(catalog: Catalog) -> Dict[str, Schema]:
59 | """
60 | Build a map of streams with their schemas
61 | :param catalog:
62 | :return: Dictionary mapping stream ID to its schema
63 | """
64 | return {
65 | stream.tap_stream_id: stream.schema
66 | for stream in catalog.streams if stream.is_selected()
67 | }
68 |
--------------------------------------------------------------------------------