├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── BUG_REPORT.md
    │   ├── FEATURE_REQUEST.md
    │   ├── QUESTION.md
    │   └── config.yml
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    └── workflows
    │   ├── ci.yml
    │   └── pythonpublish.yml
├── .gitignore
├── .pylintrc
├── CHANGELOG.md
├── LICENSE
├── README.md
├── sample_config.json
├── sample_logging.conf
├── setup.py
├── tests
    ├── __init__.py
    ├── integration
    │   ├── __init__.py
    │   ├── resources
    │   │   ├── catalog.json
    │   │   ├── invalid_config.json
    │   │   ├── invalid_messages.json
    │   │   ├── messages.json
    │   │   ├── streams_with_changing_schema.json
    │   │   ├── streams_with_object.json
    │   │   └── valid_config.json
    │   └── test_integrations.py
    └── unit
    │   ├── __init__.py
    │   ├── test_init.py
    │   ├── test_transform.py
    │   └── test_utils.py
└── transform_field
    ├── __init__.py
    ├── errors.py
    ├── timings.py
    ├── transform.py
    └── utils.py


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @transferwise/analytics-platform
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/BUG_REPORT.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a bug report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ---
11 | name: Bug report
12 | about: Create a report to help us improve
13 | title: ''
14 | labels: ''
15 | assignees: ''
16 | 
17 | ---
18 | 
19 | **Describe the bug**
20 | A clear and concise description of what the bug is.
21 | 
22 | **To Reproduce**
23 | Steps to reproduce the behavior:
24 | 1. Prepare the data as '...'
25 | 2. Run the command '....'
26 | 4. See error
27 | 
28 | **Expected behavior**
29 | A clear and concise description of what you expected to happen.
30 | 
31 | **Screenshots**
32 | If applicable, add screenshots to help explain your problem.
33 | 
34 | **Your environment**
35 | - Version, e.g branch/commit #/release/tag
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/FEATURE_REQUEST.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/QUESTION.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: Ask anything about this project
 4 | title: ''
 5 | labels: help wanted
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Your question**
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |   - name: PipelineWise Community Slack channel
4 |     url: https://singer-io.slack.com/messages/pipelinewise
5 |     about: Open discussion about PipelineWise
6 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Problem
 2 | 
 3 | _Describe the problem your PR is trying to solve_
 4 | 
 5 | ## Proposed changes
 6 | 
 7 | _Describe the big picture of your changes here to communicate to the maintainers why we should accept this pull request.
 8 | If it fixes a bug or resolves a feature request, be sure to link to that issue._
 9 | 
10 | 
11 | ## Types of changes
12 | 
13 | What types of changes does your code introduce to pipelinewise-transform-field?
14 | _Put an `x` in the boxes that apply_
15 | 
16 | - [ ] Bugfix (non-breaking change which fixes an issue)
17 | - [ ] New feature (non-breaking change which adds functionality)
18 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
19 | - [ ] Documentation Update (if none of the other choices apply)
20 | 
21 | 
22 | ## Checklist
23 | 
24 | - [ ] I have read the [CONTRIBUTING](https://github.com/transferwise/pipelinewise/blob/master/CONTRIBUTING.md) doc.
25 | - [ ] Description above provides context of the change
26 | - [ ] I have added tests that prove my fix is effective or that my feature works
27 | - [ ] Unit tests for changes (not needed for documentation changes)
28 | - [ ] CI checks pass with my changes
29 | - [ ] Bumping version in `setup.py` is an individual PR and not mixed with feature or bugfix PRs
30 | - [ ] Commit message/PR title starts with `[AP-NNNN]` (if applicable. AP-NNNN = JIRA ID)
31 | - [ ] Branch name starts with `AP-NNN` (if applicable. AP-NNN = JIRA ID)
32 | - [ ] Commits follow "[How to write a good git commit message](http://chris.beams.io/posts/git-commit/)"
33 | - [ ] Relevant documentation is updated including usage instructions
34 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # This is an automatically generated base configuration
 2 | # For further configuration options and tuning:
 3 | # https://docs.github.com/en/free-pro-team@latest/github/administering-a-repository/configuration-options-for-dependency-updates
 4 | 
 5 | version: 2
 6 | updates:
 7 |   - package-ecosystem: "pip"
 8 |     directory: "/"
 9 |     schedule:
10 |       interval: "weekly"
11 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master]
 6 |   pull_request:
 7 |     branches: [master]
 8 | 
 9 |   workflow_dispatch:
10 | 
11 | concurrency:
12 |   group: ci-${{ github.head_ref }}
13 |   cancel-in-progress: true
14 | 
15 | jobs:
16 |   build:
17 | 
18 |     runs-on: ubuntu-latest
19 |     strategy:
20 |       fail-fast: true
21 |       matrix:
22 |         python-version: [3.6, 3.7, 3.8]
23 | 
24 |     steps:
25 |       - name: Checking out repo
26 |         uses: actions/checkout@v2
27 | 
28 |       - name: Set up Python ${{ matrix.container[1] }}
29 |         uses: actions/setup-python@v2
30 |         with:
31 |           python-version: ${{ matrix.python-version }}
32 | 
33 |       - name: set LOGGING_CONF_FILE env
34 |         run: |
35 |           export LOGGING_CONF_FILE=$(pwd)/sample_logging.conf
36 |           echo $LOGGING_CONF_FILE
37 | 
38 |       - name: Install dependencies
39 |         run: |
40 |           pip install --upgrade pip setuptools
41 |           pip install .[test]
42 | 
43 |       - name: Check if pylint is happy
44 |         run: pylint transform_field
45 | 
46 |       - name: Run Unit Tests with min coverage
47 |         run: pytest --cov=transform_field --cov-fail-under=65 -v tests/unit
48 | 
49 |       - name: Run Integration Tests with min coverage
50 |         run: pytest --cov-fail-under=73 -v tests/integration
51 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package to PyPi
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v2
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v1
14 |       with:
15 |         python-version: '3.x'
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         pip install setuptools wheel twine
20 |     - name: Build and publish
21 |       env:
22 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
23 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
24 |       run: |
25 |         python setup.py sdist bdist_wheel
26 |         twine upload dist/*
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # IDE
 2 | .vscode
 3 | .idea/*
 4 | 
 5 | 
 6 | # Python
 7 | __pycache__/
 8 | *.py[cod]
 9 | *$py.class
10 | .virtualenvs
11 | *.egg-info/
12 | *~
13 | dist/
14 | 
15 | # Singer JSON files
16 | properties.json
17 | config.json
18 | state.json
19 | 
20 | *.db
21 | .DS_Store
22 | venv
23 | env
24 | blog_old.md
25 | node_modules
26 | *.pyc
27 | tmp
28 | 
29 | # Docs
30 | docs/_build/
31 | docs/_templates/
32 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | # Based on Apache 2.0 licensed code from https://github.com/ClusterHQ/flocker
  2 | 
  3 | [MASTER]
  4 | 
  5 | # Specify a configuration file.
  6 | #rcfile=
  7 | 
  8 | # Python code to execute, usually for sys.path manipulation such as
  9 | # pygtk.require().
 10 | # init-hook=
 11 | 
 12 | # Add files or directories to the blacklist. They should be base names, not paths.
 13 | ignore=
 14 | 
 15 | # Pickle collected data for later comparisons.
 16 | persistent=no
 17 | 
 18 | # List of plugins (as comma separated values of python modules names) to load,
 19 | # usually to register additional checkers.
 20 | load-plugins=
 21 | 
 22 | # Use multiple processes to speed up Pylint.
 23 | # DO NOT CHANGE THIS VALUES >1 HIDE RESULTS!!!!!
 24 | jobs=1
 25 | 
 26 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 27 | # active Python interpreter and may run arbitrary code.
 28 | unsafe-load-any-extension=no
 29 | 
 30 | # A comma-separated list of package or module names from where C extensions may
 31 | # be loaded. Extensions are loading into the active Python interpreter and may
 32 | # run arbitrary code
 33 | extension-pkg-whitelist=ujson
 34 | 
 35 | # Allow optimization of some AST trees. This will activate a peephole AST
 36 | # optimizer, which will apply various small optimizations. For instance, it can
 37 | # be used to obtain the result of joining multiple strings with the addition
 38 | # operator. Joining a lot of strings can lead to a maximum recursion error in
 39 | # Pylint and this flag can prevent that. It has one side effect, the resulting
 40 | # AST will be different than the one from reality.
 41 | optimize-ast=no
 42 | 
 43 | 
 44 | [MESSAGES CONTROL]
 45 | 
 46 | # Only show warnings with the listed confidence levels. Leave empty to show
 47 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
 48 | confidence=
 49 | 
 50 | # Enable the message, report, category or checker with the given id(s). You can
 51 | # either give multiple identifier separated by comma (,) or put this option
 52 | # multiple time. See also the "--disable" option for examples.
 53 | disable=wrong-import-order,
 54 |         broad-except,
 55 |         missing-module-docstring,
 56 |         duplicate-code,  # not useful until a major code refactoring
 57 | 
 58 | 
 59 | enable=import-error,
 60 |        import-self,
 61 |        reimported,
 62 |        wildcard-import,
 63 |        misplaced-future,
 64 |        deprecated-module,
 65 |        unpacking-non-sequence,
 66 |        invalid-all-object,
 67 |        undefined-all-variable,
 68 |        used-before-assignment,
 69 |        cell-var-from-loop,
 70 |        global-variable-undefined,
 71 |        redefine-in-handler,
 72 |        unused-import,
 73 |        unused-wildcard-import,
 74 |        global-variable-not-assigned,
 75 |        undefined-loop-variable,
 76 |        global-statement,
 77 |        global-at-module-level,
 78 |        bad-open-mode,
 79 |        redundant-unittest-assert,
 80 |        boolean-datetime
 81 |        deprecated-method,
 82 |        anomalous-unicode-escape-in-string,
 83 |        anomalous-backslash-in-string,
 84 |        not-in-loop,
 85 |        continue-in-finally,
 86 |        abstract-class-instantiated,
 87 |        star-needs-assignment-target,
 88 |        duplicate-argument-name,
 89 |        return-in-init,
 90 |        too-many-star-expressions,
 91 |        nonlocal-and-global,
 92 |        return-outside-function,
 93 |        return-arg-in-generator,
 94 |        invalid-star-assignment-target,
 95 |        bad-reversed-sequence,
 96 |        nonexistent-operator,
 97 |        yield-outside-function,
 98 |        init-is-generator,
 99 |        nonlocal-without-binding,
100 |        lost-exception,
101 |        assert-on-tuple,
102 |        dangerous-default-value,
103 |        duplicate-key,
104 |        useless-else-on-loop
105 |        expression-not-assigned,
106 |        confusing-with-statement,
107 |        unnecessary-lambda,
108 |        pointless-statement,
109 |        pointless-string-statement,
110 |        unnecessary-pass,
111 |        unreachable,
112 |        eval-used,
113 |        exec-used,
114 |        using-constant-test,
115 |        bad-super-call,
116 |        missing-super-argument,
117 |        slots-on-old-class,
118 |        super-on-old-class,
119 |        property-on-old-class,
120 |        not-an-iterable,
121 |        not-a-mapping,
122 |        format-needs-mapping,
123 |        truncated-format-string,
124 |        missing-format-string-key,
125 |        mixed-format-string,
126 |        too-few-format-args,
127 |        bad-str-strip-call,
128 |        too-many-format-args,
129 |        bad-format-character,
130 |        format-combined-specification,
131 |        bad-format-string-key,
132 |        bad-format-string,
133 |        missing-format-attribute,
134 |        missing-format-argument-key,
135 |        unused-format-string-argument
136 |        unused-format-string-key,
137 |        invalid-format-index,
138 |        bad-indentation,
139 |        mixed-indentation,
140 |        unnecessary-semicolon,
141 |        lowercase-l-suffix,
142 |        invalid-encoded-data,
143 |        unpacking-in-except,
144 |        import-star-module-level,
145 |        long-suffix,
146 |        old-octal-literal,
147 |        old-ne-operator,
148 |        backtick,
149 |        old-raise-syntax,
150 |        metaclass-assignment,
151 |        next-method-called,
152 |        dict-iter-method,
153 |        dict-view-method,
154 |        indexing-exception,
155 |        raising-string,
156 |        using-cmp-argument,
157 |        cmp-method,
158 |        coerce-method,
159 |        delslice-method,
160 |        getslice-method,
161 |        hex-method,
162 |        nonzero-method,
163 |        t-method,
164 |        setslice-method,
165 |        old-division,
166 |        logging-format-truncated,
167 |        logging-too-few-args,
168 |        logging-too-many-args,
169 |        logging-unsupported-format,
170 |        logging-format-interpolation,
171 |        invalid-unary-operand-type,
172 |        unsupported-binary-operation,
173 |        not-callable,
174 |        redundant-keyword-arg,
175 |        assignment-from-no-return,
176 |        assignment-from-none,
177 |        not-context-manager,
178 |        repeated-keyword,
179 |        missing-kwoa,
180 |        no-value-for-parameter,
181 |        invalid-sequence-index,
182 |        invalid-slice-index,
183 |        unexpected-keyword-arg,
184 |        unsupported-membership-test,
185 |        unsubscriptable-object,
186 |        access-member-before-definition,
187 |        method-hidden,
188 |        assigning-non-slot,
189 |        duplicate-bases,
190 |        inconsistent-mro,
191 |        inherit-non-class,
192 |        invalid-slots,
193 |        invalid-slots-object,
194 |        no-method-argument,
195 |        no-self-argument,
196 |        unexpected-special-method-signature,
197 |        non-iterator-returned,
198 |        arguments-differ,
199 |        signature-differs,
200 |        bad-staticmethod-argument,
201 |        non-parent-init-called,
202 |        bad-except-order,
203 |        catching-non-exception,
204 |        bad-exception-context,
205 |        notimplemented-raised,
206 |        raising-bad-type,
207 |        raising-non-exception,
208 |        misplaced-bare-raise,
209 |        duplicate-except,
210 |        nonstandard-exception,
211 |        binary-op-exception,
212 |        bare-except,
213 |        not-async-context-manager,
214 |        yield-inside-async-function
215 | 
216 | # Needs investigation:
217 | # abstract-method (might be indicating a bug? probably not though)
218 | # protected-access (requires some refactoring)
219 | # attribute-defined-outside-init (requires some refactoring)
220 | # super-init-not-called (requires some cleanup)
221 | 
222 | # Things we'd like to enable someday:
223 | # redefined-builtin (requires a bunch of work to clean up our code first)
224 | # redefined-outer-name (requires a bunch of work to clean up our code first)
225 | # undefined-variable (re-enable when pylint fixes https://github.com/PyCQA/pylint/issues/760)
226 | # no-name-in-module (giving us spurious warnings https://github.com/PyCQA/pylint/issues/73)
227 | # unused-argument (need to clean up or code a lot, e.g. prefix unused_?)
228 | # function-redefined (@overload causes lots of spurious warnings)
229 | # too-many-function-args (@overload causes spurious warnings... I think)
230 | # parameter-unpacking (needed for eventual Python 3 compat)
231 | # print-statement (needed for eventual Python 3 compat)
232 | # filter-builtin-not-iterating (Python 3)
233 | # map-builtin-not-iterating (Python 3)
234 | # range-builtin-not-iterating (Python 3)
235 | # zip-builtin-not-iterating (Python 3)
236 | # many others relevant to Python 3
237 | # unused-variable (a little work to cleanup, is all)
238 | 
239 | # ...
240 | [REPORTS]
241 | 
242 | # Set the output format. Available formats are text, parseable, colorized, msvs
243 | # (visual studio) and html. You can also give a reporter class, eg
244 | # mypackage.mymodule.MyReporterClass.
245 | output-format=parseable
246 | 
247 | # Put messages in a separate file for each module / package specified on the
248 | # command line instead of printing them on stdout. Reports (if any) will be
249 | # written in a file name "pylint_global.[txt|html]".
250 | files-output=no
251 | 
252 | # Tells whether to display a full report or only the messages
253 | reports=no
254 | 
255 | # Python expression which should return a note less than 10 (10 is the highest
256 | # note). You have access to the variables errors warning, statement which
257 | # respectively contain the number of errors / warnings messages and the total
258 | # number of statements analyzed. This is used by the global evaluation report
259 | # (RP0004).
260 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
261 | 
262 | # Template used to display messages. This is a python new-style format string
263 | # used to format the message information. See doc for all details
264 | #msg-template=
265 | 
266 | 
267 | [LOGGING]
268 | 
269 | # Logging modules to check that the string format arguments are in logging
270 | # function parameter format
271 | logging-modules=logging
272 | 
273 | 
274 | [FORMAT]
275 | 
276 | # Maximum number of characters on a single line.
277 | max-line-length=120
278 | 
279 | # Regexp for a line that is allowed to be longer than the limit.
280 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
281 | 
282 | # Allow the body of an if to be on the same line as the test if there is no
283 | # else.
284 | single-line-if-stmt=no
285 | 
286 | # List of optional constructs for which whitespace checking is disabled. `dict-
287 | # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
288 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
289 | # `empty-line` allows space-only lines.
290 | no-space-check=trailing-comma,dict-separator
291 | 
292 | # Maximum number of lines in a module
293 | max-module-lines=1000
294 | 
295 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
296 | # tab).
297 | indent-string='    '
298 | 
299 | # Number of spaces of indent required inside a hanging  or continued line.
300 | indent-after-paren=4
301 | 
302 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
303 | expected-line-ending-format=
304 | 
305 | 
306 | [TYPECHECK]
307 | 
308 | # Tells whether missing members accessed in mixin class should be ignored. A
309 | # mixin class is detected if its name ends with "mixin" (case insensitive).
310 | ignore-mixin-members=yes
311 | 
312 | # List of module names for which member attributes should not be checked
313 | # (useful for modules/projects where namespaces are manipulated during runtime
314 | # and thus existing member attributes cannot be deduced by static analysis. It
315 | # supports qualified module names, as well as Unix pattern matching.
316 | ignored-modules=
317 | 
318 | # List of classes names for which member attributes should not be checked
319 | # (useful for classes with attributes dynamically set). This supports can work
320 | # with qualified names.
321 | ignored-classes=
322 | 
323 | # List of members which are set dynamically and missed by pylint inference
324 | # system, and so shouldn't trigger E1101 when accessed. Python regular
325 | # expressions are accepted.
326 | generated-members=
327 | 
328 | 
329 | [VARIABLES]
330 | 
331 | # Tells whether we should check for unused import in __init__ files.
332 | init-import=no
333 | 
334 | # A regular expression matching the name of dummy variables (i.e. expectedly
335 | # not used).
336 | dummy-variables-rgx=_$|dummy
337 | 
338 | # List of additional names supposed to be defined in builtins. Remember that
339 | # you should avoid to define new builtins when possible.
340 | additional-builtins=
341 | 
342 | # List of strings which can identify a callback function by name. A callback
343 | # name must start or end with one of those strings.
344 | callbacks=cb_,_cb
345 | 
346 | 
347 | [SIMILARITIES]
348 | 
349 | # Minimum lines number of a similarity.
350 | min-similarity-lines=4
351 | 
352 | # Ignore comments when computing similarities.
353 | ignore-comments=yes
354 | 
355 | # Ignore docstrings when computing similarities.
356 | ignore-docstrings=yes
357 | 
358 | # Ignore imports when computing similarities.
359 | ignore-imports=no
360 | 
361 | 
362 | [SPELLING]
363 | 
364 | # Spelling dictionary name. Available dictionaries: none. To make it working
365 | # install python-enchant package.
366 | spelling-dict=
367 | 
368 | # List of comma separated words that should not be checked.
369 | spelling-ignore-words=
370 | 
371 | # A path to a file that contains private dictionary; one word per line.
372 | spelling-private-dict-file=
373 | 
374 | # Tells whether to store unknown words to indicated private dictionary in
375 | # --spelling-private-dict-file option instead of raising a message.
376 | spelling-store-unknown-words=no
377 | 
378 | 
379 | [MISCELLANEOUS]
380 | 
381 | # List of note tags to take in consideration, separated by a comma.
382 | notes=FIXME,XXX
383 | 
384 | 
385 | [BASIC]
386 | 
387 | # List of builtins function names that should not be used, separated by a comma
388 | bad-functions=map,filter,input
389 | 
390 | # Good variable names which should always be accepted, separated by a comma
391 | good-names=i,j,k,ex,Run,_
392 | 
393 | # Bad variable names which should always be refused, separated by a comma
394 | bad-names=foo,bar,baz,toto,tutu,tata
395 | 
396 | # Colon-delimited sets of names that determine each other's naming style when
397 | # the name regexes allow several styles.
398 | name-group=
399 | 
400 | # Include a hint for the correct naming format with invalid-name
401 | include-naming-hint=no
402 | 
403 | # Regular expression matching correct function names
404 | function-rgx=[a-z_][a-z0-9_]{2,40}$
405 | 
406 | # Naming hint for function names
407 | function-name-hint=[a-z_][a-z0-9_]{2,40}$
408 | 
409 | # Regular expression matching correct variable names
410 | variable-rgx=[a-z_][a-z0-9_]{2,30}$
411 | 
412 | # Naming hint for variable names
413 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$
414 | 
415 | # Regular expression matching correct constant names
416 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
417 | 
418 | # Naming hint for constant names
419 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
420 | 
421 | # Regular expression matching correct attribute names
422 | attr-rgx=[a-z_][a-z0-9_]{2,30}$
423 | 
424 | # Naming hint for attribute names
425 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$
426 | 
427 | # Regular expression matching correct argument names
428 | argument-rgx=[a-z_][a-z0-9_]{2,30}$
429 | 
430 | # Naming hint for argument names
431 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$
432 | 
433 | # Regular expression matching correct class attribute names
434 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
435 | 
436 | # Naming hint for class attribute names
437 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
438 | 
439 | # Regular expression matching correct inline iteration names
440 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
441 | 
442 | # Naming hint for inline iteration names
443 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
444 | 
445 | # Regular expression matching correct class names
446 | class-rgx=[A-Z_][a-zA-Z0-9]+$
447 | 
448 | # Naming hint for class names
449 | class-name-hint=[A-Z_][a-zA-Z0-9]+$
450 | 
451 | # Regular expression matching correct module names
452 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
453 | 
454 | # Naming hint for module names
455 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
456 | 
457 | # Regular expression matching correct method names
458 | method-rgx=[a-z_][a-z0-9_]{2,30}$
459 | 
460 | # Naming hint for method names
461 | method-name-hint=[a-z_][a-z0-9_]{2,30}$
462 | 
463 | # Regular expression which should only match function or class names that do
464 | # not require a docstring.
465 | no-docstring-rgx=^_
466 | 
467 | # Minimum line length for functions/classes that require docstrings, shorter
468 | # ones are exempt.
469 | docstring-min-length=-1
470 | 
471 | 
472 | [ELIF]
473 | 
474 | # Maximum number of nested blocks for function / method body
475 | max-nested-blocks=5
476 | 
477 | 
478 | [IMPORTS]
479 | 
480 | # Deprecated modules which should not be used, separated by a comma
481 | deprecated-modules=regsub,TERMIOS,Bastion,rexec
482 | 
483 | # Create a graph of every (i.e. internal and external) dependencies in the
484 | # given file (report RP0402 must not be disabled)
485 | import-graph=
486 | 
487 | # Create a graph of external dependencies in the given file (report RP0402 must
488 | # not be disabled)
489 | ext-import-graph=
490 | 
491 | # Create a graph of internal dependencies in the given file (report RP0402 must
492 | # not be disabled)
493 | int-import-graph=
494 | 
495 | 
496 | [DESIGN]
497 | 
498 | # Maximum number of arguments for function / method
499 | max-args=7
500 | 
501 | # Argument names that match this expression will be ignored. Default to name
502 | # with leading underscore
503 | ignored-argument-names=_.*
504 | 
505 | # Maximum number of locals for function / method body
506 | max-locals=15
507 | 
508 | # Maximum number of return / yield for function / method body
509 | max-returns=6
510 | 
511 | # Maximum number of branch for function / method body
512 | max-branches=12
513 | 
514 | # Maximum number of statements in function / method body
515 | max-statements=50
516 | 
517 | # Maximum number of parents for a class (see R0901).
518 | max-parents=7
519 | 
520 | # Maximum number of attributes for a class (see R0902).
521 | max-attributes=7
522 | 
523 | # Minimum number of public methods for a class (see R0903).
524 | min-public-methods=2
525 | 
526 | # Maximum number of public methods for a class (see R0904).
527 | max-public-methods=20
528 | 
529 | # Maximum number of boolean expressions in a if statement
530 | max-bool-expr=5
531 | 
532 | 
533 | [CLASSES]
534 | 
535 | # List of method names used to declare (i.e. assign) instance attributes.
536 | defining-attr-methods=__init__,__new__,setUp
537 | 
538 | # List of valid names for the first argument in a class method.
539 | valid-classmethod-first-arg=cls
540 | 
541 | # List of valid names for the first argument in a metaclass class method.
542 | valid-metaclass-classmethod-first-arg=mcs
543 | 
544 | # List of member names, which should be excluded from the protected access
545 | # warning.
546 | exclude-protected=_asdict,_fields,_replace,_source,_make
547 | 
548 | 
549 | [EXCEPTIONS]
550 | 
551 | # Exceptions that will emit a warning when being caught. Defaults to
552 | # "Exception"
553 | overgeneral-exceptions=Exception


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # CHANGELOG
 2 | 
 3 | ## 2.3.0 (2021-12-16)
 4 | ### Added
 5 | - Transformation of specific fields in object/array type properties in `RECORD` by using XPath syntax.
 6 | - Conditions on specific fields in object/array type properties in `RECORD`.
 7 | 
 8 | ## 2.2.0 (2021-09-17)
 9 | ### Added
10 | - New transformation MASK-STRING-SKIP-ENDS-n. The transformation masks the string except start and end n-characters.
11 | 
12 | ## 2.1.0 (2021-03-11)
13 | ### Addedd
14 | - `--validate` flag to do one-off validatation of the transformation config using a given catalog file.
15 | 
16 | ### Changed
17 | - Validation of the transformation during runtime whenever a new `SCHEMA` type message has been received.
18 | 
19 | 
20 | ## 2.0.0 (2020-03-17)
21 | 
22 | ### Changed
23 | - Stop trimming transformed values
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Copyright 2012 The Obvious Corporation and contributors.
  2 | <http://obvious.com/>
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | <http://www.apache.org/licenses/LICENSE-2.0>
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | 
 16 | ```
 17 | -------------------------------------------------------------------------
 18 |                               Apache License
 19 |                         Version 2.0, January 2004
 20 |                      http://www.apache.org/licenses/
 21 | 
 22 | 
 23 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 24 | 
 25 | 1. Definitions.
 26 | 
 27 |    "License" shall mean the terms and conditions for use, reproduction,
 28 |    and distribution as defined by Sections 1 through 9 of this document.
 29 | 
 30 |    "Licensor" shall mean the copyright owner or entity authorized by
 31 |    the copyright owner that is granting the License.
 32 | 
 33 |    "Legal Entity" shall mean the union of the acting entity and all
 34 |    other entities that control, are controlled by, or are under common
 35 |    control with that entity. For the purposes of this definition,
 36 |    "control" means (i) the power, direct or indirect, to cause the
 37 |    direction or management of such entity, whether by contract or
 38 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 39 |    outstanding shares, or (iii) beneficial ownership of such entity.
 40 | 
 41 |    "You" (or "Your") shall mean an individual or Legal Entity
 42 |    exercising permissions granted by this License.
 43 | 
 44 |    "Source" form shall mean the preferred form for making modifications,
 45 |    including but not limited to software source code, documentation
 46 |    source, and configuration files.
 47 | 
 48 |    "Object" form shall mean any form resulting from mechanical
 49 |    transformation or translation of a Source form, including but
 50 |    not limited to compiled object code, generated documentation,
 51 |    and conversions to other media types.
 52 | 
 53 |    "Work" shall mean the work of authorship, whether in Source or
 54 |    Object form, made available under the License, as indicated by a
 55 |    copyright notice that is included in or attached to the work
 56 |    (an example is provided in the Appendix below).
 57 | 
 58 |    "Derivative Works" shall mean any work, whether in Source or Object
 59 |    form, that is based on (or derived from) the Work and for which the
 60 |    editorial revisions, annotations, elaborations, or other modifications
 61 |    represent, as a whole, an original work of authorship. For the purposes
 62 |    of this License, Derivative Works shall not include works that remain
 63 |    separable from, or merely link (or bind by name) to the interfaces of,
 64 |    the Work and Derivative Works thereof.
 65 | 
 66 |    "Contribution" shall mean any work of authorship, including
 67 |    the original version of the Work and any modifications or additions
 68 |    to that Work or Derivative Works thereof, that is intentionally
 69 |    submitted to Licensor for inclusion in the Work by the copyright owner
 70 |    or by an individual or Legal Entity authorized to submit on behalf of
 71 |    the copyright owner. For the purposes of this definition, "submitted"
 72 |    means any form of electronic, verbal, or written communication sent
 73 |    to the Licensor or its representatives, including but not limited to
 74 |    communication on electronic mailing lists, source code control systems,
 75 |    and issue tracking systems that are managed by, or on behalf of, the
 76 |    Licensor for the purpose of discussing and improving the Work, but
 77 |    excluding communication that is conspicuously marked or otherwise
 78 |    designated in writing by the copyright owner as "Not a Contribution."
 79 | 
 80 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 81 |    on behalf of whom a Contribution has been received by Licensor and
 82 |    subsequently incorporated within the Work.
 83 | 
 84 | 2. Grant of Copyright License. Subject to the terms and conditions of
 85 |    this License, each Contributor hereby grants to You a perpetual,
 86 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 87 |    copyright license to reproduce, prepare Derivative Works of,
 88 |    publicly display, publicly perform, sublicense, and distribute the
 89 |    Work and such Derivative Works in Source or Object form.
 90 | 
 91 | 3. Grant of Patent License. Subject to the terms and conditions of
 92 |    this License, each Contributor hereby grants to You a perpetual,
 93 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 94 |    (except as stated in this section) patent license to make, have made,
 95 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 96 |    where such license applies only to those patent claims licensable
 97 |    by such Contributor that are necessarily infringed by their
 98 |    Contribution(s) alone or by combination of their Contribution(s)
 99 |    with the Work to which such Contribution(s) was submitted. If You
100 |    institute patent litigation against any entity (including a
101 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
102 |    or a Contribution incorporated within the Work constitutes direct
103 |    or contributory patent infringement, then any patent licenses
104 |    granted to You under this License for that Work shall terminate
105 |    as of the date such litigation is filed.
106 | 
107 | 4. Redistribution. You may reproduce and distribute copies of the
108 |    Work or Derivative Works thereof in any medium, with or without
109 |    modifications, and in Source or Object form, provided that You
110 |    meet the following conditions:
111 | 
112 |    (a) You must give any other recipients of the Work or
113 |        Derivative Works a copy of this License; and
114 | 
115 |    (b) You must cause any modified files to carry prominent notices
116 |        stating that You changed the files; and
117 | 
118 |    (c) You must retain, in the Source form of any Derivative Works
119 |        that You distribute, all copyright, patent, trademark, and
120 |        attribution notices from the Source form of the Work,
121 |        excluding those notices that do not pertain to any part of
122 |        the Derivative Works; and
123 | 
124 |    (d) If the Work includes a "NOTICE" text file as part of its
125 |        distribution, then any Derivative Works that You distribute must
126 |        include a readable copy of the attribution notices contained
127 |        within such NOTICE file, excluding those notices that do not
128 |        pertain to any part of the Derivative Works, in at least one
129 |        of the following places: within a NOTICE text file distributed
130 |        as part of the Derivative Works; within the Source form or
131 |        documentation, if provided along with the Derivative Works; or,
132 |        within a display generated by the Derivative Works, if and
133 |        wherever such third-party notices normally appear. The contents
134 |        of the NOTICE file are for informational purposes only and
135 |        do not modify the License. You may add Your own attribution
136 |        notices within Derivative Works that You distribute, alongside
137 |        or as an addendum to the NOTICE text from the Work, provided
138 |        that such additional attribution notices cannot be construed
139 |        as modifying the License.
140 | 
141 |    You may add Your own copyright statement to Your modifications and
142 |    may provide additional or different license terms and conditions
143 |    for use, reproduction, or distribution of Your modifications, or
144 |    for any such Derivative Works as a whole, provided Your use,
145 |    reproduction, and distribution of the Work otherwise complies with
146 |    the conditions stated in this License.
147 | 
148 | 5. Submission of Contributions. Unless You explicitly state otherwise,
149 |    any Contribution intentionally submitted for inclusion in the Work
150 |    by You to the Licensor shall be under the terms and conditions of
151 |    this License, without any additional terms or conditions.
152 |    Notwithstanding the above, nothing herein shall supersede or modify
153 |    the terms of any separate license agreement you may have executed
154 |    with Licensor regarding such Contributions.
155 | 
156 | 6. Trademarks. This License does not grant permission to use the trade
157 |    names, trademarks, service marks, or product names of the Licensor,
158 |    except as required for reasonable and customary use in describing the
159 |    origin of the Work and reproducing the content of the NOTICE file.
160 | 
161 | 7. Disclaimer of Warranty. Unless required by applicable law or
162 |    agreed to in writing, Licensor provides the Work (and each
163 |    Contributor provides its Contributions) on an "AS IS" BASIS,
164 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
165 |    implied, including, without limitation, any warranties or conditions
166 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
167 |    PARTICULAR PURPOSE. You are solely responsible for determining the
168 |    appropriateness of using or redistributing the Work and assume any
169 |    risks associated with Your exercise of permissions under this License.
170 | 
171 | 8. Limitation of Liability. In no event and under no legal theory,
172 |    whether in tort (including negligence), contract, or otherwise,
173 |    unless required by applicable law (such as deliberate and grossly
174 |    negligent acts) or agreed to in writing, shall any Contributor be
175 |    liable to You for damages, including any direct, indirect, special,
176 |    incidental, or consequential damages of any character arising as a
177 |    result of this License or out of the use or inability to use the
178 |    Work (including but not limited to damages for loss of goodwill,
179 |    work stoppage, computer failure or malfunction, or any and all
180 |    other commercial damages or losses), even if such Contributor
181 |    has been advised of the possibility of such damages.
182 | 
183 | 9. Accepting Warranty or Additional Liability. While redistributing
184 |    the Work or Derivative Works thereof, You may choose to offer,
185 |    and charge a fee for, acceptance of support, warranty, indemnity,
186 |    or other liability obligations and/or rights consistent with this
187 |    License. However, in accepting such obligations, You may act only
188 |    on Your own behalf and on Your sole responsibility, not on behalf
189 |    of any other Contributor, and only if You agree to indemnify,
190 |    defend, and hold each Contributor harmless for any liability
191 |    incurred by, or claims asserted against, such Contributor by reason
192 |    of your accepting any such warranty or additional liability.
193 | 
194 | END OF TERMS AND CONDITIONS
195 | ```
196 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Notice
  2 | To better serve Wise business and customer needs, the PipelineWise codebase needs to shrink.
  3 | We have made the difficult decision that, going forward many components of PipelineWise will be removed or incorporated in the main repo.
  4 | The last version before this decision is [v0.64.1](https://github.com/transferwise/pipelinewise/tree/v0.64.1)
  5 | 
  6 | We thank all in the open-source community, that over the past 6 years, have helped to make PipelineWise a robust product for heterogeneous replication of many many Terabytes, daily
  7 | 
  8 | # pipelinewise-transform-field
  9 | 
 10 | [![PyPI version](https://badge.fury.io/py/pipelinewise-transform-field.svg)](https://badge.fury.io/py/pipelinewise-transform-field)
 11 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pipelinewise-transform-field.svg)](https://pypi.org/project/pipelinewise-transform-field/)
 12 | [![License: Apache2](https://img.shields.io/badge/License-Apache2-yellow.svg)](https://opensource.org/licenses/Apache-2.0)
 13 | 
 14 | Transformation component between [Singer](https://www.singer.io/) taps and targets.
 15 | 
 16 | This is a [PipelineWise](https://transferwise.github.io/pipelinewise) compatible component.
 17 | 
 18 | ## How to use it
 19 | 
 20 | The recommended method of running this component is to use it from [PipelineWise](https://transferwise.github.io/pipelinewise). When running it from PipelineWise you don't need to configure this tap with JSON files, and most of things are automated.
 21 | Please check the related documentation at [Transformations](https://transferwise.github.io/pipelinewise/user_guide/transformations.html)
 22 | 
 23 | If you want to run this [Singer](https://singer.io) compatible component independently please read further.
 24 | 
 25 | ## Install
 26 | 
 27 | First, make sure Python 3 is installed on your system or follow these
 28 | installation instructions for [Mac](http://docs.python-guide.org/en/latest/starting/install3/osx/) or
 29 | [Ubuntu](https://www.digitalocean.com/community/tutorials/how-to-install-python-3-and-set-up-a-local-programming-environment-on-ubuntu-16-04).
 30 | 
 31 | It's recommended to use a virtualenv:
 32 | 
 33 | ```bash
 34 |   python3 -m venv venv
 35 |   pip install pipelinewise-transform-field
 36 | ```
 37 | 
 38 | or
 39 | 
 40 | ```bash
 41 |   python3 -m venv venv
 42 |   . venv/bin/activate
 43 |   pip install --upgrade pip setuptools
 44 |   pip install .
 45 | ```
 46 | 
 47 | ### To validate transformations
 48 | 
 49 | `transform-field --validate --config [config.json] --catalog [catalog.json]`
 50 | 
 51 | ### To run
 52 | 
 53 | Put it between a tap and a target with simple unix pipes:
 54 | 
 55 | `some-singer-tap | transform-field --config [config.json] | some-singer-target`
 56 | 
 57 | It's reading incoming messages from STDIN and using `config.json` to transform incoming RECORD messages.
 58 | 
 59 | **Note**: To avoid version conflicts run `tap`, `transform` and `targets` in separate virtual environments.
 60 | 
 61 | ### Transformation types
 62 | 
 63 | The following are the transformation types supported by _pipelinewise-transform-field_:
 64 | 
 65 | * **SET-NULL**: Transforms any input to NULL
 66 | * **HASH**: Transforms string input to hash
 67 | * **HASH-SKIP-FIRST-n**: Transforms string input to hash skipping first n characters, e.g. HASH-SKIP-FIRST-2
 68 | * **MASK-DATE**: Replaces the months and day parts of date columns to be always 1st of Jan
 69 | * **MASK-NUMBER**: Transforms any numeric value to zero
 70 | * **MASK-HIDDEN**: Transforms any string to 'hidden'
 71 | * **MASK-STRING-SKIP-ENDS-n**: Transforms string input to masked version skipping first and last n characters, e.g. MASK-STRING-SKIP-ENDS-3
 72 | 
 73 | _PS_: 1 =< n =< 9
 74 | 
 75 | ### Conditional transformations
 76 | 
 77 | It is possible to transform a record's property based on some given condition(s), the transformation will only take place when all conditions are met.
 78 | 
 79 | A condition is a combination of:
 80 | * column [required]: the field to look up to
 81 | * operation [required]: the comparison type to use, the supported ones are `equals` and `regex_match`.
 82 | * value [required]: the column value to look for in records.
 83 | 
 84 | **An equality condition on a column**
 85 | ```json
 86 | {
 87 |   "column": "<some column name>",
 88 |   "equals": <some important value>
 89 | }
 90 | ```
 91 | 
 92 | **A regex condition on a column**
 93 | ```json
 94 | {
 95 |   "column": "<some column name>",
 96 |   "regex_match": "<some regex pattern>"
 97 | }
 98 | ```
 99 | 
100 | **A condition on a property within a JSON-type column**
101 | ```json
102 | {
103 |   "column": "<some column name>",
104 |   "field_path": "<xpath to property within 'column' object>",
105 |   "equals": <some important value>
106 | }
107 | ```
108 | 
109 | ### Configuration
110 | 
111 | You need to define which columns have to be transformed by which method and in which condition the transformation needs to be applied.
112 | 
113 | #### Basic transformation
114 | A basic transformation is where a field in all a stream records will be transformed can be achieved with:
115 | ```json
116 | {
117 |   "tap_stream_name": "<stream ID>",
118 |   "field_id": "<Name of the field to transform in the record>",
119 |   "type": "<Transformation type>"
120 | }
121 | ```
122 | 
123 | #### Transformation within JSON
124 | 
125 | In order to transform property(ies) within a JSON type field, you can make use of `field_paths` property:
126 | 
127 | ```json
128 | {
129 |   "tap_stream_name": "<stream ID>",
130 |   "field_id": "<Name of the field to transform in the record>",
131 |   "field_paths": ["xpath to property 1", "xpath to property 2"],
132 |   "type": "<Transformation type>"
133 | }
134 | ```
135 | 
136 | #### Conditional Transformation
137 | 
138 | To apply transformation conditionally, you can make use of the property `when` which can have one or many conditions:
139 | 
140 | ```json
141 | {
142 |   "tap_stream_name": "<stream ID>",
143 |   "field_id": "<Name of the field to transform in the record>",
144 |   "type": "<Transformation type>",
145 |   "when": [
146 |     {"column": "string_col_1", "equals": "some value"},
147 |     {"column": "string_col_2", "regex_match": ".*PII.*"},
148 |     {"column": "numeric_col_1", "equals": 33},
149 |     {"column": "json_column", "field_path": "metadata/comment", "regex_match": "sensitive"}
150 |   ]
151 | }
152 | ```
153 | 
154 | **Sample config**
155 | [config.json](./sample_config.json)
156 | 
157 | (Tip: PipelineWise generating this for you from a more readable YAML format)
158 | 
159 | 
160 | ### To check code style:
161 | 
162 | 1. Install python dependencies in a virtual env
163 | ```
164 |   python3 -m venv venv
165 |   . venv/bin/activate
166 |   pip install --upgrade pip setuptools
167 |   pip install .[test]
168 | ```
169 | 
170 | 2. Run pylint
171 | ```shell
172 | pylint transform_field
173 | ```
174 | 
175 | ### To run tests:
176 | 
177 | 1. Install python dependencies in a virtual env and run unit and integration tests
178 | ```
179 |   python3 -m venv venv
180 |   . venv/bin/activate
181 |   pip install --upgrade pip setuptools
182 |   pip install .[test]
183 | ```
184 | 
185 | 2. Run tests:
186 | 
187 | * Unit tests
188 | ```
189 |   pytest -v tests/unit
190 | ```
191 | 
192 | * Integration tests
193 | ```
194 |   pytest -v tests/integration
195 | ```
196 | 
197 | * All tests
198 | ```
199 |   pytest -v tests
200 | ```
201 | 
202 | 
203 | 
204 | ## License
205 | 
206 | Apache License Version 2.0
207 | 
208 | See [LICENSE](LICENSE) to see the full text.
209 | 
210 | 


--------------------------------------------------------------------------------
/sample_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "transformations": [
 3 |     {
 4 |       "field_id": "password_hash",
 5 |       "tap_stream_name": "stream-id-sent-by-the-tap",
 6 |       "type": "MASK-HIDDEN"
 7 |     },
 8 |     {
 9 |       "field_id": "salt",
10 |       "tap_stream_name": "stream-id-sent-by-the-tap",
11 |       "type": "HASH"
12 |     },
13 |     {
14 |       "field_id": "value",
15 |       "tap_stream_name": "stream-id-sent-by-the-tap",
16 |       "type": "SET-NULL",
17 |       "when": [
18 |         {"column": "string_column_1", "equals": "Property" },
19 |         {"column": "numeric_column", "equals": 200 },
20 |         {"column": "string_column_2", "regex_match": "sensitive.*PII" },
21 |         {"column": "json_column", "field_path": "metadata/comment", "regex_match": "sensitive" }
22 |       ]
23 |     },
24 |     {
25 |       "field_id": "metadata",
26 |       "tap_stream_name": "stream-id-sent-by-the-tap",
27 |       "type": "MASK-HIDDEN",
28 |       "field_paths": ["user/address", "user/zip_code"]
29 |     }
30 |   ]
31 | }


--------------------------------------------------------------------------------
/sample_logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root
 3 | 
 4 | [handlers]
 5 | keys=stderr
 6 | 
 7 | [formatters]
 8 | keys=child
 9 | 
10 | [logger_root]
11 | level=INFO
12 | handlers=stderr
13 | formatter=child
14 | propagate=0
15 | 
16 | [handler_stderr]
17 | level=INFO
18 | class=StreamHandler
19 | formatter=child
20 | args=(sys.stderr,)
21 | 
22 | [formatter_child]
23 | class=logging.Formatter
24 | format=time=%(asctime)s name=%(name)s level=%(levelname)s message=%(message)s
25 | datefmt=%Y-%m-%d %H:%M:%S
26 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | with open("README.md", "r") as fh:
 6 |     long_description = fh.read()
 7 | 
 8 | setup(name='pipelinewise-transform-field',
 9 |       version='2.3.0',
10 |       description='Singer.io simple field transformer between taps and targets - PipelineWise compatible',
11 |       long_description=long_description,
12 |       long_description_content_type='text/markdown',
13 |       author="Wise",
14 |       url='https://github.com/transferwise/pipelinewise-transform-field',
15 |       classifiers=[
16 |           'License :: OSI Approved :: Apache Software License',
17 |           'Environment :: Console',
18 |           'Programming Language :: Python :: 3 :: Only',
19 |           'Programming Language :: Python :: 3.6',
20 |           'Programming Language :: Python :: 3.7',
21 |           'Programming Language :: Python :: 3.8'
22 |       ],
23 |       py_modules=['transform_field'],
24 |       install_requires=[
25 |           'pipelinewise-singer-python==1.*',
26 |           'dpath==2.0.*',
27 |       ],
28 |       extras_require={
29 |           'test': [
30 |               'pytest==6.2.*',
31 |               'pytest-cov==3.0.*',
32 |               'pylint==2.12.*',
33 |           ]
34 |       },
35 |       entry_points='''
36 |           [console_scripts]
37 |           transform-field=transform_field:main
38 |       ''',
39 |       packages=['transform_field']
40 |       )
41 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/transferwise/pipelinewise-transform-field/af22b03ece9144948702236d9f2f5aaed845ac16/tests/__init__.py


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/transferwise/pipelinewise-transform-field/af22b03ece9144948702236d9f2f5aaed845ac16/tests/integration/__init__.py


--------------------------------------------------------------------------------
/tests/integration/resources/catalog.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "streams": [
 3 |     {
 4 |       "metadata": [
 5 |         {
 6 |           "breadcrumb": [],
 7 |           "metadata": {
 8 |             "replication-method": "FULL_TABLE",
 9 |             "selected": true,
10 |             "selected-by-default": false,
11 |             "table-key-properties": [
12 |               "column_1"
13 |             ]
14 |           }
15 |         }
16 |       ],
17 |       "schema": {
18 |         "properties": {
19 |           "column_1": {
20 |             "format": "date-time",
21 |             "inclusion": "available",
22 |             "type": [
23 |               "null",
24 |               "string"
25 |             ]
26 |           },
27 |           "column_2": {
28 |             "inclusion": "automatic",
29 |             "maximum": 2147483647,
30 |             "minimum": -2147483648,
31 |             "type": [
32 |               "null",
33 |               "integer"
34 |             ]
35 |           },
36 |           "column_3": {
37 |             "inclusion": "automatic",
38 |             "maximum": 2147483647,
39 |             "minimum": -2147483648,
40 |             "type": [
41 |               "null",
42 |               "integer"
43 |             ]
44 |           },
45 |           "column_4": {
46 |             "inclusion": "automatic",
47 |             "maximum": 2147483647,
48 |             "minimum": -2147483648,
49 |             "type": [
50 |               "null",
51 |               "integer"
52 |             ]
53 |           },
54 |           "column_5": {
55 |             "format": "date-time",
56 |             "inclusion": "available",
57 |             "type": [
58 |               "null",
59 |               "string"
60 |             ]
61 |           }
62 |         },
63 |         "type": "object"
64 |       },
65 |       "tap_stream_id": "dummy_stream"
66 |     }
67 |   ]
68 | }


--------------------------------------------------------------------------------
/tests/integration/resources/invalid_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "transformations":[
 3 |     {
 4 |       "tap_stream_name":"dummy_stream",
 5 |       "field_id":"column_1",
 6 |       "type":"SET-NULL"
 7 |     },
 8 |     {
 9 |       "tap_stream_name":"dummy_stream",
10 |       "field_id":"column_2",
11 |       "type":"HASH"
12 |     },
13 |     {
14 |       "tap_stream_name": "dummy_stream",
15 |       "field_id": "column_5",
16 |       "type": "MASK-DATE"
17 |     }
18 |   ]
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/integration/resources/invalid_messages.json:
--------------------------------------------------------------------------------
1 | {"type": "STATE", "value": {"currently_syncing": "tap_mysql_test-test_table_one"}}
2 | {"type": "SCHEMA", "stream": "tap_mysql_test-test_table_one", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "c_varchar": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "c_int": {"inclusion": "available", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}}, "type": "object"}, "key_properties": ["c_pk"]}
3 | THIS IS A TEST INPUT FROM A TAP WITH A LINE WITH INVALID JSON
4 | {"type": "ACTIVATE_VERSION", "stream": "tap_mysql_test-test_table_one", "version": 1}
5 | 


--------------------------------------------------------------------------------
/tests/integration/resources/messages.json:
--------------------------------------------------------------------------------
 1 | {"type": "STATE", "value": {"currently_syncing": "dummy_stream"}}
 2 | {"type": "SCHEMA", "stream": "dummy_stream", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "column_1": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_2": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_3": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_4": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_5": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "column_6": {"inclusion": "available", "type": ["null", "integer"]}, "column_7": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_8": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "column_9": {"inclusion": "available", "type": ["null", "integer"]}, "column_10": {"inclusion": "available", "maxLength": 64, "type": ["null", "string"]}, "column_11": {"inclusion": "available", "maxLength": 64, "type": ["null", "string"]}, "column_12": {"inclusion": "available", "maxLength": 64, "type": ["null", "string"]}, "column_13": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_14": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["c_pk"]}
 3 | {"type": "ACTIVATE_VERSION", "stream": "dummy_stream", "version": 1}
 4 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 1, "column_1": "Dummy row 1", "column_2": "Dummy row 1", "column_3": "Dummy row 1", "column_4": "Dummy row 1", "column_5": "2019-12-21T12:12:45", "column_6": 1234, "column_7": "Dummy row 1", "column_8": "2019-12-21T12:12:45", "column_9": 100, "column_10": "column_11 is safe to keep", "column_11": "My name is John", "column_12": "abcd", "column_13": "domaskme", "column_14": "domaskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
 5 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 2, "column_1": "Dummy row 2", "column_2": "Dummy row 2", "column_3": "Dummy row 2", "column_4": "Dummy row 2", "column_5": "2019-12-21T13:12:45", "column_6": 1234, "column_7": "Dummy row 2", "column_8": "2019-12-21T13:12:45", "column_9": 200, "column_10": "column_11 has sensitive data. Needs to transform to NULL", "column_11": "SUPER_SECRET_PASSWORD", "column_12": "abcd", "column_13": "nom", "column_14": "maskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
 6 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 3, "column_1": "Dummy row 3", "column_2": "Dummy row 3", "column_3": "Dummy row 3", "column_4": "Dummy row 3", "column_5": "2019-12-21T14:12:45", "column_6": 1234, "column_7": "Dummy row 3", "column_8": "2019-12-21T14:12:45", "column_9": 300, "column_10": "Dummy row 1", "column_11": "Dummy row 1", "column_12": "abcd", "column_13": "domaskme", "column_14": "domaskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
 7 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 3, "column_1": "Dummy row 4", "column_2": "Dummy row 4", "column_3": "Dummy row 4", "column_4": "Dummy row 4", "column_5": "2019-12-21T15:12:45", "column_6": 1234, "column_7": "Dummy row 4", "column_8": "2019-12-21T15:12:45", "column_9": 400, "column_10": "Dummy row 1", "column_11": "Dummy row 1", "column_12": "abcd", "column_13": "domaskme", "column_14": "domaskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
 8 | {"type": "RECORD", "stream": "dummy_stream", "record": {"c_pk": 5, "column_1": "Dummy row 5", "column_2": "Dummy row 5", "column_3": "Dummy row 5", "column_4": "Dummy row 5", "column_5": "2019-12-21T16:12:45", "column_6": 1234, "column_7": "Dummy row 5", "column_8": "2019-12-21T16:12:45", "column_9": 500, "column_10": "Dummy row 1", "column_11": "Dummy row 1", "column_12": "abcd", "column_13": "domaskme", "column_14": "domaskme"}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
 9 | {"type": "STATE", "value": {"currently_syncing": "dummy_stream", "bookmarks": {"dummy_stream": {"initial_full_table_complete": true}}}}
10 | {"type": "ACTIVATE_VERSION", "stream": "dummy_stream", "version": 1}
11 | {"type": "STATE", "value": {"currently_syncing": null, "bookmarks": {"dummy_stream": {"initial_full_table_complete": true}}}}
12 | 


--------------------------------------------------------------------------------
/tests/integration/resources/streams_with_changing_schema.json:
--------------------------------------------------------------------------------
1 | {"type": "SCHEMA", "stream":"dummy_stream", "schema": {"properties": {"column_2": {"type": ["null", "integer"]}}}, "key_properties": []}
2 | {"type": "RECORD", "stream":"dummy_stream", "record": {"column_2": 1}}
3 | {"type": "RECORD", "stream":"dummy_stream", "record": {"column_2": 2}}
4 | {"type": "RECORD", "stream":"dummy_stream", "record": {"column_2": 3}}
5 | {"type": "SCHEMA", "stream":"dummy_stream", "schema": {"properties": {"column_2": {"type": ["null", "string"]}}}, "key_properties": []}
6 | {"type": "RECORD", "stream":"dummy_stream", "record": {"column_2": "ABC"}}


--------------------------------------------------------------------------------
/tests/integration/resources/streams_with_object.json:
--------------------------------------------------------------------------------
 1 | {"type": "STATE", "value": {"currently_syncing": "my_cool_stream"}}
 2 | {"type": "SCHEMA", "stream": "my_cool_stream", "schema": {"properties": {"c_pk": {"inclusion": "automatic", "minimum": -2147483648, "maximum": 2147483647, "type": ["null", "integer"]}, "column_1": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_2": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}, "column_3": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "column_4": {"inclusion": "available", "type": ["null", "integer"]}, "column_5": {"format": "date-time", "inclusion": "available", "type": ["null", "string"]}, "column_6": {"inclusion": "available", "maxLength": 16, "type": ["null", "string"]}}, "type": "object"}, "key_properties": ["c_pk"]}
 3 | {"type": "ACTIVATE_VERSION", "stream": "my_cool_stream", "version": 1}
 4 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 1, "column_1": "Dummy row 1", "column_2": "Dummy row 1", "column_3": "2019-12-21T12:12:45", "column_4": 1234, "column_5": "2021-12-21T12:12:45", "column_6": {"id":  50, "key1": "A", "key2":  {"key2_2": 41}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
 5 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 2, "column_1": "Dummy row 2", "column_2": "Dummy row 2", "column_3": "2019-12-21T13:12:45", "column_4": 4, "column_5": "2021-12-21T13:12:45", "column_6": {"id":  51, "key1": "B", "key2":  {"key2_1":  "ds"}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
 6 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 3, "column_1": "Dummy row 3", "column_2": "Dummy row 3", "column_3": "2019-12-21T14:12:45", "column_4": 15, "column_5": "2021-12-21T14:12:45", "column_6": {"id":  52, "key1": "C", "key2":  {"key2_1":  "xv43dgf", "key2_2": 4544}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
 7 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 4, "column_1": "Dummy row 4", "column_2": "Dummy row 4", "column_3": "2019-12-21T15:12:45", "column_4": 1000, "column_5": "2021-12-21T15:12:45", "column_6": {"id":  53, "key1": "D", "key2":  {"key2_1":  "43xvf", "key2_2": true}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
 8 | {"type": "RECORD", "stream": "my_cool_stream", "record": {"c_pk": 5, "column_1": "Dummy row 5", "column_2": "Dummy row 5", "column_3": "2019-12-21T16:12:45", "column_4": -44, "column_5": "2021-12-21T16:12:45", "column_6": {"id":  54, "key1": "E", "key2":  {"key2_1":  "trter", "key2_3": false}}}, "version": 1, "time_extracted": "2019-01-31T15:51:50.215998Z"}
 9 | {"type": "STATE", "value": {"currently_syncing": "my_cool_stream", "bookmarks": {"my_cool_stream": {"initial_full_table_complete": true}}}}
10 | {"type": "ACTIVATE_VERSION", "stream": "my_cool_stream", "version": 1}
11 | {"type": "STATE", "value": {"currently_syncing": null, "bookmarks": {"my_cool_stream": {"initial_full_table_complete": true}}}}
12 | 


--------------------------------------------------------------------------------
/tests/integration/resources/valid_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "transformations":[
 3 |     {
 4 |       "tap_stream_name":"dummy_stream",
 5 |       "field_id":"column_1",
 6 |       "type":"SET-NULL"
 7 |     },
 8 |     {
 9 |       "tap_stream_name":"dummy_stream",
10 |       "field_id":"column_2",
11 |       "type":"MASK-NUMBER"
12 |     },
13 |     {
14 |       "tap_stream_name": "dummy_stream",
15 |       "field_id": "column_5",
16 |       "type": "MASK-DATE"
17 |     }
18 |   ]
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/integration/test_integrations.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import unittest
  3 | import os
  4 | import sys
  5 | import json
  6 | import tempfile
  7 | 
  8 | from transform_field import TransformField, TransformFieldException, InvalidTransformationException
  9 | 
 10 | 
 11 | class Base(unittest.TestCase):
 12 |     def setUp(self):
 13 |         self.maxDiff = None
 14 | 
 15 |         sys.stdout = self._stdout = tempfile.NamedTemporaryFile('w+', delete=True)
 16 |         sys.stderr.write(self._stdout.name + ' ')
 17 | 
 18 |     def teardown(self):
 19 |         self._stdout.close()
 20 |         sys.stdout = sys.__stdout__
 21 | 
 22 |     @property
 23 |     def stdout(self):
 24 |         self._stdout.seek(0)
 25 |         return self._stdout.read()[:-1]  # Remove trailing \n:w
 26 | 
 27 |     def get_tap_input_messages(self, filename):
 28 |         lines = []
 29 |         with open('{}/resources/{}'.format(os.path.dirname(__file__), filename)) as tap_stdout:
 30 |             for line in tap_stdout.readlines():
 31 |                 lines.append(line)
 32 | 
 33 |         return lines
 34 | 
 35 |     def singer_output_to_objects(self, output):
 36 |         messages = []
 37 |         for message in output.splitlines():
 38 |             messages.append(json.loads(message))
 39 | 
 40 |         return messages
 41 | 
 42 | 
 43 | class TestIntegration(Base):
 44 | 
 45 |     def test_invalid_json(self):
 46 |         """Receiving invalid JSONs should raise an exception"""
 47 |         tap_lines = self.get_tap_input_messages('invalid_messages.json')
 48 |         trans_config = {'transformations': []}
 49 | 
 50 |         transform_field = TransformField(trans_config)
 51 |         with self.assertRaises(TransformFieldException):
 52 |             transform_field.consume(tap_lines)
 53 | 
 54 |     def test_multiple_singer_json_messages(self):
 55 |         """Test a bunch of singer messages with different field transformation types"""
 56 |         tap_lines = self.get_tap_input_messages('messages.json')
 57 | 
 58 |         # Set transformations on some columns
 59 |         trans_config = {'transformations': [
 60 |             {'tap_stream_name': 'dummy_stream', 'field_id': 'column_1', 'type': 'SET-NULL'},
 61 |             {'tap_stream_name': 'dummy_stream', 'field_id': 'column_2', 'type': 'HASH'},
 62 |             {'tap_stream_name': 'dummy_stream', 'field_id': 'column_3', 'type': 'HASH-SKIP-FIRST-2'},
 63 |             {'tap_stream_name': 'dummy_stream', 'field_id': 'column_4', 'type': 'HASH-SKIP-FIRST-3'},
 64 |             {'tap_stream_name': 'dummy_stream', 'field_id': 'column_5', 'type': 'MASK-DATE'},
 65 |             {'tap_stream_name': 'dummy_stream', 'field_id': 'column_6', 'type': 'MASK-NUMBER'},
 66 |             {'tap_stream_name': 'dummy_stream', 'field_id': 'column_11', 'type': 'SET-NULL',
 67 |              'when': [
 68 |                  {'column': 'column_7', 'equals': "Dummy row 2"},
 69 |                  {'column': 'column_9', 'equals': 200},
 70 |                  {'column': 'column_10', 'regex_match': 'sensitive'},
 71 |              ]
 72 |              },
 73 |             {'tap_stream_name': 'dummy_stream', 'field_id': 'column_12', 'type': 'MASK-HIDDEN'},
 74 |             {'tap_stream_name': 'dummy_stream', 'field_id': 'column_13', 'type': 'MASK-STRING-SKIP-ENDS-2'},
 75 |             {'tap_stream_name': 'dummy_stream', 'field_id': 'column_14', 'type': 'MASK-STRING-SKIP-ENDS-3'}
 76 |         ]}
 77 | 
 78 |         transform_field = TransformField(trans_config)
 79 |         transform_field.consume(tap_lines)
 80 | 
 81 |         singer_output_messages = self.singer_output_to_objects(self.stdout)
 82 | 
 83 |         # First message is the STATE message
 84 |         self.assertEqual(
 85 |             singer_output_messages[0],
 86 |             {
 87 |                 'type': 'STATE',
 88 |                 'value': {'currently_syncing': 'dummy_stream'}
 89 |             }
 90 |         )
 91 | 
 92 |         # Second message is the SCHEMA message
 93 |         self.assertEqual(
 94 |             singer_output_messages[1],
 95 |             {
 96 |                 'type': 'SCHEMA',
 97 |                 'stream': 'dummy_stream',
 98 |                 'schema': {
 99 |                     'properties': {
100 |                         'c_pk': {'inclusion': 'automatic', 'minimum': -2147483648, 'maximum': 2147483647,
101 |                                  'type': ['null', 'integer']},
102 |                         'column_1': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
103 |                         'column_2': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
104 |                         'column_3': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
105 |                         'column_4': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
106 |                         'column_5': {'inclusion': 'available', 'format': 'date-time', 'type': ['null', 'string']},
107 |                         'column_6': {'inclusion': 'available', 'type': ['null', 'integer']},
108 |                         'column_7': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
109 |                         'column_8': {'inclusion': 'available', 'format': 'date-time', 'type': ['null', 'string']},
110 |                         'column_9': {'inclusion': 'available', 'type': ['null', 'integer']},
111 |                         'column_10': {'inclusion': 'available', 'maxLength': 64, 'type': ['null', 'string']},
112 |                         'column_11': {'inclusion': 'available', 'maxLength': 64, 'type': ['null', 'string']},
113 |                         'column_12': {'inclusion': 'available', 'maxLength': 64, 'type': ['null', 'string']},
114 |                         'column_13': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
115 |                         'column_14': {'inclusion': 'available', 'maxLength': 16, 'type': ['null', 'string']},
116 |                     },
117 |                     'type': 'object'
118 |                 },
119 |                 'key_properties': ['c_pk']
120 |             }
121 |         )
122 | 
123 |         # Third message is a RECORD message with transformed values 
124 |         self.assertEqual(
125 |             singer_output_messages[2],
126 |             {
127 |                 'type': 'RECORD',
128 |                 'stream': 'dummy_stream',
129 |                 'record': {
130 |                     'c_pk': 1,
131 |                     'column_1': None,  # should be SET-NULL transformed
132 |                     'column_2': 'c584d22683f3e523df9a7396e7939c0da16af89976b613adfe4bcd4c9c526f32',
133 |                     # Should be HASH transformed
134 |                     'column_3': 'Ducd571661edac8d47669a60b964c7124b228b69862cd21d548794af41c139a8e3',
135 |                     # Should be HASH-SKIP-2 tranformed
136 |                     'column_4': 'Dum1fe9627d907b0a37a31b270cc0f660a7388eb470a2558e839e0c1f601aedfaa7',
137 |                     # Should be HASH-SKIP-3 tranformed
138 |                     'column_5': '2019-01-01T12:12:45',  # Should be MASK-DATE transformed
139 |                     'column_6': 0,  # Should be MASK-NUMBER transformed
140 |                     'column_7': 'Dummy row 1',  # Should be the originl value - Unknown transformation type
141 |                     'column_8': '2019-12-21T12:12:45',  # Should be the original date-time value
142 |                     'column_9': 100,  # Should be the original number value
143 | 
144 |                     # Conditional transformation
145 |                     'column_10': 'column_11 is safe to keep',
146 |                     'column_11': 'My name is John',
147 | 
148 |                     'column_12': 'hidden',
149 | 
150 |                     # Should be MASK-STRING-SKIP-ENDS-2 transformed
151 |                     'column_13': 'do****me',
152 |                     # Should be MASK-STRING-SKIP-ENDS-3 transformed
153 |                     'column_14': 'dom**kme',
154 |                 },
155 |                 'version': 1,
156 |                 'time_extracted': '2019-01-31T15:51:50.215998Z'
157 |             }
158 |         )
159 | 
160 |         # Third message is a RECORD message with transformed values 
161 |         self.assertEqual(
162 |             singer_output_messages[3],
163 |             {
164 |                 'type': 'RECORD',
165 |                 'stream': 'dummy_stream',
166 |                 'record': {
167 |                     'c_pk': 2,
168 |                     'column_1': None,  # should be SET-NULL transformed
169 |                     'column_2': '12c7ca803f4ae4044b8c3a6aa7dbaf9fe73a25e12f2258dbf8a832961ac6abab',
170 |                     # Should be HASH tranformed
171 |                     'column_3': 'Du7c2717bbc7489d36cea73c8519c815ce962142a5b32db413abe0bce7f58d943f',
172 |                     # Should be HASH-SKIP-3 tranformed
173 |                     'column_4': 'Dum5b2be872199a84657234144caec9106483a522edd36783c7a12439bcf3853c56',
174 |                     # Should be HASH-SKIP-3 tranformed
175 |                     'column_5': '2019-01-01T13:12:45',  # Should be MASK-DATE transformed
176 |                     'column_6': 0,  # Should be MASK-NUMBER transformed
177 |                     'column_7': 'Dummy row 2',  # Should be the origian value - Unknown transformation type
178 |                     'column_8': '2019-12-21T13:12:45',  # Should be the original date-time value
179 |                     'column_9': 200,  # Should be the original number value
180 | 
181 |                     # Conditional transformation
182 |                     'column_10': 'column_11 has sensitive data. Needs to transform to NULL',
183 |                     'column_11': None,  # Should be SET-NULL transformed
184 | 
185 |                     'column_12': 'hidden',
186 | 
187 |                     # Should be MASK-STRING-SKIP-ENDS-2 transformed
188 |                     'column_13': '***',
189 |                     # Should be MASK-STRING-SKIP-ENDS-3 transformed
190 |                     'column_14': '******',
191 |                 },
192 |                 'version': 1,
193 |                 'time_extracted': '2019-01-31T15:51:50.215998Z'
194 |             }
195 |         )
196 | 
197 |     def test_messages_with_changing_schema(self):
198 |         """Test a bunch of singer messages where a column in schema message
199 |         changes its type"""
200 |         tap_lines = self.get_tap_input_messages('streams_with_changing_schema.json')
201 | 
202 |         # Set transformations on some columns
203 |         trans_config = {'transformations': [
204 |             {'tap_stream_name': 'dummy_stream', 'field_id': 'column_2', 'type': 'MASK-NUMBER'},
205 |         ]}
206 | 
207 |         transform_field = TransformField(trans_config)
208 | 
209 |         with self.assertRaises(InvalidTransformationException):
210 |             transform_field.consume(tap_lines)
211 | 
212 |     def test_validate_flag_with_invalid_transformations(self):
213 |         config = '{}/resources/invalid_config.json'.format(os.path.dirname(__file__))
214 |         catalog = '{}/resources/catalog.json'.format(os.path.dirname(__file__))
215 | 
216 |         result = subprocess.run([
217 |             'transform-field',
218 |             '--validate',
219 |             '--config', config,
220 |             '--catalog', catalog,
221 |         ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
222 | 
223 |         with self.assertRaises(subprocess.CalledProcessError):
224 |             result.check_returncode()
225 | 
226 |     def test_validate_flag_with_valid_transformations(self):
227 | 
228 |         config = '{}/resources/valid_config.json'.format(os.path.dirname(__file__))
229 |         catalog = '{}/resources/catalog.json'.format(os.path.dirname(__file__))
230 | 
231 |         result = subprocess.run([
232 |             'transform-field',
233 |             '--validate',
234 |             '--config', config,
235 |             '--catalog', catalog,
236 |         ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
237 | 
238 |         self.assertIsNone(result.check_returncode())
239 | 
240 |     def test_multiple_singer_json_messages_with_transformation_on_json(self):
241 |         """Test a bunch of singer messages with transformation on json"""
242 |         tap_lines = self.get_tap_input_messages('streams_with_object.json')
243 | 
244 |         # Set transformations on some columns
245 |         trans_config = {'transformations': [
246 |             {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_1', 'type': 'SET-NULL'},
247 |             {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_2', 'type': 'MASK-HIDDEN'},
248 |             {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_3', 'type': 'MASK-DATE',
249 |              'when': [
250 |                  {'column': 'c_pk', 'equals': 2},
251 |                  {'column': 'column_6', 'field_path': 'key1', 'equals': 'B'}
252 |              ]
253 |              },
254 |             {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_4', 'type': 'MASK-NUMBER',
255 |                 'when': [
256 |                     {'column': 'column_4', 'equals': -44},
257 |                 ]
258 |              },
259 |             {'tap_stream_name': 'my_cool_stream', 'field_id': 'column_6', 'type': 'SET-NULL',
260 |              'field_paths': ['key2/key2_2']},
261 |         ]}
262 | 
263 |         transform_field = TransformField(trans_config)
264 |         transform_field.consume(tap_lines)
265 | 
266 |         records = [msg['record'] for msg in self.singer_output_to_objects(self.stdout) if msg['type'] == 'RECORD']
267 | 
268 |         self.assertListEqual(records, [
269 |             {
270 |                 'c_pk': 1,
271 |                 'column_1': None,
272 |                 'column_2': 'hidden',
273 |                 'column_3': '2019-12-21T12:12:45',
274 |                 'column_4': 1234,
275 |                 'column_5': '2021-12-21T12:12:45',
276 |                 'column_6': {'id': 50, 'key1': 'A', 'key2': {'key2_2': None}},
277 |             },
278 |             {
279 |                 'c_pk': 2,
280 |                 'column_1': None,
281 |                 'column_2': 'hidden',
282 |                 'column_3': '2019-01-01T13:12:45',
283 |                 'column_4': 4,
284 |                 'column_5': '2021-12-21T13:12:45',
285 |                 'column_6': {'id': 51, 'key1': 'B', 'key2': {'key2_1': 'ds'}},
286 |             },
287 |             {
288 |                 'c_pk': 3,
289 |                 'column_1': None,
290 |                 'column_2': 'hidden',
291 |                 'column_3': '2019-12-21T14:12:45',
292 |                 'column_4': 15,
293 |                 'column_5': '2021-12-21T14:12:45',
294 |                 'column_6': {'id': 52, 'key1': 'C', 'key2': {'key2_1': 'xv43dgf', 'key2_2': None}},
295 |             },
296 |             {
297 |                 'c_pk': 4,
298 |                 'column_1': None,
299 |                 'column_2': 'hidden',
300 |                 'column_3': '2019-12-21T15:12:45',
301 |                 'column_4': 1000,
302 |                 'column_5': '2021-12-21T15:12:45',
303 |                 'column_6': {'id': 53, 'key1': 'D', 'key2': {'key2_1': '43xvf', 'key2_2': None}},
304 |             },
305 |             {
306 |                 'c_pk': 5,
307 |                 'column_1': None,
308 |                 'column_2': 'hidden',
309 |                 'column_3': '2019-12-21T16:12:45',
310 |                 'column_4': 0,
311 |                 'column_5': '2021-12-21T16:12:45',
312 |                 'column_6': {'id': 54, 'key1': 'E', 'key2': {'key2_1': 'trter', 'key2_3': False}},
313 |             },
314 |         ])
315 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/transferwise/pipelinewise-transform-field/af22b03ece9144948702236d9f2f5aaed845ac16/tests/unit/__init__.py


--------------------------------------------------------------------------------
/tests/unit/test_init.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from unittest.mock import patch
  3 | 
  4 | from singer import Catalog, Schema
  5 | from transform_field.errors import CatalogRequiredException, StreamNotFoundException, NoStreamSchemaException, \
  6 |     UnsupportedTransformationTypeException, InvalidTransformationException
  7 | 
  8 | from transform_field import TransformField, TransMeta
  9 | 
 10 | 
 11 | class TestTransformField(unittest.TestCase):
 12 |     """
 13 |     Unit Tests for the TransformField class
 14 |     """
 15 | 
 16 |     def setUp(self) -> None:
 17 |         self.config = {
 18 |             'transformations': [
 19 |                 {
 20 |                     "tap_stream_name": "stream_1",
 21 |                     "field_id": "column_1",
 22 |                     "type": "SET-NULL"
 23 |                 },
 24 |                 {
 25 |                     "tap_stream_name": "stream_1",
 26 |                     "field_id": "column_2",
 27 |                     "type": "HASH",
 28 |                     "when": []
 29 |                 },
 30 |                 {
 31 |                     "tap_stream_name": "stream_2",
 32 |                     "field_id": "column_1",
 33 |                     "type": "MASK-DATE"
 34 |                 },
 35 |             ]
 36 |         }
 37 | 
 38 |     def test_init(self):
 39 |         instance = TransformField(self.config)
 40 | 
 41 |         self.assertListEqual(instance.messages, [])
 42 |         self.assertEqual(instance.buffer_size_bytes, 0)
 43 |         self.assertIsNone(instance.state)
 44 |         self.assertIsNotNone(instance.time_last_batch_sent)
 45 |         self.assertDictEqual(instance.trans_config, self.config)
 46 |         self.assertDictEqual(instance.stream_meta, {})
 47 |         self.assertDictEqual(instance.trans_meta, {
 48 |             'stream_1': [
 49 |                 TransMeta('column_1', 'SET-NULL', None, None),
 50 |                 TransMeta('column_2', 'HASH', [], None),
 51 |             ],
 52 |             'stream_2': [TransMeta('column_1', 'MASK-DATE', None, None)],
 53 |         })
 54 | 
 55 |     def test_validate_without_catalog_fails(self):
 56 |         with self.assertRaises(CatalogRequiredException):
 57 |             TransformField(self.config).validate(None)
 58 | 
 59 |     @patch('transform_field.utils.get_stream_schemas')
 60 |     def test_validate_with_missing_stream_fails(self, get_stream_schemas_mock):
 61 |         catalog = Catalog.from_dict({'streams': []})
 62 | 
 63 |         get_stream_schemas_mock.return_value = {
 64 |             'stream_2': {'something'}
 65 |         }
 66 |         with self.assertRaises(StreamNotFoundException):
 67 |             TransformField(self.config).validate(catalog)
 68 | 
 69 |     @patch('transform_field.utils.get_stream_schemas')
 70 |     def test_validate_with_empty_stream_schema_fails(self, get_stream_schemas_mock):
 71 |         catalog = Catalog.from_dict({'streams': []})
 72 | 
 73 |         get_stream_schemas_mock.return_value = {
 74 |             'stream_1': {},
 75 |             'stream_2': {'something'}
 76 |         }
 77 |         with self.assertRaises(NoStreamSchemaException):
 78 |             TransformField(self.config).validate(catalog)
 79 | 
 80 |     @patch('transform_field.utils.get_stream_schemas')
 81 |     def test_validate_with_unsupported_trans_type(self, get_stream_schemas_mock):
 82 |         config = {
 83 |             'transformations': [
 84 |                 {
 85 |                     "tap_stream_name": "stream_1",
 86 |                     "field_id": "column_1",
 87 |                     "type": "SET-RANDOM"
 88 |                 },
 89 |             ]
 90 |         }
 91 | 
 92 |         catalog = Catalog.from_dict({'streams': []})
 93 | 
 94 |         get_stream_schemas_mock.return_value = {
 95 |             'stream_1': Schema.from_dict({'properties': {
 96 |                 'column_1': {
 97 |                     'type': [
 98 |                         'string'
 99 |                     ]
100 |                 }
101 |             }})
102 |         }
103 |         with self.assertRaises(UnsupportedTransformationTypeException):
104 |             TransformField(config).validate(catalog)
105 | 
106 |     @patch('transform_field.utils.get_stream_schemas')
107 |     def test_validate_with_set_null_trans_type_success(self, get_stream_schemas_mock):
108 |         config = {
109 |             'transformations': [
110 |                 {
111 |                     "tap_stream_name": "stream_1",
112 |                     "field_id": "column_1",
113 |                     "type": "SET-NULL"
114 |                 },
115 |             ]
116 |         }
117 | 
118 |         catalog = Catalog.from_dict({'streams': []})
119 | 
120 |         get_stream_schemas_mock.return_value = {
121 |             'stream_1': Schema.from_dict({'properties': {
122 |                 'column_1': {
123 |                     'type': [
124 |                         'string'
125 |                     ]
126 |                 }
127 |             }})
128 |         }
129 |         TransformField(config).validate(catalog)
130 | 
131 |     @patch('transform_field.utils.get_stream_schemas')
132 |     def test_validate_with_hash_fails_1(self, get_stream_schemas_mock):
133 |         """
134 |         Testing validation of HASH transformation when field has no type
135 |         """
136 |         config = {
137 |             'transformations': [
138 |                 {
139 |                     "tap_stream_name": "stream_1",
140 |                     "field_id": "column_1",
141 |                     "type": "HASH"
142 |                 },
143 |             ]
144 |         }
145 | 
146 |         catalog = Catalog.from_dict({'streams': []})
147 | 
148 |         get_stream_schemas_mock.return_value = {
149 |             'stream_1': Schema.from_dict({'properties': {
150 |                 'column_1': {}
151 |             }})
152 |         }
153 |         with self.assertRaises(InvalidTransformationException):
154 |             TransformField(config).validate(catalog)
155 | 
156 |     @patch('transform_field.utils.get_stream_schemas')
157 |     def test_validate_with_hash_fails_2(self, get_stream_schemas_mock):
158 |         """
159 |         Testing validation of HASH transformation when field has non-string type
160 |         """
161 |         config = {
162 |             'transformations': [
163 |                 {
164 |                     "tap_stream_name": "stream_1",
165 |                     "field_id": "column_1",
166 |                     "type": "HASH"
167 |                 },
168 |             ]
169 |         }
170 | 
171 |         catalog = Catalog.from_dict({'streams': []})
172 | 
173 |         get_stream_schemas_mock.return_value = {
174 |             'stream_1': Schema.from_dict({'properties': {
175 |                 'column_1': {
176 |                     'type': [
177 |                         'null',
178 |                         'integer'
179 |                     ]
180 |                 }
181 |             }})
182 |         }
183 |         with self.assertRaises(InvalidTransformationException):
184 |             TransformField(config).validate(catalog)
185 | 
186 |     @patch('transform_field.utils.get_stream_schemas')
187 |     def test_validate_with_hash_fails_3(self, get_stream_schemas_mock):
188 |         """
189 |         Testing validation of HASH transformation when field has string type but formatted
190 |         """
191 |         config = {
192 |             'transformations': [
193 |                 {
194 |                     "tap_stream_name": "stream_1",
195 |                     "field_id": "column_1",
196 |                     "type": "HASH"
197 |                 },
198 |             ]
199 |         }
200 | 
201 |         catalog = Catalog.from_dict({'streams': []})
202 | 
203 |         get_stream_schemas_mock.return_value = {
204 |             'stream_1': Schema.from_dict({'properties': {
205 |                 'column_1': {
206 |                     'type': [
207 |                         'null',
208 |                         'string'
209 |                     ],
210 |                     'format': 'binary'
211 |                 }
212 |             }})
213 |         }
214 |         with self.assertRaises(InvalidTransformationException):
215 |             TransformField(config).validate(catalog)
216 | 
217 |     @patch('transform_field.utils.get_stream_schemas')
218 |     def test_validate_with_hash_success(self, get_stream_schemas_mock):
219 |         """
220 |         Testing validation of HASH transformation when field has string type but no format
221 |         """
222 |         config = {
223 |             'transformations': [
224 |                 {
225 |                     "tap_stream_name": "stream_1",
226 |                     "field_id": "column_1",
227 |                     "type": "HASH"
228 |                 },
229 |             ]
230 |         }
231 | 
232 |         catalog = Catalog.from_dict({'streams': []})
233 | 
234 |         get_stream_schemas_mock.return_value = {
235 |             'stream_1': Schema.from_dict({'properties': {
236 |                 'column_1': {
237 |                     'type': [
238 |                         'null',
239 |                         'string'
240 |                     ]
241 |                 }
242 |             }})
243 |         }
244 |         TransformField(config).validate(catalog)
245 | 
246 |     @patch('transform_field.utils.get_stream_schemas')
247 |     def test_validate_with_hash_skip_first_fails_1(self, get_stream_schemas_mock):
248 |         """
249 |         Testing validation of HASH-SKIP-FIRST transformation when field has no type
250 |         """
251 |         config = {
252 |             'transformations': [
253 |                 {
254 |                     "tap_stream_name": "stream_1",
255 |                     "field_id": "column_1",
256 |                     "type": "HASH-SKIP-FIRST-1"
257 |                 },
258 |             ]
259 |         }
260 | 
261 |         catalog = Catalog.from_dict({'streams': []})
262 | 
263 |         get_stream_schemas_mock.return_value = {
264 |             'stream_1': Schema.from_dict({'properties': {
265 |                 'column_1': {}
266 |             }})
267 |         }
268 |         with self.assertRaises(InvalidTransformationException):
269 |             TransformField(config).validate(catalog)
270 | 
271 |     @patch('transform_field.utils.get_stream_schemas')
272 |     def test_validate_with_hash_skip_first_fails_2(self, get_stream_schemas_mock):
273 |         """
274 |         Testing validation of HASH-SKIP-FIRST transformation when field has non-string type
275 |         """
276 |         config = {
277 |             'transformations': [
278 |                 {
279 |                     "tap_stream_name": "stream_1",
280 |                     "field_id": "column_1",
281 |                     "type": "HASH-SKIP-FIRST-1"
282 |                 },
283 |             ]
284 |         }
285 | 
286 |         catalog = Catalog.from_dict({'streams': []})
287 | 
288 |         get_stream_schemas_mock.return_value = {
289 |             'stream_1': Schema.from_dict({'properties': {
290 |                 'column_1': {
291 |                     'type': [
292 |                         'null',
293 |                         'integer'
294 |                     ]
295 |                 }
296 |             }})
297 |         }
298 |         with self.assertRaises(InvalidTransformationException):
299 |             TransformField(config).validate(catalog)
300 | 
301 |     @patch('transform_field.utils.get_stream_schemas')
302 |     def test_validate_with_hash_skip_first_fails_3(self, get_stream_schemas_mock):
303 |         """
304 |         Testing validation of HASH-SKIP-FIRST-1 transformation when field has string type but formatted
305 |         """
306 |         config = {
307 |             'transformations': [
308 |                 {
309 |                     "tap_stream_name": "stream_1",
310 |                     "field_id": "column_1",
311 |                     "type": "HASH-SKIP-FIRST-1"
312 |                 },
313 |             ]
314 |         }
315 | 
316 |         catalog = Catalog.from_dict({'streams': []})
317 | 
318 |         get_stream_schemas_mock.return_value = {
319 |             'stream_1': Schema.from_dict({'properties': {
320 |                 'column_1': {
321 |                     'type': [
322 |                         'null',
323 |                         'string'
324 |                     ],
325 |                     'format': 'binary'
326 |                 }
327 |             }})
328 |         }
329 |         with self.assertRaises(InvalidTransformationException):
330 |             TransformField(config).validate(catalog)
331 | 
332 |     @patch('transform_field.utils.get_stream_schemas')
333 |     def test_validate_with_hash_skip_first_success(self, get_stream_schemas_mock):
334 |         """
335 |         Testing validation of HASH-SKIP-FIRST-1 transformation when field has string type but not formatted
336 |         """
337 |         config = {
338 |             'transformations': [
339 |                 {
340 |                     "tap_stream_name": "stream_1",
341 |                     "field_id": "column_1",
342 |                     "type": "HASH-SKIP-FIRST-1"
343 |                 },
344 |             ]
345 |         }
346 | 
347 |         catalog = Catalog.from_dict({'streams': []})
348 | 
349 |         get_stream_schemas_mock.return_value = {
350 |             'stream_1': Schema.from_dict({'properties': {
351 |                 'column_1': {
352 |                     'type': [
353 |                         'null',
354 |                         'string'
355 |                     ]
356 |                 }
357 |             }})
358 |         }
359 |         TransformField(config).validate(catalog)
360 | 
361 |     @patch('transform_field.utils.get_stream_schemas')
362 |     def test_validate_with_mask_hidden_fails_1(self, get_stream_schemas_mock):
363 |         """
364 |         Testing validation of MASK-HIDDEN transformation when field has no type
365 |         """
366 |         config = {
367 |             'transformations': [
368 |                 {
369 |                     "tap_stream_name": "stream_1",
370 |                     "field_id": "column_1",
371 |                     "type": "MASK-HIDDEN"
372 |                 },
373 |             ]
374 |         }
375 | 
376 |         catalog = Catalog.from_dict({'streams': []})
377 | 
378 |         get_stream_schemas_mock.return_value = {
379 |             'stream_1': Schema.from_dict({'properties': {
380 |                 'column_1': {}
381 |             }})
382 |         }
383 |         with self.assertRaises(InvalidTransformationException):
384 |             TransformField(config).validate(catalog)
385 | 
386 |     @patch('transform_field.utils.get_stream_schemas')
387 |     def test_validate_with_mask_hidden_fails_2(self, get_stream_schemas_mock):
388 |         """
389 |         Testing validation of MASK-HIDDEN transformation when field has non-string type
390 |         """
391 |         config = {
392 |             'transformations': [
393 |                 {
394 |                     "tap_stream_name": "stream_1",
395 |                     "field_id": "column_1",
396 |                     "type": "MASK-HIDDEN"
397 |                 },
398 |             ]
399 |         }
400 | 
401 |         catalog = Catalog.from_dict({'streams': []})
402 | 
403 |         get_stream_schemas_mock.return_value = {
404 |             'stream_1': Schema.from_dict({'properties': {
405 |                 'column_1': {
406 |                     'type': [
407 |                         'null',
408 |                         'integer'
409 |                     ]
410 |                 }
411 |             }})
412 |         }
413 |         with self.assertRaises(InvalidTransformationException):
414 |             TransformField(config).validate(catalog)
415 | 
416 |     @patch('transform_field.utils.get_stream_schemas')
417 |     def test_validate_with_mask_hidden_fails_3(self, get_stream_schemas_mock):
418 |         """
419 |         Testing validation of MASK-HIDDEN transformation when field has string type but formatted
420 |         """
421 |         config = {
422 |             'transformations': [
423 |                 {
424 |                     "tap_stream_name": "stream_1",
425 |                     "field_id": "column_1",
426 |                     "type": "MASK-HIDDEN"
427 |                 },
428 |             ]
429 |         }
430 | 
431 |         catalog = Catalog.from_dict({'streams': []})
432 | 
433 |         get_stream_schemas_mock.return_value = {
434 |             'stream_1': Schema.from_dict({'properties': {
435 |                 'column_1': {
436 |                     'type': [
437 |                         'null',
438 |                         'string'
439 |                     ],
440 |                     'format': 'binary'
441 |                 }
442 |             }})
443 |         }
444 |         with self.assertRaises(InvalidTransformationException):
445 |             TransformField(config).validate(catalog)
446 | 
447 |     @patch('transform_field.utils.get_stream_schemas')
448 |     def test_validate_with_mask_hidden_success(self, get_stream_schemas_mock):
449 |         """
450 |         Testing validation of MASK-HIDDEN transformation when field has string type but not formatted
451 |         """
452 |         config = {
453 |             'transformations': [
454 |                 {
455 |                     "tap_stream_name": "stream_1",
456 |                     "field_id": "column_1",
457 |                     "type": "MASK-HIDDEN"
458 |                 },
459 |             ]
460 |         }
461 | 
462 |         catalog = Catalog.from_dict({'streams': []})
463 | 
464 |         get_stream_schemas_mock.return_value = {
465 |             'stream_1': Schema.from_dict({'properties': {
466 |                 'column_1': {
467 |                     'type': [
468 |                         'null',
469 |                         'string'
470 |                     ]
471 |                 }
472 |             }})
473 |         }
474 |         TransformField(config).validate(catalog)
475 | 
476 |     @patch('transform_field.utils.get_stream_schemas')
477 |     def test_validate_with_mask_date_fails_1(self, get_stream_schemas_mock):
478 |         """
479 |         Testing validation of MASK-DATE transformation when field has no type
480 |         """
481 |         config = {
482 |             'transformations': [
483 |                 {
484 |                     "tap_stream_name": "stream_1",
485 |                     "field_id": "column_1",
486 |                     "type": "MASK-DATE"
487 |                 },
488 |             ]
489 |         }
490 | 
491 |         catalog = Catalog.from_dict({'streams': []})
492 | 
493 |         get_stream_schemas_mock.return_value = {
494 |             'stream_1': Schema.from_dict({'properties': {
495 |                 'column_1': {}
496 |             }})
497 |         }
498 |         with self.assertRaises(InvalidTransformationException):
499 |             TransformField(config).validate(catalog)
500 | 
501 |     @patch('transform_field.utils.get_stream_schemas')
502 |     def test_validate_with_mask_date_fails_2(self, get_stream_schemas_mock):
503 |         """
504 |         Testing validation of MASK-DATE transformation when field has string type but no format
505 |         """
506 |         config = {
507 |             'transformations': [
508 |                 {
509 |                     "tap_stream_name": "stream_1",
510 |                     "field_id": "column_1",
511 |                     "type": "MASK-DATE"
512 |                 },
513 |             ]
514 |         }
515 | 
516 |         catalog = Catalog.from_dict({'streams': []})
517 | 
518 |         get_stream_schemas_mock.return_value = {
519 |             'stream_1': Schema.from_dict({'properties': {
520 |                 'column_1': {
521 |                     'type': [
522 |                         'null',
523 |                         'string'
524 |                     ]
525 |                 }
526 |             }})
527 |         }
528 |         with self.assertRaises(InvalidTransformationException):
529 |             TransformField(config).validate(catalog)
530 | 
531 |     @patch('transform_field.utils.get_stream_schemas')
532 |     def test_validate_with_mask_date_fails_3(self, get_stream_schemas_mock):
533 |         """
534 |         Testing validation of MASK-DATE transformation when field has non-string type
535 |         """
536 |         config = {
537 |             'transformations': [
538 |                 {
539 |                     "tap_stream_name": "stream_1",
540 |                     "field_id": "column_1",
541 |                     "type": "MASK-DATE"
542 |                 },
543 |             ]
544 |         }
545 | 
546 |         catalog = Catalog.from_dict({'streams': []})
547 | 
548 |         get_stream_schemas_mock.return_value = {
549 |             'stream_1': Schema.from_dict({'properties': {
550 |                 'column_1': {
551 |                     'type': [
552 |                         'null',
553 |                         'integer'
554 |                     ]
555 |                 }
556 |             }})
557 |         }
558 |         with self.assertRaises(InvalidTransformationException):
559 |             TransformField(config).validate(catalog)
560 | 
561 |     @patch('transform_field.utils.get_stream_schemas')
562 |     def test_validate_with_mask_date_fails_4(self, get_stream_schemas_mock):
563 |         """
564 |         Testing validation of MASK-DATE transformation when field has string type but not date formatted
565 |         """
566 |         config = {
567 |             'transformations': [
568 |                 {
569 |                     "tap_stream_name": "stream_1",
570 |                     "field_id": "column_1",
571 |                     "type": "MASK-DATE"
572 |                 },
573 |             ]
574 |         }
575 | 
576 |         catalog = Catalog.from_dict({'streams': []})
577 | 
578 |         get_stream_schemas_mock.return_value = {
579 |             'stream_1': Schema.from_dict({'properties': {
580 |                 'column_1': {
581 |                     'type': [
582 |                         'null',
583 |                         'string'
584 |                     ],
585 |                     'format': 'binary'
586 |                 }
587 |             }})
588 |         }
589 |         with self.assertRaises(InvalidTransformationException):
590 |             TransformField(config).validate(catalog)
591 | 
592 |     @patch('transform_field.utils.get_stream_schemas')
593 |     def test_validate_with_mask_date_success_1(self, get_stream_schemas_mock):
594 |         """
595 |         Testing validation of MASK-DATE transformation when field has string type but is date formatted
596 |         """
597 |         config = {
598 |             'transformations': [
599 |                 {
600 |                     "tap_stream_name": "stream_1",
601 |                     "field_id": "column_1",
602 |                     "type": "MASK-DATE"
603 |                 },
604 |             ]
605 |         }
606 | 
607 |         catalog = Catalog.from_dict({'streams': []})
608 | 
609 |         get_stream_schemas_mock.return_value = {
610 |             'stream_1': Schema.from_dict({'properties': {
611 |                 'column_1': {
612 |                     'type': [
613 |                         'null',
614 |                         'string'
615 |                     ],
616 |                     'format': 'date'
617 |                 }
618 |             }})
619 |         }
620 |         TransformField(config).validate(catalog)
621 | 
622 |     @patch('transform_field.utils.get_stream_schemas')
623 |     def test_validate_with_mask_date_success_2(self, get_stream_schemas_mock):
624 |         """
625 |         Testing validation of MASK-DATE transformation when field has string type but is date-time formatted
626 |         """
627 |         config = {
628 |             'transformations': [
629 |                 {
630 |                     "tap_stream_name": "stream_1",
631 |                     "field_id": "column_1",
632 |                     "type": "MASK-DATE"
633 |                 },
634 |             ]
635 |         }
636 | 
637 |         catalog = Catalog.from_dict({'streams': []})
638 | 
639 |         get_stream_schemas_mock.return_value = {
640 |             'stream_1': Schema.from_dict({'properties': {
641 |                 'column_1': {
642 |                     'type': [
643 |                         'null',
644 |                         'string'
645 |                     ],
646 |                     'format': 'date-time'
647 |                 }
648 |             }})
649 |         }
650 |         TransformField(config).validate(catalog)
651 | 
652 |     @patch('transform_field.utils.get_stream_schemas')
653 |     def test_validate_with_mask_number_fails_1(self, get_stream_schemas_mock):
654 |         """
655 |         Testing validation of MASK-NUMBER transformation when field has no type
656 |         """
657 |         config = {
658 |             'transformations': [
659 |                 {
660 |                     "tap_stream_name": "stream_1",
661 |                     "field_id": "column_1",
662 |                     "type": "MASK-NUMBER"
663 |                 },
664 |             ]
665 |         }
666 | 
667 |         catalog = Catalog.from_dict({'streams': []})
668 | 
669 |         get_stream_schemas_mock.return_value = {
670 |             'stream_1': Schema.from_dict({'properties': {
671 |                 'column_1': {}
672 |             }})
673 |         }
674 |         with self.assertRaises(InvalidTransformationException):
675 |             TransformField(config).validate(catalog)
676 | 
677 |     @patch('transform_field.utils.get_stream_schemas')
678 |     def test_validate_with_mask_number_fails_2(self, get_stream_schemas_mock):
679 |         """
680 |         Testing validation of MASK-NUMBER transformation when field not have integer nor number type
681 |         """
682 |         config = {
683 |             'transformations': [
684 |                 {
685 |                     "tap_stream_name": "stream_1",
686 |                     "field_id": "column_1",
687 |                     "type": "MASK-NUMBER"
688 |                 },
689 |             ]
690 |         }
691 | 
692 |         catalog = Catalog.from_dict({'streams': []})
693 | 
694 |         get_stream_schemas_mock.return_value = {
695 |             'stream_1': Schema.from_dict({'properties': {
696 |                 'column_1': {
697 |                     'type': [
698 |                         'null',
699 |                         'string'
700 |                     ]
701 |                 }
702 |             }})
703 |         }
704 |         with self.assertRaises(InvalidTransformationException):
705 |             TransformField(config).validate(catalog)
706 | 
707 |     @patch('transform_field.utils.get_stream_schemas')
708 |     def test_validate_with_mask_number_fails_3(self, get_stream_schemas_mock):
709 |         """
710 |         Testing validation of MASK-NUMBER transformation when field has integer type but formatted
711 |         """
712 |         config = {
713 |             'transformations': [
714 |                 {
715 |                     "tap_stream_name": "stream_1",
716 |                     "field_id": "column_1",
717 |                     "type": "MASK-NUMBER"
718 |                 },
719 |             ]
720 |         }
721 | 
722 |         catalog = Catalog.from_dict({'streams': []})
723 | 
724 |         get_stream_schemas_mock.return_value = {
725 |             'stream_1': Schema.from_dict({'properties': {
726 |                 'column_1': {
727 |                     'type': [
728 |                         'null',
729 |                         'integer'
730 |                     ],
731 |                     'format': 'something random'
732 |                 }
733 |             }})
734 |         }
735 |         with self.assertRaises(InvalidTransformationException):
736 |             TransformField(config).validate(catalog)
737 | 
738 |     @patch('transform_field.utils.get_stream_schemas')
739 |     def test_validate_with_mask_number_fails_4(self, get_stream_schemas_mock):
740 |         """
741 |         Testing validation of MASK-NUMBER transformation when field has number type but formatted
742 |         """
743 |         config = {
744 |             'transformations': [
745 |                 {
746 |                     "tap_stream_name": "stream_1",
747 |                     "field_id": "column_1",
748 |                     "type": "MASK-DATE"
749 |                 },
750 |             ]
751 |         }
752 | 
753 |         catalog = Catalog.from_dict({'streams': []})
754 | 
755 |         get_stream_schemas_mock.return_value = {
756 |             'stream_1': Schema.from_dict({'properties': {
757 |                 'column_1': {
758 |                     'type': [
759 |                         'null',
760 |                         'number'
761 |                     ],
762 |                     'format': 'binary'
763 |                 }
764 |             }})
765 |         }
766 |         with self.assertRaises(InvalidTransformationException):
767 |             TransformField(config).validate(catalog)
768 | 
769 |     @patch('transform_field.utils.get_stream_schemas')
770 |     def test_validate_with_mask_number_success_1(self, get_stream_schemas_mock):
771 |         """
772 |         Testing validation of MASK-NUMBER transformation when field has integer type
773 |         """
774 |         config = {
775 |             'transformations': [
776 |                 {
777 |                     "tap_stream_name": "stream_1",
778 |                     "field_id": "column_1",
779 |                     "type": "MASK-NUMBER"
780 |                 },
781 |             ]
782 |         }
783 | 
784 |         catalog = Catalog.from_dict({'streams': []})
785 | 
786 |         get_stream_schemas_mock.return_value = {
787 |             'stream_1': Schema.from_dict({'properties': {
788 |                 'column_1': {
789 |                     'type': [
790 |                         'null',
791 |                         'integer'
792 |                     ]
793 |                 }
794 |             }})
795 |         }
796 |         TransformField(config).validate(catalog)
797 | 
798 |     @patch('transform_field.utils.get_stream_schemas')
799 |     def test_validate_with_mask_number_success_2(self, get_stream_schemas_mock):
800 |         """
801 |         Testing validation of MASK-NUMBER transformation when field has number type
802 |         """
803 |         config = {
804 |             'transformations': [
805 |                 {
806 |                     "tap_stream_name": "stream_1",
807 |                     "field_id": "column_1",
808 |                     "type": "MASK-NUMBER"
809 |                 },
810 |             ]
811 |         }
812 | 
813 |         catalog = Catalog.from_dict({'streams': []})
814 | 
815 |         get_stream_schemas_mock.return_value = {
816 |             'stream_1': Schema.from_dict({'properties': {
817 |                 'column_1': {
818 |                     'type': [
819 |                         'null',
820 |                         'number'
821 |                     ]
822 |                 }
823 |             }})
824 |         }
825 |         TransformField(config).validate(catalog)
826 | 
827 |     @patch('transform_field.utils.get_stream_schemas')
828 |     def test_validate_with_mask_string_skip_ends_fails_1(self, get_stream_schemas_mock):
829 |         """
830 |         Testing validation of MASK-STRING-SKIP-ENDS transformation when field has no type
831 |         """
832 |         config = {
833 |             'transformations': [
834 |                 {
835 |                     "tap_stream_name": "stream_1",
836 |                     "field_id": "column_1",
837 |                     "type": "MASK-STRING-SKIP-ENDS-1"
838 |                 },
839 |             ]
840 |         }
841 | 
842 |         catalog = Catalog.from_dict({'streams': []})
843 | 
844 |         get_stream_schemas_mock.return_value = {
845 |             'stream_1': Schema.from_dict({'properties': {
846 |                 'column_1': {}
847 |             }})
848 |         }
849 |         with self.assertRaises(InvalidTransformationException):
850 |             TransformField(config).validate(catalog)
851 | 
852 |     @patch('transform_field.utils.get_stream_schemas')
853 |     def test_validate_with_mask_string_skip_ends_fails_2(self, get_stream_schemas_mock):
854 |         """
855 |         Testing validation of MASK-STRING-SKIP-ENDS transformation when field has non-string type
856 |         """
857 |         config = {
858 |             'transformations': [
859 |                 {
860 |                     "tap_stream_name": "stream_1",
861 |                     "field_id": "column_1",
862 |                     "type": "MASK-STRING-SKIP-ENDS-1"
863 |                 },
864 |             ]
865 |         }
866 | 
867 |         catalog = Catalog.from_dict({'streams': []})
868 | 
869 |         get_stream_schemas_mock.return_value = {
870 |             'stream_1': Schema.from_dict({'properties': {
871 |                 'column_1': {
872 |                     'type': [
873 |                         'null',
874 |                         'integer'
875 |                     ]
876 |                 }
877 |             }})
878 |         }
879 |         with self.assertRaises(InvalidTransformationException):
880 |             TransformField(config).validate(catalog)
881 | 
882 |     @patch('transform_field.utils.get_stream_schemas')
883 |     def test_validate_with_mask_string_skip_ends_fails_3(self, get_stream_schemas_mock):
884 |         """
885 |         Testing validation of MASK-STRING-SKIP-ENDS-1 transformation when field has string type but formatted
886 |         """
887 |         config = {
888 |             'transformations': [
889 |                 {
890 |                     "tap_stream_name": "stream_1",
891 |                     "field_id": "column_1",
892 |                     "type": "MASK-STRING-SKIP-ENDS-1"
893 |                 },
894 |             ]
895 |         }
896 | 
897 |         catalog = Catalog.from_dict({'streams': []})
898 | 
899 |         get_stream_schemas_mock.return_value = {
900 |             'stream_1': Schema.from_dict({'properties': {
901 |                 'column_1': {
902 |                     'type': [
903 |                         'null',
904 |                         'string'
905 |                     ],
906 |                     'format': 'binary'
907 |                 }
908 |             }})
909 |         }
910 |         with self.assertRaises(InvalidTransformationException):
911 |             TransformField(config).validate(catalog)
912 | 
913 |     @patch('transform_field.utils.get_stream_schemas')
914 |     def test_validate_with_mask_string_skip_ends_success(self, get_stream_schemas_mock):
915 |         """
916 |         Testing validation of MASK-STRING-SKIP-ENDS-1 transformation when field has string type but not formatted
917 |         """
918 |         config = {
919 |             'transformations': [
920 |                 {
921 |                     "tap_stream_name": "stream_1",
922 |                     "field_id": "column_1",
923 |                     "type": "MASK-STRING-SKIP-ENDS-1"
924 |                 },
925 |             ]
926 |         }
927 | 
928 |         catalog = Catalog.from_dict({'streams': []})
929 | 
930 |         get_stream_schemas_mock.return_value = {
931 |             'stream_1': Schema.from_dict({'properties': {
932 |                 'column_1': {
933 |                     'type': [
934 |                         'null',
935 |                         'string'
936 |                     ]
937 |                 }
938 |             }})
939 |         }
940 |         TransformField(config).validate(catalog)
941 | 


--------------------------------------------------------------------------------
/tests/unit/test_transform.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import hashlib
  3 | 
  4 | from transform_field import transform
  5 | 
  6 | 
  7 | class TestTransform(unittest.TestCase):
  8 |     """
  9 |     Unit Tests for the transform module
 10 |     """
 11 | 
 12 |     def setUp(self) -> None:
 13 |         self.config = {}
 14 | 
 15 |     def test_set_null(self):
 16 |         """TEST SET-NULL transformation"""
 17 |         self.assertEqual(
 18 |             transform.do_transform({"col_1": "John"}, "col_1", "SET-NULL"),
 19 |             None
 20 |         )
 21 | 
 22 |     def test_hash(self):
 23 |         """Test HASH transformation"""
 24 |         self.assertEqual(
 25 |             transform.do_transform({"col_1": "John"}, "col_1", "HASH"),
 26 |             hashlib.sha256("John".encode('utf-8')).hexdigest()
 27 |         )
 28 | 
 29 |     def test_mask_date(self):
 30 |         """Test MASK-DATE transformation"""
 31 |         self.assertEqual(
 32 |             transform.do_transform({"col_1": "2019-05-21"}, "col_1", "MASK-DATE"),
 33 |             "2019-01-01T00:00:00"
 34 |         )
 35 | 
 36 |         # Mask date should keep the time elements
 37 |         self.assertEqual(
 38 |             transform.do_transform({"col_1": "2019-05-21T13:34:11"}, "col_1", "MASK-DATE"),
 39 |             "2019-01-01T13:34:11"
 40 |         )
 41 | 
 42 |         # Mask date should keep the time elements, date is invalid
 43 |         self.assertEqual(
 44 |             transform.do_transform({"col_1": "2019-05-21T13:34:99"}, "col_1", "MASK-DATE"),
 45 |             "2019-05-21T13:34:99"
 46 |         )
 47 | 
 48 |     def test_mask_number(self):
 49 |         """Test MASK-NUMBER transformation"""
 50 |         self.assertEqual(
 51 |             transform.do_transform({"col_1": "1234567890"}, "col_1", "MASK-NUMBER"),
 52 |             0
 53 |         )
 54 | 
 55 |     def test_mask_hidden(self):
 56 |         """Test MASK-HIDDEN transformation"""
 57 |         self.assertEqual(
 58 |             transform.do_transform({"col_1": "abakadabra123"}, "col_1", "MASK-HIDDEN"),
 59 |             'hidden'
 60 |         )
 61 | 
 62 |     def test_mask_string_skip_ends_case1(self):
 63 |         """Test MASK-STRING-SKIP-ENDS transformation with n=3"""
 64 |         self.assertEqual(
 65 |             transform.do_transform({"col_1": "do!maskme!"}, "col_1", "MASK-STRING-SKIP-ENDS-3"),
 66 |             'do!****me!'
 67 |         )
 68 | 
 69 |     def test_mask_string_skip_ends_case2(self):
 70 |         """Test MASK-STRING-SKIP-ENDS transformation with n=2"""
 71 |         self.assertEqual(
 72 |             transform.do_transform({"col_1": "nomask"}, "col_1", "MASK-STRING-SKIP-ENDS-2"),
 73 |             'no**sk'
 74 |         )
 75 | 
 76 |     def test_mask_string_skip_ends_case3(self):
 77 |         """Test MASK-STRING-SKIP-ENDS transformation where string length equals to 2 * mask_length"""
 78 |         self.assertEqual(
 79 |             transform.do_transform({"col_1": "nomask"}, "col_1", "MASK-STRING-SKIP-ENDS-3"),
 80 |             '******'
 81 |         )
 82 | 
 83 |     def test_mask_string_skip_ends_case4(self):
 84 |         """Test MASK-STRING-SKIP-ENDS transformation where string length less than 2 * mask_length"""
 85 |         self.assertEqual(
 86 |             transform.do_transform({"col_1": "shortmask"}, "col_1", "MASK-STRING-SKIP-ENDS-5"),
 87 |             '*********'
 88 |         )
 89 | 
 90 |     def test_unknown_transformation_type(self):
 91 |         """Test not existing transformation type"""
 92 |         # Should return the original value
 93 |         self.assertEqual(
 94 |             transform.do_transform({"col_1": "John"}, "col_1", "NOT-EXISTING-TRANSFORMATION-TYPE"),
 95 |             "John"
 96 |         )
 97 | 
 98 |     def test_conditions(self):
 99 |         """Test conditional transformations"""
100 | 
101 |         # Matching condition: Should transform to NULL
102 |         self.assertEqual(
103 |             transform.do_transform(
104 |                 # Record:
105 |                 {"col_1": "random value", "col_2": "passwordHash", "col_3": "lkj"},
106 |                 # Column to transform:
107 |                 "col_3",
108 |                 # Transform method:
109 |                 "SET-NULL",
110 |                 # Conditions when to transform:
111 |                 [
112 |                     {'column': 'col_1', 'equals': "random value"},
113 |                     {'column': 'col_2', 'equals': "passwordHash"},
114 |                 ]
115 |             ),
116 | 
117 |             # Expected output:
118 |             None
119 |         )
120 | 
121 |         # Not matching condition: Should keep the original value
122 |         self.assertEqual(
123 |             transform.do_transform(
124 |                 # Record:
125 |                 {"col_1": "random value", "col_2": "id", "col_3": "123456789"},
126 |                 # Column to transform:
127 |                 "col_3",
128 |                 # Transform method:
129 |                 "SET-NULL",
130 |                 # Conditions when to transform:
131 |                 [
132 |                     {'column': 'col_1', 'equals': "random value"},
133 |                     {'column': 'col_2', 'equals': "passwordHash"},
134 |                 ]
135 |             ),
136 | 
137 |             # Expected output:
138 |             "123456789"
139 |         )
140 | 
141 |     def test_transform_field_in_json_col(self):
142 |         """Test transformation of a field in a json column with no conditions"""
143 | 
144 |         expected_value = {'id': 1, 'info': {'last_name': 'hidden', 'first_name': 'John'}}
145 | 
146 |         return_value = transform.do_transform(
147 |             # Record:
148 |             {
149 |                 "col_1": "random value",
150 |                 "col_2": "passwordHash",
151 |                 "col_3": "lkj",
152 |                 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John'}}
153 |             },
154 |             # Column to transform:
155 |             "col_4",
156 |             # Transform method:
157 |             "MASK-HIDDEN",
158 |             # Conditions when to transform:
159 |             None,
160 |             ['info/last_name']
161 |         )
162 | 
163 |         self.assertDictEqual(expected_value, return_value)
164 | 
165 |     def test_transform_field_in_json_col_with_conditions(self):
166 |         """Test transformation of a field in a json column with conditions"""
167 | 
168 |         expected_value = {'id': 1, 'info': {'last_name': 'hidden', 'first_name': 'John'}}
169 | 
170 |         return_value = transform.do_transform(
171 |             # Record:
172 |             {
173 |                 "col_1": "random value",
174 |                 "col_2": "passwordHash",
175 |                 "col_3": "lkj",
176 |                 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John'}}
177 |             },
178 |             # Column to transform:
179 |             "col_4",
180 |             # Transform method:
181 |             "MASK-HIDDEN",
182 |             # Conditions when to transform:
183 |             [
184 |                 {'column': 'col_2', 'equals': "passwordHash"},
185 |             ],
186 |             ['info/last_name']
187 |         )
188 | 
189 |         self.assertDictEqual(expected_value, return_value)
190 | 
191 |     def test_transform_fields_in_json_col(self):
192 |         """Test transformation of multiple fields in a json column with no conditions"""
193 | 
194 |         expected_value = {'id': 1, 'info': {'last_name': 'hidden', 'first_name': 'hidden', 'age': 25}}
195 | 
196 |         return_value = transform.do_transform(
197 |             # Record:
198 |             {
199 |                 "col_1": "random value",
200 |                 "col_2": "passwordHash",
201 |                 "col_3": "lkj",
202 |                 'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'age': 25}}
203 |             },
204 |             # Column to transform:
205 |             "col_4",
206 |             # Transform method:
207 |             "MASK-HIDDEN",
208 |             # Conditions when to transform:
209 |             None,
210 |             ['info/last_name', 'info/first_name']
211 |         )
212 | 
213 |         self.assertDictEqual(expected_value, return_value)
214 | 
215 |     def test_transform_col_with_condition_on_json_field(self):
216 |         """Test transformation of a column with condition on a field in a json"""
217 | 
218 |         record = {
219 |             "col_1": "random value",
220 |             "col_2": "passwordHash",
221 |             "col_3": "323df43983dfs",
222 |             'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}}
223 |         }
224 | 
225 |         self.assertEqual(
226 |             'hidden',
227 |             transform.do_transform(
228 |                 # Record:
229 |                 record,
230 |                 # Column to transform:
231 |                 "col_3",
232 |                 # Transform method:
233 |                 "MASK-HIDDEN",
234 |                 # Conditions when to transform:
235 |                 [
236 |                     {'column': 'col_4', 'field_path': 'info/last_name', 'equals': 'Smith'},
237 |                 ]
238 |             )
239 |         )
240 | 
241 |     def test_transform_field_in_json_col_with_condition_on_field(self):
242 |         """Test transformation of a field in a json column with condition on a field in json, condition will be met"""
243 | 
244 |         record = {
245 |             "col_1": "random value",
246 |             "col_2": "passwordHash",
247 |             "col_3": "lkj",
248 |             'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}}
249 |         }
250 | 
251 |         self.assertDictEqual(
252 |             {'id': 1, 'info': {'first_name': 'John', 'last_name': None, 'phone': '6573930'}},
253 |             transform.do_transform(
254 |                 # Record:
255 |                 record,
256 |                 # Column to transform:
257 |                 "col_4",
258 |                 # Transform method:
259 |                 "SET-NULL",
260 |                 # Conditions when to transform:
261 |                 [
262 |                     {'column': 'col_4', 'field_path': 'info/phone', 'equals': '6573930'},
263 |                 ],
264 |                 ['info/last_name']
265 |             )
266 |         )
267 | 
268 |     def test_transform_field_in_json_col_with_condition_on_field_2(self):
269 |         """Test transformation of a field in a json column with condition on a field in json,
270 |         the condition will not be met"""
271 | 
272 |         record = {
273 |             "col_1": "random value",
274 |             "col_2": "passwordHash",
275 |             "col_3": "lkj",
276 |             'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}}
277 |         }
278 | 
279 |         # not transformed
280 |         self.assertEqual(
281 |             {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}},
282 |             transform.do_transform(
283 |                 # Record:
284 |                 record,
285 |                 # Column to transform:
286 |                 "col_4",
287 |                 # Transform method:
288 |                 "SET-NULL",
289 |                 # Conditions when to transform:
290 |                 [
291 |                     {'column': 'col_4', 'field_path': 'info/phone', 'regex_match': '.*6573955.*'},
292 |                 ],
293 |                 ['info/last_name']
294 |             )
295 |         )
296 | 
297 |     def test_transform_multiple_conditions_all_success(self):
298 |         """Test conditional transformation, all the conditions will be met and transformation should happen"""
299 | 
300 |         record = {
301 |             "col_1": "random value",
302 |             "col_2": "passwordHash",
303 |             "col_3": "lkj",
304 |             'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}},
305 |             'col_5': '2021-11-30T16:40:07'
306 |         }
307 | 
308 |         self.assertEqual(
309 |             '2021-01-01T16:40:07',
310 |             transform.do_transform(
311 |                 # Record:
312 |                 record,
313 |                 # Column to transform:
314 |                 "col_5",
315 |                 # Transform method:
316 |                 "MASK-DATE",
317 |                 # Conditions when to transform:
318 |                 [
319 |                     {'column': 'col_4', 'field_path': 'info/last_name', 'equals': 'Smith'},
320 |                     {'column': 'col_4', 'field_path': 'id', 'equals': 1},
321 |                     {'column': 'col_3', 'regex_match': '.*lkj.*'},
322 |                 ]
323 |             )
324 |         )
325 | 
326 |     def test_transform_multiple_conditions_one_fails(self):
327 |         """Test conditional transformation, one of the conditions will not be met and transformation should not happen"""
328 | 
329 |         record = {
330 |             "col_1": "random value",
331 |             "col_2": "passwordHash",
332 |             "col_3": "lkj",
333 |             'col_4': {'id': 1, 'info': {'last_name': 'Smith', 'first_name': 'John', 'phone': '6573930'}},
334 |             'col_5': '2021-11-30T16:40:07'
335 |         }
336 | 
337 |         # not transformed
338 |         self.assertEqual(
339 |             '2021-11-30T16:40:07',
340 |             transform.do_transform(
341 |                 # Record:
342 |                 record,
343 |                 # Column to transform:
344 |                 "col_5",
345 |                 # Transform method:
346 |                 "MASK-DATE",
347 |                 # Conditions when to transform:
348 |                 [
349 |                     {'column': 'col_4', 'field_path': 'info/last_name', 'equals': 'Smith'},
350 |                     {'column': 'col_4', 'field_path': 'id', 'equals': 2},
351 |                     {'column': 'col_3', 'regex_match': '.*lkj.*'},
352 |                 ]
353 |             )
354 |         )
355 | 
356 | 


--------------------------------------------------------------------------------
/tests/unit/test_utils.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import unittest
  3 | 
  4 | from unittest.mock import patch
  5 | from singer import Catalog
  6 | 
  7 | from transform_field.utils import get_stream_schemas, parse_args
  8 | 
  9 | 
 10 | class TestUtils(unittest.TestCase):
 11 |     """
 12 |     Unit Tests for the utils
 13 |     """
 14 | 
 15 |     def test_get_stream_schemas(self):
 16 |         catalog = Catalog.from_dict({
 17 |             'streams': [
 18 |                 {
 19 |                     'tap_stream_id': 'stream1',
 20 |                     'schema': {
 21 |                         'properties': {
 22 |                             'col_1': {}
 23 |                         }
 24 |                     },
 25 |                     'metadata': [
 26 |                         {
 27 |                             'breadcrumb': [],
 28 |                             'metadata': {
 29 |                                 'selected': True
 30 |                             }
 31 |                         }
 32 |                     ]
 33 |                 },
 34 |                 {
 35 |                     'tap_stream_id': 'stream2',
 36 |                     'schema': {
 37 |                         'properties': {
 38 |                             'col_2': {}
 39 |                         }
 40 |                     },
 41 |                     'metadata': [
 42 |                         {
 43 |                             'breadcrumb': [],
 44 |                             'metadata': {
 45 |                                 'selected': True
 46 |                             }
 47 |                         }
 48 |                     ]
 49 |                 },
 50 |                 {
 51 |                     'tap_stream_id': 'stream3',
 52 |                     'schema': {
 53 |                         'properties': {
 54 |                             'col_3': {}
 55 |                         }
 56 |                     },
 57 |                     'metadata': [
 58 |                         {
 59 |                             'breadcrumb': [],
 60 |                             'metadata': {
 61 |                                 'selected': False
 62 |                             }
 63 |                         }
 64 |                     ]
 65 |                 }
 66 |             ]
 67 |         })
 68 | 
 69 |         output = get_stream_schemas(catalog)
 70 | 
 71 |         self.assertIn('stream1', output)
 72 |         self.assertIn('stream2', output)
 73 |         self.assertNotIn('stream3', output)
 74 | 
 75 |         self.assertEqual(len(output['stream1'].properties), 1)
 76 |         self.assertEqual(len(output['stream2'].properties), 1)
 77 | 
 78 |     @patch('transform_field.utils.Catalog.load')
 79 |     @patch('transform_field.utils.check_config')
 80 |     @patch('transform_field.utils.load_json')
 81 |     @patch('argparse.ArgumentParser.parse_args')
 82 |     def test_parse_args(self, parse_args_mock, load_json_mock, check_config_mock, catalog_load_mock):
 83 |         """
 84 |         test args parsing
 85 |         """
 86 |         check_config_mock.return_value = None
 87 |         load_json_mock.return_value = {}
 88 |         catalog_load_mock.return_value = {}
 89 | 
 90 |         parse_args_mock.return_value = argparse.Namespace(**{
 91 |             'config': './config.json',
 92 |             'catalog': './properties.json',
 93 |             'validate': False,
 94 |         })
 95 | 
 96 |         args = parse_args({'transformations'})
 97 | 
 98 |         load_json_mock.assert_called_once()
 99 |         catalog_load_mock.assert_called_once()
100 |         check_config_mock.assert_called_once()
101 | 
102 |         self.assertEqual(args.config, {})
103 |         self.assertEqual(args.catalog, {})
104 |         self.assertEqual(args.validate, False)
105 | 


--------------------------------------------------------------------------------
/transform_field/__init__.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import sys
  3 | import time
  4 | import singer
  5 | 
  6 | from typing import Union, Dict
  7 | from enum import Enum, unique
  8 | from collections import namedtuple
  9 | from decimal import Decimal
 10 | from jsonschema import FormatChecker, Draft7Validator
 11 | from singer import Catalog, Schema
 12 | 
 13 | from transform_field import transform
 14 | from transform_field import utils
 15 | from transform_field.timings import Timings
 16 | 
 17 | from transform_field.errors import CatalogRequiredException, StreamNotFoundException, InvalidTransformationException, \
 18 |     UnsupportedTransformationTypeException, NoStreamSchemaException
 19 | 
 20 | 
 21 | LOGGER = singer.get_logger('transform_field')
 22 | TIMINGS = Timings(LOGGER)
 23 | DEFAULT_MAX_BATCH_BYTES = 4000000
 24 | DEFAULT_MAX_BATCH_RECORDS = 20000
 25 | DEFAULT_BATCH_DELAY_SECONDS = 300.0
 26 | VALIDATE_RECORDS = False
 27 | 
 28 | StreamMeta = namedtuple('StreamMeta', ['schema', 'key_properties', 'bookmark_properties'])
 29 | TransMeta = namedtuple('TransMeta', ['field_id', 'type', 'when', 'field_paths'])
 30 | 
 31 | REQUIRED_CONFIG_KEYS = [
 32 |     "transformations"
 33 | ]
 34 | 
 35 | 
 36 | @unique
 37 | class TransformationTypes(Enum):
 38 |     """
 39 |     List of supported transformation types
 40 |     """
 41 |     SET_NULL = 'SET-NULL'
 42 |     MASK_HIDDEN = 'MASK-HIDDEN'
 43 |     MASK_DATE = 'MASK-DATE'
 44 |     MASK_NUMBER = 'MASK-NUMBER'
 45 |     HASH = 'HASH'
 46 |     HASH_SKIP_FIRST = 'HASH-SKIP-FIRST'
 47 |     MASK_STRING_SKIP_ENDS = 'MASK-STRING-SKIP-ENDS'
 48 | 
 49 | 
 50 | def float_to_decimal(value):
 51 |     """Walk the given data structure and turn all instances of float into
 52 |     double."""
 53 |     if isinstance(value, float):
 54 |         return Decimal(str(value))
 55 |     if isinstance(value, list):
 56 |         return [float_to_decimal(child) for child in value]
 57 |     if isinstance(value, dict):
 58 |         return {k: float_to_decimal(v) for k, v in value.items()}
 59 |     return value
 60 | 
 61 | 
 62 | class TransformFieldException(Exception):
 63 |     """A known exception for which we don't need to bring a stack trace"""
 64 | 
 65 | 
 66 | class TransformField:
 67 |     """
 68 |     Main Transformer class
 69 |     """
 70 | 
 71 |     def __init__(self, trans_config):
 72 |         self.trans_config = trans_config
 73 |         self.messages = []
 74 |         self.buffer_size_bytes = 0
 75 |         self.state = None
 76 | 
 77 |         # Time that the last batch was sent
 78 |         self.time_last_batch_sent = time.time()
 79 | 
 80 |         # Mapping from stream name to {'schema': ..., 'key_names': ..., 'bookmark_names': ... }
 81 |         self.stream_meta = {}
 82 | 
 83 |         # Mapping from transformation stream to {'stream': [ 'field_id': ..., 'type': ... ] ... }
 84 |         self.trans_meta = {}
 85 | 
 86 |         for trans in trans_config["transformations"]:
 87 |             # Naming differences in stream ids:
 88 |             #  1. properties.json and transformation_json using 'tap_stream_id'
 89 |             #  2. taps send in the 'stream' key in singer messages
 90 |             stream = trans["tap_stream_name"]
 91 |             if stream not in self.trans_meta:
 92 |                 self.trans_meta[stream] = []
 93 | 
 94 |             self.trans_meta[stream].append(TransMeta(
 95 |                 trans["field_id"],
 96 |                 trans["type"],
 97 |                 trans.get('when'),
 98 |                 trans.get('field_paths')
 99 |             ))
100 | 
101 |     # pylint: disable=too-many-nested-blocks,too-many-branches
102 |     # todo: simplify this method
103 |     def flush(self):
104 |         """Give batch to handlers to process"""
105 | 
106 |         if self.messages:
107 |             stream = self.messages[0].stream
108 |             stream_meta = self.stream_meta[stream]
109 | 
110 |             # Transform columns
111 |             messages = self.messages
112 |             schema = float_to_decimal(stream_meta.schema)
113 |             key_properties = stream_meta.key_properties
114 |             validator = Draft7Validator(schema, format_checker=FormatChecker())
115 |             trans_meta = []
116 |             if stream in self.trans_meta:
117 |                 trans_meta = self.trans_meta[stream]
118 | 
119 |             for i, message in enumerate(messages):
120 |                 if isinstance(message, singer.RecordMessage):
121 | 
122 |                     # Do transformation on every column where it is required
123 |                     for trans in trans_meta:
124 | 
125 |                         if trans.field_id in message.record:
126 |                             transformed = transform.do_transform(
127 |                                 message.record, trans.field_id, trans.type, trans.when, trans.field_paths
128 |                             )
129 |                             message.record[trans.field_id] = transformed
130 | 
131 |                     if VALIDATE_RECORDS:
132 |                         # Validate the transformed columns
133 |                         data = float_to_decimal(message.record)
134 |                         try:
135 |                             validator.validate(data)
136 |                             if key_properties:
137 |                                 for k in key_properties:
138 |                                     if k not in data:
139 |                                         raise TransformFieldException(
140 |                                             f'Message {i} is missing key property {k}')
141 | 
142 |                         except Exception as exc:
143 |                             if type(exc).__name__ == "InvalidOperation":
144 |                                 raise TransformFieldException(
145 |                                     f"Record does not pass schema validation. RECORD: {message.record}"
146 |                                     "\n'multipleOf' validations that allows long precisions are not "
147 |                                     "supported (i.e. with 15 digits or more). "
148 |                                     f"Try removing 'multipleOf' methods from JSON schema.\n{exc}") from exc
149 | 
150 |                             raise TransformFieldException(
151 |                                 f"Record does not pass schema validation. RECORD: {message.record}\n{exc}") from exc
152 | 
153 |                     # Write the transformed message
154 |                     singer.write_message(message)
155 | 
156 |             LOGGER.debug("Batch is valid with %s messages", len(messages))
157 | 
158 |             # Update stats
159 |             self.time_last_batch_sent = time.time()
160 |             self.messages = []
161 |             self.buffer_size_bytes = 0
162 | 
163 |         if self.state:
164 |             singer.write_message(singer.StateMessage(self.state))
165 |             self.state = None
166 | 
167 |         TIMINGS.log_timings()
168 | 
169 |     def handle_line(self, line):
170 |         """Takes a raw line from stdin and transforms it"""
171 |         try:
172 |             message = singer.parse_message(line)
173 | 
174 |             if not message:
175 |                 raise TransformFieldException('Unknown message type')
176 |         except Exception as exc:
177 |             raise TransformFieldException(f'Failed to process incoming message: {line}\n{exc}') from exc
178 | 
179 |         # If we got a Schema, set the schema and key properties for this
180 |         # stream. Flush the batch, if there is one, in case the schema is
181 |         # different
182 |         if isinstance(message, singer.SchemaMessage):
183 |             self.flush()
184 | 
185 |             self.stream_meta[message.stream] = StreamMeta(
186 |                 message.schema,
187 |                 message.key_properties,
188 |                 message.bookmark_properties)
189 | 
190 |             # if schema message, do validation of transformations using the schema to detect any
191 |             # incompatibilities between the transformation and column types
192 |             self.__validate_stream_trans(message.stream, message.schema)
193 | 
194 |             # Write the transformed message
195 |             singer.write_message(message)
196 | 
197 |         elif isinstance(message, (singer.RecordMessage, singer.ActivateVersionMessage)):
198 |             if self.messages and (
199 |                     message.stream != self.messages[0].stream or
200 |                     message.version != self.messages[0].version):
201 |                 self.flush()
202 |             self.messages.append(message)
203 |             self.buffer_size_bytes += len(line)
204 | 
205 |             num_bytes = self.buffer_size_bytes
206 |             num_messages = len(self.messages)
207 |             num_seconds = time.time() - self.time_last_batch_sent
208 | 
209 |             enough_bytes = num_bytes >= DEFAULT_MAX_BATCH_BYTES
210 |             enough_messages = num_messages >= DEFAULT_MAX_BATCH_RECORDS
211 |             enough_time = num_seconds >= DEFAULT_BATCH_DELAY_SECONDS
212 |             if enough_bytes or enough_messages or enough_time:
213 |                 LOGGER.debug('Flushing %d bytes, %d messages, after %.2f seconds', num_bytes, num_messages, num_seconds)
214 |                 self.flush()
215 | 
216 |         elif isinstance(message, singer.StateMessage):
217 |             self.state = message.value
218 | 
219 |     def consume(self, reader):
220 |         """Consume all the lines from the queue, flushing when done."""
221 |         for line in reader:
222 |             self.handle_line(line)
223 |         self.flush()
224 | 
225 |     def validate(self, catalog: Catalog):
226 |         """
227 |         Validate the transformations by checking if each transformation type is compatible with the column type
228 |         :param catalog: the catalog of streams with their json schema
229 |         """
230 |         LOGGER.info('Starting validation of transformations...')
231 | 
232 |         if not catalog:
233 |             raise CatalogRequiredException('Catalog missing! please provide catalog to run validation.')
234 | 
235 |         # get the schema of each stream
236 |         schemas = utils.get_stream_schemas(catalog)
237 | 
238 |         for stream_id in self.trans_meta:
239 |             self.__validate_stream_trans(stream_id, schemas.get(stream_id))
240 | 
241 |     def __validate_stream_trans(self, stream_id: str, stream_schema: Union[Schema, Dict]):
242 |         """
243 |         Validation of each stream's transformations
244 |         :param stream_id: ID of the stream
245 |         :param stream_schema: schema of the streams
246 |         """
247 | 
248 |         if stream_id not in self.trans_meta:
249 |             return
250 | 
251 |         # check if we even have schema for stream of this transformation
252 |         if stream_schema is None:
253 |             raise StreamNotFoundException(stream_id)
254 | 
255 |         # check if we stream has not empty schema
256 |         if not stream_schema:
257 |             raise NoStreamSchemaException(stream_id)
258 | 
259 |         for transformation in self.trans_meta[stream_id]:
260 |             trans_type = transformation.type
261 |             field_id = transformation.field_id
262 | 
263 |             if isinstance(stream_schema, Schema):
264 |                 field_type = stream_schema.properties[field_id].type
265 |                 field_format = stream_schema.properties[field_id].format
266 |             else:
267 |                 field_type = stream_schema['properties'][field_id].get('type')
268 |                 field_format = stream_schema['properties'][field_id].get('format')
269 | 
270 |             # If the value we want to transform is a field in a JSON property
271 |             # then no need to enforce rules below for now
272 |             if field_type and \
273 |                     ("object" in field_type or "array" in field_type) and \
274 |                     transformation.field_paths is not None:
275 |                 continue
276 | 
277 |             if trans_type in (TransformationTypes.HASH.value, TransformationTypes.MASK_HIDDEN.value) or \
278 |                     trans_type.startswith(TransformationTypes.HASH_SKIP_FIRST.value) or \
279 |                     trans_type.startswith(TransformationTypes.MASK_STRING_SKIP_ENDS.value):
280 |                 if not (field_type is not None and 'string' in field_type and not field_format):
281 |                     raise InvalidTransformationException(
282 |                         f'Cannot apply `{trans_type}` transformation type to a non-string field `'
283 |                         f'{field_id}` in stream `{stream_id}`')
284 | 
285 |             elif trans_type == TransformationTypes.MASK_DATE.value:
286 |                 if not (field_type is not None and 'string' in field_type and field_format in {'date-time', 'date'}):
287 |                     raise InvalidTransformationException(
288 |                         f'Cannot apply `{trans_type}` transformation type to a non-stringified date field'
289 |                         f' `{field_id}` in stream `{stream_id}`')
290 | 
291 |             elif trans_type == TransformationTypes.MASK_NUMBER.value:
292 |                 if not (field_type is not None and (
293 |                         'number' in field_type or 'integer' in field_type) and not field_format):
294 |                     raise InvalidTransformationException(
295 |                         f'Cannot apply `{trans_type}` transformation type to a non-numeric field '
296 |                         f'`{field_id}` in stream `{stream_id}`')
297 | 
298 |             elif trans_type == TransformationTypes.SET_NULL.value:
299 |                 LOGGER.info('Transformation type is %s, no need to do any validation.', trans_type)
300 | 
301 |             else:
302 |                 raise UnsupportedTransformationTypeException(trans_type)
303 | 
304 | 
305 | def main_impl():
306 |     """
307 |     Main implementation
308 |     """
309 |     args = utils.parse_args(REQUIRED_CONFIG_KEYS)
310 |     trans_config = {'transformations': args.config['transformations']}
311 | 
312 |     instance = TransformField(trans_config)
313 | 
314 |     if args.validate:
315 |         instance.validate(args.catalog)
316 |     else:
317 |         reader = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
318 |         instance.consume(reader)
319 | 
320 |     LOGGER.info("Exiting normally")
321 | 
322 | 
323 | def main():
324 |     """Main entry point"""
325 |     try:
326 |         main_impl()
327 |     except TransformFieldException as exc:
328 |         for line in str(exc).splitlines():
329 |             LOGGER.critical(line)
330 |         sys.exit(1)
331 |     except Exception as exc:
332 |         LOGGER.critical(exc)
333 |         raise exc
334 | 
335 | 
336 | if __name__ == '__main__':
337 |     main()
338 | 


--------------------------------------------------------------------------------
/transform_field/errors.py:
--------------------------------------------------------------------------------
 1 | class CatalogRequiredException(Exception):
 2 |     """Raised when catalog needs to be provided but it has not been"""
 3 | 
 4 | 
 5 | class StreamNotFoundException(Exception):
 6 |     """Raised when catalog doesn't have a given selected stream"""
 7 | 
 8 |     def __init__(self, stream):
 9 |         message = f'Catalog doesn\'t have the selected stream `{stream}`!'
10 | 
11 |         super().__init__(message)
12 | 
13 | 
14 | class NoStreamSchemaException(Exception):
15 |     """Raised when stream has an empty schema"""
16 | 
17 |     def __init__(self, stream):
18 |         message = f'Stream `{stream}` has an empty schema!'
19 | 
20 |         super().__init__(message)
21 | 
22 | 
23 | class InvalidTransformationException(Exception):
24 |     """Raised when the given transformation is invalid"""
25 | 
26 | 
27 | class UnsupportedTransformationTypeException(Exception):
28 |     """Raised when the given transformation type is not supported"""
29 | 
30 |     def __init__(self, trans_type):
31 |         message = f'Transformation `{trans_type}` is not supported!'
32 | 
33 |         super().__init__(message)
34 | 


--------------------------------------------------------------------------------
/transform_field/timings.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import time
 4 | 
 5 | from contextlib import contextmanager
 6 | 
 7 | 
 8 | class Timings:
 9 |     """Gathers timing information for the three main steps of the Transformer."""
10 | 
11 |     def __init__(self, logger):
12 |         self.logger = logger
13 |         self.last_time = time.time()
14 |         self.timings = {
15 |             'validating': 0.0,
16 |             'transforming': 0.0,
17 |             None: 0.0
18 |         }
19 | 
20 |     @contextmanager
21 |     def mode(self, mode):
22 |         """We wrap the big steps of the Tap in this context manager to accumulate
23 |         timing info."""
24 | 
25 |         start = time.time()
26 |         yield
27 |         end = time.time()
28 |         self.timings[None] += start - self.last_time
29 |         self.timings[mode] += end - start
30 |         self.last_time = end
31 | 
32 |     def log_timings(self):
33 |         """We call this with every flush to print out the accumulated timings"""
34 |         self.logger.debug('Timings: unspecified: %.3f; validating: %.3f; transforming: %.3f;',
35 |                           self.timings[None],
36 |                           self.timings['validating'],
37 |                           self.timings['transforming'])
38 | 


--------------------------------------------------------------------------------
/transform_field/transform.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import re
  3 | 
  4 | from typing import Dict, Any, Optional, List
  5 | from dpath.util import get as get_xpath, set as set_xpath
  6 | from singer import get_logger
  7 | from dateutil import parser
  8 | 
  9 | LOGGER = get_logger('transform_field')
 10 | 
 11 | 
 12 | def is_transform_required(record: Dict, when: Optional[List[Dict]]) -> bool:
 13 |     """
 14 |         Detects if the transformation is required or not based on
 15 |         the defined conditions and the actual values in a record.
 16 |         All conditions in when need to be met for the transformation to be required.
 17 |     """
 18 |     if not when:
 19 |         # Transformation is always required if 'when' condition not defined
 20 |         LOGGER.debug('No conditions, transformations is required')
 21 |         return True
 22 | 
 23 |     transform_required = False
 24 | 
 25 |     # Check if conditional transformation matches criteria
 26 |     # Evaluate every condition
 27 |     for condition in when:
 28 |         column_to_match = condition['column']
 29 |         column_value = record.get(column_to_match, "")
 30 | 
 31 |         field_path_to_match = condition.get('field_path')
 32 | 
 33 |         # check if given field exists in the column value
 34 |         if field_path_to_match:
 35 |             try:
 36 |                 field_value = get_xpath(column_value, field_path_to_match)
 37 |                 LOGGER.debug('field "%s" exists in the value of column "%s"', field_path_to_match, column_to_match)
 38 | 
 39 |             except KeyError:
 40 |                 # KeyError exception means the field doesn't exist, hence we cannot proceed with the
 41 |                 # equals/regex match condition, thus the condition isn't met and don't need to do
 42 |                 # transformation so breaking prematurely
 43 |                 transform_required = False
 44 | 
 45 |                 LOGGER.debug('field "%s" doesn\'t exists in the value of column "%s", '
 46 |                              'so transformation is not required.', field_path_to_match, column_to_match)
 47 |                 break
 48 | 
 49 |         cond_equals = condition.get('equals')
 50 |         cond_pattern = condition.get('regex_match')
 51 | 
 52 |         # Exact condition
 53 |         if cond_equals:
 54 |             LOGGER.debug('Equals condition found, value is: %s', cond_equals)
 55 |             if field_path_to_match:
 56 |                 transform_required = __is_condition_met('equal', cond_equals, field_value)
 57 |             else:
 58 |                 transform_required = __is_condition_met('equal', cond_equals, column_value)
 59 | 
 60 |             # Condition isn't met, exit the loop
 61 |             if not transform_required:
 62 |                 LOGGER.debug('Equals condition didn\'t match, so transformation is not required.')
 63 |                 break
 64 | 
 65 |         # Regex based condition
 66 |         elif cond_pattern:
 67 |             LOGGER.debug('Regex condition found, pattern is: %s', cond_pattern)
 68 | 
 69 |             if field_path_to_match:
 70 |                 transform_required = __is_condition_met('regex', cond_pattern, field_value)
 71 |             else:
 72 |                 transform_required = __is_condition_met('regex', cond_pattern, column_value)
 73 | 
 74 |             # Condition isn't met, exit the loop
 75 |             if not transform_required:
 76 |                 LOGGER.debug('Regex pattern didn\'t match, so transformation is not required.')
 77 |                 break
 78 | 
 79 |     LOGGER.debug('Transformation required? %s', transform_required)
 80 | 
 81 |     return transform_required
 82 | 
 83 | 
 84 | def __is_condition_met(condition_type: str, condition_value: Any, value: Any) -> bool:
 85 |     """
 86 |     Checks if given value meets the given condition
 87 |     Args:
 88 |         condition_type: condition type, could be "equal" or "regex"
 89 |         condition_value: the value of the condition, in case of regex it's the pattern, and
 90 |                          a value to compare to in case of equal
 91 |         value: the target value to run the condition against
 92 | 
 93 |     Returns: bool, True of condition is met, False otherwise
 94 |     """
 95 | 
 96 |     if condition_type == 'equal':
 97 |         return value == condition_value
 98 | 
 99 |     if condition_type == 'regex':
100 |         matcher = re.compile(condition_value)
101 |         return bool(matcher.search(value))
102 | 
103 |     raise NotImplementedError(f'__is_condition_met is not implemented for condition type "{condition_type}"', )
104 | 
105 | 
106 | def do_transform(record: Dict,
107 |                  field: str,
108 |                  trans_type: str,
109 |                  when: Optional[List[Dict]] = None,
110 |                  field_paths: Optional[List[str]] = None
111 |                  ) -> Any:
112 |     """Transform a value by a certain transformation type.
113 |     Optionally can set conditional criteria based on other
114 |     values of the record"""
115 | 
116 |     return_value = value = record.get(field)
117 | 
118 |     try:
119 |         # Do transformation only if required
120 |         if is_transform_required(record, when):
121 | 
122 |             # transforming fields nested in value dictionary
123 |             if isinstance(value, dict) and field_paths:
124 |                 for field_path in field_paths:
125 |                     try:
126 |                         field_val = get_xpath(value, field_path)
127 |                         set_xpath(value, field_path, _transform_value(field_val, trans_type))
128 |                     except KeyError:
129 |                         LOGGER.error('Field path %s does not exist', field_path)
130 | 
131 |                 return_value = value
132 | 
133 |             else:
134 |                 return_value = _transform_value(value, trans_type)
135 | 
136 |         # Return the original value if transformation is not required
137 |         else:
138 |             return_value = value
139 | 
140 |         return return_value
141 | 
142 |     # Return the original value if cannot transform
143 |     except Exception:
144 |         return return_value
145 | 
146 | 
147 | def _transform_value(value: Any, trans_type: str) -> Any:
148 |     """
149 |     Applies the given transformation type to the given value
150 |     Args:
151 |         value: value to transform
152 |         trans_type: transformation type to apply
153 | 
154 |     Returns:
155 |         transformed value
156 |     """
157 |     # Transforms any input to NULL
158 |     if trans_type == "SET-NULL":
159 |         return_value = None
160 | 
161 |     # Transforms string input to hash
162 |     elif trans_type == "HASH":
163 |         return_value = hashlib.sha256(value.encode('utf-8')).hexdigest()
164 | 
165 |     # Transforms string input to hash skipping first n characters, e.g. HASH-SKIP-FIRST-2
166 |     elif 'HASH-SKIP-FIRST' in trans_type:
167 |         return_value = value[:int(trans_type[-1])] + \
168 |                        hashlib.sha256(value.encode('utf-8')[int(trans_type[-1]):]).hexdigest()
169 | 
170 |     # Transforms any date to stg
171 |     elif trans_type == "MASK-DATE":
172 |         return_value = parser.parse(value).replace(month=1, day=1).isoformat()
173 | 
174 |     # Transforms any number to zero
175 |     elif trans_type == "MASK-NUMBER":
176 |         return_value = 0
177 | 
178 |     # Transforms any value to "hidden"
179 |     elif trans_type == "MASK-HIDDEN":
180 |         return_value = 'hidden'
181 | 
182 |     # Transforms string input to masked version skipping first and last n characters
183 |     # e.g. MASK-STRING-SKIP-ENDS-3
184 |     elif 'MASK-STRING-SKIP-ENDS' in trans_type:
185 |         skip_ends_n = int(trans_type[-1])
186 |         value_len = len(value)
187 |         return_value = '*' * value_len if value_len <= (2 * skip_ends_n) \
188 |             else f'{value[:skip_ends_n]}{"*" * (value_len - (2 * skip_ends_n))}{value[-skip_ends_n:]}'
189 | 
190 |     # Return the original value if cannot find transformation type
191 |     # todo: is this the right behavior?
192 |     else:
193 |         LOGGER.warning('Cannot find transformation type %s, returning same value', trans_type)
194 |         return_value = value
195 | 
196 |     return return_value
197 | 


--------------------------------------------------------------------------------
/transform_field/utils.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from typing import Dict
 4 | from singer import Catalog, get_logger, Schema
 5 | from singer.utils import check_config, load_json
 6 | 
 7 | 
 8 | LOGGER = get_logger('transform_field')
 9 | 
10 | 
11 | def parse_args(required_config_keys):
12 |     """
13 |     Parse standard command-line args.
14 | 
15 |     Parses the command-line arguments mentioned in the SPEC and the BEST_PRACTICES documents:
16 | 
17 |     -c,--config     Config file
18 |     --validate     flag  to validate the transformations
19 |     --catalog       Catalog file
20 | 
21 |     Returns the parsed args object from argparse. For each argument that
22 |     point to JSON files (config, catalog), we will automatically
23 |     load and parse the JSON file.
24 |     """
25 |     parser = argparse.ArgumentParser()
26 | 
27 |     parser.add_argument(
28 |         '-c', '--config',
29 |         help='Config file',
30 |         required=True)
31 | 
32 |     parser.add_argument(
33 |         '--validate',
34 |         help='Flag to trigger one-off validation of transformations in config file using the catalog',
35 |         default=False,
36 |         action='store_true'
37 |     )
38 | 
39 |     parser.add_argument(
40 |         '--catalog',
41 |         help='Catalog file')
42 | 
43 |     args = parser.parse_args()
44 | 
45 |     if args.config:
46 |         setattr(args, 'config_path', args.config)
47 |         args.config = load_json(args.config)
48 | 
49 |     if args.catalog:
50 |         setattr(args, 'catalog_path', args.catalog)
51 |         args.catalog = Catalog.load(args.catalog)
52 | 
53 |     check_config(args.config, required_config_keys)
54 | 
55 |     return args
56 | 
57 | 
58 | def get_stream_schemas(catalog: Catalog) -> Dict[str, Schema]:
59 |     """
60 |     Build a map of streams with their schemas
61 |     :param catalog:
62 |     :return: Dictionary mapping stream ID to its schema
63 |     """
64 |     return {
65 |         stream.tap_stream_id: stream.schema
66 |         for stream in catalog.streams if stream.is_selected()
67 |     }
68 | 


--------------------------------------------------------------------------------