├── tests
    ├── __init__.py
    ├── test_postgres_drop_table_field_selection.py
    ├── unittests
    │   ├── test_unsupported_pk.py
    │   ├── test_clear_state_on_replication_change.py
    │   ├── utils.py
    │   └── test_full_table_interruption.py
    ├── db_utils.py
    ├── test_postgres_views_logical_replication.py
    ├── test_postgres_views_full_table.py
    ├── test_postgres_views_incremental_replication.py
    ├── test_postgres_logical_replication_multiple_tables.py
    ├── test_postgres_logical_replication_multiple_dbs.py
    ├── test_postgres_full_table_replication_arrays.py
    └── test_postgres_discovery.py
├── tap_postgres
    ├── sync_strategies
    │   ├── __init__.py
    │   ├── common.py
    │   ├── incremental.py
    │   ├── full_table.py
    │   └── logical_replication.py
    └── db.py
├── Makefile
├── .circleci
    ├── docker-entrypoint-initdb.d
    │   └── init-permissions.sh
    ├── postgresql.conf
    ├── Dockerfile
    ├── server.key
    ├── config.yml
    └── server.crt
├── README.md
├── .github
    └── pull_request_template.md
├── setup.py
├── CHANGELOG.md
├── .gitignore
└── bin
    └── test-db


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tap_postgres/sync_strategies/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | test:
2 | 	nosetests -v tests/unittests
3 | 


--------------------------------------------------------------------------------
/.circleci/docker-entrypoint-initdb.d/init-permissions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | { echo "host replication $POSTGRES_USER 0.0.0.0/0 trust"; } >> "$PGDATA/pg_hba.conf"
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # tap-postgres
 2 | 
 3 | ## set rds.logical_replication in parameter(reboot)= 1
 4 | 
 5 | This should also set `max_wal_senders && max_replication_slots > 0`
 6 | 
 7 | Singer tap for PostgreSQL supporting Full Table & Logical Replication
 8 | using the wal2json decoder plugin.
 9 | 
10 | ```
11 | SELECT * FROM pg_create_logical_replication_slot('stitch', 'wal2json');
12 | ```
13 | 
14 | ---
15 | 
16 | Copyright &copy; 2018 Stitch
17 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | # Description of change
 2 | (write a short description here or paste a link to JIRA)
 3 | 
 4 | # QA steps
 5 |  - [ ] automated tests passing
 6 |  - [ ] manual qa steps passing (list below)
 7 |  
 8 | # Risks
 9 | 
10 | # Rollback steps
11 |  - revert this branch
12 | 
13 | #### AI generated code
14 | https://internal.qlik.dev/general/ways-of-working/code-reviews/#guidelines-for-ai-generated-code
15 | - [ ] this PR has been written with the help of GitHub Copilot or another generative AI tool
16 | 


--------------------------------------------------------------------------------
/.circleci/postgresql.conf:
--------------------------------------------------------------------------------
 1 | # LOGGING
 2 | log_min_error_statement = fatal
 3 | 
 4 | # CONNECTION
 5 | listen_addresses = '*'
 6 | 
 7 | # MODULES
 8 | #shared_preload_libraries = 'decoderbufs'
 9 | 
10 | # REPLICATION
11 | wal_level = logical             # minimal, archive, hot_standby, or logical (change requires restart)
12 | max_wal_senders = 5             # max number of walsender processes (change requires restart)
13 | #wal_keep_segments = 4          # in logfile segments, 16MB each; 0 disables
14 | #wal_sender_timeout = 60s       # in milliseconds; 0 disables
15 | max_replication_slots = 5       # max number of replication slots (change requires restart)
16 | 
17 | # SSL
18 | ssl = on
19 | ssl_cert_file = '/var/lib/postgresql/server.crt'
20 | ssl_key_file = '/var/lib/postgresql/server.key'
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | setup(name='tap-postgres',
 6 |       version='0.2.1',
 7 |       description='Singer.io tap for extracting data from PostgreSQL',
 8 |       author='Stitch',
 9 |       url='https://singer.io',
10 |       classifiers=['Programming Language :: Python :: 3 :: Only'],
11 |       install_requires=[
12 |           'singer-python==5.3.1',
13 |           'psycopg2==2.7.4',
14 |           'strict-rfc3339==0.7',
15 |       ],
16 |       extras_require={
17 |           'dev': [
18 |               'ipdb',
19 |               'pylint==2.6.0',
20 |               'nose==1.3.7',
21 |           ]
22 |       },
23 |       entry_points='''
24 |           [console_scripts]
25 |           tap-postgres=tap_postgres:main
26 |       ''',
27 |       packages=['tap_postgres', 'tap_postgres.sync_strategies']
28 | )
29 | 


--------------------------------------------------------------------------------
/.circleci/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM postgres:9.6
 2 | 
 3 | # Git SHA of v2.2
 4 | ENV WAL2JSON_COMMIT_ID=9f9762315062888f7f7f4f0a115073a33ad1275e
 5 | 
 6 | # Compile the plugins from sources and install
 7 | RUN apt-get update && apt-get install -y postgresql-server-dev-9.6 gcc git make pkgconf \
 8 |     && git clone https://github.com/eulerto/wal2json -b master --single-branch \
 9 |     && (cd /wal2json && git checkout $WAL2JSON_COMMIT_ID && make && make install) \
10 |     && rm -rf wal2json
11 | 
12 | # Copy the custom configuration which will be passed down to the server
13 | COPY postgresql.conf /usr/local/share/postgresql/postgresql.conf
14 | 
15 | # Copy the script which will initialize the replication permissions
16 | COPY /docker-entrypoint-initdb.d /docker-entrypoint-initdb.d
17 | 
18 | # Copy the self-signed cert for general SSL testing
19 | # Must be owned by postgres:postgres according to https://www.postgresql.org/docs/9.6/ssl-tcp.html
20 | # NOTE: ONLY TO BE USED FOR TESTING, this is a publicly published keypair
21 | COPY server.key server.crt /var/lib/postgresql/
22 | RUN chown postgres:postgres /var/lib/postgresql/server.*
23 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## 0.2.0
 4 |   * Add support to discover partitioned tables [101](https://github.com/singer-io/tap-postgres/pull/101)
 5 | 
 6 | ## 0.1.0
 7 |   * Add support for `wal2json` message format v2 via config parameter [91](https://github.com/singer-io/tap-postgres/pull/91)
 8 | 
 9 | ## 0.0.70
10 |   * Look up ssl status in `pg_stat_ssl` and `pg_stat_activity` tables [#84](https://github.com/singer-io/tap-postgres/pull/84)
11 | 
12 | ## 0.0.69
13 |   * Add `sslmode` log message when opening connection [#82](https://github.com/singer-io/tap-postgres/pull/82)
14 | 
15 | ## 0.0.68
16 |   * Respect `ssl` config property (bug fix) [#80](https://github.com/singer-io/tap-postgres/pull/80)
17 | 
18 | ## 0.0.67
19 |   * Make `bytea[]` fields have `"inclusion" : "unsupported"` metadata [#76](https://github.com/singer-io/tap-postgres/pull/76)
20 | 
21 | ## 0.0.66
22 |   * Fix sorting for full_table sync by xmin to use integer sorting rather than string sorting [#73](https://github.com/singer-io/tap-postgres/pull/73)
23 | 
24 | ## 0.0.65
25 |   * Add support for `int8[]` (`bigint[]`) array types to log-based replication [#69](https://github.com/singer-io/tap-postgres/pull/69)
26 | 
27 | ## 0.0.64
28 |   * Pass string to `decimal.Decimal` when handling numeric data type [#67](https://github.com/singer-io/tap-postgres/pull/67)
29 | 


--------------------------------------------------------------------------------
/tap_postgres/sync_strategies/common.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import simplejson as json
 3 | import singer
 4 | from singer import  metadata
 5 | import tap_postgres.db as post_db
 6 | 
 7 | def should_sync_column(md_map, field_name):
 8 |     field_metadata = md_map.get(('properties', field_name), {})
 9 |     return singer.should_sync_field(field_metadata.get('inclusion'),
10 |                                     field_metadata.get('selected'),
11 |                                     True)
12 | 
13 | def write_schema_message(schema_message):
14 |     sys.stdout.write(json.dumps(schema_message, use_decimal=True) + '\n')
15 |     sys.stdout.flush()
16 | 
17 | def send_schema_message(stream, bookmark_properties):
18 |     s_md = metadata.to_map(stream['metadata'])
19 |     if s_md.get((), {}).get('is-view'):
20 |         key_properties = s_md.get((), {}).get('view-key-properties', [])
21 |     else:
22 |         key_properties = s_md.get((), {}).get('table-key-properties', [])
23 | 
24 |     schema_message = {'type' : 'SCHEMA',
25 |                       'stream' : post_db.calculate_destination_stream_name(stream, s_md),
26 |                       'schema' : stream['schema'],
27 |                       'key_properties' : key_properties,
28 |                       'bookmark_properties': bookmark_properties}
29 | 
30 |     write_schema_message(schema_message)
31 | 


--------------------------------------------------------------------------------
/.circleci/server.key:
--------------------------------------------------------------------------------
 1 | -----BEGIN PRIVATE KEY-----
 2 | MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDBZuuEK7B9mbWw
 3 | Fny3uoAE+Pp26PfCPEEhOh0QpA6KPMQs06Dv6Tx94TcpNIeSnSxp3dGA3WVuOyev
 4 | SsrGhyr6DadaElzlXkB3+fpj/i06UMiG5iqyO/w+KySz5q5hWyws9PNZlkC4nW7x
 5 | V6l+KNMqS3HbFogpbreLDmrCVVvETE3t5Eq+NtmPms91eRIAAX/W02Mg5Hi5be+t
 6 | I+zvTSBwbE28JHxZwqvMd7SZSax7dHiXhN+5cG/jyXFiVM5jw2LNqD+umejIdrN6
 7 | iGYuXsm2nlf45+p8mLEDbjaqiWaPONUJ5/e4MoQTMq557xjV5WrNiWMm4MnPrdvl
 8 | ffFhr4GpAgMBAAECggEABOJLekf8Kf/StcKrnZwpFXcQJCaX8yDAaE1mZIAwGc+V
 9 | CKjDfKuAKpGgafr4nXw4nefLHZz5rcHyq5uQ6ViKfkwP+NdT3zr1F9KJPzMxAzL9
10 | DWMMmvmm0g8W2zAtISpDQFNjPdTsh3z6Sz/yeMwhIQVMt8Km55zzJ6DSk1vCeH+F
11 | gW555Hez0qL/GKLJX4pRU45getXnqt/oQOnMEpe2Ar21GJO8JYWNC954SWacE90S
12 | 7p4Y/Y4BmlCvaF0Kr8qxs2jDQmWKHYMuuxnMsKzz4u3f6wdRCEPt++Z6jMnHmjA6
13 | Mp7i8Zm5cWKehbS4hLRa/uA1JbFZXdHwJsaHeJIv3QKBgQD1cmGUYxp/GXSFjH2A
14 | 1c03FiTcifp0ui8AjFBNb5pB7aHcsLK2EZRsV8PpgvpzpaDd9iYSmxxA7Xq9gBaV
15 | daRikRVvf12FvLkiZmmpulG3DyzvpP+CEGur102+gNjQXyBrtpJ9hG8rC7PHTCoG
16 | ZIraVocQ6Ft/T8bRMEE59gctewKBgQDJt69dk6H0UYSIJkFT3CMikMamk3ObAGT3
17 | WquP+GJ+2NaIM/9aHnaNMkN4MnpN9B73VUME26k8D3nm6+smZNvK2uWeqU/MONye
18 | zTF8L7yNBsO47rWCAoiNyJgfzDXc4gHEnKP4CeU3cjebzEg1Vdb5xKDeF8XwcrUV
19 | bOUgvnc6KwKBgQC8UHfBi4/GuFcIJ9Qaxu7eNuUtN8erSzXIq97oqpmlv5aSZheX
20 | TUGdJnEvdciGditIYRSw7cTto8aqId4x6cKnxTy3APdWJoe8durWyBbt5nzJmMRY
21 | nBSgEV6arOysYm/TdI5MMxG/6wiR/kO4B+fowL58IGoi8ahO00EYIUU8hQKBgDac
22 | i1bLVGp/82Ck8sTQcZa3GYEZpI/PYIZzPsWAmrH65MIFSdnNK414kTmmeORH9mZB
23 | 6B4VllDTY854CrbfUX4vG0GEVz1UG67GoOIdTm/j5/NWdT+Yjf3M1Bqvv9loOtBP
24 | FDlf/HWxb4q3mMkPz17ZtC/MweMiOxJs4++kgUT3AoGBAJDNpcpbaANd8WDGnb+o
25 | xHgl7lO8c897HEyF7Ea9aI4d+NK/NThOJPANHSBovH9AulFipVlTQs6FTMNxI19d
26 | lGiFNwUbuVNDQucnPu5Goc0VFjI9Rwn9GcwH2vsJ9emxKlsl9VDoTl5HVgItYZK5
27 | VcTFh/izUO6ONHyrlkC7+6Pe
28 | -----END PRIVATE KEY-----
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # Emacs
104 | .tramp_history
105 | 
106 | env-vars.txt
107 | tap_oracle/__pycache__/
108 | *~
109 | config.json
110 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2.1
 2 | orbs:
 3 |   slack: circleci/slack@3.4.2
 4 | 
 5 | jobs:
 6 |   build:
 7 |     docker:
 8 |       - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester-18.04
 9 |       - image: singerio/postgres:9.6-wal2json-2.2-ssl
10 |         environment:
11 |           POSTGRES_USER: postgres
12 |           POSTGRES_PASSWORD: password
13 |         command: [postgres, -c, config_file=/usr/local/share/postgresql/postgresql.conf]
14 |     steps:
15 |       - checkout
16 |       - run:
17 |           name: 'Setup virtual env'
18 |           command: |
19 |             aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox dev_env.sh
20 |             source dev_env.sh
21 |             export LC_ALL=C
22 |             apt-get -qq update
23 |             apt-get -qq install bundler libpq-dev
24 |             pyenv local 3.5.6
25 |             python3 -m venv /usr/local/share/virtualenvs/tap-postgres
26 |             source /usr/local/share/virtualenvs/tap-postgres/bin/activate
27 |             pip install -U 'pip<19.2' 'setuptools<51.0.0'
28 |             pip install .[dev]
29 |             source dev_env.sh
30 |             make test
31 |             pylint tap_postgres -d missing-docstring,invalid-name,line-too-long,too-many-locals,too-few-public-methods,fixme,stop-iteration-return,duplicate-code,useless-import-alias,bare-except,raise-missing-from
32 |       - run:
33 |           when: always
34 |           name: 'Integration Tests'
35 |           command: |
36 |             source dev_env.sh
37 |             source /usr/local/share/virtualenvs/tap-tester/bin/activate
38 |             apt-get -qq update
39 |             apt-get -qq install bundler libpq-dev
40 |             pip install psycopg2==2.8.4
41 |             run-test --tap=tap-postgres tests
42 |       - slack/notify-on-failure:
43 |           only_for_branches: master
44 | 
45 | workflows:
46 |   version: 2
47 |   commit: &commit_jobs
48 |     jobs:
49 |       - build:
50 |           context:
51 |             - circleci-user
52 |             - tap-tester-user
53 |   build_daily:
54 |     <<: *commit_jobs
55 |     triggers:
56 |       - schedule:
57 |           cron: "0 1 * * *"
58 |           filters:
59 |             branches:
60 |               only:
61 |                 - master
62 | 


--------------------------------------------------------------------------------
/bin/test-db:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | import sys
 4 | import argparse
 5 | import subprocess
 6 | import time
 7 | from argparse import RawTextHelpFormatter
 8 | 
 9 | full_image_name = "singerio/postgres:9.6-wal2json-2.2-ssl"
10 | 
11 | def start_container(name):
12 |     START_COMMAND = """
13 |     sudo docker run -e "POSTGRES_USER={0}" -e "POSTGRES_PASSWORD={1}" -p {2}:5432 --name {3} -d {4} \
14 |                     postgres -c config_file=/usr/local/share/postgresql/postgresql.conf
15 |     """.format(os.getenv('TAP_POSTGRES_USER'),
16 |                os.getenv('TAP_POSTGRES_PASSWORD'),
17 |                os.getenv('TAP_POSTGRES_PORT'),
18 |                name,
19 |                full_image_name)
20 | 
21 |     print("Starting Docker process {} using command: {}".format(name, START_COMMAND))
22 | 
23 |     proc = subprocess.run(START_COMMAND, shell=True)
24 |     if proc.returncode != 0:
25 |         sys.exit("Exited with code: {}, the docker process failed to start.".format(proc.returncode))
26 |     print("Process started successfully.")
27 | 
28 | def get_ip_addr(name):
29 |     IP_ADDR_COMMAND = "docker inspect {} | jq -r .[].NetworkSettings.IPAddress"
30 |     print("Retrieving IP addr of postgres container")
31 |     ip_addr = subprocess.check_output(IP_ADDR_COMMAND.format(name), shell=True).decode('utf-8').rstrip()
32 |     print(ip_addr)
33 |     return ip_addr
34 | 
35 | 
36 | def stop_container(name):
37 |     STOP_COMMAND = "sudo docker stop {0} && sudo docker rm {0}"
38 | 
39 |     print("Stopping Docker process {}".format(name))
40 |     proc = subprocess.run(STOP_COMMAND.format(name), shell=True)
41 |     if proc.returncode != 0:
42 |         sys.exit("Exited with code: {}, the docker process failed to stop.".format(proc.returncode))
43 |     print("Process stopped successfully")
44 | 
45 | def connect_to_db(name):
46 |     CONNECT_COMMAND = 'docker run -it --rm -e "PGPASSWORD={}" {} psql --host {} -U {}'
47 | 
48 |     ip_addr = get_ip_addr(name)
49 | 
50 |     print("Attempting to connect to running container using a postgres container via psql")
51 |     connect_command_format = CONNECT_COMMAND.format(os.getenv('TAP_POSTGRES_PASSWORD'),
52 |                                                     full_image_name,
53 |                                                     ip_addr,
54 |                                                     os.getenv('TAP_POSTGRES_USER'))
55 |     print(connect_command_format)
56 |     # NB: Using call instead of run here because it is blocking
57 |     #     This returns only an exit code.
58 |     returncode = subprocess.call(connect_command_format,
59 |                                  shell=True)
60 |     if returncode != 0:
61 |         sys.exit("Exited with code: {}, could not connect.".format(returncode))
62 | 
63 | DESCRIPTION = """
64 | Manage docker instance for tap-postgres testing.
65 | 
66 | Uses environment variables:
67 |     TAP_POSTGRES_USER
68 |     TAP_POSTGRES_PASSWORD
69 | """
70 | parser = argparse.ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter)
71 | parser.add_argument('action', choices=['start','stop', 'connect'], help='action to perform with the container')
72 | parser.add_argument('--name', help="name assigned to running docker process", default='postgres1')
73 | 
74 | def main():
75 |     parsed_args = parser.parse_args()
76 |     # Potential arguments to add: pull, changing docker cointainer, changing password
77 |     if parsed_args.action == 'start':
78 |         start_container(parsed_args.name)
79 |     elif parsed_args.action == 'stop':
80 |         stop_container(parsed_args.name)
81 |     elif parsed_args.action == 'connect':
82 |         connect_to_db(parsed_args.name)
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/tests/test_postgres_drop_table_field_selection.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | 
  4 | import psycopg2.extras
  5 | from psycopg2.extensions import quote_ident
  6 | import tap_tester.connections as connections
  7 | import tap_tester.menagerie   as menagerie
  8 | import tap_tester.runner      as runner
  9 | 
 10 | import db_utils  # pylint: disable=import-error
 11 | 
 12 | test_schema_name = "public"
 13 | test_table_name = "postgres_drop_table_test"
 14 | 
 15 | def canonicalized_table_name(schema, table, cur):
 16 |     return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur))
 17 | 
 18 | class PostgresDropTable(unittest.TestCase):
 19 | 
 20 |     @staticmethod
 21 |     def name():
 22 |         return "tap_tester_postgres_drop_table_field_selection"
 23 | 
 24 |     @staticmethod
 25 |     def get_properties():
 26 |         return {'host' :   os.getenv('TAP_POSTGRES_HOST'),
 27 |                 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'),
 28 |                 'port' : os.getenv('TAP_POSTGRES_PORT'),
 29 |                 'user' : os.getenv('TAP_POSTGRES_USER'),
 30 |                 'default_replication_method' : 'LOG_BASED',
 31 |                 'filter_dbs' : 'discovery0'
 32 |         }
 33 | 
 34 |     @staticmethod
 35 |     def get_credentials():
 36 |         return {'password': os.getenv('TAP_POSTGRES_PASSWORD')}
 37 | 
 38 |     @staticmethod
 39 |     def get_type():
 40 |         return "platform.postgres"
 41 | 
 42 |     @staticmethod
 43 |     def tap_name():
 44 |         return "tap-postgres"
 45 | 
 46 |     @staticmethod
 47 |     def expected_check_streams():
 48 |         return { 'discovery0-public-postgres_drop_table_test'}
 49 | 
 50 | 
 51 |     def setUp(self):
 52 |         db_utils.ensure_environment_variables_set()
 53 | 
 54 |         db_utils.ensure_db('discovery0')
 55 | 
 56 |         with db_utils.get_test_connection('discovery0') as conn:
 57 |             conn.autocommit = True
 58 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
 59 |                 old_table = cur.execute("""SELECT EXISTS (
 60 |                                           SELECT 1
 61 |                                           FROM  information_schema.tables
 62 |                                           WHERE  table_schema = %s
 63 |                                           AND  table_name =   %s);""",
 64 |                                         [test_schema_name, test_table_name])
 65 |                 old_table = cur.fetchone()[0]
 66 |                 if old_table:
 67 |                     cur.execute("DROP TABLE {}".format(canonicalized_table_name(test_schema_name, test_table_name, cur)))
 68 | 
 69 | 
 70 |                 cur = conn.cursor()
 71 |                 cur.execute(""" SELECT installed_version FROM pg_available_extensions WHERE name = 'hstore' """)
 72 |                 if cur.fetchone()[0] is None:
 73 |                     cur.execute(""" CREATE EXTENSION hstore; """)
 74 | 
 75 |                 #pylint: disable=line-too-long
 76 |                 create_table_sql = 'CREATE TABLE {} (id SERIAL PRIMARY KEY)'.format(canonicalized_table_name(test_schema_name, test_table_name, cur))
 77 | 
 78 |                 cur.execute(create_table_sql)
 79 | 
 80 |     def test_run(self):
 81 |         conn_id = connections.ensure_connection(self)
 82 | 
 83 |         # Run discovery
 84 |         check_job_name = runner.run_check_mode(self, conn_id)
 85 | 
 86 |         # Verify check exit codes
 87 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
 88 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
 89 | 
 90 |         # There should not be any tables in this database
 91 |         with db_utils.get_test_connection('discovery0') as conn:
 92 |             cur = conn.cursor()
 93 |             cur.execute("DROP TABLE {}".format(canonicalized_table_name(test_schema_name, test_table_name, cur)))
 94 | 
 95 |         # Run discovery again
 96 |         check_job_name = runner.run_check_mode(self, conn_id)
 97 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
 98 | 
 99 |         # When discovery mode finds 0 tables, the tap returns an error
100 |         self.assertEqual(exit_status['discovery_exit_status'], 1)
101 | 


--------------------------------------------------------------------------------
/.circleci/server.crt:
--------------------------------------------------------------------------------
 1 | Certificate:
 2 |     Data:
 3 |         Version: 3 (0x2)
 4 |         Serial Number:
 5 |             3a:7d:37:66:c9:08:92:63:75:dc:ea:bc:2e:73:3e:97:19:d8:da:95
 6 |         Signature Algorithm: sha256WithRSAEncryption
 7 |         Issuer: CN = localhost
 8 |         Validity
 9 |             Not Before: Dec 22 21:23:13 2020 GMT
10 |             Not After : Dec 20 21:23:13 2030 GMT
11 |         Subject: CN = localhost
12 |         Subject Public Key Info:
13 |             Public Key Algorithm: rsaEncryption
14 |                 RSA Public-Key: (2048 bit)
15 |                 Modulus:
16 |                     00:c1:66:eb:84:2b:b0:7d:99:b5:b0:16:7c:b7:ba:
17 |                     80:04:f8:fa:76:e8:f7:c2:3c:41:21:3a:1d:10:a4:
18 |                     0e:8a:3c:c4:2c:d3:a0:ef:e9:3c:7d:e1:37:29:34:
19 |                     87:92:9d:2c:69:dd:d1:80:dd:65:6e:3b:27:af:4a:
20 |                     ca:c6:87:2a:fa:0d:a7:5a:12:5c:e5:5e:40:77:f9:
21 |                     fa:63:fe:2d:3a:50:c8:86:e6:2a:b2:3b:fc:3e:2b:
22 |                     24:b3:e6:ae:61:5b:2c:2c:f4:f3:59:96:40:b8:9d:
23 |                     6e:f1:57:a9:7e:28:d3:2a:4b:71:db:16:88:29:6e:
24 |                     b7:8b:0e:6a:c2:55:5b:c4:4c:4d:ed:e4:4a:be:36:
25 |                     d9:8f:9a:cf:75:79:12:00:01:7f:d6:d3:63:20:e4:
26 |                     78:b9:6d:ef:ad:23:ec:ef:4d:20:70:6c:4d:bc:24:
27 |                     7c:59:c2:ab:cc:77:b4:99:49:ac:7b:74:78:97:84:
28 |                     df:b9:70:6f:e3:c9:71:62:54:ce:63:c3:62:cd:a8:
29 |                     3f:ae:99:e8:c8:76:b3:7a:88:66:2e:5e:c9:b6:9e:
30 |                     57:f8:e7:ea:7c:98:b1:03:6e:36:aa:89:66:8f:38:
31 |                     d5:09:e7:f7:b8:32:84:13:32:ae:79:ef:18:d5:e5:
32 |                     6a:cd:89:63:26:e0:c9:cf:ad:db:e5:7d:f1:61:af:
33 |                     81:a9
34 |                 Exponent: 65537 (0x10001)
35 |         X509v3 extensions:
36 |             X509v3 Subject Key Identifier: 
37 |                 09:9A:C9:F9:7C:C8:5D:EC:22:04:E1:B0:EB:84:05:30:AC:54:E2:79
38 |             X509v3 Authority Key Identifier: 
39 |                 keyid:09:9A:C9:F9:7C:C8:5D:EC:22:04:E1:B0:EB:84:05:30:AC:54:E2:79
40 | 
41 |             X509v3 Basic Constraints: critical
42 |                 CA:TRUE
43 |     Signature Algorithm: sha256WithRSAEncryption
44 |          11:75:a1:9a:cc:48:86:3b:12:c6:c6:b5:fa:64:d3:d9:9f:d1:
45 |          3d:31:59:36:af:2c:42:4c:cb:4b:3e:d1:28:ee:9f:d8:f7:19:
46 |          90:ef:03:82:4c:8c:e6:d5:ef:44:2b:3f:1d:d7:dd:f8:1a:32:
47 |          71:c1:b5:09:15:54:0d:a5:f9:75:2b:53:77:9a:63:67:d8:a3:
48 |          52:c4:e2:5b:70:0f:e7:3d:73:b6:8a:b6:98:79:9f:42:ee:ee:
49 |          f7:21:5c:1a:17:ef:d7:22:60:73:97:0d:78:1b:ef:f2:9a:9b:
50 |          f4:17:3b:0b:2a:c2:9a:76:1c:fe:d5:ec:7f:9e:ef:f5:f5:50:
51 |          f1:c6:0a:f5:ca:97:19:d4:fe:1e:9a:6b:9e:c1:9c:aa:5b:77:
52 |          83:f3:d3:d6:de:1a:4d:f8:2b:df:4a:ba:49:26:b2:15:a5:5d:
53 |          e8:0a:7c:85:7e:41:4d:64:3d:a1:65:8f:41:fb:4d:df:7b:eb:
54 |          3d:16:f7:4a:05:b9:9b:81:6e:d4:e3:ca:be:95:08:6b:3c:2a:
55 |          c9:4d:8c:68:ce:37:5b:4f:ab:e0:81:7b:9c:51:95:48:f2:41:
56 |          4d:b0:97:14:72:c6:02:31:4b:ec:80:a3:9c:e0:09:98:9a:dc:
57 |          d4:b3:f6:c9:2a:04:5e:8c:ec:0e:c0:40:96:24:e4:70:15:4e:
58 |          c7:44:19:31
59 | -----BEGIN CERTIFICATE-----
60 | MIIDCTCCAfGgAwIBAgIUOn03ZskIkmN13Oq8LnM+lxnY2pUwDQYJKoZIhvcNAQEL
61 | BQAwFDESMBAGA1UEAwwJbG9jYWxob3N0MB4XDTIwMTIyMjIxMjMxM1oXDTMwMTIy
62 | MDIxMjMxM1owFDESMBAGA1UEAwwJbG9jYWxob3N0MIIBIjANBgkqhkiG9w0BAQEF
63 | AAOCAQ8AMIIBCgKCAQEAwWbrhCuwfZm1sBZ8t7qABPj6duj3wjxBITodEKQOijzE
64 | LNOg7+k8feE3KTSHkp0sad3RgN1lbjsnr0rKxocq+g2nWhJc5V5Ad/n6Y/4tOlDI
65 | huYqsjv8Pisks+auYVssLPTzWZZAuJ1u8VepfijTKktx2xaIKW63iw5qwlVbxExN
66 | 7eRKvjbZj5rPdXkSAAF/1tNjIOR4uW3vrSPs700gcGxNvCR8WcKrzHe0mUmse3R4
67 | l4TfuXBv48lxYlTOY8Nizag/rpnoyHazeohmLl7Jtp5X+OfqfJixA242qolmjzjV
68 | Cef3uDKEEzKuee8Y1eVqzYljJuDJz63b5X3xYa+BqQIDAQABo1MwUTAdBgNVHQ4E
69 | FgQUCZrJ+XzIXewiBOGw64QFMKxU4nkwHwYDVR0jBBgwFoAUCZrJ+XzIXewiBOGw
70 | 64QFMKxU4nkwDwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAQEAEXWh
71 | msxIhjsSxsa1+mTT2Z/RPTFZNq8sQkzLSz7RKO6f2PcZkO8DgkyM5tXvRCs/Hdfd
72 | +BoyccG1CRVUDaX5dStTd5pjZ9ijUsTiW3AP5z1ztoq2mHmfQu7u9yFcGhfv1yJg
73 | c5cNeBvv8pqb9Bc7CyrCmnYc/tXsf57v9fVQ8cYK9cqXGdT+HpprnsGcqlt3g/PT
74 | 1t4aTfgr30q6SSayFaVd6Ap8hX5BTWQ9oWWPQftN33vrPRb3SgW5m4Fu1OPKvpUI
75 | azwqyU2MaM43W0+r4IF7nFGVSPJBTbCXFHLGAjFL7ICjnOAJmJrc1LP2ySoEXozs
76 | DsBAliTkcBVOx0QZMQ==
77 | -----END CERTIFICATE-----
78 | 


--------------------------------------------------------------------------------
/tests/unittests/test_unsupported_pk.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import tap_postgres
 3 | import psycopg2
 4 | import psycopg2.extras
 5 | import os
 6 | import pdb
 7 | import singer
 8 | from singer import get_logger, metadata, write_bookmark
 9 | from utils import ensure_db, get_test_connection, ensure_test_table, select_all_of_stream, set_replication_method_for_stream, insert_record, get_test_connection_config
10 | import decimal
11 | import math
12 | import pytz
13 | import strict_rfc3339
14 | import copy
15 | 
16 | LOGGER = get_logger()
17 | 
18 | def do_not_dump_catalog(catalog):
19 |     pass
20 | 
21 | tap_postgres.dump_catalog = do_not_dump_catalog
22 | 
23 | class Unsupported(unittest.TestCase):
24 |     maxDiff = None
25 |     table_name = 'CHICKEN TIMES'
26 | 
27 |     def setUp(self):
28 |         ensure_db()
29 |         with get_test_connection() as conn:
30 |             cur = conn.cursor()
31 |             table_spec = {"columns": [{"name": "interval_col",   "type": "INTERVAL"},
32 |                                       {"name": "bit_string_col", "type": "bit(5)"},
33 |                                       {"name": "bytea_col",      "type": "bytea"},
34 |                                       {"name": "point_col",      "type": "point"},
35 |                                       {"name": "line_col",      "type": "line"},
36 |                                       {"name": "lseg_col",      "type": "lseg"},
37 |                                       {"name": "box_col",      "type": "box"},
38 |                                       {"name": "polygon_col",      "type": "polygon"},
39 |                                       {"name": "circle_col",      "type": "circle"},
40 |                                       {"name": "xml_col",      "type": "xml"},
41 |                                       {"name": "composite_col",      "type": "person_composite"},
42 |                                       {"name": "int_range_col",      "type": "int4range"},
43 |             ],
44 |                           "name": Unsupported.table_name}
45 |             with get_test_connection() as conn:
46 |                 cur = conn.cursor()
47 |                 cur.execute("""     DROP TYPE IF EXISTS person_composite CASCADE """)
48 |                 cur.execute("""     CREATE TYPE person_composite AS (age int, name text) """)
49 | 
50 |             ensure_test_table(table_spec)
51 | 
52 |     def test_catalog(self):
53 |         conn_config = get_test_connection_config()
54 |         streams = tap_postgres.do_discovery(conn_config)
55 |         chicken_streams = [s for s in streams if s['tap_stream_id'] == "postgres-public-CHICKEN TIMES"]
56 | 
57 |         self.assertEqual(len(chicken_streams), 1)
58 |         stream_dict = chicken_streams[0]
59 |         stream_dict.get('metadata').sort(key=lambda md: md['breadcrumb'])
60 | 
61 |         self.assertEqual(metadata.to_map(stream_dict.get('metadata')),
62 |                          {():                                   {'is-view': False, 'table-key-properties': [], 'row-count': 0, 'schema-name': 'public', 'database-name': 'postgres'},
63 |                           ('properties', 'bytea_col'):          {'sql-datatype': 'bytea', 'selected-by-default': False, 'inclusion': 'unsupported'},
64 |                           ('properties', 'bit_string_col'):     {'sql-datatype': 'bit(5)', 'selected-by-default': False, 'inclusion': 'unsupported'},
65 |                           ('properties', 'line_col'):           {'sql-datatype': 'line', 'selected-by-default': False, 'inclusion': 'unsupported'},
66 |                           ('properties', 'xml_col'):            {'sql-datatype': 'xml', 'selected-by-default': False, 'inclusion': 'unsupported'},
67 |                           ('properties', 'int_range_col'):      {'sql-datatype': 'int4range', 'selected-by-default': False, 'inclusion': 'unsupported'},
68 |                           ('properties', 'circle_col'):         {'sql-datatype': 'circle', 'selected-by-default': False, 'inclusion': 'unsupported'},
69 |                           ('properties', 'polygon_col'):        {'sql-datatype': 'polygon', 'selected-by-default': False, 'inclusion': 'unsupported'},
70 |                           ('properties', 'box_col'):            {'sql-datatype': 'box', 'selected-by-default': False, 'inclusion': 'unsupported'},
71 |                           ('properties', 'lseg_col'):           {'sql-datatype': 'lseg', 'selected-by-default': False, 'inclusion': 'unsupported'},
72 |                           ('properties', 'composite_col'):      {'sql-datatype': 'person_composite', 'selected-by-default': False, 'inclusion': 'unsupported'},
73 |                           ('properties', 'interval_col'):       {'sql-datatype': 'interval', 'selected-by-default': False, 'inclusion': 'unsupported'},
74 |                           ('properties', 'point_col'):          {'sql-datatype': 'point', 'selected-by-default': False, 'inclusion': 'unsupported'}}
75 |         )
76 | 
77 | 
78 | if __name__== "__main__":
79 |     test1 = Unsupported()
80 |     test1.setUp()
81 |     test1.test_catalog()
82 | 


--------------------------------------------------------------------------------
/tap_postgres/sync_strategies/incremental.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import time
  3 | import psycopg2
  4 | import psycopg2.extras
  5 | import singer
  6 | from singer import utils
  7 | import singer.metrics as metrics
  8 | import tap_postgres.db as post_db
  9 | 
 10 | 
 11 | LOGGER = singer.get_logger()
 12 | 
 13 | UPDATE_BOOKMARK_PERIOD = 1000
 14 | 
 15 | def fetch_max_replication_key(conn_config, replication_key, schema_name, table_name):
 16 |     with post_db.open_connection(conn_config, False) as conn:
 17 |         with conn.cursor() as cur:
 18 |             max_key_sql = """SELECT max({})
 19 |                               FROM {}""".format(post_db.prepare_columns_sql(replication_key),
 20 |                                                 post_db.fully_qualified_table_name(schema_name, table_name))
 21 |             LOGGER.info("determine max replication key value: %s", max_key_sql)
 22 |             cur.execute(max_key_sql)
 23 |             max_key = cur.fetchone()[0]
 24 |             LOGGER.info("max replication key value: %s", max_key)
 25 |             return max_key
 26 | 
 27 | def sync_table(conn_info, stream, state, desired_columns, md_map):
 28 |     time_extracted = utils.now()
 29 | 
 30 |     stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version')
 31 |     if stream_version is None:
 32 |         stream_version = int(time.time() * 1000)
 33 | 
 34 |     state = singer.write_bookmark(state,
 35 |                                   stream['tap_stream_id'],
 36 |                                   'version',
 37 |                                   stream_version)
 38 |     singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
 39 | 
 40 |     schema_name = md_map.get(()).get('schema-name')
 41 | 
 42 |     escaped_columns = map(post_db.prepare_columns_sql, desired_columns)
 43 | 
 44 |     activate_version_message = singer.ActivateVersionMessage(
 45 |         stream=post_db.calculate_destination_stream_name(stream, md_map),
 46 |         version=stream_version)
 47 | 
 48 | 
 49 |     singer.write_message(activate_version_message)
 50 | 
 51 |     replication_key = md_map.get((), {}).get('replication-key')
 52 |     replication_key_value = singer.get_bookmark(state, stream['tap_stream_id'], 'replication_key_value')
 53 |     replication_key_sql_datatype = md_map.get(('properties', replication_key)).get('sql-datatype')
 54 | 
 55 |     hstore_available = post_db.hstore_available(conn_info)
 56 |     with metrics.record_counter(None) as counter:
 57 |         with post_db.open_connection(conn_info) as conn:
 58 | 
 59 |             # Client side character encoding defaults to the value in postgresql.conf under client_encoding.
 60 |             # The server / db can also have its own configred encoding.
 61 |             with conn.cursor() as cur:
 62 |                 cur.execute("show server_encoding")
 63 |                 LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
 64 |                 cur.execute("show client_encoding")
 65 |                 LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])
 66 | 
 67 |             if hstore_available:
 68 |                 LOGGER.info("hstore is available")
 69 |                 psycopg2.extras.register_hstore(conn)
 70 |             else:
 71 |                 LOGGER.info("hstore is UNavailable")
 72 | 
 73 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur:
 74 |                 cur.itersize = post_db.cursor_iter_size
 75 |                 LOGGER.info("Beginning new incremental replication sync %s", stream_version)
 76 |                 if replication_key_value:
 77 |                     select_sql = """SELECT {}
 78 |                                     FROM {}
 79 |                                     WHERE {} >= '{}'::{}
 80 |                                     ORDER BY {} ASC""".format(','.join(escaped_columns),
 81 |                                                               post_db.fully_qualified_table_name(schema_name, stream['table_name']),
 82 |                                                               post_db.prepare_columns_sql(replication_key), replication_key_value, replication_key_sql_datatype,
 83 |                                                               post_db.prepare_columns_sql(replication_key))
 84 |                 else:
 85 |                     #if not replication_key_value
 86 |                     select_sql = """SELECT {}
 87 |                                     FROM {}
 88 |                                     ORDER BY {} ASC""".format(','.join(escaped_columns),
 89 |                                                               post_db.fully_qualified_table_name(schema_name, stream['table_name']),
 90 |                                                               post_db.prepare_columns_sql(replication_key))
 91 | 
 92 |                 LOGGER.info("select statement: %s with itersize %s", select_sql, cur.itersize)
 93 |                 cur.execute(select_sql)
 94 | 
 95 |                 rows_saved = 0
 96 | 
 97 |                 for rec in cur:
 98 |                     record_message = post_db.selected_row_to_singer_message(stream, rec, stream_version, desired_columns, time_extracted, md_map)
 99 |                     singer.write_message(record_message)
100 |                     rows_saved = rows_saved + 1
101 | 
102 |                     #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great
103 |                     #event worse would be allowing the NULL value to enter into the state
104 |                     if record_message.record[replication_key] is not None:
105 |                         state = singer.write_bookmark(state,
106 |                                                       stream['tap_stream_id'],
107 |                                                       'replication_key_value',
108 |                                                       record_message.record[replication_key])
109 | 
110 | 
111 |                     if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
112 |                         singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
113 | 
114 |                     counter.increment()
115 | 
116 |     return state
117 | 


--------------------------------------------------------------------------------
/tests/db_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import psycopg2
  3 | from psycopg2.extensions import quote_ident
  4 | 
  5 | # steal top_map method from singer-python so we can remove singer imports from tests
  6 | def to_map(raw_metadata):
  7 |     return {tuple(md['breadcrumb']): md['metadata'] for md in raw_metadata}
  8 | 
  9 | def ensure_environment_variables_set():
 10 |     missing_envs = [x for x in [os.getenv('TAP_POSTGRES_HOST'),
 11 |                                 os.getenv('TAP_POSTGRES_USER'),
 12 |                                 os.getenv('TAP_POSTGRES_PASSWORD'),
 13 |                                 os.getenv('TAP_POSTGRES_PORT'),
 14 |                                 os.getenv('TAP_POSTGRES_DBNAME')] if x is None]
 15 |     if len(missing_envs) != 0:
 16 |         raise Exception("Missing environment variables: {}".format(missing_envs))
 17 | 
 18 | def ensure_db(dbname=os.getenv('TAP_POSTGRES_DBNAME')):
 19 |     # Create database dev if not exists
 20 |     with get_test_connection('postgres') as conn:
 21 |         conn.autocommit = True
 22 |         with conn.cursor() as cur:
 23 |             cur.execute("SELECT 1 FROM pg_database WHERE datname = '{}'".format(dbname))
 24 |             exists = cur.fetchone()
 25 |             if not exists:
 26 |                 print("Creating database {}".format(dbname))
 27 |                 cur.execute("CREATE DATABASE {}".format(dbname))
 28 | 
 29 | def get_test_connection(dbname=os.getenv('TAP_POSTGRES_DBNAME'), logical_replication=False):
 30 | 
 31 |     conn_string = "host='{}' dbname='{}' user='{}' password='{}' port='{}'".format(os.getenv('TAP_POSTGRES_HOST'),
 32 |                                                                                    dbname,
 33 |                                                                                    os.getenv('TAP_POSTGRES_USER'),
 34 |                                                                                    os.getenv('TAP_POSTGRES_PASSWORD'),
 35 |                                                                                    os.getenv('TAP_POSTGRES_PORT'))
 36 | 
 37 |     if logical_replication:
 38 |         return psycopg2.connect(conn_string, connection_factory=psycopg2.extras.LogicalReplicationConnection)
 39 |     else:
 40 |         return psycopg2.connect(conn_string)
 41 | 
 42 | def canonicalized_table_name(conn_cursor, schema, table):
 43 |     return "{}.{}".format(quote_ident(schema, conn_cursor), quote_ident(table, conn_cursor))
 44 | 
 45 | def ensure_replication_slot(conn_cursor, db_name=os.getenv('TAP_POSTGRES_DBNAME'), slot_name='stitch'):
 46 |     conn_cursor.execute("""SELECT EXISTS (
 47 |                   SELECT 1
 48 |                   FROM  pg_replication_slots
 49 |                   WHERE  slot_name = '{}') """, slot_name)
 50 | 
 51 |     old_slot = conn_cursor.fetchone()[0]
 52 | 
 53 |     with get_test_connection(db_name, True) as conn2:
 54 |         with conn2.cursor() as conn_2_cursor:
 55 |             if old_slot:
 56 |                 conn_2_cursor.drop_replication_slot(slot_name)
 57 |             conn_2_cursor.create_replication_slot(slot_name, output_plugin='wal2json')
 58 | 
 59 | def ensure_fresh_table(conn, conn_cursor, schema_name, table_name):
 60 |     """
 61 |     If a table of the specified name and schema already exists, it was left over
 62 |     from a previous test run. Drop this table.
 63 |     """
 64 |     ctable_name = canonicalized_table_name(conn_cursor, schema_name, table_name)
 65 | 
 66 |     old_table = conn_cursor.execute("""SELECT EXISTS (
 67 |                               SELECT 1
 68 |                               FROM  information_schema.tables
 69 |                               WHERE  table_schema = %s
 70 |                               AND  table_name =   %s);""",
 71 |                             [schema_name, table_name])
 72 |     old_table = conn_cursor.fetchone()[0]
 73 |     if old_table:
 74 |         conn_cursor.execute("DROP TABLE {}".format(ctable_name))
 75 | 
 76 | 
 77 |     conn_cursor2 = conn.cursor()
 78 |     conn_cursor2.execute(""" SELECT installed_version FROM pg_available_extensions WHERE name = 'hstore' """)
 79 |     if conn_cursor2.fetchone()[0] is None:
 80 |         conn_cursor2.execute(""" CREATE EXTENSION hstore; """)
 81 |         conn_cursor2.execute(""" CREATE EXTENSION IF NOT EXISTS citext WITH SCHEMA public;""")
 82 |         conn_cursor2.execute(""" DROP TYPE IF EXISTS ALIGNMENT CASCADE """)
 83 |         conn_cursor2.execute(""" CREATE TYPE ALIGNMENT AS ENUM ('good', 'bad', 'ugly') """)
 84 | 
 85 |     return conn_cursor2
 86 | 
 87 | 
 88 | def insert_record(conn_cursor, table_name, data):
 89 |     our_keys = list(data.keys())
 90 |     our_keys.sort()
 91 |     our_values = [data.get(key) for key in our_keys]
 92 | 
 93 |     columns_sql = ", \n ".join(our_keys)
 94 |     value_sql = ",".join(["%s" for i in range(len(our_keys))])
 95 | 
 96 |     insert_sql = """ INSERT INTO {}
 97 |                             ( {} )
 98 |                      VALUES ( {} )""".format(quote_ident(table_name, conn_cursor), columns_sql, value_sql)
 99 |     conn_cursor.execute(insert_sql, our_values)
100 | 
101 | 
102 | def update_record(conn_cursor, ctable_name, primary_key, data):
103 |     """
104 |     Update an existing record as specified using the following params.
105 |     :param conn_cursor:    A pyschopg2 connection object.
106 |     :param ctable_name:    The canonicalized talbe name.
107 |     :param primary_key:    The value of the primary key
108 |                            of the record you want to update.
109 |     :param data:           A dictionary of fields to values to
110 |                            update in the record.
111 |     """
112 |     fields_to_update = ""
113 |     for field, value in data.items():
114 |         if ' ' in field:
115 |             field = quote_ident(field, conn_cursor)
116 |         fields_to_update += " {} = '{}',".format(field, value)
117 | 
118 |     update_sql = "UPDATE {} SET{} WHERE id = {}".format(ctable_name,
119 |                                                         fields_to_update[:-1],
120 |                                                         primary_key)
121 |     conn_cursor.execute(update_sql)
122 | 
123 | def delete_record(conn_cursor, ctable_name, primary_key):
124 |     # print("delete row from source db")
125 |     # with db_utils.get_test_connection('dev') as conn:
126 |     #     with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
127 |     #         cur.execute("DELETE FROM {} WHERE id = 3".format(canonicalized_table_name(test_schema_name, test_table_name, cur)))
128 | 
129 |     conn_cursor.execute("DELETE FROM {} WHERE id = {}".format(ctable_name, primary_key))
130 | 


--------------------------------------------------------------------------------
/tests/unittests/test_clear_state_on_replication_change.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import tap_postgres
 3 | 
 4 | tap_stream_id = 'chicken_table'
 5 | 
 6 | class TestClearState(unittest.TestCase):
 7 | 
 8 |     def test_incremental_happy(self):
 9 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 1, "replication_key" : 'updated_at', 'replication_key_value' : '2017-01-01T00:00:03+00:00', 'last_replication_method' : 'INCREMENTAL'}}}
10 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at', 'INCREMENTAL')
11 |         self.assertEqual(nascent_state, state)
12 | 
13 |     def test_incremental_changing_replication_keys(self):
14 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 1, "replication_key" : 'updated_at', 'replication_key_value' : '2017-01-01T00:00:03+00:00', 'last_replication_method' : 'INCREMENTAL'}}}
15 | 
16 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at_2', 'INCREMENTAL')
17 |         self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : {'last_replication_method' : 'INCREMENTAL'}}})
18 | 
19 |     def test_incremental_changing_replication_key_interrupted(self):
20 |         xmin = '3737373'
21 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 1, 'xmin' : xmin, "replication_key" : 'updated_at', 'replication_key_value' : '2017-01-01T00:00:03+00:00',
22 |                                                   'last_replication_method' : 'INCREMENTAL'}}}
23 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at_2', 'INCREMENTAL')
24 |         self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : { 'last_replication_method' : 'INCREMENTAL'}}})
25 | 
26 |     def test_full_table_to_incremental(self):
27 |         #interrupted full table -> incremental
28 |         xmin = '3737373'
29 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 1, 'xmin' : xmin, "last_replication_method" : "FULL_TABLE"}}}
30 | 
31 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at', 'INCREMENTAL')
32 |         self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : {"last_replication_method" : "INCREMENTAL"}}})
33 | 
34 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 1, "last_replication_method" : "FULL_TABLE"}}}
35 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at', 'INCREMENTAL')
36 |         self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : {"last_replication_method" : "INCREMENTAL"}}})
37 | 
38 | 
39 |     def test_log_based_to_incremental(self):
40 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 1, 'lsn' : 34343434, "last_replication_method" : "LOG_BASED"}}}
41 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at', 'INCREMENTAL')
42 |         self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : {"last_replication_method" : "INCREMENTAL"}}})
43 | 
44 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 1, 'lsn' : 34343434, 'xmin' : 34343, "last_replication_method" : "LOG_BASED"}}}
45 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at', 'INCREMENTAL')
46 |         self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : {"last_replication_method" : "INCREMENTAL"}}})
47 | 
48 |     #full table tests
49 |     def test_full_table_happy(self):
50 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "FULL_TABLE"}}}
51 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'FULL_TABLE')
52 |         self.assertEqual(nascent_state, state)
53 | 
54 |     def test_full_table_interrupted(self):
55 |         xmin = 333333
56 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "FULL_TABLE", 'xmin' : xmin}}}
57 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'FULL_TABLE')
58 |         self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : { "last_replication_method" : "FULL_TABLE", 'version': 88, 'xmin' : xmin}}})
59 | 
60 |     def test_incremental_to_full_table(self):
61 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "INCREMENTAL", 'replication_key' : 'updated_at', 'replication_key_value' : 'i will be removed'}}}
62 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'FULL_TABLE')
63 |         self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : { "last_replication_method" : "FULL_TABLE"}}})
64 | 
65 |     def test_log_based_to_full_table(self):
66 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "LOG_BASED", 'lsn' : 343434}}}
67 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'FULL_TABLE')
68 |         self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : { "last_replication_method" : "FULL_TABLE"}}})
69 | 
70 | 
71 |     #log based tests
72 |     def test_log_based_happy(self):
73 |         lsn = 43434343
74 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "LOG_BASED", 'lsn' : lsn}}}
75 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'LOG_BASED')
76 |         self.assertEqual(nascent_state, state)
77 | 
78 |         lsn = 43434343
79 |         xmin = 11111
80 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "LOG_BASED", 'lsn' : lsn, 'xmin' : xmin}}}
81 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'LOG_BASED')
82 |         self.assertEqual(nascent_state, state)
83 | 
84 |     def test_incremental_to_log_based(self):
85 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "INCREMENTAL", 'replication_key' : 'updated_at', 'replication_key_value' : 'i will be removed'}}}
86 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'LOG_BASED')
87 |         self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : { "last_replication_method" : "LOG_BASED"}}})
88 | 
89 |     def test_full_table_to_log_based(self):
90 |         state = {'bookmarks' : {tap_stream_id : { 'version' : 2222, "last_replication_method" : "FULL_TABLE", 'xmin' : 2}}}
91 |         nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'LOG_BASED')
92 |         self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : { "last_replication_method" : "LOG_BASED"}}})
93 | 
94 | 
95 | 
96 | if __name__== "__main__":
97 |     test1 = TestClearState()
98 |     test1.test_full_table_to_log_based()
99 | 


--------------------------------------------------------------------------------
/tap_postgres/sync_strategies/full_table.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # pylint: disable=missing-docstring,not-an-iterable,too-many-locals,too-many-arguments,invalid-name,too-many-return-statements,too-many-branches,len-as-condition,too-many-nested-blocks,wrong-import-order,duplicate-code
  3 | 
  4 | import copy
  5 | import time
  6 | import psycopg2
  7 | import psycopg2.extras
  8 | import singer
  9 | from singer import utils
 10 | import singer.metrics as metrics
 11 | import tap_postgres.db as post_db
 12 | 
 13 | LOGGER = singer.get_logger()
 14 | 
 15 | UPDATE_BOOKMARK_PERIOD = 1000
 16 | 
 17 | def sync_view(conn_info, stream, state, desired_columns, md_map):
 18 |     time_extracted = utils.now()
 19 | 
 20 |     #before writing the table version to state, check if we had one to begin with
 21 |     first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None
 22 |     nascent_stream_version = int(time.time() * 1000)
 23 | 
 24 |     state = singer.write_bookmark(state,
 25 |                                   stream['tap_stream_id'],
 26 |                                   'version',
 27 |                                   nascent_stream_version)
 28 |     singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
 29 | 
 30 |     schema_name = md_map.get(()).get('schema-name')
 31 | 
 32 |     escaped_columns = map(post_db.prepare_columns_sql, desired_columns)
 33 | 
 34 |     activate_version_message = singer.ActivateVersionMessage(
 35 |         stream=post_db.calculate_destination_stream_name(stream, md_map),
 36 |         version=nascent_stream_version)
 37 | 
 38 |     if first_run:
 39 |         singer.write_message(activate_version_message)
 40 | 
 41 |     with metrics.record_counter(None) as counter:
 42 |         with post_db.open_connection(conn_info) as conn:
 43 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur:
 44 |                 cur.itersize = post_db.cursor_iter_size
 45 |                 select_sql = 'SELECT {} FROM {}'.format(','.join(escaped_columns),
 46 |                                                         post_db.fully_qualified_table_name(schema_name, stream['table_name']))
 47 | 
 48 |                 LOGGER.info("select %s with itersize %s", select_sql, cur.itersize)
 49 |                 cur.execute(select_sql)
 50 | 
 51 |                 rows_saved = 0
 52 |                 for rec in cur:
 53 |                     record_message = post_db.selected_row_to_singer_message(stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map)
 54 |                     singer.write_message(record_message)
 55 |                     rows_saved = rows_saved + 1
 56 |                     if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
 57 |                         singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
 58 | 
 59 |                     counter.increment()
 60 | 
 61 |     #always send the activate version whether first run or subsequent
 62 |     singer.write_message(activate_version_message)
 63 | 
 64 |     return state
 65 | 
 66 | 
 67 | def sync_table(conn_info, stream, state, desired_columns, md_map):
 68 |     time_extracted = utils.now()
 69 | 
 70 |     #before writing the table version to state, check if we had one to begin with
 71 |     first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None
 72 | 
 73 |     #pick a new table version IFF we do not have an xmin in our state
 74 |     #the presence of an xmin indicates that we were interrupted last time through
 75 |     if singer.get_bookmark(state, stream['tap_stream_id'], 'xmin') is None:
 76 |         nascent_stream_version = int(time.time() * 1000)
 77 |     else:
 78 |         nascent_stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version')
 79 | 
 80 |     state = singer.write_bookmark(state,
 81 |                                   stream['tap_stream_id'],
 82 |                                   'version',
 83 |                                   nascent_stream_version)
 84 |     singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
 85 | 
 86 |     schema_name = md_map.get(()).get('schema-name')
 87 | 
 88 |     escaped_columns = map(post_db.prepare_columns_sql, desired_columns)
 89 | 
 90 |     activate_version_message = singer.ActivateVersionMessage(
 91 |         stream=post_db.calculate_destination_stream_name(stream, md_map),
 92 |         version=nascent_stream_version)
 93 | 
 94 |     if first_run:
 95 |         singer.write_message(activate_version_message)
 96 | 
 97 |     hstore_available = post_db.hstore_available(conn_info)
 98 |     with metrics.record_counter(None) as counter:
 99 |         with post_db.open_connection(conn_info) as conn:
100 | 
101 |             # Client side character encoding defaults to the value in postgresql.conf under client_encoding.
102 |             # The server / db can also have its own configred encoding.
103 |             with conn.cursor() as cur:
104 |                 cur.execute("show server_encoding")
105 |                 LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0])
106 |                 cur.execute("show client_encoding")
107 |                 LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0])
108 | 
109 |             if hstore_available:
110 |                 LOGGER.info("hstore is available")
111 |                 psycopg2.extras.register_hstore(conn)
112 |             else:
113 |                 LOGGER.info("hstore is UNavailable")
114 | 
115 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur:
116 |                 cur.itersize = post_db.cursor_iter_size
117 | 
118 |                 xmin = singer.get_bookmark(state, stream['tap_stream_id'], 'xmin')
119 |                 if xmin:
120 |                     LOGGER.info("Resuming Full Table replication %s from xmin %s", nascent_stream_version, xmin)
121 |                     select_sql = """SELECT {}, xmin::text::bigint
122 |                                       FROM {} where age(xmin::xid) <= age('{}'::xid)
123 |                                      ORDER BY xmin::text::bigint ASC""".format(','.join(escaped_columns),
124 |                                                                                post_db.fully_qualified_table_name(schema_name, stream['table_name']),
125 |                                                                                xmin)
126 |                 else:
127 |                     LOGGER.info("Beginning new Full Table replication %s", nascent_stream_version)
128 |                     select_sql = """SELECT {}, xmin::text::bigint
129 |                                       FROM {}
130 |                                      ORDER BY xmin::text::bigint ASC""".format(','.join(escaped_columns),
131 |                                                                                post_db.fully_qualified_table_name(schema_name, stream['table_name']))
132 | 
133 | 
134 |                 LOGGER.info("select %s with itersize %s", select_sql, cur.itersize)
135 |                 cur.execute(select_sql)
136 | 
137 |                 rows_saved = 0
138 |                 for rec in cur:
139 |                     xmin = rec['xmin']
140 |                     rec = rec[:-1]
141 |                     record_message = post_db.selected_row_to_singer_message(stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map)
142 |                     singer.write_message(record_message)
143 |                     state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', xmin)
144 |                     rows_saved = rows_saved + 1
145 |                     if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
146 |                         singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
147 | 
148 |                     counter.increment()
149 | 
150 |     #once we have completed the full table replication, discard the xmin bookmark.
151 |     #the xmin bookmark only comes into play when a full table replication is interrupted
152 |     state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', None)
153 | 
154 |     #always send the activate version whether first run or subsequent
155 |     singer.write_message(activate_version_message)
156 | 
157 |     return state
158 | 


--------------------------------------------------------------------------------
/tap_postgres/db.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import decimal
  3 | import math
  4 | import psycopg2
  5 | import psycopg2.extras
  6 | import singer
  7 | LOGGER = singer.get_logger()
  8 | 
  9 | cursor_iter_size = 20000
 10 | include_schemas_in_destination_stream_name = False
 11 | 
 12 | def get_ssl_status(conn_config):
 13 |     try:
 14 |         matching_rows = []
 15 |         with open_connection(conn_config) as conn:
 16 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur:
 17 |                 select_sql = "SELECT datname,usename, ssl, client_addr FROM pg_stat_ssl JOIN pg_stat_activity ON pg_stat_ssl.pid = pg_stat_activity.pid"
 18 |                 cur.execute(select_sql)
 19 |                 for row in cur:
 20 |                     if row[0] == conn_config['dbname'] and row[1] == conn_config['user']:
 21 |                         matching_rows.append(row)
 22 |         if len(matching_rows) == 1:
 23 |             LOGGER.info('User %s connected with SSL = %s', conn_config['user'], matching_rows[0][2])
 24 |         else:
 25 |             LOGGER.info('Failed to retrieve SSL status')
 26 |     except:
 27 |         LOGGER.info('Failed to retrieve SSL status')
 28 | 
 29 | 
 30 | def calculate_destination_stream_name(stream, md_map):
 31 |     if include_schemas_in_destination_stream_name:
 32 |         return "{}_{}".format(md_map.get((), {}).get('schema-name'), stream['stream'])
 33 | 
 34 |     return stream['stream']
 35 | 
 36 | #from the postgres docs:
 37 | #Quoted identifiers can contain any character, except the character with code zero. (To include a double #quote, write two double quotes.)
 38 | def canonicalize_identifier(identifier):
 39 |     return identifier.replace('"', '""')
 40 | 
 41 | def fully_qualified_column_name(schema, table, column):
 42 |     return '"{}"."{}"."{}"'.format(canonicalize_identifier(schema), canonicalize_identifier(table), canonicalize_identifier(column))
 43 | 
 44 | def fully_qualified_table_name(schema, table):
 45 |     return '"{}"."{}"'.format(canonicalize_identifier(schema), canonicalize_identifier(table))
 46 | 
 47 | def open_connection(conn_config, logical_replication=False):
 48 |     cfg = {
 49 |         'host': conn_config['host'],
 50 |         'dbname': conn_config['dbname'],
 51 |         'user': conn_config['user'],
 52 |         'password': conn_config['password'],
 53 |         'port': conn_config['port'],
 54 |         'connect_timeout': 30
 55 |     }
 56 | 
 57 |     if conn_config.get('sslmode'):
 58 |         cfg['sslmode'] = conn_config['sslmode']
 59 | 
 60 |     if logical_replication:
 61 |         cfg['connection_factory'] = psycopg2.extras.LogicalReplicationConnection
 62 | 
 63 |     conn = psycopg2.connect(**cfg)
 64 | 
 65 |     return conn
 66 | 
 67 | def prepare_columns_sql(c):
 68 |     column_name = """ "{}" """.format(canonicalize_identifier(c))
 69 |     return column_name
 70 | 
 71 | def filter_dbs_sql_clause(sql, filter_dbs):
 72 |     in_clause = " AND datname in (" + ",".join(["'{}'".format(b.strip(' ')) for b in filter_dbs.split(',')]) + ")"
 73 |     return sql + in_clause
 74 | 
 75 | #pylint: disable=too-many-branches,too-many-nested-blocks
 76 | def selected_value_to_singer_value_impl(elem, sql_datatype):
 77 |     sql_datatype = sql_datatype.replace('[]', '')
 78 |     if elem is None:
 79 |         cleaned_elem = elem
 80 |     elif sql_datatype == 'money':
 81 |         cleaned_elem = elem
 82 |     elif isinstance(elem, datetime.datetime):
 83 |         if sql_datatype == 'timestamp with time zone':
 84 |             cleaned_elem = elem.isoformat()
 85 |         else: #timestamp WITH OUT time zone
 86 |             cleaned_elem = elem.isoformat() + '+00:00'
 87 |     elif isinstance(elem, datetime.date):
 88 |         cleaned_elem = elem.isoformat() + 'T00:00:00+00:00'
 89 |     elif sql_datatype == 'bit':
 90 |         cleaned_elem = elem == '1'
 91 |     elif sql_datatype == 'boolean':
 92 |         cleaned_elem = elem
 93 |     elif isinstance(elem, int):
 94 |         cleaned_elem = elem
 95 |     elif isinstance(elem, datetime.time):
 96 |         cleaned_elem = str(elem)
 97 |     elif isinstance(elem, str):
 98 |         cleaned_elem = elem
 99 |     elif isinstance(elem, decimal.Decimal):
100 |         #NB> We cast NaN's to NULL as wal2json does not support them and now we are at least consistent(ly wrong)
101 |         if elem.is_nan():
102 |             cleaned_elem = None
103 |         else:
104 |             cleaned_elem = elem
105 |     elif isinstance(elem, float):
106 |         #NB> We cast NaN's, +Inf, -Inf to NULL as wal2json does not support them and now we are at least consistent(ly wrong)
107 |         if math.isnan(elem):
108 |             cleaned_elem = None
109 |         elif math.isinf(elem):
110 |             cleaned_elem = None
111 |         else:
112 |             cleaned_elem = elem
113 |     elif isinstance(elem, dict):
114 |         if sql_datatype == 'hstore':
115 |             cleaned_elem = elem
116 |         else:
117 |             raise Exception("do not know how to marshall a dict if its not an hstore or json: {}".format(sql_datatype))
118 |     else:
119 |         raise Exception("do not know how to marshall value of class( {} ) and sql_datatype ( {} )".format(elem.__class__, sql_datatype))
120 | 
121 |     return cleaned_elem
122 | 
123 | def selected_array_to_singer_value(elem, sql_datatype):
124 |     if isinstance(elem, list):
125 |         return list(map(lambda elem: selected_array_to_singer_value(elem, sql_datatype), elem))
126 | 
127 |     return selected_value_to_singer_value_impl(elem, sql_datatype)
128 | 
129 | def selected_value_to_singer_value(elem, sql_datatype):
130 |     #are we dealing with an array?
131 |     if sql_datatype.find('[]') > 0:
132 |         return list(map(lambda elem: selected_array_to_singer_value(elem, sql_datatype), (elem or [])))
133 | 
134 |     return selected_value_to_singer_value_impl(elem, sql_datatype)
135 | 
136 | #pylint: disable=too-many-arguments
137 | def selected_row_to_singer_message(stream, row, version, columns, time_extracted, md_map):
138 |     row_to_persist = ()
139 |     for idx, elem in enumerate(row):
140 |         sql_datatype = md_map.get(('properties', columns[idx]))['sql-datatype']
141 |         cleaned_elem = selected_value_to_singer_value(elem, sql_datatype)
142 |         row_to_persist += (cleaned_elem,)
143 | 
144 |     rec = dict(zip(columns, row_to_persist))
145 | 
146 |     return singer.RecordMessage(
147 |         stream=calculate_destination_stream_name(stream, md_map),
148 |         record=rec,
149 |         version=version,
150 |         time_extracted=time_extracted)
151 | 
152 | def hstore_available(conn_info):
153 |     with open_connection(conn_info) as conn:
154 |         with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur:
155 |             cur.execute(""" SELECT installed_version FROM pg_available_extensions WHERE name = 'hstore' """)
156 |             res = cur.fetchone()
157 |             if res and res[0]:
158 |                 return True
159 |             return False
160 | 
161 | 
162 | def compute_tap_stream_id(database_name, schema_name, table_name):
163 |     return database_name + '-' + schema_name + '-' + table_name
164 | 
165 | 
166 | #NB> numeric/decimal columns in postgres without a specified scale && precision
167 | #default to 'up to 131072 digits before the decimal point; up to 16383
168 | #digits after the decimal point'. For practical reasons, we are capping this at 74/38
169 | #  https://www.postgresql.org/docs/10/static/datatype-numeric.html#DATATYPE-NUMERIC-TABLE
170 | MAX_SCALE = 38
171 | MAX_PRECISION = 100
172 | 
173 | def numeric_precision(c):
174 |     if c.numeric_precision is None:
175 |         return MAX_PRECISION
176 | 
177 |     if c.numeric_precision > MAX_PRECISION:
178 |         LOGGER.warning('capping decimal precision to 100.  THIS MAY CAUSE TRUNCATION')
179 |         return MAX_PRECISION
180 | 
181 |     return c.numeric_precision
182 | 
183 | def numeric_scale(c):
184 |     if c.numeric_scale is None:
185 |         return MAX_SCALE
186 |     if c.numeric_scale > MAX_SCALE:
187 |         LOGGER.warning('capping decimal scale to 38.  THIS MAY CAUSE TRUNCATION')
188 |         return MAX_SCALE
189 | 
190 |     return c.numeric_scale
191 | 
192 | def numeric_multiple_of(scale):
193 |     return 10 ** (0 - scale)
194 | 
195 | def numeric_max(precision, scale):
196 |     return 10 ** (precision - scale)
197 | 
198 | def numeric_min(precision, scale):
199 |     return -10 ** (precision - scale)
200 | 


--------------------------------------------------------------------------------
/tests/unittests/utils.py:
--------------------------------------------------------------------------------
  1 | from singer import get_logger, metadata
  2 | from nose.tools import nottest
  3 | import psycopg2
  4 | import singer
  5 | import os
  6 | import decimal
  7 | import math
  8 | import datetime
  9 | import pdb
 10 | from psycopg2.extensions import quote_ident
 11 | 
 12 | LOGGER = get_logger()
 13 | 
 14 | def get_test_connection_config(target_db='postgres'):
 15 |     missing_envs = [x for x in [os.getenv('TAP_POSTGRES_HOST'),
 16 |                                 os.getenv('TAP_POSTGRES_USER'),
 17 |                                 os.getenv('TAP_POSTGRES_PASSWORD'),
 18 |                                 os.getenv('TAP_POSTGRES_PORT')] if x == None]
 19 |     if len(missing_envs) != 0:
 20 |         #pylint: disable=line-too-long
 21 |         raise Exception("set TAP_POSTGRES_HOST, TAP_POSTGRES_USER, TAP_POSTGRES_PASSWORD, TAP_POSTGRES_PORT")
 22 | 
 23 |     conn_config = {}
 24 |     conn_config['host'] = os.environ.get('TAP_POSTGRES_HOST')
 25 |     conn_config['user'] = os.environ.get('TAP_POSTGRES_USER')
 26 |     conn_config['password'] = os.environ.get('TAP_POSTGRES_PASSWORD')
 27 |     conn_config['port'] = os.environ.get('TAP_POSTGRES_PORT')
 28 |     conn_config['dbname'] = target_db
 29 |     return conn_config
 30 | 
 31 | def get_test_connection(target_db='postgres'):
 32 |     conn_config = get_test_connection_config(target_db)
 33 |     conn_string = "host='{}' dbname='{}' user='{}' password='{}' port='{}'".format(conn_config['host'],
 34 |                                                                                    conn_config['dbname'],
 35 |                                                                                    conn_config['user'],
 36 |                                                                                    conn_config['password'],
 37 |                                                                                    conn_config['port'])
 38 |     LOGGER.info("connecting to {}".format(conn_config['host']))
 39 | 
 40 |     conn = psycopg2.connect(conn_string)
 41 |     conn.autocommit = True
 42 | 
 43 |     return conn
 44 | 
 45 | def build_col_sql(col, cur):
 46 |     if col.get('quoted'):
 47 |         col_sql = "{} {}".format(quote_ident(col['name'], cur), col['type'])
 48 |     else:
 49 |         col_sql = "{} {}".format(col['name'], col['type'])
 50 | 
 51 |     return col_sql
 52 | 
 53 | def build_table(table, cur):
 54 |     create_sql = "CREATE TABLE {}\n".format(quote_ident(table['name'], cur))
 55 |     col_sql = map(lambda c: build_col_sql(c, cur), table['columns'])
 56 |     pks = [c['name'] for c in table['columns'] if c.get('primary_key')]
 57 |     if len(pks) != 0:
 58 |         pk_sql = ",\n CONSTRAINT {}  PRIMARY KEY({})".format(quote_ident(table['name'] + "_pk", cur), " ,".join(pks))
 59 |     else:
 60 |        pk_sql = ""
 61 | 
 62 |     sql = "{} ( {} {})".format(create_sql, ",\n".join(col_sql), pk_sql)
 63 | 
 64 |     return sql
 65 | 
 66 | def ensure_db(dbname='postgres'):
 67 |     # Create database dev if not exists
 68 |     with get_test_connection() as conn:
 69 |         conn.autocommit = True
 70 |         with conn.cursor() as cur:
 71 |             cur.execute("SELECT 1 FROM pg_database WHERE datname = '{}'".format(dbname))
 72 |             exists = cur.fetchone()
 73 |             if not exists:
 74 |                 print("Creating database {}".format(dbname))
 75 |                 cur.execute("CREATE DATABASE {}".format(dbname))
 76 | 
 77 | @nottest
 78 | def ensure_test_table(table_spec, target_db='postgres'):
 79 |     with get_test_connection(target_db) as conn:
 80 |         with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
 81 |             sql = """SELECT *
 82 |                        FROM information_schema.tables
 83 |                       WHERE table_schema = 'public'
 84 |                         AND table_name = %s"""
 85 | 
 86 |             cur.execute(sql,
 87 |                         [table_spec['name']])
 88 |             old_table = cur.fetchall()
 89 | 
 90 |             if len(old_table) != 0:
 91 |                 cur.execute('DROP TABLE {} cascade'.format(quote_ident(table_spec['name'], cur)))
 92 | 
 93 |             sql = build_table(table_spec, cur)
 94 |             LOGGER.info("create table sql: %s", sql)
 95 |             cur.execute(sql)
 96 | 
 97 | def unselect_column(our_stream, col):
 98 |     md = metadata.to_map(our_stream['metadata'])
 99 |     md.get(('properties', col))['selected'] = False
100 |     our_stream['metadata'] = metadata.to_list(md)
101 |     return our_stream
102 | 
103 | def set_replication_method_for_stream(stream, method):
104 |     new_md = metadata.to_map(stream['metadata'])
105 |     old_md = new_md.get(())
106 |     old_md.update({'replication-method': method})
107 | 
108 |     stream['metadata'] = metadata.to_list(new_md)
109 |     return stream
110 | 
111 | def select_all_of_stream(stream):
112 |     new_md = metadata.to_map(stream['metadata'])
113 | 
114 |     old_md = new_md.get(())
115 |     old_md.update({'selected': True})
116 |     for col_name, col_schema in stream['schema']['properties'].items():
117 |         #explicitly select column if it is not automatic
118 |         if new_md.get(('properties', col_name)).get('inclusion') != 'automatic' and new_md.get(('properties', col_name)).get('inclusion') != 'unsupported':
119 |             old_md = new_md.get(('properties', col_name))
120 |             old_md.update({'selected' : True})
121 | 
122 |     stream['metadata'] = metadata.to_list(new_md)
123 |     return stream
124 | 
125 | 
126 | def crud_up_value(value):
127 |     if isinstance(value, str):
128 |         return value
129 |     elif isinstance(value, int):
130 |         return str(value)
131 |     elif isinstance(value, float):
132 |         if (value == float('+inf')):
133 |             return "'+Inf'"
134 |         elif (value == float('-inf')):
135 |             return "'-Inf'"
136 |         elif (math.isnan(value)):
137 |             return "'NaN'"
138 |         else:
139 |             return "{:f}".format(value)
140 |     elif isinstance(value, decimal.Decimal):
141 |         return "{:f}".format(value)
142 |     elif value is None:
143 |         return 'NULL'
144 |     elif isinstance(value, datetime.datetime) and value.tzinfo is None:
145 |         return "TIMESTAMP '{}'".format(str(value))
146 |     elif isinstance(value, datetime.datetime):
147 |         return "TIMESTAMP '{}'".format(str(value))
148 |     elif isinstance(value, datetime.date):
149 |         return "Date  '{}'".format(str(value))
150 |     else:
151 |         raise Exception("crud_up_value does not yet support {}".format(value.__class__))
152 | 
153 | def insert_record(cursor, table_name, data):
154 |     our_keys = list(data.keys())
155 |     our_keys.sort()
156 |     our_values = list(map( lambda k: data.get(k), our_keys))
157 | 
158 | 
159 |     columns_sql = ", \n".join(map(lambda k: quote_ident(k, cursor), our_keys))
160 |     value_sql = ",".join(["%s" for i in range(len(our_keys))])
161 | 
162 |     insert_sql = """ INSERT INTO {}
163 |                             ( {} )
164 |                      VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql)
165 |     LOGGER.info("INSERT: {}".format(insert_sql))
166 |     cursor.execute(insert_sql, list(map(crud_up_value, our_values)))
167 | 
168 | 
169 | def verify_crud_messages(that, caught_messages, pks):
170 | 
171 |     that.assertEqual(14, len(caught_messages))
172 |     that.assertTrue(isinstance(caught_messages[0], singer.SchemaMessage))
173 |     that.assertTrue(isinstance(caught_messages[1], singer.RecordMessage))
174 |     that.assertTrue(isinstance(caught_messages[2], singer.StateMessage))
175 |     that.assertTrue(isinstance(caught_messages[3], singer.RecordMessage))
176 |     that.assertTrue(isinstance(caught_messages[4], singer.StateMessage))
177 |     that.assertTrue(isinstance(caught_messages[5], singer.RecordMessage))
178 |     that.assertTrue(isinstance(caught_messages[6], singer.StateMessage))
179 |     that.assertTrue(isinstance(caught_messages[7], singer.RecordMessage))
180 |     that.assertTrue(isinstance(caught_messages[8], singer.StateMessage))
181 |     that.assertTrue(isinstance(caught_messages[9], singer.RecordMessage))
182 |     that.assertTrue(isinstance(caught_messages[10], singer.StateMessage))
183 |     that.assertTrue(isinstance(caught_messages[11], singer.RecordMessage))
184 |     that.assertTrue(isinstance(caught_messages[12], singer.StateMessage))
185 |     that.assertTrue(isinstance(caught_messages[13], singer.StateMessage))
186 | 
187 |     #schema includes scn && _sdc_deleted_at because we selected logminer as our replication method
188 |     that.assertEqual({"type" : ['integer']}, caught_messages[0].schema.get('properties').get('scn') )
189 |     that.assertEqual({"type" : ['null', 'string'], "format" : "date-time"}, caught_messages[0].schema.get('properties').get('_sdc_deleted_at') )
190 | 
191 |     that.assertEqual(pks, caught_messages[0].key_properties)
192 | 
193 |     #verify first STATE message
194 |     bookmarks_1 = caught_messages[2].value.get('bookmarks')['ROOT-CHICKEN']
195 |     that.assertIsNotNone(bookmarks_1)
196 |     bookmarks_1_scn = bookmarks_1.get('scn')
197 |     bookmarks_1_version = bookmarks_1.get('version')
198 |     that.assertIsNotNone(bookmarks_1_scn)
199 |     that.assertIsNotNone(bookmarks_1_version)
200 | 
201 |     #verify STATE message after UPDATE
202 |     bookmarks_2 = caught_messages[6].value.get('bookmarks')['ROOT-CHICKEN']
203 |     that.assertIsNotNone(bookmarks_2)
204 |     bookmarks_2_scn = bookmarks_2.get('scn')
205 |     bookmarks_2_version = bookmarks_2.get('version')
206 |     that.assertIsNotNone(bookmarks_2_scn)
207 |     that.assertIsNotNone(bookmarks_2_version)
208 |     that.assertGreater(bookmarks_2_scn, bookmarks_1_scn)
209 |     that.assertEqual(bookmarks_2_version, bookmarks_1_version)
210 | 


--------------------------------------------------------------------------------
/tests/test_postgres_views_logical_replication.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | 
  4 | import psycopg2.extras
  5 | from psycopg2.extensions import quote_ident
  6 | import tap_tester.connections as connections
  7 | import tap_tester.menagerie   as menagerie
  8 | import tap_tester.runner      as runner
  9 | 
 10 | import db_utils  # pylint: disable=import-error
 11 | 
 12 | expected_schemas = {'chicken_view': {'properties':
 13 |                                      {'fk_id': {'maximum': 9223372036854775807, 'type': ['null', 'integer'],
 14 |                                                 'minimum': -9223372036854775808},
 15 |                                       'size': {'type': ['null', 'string']},
 16 |                                       'name': {'type': ['null', 'string']},
 17 |                                       'id': {'maximum': 2147483647, 'type': ['null', 'integer'],
 18 |                                              'minimum': -2147483648},
 19 |                                       'age': {'maximum': 2147483647, 'type': ['null', 'integer'],
 20 |                                               'minimum': -2147483648}},
 21 |                                  'type': 'object'}}
 22 | 
 23 | 
 24 | def canonicalized_table_name(schema, table, cur):
 25 |     return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur))
 26 | 
 27 | def insert_record(cursor, table_name, data):
 28 |     our_keys = list(data.keys())
 29 |     our_keys.sort()
 30 |     our_values = [data.get(key) for key in our_keys]
 31 | 
 32 |     columns_sql = ", \n ".join(our_keys)
 33 |     value_sql = ",".join(["%s" for i in range(len(our_keys))])
 34 | 
 35 |     insert_sql = """ INSERT INTO {}
 36 |                             ( {} )
 37 |                      VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql)
 38 |     cursor.execute(insert_sql, our_values)
 39 | 
 40 | 
 41 | 
 42 | test_schema_name = "public"
 43 | test_table_name_1 = "postgres_views_full_table_replication_test"
 44 | test_table_name_2 = "postgres_views_full_table_replication_test_2"
 45 | test_view = 'chicken_view'
 46 | 
 47 | class PostgresViewsLogicalReplication(unittest.TestCase):
 48 |     def setUp(self):
 49 |         db_utils.ensure_environment_variables_set()
 50 | 
 51 |         db_utils.ensure_db()
 52 | 
 53 |         self.maxDiff = None
 54 | 
 55 |         with db_utils.get_test_connection() as conn:
 56 |             conn.autocommit = True
 57 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
 58 |                 for table in [test_table_name_1, test_table_name_2]:
 59 |                     old_table = cur.execute("""SELECT EXISTS (
 60 |                                                   SELECT 1
 61 |                                                     FROM  information_schema.tables
 62 |                                                    WHERE table_schema = %s
 63 |                                                      AND  table_name =  %s)""",
 64 |                                             [test_schema_name, table])
 65 |                     old_table = cur.fetchone()[0]
 66 |                     if old_table:
 67 |                         cur.execute("DROP TABLE {} CASCADE".format(canonicalized_table_name(test_schema_name, table, cur)))
 68 | 
 69 | 
 70 |                 cur.execute("""DROP VIEW IF EXISTS {} """.format(quote_ident(test_view, cur)))
 71 |                 cur.execute("""CREATE TABLE {}
 72 |                                 (id SERIAL PRIMARY KEY,
 73 |                                  name VARCHAR,
 74 |                                  size VARCHAR) """.format(canonicalized_table_name(test_schema_name, test_table_name_1, cur)))
 75 | 
 76 |                 cur.execute("""CREATE TABLE {}
 77 |                                 (fk_id bigint,
 78 |                                  age integer) """.format(canonicalized_table_name(test_schema_name, test_table_name_2, cur)))
 79 | 
 80 |                 cur.execute("""CREATE VIEW {} AS
 81 |                             (SELECT *
 82 |                               FROM {}
 83 |                               join {}
 84 |                                 on {}.id = {}.fk_id
 85 |                     )""".format(quote_ident(test_view, cur),
 86 |                                 canonicalized_table_name(test_schema_name, test_table_name_1, cur),
 87 |                                 canonicalized_table_name(test_schema_name, test_table_name_2, cur),
 88 |                                 canonicalized_table_name(test_schema_name, test_table_name_1, cur),
 89 |                                 canonicalized_table_name(test_schema_name, test_table_name_2, cur)))
 90 | 
 91 |                 self.rec_1 = { 'name' : 'fred', 'size' : 'big' }
 92 |                 insert_record(cur, test_table_name_1, self.rec_1)
 93 | 
 94 |                 cur.execute("SELECT id FROM {}".format(canonicalized_table_name(test_schema_name, test_table_name_1, cur)))
 95 |                 fk_id = cur.fetchone()[0]
 96 | 
 97 |                 self.rec_2 = { 'fk_id' : fk_id, 'age' : 99 }
 98 |                 insert_record(cur, test_table_name_2, self.rec_2)
 99 | 
100 |     @staticmethod
101 |     def expected_check_streams():
102 |         return { 'postgres-public-chicken_view'}
103 | 
104 |     @staticmethod
105 |     def expected_sync_streams():
106 |         return { 'chicken_view' }
107 | 
108 |     @staticmethod
109 |     def name():
110 |         return "tap_tester_postgres_views_logical_replication"
111 | 
112 |     @staticmethod
113 |     def expected_pks():
114 |         return {
115 |             'chicken_view' : {'id'}
116 |         }
117 | 
118 |     @staticmethod
119 |     def tap_name():
120 |         return "tap-postgres"
121 | 
122 |     @staticmethod
123 |     def get_type():
124 |         return "platform.postgres"
125 | 
126 |     @staticmethod
127 |     def get_credentials():
128 |         return {'password': os.getenv('TAP_POSTGRES_PASSWORD')}
129 | 
130 |     @staticmethod
131 |     def get_properties():
132 |         return {'host' : os.getenv('TAP_POSTGRES_HOST'),
133 |                 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'),
134 |                 'port' : os.getenv('TAP_POSTGRES_PORT'),
135 |                 'user' : os.getenv('TAP_POSTGRES_USER'),
136 |                 'default_replication_method' : 'FULL_TABLE'
137 |         }
138 | 
139 |     def test_run(self):
140 |         conn_id = connections.ensure_connection(self)
141 | 
142 |         # run in check mode
143 |         check_job_name = runner.run_check_mode(self, conn_id)
144 | 
145 |         # verify check  exit codes
146 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
147 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
148 | 
149 |         # verify the tap discovered the right streams
150 |         found_catalogs = [fc for fc
151 |                           in menagerie.get_catalogs(conn_id)
152 |                           if fc['tap_stream_id'] in self.expected_check_streams()]
153 | 
154 |         self.assertEqual(len(found_catalogs),
155 |                          1,
156 |                          msg="unable to locate schemas for connection {}".format(conn_id))
157 | 
158 |         found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))
159 |         diff = self.expected_check_streams().symmetric_difference(found_catalog_names)
160 |         self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
161 | 
162 |         # verify that persisted streams have the correct properties
163 |         chicken_catalog = found_catalogs[0]
164 | 
165 |         self.assertEqual('chicken_view', chicken_catalog['stream_name'])
166 |         print("discovered streams are correct")
167 | 
168 |         print('checking discoverd metadata for ROOT-CHICKEN_VIEW')
169 |         md = menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id'])['metadata']
170 | 
171 |         self.assertEqual(
172 |             {(): {'database-name': 'postgres', 'is-view': True, 'row-count': 0, 'schema-name': 'public', 'table-key-properties': []},
173 |              ('properties', 'fk_id'): {'inclusion': 'available', 'sql-datatype': 'bigint', 'selected-by-default': True},
174 |              ('properties', 'name'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True},
175 |              ('properties', 'age'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True},
176 |              ('properties', 'size'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True},
177 |              ('properties', 'id'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True}},
178 |             db_utils.to_map(md))
179 | 
180 | 
181 |         # 'ID' selected as view-key-properties
182 |         replication_md = [{"breadcrumb": [], "metadata": {'replication-key': None, "replication-method" : "LOG_BASED", 'view-key-properties': ["id"]}}]
183 | 
184 |         connections.select_catalog_and_fields_via_metadata(conn_id, chicken_catalog,
185 |                                                            menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id']),
186 |                                                            replication_md)
187 | 
188 |         # clear state
189 |         menagerie.set_state(conn_id, {})
190 | 
191 |         sync_job_name = runner.run_sync_mode(self, conn_id)
192 | 
193 |        # verify tap and target exit codes
194 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
195 | 
196 |         self.assertEqual(exit_status['tap_exit_status'], 1)
197 |         # menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
198 | 
199 |         record_count_by_stream = runner.examine_target_output_file(self,
200 |                                                                    conn_id,
201 |                                                                    self.expected_sync_streams(),
202 |                                                                    self.expected_pks())
203 | 
204 |         self.assertEqual(record_count_by_stream, {})
205 |         print("records are correct")
206 | 
207 |         # verify state and bookmarks
208 |         state = menagerie.get_state(conn_id)
209 |         self.assertEqual(state, {}, msg="expected state to be empty")
210 | 


--------------------------------------------------------------------------------
/tests/test_postgres_views_full_table.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | 
  4 | import psycopg2.extras
  5 | from psycopg2.extensions import quote_ident
  6 | import tap_tester.connections as connections
  7 | import tap_tester.menagerie   as menagerie
  8 | import tap_tester.runner      as runner
  9 | 
 10 | import db_utils  # pylint: disable=import-error
 11 | 
 12 | expected_schemas = {'chicken_view':
 13 |                     {'properties': {'fk_id': {'maximum': 9223372036854775807, 'type': ['null', 'integer'],
 14 |                                               'minimum': -9223372036854775808},
 15 |                                     'size': {'type': ['null', 'string']},
 16 |                                     'name': {'type': ['null', 'string']},
 17 |                                     'id': {'maximum': 2147483647, 'type': ['null', 'integer'],
 18 |                                            'minimum': -2147483648},
 19 |                                     'age': {'maximum': 2147483647, 'type': ['null', 'integer'],
 20 |                                             'minimum': -2147483648}},
 21 |                      'type': 'object',
 22 |                      'definitions' : {
 23 |                          'sdc_recursive_integer_array' : { 'type' : ['null', 'integer', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_integer_array'}},
 24 |                         'sdc_recursive_number_array' : { 'type' : ['null', 'number', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_number_array'}},
 25 |                         'sdc_recursive_string_array' : { 'type' : ['null', 'string', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_string_array'}},
 26 |                         'sdc_recursive_boolean_array' : { 'type' : ['null', 'boolean', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_boolean_array'}},
 27 |                         'sdc_recursive_timestamp_array' : { 'type' : ['null', 'string', 'array'], 'format' : 'date-time', 'items' : { '$ref': '#/definitions/sdc_recursive_timestamp_array'}},
 28 |                         'sdc_recursive_object_array' : { 'type' : ['null','object', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_object_array'}}
 29 |                      }}}
 30 | 
 31 | 
 32 | def canonicalized_table_name(schema, table, cur):
 33 |     return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur))
 34 | 
 35 | def insert_record(cursor, table_name, data):
 36 |     our_keys = list(data.keys())
 37 |     our_keys.sort()
 38 |     our_values = [data.get(key) for key in our_keys]
 39 | 
 40 |     columns_sql = ", \n ".join(our_keys)
 41 |     value_sql = ",".join(["%s" for i in range(len(our_keys))])
 42 | 
 43 |     insert_sql = """ INSERT INTO {}
 44 |                             ( {} )
 45 |                      VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql)
 46 |     cursor.execute(insert_sql, our_values)
 47 | 
 48 | 
 49 | 
 50 | test_schema_name = "public"
 51 | test_table_name_1 = "postgres_views_full_table_replication_test"
 52 | test_table_name_2 = "postgres_views_full_table_replication_test_2"
 53 | test_view = 'chicken_view'
 54 | 
 55 | class PostgresViewsFullTable(unittest.TestCase):
 56 |     def setUp(self):
 57 |         db_utils.ensure_environment_variables_set()
 58 | 
 59 |         db_utils.ensure_db()
 60 | 
 61 |         self.maxDiff = None
 62 | 
 63 |         with db_utils.get_test_connection() as conn:
 64 |             conn.autocommit = True
 65 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
 66 |                 for table in [test_table_name_1, test_table_name_2]:
 67 |                     old_table = cur.execute("""SELECT EXISTS (
 68 |                                                   SELECT 1
 69 |                                                     FROM  information_schema.tables
 70 |                                                    WHERE table_schema = %s
 71 |                                                      AND  table_name =  %s)""",
 72 |                                             [test_schema_name, table])
 73 |                     old_table = cur.fetchone()[0]
 74 |                     if old_table:
 75 |                         cur.execute("DROP TABLE {} CASCADE".format(canonicalized_table_name(test_schema_name, table, cur)))
 76 | 
 77 | 
 78 |                 cur.execute("""DROP VIEW IF EXISTS {} """.format(quote_ident(test_view, cur)))
 79 |                 cur.execute("""CREATE TABLE {}
 80 |                                 (id SERIAL PRIMARY KEY,
 81 |                                  name VARCHAR,
 82 |                                  size VARCHAR) """.format(canonicalized_table_name(test_schema_name, test_table_name_1, cur)))
 83 | 
 84 |                 cur.execute("""CREATE TABLE {}
 85 |                                 (fk_id bigint,
 86 |                                  age integer) """.format(canonicalized_table_name(test_schema_name, test_table_name_2, cur)))
 87 | 
 88 |                 cur.execute("""CREATE VIEW {} AS
 89 |                             (SELECT *
 90 |                               FROM {}
 91 |                               join {}
 92 |                                 on {}.id = {}.fk_id
 93 |                     )""".format(quote_ident(test_view, cur),
 94 |                                 canonicalized_table_name(test_schema_name, test_table_name_1, cur),
 95 |                                 canonicalized_table_name(test_schema_name, test_table_name_2, cur),
 96 |                                 canonicalized_table_name(test_schema_name, test_table_name_1, cur),
 97 |                                 canonicalized_table_name(test_schema_name, test_table_name_2, cur)))
 98 | 
 99 |                 self.rec_1 = { 'name' : 'fred', 'size' : 'big' }
100 |                 insert_record(cur, test_table_name_1, self.rec_1)
101 | 
102 |                 cur.execute("SELECT id FROM {}".format(canonicalized_table_name(test_schema_name, test_table_name_1, cur)))
103 |                 fk_id = cur.fetchone()[0]
104 | 
105 |                 self.rec_2 = { 'fk_id' : fk_id, 'age' : 99 }
106 |                 insert_record(cur, test_table_name_2, self.rec_2)
107 | 
108 | 
109 |     @staticmethod
110 |     def expected_check_streams():
111 |         return { 'postgres-public-chicken_view'}
112 | 
113 |     @staticmethod
114 |     def expected_sync_streams():
115 |         return { 'chicken_view' }
116 | 
117 |     @staticmethod
118 |     def name():
119 |         return "tap_tester_postgres_views_full_table"
120 | 
121 |     @staticmethod
122 |     def expected_pks():
123 |         return {
124 |             'chicken_view' : {'id'}
125 |         }
126 | 
127 |     @staticmethod
128 |     def tap_name():
129 |         return "tap-postgres"
130 | 
131 |     @staticmethod
132 |     def get_type():
133 |         return "platform.postgres"
134 | 
135 |     @staticmethod
136 |     def get_credentials():
137 |         return {'password': os.getenv('TAP_POSTGRES_PASSWORD')}
138 | 
139 |     @staticmethod
140 |     def get_properties():
141 |         return {'host' : os.getenv('TAP_POSTGRES_HOST'),
142 |                 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'),
143 |                 'port' : os.getenv('TAP_POSTGRES_PORT'),
144 |                 'user' : os.getenv('TAP_POSTGRES_USER')
145 |         }
146 | 
147 |     def test_run(self):
148 |         conn_id = connections.ensure_connection(self)
149 | 
150 |         # run in check mode
151 |         check_job_name = runner.run_check_mode(self, conn_id)
152 | 
153 |         # verify check  exit codes
154 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
155 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
156 | 
157 |         # verify the tap discovered the right streams
158 |         found_catalogs = [fc for fc
159 |                           in menagerie.get_catalogs(conn_id)
160 |                           if fc['tap_stream_id'] in self.expected_check_streams()]
161 | 
162 |         self.assertEqual(len(found_catalogs),
163 |                          1,
164 |                          msg="unable to locate schemas for connection {}".format(conn_id))
165 | 
166 |         found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))
167 |         diff = self.expected_check_streams().symmetric_difference(found_catalog_names)
168 |         self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
169 | 
170 |         # verify that persisted streams have the correct properties
171 |         chicken_catalog = found_catalogs[0]
172 | 
173 |         self.assertEqual('chicken_view', chicken_catalog['stream_name'])
174 |         print("discovered streams are correct")
175 | 
176 |         print('checking discoverd metadata for ROOT-CHICKEN_VIEW')
177 |         md = menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id'])['metadata']
178 | 
179 |         self.assertEqual(
180 |             {(): {'database-name': 'postgres', 'is-view': True, 'row-count': 0, 'schema-name': 'public', 'table-key-properties': []},
181 |              ('properties', 'fk_id'): {'inclusion': 'available', 'sql-datatype': 'bigint', 'selected-by-default': True},
182 |              ('properties', 'name'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True},
183 |              ('properties', 'age'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True},
184 |              ('properties', 'size'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True},
185 |              ('properties', 'id'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True}},
186 |             db_utils.to_map(md))
187 | 
188 | 
189 |         # 'ID' selected as view-key-properties
190 |         replication_md = [{"breadcrumb": [], "metadata": {'replication-key': None, "replication-method" : "FULL_TABLE", 'view-key-properties': ["id"]}}]
191 | 
192 |         connections.select_catalog_and_fields_via_metadata(conn_id, chicken_catalog,
193 |                                                            menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id']),
194 |                                                            replication_md)
195 | 
196 |         # clear state
197 |         menagerie.set_state(conn_id, {})
198 | 
199 |         sync_job_name = runner.run_sync_mode(self, conn_id)
200 | 
201 |         # verify tap and target exit codes
202 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
203 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
204 | 
205 |         record_count_by_stream = runner.examine_target_output_file(self,
206 |                                                                    conn_id,
207 |                                                                    self.expected_sync_streams(),
208 |                                                                    self.expected_pks())
209 | 
210 | 
211 |         self.assertEqual(record_count_by_stream, { 'chicken_view': 1})
212 |         records_by_stream = runner.get_records_from_target_output()
213 | 
214 |         table_version = records_by_stream['chicken_view']['table_version']
215 |         self.assertEqual(records_by_stream['chicken_view']['messages'][0]['action'], 'activate_version')
216 |         self.assertEqual(records_by_stream['chicken_view']['messages'][1]['action'], 'upsert')
217 |         self.assertEqual(records_by_stream['chicken_view']['messages'][2]['action'], 'activate_version')
218 | 
219 |         # verifications about individual records
220 |         for stream, recs in records_by_stream.items():
221 |             # verify the persisted schema was correct
222 |             self.assertEqual(recs['schema'],
223 |                              expected_schemas[stream],
224 |                              msg="Persisted schema did not match expected schema for stream `{}`.".format(stream))
225 | 
226 |         actual_chicken_record = records_by_stream['chicken_view']['messages'][1]['data']
227 | 
228 |         expected_chicken_record = {'id': 1, 'fk_id': 1, 'name': 'fred', 'age': 99, 'size' : 'big'}
229 |         self.assertEqual(actual_chicken_record,
230 |                          expected_chicken_record,
231 |                          msg="Expected `various_types` upsert record data to be {}, but target output {}".format(expected_chicken_record, actual_chicken_record))
232 | 
233 |         print("records are correct")
234 | 
235 |         # verify state and bookmarks
236 |         state = menagerie.get_state(conn_id)
237 | 
238 |         chicken_bookmark = state['bookmarks']['postgres-public-chicken_view']
239 |         self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")
240 |         self.assertEqual(chicken_bookmark['version'], table_version,
241 |                          msg="expected bookmark for stream ROOT-CHICKEN to match version")
242 | 


--------------------------------------------------------------------------------
/tests/test_postgres_views_incremental_replication.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import datetime
  3 | import unittest
  4 | 
  5 | import psycopg2.extras
  6 | from psycopg2.extensions import quote_ident
  7 | import tap_tester.connections as connections
  8 | import tap_tester.menagerie   as menagerie
  9 | import tap_tester.runner      as runner
 10 | 
 11 | import db_utils  # pylint: disable=import-error
 12 | 
 13 | 
 14 | expected_schemas = {'chicken_view':
 15 |                     {'properties': {'fk_id': {'maximum': 9223372036854775807, 'type': ['null', 'integer'],
 16 |                                               'minimum': -9223372036854775808},
 17 |                                     'size': {'type': ['null', 'string']},
 18 |                                     'name': {'type': ['null', 'string']},
 19 |                                     'id': {'maximum': 2147483647, 'type': ['null', 'integer'],
 20 |                                            'minimum': -2147483648},
 21 |                                     'age': {'maximum': 2147483647, 'type': ['null', 'integer'],
 22 |                                             'minimum': -2147483648},
 23 |                                     'updated_at': {'format': 'date-time',
 24 |                                                    'type': ['null', 'string']}},
 25 |                      'type': 'object',
 26 |                      'definitions' : {
 27 |                         'sdc_recursive_integer_array' : { 'type' : ['null', 'integer', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_integer_array'}},
 28 |                         'sdc_recursive_number_array' : { 'type' : ['null', 'number', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_number_array'}},
 29 |                         'sdc_recursive_string_array' : { 'type' : ['null', 'string', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_string_array'}},
 30 |                         'sdc_recursive_boolean_array' : { 'type' : ['null', 'boolean', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_boolean_array'}},
 31 |                         'sdc_recursive_timestamp_array' : { 'type' : ['null', 'string', 'array'], 'format' : 'date-time', 'items' : { '$ref': '#/definitions/sdc_recursive_timestamp_array'}},
 32 |                         'sdc_recursive_object_array' : { 'type' : ['null','object', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_object_array'}}
 33 |                      }}}
 34 | 
 35 | def canonicalized_table_name(schema, table, cur):
 36 |     return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur))
 37 | 
 38 | def insert_record(cursor, table_name, data):
 39 |     our_keys = list(data.keys())
 40 |     our_keys.sort()
 41 |     our_values = [data.get(key) for key in our_keys]
 42 | 
 43 | 
 44 |     columns_sql = ", \n ".join(our_keys)
 45 |     value_sql = ",".join(["%s" for i in range(len(our_keys))])
 46 | 
 47 |     insert_sql = """ INSERT INTO {}
 48 |                             ( {} )
 49 |                      VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql)
 50 |     cursor.execute(insert_sql, our_values)
 51 | 
 52 | 
 53 | 
 54 | test_schema_name = "public"
 55 | test_table_name_1 = "postgres_views_full_table_replication_test"
 56 | test_table_name_2 = "postgres_views_full_table_replication_test_2"
 57 | test_view = 'chicken_view'
 58 | 
 59 | class PostgresViewsIncrementalReplication(unittest.TestCase):
 60 |     def setUp(self):
 61 |         db_utils.ensure_environment_variables_set()
 62 | 
 63 |         db_utils.ensure_db()
 64 | 
 65 |         self.maxDiff = None
 66 | 
 67 |         with db_utils.get_test_connection() as conn:
 68 |             conn.autocommit = True
 69 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
 70 |                 for table in [test_table_name_1, test_table_name_2]:
 71 |                     old_table = cur.execute("""SELECT EXISTS (
 72 |                                                   SELECT 1
 73 |                                                     FROM  information_schema.tables
 74 |                                                    WHERE table_schema = %s
 75 |                                                      AND  table_name =  %s)""",
 76 |                                             [test_schema_name, table])
 77 |                     old_table = cur.fetchone()[0]
 78 |                     if old_table:
 79 |                         cur.execute("DROP TABLE {} CASCADE".format(canonicalized_table_name(test_schema_name, table, cur)))
 80 | 
 81 | 
 82 |                 cur.execute("""DROP VIEW IF EXISTS {} """.format(quote_ident(test_view, cur)))
 83 |                 cur.execute("""CREATE TABLE {}
 84 |                                 (id SERIAL PRIMARY KEY,
 85 |                                  updated_at TIMESTAMP WITH TIME ZONE,
 86 |                                  name VARCHAR,
 87 |                                  size VARCHAR) """.format(canonicalized_table_name(test_schema_name, test_table_name_1, cur)))
 88 | 
 89 |                 cur.execute("""CREATE TABLE {}
 90 |                                 (fk_id bigint,
 91 |                                  age integer) """.format(canonicalized_table_name(test_schema_name, test_table_name_2, cur)))
 92 | 
 93 |                 cur.execute("""CREATE VIEW {} AS
 94 |                             (SELECT *
 95 |                               FROM {}
 96 |                               join {}
 97 |                                 on {}.id = {}.fk_id
 98 |                     )""".format(quote_ident(test_view, cur),
 99 |                                 canonicalized_table_name(test_schema_name, test_table_name_1, cur),
100 |                                 canonicalized_table_name(test_schema_name, test_table_name_2, cur),
101 |                                 canonicalized_table_name(test_schema_name, test_table_name_1, cur),
102 |                                 canonicalized_table_name(test_schema_name, test_table_name_2, cur)))
103 | 
104 |                 self.rec_1 = { 'name' : 'fred', 'size' : 'big', 'updated_at' : datetime.datetime(2111, 1, 1, 12, 12, 12, 222111) }
105 |                 insert_record(cur, test_table_name_1, self.rec_1)
106 | 
107 |                 cur.execute("SELECT id FROM {}".format(canonicalized_table_name(test_schema_name, test_table_name_1, cur)))
108 |                 fk_id = cur.fetchone()[0]
109 | 
110 |                 self.rec_2 = { 'fk_id' : fk_id, 'age' : 99 }
111 |                 insert_record(cur, test_table_name_2, self.rec_2)
112 | 
113 |     @staticmethod
114 |     def expected_check_streams():
115 |         return { 'postgres-public-chicken_view'}
116 | 
117 |     @staticmethod
118 |     def expected_sync_streams():
119 |         return { 'chicken_view' }
120 | 
121 |     @staticmethod
122 |     def name():
123 |         return "tap_tester_postgres_views_incremental_replication"
124 | 
125 |     @staticmethod
126 |     def expected_pks():
127 |         return {
128 |             'chicken_view' : {'id'}
129 |         }
130 | 
131 |     @staticmethod
132 |     def tap_name():
133 |         return "tap-postgres"
134 | 
135 |     @staticmethod
136 |     def get_type():
137 |         return "platform.postgres"
138 | 
139 |     @staticmethod
140 |     def get_credentials():
141 |         return {'password': os.getenv('TAP_POSTGRES_PASSWORD')}
142 | 
143 |     @staticmethod
144 |     def get_properties():
145 |         return {'host' : os.getenv('TAP_POSTGRES_HOST'),
146 |                 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'),
147 |                 'port' : os.getenv('TAP_POSTGRES_PORT'),
148 |                 'user' : os.getenv('TAP_POSTGRES_USER'),
149 |                 'default_replication_method' : 'FULL_TABLE'
150 |         }
151 | 
152 |     def test_run(self):
153 |         conn_id = connections.ensure_connection(self)
154 | 
155 |         # run in check mode
156 |         check_job_name = runner.run_check_mode(self, conn_id)
157 | 
158 |         # verify check  exit codes
159 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
160 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
161 | 
162 |         # verify the tap discovered the right streams
163 |         found_catalogs = [fc for fc
164 |                           in menagerie.get_catalogs(conn_id)
165 |                           if fc['tap_stream_id'] in self.expected_check_streams()]
166 | 
167 |         self.assertEqual(len(found_catalogs),
168 |                          1,
169 |                          msg="unable to locate schemas for connection {}".format(conn_id))
170 | 
171 |         found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))
172 |         diff = self.expected_check_streams().symmetric_difference(found_catalog_names)
173 |         self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
174 | 
175 |         # verify that persisted streams have the correct properties
176 |         chicken_catalog = found_catalogs[0]
177 | 
178 |         self.assertEqual('chicken_view', chicken_catalog['stream_name'])
179 |         print("discovered streams are correct")
180 | 
181 |         print('checking discoverd metadata for ROOT-CHICKEN_VIEW')
182 |         md = menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id'])['metadata']
183 | 
184 |         self.assertEqual(
185 |             {(): {'database-name': 'postgres', 'is-view': True, 'row-count': 0, 'schema-name': 'public', 'table-key-properties': []},
186 |              ('properties', 'fk_id'): {'inclusion': 'available', 'sql-datatype': 'bigint', 'selected-by-default': True},
187 |              ('properties', 'name'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True},
188 |              ('properties', 'age'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True},
189 |              ('properties', 'size'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True},
190 |              ('properties', 'id'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True},
191 |              ('properties', 'updated_at'): {'selected-by-default': True, 'inclusion': 'available', 'sql-datatype': 'timestamp with time zone'}},
192 |             db_utils.to_map(md))
193 | 
194 | 
195 |         # 'ID' selected as view-key-properties, updated_at is replication_key
196 |         replication_md = [{"breadcrumb": [], "metadata": {'replication-key': 'updated_at', "replication-method" : "INCREMENTAL", 'view-key-properties': ["id"]}}]
197 | 
198 |         connections.select_catalog_and_fields_via_metadata(conn_id, chicken_catalog,
199 |                                                            menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id']),
200 |                                                            replication_md)
201 | 
202 |         # clear state
203 |         menagerie.set_state(conn_id, {})
204 | 
205 |         sync_job_name = runner.run_sync_mode(self, conn_id)
206 | 
207 |         # verify tap and target exit codes
208 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
209 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
210 | 
211 |         record_count_by_stream = runner.examine_target_output_file(self,
212 |                                                                    conn_id,
213 |                                                                    self.expected_sync_streams(),
214 |                                                                    self.expected_pks())
215 | 
216 | 
217 |         self.assertEqual(record_count_by_stream, { 'chicken_view': 1})
218 |         records_by_stream = runner.get_records_from_target_output()
219 | 
220 |         table_version = records_by_stream['chicken_view']['table_version']
221 |         self.assertEqual(2, len(records_by_stream['chicken_view']['messages']))
222 |         self.assertEqual(records_by_stream['chicken_view']['messages'][0]['action'], 'activate_version')
223 |         self.assertEqual(records_by_stream['chicken_view']['messages'][1]['action'], 'upsert')
224 | 
225 |         # verifications about individual records
226 |         for stream, recs in records_by_stream.items():
227 |             # verify the persisted schema was correct
228 |             self.assertEqual(recs['schema'],
229 |                              expected_schemas[stream],
230 |                              msg="Persisted schema did not match expected schema for stream `{}`.".format(stream))
231 | 
232 |         actual_chicken_record = records_by_stream['chicken_view']['messages'][1]['data']
233 | 
234 |         expected_chicken_record = {'id': 1, 'fk_id': 1, 'name': 'fred', 'age': 99,  'updated_at': '2111-01-01T12:12:12.222111+00:00', 'size' : 'big'}
235 |         self.assertEqual(actual_chicken_record,
236 |                          expected_chicken_record,
237 |                          msg="Expected `various_types` upsert record data to be {}, but target output {}".format(expected_chicken_record, actual_chicken_record))
238 | 
239 |         print("records are correct")
240 | 
241 |         # verify state and bookmarks
242 |         state = menagerie.get_state(conn_id)
243 | 
244 |         chicken_bookmark = state['bookmarks']['postgres-public-chicken_view']
245 |         self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")
246 |         self.assertEqual(chicken_bookmark['version'], table_version,
247 |                          msg="expected bookmark for stream ROOT-CHICKEN to match version")
248 |         self.assertEqual(chicken_bookmark['replication_key'], 'updated_at')
249 |         self.assertEqual(chicken_bookmark['replication_key_value'],'2111-01-01T12:12:12.222111+00:00')
250 |         print("bookmarks are correct")
251 | 
252 |         # TODO Verify expected fields have inclusion of 'automatic'
253 | 


--------------------------------------------------------------------------------
/tests/unittests/test_full_table_interruption.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | import tap_postgres
  4 | import tap_postgres.sync_strategies.full_table as full_table
  5 | import tap_postgres.sync_strategies.common as pg_common
  6 | import pdb
  7 | import singer
  8 | from singer import get_logger, metadata, write_bookmark
  9 | 
 10 | from utils import ensure_db, get_test_connection, ensure_test_table, select_all_of_stream, set_replication_method_for_stream, insert_record, get_test_connection_config
 11 | 
 12 | import decimal
 13 | import math
 14 | import pytz
 15 | import strict_rfc3339
 16 | import copy
 17 | 
 18 | LOGGER = get_logger()
 19 | 
 20 | CAUGHT_MESSAGES = []
 21 | COW_RECORD_COUNT = 0
 22 | 
 23 | def singer_write_message_no_cow(message):
 24 |     global COW_RECORD_COUNT
 25 | 
 26 |     if isinstance(message, singer.RecordMessage) and message.stream == 'COW':
 27 |         COW_RECORD_COUNT = COW_RECORD_COUNT + 1
 28 |         if COW_RECORD_COUNT > 2:
 29 |             raise Exception("simulated exception")
 30 |         CAUGHT_MESSAGES.append(message)
 31 |     else:
 32 |         CAUGHT_MESSAGES.append(message)
 33 | 
 34 | def singer_write_schema_ok(message):
 35 |     CAUGHT_MESSAGES.append(message)
 36 | 
 37 | def singer_write_message_ok(message):
 38 |     CAUGHT_MESSAGES.append(message)
 39 | 
 40 | def expected_record(fixture_row):
 41 |     expected_record = {}
 42 |     for k,v in fixture_row.items():
 43 |         expected_record[k.replace('"', '')] = v
 44 | 
 45 |     return expected_record
 46 | 
 47 | def do_not_dump_catalog(catalog):
 48 |     pass
 49 | 
 50 | tap_postgres.dump_catalog = do_not_dump_catalog
 51 | full_table.UPDATE_BOOKMARK_PERIOD = 1
 52 | 
 53 | class LogicalInterruption(unittest.TestCase):
 54 |     maxDiff = None
 55 | 
 56 |     def setUp(self):
 57 |         ensure_db()
 58 |         table_spec_1 = {"columns": [{"name": "id", "type" : "serial",       "primary_key" : True},
 59 |                                     {"name" : 'name', "type": "character varying"},
 60 |                                     {"name" : 'colour', "type": "character varying"}],
 61 |                         "name" : 'COW'}
 62 |         ensure_test_table(table_spec_1)
 63 |         global COW_RECORD_COUNT
 64 |         COW_RECORD_COUNT = 0
 65 |         global CAUGHT_MESSAGES
 66 |         CAUGHT_MESSAGES.clear()
 67 | 
 68 |     def test_catalog(self):
 69 |         singer.write_message = singer_write_message_no_cow
 70 |         pg_common.write_schema_message = singer_write_message_ok
 71 | 
 72 |         conn_config = get_test_connection_config()
 73 |         streams = tap_postgres.do_discovery(conn_config)
 74 |         cow_stream = [s for s in streams if s['table_name'] == 'COW'][0]
 75 |         self.assertIsNotNone(cow_stream)
 76 |         cow_stream = select_all_of_stream(cow_stream)
 77 |         cow_stream = set_replication_method_for_stream(cow_stream, 'LOG_BASED')
 78 | 
 79 |         with get_test_connection() as conn:
 80 |             conn.autocommit = True
 81 |             cur = conn.cursor()
 82 | 
 83 |             cow_rec = {'name' : 'betty', 'colour' : 'blue'}
 84 |             insert_record(cur, 'COW', cow_rec)
 85 | 
 86 |             cow_rec = {'name' : 'smelly', 'colour' : 'brow'}
 87 |             insert_record(cur, 'COW', cow_rec)
 88 | 
 89 |             cow_rec = {'name' : 'pooper', 'colour' : 'green'}
 90 |             insert_record(cur, 'COW', cow_rec)
 91 | 
 92 |         state = {}
 93 |         #the initial phase of cows logical replication will be a full table.
 94 |         #it will sync the first record and then blow up on the 2nd record
 95 |         try:
 96 | 
 97 |             tap_postgres.do_sync(get_test_connection_config(), {'streams' : streams}, None, state)
 98 |         except Exception as ex:
 99 |             blew_up_on_cow = True
100 | 
101 |         self.assertTrue(blew_up_on_cow)
102 | 
103 |         self.assertEqual(7, len(CAUGHT_MESSAGES))
104 | 
105 |         self.assertEqual(CAUGHT_MESSAGES[0]['type'], 'SCHEMA')
106 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[1], singer.StateMessage))
107 |         self.assertIsNone(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW'].get('xmin'))
108 |         self.assertIsNotNone(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW'].get('lsn'))
109 |         end_lsn = CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW'].get('lsn')
110 | 
111 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[2], singer.ActivateVersionMessage))
112 |         new_version = CAUGHT_MESSAGES[2].version
113 | 
114 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[3], singer.RecordMessage))
115 |         self.assertEqual(CAUGHT_MESSAGES[3].record, {'colour': 'blue', 'id': 1, 'name': 'betty'})
116 |         self.assertEqual('COW', CAUGHT_MESSAGES[3].stream)
117 | 
118 | 
119 | 
120 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[4], singer.StateMessage))
121 |         #xmin is set while we are processing the full table replication
122 |         self.assertIsNotNone(CAUGHT_MESSAGES[4].value['bookmarks']['postgres-public-COW']['xmin'])
123 |         self.assertEqual(CAUGHT_MESSAGES[4].value['bookmarks']['postgres-public-COW']['lsn'], end_lsn)
124 | 
125 |         self.assertEqual(CAUGHT_MESSAGES[5].record['name'], 'smelly')
126 |         self.assertEqual('COW', CAUGHT_MESSAGES[5].stream)
127 | 
128 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[6], singer.StateMessage))
129 |         last_xmin = CAUGHT_MESSAGES[6].value['bookmarks']['postgres-public-COW']['xmin']
130 |         old_state = CAUGHT_MESSAGES[6].value
131 | 
132 | 
133 |         #run another do_sync, should get the remaining record which effectively finishes the initial full_table
134 |         #replication portion of the logical replication
135 |         singer.write_message = singer_write_message_ok
136 |         global COW_RECORD_COUNT
137 |         COW_RECORD_COUNT = 0
138 |         CAUGHT_MESSAGES.clear()
139 |         tap_postgres.do_sync(get_test_connection_config(), {'streams' : streams}, None, old_state)
140 | 
141 |         self.assertEqual(8, len(CAUGHT_MESSAGES))
142 | 
143 |         self.assertEqual(CAUGHT_MESSAGES[0]['type'], 'SCHEMA')
144 | 
145 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[1], singer.StateMessage))
146 |         self.assertEqual(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW'].get('xmin'), last_xmin)
147 |         self.assertEqual(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW'].get('lsn'), end_lsn)
148 |         self.assertEqual(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW'].get('version'), new_version)
149 | 
150 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[2], singer.RecordMessage))
151 |         self.assertEqual(CAUGHT_MESSAGES[2].record, {'colour': 'brow', 'id': 2, 'name': 'smelly'})
152 |         self.assertEqual('COW', CAUGHT_MESSAGES[2].stream)
153 | 
154 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[3], singer.StateMessage))
155 |         self.assertTrue(CAUGHT_MESSAGES[3].value['bookmarks']['postgres-public-COW'].get('xmin'),last_xmin)
156 |         self.assertEqual(CAUGHT_MESSAGES[3].value['bookmarks']['postgres-public-COW'].get('lsn'), end_lsn)
157 |         self.assertEqual(CAUGHT_MESSAGES[3].value['bookmarks']['postgres-public-COW'].get('version'), new_version)
158 | 
159 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[4], singer.RecordMessage))
160 |         self.assertEqual(CAUGHT_MESSAGES[4].record['name'], 'pooper')
161 |         self.assertEqual('COW', CAUGHT_MESSAGES[4].stream)
162 | 
163 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[5], singer.StateMessage))
164 |         self.assertTrue(CAUGHT_MESSAGES[5].value['bookmarks']['postgres-public-COW'].get('xmin') > last_xmin)
165 |         self.assertEqual(CAUGHT_MESSAGES[5].value['bookmarks']['postgres-public-COW'].get('lsn'), end_lsn)
166 |         self.assertEqual(CAUGHT_MESSAGES[5].value['bookmarks']['postgres-public-COW'].get('version'), new_version)
167 | 
168 | 
169 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[6], singer.ActivateVersionMessage))
170 |         self.assertEqual(CAUGHT_MESSAGES[6].version, new_version)
171 | 
172 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[7], singer.StateMessage))
173 |         self.assertIsNone(CAUGHT_MESSAGES[7].value['bookmarks']['postgres-public-COW'].get('xmin'))
174 |         self.assertEqual(CAUGHT_MESSAGES[7].value['bookmarks']['postgres-public-COW'].get('lsn'), end_lsn)
175 |         self.assertEqual(CAUGHT_MESSAGES[7].value['bookmarks']['postgres-public-COW'].get('version'), new_version)
176 | 
177 | class FullTableInterruption(unittest.TestCase):
178 |     maxDiff = None
179 |     def setUp(self):
180 |         table_spec_1 = {"columns": [{"name": "id", "type" : "serial",       "primary_key" : True},
181 |                                     {"name" : 'name', "type": "character varying"},
182 |                                     {"name" : 'colour', "type": "character varying"}],
183 |                         "name" : 'COW'}
184 |         ensure_test_table(table_spec_1)
185 | 
186 |         table_spec_2 = {"columns": [{"name": "id", "type" : "serial",       "primary_key" : True},
187 |                                     {"name" : 'name', "type": "character varying"},
188 |                                     {"name" : 'colour', "type": "character varying"}],
189 |                         "name" : 'CHICKEN'}
190 |         ensure_test_table(table_spec_2)
191 | 
192 |         global COW_RECORD_COUNT
193 |         COW_RECORD_COUNT = 0
194 |         global CAUGHT_MESSAGES
195 |         CAUGHT_MESSAGES.clear()
196 | 
197 |     def test_catalog(self):
198 |         singer.write_message = singer_write_message_no_cow
199 |         pg_common.write_schema_message = singer_write_message_ok
200 | 
201 |         conn_config = get_test_connection_config()
202 |         streams = tap_postgres.do_discovery(conn_config)
203 |         cow_stream = [s for s in streams if s['table_name'] == 'COW'][0]
204 |         self.assertIsNotNone(cow_stream)
205 |         cow_stream = select_all_of_stream(cow_stream)
206 |         cow_stream = set_replication_method_for_stream(cow_stream, 'FULL_TABLE')
207 | 
208 |         chicken_stream = [s for s in streams if s['table_name'] == 'CHICKEN'][0]
209 |         self.assertIsNotNone(chicken_stream)
210 |         chicken_stream = select_all_of_stream(chicken_stream)
211 |         chicken_stream = set_replication_method_for_stream(chicken_stream, 'FULL_TABLE')
212 |         with get_test_connection() as conn:
213 |             conn.autocommit = True
214 |             cur = conn.cursor()
215 | 
216 |             cow_rec = {'name' : 'betty', 'colour' : 'blue'}
217 |             insert_record(cur, 'COW', cow_rec)
218 |             cow_rec = {'name' : 'smelly', 'colour' : 'brow'}
219 |             insert_record(cur, 'COW', cow_rec)
220 | 
221 |             cow_rec = {'name' : 'pooper', 'colour' : 'green'}
222 |             insert_record(cur, 'COW', cow_rec)
223 | 
224 |             chicken_rec = {'name' : 'fred', 'colour' : 'red'}
225 |             insert_record(cur, 'CHICKEN', chicken_rec)
226 | 
227 |         state = {}
228 |         #this will sync the CHICKEN but then blow up on the COW
229 |         try:
230 |             tap_postgres.do_sync(get_test_connection_config(), {'streams' : streams}, None, state)
231 |         except Exception as ex:
232 |             # LOGGER.exception(ex)
233 |             blew_up_on_cow = True
234 | 
235 |         self.assertTrue(blew_up_on_cow)
236 | 
237 | 
238 |         self.assertEqual(14, len(CAUGHT_MESSAGES))
239 | 
240 |         self.assertEqual(CAUGHT_MESSAGES[0]['type'], 'SCHEMA')
241 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[1], singer.StateMessage))
242 |         self.assertIsNone(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-CHICKEN'].get('xmin'))
243 | 
244 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[2], singer.ActivateVersionMessage))
245 |         new_version = CAUGHT_MESSAGES[2].version
246 | 
247 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[3], singer.RecordMessage))
248 |         self.assertEqual('CHICKEN', CAUGHT_MESSAGES[3].stream)
249 | 
250 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[4], singer.StateMessage))
251 |         #xmin is set while we are processing the full table replication
252 |         self.assertIsNotNone(CAUGHT_MESSAGES[4].value['bookmarks']['postgres-public-CHICKEN']['xmin'])
253 | 
254 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[5], singer.ActivateVersionMessage))
255 |         self.assertEqual(CAUGHT_MESSAGES[5].version, new_version)
256 | 
257 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[6], singer.StateMessage))
258 |         self.assertEqual(None, singer.get_currently_syncing( CAUGHT_MESSAGES[6].value))
259 |         #xmin is cleared at the end of the full table replication
260 |         self.assertIsNone(CAUGHT_MESSAGES[6].value['bookmarks']['postgres-public-CHICKEN']['xmin'])
261 | 
262 | 
263 |         #cow messages
264 |         self.assertEqual(CAUGHT_MESSAGES[7]['type'], 'SCHEMA')
265 | 
266 |         self.assertEqual("COW", CAUGHT_MESSAGES[7]['stream'])
267 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[8], singer.StateMessage))
268 |         self.assertIsNone(CAUGHT_MESSAGES[8].value['bookmarks']['postgres-public-COW'].get('xmin'))
269 |         self.assertEqual("postgres-public-COW", CAUGHT_MESSAGES[8].value['currently_syncing'])
270 | 
271 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[9], singer.ActivateVersionMessage))
272 |         cow_version = CAUGHT_MESSAGES[9].version
273 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[10], singer.RecordMessage))
274 | 
275 |         self.assertEqual(CAUGHT_MESSAGES[10].record['name'], 'betty')
276 |         self.assertEqual('COW', CAUGHT_MESSAGES[10].stream)
277 | 
278 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[11], singer.StateMessage))
279 |         #xmin is set while we are processing the full table replication
280 |         self.assertIsNotNone(CAUGHT_MESSAGES[11].value['bookmarks']['postgres-public-COW']['xmin'])
281 | 
282 | 
283 |         self.assertEqual(CAUGHT_MESSAGES[12].record['name'], 'smelly')
284 |         self.assertEqual('COW', CAUGHT_MESSAGES[12].stream)
285 |         old_state = CAUGHT_MESSAGES[13].value
286 | 
287 |         #run another do_sync
288 |         singer.write_message = singer_write_message_ok
289 |         CAUGHT_MESSAGES.clear()
290 |         global COW_RECORD_COUNT
291 |         COW_RECORD_COUNT = 0
292 | 
293 |         tap_postgres.do_sync(get_test_connection_config(), {'streams' : streams}, None, old_state)
294 | 
295 |         self.assertEqual(CAUGHT_MESSAGES[0]['type'], 'SCHEMA')
296 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[1], singer.StateMessage))
297 | 
298 |         # because we were interrupted, we do not switch versions
299 |         self.assertEqual(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW']['version'], cow_version)
300 |         self.assertIsNotNone(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW']['xmin'])
301 |         self.assertEqual("postgres-public-COW", singer.get_currently_syncing(CAUGHT_MESSAGES[1].value))
302 | 
303 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[2], singer.RecordMessage))
304 |         self.assertEqual(CAUGHT_MESSAGES[2].record['name'], 'smelly')
305 |         self.assertEqual('COW', CAUGHT_MESSAGES[2].stream)
306 | 
307 | 
308 |         #after record: activate version, state with no xmin or currently syncing
309 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[3], singer.StateMessage))
310 |         #we still have an xmin for COW because are not yet done with the COW table
311 |         self.assertIsNotNone(CAUGHT_MESSAGES[3].value['bookmarks']['postgres-public-COW']['xmin'])
312 |         self.assertEqual(singer.get_currently_syncing( CAUGHT_MESSAGES[3].value), 'postgres-public-COW')
313 | 
314 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[4], singer.RecordMessage))
315 |         self.assertEqual(CAUGHT_MESSAGES[4].record['name'], 'pooper')
316 |         self.assertEqual('COW', CAUGHT_MESSAGES[4].stream)
317 | 
318 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[5], singer.StateMessage))
319 |         self.assertIsNotNone(CAUGHT_MESSAGES[5].value['bookmarks']['postgres-public-COW']['xmin'])
320 |         self.assertEqual(singer.get_currently_syncing( CAUGHT_MESSAGES[5].value), 'postgres-public-COW')
321 | 
322 | 
323 |         #xmin is cleared because we are finished the full table replication
324 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[6], singer.ActivateVersionMessage))
325 |         self.assertEqual(CAUGHT_MESSAGES[6].version, cow_version)
326 | 
327 |         self.assertTrue(isinstance(CAUGHT_MESSAGES[7], singer.StateMessage))
328 |         self.assertIsNone(singer.get_currently_syncing( CAUGHT_MESSAGES[7].value))
329 |         self.assertIsNone(CAUGHT_MESSAGES[7].value['bookmarks']['postgres-public-CHICKEN']['xmin'])
330 |         self.assertIsNone(singer.get_currently_syncing( CAUGHT_MESSAGES[7].value))
331 | 
332 | 
333 | if __name__== "__main__":
334 |     test1 = LogicalInterruption()
335 |     test1.setUp()
336 |     test1.test_catalog()
337 | 


--------------------------------------------------------------------------------
/tests/test_postgres_logical_replication_multiple_tables.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | 
  4 | import psycopg2.extras
  5 | from psycopg2.extensions import quote_ident
  6 | import tap_tester.connections as connections
  7 | import tap_tester.menagerie   as menagerie
  8 | import tap_tester.runner      as runner
  9 | 
 10 | import db_utils  # pylint: disable=import-error
 11 | 
 12 | 
 13 | expected_schemas = {'postgres_logical_replication_test_cows':
 14 |                     {'type': 'object',
 15 |                      'selected': True,
 16 |                      'properties': {'cow_name': {'selected': True, 'type': ['null', 'string'], 'inclusion': 'available'},
 17 |                                     'id': {'maximum': 2147483647, 'inclusion': 'automatic', 'type': ['integer'], 'minimum': -2147483648, 'selected': True},
 18 |                                     'cow_age': {'selected': True, 'type': ['null', 'integer'], 'inclusion': 'available'}}},
 19 | 
 20 |                     'postgres_logical_replication_test_chickens':
 21 |                     {'type': 'object',
 22 |                      'selected': True,
 23 |                      'properties': {'cow_name': {'selected': True, 'type': ['null', 'string'], 'inclusion': 'available'},
 24 |                                     'id': {'maximum': 2147483647, 'inclusion': 'automatic', 'type': ['integer'], 'minimum': -2147483648, 'selected': True},
 25 |                                     'cow_age': {'selected': True, 'type': ['null', 'integer'], 'inclusion': 'available'}}}}
 26 | 
 27 | 
 28 | def insert_record(cursor, table_name, data):
 29 |     our_keys = list(data.keys())
 30 |     our_keys.sort()
 31 |     our_values = [data.get(key) for key in our_keys]
 32 | 
 33 |     columns_sql = ", \n ".join(our_keys)
 34 |     value_sql = ",".join(["%s" for i in range(len(our_keys))])
 35 | 
 36 |     insert_sql = """ INSERT INTO {}
 37 |                             ( {} )
 38 |                      VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql)
 39 |     cursor.execute(insert_sql, our_values)
 40 | 
 41 | test_schema_name = "public"
 42 | test_table_name_cows = "postgres_logical_replication_test_cows"
 43 | test_table_name_chickens = "postgres_logical_replication_test_chickens"
 44 | 
 45 | def canonicalized_table_name(schema, table, cur):
 46 |     return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur))
 47 | 
 48 | 
 49 | class PostgresLogicalRepMultipleTables(unittest.TestCase):
 50 |     def tearDown(self):
 51 |         with db_utils.get_test_connection('dev') as conn:
 52 |             conn.autocommit = True
 53 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
 54 |                 cur.execute(""" SELECT pg_drop_replication_slot('stitch') """)
 55 | 
 56 |     def setUp(self):
 57 |         db_utils.ensure_environment_variables_set()
 58 | 
 59 |         db_utils.ensure_db("dev")
 60 | 
 61 |         self.maxDiff = None
 62 | 
 63 |         with db_utils.get_test_connection('dev') as conn:
 64 |             conn.autocommit = True
 65 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
 66 |                 cur.execute(""" SELECT EXISTS (SELECT 1
 67 |                                                 FROM  pg_replication_slots
 68 |                                                WHERE  slot_name = 'stitch') """)
 69 |                 old_slot = cur.fetchone()[0]
 70 |                 with db_utils.get_test_connection('dev', True) as conn2:
 71 |                     with conn2.cursor() as cur2:
 72 |                         if old_slot:
 73 |                             cur2.drop_replication_slot("stitch")
 74 |                         cur2.create_replication_slot('stitch', output_plugin='wal2json')
 75 | 
 76 |                 for t in [test_table_name_cows, test_table_name_chickens]:
 77 |                     old_table = cur.execute("""SELECT EXISTS (
 78 |                                           SELECT 1
 79 |                                           FROM  information_schema.tables
 80 |                                           WHERE  table_schema = %s
 81 |                                           AND  table_name =   %s);""",
 82 |                                             [test_schema_name, t])
 83 |                     old_table = cur.fetchone()[0]
 84 | 
 85 |                     if old_table:
 86 |                         cur.execute("DROP TABLE {}".format(canonicalized_table_name(test_schema_name, t, cur)))
 87 | 
 88 | 
 89 |                 cur = conn.cursor()
 90 |                 create_table_sql = """
 91 |                 CREATE TABLE {} (id            SERIAL PRIMARY KEY,
 92 |                                 cow_age        integer,
 93 |                                 cow_name       varchar)
 94 |                 """.format(canonicalized_table_name(test_schema_name, test_table_name_cows, cur))
 95 |                 cur.execute(create_table_sql)
 96 | 
 97 |                 create_table_sql = """
 98 |                 CREATE TABLE {} (id            SERIAL PRIMARY KEY,
 99 |                                 chicken_age        integer,
100 |                                 chicken_name       varchar)
101 |                 """.format(canonicalized_table_name(test_schema_name, test_table_name_chickens, cur))
102 |                 cur.execute(create_table_sql)
103 | 
104 |                 #insert a cow
105 |                 self.cows_rec_1 = {'cow_name' : "anne_cow", 'cow_age' : 30}
106 |                 insert_record(cur, test_table_name_cows, self.cows_rec_1)
107 | 
108 |                 #insert a chicken
109 |                 self.chickens_rec_1 = {'chicken_name' : "alfred_chicken", 'chicken_age' : 4}
110 |                 insert_record(cur, test_table_name_chickens, self.chickens_rec_1)
111 | 
112 |     @staticmethod
113 |     def expected_check_streams():
114 |         return { 'dev-public-postgres_logical_replication_test_cows', 'dev-public-postgres_logical_replication_test_chickens'}
115 | 
116 |     @staticmethod
117 |     def expected_sync_streams():
118 |         return { 'postgres_logical_replication_test_cows', 'postgres_logical_replication_test_chickens' }
119 | 
120 |     @staticmethod
121 |     def expected_pks():
122 |         return {
123 |             'postgres_logical_replication_test_cows' : {'id'},
124 |             'postgres_logical_replication_test_chickens' : {'id'}
125 |         }
126 | 
127 |     @staticmethod
128 |     def tap_name():
129 |         return "tap-postgres"
130 | 
131 |     @staticmethod
132 |     def name():
133 |         return "tap_tester_postgres_logical_multiple_tables"
134 | 
135 |     @staticmethod
136 |     def get_type():
137 |         return "platform.postgres"
138 | 
139 |     @staticmethod
140 |     def get_credentials():
141 |         return {'password': os.getenv('TAP_POSTGRES_PASSWORD')}
142 | 
143 |     @staticmethod
144 |     def get_properties():
145 |         return {'host' : os.getenv('TAP_POSTGRES_HOST'),
146 |                 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'),
147 |                 'port' : os.getenv('TAP_POSTGRES_PORT'),
148 |                 'user' : os.getenv('TAP_POSTGRES_USER'),
149 |                 'default_replication_method' : 'LOG_BASED',
150 |                 'logical_poll_total_seconds': '10'
151 |         }
152 | 
153 | 
154 |     def test_run(self):
155 |         conn_id = connections.ensure_connection(self)
156 | 
157 |         # run in check mode
158 |         check_job_name = runner.run_check_mode(self, conn_id)
159 | 
160 |         # verify check  exit codes
161 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
162 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
163 | 
164 |         # verify the tap discovered the right streams
165 |         found_catalogs = [fc for fc
166 |                           in menagerie.get_catalogs(conn_id)
167 |                           if fc['tap_stream_id'] in self.expected_check_streams()]
168 | 
169 | 
170 |         self.assertGreaterEqual(len(found_catalogs),
171 |                                 2,
172 |                                 msg="unable to locate schemas for connection {}".format(conn_id))
173 | 
174 |         found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))
175 |         diff = self.expected_check_streams().symmetric_difference(found_catalog_names)
176 |         self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
177 | 
178 |         # verify that persisted streams have the correct properties
179 | 
180 |         test_catalog_cows = list(filter( lambda c: c['stream_name'] == 'postgres_logical_replication_test_cows', found_catalogs))[0]
181 |         self.assertEqual('postgres_logical_replication_test_cows', test_catalog_cows['stream_name'])
182 | 
183 | 
184 |         test_catalog_chickens = list(filter( lambda c: c['stream_name'] == 'postgres_logical_replication_test_chickens', found_catalogs))[0]
185 |         self.assertEqual('postgres_logical_replication_test_chickens', test_catalog_chickens['stream_name'])
186 |         print("discovered streams are correct")
187 | 
188 |         additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}]
189 |         connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog_cows,
190 |                                                            menagerie.get_annotated_schema(conn_id, test_catalog_cows['stream_id']),
191 |                                                            additional_md)
192 |         connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog_chickens,
193 |                                                            menagerie.get_annotated_schema(conn_id, test_catalog_chickens['stream_id']),
194 |                                                            additional_md)
195 | 
196 |         # clear state
197 |         menagerie.set_state(conn_id, {})
198 | 
199 |         sync_job_name = runner.run_sync_mode(self, conn_id)
200 | 
201 |         # verify tap and target exit codes
202 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
203 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
204 | 
205 |         record_count_by_stream = runner.examine_target_output_file(self,
206 |                                                                    conn_id,
207 |                                                                    self.expected_sync_streams(),
208 |                                                                    self.expected_pks())
209 | 
210 | 
211 |         self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test_cows': 1, 'postgres_logical_replication_test_chickens': 1})
212 |         records_by_stream = runner.get_records_from_target_output()
213 | 
214 |         table_version_cows = records_by_stream['postgres_logical_replication_test_cows']['table_version']
215 |         self.assertEqual(records_by_stream['postgres_logical_replication_test_cows']['messages'][0]['action'], 'activate_version')
216 |         self.assertEqual(records_by_stream['postgres_logical_replication_test_cows']['messages'][1]['action'], 'upsert')
217 |         self.assertEqual(records_by_stream['postgres_logical_replication_test_cows']['messages'][2]['action'], 'activate_version')
218 | 
219 |         table_version_chickens = records_by_stream['postgres_logical_replication_test_chickens']['table_version']
220 |         self.assertEqual(records_by_stream['postgres_logical_replication_test_chickens']['messages'][0]['action'], 'activate_version')
221 |         self.assertEqual(records_by_stream['postgres_logical_replication_test_chickens']['messages'][1]['action'], 'upsert')
222 |         self.assertEqual(records_by_stream['postgres_logical_replication_test_chickens']['messages'][2]['action'], 'activate_version')
223 | 
224 |         # verify state and bookmarks
225 |         state = menagerie.get_state(conn_id)
226 |         self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")
227 | 
228 |         bookmark_cows = state['bookmarks']['dev-public-postgres_logical_replication_test_cows']
229 |         self.assertIsNotNone(bookmark_cows['lsn'], msg="expected bookmark for stream to have an lsn")
230 |         lsn_cows_1 = bookmark_cows['lsn']
231 |         self.assertEqual(bookmark_cows['version'], table_version_cows, msg="expected bookmark for stream to match version")
232 | 
233 |         bookmark_chickens = state['bookmarks']['dev-public-postgres_logical_replication_test_chickens']
234 |         self.assertIsNotNone(bookmark_chickens['lsn'], msg="expected bookmark for stream to have an lsn")
235 |         lsn_chickens_1 = bookmark_chickens['lsn']
236 |         self.assertEqual(bookmark_chickens['version'], table_version_chickens, msg="expected bookmark for stream to match version")
237 | 
238 | 
239 |         #----------------------------------------------------------------------
240 |         # invoke the sync job again after adding records
241 |         #----------------------------------------------------------------------
242 |         print("inserting 2 more cows and 2 more chickens")
243 | 
244 |         with db_utils.get_test_connection('dev') as conn:
245 |             conn.autocommit = True
246 |             with conn.cursor() as cur:
247 |                 # insert another cow
248 |                 self.cows_rec_2 = {'cow_name' : "betty cow", 'cow_age' : 21}
249 |                 insert_record(cur, test_table_name_cows, self.cows_rec_2)
250 |                 # update that cow's expected values
251 |                 self.cows_rec_2['id'] = 2
252 |                 self.cows_rec_2['_sdc_deleted_at'] = None
253 | 
254 |                 # insert another chicken
255 |                 self.chicken_rec_2 = {'chicken_name' : "burt chicken", 'chicken_age' : 14}
256 |                 insert_record(cur, test_table_name_chickens, self.chicken_rec_2)
257 |                 # update that cow's expected values
258 |                 self.chicken_rec_2['id'] = 2
259 |                 self.chicken_rec_2['_sdc_deleted_at'] = None
260 | 
261 |                 # and repeat...
262 | 
263 |                 self.cows_rec_3 = {'cow_name' : "cindy cow", 'cow_age' : 10}
264 |                 insert_record(cur, test_table_name_cows, self.cows_rec_3)
265 |                 self.cows_rec_3['id'] = 3
266 |                 self.cows_rec_3['_sdc_deleted_at'] = None
267 | 
268 | 
269 |                 self.chicken_rec_3 = {'chicken_name' : "carl chicken", 'chicken_age' : 4}
270 |                 insert_record(cur, test_table_name_chickens, self.chicken_rec_3)
271 |                 self.chicken_rec_3['id'] = 3
272 |                 self.chicken_rec_3['_sdc_deleted_at'] = None
273 | 
274 | 
275 |         sync_job_name = runner.run_sync_mode(self, conn_id)
276 | 
277 |         # verify tap and target exit codes
278 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
279 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
280 | 
281 |         record_count_by_stream = runner.examine_target_output_file(self,
282 |                                                                    conn_id,
283 |                                                                    self.expected_sync_streams(),
284 |                                                                    self.expected_pks())
285 |         self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test_cows': 2, 'postgres_logical_replication_test_chickens': 2})
286 |         records_by_stream = runner.get_records_from_target_output()
287 |         chicken_messages = records_by_stream["postgres_logical_replication_test_chickens"]['messages']
288 |         cow_messages = records_by_stream["postgres_logical_replication_test_cows"]['messages']
289 | 
290 |         self.assertDictEqual(self.cows_rec_2, cow_messages[0]['data'])
291 |         self.assertDictEqual(self.chicken_rec_2, chicken_messages[0]['data'])
292 |         self.assertDictEqual(self.cows_rec_3, cow_messages[1]['data'])
293 |         self.assertDictEqual(self.chicken_rec_3, chicken_messages[1]['data'])
294 | 
295 |         print("inserted record is correct")
296 | 
297 |         state = menagerie.get_state(conn_id)
298 |         self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")
299 |         cows_bookmark = state['bookmarks']['dev-public-postgres_logical_replication_test_cows']
300 |         self.assertIsNotNone(cows_bookmark['lsn'], msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn")
301 |         lsn_cows_2 = cows_bookmark['lsn']
302 |         self.assertTrue(lsn_cows_2 >= lsn_cows_1)
303 | 
304 |         chickens_bookmark = state['bookmarks']['dev-public-postgres_logical_replication_test_chickens']
305 |         self.assertIsNotNone(chickens_bookmark['lsn'], msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn")
306 |         lsn_chickens_2 = chickens_bookmark['lsn']
307 |         self.assertTrue(lsn_chickens_2 >= lsn_chickens_1)
308 | 
309 |         #table_version does NOT change
310 |         self.assertEqual(chickens_bookmark['version'], table_version_chickens, msg="expected bookmark for stream public-postgres_logical_replication_test to match version")
311 | 
312 |         #table_version does NOT change
313 |         self.assertEqual(cows_bookmark['version'], table_version_cows, msg="expected bookmark for stream public-postgres_logical_replication_test to match version")
314 | 


--------------------------------------------------------------------------------
/tests/test_postgres_logical_replication_multiple_dbs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | 
  4 | import psycopg2.extras
  5 | from psycopg2.extensions import quote_ident
  6 | import tap_tester.connections as connections
  7 | import tap_tester.menagerie   as menagerie
  8 | import tap_tester.runner      as runner
  9 | 
 10 | import db_utils  # pylint: disable=import-error
 11 | 
 12 | 
 13 | def insert_record(cursor, table_name, data):
 14 |     our_keys = list(data.keys())
 15 |     our_keys.sort()
 16 |     our_values = [data.get(key) for key in our_keys]
 17 | 
 18 |     columns_sql = ", \n ".join(our_keys)
 19 |     value_sql = ",".join(["%s" for i in range(len(our_keys))])
 20 | 
 21 |     insert_sql = """ INSERT INTO {}
 22 |                             ( {} )
 23 |                      VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql)
 24 |     cursor.execute(insert_sql, our_values)
 25 | 
 26 | 
 27 | test_schema_name = "public"
 28 | test_table_name_cows = "postgres_logical_replication_test_cows"
 29 | test_table_name_chickens = "postgres_logical_replication_test_chickens"
 30 | 
 31 | def canonicalized_table_name(schema, table, cur):
 32 |     return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur))
 33 | 
 34 | 
 35 | class PostgresLogicalRepMultipleDBs(unittest.TestCase):
 36 |     def tearDown(self):
 37 |         with db_utils.get_test_connection('dev') as conn:
 38 |             conn.autocommit = True
 39 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
 40 |                 cur.execute(""" SELECT pg_drop_replication_slot('stitch_dev') """)
 41 | 
 42 |         with db_utils.get_test_connection('postgres') as conn:
 43 |             conn.autocommit = True
 44 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
 45 |                 cur.execute(""" SELECT pg_drop_replication_slot('stitch_postgres') """)
 46 | 
 47 |     def setUp(self):
 48 |         db_utils.ensure_environment_variables_set()
 49 | 
 50 |         db_utils.ensure_db('dev')
 51 |         db_utils.ensure_db('postgres')
 52 | 
 53 |         self.maxDiff = None
 54 | 
 55 |         with db_utils.get_test_connection('dev') as conn:
 56 |             conn.autocommit = True
 57 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
 58 |                 cur.execute(""" SELECT EXISTS (SELECT 1
 59 |                                                 FROM  pg_replication_slots
 60 |                                                WHERE  slot_name = 'stitch_dev') """)
 61 |                 old_slot = cur.fetchone()[0]
 62 |                 with db_utils.get_test_connection('dev', True) as conn2:
 63 |                     with conn2.cursor() as cur2:
 64 |                         if old_slot:
 65 |                             cur2.drop_replication_slot("stitch_dev")
 66 |                         cur2.create_replication_slot('stitch_dev', output_plugin='wal2json')
 67 | 
 68 |                 old_table = cur.execute("""SELECT EXISTS (
 69 |                                           SELECT 1
 70 |                                           FROM  information_schema.tables
 71 |                                           WHERE  table_schema = %s
 72 |                                           AND  table_name =   %s);""",
 73 |                                             [test_schema_name, test_table_name_cows])
 74 |                 old_table = cur.fetchone()[0]
 75 | 
 76 |                 if old_table:
 77 |                     cur.execute("DROP TABLE {}".format(canonicalized_table_name(test_schema_name, test_table_name_cows, cur)))
 78 | 
 79 |                 #create dev_cows
 80 |                 cur = conn.cursor()
 81 |                 create_table_sql = """
 82 |                 CREATE TABLE {} (id            SERIAL PRIMARY KEY,
 83 |                                 cow_age        integer,
 84 |                                 cow_name       varchar)
 85 |                 """.format(canonicalized_table_name(test_schema_name, test_table_name_cows, cur))
 86 |                 cur.execute(create_table_sql)
 87 | 
 88 |                 #insert a cow
 89 |                 self.cows_rec_1 = {'cow_name' : "anne_cow", 'cow_age' : 30}
 90 |                 insert_record(cur, test_table_name_cows, self.cows_rec_1)
 91 | 
 92 | 
 93 |         with db_utils.get_test_connection('postgres') as conn:
 94 |             conn.autocommit = True
 95 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
 96 |                 cur.execute(""" SELECT EXISTS (SELECT 1
 97 |                                                 FROM  pg_replication_slots
 98 |                                                WHERE  slot_name = 'stitch_postgres') """)
 99 |                 old_slot = cur.fetchone()[0]
100 |                 with db_utils.get_test_connection('postgres', True) as conn2:
101 |                     with conn2.cursor() as cur2:
102 |                         if old_slot:
103 |                             cur2.drop_replication_slot("stitch_postgres")
104 |                         cur2.create_replication_slot('stitch_postgres', output_plugin='wal2json')
105 | 
106 | 
107 |                 old_table = cur.execute("""SELECT EXISTS (
108 |                                           SELECT 1
109 |                                           FROM  information_schema.tables
110 |                                           WHERE  table_schema = %s
111 |                                           AND  table_name =   %s);""",
112 |                                         [test_schema_name, test_table_name_chickens])
113 |                 old_table = cur.fetchone()[0]
114 | 
115 |                 if old_table:
116 |                     cur.execute("DROP TABLE {}".format(canonicalized_table_name(test_schema_name, test_table_name_chickens, cur)))
117 | 
118 | 
119 |                 #create postgres_chickens
120 |                 create_table_sql = """
121 |                 CREATE TABLE {} (id            SERIAL PRIMARY KEY,
122 |                                 chicken_age        integer,
123 |                                 chicken_name       varchar)
124 |                 """.format(canonicalized_table_name(test_schema_name, test_table_name_chickens, cur))
125 |                 cur.execute(create_table_sql)
126 | 
127 | 
128 |                 #insert a chicken
129 |                 self.chickens_rec_1 = {'chicken_name' : "alfred_chicken", 'chicken_age' : 4}
130 |                 insert_record(cur, test_table_name_chickens, self.chickens_rec_1)
131 | 
132 |     @staticmethod
133 |     def expected_check_streams():
134 |         return { 'dev-public-postgres_logical_replication_test_cows', 'postgres-public-postgres_logical_replication_test_chickens'}
135 | 
136 |     @staticmethod
137 |     def expected_sync_streams():
138 |         return { 'public_postgres_logical_replication_test_cows', 'public_postgres_logical_replication_test_chickens' }
139 | 
140 |     @staticmethod
141 |     def expected_pks():
142 |         return {
143 |             'public_postgres_logical_replication_test_cows' : {'id'},
144 |             'public_postgres_logical_replication_test_chickens' : {'id'}
145 |         }
146 | 
147 |     @staticmethod
148 |     def tap_name():
149 |         return "tap-postgres"
150 | 
151 |     @staticmethod
152 |     def name():
153 |         return "tap_tester_postgres_logical_multiple_dbs"
154 | 
155 |     @staticmethod
156 |     def get_type():
157 |         return "platform.postgres"
158 | 
159 |     @staticmethod
160 |     def get_credentials():
161 |         return {'password': os.getenv('TAP_POSTGRES_PASSWORD')}
162 | 
163 |     @staticmethod
164 |     def get_properties():
165 |         return {'host' : os.getenv('TAP_POSTGRES_HOST'),
166 |                 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'),
167 |                 'port' : os.getenv('TAP_POSTGRES_PORT'),
168 |                 'user' : os.getenv('TAP_POSTGRES_USER'),
169 |                 'default_replication_method' : 'LOG_BASED',
170 |                 'include_schemas_in_destination_stream_name' : 'true',
171 |                 'debug_lsn': 'true',
172 |                 'logical_poll_total_seconds': '10'
173 |         }
174 | 
175 | 
176 |     def test_run(self):
177 |         conn_id = connections.ensure_connection(self)
178 | 
179 |         # run in check mode
180 |         check_job_name = runner.run_check_mode(self, conn_id)
181 | 
182 |         # verify check  exit codes
183 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
184 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
185 | 
186 |         # verify the tap discovered the right streams
187 |         found_catalogs = [fc for fc
188 |                           in menagerie.get_catalogs(conn_id)
189 |                           if fc['tap_stream_id'] in self.expected_check_streams()]
190 | 
191 |         self.assertGreaterEqual(len(found_catalogs),
192 |                                 2,
193 |                                 msg="unable to locate schemas for connection {}".format(conn_id))
194 | 
195 |         found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))
196 |         diff = self.expected_check_streams().symmetric_difference(found_catalog_names)
197 |         self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
198 | 
199 |         # verify that persisted streams have the correct properties
200 |         test_catalog_cows = list(filter( lambda c: c['stream_name'] == 'postgres_logical_replication_test_cows', found_catalogs))[0]
201 |         self.assertEqual('postgres_logical_replication_test_cows', test_catalog_cows['stream_name'])
202 | 
203 | 
204 |         test_catalog_chickens = list(filter( lambda c: c['stream_name'] == 'postgres_logical_replication_test_chickens', found_catalogs))[0]
205 |         self.assertEqual('postgres_logical_replication_test_chickens', test_catalog_chickens['stream_name'])
206 |         print("discovered streams are correct")
207 | 
208 |         additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}]
209 |         connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog_cows,
210 |                                                            menagerie.get_annotated_schema(conn_id, test_catalog_cows['stream_id']),
211 |                                                            additional_md)
212 |         connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog_chickens,
213 |                                                            menagerie.get_annotated_schema(conn_id, test_catalog_chickens['stream_id']),
214 |                                                            additional_md)
215 | 
216 |         # clear state
217 |         menagerie.set_state(conn_id, {})
218 | 
219 |         #run sync job
220 |         sync_job_name = runner.run_sync_mode(self, conn_id)
221 | 
222 |         # verify tap and target exit codes
223 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
224 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
225 | 
226 |         record_count_by_stream = runner.examine_target_output_file(self,
227 |                                                                    conn_id,
228 |                                                                    self.expected_sync_streams(),
229 |                                                                    self.expected_pks())
230 | 
231 | 
232 |         self.assertEqual(record_count_by_stream, { 'public_postgres_logical_replication_test_cows': 1, 'public_postgres_logical_replication_test_chickens': 1})
233 |         records_by_stream = runner.get_records_from_target_output()
234 | 
235 |         table_version_cows = records_by_stream['public_postgres_logical_replication_test_cows']['table_version']
236 |         self.assertEqual(records_by_stream['public_postgres_logical_replication_test_cows']['messages'][0]['action'], 'activate_version')
237 |         self.assertEqual(records_by_stream['public_postgres_logical_replication_test_cows']['messages'][1]['action'], 'upsert')
238 |         self.assertEqual(records_by_stream['public_postgres_logical_replication_test_cows']['messages'][2]['action'], 'activate_version')
239 | 
240 |         table_version_chickens = records_by_stream['public_postgres_logical_replication_test_chickens']['table_version']
241 |         self.assertEqual(records_by_stream['public_postgres_logical_replication_test_chickens']['messages'][0]['action'], 'activate_version')
242 |         self.assertEqual(records_by_stream['public_postgres_logical_replication_test_chickens']['messages'][1]['action'], 'upsert')
243 |         self.assertEqual(records_by_stream['public_postgres_logical_replication_test_chickens']['messages'][2]['action'], 'activate_version')
244 | 
245 |         # verify state and bookmarks
246 |         state = menagerie.get_state(conn_id)
247 |         self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")
248 | 
249 |         bookmark_cows = state['bookmarks']['dev-public-postgres_logical_replication_test_cows']
250 |         self.assertIsNotNone(bookmark_cows['lsn'], msg="expected bookmark for stream to have an lsn")
251 |         lsn_cows_1 = bookmark_cows['lsn']
252 |         self.assertEqual(bookmark_cows['version'], table_version_cows, msg="expected bookmark for stream to match version")
253 | 
254 |         bookmark_chickens = state['bookmarks']['postgres-public-postgres_logical_replication_test_chickens']
255 |         self.assertIsNotNone(bookmark_chickens['lsn'], msg="expected bookmark for stream to have an lsn")
256 |         lsn_chickens_1 = bookmark_chickens['lsn']
257 |         self.assertEqual(bookmark_chickens['version'], table_version_chickens, msg="expected bookmark for stream to match version")
258 | 
259 | 
260 |         #----------------------------------------------------------------------
261 |         # invoke the sync job again after adding records
262 |         #----------------------------------------------------------------------
263 |         print("inserting 1 more cows and 1 more chickens")
264 | 
265 |         with db_utils.get_test_connection('dev') as conn:
266 |             conn.autocommit = True
267 |             with conn.cursor() as cur:
268 |                 #insert another cow
269 |                 self.cows_rec_2 = {'cow_name' : "betty cow", 'cow_age' : 21}
270 |                 insert_record(cur, test_table_name_cows, self.cows_rec_2)
271 | 
272 |         with db_utils.get_test_connection('postgres') as conn:
273 |             conn.autocommit = True
274 |             with conn.cursor() as cur:
275 |                 #insert another chicken
276 |                 self.chicken_rec_2 = {'chicken_name' : "burt chicken", 'chicken_age' : 14}
277 |                 insert_record(cur, test_table_name_chickens, self.chicken_rec_2)
278 | 
279 |         sync_job_name = runner.run_sync_mode(self, conn_id)
280 | 
281 |         # verify tap and target exit codes
282 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
283 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
284 | 
285 |         record_count_by_stream = runner.examine_target_output_file(self,
286 |                                                                    conn_id,
287 |                                                                    self.expected_sync_streams(),
288 |                                                                    self.expected_pks())
289 |         self.assertEqual(record_count_by_stream, { 'public_postgres_logical_replication_test_cows': 1, 'public_postgres_logical_replication_test_chickens': 1})
290 | 
291 |         upserts = []
292 |         for u in runner.get_upserts_from_target_output():
293 |             self.assertIsNotNone(u.get('_sdc_lsn'))
294 |             del u['_sdc_lsn']
295 |             upserts.append(u)
296 | 
297 |         self.assertEqual([{'_sdc_deleted_at': None, 'cow_age': 21, 'id': 2, 'cow_name': 'betty cow'},
298 |                           {'chicken_name': 'burt chicken', '_sdc_deleted_at': None, 'chicken_age': 14, 'id': 2}],
299 |                          upserts)
300 | 
301 |         print("inserted record is correct")
302 | 
303 |         state = menagerie.get_state(conn_id)
304 |         self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")
305 |         cows_bookmark = state['bookmarks']['dev-public-postgres_logical_replication_test_cows']
306 |         self.assertIsNotNone(cows_bookmark['lsn'], msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn")
307 |         lsn_cows_2 = cows_bookmark['lsn']
308 |         self.assertTrue(lsn_cows_2 >= lsn_cows_1)
309 | 
310 |         chickens_bookmark = state['bookmarks']['postgres-public-postgres_logical_replication_test_chickens']
311 |         self.assertIsNotNone(chickens_bookmark['lsn'], msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn")
312 |         lsn_chickens_2 = chickens_bookmark['lsn']
313 |         self.assertTrue(lsn_chickens_2 >= lsn_chickens_1)
314 | 
315 |         #table_version does NOT change
316 |         self.assertEqual(chickens_bookmark['version'], table_version_chickens, msg="expected bookmark for stream public-postgres_logical_replication_test to match version")
317 | 
318 |         #table_version does NOT change
319 |         self.assertEqual(cows_bookmark['version'], table_version_cows, msg="expected bookmark for stream public-postgres_logical_replication_test to match version")
320 | 


--------------------------------------------------------------------------------
/tests/test_postgres_full_table_replication_arrays.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import decimal
  3 | import json
  4 | import os
  5 | import unittest
  6 | import uuid
  7 | 
  8 | import pytz
  9 | import psycopg2.extras
 10 | from psycopg2.extensions import quote_ident
 11 | import tap_tester.connections as connections
 12 | import tap_tester.menagerie   as menagerie
 13 | import tap_tester.runner      as runner
 14 | 
 15 | import db_utils  # pylint: disable=import-error
 16 | 
 17 | 
 18 | 
 19 | test_schema_name = "public"
 20 | test_table_name = "postgres_full_table_replication_array_test"
 21 | 
 22 | 
 23 | MAX_SCALE = 38
 24 | MAX_PRECISION = 100
 25 | expected_schemas = {test_table_name:
 26 |                     {'definitions' : {
 27 |                         'sdc_recursive_integer_array' : { 'type' : ['null', 'integer', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_integer_array'}},
 28 |                         'sdc_recursive_number_array' : { 'type' : ['null', 'number', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_number_array'}},
 29 |                         'sdc_recursive_string_array' : { 'type' : ['null', 'string', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_string_array'}},
 30 |                         'sdc_recursive_boolean_array' : { 'type' : ['null', 'boolean', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_boolean_array'}},
 31 |                         'sdc_recursive_timestamp_array' : { 'type' : ['null', 'string', 'array'], 'format' : 'date-time', 'items' : { '$ref': '#/definitions/sdc_recursive_timestamp_array'}},
 32 |                         'sdc_recursive_object_array' : { 'type' : ['null','object', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_object_array'}},
 33 |                         "sdc_recursive_decimal_12_2_array": {"exclusiveMaximum": True,
 34 | 			                                     "exclusiveMinimum": True,
 35 | 			                                     "type": ['null', "number", "array"],
 36 | 			                                     "items": {
 37 | 				                                 "$ref": "#/definitions/sdc_recursive_decimal_12_2_array"
 38 | 			                                     },
 39 | 			                                     "minimum": -10000000000,
 40 | 			                                     "multipleOf": decimal.Decimal('0.01'),
 41 | 			                                     "maximum": 10000000000}},
 42 |                      'type': 'object',
 43 |                      'properties': {'id': {'maximum': 2147483647, 'type': ['integer'], 'minimum': -2147483648},
 44 |                                     'our_bit_array': {'items': { '$ref' : '#/definitions/sdc_recursive_boolean_array'}, 'type': ['null', 'array']},
 45 |                                     'our_boolean_array': {'items': { '$ref' : '#/definitions/sdc_recursive_boolean_array'}, 'type': ['null', 'array']},
 46 |                                     'our_cidr_array': {'items':{ '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']},
 47 |                                     'our_citext_array': {'items':{ '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']},
 48 |                                     'our_date_array': {'items':{ '$ref' : '#/definitions/sdc_recursive_timestamp_array'}, 'type': ['null', 'array']},
 49 |                                     'our_decimal_array' : {'type': ['null', 'array'], 'items': {'$ref' : '#/definitions/sdc_recursive_decimal_12_2_array'}},
 50 |                                     'our_double_array': {'items': { '$ref' : '#/definitions/sdc_recursive_number_array'}, 'type': ['null', 'array']},
 51 |                                     'our_enum_array': {'type': ['null', 'array'], 'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}} ,
 52 |                                     'our_float_array': {'items': { '$ref' : '#/definitions/sdc_recursive_number_array'}, 'type': ['null', 'array']},
 53 |                                     'our_hstore_array': {'items': { '$ref' : '#/definitions/sdc_recursive_object_array'}, 'type': ['null', 'array']},
 54 |                                     'our_inet_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']},
 55 |                                     'our_int_array': {'items': { '$ref' : '#/definitions/sdc_recursive_integer_array'}, 'type': ['null', 'array']},
 56 |                                     'our_json_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']},
 57 |                                     'our_jsonb_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']},
 58 |                                     'our_mac_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']},
 59 |                                     'our_money_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']},
 60 |                                     'our_real_array': {'items': { '$ref' : '#/definitions/sdc_recursive_number_array'}, 'type': ['null', 'array']},
 61 |                                     'our_smallint_array': {'items': { '$ref' : '#/definitions/sdc_recursive_integer_array'}, 'type': ['null', 'array']},
 62 |                                     'our_string_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']},
 63 |                                     'our_text_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']},
 64 |                                     'our_time_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']},
 65 |                                     'our_ts_tz_array': {'items': { '$ref' : '#/definitions/sdc_recursive_timestamp_array'}, 'type': ['null', 'array']},
 66 |                                     'our_uuid_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']}}
 67 |                      }}
 68 | 
 69 | 
 70 | def insert_record(cursor, table_name, data):
 71 |     our_keys = list(data.keys())
 72 |     our_keys.sort()
 73 |     our_values = [data.get(key) for key in our_keys]
 74 | 
 75 |     columns_sql = ", \n ".join(our_keys)
 76 |     value_sql_array = []
 77 |     for k in our_keys:
 78 |         if k == 'our_json_array':
 79 |             value_sql_array.append("%s::json[]")
 80 |         elif k == 'our_jsonb_array':
 81 |             value_sql_array.append("%s::jsonb[]")
 82 |         else:
 83 |             value_sql_array.append("%s")
 84 | 
 85 |     value_sql = ",".join(value_sql_array)
 86 | 
 87 |     insert_sql = """ INSERT INTO {}
 88 |                             ( {} )
 89 |                      VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql)
 90 |     cursor.execute(insert_sql, our_values)
 91 | 
 92 | def canonicalized_table_name(schema, table, cur):
 93 |     return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur))
 94 | 
 95 | 
 96 | class PostgresFullTableRepArrays(unittest.TestCase):
 97 |     def tearDown(self):
 98 |         with db_utils.get_test_connection('dev') as conn:
 99 |             conn.autocommit = True
100 | 
101 |     def setUp(self):
102 |         db_utils.ensure_environment_variables_set()
103 | 
104 |         db_utils.ensure_db()
105 | 
106 |         self.maxDiff = None
107 | 
108 |         with db_utils.get_test_connection('dev') as conn:
109 |             conn.autocommit = True
110 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
111 |                 old_table = cur.execute("""SELECT EXISTS (
112 |                                           SELECT 1
113 |                                           FROM  information_schema.tables
114 |                                           WHERE  table_schema = %s
115 |                                           AND  table_name =   %s);""",
116 |                                         [test_schema_name, test_table_name])
117 |                 old_table = cur.fetchone()[0]
118 | 
119 |                 if old_table:
120 |                     cur.execute("DROP TABLE {}".format(canonicalized_table_name(test_schema_name, test_table_name, cur)))
121 | 
122 | 
123 |                 cur = conn.cursor()
124 |                 cur.execute(""" SELECT installed_version FROM pg_available_extensions WHERE name = 'hstore' """)
125 |                 if cur.fetchone()[0] is None:
126 |                     cur.execute(""" CREATE EXTENSION hstore; """)
127 | 
128 |                 cur.execute(""" CREATE EXTENSION IF NOT EXISTS citext WITH SCHEMA public;""")
129 |                 cur.execute(""" DROP TYPE IF EXISTS ALIGNMENT CASCADE """)
130 |                 cur.execute(""" CREATE TYPE ALIGNMENT AS ENUM ('good', 'bad', 'ugly') """)
131 | 
132 | 
133 |                 create_table_sql = """
134 | CREATE TABLE {} (id                      SERIAL PRIMARY KEY,
135 |                 our_bit_array            BIT(1)[],
136 |                 our_boolean_array        BOOLEAN[],
137 |                 our_cidr_array           CIDR[],
138 |                 our_citext_array         CITEXT[],
139 |                 our_date_array           DATE[],
140 |                 our_decimal_array        NUMERIC(12,2)[],
141 |                 our_double_array         DOUBLE PRECISION[],
142 |                 our_enum_array           ALIGNMENT[],
143 |                 our_float_array          FLOAT[],
144 |                 our_hstore_array         HSTORE[],
145 |                 our_inet_array           INET[],
146 |                 our_int_array            INTEGER[][],
147 |                 our_json_array           JSON[],
148 |                 our_jsonb_array          JSONB[],
149 |                 our_mac_array            MACADDR[],
150 |                 our_money_array          MONEY[],
151 |                 our_real_array           REAL[],
152 |                 our_smallint_array       SMALLINT[],
153 |                 our_string_array         VARCHAR[],
154 |                 our_text_array           TEXT[],
155 |                 our_time_array           TIME[],
156 |                 our_ts_tz_array          TIMESTAMP WITH TIME ZONE[],
157 |                 our_uuid_array           UUID[])
158 |                 """.format(canonicalized_table_name(test_schema_name, test_table_name, cur))
159 | 
160 |                 cur.execute(create_table_sql)
161 | 
162 |     @staticmethod
163 |     def expected_check_streams():
164 |         return { 'dev-public-postgres_full_table_replication_array_test'}
165 | 
166 |     @staticmethod
167 |     def expected_sync_streams():
168 |         return { test_table_name }
169 | 
170 |     @staticmethod
171 |     def expected_pks():
172 |         return {
173 |             test_table_name : {'id'}
174 |         }
175 | 
176 |     @staticmethod
177 |     def tap_name():
178 |         return "tap-postgres"
179 | 
180 |     @staticmethod
181 |     def name():
182 |         return "tap_tester_postgres_full_table_replication_arrays"
183 | 
184 |     @staticmethod
185 |     def get_type():
186 |         return "platform.postgres"
187 | 
188 |     @staticmethod
189 |     def get_credentials():
190 |         return {'password': os.getenv('TAP_POSTGRES_PASSWORD')}
191 | 
192 |     @staticmethod
193 |     def get_properties():
194 |         return {'host' : os.getenv('TAP_POSTGRES_HOST'),
195 |                 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'),
196 |                 'port' : os.getenv('TAP_POSTGRES_PORT'),
197 |                 'user' : os.getenv('TAP_POSTGRES_USER'),
198 |                 'default_replication_method' : 'LOG_BASED'
199 |         }
200 | 
201 | 
202 |     def test_run(self):
203 |         conn_id = connections.ensure_connection(self)
204 | 
205 |         # run in check mode
206 |         check_job_name = runner.run_check_mode(self, conn_id)
207 | 
208 |         # verify check  exit codes
209 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
210 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
211 | 
212 |         # verify the tap discovered the right streams
213 |         found_catalogs = [fc for fc
214 |                           in menagerie.get_catalogs(conn_id)
215 |                           if fc['tap_stream_id'] in self.expected_check_streams()]
216 | 
217 | 
218 |         self.assertGreaterEqual(len(found_catalogs),
219 |                                 1,
220 |                                 msg="unable to locate schemas for connection {}".format(conn_id))
221 | 
222 |         found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))
223 |         diff = self.expected_check_streams().symmetric_difference(found_catalog_names)
224 |         self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
225 | 
226 |         # verify that persisted streams have the correct properties
227 |         test_catalog = found_catalogs[0]
228 | 
229 |         self.assertEqual(test_table_name, test_catalog['stream_name'])
230 | 
231 |         print("discovered streams are correct")
232 |         additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}]
233 |         _ = connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog,
234 |                                                                menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']),
235 |                                                                additional_md)
236 | 
237 |         # clear state
238 |         menagerie.set_state(conn_id, {})
239 | 
240 |         print("inserting a record")
241 |         our_ts_tz = None
242 |         our_date = None
243 |         our_uuid = str(uuid.uuid1())
244 |         with db_utils.get_test_connection('dev') as conn:
245 |             conn.autocommit = True
246 |             with conn.cursor() as cur:
247 |                 #insert fixture data 2
248 | 
249 |                 #insert fixture data 1
250 |                 our_ts = datetime.datetime(1997, 2, 2, 2, 2, 2, 722184)
251 |                 nyc_tz = pytz.timezone('America/New_York')
252 |                 our_ts_tz = nyc_tz.localize(our_ts)
253 |                 our_date = datetime.date(1998, 3, 4)
254 | 
255 |                 self.rec_1 = {
256 |                     'our_bit_array'         : '{{0,1,1}}',
257 |                     'our_boolean_array'     : '{true}',
258 |                     'our_cidr_array'        : '{{192.168.100.128/25}}',
259 |                     'our_citext_array'      : '{{maGICKal 2}}',
260 |                     'our_date_array'        : '{{{}}}'.format(our_date),
261 |                     'our_decimal_array'     : '{{{}}}'.format(decimal.Decimal('1234567890.01')),
262 |                     'our_double_array'      : '{{1.232323}}',
263 |                     'our_enum_array'        : '{{bad}}',
264 |                     'our_float_array'       : '{{5.23}}',
265 |                     'our_hstore_array'      : """{{"size=>small","name=>betty"}}""",
266 |                     'our_inet_array'        : '{{192.168.100.128/24}}',
267 |                     'our_int_array'         : '{{1,2,3},{4,5,6}}',
268 |                     'our_json_array'        : [psycopg2.extras.Json({'secret' : 55})],
269 |                     'our_jsonb_array'       : [psycopg2.extras.Json({'secret' : 69})],
270 |                     'our_mac_array'         : '{{08:00:2b:01:02:03}}',
271 |                     'our_money_array'       : '{{$412.1234}}',
272 |                     'our_real_array'        : '{{76.33}}',
273 |                     'our_smallint_array'    : '{{10,20,30},{40,50,60}}',
274 |                     'our_string_array'      : '{{one string, two strings}}',
275 |                     'our_text_array'        : '{{three string, four}}',
276 |                     'our_time_array'        : '{{03:04:05}}',
277 |                     'our_ts_tz_array'       : '{{{}}}'.format(our_ts_tz),
278 |                     'our_uuid_array'        : '{{{}}}'.format(our_uuid)}
279 | 
280 | 
281 |                 insert_record(cur, test_table_name, self.rec_1)
282 | 
283 | 
284 |         sync_job_name = runner.run_sync_mode(self, conn_id)
285 | 
286 |         # verify tap and target exit codes
287 |         exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
288 |         menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
289 | 
290 |         record_count_by_stream = runner.examine_target_output_file(self,
291 |                                                                    conn_id,
292 |                                                                    self.expected_sync_streams(),
293 |                                                                    self.expected_pks())
294 |         self.assertEqual(record_count_by_stream, { test_table_name: 1 })
295 |         records_by_stream = runner.get_records_from_target_output()
296 |         self.assertTrue(len(records_by_stream) > 0)
297 | 
298 |         for stream, recs in records_by_stream.items():
299 |             # verify the persisted schema was correct
300 |             self.assertEqual(recs['schema'],
301 |                              expected_schemas[stream],
302 |                              msg="Persisted schema did not match expected schema for stream `{}`.".format(stream))
303 | 
304 |         self.assertEqual(3, len(records_by_stream[test_table_name]['messages']))
305 |         self.assertEqual(records_by_stream[test_table_name]['messages'][0]['action'],
306 |                          'activate_version')
307 |         self.assertEqual(records_by_stream[test_table_name]['messages'][1]['action'],
308 |                          'upsert')
309 |         self.assertEqual(records_by_stream[test_table_name]['messages'][2]['action'],
310 |                          'activate_version')
311 |         actual_record_1 = records_by_stream[test_table_name]['messages'][1]['data']
312 | 
313 |         expected_inserted_record = {'id': 1,
314 |                                     'our_bit_array'         : [[False, True, True]],
315 |                                     'our_boolean_array'     : [True],
316 |                                     'our_cidr_array'        : [['192.168.100.128/25']],
317 |                                     'our_citext_array'      : [['maGICKal 2']],
318 |                                     'our_date_array'        : ['1998-03-04T00:00:00+00:00'],
319 |                                     'our_decimal_array'     : [decimal.Decimal('1234567890.01')],
320 |                                     'our_double_array'      : [[decimal.Decimal('1.232323')]],
321 |                                     'our_enum_array'        : [['bad']],
322 |                                     'our_float_array'       : [[decimal.Decimal('5.23')]],
323 |                                     'our_hstore_array'      : [[{'size' : 'small' }, {'name' : 'betty'} ]],
324 |                                     'our_inet_array'        : [['192.168.100.128/24']],
325 |                                     'our_int_array'         : [[1,2,3],[4,5,6]],
326 |                                     'our_json_array'        : [json.dumps({'secret' : 55})],
327 |                                     'our_jsonb_array'       : [json.dumps({'secret' : 69})],
328 |                                     'our_mac_array'         : [['08:00:2b:01:02:03']],
329 |                                     'our_money_array'       : [['$412.12']],
330 |                                     'our_real_array'        : [[decimal.Decimal('76.33')]],
331 |                                     'our_smallint_array'    : [[10,20,30],[40,50,60]],
332 |                                     'our_string_array'      : [['one string', 'two strings']],
333 |                                     'our_text_array'        : [['three string', 'four']],
334 |                                     'our_time_array'        : [['03:04:05']],
335 |                                     'our_ts_tz_array'       : ['1997-02-02T07:02:02.722184+00:00'],
336 |                                     'our_uuid_array'        : ['{}'.format(our_uuid)]
337 | 
338 |         }
339 | 
340 |         self.assertEqual(set(actual_record_1.keys()), set(expected_inserted_record.keys()),
341 |                          msg="keys for expected_record_1 are wrong: {}".format(set(actual_record_1.keys()).symmetric_difference(set(expected_inserted_record.keys()))))
342 | 
343 |         for k in actual_record_1.keys():
344 |             self.assertEqual(actual_record_1[k], expected_inserted_record[k], msg="{} != {} for key {}".format(actual_record_1[k], expected_inserted_record[k], k))
345 | 
346 |         print("inserted record is correct")
347 | 
348 |         # verify state and bookmarks
349 |         state = menagerie.get_state(conn_id)
350 | 
351 |         bookmark = state['bookmarks']['dev-public-postgres_full_table_replication_array_test']
352 |         self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None")
353 | 
354 |         self.assertIsNone(bookmark.get('lsn'),
355 |                           msg="expected bookmark for stream to have NO lsn because we are using full-table replication")
356 | 


--------------------------------------------------------------------------------
/tap_postgres/sync_strategies/logical_replication.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # pylint: disable=missing-docstring,not-an-iterable,too-many-locals,too-many-arguments,invalid-name,too-many-return-statements,too-many-branches,len-as-condition,too-many-nested-blocks,wrong-import-order,duplicate-code, anomalous-backslash-in-string, too-many-statements, singleton-comparison, consider-using-in
  3 | 
  4 | import singer
  5 | import datetime
  6 | import decimal
  7 | from singer import utils, get_bookmark
  8 | import singer.metadata as metadata
  9 | import tap_postgres.db as post_db
 10 | import tap_postgres.sync_strategies.common as sync_common
 11 | from dateutil.parser import parse
 12 | import psycopg2
 13 | from psycopg2 import sql
 14 | import copy
 15 | from select import select
 16 | from functools import reduce
 17 | import json
 18 | import re
 19 | 
 20 | LOGGER = singer.get_logger()
 21 | 
 22 | UPDATE_BOOKMARK_PERIOD = 1000
 23 | 
 24 | def get_pg_version(cur):
 25 |     cur.execute("SELECT version()")
 26 |     res = cur.fetchone()[0]
 27 |     version_match = re.match('PostgreSQL (\d+)', res)
 28 |     if not version_match:
 29 |         raise Exception('unable to determine PostgreSQL version from {}'.format(res))
 30 | 
 31 |     version = int(version_match.group(1))
 32 |     LOGGER.info("Detected PostgresSQL version: %s", version)
 33 |     return version
 34 | 
 35 | def fetch_current_lsn(conn_config):
 36 |     with post_db.open_connection(conn_config, False) as conn:
 37 |         with conn.cursor() as cur:
 38 |             version = get_pg_version(cur)
 39 |             if version == 9:
 40 |                 cur.execute("SELECT pg_current_xlog_location()")
 41 |             elif version > 9:
 42 |                 cur.execute("SELECT pg_current_wal_lsn()")
 43 |             else:
 44 |                 raise Exception('unable to fetch current lsn for PostgresQL version {}'.format(version))
 45 | 
 46 |             current_lsn = cur.fetchone()[0]
 47 |             file, index = current_lsn.split('/')
 48 |             return (int(file, 16)  << 32) + int(index, 16)
 49 | 
 50 | def add_automatic_properties(stream, conn_config):
 51 |     stream['schema']['properties']['_sdc_deleted_at'] = {'type' : ['null', 'string'], 'format' :'date-time'}
 52 |     if conn_config.get('debug_lsn'):
 53 |         LOGGER.info('debug_lsn is ON')
 54 |         stream['schema']['properties']['_sdc_lsn'] = {'type' : ['null', 'string']}
 55 |     else:
 56 |         LOGGER.info('debug_lsn is OFF')
 57 | 
 58 |     return stream
 59 | 
 60 | def get_stream_version(tap_stream_id, state):
 61 |     stream_version = singer.get_bookmark(state, tap_stream_id, 'version')
 62 | 
 63 |     if stream_version is None:
 64 |         raise Exception("version not found for log miner {}".format(tap_stream_id))
 65 | 
 66 |     return stream_version
 67 | 
 68 | def tuples_to_map(accum, t):
 69 |     accum[t[0]] = t[1]
 70 |     return accum
 71 | 
 72 | def create_hstore_elem_query(elem):
 73 |     return sql.SQL("SELECT hstore_to_array({})").format(sql.Literal(elem))
 74 | 
 75 | def create_hstore_elem(conn_info, elem):
 76 |     with post_db.open_connection(conn_info) as conn:
 77 |         with conn.cursor() as cur:
 78 |             query = create_hstore_elem_query(elem)
 79 |             cur.execute(query)
 80 |             res = cur.fetchone()[0]
 81 |             hstore_elem = reduce(tuples_to_map, [res[i:i + 2] for i in range(0, len(res), 2)], {})
 82 |             return hstore_elem
 83 | 
 84 | def create_array_elem(elem, sql_datatype, conn_info):
 85 |     if elem is None:
 86 |         return None
 87 | 
 88 |     with post_db.open_connection(conn_info) as conn:
 89 |         with conn.cursor() as cur:
 90 |             if sql_datatype == 'bit[]':
 91 |                 cast_datatype = 'boolean[]'
 92 |             elif sql_datatype == 'boolean[]':
 93 |                 cast_datatype = 'boolean[]'
 94 |             elif sql_datatype == 'character varying[]':
 95 |                 cast_datatype = 'character varying[]'
 96 |             elif sql_datatype == 'cidr[]':
 97 |                 cast_datatype = 'cidr[]'
 98 |             elif sql_datatype == 'citext[]':
 99 |                 cast_datatype = 'text[]'
100 |             elif sql_datatype == 'date[]':
101 |                 cast_datatype = 'text[]'
102 |             elif sql_datatype == 'double precision[]':
103 |                 cast_datatype = 'double precision[]'
104 |             elif sql_datatype == 'hstore[]':
105 |                 cast_datatype = 'text[]'
106 |             elif sql_datatype == 'integer[]':
107 |                 cast_datatype = 'integer[]'
108 |             elif sql_datatype == 'bigint[]':
109 |                 cast_datatype = 'bigint[]'
110 |             elif sql_datatype == 'inet[]':
111 |                 cast_datatype = 'inet[]'
112 |             elif sql_datatype == 'json[]':
113 |                 cast_datatype = 'text[]'
114 |             elif sql_datatype == 'jsonb[]':
115 |                 cast_datatype = 'text[]'
116 |             elif sql_datatype == 'macaddr[]':
117 |                 cast_datatype = 'macaddr[]'
118 |             elif sql_datatype == 'money[]':
119 |                 cast_datatype = 'text[]'
120 |             elif sql_datatype == 'numeric[]':
121 |                 cast_datatype = 'text[]'
122 |             elif sql_datatype == 'real[]':
123 |                 cast_datatype = 'real[]'
124 |             elif sql_datatype == 'smallint[]':
125 |                 cast_datatype = 'smallint[]'
126 |             elif sql_datatype == 'text[]':
127 |                 cast_datatype = 'text[]'
128 |             elif sql_datatype in ('time without time zone[]', 'time with time zone[]'):
129 |                 cast_datatype = 'text[]'
130 |             elif sql_datatype in ('timestamp with time zone[]', 'timestamp without time zone[]'):
131 |                 cast_datatype = 'text[]'
132 |             elif sql_datatype == 'uuid[]':
133 |                 cast_datatype = 'text[]'
134 | 
135 |             else:
136 |                 #custom datatypes like enums
137 |                 cast_datatype = 'text[]'
138 | 
139 |             sql_stmt = """SELECT $stitch_quote${}$stitch_quote$::{}""".format(elem, cast_datatype)
140 |             cur.execute(sql_stmt)
141 |             res = cur.fetchone()[0]
142 |             return res
143 | 
144 | #pylint: disable=too-many-branches,too-many-nested-blocks
145 | def selected_value_to_singer_value_impl(elem, og_sql_datatype, conn_info):
146 |     sql_datatype = og_sql_datatype.replace('[]', '')
147 | 
148 |     if elem is None:
149 |         return elem
150 |     if sql_datatype == 'timestamp without time zone':
151 |         return parse(elem).isoformat() + '+00:00'
152 |     if sql_datatype == 'timestamp with time zone':
153 |         if isinstance(elem, datetime.datetime):
154 |             return elem.isoformat()
155 | 
156 |         return parse(elem).isoformat()
157 |     if sql_datatype == 'date':
158 |         if  isinstance(elem, datetime.date):
159 |             #logical replication gives us dates as strings UNLESS they from an array
160 |             return elem.isoformat() + 'T00:00:00+00:00'
161 |         return parse(elem).isoformat() + "+00:00"
162 |     if sql_datatype == 'time with time zone':
163 |         return parse(elem).isoformat().split('T')[1]
164 |     if sql_datatype == 'bit':
165 |         #for arrays, elem will == True
166 |         #for ordinary bits, elem will == '1'
167 |         return elem == '1' or elem == True
168 |     if sql_datatype == 'boolean':
169 |         return elem
170 |     if sql_datatype == 'hstore':
171 |         return create_hstore_elem(conn_info, elem)
172 |     if 'numeric' in sql_datatype:
173 |         return decimal.Decimal(str(elem))
174 |     if isinstance(elem, int):
175 |         return elem
176 |     if isinstance(elem, float):
177 |         return elem
178 |     if isinstance(elem, str):
179 |         return elem
180 | 
181 |     raise Exception("do not know how to marshall value of type {}".format(elem.__class__))
182 | 
183 | def selected_array_to_singer_value(elem, sql_datatype, conn_info):
184 |     if isinstance(elem, list):
185 |         return list(map(lambda elem: selected_array_to_singer_value(elem, sql_datatype, conn_info), elem))
186 | 
187 |     return selected_value_to_singer_value_impl(elem, sql_datatype, conn_info)
188 | 
189 | def selected_value_to_singer_value(elem, sql_datatype, conn_info):
190 |     #are we dealing with an array?
191 |     if sql_datatype.find('[]') > 0:
192 |         cleaned_elem = create_array_elem(elem, sql_datatype, conn_info)
193 |         return list(map(lambda elem: selected_array_to_singer_value(elem, sql_datatype, conn_info), (cleaned_elem or [])))
194 | 
195 |     return selected_value_to_singer_value_impl(elem, sql_datatype, conn_info)
196 | 
197 | def row_to_singer_message(stream, row, version, columns, time_extracted, md_map, conn_info):
198 |     row_to_persist = ()
199 |     md_map[('properties', '_sdc_deleted_at')] = {'sql-datatype' : 'timestamp with time zone'}
200 |     md_map[('properties', '_sdc_lsn')] = {'sql-datatype' : "character varying"}
201 | 
202 |     for idx, elem in enumerate(row):
203 |         sql_datatype = md_map.get(('properties', columns[idx])).get('sql-datatype')
204 | 
205 |         if not sql_datatype:
206 |             LOGGER.info("No sql-datatype found for stream %s: %s", stream, columns[idx])
207 |             raise Exception("Unable to find sql-datatype for stream {}".format(stream))
208 | 
209 |         cleaned_elem = selected_value_to_singer_value(elem, sql_datatype, conn_info)
210 |         row_to_persist += (cleaned_elem,)
211 | 
212 |     rec = dict(zip(columns, row_to_persist))
213 | 
214 |     return singer.RecordMessage(
215 |         stream=post_db.calculate_destination_stream_name(stream, md_map),
216 |         record=rec,
217 |         version=version,
218 |         time_extracted=time_extracted)
219 | 
220 | def consume_message_format_2(payload, conn_info, streams_lookup, state, time_extracted, lsn):
221 |     ## Action Types:
222 |     # I = Insert
223 |     # U = Update
224 |     # D = Delete
225 |     # B = Begin Transaction
226 |     # C = Commit Transaction
227 |     # M = Message
228 |     # T = Truncate
229 |     action = payload['action']
230 |     if action not in ['U', 'I', 'D']:
231 |         LOGGER.debug("Skipping message of type %s", action)
232 |         yield None
233 |     else:
234 |         tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'], payload['schema'], payload['table'])
235 |         if streams_lookup.get(tap_stream_id) is None:
236 |             yield None
237 |         else:
238 |             target_stream = streams_lookup[tap_stream_id]
239 |             stream_version = get_stream_version(target_stream['tap_stream_id'], state)
240 |             stream_md_map = metadata.to_map(target_stream['metadata'])
241 | 
242 |             desired_columns = [col for col in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(stream_md_map, col)]
243 | 
244 |             col_names = []
245 |             col_vals = []
246 |             if payload['action'] in ['I', 'U']:
247 |                 for column in payload['columns']:
248 |                     if column['name'] in set(desired_columns):
249 |                         col_names.append(column['name'])
250 |                         col_vals.append(column['value'])
251 | 
252 |                 col_names = col_names + ['_sdc_deleted_at']
253 |                 col_vals = col_vals + [None]
254 | 
255 |                 if conn_info.get('debug_lsn'):
256 |                     col_names = col_names + ['_sdc_lsn']
257 |                     col_vals = col_vals + [str(lsn)]
258 | 
259 |             elif payload['action'] == 'D':
260 |                 for column in payload['identity']:
261 |                     if column['name'] in set(desired_columns):
262 |                         col_names.append(column['name'])
263 |                         col_vals.append(column['value'])
264 | 
265 |                 col_names = col_names + ['_sdc_deleted_at']
266 |                 col_vals = col_vals + [singer.utils.strftime(singer.utils.strptime_to_utc(payload['timestamp']))]
267 | 
268 |                 if conn_info.get('debug_lsn'):
269 |                     col_vals = col_vals + [str(lsn)]
270 |                     col_names = col_names + ['_sdc_lsn']
271 | 
272 |             # Yield 1 record to match the API of V1
273 |             yield row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info)
274 | 
275 |             state = singer.write_bookmark(state,
276 |                                           target_stream['tap_stream_id'],
277 |                                           'lsn',
278 |                                           lsn)
279 | 
280 | # message-format v1
281 | def consume_message_format_1(payload, conn_info, streams_lookup, state, time_extracted, lsn):
282 |     for c in payload['change']:
283 |         tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'], c['schema'], c['table'])
284 |         if streams_lookup.get(tap_stream_id) is None:
285 |             continue
286 | 
287 |         target_stream = streams_lookup[tap_stream_id]
288 |         stream_version = get_stream_version(target_stream['tap_stream_id'], state)
289 |         stream_md_map = metadata.to_map(target_stream['metadata'])
290 | 
291 | 
292 |         desired_columns = [c for c in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(stream_md_map, c)]
293 | 
294 |         if c['kind'] == 'insert':
295 |             col_names = []
296 |             col_vals = []
297 |             for idx, col in enumerate(c['columnnames']):
298 |                 if col in set(desired_columns):
299 |                     col_names.append(col)
300 |                     col_vals.append(c['columnvalues'][idx])
301 | 
302 |             col_names = col_names + ['_sdc_deleted_at']
303 |             col_vals = col_vals + [None]
304 |             if conn_info.get('debug_lsn'):
305 |                 col_names = col_names + ['_sdc_lsn']
306 |                 col_vals = col_vals + [str(lsn)]
307 |             record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info)
308 | 
309 |         elif c['kind'] == 'update':
310 |             col_names = []
311 |             col_vals = []
312 |             for idx, col in enumerate(c['columnnames']):
313 |                 if col in set(desired_columns):
314 |                     col_names.append(col)
315 |                     col_vals.append(c['columnvalues'][idx])
316 | 
317 |             col_names = col_names + ['_sdc_deleted_at']
318 |             col_vals = col_vals + [None]
319 | 
320 |             if conn_info.get('debug_lsn'):
321 |                 col_vals = col_vals + [str(lsn)]
322 |                 col_names = col_names + ['_sdc_lsn']
323 |             record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info)
324 | 
325 |         elif c['kind'] == 'delete':
326 |             col_names = []
327 |             col_vals = []
328 |             for idx, col in enumerate(c['oldkeys']['keynames']):
329 |                 if col in set(desired_columns):
330 |                     col_names.append(col)
331 |                     col_vals.append(c['oldkeys']['keyvalues'][idx])
332 | 
333 | 
334 |             col_names = col_names + ['_sdc_deleted_at']
335 |             col_vals = col_vals  + [singer.utils.strftime(time_extracted)]
336 |             if conn_info.get('debug_lsn'):
337 |                 col_vals = col_vals + [str(lsn)]
338 |                 col_names = col_names + ['_sdc_lsn']
339 |             record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info)
340 | 
341 |         else:
342 |             raise Exception("unrecognized replication operation: {}".format(c['kind']))
343 | 
344 | 
345 |         yield record_message
346 |         state = singer.write_bookmark(state,
347 |                                       target_stream['tap_stream_id'],
348 |                                       'lsn',
349 |                                       lsn)
350 | 
351 | 
352 | def consume_message(streams, state, msg, time_extracted, conn_info, end_lsn, message_format="1"):
353 |     payload = json.loads(msg.payload)
354 |     lsn = msg.data_start
355 | 
356 |     streams_lookup = {s['tap_stream_id']: s for s in streams}
357 | 
358 |     if message_format == "1":
359 |         records = consume_message_format_1(payload, conn_info, streams_lookup, state, time_extracted, lsn)
360 |     elif message_format == "2":
361 |         records = consume_message_format_2(payload, conn_info, streams_lookup, state, time_extracted, lsn)
362 |     else:
363 |         raise Exception("Unknown wal2json message format version: {}".format(message_format))
364 | 
365 |     for record_message in records:
366 |         if record_message:
367 |             singer.write_message(record_message)
368 |         # Pulled out of refactor so we send a keep-alive per-record
369 |         LOGGER.debug("sending feedback to server with NO flush_lsn. just a keep-alive")
370 |         msg.cursor.send_feedback()
371 | 
372 |     LOGGER.debug("sending feedback to server. flush_lsn = %s", msg.data_start)
373 |     if msg.data_start > end_lsn:
374 |         raise Exception("incorrectly attempting to flush an lsn({}) > end_lsn({})".format(msg.data_start, end_lsn))
375 | 
376 |     msg.cursor.send_feedback(flush_lsn=msg.data_start)
377 | 
378 | 
379 |     return state
380 | 
381 | def locate_replication_slot(conn_info):
382 |     with post_db.open_connection(conn_info, False) as conn:
383 |         with conn.cursor() as cur:
384 |             db_specific_slot = "stitch_{}".format(conn_info['dbname'])
385 |             cur.execute("SELECT * FROM pg_replication_slots WHERE slot_name = %s AND plugin = %s", (db_specific_slot, 'wal2json'))
386 |             if len(cur.fetchall()) == 1:
387 |                 LOGGER.info("using pg_replication_slot %s", db_specific_slot)
388 |                 return db_specific_slot
389 | 
390 | 
391 |             cur.execute("SELECT * FROM pg_replication_slots WHERE slot_name = 'stitch' AND plugin = 'wal2json'")
392 |             if len(cur.fetchall()) == 1:
393 |                 LOGGER.info("using pg_replication_slot 'stitch'")
394 |                 return 'stitch'
395 | 
396 |             raise Exception("Unable to find replication slot (stitch || {} with wal2json".format(db_specific_slot))
397 | 
398 | 
399 | def sync_tables(conn_info, logical_streams, state, end_lsn):
400 |     start_lsn = min([get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams])
401 |     time_extracted = utils.now()
402 |     slot = locate_replication_slot(conn_info)
403 |     last_lsn_processed = None
404 |     poll_total_seconds = conn_info['logical_poll_total_seconds'] or 60 * 30  #we are willing to poll for a total of 30 minutes without finding a record
405 |     keep_alive_time = 10.0
406 |     begin_ts = datetime.datetime.now()
407 | 
408 |     for s in logical_streams:
409 |         sync_common.send_schema_message(s, ['lsn'])
410 | 
411 |     with post_db.open_connection(conn_info, True) as conn:
412 |         with conn.cursor() as cur:
413 |             LOGGER.info("Starting Logical Replication for %s(%s): %s -> %s. poll_total_seconds: %s", list(map(lambda s: s['tap_stream_id'], logical_streams)), slot, start_lsn, end_lsn, poll_total_seconds)
414 | 
415 |             replication_params = {"slot_name": slot,
416 |                                   "decode": True,
417 |                                   "start_lsn": start_lsn}
418 |             message_format = conn_info.get("wal2json_message_format") or "1"
419 |             if message_format == "2":
420 |                 LOGGER.info("Using wal2json format-version 2")
421 |                 replication_params["options"] = {"format-version": 2, "include-timestamp": True}
422 | 
423 |             try:
424 |                 cur.start_replication(**replication_params)
425 |             except psycopg2.ProgrammingError:
426 |                 raise Exception("unable to start replication with logical replication slot {}".format(slot))
427 | 
428 |             rows_saved = 0
429 |             while True:
430 |                 poll_duration = (datetime.datetime.now() - begin_ts).total_seconds()
431 |                 if poll_duration > poll_total_seconds:
432 |                     LOGGER.info("breaking after %s seconds of polling with no data", poll_duration)
433 |                     break
434 | 
435 |                 msg = cur.read_message()
436 |                 if msg:
437 |                     begin_ts = datetime.datetime.now()
438 |                     if msg.data_start > end_lsn:
439 |                         LOGGER.info("gone past end_lsn %s for run. breaking", end_lsn)
440 |                         break
441 | 
442 |                     state = consume_message(logical_streams, state, msg, time_extracted,
443 |                                             conn_info, end_lsn, message_format=message_format)
444 |                     #msg has been consumed. it has been processed
445 |                     last_lsn_processed = msg.data_start
446 |                     rows_saved = rows_saved + 1
447 |                     if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
448 |                         singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
449 |                 else:
450 |                     now = datetime.datetime.now()
451 |                     timeout = keep_alive_time - (now - cur.io_timestamp).total_seconds()
452 |                     try:
453 |                         sel = select([cur], [], [], max(0, timeout))
454 |                         if not any(sel):
455 |                             LOGGER.info("no data for %s seconds. sending feedback to server with NO flush_lsn. just a keep-alive", timeout)
456 |                             cur.send_feedback()
457 | 
458 |                     except InterruptedError:
459 |                         pass  # recalculate timeout and continue
460 | 
461 |     if last_lsn_processed:
462 |         for s in logical_streams:
463 |             LOGGER.info("updating bookmark for stream %s to last_lsn_processed %s", s['tap_stream_id'], last_lsn_processed)
464 |             state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn', last_lsn_processed)
465 | 
466 |     singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
467 |     return state
468 | 


--------------------------------------------------------------------------------
/tests/test_postgres_discovery.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import datetime
  3 | import unittest
  4 | import decimal
  5 | import uuid
  6 | import json
  7 | 
  8 | from psycopg2.extensions import quote_ident
  9 | import psycopg2.extras
 10 | import pytz
 11 | import tap_tester.connections as connections
 12 | import tap_tester.menagerie   as menagerie
 13 | import tap_tester.runner      as runner
 14 | 
 15 | import db_utils  # pylint: disable=import-error
 16 | 
 17 | 
 18 | test_schema_name = "public"
 19 | test_table_name = "postgres_discovery_test"
 20 | test_db = "discovery1"
 21 | 
 22 | 
 23 | class PostgresDiscovery(unittest.TestCase):
 24 |     AUTOMATIC_FIELDS = "automatic"
 25 |     REPLICATION_KEYS = "valid-replication-keys"
 26 |     PRIMARY_KEYS = "table-key-properties"
 27 |     FOREIGN_KEYS = "table-foreign-key-properties"
 28 |     REPLICATION_METHOD = "forced-replication-method"
 29 |     API_LIMIT = "max-row-limit"
 30 |     INCREMENTAL = "INCREMENTAL"
 31 |     FULL_TABLE = "FULL_TABLE"
 32 |     LOG_BASED = "LOG_BASED"
 33 | 
 34 |     UNSUPPORTED_TYPES = {
 35 |         "BIGSERIAL",
 36 |         "BIT VARYING",
 37 |         "BOX",
 38 |         "BYTEA",
 39 |         "CIRCLE",
 40 |         "INTERVAL",
 41 |         "LINE",
 42 |         "LSEG",
 43 |         "PATH",
 44 |         "PG_LSN",
 45 |         "POINT",
 46 |         "POLYGON",
 47 |         "SERIAL",
 48 |         "SMALLSERIAL",
 49 |         "TSQUERY",
 50 |         "TSVECTOR",
 51 |         "TXID_SNAPSHOT",
 52 |         "XML",
 53 |     }
 54 |     default_replication_method = ""
 55 | 
 56 |     def tearDown(self):
 57 |         pass
 58 |         # with db_utils.get_test_connection(test_db) as conn:
 59 |         #     conn.autocommit = True
 60 |         #     with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
 61 |         #         cur.execute(""" SELECT pg_drop_replication_slot('stitch') """)
 62 | 
 63 |     def setUp(self):
 64 |         db_utils.ensure_environment_variables_set()
 65 | 
 66 |         db_utils.ensure_db(test_db)
 67 |         self.maxDiff = None
 68 | 
 69 |         with db_utils.get_test_connection(test_db) as conn:
 70 |             conn.autocommit = True
 71 |             with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
 72 | 
 73 |                 # db_utils.ensure_replication_slot(cur, test_db)
 74 | 
 75 |                 canonicalized_table_name = db_utils.canonicalized_table_name(cur, test_schema_name, test_table_name)
 76 | 
 77 |                 create_table_sql = """
 78 | CREATE TABLE {} (id                   SERIAL PRIMARY KEY,
 79 |                 our_varchar           VARCHAR,
 80 |                 our_varchar_10        VARCHAR(10),
 81 |                 our_text              TEXT,
 82 |                 our_text_2            TEXT,
 83 |                 our_integer           INTEGER,
 84 |                 our_smallint          SMALLINT,
 85 |                 our_bigint            BIGINT,
 86 |                 our_decimal           NUMERIC(12,2),
 87 |                 "OUR TS"              TIMESTAMP WITHOUT TIME ZONE,
 88 |                 "OUR TS TZ"           TIMESTAMP WITH TIME ZONE,
 89 |                 "OUR TIME"            TIME WITHOUT TIME ZONE,
 90 |                 "OUR TIME TZ"         TIME WITH TIME ZONE,
 91 |                 "OUR DATE"            DATE,
 92 |                 our_double            DOUBLE PRECISION,
 93 |                 our_real              REAL,
 94 |                 our_boolean           BOOLEAN,
 95 |                 our_bit               BIT(1),
 96 |                 our_json              JSON,
 97 |                 our_jsonb             JSONB,
 98 |                 our_uuid              UUID,
 99 |                 our_store             HSTORE,
100 |                 our_citext            CITEXT,
101 |                 our_cidr              cidr,
102 |                 our_inet              inet,
103 |                 our_mac               macaddr,
104 |                 our_alignment_enum    ALIGNMENT,
105 |                 our_money             money,
106 |                 invalid_bigserial     BIGSERIAL,
107 |                 invalid_bit_varying   BIT VARYING,
108 |                 invalid_box           BOX,
109 |                 invalid_bytea         BYTEA,
110 |                 invalid_circle        CIRCLE,
111 |                 invalid_interval      INTERVAL,
112 |                 invalid_line          LINE,
113 |                 invalid_lseg          LSEG,
114 |                 invalid_path          PATH,
115 |                 invalid_pg_lsn        PG_LSN,
116 |                 invalid_point         POINT,
117 |                 invalid_polygon       POLYGON,
118 |                 invalid_serial        SERIAL,
119 |                 invalid_smallserial   SMALLSERIAL,
120 |                 invalid_tsquery       TSQUERY,
121 |                 invalid_tsvector      TSVECTOR,
122 |                 invalid_txid_snapshot TXID_SNAPSHOT,
123 |                 invalid_xml           XML)
124 |                 """.format(canonicalized_table_name)
125 | 
126 |                 cur = db_utils.ensure_fresh_table(conn, cur, test_schema_name, test_table_name)
127 |                 cur.execute(create_table_sql)
128 | 
129 |                 #insert fixture data 1
130 |                 our_ts = datetime.datetime(1997, 2, 2, 2, 2, 2, 722184)
131 |                 nyc_tz = pytz.timezone('America/New_York')
132 |                 our_ts_tz = nyc_tz.localize(our_ts)
133 |                 our_time  = datetime.time(12,11,10)
134 |                 our_time_tz = our_time.isoformat() + "-04:00"
135 |                 our_date = datetime.date(1998, 3, 4)
136 |                 my_uuid =  str(uuid.uuid1())
137 | 
138 |                 self.recs = []
139 |                 for _ in range(500):
140 |                     our_ts = datetime.datetime(1987, 3, 3, 3, 3, 3, 733184)
141 |                     nyc_tz = pytz.timezone('America/New_York')
142 |                     our_ts_tz = nyc_tz.localize(our_ts)
143 |                     our_time  = datetime.time(10,9,8)
144 |                     our_time_tz = our_time.isoformat() + "-04:00"
145 |                     our_date = datetime.date(1964, 7, 1)
146 |                     my_uuid =  str(uuid.uuid1())
147 | 
148 |                     record = {'our_varchar' : "our_varchar 4",
149 |                               'our_varchar_10' : "varchar_10",
150 |                               'our_text' : "some text 2",
151 |                               'our_text_2' : "NOT SELECTED",
152 |                               'our_integer' : 44101,
153 |                               'our_smallint' : 2,
154 |                               'our_bigint' : 1000001,
155 |                               'our_decimal' : decimal.Decimal('9876543210.02'),
156 |                               quote_ident('OUR TS', cur) : our_ts,
157 |                               quote_ident('OUR TS TZ', cur) : our_ts_tz,
158 |                               quote_ident('OUR TIME', cur) : our_time,
159 |                               quote_ident('OUR TIME TZ', cur) : our_time_tz,
160 |                               quote_ident('OUR DATE', cur) : our_date,
161 |                               'our_double' : 1.1,
162 |                               'our_real' : 1.2,
163 |                               'our_boolean' : True,
164 |                               'our_bit' : '1',
165 |                               'our_json' : json.dumps({'nymn' : 77}),
166 |                               'our_jsonb' : json.dumps({'burgers' : 'good++'}),
167 |                               'our_uuid' : my_uuid,
168 |                               'our_store' : 'dances=>"floor",name=>"betty"',
169 |                               'our_citext': 'maGICKal 2',
170 |                               'our_cidr' : '192.168.101.128/25',
171 |                               'our_inet': '192.168.101.128/24',
172 |                               'our_mac' : '08:00:2b:01:02:04',
173 |                     }
174 | 
175 |                     db_utils.insert_record(cur, test_table_name, record)
176 |                     self.recs.append(record)
177 | 
178 |                 cur.execute("""ANALYZE {}""".format(canonicalized_table_name))
179 | 
180 |     @staticmethod
181 |     def expected_check_streams():
182 |         return { 'postgres_discovery_test'}
183 | 
184 |     def expected_check_stream_ids(self):
185 |         """A set of expected table names in <collection_name> format"""
186 |         check_streams = self.expected_check_streams()
187 |         return {"{}-{}-{}".format(test_db, test_schema_name, stream) for stream in check_streams}
188 | 
189 |     @staticmethod
190 |     def expected_primary_keys():
191 |         return {
192 |             'postgres_discovery_test' : {'id'}
193 |         }
194 | 
195 |     @staticmethod
196 |     def expected_unsupported_fields():
197 |         return {
198 |             'invalid_bigserial',
199 |             'invalid_bit_varying',
200 |             'invalid_box',
201 |             'invalid_bytea',
202 |             'invalid_circle',
203 |             'invalid_interval',
204 |             'invalid_line',
205 |             'invalid_lseg',
206 |             'invalid_path',
207 |             'invalid_pg_lsn',
208 |             'invalid_point',
209 |             'invalid_polygon',
210 |             'invalid_serial',
211 |             'invalid_smallserial',
212 |             'invalid_tsquery',
213 |             'invalid_tsvector',
214 |             'invalid_txid_snapshot',
215 |             'invalid_xml',
216 |         }
217 |     @staticmethod
218 |     def expected_schema_types():
219 |         return {
220 |             'id': 'integer',  # 'serial primary key',
221 |             'our_varchar': 'character varying',  # 'varchar'
222 |             'our_varchar_10': 'character varying',  # 'varchar(10)',
223 |             'our_text': 'text',
224 |             'our_text_2': 'text',
225 |             'our_integer': 'integer',
226 |             'our_smallint': 'smallint',
227 |             'our_bigint': 'bigint',
228 |             'our_decimal': 'numeric',
229 |             'OUR TS': 'timestamp without time zone',
230 |             'OUR TS TZ': 'timestamp with time zone',
231 |             'OUR TIME': 'time without time zone',
232 |             'OUR TIME TZ': 'time with time zone',
233 |             'OUR DATE': 'date',
234 |             'our_double': 'double precision',
235 |             'our_real': 'real',
236 |             'our_boolean': 'boolean',
237 |             'our_bit': 'bit',
238 |             'our_json': 'json',
239 |             'our_jsonb': 'jsonb',
240 |             'our_uuid': 'uuid',
241 |             'our_store': 'hstore',
242 |             'our_citext': 'citext',
243 |             'our_cidr': 'cidr',
244 |             'our_inet': 'inet',
245 |             'our_mac': 'macaddr',
246 |             'our_alignment_enum': 'alignment',
247 |             'our_money': 'money',
248 |             'invalid_bigserial': 'bigint',
249 |             'invalid_bit_varying': 'bit varying',
250 |             'invalid_box': 'box',
251 |             'invalid_bytea': 'bytea',
252 |             'invalid_circle': 'circle',
253 |             'invalid_interval': 'interval',
254 |             'invalid_line': 'line',
255 |             'invalid_lseg': 'lseg',
256 |             'invalid_path': 'path',
257 |             'invalid_pg_lsn': 'pg_lsn',
258 |             'invalid_point': 'point',
259 |             'invalid_polygon': 'polygon',
260 |             'invalid_serial': 'integer',
261 |             'invalid_smallserial': 'smallint',
262 |             'invalid_tsquery': 'tsquery',
263 |             'invalid_tsvector': 'tsvector',
264 |             'invalid_txid_snapshot': 'txid_snapshot',
265 |             'invalid_xml': 'xml',
266 |         }
267 | 
268 |     @staticmethod
269 |     def tap_name():
270 |         return "tap-postgres"
271 | 
272 |     @staticmethod
273 |     def name():
274 |         return "tap_tester_postgres_discovery"
275 | 
276 |     @staticmethod
277 |     def get_type():
278 |         return "platform.postgres"
279 | 
280 |     @staticmethod
281 |     def get_credentials():
282 |         return {'password': os.getenv('TAP_POSTGRES_PASSWORD')}
283 | 
284 |     def get_properties(self, original_properties=True):
285 |         return_value = {
286 |             'host' : os.getenv('TAP_POSTGRES_HOST'),
287 |             'dbname' : os.getenv('TAP_POSTGRES_DBNAME'),
288 |             'port' : os.getenv('TAP_POSTGRES_PORT'),
289 |             'user' : os.getenv('TAP_POSTGRES_USER'),
290 |             'default_replication_method' : self.FULL_TABLE,
291 |             'filter_dbs' : 'discovery1'
292 |         }
293 |         if not original_properties:
294 |             if self.default_replication_method is self.LOG_BASED:
295 |                 return_value['wal2json_message_format'] = '1'
296 | 
297 |             return_value['default_replication_method'] = self.default_replication_method
298 | 
299 |         return return_value
300 | 
301 |     def test_run(self):
302 |         """Parametrized discovery test running against each replicatio method."""
303 | 
304 |         self.default_replication_method = self.FULL_TABLE
305 |         full_table_conn_id = connections.ensure_connection(self, original_properties=False)
306 |         self.discovery_test(full_table_conn_id)
307 | 
308 |         self.default_replication_method = self.INCREMENTAL
309 |         incremental_conn_id = connections.ensure_connection(self, original_properties=False)
310 |         self.discovery_test(incremental_conn_id)
311 | 
312 |         # NB | We are able to generate a connection and run discovery with a default replication
313 |         #      method of logical replication WITHOUT selecting a replication slot. This is not
314 |         #      ideal behavior. This BUG should not be carried over into hp-postgres, but will not
315 |         #      be fixed for this tap.
316 |         self.default_replication_method = self.LOG_BASED
317 |         log_based_conn_id = connections.ensure_connection(self, original_properties=False)
318 |         self.discovery_test(log_based_conn_id)
319 | 
320 |     def discovery_test(self, conn_id):
321 |         """
322 |         Basic Discovery Test for a database tap.
323 | 
324 |         Test Description:
325 |           Ensure discovery runs without exit codes and generates a catalog of the expected form
326 | 
327 |         Test Cases:
328 |             - Verify discovery generated the expected catalogs by name.
329 |             - Verify that the table_name is in the format <collection_name> for each stream.
330 |             - Verify the caatalog is found for a given stream.
331 |             - Verify there is only 1 top level breadcrumb in metadata for a given stream.
332 |             - Verify replication key(s) match expectations for a given stream.
333 |             - Verify primary key(s) match expectations for a given stream.
334 |             - Verify the replication method matches our expectations for a given stream.
335 |             - Verify that only primary keys are given the inclusion of automatic in metadata
336 |               for a given stream.
337 |             - Verify expected unsupported fields are given the inclusion of unsupported in
338 |               metadata for a given stream.
339 |             - Verify that all fields for a given stream which are not unsupported or automatic
340 |               have inclusion of available.
341 |             - Verify row-count metadata matches expectations for a given stream.
342 |             - Verify selected metadata is None for all streams.
343 |             - Verify is-view metadata is False for a given stream.
344 |             - Verify no forced-replication-method is present in metadata for a given stream.
345 |             - Verify schema and db match expectations for a given stream.
346 |             - Verify schema types match expectations for a given stream.
347 |         """
348 |         ##########################################################################
349 |         ### TODO
350 |         ###   [] Generate multiple tables (streams) and maybe dbs too?
351 |         ###   [] Investigate potential bug, see DOCS_BUG_1
352 |         ##########################################################################
353 | 
354 |         # run discovery (check mode)
355 |         check_job_name = runner.run_check_mode(self, conn_id)
356 | 
357 |         # Verify check exit codes
358 |         exit_status = menagerie.get_exit_status(conn_id, check_job_name)
359 |         menagerie.verify_check_exit_status(self, exit_status, check_job_name)
360 | 
361 |         # Verify discovery generated a catalog
362 |         found_catalogs = menagerie.get_catalogs(conn_id)
363 |         self.assertGreater(len(found_catalogs), 0)
364 | 
365 |         # Verify discovery generated the expected catalogs by name
366 |         found_catalog_names = {catalog['stream_name'] for catalog in found_catalogs}
367 |         self.assertSetEqual(self.expected_check_streams(), found_catalog_names)
368 | 
369 |         # Verify that the table_name is in the format <collection_name> for each stream
370 |         found_catalog_stream_ids = {catalog['tap_stream_id'] for catalog in found_catalogs}
371 |         self.assertSetEqual(self.expected_check_stream_ids(), found_catalog_stream_ids)
372 | 
373 |         # Test by stream
374 |         for stream in self.expected_check_streams():
375 |             with self.subTest(stream=stream):
376 | 
377 |                 # Verify the caatalog is found for a given stream
378 |                 catalog = next(iter([catalog for catalog in found_catalogs
379 |                                      if catalog["stream_name"] == stream]))
380 |                 self.assertTrue(isinstance(catalog, dict))
381 | 
382 |                 # collecting expected values
383 |                 expected_primary_keys = self.expected_primary_keys()[stream]
384 |                 expected_replication_keys = set()
385 |                 expected_unsupported_fields = self.expected_unsupported_fields()
386 |                 expected_fields_to_datatypes = self.expected_schema_types()
387 |                 expected_row_count = len(self.recs)
388 | 
389 |                 # collecting actual values...
390 |                 schema_and_metadata = menagerie.get_annotated_schema(conn_id, catalog['stream_id'])
391 |                 stream_metadata = schema_and_metadata["metadata"]
392 |                 top_level_metadata = [item for item in stream_metadata if item.get("breadcrumb") == []]
393 |                 stream_properties = top_level_metadata[0]['metadata']
394 |                 actual_primary_keys = set(stream_properties.get(self.PRIMARY_KEYS, []))
395 |                 actual_replication_keys = set(stream_properties.get(self.REPLICATION_KEYS, []))
396 |                 actual_replication_method = stream_properties.get(self.REPLICATION_METHOD)
397 |                 actual_automatic_fields = set(
398 |                     item.get("breadcrumb", ["properties", None])[1] for item in stream_metadata
399 |                     if item.get("metadata").get("inclusion") == "automatic"
400 |                 )
401 |                 actual_unsupported_fields = set(
402 |                     item.get("breadcrumb", ["properties", None])[1] for item in stream_metadata
403 |                     if item.get("metadata").get("inclusion") == "unsupported"
404 |                 )
405 |                 actual_fields_to_datatypes = {
406 |                     item['breadcrumb'][1]: item['metadata'].get('sql-datatype')
407 |                     for item in stream_metadata if item['breadcrumb'] != []
408 |                 }
409 | 
410 |                 # Verify there is only 1 top level breadcrumb in metadata
411 |                 self.assertEqual(1, len(top_level_metadata))
412 | 
413 |                 # Verify replication key(s) match expectations
414 |                 self.assertSetEqual(
415 |                     expected_replication_keys, actual_replication_keys
416 |                 )
417 | 
418 |                 # NB | We expect primary keys and replication keys to have inclusion automatic for
419 |                 #      key-based incremental replication. But that is only true for primary keys here.
420 |                 #      This BUG should not be carried over into hp-postgres, but will not be fixed for this tap.
421 | 
422 |                 # Verify primary key(s) match expectations
423 |                 self.assertSetEqual(
424 |                     expected_primary_keys, actual_primary_keys,
425 |                 )
426 | 
427 |                 # Verify the replication method matches our expectations
428 |                 self.assertIsNone(actual_replication_method)
429 | 
430 |                 # Verify that only primary keys
431 |                 # are given the inclusion of automatic in metadata.
432 |                 self.assertSetEqual(expected_primary_keys, actual_automatic_fields)
433 | 
434 | 
435 |                 # DOCS_BUG_1 ? | The following types were converted and selected, but docs say unsupported.
436 |                 #                Still need to investigate how the tap handles values of these datatypes
437 |                 #                during sync.
438 |                 KNOWN_MISSING = {
439 |                     'invalid_bigserial', # BIGSERIAL -> bigint
440 |                     'invalid_serial',  # SERIAL -> integer
441 |                     'invalid_smallserial',  # SMALLSERIAL -> smallint
442 |                 }
443 |                 # Verify expected unsupported fields
444 |                 # are given the inclusion of unsupported in metadata.
445 |                 self.assertSetEqual(expected_unsupported_fields, actual_unsupported_fields | KNOWN_MISSING)
446 | 
447 | 
448 |                 # Verify that all other fields have inclusion of available
449 |                 # This assumes there are no unsupported fields for SaaS sources
450 |                 self.assertTrue(
451 |                     all({item.get("metadata").get("inclusion") == "available"
452 |                          for item in stream_metadata
453 |                          if item.get("breadcrumb", []) != []
454 |                          and item.get("breadcrumb", ["properties", None])[1]
455 |                          not in actual_automatic_fields
456 |                          and item.get("breadcrumb", ["properties", None])[1]
457 |                          not in actual_unsupported_fields}),
458 |                     msg="Not all non key properties are set to available in metadata")
459 | 
460 |                 # Verify row-count metadata matches expectations
461 |                 self.assertEqual(expected_row_count, stream_properties['row-count'])
462 | 
463 |                 # Verify selected metadata is None for all streams
464 |                 self.assertNotIn('selected', stream_properties.keys())
465 | 
466 |                 # Verify is-view metadata is False
467 |                 self.assertFalse(stream_properties['is-view'])
468 | 
469 |                 # Verify no forced-replication-method is present in metadata
470 |                 self.assertNotIn(self.REPLICATION_METHOD, stream_properties.keys())
471 | 
472 |                 # Verify schema and db match expectations
473 |                 self.assertEqual(test_schema_name, stream_properties['schema-name'])
474 |                 self.assertEqual(test_db, stream_properties['database-name'])
475 | 
476 |                 # Verify schema types match expectations
477 |                 self.assertDictEqual(expected_fields_to_datatypes, actual_fields_to_datatypes)
478 | 


--------------------------------------------------------------------------------