├── tests ├── __init__.py ├── test_postgres_drop_table_field_selection.py ├── unittests │ ├── test_unsupported_pk.py │ ├── test_clear_state_on_replication_change.py │ ├── utils.py │ └── test_full_table_interruption.py ├── db_utils.py ├── test_postgres_views_logical_replication.py ├── test_postgres_views_full_table.py ├── test_postgres_views_incremental_replication.py ├── test_postgres_logical_replication_multiple_tables.py ├── test_postgres_logical_replication_multiple_dbs.py ├── test_postgres_full_table_replication_arrays.py └── test_postgres_discovery.py ├── tap_postgres ├── sync_strategies │ ├── __init__.py │ ├── common.py │ ├── incremental.py │ ├── full_table.py │ └── logical_replication.py └── db.py ├── Makefile ├── .circleci ├── docker-entrypoint-initdb.d │ └── init-permissions.sh ├── postgresql.conf ├── Dockerfile ├── server.key ├── config.yml └── server.crt ├── README.md ├── .github └── pull_request_template.md ├── setup.py ├── CHANGELOG.md ├── .gitignore └── bin └── test-db /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tap_postgres/sync_strategies/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | nosetests -v tests/unittests 3 | -------------------------------------------------------------------------------- /.circleci/docker-entrypoint-initdb.d/init-permissions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | { echo "host replication $POSTGRES_USER 0.0.0.0/0 trust"; } >> "$PGDATA/pg_hba.conf" 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tap-postgres 2 | 3 | ## set rds.logical_replication in parameter(reboot)= 1 4 | 5 | This should also set `max_wal_senders && max_replication_slots > 0` 6 | 7 | Singer tap for PostgreSQL supporting Full Table & Logical Replication 8 | using the wal2json decoder plugin. 9 | 10 | ``` 11 | SELECT * FROM pg_create_logical_replication_slot('stitch', 'wal2json'); 12 | ``` 13 | 14 | --- 15 | 16 | Copyright © 2018 Stitch 17 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Description of change 2 | (write a short description here or paste a link to JIRA) 3 | 4 | # QA steps 5 | - [ ] automated tests passing 6 | - [ ] manual qa steps passing (list below) 7 | 8 | # Risks 9 | 10 | # Rollback steps 11 | - revert this branch 12 | 13 | #### AI generated code 14 | https://internal.qlik.dev/general/ways-of-working/code-reviews/#guidelines-for-ai-generated-code 15 | - [ ] this PR has been written with the help of GitHub Copilot or another generative AI tool 16 | -------------------------------------------------------------------------------- /.circleci/postgresql.conf: -------------------------------------------------------------------------------- 1 | # LOGGING 2 | log_min_error_statement = fatal 3 | 4 | # CONNECTION 5 | listen_addresses = '*' 6 | 7 | # MODULES 8 | #shared_preload_libraries = 'decoderbufs' 9 | 10 | # REPLICATION 11 | wal_level = logical # minimal, archive, hot_standby, or logical (change requires restart) 12 | max_wal_senders = 5 # max number of walsender processes (change requires restart) 13 | #wal_keep_segments = 4 # in logfile segments, 16MB each; 0 disables 14 | #wal_sender_timeout = 60s # in milliseconds; 0 disables 15 | max_replication_slots = 5 # max number of replication slots (change requires restart) 16 | 17 | # SSL 18 | ssl = on 19 | ssl_cert_file = '/var/lib/postgresql/server.crt' 20 | ssl_key_file = '/var/lib/postgresql/server.key' 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | 5 | setup(name='tap-postgres', 6 | version='0.2.1', 7 | description='Singer.io tap for extracting data from PostgreSQL', 8 | author='Stitch', 9 | url='https://singer.io', 10 | classifiers=['Programming Language :: Python :: 3 :: Only'], 11 | install_requires=[ 12 | 'singer-python==5.3.1', 13 | 'psycopg2==2.7.4', 14 | 'strict-rfc3339==0.7', 15 | ], 16 | extras_require={ 17 | 'dev': [ 18 | 'ipdb', 19 | 'pylint==2.6.0', 20 | 'nose==1.3.7', 21 | ] 22 | }, 23 | entry_points=''' 24 | [console_scripts] 25 | tap-postgres=tap_postgres:main 26 | ''', 27 | packages=['tap_postgres', 'tap_postgres.sync_strategies'] 28 | ) 29 | -------------------------------------------------------------------------------- /.circleci/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM postgres:9.6 2 | 3 | # Git SHA of v2.2 4 | ENV WAL2JSON_COMMIT_ID=9f9762315062888f7f7f4f0a115073a33ad1275e 5 | 6 | # Compile the plugins from sources and install 7 | RUN apt-get update && apt-get install -y postgresql-server-dev-9.6 gcc git make pkgconf \ 8 | && git clone https://github.com/eulerto/wal2json -b master --single-branch \ 9 | && (cd /wal2json && git checkout $WAL2JSON_COMMIT_ID && make && make install) \ 10 | && rm -rf wal2json 11 | 12 | # Copy the custom configuration which will be passed down to the server 13 | COPY postgresql.conf /usr/local/share/postgresql/postgresql.conf 14 | 15 | # Copy the script which will initialize the replication permissions 16 | COPY /docker-entrypoint-initdb.d /docker-entrypoint-initdb.d 17 | 18 | # Copy the self-signed cert for general SSL testing 19 | # Must be owned by postgres:postgres according to https://www.postgresql.org/docs/9.6/ssl-tcp.html 20 | # NOTE: ONLY TO BE USED FOR TESTING, this is a publicly published keypair 21 | COPY server.key server.crt /var/lib/postgresql/ 22 | RUN chown postgres:postgres /var/lib/postgresql/server.* 23 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.2.0 4 | * Add support to discover partitioned tables [101](https://github.com/singer-io/tap-postgres/pull/101) 5 | 6 | ## 0.1.0 7 | * Add support for `wal2json` message format v2 via config parameter [91](https://github.com/singer-io/tap-postgres/pull/91) 8 | 9 | ## 0.0.70 10 | * Look up ssl status in `pg_stat_ssl` and `pg_stat_activity` tables [#84](https://github.com/singer-io/tap-postgres/pull/84) 11 | 12 | ## 0.0.69 13 | * Add `sslmode` log message when opening connection [#82](https://github.com/singer-io/tap-postgres/pull/82) 14 | 15 | ## 0.0.68 16 | * Respect `ssl` config property (bug fix) [#80](https://github.com/singer-io/tap-postgres/pull/80) 17 | 18 | ## 0.0.67 19 | * Make `bytea[]` fields have `"inclusion" : "unsupported"` metadata [#76](https://github.com/singer-io/tap-postgres/pull/76) 20 | 21 | ## 0.0.66 22 | * Fix sorting for full_table sync by xmin to use integer sorting rather than string sorting [#73](https://github.com/singer-io/tap-postgres/pull/73) 23 | 24 | ## 0.0.65 25 | * Add support for `int8[]` (`bigint[]`) array types to log-based replication [#69](https://github.com/singer-io/tap-postgres/pull/69) 26 | 27 | ## 0.0.64 28 | * Pass string to `decimal.Decimal` when handling numeric data type [#67](https://github.com/singer-io/tap-postgres/pull/67) 29 | -------------------------------------------------------------------------------- /tap_postgres/sync_strategies/common.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import simplejson as json 3 | import singer 4 | from singer import metadata 5 | import tap_postgres.db as post_db 6 | 7 | def should_sync_column(md_map, field_name): 8 | field_metadata = md_map.get(('properties', field_name), {}) 9 | return singer.should_sync_field(field_metadata.get('inclusion'), 10 | field_metadata.get('selected'), 11 | True) 12 | 13 | def write_schema_message(schema_message): 14 | sys.stdout.write(json.dumps(schema_message, use_decimal=True) + '\n') 15 | sys.stdout.flush() 16 | 17 | def send_schema_message(stream, bookmark_properties): 18 | s_md = metadata.to_map(stream['metadata']) 19 | if s_md.get((), {}).get('is-view'): 20 | key_properties = s_md.get((), {}).get('view-key-properties', []) 21 | else: 22 | key_properties = s_md.get((), {}).get('table-key-properties', []) 23 | 24 | schema_message = {'type' : 'SCHEMA', 25 | 'stream' : post_db.calculate_destination_stream_name(stream, s_md), 26 | 'schema' : stream['schema'], 27 | 'key_properties' : key_properties, 28 | 'bookmark_properties': bookmark_properties} 29 | 30 | write_schema_message(schema_message) 31 | -------------------------------------------------------------------------------- /.circleci/server.key: -------------------------------------------------------------------------------- 1 | -----BEGIN PRIVATE KEY----- 2 | MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDBZuuEK7B9mbWw 3 | Fny3uoAE+Pp26PfCPEEhOh0QpA6KPMQs06Dv6Tx94TcpNIeSnSxp3dGA3WVuOyev 4 | SsrGhyr6DadaElzlXkB3+fpj/i06UMiG5iqyO/w+KySz5q5hWyws9PNZlkC4nW7x 5 | V6l+KNMqS3HbFogpbreLDmrCVVvETE3t5Eq+NtmPms91eRIAAX/W02Mg5Hi5be+t 6 | I+zvTSBwbE28JHxZwqvMd7SZSax7dHiXhN+5cG/jyXFiVM5jw2LNqD+umejIdrN6 7 | iGYuXsm2nlf45+p8mLEDbjaqiWaPONUJ5/e4MoQTMq557xjV5WrNiWMm4MnPrdvl 8 | ffFhr4GpAgMBAAECggEABOJLekf8Kf/StcKrnZwpFXcQJCaX8yDAaE1mZIAwGc+V 9 | CKjDfKuAKpGgafr4nXw4nefLHZz5rcHyq5uQ6ViKfkwP+NdT3zr1F9KJPzMxAzL9 10 | DWMMmvmm0g8W2zAtISpDQFNjPdTsh3z6Sz/yeMwhIQVMt8Km55zzJ6DSk1vCeH+F 11 | gW555Hez0qL/GKLJX4pRU45getXnqt/oQOnMEpe2Ar21GJO8JYWNC954SWacE90S 12 | 7p4Y/Y4BmlCvaF0Kr8qxs2jDQmWKHYMuuxnMsKzz4u3f6wdRCEPt++Z6jMnHmjA6 13 | Mp7i8Zm5cWKehbS4hLRa/uA1JbFZXdHwJsaHeJIv3QKBgQD1cmGUYxp/GXSFjH2A 14 | 1c03FiTcifp0ui8AjFBNb5pB7aHcsLK2EZRsV8PpgvpzpaDd9iYSmxxA7Xq9gBaV 15 | daRikRVvf12FvLkiZmmpulG3DyzvpP+CEGur102+gNjQXyBrtpJ9hG8rC7PHTCoG 16 | ZIraVocQ6Ft/T8bRMEE59gctewKBgQDJt69dk6H0UYSIJkFT3CMikMamk3ObAGT3 17 | WquP+GJ+2NaIM/9aHnaNMkN4MnpN9B73VUME26k8D3nm6+smZNvK2uWeqU/MONye 18 | zTF8L7yNBsO47rWCAoiNyJgfzDXc4gHEnKP4CeU3cjebzEg1Vdb5xKDeF8XwcrUV 19 | bOUgvnc6KwKBgQC8UHfBi4/GuFcIJ9Qaxu7eNuUtN8erSzXIq97oqpmlv5aSZheX 20 | TUGdJnEvdciGditIYRSw7cTto8aqId4x6cKnxTy3APdWJoe8durWyBbt5nzJmMRY 21 | nBSgEV6arOysYm/TdI5MMxG/6wiR/kO4B+fowL58IGoi8ahO00EYIUU8hQKBgDac 22 | i1bLVGp/82Ck8sTQcZa3GYEZpI/PYIZzPsWAmrH65MIFSdnNK414kTmmeORH9mZB 23 | 6B4VllDTY854CrbfUX4vG0GEVz1UG67GoOIdTm/j5/NWdT+Yjf3M1Bqvv9loOtBP 24 | FDlf/HWxb4q3mMkPz17ZtC/MweMiOxJs4++kgUT3AoGBAJDNpcpbaANd8WDGnb+o 25 | xHgl7lO8c897HEyF7Ea9aI4d+NK/NThOJPANHSBovH9AulFipVlTQs6FTMNxI19d 26 | lGiFNwUbuVNDQucnPu5Goc0VFjI9Rwn9GcwH2vsJ9emxKlsl9VDoTl5HVgItYZK5 27 | VcTFh/izUO6ONHyrlkC7+6Pe 28 | -----END PRIVATE KEY----- 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # Emacs 104 | .tramp_history 105 | 106 | env-vars.txt 107 | tap_oracle/__pycache__/ 108 | *~ 109 | config.json 110 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | orbs: 3 | slack: circleci/slack@3.4.2 4 | 5 | jobs: 6 | build: 7 | docker: 8 | - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester-18.04 9 | - image: singerio/postgres:9.6-wal2json-2.2-ssl 10 | environment: 11 | POSTGRES_USER: postgres 12 | POSTGRES_PASSWORD: password 13 | command: [postgres, -c, config_file=/usr/local/share/postgresql/postgresql.conf] 14 | steps: 15 | - checkout 16 | - run: 17 | name: 'Setup virtual env' 18 | command: | 19 | aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox dev_env.sh 20 | source dev_env.sh 21 | export LC_ALL=C 22 | apt-get -qq update 23 | apt-get -qq install bundler libpq-dev 24 | pyenv local 3.5.6 25 | python3 -m venv /usr/local/share/virtualenvs/tap-postgres 26 | source /usr/local/share/virtualenvs/tap-postgres/bin/activate 27 | pip install -U 'pip<19.2' 'setuptools<51.0.0' 28 | pip install .[dev] 29 | source dev_env.sh 30 | make test 31 | pylint tap_postgres -d missing-docstring,invalid-name,line-too-long,too-many-locals,too-few-public-methods,fixme,stop-iteration-return,duplicate-code,useless-import-alias,bare-except,raise-missing-from 32 | - run: 33 | when: always 34 | name: 'Integration Tests' 35 | command: | 36 | source dev_env.sh 37 | source /usr/local/share/virtualenvs/tap-tester/bin/activate 38 | apt-get -qq update 39 | apt-get -qq install bundler libpq-dev 40 | pip install psycopg2==2.8.4 41 | run-test --tap=tap-postgres tests 42 | - slack/notify-on-failure: 43 | only_for_branches: master 44 | 45 | workflows: 46 | version: 2 47 | commit: &commit_jobs 48 | jobs: 49 | - build: 50 | context: 51 | - circleci-user 52 | - tap-tester-user 53 | build_daily: 54 | <<: *commit_jobs 55 | triggers: 56 | - schedule: 57 | cron: "0 1 * * *" 58 | filters: 59 | branches: 60 | only: 61 | - master 62 | -------------------------------------------------------------------------------- /bin/test-db: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import argparse 5 | import subprocess 6 | import time 7 | from argparse import RawTextHelpFormatter 8 | 9 | full_image_name = "singerio/postgres:9.6-wal2json-2.2-ssl" 10 | 11 | def start_container(name): 12 | START_COMMAND = """ 13 | sudo docker run -e "POSTGRES_USER={0}" -e "POSTGRES_PASSWORD={1}" -p {2}:5432 --name {3} -d {4} \ 14 | postgres -c config_file=/usr/local/share/postgresql/postgresql.conf 15 | """.format(os.getenv('TAP_POSTGRES_USER'), 16 | os.getenv('TAP_POSTGRES_PASSWORD'), 17 | os.getenv('TAP_POSTGRES_PORT'), 18 | name, 19 | full_image_name) 20 | 21 | print("Starting Docker process {} using command: {}".format(name, START_COMMAND)) 22 | 23 | proc = subprocess.run(START_COMMAND, shell=True) 24 | if proc.returncode != 0: 25 | sys.exit("Exited with code: {}, the docker process failed to start.".format(proc.returncode)) 26 | print("Process started successfully.") 27 | 28 | def get_ip_addr(name): 29 | IP_ADDR_COMMAND = "docker inspect {} | jq -r .[].NetworkSettings.IPAddress" 30 | print("Retrieving IP addr of postgres container") 31 | ip_addr = subprocess.check_output(IP_ADDR_COMMAND.format(name), shell=True).decode('utf-8').rstrip() 32 | print(ip_addr) 33 | return ip_addr 34 | 35 | 36 | def stop_container(name): 37 | STOP_COMMAND = "sudo docker stop {0} && sudo docker rm {0}" 38 | 39 | print("Stopping Docker process {}".format(name)) 40 | proc = subprocess.run(STOP_COMMAND.format(name), shell=True) 41 | if proc.returncode != 0: 42 | sys.exit("Exited with code: {}, the docker process failed to stop.".format(proc.returncode)) 43 | print("Process stopped successfully") 44 | 45 | def connect_to_db(name): 46 | CONNECT_COMMAND = 'docker run -it --rm -e "PGPASSWORD={}" {} psql --host {} -U {}' 47 | 48 | ip_addr = get_ip_addr(name) 49 | 50 | print("Attempting to connect to running container using a postgres container via psql") 51 | connect_command_format = CONNECT_COMMAND.format(os.getenv('TAP_POSTGRES_PASSWORD'), 52 | full_image_name, 53 | ip_addr, 54 | os.getenv('TAP_POSTGRES_USER')) 55 | print(connect_command_format) 56 | # NB: Using call instead of run here because it is blocking 57 | # This returns only an exit code. 58 | returncode = subprocess.call(connect_command_format, 59 | shell=True) 60 | if returncode != 0: 61 | sys.exit("Exited with code: {}, could not connect.".format(returncode)) 62 | 63 | DESCRIPTION = """ 64 | Manage docker instance for tap-postgres testing. 65 | 66 | Uses environment variables: 67 | TAP_POSTGRES_USER 68 | TAP_POSTGRES_PASSWORD 69 | """ 70 | parser = argparse.ArgumentParser(description=DESCRIPTION, formatter_class=RawTextHelpFormatter) 71 | parser.add_argument('action', choices=['start','stop', 'connect'], help='action to perform with the container') 72 | parser.add_argument('--name', help="name assigned to running docker process", default='postgres1') 73 | 74 | def main(): 75 | parsed_args = parser.parse_args() 76 | # Potential arguments to add: pull, changing docker cointainer, changing password 77 | if parsed_args.action == 'start': 78 | start_container(parsed_args.name) 79 | elif parsed_args.action == 'stop': 80 | stop_container(parsed_args.name) 81 | elif parsed_args.action == 'connect': 82 | connect_to_db(parsed_args.name) 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /tests/test_postgres_drop_table_field_selection.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import psycopg2.extras 5 | from psycopg2.extensions import quote_ident 6 | import tap_tester.connections as connections 7 | import tap_tester.menagerie as menagerie 8 | import tap_tester.runner as runner 9 | 10 | import db_utils # pylint: disable=import-error 11 | 12 | test_schema_name = "public" 13 | test_table_name = "postgres_drop_table_test" 14 | 15 | def canonicalized_table_name(schema, table, cur): 16 | return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur)) 17 | 18 | class PostgresDropTable(unittest.TestCase): 19 | 20 | @staticmethod 21 | def name(): 22 | return "tap_tester_postgres_drop_table_field_selection" 23 | 24 | @staticmethod 25 | def get_properties(): 26 | return {'host' : os.getenv('TAP_POSTGRES_HOST'), 27 | 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'), 28 | 'port' : os.getenv('TAP_POSTGRES_PORT'), 29 | 'user' : os.getenv('TAP_POSTGRES_USER'), 30 | 'default_replication_method' : 'LOG_BASED', 31 | 'filter_dbs' : 'discovery0' 32 | } 33 | 34 | @staticmethod 35 | def get_credentials(): 36 | return {'password': os.getenv('TAP_POSTGRES_PASSWORD')} 37 | 38 | @staticmethod 39 | def get_type(): 40 | return "platform.postgres" 41 | 42 | @staticmethod 43 | def tap_name(): 44 | return "tap-postgres" 45 | 46 | @staticmethod 47 | def expected_check_streams(): 48 | return { 'discovery0-public-postgres_drop_table_test'} 49 | 50 | 51 | def setUp(self): 52 | db_utils.ensure_environment_variables_set() 53 | 54 | db_utils.ensure_db('discovery0') 55 | 56 | with db_utils.get_test_connection('discovery0') as conn: 57 | conn.autocommit = True 58 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 59 | old_table = cur.execute("""SELECT EXISTS ( 60 | SELECT 1 61 | FROM information_schema.tables 62 | WHERE table_schema = %s 63 | AND table_name = %s);""", 64 | [test_schema_name, test_table_name]) 65 | old_table = cur.fetchone()[0] 66 | if old_table: 67 | cur.execute("DROP TABLE {}".format(canonicalized_table_name(test_schema_name, test_table_name, cur))) 68 | 69 | 70 | cur = conn.cursor() 71 | cur.execute(""" SELECT installed_version FROM pg_available_extensions WHERE name = 'hstore' """) 72 | if cur.fetchone()[0] is None: 73 | cur.execute(""" CREATE EXTENSION hstore; """) 74 | 75 | #pylint: disable=line-too-long 76 | create_table_sql = 'CREATE TABLE {} (id SERIAL PRIMARY KEY)'.format(canonicalized_table_name(test_schema_name, test_table_name, cur)) 77 | 78 | cur.execute(create_table_sql) 79 | 80 | def test_run(self): 81 | conn_id = connections.ensure_connection(self) 82 | 83 | # Run discovery 84 | check_job_name = runner.run_check_mode(self, conn_id) 85 | 86 | # Verify check exit codes 87 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 88 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 89 | 90 | # There should not be any tables in this database 91 | with db_utils.get_test_connection('discovery0') as conn: 92 | cur = conn.cursor() 93 | cur.execute("DROP TABLE {}".format(canonicalized_table_name(test_schema_name, test_table_name, cur))) 94 | 95 | # Run discovery again 96 | check_job_name = runner.run_check_mode(self, conn_id) 97 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 98 | 99 | # When discovery mode finds 0 tables, the tap returns an error 100 | self.assertEqual(exit_status['discovery_exit_status'], 1) 101 | -------------------------------------------------------------------------------- /.circleci/server.crt: -------------------------------------------------------------------------------- 1 | Certificate: 2 | Data: 3 | Version: 3 (0x2) 4 | Serial Number: 5 | 3a:7d:37:66:c9:08:92:63:75:dc:ea:bc:2e:73:3e:97:19:d8:da:95 6 | Signature Algorithm: sha256WithRSAEncryption 7 | Issuer: CN = localhost 8 | Validity 9 | Not Before: Dec 22 21:23:13 2020 GMT 10 | Not After : Dec 20 21:23:13 2030 GMT 11 | Subject: CN = localhost 12 | Subject Public Key Info: 13 | Public Key Algorithm: rsaEncryption 14 | RSA Public-Key: (2048 bit) 15 | Modulus: 16 | 00:c1:66:eb:84:2b:b0:7d:99:b5:b0:16:7c:b7:ba: 17 | 80:04:f8:fa:76:e8:f7:c2:3c:41:21:3a:1d:10:a4: 18 | 0e:8a:3c:c4:2c:d3:a0:ef:e9:3c:7d:e1:37:29:34: 19 | 87:92:9d:2c:69:dd:d1:80:dd:65:6e:3b:27:af:4a: 20 | ca:c6:87:2a:fa:0d:a7:5a:12:5c:e5:5e:40:77:f9: 21 | fa:63:fe:2d:3a:50:c8:86:e6:2a:b2:3b:fc:3e:2b: 22 | 24:b3:e6:ae:61:5b:2c:2c:f4:f3:59:96:40:b8:9d: 23 | 6e:f1:57:a9:7e:28:d3:2a:4b:71:db:16:88:29:6e: 24 | b7:8b:0e:6a:c2:55:5b:c4:4c:4d:ed:e4:4a:be:36: 25 | d9:8f:9a:cf:75:79:12:00:01:7f:d6:d3:63:20:e4: 26 | 78:b9:6d:ef:ad:23:ec:ef:4d:20:70:6c:4d:bc:24: 27 | 7c:59:c2:ab:cc:77:b4:99:49:ac:7b:74:78:97:84: 28 | df:b9:70:6f:e3:c9:71:62:54:ce:63:c3:62:cd:a8: 29 | 3f:ae:99:e8:c8:76:b3:7a:88:66:2e:5e:c9:b6:9e: 30 | 57:f8:e7:ea:7c:98:b1:03:6e:36:aa:89:66:8f:38: 31 | d5:09:e7:f7:b8:32:84:13:32:ae:79:ef:18:d5:e5: 32 | 6a:cd:89:63:26:e0:c9:cf:ad:db:e5:7d:f1:61:af: 33 | 81:a9 34 | Exponent: 65537 (0x10001) 35 | X509v3 extensions: 36 | X509v3 Subject Key Identifier: 37 | 09:9A:C9:F9:7C:C8:5D:EC:22:04:E1:B0:EB:84:05:30:AC:54:E2:79 38 | X509v3 Authority Key Identifier: 39 | keyid:09:9A:C9:F9:7C:C8:5D:EC:22:04:E1:B0:EB:84:05:30:AC:54:E2:79 40 | 41 | X509v3 Basic Constraints: critical 42 | CA:TRUE 43 | Signature Algorithm: sha256WithRSAEncryption 44 | 11:75:a1:9a:cc:48:86:3b:12:c6:c6:b5:fa:64:d3:d9:9f:d1: 45 | 3d:31:59:36:af:2c:42:4c:cb:4b:3e:d1:28:ee:9f:d8:f7:19: 46 | 90:ef:03:82:4c:8c:e6:d5:ef:44:2b:3f:1d:d7:dd:f8:1a:32: 47 | 71:c1:b5:09:15:54:0d:a5:f9:75:2b:53:77:9a:63:67:d8:a3: 48 | 52:c4:e2:5b:70:0f:e7:3d:73:b6:8a:b6:98:79:9f:42:ee:ee: 49 | f7:21:5c:1a:17:ef:d7:22:60:73:97:0d:78:1b:ef:f2:9a:9b: 50 | f4:17:3b:0b:2a:c2:9a:76:1c:fe:d5:ec:7f:9e:ef:f5:f5:50: 51 | f1:c6:0a:f5:ca:97:19:d4:fe:1e:9a:6b:9e:c1:9c:aa:5b:77: 52 | 83:f3:d3:d6:de:1a:4d:f8:2b:df:4a:ba:49:26:b2:15:a5:5d: 53 | e8:0a:7c:85:7e:41:4d:64:3d:a1:65:8f:41:fb:4d:df:7b:eb: 54 | 3d:16:f7:4a:05:b9:9b:81:6e:d4:e3:ca:be:95:08:6b:3c:2a: 55 | c9:4d:8c:68:ce:37:5b:4f:ab:e0:81:7b:9c:51:95:48:f2:41: 56 | 4d:b0:97:14:72:c6:02:31:4b:ec:80:a3:9c:e0:09:98:9a:dc: 57 | d4:b3:f6:c9:2a:04:5e:8c:ec:0e:c0:40:96:24:e4:70:15:4e: 58 | c7:44:19:31 59 | -----BEGIN CERTIFICATE----- 60 | MIIDCTCCAfGgAwIBAgIUOn03ZskIkmN13Oq8LnM+lxnY2pUwDQYJKoZIhvcNAQEL 61 | BQAwFDESMBAGA1UEAwwJbG9jYWxob3N0MB4XDTIwMTIyMjIxMjMxM1oXDTMwMTIy 62 | MDIxMjMxM1owFDESMBAGA1UEAwwJbG9jYWxob3N0MIIBIjANBgkqhkiG9w0BAQEF 63 | AAOCAQ8AMIIBCgKCAQEAwWbrhCuwfZm1sBZ8t7qABPj6duj3wjxBITodEKQOijzE 64 | LNOg7+k8feE3KTSHkp0sad3RgN1lbjsnr0rKxocq+g2nWhJc5V5Ad/n6Y/4tOlDI 65 | huYqsjv8Pisks+auYVssLPTzWZZAuJ1u8VepfijTKktx2xaIKW63iw5qwlVbxExN 66 | 7eRKvjbZj5rPdXkSAAF/1tNjIOR4uW3vrSPs700gcGxNvCR8WcKrzHe0mUmse3R4 67 | l4TfuXBv48lxYlTOY8Nizag/rpnoyHazeohmLl7Jtp5X+OfqfJixA242qolmjzjV 68 | Cef3uDKEEzKuee8Y1eVqzYljJuDJz63b5X3xYa+BqQIDAQABo1MwUTAdBgNVHQ4E 69 | FgQUCZrJ+XzIXewiBOGw64QFMKxU4nkwHwYDVR0jBBgwFoAUCZrJ+XzIXewiBOGw 70 | 64QFMKxU4nkwDwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAQEAEXWh 71 | msxIhjsSxsa1+mTT2Z/RPTFZNq8sQkzLSz7RKO6f2PcZkO8DgkyM5tXvRCs/Hdfd 72 | +BoyccG1CRVUDaX5dStTd5pjZ9ijUsTiW3AP5z1ztoq2mHmfQu7u9yFcGhfv1yJg 73 | c5cNeBvv8pqb9Bc7CyrCmnYc/tXsf57v9fVQ8cYK9cqXGdT+HpprnsGcqlt3g/PT 74 | 1t4aTfgr30q6SSayFaVd6Ap8hX5BTWQ9oWWPQftN33vrPRb3SgW5m4Fu1OPKvpUI 75 | azwqyU2MaM43W0+r4IF7nFGVSPJBTbCXFHLGAjFL7ICjnOAJmJrc1LP2ySoEXozs 76 | DsBAliTkcBVOx0QZMQ== 77 | -----END CERTIFICATE----- 78 | -------------------------------------------------------------------------------- /tests/unittests/test_unsupported_pk.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tap_postgres 3 | import psycopg2 4 | import psycopg2.extras 5 | import os 6 | import pdb 7 | import singer 8 | from singer import get_logger, metadata, write_bookmark 9 | from utils import ensure_db, get_test_connection, ensure_test_table, select_all_of_stream, set_replication_method_for_stream, insert_record, get_test_connection_config 10 | import decimal 11 | import math 12 | import pytz 13 | import strict_rfc3339 14 | import copy 15 | 16 | LOGGER = get_logger() 17 | 18 | def do_not_dump_catalog(catalog): 19 | pass 20 | 21 | tap_postgres.dump_catalog = do_not_dump_catalog 22 | 23 | class Unsupported(unittest.TestCase): 24 | maxDiff = None 25 | table_name = 'CHICKEN TIMES' 26 | 27 | def setUp(self): 28 | ensure_db() 29 | with get_test_connection() as conn: 30 | cur = conn.cursor() 31 | table_spec = {"columns": [{"name": "interval_col", "type": "INTERVAL"}, 32 | {"name": "bit_string_col", "type": "bit(5)"}, 33 | {"name": "bytea_col", "type": "bytea"}, 34 | {"name": "point_col", "type": "point"}, 35 | {"name": "line_col", "type": "line"}, 36 | {"name": "lseg_col", "type": "lseg"}, 37 | {"name": "box_col", "type": "box"}, 38 | {"name": "polygon_col", "type": "polygon"}, 39 | {"name": "circle_col", "type": "circle"}, 40 | {"name": "xml_col", "type": "xml"}, 41 | {"name": "composite_col", "type": "person_composite"}, 42 | {"name": "int_range_col", "type": "int4range"}, 43 | ], 44 | "name": Unsupported.table_name} 45 | with get_test_connection() as conn: 46 | cur = conn.cursor() 47 | cur.execute(""" DROP TYPE IF EXISTS person_composite CASCADE """) 48 | cur.execute(""" CREATE TYPE person_composite AS (age int, name text) """) 49 | 50 | ensure_test_table(table_spec) 51 | 52 | def test_catalog(self): 53 | conn_config = get_test_connection_config() 54 | streams = tap_postgres.do_discovery(conn_config) 55 | chicken_streams = [s for s in streams if s['tap_stream_id'] == "postgres-public-CHICKEN TIMES"] 56 | 57 | self.assertEqual(len(chicken_streams), 1) 58 | stream_dict = chicken_streams[0] 59 | stream_dict.get('metadata').sort(key=lambda md: md['breadcrumb']) 60 | 61 | self.assertEqual(metadata.to_map(stream_dict.get('metadata')), 62 | {(): {'is-view': False, 'table-key-properties': [], 'row-count': 0, 'schema-name': 'public', 'database-name': 'postgres'}, 63 | ('properties', 'bytea_col'): {'sql-datatype': 'bytea', 'selected-by-default': False, 'inclusion': 'unsupported'}, 64 | ('properties', 'bit_string_col'): {'sql-datatype': 'bit(5)', 'selected-by-default': False, 'inclusion': 'unsupported'}, 65 | ('properties', 'line_col'): {'sql-datatype': 'line', 'selected-by-default': False, 'inclusion': 'unsupported'}, 66 | ('properties', 'xml_col'): {'sql-datatype': 'xml', 'selected-by-default': False, 'inclusion': 'unsupported'}, 67 | ('properties', 'int_range_col'): {'sql-datatype': 'int4range', 'selected-by-default': False, 'inclusion': 'unsupported'}, 68 | ('properties', 'circle_col'): {'sql-datatype': 'circle', 'selected-by-default': False, 'inclusion': 'unsupported'}, 69 | ('properties', 'polygon_col'): {'sql-datatype': 'polygon', 'selected-by-default': False, 'inclusion': 'unsupported'}, 70 | ('properties', 'box_col'): {'sql-datatype': 'box', 'selected-by-default': False, 'inclusion': 'unsupported'}, 71 | ('properties', 'lseg_col'): {'sql-datatype': 'lseg', 'selected-by-default': False, 'inclusion': 'unsupported'}, 72 | ('properties', 'composite_col'): {'sql-datatype': 'person_composite', 'selected-by-default': False, 'inclusion': 'unsupported'}, 73 | ('properties', 'interval_col'): {'sql-datatype': 'interval', 'selected-by-default': False, 'inclusion': 'unsupported'}, 74 | ('properties', 'point_col'): {'sql-datatype': 'point', 'selected-by-default': False, 'inclusion': 'unsupported'}} 75 | ) 76 | 77 | 78 | if __name__== "__main__": 79 | test1 = Unsupported() 80 | test1.setUp() 81 | test1.test_catalog() 82 | -------------------------------------------------------------------------------- /tap_postgres/sync_strategies/incremental.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import time 3 | import psycopg2 4 | import psycopg2.extras 5 | import singer 6 | from singer import utils 7 | import singer.metrics as metrics 8 | import tap_postgres.db as post_db 9 | 10 | 11 | LOGGER = singer.get_logger() 12 | 13 | UPDATE_BOOKMARK_PERIOD = 1000 14 | 15 | def fetch_max_replication_key(conn_config, replication_key, schema_name, table_name): 16 | with post_db.open_connection(conn_config, False) as conn: 17 | with conn.cursor() as cur: 18 | max_key_sql = """SELECT max({}) 19 | FROM {}""".format(post_db.prepare_columns_sql(replication_key), 20 | post_db.fully_qualified_table_name(schema_name, table_name)) 21 | LOGGER.info("determine max replication key value: %s", max_key_sql) 22 | cur.execute(max_key_sql) 23 | max_key = cur.fetchone()[0] 24 | LOGGER.info("max replication key value: %s", max_key) 25 | return max_key 26 | 27 | def sync_table(conn_info, stream, state, desired_columns, md_map): 28 | time_extracted = utils.now() 29 | 30 | stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') 31 | if stream_version is None: 32 | stream_version = int(time.time() * 1000) 33 | 34 | state = singer.write_bookmark(state, 35 | stream['tap_stream_id'], 36 | 'version', 37 | stream_version) 38 | singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) 39 | 40 | schema_name = md_map.get(()).get('schema-name') 41 | 42 | escaped_columns = map(post_db.prepare_columns_sql, desired_columns) 43 | 44 | activate_version_message = singer.ActivateVersionMessage( 45 | stream=post_db.calculate_destination_stream_name(stream, md_map), 46 | version=stream_version) 47 | 48 | 49 | singer.write_message(activate_version_message) 50 | 51 | replication_key = md_map.get((), {}).get('replication-key') 52 | replication_key_value = singer.get_bookmark(state, stream['tap_stream_id'], 'replication_key_value') 53 | replication_key_sql_datatype = md_map.get(('properties', replication_key)).get('sql-datatype') 54 | 55 | hstore_available = post_db.hstore_available(conn_info) 56 | with metrics.record_counter(None) as counter: 57 | with post_db.open_connection(conn_info) as conn: 58 | 59 | # Client side character encoding defaults to the value in postgresql.conf under client_encoding. 60 | # The server / db can also have its own configred encoding. 61 | with conn.cursor() as cur: 62 | cur.execute("show server_encoding") 63 | LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0]) 64 | cur.execute("show client_encoding") 65 | LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0]) 66 | 67 | if hstore_available: 68 | LOGGER.info("hstore is available") 69 | psycopg2.extras.register_hstore(conn) 70 | else: 71 | LOGGER.info("hstore is UNavailable") 72 | 73 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur: 74 | cur.itersize = post_db.cursor_iter_size 75 | LOGGER.info("Beginning new incremental replication sync %s", stream_version) 76 | if replication_key_value: 77 | select_sql = """SELECT {} 78 | FROM {} 79 | WHERE {} >= '{}'::{} 80 | ORDER BY {} ASC""".format(','.join(escaped_columns), 81 | post_db.fully_qualified_table_name(schema_name, stream['table_name']), 82 | post_db.prepare_columns_sql(replication_key), replication_key_value, replication_key_sql_datatype, 83 | post_db.prepare_columns_sql(replication_key)) 84 | else: 85 | #if not replication_key_value 86 | select_sql = """SELECT {} 87 | FROM {} 88 | ORDER BY {} ASC""".format(','.join(escaped_columns), 89 | post_db.fully_qualified_table_name(schema_name, stream['table_name']), 90 | post_db.prepare_columns_sql(replication_key)) 91 | 92 | LOGGER.info("select statement: %s with itersize %s", select_sql, cur.itersize) 93 | cur.execute(select_sql) 94 | 95 | rows_saved = 0 96 | 97 | for rec in cur: 98 | record_message = post_db.selected_row_to_singer_message(stream, rec, stream_version, desired_columns, time_extracted, md_map) 99 | singer.write_message(record_message) 100 | rows_saved = rows_saved + 1 101 | 102 | #Picking a replication_key with NULL values will result in it ALWAYS been synced which is not great 103 | #event worse would be allowing the NULL value to enter into the state 104 | if record_message.record[replication_key] is not None: 105 | state = singer.write_bookmark(state, 106 | stream['tap_stream_id'], 107 | 'replication_key_value', 108 | record_message.record[replication_key]) 109 | 110 | 111 | if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: 112 | singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) 113 | 114 | counter.increment() 115 | 116 | return state 117 | -------------------------------------------------------------------------------- /tests/db_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import psycopg2 3 | from psycopg2.extensions import quote_ident 4 | 5 | # steal top_map method from singer-python so we can remove singer imports from tests 6 | def to_map(raw_metadata): 7 | return {tuple(md['breadcrumb']): md['metadata'] for md in raw_metadata} 8 | 9 | def ensure_environment_variables_set(): 10 | missing_envs = [x for x in [os.getenv('TAP_POSTGRES_HOST'), 11 | os.getenv('TAP_POSTGRES_USER'), 12 | os.getenv('TAP_POSTGRES_PASSWORD'), 13 | os.getenv('TAP_POSTGRES_PORT'), 14 | os.getenv('TAP_POSTGRES_DBNAME')] if x is None] 15 | if len(missing_envs) != 0: 16 | raise Exception("Missing environment variables: {}".format(missing_envs)) 17 | 18 | def ensure_db(dbname=os.getenv('TAP_POSTGRES_DBNAME')): 19 | # Create database dev if not exists 20 | with get_test_connection('postgres') as conn: 21 | conn.autocommit = True 22 | with conn.cursor() as cur: 23 | cur.execute("SELECT 1 FROM pg_database WHERE datname = '{}'".format(dbname)) 24 | exists = cur.fetchone() 25 | if not exists: 26 | print("Creating database {}".format(dbname)) 27 | cur.execute("CREATE DATABASE {}".format(dbname)) 28 | 29 | def get_test_connection(dbname=os.getenv('TAP_POSTGRES_DBNAME'), logical_replication=False): 30 | 31 | conn_string = "host='{}' dbname='{}' user='{}' password='{}' port='{}'".format(os.getenv('TAP_POSTGRES_HOST'), 32 | dbname, 33 | os.getenv('TAP_POSTGRES_USER'), 34 | os.getenv('TAP_POSTGRES_PASSWORD'), 35 | os.getenv('TAP_POSTGRES_PORT')) 36 | 37 | if logical_replication: 38 | return psycopg2.connect(conn_string, connection_factory=psycopg2.extras.LogicalReplicationConnection) 39 | else: 40 | return psycopg2.connect(conn_string) 41 | 42 | def canonicalized_table_name(conn_cursor, schema, table): 43 | return "{}.{}".format(quote_ident(schema, conn_cursor), quote_ident(table, conn_cursor)) 44 | 45 | def ensure_replication_slot(conn_cursor, db_name=os.getenv('TAP_POSTGRES_DBNAME'), slot_name='stitch'): 46 | conn_cursor.execute("""SELECT EXISTS ( 47 | SELECT 1 48 | FROM pg_replication_slots 49 | WHERE slot_name = '{}') """, slot_name) 50 | 51 | old_slot = conn_cursor.fetchone()[0] 52 | 53 | with get_test_connection(db_name, True) as conn2: 54 | with conn2.cursor() as conn_2_cursor: 55 | if old_slot: 56 | conn_2_cursor.drop_replication_slot(slot_name) 57 | conn_2_cursor.create_replication_slot(slot_name, output_plugin='wal2json') 58 | 59 | def ensure_fresh_table(conn, conn_cursor, schema_name, table_name): 60 | """ 61 | If a table of the specified name and schema already exists, it was left over 62 | from a previous test run. Drop this table. 63 | """ 64 | ctable_name = canonicalized_table_name(conn_cursor, schema_name, table_name) 65 | 66 | old_table = conn_cursor.execute("""SELECT EXISTS ( 67 | SELECT 1 68 | FROM information_schema.tables 69 | WHERE table_schema = %s 70 | AND table_name = %s);""", 71 | [schema_name, table_name]) 72 | old_table = conn_cursor.fetchone()[0] 73 | if old_table: 74 | conn_cursor.execute("DROP TABLE {}".format(ctable_name)) 75 | 76 | 77 | conn_cursor2 = conn.cursor() 78 | conn_cursor2.execute(""" SELECT installed_version FROM pg_available_extensions WHERE name = 'hstore' """) 79 | if conn_cursor2.fetchone()[0] is None: 80 | conn_cursor2.execute(""" CREATE EXTENSION hstore; """) 81 | conn_cursor2.execute(""" CREATE EXTENSION IF NOT EXISTS citext WITH SCHEMA public;""") 82 | conn_cursor2.execute(""" DROP TYPE IF EXISTS ALIGNMENT CASCADE """) 83 | conn_cursor2.execute(""" CREATE TYPE ALIGNMENT AS ENUM ('good', 'bad', 'ugly') """) 84 | 85 | return conn_cursor2 86 | 87 | 88 | def insert_record(conn_cursor, table_name, data): 89 | our_keys = list(data.keys()) 90 | our_keys.sort() 91 | our_values = [data.get(key) for key in our_keys] 92 | 93 | columns_sql = ", \n ".join(our_keys) 94 | value_sql = ",".join(["%s" for i in range(len(our_keys))]) 95 | 96 | insert_sql = """ INSERT INTO {} 97 | ( {} ) 98 | VALUES ( {} )""".format(quote_ident(table_name, conn_cursor), columns_sql, value_sql) 99 | conn_cursor.execute(insert_sql, our_values) 100 | 101 | 102 | def update_record(conn_cursor, ctable_name, primary_key, data): 103 | """ 104 | Update an existing record as specified using the following params. 105 | :param conn_cursor: A pyschopg2 connection object. 106 | :param ctable_name: The canonicalized talbe name. 107 | :param primary_key: The value of the primary key 108 | of the record you want to update. 109 | :param data: A dictionary of fields to values to 110 | update in the record. 111 | """ 112 | fields_to_update = "" 113 | for field, value in data.items(): 114 | if ' ' in field: 115 | field = quote_ident(field, conn_cursor) 116 | fields_to_update += " {} = '{}',".format(field, value) 117 | 118 | update_sql = "UPDATE {} SET{} WHERE id = {}".format(ctable_name, 119 | fields_to_update[:-1], 120 | primary_key) 121 | conn_cursor.execute(update_sql) 122 | 123 | def delete_record(conn_cursor, ctable_name, primary_key): 124 | # print("delete row from source db") 125 | # with db_utils.get_test_connection('dev') as conn: 126 | # with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 127 | # cur.execute("DELETE FROM {} WHERE id = 3".format(canonicalized_table_name(test_schema_name, test_table_name, cur))) 128 | 129 | conn_cursor.execute("DELETE FROM {} WHERE id = {}".format(ctable_name, primary_key)) 130 | -------------------------------------------------------------------------------- /tests/unittests/test_clear_state_on_replication_change.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tap_postgres 3 | 4 | tap_stream_id = 'chicken_table' 5 | 6 | class TestClearState(unittest.TestCase): 7 | 8 | def test_incremental_happy(self): 9 | state = {'bookmarks' : {tap_stream_id : { 'version' : 1, "replication_key" : 'updated_at', 'replication_key_value' : '2017-01-01T00:00:03+00:00', 'last_replication_method' : 'INCREMENTAL'}}} 10 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at', 'INCREMENTAL') 11 | self.assertEqual(nascent_state, state) 12 | 13 | def test_incremental_changing_replication_keys(self): 14 | state = {'bookmarks' : {tap_stream_id : { 'version' : 1, "replication_key" : 'updated_at', 'replication_key_value' : '2017-01-01T00:00:03+00:00', 'last_replication_method' : 'INCREMENTAL'}}} 15 | 16 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at_2', 'INCREMENTAL') 17 | self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : {'last_replication_method' : 'INCREMENTAL'}}}) 18 | 19 | def test_incremental_changing_replication_key_interrupted(self): 20 | xmin = '3737373' 21 | state = {'bookmarks' : {tap_stream_id : { 'version' : 1, 'xmin' : xmin, "replication_key" : 'updated_at', 'replication_key_value' : '2017-01-01T00:00:03+00:00', 22 | 'last_replication_method' : 'INCREMENTAL'}}} 23 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at_2', 'INCREMENTAL') 24 | self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : { 'last_replication_method' : 'INCREMENTAL'}}}) 25 | 26 | def test_full_table_to_incremental(self): 27 | #interrupted full table -> incremental 28 | xmin = '3737373' 29 | state = {'bookmarks' : {tap_stream_id : { 'version' : 1, 'xmin' : xmin, "last_replication_method" : "FULL_TABLE"}}} 30 | 31 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at', 'INCREMENTAL') 32 | self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : {"last_replication_method" : "INCREMENTAL"}}}) 33 | 34 | state = {'bookmarks' : {tap_stream_id : { 'version' : 1, "last_replication_method" : "FULL_TABLE"}}} 35 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at', 'INCREMENTAL') 36 | self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : {"last_replication_method" : "INCREMENTAL"}}}) 37 | 38 | 39 | def test_log_based_to_incremental(self): 40 | state = {'bookmarks' : {tap_stream_id : { 'version' : 1, 'lsn' : 34343434, "last_replication_method" : "LOG_BASED"}}} 41 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at', 'INCREMENTAL') 42 | self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : {"last_replication_method" : "INCREMENTAL"}}}) 43 | 44 | state = {'bookmarks' : {tap_stream_id : { 'version' : 1, 'lsn' : 34343434, 'xmin' : 34343, "last_replication_method" : "LOG_BASED"}}} 45 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, 'updated_at', 'INCREMENTAL') 46 | self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : {"last_replication_method" : "INCREMENTAL"}}}) 47 | 48 | #full table tests 49 | def test_full_table_happy(self): 50 | state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "FULL_TABLE"}}} 51 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'FULL_TABLE') 52 | self.assertEqual(nascent_state, state) 53 | 54 | def test_full_table_interrupted(self): 55 | xmin = 333333 56 | state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "FULL_TABLE", 'xmin' : xmin}}} 57 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'FULL_TABLE') 58 | self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : { "last_replication_method" : "FULL_TABLE", 'version': 88, 'xmin' : xmin}}}) 59 | 60 | def test_incremental_to_full_table(self): 61 | state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "INCREMENTAL", 'replication_key' : 'updated_at', 'replication_key_value' : 'i will be removed'}}} 62 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'FULL_TABLE') 63 | self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : { "last_replication_method" : "FULL_TABLE"}}}) 64 | 65 | def test_log_based_to_full_table(self): 66 | state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "LOG_BASED", 'lsn' : 343434}}} 67 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'FULL_TABLE') 68 | self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : { "last_replication_method" : "FULL_TABLE"}}}) 69 | 70 | 71 | #log based tests 72 | def test_log_based_happy(self): 73 | lsn = 43434343 74 | state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "LOG_BASED", 'lsn' : lsn}}} 75 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'LOG_BASED') 76 | self.assertEqual(nascent_state, state) 77 | 78 | lsn = 43434343 79 | xmin = 11111 80 | state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "LOG_BASED", 'lsn' : lsn, 'xmin' : xmin}}} 81 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'LOG_BASED') 82 | self.assertEqual(nascent_state, state) 83 | 84 | def test_incremental_to_log_based(self): 85 | state = {'bookmarks' : {tap_stream_id : { 'version' : 88, "last_replication_method" : "INCREMENTAL", 'replication_key' : 'updated_at', 'replication_key_value' : 'i will be removed'}}} 86 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'LOG_BASED') 87 | self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : { "last_replication_method" : "LOG_BASED"}}}) 88 | 89 | def test_full_table_to_log_based(self): 90 | state = {'bookmarks' : {tap_stream_id : { 'version' : 2222, "last_replication_method" : "FULL_TABLE", 'xmin' : 2}}} 91 | nascent_state = tap_postgres.clear_state_on_replication_change(state, tap_stream_id, None, 'LOG_BASED') 92 | self.assertEqual(nascent_state, {'bookmarks' : {tap_stream_id : { "last_replication_method" : "LOG_BASED"}}}) 93 | 94 | 95 | 96 | if __name__== "__main__": 97 | test1 = TestClearState() 98 | test1.test_full_table_to_log_based() 99 | -------------------------------------------------------------------------------- /tap_postgres/sync_strategies/full_table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # pylint: disable=missing-docstring,not-an-iterable,too-many-locals,too-many-arguments,invalid-name,too-many-return-statements,too-many-branches,len-as-condition,too-many-nested-blocks,wrong-import-order,duplicate-code 3 | 4 | import copy 5 | import time 6 | import psycopg2 7 | import psycopg2.extras 8 | import singer 9 | from singer import utils 10 | import singer.metrics as metrics 11 | import tap_postgres.db as post_db 12 | 13 | LOGGER = singer.get_logger() 14 | 15 | UPDATE_BOOKMARK_PERIOD = 1000 16 | 17 | def sync_view(conn_info, stream, state, desired_columns, md_map): 18 | time_extracted = utils.now() 19 | 20 | #before writing the table version to state, check if we had one to begin with 21 | first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None 22 | nascent_stream_version = int(time.time() * 1000) 23 | 24 | state = singer.write_bookmark(state, 25 | stream['tap_stream_id'], 26 | 'version', 27 | nascent_stream_version) 28 | singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) 29 | 30 | schema_name = md_map.get(()).get('schema-name') 31 | 32 | escaped_columns = map(post_db.prepare_columns_sql, desired_columns) 33 | 34 | activate_version_message = singer.ActivateVersionMessage( 35 | stream=post_db.calculate_destination_stream_name(stream, md_map), 36 | version=nascent_stream_version) 37 | 38 | if first_run: 39 | singer.write_message(activate_version_message) 40 | 41 | with metrics.record_counter(None) as counter: 42 | with post_db.open_connection(conn_info) as conn: 43 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur: 44 | cur.itersize = post_db.cursor_iter_size 45 | select_sql = 'SELECT {} FROM {}'.format(','.join(escaped_columns), 46 | post_db.fully_qualified_table_name(schema_name, stream['table_name'])) 47 | 48 | LOGGER.info("select %s with itersize %s", select_sql, cur.itersize) 49 | cur.execute(select_sql) 50 | 51 | rows_saved = 0 52 | for rec in cur: 53 | record_message = post_db.selected_row_to_singer_message(stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map) 54 | singer.write_message(record_message) 55 | rows_saved = rows_saved + 1 56 | if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: 57 | singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) 58 | 59 | counter.increment() 60 | 61 | #always send the activate version whether first run or subsequent 62 | singer.write_message(activate_version_message) 63 | 64 | return state 65 | 66 | 67 | def sync_table(conn_info, stream, state, desired_columns, md_map): 68 | time_extracted = utils.now() 69 | 70 | #before writing the table version to state, check if we had one to begin with 71 | first_run = singer.get_bookmark(state, stream['tap_stream_id'], 'version') is None 72 | 73 | #pick a new table version IFF we do not have an xmin in our state 74 | #the presence of an xmin indicates that we were interrupted last time through 75 | if singer.get_bookmark(state, stream['tap_stream_id'], 'xmin') is None: 76 | nascent_stream_version = int(time.time() * 1000) 77 | else: 78 | nascent_stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') 79 | 80 | state = singer.write_bookmark(state, 81 | stream['tap_stream_id'], 82 | 'version', 83 | nascent_stream_version) 84 | singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) 85 | 86 | schema_name = md_map.get(()).get('schema-name') 87 | 88 | escaped_columns = map(post_db.prepare_columns_sql, desired_columns) 89 | 90 | activate_version_message = singer.ActivateVersionMessage( 91 | stream=post_db.calculate_destination_stream_name(stream, md_map), 92 | version=nascent_stream_version) 93 | 94 | if first_run: 95 | singer.write_message(activate_version_message) 96 | 97 | hstore_available = post_db.hstore_available(conn_info) 98 | with metrics.record_counter(None) as counter: 99 | with post_db.open_connection(conn_info) as conn: 100 | 101 | # Client side character encoding defaults to the value in postgresql.conf under client_encoding. 102 | # The server / db can also have its own configred encoding. 103 | with conn.cursor() as cur: 104 | cur.execute("show server_encoding") 105 | LOGGER.info("Current Server Encoding: %s", cur.fetchone()[0]) 106 | cur.execute("show client_encoding") 107 | LOGGER.info("Current Client Encoding: %s", cur.fetchone()[0]) 108 | 109 | if hstore_available: 110 | LOGGER.info("hstore is available") 111 | psycopg2.extras.register_hstore(conn) 112 | else: 113 | LOGGER.info("hstore is UNavailable") 114 | 115 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur: 116 | cur.itersize = post_db.cursor_iter_size 117 | 118 | xmin = singer.get_bookmark(state, stream['tap_stream_id'], 'xmin') 119 | if xmin: 120 | LOGGER.info("Resuming Full Table replication %s from xmin %s", nascent_stream_version, xmin) 121 | select_sql = """SELECT {}, xmin::text::bigint 122 | FROM {} where age(xmin::xid) <= age('{}'::xid) 123 | ORDER BY xmin::text::bigint ASC""".format(','.join(escaped_columns), 124 | post_db.fully_qualified_table_name(schema_name, stream['table_name']), 125 | xmin) 126 | else: 127 | LOGGER.info("Beginning new Full Table replication %s", nascent_stream_version) 128 | select_sql = """SELECT {}, xmin::text::bigint 129 | FROM {} 130 | ORDER BY xmin::text::bigint ASC""".format(','.join(escaped_columns), 131 | post_db.fully_qualified_table_name(schema_name, stream['table_name'])) 132 | 133 | 134 | LOGGER.info("select %s with itersize %s", select_sql, cur.itersize) 135 | cur.execute(select_sql) 136 | 137 | rows_saved = 0 138 | for rec in cur: 139 | xmin = rec['xmin'] 140 | rec = rec[:-1] 141 | record_message = post_db.selected_row_to_singer_message(stream, rec, nascent_stream_version, desired_columns, time_extracted, md_map) 142 | singer.write_message(record_message) 143 | state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', xmin) 144 | rows_saved = rows_saved + 1 145 | if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: 146 | singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) 147 | 148 | counter.increment() 149 | 150 | #once we have completed the full table replication, discard the xmin bookmark. 151 | #the xmin bookmark only comes into play when a full table replication is interrupted 152 | state = singer.write_bookmark(state, stream['tap_stream_id'], 'xmin', None) 153 | 154 | #always send the activate version whether first run or subsequent 155 | singer.write_message(activate_version_message) 156 | 157 | return state 158 | -------------------------------------------------------------------------------- /tap_postgres/db.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import decimal 3 | import math 4 | import psycopg2 5 | import psycopg2.extras 6 | import singer 7 | LOGGER = singer.get_logger() 8 | 9 | cursor_iter_size = 20000 10 | include_schemas_in_destination_stream_name = False 11 | 12 | def get_ssl_status(conn_config): 13 | try: 14 | matching_rows = [] 15 | with open_connection(conn_config) as conn: 16 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur: 17 | select_sql = "SELECT datname,usename, ssl, client_addr FROM pg_stat_ssl JOIN pg_stat_activity ON pg_stat_ssl.pid = pg_stat_activity.pid" 18 | cur.execute(select_sql) 19 | for row in cur: 20 | if row[0] == conn_config['dbname'] and row[1] == conn_config['user']: 21 | matching_rows.append(row) 22 | if len(matching_rows) == 1: 23 | LOGGER.info('User %s connected with SSL = %s', conn_config['user'], matching_rows[0][2]) 24 | else: 25 | LOGGER.info('Failed to retrieve SSL status') 26 | except: 27 | LOGGER.info('Failed to retrieve SSL status') 28 | 29 | 30 | def calculate_destination_stream_name(stream, md_map): 31 | if include_schemas_in_destination_stream_name: 32 | return "{}_{}".format(md_map.get((), {}).get('schema-name'), stream['stream']) 33 | 34 | return stream['stream'] 35 | 36 | #from the postgres docs: 37 | #Quoted identifiers can contain any character, except the character with code zero. (To include a double #quote, write two double quotes.) 38 | def canonicalize_identifier(identifier): 39 | return identifier.replace('"', '""') 40 | 41 | def fully_qualified_column_name(schema, table, column): 42 | return '"{}"."{}"."{}"'.format(canonicalize_identifier(schema), canonicalize_identifier(table), canonicalize_identifier(column)) 43 | 44 | def fully_qualified_table_name(schema, table): 45 | return '"{}"."{}"'.format(canonicalize_identifier(schema), canonicalize_identifier(table)) 46 | 47 | def open_connection(conn_config, logical_replication=False): 48 | cfg = { 49 | 'host': conn_config['host'], 50 | 'dbname': conn_config['dbname'], 51 | 'user': conn_config['user'], 52 | 'password': conn_config['password'], 53 | 'port': conn_config['port'], 54 | 'connect_timeout': 30 55 | } 56 | 57 | if conn_config.get('sslmode'): 58 | cfg['sslmode'] = conn_config['sslmode'] 59 | 60 | if logical_replication: 61 | cfg['connection_factory'] = psycopg2.extras.LogicalReplicationConnection 62 | 63 | conn = psycopg2.connect(**cfg) 64 | 65 | return conn 66 | 67 | def prepare_columns_sql(c): 68 | column_name = """ "{}" """.format(canonicalize_identifier(c)) 69 | return column_name 70 | 71 | def filter_dbs_sql_clause(sql, filter_dbs): 72 | in_clause = " AND datname in (" + ",".join(["'{}'".format(b.strip(' ')) for b in filter_dbs.split(',')]) + ")" 73 | return sql + in_clause 74 | 75 | #pylint: disable=too-many-branches,too-many-nested-blocks 76 | def selected_value_to_singer_value_impl(elem, sql_datatype): 77 | sql_datatype = sql_datatype.replace('[]', '') 78 | if elem is None: 79 | cleaned_elem = elem 80 | elif sql_datatype == 'money': 81 | cleaned_elem = elem 82 | elif isinstance(elem, datetime.datetime): 83 | if sql_datatype == 'timestamp with time zone': 84 | cleaned_elem = elem.isoformat() 85 | else: #timestamp WITH OUT time zone 86 | cleaned_elem = elem.isoformat() + '+00:00' 87 | elif isinstance(elem, datetime.date): 88 | cleaned_elem = elem.isoformat() + 'T00:00:00+00:00' 89 | elif sql_datatype == 'bit': 90 | cleaned_elem = elem == '1' 91 | elif sql_datatype == 'boolean': 92 | cleaned_elem = elem 93 | elif isinstance(elem, int): 94 | cleaned_elem = elem 95 | elif isinstance(elem, datetime.time): 96 | cleaned_elem = str(elem) 97 | elif isinstance(elem, str): 98 | cleaned_elem = elem 99 | elif isinstance(elem, decimal.Decimal): 100 | #NB> We cast NaN's to NULL as wal2json does not support them and now we are at least consistent(ly wrong) 101 | if elem.is_nan(): 102 | cleaned_elem = None 103 | else: 104 | cleaned_elem = elem 105 | elif isinstance(elem, float): 106 | #NB> We cast NaN's, +Inf, -Inf to NULL as wal2json does not support them and now we are at least consistent(ly wrong) 107 | if math.isnan(elem): 108 | cleaned_elem = None 109 | elif math.isinf(elem): 110 | cleaned_elem = None 111 | else: 112 | cleaned_elem = elem 113 | elif isinstance(elem, dict): 114 | if sql_datatype == 'hstore': 115 | cleaned_elem = elem 116 | else: 117 | raise Exception("do not know how to marshall a dict if its not an hstore or json: {}".format(sql_datatype)) 118 | else: 119 | raise Exception("do not know how to marshall value of class( {} ) and sql_datatype ( {} )".format(elem.__class__, sql_datatype)) 120 | 121 | return cleaned_elem 122 | 123 | def selected_array_to_singer_value(elem, sql_datatype): 124 | if isinstance(elem, list): 125 | return list(map(lambda elem: selected_array_to_singer_value(elem, sql_datatype), elem)) 126 | 127 | return selected_value_to_singer_value_impl(elem, sql_datatype) 128 | 129 | def selected_value_to_singer_value(elem, sql_datatype): 130 | #are we dealing with an array? 131 | if sql_datatype.find('[]') > 0: 132 | return list(map(lambda elem: selected_array_to_singer_value(elem, sql_datatype), (elem or []))) 133 | 134 | return selected_value_to_singer_value_impl(elem, sql_datatype) 135 | 136 | #pylint: disable=too-many-arguments 137 | def selected_row_to_singer_message(stream, row, version, columns, time_extracted, md_map): 138 | row_to_persist = () 139 | for idx, elem in enumerate(row): 140 | sql_datatype = md_map.get(('properties', columns[idx]))['sql-datatype'] 141 | cleaned_elem = selected_value_to_singer_value(elem, sql_datatype) 142 | row_to_persist += (cleaned_elem,) 143 | 144 | rec = dict(zip(columns, row_to_persist)) 145 | 146 | return singer.RecordMessage( 147 | stream=calculate_destination_stream_name(stream, md_map), 148 | record=rec, 149 | version=version, 150 | time_extracted=time_extracted) 151 | 152 | def hstore_available(conn_info): 153 | with open_connection(conn_info) as conn: 154 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur: 155 | cur.execute(""" SELECT installed_version FROM pg_available_extensions WHERE name = 'hstore' """) 156 | res = cur.fetchone() 157 | if res and res[0]: 158 | return True 159 | return False 160 | 161 | 162 | def compute_tap_stream_id(database_name, schema_name, table_name): 163 | return database_name + '-' + schema_name + '-' + table_name 164 | 165 | 166 | #NB> numeric/decimal columns in postgres without a specified scale && precision 167 | #default to 'up to 131072 digits before the decimal point; up to 16383 168 | #digits after the decimal point'. For practical reasons, we are capping this at 74/38 169 | # https://www.postgresql.org/docs/10/static/datatype-numeric.html#DATATYPE-NUMERIC-TABLE 170 | MAX_SCALE = 38 171 | MAX_PRECISION = 100 172 | 173 | def numeric_precision(c): 174 | if c.numeric_precision is None: 175 | return MAX_PRECISION 176 | 177 | if c.numeric_precision > MAX_PRECISION: 178 | LOGGER.warning('capping decimal precision to 100. THIS MAY CAUSE TRUNCATION') 179 | return MAX_PRECISION 180 | 181 | return c.numeric_precision 182 | 183 | def numeric_scale(c): 184 | if c.numeric_scale is None: 185 | return MAX_SCALE 186 | if c.numeric_scale > MAX_SCALE: 187 | LOGGER.warning('capping decimal scale to 38. THIS MAY CAUSE TRUNCATION') 188 | return MAX_SCALE 189 | 190 | return c.numeric_scale 191 | 192 | def numeric_multiple_of(scale): 193 | return 10 ** (0 - scale) 194 | 195 | def numeric_max(precision, scale): 196 | return 10 ** (precision - scale) 197 | 198 | def numeric_min(precision, scale): 199 | return -10 ** (precision - scale) 200 | -------------------------------------------------------------------------------- /tests/unittests/utils.py: -------------------------------------------------------------------------------- 1 | from singer import get_logger, metadata 2 | from nose.tools import nottest 3 | import psycopg2 4 | import singer 5 | import os 6 | import decimal 7 | import math 8 | import datetime 9 | import pdb 10 | from psycopg2.extensions import quote_ident 11 | 12 | LOGGER = get_logger() 13 | 14 | def get_test_connection_config(target_db='postgres'): 15 | missing_envs = [x for x in [os.getenv('TAP_POSTGRES_HOST'), 16 | os.getenv('TAP_POSTGRES_USER'), 17 | os.getenv('TAP_POSTGRES_PASSWORD'), 18 | os.getenv('TAP_POSTGRES_PORT')] if x == None] 19 | if len(missing_envs) != 0: 20 | #pylint: disable=line-too-long 21 | raise Exception("set TAP_POSTGRES_HOST, TAP_POSTGRES_USER, TAP_POSTGRES_PASSWORD, TAP_POSTGRES_PORT") 22 | 23 | conn_config = {} 24 | conn_config['host'] = os.environ.get('TAP_POSTGRES_HOST') 25 | conn_config['user'] = os.environ.get('TAP_POSTGRES_USER') 26 | conn_config['password'] = os.environ.get('TAP_POSTGRES_PASSWORD') 27 | conn_config['port'] = os.environ.get('TAP_POSTGRES_PORT') 28 | conn_config['dbname'] = target_db 29 | return conn_config 30 | 31 | def get_test_connection(target_db='postgres'): 32 | conn_config = get_test_connection_config(target_db) 33 | conn_string = "host='{}' dbname='{}' user='{}' password='{}' port='{}'".format(conn_config['host'], 34 | conn_config['dbname'], 35 | conn_config['user'], 36 | conn_config['password'], 37 | conn_config['port']) 38 | LOGGER.info("connecting to {}".format(conn_config['host'])) 39 | 40 | conn = psycopg2.connect(conn_string) 41 | conn.autocommit = True 42 | 43 | return conn 44 | 45 | def build_col_sql(col, cur): 46 | if col.get('quoted'): 47 | col_sql = "{} {}".format(quote_ident(col['name'], cur), col['type']) 48 | else: 49 | col_sql = "{} {}".format(col['name'], col['type']) 50 | 51 | return col_sql 52 | 53 | def build_table(table, cur): 54 | create_sql = "CREATE TABLE {}\n".format(quote_ident(table['name'], cur)) 55 | col_sql = map(lambda c: build_col_sql(c, cur), table['columns']) 56 | pks = [c['name'] for c in table['columns'] if c.get('primary_key')] 57 | if len(pks) != 0: 58 | pk_sql = ",\n CONSTRAINT {} PRIMARY KEY({})".format(quote_ident(table['name'] + "_pk", cur), " ,".join(pks)) 59 | else: 60 | pk_sql = "" 61 | 62 | sql = "{} ( {} {})".format(create_sql, ",\n".join(col_sql), pk_sql) 63 | 64 | return sql 65 | 66 | def ensure_db(dbname='postgres'): 67 | # Create database dev if not exists 68 | with get_test_connection() as conn: 69 | conn.autocommit = True 70 | with conn.cursor() as cur: 71 | cur.execute("SELECT 1 FROM pg_database WHERE datname = '{}'".format(dbname)) 72 | exists = cur.fetchone() 73 | if not exists: 74 | print("Creating database {}".format(dbname)) 75 | cur.execute("CREATE DATABASE {}".format(dbname)) 76 | 77 | @nottest 78 | def ensure_test_table(table_spec, target_db='postgres'): 79 | with get_test_connection(target_db) as conn: 80 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 81 | sql = """SELECT * 82 | FROM information_schema.tables 83 | WHERE table_schema = 'public' 84 | AND table_name = %s""" 85 | 86 | cur.execute(sql, 87 | [table_spec['name']]) 88 | old_table = cur.fetchall() 89 | 90 | if len(old_table) != 0: 91 | cur.execute('DROP TABLE {} cascade'.format(quote_ident(table_spec['name'], cur))) 92 | 93 | sql = build_table(table_spec, cur) 94 | LOGGER.info("create table sql: %s", sql) 95 | cur.execute(sql) 96 | 97 | def unselect_column(our_stream, col): 98 | md = metadata.to_map(our_stream['metadata']) 99 | md.get(('properties', col))['selected'] = False 100 | our_stream['metadata'] = metadata.to_list(md) 101 | return our_stream 102 | 103 | def set_replication_method_for_stream(stream, method): 104 | new_md = metadata.to_map(stream['metadata']) 105 | old_md = new_md.get(()) 106 | old_md.update({'replication-method': method}) 107 | 108 | stream['metadata'] = metadata.to_list(new_md) 109 | return stream 110 | 111 | def select_all_of_stream(stream): 112 | new_md = metadata.to_map(stream['metadata']) 113 | 114 | old_md = new_md.get(()) 115 | old_md.update({'selected': True}) 116 | for col_name, col_schema in stream['schema']['properties'].items(): 117 | #explicitly select column if it is not automatic 118 | if new_md.get(('properties', col_name)).get('inclusion') != 'automatic' and new_md.get(('properties', col_name)).get('inclusion') != 'unsupported': 119 | old_md = new_md.get(('properties', col_name)) 120 | old_md.update({'selected' : True}) 121 | 122 | stream['metadata'] = metadata.to_list(new_md) 123 | return stream 124 | 125 | 126 | def crud_up_value(value): 127 | if isinstance(value, str): 128 | return value 129 | elif isinstance(value, int): 130 | return str(value) 131 | elif isinstance(value, float): 132 | if (value == float('+inf')): 133 | return "'+Inf'" 134 | elif (value == float('-inf')): 135 | return "'-Inf'" 136 | elif (math.isnan(value)): 137 | return "'NaN'" 138 | else: 139 | return "{:f}".format(value) 140 | elif isinstance(value, decimal.Decimal): 141 | return "{:f}".format(value) 142 | elif value is None: 143 | return 'NULL' 144 | elif isinstance(value, datetime.datetime) and value.tzinfo is None: 145 | return "TIMESTAMP '{}'".format(str(value)) 146 | elif isinstance(value, datetime.datetime): 147 | return "TIMESTAMP '{}'".format(str(value)) 148 | elif isinstance(value, datetime.date): 149 | return "Date '{}'".format(str(value)) 150 | else: 151 | raise Exception("crud_up_value does not yet support {}".format(value.__class__)) 152 | 153 | def insert_record(cursor, table_name, data): 154 | our_keys = list(data.keys()) 155 | our_keys.sort() 156 | our_values = list(map( lambda k: data.get(k), our_keys)) 157 | 158 | 159 | columns_sql = ", \n".join(map(lambda k: quote_ident(k, cursor), our_keys)) 160 | value_sql = ",".join(["%s" for i in range(len(our_keys))]) 161 | 162 | insert_sql = """ INSERT INTO {} 163 | ( {} ) 164 | VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql) 165 | LOGGER.info("INSERT: {}".format(insert_sql)) 166 | cursor.execute(insert_sql, list(map(crud_up_value, our_values))) 167 | 168 | 169 | def verify_crud_messages(that, caught_messages, pks): 170 | 171 | that.assertEqual(14, len(caught_messages)) 172 | that.assertTrue(isinstance(caught_messages[0], singer.SchemaMessage)) 173 | that.assertTrue(isinstance(caught_messages[1], singer.RecordMessage)) 174 | that.assertTrue(isinstance(caught_messages[2], singer.StateMessage)) 175 | that.assertTrue(isinstance(caught_messages[3], singer.RecordMessage)) 176 | that.assertTrue(isinstance(caught_messages[4], singer.StateMessage)) 177 | that.assertTrue(isinstance(caught_messages[5], singer.RecordMessage)) 178 | that.assertTrue(isinstance(caught_messages[6], singer.StateMessage)) 179 | that.assertTrue(isinstance(caught_messages[7], singer.RecordMessage)) 180 | that.assertTrue(isinstance(caught_messages[8], singer.StateMessage)) 181 | that.assertTrue(isinstance(caught_messages[9], singer.RecordMessage)) 182 | that.assertTrue(isinstance(caught_messages[10], singer.StateMessage)) 183 | that.assertTrue(isinstance(caught_messages[11], singer.RecordMessage)) 184 | that.assertTrue(isinstance(caught_messages[12], singer.StateMessage)) 185 | that.assertTrue(isinstance(caught_messages[13], singer.StateMessage)) 186 | 187 | #schema includes scn && _sdc_deleted_at because we selected logminer as our replication method 188 | that.assertEqual({"type" : ['integer']}, caught_messages[0].schema.get('properties').get('scn') ) 189 | that.assertEqual({"type" : ['null', 'string'], "format" : "date-time"}, caught_messages[0].schema.get('properties').get('_sdc_deleted_at') ) 190 | 191 | that.assertEqual(pks, caught_messages[0].key_properties) 192 | 193 | #verify first STATE message 194 | bookmarks_1 = caught_messages[2].value.get('bookmarks')['ROOT-CHICKEN'] 195 | that.assertIsNotNone(bookmarks_1) 196 | bookmarks_1_scn = bookmarks_1.get('scn') 197 | bookmarks_1_version = bookmarks_1.get('version') 198 | that.assertIsNotNone(bookmarks_1_scn) 199 | that.assertIsNotNone(bookmarks_1_version) 200 | 201 | #verify STATE message after UPDATE 202 | bookmarks_2 = caught_messages[6].value.get('bookmarks')['ROOT-CHICKEN'] 203 | that.assertIsNotNone(bookmarks_2) 204 | bookmarks_2_scn = bookmarks_2.get('scn') 205 | bookmarks_2_version = bookmarks_2.get('version') 206 | that.assertIsNotNone(bookmarks_2_scn) 207 | that.assertIsNotNone(bookmarks_2_version) 208 | that.assertGreater(bookmarks_2_scn, bookmarks_1_scn) 209 | that.assertEqual(bookmarks_2_version, bookmarks_1_version) 210 | -------------------------------------------------------------------------------- /tests/test_postgres_views_logical_replication.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import psycopg2.extras 5 | from psycopg2.extensions import quote_ident 6 | import tap_tester.connections as connections 7 | import tap_tester.menagerie as menagerie 8 | import tap_tester.runner as runner 9 | 10 | import db_utils # pylint: disable=import-error 11 | 12 | expected_schemas = {'chicken_view': {'properties': 13 | {'fk_id': {'maximum': 9223372036854775807, 'type': ['null', 'integer'], 14 | 'minimum': -9223372036854775808}, 15 | 'size': {'type': ['null', 'string']}, 16 | 'name': {'type': ['null', 'string']}, 17 | 'id': {'maximum': 2147483647, 'type': ['null', 'integer'], 18 | 'minimum': -2147483648}, 19 | 'age': {'maximum': 2147483647, 'type': ['null', 'integer'], 20 | 'minimum': -2147483648}}, 21 | 'type': 'object'}} 22 | 23 | 24 | def canonicalized_table_name(schema, table, cur): 25 | return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur)) 26 | 27 | def insert_record(cursor, table_name, data): 28 | our_keys = list(data.keys()) 29 | our_keys.sort() 30 | our_values = [data.get(key) for key in our_keys] 31 | 32 | columns_sql = ", \n ".join(our_keys) 33 | value_sql = ",".join(["%s" for i in range(len(our_keys))]) 34 | 35 | insert_sql = """ INSERT INTO {} 36 | ( {} ) 37 | VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql) 38 | cursor.execute(insert_sql, our_values) 39 | 40 | 41 | 42 | test_schema_name = "public" 43 | test_table_name_1 = "postgres_views_full_table_replication_test" 44 | test_table_name_2 = "postgres_views_full_table_replication_test_2" 45 | test_view = 'chicken_view' 46 | 47 | class PostgresViewsLogicalReplication(unittest.TestCase): 48 | def setUp(self): 49 | db_utils.ensure_environment_variables_set() 50 | 51 | db_utils.ensure_db() 52 | 53 | self.maxDiff = None 54 | 55 | with db_utils.get_test_connection() as conn: 56 | conn.autocommit = True 57 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 58 | for table in [test_table_name_1, test_table_name_2]: 59 | old_table = cur.execute("""SELECT EXISTS ( 60 | SELECT 1 61 | FROM information_schema.tables 62 | WHERE table_schema = %s 63 | AND table_name = %s)""", 64 | [test_schema_name, table]) 65 | old_table = cur.fetchone()[0] 66 | if old_table: 67 | cur.execute("DROP TABLE {} CASCADE".format(canonicalized_table_name(test_schema_name, table, cur))) 68 | 69 | 70 | cur.execute("""DROP VIEW IF EXISTS {} """.format(quote_ident(test_view, cur))) 71 | cur.execute("""CREATE TABLE {} 72 | (id SERIAL PRIMARY KEY, 73 | name VARCHAR, 74 | size VARCHAR) """.format(canonicalized_table_name(test_schema_name, test_table_name_1, cur))) 75 | 76 | cur.execute("""CREATE TABLE {} 77 | (fk_id bigint, 78 | age integer) """.format(canonicalized_table_name(test_schema_name, test_table_name_2, cur))) 79 | 80 | cur.execute("""CREATE VIEW {} AS 81 | (SELECT * 82 | FROM {} 83 | join {} 84 | on {}.id = {}.fk_id 85 | )""".format(quote_ident(test_view, cur), 86 | canonicalized_table_name(test_schema_name, test_table_name_1, cur), 87 | canonicalized_table_name(test_schema_name, test_table_name_2, cur), 88 | canonicalized_table_name(test_schema_name, test_table_name_1, cur), 89 | canonicalized_table_name(test_schema_name, test_table_name_2, cur))) 90 | 91 | self.rec_1 = { 'name' : 'fred', 'size' : 'big' } 92 | insert_record(cur, test_table_name_1, self.rec_1) 93 | 94 | cur.execute("SELECT id FROM {}".format(canonicalized_table_name(test_schema_name, test_table_name_1, cur))) 95 | fk_id = cur.fetchone()[0] 96 | 97 | self.rec_2 = { 'fk_id' : fk_id, 'age' : 99 } 98 | insert_record(cur, test_table_name_2, self.rec_2) 99 | 100 | @staticmethod 101 | def expected_check_streams(): 102 | return { 'postgres-public-chicken_view'} 103 | 104 | @staticmethod 105 | def expected_sync_streams(): 106 | return { 'chicken_view' } 107 | 108 | @staticmethod 109 | def name(): 110 | return "tap_tester_postgres_views_logical_replication" 111 | 112 | @staticmethod 113 | def expected_pks(): 114 | return { 115 | 'chicken_view' : {'id'} 116 | } 117 | 118 | @staticmethod 119 | def tap_name(): 120 | return "tap-postgres" 121 | 122 | @staticmethod 123 | def get_type(): 124 | return "platform.postgres" 125 | 126 | @staticmethod 127 | def get_credentials(): 128 | return {'password': os.getenv('TAP_POSTGRES_PASSWORD')} 129 | 130 | @staticmethod 131 | def get_properties(): 132 | return {'host' : os.getenv('TAP_POSTGRES_HOST'), 133 | 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'), 134 | 'port' : os.getenv('TAP_POSTGRES_PORT'), 135 | 'user' : os.getenv('TAP_POSTGRES_USER'), 136 | 'default_replication_method' : 'FULL_TABLE' 137 | } 138 | 139 | def test_run(self): 140 | conn_id = connections.ensure_connection(self) 141 | 142 | # run in check mode 143 | check_job_name = runner.run_check_mode(self, conn_id) 144 | 145 | # verify check exit codes 146 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 147 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 148 | 149 | # verify the tap discovered the right streams 150 | found_catalogs = [fc for fc 151 | in menagerie.get_catalogs(conn_id) 152 | if fc['tap_stream_id'] in self.expected_check_streams()] 153 | 154 | self.assertEqual(len(found_catalogs), 155 | 1, 156 | msg="unable to locate schemas for connection {}".format(conn_id)) 157 | 158 | found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) 159 | diff = self.expected_check_streams().symmetric_difference(found_catalog_names) 160 | self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) 161 | 162 | # verify that persisted streams have the correct properties 163 | chicken_catalog = found_catalogs[0] 164 | 165 | self.assertEqual('chicken_view', chicken_catalog['stream_name']) 166 | print("discovered streams are correct") 167 | 168 | print('checking discoverd metadata for ROOT-CHICKEN_VIEW') 169 | md = menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id'])['metadata'] 170 | 171 | self.assertEqual( 172 | {(): {'database-name': 'postgres', 'is-view': True, 'row-count': 0, 'schema-name': 'public', 'table-key-properties': []}, 173 | ('properties', 'fk_id'): {'inclusion': 'available', 'sql-datatype': 'bigint', 'selected-by-default': True}, 174 | ('properties', 'name'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True}, 175 | ('properties', 'age'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True}, 176 | ('properties', 'size'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True}, 177 | ('properties', 'id'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True}}, 178 | db_utils.to_map(md)) 179 | 180 | 181 | # 'ID' selected as view-key-properties 182 | replication_md = [{"breadcrumb": [], "metadata": {'replication-key': None, "replication-method" : "LOG_BASED", 'view-key-properties': ["id"]}}] 183 | 184 | connections.select_catalog_and_fields_via_metadata(conn_id, chicken_catalog, 185 | menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id']), 186 | replication_md) 187 | 188 | # clear state 189 | menagerie.set_state(conn_id, {}) 190 | 191 | sync_job_name = runner.run_sync_mode(self, conn_id) 192 | 193 | # verify tap and target exit codes 194 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 195 | 196 | self.assertEqual(exit_status['tap_exit_status'], 1) 197 | # menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 198 | 199 | record_count_by_stream = runner.examine_target_output_file(self, 200 | conn_id, 201 | self.expected_sync_streams(), 202 | self.expected_pks()) 203 | 204 | self.assertEqual(record_count_by_stream, {}) 205 | print("records are correct") 206 | 207 | # verify state and bookmarks 208 | state = menagerie.get_state(conn_id) 209 | self.assertEqual(state, {}, msg="expected state to be empty") 210 | -------------------------------------------------------------------------------- /tests/test_postgres_views_full_table.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import psycopg2.extras 5 | from psycopg2.extensions import quote_ident 6 | import tap_tester.connections as connections 7 | import tap_tester.menagerie as menagerie 8 | import tap_tester.runner as runner 9 | 10 | import db_utils # pylint: disable=import-error 11 | 12 | expected_schemas = {'chicken_view': 13 | {'properties': {'fk_id': {'maximum': 9223372036854775807, 'type': ['null', 'integer'], 14 | 'minimum': -9223372036854775808}, 15 | 'size': {'type': ['null', 'string']}, 16 | 'name': {'type': ['null', 'string']}, 17 | 'id': {'maximum': 2147483647, 'type': ['null', 'integer'], 18 | 'minimum': -2147483648}, 19 | 'age': {'maximum': 2147483647, 'type': ['null', 'integer'], 20 | 'minimum': -2147483648}}, 21 | 'type': 'object', 22 | 'definitions' : { 23 | 'sdc_recursive_integer_array' : { 'type' : ['null', 'integer', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_integer_array'}}, 24 | 'sdc_recursive_number_array' : { 'type' : ['null', 'number', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_number_array'}}, 25 | 'sdc_recursive_string_array' : { 'type' : ['null', 'string', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_string_array'}}, 26 | 'sdc_recursive_boolean_array' : { 'type' : ['null', 'boolean', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_boolean_array'}}, 27 | 'sdc_recursive_timestamp_array' : { 'type' : ['null', 'string', 'array'], 'format' : 'date-time', 'items' : { '$ref': '#/definitions/sdc_recursive_timestamp_array'}}, 28 | 'sdc_recursive_object_array' : { 'type' : ['null','object', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_object_array'}} 29 | }}} 30 | 31 | 32 | def canonicalized_table_name(schema, table, cur): 33 | return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur)) 34 | 35 | def insert_record(cursor, table_name, data): 36 | our_keys = list(data.keys()) 37 | our_keys.sort() 38 | our_values = [data.get(key) for key in our_keys] 39 | 40 | columns_sql = ", \n ".join(our_keys) 41 | value_sql = ",".join(["%s" for i in range(len(our_keys))]) 42 | 43 | insert_sql = """ INSERT INTO {} 44 | ( {} ) 45 | VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql) 46 | cursor.execute(insert_sql, our_values) 47 | 48 | 49 | 50 | test_schema_name = "public" 51 | test_table_name_1 = "postgres_views_full_table_replication_test" 52 | test_table_name_2 = "postgres_views_full_table_replication_test_2" 53 | test_view = 'chicken_view' 54 | 55 | class PostgresViewsFullTable(unittest.TestCase): 56 | def setUp(self): 57 | db_utils.ensure_environment_variables_set() 58 | 59 | db_utils.ensure_db() 60 | 61 | self.maxDiff = None 62 | 63 | with db_utils.get_test_connection() as conn: 64 | conn.autocommit = True 65 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 66 | for table in [test_table_name_1, test_table_name_2]: 67 | old_table = cur.execute("""SELECT EXISTS ( 68 | SELECT 1 69 | FROM information_schema.tables 70 | WHERE table_schema = %s 71 | AND table_name = %s)""", 72 | [test_schema_name, table]) 73 | old_table = cur.fetchone()[0] 74 | if old_table: 75 | cur.execute("DROP TABLE {} CASCADE".format(canonicalized_table_name(test_schema_name, table, cur))) 76 | 77 | 78 | cur.execute("""DROP VIEW IF EXISTS {} """.format(quote_ident(test_view, cur))) 79 | cur.execute("""CREATE TABLE {} 80 | (id SERIAL PRIMARY KEY, 81 | name VARCHAR, 82 | size VARCHAR) """.format(canonicalized_table_name(test_schema_name, test_table_name_1, cur))) 83 | 84 | cur.execute("""CREATE TABLE {} 85 | (fk_id bigint, 86 | age integer) """.format(canonicalized_table_name(test_schema_name, test_table_name_2, cur))) 87 | 88 | cur.execute("""CREATE VIEW {} AS 89 | (SELECT * 90 | FROM {} 91 | join {} 92 | on {}.id = {}.fk_id 93 | )""".format(quote_ident(test_view, cur), 94 | canonicalized_table_name(test_schema_name, test_table_name_1, cur), 95 | canonicalized_table_name(test_schema_name, test_table_name_2, cur), 96 | canonicalized_table_name(test_schema_name, test_table_name_1, cur), 97 | canonicalized_table_name(test_schema_name, test_table_name_2, cur))) 98 | 99 | self.rec_1 = { 'name' : 'fred', 'size' : 'big' } 100 | insert_record(cur, test_table_name_1, self.rec_1) 101 | 102 | cur.execute("SELECT id FROM {}".format(canonicalized_table_name(test_schema_name, test_table_name_1, cur))) 103 | fk_id = cur.fetchone()[0] 104 | 105 | self.rec_2 = { 'fk_id' : fk_id, 'age' : 99 } 106 | insert_record(cur, test_table_name_2, self.rec_2) 107 | 108 | 109 | @staticmethod 110 | def expected_check_streams(): 111 | return { 'postgres-public-chicken_view'} 112 | 113 | @staticmethod 114 | def expected_sync_streams(): 115 | return { 'chicken_view' } 116 | 117 | @staticmethod 118 | def name(): 119 | return "tap_tester_postgres_views_full_table" 120 | 121 | @staticmethod 122 | def expected_pks(): 123 | return { 124 | 'chicken_view' : {'id'} 125 | } 126 | 127 | @staticmethod 128 | def tap_name(): 129 | return "tap-postgres" 130 | 131 | @staticmethod 132 | def get_type(): 133 | return "platform.postgres" 134 | 135 | @staticmethod 136 | def get_credentials(): 137 | return {'password': os.getenv('TAP_POSTGRES_PASSWORD')} 138 | 139 | @staticmethod 140 | def get_properties(): 141 | return {'host' : os.getenv('TAP_POSTGRES_HOST'), 142 | 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'), 143 | 'port' : os.getenv('TAP_POSTGRES_PORT'), 144 | 'user' : os.getenv('TAP_POSTGRES_USER') 145 | } 146 | 147 | def test_run(self): 148 | conn_id = connections.ensure_connection(self) 149 | 150 | # run in check mode 151 | check_job_name = runner.run_check_mode(self, conn_id) 152 | 153 | # verify check exit codes 154 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 155 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 156 | 157 | # verify the tap discovered the right streams 158 | found_catalogs = [fc for fc 159 | in menagerie.get_catalogs(conn_id) 160 | if fc['tap_stream_id'] in self.expected_check_streams()] 161 | 162 | self.assertEqual(len(found_catalogs), 163 | 1, 164 | msg="unable to locate schemas for connection {}".format(conn_id)) 165 | 166 | found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) 167 | diff = self.expected_check_streams().symmetric_difference(found_catalog_names) 168 | self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) 169 | 170 | # verify that persisted streams have the correct properties 171 | chicken_catalog = found_catalogs[0] 172 | 173 | self.assertEqual('chicken_view', chicken_catalog['stream_name']) 174 | print("discovered streams are correct") 175 | 176 | print('checking discoverd metadata for ROOT-CHICKEN_VIEW') 177 | md = menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id'])['metadata'] 178 | 179 | self.assertEqual( 180 | {(): {'database-name': 'postgres', 'is-view': True, 'row-count': 0, 'schema-name': 'public', 'table-key-properties': []}, 181 | ('properties', 'fk_id'): {'inclusion': 'available', 'sql-datatype': 'bigint', 'selected-by-default': True}, 182 | ('properties', 'name'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True}, 183 | ('properties', 'age'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True}, 184 | ('properties', 'size'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True}, 185 | ('properties', 'id'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True}}, 186 | db_utils.to_map(md)) 187 | 188 | 189 | # 'ID' selected as view-key-properties 190 | replication_md = [{"breadcrumb": [], "metadata": {'replication-key': None, "replication-method" : "FULL_TABLE", 'view-key-properties': ["id"]}}] 191 | 192 | connections.select_catalog_and_fields_via_metadata(conn_id, chicken_catalog, 193 | menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id']), 194 | replication_md) 195 | 196 | # clear state 197 | menagerie.set_state(conn_id, {}) 198 | 199 | sync_job_name = runner.run_sync_mode(self, conn_id) 200 | 201 | # verify tap and target exit codes 202 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 203 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 204 | 205 | record_count_by_stream = runner.examine_target_output_file(self, 206 | conn_id, 207 | self.expected_sync_streams(), 208 | self.expected_pks()) 209 | 210 | 211 | self.assertEqual(record_count_by_stream, { 'chicken_view': 1}) 212 | records_by_stream = runner.get_records_from_target_output() 213 | 214 | table_version = records_by_stream['chicken_view']['table_version'] 215 | self.assertEqual(records_by_stream['chicken_view']['messages'][0]['action'], 'activate_version') 216 | self.assertEqual(records_by_stream['chicken_view']['messages'][1]['action'], 'upsert') 217 | self.assertEqual(records_by_stream['chicken_view']['messages'][2]['action'], 'activate_version') 218 | 219 | # verifications about individual records 220 | for stream, recs in records_by_stream.items(): 221 | # verify the persisted schema was correct 222 | self.assertEqual(recs['schema'], 223 | expected_schemas[stream], 224 | msg="Persisted schema did not match expected schema for stream `{}`.".format(stream)) 225 | 226 | actual_chicken_record = records_by_stream['chicken_view']['messages'][1]['data'] 227 | 228 | expected_chicken_record = {'id': 1, 'fk_id': 1, 'name': 'fred', 'age': 99, 'size' : 'big'} 229 | self.assertEqual(actual_chicken_record, 230 | expected_chicken_record, 231 | msg="Expected `various_types` upsert record data to be {}, but target output {}".format(expected_chicken_record, actual_chicken_record)) 232 | 233 | print("records are correct") 234 | 235 | # verify state and bookmarks 236 | state = menagerie.get_state(conn_id) 237 | 238 | chicken_bookmark = state['bookmarks']['postgres-public-chicken_view'] 239 | self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") 240 | self.assertEqual(chicken_bookmark['version'], table_version, 241 | msg="expected bookmark for stream ROOT-CHICKEN to match version") 242 | -------------------------------------------------------------------------------- /tests/test_postgres_views_incremental_replication.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | import unittest 4 | 5 | import psycopg2.extras 6 | from psycopg2.extensions import quote_ident 7 | import tap_tester.connections as connections 8 | import tap_tester.menagerie as menagerie 9 | import tap_tester.runner as runner 10 | 11 | import db_utils # pylint: disable=import-error 12 | 13 | 14 | expected_schemas = {'chicken_view': 15 | {'properties': {'fk_id': {'maximum': 9223372036854775807, 'type': ['null', 'integer'], 16 | 'minimum': -9223372036854775808}, 17 | 'size': {'type': ['null', 'string']}, 18 | 'name': {'type': ['null', 'string']}, 19 | 'id': {'maximum': 2147483647, 'type': ['null', 'integer'], 20 | 'minimum': -2147483648}, 21 | 'age': {'maximum': 2147483647, 'type': ['null', 'integer'], 22 | 'minimum': -2147483648}, 23 | 'updated_at': {'format': 'date-time', 24 | 'type': ['null', 'string']}}, 25 | 'type': 'object', 26 | 'definitions' : { 27 | 'sdc_recursive_integer_array' : { 'type' : ['null', 'integer', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_integer_array'}}, 28 | 'sdc_recursive_number_array' : { 'type' : ['null', 'number', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_number_array'}}, 29 | 'sdc_recursive_string_array' : { 'type' : ['null', 'string', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_string_array'}}, 30 | 'sdc_recursive_boolean_array' : { 'type' : ['null', 'boolean', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_boolean_array'}}, 31 | 'sdc_recursive_timestamp_array' : { 'type' : ['null', 'string', 'array'], 'format' : 'date-time', 'items' : { '$ref': '#/definitions/sdc_recursive_timestamp_array'}}, 32 | 'sdc_recursive_object_array' : { 'type' : ['null','object', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_object_array'}} 33 | }}} 34 | 35 | def canonicalized_table_name(schema, table, cur): 36 | return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur)) 37 | 38 | def insert_record(cursor, table_name, data): 39 | our_keys = list(data.keys()) 40 | our_keys.sort() 41 | our_values = [data.get(key) for key in our_keys] 42 | 43 | 44 | columns_sql = ", \n ".join(our_keys) 45 | value_sql = ",".join(["%s" for i in range(len(our_keys))]) 46 | 47 | insert_sql = """ INSERT INTO {} 48 | ( {} ) 49 | VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql) 50 | cursor.execute(insert_sql, our_values) 51 | 52 | 53 | 54 | test_schema_name = "public" 55 | test_table_name_1 = "postgres_views_full_table_replication_test" 56 | test_table_name_2 = "postgres_views_full_table_replication_test_2" 57 | test_view = 'chicken_view' 58 | 59 | class PostgresViewsIncrementalReplication(unittest.TestCase): 60 | def setUp(self): 61 | db_utils.ensure_environment_variables_set() 62 | 63 | db_utils.ensure_db() 64 | 65 | self.maxDiff = None 66 | 67 | with db_utils.get_test_connection() as conn: 68 | conn.autocommit = True 69 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 70 | for table in [test_table_name_1, test_table_name_2]: 71 | old_table = cur.execute("""SELECT EXISTS ( 72 | SELECT 1 73 | FROM information_schema.tables 74 | WHERE table_schema = %s 75 | AND table_name = %s)""", 76 | [test_schema_name, table]) 77 | old_table = cur.fetchone()[0] 78 | if old_table: 79 | cur.execute("DROP TABLE {} CASCADE".format(canonicalized_table_name(test_schema_name, table, cur))) 80 | 81 | 82 | cur.execute("""DROP VIEW IF EXISTS {} """.format(quote_ident(test_view, cur))) 83 | cur.execute("""CREATE TABLE {} 84 | (id SERIAL PRIMARY KEY, 85 | updated_at TIMESTAMP WITH TIME ZONE, 86 | name VARCHAR, 87 | size VARCHAR) """.format(canonicalized_table_name(test_schema_name, test_table_name_1, cur))) 88 | 89 | cur.execute("""CREATE TABLE {} 90 | (fk_id bigint, 91 | age integer) """.format(canonicalized_table_name(test_schema_name, test_table_name_2, cur))) 92 | 93 | cur.execute("""CREATE VIEW {} AS 94 | (SELECT * 95 | FROM {} 96 | join {} 97 | on {}.id = {}.fk_id 98 | )""".format(quote_ident(test_view, cur), 99 | canonicalized_table_name(test_schema_name, test_table_name_1, cur), 100 | canonicalized_table_name(test_schema_name, test_table_name_2, cur), 101 | canonicalized_table_name(test_schema_name, test_table_name_1, cur), 102 | canonicalized_table_name(test_schema_name, test_table_name_2, cur))) 103 | 104 | self.rec_1 = { 'name' : 'fred', 'size' : 'big', 'updated_at' : datetime.datetime(2111, 1, 1, 12, 12, 12, 222111) } 105 | insert_record(cur, test_table_name_1, self.rec_1) 106 | 107 | cur.execute("SELECT id FROM {}".format(canonicalized_table_name(test_schema_name, test_table_name_1, cur))) 108 | fk_id = cur.fetchone()[0] 109 | 110 | self.rec_2 = { 'fk_id' : fk_id, 'age' : 99 } 111 | insert_record(cur, test_table_name_2, self.rec_2) 112 | 113 | @staticmethod 114 | def expected_check_streams(): 115 | return { 'postgres-public-chicken_view'} 116 | 117 | @staticmethod 118 | def expected_sync_streams(): 119 | return { 'chicken_view' } 120 | 121 | @staticmethod 122 | def name(): 123 | return "tap_tester_postgres_views_incremental_replication" 124 | 125 | @staticmethod 126 | def expected_pks(): 127 | return { 128 | 'chicken_view' : {'id'} 129 | } 130 | 131 | @staticmethod 132 | def tap_name(): 133 | return "tap-postgres" 134 | 135 | @staticmethod 136 | def get_type(): 137 | return "platform.postgres" 138 | 139 | @staticmethod 140 | def get_credentials(): 141 | return {'password': os.getenv('TAP_POSTGRES_PASSWORD')} 142 | 143 | @staticmethod 144 | def get_properties(): 145 | return {'host' : os.getenv('TAP_POSTGRES_HOST'), 146 | 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'), 147 | 'port' : os.getenv('TAP_POSTGRES_PORT'), 148 | 'user' : os.getenv('TAP_POSTGRES_USER'), 149 | 'default_replication_method' : 'FULL_TABLE' 150 | } 151 | 152 | def test_run(self): 153 | conn_id = connections.ensure_connection(self) 154 | 155 | # run in check mode 156 | check_job_name = runner.run_check_mode(self, conn_id) 157 | 158 | # verify check exit codes 159 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 160 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 161 | 162 | # verify the tap discovered the right streams 163 | found_catalogs = [fc for fc 164 | in menagerie.get_catalogs(conn_id) 165 | if fc['tap_stream_id'] in self.expected_check_streams()] 166 | 167 | self.assertEqual(len(found_catalogs), 168 | 1, 169 | msg="unable to locate schemas for connection {}".format(conn_id)) 170 | 171 | found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) 172 | diff = self.expected_check_streams().symmetric_difference(found_catalog_names) 173 | self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) 174 | 175 | # verify that persisted streams have the correct properties 176 | chicken_catalog = found_catalogs[0] 177 | 178 | self.assertEqual('chicken_view', chicken_catalog['stream_name']) 179 | print("discovered streams are correct") 180 | 181 | print('checking discoverd metadata for ROOT-CHICKEN_VIEW') 182 | md = menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id'])['metadata'] 183 | 184 | self.assertEqual( 185 | {(): {'database-name': 'postgres', 'is-view': True, 'row-count': 0, 'schema-name': 'public', 'table-key-properties': []}, 186 | ('properties', 'fk_id'): {'inclusion': 'available', 'sql-datatype': 'bigint', 'selected-by-default': True}, 187 | ('properties', 'name'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True}, 188 | ('properties', 'age'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True}, 189 | ('properties', 'size'): {'inclusion': 'available', 'sql-datatype': 'character varying', 'selected-by-default': True}, 190 | ('properties', 'id'): {'inclusion': 'available', 'sql-datatype': 'integer', 'selected-by-default': True}, 191 | ('properties', 'updated_at'): {'selected-by-default': True, 'inclusion': 'available', 'sql-datatype': 'timestamp with time zone'}}, 192 | db_utils.to_map(md)) 193 | 194 | 195 | # 'ID' selected as view-key-properties, updated_at is replication_key 196 | replication_md = [{"breadcrumb": [], "metadata": {'replication-key': 'updated_at', "replication-method" : "INCREMENTAL", 'view-key-properties': ["id"]}}] 197 | 198 | connections.select_catalog_and_fields_via_metadata(conn_id, chicken_catalog, 199 | menagerie.get_annotated_schema(conn_id, chicken_catalog['stream_id']), 200 | replication_md) 201 | 202 | # clear state 203 | menagerie.set_state(conn_id, {}) 204 | 205 | sync_job_name = runner.run_sync_mode(self, conn_id) 206 | 207 | # verify tap and target exit codes 208 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 209 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 210 | 211 | record_count_by_stream = runner.examine_target_output_file(self, 212 | conn_id, 213 | self.expected_sync_streams(), 214 | self.expected_pks()) 215 | 216 | 217 | self.assertEqual(record_count_by_stream, { 'chicken_view': 1}) 218 | records_by_stream = runner.get_records_from_target_output() 219 | 220 | table_version = records_by_stream['chicken_view']['table_version'] 221 | self.assertEqual(2, len(records_by_stream['chicken_view']['messages'])) 222 | self.assertEqual(records_by_stream['chicken_view']['messages'][0]['action'], 'activate_version') 223 | self.assertEqual(records_by_stream['chicken_view']['messages'][1]['action'], 'upsert') 224 | 225 | # verifications about individual records 226 | for stream, recs in records_by_stream.items(): 227 | # verify the persisted schema was correct 228 | self.assertEqual(recs['schema'], 229 | expected_schemas[stream], 230 | msg="Persisted schema did not match expected schema for stream `{}`.".format(stream)) 231 | 232 | actual_chicken_record = records_by_stream['chicken_view']['messages'][1]['data'] 233 | 234 | expected_chicken_record = {'id': 1, 'fk_id': 1, 'name': 'fred', 'age': 99, 'updated_at': '2111-01-01T12:12:12.222111+00:00', 'size' : 'big'} 235 | self.assertEqual(actual_chicken_record, 236 | expected_chicken_record, 237 | msg="Expected `various_types` upsert record data to be {}, but target output {}".format(expected_chicken_record, actual_chicken_record)) 238 | 239 | print("records are correct") 240 | 241 | # verify state and bookmarks 242 | state = menagerie.get_state(conn_id) 243 | 244 | chicken_bookmark = state['bookmarks']['postgres-public-chicken_view'] 245 | self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") 246 | self.assertEqual(chicken_bookmark['version'], table_version, 247 | msg="expected bookmark for stream ROOT-CHICKEN to match version") 248 | self.assertEqual(chicken_bookmark['replication_key'], 'updated_at') 249 | self.assertEqual(chicken_bookmark['replication_key_value'],'2111-01-01T12:12:12.222111+00:00') 250 | print("bookmarks are correct") 251 | 252 | # TODO Verify expected fields have inclusion of 'automatic' 253 | -------------------------------------------------------------------------------- /tests/unittests/test_full_table_interruption.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import tap_postgres 4 | import tap_postgres.sync_strategies.full_table as full_table 5 | import tap_postgres.sync_strategies.common as pg_common 6 | import pdb 7 | import singer 8 | from singer import get_logger, metadata, write_bookmark 9 | 10 | from utils import ensure_db, get_test_connection, ensure_test_table, select_all_of_stream, set_replication_method_for_stream, insert_record, get_test_connection_config 11 | 12 | import decimal 13 | import math 14 | import pytz 15 | import strict_rfc3339 16 | import copy 17 | 18 | LOGGER = get_logger() 19 | 20 | CAUGHT_MESSAGES = [] 21 | COW_RECORD_COUNT = 0 22 | 23 | def singer_write_message_no_cow(message): 24 | global COW_RECORD_COUNT 25 | 26 | if isinstance(message, singer.RecordMessage) and message.stream == 'COW': 27 | COW_RECORD_COUNT = COW_RECORD_COUNT + 1 28 | if COW_RECORD_COUNT > 2: 29 | raise Exception("simulated exception") 30 | CAUGHT_MESSAGES.append(message) 31 | else: 32 | CAUGHT_MESSAGES.append(message) 33 | 34 | def singer_write_schema_ok(message): 35 | CAUGHT_MESSAGES.append(message) 36 | 37 | def singer_write_message_ok(message): 38 | CAUGHT_MESSAGES.append(message) 39 | 40 | def expected_record(fixture_row): 41 | expected_record = {} 42 | for k,v in fixture_row.items(): 43 | expected_record[k.replace('"', '')] = v 44 | 45 | return expected_record 46 | 47 | def do_not_dump_catalog(catalog): 48 | pass 49 | 50 | tap_postgres.dump_catalog = do_not_dump_catalog 51 | full_table.UPDATE_BOOKMARK_PERIOD = 1 52 | 53 | class LogicalInterruption(unittest.TestCase): 54 | maxDiff = None 55 | 56 | def setUp(self): 57 | ensure_db() 58 | table_spec_1 = {"columns": [{"name": "id", "type" : "serial", "primary_key" : True}, 59 | {"name" : 'name', "type": "character varying"}, 60 | {"name" : 'colour', "type": "character varying"}], 61 | "name" : 'COW'} 62 | ensure_test_table(table_spec_1) 63 | global COW_RECORD_COUNT 64 | COW_RECORD_COUNT = 0 65 | global CAUGHT_MESSAGES 66 | CAUGHT_MESSAGES.clear() 67 | 68 | def test_catalog(self): 69 | singer.write_message = singer_write_message_no_cow 70 | pg_common.write_schema_message = singer_write_message_ok 71 | 72 | conn_config = get_test_connection_config() 73 | streams = tap_postgres.do_discovery(conn_config) 74 | cow_stream = [s for s in streams if s['table_name'] == 'COW'][0] 75 | self.assertIsNotNone(cow_stream) 76 | cow_stream = select_all_of_stream(cow_stream) 77 | cow_stream = set_replication_method_for_stream(cow_stream, 'LOG_BASED') 78 | 79 | with get_test_connection() as conn: 80 | conn.autocommit = True 81 | cur = conn.cursor() 82 | 83 | cow_rec = {'name' : 'betty', 'colour' : 'blue'} 84 | insert_record(cur, 'COW', cow_rec) 85 | 86 | cow_rec = {'name' : 'smelly', 'colour' : 'brow'} 87 | insert_record(cur, 'COW', cow_rec) 88 | 89 | cow_rec = {'name' : 'pooper', 'colour' : 'green'} 90 | insert_record(cur, 'COW', cow_rec) 91 | 92 | state = {} 93 | #the initial phase of cows logical replication will be a full table. 94 | #it will sync the first record and then blow up on the 2nd record 95 | try: 96 | 97 | tap_postgres.do_sync(get_test_connection_config(), {'streams' : streams}, None, state) 98 | except Exception as ex: 99 | blew_up_on_cow = True 100 | 101 | self.assertTrue(blew_up_on_cow) 102 | 103 | self.assertEqual(7, len(CAUGHT_MESSAGES)) 104 | 105 | self.assertEqual(CAUGHT_MESSAGES[0]['type'], 'SCHEMA') 106 | self.assertTrue(isinstance(CAUGHT_MESSAGES[1], singer.StateMessage)) 107 | self.assertIsNone(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW'].get('xmin')) 108 | self.assertIsNotNone(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW'].get('lsn')) 109 | end_lsn = CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW'].get('lsn') 110 | 111 | self.assertTrue(isinstance(CAUGHT_MESSAGES[2], singer.ActivateVersionMessage)) 112 | new_version = CAUGHT_MESSAGES[2].version 113 | 114 | self.assertTrue(isinstance(CAUGHT_MESSAGES[3], singer.RecordMessage)) 115 | self.assertEqual(CAUGHT_MESSAGES[3].record, {'colour': 'blue', 'id': 1, 'name': 'betty'}) 116 | self.assertEqual('COW', CAUGHT_MESSAGES[3].stream) 117 | 118 | 119 | 120 | self.assertTrue(isinstance(CAUGHT_MESSAGES[4], singer.StateMessage)) 121 | #xmin is set while we are processing the full table replication 122 | self.assertIsNotNone(CAUGHT_MESSAGES[4].value['bookmarks']['postgres-public-COW']['xmin']) 123 | self.assertEqual(CAUGHT_MESSAGES[4].value['bookmarks']['postgres-public-COW']['lsn'], end_lsn) 124 | 125 | self.assertEqual(CAUGHT_MESSAGES[5].record['name'], 'smelly') 126 | self.assertEqual('COW', CAUGHT_MESSAGES[5].stream) 127 | 128 | self.assertTrue(isinstance(CAUGHT_MESSAGES[6], singer.StateMessage)) 129 | last_xmin = CAUGHT_MESSAGES[6].value['bookmarks']['postgres-public-COW']['xmin'] 130 | old_state = CAUGHT_MESSAGES[6].value 131 | 132 | 133 | #run another do_sync, should get the remaining record which effectively finishes the initial full_table 134 | #replication portion of the logical replication 135 | singer.write_message = singer_write_message_ok 136 | global COW_RECORD_COUNT 137 | COW_RECORD_COUNT = 0 138 | CAUGHT_MESSAGES.clear() 139 | tap_postgres.do_sync(get_test_connection_config(), {'streams' : streams}, None, old_state) 140 | 141 | self.assertEqual(8, len(CAUGHT_MESSAGES)) 142 | 143 | self.assertEqual(CAUGHT_MESSAGES[0]['type'], 'SCHEMA') 144 | 145 | self.assertTrue(isinstance(CAUGHT_MESSAGES[1], singer.StateMessage)) 146 | self.assertEqual(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW'].get('xmin'), last_xmin) 147 | self.assertEqual(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW'].get('lsn'), end_lsn) 148 | self.assertEqual(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW'].get('version'), new_version) 149 | 150 | self.assertTrue(isinstance(CAUGHT_MESSAGES[2], singer.RecordMessage)) 151 | self.assertEqual(CAUGHT_MESSAGES[2].record, {'colour': 'brow', 'id': 2, 'name': 'smelly'}) 152 | self.assertEqual('COW', CAUGHT_MESSAGES[2].stream) 153 | 154 | self.assertTrue(isinstance(CAUGHT_MESSAGES[3], singer.StateMessage)) 155 | self.assertTrue(CAUGHT_MESSAGES[3].value['bookmarks']['postgres-public-COW'].get('xmin'),last_xmin) 156 | self.assertEqual(CAUGHT_MESSAGES[3].value['bookmarks']['postgres-public-COW'].get('lsn'), end_lsn) 157 | self.assertEqual(CAUGHT_MESSAGES[3].value['bookmarks']['postgres-public-COW'].get('version'), new_version) 158 | 159 | self.assertTrue(isinstance(CAUGHT_MESSAGES[4], singer.RecordMessage)) 160 | self.assertEqual(CAUGHT_MESSAGES[4].record['name'], 'pooper') 161 | self.assertEqual('COW', CAUGHT_MESSAGES[4].stream) 162 | 163 | self.assertTrue(isinstance(CAUGHT_MESSAGES[5], singer.StateMessage)) 164 | self.assertTrue(CAUGHT_MESSAGES[5].value['bookmarks']['postgres-public-COW'].get('xmin') > last_xmin) 165 | self.assertEqual(CAUGHT_MESSAGES[5].value['bookmarks']['postgres-public-COW'].get('lsn'), end_lsn) 166 | self.assertEqual(CAUGHT_MESSAGES[5].value['bookmarks']['postgres-public-COW'].get('version'), new_version) 167 | 168 | 169 | self.assertTrue(isinstance(CAUGHT_MESSAGES[6], singer.ActivateVersionMessage)) 170 | self.assertEqual(CAUGHT_MESSAGES[6].version, new_version) 171 | 172 | self.assertTrue(isinstance(CAUGHT_MESSAGES[7], singer.StateMessage)) 173 | self.assertIsNone(CAUGHT_MESSAGES[7].value['bookmarks']['postgres-public-COW'].get('xmin')) 174 | self.assertEqual(CAUGHT_MESSAGES[7].value['bookmarks']['postgres-public-COW'].get('lsn'), end_lsn) 175 | self.assertEqual(CAUGHT_MESSAGES[7].value['bookmarks']['postgres-public-COW'].get('version'), new_version) 176 | 177 | class FullTableInterruption(unittest.TestCase): 178 | maxDiff = None 179 | def setUp(self): 180 | table_spec_1 = {"columns": [{"name": "id", "type" : "serial", "primary_key" : True}, 181 | {"name" : 'name', "type": "character varying"}, 182 | {"name" : 'colour', "type": "character varying"}], 183 | "name" : 'COW'} 184 | ensure_test_table(table_spec_1) 185 | 186 | table_spec_2 = {"columns": [{"name": "id", "type" : "serial", "primary_key" : True}, 187 | {"name" : 'name', "type": "character varying"}, 188 | {"name" : 'colour', "type": "character varying"}], 189 | "name" : 'CHICKEN'} 190 | ensure_test_table(table_spec_2) 191 | 192 | global COW_RECORD_COUNT 193 | COW_RECORD_COUNT = 0 194 | global CAUGHT_MESSAGES 195 | CAUGHT_MESSAGES.clear() 196 | 197 | def test_catalog(self): 198 | singer.write_message = singer_write_message_no_cow 199 | pg_common.write_schema_message = singer_write_message_ok 200 | 201 | conn_config = get_test_connection_config() 202 | streams = tap_postgres.do_discovery(conn_config) 203 | cow_stream = [s for s in streams if s['table_name'] == 'COW'][0] 204 | self.assertIsNotNone(cow_stream) 205 | cow_stream = select_all_of_stream(cow_stream) 206 | cow_stream = set_replication_method_for_stream(cow_stream, 'FULL_TABLE') 207 | 208 | chicken_stream = [s for s in streams if s['table_name'] == 'CHICKEN'][0] 209 | self.assertIsNotNone(chicken_stream) 210 | chicken_stream = select_all_of_stream(chicken_stream) 211 | chicken_stream = set_replication_method_for_stream(chicken_stream, 'FULL_TABLE') 212 | with get_test_connection() as conn: 213 | conn.autocommit = True 214 | cur = conn.cursor() 215 | 216 | cow_rec = {'name' : 'betty', 'colour' : 'blue'} 217 | insert_record(cur, 'COW', cow_rec) 218 | cow_rec = {'name' : 'smelly', 'colour' : 'brow'} 219 | insert_record(cur, 'COW', cow_rec) 220 | 221 | cow_rec = {'name' : 'pooper', 'colour' : 'green'} 222 | insert_record(cur, 'COW', cow_rec) 223 | 224 | chicken_rec = {'name' : 'fred', 'colour' : 'red'} 225 | insert_record(cur, 'CHICKEN', chicken_rec) 226 | 227 | state = {} 228 | #this will sync the CHICKEN but then blow up on the COW 229 | try: 230 | tap_postgres.do_sync(get_test_connection_config(), {'streams' : streams}, None, state) 231 | except Exception as ex: 232 | # LOGGER.exception(ex) 233 | blew_up_on_cow = True 234 | 235 | self.assertTrue(blew_up_on_cow) 236 | 237 | 238 | self.assertEqual(14, len(CAUGHT_MESSAGES)) 239 | 240 | self.assertEqual(CAUGHT_MESSAGES[0]['type'], 'SCHEMA') 241 | self.assertTrue(isinstance(CAUGHT_MESSAGES[1], singer.StateMessage)) 242 | self.assertIsNone(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-CHICKEN'].get('xmin')) 243 | 244 | self.assertTrue(isinstance(CAUGHT_MESSAGES[2], singer.ActivateVersionMessage)) 245 | new_version = CAUGHT_MESSAGES[2].version 246 | 247 | self.assertTrue(isinstance(CAUGHT_MESSAGES[3], singer.RecordMessage)) 248 | self.assertEqual('CHICKEN', CAUGHT_MESSAGES[3].stream) 249 | 250 | self.assertTrue(isinstance(CAUGHT_MESSAGES[4], singer.StateMessage)) 251 | #xmin is set while we are processing the full table replication 252 | self.assertIsNotNone(CAUGHT_MESSAGES[4].value['bookmarks']['postgres-public-CHICKEN']['xmin']) 253 | 254 | self.assertTrue(isinstance(CAUGHT_MESSAGES[5], singer.ActivateVersionMessage)) 255 | self.assertEqual(CAUGHT_MESSAGES[5].version, new_version) 256 | 257 | self.assertTrue(isinstance(CAUGHT_MESSAGES[6], singer.StateMessage)) 258 | self.assertEqual(None, singer.get_currently_syncing( CAUGHT_MESSAGES[6].value)) 259 | #xmin is cleared at the end of the full table replication 260 | self.assertIsNone(CAUGHT_MESSAGES[6].value['bookmarks']['postgres-public-CHICKEN']['xmin']) 261 | 262 | 263 | #cow messages 264 | self.assertEqual(CAUGHT_MESSAGES[7]['type'], 'SCHEMA') 265 | 266 | self.assertEqual("COW", CAUGHT_MESSAGES[7]['stream']) 267 | self.assertTrue(isinstance(CAUGHT_MESSAGES[8], singer.StateMessage)) 268 | self.assertIsNone(CAUGHT_MESSAGES[8].value['bookmarks']['postgres-public-COW'].get('xmin')) 269 | self.assertEqual("postgres-public-COW", CAUGHT_MESSAGES[8].value['currently_syncing']) 270 | 271 | self.assertTrue(isinstance(CAUGHT_MESSAGES[9], singer.ActivateVersionMessage)) 272 | cow_version = CAUGHT_MESSAGES[9].version 273 | self.assertTrue(isinstance(CAUGHT_MESSAGES[10], singer.RecordMessage)) 274 | 275 | self.assertEqual(CAUGHT_MESSAGES[10].record['name'], 'betty') 276 | self.assertEqual('COW', CAUGHT_MESSAGES[10].stream) 277 | 278 | self.assertTrue(isinstance(CAUGHT_MESSAGES[11], singer.StateMessage)) 279 | #xmin is set while we are processing the full table replication 280 | self.assertIsNotNone(CAUGHT_MESSAGES[11].value['bookmarks']['postgres-public-COW']['xmin']) 281 | 282 | 283 | self.assertEqual(CAUGHT_MESSAGES[12].record['name'], 'smelly') 284 | self.assertEqual('COW', CAUGHT_MESSAGES[12].stream) 285 | old_state = CAUGHT_MESSAGES[13].value 286 | 287 | #run another do_sync 288 | singer.write_message = singer_write_message_ok 289 | CAUGHT_MESSAGES.clear() 290 | global COW_RECORD_COUNT 291 | COW_RECORD_COUNT = 0 292 | 293 | tap_postgres.do_sync(get_test_connection_config(), {'streams' : streams}, None, old_state) 294 | 295 | self.assertEqual(CAUGHT_MESSAGES[0]['type'], 'SCHEMA') 296 | self.assertTrue(isinstance(CAUGHT_MESSAGES[1], singer.StateMessage)) 297 | 298 | # because we were interrupted, we do not switch versions 299 | self.assertEqual(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW']['version'], cow_version) 300 | self.assertIsNotNone(CAUGHT_MESSAGES[1].value['bookmarks']['postgres-public-COW']['xmin']) 301 | self.assertEqual("postgres-public-COW", singer.get_currently_syncing(CAUGHT_MESSAGES[1].value)) 302 | 303 | self.assertTrue(isinstance(CAUGHT_MESSAGES[2], singer.RecordMessage)) 304 | self.assertEqual(CAUGHT_MESSAGES[2].record['name'], 'smelly') 305 | self.assertEqual('COW', CAUGHT_MESSAGES[2].stream) 306 | 307 | 308 | #after record: activate version, state with no xmin or currently syncing 309 | self.assertTrue(isinstance(CAUGHT_MESSAGES[3], singer.StateMessage)) 310 | #we still have an xmin for COW because are not yet done with the COW table 311 | self.assertIsNotNone(CAUGHT_MESSAGES[3].value['bookmarks']['postgres-public-COW']['xmin']) 312 | self.assertEqual(singer.get_currently_syncing( CAUGHT_MESSAGES[3].value), 'postgres-public-COW') 313 | 314 | self.assertTrue(isinstance(CAUGHT_MESSAGES[4], singer.RecordMessage)) 315 | self.assertEqual(CAUGHT_MESSAGES[4].record['name'], 'pooper') 316 | self.assertEqual('COW', CAUGHT_MESSAGES[4].stream) 317 | 318 | self.assertTrue(isinstance(CAUGHT_MESSAGES[5], singer.StateMessage)) 319 | self.assertIsNotNone(CAUGHT_MESSAGES[5].value['bookmarks']['postgres-public-COW']['xmin']) 320 | self.assertEqual(singer.get_currently_syncing( CAUGHT_MESSAGES[5].value), 'postgres-public-COW') 321 | 322 | 323 | #xmin is cleared because we are finished the full table replication 324 | self.assertTrue(isinstance(CAUGHT_MESSAGES[6], singer.ActivateVersionMessage)) 325 | self.assertEqual(CAUGHT_MESSAGES[6].version, cow_version) 326 | 327 | self.assertTrue(isinstance(CAUGHT_MESSAGES[7], singer.StateMessage)) 328 | self.assertIsNone(singer.get_currently_syncing( CAUGHT_MESSAGES[7].value)) 329 | self.assertIsNone(CAUGHT_MESSAGES[7].value['bookmarks']['postgres-public-CHICKEN']['xmin']) 330 | self.assertIsNone(singer.get_currently_syncing( CAUGHT_MESSAGES[7].value)) 331 | 332 | 333 | if __name__== "__main__": 334 | test1 = LogicalInterruption() 335 | test1.setUp() 336 | test1.test_catalog() 337 | -------------------------------------------------------------------------------- /tests/test_postgres_logical_replication_multiple_tables.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import psycopg2.extras 5 | from psycopg2.extensions import quote_ident 6 | import tap_tester.connections as connections 7 | import tap_tester.menagerie as menagerie 8 | import tap_tester.runner as runner 9 | 10 | import db_utils # pylint: disable=import-error 11 | 12 | 13 | expected_schemas = {'postgres_logical_replication_test_cows': 14 | {'type': 'object', 15 | 'selected': True, 16 | 'properties': {'cow_name': {'selected': True, 'type': ['null', 'string'], 'inclusion': 'available'}, 17 | 'id': {'maximum': 2147483647, 'inclusion': 'automatic', 'type': ['integer'], 'minimum': -2147483648, 'selected': True}, 18 | 'cow_age': {'selected': True, 'type': ['null', 'integer'], 'inclusion': 'available'}}}, 19 | 20 | 'postgres_logical_replication_test_chickens': 21 | {'type': 'object', 22 | 'selected': True, 23 | 'properties': {'cow_name': {'selected': True, 'type': ['null', 'string'], 'inclusion': 'available'}, 24 | 'id': {'maximum': 2147483647, 'inclusion': 'automatic', 'type': ['integer'], 'minimum': -2147483648, 'selected': True}, 25 | 'cow_age': {'selected': True, 'type': ['null', 'integer'], 'inclusion': 'available'}}}} 26 | 27 | 28 | def insert_record(cursor, table_name, data): 29 | our_keys = list(data.keys()) 30 | our_keys.sort() 31 | our_values = [data.get(key) for key in our_keys] 32 | 33 | columns_sql = ", \n ".join(our_keys) 34 | value_sql = ",".join(["%s" for i in range(len(our_keys))]) 35 | 36 | insert_sql = """ INSERT INTO {} 37 | ( {} ) 38 | VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql) 39 | cursor.execute(insert_sql, our_values) 40 | 41 | test_schema_name = "public" 42 | test_table_name_cows = "postgres_logical_replication_test_cows" 43 | test_table_name_chickens = "postgres_logical_replication_test_chickens" 44 | 45 | def canonicalized_table_name(schema, table, cur): 46 | return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur)) 47 | 48 | 49 | class PostgresLogicalRepMultipleTables(unittest.TestCase): 50 | def tearDown(self): 51 | with db_utils.get_test_connection('dev') as conn: 52 | conn.autocommit = True 53 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 54 | cur.execute(""" SELECT pg_drop_replication_slot('stitch') """) 55 | 56 | def setUp(self): 57 | db_utils.ensure_environment_variables_set() 58 | 59 | db_utils.ensure_db("dev") 60 | 61 | self.maxDiff = None 62 | 63 | with db_utils.get_test_connection('dev') as conn: 64 | conn.autocommit = True 65 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 66 | cur.execute(""" SELECT EXISTS (SELECT 1 67 | FROM pg_replication_slots 68 | WHERE slot_name = 'stitch') """) 69 | old_slot = cur.fetchone()[0] 70 | with db_utils.get_test_connection('dev', True) as conn2: 71 | with conn2.cursor() as cur2: 72 | if old_slot: 73 | cur2.drop_replication_slot("stitch") 74 | cur2.create_replication_slot('stitch', output_plugin='wal2json') 75 | 76 | for t in [test_table_name_cows, test_table_name_chickens]: 77 | old_table = cur.execute("""SELECT EXISTS ( 78 | SELECT 1 79 | FROM information_schema.tables 80 | WHERE table_schema = %s 81 | AND table_name = %s);""", 82 | [test_schema_name, t]) 83 | old_table = cur.fetchone()[0] 84 | 85 | if old_table: 86 | cur.execute("DROP TABLE {}".format(canonicalized_table_name(test_schema_name, t, cur))) 87 | 88 | 89 | cur = conn.cursor() 90 | create_table_sql = """ 91 | CREATE TABLE {} (id SERIAL PRIMARY KEY, 92 | cow_age integer, 93 | cow_name varchar) 94 | """.format(canonicalized_table_name(test_schema_name, test_table_name_cows, cur)) 95 | cur.execute(create_table_sql) 96 | 97 | create_table_sql = """ 98 | CREATE TABLE {} (id SERIAL PRIMARY KEY, 99 | chicken_age integer, 100 | chicken_name varchar) 101 | """.format(canonicalized_table_name(test_schema_name, test_table_name_chickens, cur)) 102 | cur.execute(create_table_sql) 103 | 104 | #insert a cow 105 | self.cows_rec_1 = {'cow_name' : "anne_cow", 'cow_age' : 30} 106 | insert_record(cur, test_table_name_cows, self.cows_rec_1) 107 | 108 | #insert a chicken 109 | self.chickens_rec_1 = {'chicken_name' : "alfred_chicken", 'chicken_age' : 4} 110 | insert_record(cur, test_table_name_chickens, self.chickens_rec_1) 111 | 112 | @staticmethod 113 | def expected_check_streams(): 114 | return { 'dev-public-postgres_logical_replication_test_cows', 'dev-public-postgres_logical_replication_test_chickens'} 115 | 116 | @staticmethod 117 | def expected_sync_streams(): 118 | return { 'postgres_logical_replication_test_cows', 'postgres_logical_replication_test_chickens' } 119 | 120 | @staticmethod 121 | def expected_pks(): 122 | return { 123 | 'postgres_logical_replication_test_cows' : {'id'}, 124 | 'postgres_logical_replication_test_chickens' : {'id'} 125 | } 126 | 127 | @staticmethod 128 | def tap_name(): 129 | return "tap-postgres" 130 | 131 | @staticmethod 132 | def name(): 133 | return "tap_tester_postgres_logical_multiple_tables" 134 | 135 | @staticmethod 136 | def get_type(): 137 | return "platform.postgres" 138 | 139 | @staticmethod 140 | def get_credentials(): 141 | return {'password': os.getenv('TAP_POSTGRES_PASSWORD')} 142 | 143 | @staticmethod 144 | def get_properties(): 145 | return {'host' : os.getenv('TAP_POSTGRES_HOST'), 146 | 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'), 147 | 'port' : os.getenv('TAP_POSTGRES_PORT'), 148 | 'user' : os.getenv('TAP_POSTGRES_USER'), 149 | 'default_replication_method' : 'LOG_BASED', 150 | 'logical_poll_total_seconds': '10' 151 | } 152 | 153 | 154 | def test_run(self): 155 | conn_id = connections.ensure_connection(self) 156 | 157 | # run in check mode 158 | check_job_name = runner.run_check_mode(self, conn_id) 159 | 160 | # verify check exit codes 161 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 162 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 163 | 164 | # verify the tap discovered the right streams 165 | found_catalogs = [fc for fc 166 | in menagerie.get_catalogs(conn_id) 167 | if fc['tap_stream_id'] in self.expected_check_streams()] 168 | 169 | 170 | self.assertGreaterEqual(len(found_catalogs), 171 | 2, 172 | msg="unable to locate schemas for connection {}".format(conn_id)) 173 | 174 | found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) 175 | diff = self.expected_check_streams().symmetric_difference(found_catalog_names) 176 | self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) 177 | 178 | # verify that persisted streams have the correct properties 179 | 180 | test_catalog_cows = list(filter( lambda c: c['stream_name'] == 'postgres_logical_replication_test_cows', found_catalogs))[0] 181 | self.assertEqual('postgres_logical_replication_test_cows', test_catalog_cows['stream_name']) 182 | 183 | 184 | test_catalog_chickens = list(filter( lambda c: c['stream_name'] == 'postgres_logical_replication_test_chickens', found_catalogs))[0] 185 | self.assertEqual('postgres_logical_replication_test_chickens', test_catalog_chickens['stream_name']) 186 | print("discovered streams are correct") 187 | 188 | additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}] 189 | connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog_cows, 190 | menagerie.get_annotated_schema(conn_id, test_catalog_cows['stream_id']), 191 | additional_md) 192 | connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog_chickens, 193 | menagerie.get_annotated_schema(conn_id, test_catalog_chickens['stream_id']), 194 | additional_md) 195 | 196 | # clear state 197 | menagerie.set_state(conn_id, {}) 198 | 199 | sync_job_name = runner.run_sync_mode(self, conn_id) 200 | 201 | # verify tap and target exit codes 202 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 203 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 204 | 205 | record_count_by_stream = runner.examine_target_output_file(self, 206 | conn_id, 207 | self.expected_sync_streams(), 208 | self.expected_pks()) 209 | 210 | 211 | self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test_cows': 1, 'postgres_logical_replication_test_chickens': 1}) 212 | records_by_stream = runner.get_records_from_target_output() 213 | 214 | table_version_cows = records_by_stream['postgres_logical_replication_test_cows']['table_version'] 215 | self.assertEqual(records_by_stream['postgres_logical_replication_test_cows']['messages'][0]['action'], 'activate_version') 216 | self.assertEqual(records_by_stream['postgres_logical_replication_test_cows']['messages'][1]['action'], 'upsert') 217 | self.assertEqual(records_by_stream['postgres_logical_replication_test_cows']['messages'][2]['action'], 'activate_version') 218 | 219 | table_version_chickens = records_by_stream['postgres_logical_replication_test_chickens']['table_version'] 220 | self.assertEqual(records_by_stream['postgres_logical_replication_test_chickens']['messages'][0]['action'], 'activate_version') 221 | self.assertEqual(records_by_stream['postgres_logical_replication_test_chickens']['messages'][1]['action'], 'upsert') 222 | self.assertEqual(records_by_stream['postgres_logical_replication_test_chickens']['messages'][2]['action'], 'activate_version') 223 | 224 | # verify state and bookmarks 225 | state = menagerie.get_state(conn_id) 226 | self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") 227 | 228 | bookmark_cows = state['bookmarks']['dev-public-postgres_logical_replication_test_cows'] 229 | self.assertIsNotNone(bookmark_cows['lsn'], msg="expected bookmark for stream to have an lsn") 230 | lsn_cows_1 = bookmark_cows['lsn'] 231 | self.assertEqual(bookmark_cows['version'], table_version_cows, msg="expected bookmark for stream to match version") 232 | 233 | bookmark_chickens = state['bookmarks']['dev-public-postgres_logical_replication_test_chickens'] 234 | self.assertIsNotNone(bookmark_chickens['lsn'], msg="expected bookmark for stream to have an lsn") 235 | lsn_chickens_1 = bookmark_chickens['lsn'] 236 | self.assertEqual(bookmark_chickens['version'], table_version_chickens, msg="expected bookmark for stream to match version") 237 | 238 | 239 | #---------------------------------------------------------------------- 240 | # invoke the sync job again after adding records 241 | #---------------------------------------------------------------------- 242 | print("inserting 2 more cows and 2 more chickens") 243 | 244 | with db_utils.get_test_connection('dev') as conn: 245 | conn.autocommit = True 246 | with conn.cursor() as cur: 247 | # insert another cow 248 | self.cows_rec_2 = {'cow_name' : "betty cow", 'cow_age' : 21} 249 | insert_record(cur, test_table_name_cows, self.cows_rec_2) 250 | # update that cow's expected values 251 | self.cows_rec_2['id'] = 2 252 | self.cows_rec_2['_sdc_deleted_at'] = None 253 | 254 | # insert another chicken 255 | self.chicken_rec_2 = {'chicken_name' : "burt chicken", 'chicken_age' : 14} 256 | insert_record(cur, test_table_name_chickens, self.chicken_rec_2) 257 | # update that cow's expected values 258 | self.chicken_rec_2['id'] = 2 259 | self.chicken_rec_2['_sdc_deleted_at'] = None 260 | 261 | # and repeat... 262 | 263 | self.cows_rec_3 = {'cow_name' : "cindy cow", 'cow_age' : 10} 264 | insert_record(cur, test_table_name_cows, self.cows_rec_3) 265 | self.cows_rec_3['id'] = 3 266 | self.cows_rec_3['_sdc_deleted_at'] = None 267 | 268 | 269 | self.chicken_rec_3 = {'chicken_name' : "carl chicken", 'chicken_age' : 4} 270 | insert_record(cur, test_table_name_chickens, self.chicken_rec_3) 271 | self.chicken_rec_3['id'] = 3 272 | self.chicken_rec_3['_sdc_deleted_at'] = None 273 | 274 | 275 | sync_job_name = runner.run_sync_mode(self, conn_id) 276 | 277 | # verify tap and target exit codes 278 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 279 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 280 | 281 | record_count_by_stream = runner.examine_target_output_file(self, 282 | conn_id, 283 | self.expected_sync_streams(), 284 | self.expected_pks()) 285 | self.assertEqual(record_count_by_stream, { 'postgres_logical_replication_test_cows': 2, 'postgres_logical_replication_test_chickens': 2}) 286 | records_by_stream = runner.get_records_from_target_output() 287 | chicken_messages = records_by_stream["postgres_logical_replication_test_chickens"]['messages'] 288 | cow_messages = records_by_stream["postgres_logical_replication_test_cows"]['messages'] 289 | 290 | self.assertDictEqual(self.cows_rec_2, cow_messages[0]['data']) 291 | self.assertDictEqual(self.chicken_rec_2, chicken_messages[0]['data']) 292 | self.assertDictEqual(self.cows_rec_3, cow_messages[1]['data']) 293 | self.assertDictEqual(self.chicken_rec_3, chicken_messages[1]['data']) 294 | 295 | print("inserted record is correct") 296 | 297 | state = menagerie.get_state(conn_id) 298 | self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") 299 | cows_bookmark = state['bookmarks']['dev-public-postgres_logical_replication_test_cows'] 300 | self.assertIsNotNone(cows_bookmark['lsn'], msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn") 301 | lsn_cows_2 = cows_bookmark['lsn'] 302 | self.assertTrue(lsn_cows_2 >= lsn_cows_1) 303 | 304 | chickens_bookmark = state['bookmarks']['dev-public-postgres_logical_replication_test_chickens'] 305 | self.assertIsNotNone(chickens_bookmark['lsn'], msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn") 306 | lsn_chickens_2 = chickens_bookmark['lsn'] 307 | self.assertTrue(lsn_chickens_2 >= lsn_chickens_1) 308 | 309 | #table_version does NOT change 310 | self.assertEqual(chickens_bookmark['version'], table_version_chickens, msg="expected bookmark for stream public-postgres_logical_replication_test to match version") 311 | 312 | #table_version does NOT change 313 | self.assertEqual(cows_bookmark['version'], table_version_cows, msg="expected bookmark for stream public-postgres_logical_replication_test to match version") 314 | -------------------------------------------------------------------------------- /tests/test_postgres_logical_replication_multiple_dbs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import psycopg2.extras 5 | from psycopg2.extensions import quote_ident 6 | import tap_tester.connections as connections 7 | import tap_tester.menagerie as menagerie 8 | import tap_tester.runner as runner 9 | 10 | import db_utils # pylint: disable=import-error 11 | 12 | 13 | def insert_record(cursor, table_name, data): 14 | our_keys = list(data.keys()) 15 | our_keys.sort() 16 | our_values = [data.get(key) for key in our_keys] 17 | 18 | columns_sql = ", \n ".join(our_keys) 19 | value_sql = ",".join(["%s" for i in range(len(our_keys))]) 20 | 21 | insert_sql = """ INSERT INTO {} 22 | ( {} ) 23 | VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql) 24 | cursor.execute(insert_sql, our_values) 25 | 26 | 27 | test_schema_name = "public" 28 | test_table_name_cows = "postgres_logical_replication_test_cows" 29 | test_table_name_chickens = "postgres_logical_replication_test_chickens" 30 | 31 | def canonicalized_table_name(schema, table, cur): 32 | return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur)) 33 | 34 | 35 | class PostgresLogicalRepMultipleDBs(unittest.TestCase): 36 | def tearDown(self): 37 | with db_utils.get_test_connection('dev') as conn: 38 | conn.autocommit = True 39 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 40 | cur.execute(""" SELECT pg_drop_replication_slot('stitch_dev') """) 41 | 42 | with db_utils.get_test_connection('postgres') as conn: 43 | conn.autocommit = True 44 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 45 | cur.execute(""" SELECT pg_drop_replication_slot('stitch_postgres') """) 46 | 47 | def setUp(self): 48 | db_utils.ensure_environment_variables_set() 49 | 50 | db_utils.ensure_db('dev') 51 | db_utils.ensure_db('postgres') 52 | 53 | self.maxDiff = None 54 | 55 | with db_utils.get_test_connection('dev') as conn: 56 | conn.autocommit = True 57 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 58 | cur.execute(""" SELECT EXISTS (SELECT 1 59 | FROM pg_replication_slots 60 | WHERE slot_name = 'stitch_dev') """) 61 | old_slot = cur.fetchone()[0] 62 | with db_utils.get_test_connection('dev', True) as conn2: 63 | with conn2.cursor() as cur2: 64 | if old_slot: 65 | cur2.drop_replication_slot("stitch_dev") 66 | cur2.create_replication_slot('stitch_dev', output_plugin='wal2json') 67 | 68 | old_table = cur.execute("""SELECT EXISTS ( 69 | SELECT 1 70 | FROM information_schema.tables 71 | WHERE table_schema = %s 72 | AND table_name = %s);""", 73 | [test_schema_name, test_table_name_cows]) 74 | old_table = cur.fetchone()[0] 75 | 76 | if old_table: 77 | cur.execute("DROP TABLE {}".format(canonicalized_table_name(test_schema_name, test_table_name_cows, cur))) 78 | 79 | #create dev_cows 80 | cur = conn.cursor() 81 | create_table_sql = """ 82 | CREATE TABLE {} (id SERIAL PRIMARY KEY, 83 | cow_age integer, 84 | cow_name varchar) 85 | """.format(canonicalized_table_name(test_schema_name, test_table_name_cows, cur)) 86 | cur.execute(create_table_sql) 87 | 88 | #insert a cow 89 | self.cows_rec_1 = {'cow_name' : "anne_cow", 'cow_age' : 30} 90 | insert_record(cur, test_table_name_cows, self.cows_rec_1) 91 | 92 | 93 | with db_utils.get_test_connection('postgres') as conn: 94 | conn.autocommit = True 95 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 96 | cur.execute(""" SELECT EXISTS (SELECT 1 97 | FROM pg_replication_slots 98 | WHERE slot_name = 'stitch_postgres') """) 99 | old_slot = cur.fetchone()[0] 100 | with db_utils.get_test_connection('postgres', True) as conn2: 101 | with conn2.cursor() as cur2: 102 | if old_slot: 103 | cur2.drop_replication_slot("stitch_postgres") 104 | cur2.create_replication_slot('stitch_postgres', output_plugin='wal2json') 105 | 106 | 107 | old_table = cur.execute("""SELECT EXISTS ( 108 | SELECT 1 109 | FROM information_schema.tables 110 | WHERE table_schema = %s 111 | AND table_name = %s);""", 112 | [test_schema_name, test_table_name_chickens]) 113 | old_table = cur.fetchone()[0] 114 | 115 | if old_table: 116 | cur.execute("DROP TABLE {}".format(canonicalized_table_name(test_schema_name, test_table_name_chickens, cur))) 117 | 118 | 119 | #create postgres_chickens 120 | create_table_sql = """ 121 | CREATE TABLE {} (id SERIAL PRIMARY KEY, 122 | chicken_age integer, 123 | chicken_name varchar) 124 | """.format(canonicalized_table_name(test_schema_name, test_table_name_chickens, cur)) 125 | cur.execute(create_table_sql) 126 | 127 | 128 | #insert a chicken 129 | self.chickens_rec_1 = {'chicken_name' : "alfred_chicken", 'chicken_age' : 4} 130 | insert_record(cur, test_table_name_chickens, self.chickens_rec_1) 131 | 132 | @staticmethod 133 | def expected_check_streams(): 134 | return { 'dev-public-postgres_logical_replication_test_cows', 'postgres-public-postgres_logical_replication_test_chickens'} 135 | 136 | @staticmethod 137 | def expected_sync_streams(): 138 | return { 'public_postgres_logical_replication_test_cows', 'public_postgres_logical_replication_test_chickens' } 139 | 140 | @staticmethod 141 | def expected_pks(): 142 | return { 143 | 'public_postgres_logical_replication_test_cows' : {'id'}, 144 | 'public_postgres_logical_replication_test_chickens' : {'id'} 145 | } 146 | 147 | @staticmethod 148 | def tap_name(): 149 | return "tap-postgres" 150 | 151 | @staticmethod 152 | def name(): 153 | return "tap_tester_postgres_logical_multiple_dbs" 154 | 155 | @staticmethod 156 | def get_type(): 157 | return "platform.postgres" 158 | 159 | @staticmethod 160 | def get_credentials(): 161 | return {'password': os.getenv('TAP_POSTGRES_PASSWORD')} 162 | 163 | @staticmethod 164 | def get_properties(): 165 | return {'host' : os.getenv('TAP_POSTGRES_HOST'), 166 | 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'), 167 | 'port' : os.getenv('TAP_POSTGRES_PORT'), 168 | 'user' : os.getenv('TAP_POSTGRES_USER'), 169 | 'default_replication_method' : 'LOG_BASED', 170 | 'include_schemas_in_destination_stream_name' : 'true', 171 | 'debug_lsn': 'true', 172 | 'logical_poll_total_seconds': '10' 173 | } 174 | 175 | 176 | def test_run(self): 177 | conn_id = connections.ensure_connection(self) 178 | 179 | # run in check mode 180 | check_job_name = runner.run_check_mode(self, conn_id) 181 | 182 | # verify check exit codes 183 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 184 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 185 | 186 | # verify the tap discovered the right streams 187 | found_catalogs = [fc for fc 188 | in menagerie.get_catalogs(conn_id) 189 | if fc['tap_stream_id'] in self.expected_check_streams()] 190 | 191 | self.assertGreaterEqual(len(found_catalogs), 192 | 2, 193 | msg="unable to locate schemas for connection {}".format(conn_id)) 194 | 195 | found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) 196 | diff = self.expected_check_streams().symmetric_difference(found_catalog_names) 197 | self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) 198 | 199 | # verify that persisted streams have the correct properties 200 | test_catalog_cows = list(filter( lambda c: c['stream_name'] == 'postgres_logical_replication_test_cows', found_catalogs))[0] 201 | self.assertEqual('postgres_logical_replication_test_cows', test_catalog_cows['stream_name']) 202 | 203 | 204 | test_catalog_chickens = list(filter( lambda c: c['stream_name'] == 'postgres_logical_replication_test_chickens', found_catalogs))[0] 205 | self.assertEqual('postgres_logical_replication_test_chickens', test_catalog_chickens['stream_name']) 206 | print("discovered streams are correct") 207 | 208 | additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'LOG_BASED'}}] 209 | connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog_cows, 210 | menagerie.get_annotated_schema(conn_id, test_catalog_cows['stream_id']), 211 | additional_md) 212 | connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog_chickens, 213 | menagerie.get_annotated_schema(conn_id, test_catalog_chickens['stream_id']), 214 | additional_md) 215 | 216 | # clear state 217 | menagerie.set_state(conn_id, {}) 218 | 219 | #run sync job 220 | sync_job_name = runner.run_sync_mode(self, conn_id) 221 | 222 | # verify tap and target exit codes 223 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 224 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 225 | 226 | record_count_by_stream = runner.examine_target_output_file(self, 227 | conn_id, 228 | self.expected_sync_streams(), 229 | self.expected_pks()) 230 | 231 | 232 | self.assertEqual(record_count_by_stream, { 'public_postgres_logical_replication_test_cows': 1, 'public_postgres_logical_replication_test_chickens': 1}) 233 | records_by_stream = runner.get_records_from_target_output() 234 | 235 | table_version_cows = records_by_stream['public_postgres_logical_replication_test_cows']['table_version'] 236 | self.assertEqual(records_by_stream['public_postgres_logical_replication_test_cows']['messages'][0]['action'], 'activate_version') 237 | self.assertEqual(records_by_stream['public_postgres_logical_replication_test_cows']['messages'][1]['action'], 'upsert') 238 | self.assertEqual(records_by_stream['public_postgres_logical_replication_test_cows']['messages'][2]['action'], 'activate_version') 239 | 240 | table_version_chickens = records_by_stream['public_postgres_logical_replication_test_chickens']['table_version'] 241 | self.assertEqual(records_by_stream['public_postgres_logical_replication_test_chickens']['messages'][0]['action'], 'activate_version') 242 | self.assertEqual(records_by_stream['public_postgres_logical_replication_test_chickens']['messages'][1]['action'], 'upsert') 243 | self.assertEqual(records_by_stream['public_postgres_logical_replication_test_chickens']['messages'][2]['action'], 'activate_version') 244 | 245 | # verify state and bookmarks 246 | state = menagerie.get_state(conn_id) 247 | self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") 248 | 249 | bookmark_cows = state['bookmarks']['dev-public-postgres_logical_replication_test_cows'] 250 | self.assertIsNotNone(bookmark_cows['lsn'], msg="expected bookmark for stream to have an lsn") 251 | lsn_cows_1 = bookmark_cows['lsn'] 252 | self.assertEqual(bookmark_cows['version'], table_version_cows, msg="expected bookmark for stream to match version") 253 | 254 | bookmark_chickens = state['bookmarks']['postgres-public-postgres_logical_replication_test_chickens'] 255 | self.assertIsNotNone(bookmark_chickens['lsn'], msg="expected bookmark for stream to have an lsn") 256 | lsn_chickens_1 = bookmark_chickens['lsn'] 257 | self.assertEqual(bookmark_chickens['version'], table_version_chickens, msg="expected bookmark for stream to match version") 258 | 259 | 260 | #---------------------------------------------------------------------- 261 | # invoke the sync job again after adding records 262 | #---------------------------------------------------------------------- 263 | print("inserting 1 more cows and 1 more chickens") 264 | 265 | with db_utils.get_test_connection('dev') as conn: 266 | conn.autocommit = True 267 | with conn.cursor() as cur: 268 | #insert another cow 269 | self.cows_rec_2 = {'cow_name' : "betty cow", 'cow_age' : 21} 270 | insert_record(cur, test_table_name_cows, self.cows_rec_2) 271 | 272 | with db_utils.get_test_connection('postgres') as conn: 273 | conn.autocommit = True 274 | with conn.cursor() as cur: 275 | #insert another chicken 276 | self.chicken_rec_2 = {'chicken_name' : "burt chicken", 'chicken_age' : 14} 277 | insert_record(cur, test_table_name_chickens, self.chicken_rec_2) 278 | 279 | sync_job_name = runner.run_sync_mode(self, conn_id) 280 | 281 | # verify tap and target exit codes 282 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 283 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 284 | 285 | record_count_by_stream = runner.examine_target_output_file(self, 286 | conn_id, 287 | self.expected_sync_streams(), 288 | self.expected_pks()) 289 | self.assertEqual(record_count_by_stream, { 'public_postgres_logical_replication_test_cows': 1, 'public_postgres_logical_replication_test_chickens': 1}) 290 | 291 | upserts = [] 292 | for u in runner.get_upserts_from_target_output(): 293 | self.assertIsNotNone(u.get('_sdc_lsn')) 294 | del u['_sdc_lsn'] 295 | upserts.append(u) 296 | 297 | self.assertEqual([{'_sdc_deleted_at': None, 'cow_age': 21, 'id': 2, 'cow_name': 'betty cow'}, 298 | {'chicken_name': 'burt chicken', '_sdc_deleted_at': None, 'chicken_age': 14, 'id': 2}], 299 | upserts) 300 | 301 | print("inserted record is correct") 302 | 303 | state = menagerie.get_state(conn_id) 304 | self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") 305 | cows_bookmark = state['bookmarks']['dev-public-postgres_logical_replication_test_cows'] 306 | self.assertIsNotNone(cows_bookmark['lsn'], msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn") 307 | lsn_cows_2 = cows_bookmark['lsn'] 308 | self.assertTrue(lsn_cows_2 >= lsn_cows_1) 309 | 310 | chickens_bookmark = state['bookmarks']['postgres-public-postgres_logical_replication_test_chickens'] 311 | self.assertIsNotNone(chickens_bookmark['lsn'], msg="expected bookmark for stream public-postgres_logical_replication_test to have an scn") 312 | lsn_chickens_2 = chickens_bookmark['lsn'] 313 | self.assertTrue(lsn_chickens_2 >= lsn_chickens_1) 314 | 315 | #table_version does NOT change 316 | self.assertEqual(chickens_bookmark['version'], table_version_chickens, msg="expected bookmark for stream public-postgres_logical_replication_test to match version") 317 | 318 | #table_version does NOT change 319 | self.assertEqual(cows_bookmark['version'], table_version_cows, msg="expected bookmark for stream public-postgres_logical_replication_test to match version") 320 | -------------------------------------------------------------------------------- /tests/test_postgres_full_table_replication_arrays.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import decimal 3 | import json 4 | import os 5 | import unittest 6 | import uuid 7 | 8 | import pytz 9 | import psycopg2.extras 10 | from psycopg2.extensions import quote_ident 11 | import tap_tester.connections as connections 12 | import tap_tester.menagerie as menagerie 13 | import tap_tester.runner as runner 14 | 15 | import db_utils # pylint: disable=import-error 16 | 17 | 18 | 19 | test_schema_name = "public" 20 | test_table_name = "postgres_full_table_replication_array_test" 21 | 22 | 23 | MAX_SCALE = 38 24 | MAX_PRECISION = 100 25 | expected_schemas = {test_table_name: 26 | {'definitions' : { 27 | 'sdc_recursive_integer_array' : { 'type' : ['null', 'integer', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_integer_array'}}, 28 | 'sdc_recursive_number_array' : { 'type' : ['null', 'number', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_number_array'}}, 29 | 'sdc_recursive_string_array' : { 'type' : ['null', 'string', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_string_array'}}, 30 | 'sdc_recursive_boolean_array' : { 'type' : ['null', 'boolean', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_boolean_array'}}, 31 | 'sdc_recursive_timestamp_array' : { 'type' : ['null', 'string', 'array'], 'format' : 'date-time', 'items' : { '$ref': '#/definitions/sdc_recursive_timestamp_array'}}, 32 | 'sdc_recursive_object_array' : { 'type' : ['null','object', 'array'], 'items' : { '$ref': '#/definitions/sdc_recursive_object_array'}}, 33 | "sdc_recursive_decimal_12_2_array": {"exclusiveMaximum": True, 34 | "exclusiveMinimum": True, 35 | "type": ['null', "number", "array"], 36 | "items": { 37 | "$ref": "#/definitions/sdc_recursive_decimal_12_2_array" 38 | }, 39 | "minimum": -10000000000, 40 | "multipleOf": decimal.Decimal('0.01'), 41 | "maximum": 10000000000}}, 42 | 'type': 'object', 43 | 'properties': {'id': {'maximum': 2147483647, 'type': ['integer'], 'minimum': -2147483648}, 44 | 'our_bit_array': {'items': { '$ref' : '#/definitions/sdc_recursive_boolean_array'}, 'type': ['null', 'array']}, 45 | 'our_boolean_array': {'items': { '$ref' : '#/definitions/sdc_recursive_boolean_array'}, 'type': ['null', 'array']}, 46 | 'our_cidr_array': {'items':{ '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']}, 47 | 'our_citext_array': {'items':{ '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']}, 48 | 'our_date_array': {'items':{ '$ref' : '#/definitions/sdc_recursive_timestamp_array'}, 'type': ['null', 'array']}, 49 | 'our_decimal_array' : {'type': ['null', 'array'], 'items': {'$ref' : '#/definitions/sdc_recursive_decimal_12_2_array'}}, 50 | 'our_double_array': {'items': { '$ref' : '#/definitions/sdc_recursive_number_array'}, 'type': ['null', 'array']}, 51 | 'our_enum_array': {'type': ['null', 'array'], 'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}} , 52 | 'our_float_array': {'items': { '$ref' : '#/definitions/sdc_recursive_number_array'}, 'type': ['null', 'array']}, 53 | 'our_hstore_array': {'items': { '$ref' : '#/definitions/sdc_recursive_object_array'}, 'type': ['null', 'array']}, 54 | 'our_inet_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']}, 55 | 'our_int_array': {'items': { '$ref' : '#/definitions/sdc_recursive_integer_array'}, 'type': ['null', 'array']}, 56 | 'our_json_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']}, 57 | 'our_jsonb_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']}, 58 | 'our_mac_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']}, 59 | 'our_money_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']}, 60 | 'our_real_array': {'items': { '$ref' : '#/definitions/sdc_recursive_number_array'}, 'type': ['null', 'array']}, 61 | 'our_smallint_array': {'items': { '$ref' : '#/definitions/sdc_recursive_integer_array'}, 'type': ['null', 'array']}, 62 | 'our_string_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']}, 63 | 'our_text_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']}, 64 | 'our_time_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']}, 65 | 'our_ts_tz_array': {'items': { '$ref' : '#/definitions/sdc_recursive_timestamp_array'}, 'type': ['null', 'array']}, 66 | 'our_uuid_array': {'items': { '$ref' : '#/definitions/sdc_recursive_string_array'}, 'type': ['null', 'array']}} 67 | }} 68 | 69 | 70 | def insert_record(cursor, table_name, data): 71 | our_keys = list(data.keys()) 72 | our_keys.sort() 73 | our_values = [data.get(key) for key in our_keys] 74 | 75 | columns_sql = ", \n ".join(our_keys) 76 | value_sql_array = [] 77 | for k in our_keys: 78 | if k == 'our_json_array': 79 | value_sql_array.append("%s::json[]") 80 | elif k == 'our_jsonb_array': 81 | value_sql_array.append("%s::jsonb[]") 82 | else: 83 | value_sql_array.append("%s") 84 | 85 | value_sql = ",".join(value_sql_array) 86 | 87 | insert_sql = """ INSERT INTO {} 88 | ( {} ) 89 | VALUES ( {} )""".format(quote_ident(table_name, cursor), columns_sql, value_sql) 90 | cursor.execute(insert_sql, our_values) 91 | 92 | def canonicalized_table_name(schema, table, cur): 93 | return "{}.{}".format(quote_ident(schema, cur), quote_ident(table, cur)) 94 | 95 | 96 | class PostgresFullTableRepArrays(unittest.TestCase): 97 | def tearDown(self): 98 | with db_utils.get_test_connection('dev') as conn: 99 | conn.autocommit = True 100 | 101 | def setUp(self): 102 | db_utils.ensure_environment_variables_set() 103 | 104 | db_utils.ensure_db() 105 | 106 | self.maxDiff = None 107 | 108 | with db_utils.get_test_connection('dev') as conn: 109 | conn.autocommit = True 110 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 111 | old_table = cur.execute("""SELECT EXISTS ( 112 | SELECT 1 113 | FROM information_schema.tables 114 | WHERE table_schema = %s 115 | AND table_name = %s);""", 116 | [test_schema_name, test_table_name]) 117 | old_table = cur.fetchone()[0] 118 | 119 | if old_table: 120 | cur.execute("DROP TABLE {}".format(canonicalized_table_name(test_schema_name, test_table_name, cur))) 121 | 122 | 123 | cur = conn.cursor() 124 | cur.execute(""" SELECT installed_version FROM pg_available_extensions WHERE name = 'hstore' """) 125 | if cur.fetchone()[0] is None: 126 | cur.execute(""" CREATE EXTENSION hstore; """) 127 | 128 | cur.execute(""" CREATE EXTENSION IF NOT EXISTS citext WITH SCHEMA public;""") 129 | cur.execute(""" DROP TYPE IF EXISTS ALIGNMENT CASCADE """) 130 | cur.execute(""" CREATE TYPE ALIGNMENT AS ENUM ('good', 'bad', 'ugly') """) 131 | 132 | 133 | create_table_sql = """ 134 | CREATE TABLE {} (id SERIAL PRIMARY KEY, 135 | our_bit_array BIT(1)[], 136 | our_boolean_array BOOLEAN[], 137 | our_cidr_array CIDR[], 138 | our_citext_array CITEXT[], 139 | our_date_array DATE[], 140 | our_decimal_array NUMERIC(12,2)[], 141 | our_double_array DOUBLE PRECISION[], 142 | our_enum_array ALIGNMENT[], 143 | our_float_array FLOAT[], 144 | our_hstore_array HSTORE[], 145 | our_inet_array INET[], 146 | our_int_array INTEGER[][], 147 | our_json_array JSON[], 148 | our_jsonb_array JSONB[], 149 | our_mac_array MACADDR[], 150 | our_money_array MONEY[], 151 | our_real_array REAL[], 152 | our_smallint_array SMALLINT[], 153 | our_string_array VARCHAR[], 154 | our_text_array TEXT[], 155 | our_time_array TIME[], 156 | our_ts_tz_array TIMESTAMP WITH TIME ZONE[], 157 | our_uuid_array UUID[]) 158 | """.format(canonicalized_table_name(test_schema_name, test_table_name, cur)) 159 | 160 | cur.execute(create_table_sql) 161 | 162 | @staticmethod 163 | def expected_check_streams(): 164 | return { 'dev-public-postgres_full_table_replication_array_test'} 165 | 166 | @staticmethod 167 | def expected_sync_streams(): 168 | return { test_table_name } 169 | 170 | @staticmethod 171 | def expected_pks(): 172 | return { 173 | test_table_name : {'id'} 174 | } 175 | 176 | @staticmethod 177 | def tap_name(): 178 | return "tap-postgres" 179 | 180 | @staticmethod 181 | def name(): 182 | return "tap_tester_postgres_full_table_replication_arrays" 183 | 184 | @staticmethod 185 | def get_type(): 186 | return "platform.postgres" 187 | 188 | @staticmethod 189 | def get_credentials(): 190 | return {'password': os.getenv('TAP_POSTGRES_PASSWORD')} 191 | 192 | @staticmethod 193 | def get_properties(): 194 | return {'host' : os.getenv('TAP_POSTGRES_HOST'), 195 | 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'), 196 | 'port' : os.getenv('TAP_POSTGRES_PORT'), 197 | 'user' : os.getenv('TAP_POSTGRES_USER'), 198 | 'default_replication_method' : 'LOG_BASED' 199 | } 200 | 201 | 202 | def test_run(self): 203 | conn_id = connections.ensure_connection(self) 204 | 205 | # run in check mode 206 | check_job_name = runner.run_check_mode(self, conn_id) 207 | 208 | # verify check exit codes 209 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 210 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 211 | 212 | # verify the tap discovered the right streams 213 | found_catalogs = [fc for fc 214 | in menagerie.get_catalogs(conn_id) 215 | if fc['tap_stream_id'] in self.expected_check_streams()] 216 | 217 | 218 | self.assertGreaterEqual(len(found_catalogs), 219 | 1, 220 | msg="unable to locate schemas for connection {}".format(conn_id)) 221 | 222 | found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs)) 223 | diff = self.expected_check_streams().symmetric_difference(found_catalog_names) 224 | self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff)) 225 | 226 | # verify that persisted streams have the correct properties 227 | test_catalog = found_catalogs[0] 228 | 229 | self.assertEqual(test_table_name, test_catalog['stream_name']) 230 | 231 | print("discovered streams are correct") 232 | additional_md = [{ "breadcrumb" : [], "metadata" : {'replication-method' : 'FULL_TABLE'}}] 233 | _ = connections.select_catalog_and_fields_via_metadata(conn_id, test_catalog, 234 | menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']), 235 | additional_md) 236 | 237 | # clear state 238 | menagerie.set_state(conn_id, {}) 239 | 240 | print("inserting a record") 241 | our_ts_tz = None 242 | our_date = None 243 | our_uuid = str(uuid.uuid1()) 244 | with db_utils.get_test_connection('dev') as conn: 245 | conn.autocommit = True 246 | with conn.cursor() as cur: 247 | #insert fixture data 2 248 | 249 | #insert fixture data 1 250 | our_ts = datetime.datetime(1997, 2, 2, 2, 2, 2, 722184) 251 | nyc_tz = pytz.timezone('America/New_York') 252 | our_ts_tz = nyc_tz.localize(our_ts) 253 | our_date = datetime.date(1998, 3, 4) 254 | 255 | self.rec_1 = { 256 | 'our_bit_array' : '{{0,1,1}}', 257 | 'our_boolean_array' : '{true}', 258 | 'our_cidr_array' : '{{192.168.100.128/25}}', 259 | 'our_citext_array' : '{{maGICKal 2}}', 260 | 'our_date_array' : '{{{}}}'.format(our_date), 261 | 'our_decimal_array' : '{{{}}}'.format(decimal.Decimal('1234567890.01')), 262 | 'our_double_array' : '{{1.232323}}', 263 | 'our_enum_array' : '{{bad}}', 264 | 'our_float_array' : '{{5.23}}', 265 | 'our_hstore_array' : """{{"size=>small","name=>betty"}}""", 266 | 'our_inet_array' : '{{192.168.100.128/24}}', 267 | 'our_int_array' : '{{1,2,3},{4,5,6}}', 268 | 'our_json_array' : [psycopg2.extras.Json({'secret' : 55})], 269 | 'our_jsonb_array' : [psycopg2.extras.Json({'secret' : 69})], 270 | 'our_mac_array' : '{{08:00:2b:01:02:03}}', 271 | 'our_money_array' : '{{$412.1234}}', 272 | 'our_real_array' : '{{76.33}}', 273 | 'our_smallint_array' : '{{10,20,30},{40,50,60}}', 274 | 'our_string_array' : '{{one string, two strings}}', 275 | 'our_text_array' : '{{three string, four}}', 276 | 'our_time_array' : '{{03:04:05}}', 277 | 'our_ts_tz_array' : '{{{}}}'.format(our_ts_tz), 278 | 'our_uuid_array' : '{{{}}}'.format(our_uuid)} 279 | 280 | 281 | insert_record(cur, test_table_name, self.rec_1) 282 | 283 | 284 | sync_job_name = runner.run_sync_mode(self, conn_id) 285 | 286 | # verify tap and target exit codes 287 | exit_status = menagerie.get_exit_status(conn_id, sync_job_name) 288 | menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) 289 | 290 | record_count_by_stream = runner.examine_target_output_file(self, 291 | conn_id, 292 | self.expected_sync_streams(), 293 | self.expected_pks()) 294 | self.assertEqual(record_count_by_stream, { test_table_name: 1 }) 295 | records_by_stream = runner.get_records_from_target_output() 296 | self.assertTrue(len(records_by_stream) > 0) 297 | 298 | for stream, recs in records_by_stream.items(): 299 | # verify the persisted schema was correct 300 | self.assertEqual(recs['schema'], 301 | expected_schemas[stream], 302 | msg="Persisted schema did not match expected schema for stream `{}`.".format(stream)) 303 | 304 | self.assertEqual(3, len(records_by_stream[test_table_name]['messages'])) 305 | self.assertEqual(records_by_stream[test_table_name]['messages'][0]['action'], 306 | 'activate_version') 307 | self.assertEqual(records_by_stream[test_table_name]['messages'][1]['action'], 308 | 'upsert') 309 | self.assertEqual(records_by_stream[test_table_name]['messages'][2]['action'], 310 | 'activate_version') 311 | actual_record_1 = records_by_stream[test_table_name]['messages'][1]['data'] 312 | 313 | expected_inserted_record = {'id': 1, 314 | 'our_bit_array' : [[False, True, True]], 315 | 'our_boolean_array' : [True], 316 | 'our_cidr_array' : [['192.168.100.128/25']], 317 | 'our_citext_array' : [['maGICKal 2']], 318 | 'our_date_array' : ['1998-03-04T00:00:00+00:00'], 319 | 'our_decimal_array' : [decimal.Decimal('1234567890.01')], 320 | 'our_double_array' : [[decimal.Decimal('1.232323')]], 321 | 'our_enum_array' : [['bad']], 322 | 'our_float_array' : [[decimal.Decimal('5.23')]], 323 | 'our_hstore_array' : [[{'size' : 'small' }, {'name' : 'betty'} ]], 324 | 'our_inet_array' : [['192.168.100.128/24']], 325 | 'our_int_array' : [[1,2,3],[4,5,6]], 326 | 'our_json_array' : [json.dumps({'secret' : 55})], 327 | 'our_jsonb_array' : [json.dumps({'secret' : 69})], 328 | 'our_mac_array' : [['08:00:2b:01:02:03']], 329 | 'our_money_array' : [['$412.12']], 330 | 'our_real_array' : [[decimal.Decimal('76.33')]], 331 | 'our_smallint_array' : [[10,20,30],[40,50,60]], 332 | 'our_string_array' : [['one string', 'two strings']], 333 | 'our_text_array' : [['three string', 'four']], 334 | 'our_time_array' : [['03:04:05']], 335 | 'our_ts_tz_array' : ['1997-02-02T07:02:02.722184+00:00'], 336 | 'our_uuid_array' : ['{}'.format(our_uuid)] 337 | 338 | } 339 | 340 | self.assertEqual(set(actual_record_1.keys()), set(expected_inserted_record.keys()), 341 | msg="keys for expected_record_1 are wrong: {}".format(set(actual_record_1.keys()).symmetric_difference(set(expected_inserted_record.keys())))) 342 | 343 | for k in actual_record_1.keys(): 344 | self.assertEqual(actual_record_1[k], expected_inserted_record[k], msg="{} != {} for key {}".format(actual_record_1[k], expected_inserted_record[k], k)) 345 | 346 | print("inserted record is correct") 347 | 348 | # verify state and bookmarks 349 | state = menagerie.get_state(conn_id) 350 | 351 | bookmark = state['bookmarks']['dev-public-postgres_full_table_replication_array_test'] 352 | self.assertIsNone(state['currently_syncing'], msg="expected state's currently_syncing to be None") 353 | 354 | self.assertIsNone(bookmark.get('lsn'), 355 | msg="expected bookmark for stream to have NO lsn because we are using full-table replication") 356 | -------------------------------------------------------------------------------- /tap_postgres/sync_strategies/logical_replication.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # pylint: disable=missing-docstring,not-an-iterable,too-many-locals,too-many-arguments,invalid-name,too-many-return-statements,too-many-branches,len-as-condition,too-many-nested-blocks,wrong-import-order,duplicate-code, anomalous-backslash-in-string, too-many-statements, singleton-comparison, consider-using-in 3 | 4 | import singer 5 | import datetime 6 | import decimal 7 | from singer import utils, get_bookmark 8 | import singer.metadata as metadata 9 | import tap_postgres.db as post_db 10 | import tap_postgres.sync_strategies.common as sync_common 11 | from dateutil.parser import parse 12 | import psycopg2 13 | from psycopg2 import sql 14 | import copy 15 | from select import select 16 | from functools import reduce 17 | import json 18 | import re 19 | 20 | LOGGER = singer.get_logger() 21 | 22 | UPDATE_BOOKMARK_PERIOD = 1000 23 | 24 | def get_pg_version(cur): 25 | cur.execute("SELECT version()") 26 | res = cur.fetchone()[0] 27 | version_match = re.match('PostgreSQL (\d+)', res) 28 | if not version_match: 29 | raise Exception('unable to determine PostgreSQL version from {}'.format(res)) 30 | 31 | version = int(version_match.group(1)) 32 | LOGGER.info("Detected PostgresSQL version: %s", version) 33 | return version 34 | 35 | def fetch_current_lsn(conn_config): 36 | with post_db.open_connection(conn_config, False) as conn: 37 | with conn.cursor() as cur: 38 | version = get_pg_version(cur) 39 | if version == 9: 40 | cur.execute("SELECT pg_current_xlog_location()") 41 | elif version > 9: 42 | cur.execute("SELECT pg_current_wal_lsn()") 43 | else: 44 | raise Exception('unable to fetch current lsn for PostgresQL version {}'.format(version)) 45 | 46 | current_lsn = cur.fetchone()[0] 47 | file, index = current_lsn.split('/') 48 | return (int(file, 16) << 32) + int(index, 16) 49 | 50 | def add_automatic_properties(stream, conn_config): 51 | stream['schema']['properties']['_sdc_deleted_at'] = {'type' : ['null', 'string'], 'format' :'date-time'} 52 | if conn_config.get('debug_lsn'): 53 | LOGGER.info('debug_lsn is ON') 54 | stream['schema']['properties']['_sdc_lsn'] = {'type' : ['null', 'string']} 55 | else: 56 | LOGGER.info('debug_lsn is OFF') 57 | 58 | return stream 59 | 60 | def get_stream_version(tap_stream_id, state): 61 | stream_version = singer.get_bookmark(state, tap_stream_id, 'version') 62 | 63 | if stream_version is None: 64 | raise Exception("version not found for log miner {}".format(tap_stream_id)) 65 | 66 | return stream_version 67 | 68 | def tuples_to_map(accum, t): 69 | accum[t[0]] = t[1] 70 | return accum 71 | 72 | def create_hstore_elem_query(elem): 73 | return sql.SQL("SELECT hstore_to_array({})").format(sql.Literal(elem)) 74 | 75 | def create_hstore_elem(conn_info, elem): 76 | with post_db.open_connection(conn_info) as conn: 77 | with conn.cursor() as cur: 78 | query = create_hstore_elem_query(elem) 79 | cur.execute(query) 80 | res = cur.fetchone()[0] 81 | hstore_elem = reduce(tuples_to_map, [res[i:i + 2] for i in range(0, len(res), 2)], {}) 82 | return hstore_elem 83 | 84 | def create_array_elem(elem, sql_datatype, conn_info): 85 | if elem is None: 86 | return None 87 | 88 | with post_db.open_connection(conn_info) as conn: 89 | with conn.cursor() as cur: 90 | if sql_datatype == 'bit[]': 91 | cast_datatype = 'boolean[]' 92 | elif sql_datatype == 'boolean[]': 93 | cast_datatype = 'boolean[]' 94 | elif sql_datatype == 'character varying[]': 95 | cast_datatype = 'character varying[]' 96 | elif sql_datatype == 'cidr[]': 97 | cast_datatype = 'cidr[]' 98 | elif sql_datatype == 'citext[]': 99 | cast_datatype = 'text[]' 100 | elif sql_datatype == 'date[]': 101 | cast_datatype = 'text[]' 102 | elif sql_datatype == 'double precision[]': 103 | cast_datatype = 'double precision[]' 104 | elif sql_datatype == 'hstore[]': 105 | cast_datatype = 'text[]' 106 | elif sql_datatype == 'integer[]': 107 | cast_datatype = 'integer[]' 108 | elif sql_datatype == 'bigint[]': 109 | cast_datatype = 'bigint[]' 110 | elif sql_datatype == 'inet[]': 111 | cast_datatype = 'inet[]' 112 | elif sql_datatype == 'json[]': 113 | cast_datatype = 'text[]' 114 | elif sql_datatype == 'jsonb[]': 115 | cast_datatype = 'text[]' 116 | elif sql_datatype == 'macaddr[]': 117 | cast_datatype = 'macaddr[]' 118 | elif sql_datatype == 'money[]': 119 | cast_datatype = 'text[]' 120 | elif sql_datatype == 'numeric[]': 121 | cast_datatype = 'text[]' 122 | elif sql_datatype == 'real[]': 123 | cast_datatype = 'real[]' 124 | elif sql_datatype == 'smallint[]': 125 | cast_datatype = 'smallint[]' 126 | elif sql_datatype == 'text[]': 127 | cast_datatype = 'text[]' 128 | elif sql_datatype in ('time without time zone[]', 'time with time zone[]'): 129 | cast_datatype = 'text[]' 130 | elif sql_datatype in ('timestamp with time zone[]', 'timestamp without time zone[]'): 131 | cast_datatype = 'text[]' 132 | elif sql_datatype == 'uuid[]': 133 | cast_datatype = 'text[]' 134 | 135 | else: 136 | #custom datatypes like enums 137 | cast_datatype = 'text[]' 138 | 139 | sql_stmt = """SELECT $stitch_quote${}$stitch_quote$::{}""".format(elem, cast_datatype) 140 | cur.execute(sql_stmt) 141 | res = cur.fetchone()[0] 142 | return res 143 | 144 | #pylint: disable=too-many-branches,too-many-nested-blocks 145 | def selected_value_to_singer_value_impl(elem, og_sql_datatype, conn_info): 146 | sql_datatype = og_sql_datatype.replace('[]', '') 147 | 148 | if elem is None: 149 | return elem 150 | if sql_datatype == 'timestamp without time zone': 151 | return parse(elem).isoformat() + '+00:00' 152 | if sql_datatype == 'timestamp with time zone': 153 | if isinstance(elem, datetime.datetime): 154 | return elem.isoformat() 155 | 156 | return parse(elem).isoformat() 157 | if sql_datatype == 'date': 158 | if isinstance(elem, datetime.date): 159 | #logical replication gives us dates as strings UNLESS they from an array 160 | return elem.isoformat() + 'T00:00:00+00:00' 161 | return parse(elem).isoformat() + "+00:00" 162 | if sql_datatype == 'time with time zone': 163 | return parse(elem).isoformat().split('T')[1] 164 | if sql_datatype == 'bit': 165 | #for arrays, elem will == True 166 | #for ordinary bits, elem will == '1' 167 | return elem == '1' or elem == True 168 | if sql_datatype == 'boolean': 169 | return elem 170 | if sql_datatype == 'hstore': 171 | return create_hstore_elem(conn_info, elem) 172 | if 'numeric' in sql_datatype: 173 | return decimal.Decimal(str(elem)) 174 | if isinstance(elem, int): 175 | return elem 176 | if isinstance(elem, float): 177 | return elem 178 | if isinstance(elem, str): 179 | return elem 180 | 181 | raise Exception("do not know how to marshall value of type {}".format(elem.__class__)) 182 | 183 | def selected_array_to_singer_value(elem, sql_datatype, conn_info): 184 | if isinstance(elem, list): 185 | return list(map(lambda elem: selected_array_to_singer_value(elem, sql_datatype, conn_info), elem)) 186 | 187 | return selected_value_to_singer_value_impl(elem, sql_datatype, conn_info) 188 | 189 | def selected_value_to_singer_value(elem, sql_datatype, conn_info): 190 | #are we dealing with an array? 191 | if sql_datatype.find('[]') > 0: 192 | cleaned_elem = create_array_elem(elem, sql_datatype, conn_info) 193 | return list(map(lambda elem: selected_array_to_singer_value(elem, sql_datatype, conn_info), (cleaned_elem or []))) 194 | 195 | return selected_value_to_singer_value_impl(elem, sql_datatype, conn_info) 196 | 197 | def row_to_singer_message(stream, row, version, columns, time_extracted, md_map, conn_info): 198 | row_to_persist = () 199 | md_map[('properties', '_sdc_deleted_at')] = {'sql-datatype' : 'timestamp with time zone'} 200 | md_map[('properties', '_sdc_lsn')] = {'sql-datatype' : "character varying"} 201 | 202 | for idx, elem in enumerate(row): 203 | sql_datatype = md_map.get(('properties', columns[idx])).get('sql-datatype') 204 | 205 | if not sql_datatype: 206 | LOGGER.info("No sql-datatype found for stream %s: %s", stream, columns[idx]) 207 | raise Exception("Unable to find sql-datatype for stream {}".format(stream)) 208 | 209 | cleaned_elem = selected_value_to_singer_value(elem, sql_datatype, conn_info) 210 | row_to_persist += (cleaned_elem,) 211 | 212 | rec = dict(zip(columns, row_to_persist)) 213 | 214 | return singer.RecordMessage( 215 | stream=post_db.calculate_destination_stream_name(stream, md_map), 216 | record=rec, 217 | version=version, 218 | time_extracted=time_extracted) 219 | 220 | def consume_message_format_2(payload, conn_info, streams_lookup, state, time_extracted, lsn): 221 | ## Action Types: 222 | # I = Insert 223 | # U = Update 224 | # D = Delete 225 | # B = Begin Transaction 226 | # C = Commit Transaction 227 | # M = Message 228 | # T = Truncate 229 | action = payload['action'] 230 | if action not in ['U', 'I', 'D']: 231 | LOGGER.debug("Skipping message of type %s", action) 232 | yield None 233 | else: 234 | tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'], payload['schema'], payload['table']) 235 | if streams_lookup.get(tap_stream_id) is None: 236 | yield None 237 | else: 238 | target_stream = streams_lookup[tap_stream_id] 239 | stream_version = get_stream_version(target_stream['tap_stream_id'], state) 240 | stream_md_map = metadata.to_map(target_stream['metadata']) 241 | 242 | desired_columns = [col for col in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(stream_md_map, col)] 243 | 244 | col_names = [] 245 | col_vals = [] 246 | if payload['action'] in ['I', 'U']: 247 | for column in payload['columns']: 248 | if column['name'] in set(desired_columns): 249 | col_names.append(column['name']) 250 | col_vals.append(column['value']) 251 | 252 | col_names = col_names + ['_sdc_deleted_at'] 253 | col_vals = col_vals + [None] 254 | 255 | if conn_info.get('debug_lsn'): 256 | col_names = col_names + ['_sdc_lsn'] 257 | col_vals = col_vals + [str(lsn)] 258 | 259 | elif payload['action'] == 'D': 260 | for column in payload['identity']: 261 | if column['name'] in set(desired_columns): 262 | col_names.append(column['name']) 263 | col_vals.append(column['value']) 264 | 265 | col_names = col_names + ['_sdc_deleted_at'] 266 | col_vals = col_vals + [singer.utils.strftime(singer.utils.strptime_to_utc(payload['timestamp']))] 267 | 268 | if conn_info.get('debug_lsn'): 269 | col_vals = col_vals + [str(lsn)] 270 | col_names = col_names + ['_sdc_lsn'] 271 | 272 | # Yield 1 record to match the API of V1 273 | yield row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) 274 | 275 | state = singer.write_bookmark(state, 276 | target_stream['tap_stream_id'], 277 | 'lsn', 278 | lsn) 279 | 280 | # message-format v1 281 | def consume_message_format_1(payload, conn_info, streams_lookup, state, time_extracted, lsn): 282 | for c in payload['change']: 283 | tap_stream_id = post_db.compute_tap_stream_id(conn_info['dbname'], c['schema'], c['table']) 284 | if streams_lookup.get(tap_stream_id) is None: 285 | continue 286 | 287 | target_stream = streams_lookup[tap_stream_id] 288 | stream_version = get_stream_version(target_stream['tap_stream_id'], state) 289 | stream_md_map = metadata.to_map(target_stream['metadata']) 290 | 291 | 292 | desired_columns = [c for c in target_stream['schema']['properties'].keys() if sync_common.should_sync_column(stream_md_map, c)] 293 | 294 | if c['kind'] == 'insert': 295 | col_names = [] 296 | col_vals = [] 297 | for idx, col in enumerate(c['columnnames']): 298 | if col in set(desired_columns): 299 | col_names.append(col) 300 | col_vals.append(c['columnvalues'][idx]) 301 | 302 | col_names = col_names + ['_sdc_deleted_at'] 303 | col_vals = col_vals + [None] 304 | if conn_info.get('debug_lsn'): 305 | col_names = col_names + ['_sdc_lsn'] 306 | col_vals = col_vals + [str(lsn)] 307 | record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) 308 | 309 | elif c['kind'] == 'update': 310 | col_names = [] 311 | col_vals = [] 312 | for idx, col in enumerate(c['columnnames']): 313 | if col in set(desired_columns): 314 | col_names.append(col) 315 | col_vals.append(c['columnvalues'][idx]) 316 | 317 | col_names = col_names + ['_sdc_deleted_at'] 318 | col_vals = col_vals + [None] 319 | 320 | if conn_info.get('debug_lsn'): 321 | col_vals = col_vals + [str(lsn)] 322 | col_names = col_names + ['_sdc_lsn'] 323 | record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) 324 | 325 | elif c['kind'] == 'delete': 326 | col_names = [] 327 | col_vals = [] 328 | for idx, col in enumerate(c['oldkeys']['keynames']): 329 | if col in set(desired_columns): 330 | col_names.append(col) 331 | col_vals.append(c['oldkeys']['keyvalues'][idx]) 332 | 333 | 334 | col_names = col_names + ['_sdc_deleted_at'] 335 | col_vals = col_vals + [singer.utils.strftime(time_extracted)] 336 | if conn_info.get('debug_lsn'): 337 | col_vals = col_vals + [str(lsn)] 338 | col_names = col_names + ['_sdc_lsn'] 339 | record_message = row_to_singer_message(target_stream, col_vals, stream_version, col_names, time_extracted, stream_md_map, conn_info) 340 | 341 | else: 342 | raise Exception("unrecognized replication operation: {}".format(c['kind'])) 343 | 344 | 345 | yield record_message 346 | state = singer.write_bookmark(state, 347 | target_stream['tap_stream_id'], 348 | 'lsn', 349 | lsn) 350 | 351 | 352 | def consume_message(streams, state, msg, time_extracted, conn_info, end_lsn, message_format="1"): 353 | payload = json.loads(msg.payload) 354 | lsn = msg.data_start 355 | 356 | streams_lookup = {s['tap_stream_id']: s for s in streams} 357 | 358 | if message_format == "1": 359 | records = consume_message_format_1(payload, conn_info, streams_lookup, state, time_extracted, lsn) 360 | elif message_format == "2": 361 | records = consume_message_format_2(payload, conn_info, streams_lookup, state, time_extracted, lsn) 362 | else: 363 | raise Exception("Unknown wal2json message format version: {}".format(message_format)) 364 | 365 | for record_message in records: 366 | if record_message: 367 | singer.write_message(record_message) 368 | # Pulled out of refactor so we send a keep-alive per-record 369 | LOGGER.debug("sending feedback to server with NO flush_lsn. just a keep-alive") 370 | msg.cursor.send_feedback() 371 | 372 | LOGGER.debug("sending feedback to server. flush_lsn = %s", msg.data_start) 373 | if msg.data_start > end_lsn: 374 | raise Exception("incorrectly attempting to flush an lsn({}) > end_lsn({})".format(msg.data_start, end_lsn)) 375 | 376 | msg.cursor.send_feedback(flush_lsn=msg.data_start) 377 | 378 | 379 | return state 380 | 381 | def locate_replication_slot(conn_info): 382 | with post_db.open_connection(conn_info, False) as conn: 383 | with conn.cursor() as cur: 384 | db_specific_slot = "stitch_{}".format(conn_info['dbname']) 385 | cur.execute("SELECT * FROM pg_replication_slots WHERE slot_name = %s AND plugin = %s", (db_specific_slot, 'wal2json')) 386 | if len(cur.fetchall()) == 1: 387 | LOGGER.info("using pg_replication_slot %s", db_specific_slot) 388 | return db_specific_slot 389 | 390 | 391 | cur.execute("SELECT * FROM pg_replication_slots WHERE slot_name = 'stitch' AND plugin = 'wal2json'") 392 | if len(cur.fetchall()) == 1: 393 | LOGGER.info("using pg_replication_slot 'stitch'") 394 | return 'stitch' 395 | 396 | raise Exception("Unable to find replication slot (stitch || {} with wal2json".format(db_specific_slot)) 397 | 398 | 399 | def sync_tables(conn_info, logical_streams, state, end_lsn): 400 | start_lsn = min([get_bookmark(state, s['tap_stream_id'], 'lsn') for s in logical_streams]) 401 | time_extracted = utils.now() 402 | slot = locate_replication_slot(conn_info) 403 | last_lsn_processed = None 404 | poll_total_seconds = conn_info['logical_poll_total_seconds'] or 60 * 30 #we are willing to poll for a total of 30 minutes without finding a record 405 | keep_alive_time = 10.0 406 | begin_ts = datetime.datetime.now() 407 | 408 | for s in logical_streams: 409 | sync_common.send_schema_message(s, ['lsn']) 410 | 411 | with post_db.open_connection(conn_info, True) as conn: 412 | with conn.cursor() as cur: 413 | LOGGER.info("Starting Logical Replication for %s(%s): %s -> %s. poll_total_seconds: %s", list(map(lambda s: s['tap_stream_id'], logical_streams)), slot, start_lsn, end_lsn, poll_total_seconds) 414 | 415 | replication_params = {"slot_name": slot, 416 | "decode": True, 417 | "start_lsn": start_lsn} 418 | message_format = conn_info.get("wal2json_message_format") or "1" 419 | if message_format == "2": 420 | LOGGER.info("Using wal2json format-version 2") 421 | replication_params["options"] = {"format-version": 2, "include-timestamp": True} 422 | 423 | try: 424 | cur.start_replication(**replication_params) 425 | except psycopg2.ProgrammingError: 426 | raise Exception("unable to start replication with logical replication slot {}".format(slot)) 427 | 428 | rows_saved = 0 429 | while True: 430 | poll_duration = (datetime.datetime.now() - begin_ts).total_seconds() 431 | if poll_duration > poll_total_seconds: 432 | LOGGER.info("breaking after %s seconds of polling with no data", poll_duration) 433 | break 434 | 435 | msg = cur.read_message() 436 | if msg: 437 | begin_ts = datetime.datetime.now() 438 | if msg.data_start > end_lsn: 439 | LOGGER.info("gone past end_lsn %s for run. breaking", end_lsn) 440 | break 441 | 442 | state = consume_message(logical_streams, state, msg, time_extracted, 443 | conn_info, end_lsn, message_format=message_format) 444 | #msg has been consumed. it has been processed 445 | last_lsn_processed = msg.data_start 446 | rows_saved = rows_saved + 1 447 | if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: 448 | singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) 449 | else: 450 | now = datetime.datetime.now() 451 | timeout = keep_alive_time - (now - cur.io_timestamp).total_seconds() 452 | try: 453 | sel = select([cur], [], [], max(0, timeout)) 454 | if not any(sel): 455 | LOGGER.info("no data for %s seconds. sending feedback to server with NO flush_lsn. just a keep-alive", timeout) 456 | cur.send_feedback() 457 | 458 | except InterruptedError: 459 | pass # recalculate timeout and continue 460 | 461 | if last_lsn_processed: 462 | for s in logical_streams: 463 | LOGGER.info("updating bookmark for stream %s to last_lsn_processed %s", s['tap_stream_id'], last_lsn_processed) 464 | state = singer.write_bookmark(state, s['tap_stream_id'], 'lsn', last_lsn_processed) 465 | 466 | singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) 467 | return state 468 | -------------------------------------------------------------------------------- /tests/test_postgres_discovery.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | import unittest 4 | import decimal 5 | import uuid 6 | import json 7 | 8 | from psycopg2.extensions import quote_ident 9 | import psycopg2.extras 10 | import pytz 11 | import tap_tester.connections as connections 12 | import tap_tester.menagerie as menagerie 13 | import tap_tester.runner as runner 14 | 15 | import db_utils # pylint: disable=import-error 16 | 17 | 18 | test_schema_name = "public" 19 | test_table_name = "postgres_discovery_test" 20 | test_db = "discovery1" 21 | 22 | 23 | class PostgresDiscovery(unittest.TestCase): 24 | AUTOMATIC_FIELDS = "automatic" 25 | REPLICATION_KEYS = "valid-replication-keys" 26 | PRIMARY_KEYS = "table-key-properties" 27 | FOREIGN_KEYS = "table-foreign-key-properties" 28 | REPLICATION_METHOD = "forced-replication-method" 29 | API_LIMIT = "max-row-limit" 30 | INCREMENTAL = "INCREMENTAL" 31 | FULL_TABLE = "FULL_TABLE" 32 | LOG_BASED = "LOG_BASED" 33 | 34 | UNSUPPORTED_TYPES = { 35 | "BIGSERIAL", 36 | "BIT VARYING", 37 | "BOX", 38 | "BYTEA", 39 | "CIRCLE", 40 | "INTERVAL", 41 | "LINE", 42 | "LSEG", 43 | "PATH", 44 | "PG_LSN", 45 | "POINT", 46 | "POLYGON", 47 | "SERIAL", 48 | "SMALLSERIAL", 49 | "TSQUERY", 50 | "TSVECTOR", 51 | "TXID_SNAPSHOT", 52 | "XML", 53 | } 54 | default_replication_method = "" 55 | 56 | def tearDown(self): 57 | pass 58 | # with db_utils.get_test_connection(test_db) as conn: 59 | # conn.autocommit = True 60 | # with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 61 | # cur.execute(""" SELECT pg_drop_replication_slot('stitch') """) 62 | 63 | def setUp(self): 64 | db_utils.ensure_environment_variables_set() 65 | 66 | db_utils.ensure_db(test_db) 67 | self.maxDiff = None 68 | 69 | with db_utils.get_test_connection(test_db) as conn: 70 | conn.autocommit = True 71 | with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: 72 | 73 | # db_utils.ensure_replication_slot(cur, test_db) 74 | 75 | canonicalized_table_name = db_utils.canonicalized_table_name(cur, test_schema_name, test_table_name) 76 | 77 | create_table_sql = """ 78 | CREATE TABLE {} (id SERIAL PRIMARY KEY, 79 | our_varchar VARCHAR, 80 | our_varchar_10 VARCHAR(10), 81 | our_text TEXT, 82 | our_text_2 TEXT, 83 | our_integer INTEGER, 84 | our_smallint SMALLINT, 85 | our_bigint BIGINT, 86 | our_decimal NUMERIC(12,2), 87 | "OUR TS" TIMESTAMP WITHOUT TIME ZONE, 88 | "OUR TS TZ" TIMESTAMP WITH TIME ZONE, 89 | "OUR TIME" TIME WITHOUT TIME ZONE, 90 | "OUR TIME TZ" TIME WITH TIME ZONE, 91 | "OUR DATE" DATE, 92 | our_double DOUBLE PRECISION, 93 | our_real REAL, 94 | our_boolean BOOLEAN, 95 | our_bit BIT(1), 96 | our_json JSON, 97 | our_jsonb JSONB, 98 | our_uuid UUID, 99 | our_store HSTORE, 100 | our_citext CITEXT, 101 | our_cidr cidr, 102 | our_inet inet, 103 | our_mac macaddr, 104 | our_alignment_enum ALIGNMENT, 105 | our_money money, 106 | invalid_bigserial BIGSERIAL, 107 | invalid_bit_varying BIT VARYING, 108 | invalid_box BOX, 109 | invalid_bytea BYTEA, 110 | invalid_circle CIRCLE, 111 | invalid_interval INTERVAL, 112 | invalid_line LINE, 113 | invalid_lseg LSEG, 114 | invalid_path PATH, 115 | invalid_pg_lsn PG_LSN, 116 | invalid_point POINT, 117 | invalid_polygon POLYGON, 118 | invalid_serial SERIAL, 119 | invalid_smallserial SMALLSERIAL, 120 | invalid_tsquery TSQUERY, 121 | invalid_tsvector TSVECTOR, 122 | invalid_txid_snapshot TXID_SNAPSHOT, 123 | invalid_xml XML) 124 | """.format(canonicalized_table_name) 125 | 126 | cur = db_utils.ensure_fresh_table(conn, cur, test_schema_name, test_table_name) 127 | cur.execute(create_table_sql) 128 | 129 | #insert fixture data 1 130 | our_ts = datetime.datetime(1997, 2, 2, 2, 2, 2, 722184) 131 | nyc_tz = pytz.timezone('America/New_York') 132 | our_ts_tz = nyc_tz.localize(our_ts) 133 | our_time = datetime.time(12,11,10) 134 | our_time_tz = our_time.isoformat() + "-04:00" 135 | our_date = datetime.date(1998, 3, 4) 136 | my_uuid = str(uuid.uuid1()) 137 | 138 | self.recs = [] 139 | for _ in range(500): 140 | our_ts = datetime.datetime(1987, 3, 3, 3, 3, 3, 733184) 141 | nyc_tz = pytz.timezone('America/New_York') 142 | our_ts_tz = nyc_tz.localize(our_ts) 143 | our_time = datetime.time(10,9,8) 144 | our_time_tz = our_time.isoformat() + "-04:00" 145 | our_date = datetime.date(1964, 7, 1) 146 | my_uuid = str(uuid.uuid1()) 147 | 148 | record = {'our_varchar' : "our_varchar 4", 149 | 'our_varchar_10' : "varchar_10", 150 | 'our_text' : "some text 2", 151 | 'our_text_2' : "NOT SELECTED", 152 | 'our_integer' : 44101, 153 | 'our_smallint' : 2, 154 | 'our_bigint' : 1000001, 155 | 'our_decimal' : decimal.Decimal('9876543210.02'), 156 | quote_ident('OUR TS', cur) : our_ts, 157 | quote_ident('OUR TS TZ', cur) : our_ts_tz, 158 | quote_ident('OUR TIME', cur) : our_time, 159 | quote_ident('OUR TIME TZ', cur) : our_time_tz, 160 | quote_ident('OUR DATE', cur) : our_date, 161 | 'our_double' : 1.1, 162 | 'our_real' : 1.2, 163 | 'our_boolean' : True, 164 | 'our_bit' : '1', 165 | 'our_json' : json.dumps({'nymn' : 77}), 166 | 'our_jsonb' : json.dumps({'burgers' : 'good++'}), 167 | 'our_uuid' : my_uuid, 168 | 'our_store' : 'dances=>"floor",name=>"betty"', 169 | 'our_citext': 'maGICKal 2', 170 | 'our_cidr' : '192.168.101.128/25', 171 | 'our_inet': '192.168.101.128/24', 172 | 'our_mac' : '08:00:2b:01:02:04', 173 | } 174 | 175 | db_utils.insert_record(cur, test_table_name, record) 176 | self.recs.append(record) 177 | 178 | cur.execute("""ANALYZE {}""".format(canonicalized_table_name)) 179 | 180 | @staticmethod 181 | def expected_check_streams(): 182 | return { 'postgres_discovery_test'} 183 | 184 | def expected_check_stream_ids(self): 185 | """A set of expected table names in format""" 186 | check_streams = self.expected_check_streams() 187 | return {"{}-{}-{}".format(test_db, test_schema_name, stream) for stream in check_streams} 188 | 189 | @staticmethod 190 | def expected_primary_keys(): 191 | return { 192 | 'postgres_discovery_test' : {'id'} 193 | } 194 | 195 | @staticmethod 196 | def expected_unsupported_fields(): 197 | return { 198 | 'invalid_bigserial', 199 | 'invalid_bit_varying', 200 | 'invalid_box', 201 | 'invalid_bytea', 202 | 'invalid_circle', 203 | 'invalid_interval', 204 | 'invalid_line', 205 | 'invalid_lseg', 206 | 'invalid_path', 207 | 'invalid_pg_lsn', 208 | 'invalid_point', 209 | 'invalid_polygon', 210 | 'invalid_serial', 211 | 'invalid_smallserial', 212 | 'invalid_tsquery', 213 | 'invalid_tsvector', 214 | 'invalid_txid_snapshot', 215 | 'invalid_xml', 216 | } 217 | @staticmethod 218 | def expected_schema_types(): 219 | return { 220 | 'id': 'integer', # 'serial primary key', 221 | 'our_varchar': 'character varying', # 'varchar' 222 | 'our_varchar_10': 'character varying', # 'varchar(10)', 223 | 'our_text': 'text', 224 | 'our_text_2': 'text', 225 | 'our_integer': 'integer', 226 | 'our_smallint': 'smallint', 227 | 'our_bigint': 'bigint', 228 | 'our_decimal': 'numeric', 229 | 'OUR TS': 'timestamp without time zone', 230 | 'OUR TS TZ': 'timestamp with time zone', 231 | 'OUR TIME': 'time without time zone', 232 | 'OUR TIME TZ': 'time with time zone', 233 | 'OUR DATE': 'date', 234 | 'our_double': 'double precision', 235 | 'our_real': 'real', 236 | 'our_boolean': 'boolean', 237 | 'our_bit': 'bit', 238 | 'our_json': 'json', 239 | 'our_jsonb': 'jsonb', 240 | 'our_uuid': 'uuid', 241 | 'our_store': 'hstore', 242 | 'our_citext': 'citext', 243 | 'our_cidr': 'cidr', 244 | 'our_inet': 'inet', 245 | 'our_mac': 'macaddr', 246 | 'our_alignment_enum': 'alignment', 247 | 'our_money': 'money', 248 | 'invalid_bigserial': 'bigint', 249 | 'invalid_bit_varying': 'bit varying', 250 | 'invalid_box': 'box', 251 | 'invalid_bytea': 'bytea', 252 | 'invalid_circle': 'circle', 253 | 'invalid_interval': 'interval', 254 | 'invalid_line': 'line', 255 | 'invalid_lseg': 'lseg', 256 | 'invalid_path': 'path', 257 | 'invalid_pg_lsn': 'pg_lsn', 258 | 'invalid_point': 'point', 259 | 'invalid_polygon': 'polygon', 260 | 'invalid_serial': 'integer', 261 | 'invalid_smallserial': 'smallint', 262 | 'invalid_tsquery': 'tsquery', 263 | 'invalid_tsvector': 'tsvector', 264 | 'invalid_txid_snapshot': 'txid_snapshot', 265 | 'invalid_xml': 'xml', 266 | } 267 | 268 | @staticmethod 269 | def tap_name(): 270 | return "tap-postgres" 271 | 272 | @staticmethod 273 | def name(): 274 | return "tap_tester_postgres_discovery" 275 | 276 | @staticmethod 277 | def get_type(): 278 | return "platform.postgres" 279 | 280 | @staticmethod 281 | def get_credentials(): 282 | return {'password': os.getenv('TAP_POSTGRES_PASSWORD')} 283 | 284 | def get_properties(self, original_properties=True): 285 | return_value = { 286 | 'host' : os.getenv('TAP_POSTGRES_HOST'), 287 | 'dbname' : os.getenv('TAP_POSTGRES_DBNAME'), 288 | 'port' : os.getenv('TAP_POSTGRES_PORT'), 289 | 'user' : os.getenv('TAP_POSTGRES_USER'), 290 | 'default_replication_method' : self.FULL_TABLE, 291 | 'filter_dbs' : 'discovery1' 292 | } 293 | if not original_properties: 294 | if self.default_replication_method is self.LOG_BASED: 295 | return_value['wal2json_message_format'] = '1' 296 | 297 | return_value['default_replication_method'] = self.default_replication_method 298 | 299 | return return_value 300 | 301 | def test_run(self): 302 | """Parametrized discovery test running against each replicatio method.""" 303 | 304 | self.default_replication_method = self.FULL_TABLE 305 | full_table_conn_id = connections.ensure_connection(self, original_properties=False) 306 | self.discovery_test(full_table_conn_id) 307 | 308 | self.default_replication_method = self.INCREMENTAL 309 | incremental_conn_id = connections.ensure_connection(self, original_properties=False) 310 | self.discovery_test(incremental_conn_id) 311 | 312 | # NB | We are able to generate a connection and run discovery with a default replication 313 | # method of logical replication WITHOUT selecting a replication slot. This is not 314 | # ideal behavior. This BUG should not be carried over into hp-postgres, but will not 315 | # be fixed for this tap. 316 | self.default_replication_method = self.LOG_BASED 317 | log_based_conn_id = connections.ensure_connection(self, original_properties=False) 318 | self.discovery_test(log_based_conn_id) 319 | 320 | def discovery_test(self, conn_id): 321 | """ 322 | Basic Discovery Test for a database tap. 323 | 324 | Test Description: 325 | Ensure discovery runs without exit codes and generates a catalog of the expected form 326 | 327 | Test Cases: 328 | - Verify discovery generated the expected catalogs by name. 329 | - Verify that the table_name is in the format for each stream. 330 | - Verify the caatalog is found for a given stream. 331 | - Verify there is only 1 top level breadcrumb in metadata for a given stream. 332 | - Verify replication key(s) match expectations for a given stream. 333 | - Verify primary key(s) match expectations for a given stream. 334 | - Verify the replication method matches our expectations for a given stream. 335 | - Verify that only primary keys are given the inclusion of automatic in metadata 336 | for a given stream. 337 | - Verify expected unsupported fields are given the inclusion of unsupported in 338 | metadata for a given stream. 339 | - Verify that all fields for a given stream which are not unsupported or automatic 340 | have inclusion of available. 341 | - Verify row-count metadata matches expectations for a given stream. 342 | - Verify selected metadata is None for all streams. 343 | - Verify is-view metadata is False for a given stream. 344 | - Verify no forced-replication-method is present in metadata for a given stream. 345 | - Verify schema and db match expectations for a given stream. 346 | - Verify schema types match expectations for a given stream. 347 | """ 348 | ########################################################################## 349 | ### TODO 350 | ### [] Generate multiple tables (streams) and maybe dbs too? 351 | ### [] Investigate potential bug, see DOCS_BUG_1 352 | ########################################################################## 353 | 354 | # run discovery (check mode) 355 | check_job_name = runner.run_check_mode(self, conn_id) 356 | 357 | # Verify check exit codes 358 | exit_status = menagerie.get_exit_status(conn_id, check_job_name) 359 | menagerie.verify_check_exit_status(self, exit_status, check_job_name) 360 | 361 | # Verify discovery generated a catalog 362 | found_catalogs = menagerie.get_catalogs(conn_id) 363 | self.assertGreater(len(found_catalogs), 0) 364 | 365 | # Verify discovery generated the expected catalogs by name 366 | found_catalog_names = {catalog['stream_name'] for catalog in found_catalogs} 367 | self.assertSetEqual(self.expected_check_streams(), found_catalog_names) 368 | 369 | # Verify that the table_name is in the format for each stream 370 | found_catalog_stream_ids = {catalog['tap_stream_id'] for catalog in found_catalogs} 371 | self.assertSetEqual(self.expected_check_stream_ids(), found_catalog_stream_ids) 372 | 373 | # Test by stream 374 | for stream in self.expected_check_streams(): 375 | with self.subTest(stream=stream): 376 | 377 | # Verify the caatalog is found for a given stream 378 | catalog = next(iter([catalog for catalog in found_catalogs 379 | if catalog["stream_name"] == stream])) 380 | self.assertTrue(isinstance(catalog, dict)) 381 | 382 | # collecting expected values 383 | expected_primary_keys = self.expected_primary_keys()[stream] 384 | expected_replication_keys = set() 385 | expected_unsupported_fields = self.expected_unsupported_fields() 386 | expected_fields_to_datatypes = self.expected_schema_types() 387 | expected_row_count = len(self.recs) 388 | 389 | # collecting actual values... 390 | schema_and_metadata = menagerie.get_annotated_schema(conn_id, catalog['stream_id']) 391 | stream_metadata = schema_and_metadata["metadata"] 392 | top_level_metadata = [item for item in stream_metadata if item.get("breadcrumb") == []] 393 | stream_properties = top_level_metadata[0]['metadata'] 394 | actual_primary_keys = set(stream_properties.get(self.PRIMARY_KEYS, [])) 395 | actual_replication_keys = set(stream_properties.get(self.REPLICATION_KEYS, [])) 396 | actual_replication_method = stream_properties.get(self.REPLICATION_METHOD) 397 | actual_automatic_fields = set( 398 | item.get("breadcrumb", ["properties", None])[1] for item in stream_metadata 399 | if item.get("metadata").get("inclusion") == "automatic" 400 | ) 401 | actual_unsupported_fields = set( 402 | item.get("breadcrumb", ["properties", None])[1] for item in stream_metadata 403 | if item.get("metadata").get("inclusion") == "unsupported" 404 | ) 405 | actual_fields_to_datatypes = { 406 | item['breadcrumb'][1]: item['metadata'].get('sql-datatype') 407 | for item in stream_metadata if item['breadcrumb'] != [] 408 | } 409 | 410 | # Verify there is only 1 top level breadcrumb in metadata 411 | self.assertEqual(1, len(top_level_metadata)) 412 | 413 | # Verify replication key(s) match expectations 414 | self.assertSetEqual( 415 | expected_replication_keys, actual_replication_keys 416 | ) 417 | 418 | # NB | We expect primary keys and replication keys to have inclusion automatic for 419 | # key-based incremental replication. But that is only true for primary keys here. 420 | # This BUG should not be carried over into hp-postgres, but will not be fixed for this tap. 421 | 422 | # Verify primary key(s) match expectations 423 | self.assertSetEqual( 424 | expected_primary_keys, actual_primary_keys, 425 | ) 426 | 427 | # Verify the replication method matches our expectations 428 | self.assertIsNone(actual_replication_method) 429 | 430 | # Verify that only primary keys 431 | # are given the inclusion of automatic in metadata. 432 | self.assertSetEqual(expected_primary_keys, actual_automatic_fields) 433 | 434 | 435 | # DOCS_BUG_1 ? | The following types were converted and selected, but docs say unsupported. 436 | # Still need to investigate how the tap handles values of these datatypes 437 | # during sync. 438 | KNOWN_MISSING = { 439 | 'invalid_bigserial', # BIGSERIAL -> bigint 440 | 'invalid_serial', # SERIAL -> integer 441 | 'invalid_smallserial', # SMALLSERIAL -> smallint 442 | } 443 | # Verify expected unsupported fields 444 | # are given the inclusion of unsupported in metadata. 445 | self.assertSetEqual(expected_unsupported_fields, actual_unsupported_fields | KNOWN_MISSING) 446 | 447 | 448 | # Verify that all other fields have inclusion of available 449 | # This assumes there are no unsupported fields for SaaS sources 450 | self.assertTrue( 451 | all({item.get("metadata").get("inclusion") == "available" 452 | for item in stream_metadata 453 | if item.get("breadcrumb", []) != [] 454 | and item.get("breadcrumb", ["properties", None])[1] 455 | not in actual_automatic_fields 456 | and item.get("breadcrumb", ["properties", None])[1] 457 | not in actual_unsupported_fields}), 458 | msg="Not all non key properties are set to available in metadata") 459 | 460 | # Verify row-count metadata matches expectations 461 | self.assertEqual(expected_row_count, stream_properties['row-count']) 462 | 463 | # Verify selected metadata is None for all streams 464 | self.assertNotIn('selected', stream_properties.keys()) 465 | 466 | # Verify is-view metadata is False 467 | self.assertFalse(stream_properties['is-view']) 468 | 469 | # Verify no forced-replication-method is present in metadata 470 | self.assertNotIn(self.REPLICATION_METHOD, stream_properties.keys()) 471 | 472 | # Verify schema and db match expectations 473 | self.assertEqual(test_schema_name, stream_properties['schema-name']) 474 | self.assertEqual(test_db, stream_properties['database-name']) 475 | 476 | # Verify schema types match expectations 477 | self.assertDictEqual(expected_fields_to_datatypes, actual_fields_to_datatypes) 478 | --------------------------------------------------------------------------------