├── requirements.txt ├── redshift ├── requirements.txt ├── .gitignore ├── example.env ├── my.cnf ├── Dockerfile ├── .dockerignore ├── docker-compose.yml ├── README.md ├── main.py ├── utils.py └── mysql2redshift.py ├── bigquery ├── requirements.txt ├── .gitignore ├── example.env ├── my.cnf ├── Dockerfile ├── .dockerignore ├── docker-compose.yml ├── README.md ├── mysql2bigquery.py ├── main.py └── utils.py ├── snowflake ├── requirements.txt ├── .gitignore ├── my.cnf ├── example.env ├── Dockerfile ├── .dockerignore ├── docker-compose.yml ├── main.py ├── mysql2snowsql.py ├── utils.py └── README.md ├── .gitignore ├── my.cnf ├── Dockerfile ├── .dockerignore ├── .github └── main.workflow ├── docker-compose.yml ├── .circleci └── config.yml ├── main.py ├── README.md └── utils.py /requirements.txt: -------------------------------------------------------------------------------- 1 | mysql-replication 2 | PyMySQL -------------------------------------------------------------------------------- /redshift/requirements.txt: -------------------------------------------------------------------------------- 1 | mysql-replication 2 | PyMySQL 3 | psycopg2-binary -------------------------------------------------------------------------------- /bigquery/requirements.txt: -------------------------------------------------------------------------------- 1 | mysql-replication 2 | PyMySQL 3 | google-cloud-bigquery -------------------------------------------------------------------------------- /snowflake/requirements.txt: -------------------------------------------------------------------------------- 1 | mysql-replication 2 | PyMySQL 3 | snowflake-connector-python -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | lib/ 7 | .env -------------------------------------------------------------------------------- /redshift/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | lib/ 7 | .env -------------------------------------------------------------------------------- /snowflake/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | lib/ 7 | .env -------------------------------------------------------------------------------- /bigquery/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | lib/ 7 | .env 8 | *.json -------------------------------------------------------------------------------- /bigquery/example.env: -------------------------------------------------------------------------------- 1 | MYSQL_HOST=mysql 2 | MYSQL_PORT=3306 3 | MYSQL_DATABASE=testdb 4 | MYSQL_USER=root 5 | MYSQL_PASSWORD=example 6 | PROJECT_ID= 7 | GOOGLE_APPLICATION_CREDENTIALS=/service_account.json -------------------------------------------------------------------------------- /my.cnf: -------------------------------------------------------------------------------- 1 | [mysqld] 2 | server-id = 1 3 | log_bin = /var/log/mysql/mysql-bin.log 4 | expire_logs_days = 10 5 | max_binlog_size = 100M 6 | binlog-format = row #Very important if you want to receive write, update and delete row events 7 | -------------------------------------------------------------------------------- /redshift/example.env: -------------------------------------------------------------------------------- 1 | MYSQL_HOST=mysql 2 | MYSQL_PORT=3306 3 | MYSQL_DATABASE=testdb 4 | MYSQL_USER=root 5 | MYSQL_PASSWORD=example 6 | REDSHIFT_HOST= 7 | REDSHIFT_PORT= 8 | REDSHIFT_DATABASE= 9 | REDSHIFT_USER= 10 | REDSHIFT_PASSWORD= -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.5 2 | 3 | COPY ./requirements.txt /app/requirements.txt 4 | 5 | RUN pip install --upgrade --no-cache-dir -r /app/requirements.txt 6 | 7 | WORKDIR /app 8 | COPY . /app/ 9 | 10 | ENTRYPOINT python /app/main.py 11 | -------------------------------------------------------------------------------- /bigquery/my.cnf: -------------------------------------------------------------------------------- 1 | [mysqld] 2 | server-id = 1 3 | log_bin = /var/log/mysql/mysql-bin.log 4 | expire_logs_days = 10 5 | max_binlog_size = 100M 6 | binlog-format = row #Very important if you want to receive write, update and delete row events 7 | -------------------------------------------------------------------------------- /redshift/my.cnf: -------------------------------------------------------------------------------- 1 | [mysqld] 2 | server-id = 1 3 | log_bin = /var/log/mysql/mysql-bin.log 4 | expire_logs_days = 10 5 | max_binlog_size = 100M 6 | binlog-format = row #Very important if you want to receive write, update and delete row events 7 | -------------------------------------------------------------------------------- /snowflake/my.cnf: -------------------------------------------------------------------------------- 1 | [mysqld] 2 | server-id = 1 3 | log_bin = /var/log/mysql/mysql-bin.log 4 | expire_logs_days = 10 5 | max_binlog_size = 100M 6 | binlog-format = row #Very important if you want to receive write, update and delete row events 7 | -------------------------------------------------------------------------------- /snowflake/example.env: -------------------------------------------------------------------------------- 1 | MYSQL_HOST=mysql 2 | MYSQL_PORT=3306 3 | MYSQL_DATABASE=testdb 4 | MYSQL_USER=root 5 | MYSQL_PASSWORD=example 6 | SNOWFLAKE_ACCOUNT= 7 | SNOWFLAKE_USER= 8 | SNOWFLAKE_PASSWORD= 9 | SNOWFLAKE_WAREHOUSE= 10 | SNOWFLAKE_DATABASE= -------------------------------------------------------------------------------- /bigquery/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.5 2 | 3 | COPY ./requirements.txt /app/requirements.txt 4 | 5 | RUN pip install --upgrade --no-cache-dir -r /app/requirements.txt 6 | 7 | WORKDIR /app 8 | COPY . /app/ 9 | 10 | ENTRYPOINT python /app/main.py 11 | -------------------------------------------------------------------------------- /redshift/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.5 2 | 3 | COPY ./requirements.txt /app/requirements.txt 4 | 5 | RUN pip install --upgrade --no-cache-dir -r /app/requirements.txt 6 | 7 | WORKDIR /app 8 | COPY . /app/ 9 | 10 | ENTRYPOINT python /app/main.py 11 | -------------------------------------------------------------------------------- /snowflake/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.5 2 | 3 | COPY ./requirements.txt /app/requirements.txt 4 | 5 | RUN pip install --upgrade --no-cache-dir -r /app/requirements.txt 6 | 7 | WORKDIR /app 8 | COPY . /app/ 9 | 10 | ENTRYPOINT python /app/main.py 11 | -------------------------------------------------------------------------------- /redshift/.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .gitignore 3 | LICENSE 4 | VERSION 5 | README.md 6 | Changelog.md 7 | Makefile 8 | docker-compose.yml 9 | docs 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | lib/ 17 | .env -------------------------------------------------------------------------------- /snowflake/.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .gitignore 3 | LICENSE 4 | VERSION 5 | README.md 6 | Changelog.md 7 | Makefile 8 | docker-compose.yml 9 | docs 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | lib/ 17 | .env -------------------------------------------------------------------------------- /bigquery/.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .gitignore 3 | LICENSE 4 | VERSION 5 | README.md 6 | Changelog.md 7 | Makefile 8 | docker-compose.yml 9 | docs 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | lib/ 17 | .env 18 | *.json -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .gitignore 3 | LICENSE 4 | VERSION 5 | README.md 6 | Changelog.md 7 | Makefile 8 | docker-compose.yml 9 | docs 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | lib/ 17 | .env 18 | 19 | snowflake/ 20 | redshift/ 21 | bigquery/ -------------------------------------------------------------------------------- /.github/main.workflow: -------------------------------------------------------------------------------- 1 | workflow "New workflow" { 2 | resolves = ["GitHub Action for Slack"] 3 | on = "issue_comment" 4 | } 5 | 6 | action "GitHub Action for Slack" { 7 | uses = "Ilshidur/action-slack@4ab30779c772cac48ffe705d27a5a194e3d5ed78" 8 | secrets = ["SLACK_WEBHOOK"] 9 | args = "LGTM!" 10 | } 11 | -------------------------------------------------------------------------------- /redshift/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | python: 4 | build: 5 | context: ./ 6 | dockerfile: Dockerfile 7 | restart: always 8 | depends_on: 9 | - mysql 10 | volumes: 11 | - ./:/app 12 | env_file: 13 | - ./.env 14 | environment: 15 | - PYTHONUNBUFFERED=0 16 | mysql: 17 | image: mysql:5.6 18 | restart: always 19 | environment: 20 | MYSQL_ROOT_PASSWORD: example 21 | ports: 22 | - "3306:3306" 23 | volumes: 24 | - ./my.cnf:/etc/mysql/conf.d/myx.cnf 25 | -------------------------------------------------------------------------------- /snowflake/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | python: 4 | build: 5 | context: ./ 6 | dockerfile: Dockerfile 7 | restart: always 8 | depends_on: 9 | - mysql 10 | volumes: 11 | - ./:/app 12 | env_file: 13 | - ./.env 14 | environment: 15 | - PYTHONUNBUFFERED=0 16 | mysql: 17 | image: mysql:5.6 18 | restart: always 19 | environment: 20 | MYSQL_ROOT_PASSWORD: example 21 | ports: 22 | - "3306:3306" 23 | volumes: 24 | - ./my.cnf:/etc/mysql/conf.d/myx.cnf 25 | -------------------------------------------------------------------------------- /bigquery/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | python: 4 | build: 5 | context: ./ 6 | dockerfile: Dockerfile 7 | restart: always 8 | depends_on: 9 | - mysql 10 | volumes: 11 | - ./:/app 12 | - ./Exoot-28bad1c01d20.json:/service_account.json 13 | env_file: 14 | - ./.env 15 | environment: 16 | - PYTHONUNBUFFERED=0 17 | mysql: 18 | image: mysql:5.6 19 | restart: always 20 | environment: 21 | MYSQL_ROOT_PASSWORD: example 22 | ports: 23 | - "3306:3306" 24 | volumes: 25 | - ./my.cnf:/etc/mysql/conf.d/myx.cnf 26 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | python: 4 | build: 5 | context: ./ 6 | dockerfile: Dockerfile 7 | restart: always 8 | depends_on: 9 | - mysql 10 | volumes: 11 | - ./:/app 12 | environment: 13 | - MYSQL_HOST=mysql 14 | - MYSQL_PORT=3306 15 | - MYSQL_DATABASE=testdb 16 | - MYSQL_USER=root 17 | - MYSQL_PASSWORD=example 18 | - PYTHONUNBUFFERED=0 19 | mysql: 20 | image: mysql:5.6 21 | restart: always 22 | environment: 23 | MYSQL_ROOT_PASSWORD: example 24 | ports: 25 | - "3306:3306" 26 | volumes: 27 | - ./my.cnf:/etc/mysql/conf.d/myx.cnf 28 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | machine: true 5 | steps: 6 | - checkout 7 | # login to docker hub 8 | - run: docker login -u $DOCKER_USER -p $DOCKER_PASS 9 | 10 | # build the application image 11 | - run: docker build -t servicerocket/mysql2snowflake:latest snowflake/ 12 | 13 | # deploy the image 14 | - run: docker push servicerocket/mysql2snowflake:latest 15 | 16 | # build the application image 17 | - run: docker build -t servicerocket/mysql2redshift:latest redshift/ 18 | 19 | # deploy the image 20 | - run: docker push servicerocket/mysql2redshift:latest 21 | 22 | # build the application image 23 | - run: docker build -t servicerocket/mysql2bigquery:latest bigquery/ 24 | 25 | # deploy the image 26 | - run: docker push servicerocket/mysql2bigquery:latest 27 | 28 | # build the application image 29 | - run: docker build -t servicerocket/mysql2stdout:latest . 30 | 31 | # deploy the image 32 | - run: docker push servicerocket/mysql2stdout:latest -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from pymysqlreplication import BinLogStreamReader 4 | from pymysqlreplication.row_event import ( 5 | DeleteRowsEvent, 6 | UpdateRowsEvent, 7 | WriteRowsEvent, 8 | ) 9 | 10 | from utils import concat_sql_from_binlog_event 11 | import pymysql 12 | import os 13 | import sys 14 | import logging 15 | 16 | # Logging 17 | logging.basicConfig( 18 | stream=sys.stdout, 19 | level=logging.INFO, 20 | format="%(levelname)s %(message)s") 21 | 22 | def main(mysqlConfigs): 23 | conn = pymysql.connect(**mysqlConfigs) 24 | cursor = conn.cursor() 25 | stream = BinLogStreamReader( 26 | connection_settings = mysqlConfigs, 27 | server_id=100, 28 | blocking=True, 29 | resume_stream=True, 30 | only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent]) 31 | 32 | for binlogevent in stream: 33 | e_start_pos, last_pos = stream.log_pos, stream.log_pos 34 | for row in binlogevent.rows: 35 | event = {"schema": binlogevent.schema, 36 | "table": binlogevent.table, 37 | "type": type(binlogevent).__name__, 38 | "row": row 39 | } 40 | #if isinstance(binlog_event, QueryEvent) and binlog_event.query == 'BEGIN': 41 | # e_start_pos = last_pos 42 | print("/*", json.dumps(event), "*/") 43 | print(concat_sql_from_binlog_event(cursor=cursor, binlog_event=binlogevent, row=row, e_start_pos=e_start_pos)) 44 | print() 45 | 46 | 47 | if __name__ == "__main__": 48 | mysqlConfigs = { 49 | "host": os.getenv('MYSQL_HOST'), 50 | "port": int(os.getenv('MYSQL_PORT')), 51 | "user": os.getenv('MYSQL_USER'), 52 | "passwd": os.getenv('MYSQL_PASSWORD'), 53 | 'db': os.getenv('MYSQL_DATABASE'), 54 | } 55 | main(mysqlConfigs) -------------------------------------------------------------------------------- /bigquery/README.md: -------------------------------------------------------------------------------- 1 | # Streaming mysql binlog replication to BigQuery 2 | [![](https://images.microbadger.com/badges/image/servicerocket/mysql2bigquery.svg)](https://hub.docker.com/r/servicerocket/mysql2bigquery/) 3 | 4 | Generate a [JSON service account](https://cloud.google.com/bigquery/docs/reference/libraries#client-libraries-install-python) credentials file and copy it to `bigquery/` directory and mount the file from `docker-compose.yml`. 5 | 6 | Clone this repo and update your GCP `PROJECT_ID` and `GOOGLE_APPLICATION_CREDENTIALS` location in `example.env`, then 7 | ```bash 8 | cd bigquery/ 9 | mv example.env .env 10 | docker-compose up --build 11 | ``` 12 | 13 | > Note: If you get `Can't connect to MySQL server on 'mysql' ([Errno 111] Connection refused)` error on the first run, try running it again. 14 | 15 | In another terminal run 16 | ```bash 17 | docker-compose exec mysql mysql -u root -pexample -e "DROP DATABASE IF EXISTS testdb; CREATE DATABASE testdb; USE testdb; CREATE TABLE testtbl (id int, name varchar(255)); INSERT INTO testtbl VALUES (1, 'hello'), (2, 'hola'), (3, 'zdravstvuy'), (1, 'bonjour'); UPDATE testtbl SET name = 'yolo' WHERE id = 1; UPDATE testtbl SET name = 'world' WHERE id = 3; DELETE FROM testtbl WHERE id = 1; SELECT * FROM testtbl;" 18 | ``` 19 | 20 | Which will output the following to the terminal 21 | ``` 22 | +------+-------+ 23 | | id | name | 24 | +------+-------+ 25 | | 2 | hola | 26 | | 3 | world | 27 | +------+-------+ 28 | ``` 29 | 30 | `docker-compose` daemon should output something like this 31 | ```sql 32 | python_1 | INSERT INTO testtbl(name, id) VALUES ('hello', 1); 33 | python_1 | INSERT INTO testtbl(name, id) VALUES ('hola', 2); 34 | python_1 | INSERT INTO testtbl(name, id) VALUES ('zdravstvuy', 3); 35 | python_1 | INSERT INTO testtbl(name, id) VALUES ('bonjour', 1); 36 | python_1 | UPDATE testtbl SET name='yolo', id=1 WHERE name='hello' AND id=1; 37 | python_1 | UPDATE testtbl SET name='yolo', id=1 WHERE name='bonjour' AND id=1; 38 | python_1 | UPDATE testtbl SET name='world', id=3 WHERE name='zdravstvuy' AND id=3; 39 | python_1 | DELETE FROM testtbl WHERE name='yolo' AND id=1; 40 | python_1 | DELETE FROM testtbl WHERE name='yolo' AND id=1; 41 | ``` 42 | Executing these queries one by one is not optimal. Ideally, we should batch them together. 43 | 44 | # References 45 | - https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/usage.html 46 | - https://cloud.google.com/bigquery/streaming-data-into-bigquery -------------------------------------------------------------------------------- /redshift/README.md: -------------------------------------------------------------------------------- 1 | # Streaming mysql binlog replication to Redshift 2 | [![](https://images.microbadger.com/badges/image/servicerocket/mysql2redshift.svg)](https://hub.docker.com/r/servicerocket/mysql2redshift/) 3 | 4 | Run this query on Redshift to create a new table 5 | ```sql 6 | DROP TABLE IF EXISTS testtbl; 7 | CREATE TABLE testtbl(id integer, name varchar(255)); 8 | ``` 9 | 10 | Clone this repo and update your Redshift credentials in `example.env`, then 11 | ```bash 12 | cd redshift/ 13 | mv example.env .env 14 | docker-compose up --build 15 | ``` 16 | 17 | > Note: If you get `Can't connect to MySQL server on 'mysql' ([Errno 111] Connection refused)` error on the first run, try running it again. 18 | 19 | In another terminal run 20 | ```bash 21 | docker-compose exec mysql mysql -u root -pexample -e "DROP DATABASE IF EXISTS testdb; CREATE DATABASE testdb; USE testdb; CREATE TABLE testtbl (id int, name varchar(255)); INSERT INTO testtbl VALUES (1, 'hello'), (2, 'hola'), (3, 'zdravstvuy'), (1, 'bonjour'); UPDATE testtbl SET name = 'yolo' WHERE id = 1; UPDATE testtbl SET name = 'world' WHERE id = 3; DELETE FROM testtbl WHERE id = 1; SELECT * FROM testtbl;" 22 | ``` 23 | 24 | Which will output the following to the terminal 25 | ``` 26 | +------+-------+ 27 | | id | name | 28 | +------+-------+ 29 | | 2 | hola | 30 | | 3 | world | 31 | +------+-------+ 32 | ``` 33 | 34 | `docker-compose` daemon should output something like this 35 | ```sql 36 | python_1 | INSERT INTO testtbl(name, id) VALUES ('hello', 1); 37 | python_1 | INSERT INTO testtbl(name, id) VALUES ('hola', 2); 38 | python_1 | INSERT INTO testtbl(name, id) VALUES ('zdravstvuy', 3); 39 | python_1 | INSERT INTO testtbl(name, id) VALUES ('bonjour', 1); 40 | python_1 | UPDATE testtbl SET name='yolo', id=1 WHERE name='hello' AND id=1; 41 | python_1 | UPDATE testtbl SET name='yolo', id=1 WHERE name='bonjour' AND id=1; 42 | python_1 | UPDATE testtbl SET name='world', id=3 WHERE name='zdravstvuy' AND id=3; 43 | python_1 | DELETE FROM testtbl WHERE name='yolo' AND id=1; 44 | python_1 | DELETE FROM testtbl WHERE name='yolo' AND id=1; 45 | ``` 46 | Executing these queries one by one is not optimal. Ideally, we should batch them together. 47 | ![Redshift History showing slow queries](https://i.imgur.com/r4vVhHL.png) 48 | 49 | # References 50 | - https://docs.aws.amazon.com/redshift/latest/dg/r_INSERT_30.html 51 | - https://docs.aws.amazon.com/redshift/latest/dg/t_Updating_tables_with_DML_commands.html 52 | - https://docs.aws.amazon.com/redshift/latest/dg/c_redshift-and-postgres-sql.html 53 | - https://www.blendo.co/blog/access-your-data-in-amazon-redshift-and-postgresql-with-python-and-r/ -------------------------------------------------------------------------------- /redshift/main.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from pymysqlreplication import BinLogStreamReader 4 | from pymysqlreplication.row_event import ( 5 | DeleteRowsEvent, 6 | UpdateRowsEvent, 7 | WriteRowsEvent, 8 | ) 9 | 10 | from utils import concat_sql_from_binlog_event 11 | import pymysql 12 | import os 13 | import sys 14 | import logging 15 | import psycopg2 16 | 17 | 18 | # Logging 19 | logging.basicConfig( 20 | #filename='/tmp/snowflake_python_connector.log', 21 | stream=sys.stdout, 22 | level=logging.INFO, 23 | format="%(levelname)s %(message)s") 24 | 25 | def main(mysqlConfigs, redshiftConfigs): 26 | 27 | rs = psycopg2.connect(**redshiftConfigs) 28 | 29 | conn = pymysql.connect(**mysqlConfigs) 30 | 31 | 32 | 33 | rs.cursor().execute(""" 34 | DROP TABLE IF EXISTS testtbl; 35 | CREATE TABLE testtbl(id integer, name varchar(255)); 36 | """) 37 | 38 | stream = BinLogStreamReader( 39 | connection_settings = mysqlConfigs, 40 | server_id=100, 41 | blocking=True, 42 | resume_stream=True, 43 | only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent]) 44 | 45 | cursor = conn.cursor() 46 | for binlogevent in stream: 47 | e_start_pos, last_pos = stream.log_pos, stream.log_pos 48 | #print([a for a in dir(binlogevent) if not a.startswith('__')]) 49 | for row in binlogevent.rows: 50 | event = {"schema": binlogevent.schema, 51 | "table": binlogevent.table, 52 | "type": type(binlogevent).__name__, 53 | "row": row 54 | } 55 | 56 | #if isinstance(binlog_event, QueryEvent) and binlog_event.query == 'BEGIN': 57 | # e_start_pos = last_pos 58 | #print(json.dumps(event)) 59 | binlog2sql = concat_sql_from_binlog_event(cursor=cursor, binlog_event=binlogevent, row=row, e_start_pos=e_start_pos).replace('`', "") 60 | print(binlog2sql) 61 | 62 | try: 63 | rs.cursor().execute(binlog2sql) 64 | except psycopg2.Error as e: 65 | print(e) 66 | 67 | # cur = rs.cursor() 68 | # cur.execute("SELECT * FROM testtbl;") 69 | 70 | # for row in cur.fetchall(): 71 | # print(row) 72 | 73 | 74 | if __name__ == "__main__": 75 | redshiftConfigs = { 76 | "host": os.getenv('REDSHIFT_HOST'), 77 | "port": int(os.getenv('REDSHIFT_PORT')), 78 | "user": os.getenv('REDSHIFT_USER'), 79 | "password": os.getenv('REDSHIFT_PASSWORD'), 80 | 'dbname': os.getenv('REDSHIFT_DATABASE'), 81 | } 82 | mysqlConfigs = { 83 | "host": os.getenv('MYSQL_HOST'), 84 | "port": int(os.getenv('MYSQL_PORT')), 85 | "user": os.getenv('MYSQL_USER'), 86 | "passwd": os.getenv('MYSQL_PASSWORD'), 87 | 'db': os.getenv('MYSQL_DATABASE'), 88 | } 89 | main(mysqlConfigs, redshiftConfigs) -------------------------------------------------------------------------------- /snowflake/main.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from pymysqlreplication import BinLogStreamReader 4 | from pymysqlreplication.row_event import ( 5 | DeleteRowsEvent, 6 | UpdateRowsEvent, 7 | WriteRowsEvent, 8 | ) 9 | 10 | from utils import concat_sql_from_binlog_event 11 | import pymysql 12 | import os 13 | import sys 14 | import logging 15 | import snowflake.connector 16 | 17 | 18 | # Logging 19 | logging.basicConfig( 20 | #filename='/tmp/snowflake_python_connector.log', 21 | stream=sys.stdout, 22 | level=logging.INFO, 23 | format="%(levelname)s %(message)s") 24 | 25 | def main(snowflakeConfig, mysqlConfigs): 26 | # Connecting to Snowflake 27 | sf = snowflake.connector.connect(**snowflakeConfig) 28 | 29 | conn = pymysql.connect(**mysqlConfigs) 30 | # Usually a schema is a collection of tables and a Database is a collection of schemas. 31 | # https://stackoverflow.com/a/19257781 32 | 33 | 34 | stream = BinLogStreamReader( 35 | connection_settings = mysqlConfigs, 36 | server_id=100, 37 | blocking=True, 38 | resume_stream=True, 39 | only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent]) 40 | 41 | cursor = conn.cursor() 42 | for binlogevent in stream: 43 | e_start_pos, last_pos = stream.log_pos, stream.log_pos 44 | #print([a for a in dir(binlogevent) if not a.startswith('__')]) 45 | for row in binlogevent.rows: 46 | event = {"schema": binlogevent.schema, 47 | "table": binlogevent.table, 48 | "type": type(binlogevent).__name__, 49 | "row": row 50 | } 51 | 52 | #if isinstance(binlog_event, QueryEvent) and binlog_event.query == 'BEGIN': 53 | # e_start_pos = last_pos 54 | #print(json.dumps(event)) 55 | binlog2sql = concat_sql_from_binlog_event(cursor=cursor, binlog_event=binlogevent, row=row, e_start_pos=e_start_pos).replace('`', '') 56 | print(binlog2sql) 57 | 58 | try: 59 | sf.cursor().execute(binlog2sql) 60 | except snowflake.connector.errors.ProgrammingError as e: 61 | # default error message 62 | print(e) 63 | # customer error message 64 | print('Error {0} ({1}): {2} ({3})'.format(e.errno, e.sqlstate, e.msg, e.sfqid)) 65 | 66 | 67 | if __name__ == "__main__": 68 | # Setting your account and login information 69 | snowflakeConfig = { 70 | 'account': os.getenv('SNOWFLAKE_ACCOUNT'), 71 | 'user': os.getenv('SNOWFLAKE_USER'), 72 | 'password': os.getenv('SNOWFLAKE_PASSWORD'), 73 | 'warehouse': os.getenv('SNOWFLAKE_WAREHOUSE'), 74 | 'database': os.getenv('SNOWFLAKE_DATABASE'), 75 | 'schema': 'PUBLIC' 76 | } 77 | mysqlConfigs = { 78 | "host": os.getenv('MYSQL_HOST'), 79 | "port": int(os.getenv('MYSQL_PORT')), 80 | "user": os.getenv('MYSQL_USER'), 81 | "passwd": os.getenv('MYSQL_PASSWORD'), 82 | 'db': os.getenv('MYSQL_DATABASE'), 83 | } 84 | main(snowflakeConfig, mysqlConfigs) -------------------------------------------------------------------------------- /bigquery/mysql2bigquery.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import json 4 | 5 | 6 | class ColumnInfo(object): 7 | data_types = { 8 | # 'string' 9 | 'char':'string', 10 | 'character':'string', 11 | 'varchar':'string', 12 | 'tinytext':'string', 13 | 'text':'string', 14 | 'mediumtext':'string', 15 | 'longtext':'string', 16 | # 'integer' 17 | 'tinyint':'integer', 18 | 'smallint':'integer', 19 | 'mediumint':'integer', 20 | 'integer':'integer', 21 | 'int':'integer', 22 | 'bigint':'integer', 23 | # 'float' 24 | 'float':'float', 25 | 'double':'float', 26 | 'real':'float', 27 | 'decimal':'float', 28 | 'fixed':'float', 29 | 'dec':'float', 30 | 'numeric':'float', 31 | # 'timestamp' 32 | 'date':'timestamp', 33 | 'datetime':'timestamp', 34 | 'timestamp':'timestamp', 35 | 'time':'timestamp', 36 | # 'boolean' 37 | 'bit':'boolean', 38 | 'bool':'boolean', 39 | 'boolean':'boolean', 40 | } 41 | 42 | def __init__(self, name, typ, nullable): 43 | self.name = name.lower() 44 | self.typ = self.convert_type(typ) 45 | self.is_nullable = self.nullable(nullable) 46 | 47 | def schema(self): 48 | return { "name": self.name, 49 | "type": self.typ.replace('timestamp', 'integer'), 50 | "mode": self.is_nullable } 51 | 52 | def convert_type(self, original): 53 | return self.data_types.get(original.split('(')[0], None) 54 | 55 | def nullable(self, is_nullable): 56 | return 'nullable' if is_nullable in ['YES','yes'] else 'required' 57 | 58 | def query(self): 59 | if self.typ == 'timestamp': 60 | return '1000000*unix_timestamp(%(name)s) as %(name)s' % ({'t':self.name}) 61 | return self.name 62 | 63 | 64 | class SchemaParser(object): 65 | def __init__(self, dest): 66 | self.dest = dest 67 | 68 | def parse_schema(self, line): 69 | elements = line.strip().split('\t') 70 | info = ColumnInfo(*elements[0:3]) 71 | return info.schema() 72 | 73 | def parse_query(self, line): 74 | elements = line.strip().split('\t') 75 | info = ColumnInfo(*elements[0:3]) 76 | return info.query() 77 | 78 | def schema(self, fp): 79 | with fp: 80 | return json.dumps([self.parse_schema(line) for line in fp]) 81 | 82 | def query(self, fp): 83 | with fp: 84 | return ','.join([self.parse_query(line) for line in fp]) 85 | 86 | def run(self, fp): 87 | if self.dest == 'schema': 88 | return self.schema(fp) 89 | return self.query(fp) 90 | 91 | 92 | if __name__ == '__main__': 93 | dest = sys.argv[1] 94 | fp = sys.stdin 95 | parser = SchemaParser(dest) 96 | print(parser.run(fp)) 97 | -------------------------------------------------------------------------------- /snowflake/mysql2snowsql.py: -------------------------------------------------------------------------------- 1 | #Ported to Python 3 from https://support.snowflake.net/s/question/0D50Z00006uSiEBSA0/homebrew-mysql-to-snowflake-ddl-converter 2 | import re 3 | import os 4 | import pymysql 5 | import snowflake.connector 6 | 7 | def apply_regex_sub(regex, expression, sub_string): 8 | p = re.compile(regex, re.MULTILINE) 9 | return re.sub(p, sub_string, expression) 10 | 11 | def mysql_to_snowflake(mysql_ddl): 12 | result = apply_regex_sub(r'`', mysql_ddl, "") # Remove ` 13 | result = apply_regex_sub(r'((--(.+)?)|(/\*(.+)))\n?', result, "") # Remove all comments fields 14 | result = apply_regex_sub(r'(DROP(.)+)\n', result, "") # Remove DROP Table reference 15 | result = apply_regex_sub(r'\sDEFAULT(.+,)', result, ",") # Remove DEFAULT 16 | result = apply_regex_sub(r'\s((NOT\sNULL)|NULL)', result, "") # Remove NULL 17 | result = apply_regex_sub(r"((enum|varchar|nvarchar|char)\(['0-9a-zA-Z,]+\))(.)+", result, "STRING,") # STRING data types 18 | result = apply_regex_sub(r'(tiny|big)?int\([0-9a-zA-Z,]+\)(\s(unsigned))?', result, "NUMBER") # NUMBER data types 19 | result = apply_regex_sub(r'datetime', result, "TIMESTAMP_LTZ") # TIMESTAMP_LTZ data types 20 | result = apply_regex_sub(r'\s\s(((PRIMARY)|(UNIQUE))\s)?KEY(.+)\n', result, "") # Strip KEYS 21 | result = apply_regex_sub(r'AUTO_INCREMENT',result,"") #Strips AUTO_INCREMENT 22 | result = apply_regex_sub(r'\s\s(CONSTRAINT\s)(.+)\n', result, "") # Strip CONSTRAINTS 23 | result = apply_regex_sub(r',?\n\)(.+)', result, "\n);") # Clean closing bracket 24 | result = apply_regex_sub(r'^(?:[\t ]*(?:\r?\n|\r))+', result, "") # Discard blank lines 25 | result = apply_regex_sub(r'bit\([0-9a-zA-Z,]+\)', result, "BOOLEAN") # BOOLEAN data types 26 | 27 | r = re.compile(r'(\s)(longblob|blob|longtext|text)(\n|\,)', re.MULTILINE) 28 | result = re.sub(r, r"\1STRING\3", result) 29 | 30 | return result 31 | 32 | def main(mysqlConfigs): 33 | conn = pymysql.connect(**mysqlConfigs) 34 | cur = conn.cursor() 35 | cur.execute("SHOW TABLES") 36 | sf = snowflake.connector.connect(**snowflakeConfig) 37 | 38 | for (table_name,) in cur.fetchall(): 39 | print("/* TABLE:", table_name, "*/") 40 | 41 | cur.execute("SHOW CREATE TABLE %s" % table_name) # WARNING: can be dangerous as this wont properly escape table_name 42 | 43 | sql = cur.fetchone()[1] 44 | #print(sql) 45 | 46 | if sql.find('CREATE ALGORITHM') != -1: 47 | print("/* Skipping", table_name,"*/") 48 | continue 49 | 50 | snowsql = mysql_to_snowflake(sql).replace("CREATE TABLE", "CREATE OR REPLACE TABLE") 51 | print(snowsql) 52 | #sf.cursor().execute(snowsql) 53 | 54 | if __name__ == "__main__": 55 | snowflakeConfig = { 56 | 'account': os.getenv('SNOWFLAKE_ACCOUNT'), 57 | 'user': os.getenv('SNOWFLAKE_USER'), 58 | 'password': os.getenv('SNOWFLAKE_PASSWORD'), 59 | 'warehouse': os.getenv('SNOWFLAKE_WAREHOUSE'), 60 | 'database': os.getenv('SNOWFLAKE_DATABASE'), 61 | 'schema': 'PUBLIC' 62 | } 63 | mysqlConfigs = { 64 | "host": os.getenv('MYSQL_HOST'), 65 | "port": int(os.getenv('MYSQL_PORT')), 66 | "user": os.getenv('MYSQL_USER'), 67 | "passwd": os.getenv('MYSQL_PASSWORD'), 68 | 'db': os.getenv('MYSQL_DATABASE'), 69 | } 70 | main(mysqlConfigs) 71 | -------------------------------------------------------------------------------- /bigquery/main.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from pymysqlreplication import BinLogStreamReader 4 | from pymysqlreplication.row_event import ( 5 | DeleteRowsEvent, 6 | UpdateRowsEvent, 7 | WriteRowsEvent, 8 | ) 9 | 10 | from utils import concat_sql_from_binlog_event 11 | import pymysql 12 | import os 13 | import sys 14 | import logging 15 | from google.cloud import bigquery 16 | 17 | # Logging 18 | logging.basicConfig( 19 | #filename='/tmp/snowflake_python_connector.log', 20 | stream=sys.stdout, 21 | level=logging.INFO, 22 | format="%(levelname)s %(message)s") 23 | 24 | def main(mysqlConfigs, redshiftConfigs): 25 | SCHEMA = [ 26 | bigquery.SchemaField('id', 'INTEGER', mode='REQUIRED'), 27 | bigquery.SchemaField('name', 'STRING', mode='REQUIRED'), 28 | ] 29 | 30 | project_id = os.getenv('PROJECT_ID') 31 | dataset_id = 'testdb' 32 | table_id = 'testtbl' 33 | 34 | client = bigquery.Client() 35 | 36 | 37 | client.delete_dataset(client.dataset(dataset_id), delete_contents=True) 38 | 39 | dataset = bigquery.Dataset(client.dataset(dataset_id)) 40 | dataset = client.create_dataset(dataset) 41 | dataset.location = 'US' 42 | 43 | table = bigquery.Table(dataset.table(table_id), schema=SCHEMA) 44 | table = client.create_table(table) 45 | 46 | query = "SELECT * FROM `{}`.`{}`.`{}` limit 100".format(project_id, dataset_id, table_id) 47 | 48 | 49 | client = bigquery.Client() 50 | table_ref = client.dataset(dataset_id).table(table_id) 51 | ''' 52 | table = client.get_table(table_ref) 53 | rows_to_insert = [ 54 | ('Phred Phlyntstone', 32), 55 | ('Wylma Phlyntstone', 1), 56 | ] 57 | errors = client.insert_rows(table, rows_to_insert) 58 | print(errors) 59 | assert errors == [] 60 | ''' 61 | 62 | query_job = client.query(query, location='US') 63 | for row in query_job: 64 | print(row) 65 | conn = pymysql.connect(**mysqlConfigs) 66 | 67 | 68 | stream = BinLogStreamReader( 69 | connection_settings = mysqlConfigs, 70 | server_id=100, 71 | blocking=True, 72 | resume_stream=True, 73 | only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent]) 74 | 75 | cursor = conn.cursor() 76 | for binlogevent in stream: 77 | e_start_pos, last_pos = stream.log_pos, stream.log_pos 78 | #print([a for a in dir(binlogevent) if not a.startswith('__')]) 79 | for row in binlogevent.rows: 80 | event = {"schema": binlogevent.schema, 81 | "table": binlogevent.table, 82 | "type": type(binlogevent).__name__, 83 | "row": row 84 | } 85 | 86 | #if isinstance(binlog_event, QueryEvent) and binlog_event.query == 'BEGIN': 87 | # e_start_pos = last_pos 88 | #print(json.dumps(event)) 89 | binlog2sql = concat_sql_from_binlog_event(cursor=cursor, binlog_event=binlogevent, row=row, e_start_pos=e_start_pos).replace('`', "").replace('testtbl', '`testdb.testtbl`') 90 | print(binlog2sql) 91 | 92 | query_job = client.query(binlog2sql, location='US') 93 | result = query_job.result() 94 | #print("Total rows affected: ", query_job.num_dml_affected_rows) 95 | 96 | #query_job = client.query(query, location='US') 97 | #for row in query_job: 98 | # print(row) 99 | 100 | if __name__ == "__main__": 101 | mysqlConfigs = { 102 | "host": os.getenv('MYSQL_HOST'), 103 | "port": int(os.getenv('MYSQL_PORT')), 104 | "user": os.getenv('MYSQL_USER'), 105 | "passwd": os.getenv('MYSQL_PASSWORD'), 106 | 'db': os.getenv('MYSQL_DATABASE'), 107 | } 108 | bigqueryConfigs = { 109 | 'dataset': 'testdb' 110 | } 111 | main(mysqlConfigs, bigqueryConfigs) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Streaming mysql binlog replication to Snowflake/Redshift/BigQuery 2 | [![CircleCI](https://circleci.com/gh/trainingrocket/mysql-binlog-replication.svg?style=svg)](https://circleci.com/gh/trainingrocket/mysql-binlog-replication) 3 | 4 | ```bash 5 | docker-compose up --build 6 | ``` 7 | 8 | > Note: If you get `Can't connect to MySQL server on 'mysql' ([Errno 111] Connection refused)` error on the first run, try running it again. 9 | 10 | We can actually merge all these images into a single image, but I personally prefer it for simplicity at the expense of code duplication. 11 | 12 | In another terminal login into the mysql instance 13 | ```bash 14 | docker-compose exec mysql mysql -u root -pexample 15 | ``` 16 | And execute the following 17 | ```sql 18 | DROP DATABASE IF EXISTS testdb; 19 | CREATE DATABASE testdb; USE testdb; 20 | CREATE TABLE testtbl (id int, name varchar(255)); 21 | INSERT INTO testtbl VALUES (1, 'hello'), (2, 'hola'), (3, 'zdravstvuy'), (1, 'bonjour'); 22 | DELETE FROM testtbl WHERE id = 1; 23 | SELECT * FROM testtbl; 24 | ``` 25 | 26 | Or you can just 27 | ```bash 28 | docker-compose exec mysql mysql -u root -pexample -e "DROP DATABASE IF EXISTS testdb; CREATE DATABASE testdb; USE testdb; CREATE TABLE testtbl (id int, name varchar(255)); INSERT INTO testtbl VALUES (1, 'hello'), (2, 'hola'), (3, 'zdravstvuy'), (1, 'bonjour'); DELETE FROM testtbl WHERE id = 1; SELECT * FROM testtbl;" 29 | ``` 30 | 31 | Which will output the following to the terminal 32 | ``` 33 | +------+------------+ 34 | | id | name | 35 | +------+------------+ 36 | | 2 | hola | 37 | | 3 | zdravstvuy | 38 | +------+------------+ 39 | ``` 40 | 41 | `docker-compose` daemon should output something like this 42 | ```sql 43 | python_1 | {"type": "WriteRowsEvent", "row": {"values": {"name": "hello", "id": 1}}, "table": "testtbl", "schema": "testdb"} 44 | python_1 | INSERT INTO `testdb`.`testtbl`(`name`, `id`) VALUES ('hello', 1); 45 | python_1 | {"type": "WriteRowsEvent", "row": {"values": {"name": "hola", "id": 2}}, "table": "testtbl", "schema": "testdb"} 46 | python_1 | INSERT INTO `testdb`.`testtbl`(`name`, `id`) VALUES ('hola', 2); 47 | python_1 | {"type": "WriteRowsEvent", "row": {"values": {"name": "zdravstvuy", "id": 3}}, "table": "testtbl", "schema": "testdb"} 48 | python_1 | INSERT INTO `testdb`.`testtbl`(`name`, `id`) VALUES ('zdravstvuy', 3); 49 | python_1 | {"type": "WriteRowsEvent", "row": {"values": {"name": "bonjour", "id": 1}}, "table": "testtbl", "schema": "testdb"} 50 | python_1 | INSERT INTO `testdb`.`testtbl`(`name`, `id`) VALUES ('bonjour', 1); 51 | python_1 | {"type": "DeleteRowsEvent", "row": {"values": {"name": "hello", "id": 1}}, "table": "testtbl", "schema": "testdb"} 52 | python_1 | DELETE FROM `testdb`.`testtbl` WHERE `name`='hello' AND `id`=1 LIMIT 1; 53 | python_1 | {"type": "DeleteRowsEvent", "row": {"values": {"name": "bonjour", "id": 1}}, "table": "testtbl", "schema": "testdb"} 54 | python_1 | DELETE FROM `testdb`.`testtbl` WHERE `name`='bonjour' AND `id`=1 LIMIT 1; 55 | ``` 56 | 57 | # Change Data Capture from RDS instance 58 | 59 | [Update RDS parameter group](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_LogAccess.Concepts.MySQL.html#USER_LogAccess.MySQL.BinaryFormat) to make sure `log_bin` is enabled and set `binlog_format` to `ROW`. 60 | Update the environment variables with RDS credentials, `MYSQL_HOST` should be something like `testdb.xxx.us-west-2.rds.amazonaws.com`. 61 | 62 | ``` 63 | mysql> show global variables like 'log_bin'; show global variables like 'binlog_format'; 64 | +---------------+-------+ 65 | | Variable_name | Value | 66 | +---------------+-------+ 67 | | log_bin | ON | 68 | +---------------+-------+ 69 | 1 row in set (0.64 sec) 70 | 71 | +---------------+-------+ 72 | | Variable_name | Value | 73 | +---------------+-------+ 74 | | binlog_format | ROW | 75 | +---------------+-------+ 76 | 1 row in set (0.61 sec) 77 | ``` 78 | 79 | # References 80 | - https://www.alooma.com/blog/mysql-to-amazon-redshift-replication 81 | - https://aws.amazon.com/blogs/database/streaming-changes-in-a-database-with-amazon-kinesis/ 82 | - https://github.com/danfengcao/binlog2sql 83 | - https://www.thegeekstuff.com/2017/08/mysqlbinlog-examples/ -------------------------------------------------------------------------------- /bigquery/utils.py: -------------------------------------------------------------------------------- 1 | # Ripped off from https://github.com/danfengcao/binlog2sql 2 | 3 | import os 4 | import sys 5 | import argparse 6 | import datetime 7 | 8 | if sys.version > '3': 9 | PY3PLUS = True 10 | else: 11 | PY3PLUS = False 12 | 13 | from pymysqlreplication.row_event import ( 14 | WriteRowsEvent, 15 | UpdateRowsEvent, 16 | DeleteRowsEvent, 17 | ) 18 | from pymysqlreplication.event import QueryEvent 19 | 20 | 21 | def compare_items(items): 22 | # caution: if v is NULL, may need to process 23 | (k, v) = items 24 | if v is None: 25 | return '`%s` IS %%s' % k 26 | else: 27 | return '`%s`=%%s' % k 28 | 29 | 30 | def fix_object(value): 31 | """Fixes python objects so that they can be properly inserted into SQL queries""" 32 | if isinstance(value, set): 33 | value = ','.join(value) 34 | if PY3PLUS and isinstance(value, bytes): 35 | return value.decode('utf-8') 36 | elif not PY3PLUS and isinstance(value, unicode): 37 | return value.encode('utf-8') 38 | else: 39 | return value 40 | 41 | def concat_sql_from_binlog_event(cursor, binlog_event, row=None, e_start_pos=None, flashback=False, no_pk=False): 42 | if flashback and no_pk: 43 | raise ValueError('only one of flashback or no_pk can be True') 44 | if not (isinstance(binlog_event, WriteRowsEvent) or isinstance(binlog_event, UpdateRowsEvent) 45 | or isinstance(binlog_event, DeleteRowsEvent) or isinstance(binlog_event, QueryEvent)): 46 | raise ValueError('binlog_event must be WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent or QueryEvent') 47 | 48 | sql = '' 49 | if isinstance(binlog_event, WriteRowsEvent) or isinstance(binlog_event, UpdateRowsEvent) \ 50 | or isinstance(binlog_event, DeleteRowsEvent): 51 | pattern = generate_sql_pattern(binlog_event, row=row, flashback=flashback, no_pk=no_pk) 52 | sql = cursor.mogrify(pattern['template'], pattern['values']) 53 | time = datetime.datetime.fromtimestamp(binlog_event.timestamp) 54 | #sql += ' #start %s end %s time %s' % (e_start_pos, binlog_event.packet.log_pos, time) 55 | elif flashback is False and isinstance(binlog_event, QueryEvent) and binlog_event.query != 'BEGIN' \ 56 | and binlog_event.query != 'COMMIT': 57 | if binlog_event.schema: 58 | sql = 'USE {0};\n'.format(binlog_event.schema) 59 | sql += '{0};'.format(fix_object(binlog_event.query)) 60 | 61 | return sql 62 | 63 | #removed schema: `schemaname`.`tablename` to just `tablename` 64 | #remove limt 1 from UPDATE and DELETE 65 | def generate_sql_pattern(binlog_event, row=None, flashback=False, no_pk=False): 66 | template = '' 67 | values = [] 68 | if flashback is True: 69 | if isinstance(binlog_event, WriteRowsEvent): 70 | template = 'DELETE FROM `{0}` WHERE {1};'.format( 71 | binlog_event.table, 72 | ' AND '.join(map(compare_items, row['values'].items())) 73 | ) 74 | values = map(fix_object, row['values'].values()) 75 | elif isinstance(binlog_event, DeleteRowsEvent): 76 | template = 'INSERT INTO `{0}`({1}) VALUES ({2});'.format( 77 | binlog_event.table, 78 | ', '.join(map(lambda key: '`%s`' % key, row['values'].keys())), 79 | ', '.join(['%s'] * len(row['values'])) 80 | ) 81 | values = map(fix_object, row['values'].values()) 82 | elif isinstance(binlog_event, UpdateRowsEvent): 83 | template = 'UPDATE `{0}` SET {1} WHERE {2};'.format( 84 | binlog_event.table, 85 | ', '.join(['`%s`=%%s' % x for x in row['before_values'].keys()]), 86 | ' AND '.join(map(compare_items, row['after_values'].items()))) 87 | values = map(fix_object, list(row['before_values'].values())+list(row['after_values'].values())) 88 | else: 89 | if isinstance(binlog_event, WriteRowsEvent): 90 | if no_pk: 91 | # print binlog_event.__dict__ 92 | # tableInfo = (binlog_event.table_map)[binlog_event.table_id] 93 | # if tableInfo.primary_key: 94 | # row['values'].pop(tableInfo.primary_key) 95 | if binlog_event.primary_key: 96 | row['values'].pop(binlog_event.primary_key) 97 | 98 | template = 'INSERT INTO `{0}`({1}) VALUES ({2});'.format( 99 | binlog_event.table, 100 | ', '.join(map(lambda key: '`%s`' % key, row['values'].keys())), 101 | ', '.join(['%s'] * len(row['values'])) 102 | ) 103 | values = map(fix_object, row['values'].values()) 104 | elif isinstance(binlog_event, DeleteRowsEvent): 105 | template = 'DELETE FROM `{0}` WHERE {1};'.format( 106 | binlog_event.table, ' AND '.join(map(compare_items, row['values'].items()))) 107 | values = map(fix_object, row['values'].values()) 108 | elif isinstance(binlog_event, UpdateRowsEvent): 109 | template = 'UPDATE `{0}` SET {1} WHERE {2};'.format( 110 | binlog_event.table, 111 | ', '.join(['`%s`=%%s' % k for k in row['after_values'].keys()]), 112 | ' AND '.join(map(compare_items, row['before_values'].items())) 113 | ) 114 | values = map(fix_object, list(row['after_values'].values())+list(row['before_values'].values())) 115 | 116 | return {'template': template, 'values': list(values)} 117 | -------------------------------------------------------------------------------- /redshift/utils.py: -------------------------------------------------------------------------------- 1 | # Ripped off from https://github.com/danfengcao/binlog2sql 2 | 3 | import os 4 | import sys 5 | import argparse 6 | import datetime 7 | 8 | if sys.version > '3': 9 | PY3PLUS = True 10 | else: 11 | PY3PLUS = False 12 | 13 | from pymysqlreplication.row_event import ( 14 | WriteRowsEvent, 15 | UpdateRowsEvent, 16 | DeleteRowsEvent, 17 | ) 18 | from pymysqlreplication.event import QueryEvent 19 | 20 | 21 | def compare_items(items): 22 | # caution: if v is NULL, may need to process 23 | (k, v) = items 24 | if v is None: 25 | return '`%s` IS %%s' % k 26 | else: 27 | return '`%s`=%%s' % k 28 | 29 | 30 | def fix_object(value): 31 | """Fixes python objects so that they can be properly inserted into SQL queries""" 32 | if isinstance(value, set): 33 | value = ','.join(value) 34 | if PY3PLUS and isinstance(value, bytes): 35 | return value.decode('utf-8') 36 | elif not PY3PLUS and isinstance(value, unicode): 37 | return value.encode('utf-8') 38 | else: 39 | return value 40 | 41 | def concat_sql_from_binlog_event(cursor, binlog_event, row=None, e_start_pos=None, flashback=False, no_pk=False): 42 | if flashback and no_pk: 43 | raise ValueError('only one of flashback or no_pk can be True') 44 | if not (isinstance(binlog_event, WriteRowsEvent) or isinstance(binlog_event, UpdateRowsEvent) 45 | or isinstance(binlog_event, DeleteRowsEvent) or isinstance(binlog_event, QueryEvent)): 46 | raise ValueError('binlog_event must be WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent or QueryEvent') 47 | 48 | sql = '' 49 | if isinstance(binlog_event, WriteRowsEvent) or isinstance(binlog_event, UpdateRowsEvent) \ 50 | or isinstance(binlog_event, DeleteRowsEvent): 51 | pattern = generate_sql_pattern(binlog_event, row=row, flashback=flashback, no_pk=no_pk) 52 | sql = cursor.mogrify(pattern['template'], pattern['values']) 53 | time = datetime.datetime.fromtimestamp(binlog_event.timestamp) 54 | #sql += ' #start %s end %s time %s' % (e_start_pos, binlog_event.packet.log_pos, time) 55 | elif flashback is False and isinstance(binlog_event, QueryEvent) and binlog_event.query != 'BEGIN' \ 56 | and binlog_event.query != 'COMMIT': 57 | if binlog_event.schema: 58 | sql = 'USE {0};\n'.format(binlog_event.schema) 59 | sql += '{0};'.format(fix_object(binlog_event.query)) 60 | 61 | return sql 62 | 63 | #removed schema: `schemaname`.`tablename` to just `tablename` 64 | #remove limt 1 from UPDATE and DELETE 65 | def generate_sql_pattern(binlog_event, row=None, flashback=False, no_pk=False): 66 | template = '' 67 | values = [] 68 | if flashback is True: 69 | if isinstance(binlog_event, WriteRowsEvent): 70 | template = 'DELETE FROM `{0}` WHERE {1};'.format( 71 | binlog_event.table, 72 | ' AND '.join(map(compare_items, row['values'].items())) 73 | ) 74 | values = map(fix_object, row['values'].values()) 75 | elif isinstance(binlog_event, DeleteRowsEvent): 76 | template = 'INSERT INTO `{0}`({1}) VALUES ({2});'.format( 77 | binlog_event.table, 78 | ', '.join(map(lambda key: '`%s`' % key, row['values'].keys())), 79 | ', '.join(['%s'] * len(row['values'])) 80 | ) 81 | values = map(fix_object, row['values'].values()) 82 | elif isinstance(binlog_event, UpdateRowsEvent): 83 | template = 'UPDATE `{0}` SET {1} WHERE {2};'.format( 84 | binlog_event.table, 85 | ', '.join(['`%s`=%%s' % x for x in row['before_values'].keys()]), 86 | ' AND '.join(map(compare_items, row['after_values'].items()))) 87 | values = map(fix_object, list(row['before_values'].values())+list(row['after_values'].values())) 88 | else: 89 | if isinstance(binlog_event, WriteRowsEvent): 90 | if no_pk: 91 | # print binlog_event.__dict__ 92 | # tableInfo = (binlog_event.table_map)[binlog_event.table_id] 93 | # if tableInfo.primary_key: 94 | # row['values'].pop(tableInfo.primary_key) 95 | if binlog_event.primary_key: 96 | row['values'].pop(binlog_event.primary_key) 97 | 98 | template = 'INSERT INTO `{0}`({1}) VALUES ({2});'.format( 99 | binlog_event.table, 100 | ', '.join(map(lambda key: '`%s`' % key, row['values'].keys())), 101 | ', '.join(['%s'] * len(row['values'])) 102 | ) 103 | values = map(fix_object, row['values'].values()) 104 | elif isinstance(binlog_event, DeleteRowsEvent): 105 | template = 'DELETE FROM `{0}` WHERE {1};'.format( 106 | binlog_event.table, ' AND '.join(map(compare_items, row['values'].items()))) 107 | values = map(fix_object, row['values'].values()) 108 | elif isinstance(binlog_event, UpdateRowsEvent): 109 | template = 'UPDATE `{0}` SET {1} WHERE {2};'.format( 110 | binlog_event.table, 111 | ', '.join(['`%s`=%%s' % k for k in row['after_values'].keys()]), 112 | ' AND '.join(map(compare_items, row['before_values'].items())) 113 | ) 114 | values = map(fix_object, list(row['after_values'].values())+list(row['before_values'].values())) 115 | 116 | return {'template': template, 'values': list(values)} 117 | -------------------------------------------------------------------------------- /snowflake/utils.py: -------------------------------------------------------------------------------- 1 | # Ripped off from https://github.com/danfengcao/binlog2sql 2 | 3 | import os 4 | import sys 5 | import argparse 6 | import datetime 7 | 8 | if sys.version > '3': 9 | PY3PLUS = True 10 | else: 11 | PY3PLUS = False 12 | 13 | from pymysqlreplication.row_event import ( 14 | WriteRowsEvent, 15 | UpdateRowsEvent, 16 | DeleteRowsEvent, 17 | ) 18 | from pymysqlreplication.event import QueryEvent 19 | 20 | 21 | def compare_items(items): 22 | # caution: if v is NULL, may need to process 23 | (k, v) = items 24 | if v is None: 25 | return '`%s` IS %%s' % k 26 | else: 27 | return '`%s`=%%s' % k 28 | 29 | 30 | def fix_object(value): 31 | """Fixes python objects so that they can be properly inserted into SQL queries""" 32 | if isinstance(value, set): 33 | value = ','.join(value) 34 | if PY3PLUS and isinstance(value, bytes): 35 | return value.decode('utf-8') 36 | elif not PY3PLUS and isinstance(value, unicode): 37 | return value.encode('utf-8') 38 | else: 39 | return value 40 | 41 | def concat_sql_from_binlog_event(cursor, binlog_event, row=None, e_start_pos=None, flashback=False, no_pk=False): 42 | if flashback and no_pk: 43 | raise ValueError('only one of flashback or no_pk can be True') 44 | if not (isinstance(binlog_event, WriteRowsEvent) or isinstance(binlog_event, UpdateRowsEvent) 45 | or isinstance(binlog_event, DeleteRowsEvent) or isinstance(binlog_event, QueryEvent)): 46 | raise ValueError('binlog_event must be WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent or QueryEvent') 47 | 48 | sql = '' 49 | if isinstance(binlog_event, WriteRowsEvent) or isinstance(binlog_event, UpdateRowsEvent) \ 50 | or isinstance(binlog_event, DeleteRowsEvent): 51 | pattern = generate_sql_pattern(binlog_event, row=row, flashback=flashback, no_pk=no_pk) 52 | sql = cursor.mogrify(pattern['template'], pattern['values']) 53 | time = datetime.datetime.fromtimestamp(binlog_event.timestamp) 54 | #sql += ' #start %s end %s time %s' % (e_start_pos, binlog_event.packet.log_pos, time) 55 | elif flashback is False and isinstance(binlog_event, QueryEvent) and binlog_event.query != 'BEGIN' \ 56 | and binlog_event.query != 'COMMIT': 57 | if binlog_event.schema: 58 | sql = 'USE {0};\n'.format(binlog_event.schema) 59 | sql += '{0};'.format(fix_object(binlog_event.query)) 60 | 61 | return sql 62 | 63 | #removed schema: `schemaname`.`tablename` to just `tablename` 64 | #remove limt 1 from UPDATE and DELETE 65 | def generate_sql_pattern(binlog_event, row=None, flashback=False, no_pk=False): 66 | template = '' 67 | values = [] 68 | if flashback is True: 69 | if isinstance(binlog_event, WriteRowsEvent): 70 | template = 'DELETE FROM `{0}` WHERE {1};'.format( 71 | binlog_event.table, 72 | ' AND '.join(map(compare_items, row['values'].items())) 73 | ) 74 | values = map(fix_object, row['values'].values()) 75 | elif isinstance(binlog_event, DeleteRowsEvent): 76 | template = 'INSERT INTO `{0}`({1}) VALUES ({2});'.format( 77 | binlog_event.table, 78 | ', '.join(map(lambda key: '`%s`' % key, row['values'].keys())), 79 | ', '.join(['%s'] * len(row['values'])) 80 | ) 81 | values = map(fix_object, row['values'].values()) 82 | elif isinstance(binlog_event, UpdateRowsEvent): 83 | template = 'UPDATE `{0}` SET {1} WHERE {2};'.format( 84 | binlog_event.table, 85 | ', '.join(['`%s`=%%s' % x for x in row['before_values'].keys()]), 86 | ' AND '.join(map(compare_items, row['after_values'].items()))) 87 | values = map(fix_object, list(row['before_values'].values())+list(row['after_values'].values())) 88 | else: 89 | if isinstance(binlog_event, WriteRowsEvent): 90 | if no_pk: 91 | # print binlog_event.__dict__ 92 | # tableInfo = (binlog_event.table_map)[binlog_event.table_id] 93 | # if tableInfo.primary_key: 94 | # row['values'].pop(tableInfo.primary_key) 95 | if binlog_event.primary_key: 96 | row['values'].pop(binlog_event.primary_key) 97 | 98 | template = 'INSERT INTO `{0}`({1}) VALUES ({2});'.format( 99 | binlog_event.table, 100 | ', '.join(map(lambda key: '`%s`' % key, row['values'].keys())), 101 | ', '.join(['%s'] * len(row['values'])) 102 | ) 103 | values = map(fix_object, row['values'].values()) 104 | elif isinstance(binlog_event, DeleteRowsEvent): 105 | template = 'DELETE FROM `{0}` WHERE {1};'.format( 106 | binlog_event.table, ' AND '.join(map(compare_items, row['values'].items()))) 107 | values = map(fix_object, row['values'].values()) 108 | elif isinstance(binlog_event, UpdateRowsEvent): 109 | template = 'UPDATE `{0}` SET {1} WHERE {2};'.format( 110 | binlog_event.table, 111 | ', '.join(['`%s`=%%s' % k for k in row['after_values'].keys()]), 112 | ' AND '.join(map(compare_items, row['before_values'].items())) 113 | ) 114 | values = map(fix_object, list(row['after_values'].values())+list(row['before_values'].values())) 115 | 116 | return {'template': template, 'values': list(values)} 117 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # Ripped off from https://github.com/danfengcao/binlog2sql 2 | 3 | import os 4 | import sys 5 | import argparse 6 | import datetime 7 | 8 | if sys.version > '3': 9 | PY3PLUS = True 10 | else: 11 | PY3PLUS = False 12 | 13 | from pymysqlreplication.row_event import ( 14 | WriteRowsEvent, 15 | UpdateRowsEvent, 16 | DeleteRowsEvent, 17 | ) 18 | from pymysqlreplication.event import QueryEvent 19 | 20 | 21 | def compare_items(items): 22 | # caution: if v is NULL, may need to process 23 | (k, v) = items 24 | if v is None: 25 | return '`%s` IS %%s' % k 26 | else: 27 | return '`%s`=%%s' % k 28 | 29 | 30 | def fix_object(value): 31 | """Fixes python objects so that they can be properly inserted into SQL queries""" 32 | if isinstance(value, set): 33 | value = ','.join(value) 34 | if PY3PLUS and isinstance(value, bytes): 35 | return value.decode('utf-8') 36 | elif not PY3PLUS and isinstance(value, unicode): 37 | return value.encode('utf-8') 38 | else: 39 | return value 40 | 41 | def concat_sql_from_binlog_event(cursor, binlog_event, row=None, e_start_pos=None, flashback=False, no_pk=False): 42 | if flashback and no_pk: 43 | raise ValueError('only one of flashback or no_pk can be True') 44 | if not (isinstance(binlog_event, WriteRowsEvent) or isinstance(binlog_event, UpdateRowsEvent) 45 | or isinstance(binlog_event, DeleteRowsEvent) or isinstance(binlog_event, QueryEvent)): 46 | raise ValueError('binlog_event must be WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent or QueryEvent') 47 | 48 | sql = '' 49 | if isinstance(binlog_event, WriteRowsEvent) or isinstance(binlog_event, UpdateRowsEvent) \ 50 | or isinstance(binlog_event, DeleteRowsEvent): 51 | pattern = generate_sql_pattern(binlog_event, row=row, flashback=flashback, no_pk=no_pk) 52 | sql = cursor.mogrify(pattern['template'], pattern['values']) 53 | time = datetime.datetime.fromtimestamp(binlog_event.timestamp) 54 | #sql += ' #start %s end %s time %s' % (e_start_pos, binlog_event.packet.log_pos, time) 55 | elif flashback is False and isinstance(binlog_event, QueryEvent) and binlog_event.query != 'BEGIN' \ 56 | and binlog_event.query != 'COMMIT': 57 | if binlog_event.schema: 58 | sql = 'USE {0};\n'.format(binlog_event.schema) 59 | sql += '{0};'.format(fix_object(binlog_event.query)) 60 | 61 | return sql 62 | 63 | 64 | def generate_sql_pattern(binlog_event, row=None, flashback=False, no_pk=False): 65 | template = '' 66 | values = [] 67 | if flashback is True: 68 | if isinstance(binlog_event, WriteRowsEvent): 69 | template = 'DELETE FROM `{0}`.`{1}` WHERE {2} LIMIT 1;'.format( 70 | binlog_event.schema, binlog_event.table, 71 | ' AND '.join(map(compare_items, row['values'].items())) 72 | ) 73 | values = map(fix_object, row['values'].values()) 74 | elif isinstance(binlog_event, DeleteRowsEvent): 75 | template = 'INSERT INTO `{0}`.`{1}`({2}) VALUES ({3});'.format( 76 | binlog_event.schema, binlog_event.table, 77 | ', '.join(map(lambda key: '`%s`' % key, row['values'].keys())), 78 | ', '.join(['%s'] * len(row['values'])) 79 | ) 80 | values = map(fix_object, row['values'].values()) 81 | elif isinstance(binlog_event, UpdateRowsEvent): 82 | template = 'UPDATE `{0}`.`{1}` SET {2} WHERE {3} LIMIT 1;'.format( 83 | binlog_event.schema, binlog_event.table, 84 | ', '.join(['`%s`=%%s' % x for x in row['before_values'].keys()]), 85 | ' AND '.join(map(compare_items, row['after_values'].items()))) 86 | values = map(fix_object, list(row['before_values'].values())+list(row['after_values'].values())) 87 | else: 88 | if isinstance(binlog_event, WriteRowsEvent): 89 | if no_pk: 90 | # print binlog_event.__dict__ 91 | # tableInfo = (binlog_event.table_map)[binlog_event.table_id] 92 | # if tableInfo.primary_key: 93 | # row['values'].pop(tableInfo.primary_key) 94 | if binlog_event.primary_key: 95 | row['values'].pop(binlog_event.primary_key) 96 | 97 | template = 'INSERT INTO `{0}`.`{1}`({2}) VALUES ({3});'.format( 98 | binlog_event.schema, binlog_event.table, 99 | ', '.join(map(lambda key: '`%s`' % key, row['values'].keys())), 100 | ', '.join(['%s'] * len(row['values'])) 101 | ) 102 | values = map(fix_object, row['values'].values()) 103 | elif isinstance(binlog_event, DeleteRowsEvent): 104 | template = 'DELETE FROM `{0}`.`{1}` WHERE {2} LIMIT 1;'.format( 105 | binlog_event.schema, binlog_event.table, ' AND '.join(map(compare_items, row['values'].items()))) 106 | values = map(fix_object, row['values'].values()) 107 | elif isinstance(binlog_event, UpdateRowsEvent): 108 | template = 'UPDATE `{0}`.`{1}` SET {2} WHERE {3} LIMIT 1;'.format( 109 | binlog_event.schema, binlog_event.table, 110 | ', '.join(['`%s`=%%s' % k for k in row['after_values'].keys()]), 111 | ' AND '.join(map(compare_items, row['before_values'].items())) 112 | ) 113 | values = map(fix_object, list(row['after_values'].values())+list(row['before_values'].values())) 114 | 115 | return {'template': template, 'values': list(values)} 116 | -------------------------------------------------------------------------------- /snowflake/README.md: -------------------------------------------------------------------------------- 1 | # Streaming mysql binlog replication to Snowflake 2 | [![](https://images.microbadger.com/badges/image/servicerocket/mysql2snowflake.svg)](https://hub.docker.com/r/servicerocket/mysql2snowflake/) 3 | 4 | In Snowflakes web interface run this query to create a database and table 5 | ```sql 6 | CREATE OR REPLACE DATABASE testdb; 7 | USE DATABASE testdb; 8 | CREATE OR REPLACE TABLE testtbl(id integer, name string); 9 | ``` 10 | 11 | You can use the following python script to generate Snowflake compatible schema 12 | ```bash 13 | docker-compose exec python python mysql2snowsql.py 14 | ``` 15 | 16 | Clone this repo and update your Snowflake credentials in `example.env`, then 17 | ```bash 18 | cd snowflake/ 19 | mv example.env .env 20 | docker-compose up --build 21 | ``` 22 | 23 | > Note: If you get `Can't connect to MySQL server on 'mysql' ([Errno 111] Connection refused)` error on the first run, try running it again. 24 | 25 | In another terminal run 26 | ```bash 27 | docker-compose exec mysql mysql -u root -pexample -e "DROP DATABASE IF EXISTS testdb; CREATE DATABASE testdb; USE testdb; CREATE TABLE testtbl (id int, name varchar(255)); INSERT INTO testtbl VALUES (1, 'hello'), (2, 'hola'), (3, 'zdravstvuy'), (1, 'bonjour'); UPDATE testtbl SET name = 'yolo' WHERE id = 1; UPDATE testtbl SET name = 'world' WHERE id = 3; DELETE FROM testtbl WHERE id = 1; SELECT * FROM testtbl;" 28 | ``` 29 | 30 | Which will output the following to the terminal 31 | ``` 32 | +------+-------+ 33 | | id | name | 34 | +------+-------+ 35 | | 2 | hola | 36 | | 3 | world | 37 | +------+-------+ 38 | ``` 39 | 40 | `docker-compose` daemon should output something like this 41 | ```sql 42 | python_1 | INSERT INTO testtbl(name, id) VALUES ('hello', 1); 43 | python_1 | INSERT INTO testtbl(name, id) VALUES ('hola', 2); 44 | python_1 | INSERT INTO testtbl(name, id) VALUES ('zdravstvuy', 3); 45 | python_1 | INSERT INTO testtbl(name, id) VALUES ('bonjour', 1); 46 | python_1 | UPDATE testtbl SET name='yolo', id=1 WHERE name='hello' AND id=1; 47 | python_1 | UPDATE testtbl SET name='yolo', id=1 WHERE name='bonjour' AND id=1; 48 | python_1 | UPDATE testtbl SET name='world', id=3 WHERE name='zdravstvuy' AND id=3; 49 | python_1 | DELETE FROM testtbl WHERE name='yolo' AND id=1; 50 | python_1 | DELETE FROM testtbl WHERE name='yolo' AND id=1; 51 | ``` 52 | Executing these queries one by one is really slow. Ideally, we should batch them together. 53 | ![Snowflake History showing slow queries](https://i.imgur.com/iVXQ3Nx.png) 54 | 55 | # References 56 | - https://www.alooma.com/blog/mysql-to-amazon-redshift-replication 57 | - https://aws.amazon.com/blogs/database/streaming-changes-in-a-database-with-amazon-kinesis/ 58 | - https://github.com/danfengcao/binlog2sql 59 | - https://www.thegeekstuff.com/2017/08/mysqlbinlog-examples/ 60 | 61 | 62 | 63 | # Dumping MySQL database as CSV 64 | [Dumping tab files](https://dev.mysql.com/doc/refman/8.0/en/mysqldump-delimited-text.html) to arbitrary directory can result in `The MySQL server is running with the --secure-file-priv option so it cannot execute this statement` error. Instead use the directory that has been configured in mysql. [source](https://stackoverflow.com/questions/32737478/how-should-i-tackle-secure-file-priv-in-mysql) 65 | ```bash 66 | docker-compose exec mysql mysql -u root -pexample -e 'SHOW VARIABLES LIKE "secure_file_priv";' 67 | ``` 68 | ``` 69 | +------------------+-----------------------+ 70 | | Variable_name | Value | 71 | +------------------+-----------------------+ 72 | | secure_file_priv | /var/lib/mysql-files/ | 73 | +------------------+-----------------------+ 74 | ``` 75 | 76 | ```bash 77 | docker-compose exec mysql bash 78 | mysqldump -u root -pexample -T /var/lib/mysql-files/ --fields-terminated-by ',' --fields-enclosed-by '"' --fields-escaped-by '\' --no-create-info testdb 79 | ``` 80 | 81 | > Note: mysqldump uses \N to represent NULL. Though, a NULL value is typically represented by two successive delimiters, e.g. ,,, to indicate that the field contains no data; [source](https://docs.snowflake.net/manuals/user-guide/data-unload-considerations.html#empty-strings-and-null-values) 82 | 83 | ## Preprocessing CSV 84 | 85 | ```bash 86 | cd /var/lib/mysql-files/ 87 | # Remove sql files 88 | find . -type f -name "*.sql" -print0 | xargs -0 rm 89 | 90 | # Fix boolean types. HACKY! I don't know how else to do this 91 | find . -type f -name "*.txt" -print0 | xargs -0 sed -i 's/\"\x01\"/\"1\"/g' 92 | find . -type f -name "*.txt" -print0 | xargs -0 sed -i 's/\"\x00\"/\"0\"/g' 93 | 94 | # Fix invalid dates to epoach time 95 | find . -type f -name "*.txt" -print0 | xargs -0 sed -i 's/\"0000-00-00 00:00:00\"/\"1970-01-01 00:00:00\"/g' 96 | ``` 97 | 98 | # Generating Snowflake schema 99 | 100 | ```bash 101 | docker-compose exec python python mysql2snowsql.py 102 | ``` 103 | 104 | ```sql 105 | /* TABLE: testtbl */ 106 | CREATE OR REPLACE TABLE testtbl ( 107 | id NUMBER , 108 | name STRING, 109 | ) 110 | ``` 111 | 112 | # Loading CSV into Snowflake 113 | 114 | Copy data to S3 115 | ```bash 116 | aws s3 cp --recursive /var/lib/mysql-files/ s3://bucket-name/directory-name 117 | ``` 118 | 119 | Create [AWS IAM User Credentials](https://docs.snowflake.net/manuals/user-guide/data-load-s3-config.html#option-1-configuring-aws-iam-user-credentials) and use it to create a `Stage` (think of it as a data source). 120 | 121 | 122 | ```sql 123 | CREATE STAGE "TESTDB"."PUBLIC".S3STAGE 124 | URL = 's3://bucket-name' 125 | CREDENTIALS = (AWS_KEY_ID = 'abcde' AWS_SECRET_KEY = 'xxxxx'); 126 | ``` 127 | 128 | For convenience, create a `File Format` to specify how Snowflake should parse the csv. 129 | ```sql 130 | CREATE FILE FORMAT "TESTDB"."PUBLIC".MYSQLDUMPCSV 131 | TYPE = 'CSV' 132 | COMPRESSION = 'NONE' 133 | FIELD_DELIMITER = ',' 134 | RECORD_DELIMITER = '\n' 135 | SKIP_HEADER = 0 136 | FIELD_OPTIONALLY_ENCLOSED_BY = '\042' 137 | TRIM_SPACE = FALSE 138 | ERROR_ON_COLUMN_COUNT_MISMATCH = TRUE 139 | ESCAPE = '\134' 140 | ESCAPE_UNENCLOSED_FIELD = 'NONE' 141 | DATE_FORMAT = 'AUTO' 142 | TIMESTAMP_FORMAT = 'AUTO' 143 | NULL_IF = ('\\N'); 144 | ``` 145 | 146 | Finally, load data from S3 into Snowflake 147 | 148 | ```sql 149 | COPY INTO "TESTDB"."PUBLIC"."TESTTBL" 150 | FROM '@"TESTDB"."PUBLIC"."S3STAGE"/directory-name/testtbl.txt' 151 | FILE_FORMAT = '"TESTDB"."PUBLIC"."MYSQLDUMPCSV"' 152 | ON_ERROR = 'ABORT_STATEMENT' 153 | PURGE = FALSE; 154 | ``` 155 | 156 | # Reference 157 | - https://docs.snowflake.net/manuals/user-guide/data-load-s3.html 158 | - https://docs.snowflake.net/manuals/user-guide/data-load-s3-create-stage.html 159 | - https://docs.snowflake.net/manuals/user-guide/data-load-s3-copy.html -------------------------------------------------------------------------------- /redshift/mysql2redshift.py: -------------------------------------------------------------------------------- 1 | # Ripped off from https://github.com/lanyrd/mysql-postgresql-converter/blob/master/db_converter.py 2 | 3 | """ 4 | Fixes a MySQL dump made with the right format so it can be directly 5 | imported to a new PostgreSQL database. 6 | 7 | Dump using: 8 | mysqldump --compatible=postgresql --default-character-set=utf8 -r databasename.mysql -u root databasename 9 | """ 10 | 11 | import re 12 | import sys 13 | import os 14 | import time 15 | import subprocess 16 | 17 | 18 | def parse(input_filename, output_filename): 19 | "Feed it a file, and it'll output a fixed one" 20 | 21 | # State storage 22 | if input_filename == "-": 23 | num_lines = -1 24 | else: 25 | num_lines = int(subprocess.check_output(["wc", "-l", input_filename]).strip().split()[0]) 26 | tables = {} 27 | current_table = None 28 | creation_lines = [] 29 | enum_types = [] 30 | foreign_key_lines = [] 31 | fulltext_key_lines = [] 32 | sequence_lines = [] 33 | cast_lines = [] 34 | num_inserts = 0 35 | started = time.time() 36 | 37 | # Open output file and write header. Logging file handle will be stdout 38 | # unless we're writing output to stdout, in which case NO PROGRESS FOR YOU. 39 | if output_filename == "-": 40 | output = sys.stdout 41 | logging = open(os.devnull, "w") 42 | else: 43 | output = open(output_filename, "w") 44 | logging = sys.stdout 45 | 46 | if input_filename == "-": 47 | input_fh = sys.stdin 48 | else: 49 | input_fh = open(input_filename) 50 | 51 | 52 | output.write("-- Converted by db_converter\n") 53 | output.write("START TRANSACTION;\n") 54 | #output.write("SET standard_conforming_strings=off;\n") 55 | #output.write("SET escape_string_warning=off;\n") 56 | #output.write("SET CONSTRAINTS ALL DEFERRED;\n\n") 57 | 58 | for i, line in enumerate(input_fh): 59 | time_taken = time.time() - started 60 | percentage_done = (i+1) / float(num_lines) 61 | secs_left = (time_taken / percentage_done) - time_taken 62 | logging.write("\rLine %i (of %s: %.2f%%) [%s tables] [%s inserts] [ETA: %i min %i sec]" % ( 63 | i + 1, 64 | num_lines, 65 | ((i+1)/float(num_lines))*100, 66 | len(tables), 67 | num_inserts, 68 | secs_left // 60, 69 | secs_left % 60, 70 | )) 71 | logging.flush() 72 | line = line.strip().replace(r"\\", "WUBWUBREALSLASHWUB").replace(r"\'", "''").replace("WUBWUBREALSLASHWUB", r"\\") 73 | 74 | line = line.replace("`", '"').replace('b\'0\'', '0').replace('b\'1\'', '1').replace('0000-00-00 00:00:00', '1970-01-01 00:00:00') 75 | 76 | # Ignore comment lines 77 | if line.startswith("--") or line.startswith("/*") or line.startswith("LOCK TABLES") or line.startswith("DROP TABLE") or line.startswith("UNLOCK TABLES") or not line: 78 | continue 79 | 80 | # Outside of anything handling 81 | if current_table is None: 82 | # Start of a table creation statement? 83 | if line.startswith("CREATE TABLE"): 84 | current_table = line.split('"')[1] 85 | tables[current_table] = {"columns": []} 86 | creation_lines = [] 87 | # Inserting data into a table? 88 | elif line.startswith("INSERT INTO"): 89 | output.write(line.replace("'0000-00-00 00:00:00'", "NULL") + "\n") 90 | num_inserts += 1 91 | # ??? 92 | else: 93 | print("\n ! Unknown line in main body: %s" % line) 94 | 95 | # Inside-create-statement handling 96 | else: 97 | # Is it a column? 98 | if line.startswith('"'): 99 | useless, name, definition = line.strip(",").split('"',2) 100 | try: 101 | type, extra = definition.strip().split(" ", 1) 102 | 103 | # This must be a tricky enum 104 | if ')' in extra: 105 | type, extra = definition.strip().split(")") 106 | 107 | except ValueError: 108 | type = definition.strip() 109 | extra = "" 110 | extra = re.sub("CHARACTER SET [\w\d]+\s*", "", extra.replace("unsigned", "")) 111 | extra = re.sub("COLLATE [\w\d]+\s*", "", extra.replace("unsigned", "")) 112 | 113 | # See if it needs type conversion 114 | final_type = None 115 | set_sequence = None 116 | if type.startswith("tinyint("): 117 | type = "int4" 118 | set_sequence = True 119 | final_type = "boolean" 120 | elif type.startswith("int("): 121 | type = "integer" 122 | set_sequence = True 123 | elif type.startswith("bigint("): 124 | type = "bigint" 125 | set_sequence = True 126 | elif type == "longtext": 127 | type = "text" 128 | elif type == "mediumtext": 129 | type = "text" 130 | elif type == "tinytext": 131 | type = "text" 132 | elif type.startswith("varchar("): 133 | size = int(type.split("(")[1].rstrip(")")) 134 | type = "varchar(%s)" % (size * 2) 135 | elif type.startswith("smallint("): 136 | type = "int2" 137 | set_sequence = True 138 | elif type == "datetime": 139 | type = "timestamp with time zone" 140 | elif type == "double": 141 | type = "double precision" 142 | elif type.endswith("blob"): 143 | type = "text" 144 | elif type.startswith("bit(1)"): 145 | type = "boolean" 146 | elif type.startswith("enum(") or type.startswith("set("): 147 | 148 | types_str = type.split("(")[1].rstrip(")").rstrip('"') 149 | types_arr = [type_str.strip('\'') for type_str in types_str.split(",")] 150 | 151 | # Considered using values to make a name, but its dodgy 152 | # enum_name = '_'.join(types_arr) 153 | enum_name = "{0}_{1}".format(current_table, name) 154 | 155 | if enum_name not in enum_types: 156 | output.write("CREATE TYPE {0} AS ENUM ({1}); \n".format(enum_name, types_str)); 157 | enum_types.append(enum_name) 158 | 159 | type = enum_name 160 | 161 | if final_type: 162 | cast_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"%s\" DROP DEFAULT, ALTER COLUMN \"%s\" TYPE %s USING CAST(\"%s\" as %s)" % (current_table, name, name, final_type, name, final_type)) 163 | # ID fields need sequences [if they are integers?] 164 | if name == "id" and set_sequence is True: 165 | sequence_lines.append("CREATE SEQUENCE %s_id_seq" % (current_table)) 166 | sequence_lines.append("SELECT setval('%s_id_seq', max(id)) FROM %s" % (current_table, current_table)) 167 | sequence_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"id\" SET DEFAULT nextval('%s_id_seq')" % (current_table, current_table)) 168 | # Record it 169 | creation_lines.append('"%s" %s %s' % (name, type, extra)) 170 | tables[current_table]['columns'].append((name, type, extra)) 171 | # Is it a constraint or something? 172 | elif line.startswith("PRIMARY KEY"): 173 | creation_lines.append(line.rstrip(",")) 174 | elif line.startswith("CONSTRAINT"): 175 | foreign_key_lines.append("ALTER TABLE \"%s\" ADD CONSTRAINT %s DEFERRABLE INITIALLY DEFERRED" % (current_table, line.split("CONSTRAINT")[1].strip().rstrip(","))) 176 | #foreign_key_lines.append("CREATE INDEX ON \"%s\" %s" % (current_table, line.split("FOREIGN KEY")[1].split("REFERENCES")[0].strip().rstrip(","))) 177 | elif line.startswith("UNIQUE KEY"): 178 | creation_lines.append("UNIQUE (%s)" % line.split("(")[1].split(")")[0]) 179 | elif line.startswith("FULLTEXT KEY"): 180 | 181 | fulltext_keys = " || ' ' || ".join( line.split('(')[-1].split(')')[0].replace('"', '').split(',') ) 182 | #fulltext_key_lines.append("CREATE INDEX ON %s USING gin(to_tsvector('english', %s))" % (current_table, fulltext_keys)) 183 | 184 | elif line.startswith("KEY"): 185 | pass 186 | # Is it the end of the table? 187 | elif line == ");": 188 | output.write("DROP TABLE IF EXISTS \"%s\" CASCADE;\n" % current_table) 189 | output.write("CREATE TABLE \"%s\" (\n" % current_table) 190 | for i, line in enumerate(creation_lines): 191 | output.write(" %s%s\n" % (line, "," if i != (len(creation_lines) - 1) else "")) 192 | output.write(');\n\n') 193 | current_table = None 194 | # ??? 195 | else: 196 | print("\n ! Unknown line inside table creation: %s" % line) 197 | 198 | 199 | # Finish file 200 | output.write("\n-- Post-data save --\n") 201 | output.write("COMMIT;\n") 202 | output.write("START TRANSACTION;\n") 203 | 204 | # Write typecasts out 205 | #output.write("\n-- Typecasts --\n") 206 | #for line in cast_lines: 207 | # output.write("%s;\n" % line) 208 | 209 | # We need this becuase Redshift uses them as planning hints for query execution [source](https://docs.aws.amazon.com/redshift/latest/dg/t_Defining_constraints.html) 210 | # Write FK constraints out 211 | output.write("\n-- Foreign keys --\n") 212 | for line in foreign_key_lines: 213 | for i in [" ON DELETE NO ACTION", " ON UPDATE NO ACTION", " ON DELETE CASCADE"]: 214 | line = line.replace(i, "") 215 | output.write("%s;\n" % line) 216 | 217 | # Write sequences out 218 | #output.write("\n-- Sequences --\n") 219 | #for line in sequence_lines: 220 | # output.write("%s;\n" % line) 221 | 222 | # Write full-text indexkeyses out 223 | output.write("\n-- Full Text keys --\n") 224 | for line in fulltext_key_lines: 225 | output.write("%s;\n" % line) 226 | 227 | # Finish file 228 | output.write("\n") 229 | output.write("COMMIT;\n") 230 | print() 231 | 232 | 233 | if __name__ == "__main__": 234 | parse(sys.argv[1], sys.argv[2]) --------------------------------------------------------------------------------