├── clickhouse_mysql ├── pool │ ├── __init__.py │ ├── pool.py │ └── bbpool.py ├── converter │ ├── __init__.py │ ├── csvreadconverter.py │ ├── converter.py │ ├── csvwriteconverter.py │ └── chwriteconverter.py ├── dbclient │ ├── __init__.py │ ├── chclient.py │ └── mysqlclient.py ├── event │ ├── __init__.py │ └── event.py ├── reader │ ├── __init__.py │ ├── reader.py │ ├── csvreader.py │ └── mysqlreader.py ├── writer │ ├── __init__.py │ ├── poolwriter.py │ ├── writer.py │ ├── processwriter.py │ ├── chcsvwriter.py │ ├── chwriter.py │ └── csvwriter.py ├── __init__.py ├── util.py ├── pumper.py ├── observable.py ├── objectbuilder.py ├── daemon.py ├── main.py ├── tablemigrator.py ├── tableprocessor.py ├── tablesqlbuilder.py ├── config.py └── clioptions.py ├── TODO.txt ├── clickhouse-mysql.conf ├── package_distr_source.sh ├── package_distr_wheels.sh ├── README.md ├── package_publish.sh ├── dev_run_plain.sh ├── clickhouse_mysql_converter └── chwritedataconverter.py ├── dev_run_config_file.sh ├── package_distr_deb.sh ├── .gitignore ├── clickhouse_mysql_examples ├── airline_ontime_data_mysql_to_ch_migration.sh ├── datatypes.sh ├── airline_ontime_mysql_data_import.sh ├── airline_ontime_data_mysql_to_ch_reader.sh ├── airline_ontime_data_download.sh ├── airline_ontime_schema_ch.sql └── airline_ontime_schema_mysql.sql ├── package_clear_old.sh ├── pack ├── build.sh └── clickhouse-mysql.spec ├── package_distr_rpm.sh ├── LICENSE ├── dev_run_cli_options.sh ├── docs └── usage-references.md ├── CHANGELOG.md ├── dev_run_cli_options_local_table_migration.sh ├── clickhouse_mysql.etc └── clickhouse-mysql.conf ├── setup.py └── clickhouse_mysql.init.d ├── clickhouse-mysql.service └── clickhouse-mysql /clickhouse_mysql/pool/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /clickhouse_mysql/converter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /clickhouse_mysql/dbclient/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /clickhouse_mysql/event/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /clickhouse_mysql/reader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /clickhouse_mysql/writer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TODO.txt: -------------------------------------------------------------------------------- 1 | Row mapper/processor for each row 2 | 3 | -------------------------------------------------------------------------------- /clickhouse-mysql.conf: -------------------------------------------------------------------------------- 1 | clickhouse_mysql.etc/clickhouse-mysql.conf -------------------------------------------------------------------------------- /package_distr_source.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./package_clear_old.sh 4 | 5 | python3 setup.py sdist 6 | -------------------------------------------------------------------------------- /package_distr_wheels.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./package_clear_old.sh 4 | 5 | python3 setup.py bdist_wheel 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # clickhouse-mysql-data-reader 2 | 3 | - [Manual](docs/manual.md) 4 | - [Usage examples](docs/usage-references.md) 5 | -------------------------------------------------------------------------------- /package_publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "###########################" 4 | echo "### Publish from dist/* ###" 5 | echo "###########################" 6 | 7 | echo "Going to publish:" 8 | for FILE in $(ls dist/*); do 9 | echo " $FILE" 10 | done 11 | 12 | twine upload dist/* 13 | -------------------------------------------------------------------------------- /clickhouse_mysql/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from .main import Main 5 | 6 | 7 | def main(): 8 | """Entry point for the application script""" 9 | main = Main() 10 | main.start() 11 | 12 | 13 | if __name__ == '__main__': 14 | main() 15 | -------------------------------------------------------------------------------- /dev_run_plain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ugly stub to suppress unsufficient sockets 4 | #sudo bash -c "echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse" 5 | 6 | # run data reader with specified Python version 7 | 8 | PYTHON="python3" 9 | 10 | CH_MYSQL="-m clickhouse_mysql.main" 11 | 12 | if [ ! -d "clickhouse_mysql" ]; then 13 | # no clickhouse_mysql dir available - step out of examples dir 14 | cd .. 15 | fi 16 | 17 | $PYTHON $CH_MYSQL ${*:1} 18 | -------------------------------------------------------------------------------- /clickhouse_mysql_converter/chwritedataconverter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from chwriteconverter import CHWriteConverter 5 | import datetime 6 | 7 | class CHWriteDataConverter(CHWriteConverter): 8 | def column(self, column, value): 9 | 10 | if column == 'day': 11 | _datetime = datetime.datetime.strptime(value, '%Y-%m-%d') 12 | _date = _datetime.date() 13 | return _date 14 | 15 | return super().column(column, value) 16 | -------------------------------------------------------------------------------- /dev_run_config_file.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ugly stub to suppress unsufficient sockets 4 | #sudo bash -c "echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse" 5 | 6 | # run data reader with specified Python version 7 | 8 | PYTHON="python3" 9 | 10 | CH_MYSQL="-m clickhouse_mysql.main" 11 | 12 | if [ ! -d "clickhouse_mysql" ]; then 13 | # no clickhouse_mysql dir available - step out of examples dir 14 | cd .. 15 | fi 16 | 17 | $PYTHON $CH_MYSQL --config-file=clickhouse_mysql.etc/clickhouse-mysql.conf ${*:1} 18 | -------------------------------------------------------------------------------- /package_distr_deb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./package_clear_old.sh 4 | 5 | echo "##########################" 6 | echo "### Build deb packages ###" 7 | echo "##########################" 8 | 9 | python3 setup.py --command-packages=stdeb.command bdist_deb 10 | 11 | echo "" 12 | echo "" 13 | echo "" 14 | echo "############################" 15 | echo "### Results - .deb files ###" 16 | echo "############################" 17 | ls -la ./deb_dist/*.deb 18 | 19 | # pypi stdeb 20 | # apt install python3-all python3-stdeb 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | sdist 8 | deb_dist 9 | build 10 | eggs 11 | parts 12 | bin 13 | var 14 | develop-eggs 15 | .installed.cfg 16 | 17 | # Installer logs 18 | pip-log.txt 19 | 20 | # Unit test / coverage reports 21 | .coverage 22 | .tox 23 | 24 | # Translations 25 | *.mo 26 | 27 | #Mr Developer 28 | .mr.developer.cfg 29 | 30 | # Doc 31 | _build 32 | 33 | # Text Editor Backupfile 34 | *~ 35 | 36 | # Intellij IDE 37 | .idea 38 | *.xml 39 | *.iml 40 | 41 | # Nose 42 | .noseids 43 | 44 | # Pyenv 45 | .python-version 46 | -------------------------------------------------------------------------------- /clickhouse_mysql/converter/csvreadconverter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from clickhouse_mysql.converter.converter import Converter 5 | import ast 6 | 7 | 8 | class CSVReadConverter(Converter): 9 | 10 | def row(self, row): 11 | if row is None: 12 | return None 13 | 14 | for column in row: 15 | if row[column] == '': 16 | row[column] = None 17 | # else: 18 | # try: 19 | # event.row[column] = ast.literal_eval(event.row[column]) 20 | # except: 21 | # pass 22 | return row 23 | -------------------------------------------------------------------------------- /clickhouse_mysql_examples/airline_ontime_data_mysql_to_ch_migration.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # migrate data from MySQL to ClickHouse 3 | 4 | # put csv files in this dir 5 | CSV_FILES_DIR="/var/lib/mysql-files" 6 | 7 | # dump CSV files into CSV_FILES_DIR 8 | sudo mysqldump \ 9 | -u root \ 10 | --tz-utc \ 11 | --quick \ 12 | --fields-terminated-by=, \ 13 | --fields-optionally-enclosed-by=\" \ 14 | --fields-escaped-by=\\ \ 15 | --tab="$CSV_FILES_DIR"/ \ 16 | airline ontime 17 | 18 | # replay CSV files from CSV_FILES_DIR 19 | sudo cat "$CSV_FILES_DIR"/ontime.txt | clickhouse-client --query="INSERT INTO airline.ontime FORMAT CSV" 20 | -------------------------------------------------------------------------------- /package_clear_old.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # List of items (files and folders) to be deleted. 4 | # These items are package-related 5 | ITEMS_TO_DEL=" 6 | build 7 | dist 8 | clickhouse_mysql.egg-info 9 | deb_dist 10 | " 11 | 12 | echo "########################################" 13 | echo "### Clear all build and release data ###" 14 | echo "########################################" 15 | 16 | echo "About to delete:" 17 | DEL="" 18 | for ITEM in ${ITEMS_TO_DEL}; do 19 | echo " ${ITEM}" 20 | DEL="${DEL} ${ITEM}" 21 | done 22 | 23 | if [[ -z "${DEL}" ]]; then 24 | echo "No items to delete" 25 | else 26 | echo "rm -rf ${DEL}" 27 | rm -rf ${DEL} 28 | fi 29 | -------------------------------------------------------------------------------- /pack/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | SOURCE_ROOT_DIR="$( cd "$( dirname $( dirname "${BASH_SOURCE[0]}" ) )" && pwd )" 6 | 7 | RPMBUILD_DIR="$SOURCE_ROOT_DIR/build/bdist.linux-x86_64/rpm/" 8 | 9 | TMP_DIR="$RPMBUILD_DIR/TMP" 10 | 11 | RPMMACROS=$(echo '%_topdir '"$RPMBUILD_DIR"' 12 | %_tmppath '"$TMP_DIR") 13 | echo "$RPMMACROS" > ~/.rpmmacros 14 | 15 | SPEC_FILE="$SOURCE_ROOT_DIR/pack/clickhouse-mysql.spec" 16 | 17 | VERSION=$(cat "$SOURCE_ROOT_DIR/setup.py" | grep 'version=' | grep -o "'.*'" | grep -o "[^']" | tr -d '\n') 18 | echo "VERSION=$VERSION" 19 | 20 | function mkdirs() 21 | { 22 | mkdir -p "$RPMBUILD_DIR"/{BUILD,BUILDROOT,RPMS,SOURCES,SPECS,SRPMS} 23 | mkdir -p "$TMP_DIR" 24 | } 25 | 26 | mkdirs 27 | 28 | rpmbuild -ba $SPEC_FILE 29 | 30 | -------------------------------------------------------------------------------- /clickhouse_mysql/reader/reader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from clickhouse_mysql.observable import Observable 5 | 6 | 7 | class Reader(Observable): 8 | """Read data from source and notify observers""" 9 | 10 | converter = None 11 | 12 | event_handlers = { 13 | # called on each WriteRowsEvent 14 | 'WriteRowsEvent': [], 15 | 16 | # called on each row inside WriteRowsEvent (thus can be called multiple times per WriteRowsEvent) 17 | 'WriteRowsEvent.EachRow': [], 18 | 19 | # called when Reader has no data to read 20 | 'ReaderIdleEvent': [], 21 | } 22 | 23 | def __init__(self, converter=None, callbacks={}): 24 | self.converter = converter 25 | self.subscribe(callbacks) 26 | 27 | def read(self): 28 | pass 29 | -------------------------------------------------------------------------------- /package_distr_rpm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./package_clear_old.sh 4 | 5 | echo "##########################" 6 | echo "### Build RPM packages ###" 7 | echo "##########################" 8 | 9 | python3 setup.py bdist_rpm --packager="Vladislav Klimenko " 10 | # --spec-only 11 | 12 | echo "" 13 | echo "" 14 | echo "" 15 | echo "######################################" 16 | echo "### Results - .spec and .rpm files ###" 17 | echo "######################################" 18 | ls -la ./build/bdist.linux-x86_64/rpm/SPECS/*.spec 19 | ls -la ./dist/*.rpm 20 | 21 | # build RPMs with 22 | # rpmbuild -ba ./build/bdist.linux-x86_64/rpm/SPECS/clickhouse-mysql.spec 23 | 24 | # https://docs.python.org/2.0/dist/creating-rpms.html 25 | # for deb-based distro 26 | # sudo apt install rpm 27 | 28 | # 1. install python3 29 | # ensure python3 is available or run sudo ln -s python3.6 /usr/bin/python3 30 | # 2. install sudo yum install rpm-build -------------------------------------------------------------------------------- /clickhouse_mysql/converter/converter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | 6 | from clickhouse_mysql.event.event import Event 7 | 8 | 9 | class Converter(object): 10 | 11 | column_skip = [] 12 | 13 | def __init__(self, column_skip): 14 | logging.debug("Converter __init__()") 15 | self.column_skip = [] if column_skip is None else column_skip 16 | logging.debug(self.column_skip) 17 | 18 | def row(self, row): 19 | return row 20 | 21 | def rows(self, rows): 22 | if rows is None: 23 | return None 24 | 25 | res = [] 26 | for row in rows: 27 | res.append(self.row(row)) 28 | 29 | return res 30 | 31 | def convert(self, event_or_row): 32 | if isinstance(event_or_row, Event): 33 | return event_or_row.convert(self) 34 | else: 35 | return self.row(event_or_row) 36 | -------------------------------------------------------------------------------- /clickhouse_mysql/pool/pool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | class Pool(object): 6 | 7 | writer_builder = None 8 | key_builder = None 9 | 10 | key_generator = None 11 | 12 | max_bucket_size = None 13 | max_belt_size = None 14 | max_interval_between_rotations = None 15 | 16 | def __init__( 17 | self, 18 | writer_builder=None, 19 | key_builder=None, 20 | max_bucket_size=10000, 21 | max_belt_size=1, 22 | max_interval_between_rotations=60, 23 | ): 24 | self.writer_builder = writer_builder 25 | self.key_builder = key_builder 26 | self.key_generator = self.key_builder.get() 27 | 28 | self.max_bucket_size = max_bucket_size 29 | self.max_belt_size = max_belt_size 30 | self.max_interval_between_rotations = max_interval_between_rotations 31 | 32 | def insert(self, item): 33 | pass 34 | 35 | def flush(self, key=None): 36 | pass 37 | -------------------------------------------------------------------------------- /clickhouse_mysql_examples/datatypes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo bash -c "echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse" 4 | 5 | PYTHON="" 6 | PYTHON="python3.6" 7 | PYTHON="python3" 8 | PYTHON="/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy" 9 | 10 | CH_MYSQL="/usr/bin/clickhouse-mysql" 11 | CH_MYSQL="-m clickhouse_mysql.main" 12 | 13 | $PYTHON $CH_MYSQL ${*:1} \ 14 | --src-resume \ 15 | --src-wait \ 16 | --src-host=127.0.0.1 \ 17 | --src-user=reader \ 18 | --src-password=qwerty \ 19 | --dst-host=192.168.74.251 \ 20 | --csvpool \ 21 | --csvpool-file-path-prefix=qwe_ \ 22 | --column-default-value \ 23 | date_1=2000-01-01 \ 24 | datetime_1=2000-01-01\ 01:02:03 \ 25 | time_1=2001-01-01\ 01:02:03 \ 26 | timestamp_1=2002-01-01\ 01:02:03 \ 27 | --mempool-max-flush-interval=600 \ 28 | --mempool-max-events-num=900000 \ 29 | --pump-data 30 | 31 | # --mempool 32 | # --mempool-max-events-num=3 33 | # --mempool-max-flush-interval=30 34 | # --dst-file=dst.csv 35 | # --dst-schema=db 36 | # --dst-table=datatypes 37 | # --csvpool-keep-files 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Altinity 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dev_run_cli_options.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ugly stub to suppress unsufficient sockets 4 | #sudo bash -c "echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse" 5 | 6 | # run data reader with specified Python version 7 | 8 | PYTHON="python3" 9 | 10 | CH_MYSQL="-m clickhouse_mysql.main" 11 | 12 | if [ ! -d "clickhouse_mysql" ]; then 13 | # no clickhouse_mysql dir available - step out of examples dir 14 | cd .. 15 | fi 16 | 17 | $PYTHON $CH_MYSQL ${*:1} \ 18 | --src-server-id=1 \ 19 | --src-resume \ 20 | --src-wait \ 21 | --nice-pause=1 \ 22 | --log-level=debug \ 23 | --src-host=127.0.0.1 \ 24 | --src-user=reader \ 25 | --src-password=qwerty \ 26 | --src-tables-prefixes=log_ \ 27 | --dst-host=127.0.0.1 \ 28 | --dst-table=logunified \ 29 | --csvpool \ 30 | --csvpool-file-path-prefix=qwe_ \ 31 | --mempool-max-flush-interval=60 \ 32 | --mempool-max-events-num=10000 \ 33 | --pump-data 34 | 35 | # --log-file=ontime.log \ 36 | # --mempool 37 | # --mempool-max-events-num=3 38 | # --mempool-max-flush-interval=30 39 | # --dst-file=dst.csv 40 | # --dst-schema=db 41 | # --dst-table=datatypes 42 | # --csvpool-keep-files 43 | # --log-level=info \ 44 | -------------------------------------------------------------------------------- /clickhouse_mysql/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | import logging 6 | import pprint 7 | import sys 8 | import importlib.util 9 | 10 | 11 | class Util(object): 12 | 13 | @staticmethod 14 | def join_lists(*args): 15 | res = [] 16 | for l in args: 17 | if isinstance(l, list): 18 | res += l 19 | 20 | return res 21 | 22 | @staticmethod 23 | def log_row(row, header="log row"): 24 | log_row = header + "\n"; 25 | if isinstance(row, dict): 26 | for column, value in row.items(): 27 | log_row += "column: {}={}\n".format(column, value) 28 | else: 29 | for value in row: 30 | log_row += "value: {}\n".format(value) 31 | logging.info(log_row) 32 | 33 | @staticmethod 34 | def class_from_file(file_name, class_name): 35 | logging.info("sys.path") 36 | logging.info(pprint.pformat(sys.path)) 37 | spec = importlib.util.spec_from_file_location("file_module", file_name) 38 | module = importlib.util.module_from_spec(spec) 39 | spec.loader.exec_module(module) 40 | _class = getattr(module, class_name) 41 | return _class 42 | -------------------------------------------------------------------------------- /clickhouse_mysql_examples/airline_ontime_mysql_data_import.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # import airline.ontime test dataset into MySQL 3 | 4 | # looking for csv files in this dir 5 | #FILES_TO_IMPORT_DIR="/mnt/nas/work/ontime" 6 | FILES_TO_IMPORT_DIR="csv" 7 | 8 | # how many files to skip from the beginning of the list 9 | FILES_TO_SKIP_NUM=0 10 | 11 | # how many files to import 12 | FILES_TO_IMPORT_NUM=10 13 | 14 | # which file would be the first to import 15 | FILE_TO_START_IMPORT_FROM=$((FILES_TO_SKIP_NUM+1)) 16 | 17 | i=1 18 | for file in $(ls "$FILES_TO_IMPORT_DIR"/*.csv|sort|tail -n +"$FILE_TO_START_IMPORT_FROM"|head -n "$FILES_TO_IMPORT_NUM"); do 19 | echo "$i. Prepare. Make link to $file" 20 | rm -f ontime 21 | ln -s $file ontime 22 | 23 | echo "$i. Import. $file" 24 | time mysqlimport \ 25 | --ignore-lines=1 \ 26 | --fields-terminated-by=, \ 27 | --fields-enclosed-by=\" \ 28 | --local \ 29 | -u root \ 30 | airline ontime 31 | 32 | #--local reads files locally on the client host, bot on the server 33 | #--lock-tables Lock all tables for writing before processing any text files. This ensures that all tables are synchronized on the server. 34 | 35 | echo "$i. Cleanup. $file" 36 | rm -f ontime 37 | 38 | i=$((i+1)) 39 | done 40 | -------------------------------------------------------------------------------- /clickhouse_mysql/converter/csvwriteconverter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | 6 | from clickhouse_mysql.converter.converter import Converter 7 | 8 | 9 | class CSVWriteConverter(Converter): 10 | 11 | # default values for columns - dict 12 | defaults = None 13 | 14 | def __init__(self, defaults=None, column_skip=None): 15 | logging.debug("CSVWriteConverter __init__()") 16 | self.defaults = [] if defaults is None else defaults 17 | super().__init__(column_skip=column_skip) 18 | 19 | def row(self, row): 20 | if row is None: 21 | return None 22 | 23 | # replace empty columns with default values (for default we have) 24 | for column in row: 25 | if (row[column] is None) and (column in self.defaults): 26 | row[column] = self.defaults[column] 27 | 28 | # delete columns according to the list of columns to skip 29 | for column in self.column_skip: 30 | if column in row: 31 | row.pop(column) 32 | 33 | return row 34 | 35 | def convert(self, event_or_row): 36 | # nothing to convert 37 | if not self.defaults and not self.column_skip: 38 | return event_or_row 39 | 40 | # have some convert setup 41 | return super().convert(event_or_row) 42 | -------------------------------------------------------------------------------- /clickhouse_mysql/pumper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | class Pumper(object): 6 | """ 7 | Pump data - read data from reader and push into writer 8 | """ 9 | 10 | reader = None 11 | writer = None 12 | 13 | def __init__(self, reader=None, writer=None): 14 | 15 | self.reader = reader 16 | self.writer = writer 17 | 18 | if self.reader: 19 | # subscribe on reader's event notifications 20 | self.reader.subscribe({ 21 | 'WriteRowsEvent': self.write_rows_event, 22 | # 'WriteRowsEvent.EachRow': self.write_rows_event_each_row, 23 | 'ReaderIdleEvent': self.reader_idle_event, 24 | }) 25 | 26 | def run(self): 27 | self.reader.read() 28 | 29 | def write_rows_event(self, event=None): 30 | """ 31 | WriteRowsEvent handler 32 | :param event: 33 | """ 34 | self.writer.insert(event) 35 | 36 | def write_rows_event_each_row(self, event=None): 37 | """ 38 | WriteRowsEvent.EachRow handler 39 | :param event: 40 | """ 41 | self.writer.insert(event) 42 | 43 | def reader_idle_event(self): 44 | """ 45 | ReaderIdleEvent handler 46 | """ 47 | self.writer.flush() 48 | 49 | if __name__ == '__main__': 50 | print("pumper") 51 | -------------------------------------------------------------------------------- /clickhouse_mysql_examples/airline_ontime_data_mysql_to_ch_reader.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # read airline.ontime test dataset from MySQL and write it to CH 3 | 4 | # ugly stub to suppress unsufficient sockets 5 | #sudo bash -c "echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse" 6 | 7 | # run data reader with specified Python version 8 | 9 | PYTHON="" 10 | PYTHON="/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy" 11 | PYTHON="python3.6" 12 | PYTHON="python3" 13 | 14 | CH_MYSQL="/usr/bin/clickhouse-mysql" 15 | CH_MYSQL="-m clickhouse_mysql.main" 16 | 17 | if [ ! -d "clickhouse_mysql" ] && [ -d "../clickhouse_mysql" ]; then 18 | # no clickhouse_mysql dir available - step out of examples dir 19 | cd .. 20 | fi 21 | 22 | $PYTHON $CH_MYSQL ${*:1} \ 23 | --src-server-id=1 \ 24 | --src-resume \ 25 | --src-wait \ 26 | --nice-pause=1 \ 27 | --log-level=info \ 28 | --log-file=ontime.log \ 29 | --src-host=127.0.0.1 \ 30 | --src-user=root \ 31 | --dst-host=127.0.0.1 \ 32 | --csvpool \ 33 | --csvpool-file-path-prefix=qwe_ \ 34 | --mempool-max-flush-interval=60 \ 35 | --mempool-max-events-num=10000 \ 36 | --pump-data 37 | 38 | # --mempool 39 | # --mempool-max-events-num=3 40 | # --mempool-max-flush-interval=30 41 | # --dst-file=dst.csv 42 | # --dst-schema=db 43 | # --dst-table=datatypes 44 | # --csvpool-keep-files 45 | # --log-level=info \ 46 | -------------------------------------------------------------------------------- /clickhouse_mysql/dbclient/chclient.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | import sys 6 | 7 | from clickhouse_driver.client import Client 8 | 9 | 10 | class CHClient(Client): 11 | """ClickHouse Client""" 12 | 13 | def __init__(self, connection_settings): 14 | logging.info("CHClient() connection_settings={}".format(connection_settings)) 15 | self.verify_connection_settings(connection_settings) 16 | super().__init__(**connection_settings) 17 | 18 | def verify_connection_settings(self, connection_settings): 19 | if not connection_settings: 20 | logging.critical("Need CH connection settings") 21 | sys.exit(0) 22 | 23 | if 'host' not in connection_settings: 24 | logging.critical("Need CH host in connection settings") 25 | sys.exit(0) 26 | 27 | if not connection_settings['host']: 28 | logging.critical("Need CH host in connection settings") 29 | sys.exit(0) 30 | 31 | if 'port' not in connection_settings: 32 | logging.critical("Need CH port in connection settings") 33 | sys.exit(0) 34 | 35 | if not connection_settings['port']: 36 | logging.critical("Need CH port in connection settings") 37 | sys.exit(0) 38 | 39 | #self.client = CHClient(connection_settings) 40 | #self.client.execute(sql, rows) 41 | -------------------------------------------------------------------------------- /clickhouse_mysql_examples/airline_ontime_data_download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # download airline.ontime test dataset 3 | 4 | ZIP_FILES_DIR=$(pwd)"/zip" 5 | CSV_FILES_DIR=$(pwd)"/csv" 6 | 7 | FROM_YEAR=1987 8 | TO_YEAR=2017 9 | 10 | FROM_MONTH=1 11 | TO_MONTH=12 12 | 13 | echo "Check required commands availability" 14 | if command -v wget && command -v unzip; then 15 | echo "Looks like all required commands are available" 16 | else 17 | echo "Please ensure availability of: wget && unzip" 18 | exit 1 19 | fi 20 | 21 | echo "Download dataset" 22 | 23 | echo "Create dir $ZIP_FILES_DIR for downloading zip files" 24 | mkdir -p "$ZIP_FILES_DIR" 25 | 26 | if [ ! -d "$ZIP_FILES_DIR" ]; then 27 | "Can' use dir: $ZIP_FILES_DIR - not available" 28 | exit 1 29 | fi 30 | 31 | echo "Download files into $ZIP_FILES_DIR" 32 | for year in `seq $FROM_YEAR $TO_YEAR`; do 33 | for month in `seq $FROM_MONTH $TO_MONTH`; do 34 | FILE_NAME="On_Time_On_Time_Performance_${year}_${month}.zip" 35 | wget -O "$ZIP_FILES_DIR/$FILE_NAME" "http://transtats.bts.gov/PREZIP/$FILE_NAME" 36 | done 37 | done 38 | 39 | echo "Unzip dataset" 40 | 41 | echo "Create dir $CSV_FILES_DIR for unzipped CSV files" 42 | mkdir -p "$CSV_FILES_DIR" 43 | 44 | if [ ! -d "$CSV_FILES_DIR" ]; then 45 | "Can' use dir: $CSV_FILES_DIR - not available" 46 | exit 1 47 | fi 48 | 49 | for ZIP_FILENAME in `ls "$ZIP_FILES_DIR"/*.zip`; do 50 | echo "Unzipping $ZIP_FILENAME to $CSV_FILES_DIR/" 51 | unzip -o "$ZIP_FILENAME" -d "$CSV_FILES_DIR/" 52 | done 53 | -------------------------------------------------------------------------------- /docs/usage-references.md: -------------------------------------------------------------------------------- 1 | ## Use Cases 2 | - [3 Step Migration of MySQL data to Clickhouse for faster analytics.](https://mydbops.wordpress.com/2020/02/21/3-step-migration-of-mysql-data-to-clickhouse-for-faster-analytics/) 3 | - [Hybrid OLTP/Analytics Database Workloads: Replicating MySQL Data to ClickHouse](https://severalnines.com/database-blog/hybrid-oltpanalytics-database-workloads-replicating-mysql-data-clickhouse) 4 | - [How to import and replicate data from MySQL toClickHouse](https://minervadb.com/wp-content/uploads/2019/10/How-to-import-and-replicate-data-from-MySQL-to-ClickHouse.pdf) 5 | - [Use Yandex ClickHouse for Analytics with Data from MySQL](https://www.tienle.com/2018/05-04/use-yandex-clickhouse-for-analytics-with-data-from-mysql.html) 6 | 7 | ## Talks 8 | - [Opensource Column Store Databases: MariaDB ColumnStore vs. ClickHouse](https://www.percona.com/live/19/sites/default/files/slides/Opensource%20Column%20Store%20Databases_%20MariaDB%20ColumnStore%20vs.%20ClickHouse%20-%20FileId%20-%20188040.pdf) 9 | - [Replicating MySQL Data to TiDB For Near Real-Time Analytics](https://dataops.barcelona/wp-content/uploads/2019/06/Replicating-to-TiDb-francisco-Bordenave.pdf) 10 | 11 | ## TODOs and HOWTOs 12 | - [Clickhouse install and use /clickhouse-mysql installation](http://www.programmersought.com/article/7079240138/) 13 | - [Replication from MySQL to ClickHouse](https://www.goplardb.com/post/replication-from-mysql-to-clickhouse) 14 | 15 | 16 | ## Other References 17 | - [CH integrations](https://clickhouse.tech/docs/en/interfaces/third-party/integrations/) 18 | - [awesomeopensource](https://awesomeopensource.com/projects/clickhouse) 19 | 20 | -------------------------------------------------------------------------------- /clickhouse_mysql/observable.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | class Observable(object): 6 | """ 7 | Implements Observable pattern 8 | """ 9 | 10 | # functions to be called when event to be notified upon 11 | event_handlers = { 12 | 'Event1': [], 13 | 'Event2': [], 14 | } 15 | 16 | def subscribe(self, event_handlers): 17 | # event_handlers has the same structure as self.event_handlers 18 | 19 | for event_name in event_handlers: 20 | if event_name in self.event_handlers: 21 | # this event is listed in Observable as 'subscribable' 22 | if callable(event_handlers[event_name]): 23 | # function itself 24 | self.event_handlers[event_name].append(event_handlers[event_name]) 25 | else: 26 | # assume ist of functions - iterate over it and add each of them 27 | if isinstance(event_handlers[event_name], list): 28 | for callback in event_handlers[event_name]: 29 | if callable(callback): 30 | self.event_handlers[event_name].append(callback) 31 | 32 | def notify(self, event_name, **attrs): 33 | # notify (call function) each of subscribers of event_name event 34 | for callback in self.event_handlers[event_name]: 35 | callback(**attrs) 36 | 37 | def subscribers(self, event_name): 38 | # are there any (>0) subscribers for event event_name? 39 | return event_name in self.event_handlers and (len(self.event_handlers[event_name]) > 0) 40 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # clickhouse-mysql 2019-09-03 2 | 3 | ## improvements 4 | * fix --src-tables-where-clauses to accept both filenames (for long where-clauses) and where-cluases themselves (for shorted clauses) 5 | 6 | ## bugfixes 7 | * fix --src-port CLI option 8 | * ensure UTF8 for source migration 9 | 10 | # clickhouse-mysql 2019-03-25 11 | 12 | ## new features 13 | * added new CLI option `--dst-schema` - make table full name change to `schema`.`db__table` 14 | * added new CLI option `--dst-cluster` - support table create on cluster 15 | * added new CLI option `--dst-distribute` - make table full name change to `schema_all`.`db__table_all`,and engine change to Distributed 16 | 17 | # clickhouse-mysql 2018-03-14 18 | 19 | ## new features 20 | * added new CLI option `--create-table-sql` - make attempt to prepare ready-to-use **CREATE TABLE** statement 21 | * added new CLI option `--pump-data` - specifies that we'd like to pump data into ClickHouse. Was default behaviour previously 22 | * added new CLI option `--install` - Install service file(s) 23 | * added new CLI option `--dst-create-table` - tries to automatically create target table in ClickHouse before any data inserted 24 | 25 | ## improvements 26 | * modified/added new CLI option `--with-create-database` - used in combination with `--create-table-sql*` options in order to add **CREATE DATABASE** statement in additon to **CREATE TABLE** 27 | * modified/added new CLI option `--create-table-json-template` - prepare JSON **CREATE TABLE** data 28 | * modified/added new CLI option `--migrate-table` - was called `--table-migrate` previously 29 | * modified/added new CLI option `--create-table-sql-template` - prepare **CREATE TABLE** template 30 | 31 | ## bugfixes 32 | * config files vs CLI options order fixed 33 | 34 | -------------------------------------------------------------------------------- /clickhouse_mysql/writer/poolwriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | 6 | from clickhouse_mysql.writer.writer import Writer 7 | from clickhouse_mysql.event.event import Event 8 | from clickhouse_mysql.pool.bbpool import BBPool 9 | 10 | 11 | class PoolWriter(Writer): 12 | """Write with caching in Pool""" 13 | 14 | writer_builder = None 15 | max_pool_size = None 16 | pool = None 17 | 18 | def __init__( 19 | self, 20 | writer_builder=None, 21 | max_pool_size=10000, 22 | max_flush_interval=60 23 | ): 24 | logging.info("PoolWriter()") 25 | self.writer_builder = writer_builder 26 | self.max_pool_size = max_pool_size 27 | self.max_flush_interval = max_flush_interval 28 | 29 | self.pool = BBPool( 30 | writer_builder=self.writer_builder, 31 | max_bucket_size=self.max_pool_size, 32 | max_interval_between_rotations=self.max_flush_interval, 33 | ) 34 | 35 | def insert(self, event_or_events): 36 | """Insert data into Pool""" 37 | logging.debug('class:%s insert', __class__) 38 | self.pool.insert(event_or_events) 39 | 40 | def flush(self): 41 | self.pool.flush() 42 | 43 | if __name__ == '__main__': 44 | path = 'file.csv' 45 | 46 | writer = CSVWriter(path) 47 | writer.open() 48 | event = Event() 49 | event.row_converted={ 50 | 'a': 123, 51 | 'b': 456, 52 | 'c': 'qwe', 53 | 'd': 'rty', 54 | } 55 | writer.insert(event) 56 | event.row_converted={ 57 | 'a': 789, 58 | 'b': 987, 59 | 'c': 'asd', 60 | 'd': 'fgh', 61 | } 62 | writer.insert(event) 63 | writer.close() 64 | -------------------------------------------------------------------------------- /clickhouse_mysql/writer/writer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | class Writer(object): 6 | 7 | next_writer_builder = None 8 | converter_builder = None 9 | 10 | def __init__( 11 | self, 12 | next_writer_builder=None, 13 | converter_builder=None 14 | ): 15 | self.next_writer_builder = next_writer_builder 16 | self.converter_builder = converter_builder 17 | 18 | def opened(self): 19 | pass 20 | 21 | def open(self): 22 | pass 23 | 24 | def listify(self, obj_or_list): 25 | """Ensure list - create a list from an object as [obj] or keep a list if it is already a list""" 26 | 27 | if obj_or_list is None: 28 | # no value - return empty list 29 | return [] 30 | 31 | elif isinstance(obj_or_list, list) or isinstance(obj_or_list, set) or isinstance(obj_or_list, tuple): 32 | if len(obj_or_list) < 1: 33 | # list/set/tuple is empty - nothing to do 34 | return [] 35 | else: 36 | # list/set/tuple is good 37 | return obj_or_list 38 | 39 | else: 40 | # event_or_events is an object 41 | return [obj_or_list] 42 | 43 | def convert(self, data): 44 | """Convert an object if we have a converter or just return object 'as is' otherwise""" 45 | return self.converter_builder.get().convert(data) if self.converter_builder else data 46 | 47 | def insert(self, event_or_events=None): 48 | # event_or_events = [ 49 | # event: { 50 | # row: {'id': 3, 'a': 3} 51 | # }, 52 | # event: { 53 | # row: {'id': 3, 'a': 3} 54 | # }, 55 | # ] 56 | pass 57 | 58 | def flush(self): 59 | pass 60 | 61 | def push(self): 62 | pass 63 | 64 | def destroy(self): 65 | pass 66 | 67 | def close(self): 68 | pass 69 | -------------------------------------------------------------------------------- /clickhouse_mysql/reader/csvreader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import csv 5 | import os 6 | 7 | from clickhouse_mysql.reader.reader import Reader 8 | from clickhouse_mysql.event.event import Event 9 | from clickhouse_mysql.converter.csvreadconverter import CSVReadConverter 10 | 11 | 12 | class CSVReader(Reader): 13 | """Read data from CSV files""" 14 | 15 | csv_file_path = None 16 | csvfile = None 17 | sniffer = None 18 | dialect = None 19 | has_header = False 20 | reader = None 21 | 22 | def __init__( 23 | self, 24 | csv_file_path, 25 | converter=None, 26 | callbacks={} 27 | ): 28 | super().__init__(converter=converter, callbacks=callbacks) 29 | 30 | self.csv_file_path = csv_file_path 31 | self.csvfile = open(self.csv_file_path) 32 | self.sniffer = csv.Sniffer() 33 | self.dialect = self.sniffer.sniff(self.csvfile.read(1024)) 34 | self.csvfile.seek(0) 35 | self.has_header = self.sniffer.has_header(self.csvfile.read(1024)) 36 | self.csvfile.seek(0) 37 | self.reader = csv.DictReader(self.csvfile, dialect=self.dialect) 38 | if self.has_header: 39 | print('=======') 40 | print(self.reader.fieldnames) 41 | print('=======') 42 | else: 43 | # should raise error? 44 | pass 45 | 46 | def read(self): 47 | # fetch events 48 | try: 49 | event = Event() 50 | event.table = os.path.splitext(self.csv_file_path)[0] 51 | self.notify('WriteRowsEvent', event=event) 52 | for row in self.reader: 53 | event.row = row 54 | self.notify('WriteRowsEvent.EachRow', event=self.converter.convert(event) if self.converter else event) 55 | except KeyboardInterrupt: 56 | pass 57 | 58 | self.csvfile.close() 59 | 60 | if __name__ == '__main__': 61 | reader = CSVReader(filename='data.csv') 62 | reader.read() 63 | -------------------------------------------------------------------------------- /dev_run_cli_options_local_table_migration.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script performs migration of one table test.books from local MySQL 4 | # into one table test.books in local ClickHouse 5 | # Tables are created manually by user and are expected by migrator to be in place 6 | # Migrator exists after all data from migrated table is copied into ClickHouse 7 | 8 | # ugly stub to suppress unsufficient sockets 9 | #sudo bash -c "echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse" 10 | 11 | # run data reader with specified Python version 12 | 13 | PYTHON="python3" 14 | 15 | CH_MYSQL="-m clickhouse_mysql.main" 16 | 17 | if [ ! -d "clickhouse_mysql" ]; then 18 | # no clickhouse_mysql dir available - step out of examples dir 19 | cd .. 20 | fi 21 | 22 | MYSQL_USER=reader 23 | MYSQL_PASSWORD=qwerty 24 | SRC_TABLES=test.books 25 | DST_SCHEMA=test 26 | DST_TABLE=books 27 | 28 | MYSQL_USER=user1 29 | MYSQL_PASSWORD=qwerty 30 | SRC_TABLES=repl.foo 31 | DST_SCHEMA=repl1 32 | DST_TABLE=foo1 33 | 34 | $PYTHON $CH_MYSQL ${*:1} \ 35 | --src-server-id=1 \ 36 | --nice-pause=1 \ 37 | --log-level=debug \ 38 | \ 39 | --src-host=127.0.0.1 \ 40 | --src-user="${MYSQL_USER}" \ 41 | --src-password="${MYSQL_PASSWORD}" \ 42 | --src-tables="${SRC_TABLES}" \ 43 | \ 44 | --dst-host=127.0.0.1 \ 45 | --dst-create-table \ 46 | --with-create-database \ 47 | \ 48 | --csvpool \ 49 | --csvpool-file-path-prefix=qwe_ \ 50 | --mempool-max-flush-interval=60 \ 51 | --mempool-max-events-num=10000 \ 52 | \ 53 | --binlog-position-file=qwe.txt \ 54 | --pump-data \ 55 | --migrate-table \ 56 | --src-wait \ 57 | --src-resume 58 | 59 | # --dst-schema="${DST_SCHEMA}" \ 60 | # --dst-table="${DST_TABLE}" \ 61 | # --dst-table="${DST_SCHEMA}.${DST_TABLE}" \ 62 | # --dst-table-prefix="pr1_" \ 63 | # --log-file=ontime.log \ 64 | # --mempool 65 | # --mempool-max-events-num=3 66 | # --mempool-max-flush-interval=30 67 | # --dst-file=dst.csv 68 | # --dst-schema=db 69 | # --dst-table=datatypes 70 | # --csvpool-keep-files 71 | # --log-level=info \ 72 | -------------------------------------------------------------------------------- /clickhouse_mysql/writer/processwriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import multiprocessing as mp 5 | import logging 6 | 7 | from clickhouse_mysql.writer.writer import Writer 8 | 9 | 10 | class ProcessWriter(Writer): 11 | """Start write procedure as a separated process""" 12 | args = None 13 | 14 | def __init__(self, **kwargs): 15 | next_writer_builder = kwargs.pop('next_writer_builder', None) 16 | converter_builder = kwargs.pop('converter_builder', None) 17 | super().__init__(next_writer_builder=next_writer_builder, converter_builder=converter_builder) 18 | for arg in kwargs: 19 | self.next_writer_builder.param(arg, kwargs[arg]) 20 | 21 | def opened(self): 22 | pass 23 | 24 | def open(self): 25 | pass 26 | 27 | def process(self, event_or_events=None): 28 | """Separate process body to be run""" 29 | 30 | logging.debug('class:%s process()', __class__) 31 | writer = self.next_writer_builder.get() 32 | writer.insert(event_or_events) 33 | writer.close() 34 | writer.push() 35 | writer.destroy() 36 | logging.debug('class:%s process() done', __class__) 37 | 38 | def insert(self, event_or_events=None): 39 | # event_or_events = [ 40 | # event: { 41 | # row: {'id': 3, 'a': 3} 42 | # }, 43 | # event: { 44 | # row: {'id': 3, 'a': 3} 45 | # }, 46 | # ] 47 | 48 | # start separated process with event_or_events to be inserted 49 | 50 | logging.debug('class:%s insert', __class__) 51 | process = mp.Process(target=self.process, args=(event_or_events,)) 52 | 53 | logging.debug('class:%s insert.process.start()', __class__) 54 | process.start() 55 | 56 | #process.join() 57 | logging.debug('class:%s insert done', __class__) 58 | pass 59 | 60 | def flush(self): 61 | pass 62 | 63 | def push(self): 64 | pass 65 | 66 | def destroy(self): 67 | pass 68 | 69 | def close(self): 70 | pass 71 | -------------------------------------------------------------------------------- /clickhouse_mysql/converter/chwriteconverter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from clickhouse_mysql.converter.converter import Converter 5 | 6 | import datetime 7 | import decimal 8 | import logging 9 | 10 | 11 | class CHWriteConverter(Converter): 12 | 13 | # do not include empty columns into converted row 14 | delete_empty_columns = False 15 | 16 | types_to_convert = [ 17 | datetime.timedelta, 18 | bytes, 19 | decimal.Decimal, 20 | 21 | # jsonify 22 | # object, 23 | dict, 24 | list, 25 | 26 | # set - how to migrate MySQL's `set` type and tell it from `json` type - both of which are presented as `dict`? 27 | set, 28 | ] 29 | 30 | def __init__(self, column_skip): 31 | logging.debug("CHWriteConverter __init__()") 32 | super().__init__(column_skip=column_skip) 33 | 34 | def column(self, column, value): 35 | for _type in self.types_to_convert: 36 | if isinstance(value, _type): 37 | # print("Converting column", column, "of type", type(event.row[column]), event.row[column]) 38 | return str(value) 39 | # print("Using asis column", column, "of type", type(event.row[column])) 40 | return value 41 | 42 | def row(self, row): 43 | """ 44 | Convert row 45 | :param row: row to convert 46 | :return: converted row 47 | """ 48 | if row is None: 49 | return None 50 | 51 | # init list of columns to delete 52 | columns_to_delete = self.column_skip 53 | 54 | for column in row: 55 | 56 | # skip columns already prepared for deletion 57 | if column in columns_to_delete: 58 | continue 59 | 60 | # convert column 61 | row[column] = self.column(column, row[column]) 62 | 63 | # include empty column to the list of to be deleted columns 64 | if (row[column] is None) and self.delete_empty_columns: 65 | columns_to_delete.append(column) 66 | 67 | # delete columns according to the list of columns to delete 68 | for column in columns_to_delete: 69 | row.pop(column) 70 | 71 | return row 72 | -------------------------------------------------------------------------------- /clickhouse_mysql.etc/clickhouse-mysql.conf: -------------------------------------------------------------------------------- 1 | # INTRODUCTION 2 | # 3 | # 1. Parameters accept both variants for naming - with '_' as a words separator or '-'. 4 | # The former is used for ENV VARS, the latter - for CLI options. 5 | # Config file accepts both, thus 'log_file' equals to 'log-file' 6 | # 7 | # 2. Lists are specified with ',' as a separator 8 | # For example, in case we'd like to specify two entries for `src_tables_prefixes` option: 9 | # src_tables_prefixes=performance_log_, indexing_log_ 10 | # 11 | # 3. Boolean parameters accept case-insensitive 'yes', 'no', 'on', 'off', 1 or 0 values 12 | # 13 | # 4. Each parameter must have a value, thus 14 | # 'dry=yes' is ok, 15 | # 'dry=' is not that good, but will be interpreted as an empty line, while 16 | # 'dry' is a syntax error - not recognized to as a value - no '=' and right part value specified 17 | # 18 | # 5. Comments are '#'-only. So, php-like ';' is not accepted as a comment symbol in config file 19 | # 20 | # general app section 21 | # 22 | 23 | #config_file=clickhouse-mysql.conf 24 | #log_file=/var/log/clickhouse-mysql/main.log 25 | log_level=debug 26 | nice_pause=1 27 | #dry=yes 28 | #daemon=yes 29 | #pid_file=/tmp/clickhouse-client.pid 30 | #binlog_position_file=/tmp/clickhouse-mysql-binlog.pos 31 | mempool=yes 32 | #mempool_max_events_num=10000 33 | #mempool_max_rows_num= 34 | #mempool_max_flush_interval=60 35 | #csvpool=yes 36 | #csvpool_file_path_prefix=qwe_ 37 | #csvpool_keep_files=yes 38 | #create_table_sql_template=yes 39 | #create_table_sql=yes 40 | #with_create_database=yes 41 | #create_table_json_template=yes 42 | migrate_table=yes 43 | pump_data=yes 44 | 45 | # 46 | # src section 47 | # 48 | 49 | src_server_id=1 50 | src_host=127.0.0.1 51 | #src_port=3306 52 | src_user=reader 53 | src_password=qwerty 54 | #src_schemas=db1, db2, db3 55 | src_tables=airline.ontime 56 | #src_tables_where_clauses=a=1 57 | #src_tables_prefixes=log_, log1_, log2_ 58 | src_wait=yes 59 | #src_resume=yes 60 | #src_file= 61 | #src_binlog_file=mysql-bin.000024 62 | #src_binlog_position=5307 63 | 64 | # 65 | # dst section 66 | # 67 | 68 | #dst_file=dst.csv 69 | dst_host=127.0.0.1 70 | #dst_port= 71 | #dst_user=default 72 | #dst_password= 73 | #dst_schema=db 74 | #dst_table=logunified 75 | dst_create_table=yes 76 | 77 | # 78 | # converters section 79 | # 80 | 81 | #column_default_value= 82 | #column_skip= 83 | #ch_converter_file= 84 | #ch_converter_class= 85 | -------------------------------------------------------------------------------- /clickhouse_mysql/objectbuilder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | class ObjectBuilder(object): 6 | 7 | class_name = None 8 | constructor_params = None 9 | instance = None 10 | 11 | def __init__(self, class_name=None, constructor_params=None, instance=None): 12 | """ 13 | Builder/Wrapper for an object. 14 | In case instance is provided - operates as a wrapper 15 | In case class_name and (optional) constructor_params provided - return instance of specified class 16 | :param class_name: class to instantiate 17 | :param constructor_params: dict of class's contrcutor params. Used as **constructor_params 18 | :param instance: ready-to use instance 19 | """ 20 | self.class_name = class_name 21 | self.constructor_params = constructor_params 22 | self.instance = instance 23 | 24 | def param(self, name, value): 25 | """ 26 | Set constructor param for an object 27 | :param name: param name 28 | :param value: param value 29 | """ 30 | if not self.constructor_params: 31 | self.constructor_params = {} 32 | self.constructor_params[name] = value 33 | 34 | def get(self): 35 | """ 36 | Get object (in case wrapper) or an instance of a class (in case Object Builder) - each time the same object 37 | :return: object 38 | """ 39 | if not self.class_name: 40 | # no class name - return instance, it may be None 41 | return self.instance 42 | 43 | # create new object and save it as instance 44 | self.instance = self.new() 45 | 46 | # in order to return the same object instance on next get() call 47 | self.class_name = None 48 | 49 | return self.instance 50 | 51 | def new(self): 52 | """ 53 | Get object (in case wrapper) or an instance of a class (in case Object builder) - each time new object 54 | :return: object 55 | """ 56 | if not self.class_name: 57 | # no class name - return instance, it may be None 58 | return self.instance 59 | 60 | # have class name 61 | 62 | # instantiate object 63 | if self.constructor_params: 64 | return self.class_name(**self.constructor_params) 65 | else: 66 | return self.class_name() 67 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import setup, find_packages 5 | 6 | setup( 7 | name="clickhouse-mysql", 8 | 9 | # version should comply with PEP440 10 | version='0.0.20200128', 11 | 12 | description='MySQL to ClickHouse data migrator', 13 | long_description='MySQL to ClickHouse data migrator', 14 | 15 | # homepage 16 | url="https://github.com/altinity/clickhouse-mysql-data-reader", 17 | 18 | author="Vladislav Klimenko", 19 | author_email="sunsingerus@gmail.com", 20 | 21 | license="MIT", 22 | 23 | # see https://pypi.python.org/pypi?:action=list_classifiers 24 | classifiers=[ 25 | # How mature is this project? Common values are 26 | # 3 - Alpha 27 | # 4 - Beta 28 | # 5 - Production/Stable 29 | 'Development Status :: 3 - Alpha', 30 | 31 | 'Intended Audience :: Developers', 32 | 'Intended Audience :: System Administrators', 33 | 34 | 'Topic :: Database', 35 | 36 | # should match license above 37 | 'License :: OSI Approved :: MIT License', 38 | 39 | # supported Python versions 40 | 'Programming Language :: Python', 41 | 'Programming Language :: Python :: 3', 42 | 'Programming Language :: Python :: 3.4', 43 | 'Programming Language :: Python :: 3.5', 44 | 'Programming Language :: Python :: 3.6', 45 | 'Programming Language :: Python :: 3 :: Only', 46 | ], 47 | 48 | # what does the project relate to? 49 | keywords='clickhouse mysql data migration', 50 | 51 | # list of packages to be included into project 52 | packages=find_packages(exclude=[ 53 | 'contrib', 54 | 'docs', 55 | 'tests', 56 | ]), 57 | 58 | # list of additional package data to be attached to packages 59 | package_data={ 60 | 'clickhouse_mysql': [ 61 | # examples 62 | '../clickhouse_mysql_examples/*.sh', 63 | '../clickhouse_mysql_examples/*.sql', 64 | # converter examples 65 | '../clickhouse_mysql_converter/*.py', 66 | # init scripts 67 | '../clickhouse_mysql.init.d/*', 68 | # config files 69 | '../clickhouse_mysql.etc/*', 70 | ], 71 | }, 72 | 73 | # run-time dependencies 74 | # these will be installed by pip 75 | # https://packaging.python.org/en/latest/requirements.html 76 | install_requires=[ 77 | 'mysqlclient', 78 | 'mysql-replication', 79 | 'clickhouse-driver', 80 | 'configobj', 81 | 'setuptools', 82 | ], 83 | 84 | # cross-platform support for pip to create the appropriate form of executable 85 | entry_points={ 86 | 'console_scripts': [ 87 | # executable name=what to call 88 | 'clickhouse-mysql=clickhouse_mysql:main', 89 | ], 90 | }, 91 | 92 | #cmdclass={ 93 | # 'develop': PostDevelopCommand, 94 | # 'install': PostInstallCommand, 95 | #}, 96 | 97 | # python_requires='>=3.3', 98 | ) 99 | -------------------------------------------------------------------------------- /clickhouse_mysql/daemon.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | import atexit 6 | import signal 7 | 8 | 9 | class Daemon(object): 10 | 11 | pidfile = None 12 | root = '/' 13 | 14 | def __init__(self, pidfile='/tmp/daemon.pid', root='/'): 15 | self.pidfile = pidfile 16 | self.root = root 17 | 18 | def background(self): 19 | # first fork 20 | # root process waits for the child in order not to have zombies in the system 21 | pid = os.fork() 22 | if pid > 0: 23 | # parent - root process wait for first child and exits 24 | os.wait() 25 | sys.exit(0) 26 | 27 | # first child 28 | # setup own environment 29 | os.chdir(self.root) 30 | os.umask(0) 31 | os.setsid() 32 | 33 | # second fork 34 | # first-fork child produces the real worker process and exits 35 | # first-fork child is being waited now by root process 36 | pid = os.fork() 37 | if pid > 0: 38 | sys.exit(0) 39 | 40 | # worker 41 | signal.signal(signal.SIGINT, self.shutdown) 42 | signal.signal(signal.SIGTERM, self.shutdown) 43 | 44 | # handle pid file 45 | atexit.register(self.delete_pidfile) 46 | self.write_pidfile() 47 | 48 | # handle streams 49 | self.redirect_std_streams() 50 | 51 | def shutdown(self): 52 | self.delete_pidfile() 53 | sys.exit(0) 54 | 55 | def redirect_std_streams(self): 56 | sys.stdout.flush() 57 | sys.stderr.flush() 58 | 59 | stdin = open(os.devnull, 'r') 60 | stdout = open(os.devnull, 'a+') 61 | stderr = open(os.devnull, 'a+') 62 | 63 | os.dup2(stdin.fileno(), sys.stdin.fileno()) 64 | os.dup2(stdout.fileno(), sys.stdout.fileno()) 65 | os.dup2(stderr.fileno(), sys.stderr.fileno()) 66 | 67 | def write_pidfile(self): 68 | pid = str(os.getpid()) 69 | with open(self.pidfile, 'w+') as f: 70 | f.write(pid) 71 | 72 | def delete_pidfile(self): 73 | try: 74 | os.remove(self.pidfile) 75 | except: 76 | pass 77 | 78 | def get_pid(self): 79 | try: 80 | with open(self.pidfile, 'r') as pf: 81 | pid = int(pf.read().strip()) 82 | except: 83 | pid = None 84 | return pid 85 | 86 | def start(self): 87 | pid = self.get_pid() 88 | if pid: 89 | return False 90 | self.background() 91 | self.run() 92 | 93 | def stop(self, sig=signal.SIGTERM): 94 | pid = self.get_pid() 95 | if not pid: 96 | return False 97 | try: 98 | os.kill(pid, sig) 99 | except OSError as err: 100 | estr = str(err.args) 101 | if estr.find("No such process") > 0: 102 | self.delete_pidfile() 103 | 104 | def restart(self): 105 | self.stop() 106 | self.start() 107 | 108 | def run(self): 109 | pass 110 | 111 | -------------------------------------------------------------------------------- /clickhouse_mysql.init.d/clickhouse-mysql.service: -------------------------------------------------------------------------------- 1 | # man 5 systemd.unit 2 | # man 5 systemd.service 3 | # man 5 systemd.install 4 | # man systemctl 5 | 6 | [Unit] 7 | 8 | Description=MySQL to ClickHouse data reader 9 | 10 | # After means: start service after specified service is up and running 11 | After=network.target 12 | After=syslog.target 13 | #After=mysql.service 14 | 15 | # Requires means: this service reuires specified service, so we'd like to start required service in case it is not running yet. 16 | # In case service is listed in Requires and not listed in After - it will be launched "in parallel" with current service, and not after. 17 | # In this case, you need to understand possible race conditions, so it is better to explicitly specify in both After and Requries 18 | #Requires=mysql.service 19 | 20 | # Wants means: specified service would be nice to have, but it is optionals, we can go without it 21 | #Wants=redis.service 22 | 23 | [Service] 24 | 25 | # Command to start the service 26 | ExecStart=/usr/bin/python3 -m clickhouse_mysql.main --config-file=clickhouse-mysql.conf --daemon --log-file=/var/log/clickhouse-mysql/clickhouse-mysql.log --pid-file=/var/run/clickhouse-mysql/clickhouse-mysql.pid 27 | 28 | # Command to stop the the service 29 | # If this option is not specified, the process is terminated by sending the signal specified in KillSignal= 30 | # Specifier and environment variable substitution is supported (including $MAINPID) 31 | #ExecStop=/bin/kill -SIGINT $(cat /var/run/clickhouse-mysql/clickhouse-mysql.pid) 32 | 33 | # Specifies which signal to use when killing a service. This controls the signal that is sent as first step of shutting down a unit 34 | KillSignal=SIGINT 35 | 36 | # Possible values: control-group, process, mixed, none 37 | KillMode=process 38 | 39 | # Whether to send SIGKILL to remaining processes after a timeout, if the normal shutdown procedure left processes of the service around 40 | SendSIGKILL=yes 41 | 42 | # Command to reload the service 43 | #ExecReload= 44 | 45 | #ExecStartPre= 46 | #ExecStartPost= 47 | #ExecStopPost= 48 | 49 | # How long to wait for start/stop command(s) 50 | #TimeoutSec=300 51 | 52 | # How to restart the service in case it is down/ Check by PID file 53 | Restart=always 54 | #Restart=on-failure 55 | 56 | #RestartSec=30 57 | 58 | # Run service as `sudo` 59 | User=user 60 | Group=user 61 | 62 | # Service is a simple one and can't fork by itself 63 | #Type=simple 64 | 65 | # Service can fork by itself, exiting main process. Classic 'daemon' schema. 66 | Type=forking 67 | 68 | # Monitor PID file 69 | PIDFile=/var/run/clickhouse-mysql/clickhouse-mysql.pid 70 | 71 | # Perform `cd` into this dir before starting the service 72 | WorkingDirectory=/home/user/dev/mysqlbinlog/clickhouse-mysql-data-reader 73 | 74 | # Env vars 75 | #Environment=RACK_ENV=production 76 | 77 | # How this service must be treated in case of "Out Of Memory" situation 78 | # -1000 : do not kill this service at all (like sshd) 79 | # IMPORTANT: touch this in case of real need only 80 | #OOMScoreAdjust=-100 81 | 82 | #LimitCORE=infinity 83 | #LimitNOFILE=500000 84 | 85 | [Install] 86 | 87 | Alias=clickhouse-mysql clickhouse-mysql.service 88 | 89 | # runtime level 90 | # multi-user.target is the same as old known runlevel=3 91 | WantedBy=multi-user.target 92 | 93 | -------------------------------------------------------------------------------- /clickhouse_mysql/dbclient/mysqlclient.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | import MySQLdb 6 | from MySQLdb.cursors import Cursor 7 | 8 | 9 | class MySQLClient(object): 10 | 11 | connection = None 12 | cursor = None 13 | cursorclass = Cursor 14 | 15 | host = None 16 | port = None 17 | user = None 18 | password = None 19 | 20 | def __init__(self, connection_settings): 21 | """ 22 | :param host: string MySQL host 23 | :param port: int MySQL port 24 | :param user: string MySQL user 25 | :param password: string MySQL password 26 | """ 27 | self.host = connection_settings['host'] 28 | self.port = connection_settings['port'] 29 | self.user = connection_settings['user'] 30 | self.password = connection_settings['password'] 31 | 32 | def disconnect(self): 33 | """ 34 | Destroy connection objects 35 | :return: 36 | """ 37 | if self.cursor: 38 | try: 39 | self.cursor.close() 40 | del self.cursor 41 | except: 42 | pass 43 | 44 | if self.connection: 45 | try: 46 | del self.connection 47 | except: 48 | pass 49 | 50 | def connect(self, db): 51 | """ 52 | Connect to MySQL 53 | :param db: string schema/db name 54 | :return: 55 | """ 56 | 57 | self.disconnect() 58 | try: 59 | self.connection = MySQLdb.connect( 60 | host=self.host, 61 | port=self.port, 62 | user=self.user, 63 | passwd=self.password, 64 | db=db, 65 | cursorclass=self.cursorclass, 66 | charset='utf8', 67 | use_unicode=True, 68 | ) 69 | self.cursor = self.connection.cursor() 70 | logging.debug("Connect to the database host={} port={} user={} password={} db={}".format( 71 | self.host, 72 | self.port, 73 | self.user, 74 | self.password, 75 | db 76 | )) 77 | except: 78 | raise Exception("Can not connect to the database host={} port={} user={} password={} db={}".format( 79 | self.host, 80 | self.port, 81 | self.user, 82 | self.password, 83 | db 84 | )) 85 | 86 | def tables_list(self, db): 87 | """ 88 | List tables in specified DB 89 | 90 | :param db: database to list tables in 91 | :return: ['table1', 'table2', ...] 92 | """ 93 | try: 94 | self.cursorclass = Cursor 95 | self.connect(db=db) 96 | 97 | sql = "USE " + db 98 | logging.debug(sql) 99 | self.cursor.execute(sql) 100 | 101 | sql = "SHOW TABLES" 102 | logging.debug(sql) 103 | self.cursor.execute(sql) 104 | 105 | tables = [] 106 | for row in self.cursor: 107 | table_name = row[0] 108 | tables.append(table_name) 109 | 110 | except Exception as err: 111 | logging.debug("Unexpected error: {}".format(str(err))) 112 | raise Exception("Can not list tables on host={} port={} user={} password={} db={}".format( 113 | self.host, 114 | self.port, 115 | self.user, 116 | self.password, 117 | db 118 | )) 119 | 120 | return tables 121 | -------------------------------------------------------------------------------- /clickhouse_mysql/writer/chcsvwriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import logging 6 | import shlex 7 | 8 | from clickhouse_mysql.writer.writer import Writer 9 | from clickhouse_mysql.tableprocessor import TableProcessor 10 | 11 | 12 | class CHCSVWriter(Writer): 13 | """Write into ClickHouse via CSV file and clickhouse-client tool""" 14 | 15 | dst_schema = None 16 | dst_table = None 17 | dst_distribute = None 18 | 19 | host = None 20 | port = None 21 | user = None 22 | password = None 23 | 24 | def __init__( 25 | self, 26 | connection_settings, 27 | dst_schema=None, 28 | dst_table=None, 29 | dst_table_prefix=None, 30 | dst_distribute=False, 31 | ): 32 | if dst_distribute and dst_schema is not None: 33 | dst_schema += "_all" 34 | if dst_distribute and dst_table is not None: 35 | dst_table += "_all" 36 | logging.info("CHCSWriter() connection_settings={} dst_schema={} dst_table={}".format(connection_settings, dst_schema, dst_table)) 37 | self.host = connection_settings['host'] 38 | self.port = connection_settings['port'] 39 | self.user = connection_settings['user'] 40 | self.password = connection_settings['password'] 41 | self.dst_schema = dst_schema 42 | self.dst_table = dst_table 43 | self.dst_table_prefix = dst_table_prefix 44 | self.dst_distribute = dst_distribute 45 | 46 | def insert(self, event_or_events=None): 47 | # event_or_events = [ 48 | # event: { 49 | # row: {'id': 3, 'a': 3} 50 | # }, 51 | # event: { 52 | # row: {'id': 3, 'a': 3} 53 | # }, 54 | # ] 55 | 56 | events = self.listify(event_or_events) 57 | if len(events) < 1: 58 | logging.warning('No events to insert. class: %s', __class__) 59 | return 60 | 61 | # assume we have at least one Event 62 | 63 | logging.debug('class:%s insert %d rows', __class__, len(events)) 64 | 65 | for event in events: 66 | schema = self.dst_schema if self.dst_schema else event.schema 67 | table = None 68 | if self.dst_distribute: 69 | table = TableProcessor.create_distributed_table_name(db=event.schema, table=event.table) 70 | else: 71 | table = self.dst_table if self.dst_table else event.table 72 | if self.dst_schema: 73 | table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) 74 | 75 | sql = 'INSERT INTO `{0}`.`{1}` ({2}) FORMAT CSV'.format( 76 | schema, 77 | table, 78 | ', '.join(map(lambda column: '`%s`' % column, event.fieldnames)), 79 | ) 80 | 81 | choptions = "" 82 | if self.host: 83 | choptions += " --host=" + shlex.quote(self.host) 84 | if self.port: 85 | choptions += " --port=" + str(self.port) 86 | if self.user: 87 | choptions += " --user=" + shlex.quote(self.user) 88 | if self.password: 89 | choptions += " --password=" + shlex.quote(self.password) 90 | bash = "tail -n +2 '{0}' | clickhouse-client {1} --query='{2}'".format( 91 | event.filename, 92 | choptions, 93 | sql, 94 | ) 95 | 96 | logging.info('starting clickhouse-client process') 97 | logging.debug('starting %s', bash) 98 | os.system(bash) 99 | 100 | pass 101 | -------------------------------------------------------------------------------- /pack/clickhouse-mysql.spec: -------------------------------------------------------------------------------- 1 | %define name clickhouse-mysql 2 | %define version 0.0.20180321 3 | %define release 1 4 | 5 | Summary: MySQL to ClickHouse data migrator 6 | Name: %{name} 7 | Version: %{version} 8 | Release: %{release} 9 | Source0: %{name}-%{version}.tar.gz 10 | License: MIT 11 | Group: Development/Libraries 12 | BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot 13 | Prefix: %{_prefix} 14 | BuildArch: noarch 15 | Vendor: Altinity (Vladislav Klimenko ) 16 | Packager: Altinity (Vladislav Klimenko ) 17 | Url: https://github.com/altinity/clickhouse-mysql-data-reader 18 | Requires: python34 19 | Requires: python34-devel 20 | Requires: python34-libs 21 | Requires: python34-pip 22 | Requires: python34-setuptools 23 | Requires: clickhouse-client 24 | Requires: mysql-community-devel 25 | Requires: gcc 26 | Buildrequires: python34 27 | Buildrequires: python34-devel 28 | Buildrequires: python34-libs 29 | Buildrequires: python34-pip 30 | Buildrequires: python34-setuptools 31 | 32 | %description 33 | MySQL to ClickHouse data migrator 34 | 35 | %prep 36 | set -x 37 | %setup -n %{name}-%{version} -n %{name}-%{version} 38 | 39 | %build 40 | set -x 41 | python3 setup.py build 42 | 43 | %install 44 | set -x 45 | #python3 setup.py install --single-version-externally-managed -O1 --root=$RPM_BUILD_ROOT --record=INSTALLED_FILES 46 | python3 setup.py install --single-version-externally-managed -O1 --root=%{buildroot} --record=INSTALLED_FILES --prefix=/usr 47 | 48 | mkdir -p %{buildroot}/etc/clickhouse-mysql 49 | mkdir -p %{buildroot}/etc/init.d 50 | #mkdir -p %{buildroot}/etc/systemd/system 51 | mkdir -p %{buildroot}/var/run/clickhouse-mysql 52 | mkdir -p %{buildroot}/var/log/clickhouse-mysql 53 | 54 | cp %{_builddir}/%{buildsubdir}/clickhouse_mysql.etc/clickhouse-mysql.conf %{buildroot}/etc/clickhouse-mysql/clickhouse-mysql-example.conf 55 | cp %{_builddir}/%{buildsubdir}/clickhouse_mysql.init.d/clickhouse-mysql %{buildroot}/etc/init.d/clickhouse-mysql 56 | #cp %{_builddir}/%{buildsubdir}/init.d/clickhouse-mysql.service %{buildroot}/etc/systemd/system/clickhouse-mysql.service 57 | 58 | %clean 59 | rm -rf %{buildroot} 60 | 61 | %files -f INSTALLED_FILES 62 | %defattr(-,root,root) 63 | /etc/clickhouse-mysql/clickhouse-mysql-example.conf 64 | /etc/init.d/clickhouse-mysql 65 | #/etc/systemd/system/clickhouse-mysql.service 66 | /var/run/clickhouse-mysql 67 | /var/log/clickhouse-mysql 68 | 69 | %post 70 | CLICKHOUSE_USER=clickhouse 71 | CLICKHOUSE_GROUP=${CLICKHOUSE_USER} 72 | CLICKHOUSE_DATADIR=/var/lib/clickhouse 73 | 74 | function create_system_user() 75 | { 76 | USER=$1 77 | GROUP=$2 78 | HOMEDIR=$3 79 | 80 | echo "Create user ${USER}.${GROUP} with datadir ${HOMEDIR}" 81 | 82 | # Make sure the administrative user exists 83 | if ! getent passwd ${USER} > /dev/null; then 84 | adduser \ 85 | --system \ 86 | --no-create-home \ 87 | --home ${HOMEDIR} \ 88 | --shell /sbin/nologin \ 89 | --comment "Clickhouse server" \ 90 | clickhouse > /dev/null 91 | fi 92 | 93 | # if the user was created manually, make sure the group is there as well 94 | if ! getent group ${GROUP} > /dev/null; then 95 | addgroup --system ${GROUP} > /dev/null 96 | fi 97 | 98 | # make sure user is in the correct group 99 | if ! id -Gn ${USER} | grep -qw ${USER}; then 100 | adduser ${USER} ${GROUP} > /dev/null 101 | fi 102 | 103 | # check validity of user and group 104 | if [ "`id -u ${USER}`" -eq 0 ]; then 105 | echo "The ${USER} system user must not have uid 0 (root). Please fix this and reinstall this package." >&2 106 | exit 1 107 | fi 108 | 109 | if [ "`id -g ${GROUP}`" -eq 0 ]; then 110 | echo "The ${USER} system user must not have root as primary group. Please fix this and reinstall this package." >&2 111 | exit 1 112 | fi 113 | } 114 | 115 | create_system_user $CLICKHOUSE_USER clickhouse 116 | 117 | chown -R $CLICKHOUSE_USER:$CLICKHOUSE_GROUP /var/run/clickhouse-mysql 118 | chown -R $CLICKHOUSE_USER:$CLICKHOUSE_GROUP /var/log/clickhouse-mysql 119 | 120 | /usr/bin/pip3 install mysqlclient 121 | /usr/bin/pip3 install mysql-replication 122 | /usr/bin/pip3 install clickhouse-driver 123 | /usr/bin/pip3 install configobj 124 | 125 | -------------------------------------------------------------------------------- /clickhouse_mysql/writer/chwriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | import sys 6 | 7 | from decimal import Decimal 8 | 9 | from clickhouse_mysql.dbclient.chclient import CHClient 10 | 11 | from clickhouse_mysql.writer.writer import Writer 12 | from clickhouse_mysql.tableprocessor import TableProcessor 13 | from clickhouse_mysql.event.event import Event 14 | 15 | 16 | class CHWriter(Writer): 17 | """ClickHouse writer""" 18 | 19 | client = None 20 | dst_schema = None 21 | dst_table = None 22 | dst_distribute = None 23 | 24 | def __init__( 25 | self, 26 | connection_settings, 27 | dst_schema=None, 28 | dst_table=None, 29 | dst_table_prefix=None, 30 | dst_distribute=False, 31 | next_writer_builder=None, 32 | converter_builder=None, 33 | ): 34 | if dst_distribute and dst_schema is not None: 35 | dst_schema += "_all" 36 | if dst_distribute and dst_table is not None: 37 | dst_table += "_all" 38 | logging.info("CHWriter() connection_settings={} dst_schema={} dst_table={} dst_distribute={}".format( 39 | connection_settings, dst_schema, dst_table, dst_distribute)) 40 | self.client = CHClient(connection_settings) 41 | self.dst_schema = dst_schema 42 | self.dst_table = dst_table 43 | self.dst_table_prefix = dst_table_prefix 44 | self.dst_distribute = dst_distribute 45 | 46 | def insert(self, event_or_events=None): 47 | # event_or_events = [ 48 | # event: { 49 | # row: {'id': 3, 'a': 3} 50 | # }, 51 | # event: { 52 | # row: {'id': 3, 'a': 3} 53 | # }, 54 | # ] 55 | 56 | events = self.listify(event_or_events) 57 | if len(events) < 1: 58 | logging.warning('No events to insert. class: %s', __class__) 59 | return 60 | 61 | # assume we have at least one Event 62 | 63 | logging.debug('class:%s insert %d event(s)', __class__, len(events)) 64 | 65 | # verify and converts events and consolidate converted rows from all events into one batch 66 | 67 | rows = [] 68 | event_converted = None 69 | for event in events: 70 | if not event.verify: 71 | logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), __class__) 72 | continue # for event 73 | 74 | event_converted = self.convert(event) 75 | for row in event_converted: 76 | for key in row.keys(): 77 | # we need to convert Decimal value to str value for suitable for table structure 78 | if type(row[key]) == Decimal: 79 | row[key] = str(row[key]) 80 | rows.append(row) 81 | 82 | logging.debug('class:%s insert %d row(s)', __class__, len(rows)) 83 | 84 | # determine target schema.table 85 | 86 | schema = self.dst_schema if self.dst_schema else event_converted.schema 87 | table = None 88 | if self.dst_distribute: 89 | table = TableProcessor.create_distributed_table_name(db=event_converted.schema, table=event_converted.table) 90 | else: 91 | table = self.dst_table if self.dst_table else event_converted.table 92 | if self.dst_schema: 93 | table = TableProcessor.create_migrated_table_name(prefix=self.dst_table_prefix, table=table) 94 | 95 | logging.debug("schema={} table={} self.dst_schema={} self.dst_table={}".format(schema, table, self.dst_schema, self.dst_table)) 96 | 97 | # and INSERT converted rows 98 | 99 | sql = '' 100 | try: 101 | sql = 'INSERT INTO `{0}`.`{1}` ({2}) VALUES'.format( 102 | schema, 103 | table, 104 | ', '.join(map(lambda column: '`%s`' % column, rows[0].keys())) 105 | ) 106 | self.client.execute(sql, rows) 107 | except Exception as ex: 108 | logging.critical('QUERY FAILED') 109 | logging.critical('ex={}'.format(ex)) 110 | logging.critical('sql={}'.format(sql)) 111 | sys.exit(0) 112 | 113 | # all DONE 114 | 115 | 116 | if __name__ == '__main__': 117 | connection_settings = { 118 | 'host': '192.168.74.230', 119 | 'port': 9000, 120 | 'user': 'default', 121 | 'passwd': '', 122 | } 123 | 124 | writer = CHWriter(connection_settings=connection_settings) 125 | writer.insert() 126 | -------------------------------------------------------------------------------- /clickhouse_mysql.init.d/clickhouse-mysql: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ### BEGIN INIT INFO 3 | # Provides: clickhouse-mysql 4 | # Required-Start: $local_fs $network $named $time $syslog 5 | # Required-Stop: $local_fs $network $named $time $syslog 6 | # Default-Start: 2 3 4 5 7 | # Default-Stop: 0 1 6 8 | # Short-Description: clickhouse-mysql data migrator 9 | # Description: clickhouse-mysql helps to move data from MySQL to ClickHouse in background 10 | ### END INIT INFO 11 | 12 | 13 | #SERVICE_NAME=$(basename $0) 14 | SERVICE_NAME=clickhouse-mysql 15 | 16 | PID_FILE_DIR="/var/run/${SERVICE_NAME}" 17 | LOG_FILE_DIR="/var/log/${SERVICE_NAME}" 18 | 19 | PID_FILE="${PID_FILE_DIR}/${SERVICE_NAME}.pid" 20 | LOG_FILE="${LOG_FILE_DIR}/${SERVICE_NAME}.log" 21 | 22 | #RUN_CMD="python3 -m clickhouse_mysql.main --config-file=clickhouse-mysql.conf --daemon --log-file=$LOG_FILE --pid-file=$PID_FILE" 23 | RUN_CMD="/usr/bin/clickhouse-mysql --daemon --log-file=$LOG_FILE --pid-file=$PID_FILE" 24 | 25 | IS_CMD_CAPABLE="yes" 26 | #IS_CMD_CAPABLE="no" 27 | 28 | #RUN_CMD_AS_USER=user 29 | RUN_CMD_AS_USER=clickhouse 30 | WORK_DIR="/" 31 | 32 | SOFT_STOP_SIGNAL=SIGINT 33 | HARD_STOP_SIGNAL=SIGKILL 34 | 35 | DEBUG="no" 36 | #DEBUG="yes" 37 | 38 | 39 | function is_root() 40 | { 41 | [ "$EUID" -eq 0 ] 42 | } 43 | 44 | function get_pid() 45 | { 46 | [ -f "$PID_FILE" ] && cat "$PID_FILE" 47 | } 48 | 49 | function delete_pid_file() 50 | { 51 | if [ -f "$PID_FILE" ]; then 52 | rm "$PID_FILE" 53 | fi 54 | } 55 | 56 | function is_fs_item_writable_by_user() 57 | { 58 | local FS_ITEM_PATH=$1 59 | sudo -u $RUN_CMD_AS_USER bash -c "[ -w \"$FS_ITEM_PATH\" ]" 60 | } 61 | 62 | function is_file_usable() 63 | { 64 | local FILE=$1 65 | local DIR=$(dirname "${FILE}") 66 | is_fs_item_writable_by_user "$FILE" || is_fs_item_writable_by_user "$DIR" 67 | } 68 | 69 | function is_running() 70 | { 71 | kill -0 $(get_pid) &> /dev/null 72 | } 73 | 74 | function start_capable_service() 75 | { 76 | sudo -u $RUN_CMD_AS_USER bash -c "$RUN_CMD" 77 | } 78 | 79 | function start_incapable_service() 80 | { 81 | local CMD="${RUN_CMD} &> \"$LOG_FILE\" & echo \$! > \"$PID_FILE\"" 82 | sudo -u $RUN_CMD_AS_USER bash -c "$CMD" 83 | } 84 | 85 | function create_dirs() 86 | { 87 | if [ ! -d "${PID_FILE_DIR}" ]; then 88 | mkdir -p "${PID_FILE_DIR}" 89 | chown "${RUN_CMD_AS_USER}" "${PID_FILE_DIR}" 90 | fi 91 | 92 | if [ ! -d "${LOG_FILE_DIR}" ]; then 93 | mkdir -p "${LOG_FILE_DIR}" 94 | chown "${RUN_CMD_AS_USER}" "${LOG_FILE_DIR}" 95 | fi 96 | } 97 | 98 | function start() 99 | { 100 | if is_running; then 101 | echo "$SERVICE_NAME already running as: " 102 | ps -p $(get_pid) 103 | echo 104 | exit 1 105 | fi 106 | 107 | create_dirs 108 | 109 | if ! is_file_usable "$LOG_FILE"; then 110 | echo "Log file $LOG_FILE is not writable by user $RUN_CMD_AS_USER. Please check path" 111 | exit 1 112 | fi 113 | 114 | if ! is_file_usable "$PID_FILE"; then 115 | echo "Pid file $PID_FILE is not writable by user $RUN_CMD_AS_USER. Please check path" 116 | exit 1 117 | fi 118 | 119 | echo "Starting $SERVICE_NAME" 120 | cd $WORK_DIR 121 | 122 | if [ $IS_CMD_CAPABLE == "yes" ]; then 123 | start_capable_service 124 | else 125 | start_incapable_service 126 | fi 127 | 128 | cd - 129 | 130 | if ! is_running; then 131 | echo "ERROR: unable to start $SERVICE_NAME" 132 | echo "Check log file(s)" 133 | echo $LOG_FILE 134 | exit 1 135 | fi 136 | } 137 | 138 | function stop() 139 | { 140 | local STOP_SIGNAL=$1 141 | 142 | if ! is_running; then 143 | echo "$SERVICE_NAME is not running" 144 | exit 1 145 | fi 146 | 147 | echo "Send $STOP_SIGNAL to $SERVICE_NAME" 148 | kill -${STOP_SIGNAL} $(get_pid) 149 | 150 | echo "Wait for $SERVICE_NAME to exit" 151 | while is_running; do 152 | echo -n '.' 153 | sleep 1 154 | done 155 | echo "$SERVICE_NAME is not running" 156 | 157 | delete_pid_file 158 | } 159 | 160 | function stop_soft() 161 | { 162 | stop $SOFT_STOP_SIGNAL 163 | } 164 | 165 | function stop_hard() 166 | { 167 | stop $HARD_STOP_SIGNAL 168 | } 169 | 170 | function restart() 171 | { 172 | stop_soft 173 | start 174 | } 175 | 176 | function status() 177 | { 178 | echo "$SERVICE_NAME status" 179 | if is_running; then 180 | echo "$SERVICE_NAME is running as: " 181 | ps -p $(get_pid) 182 | echo 183 | else 184 | # is not running 185 | local PID=$(get_pid) 186 | if [ -z "$PID" ]; then 187 | echo "$SERVICE_NAME is not running" 188 | else 189 | echo "$SERVICE_NAME is not running, but PID file $PID_FILE exists" 190 | fi 191 | fi 192 | } 193 | 194 | if [ "$DEBUG" == "yes" ]; then 195 | set -x 196 | set -e 197 | fi 198 | 199 | if ! is_root; then 200 | echo "Please run as root" 201 | exit 1 202 | fi 203 | 204 | case "$1" in 205 | start) 206 | start 207 | ;; 208 | stop) 209 | stop_soft 210 | ;; 211 | stop_hard) 212 | stop_hard 213 | ;; 214 | restart) 215 | restart 216 | ;; 217 | status) 218 | status 219 | ;; 220 | *) 221 | echo "Usage: $0 {start|stop|stop_hard|restart|status}" 222 | esac 223 | 224 | -------------------------------------------------------------------------------- /clickhouse_mysql_examples/airline_ontime_schema_ch.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE IF NOT EXISTS `airline`; 2 | CREATE TABLE IF NOT EXISTS `airline`.`ontime` ( 3 | `Year` UInt16, 4 | `Quarter` Nullable(UInt8), 5 | `Month` UInt8, 6 | `DayofMonth` Nullable(UInt8), 7 | `DayOfWeek` Nullable(UInt8), 8 | `FlightDate` Date, 9 | `UniqueCarrier` Nullable(String), 10 | `AirlineID` Nullable(UInt32), 11 | `Carrier` Nullable(String), 12 | `TailNum` Nullable(String), 13 | `FlightNum` Nullable(String), 14 | `OriginAirportID` Nullable(UInt32), 15 | `OriginAirportSeqID` Nullable(UInt32), 16 | `OriginCityMarketID` Nullable(UInt32), 17 | `Origin` Nullable(String), 18 | `OriginCityName` Nullable(String), 19 | `OriginState` Nullable(String), 20 | `OriginStateFips` Nullable(String), 21 | `OriginStateName` Nullable(String), 22 | `OriginWac` Nullable(UInt32), 23 | `DestAirportID` Nullable(UInt32), 24 | `DestAirportSeqID` Nullable(UInt32), 25 | `DestCityMarketID` Nullable(UInt32), 26 | `Dest` Nullable(String), 27 | `DestCityName` Nullable(String), 28 | `DestState` Nullable(String), 29 | `DestStateFips` Nullable(String), 30 | `DestStateName` Nullable(String), 31 | `DestWac` Nullable(UInt32), 32 | `CRSDepTime` Nullable(UInt32), 33 | `DepTime` Nullable(UInt32), 34 | `DepDelay` Nullable(Float32), 35 | `DepDelayMinutes` Nullable(Float32), 36 | `DepDel15` Nullable(Float32), 37 | `DepartureDelayGroups` Nullable(Int32), 38 | `DepTimeBlk` Nullable(String), 39 | `TaxiOut` Nullable(Float32), 40 | `WheelsOff` Nullable(UInt32), 41 | `WheelsOn` Nullable(UInt32), 42 | `TaxiIn` Nullable(Float32), 43 | `CRSArrTime` Nullable(UInt32), 44 | `ArrTime` Nullable(UInt32), 45 | `ArrDelay` Nullable(Float32), 46 | `ArrDelayMinutes` Nullable(Float32), 47 | `ArrDel15` Nullable(Float32), 48 | `ArrivalDelayGroups` Nullable(Int32), 49 | `ArrTimeBlk` Nullable(String), 50 | `Cancelled` Nullable(Float32), 51 | `CancellationCode` Nullable(String), 52 | `Diverted` Nullable(Float32), 53 | `CRSElapsedTime` Nullable(Float32), 54 | `ActualElapsedTime` Nullable(Float32), 55 | `AirTime` Nullable(Float32), 56 | `Flights` Nullable(Float32), 57 | `Distance` Nullable(Float32), 58 | `DistanceGroup` Nullable(Float32), 59 | `CarrierDelay` Nullable(Float32), 60 | `WeatherDelay` Nullable(Float32), 61 | `NASDelay` Nullable(Float32), 62 | `SecurityDelay` Nullable(Float32), 63 | `LateAircraftDelay` Nullable(Float32), 64 | `FirstDepTime` Nullable(String), 65 | `TotalAddGTime` Nullable(String), 66 | `LongestAddGTime` Nullable(String), 67 | `DivAirportLandings` Nullable(String), 68 | `DivReachedDest` Nullable(String), 69 | `DivActualElapsedTime` Nullable(String), 70 | `DivArrDelay` Nullable(String), 71 | `DivDistance` Nullable(String), 72 | `Div1Airport` Nullable(String), 73 | `Div1AirportID` Nullable(UInt32), 74 | `Div1AirportSeqID` Nullable(UInt32), 75 | `Div1WheelsOn` Nullable(String), 76 | `Div1TotalGTime` Nullable(String), 77 | `Div1LongestGTime` Nullable(String), 78 | `Div1WheelsOff` Nullable(String), 79 | `Div1TailNum` Nullable(String), 80 | `Div2Airport` Nullable(String), 81 | `Div2AirportID` Nullable(UInt32), 82 | `Div2AirportSeqID` Nullable(UInt32), 83 | `Div2WheelsOn` Nullable(String), 84 | `Div2TotalGTime` Nullable(String), 85 | `Div2LongestGTime` Nullable(String), 86 | `Div2WheelsOff` Nullable(String), 87 | `Div2TailNum` Nullable(String), 88 | `Div3Airport` Nullable(String), 89 | `Div3AirportID` Nullable(UInt32), 90 | `Div3AirportSeqID` Nullable(UInt32), 91 | `Div3WheelsOn` Nullable(String), 92 | `Div3TotalGTime` Nullable(String), 93 | `Div3LongestGTime` Nullable(String), 94 | `Div3WheelsOff` Nullable(String), 95 | `Div3TailNum` Nullable(String), 96 | `Div4Airport` Nullable(String), 97 | `Div4AirportID` Nullable(UInt32), 98 | `Div4AirportSeqID` Nullable(UInt32), 99 | `Div4WheelsOn` Nullable(String), 100 | `Div4TotalGTime` Nullable(String), 101 | `Div4LongestGTime` Nullable(String), 102 | `Div4WheelsOff` Nullable(String), 103 | `Div4TailNum` Nullable(String), 104 | `Div5Airport` Nullable(String), 105 | `Div5AirportID` Nullable(UInt32), 106 | `Div5AirportSeqID` Nullable(UInt32), 107 | `Div5WheelsOn` Nullable(String), 108 | `Div5TotalGTime` Nullable(String), 109 | `Div5LongestGTime` Nullable(String), 110 | `Div5WheelsOff` Nullable(String), 111 | `Div5TailNum` Nullable(String) 112 | ) ENGINE = MergeTree(FlightDate, (FlightDate, Year, Month), 8192); 113 | 114 | -------------------------------------------------------------------------------- /clickhouse_mysql/event/event.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | class Event(object): 6 | 7 | # main payload - one or multiple rows 8 | 9 | # native mysql replication event 10 | # one of from pymysqlreplication.row_event import 11 | # contains rows internally 12 | pymysqlreplication_event = None 13 | 14 | # one row payload 15 | # {'id':1, 'col1':1} 16 | row = None 17 | 18 | # multi-rows payload 19 | # [{'id': 1, 'col1':1}, {'id': 2, 'col1': 2}, {'id': 3, 'col1': 3}] 20 | rows = None 21 | 22 | # additional meta-information 23 | # source-dependent 24 | 25 | # db name 26 | schema = None 27 | 28 | # table name 29 | table = None 30 | 31 | # /path/to/csv/file.csv 32 | filename = None 33 | 34 | # ['id', 'col1', 'col2'] 35 | fieldnames = None 36 | 37 | # payload rows iterator 38 | _iter = None 39 | 40 | def __iter__(self): 41 | # create payload rows iterator 42 | 43 | if self.pymysqlreplication_event is not None: 44 | # we have native replication event - would iterate over its rows 45 | self._iter = iter(self.pymysqlreplication_event.rows) 46 | 47 | elif self.row is not None: 48 | # we have one row - would iterate over list of one row 49 | self._iter = iter([self.row]) 50 | 51 | else: 52 | # assume multiple rows - would iterate over them 53 | self._iter = iter(self.rows) 54 | 55 | return self 56 | 57 | def __next__(self): 58 | # fetch next item from iterator 59 | 60 | item = next(self._iter) 61 | 62 | if self.pymysqlreplication_event is not None: 63 | # in native replication event actual data are in row['values'] dict item 64 | return item['values'] 65 | else: 66 | # local-kept data 67 | return item 68 | 69 | def convert(self, converter): 70 | self.row = converter.row(self.row) 71 | self.rows = converter.rows(self.rows) 72 | 73 | def first_row(self): 74 | return next(iter(self or []), None) 75 | 76 | def verify(self): 77 | # verify Event has correct data structure 78 | 79 | if self.pymysqlreplication_event is not None: 80 | # have native replication event must have some data 81 | # must have non-empty list of rows with 'value' 82 | # data item of reasonable len 83 | if (self.pymysqlreplication_event.rows is not None) \ 84 | and isinstance(self.pymysqlreplication_event.rows, list) \ 85 | and (len(self.pymysqlreplication_event.rows) > 0) \ 86 | and ('values' in self.pymysqlreplication_event.rows[0]) \ 87 | and (len(self.pymysqlreplication_event.rows[0]['values']) > 0): 88 | return True 89 | else: 90 | return False 91 | 92 | if self.row is not None: 93 | # one row of data must be of a reasonable len 94 | if isinstance(self.row, dict) and (len(self.row) > 0): 95 | return True 96 | else: 97 | return False 98 | 99 | if self.rows is not None: 100 | # rows of data must contain list of dicts 101 | if isinstance(self.rows, list) \ 102 | and (len(self.rows) > 0) \ 103 | and isinstance(self.rows[0], dict) \ 104 | and (len(self.rows[0]) > 0): 105 | return True 106 | else: 107 | return False 108 | 109 | # no data available? 110 | return False 111 | 112 | def meta(self): 113 | # meta info 114 | 115 | meta = '' 116 | if self.pymysqlreplication_event is not None: 117 | meta += ' mysql_event set' 118 | if (self.pymysqlreplication_event.rows is not None): 119 | meta += ' mysql_event.rows set' 120 | if isinstance(self.pymysqlreplication_event.rows, list): 121 | meta += ' mysql_event.rows is a list' 122 | meta += ' len(mysql_event.rows)=' + len(self.pymysqlreplication_event.rows) 123 | if (len(self.pymysqlreplication_event.rows) > 0) \ 124 | and ('values' in self.pymysqlreplication_event.rows[0]): 125 | meta += ' mysql_event.rows[0][values] is set' 126 | 127 | if len(self.pymysqlreplication_event.rows[0]['values']) > 0: 128 | meta += ' len(mysql_event.rows[0][values])=' + len(self.pymysqlreplication_event.rows[0]['values']) 129 | else: 130 | meta += ' mysql_event not set' 131 | 132 | if self.row is not None: 133 | meta += ' row set' 134 | if isinstance(self.row, dict): 135 | meta += ' is dict len()=' + len(self.row) 136 | else: 137 | meta += ' is not a dict' 138 | else: 139 | meta += ' row not set' 140 | 141 | if self.rows is not None: 142 | meta += ' rows set' 143 | if isinstance(self.rows, list): 144 | meta += ' rows is a list len(rows)=' + len(self.rows) 145 | if (len(self.rows) > 0) and isinstance(self.rows[0], dict): 146 | meta += ' with dict() len(dict)=' + len(self.rows[0]) 147 | else: 148 | meta += ' rows is not a list' 149 | else: 150 | meta += ' rows not set' 151 | 152 | return meta 153 | 154 | def column_names(self): 155 | # fetch column names from data 156 | return self.first_row().keys() 157 | -------------------------------------------------------------------------------- /clickhouse_mysql/writer/csvwriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import csv 5 | import os.path 6 | import logging 7 | import copy 8 | import time 9 | import uuid 10 | 11 | from clickhouse_mysql.writer.writer import Writer 12 | from clickhouse_mysql.event.event import Event 13 | 14 | 15 | class CSVWriter(Writer): 16 | """Write CSV files""" 17 | 18 | file = None 19 | path = None 20 | writer = None 21 | dst_schema = None 22 | dst_table = None 23 | fieldnames = None 24 | header_written = False 25 | path_prefix = None 26 | path_suffix_parts = [] 27 | delete = False 28 | 29 | def __init__( 30 | self, 31 | csv_file_path=None, 32 | csv_file_path_prefix=None, 33 | csv_file_path_suffix_parts=[], 34 | csv_keep_file=False, 35 | dst_schema=None, 36 | dst_table=None, 37 | dst_table_prefix=None, 38 | next_writer_builder=None, 39 | converter_builder=None, 40 | ): 41 | logging.info("CSVWriter() " 42 | "csv_file_path={} " 43 | "csv_file_path_prefix={} " 44 | "csv_file_path_suffix_parts={} " 45 | "csv_keep_file={} " 46 | "dst_schema={} " 47 | "dst_table={} ".format( 48 | csv_file_path, 49 | csv_file_path_prefix, 50 | csv_file_path_suffix_parts, 51 | csv_keep_file, 52 | dst_schema, 53 | dst_table, 54 | )) 55 | super().__init__(next_writer_builder=next_writer_builder, converter_builder=converter_builder) 56 | 57 | self.path = csv_file_path 58 | self.path_prefix = csv_file_path_prefix 59 | self.path_suffix_parts = csv_file_path_suffix_parts 60 | self.dst_schema = dst_schema 61 | self.dst_table = dst_table 62 | self.dst_table_prefix = dst_table_prefix 63 | 64 | if self.path is None: 65 | if not self.path_suffix_parts: 66 | # no suffix parts specified - use default ones 67 | # 1. current UNIX timestamp with fractions Ex.: 1521813908.1152523 68 | # 2. random-generated UUID Ex.: f42d7297-9d25-43d8-8468-a59810ce9f77 69 | # result would be filename like csvpool_1521813908.1152523_f42d7297-9d25-43d8-8468-a59810ce9f77.csv 70 | self.path_suffix_parts.append(str(time.time())) 71 | self.path_suffix_parts.append(str(uuid.uuid4())) 72 | self.path = self.path_prefix + '_'.join(self.path_suffix_parts) + '.csv' 73 | self.delete = not csv_keep_file 74 | 75 | logging.info("CSVWriter() self.path={}".format(self.path)) 76 | 77 | def __del__(self): 78 | self.destroy() 79 | 80 | def opened(self): 81 | return bool(self.file) 82 | 83 | def open(self): 84 | if not self.opened(): 85 | # do not write header to already existing file 86 | # assume it was written earlier 87 | if os.path.isfile(self.path): 88 | self.header_written = True 89 | # open file for write-at-the-end mode 90 | self.file = open(self.path, 'a+') 91 | 92 | def insert(self, event_or_events): 93 | # event_or_events = [ 94 | # event: { 95 | # row: {'id': 3, 'a': 3} 96 | # }, 97 | # event: { 98 | # row: {'id': 3, 'a': 3} 99 | # }, 100 | # ] 101 | 102 | events = self.listify(event_or_events) 103 | if len(events) < 1: 104 | logging.warning('No events to insert. class: %s', __class__) 105 | return 106 | 107 | # assume we have at least one Event 108 | 109 | logging.debug('class:%s insert %d events', __class__, len(events)) 110 | 111 | if not self.opened(): 112 | self.open() 113 | 114 | if not self.writer: 115 | # pick any event from the list 116 | event = events[0] 117 | if not event.verify: 118 | logging.warning('Event verification failed. Skip insert(). Event: %s Class: %s', event.meta(), __class__) 119 | return 120 | 121 | self.fieldnames = sorted(self.convert(copy.copy(event.first_row())).keys()) 122 | if self.dst_schema is None: 123 | self.dst_schema = event.schema 124 | if self.dst_table is None: 125 | self.dst_table = event.table 126 | 127 | self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames) 128 | if not self.header_written: 129 | self.writer.writeheader() 130 | 131 | for event in events: 132 | if not event.verify: 133 | logging.warning('Event verification failed. Skip one event. Event: %s Class: %s', event.meta(), __class__) 134 | continue # for event 135 | for row in event: 136 | self.writer.writerow(self.convert(row)) 137 | 138 | def push(self): 139 | if not self.next_writer_builder or not self.fieldnames: 140 | return 141 | 142 | event = Event() 143 | event.schema = self.dst_schema 144 | event.table = self.dst_table 145 | event.filename = self.path 146 | event.fieldnames = self.fieldnames 147 | self.next_writer_builder.get().insert(event) 148 | 149 | def close(self): 150 | if self.opened(): 151 | self.file.flush() 152 | self.file.close() 153 | self.file = None 154 | self.writer = None 155 | 156 | def destroy(self): 157 | if self.delete and os.path.isfile(self.path): 158 | self.close() 159 | os.remove(self.path) 160 | 161 | if __name__ == '__main__': 162 | path = 'file.csv' 163 | 164 | writer = CSVWriter(path) 165 | writer.open() 166 | event = Event() 167 | event.row_converted={ 168 | 'a': 123, 169 | 'b': 456, 170 | 'c': 'qwe', 171 | 'd': 'rty', 172 | } 173 | writer.insert(event) 174 | event.row_converted={ 175 | 'a': 789, 176 | 'b': 987, 177 | 'c': 'asd', 178 | 'd': 'fgh', 179 | } 180 | writer.insert(event) 181 | writer.close() 182 | -------------------------------------------------------------------------------- /clickhouse_mysql_examples/airline_ontime_schema_mysql.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE IF NOT EXISTS `airline`; 2 | CREATE TABLE IF NOT EXISTS `airline`.`ontime` ( 3 | `Year` SMALLINT UNSIGNED, -- maps to UInt16, 4 | `Quarter` TINYINT UNSIGNED, -- maps to UInt8, 5 | `Month` TINYINT UNSIGNED, -- maps to UInt8, 6 | `DayofMonth` TINYINT UNSIGNED, -- maps to UInt8, 7 | `DayOfWeek` TINYINT UNSIGNED, -- maps to UInt8, 8 | `FlightDate` DATE, -- maps to Date, 9 | `UniqueCarrier` LONGTEXT, -- maps to String, 10 | `AirlineID` INTEGER UNSIGNED, -- maps to UInt32, 11 | `Carrier` LONGTEXT, -- maps to String, 12 | `TailNum` LONGTEXT, -- maps to String, 13 | `FlightNum` LONGTEXT, -- maps to String, 14 | `OriginAirportID` INTEGER UNSIGNED, -- maps to UInt32, 15 | `OriginAirportSeqID` INTEGER UNSIGNED, -- maps to UInt32, 16 | `OriginCityMarketID` INTEGER UNSIGNED, -- maps to UInt32, 17 | `Origin` LONGTEXT, -- maps to String, 18 | `OriginCityName` LONGTEXT, -- maps to String, 19 | `OriginState` LONGTEXT, -- maps to String, 20 | `OriginStateFips` LONGTEXT, -- maps to String, 21 | `OriginStateName` LONGTEXT, -- maps to String, 22 | `OriginWac` INTEGER UNSIGNED, -- maps to UInt32, 23 | `DestAirportID` INTEGER UNSIGNED, -- maps to UInt32, 24 | `DestAirportSeqID` INTEGER UNSIGNED, -- maps to UInt32, 25 | `DestCityMarketID` INTEGER UNSIGNED, -- maps to UInt32, 26 | `Dest` LONGTEXT, -- maps to String, 27 | `DestCityName` LONGTEXT, -- maps to String, 28 | `DestState` LONGTEXT, -- maps to String, 29 | `DestStateFips` LONGTEXT, -- maps to String, 30 | `DestStateName` LONGTEXT, -- maps to String, 31 | `DestWac` INTEGER UNSIGNED, -- maps to UInt32, 32 | `CRSDepTime` INTEGER UNSIGNED, -- maps to UInt32, 33 | `DepTime` INTEGER UNSIGNED, -- maps to UInt32, 34 | `DepDelay` FLOAT, -- maps to Float32, 35 | `DepDelayMinutes` FLOAT, -- maps to Float32, 36 | `DepDel15` FLOAT, -- maps to Float32, 37 | `DepartureDelayGroups` INTEGER, -- maps to Int32, 38 | `DepTimeBlk` LONGTEXT, -- maps to String, 39 | `TaxiOut` FLOAT, -- maps to Float32, 40 | `WheelsOff` INTEGER UNSIGNED, -- maps to UInt32, 41 | `WheelsOn` INTEGER UNSIGNED, -- maps to UInt32, 42 | `TaxiIn` FLOAT, -- maps to Float32, 43 | `CRSArrTime` INTEGER UNSIGNED, -- maps to UInt32, 44 | `ArrTime` INTEGER UNSIGNED, -- maps to UInt32, 45 | `ArrDelay` FLOAT, -- maps to Float32, 46 | `ArrDelayMinutes` FLOAT, -- maps to Float32, 47 | `ArrDel15` FLOAT, -- maps to Float32, 48 | `ArrivalDelayGroups` INTEGER, -- maps to Int32, 49 | `ArrTimeBlk` LONGTEXT, -- maps to String, 50 | `Cancelled` FLOAT, -- maps to Float32, 51 | `CancellationCode` LONGTEXT, -- maps to String, 52 | `Diverted` FLOAT, -- maps to Float32, 53 | `CRSElapsedTime` FLOAT, -- maps to Float32, 54 | `ActualElapsedTime` FLOAT, -- maps to Float32, 55 | `AirTime` FLOAT, -- maps to Float32, 56 | `Flights` FLOAT, -- maps to Float32, 57 | `Distance` FLOAT, -- maps to Float32, 58 | `DistanceGroup` FLOAT, -- maps to Float32, 59 | `CarrierDelay` FLOAT, -- maps to Float32, 60 | `WeatherDelay` FLOAT, -- maps to Float32, 61 | `NASDelay` FLOAT, -- maps to Float32, 62 | `SecurityDelay` FLOAT, -- maps to Float32, 63 | `LateAircraftDelay` FLOAT, -- maps to Float32, 64 | `FirstDepTime` LONGTEXT, -- maps to String, 65 | `TotalAddGTime` LONGTEXT, -- maps to String, 66 | `LongestAddGTime` LONGTEXT, -- maps to String, 67 | `DivAirportLandings` LONGTEXT, -- maps to String, 68 | `DivReachedDest` LONGTEXT, -- maps to String, 69 | `DivActualElapsedTime` LONGTEXT, -- maps to String, 70 | `DivArrDelay` LONGTEXT, -- maps to String, 71 | `DivDistance` LONGTEXT, -- maps to String, 72 | `Div1Airport` LONGTEXT, -- maps to String, 73 | `Div1AirportID` INTEGER UNSIGNED, -- maps to UInt32, 74 | `Div1AirportSeqID` INTEGER UNSIGNED, -- maps to UInt32, 75 | `Div1WheelsOn` LONGTEXT, -- maps to String, 76 | `Div1TotalGTime` LONGTEXT, -- maps to String, 77 | `Div1LongestGTime` LONGTEXT, -- maps to String, 78 | `Div1WheelsOff` LONGTEXT, -- maps to String, 79 | `Div1TailNum` LONGTEXT, -- maps to String, 80 | `Div2Airport` LONGTEXT, -- maps to String, 81 | `Div2AirportID` INTEGER UNSIGNED, -- maps to UInt32, 82 | `Div2AirportSeqID` INTEGER UNSIGNED, -- maps to UInt32, 83 | `Div2WheelsOn` LONGTEXT, -- maps to String, 84 | `Div2TotalGTime` LONGTEXT, -- maps to String, 85 | `Div2LongestGTime` LONGTEXT, -- maps to String, 86 | `Div2WheelsOff` LONGTEXT, -- maps to String, 87 | `Div2TailNum` LONGTEXT, -- maps to String, 88 | `Div3Airport` LONGTEXT, -- maps to String, 89 | `Div3AirportID` INTEGER UNSIGNED, -- maps to UInt32, 90 | `Div3AirportSeqID` INTEGER UNSIGNED, -- maps to UInt32, 91 | `Div3WheelsOn` LONGTEXT, -- maps to String, 92 | `Div3TotalGTime` LONGTEXT, -- maps to String, 93 | `Div3LongestGTime` LONGTEXT, -- maps to String, 94 | `Div3WheelsOff` LONGTEXT, -- maps to String, 95 | `Div3TailNum` LONGTEXT, -- maps to String, 96 | `Div4Airport` LONGTEXT, -- maps to String, 97 | `Div4AirportID` INTEGER UNSIGNED, -- maps to UInt32, 98 | `Div4AirportSeqID` INTEGER UNSIGNED, -- maps to UInt32, 99 | `Div4WheelsOn` LONGTEXT, -- maps to String, 100 | `Div4TotalGTime` LONGTEXT, -- maps to String, 101 | `Div4LongestGTime` LONGTEXT, -- maps to String, 102 | `Div4WheelsOff` LONGTEXT, -- maps to String, 103 | `Div4TailNum` LONGTEXT, -- maps to String, 104 | `Div5Airport` LONGTEXT, -- maps to String, 105 | `Div5AirportID` INTEGER UNSIGNED, -- maps to UInt32, 106 | `Div5AirportSeqID` INTEGER UNSIGNED, -- maps to UInt32, 107 | `Div5WheelsOn` LONGTEXT, -- maps to String, 108 | `Div5TotalGTime` LONGTEXT, -- maps to String, 109 | `Div5LongestGTime` LONGTEXT, -- maps to String, 110 | `Div5WheelsOff` LONGTEXT, -- maps to String, 111 | `Div5TailNum` LONGTEXT -- maps to String 112 | ); 113 | -------------------------------------------------------------------------------- /clickhouse_mysql/pool/bbpool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import time 5 | import logging 6 | 7 | from clickhouse_mysql.pool.pool import Pool 8 | from clickhouse_mysql.objectbuilder import ObjectBuilder 9 | 10 | 11 | # Buckets Belts' Index Generator 12 | class BBIndexGenerator(object): 13 | 14 | def generate(self, item): 15 | # build key of the belt on which to place item 16 | return str(item.schema) + '.' + str(item.table) 17 | 18 | 19 | # Buckets Belts Pool 20 | class BBPool(Pool): 21 | 22 | # buckets on the belts 23 | belts = { 24 | # pour data into 0-index bucket 25 | # 'key.1': [[item,], [item, item, item,], [item, item, item,]] 26 | # 'key.2': [[item,], [item, item, item,], [item, item, item,]] 27 | } 28 | 29 | belts_rotated_at = { 30 | # 'key.1': UNIX TIMESTAMP 31 | # 'key.2': UNIX TIMESTAMP 32 | } 33 | 34 | buckets_num_total = 0 35 | items_num_total = 0; 36 | 37 | prev_time = None 38 | prev_buckets_count = 0 39 | prev_items_count = 0; 40 | 41 | def __init__( 42 | self, 43 | writer_builder=None, 44 | key_builder=None, 45 | max_bucket_size=10000, 46 | max_belt_size=1, 47 | max_interval_between_rotations=60, 48 | ): 49 | super().__init__( 50 | writer_builder=writer_builder, 51 | key_builder=ObjectBuilder(class_name=BBIndexGenerator), 52 | max_bucket_size=max_bucket_size, 53 | max_belt_size=max_belt_size, 54 | max_interval_between_rotations=max_interval_between_rotations, 55 | ) 56 | 57 | def create_belt(self, belt_index): 58 | """create belt with one empty bucket""" 59 | 60 | self.belts[belt_index] = [[]] 61 | self.belts_rotated_at[belt_index] = int(time.time()) 62 | 63 | def insert(self, item): 64 | """Insert item into pool""" 65 | 66 | # which belt we'll insert item? 67 | belt_index = self.key_generator.generate(item) 68 | 69 | # register belt if not yet 70 | if belt_index not in self.belts: 71 | self.create_belt(belt_index) 72 | 73 | # append item to the 0-indexed bucket of the specified belt 74 | self.belts[belt_index][0].append(item) 75 | 76 | # try to rotate belt - may it it already should be rotated 77 | self.rotate_belt(belt_index) 78 | 79 | def flush(self, key=None): 80 | """Flush all buckets from the belt and delete the belt itself""" 81 | 82 | belt_index = key 83 | empty_belts_indexes = [] 84 | 85 | if belt_index is None: 86 | for b_index in self.belts: 87 | if self.rotate_belt(b_index, flush=True): 88 | empty_belts_indexes.append(b_index) 89 | else: 90 | if self.rotate_belt(belt_index, flush=True): 91 | empty_belts_indexes.append(belt_index) 92 | 93 | # delete belt 94 | for b_index in empty_belts_indexes: 95 | self.belts.pop(b_index) 96 | self.belts_rotated_at.pop(b_index) 97 | 98 | def rotate_belt(self, belt_index, flush=False): 99 | """Try to rotate belt""" 100 | 101 | now = time.time() 102 | 103 | if flush: 104 | # explicit flush requested 105 | rotate_reason = "FLUSH" 106 | 107 | elif len(self.belts[belt_index][0]) >= self.max_bucket_size: 108 | # 0-index bucket is full 109 | rotate_reason = "SIZE" 110 | 111 | elif now >= self.belts_rotated_at[belt_index] + self.max_interval_between_rotations: 112 | # time interval reached 113 | rotate_reason = "TIME" 114 | 115 | else: 116 | # no need to rotate belt - belt not rotated 117 | return False 118 | 119 | # belt(s) needs rotation 120 | 121 | # insert empty bucket into the beginning of the belt 122 | self.belts[belt_index].insert(0, []) 123 | self.belts_rotated_at[belt_index] = now 124 | 125 | # in case we flush belt we'll keep one just inserted empty bucket 126 | buckets_num_left_on_belt = 1 if flush else self.max_belt_size 127 | 128 | while len(self.belts[belt_index]) > buckets_num_left_on_belt: 129 | # too many buckets on the belt 130 | # time to rotate belt and flush the most-right-bucket 131 | 132 | buckets_on_belt_num = len(self.belts[belt_index]) 133 | most_right_bucket_size = len(self.belts[belt_index][buckets_on_belt_num-1]) 134 | 135 | self.buckets_num_total += 1 136 | self.items_num_total += most_right_bucket_size 137 | 138 | logging.info('rot now:%f bktttl:%d bktitemsttl: %d index:%s reason:%s bktsonbelt:%d bktsize:%d beltnum:%d', 139 | now, 140 | self.buckets_num_total, 141 | self.items_num_total, 142 | str(belt_index), 143 | rotate_reason, 144 | buckets_on_belt_num, 145 | most_right_bucket_size, 146 | len(self.belts), 147 | ) 148 | 149 | # time to flush data for specified key 150 | #self.writer_builder.param('csv_file_path_suffix_parts', [str(int(now)), str(self.buckets_num_total)]) 151 | writer = self.writer_builder.new() 152 | writer.insert(self.belts[belt_index].pop()) 153 | writer.close() 154 | writer.push() 155 | writer.destroy() 156 | del writer 157 | 158 | if self.prev_time is not None: 159 | # have previous time - meaning this is at least second rotate 160 | # can calculate belt speed 161 | window_size = now - self.prev_time 162 | if window_size > 0: 163 | buckets_per_sec = (self.buckets_num_total - self.prev_buckets_count) / window_size 164 | items_per_sec = (self.items_num_total - self.prev_items_count) / window_size 165 | logging.info( 166 | 'PERF - buckets_per_sec:%f items_per_sec:%f for last %d sec', 167 | buckets_per_sec, 168 | items_per_sec, 169 | window_size 170 | ) 171 | else: 172 | logging.info("PERF - buckets window size=0 can not calc performance for this window") 173 | 174 | self.prev_time = now 175 | self.prev_buckets_count = self.buckets_num_total 176 | self.prev_items_count = self.items_num_total 177 | 178 | # belt rotated 179 | return True 180 | -------------------------------------------------------------------------------- /clickhouse_mysql/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import multiprocessing as mp 6 | import logging 7 | import traceback 8 | import pprint 9 | import json 10 | import os 11 | import pkg_resources 12 | import shutil 13 | 14 | if sys.version_info < (3, 4): 15 | print("Python version is NOT OK, need 3.4 at least") 16 | sys.exit(1) 17 | 18 | from clickhouse_mysql.clioptions import CLIOptions 19 | from clickhouse_mysql.pumper import Pumper 20 | from clickhouse_mysql.daemon import Daemon 21 | from clickhouse_mysql.config import Config 22 | 23 | 24 | class Main(Daemon): 25 | 26 | config = None 27 | 28 | def __init__(self): 29 | 30 | # append 'converter' folder into sys.path 31 | # this helps to load custom modules 32 | converter_folder = os.path.dirname(os.path.realpath(__file__)) + '/converter' 33 | if converter_folder not in sys.path: 34 | sys.path.insert(0, converter_folder) 35 | 36 | # parse CLI options 37 | self.config = Config() 38 | 39 | # first action after config available - setup requested logging level 40 | logging.basicConfig( 41 | filename=self.config.log_file(), 42 | level=self.config.log_level(), 43 | format='%(asctime)s/%(created)f:%(levelname)s:%(message)s' 44 | ) 45 | 46 | # and call parent 47 | super().__init__(pidfile=self.config.pid_file()) 48 | 49 | # some verbosity 50 | logging.info('Starting') 51 | logging.debug(self.config) 52 | logging.info("sys.path") 53 | logging.info(pprint.pformat(sys.path)) 54 | # mp.set_start_method('forkserver') 55 | 56 | @staticmethod 57 | def install(): 58 | # install service file 59 | src_service_filepath = os.path.abspath( 60 | pkg_resources.resource_filename('clickhouse_mysql', '../clickhouse_mysql.init.d/clickhouse-mysql')) 61 | dst_service_dir = '/etc/init.d/' 62 | dst_service_filepath = dst_service_dir + 'clickhouse-mysql' 63 | try: 64 | print("Install service file ", src_service_filepath, ' to ', dst_service_filepath) 65 | # os.makedirs wants '/' at the end of the folder name 66 | os.makedirs(name=dst_service_dir, exist_ok=True) 67 | shutil.copy2(src_service_filepath, dst_service_filepath) 68 | except Exception as e: 69 | print(e) 70 | print('Looks like you have no permissions to write to ', dst_service_filepath) 71 | sys.exit(1) 72 | 73 | # install config example file 74 | src_service_filepath = os.path.abspath( 75 | pkg_resources.resource_filename('clickhouse_mysql', '../clickhouse_mysql.etc/clickhouse-mysql.conf')) 76 | dst_service_dir = '/etc/clickhouse-mysql/' 77 | dst_service_filepath = dst_service_dir + 'clickhouse-mysql-example.conf' 78 | try: 79 | print("Install config file ", src_service_filepath, ' to ', dst_service_filepath) 80 | # os.makedirs wants '/' at the end of the folder name 81 | os.makedirs(name=dst_service_dir, exist_ok=True) 82 | shutil.copy2(src_service_filepath, dst_service_filepath) 83 | except Exception as e: 84 | print(e) 85 | print('Looks like you have no permissions to write to ', dst_service_filepath) 86 | sys.exit(1) 87 | 88 | print("Ensure clickhouse user exists") 89 | os.system("useradd clickhouse") 90 | 91 | print("Ensure var/run folders available") 92 | os.system("mkdir -p /var/log/clickhouse-mysql") 93 | os.system("chown clickhouse /var/log/clickhouse-mysql") 94 | os.system("mkdir -p /var/run/clickhouse-mysql") 95 | os.system("chown clickhouse /var/run/clickhouse-mysql") 96 | 97 | def run(self): 98 | try: 99 | # what action are we going to do 100 | 101 | # first of all, check for auxiliary activities (such as CREATE TABLE SQL statements, etc) 102 | 103 | # run installation process 104 | if self.config.is_install(): 105 | Main.install() 106 | return 107 | 108 | # get JSON-ed SQL statements for review 109 | if self.config.is_create_table_json_template(): 110 | # we are going to prepare table templates in JSON form 111 | print(json.dumps(self.config.table_sql_builder().templates())) 112 | return 113 | 114 | # get SQL statements for review 115 | if self.config.is_create_table_sql() or self.config.is_create_table_sql_template(): 116 | 117 | templates = self.config.table_sql_builder().templates() 118 | 119 | for db in templates: 120 | for table in templates[db]: 121 | if self.config.is_with_create_database(): 122 | print("{};".format(templates[db][table]['create_database'])) 123 | if self.config.is_create_table_sql_template(): 124 | print("{};".format(templates[db][table]['create_table_template'])) 125 | if self.config.is_create_table_sql(): 126 | print("{};".format(templates[db][table]['create_table'])) 127 | 128 | # auxiliary activities completed 129 | # run main activities (such as migrate or pump data) 130 | 131 | # main activities may be prepended with dst tables creation 132 | if self.config.is_dst_create_table(): 133 | migrator = self.config.table_migrator() 134 | migrator.migrate_all_tables(self.config.is_dst_create_table()) 135 | 136 | # run data migration 137 | if self.config.is_migrate_table(): 138 | # we are going to migrate data 139 | migrator = self.config.table_migrator() 140 | migrator.migrate_all_tables_data() 141 | 142 | # pump data to Clickhouse 143 | if self.config.is_pump_data(): 144 | pumper = Pumper( 145 | reader=self.config.reader(), 146 | writer=self.config.writer(), 147 | ) 148 | pumper.run() 149 | 150 | except Exception as ex: 151 | logging.critical(ex) 152 | print('=============') 153 | traceback.print_exc(file=sys.stdout) 154 | print('=============') 155 | print(ex) 156 | sys.exit(1); 157 | 158 | def start(self): 159 | if self.config.is_daemon(): 160 | if not super().start(): 161 | logging.error("Error going background. The process already running?") 162 | else: 163 | self.run() 164 | 165 | 166 | if __name__ == '__main__': 167 | main = Main() 168 | main.start() 169 | -------------------------------------------------------------------------------- /clickhouse_mysql/tablemigrator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | import os.path 6 | 7 | from MySQLdb.cursors import SSDictCursor,Cursor 8 | from clickhouse_mysql.tableprocessor import TableProcessor 9 | from clickhouse_mysql.tablesqlbuilder import TableSQLBuilder 10 | from clickhouse_mysql.event.event import Event 11 | 12 | 13 | class TableMigrator(TableSQLBuilder): 14 | """ 15 | Migrate data from MySQL to ClickHouse 16 | """ 17 | 18 | chwriter = None 19 | chclient = None 20 | pool_max_rows_num = 100000 21 | 22 | # { 23 | # 'db1': { 24 | # 'table1': "a = 1 and b = 2" 25 | # 'table2': "c = 1 and d = 2" 26 | # }, 27 | # 'db2': { 28 | # 'table1': "e = 2 and f = 3" 29 | # 'table2': "g = 1 and h = 2" 30 | # } 31 | # } 32 | where_clauses = {} 33 | 34 | def __init__( 35 | self, 36 | host=None, 37 | port=None, 38 | user=None, 39 | password=None, 40 | dbs=None, 41 | dst_schema=None, 42 | dst_table=None, 43 | dst_table_prefix=None, 44 | distribute=None, 45 | cluster=None, 46 | tables=None, 47 | tables_prefixes=None, 48 | tables_where_clauses=None, 49 | column_skip=[], 50 | ): 51 | super().__init__( 52 | host=host, 53 | port=port, 54 | user=user, 55 | password=password, 56 | dbs=dbs, 57 | dst_schema=dst_schema, 58 | dst_table=dst_table, 59 | dst_table_prefix=dst_table_prefix, 60 | distribute=distribute, 61 | cluster=cluster, 62 | tables=tables, 63 | tables_prefixes=tables_prefixes, 64 | column_skip=column_skip 65 | ) 66 | self.client.cursorclass = SSDictCursor 67 | 68 | # parse tables where clauses 69 | if not tables_where_clauses: 70 | return 71 | 72 | # tables_where_clauses contains: 73 | # [ 74 | # 'db1.t1=where_filename_1', 75 | # 'db2.t2=where_filename_2' 76 | # ] 77 | 78 | # debug info 79 | logging.info("tables_where_clauses={}".format(tables_where_clauses)) 80 | for table_where in tables_where_clauses: 81 | logging.info("table_where={}".format(table_where)) 82 | 83 | # process WHERE migration clauses 84 | for table_where_clause in tables_where_clauses: 85 | # table_where_clause contains 'db1.t1=where_filename_1' 86 | full_table_name, equals, where_file_name = table_where_clause.partition('=') 87 | 88 | # sanity check 89 | if not full_table_name or not equals or not where_file_name: 90 | continue 91 | if not TableProcessor.is_full_table_name(full_table_name): 92 | continue 93 | 94 | # prepare the following data structure: 95 | # { 96 | # 'db1': { 97 | # 'table1': "a = 1 and b = 2" 98 | # 'table2': "c = 1 and d = 2" 99 | # }, 100 | # 'db2': { 101 | # 'table1': "e = 2 and f = 3" 102 | # 'table2': "g = 1 and h = 2" 103 | # } 104 | # } 105 | db, table = TableProcessor.parse_full_table_name(full_table_name) 106 | if not db in self.where_clauses: 107 | self.where_clauses[db] = {} 108 | 109 | if os.path.isfile(where_file_name): 110 | self.where_clauses[db][table] = open(where_file_name, 'r').read().strip("\n") 111 | else: 112 | self.where_clauses[db][table] = where_file_name 113 | 114 | # debug info 115 | logging.info("migration where clauses") 116 | for db, tables in self.where_clauses.items(): 117 | for table, where in tables.items(): 118 | logging.info("{}.{}.where={}".format(db, table, where)) 119 | 120 | def migrate_all_tables(self, with_create_database): 121 | """ 122 | High-level migration function. Loops over tables and migrate each of them 123 | :return: 124 | """ 125 | 126 | # what tables are we going to migrate 127 | dbs = self.dbs_tables_lists() 128 | 129 | # sanity check 130 | if dbs is None: 131 | logging.info("Nothing to migrate") 132 | return None 133 | 134 | # debug info 135 | logging.info("List for migration:") 136 | for db in dbs: 137 | for table in dbs[db]: 138 | logging.info(" {}.{}".format(db, table)) 139 | 140 | # migration templates 141 | templates = self.templates() 142 | 143 | # migrate table-by-table 144 | for db in dbs: 145 | for table in dbs[db]: 146 | logging.info("Start migration {}.{}".format(db, table)) 147 | if with_create_database: 148 | print("Running with chclient {};".format(templates[db][table]['create_database'])) 149 | self.chclient.execute(templates[db][table]['create_database']) 150 | print("Running with chclient {};".format(templates[db][table]['create_table'])) 151 | self.chclient.execute(templates[db][table]['create_table']) 152 | 153 | def migrate_all_tables_data(self): 154 | """ 155 | High-level migration function. Loops over tables and migrate each of them 156 | :return: 157 | """ 158 | 159 | # what tables are we going to migrate 160 | dbs = self.dbs_tables_lists() 161 | 162 | # sanity check 163 | if dbs is None: 164 | logging.info("Nothing to migrate") 165 | return None 166 | 167 | # debug info 168 | logging.info("List for migration:") 169 | for db in dbs: 170 | for table in dbs[db]: 171 | logging.info(" {}.{}".format(db, table)) 172 | 173 | # migrate table-by-table 174 | for db in dbs: 175 | for table in dbs[db]: 176 | logging.info("Start migration {}.{}".format(db, table)) 177 | self.migrate_one_table_data(db=db, table=table) 178 | 179 | def migrate_one_table_data(self, db=None, table=None): 180 | """ 181 | Migrate one table 182 | :param db: db 183 | :param table: table 184 | :return: number of migrated rows 185 | """ 186 | 187 | 188 | # build SQL statement 189 | full_table_name = self.create_full_table_name(db=db, table=table) 190 | sql = "SELECT {0} FROM {1}".format(",".join(self.get_columns(db, full_table_name)), full_table_name) 191 | # in case we have WHERE clause for this db.table - add it to SQL 192 | if db in self.where_clauses and table in self.where_clauses[db]: 193 | sql += " WHERE {}".format(self.where_clauses[db][table]) 194 | 195 | try: 196 | logging.info("migrate_table. sql={}".format(sql)) 197 | self.client.cursorclass = SSDictCursor 198 | self.client.connect(db=db) 199 | self.client.cursor.execute(sql) 200 | cnt = 0; 201 | while True: 202 | # fetch multiple rows from MySQL 203 | rows = self.client.cursor.fetchmany(self.pool_max_rows_num) 204 | if not rows: 205 | break 206 | 207 | # insert Event with multiple rows into ClickHouse writer 208 | event = Event() 209 | event.schema = db 210 | event.table = table 211 | event.rows = rows 212 | self.chwriter.insert(event) 213 | self.chwriter.flush() 214 | 215 | cnt += len(rows) 216 | except Exception as ex: 217 | logging.critical("Critical error: {}".format(str(ex))) 218 | raise Exception("Can not migrate table on db={} table={}".format( 219 | db, 220 | table, 221 | )) 222 | 223 | return cnt 224 | 225 | def get_columns(self,db,full_table_name): 226 | self.client.cursorclass = Cursor 227 | self.client.connect(db=db) 228 | self.client.cursor.execute("DESC {}".format(full_table_name)) 229 | fields = [] 230 | for (_field, _type, _null, _key, _default, _extra,) in self.client.cursor: 231 | if self.column_skip.__contains__(_field): 232 | logging.debug("skip column %s",_field) 233 | continue 234 | fields.append('`{}`'.format(_field)) 235 | 236 | return fields 237 | 238 | if __name__ == '__main__': 239 | tb = TableBuilder( 240 | host='127.0.0.1', 241 | user='reader', 242 | password='qwerty', 243 | dbs=['db'], 244 | # tables='datatypes, enum_datatypes, json_datatypes', 245 | tables=['datatypes', 'enum_datatypes', 'json_datatypes'], 246 | ) 247 | templates = tb.templates() 248 | for db in templates: 249 | for table in templates[db]: 250 | print(table, '=', templates[db][table]) 251 | -------------------------------------------------------------------------------- /clickhouse_mysql/tableprocessor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | import MySQLdb 6 | from clickhouse_mysql.dbclient.mysqlclient import MySQLClient 7 | 8 | 9 | class TableProcessor(object): 10 | 11 | client = None 12 | 13 | dbs = None 14 | tables = None 15 | tables_prefixes = None 16 | 17 | ACTION_FAIL = 1 18 | ACTION_IGNORE_TABLE = 2 19 | ACTION_INCLUDE_TABLE = 3 20 | 21 | def __init__( 22 | self, 23 | host=None, 24 | port=None, 25 | user=None, 26 | password=None, 27 | dbs=None, 28 | dst_schema=None, 29 | dst_table=None, 30 | dst_table_prefix=None, 31 | distribute=None, 32 | cluster=None, 33 | tables=None, 34 | tables_prefixes=None, 35 | column_skip=[], 36 | ): 37 | """ 38 | :param host: string MySQL host 39 | :param port: int MySQL port 40 | :param user: string MySQL user 41 | :param password: string MySQL password 42 | :param dbs: list of string MySQL databases. May be omitted, in this case tables has to contain full table names, Ex.: db.table1 43 | :param tables: list of string list of table names. Table names may be short or full form 44 | :param tables_prefixes: list of string list of table prefixes. May be short or full form 45 | """ 46 | self.dbs = [] if dbs is None else dbs 47 | self.tables = [] if tables is None else tables 48 | self.tables_prefixes = [] if tables_prefixes is None else tables_prefixes 49 | self.client = MySQLClient({ 50 | 'host': host, 51 | 'port': port, 52 | 'user': user, 53 | 'password': password, 54 | }) 55 | self.dst_schema = dst_schema 56 | self.dst_table = dst_table 57 | self.dst_table_prefix = dst_table_prefix 58 | self.cluster = cluster 59 | self.distribute = distribute 60 | self.column_skip = column_skip 61 | 62 | def dbs_tables_lists(self): 63 | """ 64 | Prepare dict of databases and with list of tables for each db 65 | Include all tables into db tables list in case to tables are explicitly specified 66 | It still can be no tables - in case db really has no tables 67 | For convenient iteration over all tables 68 | 69 | :return: 70 | { 71 | 'db1' : ('table1', 'table2', ...), 72 | 'db2' : (all tables listed in 'db2'), 73 | 'db3' : ('table1', ...), 74 | } 75 | """ 76 | 77 | # process explicitly specified tables 78 | # prepare list of tables for each db 79 | res = TableProcessor.group_tables(self.dbs, self.tables) 80 | if res is None: 81 | logging.warning("Can't group tables for explicitly specified db/tables") 82 | return None 83 | else: 84 | logging.debug("{} group tables for explicitly specified db/tables".format(res)) 85 | 86 | # process implicitly specified tables - when db name only specified and we add all tables from this db 87 | # for dbs with no tables list specified - meaning all tables - list tables directly from DB 88 | for db in res: 89 | if not res[db]: 90 | # no tables in db, try to add all tables from DB 91 | tables = self.tables_list(db) 92 | res[db].add(tables) 93 | logging.debug("add {} tables to {} db".format(tables, db)) 94 | 95 | # process tables specified with prefix 96 | # prepare list of prefixes 97 | prefixes = TableProcessor.group_tables(tables=self.tables_prefixes) 98 | logging.debug("{} group tables for prefix specified db/tables".format(prefixes)) 99 | for db, prefixes in prefixes.items(): 100 | for prefix in prefixes: 101 | # match all tables for specified prefix 102 | tables_match = self.tables_match(db, prefix) 103 | if tables_match: 104 | logging.debug("{} tables match prefix {}.{}".format(tables_match, db, prefix)) 105 | # we have tables which match specified prefix 106 | for table in tables_match: 107 | # ensure {'db': set()} 108 | if db not in res: 109 | res[db] = set() 110 | # add table to the set of tables 111 | res[db].add(table) 112 | else: 113 | logging.debug("No tables match prefix {}.{}".format(db, prefix)) 114 | # dict of sets 115 | return res 116 | 117 | def tables_list(self, db): 118 | """ 119 | List tables in specified DB 120 | 121 | :param db: database to list tables in 122 | :return: ['table1', 'table2', ...] 123 | """ 124 | return self.client.tables_list(db) 125 | 126 | def tables_match(self, db, prefix): 127 | """ 128 | List tables which match specified prefix 129 | 130 | :param db: database to match tables in 131 | :param prefix: prefix to match tables 132 | :return: ['table1', 'table2', ...] 133 | """ 134 | res = [] 135 | # list all tables in db 136 | tables = self.tables_list(db) 137 | logging.debug("{} tables {}".format(db, tables)) 138 | for table in tables: 139 | logging.debug("check {}.{} match prefix {}".format(db, table, prefix)) 140 | if table.startswith(prefix): 141 | res.append(table) 142 | logging.debug("{}.{} match prefix {}".format(db, table, prefix)) 143 | return res 144 | 145 | @staticmethod 146 | def create_full_table_name(dst_schema=None, dst_table=None, dst_table_prefix=None, db=None, table=None, distribute=None): 147 | """ 148 | Create fully-specified table name as `schema_all`.`db__table_all` or `schema`.`db__table` or just `db`.`table` 149 | 150 | :param dst_schema: 151 | :param db: 152 | :param table: 153 | :param distribute: 154 | :return: `schema_all`.`db__table_all` or `schema`.`db__table` or just `db`.`table` 155 | """ 156 | 157 | # target table can be renamed with dst_table 158 | table = dst_table if dst_table is not None else table 159 | 160 | # simple case - do not move table into another db 161 | if dst_schema is None: 162 | return '`{0}`.`{1}`'.format(db, table) if db else '`{0}`'.format(table) 163 | 164 | if distribute: 165 | dst_schema += "_all" 166 | table += "_all" 167 | 168 | return \ 169 | '`{0}`.`{1}`'.format(dst_schema, TableProcessor.create_migrated_table_name(prefix=dst_table_prefix, table=table)) \ 170 | if db else \ 171 | '`{0}`'.format(table) 172 | 173 | @staticmethod 174 | def create_migrated_table_name(prefix=None, table=None): 175 | prefix = prefix if prefix is not None else "" 176 | return prefix + table 177 | 178 | @staticmethod 179 | def create_distributed_table_name(db=None, table=None): 180 | return db + "__" + table + "_all" 181 | 182 | @staticmethod 183 | def is_full_table_name(full_name): 184 | """ 185 | Checks whether it is a fully-specified table name. 186 | 187 | :param full_name: `db`.`name` or `name` or something else 188 | :return: bool 189 | """ 190 | db, dot, name = full_name.partition('.') 191 | if not dot: 192 | # dot not found - treat this as a short name 193 | return False 194 | else: 195 | # dot found - treat it as a long name 196 | return True 197 | 198 | @staticmethod 199 | def parse_full_table_name(full_name): 200 | """ 201 | Extract db and table names from fully-specified table name. 202 | Ex.: extract 'db', 'name' out of `db`.`name` 203 | 204 | :param full_name: `db`.`name` 205 | :return: (db, name, ) or None in case of short name 206 | """ 207 | db, dot, name = full_name.partition('.') 208 | if not dot: 209 | name = db 210 | db = None 211 | 212 | return None if db is None else db.strip('`'), name.strip('`') 213 | 214 | @staticmethod 215 | def group_tables(dbs=[], tables=[], unsettled_tables_action=ACTION_FAIL): 216 | """ 217 | Prepare dict of databases with list of tables for each db 218 | For convenient iteration over all tables 219 | :param dbs [ db1, db2, ... ] 220 | :param tables [ db1.table1, table2, ... ] 221 | :param unsettled_tables_action what to do with a table in case can't decide which db it belongs to. 222 | See ACTION_* values for full list of possible actions 223 | ACTION_FAIL - return None 224 | ACTION_IGNORE_TABLE - ignore table 225 | ACTION_INCLUDE_TABLE - add table to special db called '_' (just in order to include it somewhere) 226 | :return: 227 | { 228 | 'db1' : ('table1', 'table2', ...), 229 | 'db2' : (), 230 | 'db3' : ('table1', ...), 231 | '_' : ('tableX', 'tableY', ...), 232 | } 233 | OR 234 | None 235 | """ 236 | if dbs is None: 237 | dbs = [] 238 | if tables is None: 239 | tables = [] 240 | 241 | # prepare dict of dbs with empty sets of tables 242 | # { 243 | # 'db1': () 244 | # 'db2': () 245 | # } 246 | res = {db: set() for db in dbs} 247 | 248 | # set of tables with short names 249 | short_name_tables = set() 250 | 251 | # set of tables with full names 252 | full_name_tables = set() 253 | for table in tables: 254 | if TableProcessor.is_full_table_name(table): 255 | full_name_tables.add(table) 256 | else: 257 | short_name_tables.add(table) 258 | 259 | # now - what to do with short-name tables 260 | # if there is only one db name in dbs list we can treat this short tables as belonging to this one table 261 | # but if there is none OR many dbs listed, where those short-named tables should be included? 262 | if len(short_name_tables) > 0: 263 | # we have shot table names 264 | # where to include them? 265 | if len(dbs) == 1: 266 | # ok, let's include these short tables into the single db specified 267 | res[next(iter(res))].update(short_name_tables) 268 | else: 269 | # where to include these short tables? 270 | # Either none or Multiple databases specified 271 | if unsettled_tables_action == TableProcessor.ACTION_IGNORE_TABLE: 272 | # just ignore this tables 273 | pass 274 | elif unsettled_tables_action == TableProcessor.ACTION_FAIL: 275 | # fail process 276 | return None 277 | else: 278 | # include table 279 | # use fake '_' db for this 280 | if '_' not in res: 281 | res['_'] = set() 282 | res['_'].update(short_name_tables) 283 | else: 284 | # we do not have short table names - nothing to bother about 285 | pass 286 | 287 | # now deal with full name tables 288 | for table in full_name_tables: 289 | db, table = TableProcessor.parse_full_table_name(table) 290 | # add table to databases dict 291 | if db not in res: 292 | res[db] = set() 293 | res[db].add(table) 294 | 295 | return res 296 | 297 | @staticmethod 298 | def extract_dbs(dbs=[], tables=[]): 299 | """ 300 | Extract db/schema names from list of dbs and tables - which can contain full names, as db.table - expanding list 301 | provided as dbs[] 302 | :param dbs: list of dbs 303 | :param tables: list of tables with (otional) full names 304 | :return: set of db names 305 | """ 306 | dbs_group = TableProcessor.group_tables(dbs=dbs, 307 | tables=tables, 308 | unsettled_tables_action=TableProcessor.ACTION_IGNORE_TABLE) 309 | 310 | return dbs_group.keys() 311 | 312 | @staticmethod 313 | def extract_tables(tables=[]): 314 | """ 315 | Extract short tabke names from list of (possibly) full names 316 | :param tables: list of (possibly) full names 317 | :return: set of short names 318 | """ 319 | dbs_group = TableProcessor.group_tables(tables=tables, 320 | unsettled_tables_action=TableProcessor.ACTION_INCLUDE_TABLE) 321 | res = set() 322 | for db in dbs_group: 323 | res.update(dbs_group[db]) 324 | 325 | return res 326 | -------------------------------------------------------------------------------- /clickhouse_mysql/reader/mysqlreader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import time 5 | import logging 6 | import sys 7 | 8 | from pymysqlreplication import BinLogStreamReader 9 | from pymysqlreplication.row_event import WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent 10 | 11 | from clickhouse_mysql.reader.reader import Reader 12 | from clickhouse_mysql.event.event import Event 13 | from clickhouse_mysql.tableprocessor import TableProcessor 14 | from clickhouse_mysql.util import Util 15 | #from pymysqlreplication.event import QueryEvent, RotateEvent, FormatDescriptionEvent 16 | 17 | 18 | class MySQLReader(Reader): 19 | """Read data from MySQL as replication ls""" 20 | 21 | connection_settings = None 22 | server_id = None 23 | log_file = None 24 | log_pos = None 25 | schemas = None 26 | tables = None 27 | tables_prefixes = None 28 | blocking = None 29 | resume_stream = None 30 | binlog_stream = None 31 | nice_pause = 0 32 | 33 | write_rows_event_num = 0 34 | write_rows_event_each_row_num = 0; 35 | 36 | binlog_position_file = None 37 | 38 | def __init__( 39 | self, 40 | connection_settings, 41 | server_id, 42 | log_file=None, 43 | log_pos=None, 44 | schemas=None, 45 | tables=None, 46 | tables_prefixes=None, 47 | blocking=None, 48 | resume_stream=None, 49 | nice_pause=None, 50 | binlog_position_file=None, 51 | callbacks={}, 52 | ): 53 | super().__init__(callbacks=callbacks) 54 | 55 | self.connection_settings = connection_settings 56 | self.server_id = server_id 57 | self.log_file = log_file 58 | self.log_pos = log_pos 59 | self.schemas = None if not TableProcessor.extract_dbs(schemas, Util.join_lists(tables, tables_prefixes)) else TableProcessor.extract_dbs(schemas, Util.join_lists(tables, tables_prefixes)) 60 | self.tables = None if tables is None else TableProcessor.extract_tables(tables) 61 | self.tables_prefixes = None if tables_prefixes is None else TableProcessor.extract_tables(tables_prefixes) 62 | self.blocking = blocking 63 | self.resume_stream = resume_stream 64 | self.nice_pause = nice_pause 65 | self.binlog_position_file=binlog_position_file 66 | 67 | logging.info("raw dbs list. len()=%d", 0 if schemas is None else len(schemas)) 68 | if schemas is not None: 69 | for schema in schemas: 70 | logging.info(schema) 71 | logging.info("normalised dbs list. len()=%d", 0 if self.schemas is None else len(self.schemas)) 72 | if self.schemas is not None: 73 | for schema in self.schemas: 74 | logging.info(schema) 75 | 76 | logging.info("raw tables list. len()=%d", 0 if tables is None else len(tables)) 77 | if tables is not None: 78 | for table in tables: 79 | logging.info(table) 80 | logging.info("normalised tables list. len()=%d", 0 if self.tables is None else len(self.tables)) 81 | if self.tables is not None: 82 | for table in self.tables: 83 | logging.info(table) 84 | 85 | logging.info("raw tables-prefixes list. len()=%d", 0 if tables_prefixes is None else len(tables_prefixes)) 86 | if tables_prefixes is not None: 87 | for table in tables_prefixes: 88 | logging.info(table) 89 | logging.info("normalised tables-prefixes list. len()=%d", 0 if self.tables_prefixes is None else len(self.tables_prefixes)) 90 | if self.tables_prefixes is not None: 91 | for table in self.tables_prefixes: 92 | logging.info(table) 93 | 94 | if not isinstance(self.server_id, int): 95 | raise Exception("Please specify server_id of src server as int. Ex.: --src-server-id=1") 96 | 97 | self.binlog_stream = BinLogStreamReader( 98 | # MySQL server - data source 99 | connection_settings=self.connection_settings, 100 | server_id=self.server_id, 101 | # we are interested in reading CH-repeatable events only 102 | only_events=[ 103 | # Possible events 104 | #BeginLoadQueryEvent, 105 | DeleteRowsEvent, 106 | #ExecuteLoadQueryEvent, 107 | #FormatDescriptionEvent, 108 | #GtidEvent, 109 | #HeartbeatLogEvent, 110 | #IntvarEvent 111 | #NotImplementedEvent, 112 | #QueryEvent, 113 | #RotateEvent, 114 | #StopEvent, 115 | #TableMapEvent, 116 | UpdateRowsEvent, 117 | WriteRowsEvent, 118 | #XidEvent, 119 | ], 120 | only_schemas=self.schemas, 121 | # in case we have any prefixes - this means we need to listen to all tables within specified schemas 122 | only_tables=self.tables if not self.tables_prefixes else None, 123 | log_file=self.log_file, 124 | log_pos=self.log_pos, 125 | freeze_schema=True, # If true do not support ALTER TABLE. It's faster. 126 | blocking=False, 127 | resume_stream=self.resume_stream, 128 | ) 129 | 130 | def performance_report(self, start, rows_num, rows_num_per_event_min=None, rows_num_per_event_max=None, now=None): 131 | # log performance report 132 | 133 | if now is None: 134 | now = time.time() 135 | 136 | window_size = now - start 137 | if window_size > 0: 138 | rows_per_sec = rows_num / window_size 139 | logging.info( 140 | 'PERF - %f rows/sec, min(rows/event)=%d max(rows/event)=%d for last %d rows %f sec', 141 | rows_per_sec, 142 | rows_num_per_event_min if rows_num_per_event_min is not None else -1, 143 | rows_num_per_event_max if rows_num_per_event_max is not None else -1, 144 | rows_num, 145 | window_size, 146 | ) 147 | else: 148 | logging.info("PERF - can not calc performance for time size=0") 149 | 150 | def is_table_listened(self, table): 151 | """ 152 | Check whether table name in either directly listed in tables or starts with prefix listed in tables_prefixes 153 | :param table: table name 154 | :return: bool is table listened 155 | """ 156 | 157 | # check direct table name match 158 | if self.tables: 159 | if table in self.tables: 160 | return True 161 | 162 | # check prefixes 163 | if self.tables_prefixes: 164 | for prefix in self.tables_prefixes: 165 | if table.startswith(prefix): 166 | # table name starts with prefix list 167 | return True 168 | 169 | return False 170 | 171 | first_rows_passed = [] 172 | start_timestamp = 0 173 | start = 0 174 | rows_num = 0 175 | rows_num_since_interim_performance_report = 0 176 | rows_num_per_event_min = None 177 | rows_num_per_event_max = None 178 | 179 | def init_read_events(self): 180 | self.start_timestamp = int(time.time()) 181 | self.first_rows_passed = [] 182 | 183 | def init_fetch_loop(self): 184 | self.start = time.time() 185 | 186 | def stat_init_fetch_loop(self): 187 | self.rows_num = 0 188 | self.rows_num_since_interim_performance_report = 0 189 | self.rows_num_per_event_min = None 190 | self.rows_num_per_event_max = None 191 | 192 | def stat_close_fetch_loop(self): 193 | if self.rows_num > 0: 194 | # we have some rows processed 195 | now = time.time() 196 | if now > self.start + 60: 197 | # and processing was long enough 198 | self.performance_report(self.start, self.rows_num, now) 199 | 200 | def stat_write_rows_event_calc_rows_num_min_max(self, rows_num_per_event): 201 | # populate min value 202 | if (self.rows_num_per_event_min is None) or (rows_num_per_event < self.rows_num_per_event_min): 203 | self.rows_num_per_event_min = rows_num_per_event 204 | 205 | # populate max value 206 | if (self.rows_num_per_event_max is None) or (rows_num_per_event > self.rows_num_per_event_max): 207 | self.rows_num_per_event_max = rows_num_per_event 208 | 209 | def stat_write_rows_event_all_rows(self, mysql_event): 210 | self.write_rows_event_num += 1 211 | self.rows_num += len(mysql_event.rows) 212 | self.rows_num_since_interim_performance_report += len(mysql_event.rows) 213 | logging.debug('WriteRowsEvent #%d rows: %d', self.write_rows_event_num, len(mysql_event.rows)) 214 | 215 | def stat_write_rows_event_each_row(self): 216 | self.write_rows_event_each_row_num += 1 217 | logging.debug('WriteRowsEvent.EachRow #%d', self.write_rows_event_each_row_num) 218 | 219 | def stat_write_rows_event_each_row_for_each_row(self): 220 | self.rows_num += 1 221 | self.rows_num_since_interim_performance_report += 1 222 | 223 | def stat_write_rows_event_finalyse(self): 224 | if self.rows_num_since_interim_performance_report >= 100000: 225 | # speed report each N rows 226 | self.performance_report( 227 | start=self.start, 228 | rows_num=self.rows_num, 229 | rows_num_per_event_min=self.rows_num_per_event_min, 230 | rows_num_per_event_max=self.rows_num_per_event_max, 231 | ) 232 | self.rows_num_since_interim_performance_report = 0 233 | self.rows_num_per_event_min = None 234 | self.rows_num_per_event_max = None 235 | 236 | def process_first_event(self, event): 237 | if "{}.{}".format(event.schema, event.table) not in self.first_rows_passed: 238 | Util.log_row(event.first_row(), "first row in replication {}.{}".format(event.schema, event.table)) 239 | self.first_rows_passed.append("{}.{}".format(event.schema, event.table)) 240 | logging.info(self.first_rows_passed) 241 | 242 | def process_write_rows_event(self, mysql_event): 243 | """ 244 | Process specific MySQL event - WriteRowsEvent 245 | :param mysql_event: WriteRowsEvent instance 246 | :return: 247 | """ 248 | if self.tables_prefixes: 249 | # we have prefixes specified 250 | # need to find whether current event is produced by table in 'looking-into-tables' list 251 | if not self.is_table_listened(mysql_event.table): 252 | # this table is not listened 253 | # processing is over - just skip event 254 | return 255 | 256 | # statistics 257 | self.stat_write_rows_event_calc_rows_num_min_max(rows_num_per_event=len(mysql_event.rows)) 258 | 259 | if self.subscribers('WriteRowsEvent'): 260 | # dispatch event to subscribers 261 | 262 | # statistics 263 | self.stat_write_rows_event_all_rows(mysql_event=mysql_event) 264 | 265 | # dispatch Event 266 | event = Event() 267 | event.schema = mysql_event.schema 268 | event.table = mysql_event.table 269 | event.pymysqlreplication_event = mysql_event 270 | 271 | self.process_first_event(event=event) 272 | self.notify('WriteRowsEvent', event=event) 273 | 274 | if self.subscribers('WriteRowsEvent.EachRow'): 275 | # dispatch event to subscribers 276 | 277 | # statistics 278 | self.stat_write_rows_event_each_row() 279 | 280 | # dispatch Event per each row 281 | for row in mysql_event.rows: 282 | # statistics 283 | self.stat_write_rows_event_each_row_for_each_row() 284 | 285 | # dispatch Event 286 | event = Event() 287 | event.schema = mysql_event.schema 288 | event.table = mysql_event.table 289 | event.row = row['values'] 290 | 291 | self.process_first_event(event=event) 292 | self.notify('WriteRowsEvent.EachRow', event=event) 293 | 294 | self.stat_write_rows_event_finalyse() 295 | 296 | def process_update_rows_event(self, mysql_event): 297 | logging.info("Skip update rows") 298 | 299 | def process_delete_rows_event(self, mysql_event): 300 | logging.info("Skip delete rows") 301 | 302 | def process_binlog_position(self, file, pos): 303 | if self.binlog_position_file: 304 | with open(self.binlog_position_file, "w") as f: 305 | f.write("{}:{}".format(file, pos)) 306 | logging.debug("Next event binlog pos: {}.{}".format(file, pos)) 307 | 308 | def read(self): 309 | # main function - read data from source 310 | 311 | self.init_read_events() 312 | 313 | # fetch events 314 | try: 315 | while True: 316 | logging.debug('Check events in binlog stream') 317 | 318 | self.init_fetch_loop() 319 | 320 | # statistics 321 | self.stat_init_fetch_loop() 322 | 323 | try: 324 | logging.debug('Pre-start binlog position: ' + self.binlog_stream.log_file + ":" + str(self.binlog_stream.log_pos) if self.binlog_stream.log_pos is not None else "undef") 325 | 326 | # fetch available events from MySQL 327 | for mysql_event in self.binlog_stream: 328 | # new event has come 329 | # check what to do with it 330 | 331 | logging.debug('Got Event ' + self.binlog_stream.log_file + ":" + str(self.binlog_stream.log_pos)) 332 | 333 | # process event based on its type 334 | if isinstance(mysql_event, WriteRowsEvent): 335 | self.process_write_rows_event(mysql_event) 336 | elif isinstance(mysql_event, DeleteRowsEvent): 337 | self.process_delete_rows_event(mysql_event) 338 | elif isinstance(mysql_event, UpdateRowsEvent): 339 | self.process_update_rows_event(mysql_event) 340 | else: 341 | # skip other unhandled events 342 | pass 343 | 344 | # after event processed, we need to handle current binlog position 345 | self.process_binlog_position(self.binlog_stream.log_file, self.binlog_stream.log_pos) 346 | 347 | except KeyboardInterrupt: 348 | # pass SIGINT further 349 | logging.info("SIGINT received. Pass it further.") 350 | raise 351 | except Exception as ex: 352 | if self.blocking: 353 | # we'd like to continue waiting for data 354 | # report and continue cycle 355 | logging.warning("Got an exception, skip it in blocking mode") 356 | logging.warning(ex) 357 | else: 358 | # do not continue, report error and exit 359 | logging.critical("Got an exception, abort it in non-blocking mode") 360 | logging.critical(ex) 361 | sys.exit(1) 362 | 363 | # all events fetched (or none of them available) 364 | 365 | # statistics 366 | self.stat_close_fetch_loop() 367 | 368 | if not self.blocking: 369 | # do not wait for more data - all done 370 | break # while True 371 | 372 | # blocking - wait for more data 373 | if self.nice_pause > 0: 374 | time.sleep(self.nice_pause) 375 | 376 | self.notify('ReaderIdleEvent') 377 | 378 | except KeyboardInterrupt: 379 | logging.info("SIGINT received. Time to exit.") 380 | except Exception as ex: 381 | logging.warning("Got an exception, handle it") 382 | logging.warning(ex) 383 | 384 | try: 385 | self.binlog_stream.close() 386 | except Exception as ex: 387 | logging.warning("Unable to close binlog stream correctly") 388 | logging.warning(ex) 389 | 390 | end_timestamp = int(time.time()) 391 | 392 | logging.info('start %d', self.start_timestamp) 393 | logging.info('end %d', end_timestamp) 394 | logging.info('len %d', end_timestamp - self.start_timestamp) 395 | 396 | if __name__ == '__main__': 397 | connection_settings = { 398 | 'host': '127.0.0.1', 399 | 'port': 3306, 400 | 'user': 'reader', 401 | 'passwd': 'qwerty', 402 | } 403 | server_id = 1 404 | 405 | reader = Reader( 406 | connection_settings=connection_settings, 407 | server_id=server_id, 408 | ) 409 | 410 | reader.read() 411 | -------------------------------------------------------------------------------- /clickhouse_mysql/tablesqlbuilder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from clickhouse_mysql.tableprocessor import TableProcessor 5 | from MySQLdb.cursors import Cursor 6 | import logging 7 | 8 | 9 | class TableSQLBuilder(TableProcessor): 10 | """ 11 | Build ClickHouse table(s) 12 | """ 13 | 14 | def templates(self): 15 | """ 16 | Create ClickHouse tables templates for specified MySQL tables. 17 | In case no tables specified all tables from specified MySQL db are templated 18 | :return: dict of ClickHouse's CREATE TABLE () templates 19 | { 20 | 'db1': { 21 | 'table-db1-1': CREATE TABLE table1 statement template, 22 | 'table-db1-2': CREATE TABLE table2 statement template, 23 | }, 24 | 'db2': { 25 | 'table-db2-1': CREATE TABLE table1 statement template, 26 | 'table-db2-2': CREATE TABLE table2 statement template, 27 | } 28 | } 29 | """ 30 | dbs = self.dbs_tables_lists() 31 | if dbs is None: 32 | return None 33 | 34 | templates = {} 35 | for db in dbs: 36 | templates[db] = {} 37 | for table in dbs[db]: 38 | templates[db][table] = self.create_table_description( 39 | cluster=self.cluster, 40 | dst_schema=self.dst_schema, 41 | dst_table=self.dst_table, 42 | dst_table_prefix=self.dst_table_prefix, 43 | db=db, 44 | table=table) 45 | 46 | return templates 47 | 48 | def create_table_description(self, cluster=None, dst_schema=None, dst_table=None, dst_table_prefix=None, db=None, table=None): 49 | """ 50 | High-level function. 51 | Produce either text ClickHouse's table SQL CREATE TABLE() template or JSON ClikcHouse's table description 52 | :param db: string MySQL db name 53 | :param table: string MySQL table name 54 | :param json: bool what should return - json description or ClickHouse's SQL template 55 | :return: dict{"template":SQL, "fields": {}} or string SQL 56 | """ 57 | columns_description = self.create_table_columns_description(db=db, table=table) 58 | return { 59 | "create_table_template": self.create_table_sql_template(cluster=cluster, 60 | dst_schema=dst_schema, 61 | dst_table=dst_table, 62 | dst_table_prefix=dst_table_prefix, 63 | db=db, 64 | table=table, 65 | columns_description=columns_description), 66 | "create_table": self.create_table_sql(cluster=cluster, 67 | dst_schema=dst_schema, 68 | dst_table=dst_table, 69 | dst_table_prefix=dst_table_prefix, 70 | db=db, 71 | table=table, 72 | columns_description=columns_description), 73 | "create_database": self.create_database_sql(dst_schema=dst_schema, db=db), 74 | "fields": columns_description, 75 | } 76 | 77 | def create_table_sql_template(self, cluster=None, dst_schema=None, dst_table=None, dst_table_prefix=None, db=None, table=None, columns_description=None): 78 | """ 79 | Produce table template for ClickHouse 80 | CREATE TABLE schema.table ( 81 | ... 82 | columns specification 83 | ... 84 | ) ENGINE = MergeTree(_, (), 8192) 85 | for specified MySQL's table 86 | 87 | :param db: string - name of the DB in MySQL 88 | :param table: string - name of the table in MySQL which will be used as a base for CH's CREATE TABLE template 89 | :return: string - almost-ready-to-use ClickHouse CREATE TABLE statement 90 | """ 91 | 92 | ch_columns = [] 93 | for column_description in columns_description: 94 | ch_columns.append('`{}` {}'.format(column_description['field'], column_description['clickhouse_type_nullable'])) 95 | 96 | sql = """CREATE TABLE IF NOT EXISTS {} {} ( 97 | {} 98 | ) 99 | ENGINE = MergeTree(, (), 8192) 100 | """.format( 101 | self.create_full_table_name(dst_schema=dst_schema, dst_table=dst_table, dst_table_prefix=dst_table_prefix, db=db, table=table), 102 | "on cluster {}".format(cluster) if cluster is not None else "", 103 | ",\n ".join(ch_columns), 104 | ) 105 | return sql 106 | 107 | def create_table_sql(self, cluster=None, dst_schema=None, dst_table=None, dst_table_prefix=None, db=None, table=None, columns_description=None): 108 | """ 109 | Produce table template for ClickHouse 110 | CREATE TABLE( 111 | ... 112 | columns specification 113 | ... 114 | ) ENGINE = MergeTree(PRIMARY DATE FIELD, (COMMA SEPARATED INDEX FIELDS LIST), 8192) 115 | for specified MySQL's table 116 | 117 | :param db: string - name of the DB in MySQL 118 | :param table: string - name of the table in MySQL which will be used as a base for CH's CREATE TABLE template 119 | :return: string - ready-to-use ClickHouse CREATE TABLE statement 120 | """ 121 | 122 | ch_columns = [] 123 | 124 | primary_date_field = self.fetch_primary_date_field(columns_description) 125 | primary_key_fields = self.fetch_primary_key_fields(columns_description) 126 | 127 | # if primary_date_field is None: 128 | # # No primary date field found. Make one 129 | # primary_date_field = 'primary_date_field' 130 | # ch_columns.append('`primary_date_field` Date default today()') 131 | 132 | if primary_key_fields is None: 133 | # No primary key fields found. Make PK from primary date field 134 | primary_key_fields = [] 135 | primary_key_fields.append(primary_date_field) 136 | 137 | for column_description in columns_description: 138 | field = column_description['field'] 139 | # primary date and primary key fields can't be nullable 140 | ch_type = column_description['clickhouse_type'] if (field == primary_date_field) or (field in primary_key_fields) else column_description['clickhouse_type_nullable'] 141 | ch_columns.append('`{}` {}'.format(field, ch_type)) 142 | 143 | sql = """CREATE TABLE IF NOT EXISTS {} {} ( 144 | {} 145 | ) 146 | {} 147 | """.format( 148 | self.create_full_table_name(dst_schema=dst_schema, dst_table=dst_table, dst_table_prefix=dst_table_prefix, db=db, table=table, distribute=self.distribute), 149 | "on cluster {}".format(cluster) if not self.distribute and cluster is not None else "", 150 | ",\n ".join(ch_columns), 151 | self.create_table_engine(self.cluster, 152 | self.dst_schema, 153 | self.create_migrated_table_name(prefix=dst_table_prefix, table=dst_table) if dst_table is not None else self.create_migrated_table_name(prefix=dst_table_prefix, table=table), 154 | primary_date_field, 155 | ",".join(primary_key_fields), 156 | self.distribute) 157 | ) 158 | return sql 159 | 160 | def create_database_sql(self, dst_schema=None, db=None): 161 | """ 162 | Produce create database statement for ClickHouse 163 | CREATE DATABASE 164 | for specified MySQL's db 165 | 166 | :param db: string - name of the DB 167 | :return: string - ready-to-use ClickHouse CREATE DATABASE statement 168 | """ 169 | sql = "CREATE DATABASE IF NOT EXISTS `{}`".format(dst_schema if dst_schema is not None else db) 170 | return sql 171 | 172 | def create_table_columns_description(self, db=None, table=None, ): 173 | # list of table columns specifications 174 | # [ 175 | # { 176 | # 'field': 'f1', 177 | # 'mysql_type': 'int', 178 | # 'clickhouse_type': 'UInt32' 179 | # 'nullable': True, 180 | # 'key': 'PRI', 181 | # 'default': 'CURRENT TIMESTAMP', 182 | # 'extra': 'on update CURRENT_TIMESTAMP', 183 | # }, 184 | # {...}, 185 | # ] 186 | columns_description = [] 187 | 188 | # issue 'DESCRIBE table' statement 189 | self.client.cursorclass = Cursor 190 | self.client.connect(db=db) 191 | self.client.cursor.execute("DESC {}".format(self.create_full_table_name(db=db, table=table))) 192 | for (_field, _type, _null, _key, _default, _extra,) in self.client.cursor: 193 | # Field | Type | Null | Key | Default | Extra 194 | 195 | # build ready-to-sql column specification Ex.: 196 | # `integer_1` Nullable(Int32) 197 | # `u_integer_1` Nullable(UInt32) 198 | if self.column_skip.__contains__(_field): 199 | logging.debug("table sql builder skip column %s",_field) 200 | continue 201 | columns_description.append({ 202 | 'field': _field, 203 | 'mysql_type': _type, 204 | 'clickhouse_type': self.map_type(mysql_type=_type), 205 | 'clickhouse_type_nullable': self.map_type_nullable(mysql_type=_type, nullable=self.is_field_nullable(_null)), 206 | 'nullable': self.is_field_nullable(_null), 207 | 'key': _key, 208 | 'default': _default, 209 | 'extra': _extra, 210 | }) 211 | 212 | return columns_description 213 | 214 | def fetch_primary_date_field(self, columns_description): 215 | """ 216 | Fetch first Date column name 217 | :param columns_description: 218 | :return: string|None 219 | """ 220 | for column_description in columns_description: 221 | if column_description['clickhouse_type'] == 'Date': 222 | return column_description['field'] 223 | if column_description['clickhouse_type'] == 'DateTime': 224 | return column_description['field'] 225 | 226 | return None 227 | 228 | def fetch_primary_key_fields(self, columns_description): 229 | """ 230 | Fetch list of primary keys columns names 231 | :param columns_description: 232 | :return: list | None 233 | """ 234 | primary_key_fields = [] 235 | for column_description in columns_description: 236 | if self.is_field_primary_key(column_description['key']): 237 | primary_key_fields.append(column_description['field']) 238 | 239 | return None if not primary_key_fields else primary_key_fields 240 | 241 | def is_field_nullable(self, field): 242 | """ 243 | Check whether `nullable` field description value can be interpreted as True. 244 | Understand MySQL's "Yes" for nullable or just bool value 245 | :param field: bool, string 246 | :return: bool 247 | """ 248 | if isinstance(field, bool): 249 | # for bool - simple statement 250 | return field 251 | elif isinstance(field, str): 252 | # also accept case-insensitive string 'yes' 253 | return True if field.upper() == "YES" else False 254 | 255 | def is_field_primary_key(self, field): 256 | """ 257 | Check whether `key` field description value can be interpreted as True 258 | :param field: 259 | :return: 260 | """ 261 | return bool(field) 262 | 263 | def map_type(self, mysql_type): 264 | """ 265 | Map MySQL type (as a string from DESC table statement) to ClickHouse type (as string) 266 | :param mysql_type: string MySQL type (from DESC statement). Ex.: 'INT(10) UNSIGNED', 'BOOLEAN' 267 | :return: string ClickHouse type specification directly usable in CREATE TABLE statement. Ex.: 268 | Int32 269 | UInt32 270 | """ 271 | 272 | # deal with UPPER CASE strings for simplicity 273 | mysql_type = mysql_type.upper() 274 | 275 | # Numeric Types 276 | if mysql_type.startswith('BIT'): 277 | ch_type = 'String' 278 | elif mysql_type.startswith('TINYINT'): 279 | ch_type = 'UInt8' if mysql_type.endswith('UNSIGNED') else 'Int8' 280 | elif mysql_type.startswith('BOOLEAN') or mysql_type.startswith('BOOL'): 281 | ch_type = 'UInt8' 282 | elif mysql_type.startswith('SMALLINT'): 283 | ch_type = 'UInt16' if mysql_type.endswith('UNSIGNED') else 'Int16' 284 | elif mysql_type.startswith('MEDIUMINT'): 285 | ch_type = 'UInt32' if mysql_type.endswith('UNSIGNED') else 'Int32' 286 | elif mysql_type.startswith('INTEGER') or mysql_type.startswith('INT'): 287 | ch_type = 'UInt32' if mysql_type.endswith('UNSIGNED') else 'Int32' 288 | elif mysql_type.startswith('BIGINT'): 289 | ch_type = 'UInt64' if mysql_type.endswith('UNSIGNED') else 'Int64' 290 | elif mysql_type.startswith('SERIAL'): 291 | ch_type = 'UInt64' 292 | elif mysql_type.startswith('DECIMAL') or mysql_type.startswith('DEC') or mysql_type.startswith('FIXED') or mysql_type.startswith('NUMERIC'): 293 | ch_type = 'String' 294 | elif mysql_type.startswith('FLOAT'): 295 | ch_type = 'Float32' 296 | elif mysql_type.startswith('DOUBLE') or mysql_type.startswith('REAL'): 297 | ch_type = 'Float64' 298 | 299 | # Date and Time Types 300 | elif mysql_type.startswith('DATETIME'): 301 | ch_type = 'DateTime' 302 | elif mysql_type.startswith('DATE'): 303 | ch_type = 'Date' 304 | elif mysql_type.startswith('TIMESTAMP'): 305 | ch_type = 'DateTime' 306 | elif mysql_type.startswith('TIME'): 307 | ch_type = 'String' 308 | elif mysql_type.startswith('YEAR'): 309 | ch_type = 'UInt16' 310 | 311 | # String Types 312 | elif mysql_type.startswith('CHAR'): 313 | ch_type = 'String' 314 | elif mysql_type.startswith('VARCHAR'): 315 | ch_type = 'String' 316 | elif mysql_type.startswith('BINARY'): 317 | ch_type = 'String' 318 | elif mysql_type.startswith('VARBINARY'): 319 | ch_type = 'String' 320 | elif mysql_type.startswith('TINYBLOB'): 321 | ch_type = 'String' 322 | elif mysql_type.startswith('TINYTEXT'): 323 | ch_type = 'String' 324 | elif mysql_type.startswith('BLOB'): 325 | ch_type = 'String' 326 | elif mysql_type.startswith('TEXT'): 327 | ch_type = 'String' 328 | elif mysql_type.startswith('MEDIUMBLOB'): 329 | ch_type = 'String' 330 | elif mysql_type.startswith('MEDIUMTEXT'): 331 | ch_type = 'String' 332 | elif mysql_type.startswith('LONGBLOB'): 333 | ch_type = 'String' 334 | elif mysql_type.startswith('LONGTEXT'): 335 | ch_type = 'String' 336 | 337 | # Set Types 338 | elif mysql_type.startswith('ENUM'): 339 | ch_type = 'Enum16' 340 | elif mysql_type.startswith('SET'): 341 | ch_type = 'Array(Int8)' 342 | 343 | # Custom Types 344 | elif mysql_type.startswith('JSON'): 345 | ch_type = 'String' 346 | 347 | else: 348 | ch_type = 'UNKNOWN' 349 | 350 | return ch_type 351 | 352 | def map_type_nullable(self, mysql_type, nullable=False): 353 | """ 354 | Map MySQL type (as a string from DESC table statement) to ClickHouse type (as string) 355 | :param mysql_type: string MySQL type (from DESC statement). Ex.: 'INT(10) UNSIGNED', 'BOOLEAN' 356 | :param nullable: bool is this field nullable 357 | :return: string ClickHouse type specification directly usable in CREATE TABLE statement. Ex.: 358 | Nullable(Int32) 359 | Nullable(UInt32) 360 | """ 361 | ch_type = self.map_type(mysql_type) 362 | 363 | # Deal with NULLs 364 | if nullable: 365 | ch_type = 'Nullable(' + ch_type + ')' 366 | 367 | return ch_type 368 | 369 | def create_table_engine(self, 370 | cluster=None, 371 | dst_schema=None, 372 | dst_table=None, 373 | primary_date_field=None, 374 | primary_key_fields=None, 375 | distribute=None): 376 | """ 377 | :param cluster: 378 | :param dst_schema: 379 | :param dst_table: 380 | :param primary_date_field: 381 | :param primary_key_fields: 382 | :param distribute: 383 | :return: 384 | """ 385 | if distribute: 386 | return "ENGINE = Distributed({}, '{}', '{}', rand())".format( 387 | cluster, 388 | dst_schema, 389 | dst_table 390 | ) 391 | else: 392 | engine = "ENGINE = ReplacingMergeTree() " 393 | if primary_date_field is not None: 394 | engine += "PARTITION BY toYYYYMM({}) ".format(primary_date_field) 395 | if primary_key_fields is not None: 396 | engine += "ORDER BY ({})".format(primary_key_fields) 397 | return engine 398 | 399 | if __name__ == '__main__': 400 | tb = TableSQLBuilder( 401 | host='127.0.0.1', 402 | user='reader', 403 | password='qwerty', 404 | dbs=['db'], 405 | # tables='datatypes, enum_datatypes, json_datatypes', 406 | tables=['datatypes', 'enum_datatypes', 'json_datatypes'], 407 | ) 408 | templates = tb.templates() 409 | for db in templates: 410 | for table in templates[db]: 411 | print(table, '=', templates[db][table]) 412 | -------------------------------------------------------------------------------- /clickhouse_mysql/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from clickhouse_mysql.reader.mysqlreader import MySQLReader 5 | from clickhouse_mysql.reader.csvreader import CSVReader 6 | 7 | from clickhouse_mysql.writer.chwriter import CHWriter 8 | from clickhouse_mysql.writer.csvwriter import CSVWriter 9 | from clickhouse_mysql.writer.chcsvwriter import CHCSVWriter 10 | from clickhouse_mysql.writer.poolwriter import PoolWriter 11 | from clickhouse_mysql.writer.processwriter import ProcessWriter 12 | from clickhouse_mysql.objectbuilder import ObjectBuilder 13 | 14 | from clickhouse_mysql.converter.csvwriteconverter import CSVWriteConverter 15 | from clickhouse_mysql.converter.chwriteconverter import CHWriteConverter 16 | from clickhouse_mysql.tablesqlbuilder import TableSQLBuilder 17 | from clickhouse_mysql.tablemigrator import TableMigrator 18 | from clickhouse_mysql.clioptions import Options, AggregatedOptions 19 | 20 | from clickhouse_mysql.dbclient.chclient import CHClient 21 | 22 | from clickhouse_mysql.util import Util 23 | 24 | import pprint 25 | 26 | CONVERTER_CSV = 1 27 | CONVERTER_CH = 2 28 | 29 | 30 | class Config(object): 31 | 32 | config = None 33 | options = None 34 | 35 | def __init__(self): 36 | 37 | # get aggregated options from all sources (config, cli, env) 38 | self.options = AggregatedOptions() 39 | 40 | log_file = None 41 | log_pos = None 42 | if self.options['binlog_position_file'] and self.options.get_bool('src_resume'): 43 | try: 44 | with open(self.options['binlog_position_file'], 'r') as f: 45 | position = f.read() 46 | log_file, log_pos = position.split(':') 47 | log_pos = int(log_pos) 48 | print("binlog position from file {} is {}:{}".format( 49 | self.options['binlog_position_file'], 50 | log_file, 51 | log_pos 52 | )) 53 | except: 54 | log_file = None 55 | log_pos = None 56 | print("can't read binlog position from file {}".format( 57 | self.options['binlog_position_file'], 58 | )) 59 | # build application config out of aggregated options 60 | self.config = { 61 | # 62 | # 63 | # 64 | 'app': { 65 | 'config_file': self.options['config_file'], 66 | 'log_file': self.options['log_file'], 67 | 'log_level': Options.log_level_from_string(self.options['log_level']), 68 | 'dry': self.options.get_bool('dry'), 69 | 'daemon': self.options.get_bool('daemon'), 70 | 'create_table_sql_template': self.options.get_bool('create_table_sql_template'), 71 | 'create_table_sql': self.options.get_bool('create_table_sql'), 72 | 'with_create_database': self.options.get_bool('with_create_database'), 73 | 'create_table_json_template': self.options.get_bool('create_table_json_template'), 74 | 'migrate_table': self.options.get_bool('migrate_table'), 75 | 'pid_file': self.options['pid_file'], 76 | 'binlog_position_file': self.options['binlog_position_file'], 77 | 'mempool': self.options.get_bool('mempool') or self.options.get_bool('csvpool'), # csvpool assumes mempool to be enabled 78 | 'mempool_max_events_num': self.options.get_int('mempool_max_events_num'), 79 | 'mempool_max_rows_num': self.options.get_int('mempool_max_rows_num'), 80 | 'mempool_max_flush_interval': self.options.get_int('mempool_max_flush_interval'), 81 | 'csvpool': self.options.get_bool('csvpool'), 82 | 'pump_data': self.options.get_bool('pump_data'), 83 | 'install': self.options.get_bool('install'), 84 | }, 85 | 86 | # 87 | # 88 | # 89 | 'converter': { 90 | 'clickhouse': { 91 | 'converter_file': self.options['ch_converter_file'], 92 | 'converter_class': self.options['ch_converter_class'], 93 | 'column_skip': self.options['column_skip'], 94 | }, 95 | 'csv': { 96 | 'column_default_value': self.options['column_default_value'], 97 | 'column_skip': self.options['column_skip'], 98 | }, 99 | }, 100 | 101 | # 102 | # 103 | # 104 | 'table_builder': { 105 | 'mysql': { 106 | 'host': self.options['src_host'], 107 | 'port': self.options.get_int('src_port'), 108 | 'user': self.options['src_user'], 109 | 'password': self.options['src_password'], 110 | 'schemas': self.options.get_list('src_schemas'), 111 | 'tables': self.options.get_list('src_tables'), 112 | 'tables_prefixes': self.options.get_list('src_tables_prefixes'), 113 | 'column_skip': self.options['column_skip'] 114 | }, 115 | 'clickhouse': { 116 | 'connection_settings': { 117 | 'host': self.options['dst_host'], 118 | 'port': self.options.get_int('dst_port'), 119 | 'user': self.options['dst_user'], 120 | 'password': self.options['dst_password'], 121 | }, 122 | 'dst_schema': self.options['dst_schema'], 123 | 'dst_distribute': self.options['dst_distribute'], 124 | 'dst_cluster': self.options['dst_cluster'], 125 | 'dst_table': self.options['dst_table'], 126 | 'dst_table_prefix': self.options['dst_table_prefix'], 127 | 'dst_create_table': self.options.get_bool('dst_create_table'), 128 | }, 129 | }, 130 | 131 | # 132 | # 133 | # 134 | 'table_migrator': { 135 | 'mysql': { 136 | 'host': self.options['src_host'], 137 | 'port': self.options.get_int('src_port'), 138 | 'user': self.options['src_user'], 139 | 'password': self.options['src_password'], 140 | 'schemas': self.options.get_list('src_schemas'), 141 | 'tables': self.options.get_list('src_tables'), 142 | 'tables_prefixes': self.options.get_list('src_tables_prefixes'), 143 | 'tables_where_clauses': self.options.get_list('src_tables_where_clauses'), 144 | 'column_skip': self.options['column_skip'] 145 | }, 146 | 'clickhouse': { 147 | 'connection_settings': { 148 | 'host': self.options['dst_host'], 149 | 'port': self.options.get_int('dst_port'), 150 | 'user': self.options['dst_user'], 151 | 'password': self.options['dst_password'], 152 | }, 153 | 'dst_schema': self.options['dst_schema'], 154 | 'dst_distribute': self.options['dst_distribute'], 155 | 'dst_cluster': self.options['dst_cluster'], 156 | 'dst_table': self.options['dst_table'], 157 | 'dst_table_prefix': self.options['dst_table_prefix'], 158 | 'dst_create_table': self.options.get_bool('dst_create_table'), 159 | }, 160 | }, 161 | 162 | # 163 | # 164 | # 165 | 'reader': { 166 | 'mysql': { 167 | 'connection_settings': { 168 | 'host': self.options['src_host'], 169 | 'port': self.options.get_int('src_port'), 170 | 'user': self.options['src_user'], 171 | 'password': self.options['src_password'], 172 | }, 173 | 'server_id': self.options.get_int('src_server_id'), 174 | 'schemas': self.options.get_list('src_schemas'), 175 | 'tables': self.options.get_list('src_tables'), 176 | 'tables_prefixes': self.options.get_list('src_tables_prefixes'), 177 | 'blocking': self.options.get_bool('src_wait'), 178 | 'resume_stream': self.options.get_bool('src_resume'), 179 | 'nice_pause': 0 if self.options.get_int('nice_pause') is None else self.options.get_int('nice_pause'), 180 | 'log_file': self.options['src_binlog_file'] if self.options['src_binlog_file'] else log_file, 181 | 'log_pos': self.options.get_int('src_binlog_position') if self.options.get_int('src_binlog_position') else log_pos, 182 | }, 183 | 'file': { 184 | 'csv_file_path': self.options['src_file'], 185 | 'nice_pause': 0 if self.options.get_int('nice_pause') is None else self.options.get_int('nice_pause'), 186 | }, 187 | }, 188 | 189 | # 190 | # 191 | # 192 | 'writer': { 193 | 'clickhouse': { 194 | 'connection_settings': { 195 | 'host': self.options['dst_host'], 196 | 'port': self.options.get_int('dst_port'), 197 | 'user': self.options['dst_user'], 198 | 'password': self.options['dst_password'], 199 | }, 200 | 'dst_schema': self.options['dst_schema'], 201 | 'dst_distribute': self.options['dst_distribute'], 202 | 'dst_table': self.options['dst_table'], 203 | 'dst_table_prefix': self.options['dst_table_prefix'], 204 | }, 205 | 'file': { 206 | 'csv_file_path': self.options['dst_file'], 207 | 'csv_file_path_prefix': self.options['csvpool_file_path_prefix'], 208 | 'csv_file_path_suffix_parts': [], 209 | 'csv_keep_file': self.options['csvpool_keep_files'], 210 | 'dst_schema': self.options['dst_schema'], 211 | 'dst_distribute': self.options['dst_distribute'], 212 | 'dst_table': self.options['dst_table'], 213 | 'dst_table_prefix': self.options['dst_table_prefix'], 214 | }, 215 | }, 216 | } 217 | 218 | def __str__(self): 219 | return pprint.pformat(self.config) 220 | 221 | def __getitem__(self, item): 222 | return self.config[item] 223 | 224 | def log_file(self): 225 | return self.config['app']['log_file'] 226 | 227 | def log_level(self): 228 | return self.config['app']['log_level'] 229 | 230 | def pid_file(self): 231 | return self.config['app']['pid_file'] 232 | 233 | def mempool_max_rows_num(self): 234 | return self.config['app']['mempool_max_rows_num'] 235 | 236 | def is_daemon(self): 237 | return self.config['app']['daemon'] 238 | 239 | def is_create_table_sql_template(self): 240 | return self.config['app']['create_table_sql_template'] 241 | 242 | def is_create_table_sql(self): 243 | return self.config['app']['create_table_sql'] 244 | 245 | def is_with_create_database(self): 246 | return self.config['app']['with_create_database'] 247 | 248 | def is_dst_create_table(self): 249 | return self.config['table_builder']['clickhouse']['dst_create_table'] 250 | 251 | def is_create_table_json_template(self): 252 | return self.config['app']['create_table_json_template'] 253 | 254 | def is_install(self): 255 | return self.config['app']['install'] 256 | 257 | def table_sql_builder(self): 258 | return TableSQLBuilder( 259 | host=self.config['table_builder']['mysql']['host'], 260 | port=self.config['table_builder']['mysql']['port'], 261 | user=self.config['table_builder']['mysql']['user'], 262 | password=self.config['table_builder']['mysql']['password'], 263 | dbs=self.config['table_builder']['mysql']['schemas'], 264 | dst_schema=self.config['table_builder']['clickhouse']['dst_schema'], 265 | dst_table=self.config['table_builder']['clickhouse']['dst_table'], 266 | dst_table_prefix=self.config['table_builder']['clickhouse']['dst_table_prefix'], 267 | distribute=self.config['table_builder']['clickhouse']['dst_distribute'], 268 | cluster=self.config['table_builder']['clickhouse']['dst_cluster'], 269 | tables=self.config['table_builder']['mysql']['tables'], 270 | tables_prefixes=self.config['table_builder']['mysql']['tables_prefixes'], 271 | column_skip=self.config['converter']['clickhouse']['column_skip'], 272 | ) 273 | 274 | def is_migrate_table(self): 275 | return self.config['app']['migrate_table'] 276 | 277 | def is_pump_data(self): 278 | return self.config['app']['pump_data'] 279 | 280 | def chclient(self): 281 | return CHClient(self.config['writer']['clickhouse']['connection_settings']) 282 | 283 | def table_migrator(self): 284 | table_migrator = TableMigrator( 285 | host=self.config['table_migrator']['mysql']['host'], 286 | port=self.config['table_migrator']['mysql']['port'], 287 | user=self.config['table_migrator']['mysql']['user'], 288 | password=self.config['table_migrator']['mysql']['password'], 289 | dbs=self.config['table_migrator']['mysql']['schemas'], 290 | dst_schema=self.config['table_migrator']['clickhouse']['dst_schema'], 291 | dst_table=self.config['table_builder']['clickhouse']['dst_table'], 292 | dst_table_prefix=self.config['table_builder']['clickhouse']['dst_table_prefix'], 293 | distribute=self.config['table_migrator']['clickhouse']['dst_distribute'], 294 | cluster=self.config['table_migrator']['clickhouse']['dst_cluster'], 295 | tables=self.config['table_migrator']['mysql']['tables'], 296 | tables_prefixes=self.config['table_migrator']['mysql']['tables_prefixes'], 297 | tables_where_clauses=self.config['table_migrator']['mysql']['tables_where_clauses'], 298 | column_skip=self.config['converter']['clickhouse']['column_skip'], 299 | ) 300 | table_migrator.chwriter = self.writer_builder_chwriter().get() 301 | table_migrator.chclient = self.chclient() 302 | table_migrator.pool_max_rows_num = self.mempool_max_rows_num() 303 | 304 | return table_migrator 305 | 306 | def reader(self): 307 | if self.config['reader']['file']['csv_file_path']: 308 | return CSVReader( 309 | csv_file_path=self.config['reader']['file']['csv_file_path'], 310 | ) 311 | else: 312 | return MySQLReader( 313 | connection_settings={ 314 | 'host': self.config['reader']['mysql']['connection_settings']['host'], 315 | 'port': self.config['reader']['mysql']['connection_settings']['port'], 316 | 'user': self.config['reader']['mysql']['connection_settings']['user'], 317 | 'passwd': self.config['reader']['mysql']['connection_settings']['password'], 318 | }, 319 | server_id=self.config['reader']['mysql']['server_id'], 320 | log_file=self.config['reader']['mysql']['log_file'], 321 | log_pos=self.config['reader']['mysql']['log_pos'], 322 | schemas=self.config['reader']['mysql']['schemas'], 323 | tables=self.config['reader']['mysql']['tables'], 324 | tables_prefixes=self.config['reader']['mysql']['tables_prefixes'], 325 | blocking=self.config['reader']['mysql']['blocking'], 326 | resume_stream=self.config['reader']['mysql']['resume_stream'], 327 | nice_pause=self.config['reader']['mysql']['nice_pause'], 328 | binlog_position_file=self.config['app']['binlog_position_file'], 329 | ) 330 | 331 | def converter_builder(self, which): 332 | if which == CONVERTER_CSV: 333 | return ObjectBuilder( 334 | instance=CSVWriteConverter( 335 | defaults=self.config['converter']['csv']['column_default_value'], 336 | column_skip=self.config['converter']['csv']['column_skip'], 337 | )) 338 | 339 | elif which == CONVERTER_CH: 340 | if not self.config['converter']['clickhouse']['converter_file'] or not self.config['converter']['clickhouse']['converter_class']: 341 | # default converter 342 | return ObjectBuilder(instance=CHWriteConverter(column_skip=self.config['converter']['clickhouse']['column_skip'])) 343 | else: 344 | # explicitly specified converter 345 | _class = Util.class_from_file( 346 | self.config['converter']['clickhouse']['converter_file'], 347 | self.config['converter']['clickhouse']['converter_class'] 348 | ) 349 | return ObjectBuilder(instance=_class(column_skip=self.config['converter']['clickhouse']['column_skip'])) 350 | 351 | def writer_builder_csvpool(self): 352 | return ObjectBuilder(class_name=ProcessWriter, constructor_params={ 353 | 'next_writer_builder': ObjectBuilder(class_name=CSVWriter, constructor_params={ 354 | 'csv_file_path': self.config['writer']['file']['csv_file_path'], 355 | 'csv_file_path_prefix': self.config['writer']['file']['csv_file_path_prefix'], 356 | 'csv_file_path_suffix_parts': self.config['writer']['file']['csv_file_path_suffix_parts'], 357 | 'csv_keep_file': self.config['writer']['file']['csv_keep_file'], 358 | 'dst_schema': self.config['writer']['file']['dst_schema'], 359 | 'dst_table': self.config['writer']['file']['dst_table'], 360 | 'dst_table_prefix': self.config['writer']['file']['dst_table_prefix'], 361 | 'next_writer_builder': ObjectBuilder( 362 | class_name=CHCSVWriter, 363 | constructor_params=self.config['writer']['clickhouse'] 364 | ), 365 | 'converter_builder': self.converter_builder(CONVERTER_CSV), 366 | }) 367 | }) 368 | 369 | def writer_builder_csv_file(self): 370 | return ObjectBuilder(class_name=CSVWriter, constructor_params={ 371 | 'csv_file_path': self.config['writer']['file']['csv_file_path'], 372 | 'csv_file_path_prefix': self.config['writer']['file']['csv_file_path_prefix'], 373 | 'csv_file_path_suffix_parts': self.config['writer']['file']['csv_file_path_suffix_parts'], 374 | 'csv_keep_file': self.config['writer']['file']['csv_keep_file'], 375 | 'dst_schema': self.config['writer']['file']['dst_schema'], 376 | 'dst_table': self.config['writer']['file']['dst_table'], 377 | 'dst_table_prefix': self.config['writer']['file']['dst_table_prefix'], 378 | 'next_writer_builder': None, 379 | 'converter_builder': self.converter_builder(CONVERTER_CSV), 380 | }) 381 | 382 | def writer_builder_chwriter(self): 383 | return ObjectBuilder(class_name=CHWriter, constructor_params={ 384 | 'connection_settings': { 385 | 'host': self.config['writer']['clickhouse']['connection_settings']['host'], 386 | 'port': self.config['writer']['clickhouse']['connection_settings']['port'], 387 | 'user': self.config['writer']['clickhouse']['connection_settings']['user'], 388 | 'password': self.config['writer']['clickhouse']['connection_settings']['password'], 389 | }, 390 | 'dst_schema': self.config['writer']['clickhouse']['dst_schema'], 391 | 'dst_table': self.config['writer']['clickhouse']['dst_table'], 392 | 'dst_table_prefix': self.config['writer']['clickhouse']['dst_table_prefix'], 393 | 'dst_distribute': self.config['writer']['clickhouse']['dst_distribute'], 394 | 'next_writer_builder': None, 395 | 'converter_builder': self.converter_builder(CONVERTER_CH), 396 | }) 397 | 398 | def writer_builder(self): 399 | if self.config['app']['csvpool']: 400 | return self.writer_builder_csvpool() 401 | elif self.config['writer']['file']['csv_file_path']: 402 | return self.writer_builder_csv_file() 403 | else: 404 | return self.writer_builder_chwriter() 405 | 406 | def pool_writer(self): 407 | return PoolWriter( 408 | writer_builder=self.writer_builder(), 409 | max_pool_size=self.config['app']['mempool_max_events_num'], 410 | max_flush_interval=self.config['app']['mempool_max_flush_interval'], 411 | ) 412 | 413 | def writer(self): 414 | if self.config['app']['mempool']: 415 | return self.pool_writer() 416 | else: 417 | return self.writer_builder().get() 418 | 419 | def class_from_file(self, file, class_name): 420 | """ 421 | 422 | :param file: /path/to/file.py 423 | :param classname: CHWriteConverter 424 | :return: 425 | """ 426 | spec = importlib.util.spec_from_file_location("custom.module", file) 427 | module = importlib.util.module_from_spec(spec) 428 | spec.loader.exec_module(module) 429 | _class = globals()["custom.module.{}".format(class_name)] 430 | instance = _class() 431 | return instance 432 | -------------------------------------------------------------------------------- /clickhouse_mysql/clioptions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import logging 6 | import pprint 7 | 8 | 9 | class Options(object): 10 | @staticmethod 11 | def join_lists_into_dict(lists_to_join): 12 | """ 13 | Join several lists into one dictionary 14 | 15 | :param lists_to_join: is a list of lists 16 | [['a=b', 'c=d'], ['e=f', 'z=x'], ] 17 | 18 | :return: None or dictionary 19 | {'a': 'b', 'c': 'd', 'e': 'f', 'y': 'z'} 20 | 21 | """ 22 | 23 | # lists_to_join must be a list 24 | if not isinstance(lists_to_join, list): 25 | return None 26 | 27 | res = {} 28 | # res = dict { 29 | # 'name1': 'value1', 30 | # 'name2': 'value2', 31 | # } 32 | for _list in lists_to_join: 33 | # _list = ['a=b', 'c=d'] 34 | for name_value_pair in _list: 35 | # name_value_pair contains 'a=b' 36 | name, value = name_value_pair.split('=', 2) 37 | res[name] = value 38 | 39 | # return with sanity check 40 | if len(res) > 0: 41 | return res 42 | else: 43 | return None 44 | 45 | @staticmethod 46 | def join_lists(lists_to_join): 47 | """ 48 | Join several lists into one 49 | :param lists_to_join: is a list of lists 50 | [['a', 'b'], ['c', 'd'], ['e', 'f']] 51 | :return: 52 | ['a', 'b', 'c', 'd', 'e', 'f'] 53 | """ 54 | 55 | # lists_to_join must be a list 56 | if not isinstance(lists_to_join, list): 57 | return None 58 | 59 | res = [] 60 | for _list in lists_to_join: 61 | for _item in _list: 62 | res.append(_item) 63 | 64 | return res 65 | 66 | @staticmethod 67 | def log_level_from_string(log_level_string): 68 | """Convert string representation of a log level into logging.XXX constant""" 69 | 70 | if isinstance(log_level_string, str): 71 | level = log_level_string.upper() 72 | 73 | if level == 'CRITICAL': 74 | return logging.CRITICAL 75 | if level == 'ERROR': 76 | return logging.ERROR 77 | if level == 'WARNING': 78 | return logging.WARNING 79 | if level == 'INFO': 80 | return logging.INFO 81 | if level == 'DEBUG': 82 | return logging.DEBUG 83 | if level == 'NOTSET': 84 | return logging.NOTSET 85 | 86 | return logging.NOTSET 87 | 88 | 89 | class CLIOptions(Options): 90 | """Options extracted from command line""" 91 | 92 | default_options = { 93 | # 94 | # general app section 95 | # 96 | 'config_file': '/etc/clickhouse-mysql/clickhouse-mysql.conf', 97 | 'log_file': None, 98 | 'log_level': None, 99 | 'nice_pause': None, 100 | 'dry': False, 101 | 'daemon': False, 102 | 'pid_file': '/tmp/reader.pid', 103 | 'binlog_position_file': None, 104 | 'mempool': False, 105 | 'mempool_max_events_num': 100000, 106 | 'mempool_max_rows_num': 100000, 107 | 'mempool_max_flush_interval': 60, 108 | 'csvpool': False, 109 | 'csvpool_file_path_prefix': '/tmp/csvpool_', 110 | 'csvpool_keep_files': False, 111 | 'create_table_sql_template': False, 112 | 'create_table_sql': False, 113 | 'with_create_database': False, 114 | 'create_table_json_template': False, 115 | 'migrate_table': False, 116 | 'pump_data': False, 117 | 'install': False, 118 | 119 | # 120 | # src section 121 | # 122 | 'src_server_id': None, 123 | 'src_host': None, 124 | 'src_port': 3306, 125 | 'src_user': None, 126 | 'src_password': None, 127 | 'src_schemas': None, 128 | 'src_tables': None, 129 | 'src_tables_where_clauses': None, 130 | 'src_tables_prefixes': None, 131 | 'src_wait': False, 132 | 'src_resume': False, 133 | 'src_binlog_file': None, 134 | 'src_binlog_position': None, 135 | 'src_file': None, 136 | 137 | # 138 | # dst section 139 | # 140 | 'dst_file': None, 141 | 'dst_host': None, 142 | 'dst_port': 9000, 143 | 'dst_user': 'default', 144 | 'dst_password': '', 145 | 'dst_schema': None, 146 | 'dst_distribute': False, 147 | 'dst_cluster': None, 148 | 'dst_table': None, 149 | 'dst_table_prefix': None, 150 | 'dst_create_table': False, 151 | 152 | # 153 | # converters section 154 | # 155 | 'column_default_value': None, 156 | 'column_skip': [], 157 | 'ch_converter_file': None, 158 | 'ch_converter_class': None, 159 | } 160 | 161 | def options(self): 162 | """Parse application's CLI options into options dictionary 163 | :return: instance of Config 164 | """ 165 | 166 | argparser = argparse.ArgumentParser( 167 | description='ClickHouse data reader', 168 | epilog='===============' 169 | ) 170 | 171 | # 172 | # general app section 173 | # 174 | argparser.add_argument( 175 | '--config-file', 176 | type=str, 177 | default=self.default_options['config_file'], 178 | help='Path to config file. Default - not specified' 179 | ) 180 | argparser.add_argument( 181 | '--log-file', 182 | type=str, 183 | default=self.default_options['log_file'], 184 | help='Path to log file. Default - not specified' 185 | ) 186 | argparser.add_argument( 187 | '--log-level', 188 | type=str, 189 | default=self.default_options['log_level'], 190 | help='Log Level. Default - NOTSET' 191 | ) 192 | argparser.add_argument( 193 | '--nice-pause', 194 | type=int, 195 | default=self.default_options['nice_pause'], 196 | help='Make specified (in sec) pause between attempts to read binlog stream' 197 | ) 198 | argparser.add_argument( 199 | '--dry', 200 | action='store_true', 201 | help='Dry mode - do not do anything that can harm. ' 202 | 'Useful for debugging.' 203 | ) 204 | argparser.add_argument( 205 | '--daemon', 206 | action='store_true', 207 | help='Daemon mode - go to background.' 208 | ) 209 | argparser.add_argument( 210 | '--pid-file', 211 | type=str, 212 | default=self.default_options['pid_file'], 213 | help='Pid file to be used by the app in daemon mode' 214 | ) 215 | argparser.add_argument( 216 | '--binlog-position-file', 217 | type=str, 218 | default=self.default_options['binlog_position_file'], 219 | help='File to write binlog position to during bin log reading and to read position from on start' 220 | ) 221 | argparser.add_argument( 222 | '--mempool', 223 | action='store_true', 224 | help='Cache data in mem.' 225 | ) 226 | argparser.add_argument( 227 | '--mempool-max-events-num', 228 | type=int, 229 | default=self.default_options['mempool_max_events_num'], 230 | help='Max events number to pool - triggering pool flush' 231 | ) 232 | argparser.add_argument( 233 | '--mempool-max-rows-num', 234 | type=int, 235 | default=self.default_options['mempool_max_rows_num'], 236 | help='Max rows number to pool - triggering pool flush' 237 | ) 238 | argparser.add_argument( 239 | '--mempool-max-flush-interval', 240 | type=int, 241 | default=self.default_options['mempool_max_flush_interval'], 242 | help='Max seconds number between pool flushes' 243 | ) 244 | argparser.add_argument( 245 | '--csvpool', 246 | action='store_true', 247 | help='Cache data in CSV pool files on disk. Requires memory pooling, ' 248 | 'thus enables --mempool even if it is not explicitly specified' 249 | ) 250 | argparser.add_argument( 251 | '--csvpool-file-path-prefix', 252 | type=str, 253 | default=self.default_options['csvpool_file_path_prefix'], 254 | help='File path prefix to CSV pool files' 255 | ) 256 | argparser.add_argument( 257 | '--csvpool-keep-files', 258 | action='store_true', 259 | help='Keep CSV pool files. Useful for debugging' 260 | ) 261 | argparser.add_argument( 262 | '--create-table-sql-template', 263 | action='store_true', 264 | help='Prepare CREATE TABLE SQL template(s).' 265 | ) 266 | argparser.add_argument( 267 | '--create-table-sql', 268 | action='store_true', 269 | help='Prepare CREATE TABLE SQL statement(s).' 270 | ) 271 | argparser.add_argument( 272 | '--with-create-database', 273 | action='store_true', 274 | help='Prepend each CREATE TABLE SQL statement(s) with CREATE DATABASE statement' 275 | ) 276 | argparser.add_argument( 277 | '--create-table-json-template', 278 | action='store_true', 279 | help='Prepare CREATE TABLE template(s) as JSON. Useful for IPC' 280 | ) 281 | argparser.add_argument( 282 | '--migrate-table', 283 | action='store_true', 284 | help='Migrate table(s). Copy existing data from MySQL table(s) with SELECT statement. ' 285 | 'Binlog is not read during this procedure - just copy data from the src table(s). ' 286 | 'IMPORTANT!. Target table has to be created in ClickHouse ' 287 | 'or it has to be created with --dst-create-table and possibly with --with-create-database options. ' 288 | 'See --create-table-sql-template and --create-table-sql options for additional info. ' 289 | ) 290 | argparser.add_argument( 291 | '--pump-data', 292 | action='store_true', 293 | help='Pump data from MySQL binlog into ClickHouse. Copy rows from binlog until the end of binlog reached. ' 294 | 'When end of binlog reached, process ends. ' 295 | 'Use in combination with --src-wait in case would like to continue and wait for new rows ' 296 | 'after end of binlog reached' 297 | ) 298 | argparser.add_argument( 299 | '--install', 300 | action='store_true', 301 | help='Install service file(s)' 302 | ) 303 | 304 | # 305 | # src section 306 | # 307 | argparser.add_argument( 308 | '--src-server-id', 309 | type=int, 310 | default=self.default_options['src_server_id'], 311 | help='Set server_id to be used when reading date from MySQL src. Ex.: 1' 312 | ) 313 | argparser.add_argument( 314 | '--src-host', 315 | type=str, 316 | default=self.default_options['src_host'], 317 | help='Host to be used when reading from src. Ex.: 127.0.0.1' 318 | ) 319 | argparser.add_argument( 320 | '--src-port', 321 | type=int, 322 | default=self.default_options['src_port'], 323 | help='Port to be used when reading from src. Ex.: 3306' 324 | ) 325 | argparser.add_argument( 326 | '--src-user', 327 | type=str, 328 | default=self.default_options['src_user'], 329 | help='Username to be used when reading from src. Ex.: root' 330 | ) 331 | argparser.add_argument( 332 | '--src-password', 333 | type=str, 334 | default=self.default_options['src_password'], 335 | help='Password to be used when reading from src. Ex.: qwerty' 336 | ) 337 | argparser.add_argument( 338 | '--src-schemas', 339 | type=str, 340 | default=self.default_options['src_schemas'], 341 | help='Comma-separated list of databases (a.k.a schemas) to be used when reading from src. Ex.: db1,db2,db3' 342 | ) 343 | argparser.add_argument( 344 | '--src-tables', 345 | type=str, 346 | default=self.default_options['src_tables'], 347 | help='Comma-separated list of tables to be used when reading from src. ' 348 | 'Ex.: table1,table2,table3' 349 | 'Ex.: db1.table1,db2.table2,db3.table3' 350 | 'Ex.: table1,db2.table2,table3' 351 | ) 352 | argparser.add_argument( 353 | '--src-tables-where-clauses', 354 | type=str, 355 | default=self.default_options['src_tables_where_clauses'], 356 | help='Comma-separated list of WHERE clauses for tables to be migrated. ' 357 | 'Ex.: db1.t1="a=1 and b=2",db2.t2="c=3 and k=4". ' 358 | 'Accepts both (comma-separated) clause (useful for short clauses) or ' 359 | 'file where clause is located (useful for long clauses)' 360 | ) 361 | argparser.add_argument( 362 | '--src-tables-prefixes', 363 | type=str, 364 | default=self.default_options['src_tables_prefixes'], 365 | help='Comma-separated list of table prefixes to be used when reading from src.' 366 | 'Useful when we need to process unknown-in-advance tables, say day-named log tables, as log_2017_12_27' 367 | 'Ex.: mylog_,anotherlog_,extralog_3' 368 | ) 369 | argparser.add_argument( 370 | '--src-wait', 371 | action='store_true', 372 | help='Wait indefinitely for new records to come.' 373 | ) 374 | argparser.add_argument( 375 | '--src-resume', 376 | action='store_true', 377 | help='Resume reading from previous position. Previous position is read from `binlog-position-file`' 378 | ) 379 | argparser.add_argument( 380 | '--src-binlog-file', 381 | type=str, 382 | default=self.default_options['src_binlog_file'], 383 | help='Binlog file to be used to read from src. Related to `binlog-position-file`. ' 384 | 'Ex.: mysql-bin.000024' 385 | ) 386 | argparser.add_argument( 387 | '--src-binlog-position', 388 | type=int, 389 | default=self.default_options['src_binlog_position'], 390 | help='Binlog position to be used when reading from src. Related to `binlog-position-file`. ' 391 | 'Ex.: 5703' 392 | ) 393 | argparser.add_argument( 394 | '--src-file', 395 | type=str, 396 | default=self.default_options['src_file'], 397 | help='Source file to read data from. CSV' 398 | ) 399 | 400 | # 401 | # dst section 402 | # 403 | argparser.add_argument( 404 | '--dst-file', 405 | type=str, 406 | default=self.default_options['dst_file'], 407 | help='Target file to be used when writing data. CSV' 408 | ) 409 | argparser.add_argument( 410 | '--dst-host', 411 | type=str, 412 | default=self.default_options['dst_host'], 413 | help='Host to be used when writing to dst. Ex.: 127.0.0.1' 414 | ) 415 | argparser.add_argument( 416 | '--dst-port', 417 | type=int, 418 | default=self.default_options['dst_port'], 419 | help='Port to be used when writing to dst. Ex.: 9000' 420 | ) 421 | argparser.add_argument( 422 | '--dst-user', 423 | type=str, 424 | default=self.default_options['dst_user'], 425 | help='Username to be used when writing to dst. Ex: default' 426 | ) 427 | argparser.add_argument( 428 | '--dst-password', 429 | type=str, 430 | default=self.default_options['dst_password'], 431 | help='Password to be used when writing to dst. Ex.: qwerty' 432 | ) 433 | argparser.add_argument( 434 | '--dst-schema', 435 | type=str, 436 | default=self.default_options['dst_schema'], 437 | help='Database (a.k.a schema) to be used to create tables in ClickHouse. ' 438 | 'It overwrites source database(s) name(s), so tables in ClickHouse ' 439 | 'would be located in differently named db than in MySQL. ' 440 | 'Ex.: db1' 441 | ) 442 | argparser.add_argument( 443 | '--dst-distribute', 444 | action='store_true', 445 | default=self.default_options['dst_distribute'], 446 | help='Whether to add distribute table' 447 | ) 448 | argparser.add_argument( 449 | '--dst-cluster', 450 | type=str, 451 | default=self.default_options['dst_cluster'], 452 | help='Cluster to be used when writing to dst. Ex.: cluster1' 453 | ) 454 | argparser.add_argument( 455 | '--dst-table', 456 | type=str, 457 | default=self.default_options['dst_table'], 458 | help='Table to be used when writing to dst. Ex.: table1' 459 | ) 460 | argparser.add_argument( 461 | '--dst-table-prefix', 462 | type=str, 463 | default=self.default_options['dst_table_prefix'], 464 | help='Prefix to be used when creating dst table. Ex.: copy_table_' 465 | ) 466 | argparser.add_argument( 467 | '--dst-create-table', 468 | action='store_true', 469 | help='Prepare and run CREATE TABLE SQL statement(s).' 470 | ) 471 | 472 | # 473 | # converters section 474 | # 475 | argparser.add_argument( 476 | '--column-default-value', 477 | type=str, 478 | nargs='*', 479 | action='append', 480 | default=self.default_options['column_default_value'], 481 | help='Set of key=value pairs for columns default values. ' 482 | 'Ex.: date_1=2000-01-01 timestamp_1=2002-01-01\ 01:02:03' 483 | ) 484 | argparser.add_argument( 485 | '--column-skip', 486 | type=str, 487 | nargs='*', 488 | action='append', 489 | default=self.default_options['column_skip'], 490 | help='Set of column names to skip. Ex.: column1 column2' 491 | ) 492 | argparser.add_argument( 493 | '--ch-converter-file', 494 | type=str, 495 | default=self.default_options['ch_converter_file'], 496 | help='Filename where to search for CH converter class' 497 | ) 498 | argparser.add_argument( 499 | '--ch-converter-class', 500 | type=str, 501 | default=self.default_options['ch_converter_class'], 502 | help='Converter class name in --ch-converter-file file' 503 | ) 504 | 505 | args = argparser.parse_args() 506 | 507 | return { 508 | # 509 | # general app section 510 | # 511 | 'config_file': args.config_file, 512 | 'log_file': args.log_file, 513 | 'log_level': args.log_level, 514 | 'nice_pause': args.nice_pause, 515 | 'dry': args.dry, 516 | 'daemon': args.daemon, 517 | 'pid_file': args.pid_file, 518 | 'binlog_position_file': args.binlog_position_file, 519 | 'mempool': args.mempool, # csvpool assumes mempool to be enabled 520 | 'mempool_max_events_num': args.mempool_max_events_num, 521 | 'mempool_max_rows_num': args.mempool_max_rows_num, 522 | 'mempool_max_flush_interval': args.mempool_max_flush_interval, 523 | 'csvpool': args.csvpool, 524 | 'csvpool_file_path_prefix': args.csvpool_file_path_prefix, 525 | 'csvpool_keep_files': args.csvpool_keep_files, 526 | 'create_table_sql_template': args.create_table_sql_template, 527 | 'create_table_sql': args.create_table_sql, 528 | 'with_create_database': args.with_create_database, 529 | 'create_table_json_template': args.create_table_json_template, 530 | 'migrate_table': args.migrate_table, 531 | 'pump_data': args.pump_data, 532 | 'install': args.install, 533 | 534 | # 535 | # src section 536 | # 537 | 'src_server_id': args.src_server_id, 538 | 'src_host': args.src_host, 539 | 'src_port': args.src_port, 540 | 'src_user': args.src_user, 541 | 'src_password': args.src_password, 542 | 'src_schemas': [x for x in args.src_schemas.split(',') if x] if args.src_schemas else self.default_options['src_schemas'], 543 | 'src_tables': [x for x in args.src_tables.split(',') if x] if args.src_tables else self.default_options['src_tables'], 544 | 'src_tables_where_clauses': [x for x in args.src_tables_where_clauses.split(',') if x] if args.src_tables_where_clauses else self.default_options['src_tables_where_clauses'], 545 | 'src_tables_prefixes': [x for x in args.src_tables_prefixes.split(',') if x] if args.src_tables_prefixes else self.default_options['src_tables_prefixes'], 546 | 'src_wait': args.src_wait, 547 | 'src_resume': args.src_resume, 548 | 'src_binlog_file': args.src_binlog_file, 549 | 'src_binlog_position': args.src_binlog_position, 550 | 'src_file': args.src_file, 551 | 552 | # 553 | # dst section 554 | # 555 | 'dst_file': args.dst_file, 556 | 'dst_host': args.dst_host, 557 | 'dst_port': args.dst_port, 558 | 'dst_user': args.dst_user, 559 | 'dst_password': args.dst_password, 560 | 'dst_schema': args.dst_schema, 561 | 'dst_distribute': args.dst_distribute, 562 | 'dst_cluster': args.dst_cluster, 563 | 'dst_table': args.dst_table, 564 | 'dst_table_prefix': args.dst_table_prefix, 565 | 'dst_create_table': args.dst_create_table, 566 | 567 | # 568 | # converters section 569 | # 570 | 'column_default_value': CLIOptions.join_lists_into_dict(args.column_default_value), 571 | 'column_skip': CLIOptions.join_lists(args.column_skip), 572 | 'ch_converter_file': args.ch_converter_file, 573 | 'ch_converter_class': args.ch_converter_class, 574 | } 575 | 576 | from configobj import ConfigObj 577 | 578 | 579 | class ConfigFileOptions(Options): 580 | """Options extracted from configuration files""" 581 | 582 | @staticmethod 583 | def options(filename): 584 | 585 | # 586 | def transform(section, key): 587 | new_key = key.replace('-', '_') 588 | section.rename(key, new_key) 589 | 590 | # fetch base config 591 | try: 592 | base_config = ConfigObj( 593 | infile='/etc/clickhouse-mysql/config.ini', 594 | encoding="utf-8", 595 | default_encoding="utf-8", 596 | list_values=True, 597 | create_empty=False, # create empty config file 598 | stringify=True, 599 | raise_errors=False, 600 | file_error=False, 601 | ) 602 | except: 603 | base_config = None 604 | 605 | # fetch user config 606 | try: 607 | user_config = ConfigObj( 608 | filename, 609 | encoding="utf-8", 610 | default_encoding="utf-8", 611 | list_values=True, 612 | create_empty=False, # create empty config file 613 | stringify=True, 614 | raise_errors=False, 615 | file_error=False, 616 | ) 617 | except: 618 | user_config = None 619 | 620 | # merge base and user configs 621 | # user config has priority over base config 622 | 623 | if base_config and user_config: 624 | base_config.merge(user_config) 625 | base_config.walk(transform, call_on_sections=True) 626 | return base_config 627 | 628 | if base_config: 629 | base_config.walk(transform, call_on_sections=True) 630 | return base_config 631 | 632 | if user_config: 633 | user_config.walk(transform, call_on_sections=True) 634 | return user_config 635 | 636 | return None 637 | 638 | 639 | class AggregatedOptions(object): 640 | """Aggregated and prioritized options""" 641 | 642 | cli_opts = None 643 | cfg_opts = None 644 | env_opts = None 645 | 646 | cli = None 647 | 648 | def __init__(self): 649 | """Build aggregated options""" 650 | self.cli = CLIOptions() 651 | 652 | self.cli_opts = self.cli.options() 653 | self.cfg_opts = ConfigFileOptions.options(self.cli_opts['config_file']) 654 | 655 | def get_from_src(self, src, *coordinates): 656 | """Fetch an option by specified coordinates from provided source""" 657 | 658 | first_iteration = True 659 | for coordinate in coordinates: 660 | try: 661 | section = src[coordinate] if first_iteration else section[coordinate] 662 | except: 663 | return None 664 | first_iteration = False 665 | 666 | return section 667 | 668 | def get(self, *coordinates): 669 | """ 670 | Fetch an option by specified coordinates according to source priorities. 671 | Priority would be: 672 | 1. config (lower priority) 673 | 2. CLI opts 674 | """ 675 | cfg_opt = self.get_from_src(self.cfg_opts, *coordinates) 676 | cli_opt = self.get_from_src(self.cli_opts, *coordinates) 677 | cli_def = self.get_from_src(self.cli.default_options, *coordinates) 678 | 679 | if cli_opt != cli_def: 680 | # CLI opt is set - it is not default one - top priority 681 | return cli_opt 682 | 683 | # here CLI option is a default one 684 | 685 | if cfg_opt is not None: 686 | # cfg opt - is set lower priority 687 | return cfg_opt 688 | 689 | # option not available - return CLI default 690 | return cli_def 691 | 692 | def get_int(self, *coordinates): 693 | value = self.get(*coordinates) 694 | if value is not None: 695 | value = int(value) 696 | return value 697 | 698 | def get_list(self, *coordinates): 699 | value = self.get(*coordinates) 700 | 701 | # return None as it is 702 | if value is None: 703 | return None 704 | 705 | # return list-like type as it is 706 | if isinstance(value, (list, set, dict, tuple)): 707 | return value 708 | 709 | # wrap value in a list 710 | return [value] 711 | 712 | def get_bool(self, *coordinates): 713 | value = self.get(*coordinates) 714 | 715 | if value is None: 716 | # None is not interpreted 717 | return None 718 | elif isinstance(value, bool): 719 | # bool is ready-to-use 720 | return value 721 | elif isinstance(value, str): 722 | # str can be interpreted as "yes", "1", "on" 723 | value = value.upper() 724 | if (value == '1') or (value == 'YES') or (value == 'ON'): 725 | return True 726 | else: 727 | return False 728 | else: 729 | # int and all the rest just cast into bool 730 | return bool(value) 731 | 732 | def __getitem__(self, coordinates_tuple): 733 | if isinstance(coordinates_tuple, tuple): 734 | return self.get(*coordinates_tuple) 735 | else: 736 | return self.get(coordinates_tuple) 737 | 738 | def __str__(self): 739 | str = 'OPTIONS:\n' 740 | if self.cli_opts: 741 | str += 'CLI: =================\n' 742 | str += pprint.pformat(self.cli_opts) 743 | str += '\n' 744 | 745 | if self.cfg_opts: 746 | dict = self.cfg_opts.walk(lambda section, key: section[key]) 747 | str += 'CFG: =================\n' 748 | str += pprint.pformat(dict) 749 | str += '\n' 750 | 751 | return str 752 | --------------------------------------------------------------------------------