├── toolset.sh ├── test_schema.txt ├── common_settings.sh ├── readme.md ├── common └── functions.sh └── zookeeper_recovery.pl /toolset.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 4 | source ${SCRIPT_DIR}/common_settings.sh 5 | source ${SCRIPT_DIR}/common/functions.sh 6 | 7 | operation=$1 8 | 9 | log "Script started at $(hostname), master replica status: ${MASTER_REPLICA}" 10 | 11 | ## TODO: add arg processing (to adjust vars in common_settings w/o editing files) 12 | 13 | if [ "$operation" = "create_local_backup" ]; then 14 | create_local_backup 15 | elif [ "$operation" = "reset_node" ]; then 16 | reset_node 17 | elif [ "$operation" = "show_status" ]; then 18 | show_status 19 | elif [ "$operation" = "recover_non_replicated" ]; then 20 | recover_schema_reattach_non_replicated_tables 21 | elif [ "$operation" = "refill_replicated_tables" ]; then 22 | refill_replicated_tables 23 | elif [ "$operation" = "recreate_kafka_tables" ]; then 24 | recreate_kafka_tables 25 | else 26 | log "You need to pass operation as a script argument!" 27 | log "Possible operations:" 28 | log " * create_local_backup" 29 | log " * reset_node" 30 | log " * show_status" 31 | log " * recover_non_replicated" 32 | log " * refill_replicated_tables" 33 | log " * recreate_kafka_tables" 34 | log "See readme & source code for details." 35 | exit 1 36 | fi 37 | 38 | log "Finished!" 39 | -------------------------------------------------------------------------------- /test_schema.txt: -------------------------------------------------------------------------------- 1 | systemctl stop clickhouse-server 2 | rm -rf /var/lib/clickhouse/* 3 | systemctl start clickhouse-server 4 | 5 | clickhouse-client -mn 6 | 7 | 8 | SET max_block_size = 1, min_insert_block_size_bytes = 1, min_insert_block_size_rows = 1; 9 | 10 | create table X engine=MergeTree order by tuple() as select * from numbers(1000); 11 | create table Y engine=MergeTree order by tuple() as select * from numbers(1000); 12 | 13 | create table X1 engine=ReplicatedMergeTree('/clickhouse/tables/{database}/{shard}/{table}', '{replica}') order by tuple() as select * from numbers(1000); 14 | create table Y2 engine=ReplicatedSummingMergeTree('/clickhouse/tables/{database}/{shard}/{table}', '{replica}') order by tuple() as select * from numbers(1000); 15 | 16 | create table Z1 engine=Log as select * from numbers(1000); 17 | create table Z2 engine=TinyLog as select * from numbers(1000); 18 | create materialized view AAA to Z2 AS SELECT * FROM Z1; 19 | create view BBBB AS SELECT * FROM Z1; 20 | 21 | CREATE TABLE kafka (number UInt64) ENGINE = Kafka() SETTINGS kafka_broker_list = 'localhost:123', kafka_topic_list = 'topic1', kafka_group_name = 'group_name', kafka_format = 'TSV'; 22 | 23 | create database xxx; 24 | create table xxx.Z engine=MergeTree order by tuple() as select * from numbers(1000); 25 | create table xxx.Z3 engine=ReplicatedMergeTree('/clickhouse/tables/{database}/{shard}/{table}', '{replica}') order by tuple() as select * from numbers(1000); 26 | 27 | 28 | systemctl stop clickhouse-server 29 | rm -rf /var/lib/clickhouse/data 30 | rm -rf /var/lib/clickhouse/metadata 31 | systemctl start clickhouse-server 32 | -------------------------------------------------------------------------------- /common_settings.sh: -------------------------------------------------------------------------------- 1 | # some settings 2 | set -e # stop on error 3 | #set -x # print the commands we execute 4 | 5 | ### ADJUST THOSE: 6 | 7 | CLICKHOUSE_WORKING_FOLDER=/var/lib/clickhouse 8 | 9 | # should be same disk as CLICKHOUSE_WORKING_FOLDER! (otherwise we can't use hardlinks) 10 | CLICKHOUSE_TOOLSET_FOLDER=/var/lib/clickhouse/clickhouse-toolset 11 | 12 | BACKUP_FOLDER="${CLICKHOUSE_TOOLSET_FOLDER}/backup2020-11-10" 13 | 14 | # if you need some adjustments - like username/password/port/listened host or some parameter - adjust it here. 15 | CLICKHOUSE_CLIENT='clickhouse-client --host=127.0.0.1 --max_query_size=10000000' 16 | 17 | CLICKHOUSE_EXTRACT_FROM_CONFIG='clickhouse-extract-from-config --config-file /etc/clickhouse-server/config.xml' 18 | 19 | # for replicated tables we should use data only 20 | # from single replica (others will replicate) 21 | # otherwise we will have replicated data 22 | 23 | # if last character of the hostname is 1 we are on the master replica. 24 | HOSTNAME_SHORT=$(hostname -s) 25 | MASTER_REPLICA=$( [ "${HOSTNAME_SHORT: -1}" == "1" ] && echo 'true' || echo 'false' ) 26 | 27 | ### TODO: expose settings above via command-line args 28 | 29 | ### those normally should not be changed 30 | 31 | METADATA_FOLDER="${CLICKHOUSE_WORKING_FOLDER}/metadata" 32 | DATA_FOLDER="${CLICKHOUSE_WORKING_FOLDER}/data" 33 | 34 | BACKUP_METADATA_FOLDER="${BACKUP_FOLDER}/metadata" 35 | BACKUP_DATA_FOLDER="${BACKUP_FOLDER}/data" 36 | 37 | # we do mv instead of rm -rf (just in case), that folder is used as trashbin 38 | TRASHBIN_FOLDER="${CLICKHOUSE_TOOLSET_FOLDER}/trashbin_$(date +%Y%m%d_%H%M%S)" 39 | 40 | # we will put some tmp files there 41 | TMP_FOLDER="${CLICKHOUSE_TOOLSET_FOLDER}/tmp" 42 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Project is OBSOLETE / not-supported / maitained. 2 | 3 | On all modern versions of clickhouse (starting from 21.7) you should use embedded functionality instead (see [SYSTEM RESTORE REPLICA](https://clickhouse.com/docs/en/sql-reference/statements/system/#restore-replica)). 4 | 5 | The original contene of the README is below. 6 | 7 |





8 |





9 | 10 | # ClickHouse zookeeper recovery tool 11 | 12 | [ClickHouse](https://clickhouse.tech/) uses [ZooKeeper](https://zookeeper.apache.org/) for replication and for coordinating distributed operations on a cluster. While no data is stored in zookeeper (only metadata, like list of parts and their checksums) the zookeeper and metadata is required to ClickHouse to work. 13 | 14 | So if for some reason you lost your zookeeper data or it's damaged / out of the sync, then your ClickHouse servers will not start (or will start in read-only mode). 15 | To return it back to a healthy state you you need to recover zookeeper meta information from the existing state of ClickHouse tables. 16 | 17 | Those script can help you to automate that process even for clusters / big number of tables. 18 | 19 | In simple cases you can do it manually (attach the Replicated table as non-Replicated, create new Replicated table, move all partitions from old table to new one). 20 | 21 | ## Before you start 22 | 23 | 1. Analyze what happened. 24 | 25 | Usually if you loose the zookeeper data it means you configured something wrong, or did some innaccurate operations manually which lead to to that situation. 26 | 27 | 2. review your setup, and try not to loose your zookeeper data anymore (otherwise you will need to repeat that recovery process again really soon) 28 | * use [recommended settings](https://clickhouse.tech/docs/en/operations/tips/#zookeeper) for Zookeeper 29 | * use 3 nodes zookeeper ensemble 30 | * set up good monitoring for your zookeeper. 31 | 32 | 3. Ensure the data can't be recovered in better way. 33 | 34 | ## How to use it 35 | 36 | You can follow the sequence below cluster-wide using some automation scripts (like ansible) or just in cluster-ssh. 37 | 38 | All steps (except step 5) may be executed on different replicas at different times. So you can recover them one-after-one, or simultaneously. 39 | 40 | 1) adjust paths/parameters `common_settings.sh`. The parameters are not (yet) configurable via command-line. 41 | 42 | 2) We will do direct interventions in clickhouse working folder, so clickhouse should be offline. 43 | 44 | ``` 45 | sudo systemctl stop clickhouse-server 46 | ``` 47 | 48 | 3) Create a backup of the data (using hard links). 49 | 50 | ``` 51 | sudo ./toolset.sh create_local_backup 52 | ``` 53 | 54 | 4) if you have some dirty state in zookeeper - clean it up. Do a backup (if needed) and run `deleteall /clickhouse` in `zkCli`. 55 | 56 | 5) Run: 57 | 58 | ``` 59 | sudo ./toolset.sh reset_node 60 | ``` 61 | 62 | That will move the data & metadata of all known tables away. So generally, that will reset the state 63 | of your server - all tables & databases will disappear. (they are safe inside backup). 64 | 65 | 6) Start clickhouse back: 66 | 67 | ``` 68 | sudo systemctl start clickhouse-server 69 | ``` 70 | 71 | At that point, it should be clean - only system tables will be in place. The rest is saved inside backup. 72 | 73 | 7) Check the settings related to replication. Examine if they are correct: 74 | 75 | ``` 76 | sudo ./toolset.sh show_status 77 | ``` 78 | 79 | 8) Run: 80 | 81 | ``` 82 | sudo ./toolset.sh recover_non_replicated | tee recover_non_replicated_$(date +%Y%m%d_%H%M%S).log 83 | ``` 84 | 85 | That will recover the schema and data from the backup created on p. 3. Replicated table will be recovered w/o replication with another name (with `.recovered_non_repl.` prefix). Merges will be stopped, and we skip Kafka tables to avoid starting of consuming. 86 | 87 | 9) At that point, you can review the state of your data on different replicas. 88 | 89 | If needed, you can adjust/decide - which of them will be used as a source for recovery. 90 | 91 | **WARNING:** Only a single replica should have `MASTER_REPLICA=1` (otherwise, you will get data duplicates), it will be used to resync all data. 92 | 93 | Adjust parameters `common_settings.sh` if needed. 94 | 95 | 10) Run 96 | ``` 97 | sudo ./toolset.sh refill_replicated_tables | tee refill_replicated_tables_$(date +%Y%m%d_%H%M%S).log 98 | ``` 99 | That will create Replicated table back again. 100 | * If `MASTER_REPLICA=1` it will additionally copy partitions from `.recovered_non_repl.` table. 101 | * The replicas which have `MASTER_REPLICA=0` will just create the table(s) and will sync the data from other ('MASTER') replica. 102 | * You can monitor the progress in `system.replication_queue` and/or `system.replicas`. 103 | * That may use a lot of network bandwidth. 104 | * On replicas which have `MASTER_REPLICA=0` you can also see the doubled disk usage (we refetch data from 'MASTER' replica while keeping own copy in the backup folder created in p.3) 105 | 106 | 11) Now, all tables/replicas should be back online. And now we can enable merges (were disabled on p.8) and start Kafka consuming: 107 | ``` 108 | sudo ./toolset.sh recreate_kafka_tables | tee recreate_kafka_tables_$(date +%Y%m%d_%H%M%S).log 109 | ``` 110 | 111 | 112 | In case of any failures during the recovery: 113 | 1) fix the problem 114 | 2) stop clickhouse: `sudo systemctl stop clickhouse-server` 115 | 3) restart the recovery sequence from the p.4. 116 | 117 | The tool does not clean the backup and trashbin folders. You can clean it manually after a successful recovery. 118 | 119 | ## Notes 120 | 121 | Provided 'as is', use it at your own risk. 122 | * All actions are transparent, and the log is quite verbose. 123 | * We don't take any responsibility for potential data damage caused by inaccurate user actions related to that toolset. 124 | * We used those scripts to recover the zookeeper data for a cluster with 10 nodes (5 shards / 2 replicas) with hundreds (about 700) of tables. 125 | * During all procedures, we keep the backup (using hard-links). 126 | * In simpler cases (single table), recovery can be done manually. 127 | 128 | Limitations: 129 | * It is not possible currently to recover zookeeper without downtime. 130 | * Because of hard links, all the actions executed on the source file will also affect hard link copy and vice versa. In most cases, files in clickhouse are immutable, but for engine=Log family (which are typically not used widely), it can be the problem. If you start modifying the `engine=Log` table just after recovery, the backup copy (which is not a real copy, but a hardlink) will be affected by those changes. 131 | * Checked on last versions of Linux only (ubuntu 20, centos 7). 132 | * It doesn't support database=Atomic (yet?) 133 | * It doesn't support multidisk setups (yet?) / s3 disks. 134 | 135 | In newer ClickHouse versions a special command to automate that process (also to avoid full resync) may be added. 136 | -------------------------------------------------------------------------------- /common/functions.sh: -------------------------------------------------------------------------------- 1 | perl -v > /dev/null || (echo 'no perl installed!'; exit 1) 2 | 3 | mkdir -p $TMP_FOLDER || (echo "can not create tmp folder: $TMP_FOLDER (forget sudo?)"; exit 1) 4 | 5 | log() { 6 | echo "$(date "+%Y-%m-%d %H:%M:%S.%N") $1" 7 | } 8 | 9 | log_and_run_command() { 10 | log " Executing: '$*'" 11 | "$@" 12 | } 13 | 14 | copy_folder_by_hardlinks() { 15 | local source="$1" 16 | local target="$2" 17 | log_and_run_command cp -rla "$source" "$target" 18 | } 19 | 20 | execute_with_retries() { 21 | local i 22 | for i in {1..20}; do 23 | set +e 24 | "$@"; 25 | local result=$? 26 | set -e 27 | if [ "$result" == 0 ]; then 28 | return 0; 29 | else 30 | log " ! Error on try #${i}, will retry in 3 sec" 31 | sleep 3; 32 | fi 33 | done 34 | log " ! Too many attempts!" 35 | return 1; 36 | 37 | } 38 | 39 | clickhouse_client_call() { 40 | local db_fs_name="$1" 41 | local add='' 42 | if [ -n "$db_fs_name" ]; then 43 | add=" --database=\$'$( urldecode "$db_fs_name" )'" 44 | fi 45 | eval ${CLICKHOUSE_CLIENT}${add} 46 | } 47 | 48 | run_clickhouse_query() { 49 | local db_fs_name="$1" 50 | local query="$2" 51 | echo "$query" | clickhouse_client_call "$db_fs_name" 52 | } 53 | 54 | run_clickhouse_query_with_retries() { 55 | execute_with_retries run_clickhouse_query "$@" 56 | } 57 | 58 | execute_metadata_file() { 59 | local db_fs_name="$1" 60 | local metadata_file="$2" 61 | log " Executing $metadata_file: $(head -n 1 $metadata_file)" 62 | cat "$metadata_file" | clickhouse_client_call "$db_fs_name" 63 | } 64 | 65 | execute_metadata_file_with_retries() { 66 | execute_with_retries execute_metadata_file "$@" 67 | # we don't want to merge anything for now, 68 | # STOP MERGES only stop merges for existsing tables 69 | # so we repeat it after every table creation 70 | run_clickhouse_query_with_retries "" "SYSTEM STOP MERGES" 71 | } 72 | 73 | metadata_file_change_attach_to_create() { 74 | local metadata_file="$1" 75 | local new_metadata_file="$2" 76 | log " Changing $metadata_file to CREATE" 77 | perl -0777 -npe 's/^ATTACH/CREATE/;' "$metadata_file" > $new_metadata_file 78 | } 79 | 80 | metadata_file_change_to_non_replicated_with_prefix () { 81 | local metadata_file="$1" 82 | local new_metadata_file="$2" 83 | log " Changing $metadata_file to non-replacted with .recovered_non_repl. prefix" 84 | # https://regex101.com/r/pscML2/2 85 | # https://regex101.com/r/X4uwt5/2 86 | # + bash escaping 87 | # TODO: support for default_replica_path ? 88 | perl -0777 -npe $'s/ENGINE\\s*=\\s*Replicated((?:[A-Z][a-z]+)?MergeTree\\()(\'((?:\\\\\'|.)*?)\'),\\s*(\'((?:\\\\\'|.)*?)\')(?:,\\s*)?/ENGINE = $1/; s/^ATTACH\\s+TABLE\\s+(?:`((?:\\\\`|.)+?)`|(\\S+))/ATTACH TABLE `.recovered_non_repl.$1$2`/;' "$metadata_file" > $new_metadata_file 89 | } 90 | 91 | create_object_from_metadata_file_with_retries() { 92 | local db_fs_name="$1" 93 | local metadata_file="$2" 94 | local new_metadata_file="$(mktemp --tmpdir="${TMP_FOLDER}" change_attach_2_create.XXXXXXX.sql)" 95 | metadata_file_change_attach_to_create "$metadata_file" "$new_metadata_file" 96 | 97 | execute_metadata_file_with_retries "$db_fs_name" "$new_metadata_file" 98 | rm $new_metadata_file 99 | } 100 | 101 | attach_object_as_non_replicated_with_retries() { 102 | local db_fs_name="$1" 103 | local metadata_file="$2" 104 | local new_metadata_file="$(mktemp --tmpdir="${TMP_FOLDER}" change_to_recovered_non_repl.XXXXXXX.sql)" 105 | metadata_file_change_to_non_replicated_with_prefix "$metadata_file" "$new_metadata_file" 106 | 107 | execute_metadata_file_with_retries "$db_fs_name" "$new_metadata_file" 108 | rm $new_metadata_file 109 | } 110 | 111 | # based on https://stackoverflow.com/a/37840948/1555175 112 | # clickhouse perfectly accepts \xFF sequences in the identifiers with backticks, 113 | # so can just directly map path path%20with%20special%20chars into DB object `path\0x20with0x20special0x20chars` 114 | # it's much simpler than dealing with backslash escaping 115 | urldecode() { 116 | : "${*//+/ }" 117 | echo "${_//%/\\x}" 118 | #echo -e "${_//%/\\x}" 119 | } 120 | 121 | # transofms database%201 table%201 => `database\x201`.`table\x201` 122 | get_db_object_name() { 123 | local db_fs_name="$1" 124 | local table_fs_name="$2" 125 | echo "\`$( urldecode "$db_fs_name" )\`.\`$( urldecode ${table_fs_name})\`" 126 | } 127 | 128 | create_database() { 129 | local db_fs_name="$1" 130 | local db_metadata_file="$2" 131 | if [ "$db_fs_name" = 'default' ]; then 132 | log " Database 'default' exists" 133 | else 134 | log " Creating database: $( urldecode "$db_fs_name" )" 135 | create_object_from_metadata_file_with_retries "" "$db_metadata_file" 136 | fi 137 | } 138 | 139 | do_nothing() { 140 | true 141 | } 142 | 143 | copy_table_datadir_by_hardlinks() 144 | { 145 | local db_fs_name="$1" 146 | local table_fs_name="$2" 147 | local new_table_fs_name="${3:-$table_fs_name}" 148 | if [ -d "${BACKUP_DATA_FOLDER}/${db_fs_name}/${table_fs_name}" ]; then 149 | log " Copy data $( get_db_object_name "$db_fs_name" "${table_fs_name}") (by hardlinks):" 150 | copy_folder_by_hardlinks "${BACKUP_DATA_FOLDER}/${db_fs_name}/${table_fs_name}" "${DATA_FOLDER}/${db_fs_name}/${new_table_fs_name}" 151 | else 152 | log " No datadir for $( get_db_object_name "$db_fs_name" "${table_fs_name}") in ${BACKUP_DATA_FOLDER}/${db_fs_name}/${table_fs_name}" 153 | fi 154 | } 155 | 156 | fill_replicated_table_by_reattaching_partitions() { 157 | local db_fs_name="$1" 158 | local source_table_fs_name="$2" 159 | local dest_table_fs_name="$3" 160 | 161 | local db_ch_name=$( urldecode "$db_fs_name" ) 162 | local source_table_ch_name=$( urldecode "$source_table_fs_name" ) 163 | local dest_table_ch_name=$( urldecode "$dest_table_fs_name" ) 164 | 165 | local source_table_ch_full_name=$( get_db_object_name "$db_fs_name" "$source_table_fs_name" ) 166 | local dest_table_ch_full_name=$( get_db_object_name "$db_fs_name" "$dest_table_fs_name" ) 167 | 168 | log " Stopping merges for the source table $source_table_ch_full_name." 169 | run_clickhouse_query_with_retries "$db_fs_name" "SYSTEM STOP MERGES $source_table_ch_full_name" 170 | 171 | local i 172 | for i in {1..100}; do 173 | if [ $( run_clickhouse_query "" "select progress from system.merges where database='$db_ch_name' and table='$source_table_ch_name' limit 1") ]; 174 | then 175 | log " There are merges running on $source_table_ch_name, waiting for 3 seconds" 176 | run_clickhouse_query_with_retries "" "SYSTEM STOP MERGES $source_table_ch_full_name" 177 | sleep 3 178 | else 179 | break 180 | fi 181 | done 182 | 183 | while read partitionid ; do 184 | log " * Processing partition: $partitionid." 185 | run_clickhouse_query_with_retries "" "ALTER TABLE $dest_table_ch_full_name REPLACE PARTITION ID '$partitionid' FROM $source_table_ch_full_name"; 186 | done < <( run_clickhouse_query "" "select partition_id from system.parts where active and database='$db_ch_name' and table='$source_table_ch_name' GROUP BY partition_id ORDER BY partition_id FORMAT TSV" ) 187 | 188 | source_rows=$(run_clickhouse_query "" "select count() from $source_table_ch_full_name" ) 189 | target_rows=$(run_clickhouse_query "" "select count() from $dest_table_ch_full_name" ) 190 | 191 | log " The number of rows in ${source_table_ch_full_name}: ${source_rows}" 192 | log " The number of rows in ${dest_table_ch_full_name}: ${target_rows}" 193 | 194 | if [ "$source_rows" != "$target_rows" ]; then 195 | log "The number of rows in ${dest_table_ch_full_name} is different from the number of rows in ${dest_table_ch_full_name}" 196 | log "The migration is interrupted" 197 | exit 1 198 | fi 199 | } 200 | 201 | 202 | attach_local_tables_and_skip_kafka() 203 | { 204 | local db_fs_name="$1" 205 | local table_fs_name="$2" 206 | local table_metadata_full_filename="$3" 207 | 208 | if grep -qiE "Engine\\s*=\\s*Replicated\\w*MergeTree\\(" "$table_metadata_full_filename"; then 209 | log " ... Replicated, attaching as .recovered_non_repl.${table_fs_name}" 210 | copy_table_datadir_by_hardlinks "$db_fs_name" "$table_fs_name" "%2Erecovered_non_repl%2E${table_fs_name}" 211 | attach_object_as_non_replicated_with_retries "$db_fs_name" "$table_metadata_full_filename" 212 | 213 | elif grep -qiE "Engine\\s*=\\s*Kafka" "$table_metadata_full_filename"; then 214 | # TODO: skip also Rabbit 215 | log " ... Kafka, skipping for now" 216 | # we don't want to start inserts immediately 217 | else 218 | log " ... non Replicated, attaching as is." 219 | copy_table_datadir_by_hardlinks "$db_fs_name" "$table_fs_name" 220 | execute_metadata_file_with_retries "$db_fs_name" "$table_metadata_full_filename" 221 | # they can rely on each other but normally clickhouse allows to do ATTACH even 222 | # with non-satisfied dependancies 223 | fi 224 | } 225 | 226 | create_replicated_tables_and_reattach_parts() { 227 | local db_fs_name="$1" 228 | local table_fs_name="$2" 229 | local table_metadata_full_filename="$3" 230 | 231 | if grep -qiE "Engine\\s*=\\s*Replicated\\w*MergeTree\(" "$table_metadata_full_filename"; then 232 | 233 | # that will fail if table don't exists 234 | 235 | res=$( run_clickhouse_query "" "SHOW CREATE TABLE \`$(urldecode "$db_fs_name")\`.\`.recovered_non_repl.$(urldecode "$table_fs_name")\`" ) 236 | 237 | if [ -z "$res" ]; then 238 | log " Can not find recovered_non_repl for ${table_fs_name}. Did you run recover_non_replicated before?" 239 | exit 1; 240 | fi 241 | 242 | create_object_from_metadata_file_with_retries "${db_fs_name}" "$table_metadata_full_filename" 243 | 244 | if [ "$MASTER_REPLICA" = 'true' ]; then 245 | log " Script is running on master replica, reattaching parts" 246 | fill_replicated_table_by_reattaching_partitions "$db_fs_name" ".recovered_non_repl.${table_fs_name}" "$table_fs_name" 247 | else 248 | log " Non-master replica, will sync the data from the another one" 249 | fi 250 | 251 | # ensure the data were flushed before removing 252 | sync 253 | 254 | log " ... Dropping .recovered_non_repl.${table_fs_name}." 255 | run_clickhouse_query_with_retries "" "DROP TABLE IF EXISTS \`$(urldecode "$db_fs_name")\`.\`.recovered_non_repl.$(urldecode "$table_fs_name")\`;" 256 | else 257 | log " ... non Replicated, skipping." 258 | fi 259 | } 260 | 261 | create_kafka_tables() 262 | { 263 | local db_fs_name="$1" 264 | local table_fs_name="$2" 265 | local table_metadata_full_filename="$3" 266 | 267 | if grep -qiE "Engine\\s*=\\s*Kafka" "$table_metadata_full_filename"; then 268 | log " Recreating the Kafka table" 269 | create_object_from_metadata_file_with_retries "${db_fs_name}" "$table_metadata_full_filename" 270 | else 271 | log " ... non Kafka, skipping." 272 | fi 273 | } 274 | 275 | 276 | ## TODO support for Atomic (/store folder & symlinks) 277 | ## TODO support for several disks 278 | 279 | iterate_databases_and_tables_in_metadata() { 280 | local on_new_database="$1" 281 | local on_new_table="$2" 282 | 283 | local db_metadata_full_filename 284 | 285 | shopt -s nullglob # avoid returning * on empty dir 286 | 287 | for db_metadata_full_filename in "${BACKUP_METADATA_FOLDER}"/*.sql; do 288 | local db_metadata_filename="${db_metadata_full_filename##*/}" 289 | 290 | # the name of db in filesystem (folders etc) 291 | local db_fs_name="${db_metadata_filename%.sql}" 292 | 293 | # the real name is urldecoded db_fs_name 294 | log "> Database $( urldecode "$db_fs_name" ) found in $db_metadata_full_filename" 295 | 296 | if [ "$db_fs_name" = 'system' ]; then 297 | log " ... skipping system database." 298 | continue 299 | fi 300 | 301 | $on_new_database "$db_fs_name" "$db_metadata_full_filename" 302 | 303 | log " Iterating tables metadata in ${BACKUP_METADATA_FOLDER}/${db_fs_name}" 304 | 305 | local table_metadata_full_filename 306 | for table_metadata_full_filename in "${BACKUP_METADATA_FOLDER}/${db_fs_name}"/*.sql; do 307 | local table_metadata_filename="${table_metadata_full_filename##*/}" 308 | 309 | # the name of filesystem in filesystem (folders etc) 310 | local table_fs_name="${table_metadata_filename%.sql}" 311 | 312 | log ">>> Table $( get_db_object_name "$db_fs_name" "${table_fs_name}") found in ${table_metadata_full_filename}" 313 | $on_new_table "$db_fs_name" "$table_fs_name" "$table_metadata_full_filename" 314 | done 315 | done 316 | } 317 | 318 | ensure_clickhouse_is_stopped() { 319 | log 'checking if clickhouse is active.' 320 | set +e 321 | $CLICKHOUSE_CLIENT --query="SELECT 1" > /dev/null 2>&1 322 | local result=$? 323 | set -e 324 | if [ "$result" == 0 ]; then 325 | log 'ClickHouse is running. We can not reset it while it is active. Shutdown clickhouse first to continue!..' 326 | exit 1 327 | fi 328 | log 'It seems clickhouse is not running' 329 | } 330 | 331 | 332 | 333 | 334 | 335 | create_local_backup() { 336 | # we create/recover this 'backup' using hardlinks 337 | # warning: it's safe only when clickhouse is stopped 338 | # warning: file & its hardlink copy will have the same attributes (don't chown / chmod it!). 339 | # warning: data is not always immutable in clickhouse files (engine=Log, so after recovery backup can be affected by the running queries). 340 | 341 | 342 | if [ -d $BACKUP_FOLDER ]; then 343 | log "backup exists as $BACKUP_FOLDER . Can not continue" 344 | exit 1 345 | fi 346 | 347 | ensure_clickhouse_is_stopped 348 | 349 | log 'Creating backup folder' 350 | log_and_run_command mkdir -p "$BACKUP_FOLDER" 351 | 352 | log "Copy (by hardlinks) data & metadata folders" 353 | 354 | # TODO: we can do a real copy instead of hardlink copy for certain engines, and for metadata files. 355 | copy_folder_by_hardlinks "$METADATA_FOLDER" "$BACKUP_METADATA_FOLDER" 356 | copy_folder_by_hardlinks "$DATA_FOLDER" "$BACKUP_DATA_FOLDER" 357 | 358 | log 'Backup finished' 359 | log 'Now you can reset clickhouse node (reset_node) and clean up zookeeper (if it is broken)' 360 | } 361 | 362 | reset_node() { 363 | ## that script will move data & metadata aside to be able to start clickhouse 364 | ## second script will do the actual recovery. 365 | 366 | if [ ! -d $BACKUP_FOLDER ]; then 367 | log "backup does not exists at $BACKUP_FOLDER" 368 | exit 1 369 | fi 370 | 371 | ensure_clickhouse_is_stopped 372 | 373 | log "Creating trash bin" 374 | log_and_run_command mkdir -p "$TRASHBIN_FOLDER" 375 | 376 | log "Moving data and metadata to trash bin" 377 | log_and_run_command mv "$METADATA_FOLDER" "$TRASHBIN_FOLDER" 378 | log_and_run_command mv "$DATA_FOLDER" "$TRASHBIN_FOLDER" 379 | 380 | log "Recreating data & metadata folders" 381 | log_and_run_command mkdir -p "$METADATA_FOLDER" "$DATA_FOLDER" 382 | log_and_run_command chown -R clickhouse:clickhouse "$METADATA_FOLDER" "$DATA_FOLDER" 383 | 384 | log "Move back the system database (we don't expect any replicated tables there)" 385 | 386 | ### we don't expect any replicated tables in system database, 387 | ### and we want to put it into the correct place in advance 388 | ### otherwise clickhouse will recreate them automatically when it will be started 389 | 390 | if [ -d $BACKUP_METADATA_FOLDER/system ]; then 391 | copy_folder_by_hardlinks "$BACKUP_METADATA_FOLDER/system" "$METADATA_FOLDER" 392 | fi 393 | 394 | if [ -d $BACKUP_DATA_FOLDER/system ]; then 395 | copy_folder_by_hardlinks "$BACKUP_DATA_FOLDER/system" "$DATA_FOLDER" 396 | fi 397 | 398 | log 'Node reset finished. Now you can start it (it will be empty).' 399 | } 400 | 401 | show_status() { 402 | set +e 403 | 404 | ### Check if we are active 405 | log 'Check status:' 406 | $CLICKHOUSE_CLIENT --query="SELECT 'ClickHouse ' || version() || ' at ' || hostName() || ' is up and running. Start time: ' || toString( now() - uptime() )" --format=TSVRaw 407 | 408 | log 'Macros:' 409 | $CLICKHOUSE_CLIENT --query="SELECT * FROM system.macros" --format=PrettyCompactMonoBlock 410 | 411 | log 'Clusters:' 412 | $CLICKHOUSE_CLIENT --query="SELECT * FROM system.clusters WHERE cluster not like 'test\\\\_%' " --format=PrettyCompactMonoBlock 413 | 414 | log 'Zookeeper:' 415 | $CLICKHOUSE_CLIENT --query="SELECT * FROM system.zookeeper WHERE path='/'" --format=PrettyCompactMonoBlock 416 | $CLICKHOUSE_EXTRACT_FROM_CONFIG --key=zookeeper 417 | } 418 | 419 | recover_schema_reattach_non_replicated_tables() { 420 | log "===================================================================================" 421 | log "===================================================================================" 422 | log " Iterating databases metadata in ${BACKUP_METADATA_FOLDER}:" 423 | log " Create databases, recover simple tables by attach, Replicated as non-replicated, skip Kafka" 424 | iterate_databases_and_tables_in_metadata "create_database" "attach_local_tables_and_skip_kafka" 425 | sync 426 | } 427 | 428 | 429 | refill_replicated_tables() { 430 | log "===================================================================================" 431 | log "===================================================================================" 432 | log " Iterating databases metadata in ${BACKUP_METADATA_FOLDER}, recreate Replicated table and reattach parts" 433 | iterate_databases_and_tables_in_metadata "do_nothing" "create_replicated_tables_and_reattach_parts" 434 | sync 435 | } 436 | 437 | recreate_kafka_tables() { 438 | log "===================================================================================" 439 | log "===================================================================================" 440 | log " Enabling merges " 441 | run_clickhouse_query_with_retries "" "SYSTEM START MERGES" 442 | 443 | log "===================================================================================" 444 | log "===================================================================================" 445 | log " Iterating databases metadata in ${BACKUP_METADATA_FOLDER}, recreate Kafka tables" 446 | iterate_databases_and_tables_in_metadata "do_nothing" "create_kafka_tables" 447 | sync 448 | } 449 | 450 | 451 | ########## 452 | ## It is not used currently. 453 | ## It's safer to rely on ClickHouse to understand which folders need to be attached - because beside the tmp parts 454 | ## it can also contain same data in merged and unmerged form (and when you ATTACH part by part it will end up witj duplicates) 455 | ## In contrast when we attach whole folder as plain (non Replicated) MergeTree ClickHouse can understand that situations. 456 | # reattach_parts() 457 | # { 458 | # local db_fs_name="$1" 459 | # local table_fs_name="$2" 460 | 461 | # log " Copy parts of the table $( get_db_object_name "$db_fs_name" "${table_fs_name}") (by hardlinks) from ${BACKUP_DATA_FOLDER}/${db_fs_name}/${table_fs_name} to ${DATA_FOLDER}/${db_fs_name}/${table_fs_name}/detached" 462 | 463 | # IGNORE_PARTS="^(detached|broken.*|unexpected.*|ignored.*|noquorum.*|tmp_mut.*)$" 464 | # shopt -s nullglob # avoid returning * on empty dir 465 | 466 | # local part_path 467 | # for part_path in "${BACKUP_DATA_FOLDER}/${db_fs_name}/${table_fs_name}"/*/; do 468 | # local part_name="${part_path%"${part_path##*[!/]}"}" # extglob-free multi-trailing-/ trim 469 | # part_name="${part_name##*/}" # remove everything before the last / 470 | # if [[ $part_name =~ $IGNORE_PARTS ]]; 471 | # then 472 | # log " - $part_name ignored ($part_path)" 473 | # continue 474 | # fi 475 | # log " * ${part_name} at $part_path" 476 | # copy_folder_by_hardlinks "$part_path" "${DATA_FOLDER}/${db_fs_name}/${table_fs_name}/detached" 477 | # run_clickhouse_query_with_retries "$db_fs_name" "ALTER TABLE $( get_db_object_name "$db_fs_name" "${table_fs_name}") ATTACH PART '${part_name}'" 478 | # done 479 | # } 480 | 481 | # # Not used: metadata filename is the url-encoded table name 482 | # extract_object_name_from_metadata_content() { 483 | # local db_fs_name="$1" 484 | # local metadata_file="$2" 485 | # # https://regex101.com/r/jea9p9/1/ 486 | # perl -0777 -npe $'s/^(?:ATTACH|CREATE)\\s+(?:OR\\s+REPLACE\\s+)?(?:IF\\s+NOT\\s+EXISTS\\s+)?(TEMPORARY\\s+)?(?:MATERIALIZED\\s+VIEW|VIEW|DICTIONARY|TABLE|DATABASE|LIVE\\s+VIEW)\\s+(?:`((?:\\\\`|.)+?)`|(\\S+)).*$/$2$3/' "$metadata_file" 487 | # } 488 | -------------------------------------------------------------------------------- /zookeeper_recovery.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Data::Dumper; # TODO: not a part of core modules on older perl 6 | use POSIX(); 7 | use Carp; 8 | $| = 1; # disable output buffering 9 | 10 | 11 | ##### params: ############# 12 | 13 | my $CLICKHOUSE_CLIENT = 'clickhouse-client'; 14 | 15 | # leave $CLUSTER_NAME to run on a single node (also check RECOVER_SCHEMA_ONLY) 16 | # or run with cluster name and it should do everything correct on the whole cluster. 17 | # the safe & handy way is to create a subcluster for every shard and run that tool shard by shard 18 | my $CLUSTER_NAME = ''; 19 | 20 | # if set the data will not be recovered (can make sense with empty CLUSTER NAME will be synced from the other replica). 21 | my $RECOVER_SCHEMA_ONLY = 0; 22 | 23 | # just output the commands which should be executed. 24 | my $DRY_RUN = 0; 25 | 26 | ########################### 27 | 28 | 29 | sub printlog { 30 | my $log_line = shift; 31 | print (POSIX::strftime("%Y-%m-%d %H:%M:%S", localtime time), " ", sprintf($log_line, @_), "\n"); 32 | } 33 | 34 | sub escape_shell_arg { 35 | my ($arg) = @_; 36 | $arg =~ s/'/'\\''/g; 37 | $arg =~ s/^''//; $arg =~ s/''$//; 38 | return "'$arg'"; 39 | } 40 | 41 | sub escape_non_ascii_for_sql { 42 | my ($arg) = @_; 43 | $arg =~ s/([^A-Za-z0-9_])/sprintf("\\x%02X", ord($1))/seg; 44 | return $arg; 45 | } 46 | 47 | sub escape_sql_arg { 48 | my ($arg) = @_; 49 | return q{'} . escape_non_ascii_for_sql($arg) . q{'}; 50 | } 51 | 52 | # clickhouse perfectly accepts \xFF sequences in the identifiers with backticks, 53 | sub full_table_name { 54 | my ($database, $table) = @_; 55 | return join '.', map { q{`} . escape_non_ascii_for_sql($_) . q{`} } ($database, $table); 56 | } 57 | 58 | # TabSeparated: The following escape sequences are used for output: \b, \f, \r, \n, \t, \0, \', \\. 59 | my %mapping = ( 60 | "\\b" => "\b", "\\f" => "\f", "\\r" => "\r", "\\n" => "\n", "\\t" => "\t", "\\0" => "\0", 61 | "\\'" => "\'", "\\\\" => "\\", "\\" => "\\" 62 | ); 63 | 64 | # return array of array 65 | # tuples / maps / arrays - are not parsed 66 | sub parse_tsv 67 | { 68 | my ($tsv) = @_; 69 | my $res = [ map { [ map { s/(\\[bfrnt0'\\]|\\)/$mapping{$1}/seg; $_; } split "\t", $_, -1 ] } split "\n", $tsv, -1 ]; 70 | if ( scalar(@{pop @$res}) != 0 ) 71 | { 72 | confess("Newline at the end of TSV is missing!"); 73 | } 74 | return $res; 75 | } 76 | 77 | # return array of hashes 78 | sub parse_tsv_with_names 79 | { 80 | my ($tsv) = @_; 81 | my $raa = parse_tsv($tsv); 82 | my $column_names = shift @$raa; # get header row 83 | my $res = []; 84 | foreach my $row (@$raa) 85 | { 86 | my %h; 87 | @h{@$column_names} = @$row; 88 | push @$res, \%h; 89 | } 90 | return $res; 91 | } 92 | 93 | sub run_clickhouse_query 94 | { 95 | my $query = shift; 96 | my $extra_settings = shift || {}; 97 | 98 | my @args = ("${CLICKHOUSE_CLIENT}"); 99 | 100 | push @args, "--query=" . escape_shell_arg($query); 101 | 102 | while (my ($key, $value) = each (%$extra_settings)) { 103 | push @args, "--".$key."=" . escape_shell_arg($value); 104 | } 105 | 106 | my $cmd = join(' ', @args); 107 | my $output = `$cmd`; 108 | my $status = $?; 109 | return { 110 | status => $status, 111 | output => $output, 112 | cmd => $cmd, 113 | }; 114 | } 115 | 116 | sub run_ddl_command 117 | { 118 | my $query = shift; 119 | my $extra_settings = shift || {}; 120 | 121 | my $retries = 1; 122 | 123 | while ($retries <= 5) 124 | { 125 | printlog('Executing%s: %s', $retries > 1 ? "(attempt #$retries)" : '' , $query); 126 | 127 | if ($DRY_RUN) 128 | { 129 | printlog('Success! (DRY RUN)'); 130 | return 1; 131 | } 132 | 133 | my $res = run_clickhouse_query($query, $extra_settings); 134 | if ($res->{status} == 0) 135 | { 136 | printlog('Success!'); 137 | return 1; 138 | } 139 | 140 | printlog("Command failed: %s\n%s", $res->{cmd}, $res->{output}); 141 | 142 | sleep($retries); 143 | $retries += 1; 144 | } 145 | 146 | confess('Too many failed attempts!'); 147 | } 148 | 149 | # we print all planned commands, so in case something will break in the middle user can finish them manually. 150 | sub run_ddl_command_sequence 151 | { 152 | my $ddl_commands = shift; 153 | printlog("Trying to execute the following commands: \n %s;\n", join(";\n", @$ddl_commands)); 154 | run_ddl_command($_) foreach (@$ddl_commands); 155 | } 156 | 157 | sub get_clickhouse_query_result 158 | { 159 | my $query = shift; 160 | my $extra_settings = shift || {}; 161 | 162 | my $res = run_clickhouse_query($query, $extra_settings); 163 | # print Dumper $res; 164 | if ($res->{status} != 0) 165 | { 166 | confess("Command failed: ", $res->{cmd}, "\n", $res->{output}); 167 | } 168 | my $output = $res->{output}; 169 | chomp $output; 170 | return $output; 171 | } 172 | 173 | sub run_clickhouse_query2 174 | { 175 | my $query = shift; 176 | my $extra_settings = shift || {}; 177 | 178 | my $res = run_clickhouse_query($query, {%$extra_settings, format=>'TSVWithNames'}); 179 | 180 | if ($res->{status} != 0) 181 | { 182 | confess("Can not connect: ", $res->{output}); 183 | } 184 | # print Dumper $res; 185 | 186 | return parse_tsv_with_names($res->{output}); 187 | 188 | } 189 | 190 | sub prompt_yn { 191 | my ($query) = @_; 192 | print "$query (Y/N) "; 193 | chomp(my $answer = ); 194 | return lc($answer) eq 'y'; 195 | } 196 | 197 | sub maybecluster { 198 | my $table_name = shift; 199 | return $CLUSTER_NAME ? 'clusterAllReplicas(' . $CLUSTER_NAME . ',' . $table_name . ')' : $table_name; 200 | } 201 | 202 | sub ddl_maybe_oncluster { 203 | return $CLUSTER_NAME ? 'ON CLUSTER ' . escape_sql_arg($CLUSTER_NAME) : ''; 204 | } 205 | 206 | sub maybe_add_on_cluster_to_create_statement { 207 | my ($create_statement) = @_; 208 | my $on_cluster = ddl_maybe_oncluster(); 209 | 210 | if ($on_cluster) 211 | { 212 | $create_statement =~ s/ 213 | ^ # from begining 214 | ( # start capture group #1 215 | (?:CREATE|ATTACH)\s+TABLE\s+ # CREATE OR ATTACH 216 | (?: 217 | (?:`(?:\\.|[^`])+`|"(?:\\.|[^"])+"|[a-zA-Z0-9_]+) 218 | \. 219 | )? # optional name of the database (maybe quoted with backticks or doublequotes) followed by dot 220 | (?:`(?:\\.|[^`])+`|"(?:\\.|[^"])+"|[a-zA-Z0-9_]+)\s+ # name of the table (maybe quoted with backticks or doublequotes) 221 | (?:UUID\s+'[0-9a-fA-F-]+'\s+)? # optional uuid 222 | ) 223 | /$1 $on_cluster /isx; 224 | } 225 | 226 | return $create_statement; 227 | } 228 | 229 | sub rename_table_in_create_statement 230 | { 231 | my ($create_table,$new_name) = @_; 232 | print "0 $create_table\n"; 233 | $create_table =~ s/ 234 | ^ # from begining 235 | ( # start capture group #1 236 | CREATE 237 | \s+TABLE\s+ 238 | ) 239 | (?: 240 | (?:`(?:\\.|[^`])+`|"(?:\\.|[^"])+"|[a-zA-Z0-9_]+) 241 | \. 242 | )? # optional name of the database (maybe quoted with backticks or doublequotes) followed by dot 243 | (?:`(?:\\.|[^`])+`|"(?:\\.|[^"])+"|[a-zA-Z0-9_]+)\s+ # name of the table (maybe quoted with backticks or doublequotes) 244 | /$1$new_name /sxi; 245 | print "1 $create_table\n"; 246 | return $create_table; 247 | 248 | } 249 | 250 | 251 | sub attach_as_non_replicated 252 | { 253 | my ($original_create_table) = @_; 254 | print "2 $original_create_table\n"; 255 | my $modified_attach_table = maybe_add_on_cluster_to_create_statement($original_create_table); 256 | print "3 $modified_attach_table\n"; 257 | $modified_attach_table =~ s/ 258 | ^ # from begining 259 | CREATE 260 | ( # start capture group #1 261 | \s+TABLE\s+ 262 | (?: 263 | (?:`(?:\\.|[^`])+`|"(?:\\.|[^"])+"|[a-zA-Z0-9_]+) 264 | \. 265 | )? # optional name of the database (maybe quoted with backticks or doublequotes) followed by dot 266 | (?:`(?:\\.|[^`])+`|"(?:\\.|[^"])+"|[a-zA-Z0-9_]+)\s+ # name of the table (maybe quoted with backticks or doublequotes) 267 | ) # end capture group #1 268 | (?:UUID\s+'[0-9a-fA-F-]+'\s+)? # optional uuid 269 | (.*) # capture group #2 270 | ( \)\s+ENGINE\s*=\s* ) # capture group #3 271 | Replicated 272 | ([a-zA-Z]*MergeTree\() # capture group #4 273 | (?:\s*'(?:\\.|[^'])+'\s*,\s*'(?:\\.|[^'])+') # params of Replicated 274 | 275 | ([^\)]*\)) # capture group #5 - all other params + closing bracket. 276 | /ATTACH$1$2$3$4$5/sxi; 277 | print "4 $modified_attach_table\n"; 278 | return $modified_attach_table; 279 | } 280 | 281 | 282 | 283 | sub print_general_info { 284 | my $res = run_clickhouse_query("SELECT 1"); 285 | 286 | # check the conn is ok 287 | if ($res->{status} != 0 or $res->{output} != "1\n") 288 | { 289 | confess("Can not connect: ", $res->{output}); 290 | } 291 | 292 | printlog( "Clickhouse:\n%s\n", 293 | get_clickhouse_query_result( 294 | "SELECT 295 | hostName(), 296 | 'ClickHouse ' || version() as v, 297 | uptime(), 298 | toString( now() - uptime() ) as start_time 299 | FROM ".maybecluster('system.one')." 300 | ORDER BY hostName()", 301 | {format => 'PrettyCompactMonoBlock'} 302 | ) 303 | ); 304 | 305 | printlog("Defined macros:\n%s\n", 306 | get_clickhouse_query_result(" 307 | SELECT 308 | hostName(), 309 | * 310 | FROM " . maybecluster('system.macros') . " 311 | ORDER BY hostName(), macro", 312 | {format => 'PrettyCompactMonoBlock'} 313 | ) 314 | ); 315 | 316 | printlog("Defined clusters:\n%s\n", 317 | get_clickhouse_query_result(" 318 | SELECT 319 | hostName(), 320 | * 321 | FROM " . maybecluster('system.clusters') . " 322 | WHERE cluster not like 'test\\\\_%' 323 | ORDER BY hostName(), cluster, shard_num, replica_num", 324 | {format => 'PrettyCompactMonoBlock'} 325 | ) 326 | ); 327 | 328 | printlog("Zookeeper:\n%s\n%s\n", 329 | get_clickhouse_query_result(" 330 | SELECT 331 | hostName(), 332 | * 333 | FROM " . maybecluster('system.zookeeper') . " 334 | WHERE path = '/' 335 | ORDER BY hostName(), name", 336 | {format => 'PrettyCompactMonoBlock'} 337 | ), 338 | get_clickhouse_query_result(" 339 | SELECT 340 | hostName(), 341 | * 342 | FROM " . maybecluster('system.zookeeper') . " 343 | WHERE path = '/clickhouse' 344 | ORDER BY hostName(), name", 345 | {format => 'PrettyCompactMonoBlock'} 346 | ) 347 | ); 348 | } 349 | 350 | my $uuid_supported_cached_result = undef; 351 | 352 | sub is_uuid_supported 353 | { 354 | if (!defined($uuid_supported_cached_result)) 355 | { 356 | $uuid_supported_cached_result = get_clickhouse_query_result(" 357 | SELECT 358 | count() > 0 359 | FROM " . maybecluster('system.settings') . " 360 | WHERE name='show_table_uuid_in_table_create_query_if_not_nil'"); 361 | 362 | printlog( 'show_table_uuid_in_table_create_query_if_not_nil supported: %d', $uuid_supported_cached_result ); 363 | 364 | } 365 | return $uuid_supported_cached_result; 366 | } 367 | 368 | sub find_tables_with_zookeeper_data_missing 369 | { 370 | printlog( 'Detecting tables with zookeeper missing...' ); 371 | return run_clickhouse_query2(" 372 | WITH 373 | is_readonly and not is_session_expired and zookeeper_exception like '%No node%' as zookeeper_data_missing 374 | SELECT 375 | database, 376 | table, 377 | uniqExact(zookeeper_path) as nr_of_zookeeper_paths, 378 | arrayStringConcat( groupArray((hostName() || ': ' || zookeeper_exception)), '\n') as zookeeper_exeptions, 379 | arrayStringConcat( groupArrayIf(hostName(),zookeeper_data_missing), ',') as hosts_with_zookeeper_data_missing, 380 | arrayStringConcat( groupArrayIf(hostName(),not zookeeper_data_missing), ',') as hosts_with_zookeeper_data 381 | FROM " . maybecluster('system.replicas') . " 382 | GROUP BY 383 | database, 384 | table 385 | HAVING countIf(zookeeper_data_missing) > 0 386 | ORDER BY 387 | database, 388 | table 389 | "); 390 | } 391 | 392 | sub get_table_info 393 | { 394 | my ($database_name, $table_name) = @_; 395 | my $uuid_supported = is_uuid_supported(); 396 | 397 | return run_clickhouse_query2( 398 | sprintf( 399 | 'SELECT 400 | hostName(), 401 | * 402 | FROM %s 403 | WHERE database=%s AND name=%s 404 | ORDER BY hostName()', 405 | maybecluster('system.tables'), 406 | escape_sql_arg($database_name), 407 | escape_sql_arg($table_name) 408 | ), 409 | { $uuid_supported ? (show_table_uuid_in_table_create_query_if_not_nil => 1) : () } 410 | ); 411 | } 412 | 413 | # table will be renamed to temporary name, recreated in place, and all partitions reattached back 414 | sub recover_table_zookeeper_data 415 | { 416 | my ($table_name, $database_name, $temporary_db_name) = @_; 417 | 418 | my $full_table_name = full_table_name($database_name, $table_name); 419 | 420 | my $target_table_name = $RECOVER_SCHEMA_ONLY ? "${database_name}.${table_name}_origdata" : "${database_name}.${table_name}"; 421 | my $full_tmp_table_name = full_table_name($temporary_db_name, $target_table_name); 422 | 423 | printlog( 'Processing %s, using %s as temporary table', $full_table_name, $full_tmp_table_name); 424 | 425 | my $original_table_rows_count = get_clickhouse_query_result(sprintf('SELECT count() FROM %s',$full_table_name)); 426 | 427 | my $table_info = get_table_info($database_name, $table_name); 428 | 429 | if (scalar(@$table_info) == 0) { 430 | confess('Empty result of system.tables query'); 431 | } 432 | 433 | my $target_table_info = get_table_info($temporary_db_name,$target_table_name); 434 | 435 | if (scalar(@$target_table_info) > 0) 436 | { 437 | print Dumper $target_table_info; 438 | confess("Temporary table $full_tmp_table_name already exists! Do cleanup manually to continue."); 439 | } 440 | 441 | # small consistency check - ensure the schema is the same for different nodes 442 | my $original_create_table = $table_info->[0]{create_table_query}; 443 | 444 | if ( scalar(@$table_info) > 1 ) 445 | { 446 | for my $v (@$table_info) 447 | { 448 | if ( $v->{create_table_query} ne $original_create_table) { 449 | printlog( '%s statement : %s', $v->{'hostName()'}, $v->{create_table_query}); 450 | printlog( '%s statement : %s', $table_info->[0]{'hostName()'}, $table_info->[0]{create_table_query}); 451 | confess('Table schema is inconsistant across the cluster nodes!'); 452 | } 453 | } 454 | } 455 | 456 | my $parts_info = run_clickhouse_query2( 457 | sprintf( 458 | 'SELECT 459 | partition_id, 460 | uniqExact(name) as parts_count 461 | FROM %s 462 | WHERE 463 | active 464 | AND database=%s AND table=%s 465 | GROUP BY partition_id 466 | ORDER BY partition_id', 467 | maybecluster('system.parts'), 468 | escape_sql_arg($database_name), 469 | escape_sql_arg($table_name) 470 | ) 471 | ); 472 | 473 | if (scalar(@$parts_info) == 0) 474 | { 475 | printlog('Empty result of system.parts query: table is empty, will just recreate it.'); 476 | 477 | run_ddl_command_sequence( 478 | [ 479 | sprintf('DROP TABLE IF EXISTS %s %s NO DELAY', $full_table_name, ddl_maybe_oncluster()), 480 | maybe_add_on_cluster_to_create_statement($original_create_table), 481 | ] 482 | ); 483 | 484 | return; 485 | } 486 | 487 | my $max_part_per_partition = 0; 488 | my $overall_number_of_parts = 0; 489 | 490 | for my $p (@$parts_info) 491 | { 492 | if ($p->{parts_count} > $max_part_per_partition) 493 | { 494 | $max_part_per_partition = $p->{parts_count}; 495 | $overall_number_of_parts += $p->{parts_count}; 496 | } 497 | } 498 | 499 | # TODO: do we care of replicated_deduplication_window here? 500 | printlog("max_part_per_partition: %d, overall_number_of_parts: %d", $max_part_per_partition, $overall_number_of_parts); 501 | 502 | 503 | my @command_sequence = (); 504 | 505 | # inside Atomic database that doesn't work: 506 | # DB::Exception: Mapping for table with UUID=ccbe67e0-eb08-4897-80f1-404c3b488810 already exists. It happened due to UUID collision, most likely because some not random UUIDs were manually specified in CREATE queries. (version 21.7.1.7029 (official build)) 507 | 508 | # so we do that reattach only in after moving the table to ordinary to 'drop' atomic nature of it. 509 | 510 | # push @command_sequence, sprintf('DETACH TABLE IF EXISTS %s %s NO DELAY', $full_table_name, ddl_maybe_oncluster()); 511 | # push @command_sequence, attach_as_non_replicated($original_create_table); 512 | 513 | # SYSTEM STOP MERGES don't work cluster-wide 514 | # the safest way to use that is to create a subcluster for every shard and do it shard by shard 515 | push @command_sequence, sprintf('SYSTEM STOP MERGES %s', $full_table_name); 516 | 517 | # direct rename or r/o table was not working before 20.5, see https://github.com/ClickHouse/ClickHouse/pull/11652/ 518 | push @command_sequence, sprintf('RENAME TABLE %s TO %s %s', $full_table_name, $full_tmp_table_name, ddl_maybe_oncluster()); 519 | 520 | push @command_sequence, sprintf('DETACH TABLE IF EXISTS %s %s NO DELAY', $full_tmp_table_name, ddl_maybe_oncluster()); 521 | push @command_sequence, attach_as_non_replicated(rename_table_in_create_statement($original_create_table, $full_tmp_table_name)); 522 | 523 | push @command_sequence, sprintf('SYSTEM STOP MERGES %s', $full_tmp_table_name); 524 | 525 | push @command_sequence, maybe_add_on_cluster_to_create_statement($original_create_table); 526 | push @command_sequence, sprintf('SYSTEM STOP MERGES %s', $full_table_name); 527 | 528 | if (!$RECOVER_SCHEMA_ONLY) 529 | { 530 | for my $p (@$parts_info) 531 | { 532 | push @command_sequence, sprintf('ALTER TABLE %s %s REPLACE PARTITION ID %s FROM %s', 533 | $full_table_name, 534 | ddl_maybe_oncluster(), 535 | escape_sql_arg($p->{partition_id}), 536 | $full_tmp_table_name); 537 | } 538 | } 539 | 540 | run_ddl_command_sequence(\@command_sequence); 541 | 542 | my $new_table_row_count = get_clickhouse_query_result(sprintf('SELECT count() FROM %s',$full_table_name)); 543 | 544 | printlog('original_table_rows_count: %d, new_table_row_count: %d', $original_table_rows_count, $new_table_row_count); 545 | 546 | run_ddl_command(sprintf('SYSTEM START MERGES %s', $full_table_name)); 547 | } 548 | 549 | 550 | printlog('Started %s [pid:%d]:', $0, $$); 551 | 552 | print_general_info(); 553 | 554 | my $readonly_tables = find_tables_with_zookeeper_data_missing(); 555 | 556 | printlog( '%d tables with zookeeper_data_missing found.', scalar(@$readonly_tables)); 557 | 558 | if (scalar(@$readonly_tables) == 0) { 559 | printlog( 'Nothing to to!' ); 560 | exit; 561 | } 562 | 563 | printlog( 'WARNING: Please stop the insertion to all the tables, detach all the Kafka / RabbitMQ / Buffer / Distributed tables!'); 564 | prompt_yn('Continue?') || exit(1); 565 | 566 | my $temporary_db_name = '_tmp_zk_rcvry'; 567 | run_ddl_command(sprintf('CREATE DATABASE IF NOT EXISTS %s %s engine=Ordinary', $temporary_db_name, ddl_maybe_oncluster())); 568 | 569 | foreach my $table (@$readonly_tables) { 570 | next if $table->{'database'} eq $temporary_db_name; 571 | 572 | recover_table_zookeeper_data($table->{'table'}, $table->{'database'}, $temporary_db_name); 573 | } 574 | 575 | printlog('Done! Cross check everything and remove %s database', $temporary_db_name); 576 | --------------------------------------------------------------------------------