├── toolset.sh
├── test_schema.txt
├── common_settings.sh
├── readme.md
├── common
└── functions.sh
└── zookeeper_recovery.pl
/toolset.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
4 | source ${SCRIPT_DIR}/common_settings.sh
5 | source ${SCRIPT_DIR}/common/functions.sh
6 |
7 | operation=$1
8 |
9 | log "Script started at $(hostname), master replica status: ${MASTER_REPLICA}"
10 |
11 | ## TODO: add arg processing (to adjust vars in common_settings w/o editing files)
12 |
13 | if [ "$operation" = "create_local_backup" ]; then
14 | create_local_backup
15 | elif [ "$operation" = "reset_node" ]; then
16 | reset_node
17 | elif [ "$operation" = "show_status" ]; then
18 | show_status
19 | elif [ "$operation" = "recover_non_replicated" ]; then
20 | recover_schema_reattach_non_replicated_tables
21 | elif [ "$operation" = "refill_replicated_tables" ]; then
22 | refill_replicated_tables
23 | elif [ "$operation" = "recreate_kafka_tables" ]; then
24 | recreate_kafka_tables
25 | else
26 | log "You need to pass operation as a script argument!"
27 | log "Possible operations:"
28 | log " * create_local_backup"
29 | log " * reset_node"
30 | log " * show_status"
31 | log " * recover_non_replicated"
32 | log " * refill_replicated_tables"
33 | log " * recreate_kafka_tables"
34 | log "See readme & source code for details."
35 | exit 1
36 | fi
37 |
38 | log "Finished!"
39 |
--------------------------------------------------------------------------------
/test_schema.txt:
--------------------------------------------------------------------------------
1 | systemctl stop clickhouse-server
2 | rm -rf /var/lib/clickhouse/*
3 | systemctl start clickhouse-server
4 |
5 | clickhouse-client -mn
6 |
7 |
8 | SET max_block_size = 1, min_insert_block_size_bytes = 1, min_insert_block_size_rows = 1;
9 |
10 | create table X engine=MergeTree order by tuple() as select * from numbers(1000);
11 | create table Y engine=MergeTree order by tuple() as select * from numbers(1000);
12 |
13 | create table X1 engine=ReplicatedMergeTree('/clickhouse/tables/{database}/{shard}/{table}', '{replica}') order by tuple() as select * from numbers(1000);
14 | create table Y2 engine=ReplicatedSummingMergeTree('/clickhouse/tables/{database}/{shard}/{table}', '{replica}') order by tuple() as select * from numbers(1000);
15 |
16 | create table Z1 engine=Log as select * from numbers(1000);
17 | create table Z2 engine=TinyLog as select * from numbers(1000);
18 | create materialized view AAA to Z2 AS SELECT * FROM Z1;
19 | create view BBBB AS SELECT * FROM Z1;
20 |
21 | CREATE TABLE kafka (number UInt64) ENGINE = Kafka() SETTINGS kafka_broker_list = 'localhost:123', kafka_topic_list = 'topic1', kafka_group_name = 'group_name', kafka_format = 'TSV';
22 |
23 | create database xxx;
24 | create table xxx.Z engine=MergeTree order by tuple() as select * from numbers(1000);
25 | create table xxx.Z3 engine=ReplicatedMergeTree('/clickhouse/tables/{database}/{shard}/{table}', '{replica}') order by tuple() as select * from numbers(1000);
26 |
27 |
28 | systemctl stop clickhouse-server
29 | rm -rf /var/lib/clickhouse/data
30 | rm -rf /var/lib/clickhouse/metadata
31 | systemctl start clickhouse-server
32 |
--------------------------------------------------------------------------------
/common_settings.sh:
--------------------------------------------------------------------------------
1 | # some settings
2 | set -e # stop on error
3 | #set -x # print the commands we execute
4 |
5 | ### ADJUST THOSE:
6 |
7 | CLICKHOUSE_WORKING_FOLDER=/var/lib/clickhouse
8 |
9 | # should be same disk as CLICKHOUSE_WORKING_FOLDER! (otherwise we can't use hardlinks)
10 | CLICKHOUSE_TOOLSET_FOLDER=/var/lib/clickhouse/clickhouse-toolset
11 |
12 | BACKUP_FOLDER="${CLICKHOUSE_TOOLSET_FOLDER}/backup2020-11-10"
13 |
14 | # if you need some adjustments - like username/password/port/listened host or some parameter - adjust it here.
15 | CLICKHOUSE_CLIENT='clickhouse-client --host=127.0.0.1 --max_query_size=10000000'
16 |
17 | CLICKHOUSE_EXTRACT_FROM_CONFIG='clickhouse-extract-from-config --config-file /etc/clickhouse-server/config.xml'
18 |
19 | # for replicated tables we should use data only
20 | # from single replica (others will replicate)
21 | # otherwise we will have replicated data
22 |
23 | # if last character of the hostname is 1 we are on the master replica.
24 | HOSTNAME_SHORT=$(hostname -s)
25 | MASTER_REPLICA=$( [ "${HOSTNAME_SHORT: -1}" == "1" ] && echo 'true' || echo 'false' )
26 |
27 | ### TODO: expose settings above via command-line args
28 |
29 | ### those normally should not be changed
30 |
31 | METADATA_FOLDER="${CLICKHOUSE_WORKING_FOLDER}/metadata"
32 | DATA_FOLDER="${CLICKHOUSE_WORKING_FOLDER}/data"
33 |
34 | BACKUP_METADATA_FOLDER="${BACKUP_FOLDER}/metadata"
35 | BACKUP_DATA_FOLDER="${BACKUP_FOLDER}/data"
36 |
37 | # we do mv instead of rm -rf (just in case), that folder is used as trashbin
38 | TRASHBIN_FOLDER="${CLICKHOUSE_TOOLSET_FOLDER}/trashbin_$(date +%Y%m%d_%H%M%S)"
39 |
40 | # we will put some tmp files there
41 | TMP_FOLDER="${CLICKHOUSE_TOOLSET_FOLDER}/tmp"
42 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # Project is OBSOLETE / not-supported / maitained.
2 |
3 | On all modern versions of clickhouse (starting from 21.7) you should use embedded functionality instead (see [SYSTEM RESTORE REPLICA](https://clickhouse.com/docs/en/sql-reference/statements/system/#restore-replica)).
4 |
5 | The original contene of the README is below.
6 |
7 |
8 |
9 |
10 | # ClickHouse zookeeper recovery tool
11 |
12 | [ClickHouse](https://clickhouse.tech/) uses [ZooKeeper](https://zookeeper.apache.org/) for replication and for coordinating distributed operations on a cluster. While no data is stored in zookeeper (only metadata, like list of parts and their checksums) the zookeeper and metadata is required to ClickHouse to work.
13 |
14 | So if for some reason you lost your zookeeper data or it's damaged / out of the sync, then your ClickHouse servers will not start (or will start in read-only mode).
15 | To return it back to a healthy state you you need to recover zookeeper meta information from the existing state of ClickHouse tables.
16 |
17 | Those script can help you to automate that process even for clusters / big number of tables.
18 |
19 | In simple cases you can do it manually (attach the Replicated table as non-Replicated, create new Replicated table, move all partitions from old table to new one).
20 |
21 | ## Before you start
22 |
23 | 1. Analyze what happened.
24 |
25 | Usually if you loose the zookeeper data it means you configured something wrong, or did some innaccurate operations manually which lead to to that situation.
26 |
27 | 2. review your setup, and try not to loose your zookeeper data anymore (otherwise you will need to repeat that recovery process again really soon)
28 | * use [recommended settings](https://clickhouse.tech/docs/en/operations/tips/#zookeeper) for Zookeeper
29 | * use 3 nodes zookeeper ensemble
30 | * set up good monitoring for your zookeeper.
31 |
32 | 3. Ensure the data can't be recovered in better way.
33 |
34 | ## How to use it
35 |
36 | You can follow the sequence below cluster-wide using some automation scripts (like ansible) or just in cluster-ssh.
37 |
38 | All steps (except step 5) may be executed on different replicas at different times. So you can recover them one-after-one, or simultaneously.
39 |
40 | 1) adjust paths/parameters `common_settings.sh`. The parameters are not (yet) configurable via command-line.
41 |
42 | 2) We will do direct interventions in clickhouse working folder, so clickhouse should be offline.
43 |
44 | ```
45 | sudo systemctl stop clickhouse-server
46 | ```
47 |
48 | 3) Create a backup of the data (using hard links).
49 |
50 | ```
51 | sudo ./toolset.sh create_local_backup
52 | ```
53 |
54 | 4) if you have some dirty state in zookeeper - clean it up. Do a backup (if needed) and run `deleteall /clickhouse` in `zkCli`.
55 |
56 | 5) Run:
57 |
58 | ```
59 | sudo ./toolset.sh reset_node
60 | ```
61 |
62 | That will move the data & metadata of all known tables away. So generally, that will reset the state
63 | of your server - all tables & databases will disappear. (they are safe inside backup).
64 |
65 | 6) Start clickhouse back:
66 |
67 | ```
68 | sudo systemctl start clickhouse-server
69 | ```
70 |
71 | At that point, it should be clean - only system tables will be in place. The rest is saved inside backup.
72 |
73 | 7) Check the settings related to replication. Examine if they are correct:
74 |
75 | ```
76 | sudo ./toolset.sh show_status
77 | ```
78 |
79 | 8) Run:
80 |
81 | ```
82 | sudo ./toolset.sh recover_non_replicated | tee recover_non_replicated_$(date +%Y%m%d_%H%M%S).log
83 | ```
84 |
85 | That will recover the schema and data from the backup created on p. 3. Replicated table will be recovered w/o replication with another name (with `.recovered_non_repl.` prefix). Merges will be stopped, and we skip Kafka tables to avoid starting of consuming.
86 |
87 | 9) At that point, you can review the state of your data on different replicas.
88 |
89 | If needed, you can adjust/decide - which of them will be used as a source for recovery.
90 |
91 | **WARNING:** Only a single replica should have `MASTER_REPLICA=1` (otherwise, you will get data duplicates), it will be used to resync all data.
92 |
93 | Adjust parameters `common_settings.sh` if needed.
94 |
95 | 10) Run
96 | ```
97 | sudo ./toolset.sh refill_replicated_tables | tee refill_replicated_tables_$(date +%Y%m%d_%H%M%S).log
98 | ```
99 | That will create Replicated table back again.
100 | * If `MASTER_REPLICA=1` it will additionally copy partitions from `.recovered_non_repl.` table.
101 | * The replicas which have `MASTER_REPLICA=0` will just create the table(s) and will sync the data from other ('MASTER') replica.
102 | * You can monitor the progress in `system.replication_queue` and/or `system.replicas`.
103 | * That may use a lot of network bandwidth.
104 | * On replicas which have `MASTER_REPLICA=0` you can also see the doubled disk usage (we refetch data from 'MASTER' replica while keeping own copy in the backup folder created in p.3)
105 |
106 | 11) Now, all tables/replicas should be back online. And now we can enable merges (were disabled on p.8) and start Kafka consuming:
107 | ```
108 | sudo ./toolset.sh recreate_kafka_tables | tee recreate_kafka_tables_$(date +%Y%m%d_%H%M%S).log
109 | ```
110 |
111 |
112 | In case of any failures during the recovery:
113 | 1) fix the problem
114 | 2) stop clickhouse: `sudo systemctl stop clickhouse-server`
115 | 3) restart the recovery sequence from the p.4.
116 |
117 | The tool does not clean the backup and trashbin folders. You can clean it manually after a successful recovery.
118 |
119 | ## Notes
120 |
121 | Provided 'as is', use it at your own risk.
122 | * All actions are transparent, and the log is quite verbose.
123 | * We don't take any responsibility for potential data damage caused by inaccurate user actions related to that toolset.
124 | * We used those scripts to recover the zookeeper data for a cluster with 10 nodes (5 shards / 2 replicas) with hundreds (about 700) of tables.
125 | * During all procedures, we keep the backup (using hard-links).
126 | * In simpler cases (single table), recovery can be done manually.
127 |
128 | Limitations:
129 | * It is not possible currently to recover zookeeper without downtime.
130 | * Because of hard links, all the actions executed on the source file will also affect hard link copy and vice versa. In most cases, files in clickhouse are immutable, but for engine=Log family (which are typically not used widely), it can be the problem. If you start modifying the `engine=Log` table just after recovery, the backup copy (which is not a real copy, but a hardlink) will be affected by those changes.
131 | * Checked on last versions of Linux only (ubuntu 20, centos 7).
132 | * It doesn't support database=Atomic (yet?)
133 | * It doesn't support multidisk setups (yet?) / s3 disks.
134 |
135 | In newer ClickHouse versions a special command to automate that process (also to avoid full resync) may be added.
136 |
--------------------------------------------------------------------------------
/common/functions.sh:
--------------------------------------------------------------------------------
1 | perl -v > /dev/null || (echo 'no perl installed!'; exit 1)
2 |
3 | mkdir -p $TMP_FOLDER || (echo "can not create tmp folder: $TMP_FOLDER (forget sudo?)"; exit 1)
4 |
5 | log() {
6 | echo "$(date "+%Y-%m-%d %H:%M:%S.%N") $1"
7 | }
8 |
9 | log_and_run_command() {
10 | log " Executing: '$*'"
11 | "$@"
12 | }
13 |
14 | copy_folder_by_hardlinks() {
15 | local source="$1"
16 | local target="$2"
17 | log_and_run_command cp -rla "$source" "$target"
18 | }
19 |
20 | execute_with_retries() {
21 | local i
22 | for i in {1..20}; do
23 | set +e
24 | "$@";
25 | local result=$?
26 | set -e
27 | if [ "$result" == 0 ]; then
28 | return 0;
29 | else
30 | log " ! Error on try #${i}, will retry in 3 sec"
31 | sleep 3;
32 | fi
33 | done
34 | log " ! Too many attempts!"
35 | return 1;
36 |
37 | }
38 |
39 | clickhouse_client_call() {
40 | local db_fs_name="$1"
41 | local add=''
42 | if [ -n "$db_fs_name" ]; then
43 | add=" --database=\$'$( urldecode "$db_fs_name" )'"
44 | fi
45 | eval ${CLICKHOUSE_CLIENT}${add}
46 | }
47 |
48 | run_clickhouse_query() {
49 | local db_fs_name="$1"
50 | local query="$2"
51 | echo "$query" | clickhouse_client_call "$db_fs_name"
52 | }
53 |
54 | run_clickhouse_query_with_retries() {
55 | execute_with_retries run_clickhouse_query "$@"
56 | }
57 |
58 | execute_metadata_file() {
59 | local db_fs_name="$1"
60 | local metadata_file="$2"
61 | log " Executing $metadata_file: $(head -n 1 $metadata_file)"
62 | cat "$metadata_file" | clickhouse_client_call "$db_fs_name"
63 | }
64 |
65 | execute_metadata_file_with_retries() {
66 | execute_with_retries execute_metadata_file "$@"
67 | # we don't want to merge anything for now,
68 | # STOP MERGES only stop merges for existsing tables
69 | # so we repeat it after every table creation
70 | run_clickhouse_query_with_retries "" "SYSTEM STOP MERGES"
71 | }
72 |
73 | metadata_file_change_attach_to_create() {
74 | local metadata_file="$1"
75 | local new_metadata_file="$2"
76 | log " Changing $metadata_file to CREATE"
77 | perl -0777 -npe 's/^ATTACH/CREATE/;' "$metadata_file" > $new_metadata_file
78 | }
79 |
80 | metadata_file_change_to_non_replicated_with_prefix () {
81 | local metadata_file="$1"
82 | local new_metadata_file="$2"
83 | log " Changing $metadata_file to non-replacted with .recovered_non_repl. prefix"
84 | # https://regex101.com/r/pscML2/2
85 | # https://regex101.com/r/X4uwt5/2
86 | # + bash escaping
87 | # TODO: support for default_replica_path ?
88 | perl -0777 -npe $'s/ENGINE\\s*=\\s*Replicated((?:[A-Z][a-z]+)?MergeTree\\()(\'((?:\\\\\'|.)*?)\'),\\s*(\'((?:\\\\\'|.)*?)\')(?:,\\s*)?/ENGINE = $1/; s/^ATTACH\\s+TABLE\\s+(?:`((?:\\\\`|.)+?)`|(\\S+))/ATTACH TABLE `.recovered_non_repl.$1$2`/;' "$metadata_file" > $new_metadata_file
89 | }
90 |
91 | create_object_from_metadata_file_with_retries() {
92 | local db_fs_name="$1"
93 | local metadata_file="$2"
94 | local new_metadata_file="$(mktemp --tmpdir="${TMP_FOLDER}" change_attach_2_create.XXXXXXX.sql)"
95 | metadata_file_change_attach_to_create "$metadata_file" "$new_metadata_file"
96 |
97 | execute_metadata_file_with_retries "$db_fs_name" "$new_metadata_file"
98 | rm $new_metadata_file
99 | }
100 |
101 | attach_object_as_non_replicated_with_retries() {
102 | local db_fs_name="$1"
103 | local metadata_file="$2"
104 | local new_metadata_file="$(mktemp --tmpdir="${TMP_FOLDER}" change_to_recovered_non_repl.XXXXXXX.sql)"
105 | metadata_file_change_to_non_replicated_with_prefix "$metadata_file" "$new_metadata_file"
106 |
107 | execute_metadata_file_with_retries "$db_fs_name" "$new_metadata_file"
108 | rm $new_metadata_file
109 | }
110 |
111 | # based on https://stackoverflow.com/a/37840948/1555175
112 | # clickhouse perfectly accepts \xFF sequences in the identifiers with backticks,
113 | # so can just directly map path path%20with%20special%20chars into DB object `path\0x20with0x20special0x20chars`
114 | # it's much simpler than dealing with backslash escaping
115 | urldecode() {
116 | : "${*//+/ }"
117 | echo "${_//%/\\x}"
118 | #echo -e "${_//%/\\x}"
119 | }
120 |
121 | # transofms database%201 table%201 => `database\x201`.`table\x201`
122 | get_db_object_name() {
123 | local db_fs_name="$1"
124 | local table_fs_name="$2"
125 | echo "\`$( urldecode "$db_fs_name" )\`.\`$( urldecode ${table_fs_name})\`"
126 | }
127 |
128 | create_database() {
129 | local db_fs_name="$1"
130 | local db_metadata_file="$2"
131 | if [ "$db_fs_name" = 'default' ]; then
132 | log " Database 'default' exists"
133 | else
134 | log " Creating database: $( urldecode "$db_fs_name" )"
135 | create_object_from_metadata_file_with_retries "" "$db_metadata_file"
136 | fi
137 | }
138 |
139 | do_nothing() {
140 | true
141 | }
142 |
143 | copy_table_datadir_by_hardlinks()
144 | {
145 | local db_fs_name="$1"
146 | local table_fs_name="$2"
147 | local new_table_fs_name="${3:-$table_fs_name}"
148 | if [ -d "${BACKUP_DATA_FOLDER}/${db_fs_name}/${table_fs_name}" ]; then
149 | log " Copy data $( get_db_object_name "$db_fs_name" "${table_fs_name}") (by hardlinks):"
150 | copy_folder_by_hardlinks "${BACKUP_DATA_FOLDER}/${db_fs_name}/${table_fs_name}" "${DATA_FOLDER}/${db_fs_name}/${new_table_fs_name}"
151 | else
152 | log " No datadir for $( get_db_object_name "$db_fs_name" "${table_fs_name}") in ${BACKUP_DATA_FOLDER}/${db_fs_name}/${table_fs_name}"
153 | fi
154 | }
155 |
156 | fill_replicated_table_by_reattaching_partitions() {
157 | local db_fs_name="$1"
158 | local source_table_fs_name="$2"
159 | local dest_table_fs_name="$3"
160 |
161 | local db_ch_name=$( urldecode "$db_fs_name" )
162 | local source_table_ch_name=$( urldecode "$source_table_fs_name" )
163 | local dest_table_ch_name=$( urldecode "$dest_table_fs_name" )
164 |
165 | local source_table_ch_full_name=$( get_db_object_name "$db_fs_name" "$source_table_fs_name" )
166 | local dest_table_ch_full_name=$( get_db_object_name "$db_fs_name" "$dest_table_fs_name" )
167 |
168 | log " Stopping merges for the source table $source_table_ch_full_name."
169 | run_clickhouse_query_with_retries "$db_fs_name" "SYSTEM STOP MERGES $source_table_ch_full_name"
170 |
171 | local i
172 | for i in {1..100}; do
173 | if [ $( run_clickhouse_query "" "select progress from system.merges where database='$db_ch_name' and table='$source_table_ch_name' limit 1") ];
174 | then
175 | log " There are merges running on $source_table_ch_name, waiting for 3 seconds"
176 | run_clickhouse_query_with_retries "" "SYSTEM STOP MERGES $source_table_ch_full_name"
177 | sleep 3
178 | else
179 | break
180 | fi
181 | done
182 |
183 | while read partitionid ; do
184 | log " * Processing partition: $partitionid."
185 | run_clickhouse_query_with_retries "" "ALTER TABLE $dest_table_ch_full_name REPLACE PARTITION ID '$partitionid' FROM $source_table_ch_full_name";
186 | done < <( run_clickhouse_query "" "select partition_id from system.parts where active and database='$db_ch_name' and table='$source_table_ch_name' GROUP BY partition_id ORDER BY partition_id FORMAT TSV" )
187 |
188 | source_rows=$(run_clickhouse_query "" "select count() from $source_table_ch_full_name" )
189 | target_rows=$(run_clickhouse_query "" "select count() from $dest_table_ch_full_name" )
190 |
191 | log " The number of rows in ${source_table_ch_full_name}: ${source_rows}"
192 | log " The number of rows in ${dest_table_ch_full_name}: ${target_rows}"
193 |
194 | if [ "$source_rows" != "$target_rows" ]; then
195 | log "The number of rows in ${dest_table_ch_full_name} is different from the number of rows in ${dest_table_ch_full_name}"
196 | log "The migration is interrupted"
197 | exit 1
198 | fi
199 | }
200 |
201 |
202 | attach_local_tables_and_skip_kafka()
203 | {
204 | local db_fs_name="$1"
205 | local table_fs_name="$2"
206 | local table_metadata_full_filename="$3"
207 |
208 | if grep -qiE "Engine\\s*=\\s*Replicated\\w*MergeTree\\(" "$table_metadata_full_filename"; then
209 | log " ... Replicated, attaching as .recovered_non_repl.${table_fs_name}"
210 | copy_table_datadir_by_hardlinks "$db_fs_name" "$table_fs_name" "%2Erecovered_non_repl%2E${table_fs_name}"
211 | attach_object_as_non_replicated_with_retries "$db_fs_name" "$table_metadata_full_filename"
212 |
213 | elif grep -qiE "Engine\\s*=\\s*Kafka" "$table_metadata_full_filename"; then
214 | # TODO: skip also Rabbit
215 | log " ... Kafka, skipping for now"
216 | # we don't want to start inserts immediately
217 | else
218 | log " ... non Replicated, attaching as is."
219 | copy_table_datadir_by_hardlinks "$db_fs_name" "$table_fs_name"
220 | execute_metadata_file_with_retries "$db_fs_name" "$table_metadata_full_filename"
221 | # they can rely on each other but normally clickhouse allows to do ATTACH even
222 | # with non-satisfied dependancies
223 | fi
224 | }
225 |
226 | create_replicated_tables_and_reattach_parts() {
227 | local db_fs_name="$1"
228 | local table_fs_name="$2"
229 | local table_metadata_full_filename="$3"
230 |
231 | if grep -qiE "Engine\\s*=\\s*Replicated\\w*MergeTree\(" "$table_metadata_full_filename"; then
232 |
233 | # that will fail if table don't exists
234 |
235 | res=$( run_clickhouse_query "" "SHOW CREATE TABLE \`$(urldecode "$db_fs_name")\`.\`.recovered_non_repl.$(urldecode "$table_fs_name")\`" )
236 |
237 | if [ -z "$res" ]; then
238 | log " Can not find recovered_non_repl for ${table_fs_name}. Did you run recover_non_replicated before?"
239 | exit 1;
240 | fi
241 |
242 | create_object_from_metadata_file_with_retries "${db_fs_name}" "$table_metadata_full_filename"
243 |
244 | if [ "$MASTER_REPLICA" = 'true' ]; then
245 | log " Script is running on master replica, reattaching parts"
246 | fill_replicated_table_by_reattaching_partitions "$db_fs_name" ".recovered_non_repl.${table_fs_name}" "$table_fs_name"
247 | else
248 | log " Non-master replica, will sync the data from the another one"
249 | fi
250 |
251 | # ensure the data were flushed before removing
252 | sync
253 |
254 | log " ... Dropping .recovered_non_repl.${table_fs_name}."
255 | run_clickhouse_query_with_retries "" "DROP TABLE IF EXISTS \`$(urldecode "$db_fs_name")\`.\`.recovered_non_repl.$(urldecode "$table_fs_name")\`;"
256 | else
257 | log " ... non Replicated, skipping."
258 | fi
259 | }
260 |
261 | create_kafka_tables()
262 | {
263 | local db_fs_name="$1"
264 | local table_fs_name="$2"
265 | local table_metadata_full_filename="$3"
266 |
267 | if grep -qiE "Engine\\s*=\\s*Kafka" "$table_metadata_full_filename"; then
268 | log " Recreating the Kafka table"
269 | create_object_from_metadata_file_with_retries "${db_fs_name}" "$table_metadata_full_filename"
270 | else
271 | log " ... non Kafka, skipping."
272 | fi
273 | }
274 |
275 |
276 | ## TODO support for Atomic (/store folder & symlinks)
277 | ## TODO support for several disks
278 |
279 | iterate_databases_and_tables_in_metadata() {
280 | local on_new_database="$1"
281 | local on_new_table="$2"
282 |
283 | local db_metadata_full_filename
284 |
285 | shopt -s nullglob # avoid returning * on empty dir
286 |
287 | for db_metadata_full_filename in "${BACKUP_METADATA_FOLDER}"/*.sql; do
288 | local db_metadata_filename="${db_metadata_full_filename##*/}"
289 |
290 | # the name of db in filesystem (folders etc)
291 | local db_fs_name="${db_metadata_filename%.sql}"
292 |
293 | # the real name is urldecoded db_fs_name
294 | log "> Database $( urldecode "$db_fs_name" ) found in $db_metadata_full_filename"
295 |
296 | if [ "$db_fs_name" = 'system' ]; then
297 | log " ... skipping system database."
298 | continue
299 | fi
300 |
301 | $on_new_database "$db_fs_name" "$db_metadata_full_filename"
302 |
303 | log " Iterating tables metadata in ${BACKUP_METADATA_FOLDER}/${db_fs_name}"
304 |
305 | local table_metadata_full_filename
306 | for table_metadata_full_filename in "${BACKUP_METADATA_FOLDER}/${db_fs_name}"/*.sql; do
307 | local table_metadata_filename="${table_metadata_full_filename##*/}"
308 |
309 | # the name of filesystem in filesystem (folders etc)
310 | local table_fs_name="${table_metadata_filename%.sql}"
311 |
312 | log ">>> Table $( get_db_object_name "$db_fs_name" "${table_fs_name}") found in ${table_metadata_full_filename}"
313 | $on_new_table "$db_fs_name" "$table_fs_name" "$table_metadata_full_filename"
314 | done
315 | done
316 | }
317 |
318 | ensure_clickhouse_is_stopped() {
319 | log 'checking if clickhouse is active.'
320 | set +e
321 | $CLICKHOUSE_CLIENT --query="SELECT 1" > /dev/null 2>&1
322 | local result=$?
323 | set -e
324 | if [ "$result" == 0 ]; then
325 | log 'ClickHouse is running. We can not reset it while it is active. Shutdown clickhouse first to continue!..'
326 | exit 1
327 | fi
328 | log 'It seems clickhouse is not running'
329 | }
330 |
331 |
332 |
333 |
334 |
335 | create_local_backup() {
336 | # we create/recover this 'backup' using hardlinks
337 | # warning: it's safe only when clickhouse is stopped
338 | # warning: file & its hardlink copy will have the same attributes (don't chown / chmod it!).
339 | # warning: data is not always immutable in clickhouse files (engine=Log, so after recovery backup can be affected by the running queries).
340 |
341 |
342 | if [ -d $BACKUP_FOLDER ]; then
343 | log "backup exists as $BACKUP_FOLDER . Can not continue"
344 | exit 1
345 | fi
346 |
347 | ensure_clickhouse_is_stopped
348 |
349 | log 'Creating backup folder'
350 | log_and_run_command mkdir -p "$BACKUP_FOLDER"
351 |
352 | log "Copy (by hardlinks) data & metadata folders"
353 |
354 | # TODO: we can do a real copy instead of hardlink copy for certain engines, and for metadata files.
355 | copy_folder_by_hardlinks "$METADATA_FOLDER" "$BACKUP_METADATA_FOLDER"
356 | copy_folder_by_hardlinks "$DATA_FOLDER" "$BACKUP_DATA_FOLDER"
357 |
358 | log 'Backup finished'
359 | log 'Now you can reset clickhouse node (reset_node) and clean up zookeeper (if it is broken)'
360 | }
361 |
362 | reset_node() {
363 | ## that script will move data & metadata aside to be able to start clickhouse
364 | ## second script will do the actual recovery.
365 |
366 | if [ ! -d $BACKUP_FOLDER ]; then
367 | log "backup does not exists at $BACKUP_FOLDER"
368 | exit 1
369 | fi
370 |
371 | ensure_clickhouse_is_stopped
372 |
373 | log "Creating trash bin"
374 | log_and_run_command mkdir -p "$TRASHBIN_FOLDER"
375 |
376 | log "Moving data and metadata to trash bin"
377 | log_and_run_command mv "$METADATA_FOLDER" "$TRASHBIN_FOLDER"
378 | log_and_run_command mv "$DATA_FOLDER" "$TRASHBIN_FOLDER"
379 |
380 | log "Recreating data & metadata folders"
381 | log_and_run_command mkdir -p "$METADATA_FOLDER" "$DATA_FOLDER"
382 | log_and_run_command chown -R clickhouse:clickhouse "$METADATA_FOLDER" "$DATA_FOLDER"
383 |
384 | log "Move back the system database (we don't expect any replicated tables there)"
385 |
386 | ### we don't expect any replicated tables in system database,
387 | ### and we want to put it into the correct place in advance
388 | ### otherwise clickhouse will recreate them automatically when it will be started
389 |
390 | if [ -d $BACKUP_METADATA_FOLDER/system ]; then
391 | copy_folder_by_hardlinks "$BACKUP_METADATA_FOLDER/system" "$METADATA_FOLDER"
392 | fi
393 |
394 | if [ -d $BACKUP_DATA_FOLDER/system ]; then
395 | copy_folder_by_hardlinks "$BACKUP_DATA_FOLDER/system" "$DATA_FOLDER"
396 | fi
397 |
398 | log 'Node reset finished. Now you can start it (it will be empty).'
399 | }
400 |
401 | show_status() {
402 | set +e
403 |
404 | ### Check if we are active
405 | log 'Check status:'
406 | $CLICKHOUSE_CLIENT --query="SELECT 'ClickHouse ' || version() || ' at ' || hostName() || ' is up and running. Start time: ' || toString( now() - uptime() )" --format=TSVRaw
407 |
408 | log 'Macros:'
409 | $CLICKHOUSE_CLIENT --query="SELECT * FROM system.macros" --format=PrettyCompactMonoBlock
410 |
411 | log 'Clusters:'
412 | $CLICKHOUSE_CLIENT --query="SELECT * FROM system.clusters WHERE cluster not like 'test\\\\_%' " --format=PrettyCompactMonoBlock
413 |
414 | log 'Zookeeper:'
415 | $CLICKHOUSE_CLIENT --query="SELECT * FROM system.zookeeper WHERE path='/'" --format=PrettyCompactMonoBlock
416 | $CLICKHOUSE_EXTRACT_FROM_CONFIG --key=zookeeper
417 | }
418 |
419 | recover_schema_reattach_non_replicated_tables() {
420 | log "==================================================================================="
421 | log "==================================================================================="
422 | log " Iterating databases metadata in ${BACKUP_METADATA_FOLDER}:"
423 | log " Create databases, recover simple tables by attach, Replicated as non-replicated, skip Kafka"
424 | iterate_databases_and_tables_in_metadata "create_database" "attach_local_tables_and_skip_kafka"
425 | sync
426 | }
427 |
428 |
429 | refill_replicated_tables() {
430 | log "==================================================================================="
431 | log "==================================================================================="
432 | log " Iterating databases metadata in ${BACKUP_METADATA_FOLDER}, recreate Replicated table and reattach parts"
433 | iterate_databases_and_tables_in_metadata "do_nothing" "create_replicated_tables_and_reattach_parts"
434 | sync
435 | }
436 |
437 | recreate_kafka_tables() {
438 | log "==================================================================================="
439 | log "==================================================================================="
440 | log " Enabling merges "
441 | run_clickhouse_query_with_retries "" "SYSTEM START MERGES"
442 |
443 | log "==================================================================================="
444 | log "==================================================================================="
445 | log " Iterating databases metadata in ${BACKUP_METADATA_FOLDER}, recreate Kafka tables"
446 | iterate_databases_and_tables_in_metadata "do_nothing" "create_kafka_tables"
447 | sync
448 | }
449 |
450 |
451 | ##########
452 | ## It is not used currently.
453 | ## It's safer to rely on ClickHouse to understand which folders need to be attached - because beside the tmp parts
454 | ## it can also contain same data in merged and unmerged form (and when you ATTACH part by part it will end up witj duplicates)
455 | ## In contrast when we attach whole folder as plain (non Replicated) MergeTree ClickHouse can understand that situations.
456 | # reattach_parts()
457 | # {
458 | # local db_fs_name="$1"
459 | # local table_fs_name="$2"
460 |
461 | # log " Copy parts of the table $( get_db_object_name "$db_fs_name" "${table_fs_name}") (by hardlinks) from ${BACKUP_DATA_FOLDER}/${db_fs_name}/${table_fs_name} to ${DATA_FOLDER}/${db_fs_name}/${table_fs_name}/detached"
462 |
463 | # IGNORE_PARTS="^(detached|broken.*|unexpected.*|ignored.*|noquorum.*|tmp_mut.*)$"
464 | # shopt -s nullglob # avoid returning * on empty dir
465 |
466 | # local part_path
467 | # for part_path in "${BACKUP_DATA_FOLDER}/${db_fs_name}/${table_fs_name}"/*/; do
468 | # local part_name="${part_path%"${part_path##*[!/]}"}" # extglob-free multi-trailing-/ trim
469 | # part_name="${part_name##*/}" # remove everything before the last /
470 | # if [[ $part_name =~ $IGNORE_PARTS ]];
471 | # then
472 | # log " - $part_name ignored ($part_path)"
473 | # continue
474 | # fi
475 | # log " * ${part_name} at $part_path"
476 | # copy_folder_by_hardlinks "$part_path" "${DATA_FOLDER}/${db_fs_name}/${table_fs_name}/detached"
477 | # run_clickhouse_query_with_retries "$db_fs_name" "ALTER TABLE $( get_db_object_name "$db_fs_name" "${table_fs_name}") ATTACH PART '${part_name}'"
478 | # done
479 | # }
480 |
481 | # # Not used: metadata filename is the url-encoded table name
482 | # extract_object_name_from_metadata_content() {
483 | # local db_fs_name="$1"
484 | # local metadata_file="$2"
485 | # # https://regex101.com/r/jea9p9/1/
486 | # perl -0777 -npe $'s/^(?:ATTACH|CREATE)\\s+(?:OR\\s+REPLACE\\s+)?(?:IF\\s+NOT\\s+EXISTS\\s+)?(TEMPORARY\\s+)?(?:MATERIALIZED\\s+VIEW|VIEW|DICTIONARY|TABLE|DATABASE|LIVE\\s+VIEW)\\s+(?:`((?:\\\\`|.)+?)`|(\\S+)).*$/$2$3/' "$metadata_file"
487 | # }
488 |
--------------------------------------------------------------------------------
/zookeeper_recovery.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 | use Data::Dumper; # TODO: not a part of core modules on older perl
6 | use POSIX();
7 | use Carp;
8 | $| = 1; # disable output buffering
9 |
10 |
11 | ##### params: #############
12 |
13 | my $CLICKHOUSE_CLIENT = 'clickhouse-client';
14 |
15 | # leave $CLUSTER_NAME to run on a single node (also check RECOVER_SCHEMA_ONLY)
16 | # or run with cluster name and it should do everything correct on the whole cluster.
17 | # the safe & handy way is to create a subcluster for every shard and run that tool shard by shard
18 | my $CLUSTER_NAME = '';
19 |
20 | # if set the data will not be recovered (can make sense with empty CLUSTER NAME will be synced from the other replica).
21 | my $RECOVER_SCHEMA_ONLY = 0;
22 |
23 | # just output the commands which should be executed.
24 | my $DRY_RUN = 0;
25 |
26 | ###########################
27 |
28 |
29 | sub printlog {
30 | my $log_line = shift;
31 | print (POSIX::strftime("%Y-%m-%d %H:%M:%S", localtime time), " ", sprintf($log_line, @_), "\n");
32 | }
33 |
34 | sub escape_shell_arg {
35 | my ($arg) = @_;
36 | $arg =~ s/'/'\\''/g;
37 | $arg =~ s/^''//; $arg =~ s/''$//;
38 | return "'$arg'";
39 | }
40 |
41 | sub escape_non_ascii_for_sql {
42 | my ($arg) = @_;
43 | $arg =~ s/([^A-Za-z0-9_])/sprintf("\\x%02X", ord($1))/seg;
44 | return $arg;
45 | }
46 |
47 | sub escape_sql_arg {
48 | my ($arg) = @_;
49 | return q{'} . escape_non_ascii_for_sql($arg) . q{'};
50 | }
51 |
52 | # clickhouse perfectly accepts \xFF sequences in the identifiers with backticks,
53 | sub full_table_name {
54 | my ($database, $table) = @_;
55 | return join '.', map { q{`} . escape_non_ascii_for_sql($_) . q{`} } ($database, $table);
56 | }
57 |
58 | # TabSeparated: The following escape sequences are used for output: \b, \f, \r, \n, \t, \0, \', \\.
59 | my %mapping = (
60 | "\\b" => "\b", "\\f" => "\f", "\\r" => "\r", "\\n" => "\n", "\\t" => "\t", "\\0" => "\0",
61 | "\\'" => "\'", "\\\\" => "\\", "\\" => "\\"
62 | );
63 |
64 | # return array of array
65 | # tuples / maps / arrays - are not parsed
66 | sub parse_tsv
67 | {
68 | my ($tsv) = @_;
69 | my $res = [ map { [ map { s/(\\[bfrnt0'\\]|\\)/$mapping{$1}/seg; $_; } split "\t", $_, -1 ] } split "\n", $tsv, -1 ];
70 | if ( scalar(@{pop @$res}) != 0 )
71 | {
72 | confess("Newline at the end of TSV is missing!");
73 | }
74 | return $res;
75 | }
76 |
77 | # return array of hashes
78 | sub parse_tsv_with_names
79 | {
80 | my ($tsv) = @_;
81 | my $raa = parse_tsv($tsv);
82 | my $column_names = shift @$raa; # get header row
83 | my $res = [];
84 | foreach my $row (@$raa)
85 | {
86 | my %h;
87 | @h{@$column_names} = @$row;
88 | push @$res, \%h;
89 | }
90 | return $res;
91 | }
92 |
93 | sub run_clickhouse_query
94 | {
95 | my $query = shift;
96 | my $extra_settings = shift || {};
97 |
98 | my @args = ("${CLICKHOUSE_CLIENT}");
99 |
100 | push @args, "--query=" . escape_shell_arg($query);
101 |
102 | while (my ($key, $value) = each (%$extra_settings)) {
103 | push @args, "--".$key."=" . escape_shell_arg($value);
104 | }
105 |
106 | my $cmd = join(' ', @args);
107 | my $output = `$cmd`;
108 | my $status = $?;
109 | return {
110 | status => $status,
111 | output => $output,
112 | cmd => $cmd,
113 | };
114 | }
115 |
116 | sub run_ddl_command
117 | {
118 | my $query = shift;
119 | my $extra_settings = shift || {};
120 |
121 | my $retries = 1;
122 |
123 | while ($retries <= 5)
124 | {
125 | printlog('Executing%s: %s', $retries > 1 ? "(attempt #$retries)" : '' , $query);
126 |
127 | if ($DRY_RUN)
128 | {
129 | printlog('Success! (DRY RUN)');
130 | return 1;
131 | }
132 |
133 | my $res = run_clickhouse_query($query, $extra_settings);
134 | if ($res->{status} == 0)
135 | {
136 | printlog('Success!');
137 | return 1;
138 | }
139 |
140 | printlog("Command failed: %s\n%s", $res->{cmd}, $res->{output});
141 |
142 | sleep($retries);
143 | $retries += 1;
144 | }
145 |
146 | confess('Too many failed attempts!');
147 | }
148 |
149 | # we print all planned commands, so in case something will break in the middle user can finish them manually.
150 | sub run_ddl_command_sequence
151 | {
152 | my $ddl_commands = shift;
153 | printlog("Trying to execute the following commands: \n %s;\n", join(";\n", @$ddl_commands));
154 | run_ddl_command($_) foreach (@$ddl_commands);
155 | }
156 |
157 | sub get_clickhouse_query_result
158 | {
159 | my $query = shift;
160 | my $extra_settings = shift || {};
161 |
162 | my $res = run_clickhouse_query($query, $extra_settings);
163 | # print Dumper $res;
164 | if ($res->{status} != 0)
165 | {
166 | confess("Command failed: ", $res->{cmd}, "\n", $res->{output});
167 | }
168 | my $output = $res->{output};
169 | chomp $output;
170 | return $output;
171 | }
172 |
173 | sub run_clickhouse_query2
174 | {
175 | my $query = shift;
176 | my $extra_settings = shift || {};
177 |
178 | my $res = run_clickhouse_query($query, {%$extra_settings, format=>'TSVWithNames'});
179 |
180 | if ($res->{status} != 0)
181 | {
182 | confess("Can not connect: ", $res->{output});
183 | }
184 | # print Dumper $res;
185 |
186 | return parse_tsv_with_names($res->{output});
187 |
188 | }
189 |
190 | sub prompt_yn {
191 | my ($query) = @_;
192 | print "$query (Y/N) ";
193 | chomp(my $answer = );
194 | return lc($answer) eq 'y';
195 | }
196 |
197 | sub maybecluster {
198 | my $table_name = shift;
199 | return $CLUSTER_NAME ? 'clusterAllReplicas(' . $CLUSTER_NAME . ',' . $table_name . ')' : $table_name;
200 | }
201 |
202 | sub ddl_maybe_oncluster {
203 | return $CLUSTER_NAME ? 'ON CLUSTER ' . escape_sql_arg($CLUSTER_NAME) : '';
204 | }
205 |
206 | sub maybe_add_on_cluster_to_create_statement {
207 | my ($create_statement) = @_;
208 | my $on_cluster = ddl_maybe_oncluster();
209 |
210 | if ($on_cluster)
211 | {
212 | $create_statement =~ s/
213 | ^ # from begining
214 | ( # start capture group #1
215 | (?:CREATE|ATTACH)\s+TABLE\s+ # CREATE OR ATTACH
216 | (?:
217 | (?:`(?:\\.|[^`])+`|"(?:\\.|[^"])+"|[a-zA-Z0-9_]+)
218 | \.
219 | )? # optional name of the database (maybe quoted with backticks or doublequotes) followed by dot
220 | (?:`(?:\\.|[^`])+`|"(?:\\.|[^"])+"|[a-zA-Z0-9_]+)\s+ # name of the table (maybe quoted with backticks or doublequotes)
221 | (?:UUID\s+'[0-9a-fA-F-]+'\s+)? # optional uuid
222 | )
223 | /$1 $on_cluster /isx;
224 | }
225 |
226 | return $create_statement;
227 | }
228 |
229 | sub rename_table_in_create_statement
230 | {
231 | my ($create_table,$new_name) = @_;
232 | print "0 $create_table\n";
233 | $create_table =~ s/
234 | ^ # from begining
235 | ( # start capture group #1
236 | CREATE
237 | \s+TABLE\s+
238 | )
239 | (?:
240 | (?:`(?:\\.|[^`])+`|"(?:\\.|[^"])+"|[a-zA-Z0-9_]+)
241 | \.
242 | )? # optional name of the database (maybe quoted with backticks or doublequotes) followed by dot
243 | (?:`(?:\\.|[^`])+`|"(?:\\.|[^"])+"|[a-zA-Z0-9_]+)\s+ # name of the table (maybe quoted with backticks or doublequotes)
244 | /$1$new_name /sxi;
245 | print "1 $create_table\n";
246 | return $create_table;
247 |
248 | }
249 |
250 |
251 | sub attach_as_non_replicated
252 | {
253 | my ($original_create_table) = @_;
254 | print "2 $original_create_table\n";
255 | my $modified_attach_table = maybe_add_on_cluster_to_create_statement($original_create_table);
256 | print "3 $modified_attach_table\n";
257 | $modified_attach_table =~ s/
258 | ^ # from begining
259 | CREATE
260 | ( # start capture group #1
261 | \s+TABLE\s+
262 | (?:
263 | (?:`(?:\\.|[^`])+`|"(?:\\.|[^"])+"|[a-zA-Z0-9_]+)
264 | \.
265 | )? # optional name of the database (maybe quoted with backticks or doublequotes) followed by dot
266 | (?:`(?:\\.|[^`])+`|"(?:\\.|[^"])+"|[a-zA-Z0-9_]+)\s+ # name of the table (maybe quoted with backticks or doublequotes)
267 | ) # end capture group #1
268 | (?:UUID\s+'[0-9a-fA-F-]+'\s+)? # optional uuid
269 | (.*) # capture group #2
270 | ( \)\s+ENGINE\s*=\s* ) # capture group #3
271 | Replicated
272 | ([a-zA-Z]*MergeTree\() # capture group #4
273 | (?:\s*'(?:\\.|[^'])+'\s*,\s*'(?:\\.|[^'])+') # params of Replicated
274 |
275 | ([^\)]*\)) # capture group #5 - all other params + closing bracket.
276 | /ATTACH$1$2$3$4$5/sxi;
277 | print "4 $modified_attach_table\n";
278 | return $modified_attach_table;
279 | }
280 |
281 |
282 |
283 | sub print_general_info {
284 | my $res = run_clickhouse_query("SELECT 1");
285 |
286 | # check the conn is ok
287 | if ($res->{status} != 0 or $res->{output} != "1\n")
288 | {
289 | confess("Can not connect: ", $res->{output});
290 | }
291 |
292 | printlog( "Clickhouse:\n%s\n",
293 | get_clickhouse_query_result(
294 | "SELECT
295 | hostName(),
296 | 'ClickHouse ' || version() as v,
297 | uptime(),
298 | toString( now() - uptime() ) as start_time
299 | FROM ".maybecluster('system.one')."
300 | ORDER BY hostName()",
301 | {format => 'PrettyCompactMonoBlock'}
302 | )
303 | );
304 |
305 | printlog("Defined macros:\n%s\n",
306 | get_clickhouse_query_result("
307 | SELECT
308 | hostName(),
309 | *
310 | FROM " . maybecluster('system.macros') . "
311 | ORDER BY hostName(), macro",
312 | {format => 'PrettyCompactMonoBlock'}
313 | )
314 | );
315 |
316 | printlog("Defined clusters:\n%s\n",
317 | get_clickhouse_query_result("
318 | SELECT
319 | hostName(),
320 | *
321 | FROM " . maybecluster('system.clusters') . "
322 | WHERE cluster not like 'test\\\\_%'
323 | ORDER BY hostName(), cluster, shard_num, replica_num",
324 | {format => 'PrettyCompactMonoBlock'}
325 | )
326 | );
327 |
328 | printlog("Zookeeper:\n%s\n%s\n",
329 | get_clickhouse_query_result("
330 | SELECT
331 | hostName(),
332 | *
333 | FROM " . maybecluster('system.zookeeper') . "
334 | WHERE path = '/'
335 | ORDER BY hostName(), name",
336 | {format => 'PrettyCompactMonoBlock'}
337 | ),
338 | get_clickhouse_query_result("
339 | SELECT
340 | hostName(),
341 | *
342 | FROM " . maybecluster('system.zookeeper') . "
343 | WHERE path = '/clickhouse'
344 | ORDER BY hostName(), name",
345 | {format => 'PrettyCompactMonoBlock'}
346 | )
347 | );
348 | }
349 |
350 | my $uuid_supported_cached_result = undef;
351 |
352 | sub is_uuid_supported
353 | {
354 | if (!defined($uuid_supported_cached_result))
355 | {
356 | $uuid_supported_cached_result = get_clickhouse_query_result("
357 | SELECT
358 | count() > 0
359 | FROM " . maybecluster('system.settings') . "
360 | WHERE name='show_table_uuid_in_table_create_query_if_not_nil'");
361 |
362 | printlog( 'show_table_uuid_in_table_create_query_if_not_nil supported: %d', $uuid_supported_cached_result );
363 |
364 | }
365 | return $uuid_supported_cached_result;
366 | }
367 |
368 | sub find_tables_with_zookeeper_data_missing
369 | {
370 | printlog( 'Detecting tables with zookeeper missing...' );
371 | return run_clickhouse_query2("
372 | WITH
373 | is_readonly and not is_session_expired and zookeeper_exception like '%No node%' as zookeeper_data_missing
374 | SELECT
375 | database,
376 | table,
377 | uniqExact(zookeeper_path) as nr_of_zookeeper_paths,
378 | arrayStringConcat( groupArray((hostName() || ': ' || zookeeper_exception)), '\n') as zookeeper_exeptions,
379 | arrayStringConcat( groupArrayIf(hostName(),zookeeper_data_missing), ',') as hosts_with_zookeeper_data_missing,
380 | arrayStringConcat( groupArrayIf(hostName(),not zookeeper_data_missing), ',') as hosts_with_zookeeper_data
381 | FROM " . maybecluster('system.replicas') . "
382 | GROUP BY
383 | database,
384 | table
385 | HAVING countIf(zookeeper_data_missing) > 0
386 | ORDER BY
387 | database,
388 | table
389 | ");
390 | }
391 |
392 | sub get_table_info
393 | {
394 | my ($database_name, $table_name) = @_;
395 | my $uuid_supported = is_uuid_supported();
396 |
397 | return run_clickhouse_query2(
398 | sprintf(
399 | 'SELECT
400 | hostName(),
401 | *
402 | FROM %s
403 | WHERE database=%s AND name=%s
404 | ORDER BY hostName()',
405 | maybecluster('system.tables'),
406 | escape_sql_arg($database_name),
407 | escape_sql_arg($table_name)
408 | ),
409 | { $uuid_supported ? (show_table_uuid_in_table_create_query_if_not_nil => 1) : () }
410 | );
411 | }
412 |
413 | # table will be renamed to temporary name, recreated in place, and all partitions reattached back
414 | sub recover_table_zookeeper_data
415 | {
416 | my ($table_name, $database_name, $temporary_db_name) = @_;
417 |
418 | my $full_table_name = full_table_name($database_name, $table_name);
419 |
420 | my $target_table_name = $RECOVER_SCHEMA_ONLY ? "${database_name}.${table_name}_origdata" : "${database_name}.${table_name}";
421 | my $full_tmp_table_name = full_table_name($temporary_db_name, $target_table_name);
422 |
423 | printlog( 'Processing %s, using %s as temporary table', $full_table_name, $full_tmp_table_name);
424 |
425 | my $original_table_rows_count = get_clickhouse_query_result(sprintf('SELECT count() FROM %s',$full_table_name));
426 |
427 | my $table_info = get_table_info($database_name, $table_name);
428 |
429 | if (scalar(@$table_info) == 0) {
430 | confess('Empty result of system.tables query');
431 | }
432 |
433 | my $target_table_info = get_table_info($temporary_db_name,$target_table_name);
434 |
435 | if (scalar(@$target_table_info) > 0)
436 | {
437 | print Dumper $target_table_info;
438 | confess("Temporary table $full_tmp_table_name already exists! Do cleanup manually to continue.");
439 | }
440 |
441 | # small consistency check - ensure the schema is the same for different nodes
442 | my $original_create_table = $table_info->[0]{create_table_query};
443 |
444 | if ( scalar(@$table_info) > 1 )
445 | {
446 | for my $v (@$table_info)
447 | {
448 | if ( $v->{create_table_query} ne $original_create_table) {
449 | printlog( '%s statement : %s', $v->{'hostName()'}, $v->{create_table_query});
450 | printlog( '%s statement : %s', $table_info->[0]{'hostName()'}, $table_info->[0]{create_table_query});
451 | confess('Table schema is inconsistant across the cluster nodes!');
452 | }
453 | }
454 | }
455 |
456 | my $parts_info = run_clickhouse_query2(
457 | sprintf(
458 | 'SELECT
459 | partition_id,
460 | uniqExact(name) as parts_count
461 | FROM %s
462 | WHERE
463 | active
464 | AND database=%s AND table=%s
465 | GROUP BY partition_id
466 | ORDER BY partition_id',
467 | maybecluster('system.parts'),
468 | escape_sql_arg($database_name),
469 | escape_sql_arg($table_name)
470 | )
471 | );
472 |
473 | if (scalar(@$parts_info) == 0)
474 | {
475 | printlog('Empty result of system.parts query: table is empty, will just recreate it.');
476 |
477 | run_ddl_command_sequence(
478 | [
479 | sprintf('DROP TABLE IF EXISTS %s %s NO DELAY', $full_table_name, ddl_maybe_oncluster()),
480 | maybe_add_on_cluster_to_create_statement($original_create_table),
481 | ]
482 | );
483 |
484 | return;
485 | }
486 |
487 | my $max_part_per_partition = 0;
488 | my $overall_number_of_parts = 0;
489 |
490 | for my $p (@$parts_info)
491 | {
492 | if ($p->{parts_count} > $max_part_per_partition)
493 | {
494 | $max_part_per_partition = $p->{parts_count};
495 | $overall_number_of_parts += $p->{parts_count};
496 | }
497 | }
498 |
499 | # TODO: do we care of replicated_deduplication_window here?
500 | printlog("max_part_per_partition: %d, overall_number_of_parts: %d", $max_part_per_partition, $overall_number_of_parts);
501 |
502 |
503 | my @command_sequence = ();
504 |
505 | # inside Atomic database that doesn't work:
506 | # DB::Exception: Mapping for table with UUID=ccbe67e0-eb08-4897-80f1-404c3b488810 already exists. It happened due to UUID collision, most likely because some not random UUIDs were manually specified in CREATE queries. (version 21.7.1.7029 (official build))
507 |
508 | # so we do that reattach only in after moving the table to ordinary to 'drop' atomic nature of it.
509 |
510 | # push @command_sequence, sprintf('DETACH TABLE IF EXISTS %s %s NO DELAY', $full_table_name, ddl_maybe_oncluster());
511 | # push @command_sequence, attach_as_non_replicated($original_create_table);
512 |
513 | # SYSTEM STOP MERGES don't work cluster-wide
514 | # the safest way to use that is to create a subcluster for every shard and do it shard by shard
515 | push @command_sequence, sprintf('SYSTEM STOP MERGES %s', $full_table_name);
516 |
517 | # direct rename or r/o table was not working before 20.5, see https://github.com/ClickHouse/ClickHouse/pull/11652/
518 | push @command_sequence, sprintf('RENAME TABLE %s TO %s %s', $full_table_name, $full_tmp_table_name, ddl_maybe_oncluster());
519 |
520 | push @command_sequence, sprintf('DETACH TABLE IF EXISTS %s %s NO DELAY', $full_tmp_table_name, ddl_maybe_oncluster());
521 | push @command_sequence, attach_as_non_replicated(rename_table_in_create_statement($original_create_table, $full_tmp_table_name));
522 |
523 | push @command_sequence, sprintf('SYSTEM STOP MERGES %s', $full_tmp_table_name);
524 |
525 | push @command_sequence, maybe_add_on_cluster_to_create_statement($original_create_table);
526 | push @command_sequence, sprintf('SYSTEM STOP MERGES %s', $full_table_name);
527 |
528 | if (!$RECOVER_SCHEMA_ONLY)
529 | {
530 | for my $p (@$parts_info)
531 | {
532 | push @command_sequence, sprintf('ALTER TABLE %s %s REPLACE PARTITION ID %s FROM %s',
533 | $full_table_name,
534 | ddl_maybe_oncluster(),
535 | escape_sql_arg($p->{partition_id}),
536 | $full_tmp_table_name);
537 | }
538 | }
539 |
540 | run_ddl_command_sequence(\@command_sequence);
541 |
542 | my $new_table_row_count = get_clickhouse_query_result(sprintf('SELECT count() FROM %s',$full_table_name));
543 |
544 | printlog('original_table_rows_count: %d, new_table_row_count: %d', $original_table_rows_count, $new_table_row_count);
545 |
546 | run_ddl_command(sprintf('SYSTEM START MERGES %s', $full_table_name));
547 | }
548 |
549 |
550 | printlog('Started %s [pid:%d]:', $0, $$);
551 |
552 | print_general_info();
553 |
554 | my $readonly_tables = find_tables_with_zookeeper_data_missing();
555 |
556 | printlog( '%d tables with zookeeper_data_missing found.', scalar(@$readonly_tables));
557 |
558 | if (scalar(@$readonly_tables) == 0) {
559 | printlog( 'Nothing to to!' );
560 | exit;
561 | }
562 |
563 | printlog( 'WARNING: Please stop the insertion to all the tables, detach all the Kafka / RabbitMQ / Buffer / Distributed tables!');
564 | prompt_yn('Continue?') || exit(1);
565 |
566 | my $temporary_db_name = '_tmp_zk_rcvry';
567 | run_ddl_command(sprintf('CREATE DATABASE IF NOT EXISTS %s %s engine=Ordinary', $temporary_db_name, ddl_maybe_oncluster()));
568 |
569 | foreach my $table (@$readonly_tables) {
570 | next if $table->{'database'} eq $temporary_db_name;
571 |
572 | recover_table_zookeeper_data($table->{'table'}, $table->{'database'}, $temporary_db_name);
573 | }
574 |
575 | printlog('Done! Cross check everything and remove %s database', $temporary_db_name);
576 |
--------------------------------------------------------------------------------