├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── .golangci.yml
├── .goreleaser.yml
├── LICENSE
├── Makefile
├── README.md
├── api
    └── swagger.yml
├── cmd
    └── qumomf
    │   └── main.go
├── config
    ├── qumomf.conf.yml
    └── qumomf.daemon.min.conf.yml
├── example
    ├── docker-compose.yml
    ├── qumomf.yml
    ├── router
    │   ├── Dockerfile
    │   ├── init_router.lua
    │   └── router.lua
    └── storage
    │   ├── Dockerfile
    │   ├── init_storage.lua
    │   └── storage.lua
├── go.mod
├── go.sum
├── internal
    ├── api
    │   ├── api.go
    │   └── data.go
    ├── config
    │   ├── config.go
    │   ├── config_test.go
    │   ├── testdata
    │   │   ├── bad-elector.conf.yml
    │   │   └── qumomf-full.conf.yml
    │   └── validator.go
    ├── coordinator
    │   └── coordinator.go
    ├── metrics
    │   └── metrics.go
    ├── qumhttp
    │   ├── api.go
    │   ├── api_test.go
    │   ├── data.go
    │   ├── http.go
    │   └── routing.go
    ├── quorum
    │   ├── elector.go
    │   ├── elector_test.go
    │   ├── idle.go
    │   ├── idle_test.go
    │   ├── smart.go
    │   └── smart_test.go
    ├── storage
    │   ├── data.go
    │   ├── sqlite
    │   │   ├── sqlite.go
    │   │   └── sqlite_test.go
    │   └── storage.go
    ├── util
    │   └── util.go
    └── vshard
    │   ├── alert.go
    │   ├── cluster.go
    │   ├── cluster_test.go
    │   ├── instance.go
    │   ├── mock.go
    │   ├── orchestrator
    │       ├── analysis.go
    │       ├── config.go
    │       ├── failover.go
    │       ├── failover_test.go
    │       ├── hook.go
    │       ├── hook_test.go
    │       ├── instance_utils.go
    │       ├── instance_utils_test.go
    │       ├── monitor.go
    │       ├── monitor_test.go
    │       ├── recovery.go
    │       ├── recovery_test.go
    │       └── sampler.go
    │   ├── parser.go
    │   ├── parser_test.go
    │   ├── replicaset.go
    │   ├── replicaset_test.go
    │   ├── router.go
    │   ├── snapshot.go
    │   ├── tarantool.go
    │   └── tarantool_test.go
└── scripts
    ├── etc
        └── systemd
        │   └── qumomf.service
    ├── postinstall.sh
    └── preremove.sh


/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - 'master'
 7 |   pull_request:
 8 |     branches:
 9 |       - '*'
10 | 
11 | jobs:
12 | 
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 | 
17 |       - name: Set up Go 1.13
18 |         uses: actions/setup-go@v1
19 |         with:
20 |           go-version: 1.13
21 |         id: go
22 | 
23 |       - name: Check out code into the Go module directory
24 |         uses: actions/checkout@v2
25 | 
26 |       - name: Get dependencies
27 |         run: |
28 |           go get -v -t -d ./...
29 |           if [ -f Gopkg.toml ]; then
30 |               curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh
31 |               dep ensure
32 |           fi
33 | 
34 |       - name: Build
35 |         run: make build
36 | 
37 |       - name: Test
38 |         run: |
39 |           make env_up
40 |           make run_tests
41 |           make env_down
42 | 
43 |       - name: install golangci-lint
44 |         run: |
45 |           curl -sfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh| sh -s -- -b $GITHUB_WORKSPACE v1.23.6 
46 | 
47 |       - name: Lint
48 |         run: $GITHUB_WORKSPACE/golangci-lint run
49 | 
50 |       - name: Run GoReleaser
51 |         uses: goreleaser/goreleaser-action@v2
52 |         with:
53 |           args: release --snapshot --skip-publish --rm-dist


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | 
 8 | # Test binary, built with `go test -c`
 9 | *.test
10 | 
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 | 
14 | # Dependency directories (remove the comment below to include it)
15 | vendor/
16 | 
17 | # IDE
18 | .idea/
19 | **/.DS_Store
20 | 
21 | # vim
22 | *.swp
23 | *.swo
24 | 
25 | bin/
26 | dist/
27 | /*.db
28 | 


--------------------------------------------------------------------------------
/.golangci.yml:
--------------------------------------------------------------------------------
 1 | run:
 2 |   skip-dirs:
 3 |     - example
 4 | 
 5 | linters-settings:
 6 |   govet:
 7 |     check-shadowing: true
 8 |     settings:
 9 |       printf:
10 |         funcs:
11 |           - (github.com/golangci/golangci-lint/pkg/logutils.Log).Infof
12 |           - (github.com/golangci/golangci-lint/pkg/logutils.Log).Warnf
13 |           - (github.com/golangci/golangci-lint/pkg/logutils.Log).Errorf
14 |           - (github.com/golangci/golangci-lint/pkg/logutils.Log).Fatalf
15 |   golint:
16 |     min-confidence: 0
17 |   gocyclo:
18 |     min-complexity: 15
19 |   maligned:
20 |     suggest-new: true
21 |   dupl:
22 |     threshold: 100
23 |   goconst:
24 |     min-len: 2
25 |     min-occurrences: 3
26 |   misspell:
27 |     locale: US
28 |   goimports:
29 |     local-prefixes: github.com/golangci/golangci-lint
30 |   gocritic:
31 |     enabled-tags:
32 |       - diagnostic
33 |       - experimental
34 |       - opinionated
35 |       - performance
36 |       - style
37 |     disabled-checks:
38 |       - wrapperFunc
39 |       - dupImport # https://github.com/go-critic/go-critic/issues/845
40 |       - ifElseChain
41 |       - octalLiteral
42 |       - whyNoLint
43 |       - hugeParam
44 |   funlen:
45 |     lines: 300
46 |     statements: 200
47 | 
48 | issues:
49 |   exclude-rules:
50 |     - path: _test\.go
51 |       linters:
52 |         - dupl
53 |         - gosec
54 | 
55 | linters:
56 |   disable-all: true
57 |   enable:
58 |     - bodyclose
59 |     - deadcode
60 |     - depguard
61 |     - dogsled
62 |     - dupl
63 |     - errcheck
64 |     - funlen
65 |     - goconst
66 |     - gocritic
67 |     - gocyclo
68 |     - gofmt
69 |     - goimports
70 |     - golint
71 |     - gosec
72 |     - gosimple
73 |     - govet
74 |     - ineffassign
75 |     - interfacer
76 |     - misspell
77 |     - nakedret
78 |     - scopelint
79 |     - staticcheck
80 |     - structcheck
81 |     - stylecheck
82 |     - typecheck
83 |     - unconvert
84 |     - unparam
85 |     - unused
86 |     - varcheck
87 |     - whitespace
88 |     - prealloc
89 |     - maligned
90 | 
91 | service:
92 |   golangci-lint-version: 1.21.x # use the fixed version to not introduce new linters unexpectedly


--------------------------------------------------------------------------------
/.goreleaser.yml:
--------------------------------------------------------------------------------
 1 | before:
 2 |   hooks:
 3 |     - go mod download
 4 | 
 5 | builds:
 6 |   - env:
 7 |       - CGO_ENABLED=1
 8 |     main: ./cmd/qumomf/main.go
 9 |     ldflags:
10 |       - -s -w -X main.version={{.Version}} -X main.commit={{.Commit}} -X main.buildDate={{.Date}}
11 |     goarch:
12 |       - amd64
13 |     goos:
14 |       - linux
15 | 
16 | archives:
17 |   - files:
18 |       - LICENSE
19 |       - README.md
20 |       - config/*
21 | 
22 | checksum:
23 |   name_template: 'checksums.txt'
24 |   # Algorithm to be used.
25 |   # Accepted options are sha256, sha512, sha1, crc32, md5, sha224 and sha384.
26 |   # Default is sha256.
27 |   algorithm: sha256
28 | 
29 | snapshot:
30 |   name_template: "{{ .Tag }}-SNAPSHOT-{{.ShortCommit}}"
31 | 
32 | changelog:
33 |   skip: true
34 | 
35 | nfpms:
36 |   - id: default
37 |     package_name: qumomf
38 | 
39 |     vendor: citymobil
40 |     maintainer: Pavel Parshin <pparshin@city-mobil.ru>, Aleksandr Petrukhin <a.petrukhin@city-mobil.ru>
41 |     homepage: https://github.com/shmel1k/qumomf
42 |     description: Tarantool vshard HA tool supports auto discovery and recovery
43 |     license: MIT
44 | 
45 |     formats:
46 |       - deb
47 |       - rpm
48 | 
49 |     dependencies: ~
50 |     recommends: ~
51 |     suggests: ~
52 |     conflicts: ~
53 | 
54 |     bindir: /usr/local/bin
55 | 
56 |     epoch: 1
57 |     release: 1
58 | 
59 |     scripts:
60 |       postinstall: "scripts/postinstall.sh"
61 |       preremove: "scripts/preremove.sh"
62 | 
63 |     files:
64 |       "scripts/etc/systemd/**": "/etc/systemd/system"
65 | 
66 |     config_files:
67 |       "config/qumomf.daemon.min.conf.yml": "/etc/qumomf/conf.yml"


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | BINARY=qumomf
 2 | VERSION=`git describe --tags --dirty --always`
 3 | COMMIT=`git rev-parse HEAD`
 4 | BUILD_DATE=`date +%FT%T%z`
 5 | LDFLAGS=-ldflags "-w -s -X main.version=${VERSION} -X main.commit=${COMMIT} -X main.buildDate=${BUILD_DATE}"
 6 | 
 7 | all: build
 8 | 
 9 | .PHONY: build
10 | build:
11 | 	go build ${LDFLAGS} -o bin/${BINARY} cmd/qumomf/main.go
12 | 
13 | .PHONY: release
14 | release:
15 | 	goreleaser build --snapshot --rm-dist
16 | 
17 | .PHONY: run
18 | run: build
19 | 	bin/qumomf -config=example/qumomf.yml
20 | 
21 | .PHONY: env_up
22 | env_up:
23 | 	docker-compose -f example/docker-compose.yml up -d
24 | 	sleep 2
25 | 	docker-compose -f example/docker-compose.yml ps
26 | 
27 | .PHONY: env_down
28 | env_down:
29 | 	docker-compose -f example/docker-compose.yml down -v --rmi local --remove-orphans
30 | 
31 | .PHONY: fmt
32 | fmt:
33 | 	go fmt ./...
34 | 
35 | .PHONY: lint
36 | lint:
37 | 	golangci-lint run -v ./...
38 | 
39 | .PHONY: run_short_tests
40 | run_short_tests:
41 | 	go test -count=1 -v -short ./...
42 | 
43 | .PHONY: run_tests
44 | run_tests: env_up
45 | 	go test -count=1 -v -race ./...
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/shmel1k/qumomf?sort=semver&style=for-the-badge)
  2 | ![GitHub Workflow Status](https://img.shields.io/github/workflow/status/shmel1k/qumomf/CI?style=for-the-badge)
  3 | 
  4 | # Qumomf
  5 | 
  6 | Qumomf is a Tarantool vshard high availability tool which supports discovery and recovery.
  7 | 
  8 | # Table of Contents
  9 | 
 10 |   * [Discovery](#discovery)
 11 |   * [Configuration](#configuration)
 12 |      * [How to add a new cluster](#how-to-add-a-new-cluster)
 13 |   * [Topology recovery](#topology-recovery)
 14 |      * [Idle](#idle)
 15 |      * [Smart](#smart)
 16 |   * [Recovery hooks](#recovery-hooks)
 17 |      * [Hooks arguments and environment](#hooks-arguments-and-environment)
 18 |   * [API](#api)
 19 |   * [Hacking](#hacking)
 20 | 
 21 | ## Discovery
 22 | 
 23 | Qumomf actively crawls through your topologies and analyzes them. 
 24 | It reads basic vshard info such as replication status and configuration.
 25 | 
 26 | You should provide at least one router which will be an entrypoint to the discovery process.
 27 | 
 28 | ## Configuration
 29 | 
 30 | For a sample qumomf configuration and its description see [example](config/qumomf.conf.yml).
 31 | 
 32 | ### How to add a new cluster
 33 | 
 34 | Edit your configuration file and add a new cluster, e.g.:
 35 | 
 36 | ```yaml
 37 | clusters:
 38 |   my_cluster:
 39 |     routers:
 40 |       - name: 'my_cluster_router_1'
 41 |         addr: 'localhost:3301'
 42 | ```
 43 | 
 44 | You might override default connection settings for each cluster.
 45 | 
 46 | ```yaml
 47 | clusters:
 48 |   my_cluster:
 49 |     connection:
 50 |       user: 'tnt'
 51 |       password: 'tnt'
 52 |       connect_timeout: 10s
 53 |       request_timeout: 10s
 54 | 
 55 |     routers:
 56 |       - name: 'my_cluster_router_1'
 57 |         addr: 'localhost:3301'
 58 | ```
 59 | 
 60 | For a sample vshard configuration, 
 61 | see [qumomf example](/example) or [Tarantool documentation](https://www.tarantool.io/en/doc/1.10/reference/reference_rock/vshard/vshard_quick/#vshard-config-cluster-example).
 62 | 
 63 | Start qumomf, and it will discover all clusters defined in the configuration.
 64 | 
 65 | ## Topology recovery
 66 | 
 67 | Just now qumomf supports only automated master recovery.
 68 | It is a configurable option and can be disabled completely or for a cluster via configuration.
 69 | 
 70 | Master election supports two modes: `idle` and `smart`.
 71 | Election mode might be configured for each cluster independently.
 72 | 
 73 | Both electors supports those options:
 74 | 
 75 |   - `reasonable_follower_lsn_lag` - on crash recovery, followers that are lagging 
 76 |      more than given LSN must not participate in the election.
 77 |   - `reasonable_follower_idle` - on crash recovery, followers that are lagging 
 78 |      more than given duration must not participate in the election.
 79 | 
 80 | Value of 0 disables this features.
 81 | 
 82 | ### Idle
 83 | 
 84 | Naive and simple elector which finds alive replica last communicated to the failed master (received data or heartbeat signal).
 85 | Followers with the negative priority will be excluded from the master election.
 86 | 
 87 | ### Smart
 88 | 
 89 | Elector tries to involve as many metrics as can:
 90 |   - vshard configuration consistency (prefer replica which has the same configuration as master), 
 91 |   - which upstream status did replica have before the crash,
 92 |   - how replica is far from the master comparing LSN to the master LSN,
 93 |   - last time when replica received data or heartbeat signal from the master,
 94 |   - user promotion rules based on the instance priorities.
 95 | 
 96 | You can define your own promotion rules which will influence on master election during a failover.
 97 | Each instance has a priority set via config. Negative priority excludes follower from the election process. 
 98 | 
 99 | ## Recovery hooks
100 | 
101 | Hooks invoked through the recovery process via shell, in particular bash.
102 | 
103 | These hooks are available:
104 | 
105 |  - `PreFailover`: executed immediately before qumomf takes recovery action. Failure (non-zero exit code) of any of these processes aborts the recovery. Hint: this gives you the opportunity to abort recovery based on some internal state of your system.
106 |  - `PostSuccessfulFailover`: executed at the end of successful recovery.
107 |  - `PostUnsuccessfulFailover`: executed at the end of unsuccessful recovery.
108 | 
109 | Any process command that starts with "&" will be executed asynchronously, and a failure for such process is ignored.
110 | 
111 | Qumomf executes lists of commands sequentially, in order of definition.
112 | 
113 | A naive implementation might look like:
114 | 
115 | ```yaml
116 | hooks:
117 |   shell: bash
118 |   pre_failover:
119 |     - "echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/qumomf_recovery.log"
120 |   post_successful_failover:
121 |     - "echo 'Recovered from {failureType} on {failureCluster}. Set: {failureReplicaSetUUID}; Failed: {failedURI}; Successor: {successorURI}' >> /tmp/qumomf_recovery.log"
122 |   post_unsuccessful_failover:
123 |     - "echo 'Failed to recover from {failureType} on {failureCluster}. Set: {failureReplicaSetUUID}; Failed: {failedURI}' >> /tmp/qumomf_recovery.log"
124 | ```
125 | 
126 | ### Hooks arguments and environment
127 | 
128 | Qumomf provides all hooks with failure/recovery related information, such as the UUID/URI of the failed instance, 
129 | UUID/URI of promoted instance, type of failure, name of cluster, etc.
130 | 
131 | This information is passed independently in two ways, and you may choose to use one or both:
132 | 
133 | **Environment variables**:
134 | 
135 |   - `QUM_FAILURE_TYPE`
136 |   - `QUM_FAILED_UUID`
137 |   - `QUM_FAILED_URI`
138 |   - `QUM_FAILURE_CLUSTER`
139 |   - `QUM_FAILURE_REPLICA_SET_UUID`
140 |   - `QUM_COUNT_FOLLOWERS`
141 |   - `QUM_COUNT_WORKING_FOLLOWERS`
142 |   - `QUM_COUNT_REPLICATING_FOLLOWERS`
143 |   - `QUM_COUNT_INCONSISTENT_VSHARD_CONF`
144 |   - `QUM_IS_SUCCESSFUL`
145 |     
146 |   And, if a recovery was successful:
147 |     
148 |   - `QUM_SUCCESSOR_UUID`
149 |   - `QUM_SUCCESSOR_URI`
150 | 
151 | **Command line text replacement**. 
152 | 
153 | Qumomf replaces the following tokens in your hook commands:
154 | 
155 |   - `{failureType}`
156 |   - `{failedUUID}`
157 |   - `{failedURI}`
158 |   - `{failureCluster}`
159 |   - `{failureReplicaSetUUID}`
160 |   - `{countFollowers}`
161 |   - `{countWorkingFollowers}`
162 |   - `{countReplicatingFollowers}`
163 |   - `{countInconsistentVShardConf}`
164 |   - `{isSuccessful}`
165 | 
166 |   And, if a recovery was a successful:
167 | 
168 |   - `{successorUUID}`
169 |   - `{successorURI}`
170 | 
171 | ## API
172 | 
173 | Qumomf exposes several debug endpoints:
174 | 
175 | - `/debug/metrics` - runtime and app metrics in Prometheus format,
176 | - `/debug/health` - health check,
177 | - `/debug/about` - the app version and build date. 
178 | 
179 | [API documentation](api/swagger.yml) for getting information about cluster states, recoveries and problems.
180 | 
181 | ## Hacking
182 | 
183 | Feel free to open issues and pull requests with your ideas how to improve qumomf.
184 | 
185 | To run unit and integration tests:
186 | 
187 | ```bash
188 | make env_up
189 | make run_tests
190 | make env_down
191 | ```
192 | 


--------------------------------------------------------------------------------
/api/swagger.yml:
--------------------------------------------------------------------------------
  1 | openapi: 3.0.0
  2 | info:
  3 |   title: QUMOMF API
  4 |   version: 0.0.1
  5 | 
  6 | paths:
  7 |   /api/v0/snapshots:
  8 |     get:
  9 |       summary: "Get list of clusters"
 10 |       responses:
 11 |         '200':
 12 |           description: 'Request succefully finished'
 13 |           content:
 14 |             application/json:
 15 |               schema:
 16 |                 $ref: '#/components/schemas/ClusterInfo'
 17 |         '500':
 18 |           description: 'Internal error'
 19 | 
 20 |   /api/v0/snapshots/{cluster_name}:
 21 |     get:
 22 |       summary: "Get all information about cluster"
 23 |       parameters:
 24 |         - $ref: '#/components/parameters/cluster_name'
 25 |       responses:
 26 |         '200':
 27 |           description: 'Request succefully finished'
 28 |         '400':
 29 |           description: 'Invalid request'
 30 |         '500':
 31 |           description: 'Internal error'
 32 | 
 33 |   /api/v0/snapshots/{cluster_name}/{shard_uuid}:
 34 |     get:
 35 |       summary: "Get all information about shard"
 36 |       parameters:
 37 |         - $ref: '#/components/parameters/cluster_name'
 38 |         - $ref: '#/components/parameters/shard_uuid'
 39 |       responses:
 40 |         '200':
 41 |           description: 'Request succefully finished'
 42 |         '400':
 43 |           description: 'Invalid request'
 44 |         '500':
 45 |           description: 'Internal error'
 46 |   /api/v0/snapshots/{cluster_name}/{shard_uuid}/{instance_uuid}:
 47 |     get:
 48 |       summary: "Get all information about instance"
 49 |       parameters:
 50 |         - $ref: '#/components/parameters/cluster_name'
 51 |         - $ref: '#/components/parameters/shard_uuid'
 52 |         - $ref: '#/components/parameters/instance_uuid'
 53 |       responses:
 54 |         '200':
 55 |           description: 'Request succefully finished'
 56 |         '400':
 57 |           description: 'Invalid request'
 58 |         '500':
 59 |           description: 'Internal error'
 60 | 
 61 |   /api/v0/recoveries/{cluster_name}/{shard_uuid}:
 62 |     get:
 63 |       summary: "Get all recoceries for shard"
 64 |       parameters:
 65 |         - $ref: '#/components/parameters/cluster_name'
 66 |         - $ref: '#/components/parameters/shard_uuid'
 67 |       responses:
 68 |         '200':
 69 |           description: 'Request succefully finished'
 70 |         '400':
 71 |           description: 'Invalid request'
 72 |         '500':
 73 |           description: 'Internal error'
 74 |   /api/v0/alerts:
 75 |     get:
 76 |       summary: "Get all active problems"
 77 |       responses:
 78 |         '200':
 79 |           description: 'Request succefully finished'
 80 |           content:
 81 |             application/json:
 82 |               schema:
 83 |                 $ref: '#/components/schemas/AlertsResponse'
 84 |         '500':
 85 |           description: 'Internal error'
 86 |   /api/v0/alerts/{cluster_name}:
 87 |     get:
 88 |       summary: "Get all active problems for cluster"
 89 |       parameters:
 90 |         - $ref: '#/components/parameters/cluster_name'
 91 |       responses:
 92 |         '200':
 93 |           description: 'Request succefully finished'
 94 |           content:
 95 |             application/json:
 96 |               schema:
 97 |                 $ref: '#/components/schemas/AlertsResponse'
 98 |         '400':
 99 |           description: 'Invalid request'
100 |         '500':
101 |           description: 'Internal error'
102 | components:
103 |   schemas:
104 |     ClusterInfo:
105 |       type: array
106 |       items:
107 |         properties:
108 |           name:
109 |             type: string
110 |             example: qumomf_sandbox
111 |           shards_count:
112 |             type: integer
113 |             example: 2
114 |           routers_count:
115 |             type: integer
116 |             example: 1
117 |           discovered_at:
118 |             type: integer
119 |             example: 1611231096
120 |           health_level:
121 |             type: string
122 |             example: green
123 |     AlertsResponse:
124 |       properties:
125 |         instances_alerts:
126 |           $ref: '#/components/schemas/InstanceAlerts'
127 |         routers_alerts:
128 |           $ref: '#/components/schemas/RoutersAlerts'
129 |     InstanceAlerts:
130 |       properties:
131 |         cluster_name:
132 |           type: string
133 |         shard_uuid:
134 |           type: string
135 |         instance_uri:
136 |           type: string
137 |         alerts:
138 |           type: array
139 |           items:
140 |             $ref: '#/components/schemas/Alert'
141 |     RoutersAlerts:
142 |       properties:
143 |         uri:
144 |           type: string
145 |         alerts:
146 |           type: array
147 |           items:
148 |             $ref: '#/components/schemas/Alert'
149 |     Alert:
150 |       properties:
151 |         Type:
152 |           type: string
153 |         Description:
154 |           type: string
155 |   parameters:
156 |     cluster_name:
157 |       in: path
158 |       name: cluster_name
159 |       schema:
160 |         type: string
161 |       required: true
162 |       description: Cluster name
163 |     shard_uuid:
164 |       in: path
165 |       name: shard_uuid
166 |       schema:
167 |         type: string
168 |       required: true
169 |       description: Shard uuid
170 |     instance_uuid:
171 |       in: path
172 |       name: instance_uuid
173 |       schema:
174 |         type: string
175 |       required: true
176 |       description: Instance uuid


--------------------------------------------------------------------------------
/cmd/qumomf/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"flag"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"log/syslog"
  9 | 	"net/http"
 10 | 	"os"
 11 | 	"os/signal"
 12 | 	"path"
 13 | 	"syscall"
 14 | 	"time"
 15 | 
 16 | 	"github.com/shmel1k/qumomf/internal/api"
 17 | 
 18 | 	"github.com/gorilla/mux"
 19 | 
 20 | 	"github.com/shmel1k/qumomf/internal/storage"
 21 | 	"github.com/shmel1k/qumomf/internal/storage/sqlite"
 22 | 
 23 | 	"github.com/rs/zerolog"
 24 | 	"github.com/rs/zerolog/log"
 25 | 	"golang.org/x/sys/unix"
 26 | 	"gopkg.in/natefinch/lumberjack.v2"
 27 | 
 28 | 	"github.com/shmel1k/qumomf/internal/config"
 29 | 	"github.com/shmel1k/qumomf/internal/coordinator"
 30 | 	"github.com/shmel1k/qumomf/internal/qumhttp"
 31 | )
 32 | 
 33 | var (
 34 | 	version   = "dev"
 35 | 	commit    = "none"
 36 | 	buildDate = "unknown"
 37 | )
 38 | 
 39 | var (
 40 | 	configPath = flag.String("config", "", "Config file path")
 41 | )
 42 | 
 43 | func main() {
 44 | 	flag.Parse()
 45 | 	cfg, err := config.Setup(*configPath)
 46 | 	if err != nil {
 47 | 		log.Fatal().Err(err).Msgf("failed to read config")
 48 | 	}
 49 | 
 50 | 	logger := initLogger(cfg)
 51 | 
 52 | 	db, err := newStorage(cfg)
 53 | 	if err != nil {
 54 | 		logger.Fatal().Err(err).Msg("failed to init persistent storage")
 55 | 	}
 56 | 
 57 | 	service := api.NewService(db)
 58 | 	server := initHTTPServer(logger, service, cfg.Qumomf.Port)
 59 | 
 60 | 	logger.Info().Msgf("Starting qumomf %s, commit %s, built at %s", version, commit, buildDate)
 61 | 
 62 | 	go func() {
 63 | 		logger.Info().Msgf("Listening on %s", cfg.Qumomf.Port)
 64 | 
 65 | 		err = server.ListenAndServe()
 66 | 		if err != http.ErrServerClosed {
 67 | 			logger.Fatal().Err(err).Msg("Failed to listen HTTP server")
 68 | 		}
 69 | 	}()
 70 | 
 71 | 	if len(cfg.Clusters) == 0 {
 72 | 		logger.Warn().Msg("No clusters are found in the configuration")
 73 | 	}
 74 | 
 75 | 	qCoordinator := coordinator.New(logger, db)
 76 | 	for clusterName, clusterCfg := range cfg.Clusters {
 77 | 		err = qCoordinator.RegisterCluster(clusterName, clusterCfg, cfg)
 78 | 		if err != nil {
 79 | 			logger.Err(err).Msgf("Could not register cluster with name %s", clusterName)
 80 | 			continue
 81 | 		}
 82 | 		logger.Info().Msgf("New cluster '%s' has been registered", clusterName)
 83 | 	}
 84 | 
 85 | 	interrupt := make(chan os.Signal, 1)
 86 | 	signal.Notify(interrupt, syscall.SIGINT, syscall.SIGTERM)
 87 | 	sig := <-interrupt
 88 | 
 89 | 	logger.Info().Msgf("Received system signal: %s. Shutting down qumomf", sig)
 90 | 	qCoordinator.Shutdown()
 91 | 
 92 | 	err = server.Shutdown(context.Background())
 93 | 	if err != nil {
 94 | 		logger.Err(err).Msg("Failed to shutting down the HTTP server gracefully")
 95 | 	}
 96 | }
 97 | 
 98 | func newStorage(cfg *config.Config) (storage.Storage, error) {
 99 | 	return sqlite.New(sqlite.Config{
100 | 		FileName:       cfg.Qumomf.Storage.Filename,
101 | 		ConnectTimeout: cfg.Qumomf.Storage.ConnectTimeout,
102 | 		QueryTimeout:   cfg.Qumomf.Storage.QueryTimeout,
103 | 	})
104 | }
105 | 
106 | func initLogger(cfg *config.Config) zerolog.Logger {
107 | 	zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
108 | 
109 | 	loggingCfg := cfg.Qumomf.Logging
110 | 
111 | 	logLevel, err := zerolog.ParseLevel(loggingCfg.Level)
112 | 	if err != nil {
113 | 		log.Warn().Msgf("Unknown Level String: '%s', defaulting to DebugLevel", loggingCfg.Level)
114 | 		logLevel = zerolog.DebugLevel
115 | 	}
116 | 
117 | 	zerolog.SetGlobalLevel(logLevel)
118 | 
119 | 	writers := make([]io.Writer, 0, 1)
120 | 	writers = append(writers, os.Stdout)
121 | 
122 | 	if loggingCfg.SysLogEnabled {
123 | 		w, err := syslog.New(syslog.LOG_INFO, "qumomf")
124 | 		if err != nil {
125 | 			log.Warn().Err(err).Msg("Unable to connect to the system log daemon")
126 | 		} else {
127 | 			writers = append(writers, zerolog.SyslogLevelWriter(w))
128 | 		}
129 | 	}
130 | 
131 | 	if loggingCfg.FileLoggingEnabled {
132 | 		w, err := newRollingLogFile(&loggingCfg)
133 | 		if err != nil {
134 | 			log.Warn().Err(err).Msg("Unable to init file logger")
135 | 		} else {
136 | 			writers = append(writers, w)
137 | 		}
138 | 	}
139 | 
140 | 	var baseLogger zerolog.Logger
141 | 	if len(writers) == 1 {
142 | 		baseLogger = zerolog.New(writers[0])
143 | 	} else {
144 | 		return zerolog.New(zerolog.MultiLevelWriter(writers...))
145 | 	}
146 | 
147 | 	return baseLogger.Level(logLevel).With().Timestamp().Logger()
148 | }
149 | 
150 | func newRollingLogFile(cfg *config.Logging) (io.Writer, error) {
151 | 	dir := path.Dir(cfg.Filename)
152 | 	if unix.Access(dir, unix.W_OK) != nil {
153 | 		return nil, fmt.Errorf("no permissions to write logs to dir: %s", dir)
154 | 	}
155 | 
156 | 	return &lumberjack.Logger{
157 | 		Filename:   cfg.Filename,
158 | 		MaxBackups: cfg.MaxBackups,
159 | 		MaxSize:    cfg.MaxSize,
160 | 		MaxAge:     cfg.MaxAge,
161 | 	}, nil
162 | }
163 | 
164 | func initHTTPServer(logger zerolog.Logger, service api.Service, port string) *http.Server {
165 | 	r := mux.NewRouter()
166 | 	qumhttp.RegisterDebugHandlers(r, version, commit, buildDate)
167 | 	qumhttp.RegisterAPIHandlers(r, qumhttp.NewHandler(logger, service))
168 | 
169 | 	return &http.Server{
170 | 		Addr:         port,
171 | 		Handler:      r,
172 | 		ReadTimeout:  5 * time.Second,
173 | 		WriteTimeout: 5 * time.Second,
174 | 	}
175 | }
176 | 


--------------------------------------------------------------------------------
/config/qumomf.conf.yml:
--------------------------------------------------------------------------------
  1 | qumomf:
  2 |   # TCP port to listen.
  3 |   port: ':8080'
  4 |   logging:
  5 |     # Verbose level of logging: trace, debug, info, warn, error, fatal, panic.
  6 |     # To disable logging, pass an empty string.
  7 |     level: 'debug'
  8 |     # Write logs to the local syslog daemon.
  9 |     syslog_enabled: false
 10 |     # Write logs to the file.
 11 |     file_enabled: true
 12 |     # Absolute path to the log output file.
 13 |     file_name: '/var/log/qumomf.log'
 14 |     # The max size in MB of the logfile before it's rolled.
 15 |     file_max_size: 256
 16 |     # The max number of rolled files to keep.
 17 |     file_max_backups: 3
 18 |     # The max age in days to keep a logfile.
 19 |     file_max_age: 5
 20 |   # Indicates whether qumomf should run in the readonly mode:
 21 |   # no auto failover will be executed.
 22 |   # Can be overwritten by cluster-specific options.
 23 |   readonly: true
 24 |   # How often should qumomf discover the cluster topology.
 25 |   cluster_discovery_time: '5s'
 26 |   # How often should qumomf analyze the cluster state.
 27 |   cluster_recovery_time: '1s'
 28 |   # Qumomf avoids flapping (cascading failures causing continuous outage and elimination of resources)
 29 |   # by introducing a block period, where on any given cluster, qumomf will not kick in automated recovery
 30 |   # on an interval smaller than said period.
 31 |   # It only applies to recoveries on the same cluster.
 32 |   # There is nothing to prevent concurrent recoveries running on different clusters.
 33 |   shard_recovery_block_time: '30m'
 34 |   # Similar to the shard_recovery_block_time option but defines recovery block period
 35 |   # only for a single instance. Used during the vshard configuration recovery.
 36 |   instance_recovery_block_time: '10m'
 37 | 
 38 |   # How should qumomf choose a new master during the failover.
 39 |   # Available options: idle, smart.
 40 |   # See README for the description.
 41 |   # Can be overwritten by cluster-specific options.
 42 |   elector: 'smart'
 43 |   # On crash recovery, followers that are lagging more than given LSN must not participate in the election.
 44 |   # Value of 0 disables this feature.
 45 |   reasonable_follower_lsn_lag: 500
 46 |   # On crash recovery, followers that are lagging more than given duration must not participate in the election.
 47 |   # Value of 0 disables this feature.
 48 |   reasonable_follower_idle: '1m'
 49 | 
 50 |   # Hooks invoked through the recovery process.
 51 |   # These are arrays of commands invoked via shell, in particular bash.
 52 |   hooks:
 53 |     # Shell to use invoking hooks in format "shell -c <command>".
 54 |     shell: bash
 55 |     # Deadline timeout for basic hooks.
 56 |     timeout: 5s
 57 |     # Deadline timeout for async hooks.
 58 |     timeout_async: 10m
 59 |     # PreFailover hooks executed before the recovery process.
 60 |     pre_failover:
 61 |       - "echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/qumomf_recovery.log"
 62 |     # PostSuccessfulFailover hooks executed after the successful recovery process.
 63 |     post_successful_failover:
 64 |       - "echo 'Recovered from {failureType} on {failureCluster}. Set: {failureReplicaSetUUID}; Failed: {failedURI}; Successor: {successorURI}' >> /tmp/qumomf_recovery.log"
 65 |     # PostUnsuccessfulFailover hooks executed after the unsuccessful recovery process.
 66 |     post_unsuccessful_failover:
 67 |       - "echo 'Failed to recover from {failureType} on {failureCluster}. Set: {failureReplicaSetUUID}; Failed: {failedURI}' >> /tmp/qumomf_recovery.log"
 68 | 
 69 |   # Local persistent storage to save snapshots, recoveries and other useful data
 70 |   storage:
 71 |     filename: 'qumomf.db'
 72 |     connect_timeout: '1s'
 73 |     query_timeout: '1s'
 74 | 
 75 | # Tarantool connection options.
 76 | # Can be overwritten by cluster-specific options.
 77 | connection:
 78 |   user: 'qumomf'
 79 |   password: 'qumomf'
 80 |   connect_timeout: '500ms'
 81 |   request_timeout: '1s'
 82 | 
 83 | # List of all clusters.
 84 | clusters:
 85 |   # Cluster unique name.
 86 |   qumomf_sandbox:
 87 |     readonly: false
 88 | 
 89 |     # During the autodiscovery qumomf will use the information
 90 |     # read from tarantool instances.
 91 |     # You may want to override the URI of the instances.
 92 |     override_uri_rules:
 93 |       'qumomf_1_m.ddk:3301': '127.0.0.1:9303'
 94 |       'qumomf_1_s.ddk:3301': '127.0.0.1:9304'
 95 |       'qumomf_2_m.ddk:3301': '127.0.0.1:9305'
 96 |       'qumomf_2_s_1.ddk:3301': '127.0.0.1:9306'
 97 |       'qumomf_2_s_2.ddk:3301': '127.0.0.1:9307'
 98 | 
 99 |     # List of all routers in the cluster.
100 |     # Used to discover the cluster topology.
101 |     routers:
102 |       - name: 'router_1'
103 |         uuid: 'router_1_uuid'
104 |         addr: '127.0.0.1:9301'
105 | 
106 |   qumomf_sandbox_2:
107 |     elector: 'idle'
108 | 
109 |     connection:
110 |       user: 'tnt'
111 |       password: 'tnt'
112 |       connect_timeout: 10s
113 |       request_timeout: 10s
114 | 
115 |     # List of priorities for the cluster instances.
116 |     priorities:
117 |       'a3ef657e-eb9a-4730-b420-7ea78d52797d': 0
118 |       'bd64dd00-161e-4c99-8b3c-d3c4635e18d2': 10
119 |       'cc4cfb9c-11d8-4810-84d2-66cfbebb0f6e': -1 # exclude from the election process
120 | 
121 |     routers:
122 |       - name: 'sandbox2-router1'
123 |         uuid: '38dbe90b-9bca-4766-a98c-f02e56ddf986'
124 |         addr: '127.0.0.1:7301'


--------------------------------------------------------------------------------
/config/qumomf.daemon.min.conf.yml:
--------------------------------------------------------------------------------
 1 | qumomf:
 2 |   port: ':8080'
 3 |   logging:
 4 |     level: 'debug'
 5 |     syslog_enabled: false
 6 |     file_enabled: true
 7 |     file_name: '/var/log/qumomf.log'
 8 |     file_max_size: 256
 9 |     file_max_backups: 3
10 |     file_max_age: 5
11 |   readonly: true
12 |   cluster_discovery_time: '5s'
13 |   cluster_recovery_time: '1s'
14 |   shard_recovery_block_time: '30m'
15 |   instance_recovery_block_time: '10m'
16 | 
17 |   elector: 'smart'
18 |   reasonable_follower_lsn_lag: 500
19 |   reasonable_follower_idle: '1m'
20 | 
21 |   hooks:
22 |     shell: bash
23 |     timeout: 5s
24 |     timeout_async: 10m
25 |     pre_failover:
26 |       - "echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/qumomf_recovery.log"
27 |     post_successful_failover:
28 |       - "echo 'Recovered from {failureType} on {failureCluster}. Set: {failureReplicaSetUUID}; Failed: {failedURI}; Successor: {successorURI}' >> /tmp/qumomf_recovery.log"
29 |     post_unsuccessful_failover:
30 |       - "echo 'Failed to recover from {failureType} on {failureCluster}. Set: {failureReplicaSetUUID}; Failed: {failedURI}' >> /tmp/qumomf_recovery.log"
31 | 
32 | connection:
33 |   user: 'qumomf'
34 |   password: 'qumomf'
35 |   connect_timeout: '500ms'
36 |   request_timeout: '1s'
37 | 
38 | clusters: ~


--------------------------------------------------------------------------------
/example/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.3'
 2 | 
 3 | services:
 4 |   router_1:
 5 |     build: './router'
 6 |     container_name: qumomf_router.ddk
 7 |     networks:
 8 |       - qumomf
 9 |     ports:
10 |       - '9301:3301'
11 |     depends_on:
12 |       - storage_1_m
13 |       - storage_1_s
14 |       - storage_2_m
15 |       - storage_2_s_1
16 |       - storage_2_s_2
17 | 
18 |   storage_1_m:
19 |     build: './storage'
20 |     container_name: qumomf_1_m.ddk
21 |     networks:
22 |       - qumomf
23 |     ports:
24 |       - '9303:3301'
25 |     environment:
26 |       - STORAGE_UUID=a94e7310-13f0-4690-b136-169599e87ba0
27 | 
28 |   storage_1_s:
29 |     build: './storage'
30 |     container_name: qumomf_1_s.ddk
31 |     depends_on:
32 |       - storage_1_m
33 |     networks:
34 |       - qumomf
35 |     ports:
36 |       - '9304:3301'
37 |     environment:
38 |       - STORAGE_UUID=bd1095d1-1e73-4ceb-8e2f-6ebdc7838cb1
39 | 
40 |   storage_2_m:
41 |     build: './storage'
42 |     container_name: qumomf_2_m.ddk
43 |     networks:
44 |       - qumomf
45 |     ports:
46 |       - '9305:3301'
47 |     environment:
48 |       - STORAGE_UUID=a3ef657e-eb9a-4730-b420-7ea78d52797d
49 | 
50 |   storage_2_s_1:
51 |     build: './storage'
52 |     container_name: qumomf_2_s_1.ddk
53 |     networks:
54 |       - qumomf
55 |     ports:
56 |       - '9306:3301'
57 |     depends_on:
58 |       - storage_2_m
59 |     environment:
60 |       - STORAGE_UUID=bd64dd00-161e-4c99-8b3c-d3c4635e18d2
61 | 
62 |   storage_2_s_2:
63 |     build: './storage'
64 |     container_name: qumomf_2_s_2.ddk
65 |     networks:
66 |       - qumomf
67 |     ports:
68 |       - '9307:3301'
69 |     depends_on:
70 |       - storage_2_m
71 |     environment:
72 | #      - FAKETIME=-1m
73 |       - STORAGE_UUID=cc4cfb9c-11d8-4810-84d2-66cfbebb0f6e
74 | 
75 | networks:
76 |   qumomf:
77 |     driver: bridge
78 | 


--------------------------------------------------------------------------------
/example/qumomf.yml:
--------------------------------------------------------------------------------
 1 | qumomf:
 2 |   port: ':8080'
 3 |   logging:
 4 |     level: 'info'
 5 |     syslog_enabled: false
 6 |     file_enabled: true
 7 |     file_name: '/home/pavel/work/qumomf/src/github.com/shmel1k/qumomf/bin/qumomf.log'
 8 |     file_max_size: 256
 9 |     file_max_backups: 3
10 |     file_max_age: 5
11 |   readonly: true
12 |   cluster_discovery_time: '5s'
13 |   cluster_recovery_time: '1s'
14 |   shard_recovery_block_time: '30m'
15 |   instance_recovery_block_time: '10m'
16 |   elector: 'smart'
17 | 
18 |   hooks:
19 |     shell: bash
20 |     timeout: 2s
21 |     timeout_async: 1m
22 |     pre_failover:
23 |       - "echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/qumomf_recovery.log"
24 |     post_successful_failover:
25 |       - "echo 'Recovered from {failureType} on {failureCluster}. Set: {failureReplicaSetUUID}; Failed: {failedURI}; Successor: {successorURI}' >> /tmp/qumomf_recovery.log"
26 |     post_unsuccessful_failover:
27 |       - "echo 'Failed to recover from {failureType} on {failureCluster}. Set: {failureReplicaSetUUID}; Failed: {failedURI}' >> /tmp/qumomf_recovery.log"
28 |   storage:
29 |     filename: 'qumomf.db'
30 |     connect_timeout: '1s'
31 |     query_timeout: '1s'
32 | 
33 | connection:
34 |   user: 'qumomf'
35 |   password: 'qumomf'
36 |   connect_timeout: '500ms'
37 |   request_timeout: '1s'
38 | 
39 | clusters:
40 |   qumomf_sandbox:
41 |     readonly: false
42 | 
43 |     override_uri_rules:
44 |       'qumomf_1_m.ddk:3301': '127.0.0.1:9303'
45 |       'qumomf_1_s.ddk:3301': '127.0.0.1:9304'
46 |       'qumomf_2_m.ddk:3301': '127.0.0.1:9305'
47 |       'qumomf_2_s_1.ddk:3301': '127.0.0.1:9306'
48 |       'qumomf_2_s_2.ddk:3301': '127.0.0.1:9307'
49 | 
50 |     priorities:
51 |       'bd64dd00-161e-4c99-8b3c-d3c4635e18d2': 10
52 |       'cc4cfb9c-11d8-4810-84d2-66cfbebb0f6e': 5
53 | 
54 |     routers:
55 |       - name: 'router_1'
56 |         addr: '127.0.0.1:9301'


--------------------------------------------------------------------------------
/example/router/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM tarantool/tarantool:2.3.1
2 | 
3 | COPY init_router.lua /etc/tarantool/instances.enabled/init_router.lua
4 | COPY router.lua /etc/tarantool/instances.enabled/qumomf/router/router.lua
5 | CMD ["tarantool", "/etc/tarantool/instances.enabled/init_router.lua"]
6 | 


--------------------------------------------------------------------------------
/example/router/init_router.lua:
--------------------------------------------------------------------------------
 1 | vshard = require('vshard')
 2 | 
 3 | local cfg = {
 4 |     memtx_memory = 100 * 1024 * 1024,
 5 |     bucket_count = 120,
 6 |     rebalancer_disbalance_threshold = 10,
 7 |     rebalancer_max_receiving = 1000,
 8 | 
 9 |     sharding = {
10 |         ['7432f072-c00b-4498-b1a6-6d9547a8a150'] = { -- replicaset #1
11 |             replicas = {
12 |                 ['a94e7310-13f0-4690-b136-169599e87ba0'] = {
13 |                     uri = 'qumomf:qumomf@qumomf_1_m.ddk:3301',
14 |                     name = 'qumomf_1_m',
15 |                     master = true
16 |                 },
17 |                 ['bd1095d1-1e73-4ceb-8e2f-6ebdc7838cb1'] = {
18 |                     uri = 'qumomf:qumomf@qumomf_1_s.ddk:3301',
19 |                     name = 'qumomf_1_s'
20 |                 }
21 |             },
22 |         }, -- replicaset #1
23 |         ['5065fb5f-5f40-498e-af79-43887ba3d1ec'] = { -- replicaset #2
24 |             replicas = {
25 |                 ['a3ef657e-eb9a-4730-b420-7ea78d52797d'] = {
26 |                     uri = 'qumomf:qumomf@qumomf_2_m.ddk:3301',
27 |                     name = 'qumomf_2_m',
28 |                     master = true
29 |                 },
30 |                 ['bd64dd00-161e-4c99-8b3c-d3c4635e18d2'] = {
31 |                     uri = 'qumomf:qumomf@qumomf_2_s_1.ddk:3301',
32 |                     name = 'qumomf_2_s_1'
33 |                 },
34 |                 ['cc4cfb9c-11d8-4810-84d2-66cfbebb0f6e'] = {
35 |                     uri = 'qumomf:qumomf@qumomf_2_s_2.ddk:3301',
36 |                     name = 'qumomf_2_s_2'
37 |                 }
38 |             },
39 |         }, -- replicaset #2
40 |     }, -- sharding
41 | }
42 | 
43 | cfg.listen = 3301
44 | vshard.router.cfg(cfg)
45 | 
46 | box.once("init", function()
47 |     box.schema.user.create('qumomf', { password = 'qumomf', if_not_exists = true })
48 |     box.schema.user.grant('qumomf', 'read,write,create,execute', 'universe')
49 | end)
50 | 
51 | vshard.router.bootstrap()
52 | vshard.router.discovery_wakeup()
53 | 
54 | dofile('/etc/tarantool/instances.enabled/qumomf/router/router.lua')
55 | 
56 | 


--------------------------------------------------------------------------------
/example/router/router.lua:
--------------------------------------------------------------------------------
 1 | vshard = require('vshard')
 2 | 
 3 | local DEFAULT_TIMEOUT = 1
 4 | 
 5 | local OP_GET = 'qumomf_get'
 6 | local OP_SET = 'qumomf_set'
 7 | 
 8 | function qumomf_get(key)
 9 |     local bucket_id = vshard.router.bucket_id(key)
10 |     local netbox, err = vshard.router.route(bucket_id)
11 |     if err ~= nil then
12 |         error(err)
13 |     end
14 | 
15 |     local result, err = netbox:callbre(OP_GET, {key}, {
16 |         timeout = DEFAULT_TIMEOUT,
17 |     })
18 |     if err ~= nil then
19 |         error(err)
20 |     end
21 |     return result
22 | end
23 | 
24 | function qumomf_set(key, value)
25 |     local bucket_id = vshard.router.bucket_id(key)
26 |     local netbox, err = vshard.router.route(bucket_id)
27 |     if err ~= nil then
28 |         error(err)
29 |     end
30 | 
31 |     local result, err = netbox:callrw(OP_SET, { key, value }, {
32 |         timeout = DEFAULT_TIMEOUT,
33 |     })
34 |     if err ~= nil then
35 |         error(err)
36 |     end
37 | 
38 |     return result
39 | end


--------------------------------------------------------------------------------
/example/storage/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM tarantool/tarantool:2.3.1
2 | 
3 | COPY --from=trajano/alpine-libfaketime /faketime.so /lib/faketime.so
4 | ENV LD_PRELOAD=/lib/faketime.so
5 | 
6 | COPY init_storage.lua /etc/tarantool/instances.enabled/init_storage.lua
7 | COPY storage.lua /etc/tarantool/instances.enabled/qumomf/storage/storage.lua
8 | CMD ["tarantool", "/etc/tarantool/instances.enabled/init_storage.lua"]
9 | 


--------------------------------------------------------------------------------
/example/storage/init_storage.lua:
--------------------------------------------------------------------------------
 1 | os = require('os')
 2 | vshard = require('vshard')
 3 | 
 4 | local IDX_KEY = 1
 5 | local IDX_VALUE = 2
 6 | 
 7 | local cfg = {
 8 |     memtx_memory = 100 * 1024 * 1024,
 9 |     bucket_count = 120,
10 |     rebalancer_disbalance_threshold = 10,
11 |     rebalancer_max_receiving = 1000,
12 | 
13 |     sharding = {
14 |         ['7432f072-c00b-4498-b1a6-6d9547a8a150'] = { -- replicaset #1
15 |             replicas = {
16 |                 ['a94e7310-13f0-4690-b136-169599e87ba0'] = {
17 |                     uri = 'qumomf:qumomf@qumomf_1_m.ddk:3301',
18 |                     name = 'qumomf_1_m',
19 |                     master = true
20 |                 },
21 |                 ['bd1095d1-1e73-4ceb-8e2f-6ebdc7838cb1'] = {
22 |                     uri = 'qumomf:qumomf@qumomf_1_s.ddk:3301',
23 |                     name = 'qumomf_1_s'
24 |                 }
25 |             },
26 |         }, -- replicaset #1
27 |         ['5065fb5f-5f40-498e-af79-43887ba3d1ec'] = { -- replicaset #2
28 |             replicas = {
29 |                 ['a3ef657e-eb9a-4730-b420-7ea78d52797d'] = {
30 |                     uri = 'qumomf:qumomf@qumomf_2_m.ddk:3301',
31 |                     name = 'qumomf_2_m',
32 |                     master = true
33 |                 },
34 |                 ['bd64dd00-161e-4c99-8b3c-d3c4635e18d2'] = {
35 |                     uri = 'qumomf:qumomf@qumomf_2_s_1.ddk:3301',
36 |                     name = 'qumomf_2_s_1'
37 |                 },
38 |                 ['cc4cfb9c-11d8-4810-84d2-66cfbebb0f6e'] = {
39 |                     uri = 'qumomf:qumomf@qumomf_2_s_2.ddk:3301',
40 |                     name = 'qumomf_2_s_2'
41 |                 }
42 |             },
43 |         }, -- replicaset #2
44 |     }, -- sharding
45 | }
46 | 
47 | local UUID = os.getenv("STORAGE_UUID")
48 | 
49 | cfg.listen = 3301
50 | vshard.storage.cfg(cfg, UUID)
51 | 
52 | box.once("init", function()
53 |     if UUID == 'a94e7310-13f0-4690-b136-169599e87ba0' then
54 |         vshard.storage.bucket_force_create(0, 60, {})
55 |     end
56 | 
57 |     if UUID == 'a3ef657e-eb9a-4730-b420-7ea78d52797d' then
58 |         vshard.storage.bucket_force_create(61, 60, {})
59 |     end
60 | 
61 |     box.schema.user.create('qumomf', { password = 'qumomf', if_not_exists = true })
62 |     box.schema.user.grant('qumomf', 'read,write,create,execute', 'universe')
63 | 
64 |     local space = box.schema.create_space("qumomf", {
65 |         if_not_exists = true,
66 |     })
67 | 
68 |     space:create_index('key', {
69 |         type = 'TREE',
70 |         if_not_exists = true,
71 |         parts = {
72 |             IDX_KEY,
73 |             'string',
74 |         },
75 |         unique = true,
76 |     })
77 | end)
78 | 
79 | dofile('/etc/tarantool/instances.enabled/qumomf/storage/storage.lua')


--------------------------------------------------------------------------------
/example/storage/storage.lua:
--------------------------------------------------------------------------------
 1 | require('strict').on()
 2 | os = require('os')
 3 | 
 4 | local IDX_KEY = 1
 5 | local IDX_VALUE = 2
 6 | 
 7 | function qumomf_set(key, value)
 8 |     box.space.qumomf:insert({ key, value, 0 })
 9 |     return {}
10 | end
11 | 
12 | function qumomf_get(key)
13 |     local tuple = box.space.qumomf:select(key)
14 |     if #tuple == 0 then
15 |         return nil
16 |     end
17 |     tuple = tuple[1]
18 | 
19 |     return tuple[IDX_VALUE]
20 | end


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/shmel1k/qumomf
 2 | 
 3 | go 1.13
 4 | 
 5 | require (
 6 | 	github.com/google/uuid v1.1.2 // indirect
 7 | 	github.com/gorilla/mux v1.8.0
 8 | 	github.com/mattn/go-sqlite3 v1.14.5
 9 | 	github.com/philhofer/fwd v1.0.0 // indirect
10 | 	github.com/prometheus/client_golang v1.5.1
11 | 	github.com/rs/zerolog v1.18.0
12 | 	github.com/satori/go.uuid v1.2.0 // indirect
13 | 	github.com/stretchr/testify v1.4.0
14 | 	github.com/tarantool/go-tarantool v0.0.0-20191229181800-f4ece3508d87 // indirect
15 | 	github.com/tinylib/msgp v1.1.1 // indirect
16 | 	github.com/viciious/go-tarantool v0.0.0-20190828171136-ede812c03707
17 | 	golang.org/x/sys v0.0.0-20200122134326-e047566fdf82
18 | 	google.golang.org/appengine v1.6.5 // indirect
19 | 	google.golang.org/genproto v0.0.0-20210113195801-ae06605f4595 // indirect
20 | 	gopkg.in/natefinch/lumberjack.v2 v2.0.0
21 | 	gopkg.in/vmihailenco/msgpack.v2 v2.9.1 // indirect
22 | 	gopkg.in/yaml.v2 v2.2.8
23 | )
24 | 


--------------------------------------------------------------------------------
/internal/api/api.go:
--------------------------------------------------------------------------------
  1 | package api
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"errors"
  6 | 
  7 | 	"github.com/shmel1k/qumomf/internal/storage"
  8 | 	"github.com/shmel1k/qumomf/internal/storage/sqlite"
  9 | 	"github.com/shmel1k/qumomf/internal/vshard"
 10 | 	"github.com/shmel1k/qumomf/internal/vshard/orchestrator"
 11 | )
 12 | 
 13 | var (
 14 | 	ErrClusterNotFound    = errors.New("cluster not found")
 15 | 	ErrReplicaSetNotFound = errors.New("replica set not found")
 16 | 	ErrInstanceNotFound   = errors.New("instance not found")
 17 | )
 18 | 
 19 | type Service interface {
 20 | 	ClustersList(context.Context) ([]ClusterInfo, error)
 21 | 	ClusterSnapshot(context.Context, string) (vshard.Snapshot, error)
 22 | 	ReplicaSet(context.Context, string, vshard.ReplicaSetUUID) (vshard.ReplicaSet, error)
 23 | 	Instance(context.Context, string, vshard.ReplicaSetUUID, vshard.InstanceUUID) (vshard.Instance, error)
 24 | 	Recoveries(context.Context, string, vshard.ReplicaSetUUID) ([]orchestrator.Recovery, error)
 25 | 	Alerts(context.Context) (AlertsResponse, error)
 26 | 	ClusterAlerts(context.Context, string) (AlertsResponse, error)
 27 | }
 28 | 
 29 | func NewService(db storage.Storage) Service {
 30 | 	return &service{
 31 | 		db: db,
 32 | 	}
 33 | }
 34 | 
 35 | type service struct {
 36 | 	db storage.Storage
 37 | }
 38 | 
 39 | func (s *service) ClustersList(ctx context.Context) ([]ClusterInfo, error) {
 40 | 	clustersList, err := s.db.GetClusters(ctx)
 41 | 	if err != nil {
 42 | 		return nil, err
 43 | 	}
 44 | 
 45 | 	resp := make([]ClusterInfo, 0, len(clustersList))
 46 | 	for _, cluster := range clustersList {
 47 | 		resp = append(resp, ClusterInfo{
 48 | 			Name:         cluster.Name,
 49 | 			ShardsCount:  len(cluster.Snapshot.ReplicaSets),
 50 | 			RoutersCount: len(cluster.Snapshot.Routers),
 51 | 			DiscoveredAt: cluster.Snapshot.Created,
 52 | 			HealthLevel:  cluster.Snapshot.ClusterHealthLevel(),
 53 | 		})
 54 | 	}
 55 | 
 56 | 	return resp, nil
 57 | }
 58 | 
 59 | func (s *service) ClusterSnapshot(ctx context.Context, clusterName string) (vshard.Snapshot, error) {
 60 | 	snap, err := s.db.GetClusterSnapshot(ctx, clusterName)
 61 | 	if err == sqlite.ErrEmptyResult {
 62 | 		return vshard.Snapshot{}, ErrClusterNotFound
 63 | 	}
 64 | 
 65 | 	return snap, err
 66 | }
 67 | 
 68 | func (s *service) ReplicaSet(ctx context.Context, clusterName string, replicaSetUUID vshard.ReplicaSetUUID) (vshard.ReplicaSet, error) {
 69 | 	snap, err := s.db.GetClusterSnapshot(ctx, clusterName)
 70 | 	if err != nil {
 71 | 		if err == sqlite.ErrEmptyResult {
 72 | 			return vshard.ReplicaSet{}, ErrClusterNotFound
 73 | 		}
 74 | 		return vshard.ReplicaSet{}, err
 75 | 	}
 76 | 
 77 | 	replicaSet, err := snap.ReplicaSet(replicaSetUUID)
 78 | 	if err != nil {
 79 | 		if err == vshard.ErrReplicaSetNotFound {
 80 | 			return vshard.ReplicaSet{}, ErrReplicaSetNotFound
 81 | 		}
 82 | 
 83 | 		return vshard.ReplicaSet{}, err
 84 | 	}
 85 | 
 86 | 	return replicaSet, nil
 87 | }
 88 | 
 89 | func (s *service) Instance(ctx context.Context, clusterName string, replicaSetUUID vshard.ReplicaSetUUID, instanceUUID vshard.InstanceUUID) (vshard.Instance, error) {
 90 | 	replicaSet, err := s.ReplicaSet(ctx, clusterName, replicaSetUUID)
 91 | 	if err != nil {
 92 | 		return vshard.Instance{}, err
 93 | 	}
 94 | 
 95 | 	for i := range replicaSet.Instances {
 96 | 		if replicaSet.Instances[i].UUID == instanceUUID {
 97 | 			return replicaSet.Instances[i], nil
 98 | 		}
 99 | 	}
100 | 
101 | 	return vshard.Instance{}, ErrInstanceNotFound
102 | }
103 | 
104 | func (s *service) Recoveries(ctx context.Context, clusterName string, replicaSetUUID vshard.ReplicaSetUUID) ([]orchestrator.Recovery, error) {
105 | 	recoveries, err := s.db.GetRecoveries(ctx, clusterName)
106 | 	if err != nil {
107 | 		return nil, err
108 | 	}
109 | 
110 | 	resp := make([]orchestrator.Recovery, 0, len(recoveries))
111 | 	for i := range recoveries {
112 | 		if recoveries[i].SetUUID == replicaSetUUID {
113 | 			resp = append(resp, recoveries[i])
114 | 		}
115 | 	}
116 | 
117 | 	return resp, nil
118 | }
119 | 
120 | func (s *service) Alerts(ctx context.Context) (AlertsResponse, error) {
121 | 	clusters, err := s.db.GetClusters(ctx)
122 | 	if err != nil {
123 | 		return AlertsResponse{}, err
124 | 	}
125 | 
126 | 	instanceAlertsList := make([]InstanceAlerts, 0)
127 | 	routerAlertsList := make([]RoutersAlerts, 0)
128 | 	for i := range clusters {
129 | 		routerAlertsList = append(routerAlertsList, routersAlerts(clusters[i].Snapshot.Routers)...)
130 | 		instanceAlertsList = append(instanceAlertsList, instanceAlerts(clusters[i].Name, clusters[i].Snapshot.ReplicaSets)...)
131 | 	}
132 | 
133 | 	return AlertsResponse{
134 | 		InstancesAlerts: instanceAlertsList,
135 | 		RoutersAlerts:   routerAlertsList,
136 | 	}, nil
137 | }
138 | 
139 | func (s *service) ClusterAlerts(ctx context.Context, clusterName string) (AlertsResponse, error) {
140 | 	cluster, err := s.ClusterSnapshot(ctx, clusterName)
141 | 	if err != nil {
142 | 		return AlertsResponse{}, err
143 | 	}
144 | 
145 | 	return AlertsResponse{
146 | 		InstancesAlerts: instanceAlerts(clusterName, cluster.ReplicaSets),
147 | 		RoutersAlerts:   routersAlerts(cluster.Routers),
148 | 	}, nil
149 | }
150 | 
151 | func routersAlerts(routers []vshard.Router) []RoutersAlerts {
152 | 	result := make([]RoutersAlerts, 0)
153 | 	for i := range routers {
154 | 		if len(routers[i].Info.Alerts) > 0 {
155 | 			result = append(result, RoutersAlerts{
156 | 				URI:    routers[i].URI,
157 | 				Alerts: routers[i].Info.Alerts,
158 | 			})
159 | 		}
160 | 	}
161 | 
162 | 	return result
163 | }
164 | 
165 | func instanceAlerts(clusterName string, replicaSets []vshard.ReplicaSet) []InstanceAlerts {
166 | 	resp := make([]InstanceAlerts, 0)
167 | 
168 | 	for i := range replicaSets {
169 | 		instances := replicaSets[i].Instances
170 | 		for j := range instances {
171 | 			alerts := instances[j].StorageInfo.Alerts
172 | 			if len(alerts) != 0 {
173 | 				resp = append(resp, InstanceAlerts{
174 | 					ClusterName: clusterName,
175 | 					ShardUUID:   replicaSets[i].UUID,
176 | 					InstanceURI: instances[j].URI,
177 | 					Alerts:      alerts,
178 | 				})
179 | 			}
180 | 		}
181 | 	}
182 | 
183 | 	return resp
184 | }
185 | 


--------------------------------------------------------------------------------
/internal/api/data.go:
--------------------------------------------------------------------------------
 1 | package api
 2 | 
 3 | import "github.com/shmel1k/qumomf/internal/vshard"
 4 | 
 5 | type ClusterInfo struct {
 6 | 	Name         string             `json:"name"`
 7 | 	ShardsCount  int                `json:"shards_count"`
 8 | 	RoutersCount int                `json:"routers_count"`
 9 | 	DiscoveredAt int64              `json:"discovered_at"`
10 | 	HealthLevel  vshard.HealthLevel `json:"health_level"`
11 | }
12 | 
13 | type AlertsResponse struct {
14 | 	InstancesAlerts []InstanceAlerts `json:"instances_alerts"`
15 | 	RoutersAlerts   []RoutersAlerts  `json:"routers_alerts"`
16 | }
17 | 
18 | type InstanceAlerts struct {
19 | 	ClusterName string                `json:"cluster_name"`
20 | 	ShardUUID   vshard.ReplicaSetUUID `json:"shard_uuid"`
21 | 	InstanceURI string                `json:"instance_uri"`
22 | 	Alerts      []vshard.Alert        `json:"alerts"`
23 | }
24 | 
25 | type RoutersAlerts struct {
26 | 	URI    string         `json:"uri"`
27 | 	Alerts []vshard.Alert `json:"alerts"`
28 | }
29 | 


--------------------------------------------------------------------------------
/internal/config/config.go:
--------------------------------------------------------------------------------
  1 | package config
  2 | 
  3 | import (
  4 | 	"io/ioutil"
  5 | 	"os"
  6 | 	"time"
  7 | 
  8 | 	"gopkg.in/yaml.v2"
  9 | )
 10 | 
 11 | const (
 12 | 	defaultLogLevel                  = "debug"
 13 | 	defaultSysLogEnabled             = false
 14 | 	defaultFileLoggingEnabled        = false
 15 | 	defaultLogFilename               = "/var/log/qumomf.log"
 16 | 	defaultLogFileMaxSize            = 256
 17 | 	defaultLogFileMaxBackups         = 3
 18 | 	defaultLogFileMaxAge             = 5
 19 | 	defaultReadOnly                  = true
 20 | 	defaultUser                      = "guest"
 21 | 	defaultPassword                  = "guest"
 22 | 	defaultConnectTimeout            = 500 * time.Millisecond
 23 | 	defaultRequestTimeout            = 1 * time.Second
 24 | 	defaultClusterDiscoveryTime      = 5 * time.Second
 25 | 	defaultClusterRecoveryTime       = 1 * time.Second
 26 | 	defaultShardRecoveryBlockTime    = 30 * time.Minute
 27 | 	defaultInstanceRecoveryBlockTime = 10 * time.Minute
 28 | 	defaultElectorType               = "smart"
 29 | 	defaultShellCommand              = "bash"
 30 | 	defaultHookTimeout               = 5 * time.Second
 31 | 	defaultAsyncHookTimeout          = 10 * time.Minute
 32 | 	defaultMaxFollowerLSNLag         = 1000
 33 | 	defaultMaxFollowerIdle           = 5 * time.Minute
 34 | 	defaultStorageFileName           = "qumomf.db"
 35 | 	defaultStorageConnectTimeout     = time.Second
 36 | 	defaultStorageQueryTimeout       = time.Second
 37 | )
 38 | 
 39 | type Config struct {
 40 | 	// Qumomf is a set of global options determines qumomf's behavior.
 41 | 	Qumomf struct {
 42 | 		Port                      string        `yaml:"port"`
 43 | 		Logging                   Logging       `yaml:"logging"`
 44 | 		ReadOnly                  bool          `yaml:"readonly"`
 45 | 		ClusterDiscoveryTime      time.Duration `yaml:"cluster_discovery_time"`
 46 | 		ClusterRecoveryTime       time.Duration `yaml:"cluster_recovery_time"`
 47 | 		ShardRecoveryBlockTime    time.Duration `yaml:"shard_recovery_block_time"`
 48 | 		InstanceRecoveryBlockTime time.Duration `yaml:"instance_recovery_block_time"`
 49 | 		ElectionMode              string        `yaml:"elector"`
 50 | 		ReasonableFollowerLSNLag  int64         `yaml:"reasonable_follower_lsn_lag"`
 51 | 		ReasonableFollowerIdle    time.Duration `yaml:"reasonable_follower_idle"`
 52 | 		Hooks                     struct {
 53 | 			Shell                    string        `yaml:"shell"`
 54 | 			PreFailover              []string      `yaml:"pre_failover"`
 55 | 			PostSuccessfulFailover   []string      `yaml:"post_successful_failover"`
 56 | 			PostUnsuccessfulFailover []string      `yaml:"post_unsuccessful_failover"`
 57 | 			Timeout                  time.Duration `yaml:"timeout"`
 58 | 			TimeoutAsync             time.Duration `yaml:"timeout_async"`
 59 | 		} `yaml:"hooks"`
 60 | 		Storage struct {
 61 | 			Filename       string        `yaml:"filename"`
 62 | 			QueryTimeout   time.Duration `yaml:"query_timeout"`
 63 | 			ConnectTimeout time.Duration `yaml:"connect_timeout"`
 64 | 		} `yaml:"storage"`
 65 | 	} `yaml:"qumomf"`
 66 | 
 67 | 	// Connection contains the default connection options for each instance in clusters.
 68 | 	// This options might be overridden by cluster-level options.
 69 | 	Connection *ConnectConfig           `yaml:"connection,omitempty"`
 70 | 	Clusters   map[string]ClusterConfig `yaml:"clusters"`
 71 | }
 72 | 
 73 | type Logging struct {
 74 | 	Level              string `yaml:"level"`
 75 | 	SysLogEnabled      bool   `yaml:"syslog_enabled"`
 76 | 	FileLoggingEnabled bool   `yaml:"file_enabled"`
 77 | 	Filename           string `yaml:"file_name"`
 78 | 	MaxSize            int    `yaml:"file_max_size"`    // megabytes
 79 | 	MaxBackups         int    `yaml:"file_max_backups"` // files
 80 | 	MaxAge             int    `yaml:"file_max_age"`     // days
 81 | }
 82 | 
 83 | type ConnectConfig struct {
 84 | 	User           *string        `yaml:"user"`
 85 | 	Password       *string        `yaml:"password"`
 86 | 	ConnectTimeout *time.Duration `yaml:"connect_timeout"`
 87 | 	RequestTimeout *time.Duration `yaml:"request_timeout"`
 88 | }
 89 | 
 90 | type ClusterConfig struct {
 91 | 	// Connection contains connection options which qumomf should
 92 | 	// use to connect to routers and instances in the cluster.
 93 | 	Connection *ConnectConfig `yaml:"connection,omitempty"`
 94 | 
 95 | 	// ReadOnly indicates whether qumomf can run a failover
 96 | 	// or should just observe the cluster topology.
 97 | 	ReadOnly *bool `yaml:"readonly,omitempty"`
 98 | 
 99 | 	// ElectionMode is a master election mode of the given cluster.
100 | 	ElectionMode *string `yaml:"elector"`
101 | 
102 | 	// OverrideURIRules contains list of URI used in tarantool replication and
103 | 	// their mappings which will be used in connection pool by qumomf.
104 | 	//
105 | 	// Use it if qumomf should not connect to the instances by URI
106 | 	// obtained from the replication configuration during the auto discovery.
107 | 	OverrideURIRules map[string]string `yaml:"override_uri_rules,omitempty"`
108 | 
109 | 	// Priorities contains list of instances UUID and their priorities.
110 | 	Priorities map[string]int `yaml:"priorities,omitempty"`
111 | 
112 | 	// Routers contains list of all cluster routers.
113 | 	//
114 | 	// All cluster nodes must share a common topology.
115 | 	// An administrator must ensure that the configurations are identical.
116 | 	// The administrator must provide list of all routers so qumomf will be able
117 | 	// to update their configuration when failover is running.
118 | 	// Otherwise failover might break topology.
119 | 	Routers []RouterConfig `yaml:"routers"`
120 | }
121 | 
122 | type RouterConfig struct {
123 | 	Name string `yaml:"name"`
124 | 	Addr string `yaml:"addr"`
125 | }
126 | 
127 | func Setup(path string) (*Config, error) {
128 | 	file, err := os.Open(path)
129 | 	if err != nil {
130 | 		return nil, err
131 | 	}
132 | 	defer func() {
133 | 		_ = file.Close()
134 | 	}()
135 | 
136 | 	data, err := ioutil.ReadAll(file)
137 | 	if err != nil {
138 | 		return nil, err
139 | 	}
140 | 
141 | 	var cfg Config
142 | 	cfg.withDefaults()
143 | 	err = yaml.Unmarshal(data, &cfg)
144 | 	if err != nil {
145 | 		return nil, err
146 | 	}
147 | 
148 | 	cfg.overrideEmptyByGlobalConfigs()
149 | 
150 | 	err = validate(&cfg)
151 | 	if err != nil {
152 | 		return nil, err
153 | 	}
154 | 
155 | 	return &cfg, nil
156 | }
157 | 
158 | func (c *Config) withDefaults() {
159 | 	if c == nil {
160 | 		return
161 | 	}
162 | 
163 | 	base := &c.Qumomf
164 | 	base.ReadOnly = defaultReadOnly
165 | 
166 | 	base.Logging.Level = defaultLogLevel
167 | 	base.Logging.SysLogEnabled = defaultSysLogEnabled
168 | 	base.Logging.FileLoggingEnabled = defaultFileLoggingEnabled
169 | 	base.Logging.Filename = defaultLogFilename
170 | 	base.Logging.MaxSize = defaultLogFileMaxSize
171 | 	base.Logging.MaxBackups = defaultLogFileMaxBackups
172 | 	base.Logging.MaxAge = defaultLogFileMaxAge
173 | 
174 | 	base.ClusterDiscoveryTime = defaultClusterDiscoveryTime
175 | 	base.ClusterRecoveryTime = defaultClusterRecoveryTime
176 | 	base.ShardRecoveryBlockTime = defaultShardRecoveryBlockTime
177 | 	base.InstanceRecoveryBlockTime = defaultInstanceRecoveryBlockTime
178 | 	base.ElectionMode = defaultElectorType
179 | 	base.ReasonableFollowerLSNLag = defaultMaxFollowerLSNLag
180 | 	base.ReasonableFollowerIdle = defaultMaxFollowerIdle
181 | 	base.Hooks.Shell = defaultShellCommand
182 | 	base.Hooks.Timeout = defaultHookTimeout
183 | 	base.Hooks.TimeoutAsync = defaultAsyncHookTimeout
184 | 
185 | 	base.Storage.Filename = defaultStorageFileName
186 | 	base.Storage.ConnectTimeout = defaultStorageConnectTimeout
187 | 	base.Storage.QueryTimeout = defaultStorageQueryTimeout
188 | 
189 | 	connection := &ConnectConfig{}
190 | 	connection.User = newString(defaultUser)
191 | 	connection.Password = newString(defaultPassword)
192 | 	connection.ConnectTimeout = newDuration(defaultConnectTimeout)
193 | 	connection.RequestTimeout = newDuration(defaultRequestTimeout)
194 | 	c.Connection = connection
195 | }
196 | 
197 | func (c *Config) overrideEmptyByGlobalConfigs() {
198 | 	for clusterUUID, clusterCfg := range c.Clusters {
199 | 		if clusterCfg.ReadOnly == nil {
200 | 			clusterCfg.ReadOnly = newBool(c.Qumomf.ReadOnly)
201 | 		}
202 | 
203 | 		if clusterCfg.ElectionMode == nil {
204 | 			clusterCfg.ElectionMode = newString(c.Qumomf.ElectionMode)
205 | 		}
206 | 
207 | 		if clusterCfg.Connection == nil {
208 | 			clusterCfg.Connection = c.Connection
209 | 		} else {
210 | 			opts := clusterCfg.Connection
211 | 			if opts.ConnectTimeout == nil {
212 | 				opts.ConnectTimeout = c.Connection.ConnectTimeout
213 | 			}
214 | 			if opts.RequestTimeout == nil {
215 | 				opts.RequestTimeout = c.Connection.RequestTimeout
216 | 			}
217 | 			if opts.User == nil {
218 | 				opts.User = c.Connection.User
219 | 			}
220 | 			if opts.Password == nil {
221 | 				opts.Password = c.Connection.Password
222 | 			}
223 | 		}
224 | 
225 | 		c.Clusters[clusterUUID] = clusterCfg
226 | 	}
227 | }
228 | 
229 | func validate(c *Config) error {
230 | 	err := validateElector(&c.Qumomf.ElectionMode)
231 | 	if err != nil {
232 | 		return err
233 | 	}
234 | 
235 | 	for _, clusterCfg := range c.Clusters {
236 | 		err = validateElector(clusterCfg.ElectionMode)
237 | 		if err != nil {
238 | 			return err
239 | 		}
240 | 	}
241 | 
242 | 	return nil
243 | }
244 | 
245 | func newBool(v bool) *bool {
246 | 	return &v
247 | }
248 | 
249 | func newDuration(v time.Duration) *time.Duration {
250 | 	return &v
251 | }
252 | 
253 | func newString(v string) *string {
254 | 	return &v
255 | }
256 | 


--------------------------------------------------------------------------------
/internal/config/config_test.go:
--------------------------------------------------------------------------------
  1 | package config
  2 | 
  3 | import (
  4 | 	"path/filepath"
  5 | 	"testing"
  6 | 	"time"
  7 | 
  8 | 	"github.com/stretchr/testify/assert"
  9 | 	"github.com/stretchr/testify/require"
 10 | )
 11 | 
 12 | func TestSetup_InvalidPath(t *testing.T) {
 13 | 	cfg, err := Setup("invalid_path")
 14 | 	assert.NotNil(t, err)
 15 | 	assert.Nil(t, cfg)
 16 | }
 17 | 
 18 | func TestSetup_ValidPath(t *testing.T) {
 19 | 	testConfigPath, err := filepath.Abs("testdata/qumomf-full.conf.yml")
 20 | 	require.Nil(t, err)
 21 | 
 22 | 	cfg, err := Setup(testConfigPath)
 23 | 	require.Nil(t, err)
 24 | 	require.NotNil(t, cfg)
 25 | 
 26 | 	assert.Equal(t, ":8080", cfg.Qumomf.Port)
 27 | 
 28 | 	loggingCfg := cfg.Qumomf.Logging
 29 | 	assert.Equal(t, "debug", loggingCfg.Level)
 30 | 	assert.True(t, loggingCfg.SysLogEnabled)
 31 | 	assert.True(t, loggingCfg.FileLoggingEnabled)
 32 | 	assert.Equal(t, "/var/log/qumomf.log", loggingCfg.Filename)
 33 | 	assert.Equal(t, 256, loggingCfg.MaxSize)
 34 | 	assert.Equal(t, 3, loggingCfg.MaxBackups)
 35 | 	assert.Equal(t, 5, loggingCfg.MaxAge)
 36 | 
 37 | 	assert.True(t, cfg.Qumomf.ReadOnly)
 38 | 	assert.Equal(t, 60*time.Second, cfg.Qumomf.ClusterDiscoveryTime)
 39 | 	assert.Equal(t, 5*time.Second, cfg.Qumomf.ClusterRecoveryTime)
 40 | 	assert.Equal(t, 30*time.Minute, cfg.Qumomf.ShardRecoveryBlockTime)
 41 | 	assert.Equal(t, 10*time.Minute, cfg.Qumomf.InstanceRecoveryBlockTime)
 42 | 	assert.Equal(t, int64(500), cfg.Qumomf.ReasonableFollowerLSNLag)
 43 | 	assert.Equal(t, 1*time.Minute, cfg.Qumomf.ReasonableFollowerIdle)
 44 | 
 45 | 	hooks := cfg.Qumomf.Hooks
 46 | 	assert.Equal(t, "bash", hooks.Shell)
 47 | 	assert.Equal(t, 5*time.Second, hooks.Timeout)
 48 | 	assert.Equal(t, 10*time.Minute, hooks.TimeoutAsync)
 49 | 	assert.Equal(t, []string{"echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/qumomf_recovery.log"}, hooks.PreFailover)
 50 | 	assert.Equal(t, []string{"echo 'Recovered from {failureType} on {failureCluster}. Set: {failureReplicaSetUUID}; Failed: {failedURI}; Successor: {successorURI}' >> /tmp/qumomf_recovery.log"}, hooks.PostSuccessfulFailover)
 51 | 	assert.Equal(t, []string{"echo 'Failed to recover from {failureType} on {failureCluster}. Set: {failureReplicaSetUUID}; Failed: {failedURI}' >> /tmp/qumomf_recovery.log"}, hooks.PostUnsuccessfulFailover)
 52 | 
 53 | 	storage := cfg.Qumomf.Storage
 54 | 	assert.Equal(t, "sqlite.db", storage.Filename)
 55 | 	assert.Equal(t, time.Second, storage.QueryTimeout)
 56 | 	assert.Equal(t, time.Second, storage.ConnectTimeout)
 57 | 
 58 | 	assert.Equal(t, 500*time.Millisecond, *cfg.Connection.ConnectTimeout)
 59 | 	assert.Equal(t, 1*time.Second, *cfg.Connection.RequestTimeout)
 60 | 
 61 | 	connOpts := cfg.Connection
 62 | 	require.NotNil(t, connOpts)
 63 | 	assert.Equal(t, "qumomf", *connOpts.User)
 64 | 	assert.Equal(t, "qumomf", *connOpts.Password)
 65 | 	assert.Equal(t, 500*time.Millisecond, *connOpts.ConnectTimeout)
 66 | 	assert.Equal(t, 1*time.Second, *connOpts.RequestTimeout)
 67 | 
 68 | 	expected := map[string]ClusterConfig{
 69 | 		"qumomf_sandbox_1": {
 70 | 			Connection: &ConnectConfig{
 71 | 				User:           newString("qumomf"),
 72 | 				Password:       newString("qumomf"),
 73 | 				ConnectTimeout: newDuration(500 * time.Millisecond),
 74 | 				RequestTimeout: newDuration(1 * time.Second),
 75 | 			},
 76 | 			ReadOnly:     newBool(false),
 77 | 			ElectionMode: newString("smart"),
 78 | 			OverrideURIRules: map[string]string{
 79 | 				"qumomf_1_m.ddk:3301": "127.0.0.1:9303",
 80 | 			},
 81 | 			Routers: []RouterConfig{
 82 | 				{
 83 | 					Name: "sandbox1-router1",
 84 | 					Addr: "127.0.0.1:9301",
 85 | 				},
 86 | 				{
 87 | 					Name: "sandbox1-router2",
 88 | 					Addr: "127.0.0.1:9302",
 89 | 				},
 90 | 			},
 91 | 		},
 92 | 		"qumomf_sandbox_2": {
 93 | 			Connection: &ConnectConfig{
 94 | 				User:           newString("tnt"),
 95 | 				Password:       newString("tnt"),
 96 | 				ConnectTimeout: newDuration(10 * time.Second),
 97 | 				RequestTimeout: newDuration(10 * time.Second),
 98 | 			},
 99 | 			ReadOnly:     newBool(true),
100 | 			ElectionMode: newString("idle"),
101 | 			Priorities: map[string]int{
102 | 				"bd64dd00-161e-4c99-8b3c-d3c4635e18d2": 10,
103 | 				"cc4cfb9c-11d8-4810-84d2-66cfbebb0f6e": 5,
104 | 				"a3ef657e-eb9a-4730-b420-7ea78d52797d": -1,
105 | 			},
106 | 			Routers: []RouterConfig{
107 | 				{
108 | 					Name: "sandbox2-router1",
109 | 					Addr: "127.0.0.1:7301",
110 | 				},
111 | 			},
112 | 		},
113 | 	}
114 | 
115 | 	assert.Equal(t, expected, cfg.Clusters)
116 | }
117 | 
118 | func TestSetup_InvalidElectorOption(t *testing.T) {
119 | 	testConfigPath, err := filepath.Abs("testdata/bad-elector.conf.yml")
120 | 	require.Nil(t, err)
121 | 
122 | 	cfg, err := Setup(testConfigPath)
123 | 	require.NotNil(t, err)
124 | 	assert.Nil(t, cfg)
125 | }
126 | 


--------------------------------------------------------------------------------
/internal/config/testdata/bad-elector.conf.yml:
--------------------------------------------------------------------------------
 1 | qumomf:
 2 |   elector: 'smart'
 3 | 
 4 | clusters:
 5 |   qumomf_sandbox_1:
 6 |     elector: 'unknown'
 7 | 
 8 |     routers:
 9 |       - name: 'sandbox1-router1'
10 |         addr: '127.0.0.1:9301'
11 |         uuid: 'a94e7310-13f0-4690-b136-169599e87ba0'


--------------------------------------------------------------------------------
/internal/config/testdata/qumomf-full.conf.yml:
--------------------------------------------------------------------------------
 1 | qumomf:
 2 |   port: ':8080'
 3 |   logging:
 4 |     level: 'debug'
 5 |     syslog_enabled: true
 6 |     file_enabled: true
 7 |     file_name: '/var/log/qumomf.log'
 8 |     file_max_size: 256
 9 |     file_max_backups: 3
10 |     file_max_age: 5
11 |   readonly: true
12 |   cluster_discovery_time: '60s'
13 |   cluster_recovery_time: '5s'
14 |   shard_recovery_block_time: '30m'
15 |   instance_recovery_block_time: '10m'
16 | 
17 |   elector: 'smart'
18 |   reasonable_follower_lsn_lag: 500
19 |   reasonable_follower_idle: '1m'
20 | 
21 |   hooks:
22 |     shell: bash
23 |     timeout: 5s
24 |     timeout_async: 10m
25 |     pre_failover:
26 |       - "echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/qumomf_recovery.log"
27 |     post_successful_failover:
28 |       - "echo 'Recovered from {failureType} on {failureCluster}. Set: {failureReplicaSetUUID}; Failed: {failedURI}; Successor: {successorURI}' >> /tmp/qumomf_recovery.log"
29 |     post_unsuccessful_failover:
30 |       - "echo 'Failed to recover from {failureType} on {failureCluster}. Set: {failureReplicaSetUUID}; Failed: {failedURI}' >> /tmp/qumomf_recovery.log"
31 |   storage:
32 |     filename: 'sqlite.db'
33 |     connect_timeout: '1s'
34 |     query_timeout: '1s'
35 | 
36 | connection:
37 |   user: 'qumomf'
38 |   password: 'qumomf'
39 |   connect_timeout: '500ms'
40 |   request_timeout: '1s'
41 | 
42 | clusters:
43 |   qumomf_sandbox_1:
44 |     readonly: false
45 | 
46 |     override_uri_rules:
47 |       'qumomf_1_m.ddk:3301': '127.0.0.1:9303'
48 | 
49 |     routers:
50 |       - name: 'sandbox1-router1'
51 |         addr: '127.0.0.1:9301'
52 |         uuid: 'a94e7310-13f0-4690-b136-169599e87ba0'
53 |       - name: 'sandbox1-router2'
54 |         addr: '127.0.0.1:9302'
55 |         uuid: 'a3ef657e-eb9a-4730-b420-7ea78d52797d'
56 | 
57 |   qumomf_sandbox_2:
58 |     elector: 'idle'
59 | 
60 |     connection:
61 |       user: 'tnt'
62 |       password: 'tnt'
63 |       connect_timeout: 10s
64 |       request_timeout: 10s
65 | 
66 |     priorities:
67 |       'bd64dd00-161e-4c99-8b3c-d3c4635e18d2': 10
68 |       'cc4cfb9c-11d8-4810-84d2-66cfbebb0f6e': 5
69 |       'a3ef657e-eb9a-4730-b420-7ea78d52797d': -1
70 | 
71 |     routers:
72 |       - name: 'sandbox2-router1'
73 |         uuid: '38dbe90b-9bca-4766-a98c-f02e56ddf986'
74 |         addr: '127.0.0.1:7301'


--------------------------------------------------------------------------------
/internal/config/validator.go:
--------------------------------------------------------------------------------
 1 | package config
 2 | 
 3 | import "fmt"
 4 | 
 5 | func validateElector(v *string) error {
 6 | 	if v == nil {
 7 | 		return fmt.Errorf("option 'elector' must not be empty")
 8 | 	}
 9 | 
10 | 	if *v != "idle" && *v != "smart" {
11 | 		return fmt.Errorf("option 'elector' has a wrong value:: %s", *v)
12 | 	}
13 | 
14 | 	return nil
15 | }
16 | 


--------------------------------------------------------------------------------
/internal/coordinator/coordinator.go:
--------------------------------------------------------------------------------
  1 | package coordinator
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"errors"
  6 | 
  7 | 	"github.com/shmel1k/qumomf/internal/config"
  8 | 	"github.com/shmel1k/qumomf/internal/quorum"
  9 | 	"github.com/shmel1k/qumomf/internal/storage"
 10 | 	"github.com/shmel1k/qumomf/internal/vshard"
 11 | 	"github.com/shmel1k/qumomf/internal/vshard/orchestrator"
 12 | 
 13 | 	"github.com/rs/zerolog"
 14 | )
 15 | 
 16 | var (
 17 | 	ErrClusterAlreadyExist = errors.New("cluster with such name already registered")
 18 | )
 19 | 
 20 | type shutdownTask func()
 21 | 
 22 | type Coordinator struct {
 23 | 	logger zerolog.Logger
 24 | 
 25 | 	// clusters contains registered Tarantool clusters
 26 | 	// which Qumomf observes.
 27 | 	clusters map[string]*vshard.Cluster
 28 | 
 29 | 	// shutdownQueue contains all shutdown tasks to be
 30 | 	// executed when coordinator is going to exit.
 31 | 	shutdownQueue []shutdownTask
 32 | 
 33 | 	db storage.Storage
 34 | }
 35 | 
 36 | func New(logger zerolog.Logger, db storage.Storage) *Coordinator {
 37 | 	return &Coordinator{
 38 | 		logger:   logger,
 39 | 		clusters: make(map[string]*vshard.Cluster),
 40 | 		db:       db,
 41 | 	}
 42 | }
 43 | 
 44 | func (c *Coordinator) RegisterCluster(name string, cfg config.ClusterConfig, globalCfg *config.Config) error {
 45 | 	if _, exist := c.clusters[name]; exist {
 46 | 		return ErrClusterAlreadyExist
 47 | 	}
 48 | 
 49 | 	clusterLogger := c.logger.With().Str("cluster", name).Logger()
 50 | 
 51 | 	cluster := vshard.NewCluster(name, cfg)
 52 | 	cluster.SetLogger(clusterLogger)
 53 | 	cluster.SetOnClusterDiscovered(c.onClusterDiscovered)
 54 | 	c.clusters[name] = cluster
 55 | 	c.addShutdownTask(cluster.Shutdown)
 56 | 
 57 | 	mon := orchestrator.NewMonitor(cluster, orchestrator.Config{
 58 | 		RecoveryPollTime:  globalCfg.Qumomf.ClusterRecoveryTime,
 59 | 		DiscoveryPollTime: globalCfg.Qumomf.ClusterDiscoveryTime,
 60 | 	}, clusterLogger)
 61 | 	c.addShutdownTask(mon.Shutdown)
 62 | 
 63 | 	hooker := initHooker(globalCfg, clusterLogger)
 64 | 	elector := quorum.New(quorum.Mode(*cfg.ElectionMode), quorum.Options{
 65 | 		ReasonableFollowerLSNLag: globalCfg.Qumomf.ReasonableFollowerLSNLag,
 66 | 		ReasonableFollowerIdle:   globalCfg.Qumomf.ReasonableFollowerIdle.Seconds(),
 67 | 	})
 68 | 	failover := orchestrator.NewDefaultFailover(cluster, orchestrator.FailoverConfig{
 69 | 		Hooker:                      hooker,
 70 | 		Elector:                     elector,
 71 | 		ReplicaSetRecoveryBlockTime: globalCfg.Qumomf.ShardRecoveryBlockTime,
 72 | 		InstanceRecoveryBlockTime:   globalCfg.Qumomf.InstanceRecoveryBlockTime,
 73 | 	}, clusterLogger)
 74 | 	failover.SetOnClusterRecovered(c.onClusterRecovered)
 75 | 
 76 | 	c.addShutdownTask(failover.Shutdown)
 77 | 
 78 | 	analysisStream := mon.Serve()
 79 | 	failover.Serve(analysisStream)
 80 | 
 81 | 	return nil
 82 | }
 83 | 
 84 | func (c *Coordinator) onClusterDiscovered(clusterName string, snapshot vshard.Snapshot) {
 85 | 	err := c.db.SaveSnapshot(context.Background(), clusterName, snapshot)
 86 | 	if err != nil {
 87 | 		c.logger.Err(err).Str("cluster_name", clusterName).Msg("failed to save cluster snapshot")
 88 | 	}
 89 | }
 90 | 
 91 | func (c *Coordinator) onClusterRecovered(recovery orchestrator.Recovery) {
 92 | 	err := c.db.SaveRecovery(context.Background(), recovery)
 93 | 	if err != nil {
 94 | 		c.logger.Err(err).Str("cluster_name", recovery.ClusterName).Msg("failed to save cluster recovery data")
 95 | 	}
 96 | }
 97 | 
 98 | func (c *Coordinator) Shutdown() {
 99 | 	for i := len(c.shutdownQueue) - 1; i >= 0; i-- {
100 | 		task := c.shutdownQueue[i]
101 | 		task()
102 | 	}
103 | }
104 | 
105 | func (c *Coordinator) addShutdownTask(task shutdownTask) {
106 | 	c.shutdownQueue = append(c.shutdownQueue, task)
107 | }
108 | 
109 | func initHooker(cfg *config.Config, logger zerolog.Logger) *orchestrator.Hooker {
110 | 	hooksCfg := cfg.Qumomf.Hooks
111 | 	hooker := orchestrator.NewHooker(hooksCfg.Shell, logger)
112 | 	hooker.SetTimeout(hooksCfg.Timeout)
113 | 	hooker.SetTimeoutAsync(hooksCfg.TimeoutAsync)
114 | 
115 | 	hooker.AddHook(orchestrator.HookPreFailover, hooksCfg.PreFailover...)
116 | 	hooker.AddHook(orchestrator.HookPostSuccessfulFailover, hooksCfg.PostSuccessfulFailover...)
117 | 	hooker.AddHook(orchestrator.HookPostUnsuccessfulFailover, hooksCfg.PostUnsuccessfulFailover...)
118 | 
119 | 	return hooker
120 | }
121 | 


--------------------------------------------------------------------------------
/internal/metrics/metrics.go:
--------------------------------------------------------------------------------
  1 | package metrics
  2 | 
  3 | import (
  4 | 	"github.com/prometheus/client_golang/prometheus"
  5 | )
  6 | 
  7 | const (
  8 | 	discoveryInstanceDurations = "instance_durations"
  9 | 	discoveryClusterDurations  = "cluster_durations"
 10 | 	shardCriticalLevel         = "critical_level"
 11 | 	shardState                 = "state"
 12 | 	shardStateEvent            = "shard_state_event"
 13 | )
 14 | 
 15 | const (
 16 | 	labelClusterName = "cluster_name"
 17 | 	labelHostName    = "hostname"
 18 | 	labelShardState  = "shard_state"
 19 | 	labelShardUUID   = "shard_uuid"
 20 | )
 21 | 
 22 | var (
 23 | 	discoveryInstanceDurationsBuckets = prometheus.ExponentialBuckets(.001, 2.5, 10)
 24 | 	discoveryClusterDurationsBuckets  = prometheus.ExponentialBuckets(.001, 2.5, 10)
 25 | )
 26 | 
 27 | var (
 28 | 	discoveryInstanceDurationsSum = prometheus.NewHistogramVec(prometheus.HistogramOpts{
 29 | 		Subsystem: "discovery",
 30 | 		Name:      discoveryInstanceDurations,
 31 | 		Help:      "Instance discovery latencies in seconds",
 32 | 		Buckets:   discoveryInstanceDurationsBuckets,
 33 | 	}, []string{labelClusterName, labelHostName})
 34 | 
 35 | 	discoveryClusterDurationsSum = prometheus.NewHistogramVec(prometheus.HistogramOpts{
 36 | 		Subsystem: "discovery",
 37 | 		Name:      discoveryClusterDurations,
 38 | 		Help:      "Cluster discovery latencies in seconds",
 39 | 		Buckets:   discoveryClusterDurationsBuckets,
 40 | 	}, []string{labelClusterName})
 41 | 
 42 | 	shardCriticalLevelGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 43 | 		Subsystem: "shard",
 44 | 		Name:      shardCriticalLevel,
 45 | 		Help:      "Critical level of the replica set",
 46 | 	}, []string{labelClusterName, labelShardUUID})
 47 | 
 48 | 	shardStateGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 49 | 		Subsystem: "shard",
 50 | 		Name:      shardState,
 51 | 		Help:      "The state of each shard in the cluster; it will have one line for each possible state of each shard. A value of 1 means the shard is in the state specified by the state label, a value of 0 means it is not.",
 52 | 	}, []string{labelClusterName, labelShardUUID, labelShardState})
 53 | 
 54 | 	discoveryErrors = prometheus.NewCounter(prometheus.CounterOpts{
 55 | 		Subsystem: "discovery",
 56 | 		Name:      "errors",
 57 | 		Help:      "Errors that happen during discovery process",
 58 | 	})
 59 | 
 60 | 	shardStateCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
 61 | 		Subsystem: "orchestrator",
 62 | 		Name:      shardStateEvent,
 63 | 		Help:      "Discovered shard state event",
 64 | 	}, []string{labelClusterName, labelShardUUID, labelShardState})
 65 | )
 66 | 
 67 | func init() {
 68 | 	discoveryErrors.Add(0)
 69 | 	prometheus.MustRegister(
 70 | 		discoveryInstanceDurationsSum,
 71 | 		discoveryClusterDurationsSum,
 72 | 		shardCriticalLevelGauge,
 73 | 		shardStateGauge,
 74 | 		discoveryErrors,
 75 | 		shardStateCounter,
 76 | 	)
 77 | }
 78 | 
 79 | type Transaction interface {
 80 | 	Start() Transaction
 81 | 	End()
 82 | }
 83 | 
 84 | type timeTransaction struct {
 85 | 	labels  []string
 86 | 	summary *prometheus.HistogramVec
 87 | 	timer   *prometheus.Timer
 88 | }
 89 | 
 90 | func (txn *timeTransaction) Start() Transaction {
 91 | 	txn.timer = prometheus.NewTimer(txn.summary.WithLabelValues(txn.labels...))
 92 | 	return txn
 93 | }
 94 | 
 95 | func (txn *timeTransaction) End() {
 96 | 	txn.timer.ObserveDuration()
 97 | }
 98 | 
 99 | func StartInstanceDiscovery(clusterName, hostname string) Transaction {
100 | 	txn := &timeTransaction{
101 | 		summary: discoveryInstanceDurationsSum,
102 | 		labels:  []string{clusterName, hostname},
103 | 	}
104 | 	return txn.Start()
105 | }
106 | 
107 | func StartClusterDiscovery(clusterName string) Transaction {
108 | 	txn := &timeTransaction{
109 | 		summary: discoveryClusterDurationsSum,
110 | 		labels:  []string{clusterName},
111 | 	}
112 | 	return txn.Start()
113 | }
114 | 
115 | func SetShardCriticalLevel(clusterName, uuid string, level int) {
116 | 	shardCriticalLevelGauge.WithLabelValues(clusterName, uuid).Set(float64(level))
117 | }
118 | 
119 | func SetShardState(clusterName, uuid, state string, active bool) {
120 | 	v := float64(0)
121 | 	if active {
122 | 		v = 1
123 | 	}
124 | 	shardStateGauge.With(prometheus.Labels{
125 | 		labelClusterName: clusterName,
126 | 		labelShardUUID:   uuid,
127 | 		labelShardState:  state,
128 | 	}).Set(v)
129 | }
130 | 
131 | func RecordDiscoveryError() {
132 | 	discoveryErrors.Inc()
133 | }
134 | 
135 | func RecordDiscoveredShardState(clusterName, shardUUID, state string) {
136 | 	shardStateCounter.With(prometheus.Labels{
137 | 		labelClusterName: clusterName,
138 | 		labelShardUUID:   shardUUID,
139 | 		labelShardState:  state,
140 | 	}).Inc()
141 | }
142 | 


--------------------------------------------------------------------------------
/internal/qumhttp/api.go:
--------------------------------------------------------------------------------
  1 | package qumhttp
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"net/http"
  6 | 
  7 | 	"github.com/gorilla/mux"
  8 | 	"github.com/rs/zerolog"
  9 | 
 10 | 	"github.com/shmel1k/qumomf/internal/api"
 11 | )
 12 | 
 13 | const (
 14 | 	paramClusterName  = "cluster_name"
 15 | 	paramShardUUID    = "shard_uuid"
 16 | 	paramInstanceUUID = "instance_uuid"
 17 | )
 18 | 
 19 | const (
 20 | 	msgMarshallingError = "failed to marshal data"
 21 | 	msgInvalidParams    = "one or more parameters are invalid"
 22 | )
 23 | 
 24 | type APIHandler interface {
 25 | 	ClusterList(http.ResponseWriter, *http.Request)
 26 | 	ClusterSnapshot(http.ResponseWriter, *http.Request)
 27 | 	ShardSnapshot(http.ResponseWriter, *http.Request)
 28 | 	InstanceSnapshot(http.ResponseWriter, *http.Request)
 29 | 	ShardRecoveries(http.ResponseWriter, *http.Request)
 30 | 	Alerts(http.ResponseWriter, *http.Request)
 31 | 	ClusterAlerts(http.ResponseWriter, *http.Request)
 32 | }
 33 | 
 34 | type apiHandler struct {
 35 | 	apiSrv api.Service
 36 | 	logger zerolog.Logger
 37 | }
 38 | 
 39 | func NewHandler(logger zerolog.Logger, apiSrv api.Service) APIHandler {
 40 | 	return &apiHandler{
 41 | 		logger: logger,
 42 | 		apiSrv: apiSrv,
 43 | 	}
 44 | }
 45 | 
 46 | func (a *apiHandler) ClusterList(w http.ResponseWriter, r *http.Request) {
 47 | 	resp, err := a.apiSrv.ClustersList(r.Context())
 48 | 	if err != nil {
 49 | 		a.writeResponse(w, newInternalErrResponse("failed to get cluster list", err))
 50 | 		return
 51 | 	}
 52 | 
 53 | 	data, err := json.Marshal(resp)
 54 | 	if err != nil {
 55 | 		a.writeResponse(w, newInternalErrResponse(msgMarshallingError, err))
 56 | 		return
 57 | 	}
 58 | 
 59 | 	a.writeResponse(w, newOKResponse(data))
 60 | }
 61 | 
 62 | // nolint: dupl
 63 | func (a *apiHandler) ClusterSnapshot(w http.ResponseWriter, r *http.Request) {
 64 | 	reqParams := parseParams(mux.Vars(r))
 65 | 	if reqParams.clusterName == "" {
 66 | 		a.writeResponse(w, newBadRequestResponse(msgInvalidParams))
 67 | 		return
 68 | 	}
 69 | 
 70 | 	snap, err := a.apiSrv.ClusterSnapshot(r.Context(), reqParams.clusterName)
 71 | 	if err != nil {
 72 | 		if isNotFoundTypeErr(err) {
 73 | 			a.writeResponse(w, newBadRequestResponse(parseNotFoundTypeErr(err)))
 74 | 			return
 75 | 		}
 76 | 		a.writeResponse(w, newInternalErrResponse("failed get cluster snapshot", err))
 77 | 		return
 78 | 	}
 79 | 
 80 | 	data, err := json.Marshal(snap)
 81 | 	if err != nil {
 82 | 		a.writeResponse(w, newInternalErrResponse(msgMarshallingError, err))
 83 | 		return
 84 | 	}
 85 | 
 86 | 	a.writeResponse(w, newOKResponse(data))
 87 | }
 88 | 
 89 | func (a *apiHandler) ShardSnapshot(w http.ResponseWriter, r *http.Request) {
 90 | 	reqParams := parseParams(mux.Vars(r))
 91 | 	if reqParams.clusterName == "" || reqParams.shardUUID == "" {
 92 | 		a.writeResponse(w, newBadRequestResponse(msgInvalidParams))
 93 | 		return
 94 | 	}
 95 | 
 96 | 	shard, err := a.apiSrv.ReplicaSet(r.Context(), reqParams.clusterName, reqParams.shardUUID)
 97 | 	if err != nil {
 98 | 		if isNotFoundTypeErr(err) {
 99 | 			a.writeResponse(w, newBadRequestResponse(parseNotFoundTypeErr(err)))
100 | 			return
101 | 		}
102 | 		a.writeResponse(w, newInternalErrResponse("failed get shard snapshot", err))
103 | 		return
104 | 	}
105 | 
106 | 	data, err := json.Marshal(shard)
107 | 	if err != nil {
108 | 		a.writeResponse(w, newInternalErrResponse(msgMarshallingError, err))
109 | 		return
110 | 	}
111 | 
112 | 	a.writeResponse(w, newOKResponse(data))
113 | }
114 | 
115 | func (a *apiHandler) InstanceSnapshot(w http.ResponseWriter, r *http.Request) {
116 | 	reqParams := parseParams(mux.Vars(r))
117 | 	if reqParams.clusterName == "" || reqParams.shardUUID == "" || reqParams.instanceUUID == "" {
118 | 		a.writeResponse(w, newBadRequestResponse(msgInvalidParams))
119 | 		return
120 | 	}
121 | 
122 | 	inst, err := a.apiSrv.Instance(r.Context(), reqParams.clusterName, reqParams.shardUUID, reqParams.instanceUUID)
123 | 	if err != nil {
124 | 		if isNotFoundTypeErr(err) {
125 | 			a.writeResponse(w, newBadRequestResponse(parseNotFoundTypeErr(err)))
126 | 			return
127 | 		}
128 | 		a.writeResponse(w, newInternalErrResponse("failed get instance snapshot", err))
129 | 
130 | 		return
131 | 	}
132 | 
133 | 	data, err := json.Marshal(inst)
134 | 	if err != nil {
135 | 		a.writeResponse(w, newInternalErrResponse(msgMarshallingError, err))
136 | 		return
137 | 	}
138 | 
139 | 	a.writeResponse(w, newOKResponse(data))
140 | }
141 | 
142 | func (a *apiHandler) ShardRecoveries(w http.ResponseWriter, r *http.Request) {
143 | 	reqParams := parseParams(mux.Vars(r))
144 | 	if reqParams.clusterName == "" || reqParams.shardUUID == "" {
145 | 		a.writeResponse(w, newBadRequestResponse(msgInvalidParams))
146 | 		return
147 | 	}
148 | 
149 | 	recoveries, err := a.apiSrv.Recoveries(r.Context(), reqParams.clusterName, reqParams.shardUUID)
150 | 	if err != nil {
151 | 		a.writeResponse(w, newInternalErrResponse("failed get shard recovery", err))
152 | 		return
153 | 	}
154 | 
155 | 	data, err := json.Marshal(recoveries)
156 | 	if err != nil {
157 | 		a.writeResponse(w, newInternalErrResponse(msgMarshallingError, err))
158 | 		return
159 | 	}
160 | 
161 | 	a.writeResponse(w, newOKResponse(data))
162 | }
163 | 
164 | func (a *apiHandler) Alerts(w http.ResponseWriter, r *http.Request) {
165 | 	alerts, err := a.apiSrv.Alerts(r.Context())
166 | 	if err != nil {
167 | 		a.writeResponse(w, newInternalErrResponse("failed get alerts list", err))
168 | 		return
169 | 	}
170 | 
171 | 	data, err := json.Marshal(alerts)
172 | 	if err != nil {
173 | 		a.writeResponse(w, newInternalErrResponse(msgMarshallingError, err))
174 | 		return
175 | 	}
176 | 
177 | 	a.writeResponse(w, newOKResponse(data))
178 | }
179 | 
180 | // nolint: dupl
181 | func (a *apiHandler) ClusterAlerts(w http.ResponseWriter, r *http.Request) {
182 | 	reqParams := parseParams(mux.Vars(r))
183 | 	if reqParams.clusterName == "" {
184 | 		a.writeResponse(w, newBadRequestResponse(msgInvalidParams))
185 | 		return
186 | 	}
187 | 
188 | 	alerts, err := a.apiSrv.ClusterAlerts(r.Context(), reqParams.clusterName)
189 | 	if err != nil {
190 | 		if isNotFoundTypeErr(err) {
191 | 			a.writeResponse(w, newBadRequestResponse(parseNotFoundTypeErr(err)))
192 | 			return
193 | 		}
194 | 		a.writeResponse(w, newInternalErrResponse("failed get cluster alerts", err))
195 | 		return
196 | 	}
197 | 
198 | 	data, err := json.Marshal(alerts)
199 | 	if err != nil {
200 | 		a.writeResponse(w, newInternalErrResponse(msgMarshallingError, err))
201 | 		return
202 | 	}
203 | 
204 | 	a.writeResponse(w, newOKResponse(data))
205 | }
206 | 
207 | func isNotFoundTypeErr(err error) bool {
208 | 	return err == api.ErrClusterNotFound || err == api.ErrReplicaSetNotFound || err == api.ErrInstanceNotFound
209 | }
210 | 
211 | func parseNotFoundTypeErr(err error) string {
212 | 	switch err {
213 | 	case api.ErrClusterNotFound:
214 | 		return "cluster snapshot not found"
215 | 	case api.ErrReplicaSetNotFound:
216 | 		return "shard snapshot not found"
217 | 	case api.ErrInstanceNotFound:
218 | 		return "instance snapshot not found"
219 | 	}
220 | 
221 | 	return "cluster not found"
222 | }
223 | 
224 | func (a *apiHandler) writeResponse(w http.ResponseWriter, resp response) {
225 | 	if resp.err != nil {
226 | 		a.logger.Err(resp.err).Msg(string(resp.data))
227 | 	}
228 | 
229 | 	w.Header().Add("Content-Type", "application/json; charset=utf-8")
230 | 	w.WriteHeader(resp.statusCode)
231 | 
232 | 	_, err := w.Write(resp.data)
233 | 	if err != nil {
234 | 		a.logger.Err(err).Msg("failed to write response")
235 | 	}
236 | }
237 | 


--------------------------------------------------------------------------------
/internal/qumhttp/data.go:
--------------------------------------------------------------------------------
 1 | package qumhttp
 2 | 
 3 | import (
 4 | 	"net/http"
 5 | 
 6 | 	"github.com/shmel1k/qumomf/internal/vshard"
 7 | )
 8 | 
 9 | type response struct {
10 | 	statusCode int
11 | 	data       []byte
12 | 	err        error
13 | }
14 | 
15 | func newOKResponse(data []byte) response {
16 | 	return response{
17 | 		statusCode: http.StatusOK,
18 | 		data:       data,
19 | 	}
20 | }
21 | 
22 | func newBadRequestResponse(msg string) response {
23 | 	return response{
24 | 		statusCode: http.StatusBadRequest,
25 | 		data:       []byte(msg),
26 | 	}
27 | }
28 | 
29 | func newInternalErrResponse(msg string, err error) response {
30 | 	return response{
31 | 		statusCode: http.StatusInternalServerError,
32 | 		data:       []byte(msg),
33 | 		err:        err,
34 | 	}
35 | }
36 | 
37 | type params struct {
38 | 	clusterName  string
39 | 	shardUUID    vshard.ReplicaSetUUID
40 | 	instanceUUID vshard.InstanceUUID
41 | }
42 | 
43 | func parseParams(vars map[string]string) params {
44 | 	return params{
45 | 		clusterName:  vars[paramClusterName],
46 | 		shardUUID:    vshard.ReplicaSetUUID(vars[paramShardUUID]),
47 | 		instanceUUID: vshard.InstanceUUID(vars[paramInstanceUUID]),
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/internal/qumhttp/http.go:
--------------------------------------------------------------------------------
 1 | package qumhttp
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"net/http"
 6 | )
 7 | 
 8 | func HealthHandler() http.Handler {
 9 | 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
10 | 		w.Header().Set("Content-Type", "application/json; charset=utf-8")
11 | 		w.WriteHeader(http.StatusOK)
12 | 	})
13 | }
14 | 
15 | func AboutHandler(version, commit, buildDate string) http.Handler {
16 | 	about := struct {
17 | 		Version string `json:"version"`
18 | 		Commit  string `json:"commit"`
19 | 		Build   string `json:"build"`
20 | 	}{
21 | 		Version: version,
22 | 		Commit:  commit,
23 | 		Build:   buildDate,
24 | 	}
25 | 
26 | 	aboutStr, _ := json.Marshal(about)
27 | 
28 | 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
29 | 		w.Header().Set("Content-Type", "application/json; charset=utf-8")
30 | 		w.WriteHeader(http.StatusOK)
31 | 		_, _ = w.Write(aboutStr)
32 | 	})
33 | }
34 | 


--------------------------------------------------------------------------------
/internal/qumhttp/routing.go:
--------------------------------------------------------------------------------
 1 | package qumhttp
 2 | 
 3 | import (
 4 | 	"net/http"
 5 | 
 6 | 	"github.com/gorilla/mux"
 7 | 	"github.com/prometheus/client_golang/prometheus/promhttp"
 8 | )
 9 | 
10 | func RegisterDebugHandlers(r *mux.Router, version, commit, buildDate string) {
11 | 	r.Handle("/debug/metrics", promhttp.Handler()).Methods(http.MethodGet)
12 | 	r.Handle("/debug/health", HealthHandler()).Methods(http.MethodGet)
13 | 	r.Handle("/debug/about", AboutHandler(version, commit, buildDate)).Methods(http.MethodGet)
14 | }
15 | 
16 | func RegisterAPIHandlers(r *mux.Router, h APIHandler) {
17 | 	r.HandleFunc("/api/v0/snapshots", h.ClusterList).Methods(http.MethodGet)
18 | 	r.HandleFunc("/api/v0/snapshots/{cluster_name}", h.ClusterSnapshot).Methods(http.MethodGet)
19 | 	r.HandleFunc("/api/v0/snapshots/{cluster_name}/{shard_uuid}", h.ShardSnapshot).Methods(http.MethodGet)
20 | 	r.HandleFunc("/api/v0/snapshots/{cluster_name}/{shard_uuid}/{instance_uuid}", h.InstanceSnapshot).Methods(http.MethodGet)
21 | 
22 | 	r.HandleFunc("/api/v0/recoveries/{cluster_name}/{shard_uuid}", h.ShardRecoveries).Methods(http.MethodGet)
23 | 
24 | 	r.HandleFunc("/api/v0/alerts", h.Alerts).Methods(http.MethodGet)
25 | 	r.HandleFunc("/api/v0/alerts/{cluster_name}", h.ClusterAlerts).Methods(http.MethodGet)
26 | }
27 | 


--------------------------------------------------------------------------------
/internal/quorum/elector.go:
--------------------------------------------------------------------------------
 1 | package quorum
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"fmt"
 6 | 
 7 | 	"github.com/shmel1k/qumomf/internal/vshard"
 8 | )
 9 | 
10 | type Mode string
11 | 
12 | const (
13 | 	ModeIdle  Mode = "idle"
14 | 	ModeSmart Mode = "smart"
15 | )
16 | 
17 | var (
18 | 	ErrNoAliveFollowers = errors.New("quorum: replica set does not have any alive followers or all of them were excluded from the election")
19 | 	ErrNoCandidateFound = errors.New("quorum: no available candidate found")
20 | )
21 | 
22 | type Options struct {
23 | 	ReasonableFollowerLSNLag int64
24 | 	ReasonableFollowerIdle   float64
25 | }
26 | 
27 | type Elector interface {
28 | 	// ChooseMaster selects new master and returns back its uuid.
29 | 	ChooseMaster(set vshard.ReplicaSet) (vshard.InstanceUUID, error)
30 | 	// Mode returns the elector type.
31 | 	Mode() Mode
32 | }
33 | 
34 | func New(m Mode, opts Options) Elector {
35 | 	switch m {
36 | 	case ModeIdle:
37 | 		return NewIdleElector(opts)
38 | 	case ModeSmart:
39 | 		return NewSmartElector(opts)
40 | 	}
41 | 
42 | 	panic(fmt.Sprintf("Elector: got unknown mode %s", m))
43 | }
44 | 
45 | // filter filters out the instances which must not be promoted to the master.
46 | func filter(instances []vshard.Instance, opts Options) []vshard.Instance {
47 | 	filtered := make([]vshard.Instance, 0, len(instances))
48 | 
49 | 	for i := range instances {
50 | 		inst := &instances[i]
51 | 
52 | 		// Exclude all followers with negative priority.
53 | 		if inst.Priority < 0 {
54 | 			continue
55 | 		}
56 | 
57 | 		if opts.ReasonableFollowerLSNLag != 0 {
58 | 			// Exclude followers too far from the master.
59 | 			if inst.LSNBehindMaster > opts.ReasonableFollowerLSNLag {
60 | 				continue
61 | 			}
62 | 		}
63 | 
64 | 		if opts.ReasonableFollowerIdle != 0 {
65 | 			// Exclude followers too far from the master.
66 | 			if inst.Idle() > opts.ReasonableFollowerIdle {
67 | 				continue
68 | 			}
69 | 		}
70 | 
71 | 		filtered = append(filtered, *inst)
72 | 	}
73 | 
74 | 	return filtered
75 | }
76 | 


--------------------------------------------------------------------------------
/internal/quorum/elector_test.go:
--------------------------------------------------------------------------------
  1 | package quorum
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/stretchr/testify/assert"
  7 | 
  8 | 	"github.com/shmel1k/qumomf/internal/vshard"
  9 | )
 10 | 
 11 | func Test_filter(t *testing.T) {
 12 | 	tests := []struct {
 13 | 		name      string
 14 | 		opts      Options
 15 | 		instances []vshard.Instance
 16 | 		want      []vshard.InstanceUUID
 17 | 	}{
 18 | 		{
 19 | 			name: "ExcludeByPriority",
 20 | 			opts: Options{},
 21 | 			instances: []vshard.Instance{
 22 | 				{
 23 | 					UUID:     "1",
 24 | 					Priority: -1,
 25 | 				},
 26 | 				{
 27 | 					UUID:     "2",
 28 | 					Priority: 0,
 29 | 				},
 30 | 				{
 31 | 					UUID:     "3",
 32 | 					Priority: 1,
 33 | 				},
 34 | 			},
 35 | 			want: []vshard.InstanceUUID{
 36 | 				"2", "3",
 37 | 			},
 38 | 		},
 39 | 		{
 40 | 			name: "ExcludeByLSN",
 41 | 			opts: Options{
 42 | 				ReasonableFollowerLSNLag: 100,
 43 | 			},
 44 | 			instances: []vshard.Instance{
 45 | 				{
 46 | 					UUID:            "1",
 47 | 					LSNBehindMaster: 1000,
 48 | 				},
 49 | 				{
 50 | 					UUID:            "2",
 51 | 					LSNBehindMaster: 100,
 52 | 				},
 53 | 				{
 54 | 					UUID:            "3",
 55 | 					LSNBehindMaster: 0,
 56 | 				},
 57 | 			},
 58 | 			want: []vshard.InstanceUUID{
 59 | 				"2", "3",
 60 | 			},
 61 | 		},
 62 | 		{
 63 | 			name: "ExcludeByIdle",
 64 | 			opts: Options{
 65 | 				ReasonableFollowerIdle: 5.5,
 66 | 			},
 67 | 			instances: []vshard.Instance{
 68 | 				{
 69 | 					UUID: "1",
 70 | 					Upstream: &vshard.Upstream{
 71 | 						Status: vshard.UpstreamFollow,
 72 | 						Idle:   7.2,
 73 | 					},
 74 | 				},
 75 | 				{
 76 | 					UUID: "2",
 77 | 					Upstream: &vshard.Upstream{
 78 | 						Status: vshard.UpstreamFollow,
 79 | 						Idle:   5.1,
 80 | 					},
 81 | 				},
 82 | 				{
 83 | 					UUID: "3",
 84 | 					Upstream: &vshard.Upstream{
 85 | 						Status: vshard.UpstreamFollow,
 86 | 						Idle:   0.86981821060181,
 87 | 					},
 88 | 				},
 89 | 			},
 90 | 			want: []vshard.InstanceUUID{
 91 | 				"2", "3",
 92 | 			},
 93 | 		},
 94 | 		{
 95 | 			name: "ExcludeAll",
 96 | 			opts: Options{
 97 | 				ReasonableFollowerLSNLag: 100,
 98 | 				ReasonableFollowerIdle:   5.5,
 99 | 			},
100 | 			instances: []vshard.Instance{
101 | 				{
102 | 					UUID:            "1",
103 | 					Priority:        0,
104 | 					LSNBehindMaster: 10,
105 | 					Upstream: &vshard.Upstream{
106 | 						Status: vshard.UpstreamFollow,
107 | 						Idle:   7.2,
108 | 					},
109 | 				},
110 | 				{
111 | 					UUID:            "2",
112 | 					Priority:        -1,
113 | 					LSNBehindMaster: 0,
114 | 					Upstream: &vshard.Upstream{
115 | 						Status: vshard.UpstreamFollow,
116 | 						Idle:   0.2,
117 | 					},
118 | 				},
119 | 				{
120 | 					UUID:            "3",
121 | 					Priority:        100,
122 | 					LSNBehindMaster: 1000,
123 | 					Upstream: &vshard.Upstream{
124 | 						Status: vshard.UpstreamFollow,
125 | 						Idle:   0.1,
126 | 					},
127 | 				},
128 | 			},
129 | 			want: []vshard.InstanceUUID{},
130 | 		},
131 | 	}
132 | 
133 | 	for _, tt := range tests {
134 | 		tt := tt
135 | 		t.Run(tt.name, func(t *testing.T) {
136 | 			got := filter(tt.instances, tt.opts)
137 | 			uuids := make([]vshard.InstanceUUID, len(got))
138 | 			for i, inst := range got {
139 | 				uuids[i] = inst.UUID
140 | 			}
141 | 			assert.Equal(t, tt.want, uuids)
142 | 		})
143 | 	}
144 | }
145 | 


--------------------------------------------------------------------------------
/internal/quorum/idle.go:
--------------------------------------------------------------------------------
 1 | package quorum
 2 | 
 3 | import (
 4 | 	"math"
 5 | 
 6 | 	"github.com/shmel1k/qumomf/internal/vshard"
 7 | )
 8 | 
 9 | const (
10 | 	maxIdle = math.MaxFloat64
11 | )
12 | 
13 | type idleElector struct {
14 | 	opts Options
15 | }
16 | 
17 | // NewIdleElector returns a new elector based on the follower's idle value.
18 | //
19 | // This elector chooses the candidate to be a master selecting
20 | // the follower with a minimum idle value.
21 | func NewIdleElector(opts Options) Elector {
22 | 	return &idleElector{
23 | 		opts: opts,
24 | 	}
25 | }
26 | 
27 | func (e *idleElector) ChooseMaster(set vshard.ReplicaSet) (vshard.InstanceUUID, error) {
28 | 	followers := filter(set.AliveFollowers(), e.opts)
29 | 	if len(followers) == 0 {
30 | 		return "", ErrNoAliveFollowers
31 | 	}
32 | 
33 | 	minIdle := maxIdle
34 | 	minUUID := vshard.InstanceUUID("")
35 | 	for i := range followers {
36 | 		r := &followers[i]
37 | 
38 | 		if r.Idle() < minIdle {
39 | 			minIdle = r.Idle()
40 | 			minUUID = r.UUID
41 | 		}
42 | 	}
43 | 
44 | 	if minUUID == "" {
45 | 		return "", ErrNoCandidateFound
46 | 	}
47 | 
48 | 	return minUUID, nil
49 | }
50 | 
51 | func (*idleElector) Mode() Mode {
52 | 	return ModeIdle
53 | }
54 | 


--------------------------------------------------------------------------------
/internal/quorum/idle_test.go:
--------------------------------------------------------------------------------
  1 | package quorum
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/stretchr/testify/assert"
  7 | 
  8 | 	"github.com/shmel1k/qumomf/internal/vshard"
  9 | )
 10 | 
 11 | func TestIdleElector(t *testing.T) {
 12 | 	var testData = []struct {
 13 | 		name         string
 14 | 		set          vshard.ReplicaSet
 15 | 		expectedUUID vshard.InstanceUUID
 16 | 		expectedErr  error
 17 | 	}{
 18 | 		{
 19 | 			name: "ShouldSelectExpectedReplica",
 20 | 			set: vshard.ReplicaSet{
 21 | 				Instances: []vshard.Instance{
 22 | 					{
 23 | 						UUID:           "1",
 24 | 						LastCheckValid: false,
 25 | 						StorageInfo: vshard.StorageInfo{
 26 | 							Replication: vshard.Replication{
 27 | 								Status: vshard.StatusMaster,
 28 | 							},
 29 | 						},
 30 | 					},
 31 | 					{
 32 | 						UUID:           "2",
 33 | 						LastCheckValid: true,
 34 | 						Upstream: &vshard.Upstream{
 35 | 							Status: vshard.UpstreamFollow,
 36 | 							Idle:   0.05,
 37 | 						},
 38 | 						Downstream: &vshard.Downstream{
 39 | 							Status: vshard.DownstreamFollow,
 40 | 						},
 41 | 						StorageInfo: vshard.StorageInfo{
 42 | 							Replication: vshard.Replication{
 43 | 								Status: vshard.StatusFollow,
 44 | 							},
 45 | 						},
 46 | 					},
 47 | 					{
 48 | 						UUID:           "3",
 49 | 						LastCheckValid: true,
 50 | 						Upstream: &vshard.Upstream{
 51 | 							Status: vshard.UpstreamFollow,
 52 | 							Idle:   0.1,
 53 | 						},
 54 | 						Downstream: &vshard.Downstream{
 55 | 							Status: vshard.DownstreamFollow,
 56 | 						},
 57 | 						StorageInfo: vshard.StorageInfo{
 58 | 							Replication: vshard.Replication{
 59 | 								Status: vshard.StatusFollow,
 60 | 							},
 61 | 						},
 62 | 					},
 63 | 				},
 64 | 			},
 65 | 			expectedUUID: "2",
 66 | 		},
 67 | 		{
 68 | 			name: "NoAliveFollowers_ShouldReturnErr",
 69 | 			set: vshard.ReplicaSet{
 70 | 				Instances: []vshard.Instance{
 71 | 					{
 72 | 						UUID:           "1",
 73 | 						LastCheckValid: false,
 74 | 						StorageInfo: vshard.StorageInfo{
 75 | 							Replication: vshard.Replication{
 76 | 								Status: vshard.StatusMaster,
 77 | 							},
 78 | 						},
 79 | 					},
 80 | 					{
 81 | 						UUID:           "2",
 82 | 						LastCheckValid: true,
 83 | 						Upstream: &vshard.Upstream{
 84 | 							Status: vshard.UpstreamDisconnected,
 85 | 						},
 86 | 					},
 87 | 					{ // too far from the master
 88 | 						UUID:            "3",
 89 | 						LastCheckValid:  true,
 90 | 						LSNBehindMaster: 1000,
 91 | 						Upstream: &vshard.Upstream{
 92 | 							Status: vshard.UpstreamFollow,
 93 | 							Idle:   0.1,
 94 | 						},
 95 | 						Downstream: &vshard.Downstream{
 96 | 							Status: vshard.DownstreamFollow,
 97 | 						},
 98 | 						StorageInfo: vshard.StorageInfo{
 99 | 							Replication: vshard.Replication{
100 | 								Status: vshard.StatusFollow,
101 | 							},
102 | 						},
103 | 					},
104 | 					{ // too far from the master
105 | 						UUID:            "4",
106 | 						LastCheckValid:  true,
107 | 						LSNBehindMaster: 1,
108 | 						Upstream: &vshard.Upstream{
109 | 							Status: vshard.UpstreamFollow,
110 | 							Idle:   10,
111 | 						},
112 | 						Downstream: &vshard.Downstream{
113 | 							Status: vshard.DownstreamFollow,
114 | 						},
115 | 						StorageInfo: vshard.StorageInfo{
116 | 							Replication: vshard.Replication{
117 | 								Status: vshard.StatusFollow,
118 | 							},
119 | 						},
120 | 					},
121 | 				},
122 | 			},
123 | 			expectedErr: ErrNoAliveFollowers,
124 | 		},
125 | 		{
126 | 			name: "EmptySet_ShouldReturnErr",
127 | 			set: vshard.ReplicaSet{
128 | 				Instances: nil,
129 | 			},
130 | 			expectedErr: ErrNoAliveFollowers,
131 | 		},
132 | 	}
133 | 
134 | 	e := NewIdleElector(Options{
135 | 		ReasonableFollowerLSNLag: 100,
136 | 		ReasonableFollowerIdle:   5,
137 | 	})
138 | 
139 | 	for _, v := range testData {
140 | 		vt := v
141 | 		t.Run(v.name, func(t *testing.T) {
142 | 			uuid, err := e.ChooseMaster(vt.set)
143 | 			assert.Equal(t, vt.expectedErr, err)
144 | 			assert.Equal(t, vt.expectedUUID, uuid)
145 | 		})
146 | 	}
147 | }
148 | 


--------------------------------------------------------------------------------
/internal/quorum/smart.go:
--------------------------------------------------------------------------------
  1 | package quorum
  2 | 
  3 | import (
  4 | 	"sort"
  5 | 
  6 | 	"github.com/shmel1k/qumomf/internal/vshard"
  7 | )
  8 | 
  9 | // idleDiffDelta represents the max diff between
 10 | // idle values of the followers after which
 11 | // they are not treated as almost identical.
 12 | const idleDiffDelta = 0.5 // in seconds
 13 | 
 14 | type smartElector struct {
 15 | 	opts Options
 16 | }
 17 | 
 18 | // NewSmartElector returns a new elector based on rules:
 19 | //  - compare vshard configuration consistency,
 20 | //  - compare upstream status,
 21 | //  - compare LSN behind the master,
 22 | //  - compare when replica got last heartbeat signal or data from master,
 23 | //  - user promotion rules based on instance priorities.
 24 | func NewSmartElector(opts Options) Elector {
 25 | 	return &smartElector{
 26 | 		opts: opts,
 27 | 	}
 28 | }
 29 | 
 30 | func (e *smartElector) ChooseMaster(set vshard.ReplicaSet) (vshard.InstanceUUID, error) {
 31 | 	followers := filter(set.AliveFollowers(), e.opts)
 32 | 	if len(followers) == 0 {
 33 | 		return "", ErrNoAliveFollowers
 34 | 	}
 35 | 
 36 | 	master, err := set.Master()
 37 | 	if err != nil {
 38 | 		return "", err
 39 | 	}
 40 | 	sorter := newInstanceSorter(master, followers)
 41 | 	sort.Sort(sorter)
 42 | 
 43 | 	return followers[0].UUID, nil
 44 | }
 45 | 
 46 | func (e *smartElector) Mode() Mode {
 47 | 	return ModeSmart
 48 | }
 49 | 
 50 | // instanceSorter sorts instances by their priority to be a new master.
 51 | type instanceSorter struct {
 52 | 	master    vshard.Instance
 53 | 	instances []vshard.Instance
 54 | }
 55 | 
 56 | func newInstanceSorter(master vshard.Instance, instances []vshard.Instance) *instanceSorter {
 57 | 	return &instanceSorter{
 58 | 		master:    master,
 59 | 		instances: instances,
 60 | 	}
 61 | }
 62 | 
 63 | func (s *instanceSorter) Len() int {
 64 | 	return len(s.instances)
 65 | }
 66 | 
 67 | func (s *instanceSorter) Swap(i, j int) {
 68 | 	s.instances[i], s.instances[j] = s.instances[j], s.instances[i]
 69 | }
 70 | 
 71 | //nolint:gocyclo
 72 | func (s *instanceSorter) Less(i, j int) bool {
 73 | 	left, right := s.instances[i], s.instances[j]
 74 | 
 75 | 	// Prefer replicas with the same vshard configuration as master.
 76 | 	confHash := s.master.VShardFingerprint
 77 | 	if left.VShardFingerprint == confHash && right.VShardFingerprint != confHash {
 78 | 		return true
 79 | 	}
 80 | 	if left.VShardFingerprint != confHash && right.VShardFingerprint == confHash {
 81 | 		return false
 82 | 	}
 83 | 
 84 | 	// Prefer replicas which have follow upstream status.
 85 | 	if left.Upstream.Status == vshard.UpstreamFollow && right.Upstream.Status != vshard.UpstreamFollow {
 86 | 		return true
 87 | 	}
 88 | 	if left.Upstream.Status != vshard.UpstreamFollow && right.Upstream.Status == vshard.UpstreamFollow {
 89 | 		return false
 90 | 	}
 91 | 
 92 | 	// Prefer most up to date replica.
 93 | 	if left.LSNBehindMaster != right.LSNBehindMaster {
 94 | 		// Special case: when replication is broken and replica has been recovered from an old snapshot with
 95 | 		// LSN in front of master LSN.
 96 | 		if left.LSNBehindMaster > 0 && right.LSNBehindMaster < 0 {
 97 | 			return true
 98 | 		}
 99 | 		if left.LSNBehindMaster < 0 && right.LSNBehindMaster > 0 {
100 | 			return false
101 | 		}
102 | 
103 | 		return left.LSNBehindMaster < right.LSNBehindMaster
104 | 	}
105 | 
106 | 	d1 := left.Idle()
107 | 	d2 := right.Idle()
108 | 
109 | 	if left.Priority != right.Priority && inDelta(d1, d2, idleDiffDelta) {
110 | 		// If followers are almost equal, use user promotion rules.
111 | 		return left.Priority > right.Priority
112 | 	}
113 | 
114 | 	return d1 < d2
115 | }
116 | 
117 | func inDelta(d1, d2, delta float64) bool {
118 | 	diff := d1 - d2
119 | 	return diff >= -delta && diff <= delta
120 | }
121 | 


--------------------------------------------------------------------------------
/internal/quorum/smart_test.go:
--------------------------------------------------------------------------------
  1 | package quorum
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/stretchr/testify/assert"
  7 | 
  8 | 	"github.com/shmel1k/qumomf/internal/vshard"
  9 | )
 10 | 
 11 | func Test_smartElector_ChooseMaster(t *testing.T) {
 12 | 	var testData = []struct {
 13 | 		name         string
 14 | 		set          vshard.ReplicaSet
 15 | 		expectedUUID vshard.InstanceUUID
 16 | 		expectedErr  error
 17 | 	}{
 18 | 		{
 19 | 			name: "ShouldSelectExpectedReplica",
 20 | 			set: vshard.ReplicaSet{
 21 | 				MasterUUID: "1",
 22 | 				Instances: []vshard.Instance{
 23 | 					{
 24 | 						UUID:              "1",
 25 | 						LastCheckValid:    false,
 26 | 						VShardFingerprint: 100,
 27 | 						StorageInfo: vshard.StorageInfo{
 28 | 							Replication: vshard.Replication{
 29 | 								Status: vshard.StatusMaster,
 30 | 							},
 31 | 						},
 32 | 					},
 33 | 					{ // the best candidate
 34 | 						UUID:              "2",
 35 | 						LastCheckValid:    true,
 36 | 						LSNBehindMaster:   0,
 37 | 						VShardFingerprint: 100,
 38 | 						Upstream: &vshard.Upstream{
 39 | 							Status: vshard.UpstreamFollow,
 40 | 							Idle:   0.05,
 41 | 						},
 42 | 						Downstream: &vshard.Downstream{
 43 | 							Status: vshard.DownstreamFollow,
 44 | 						},
 45 | 						StorageInfo: vshard.StorageInfo{
 46 | 							Replication: vshard.Replication{
 47 | 								Status: vshard.StatusFollow,
 48 | 							},
 49 | 						},
 50 | 						Priority: 100,
 51 | 					},
 52 | 					{ // good candidate but has lower priority
 53 | 						UUID:              "3",
 54 | 						LastCheckValid:    true,
 55 | 						LSNBehindMaster:   0,
 56 | 						VShardFingerprint: 100,
 57 | 						Upstream: &vshard.Upstream{
 58 | 							Status: vshard.UpstreamFollow,
 59 | 							Idle:   0.05,
 60 | 						},
 61 | 						Downstream: &vshard.Downstream{
 62 | 							Status: vshard.DownstreamFollow,
 63 | 						},
 64 | 						StorageInfo: vshard.StorageInfo{
 65 | 							Replication: vshard.Replication{
 66 | 								Status: vshard.StatusFollow,
 67 | 							},
 68 | 						},
 69 | 						Priority: 10,
 70 | 					},
 71 | 					{ // too far from master
 72 | 						UUID:              "4",
 73 | 						LastCheckValid:    true,
 74 | 						LSNBehindMaster:   10,
 75 | 						VShardFingerprint: 100,
 76 | 						Upstream: &vshard.Upstream{
 77 | 							Status: vshard.UpstreamFollow,
 78 | 							Idle:   0.1,
 79 | 						},
 80 | 						Downstream: &vshard.Downstream{
 81 | 							Status: vshard.DownstreamFollow,
 82 | 						},
 83 | 						StorageInfo: vshard.StorageInfo{
 84 | 							Replication: vshard.Replication{
 85 | 								Status: vshard.StatusFollow,
 86 | 							},
 87 | 						},
 88 | 					},
 89 | 					{ // inconsistent vshard configuration
 90 | 						UUID:              "5",
 91 | 						LastCheckValid:    true,
 92 | 						LSNBehindMaster:   0,
 93 | 						VShardFingerprint: 10,
 94 | 						Upstream: &vshard.Upstream{
 95 | 							Status: vshard.UpstreamFollow,
 96 | 							Idle:   0.0001,
 97 | 						},
 98 | 						Downstream: &vshard.Downstream{
 99 | 							Status: vshard.DownstreamFollow,
100 | 						},
101 | 						StorageInfo: vshard.StorageInfo{
102 | 							Replication: vshard.Replication{
103 | 								Status: vshard.StatusFollow,
104 | 							},
105 | 						},
106 | 					},
107 | 				},
108 | 			},
109 | 			expectedUUID: "2",
110 | 		},
111 | 		{
112 | 			name: "NoAliveFollowers_ShouldReturnErr",
113 | 			set: vshard.ReplicaSet{
114 | 				MasterUUID: "1",
115 | 				Instances: []vshard.Instance{
116 | 					{
117 | 						UUID:           "1",
118 | 						LastCheckValid: false,
119 | 						StorageInfo: vshard.StorageInfo{
120 | 							Replication: vshard.Replication{
121 | 								Status: vshard.StatusMaster,
122 | 							},
123 | 						},
124 | 					},
125 | 					{
126 | 						UUID:           "2",
127 | 						LastCheckValid: true,
128 | 						Upstream: &vshard.Upstream{
129 | 							Status: vshard.UpstreamDisconnected,
130 | 						},
131 | 					},
132 | 					{ // too far from the master
133 | 						UUID:            "3",
134 | 						LastCheckValid:  true,
135 | 						LSNBehindMaster: 1000,
136 | 						Upstream: &vshard.Upstream{
137 | 							Status: vshard.UpstreamFollow,
138 | 							Idle:   0.1,
139 | 						},
140 | 						Downstream: &vshard.Downstream{
141 | 							Status: vshard.DownstreamFollow,
142 | 						},
143 | 						StorageInfo: vshard.StorageInfo{
144 | 							Replication: vshard.Replication{
145 | 								Status: vshard.StatusFollow,
146 | 							},
147 | 						},
148 | 					},
149 | 					{ // too far from the master
150 | 						UUID:            "4",
151 | 						LastCheckValid:  true,
152 | 						LSNBehindMaster: 1,
153 | 						Upstream: &vshard.Upstream{
154 | 							Status: vshard.UpstreamFollow,
155 | 							Idle:   10,
156 | 						},
157 | 						Downstream: &vshard.Downstream{
158 | 							Status: vshard.DownstreamFollow,
159 | 						},
160 | 						StorageInfo: vshard.StorageInfo{
161 | 							Replication: vshard.Replication{
162 | 								Status: vshard.StatusFollow,
163 | 							},
164 | 						},
165 | 					},
166 | 				},
167 | 			},
168 | 			expectedErr: ErrNoAliveFollowers,
169 | 		},
170 | 		{
171 | 			name: "EmptySet_ShouldReturnErr",
172 | 			set: vshard.ReplicaSet{
173 | 				Instances: nil,
174 | 			},
175 | 			expectedErr: ErrNoAliveFollowers,
176 | 		},
177 | 	}
178 | 
179 | 	e := NewSmartElector(Options{
180 | 		ReasonableFollowerLSNLag: 100,
181 | 		ReasonableFollowerIdle:   5,
182 | 	})
183 | 
184 | 	for _, v := range testData {
185 | 		vt := v
186 | 		t.Run(v.name, func(t *testing.T) {
187 | 			uuid, err := e.ChooseMaster(vt.set)
188 | 			assert.Equal(t, vt.expectedErr, err)
189 | 			assert.Equal(t, vt.expectedUUID, uuid)
190 | 		})
191 | 	}
192 | }
193 | 
194 | func Test_inDelta(t *testing.T) {
195 | 	tests := []struct {
196 | 		name  string
197 | 		d1    float64
198 | 		d2    float64
199 | 		delta float64
200 | 		want  bool
201 | 	}{
202 | 		{
203 | 			name:  "InDelta",
204 | 			d1:    0.23,
205 | 			d2:    0.532,
206 | 			delta: 1,
207 | 			want:  true,
208 | 		},
209 | 		{
210 | 			name:  "NotInDelta",
211 | 			d1:    0.23,
212 | 			d2:    0.532,
213 | 			delta: 0.1,
214 | 			want:  false,
215 | 		},
216 | 	}
217 | 	for _, tt := range tests {
218 | 		tt := tt
219 | 		t.Run(tt.name, func(t *testing.T) {
220 | 			assert.Equal(t, tt.want, inDelta(tt.d1, tt.d2, tt.delta))
221 | 		})
222 | 	}
223 | }
224 | 


--------------------------------------------------------------------------------
/internal/storage/data.go:
--------------------------------------------------------------------------------
1 | package storage
2 | 
3 | import "github.com/shmel1k/qumomf/internal/vshard"
4 | 
5 | type ClusterSnapshotResp struct {
6 | 	Name     string
7 | 	Snapshot vshard.Snapshot
8 | }
9 | 


--------------------------------------------------------------------------------
/internal/storage/sqlite/sqlite.go:
--------------------------------------------------------------------------------
  1 | package sqlite
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"database/sql"
  6 | 	"encoding/json"
  7 | 	"errors"
  8 | 	"time"
  9 | 
 10 | 	"github.com/shmel1k/qumomf/internal/storage"
 11 | 	"github.com/shmel1k/qumomf/internal/vshard"
 12 | 	"github.com/shmel1k/qumomf/internal/vshard/orchestrator"
 13 | 
 14 | 	// sqlite3 driver
 15 | 	_ "github.com/mattn/go-sqlite3"
 16 | )
 17 | 
 18 | const (
 19 | 	querySaveSnapshot = `INSERT INTO snapshots(cluster_name, created_at, data) 
 20 | 							VALUES(?, ?, ?)
 21 | 							ON CONFLICT(cluster_name) DO UPDATE SET
 22 |   								created_at = excluded.created_at,
 23 |   								data = excluded.data`
 24 | 	querySaveRecoveries = `INSERT INTO recoveries(cluster_name, created_at, data) 
 25 | 							VALUES(?, ?, ?)`
 26 | 	initDatabaseQueries = `CREATE TABLE IF NOT EXISTS snapshots (
 27 | 		"id" integer NOT NULL PRIMARY KEY AUTOINCREMENT,		
 28 | 		"cluster_name" TEXT UNIQUE,
 29 | 		"created_at" INTEGER,
 30 | 		"data" BLOB
 31 | 	  );
 32 | 	CREATE TABLE IF NOT EXISTS recoveries (
 33 | 		"id" integer NOT NULL PRIMARY KEY AUTOINCREMENT,		
 34 | 		"cluster_name" TEXT,
 35 | 		"created_at" INTEGER,
 36 | 		"data" BLOB
 37 | 	  )`
 38 | 	queryGetLastSnapshot = `SELECT data
 39 | 		FROM snapshots
 40 | 		WHERE cluster_name = ?
 41 | 		ORDER BY id DESC limit 1`
 42 | 	queryGetRecoveries = `SELECT data
 43 | 		FROM recoveries
 44 | 		WHERE cluster_name = ?`
 45 | 	queryGetClusters = `SELECT cluster_name, data
 46 | 		FROM snapshots`
 47 | )
 48 | 
 49 | var (
 50 | 	ErrEmptyResult = errors.New("empty result")
 51 | )
 52 | 
 53 | type sqlite struct {
 54 | 	db     *sql.DB
 55 | 	config Config
 56 | }
 57 | 
 58 | type Config struct {
 59 | 	FileName       string
 60 | 	ConnectTimeout time.Duration
 61 | 	QueryTimeout   time.Duration
 62 | }
 63 | 
 64 | func New(cfg Config) (storage.Storage, error) {
 65 | 	ctx, cancel := context.WithTimeout(context.Background(), cfg.QueryTimeout)
 66 | 	defer cancel()
 67 | 
 68 | 	db, err := sql.Open("sqlite3", cfg.FileName)
 69 | 	if err != nil {
 70 | 		return &sqlite{}, err
 71 | 	}
 72 | 
 73 | 	db.SetMaxOpenConns(1)
 74 | 
 75 | 	err = createTables(ctx, db)
 76 | 	if err != nil {
 77 | 		return nil, err
 78 | 	}
 79 | 
 80 | 	return &sqlite{
 81 | 		db:     db,
 82 | 		config: cfg,
 83 | 	}, nil
 84 | }
 85 | 
 86 | func (s *sqlite) GetClusters(ctx context.Context) ([]storage.ClusterSnapshotResp, error) {
 87 | 	ctx, cancel := context.WithTimeout(ctx, s.config.QueryTimeout)
 88 | 	defer cancel()
 89 | 
 90 | 	rows, err := s.db.QueryContext(ctx, queryGetClusters)
 91 | 	if err != nil {
 92 | 		return nil, err
 93 | 	}
 94 | 
 95 | 	resp := make([]storage.ClusterSnapshotResp, 0)
 96 | 	data := make([]byte, 0)
 97 | 	for rows.Next() {
 98 | 		snapResp := storage.ClusterSnapshotResp{}
 99 | 		err = rows.Scan(&snapResp.Name, &data)
100 | 		if err != nil {
101 | 			return nil, err
102 | 		}
103 | 
104 | 		err = json.Unmarshal(data, &snapResp.Snapshot)
105 | 		if err != nil {
106 | 			return nil, err
107 | 		}
108 | 
109 | 		resp = append(resp, snapResp)
110 | 	}
111 | 
112 | 	return resp, nil
113 | }
114 | 
115 | func (s *sqlite) SaveSnapshot(ctx context.Context, clusterName string, snapshot vshard.Snapshot) error {
116 | 	ctx, cancel := context.WithTimeout(ctx, s.config.QueryTimeout)
117 | 	defer cancel()
118 | 
119 | 	data, err := json.Marshal(snapshot)
120 | 	if err != nil {
121 | 		return err
122 | 	}
123 | 
124 | 	_, err = s.db.ExecContext(ctx, querySaveSnapshot, clusterName, snapshot.Created, data)
125 | 
126 | 	return err
127 | }
128 | 
129 | func (s *sqlite) SaveRecovery(ctx context.Context, recovery orchestrator.Recovery) error {
130 | 	ctx, cancel := context.WithTimeout(ctx, s.config.QueryTimeout)
131 | 	defer cancel()
132 | 
133 | 	data, err := json.Marshal(recovery)
134 | 	if err != nil {
135 | 		return err
136 | 	}
137 | 
138 | 	_, err = s.db.ExecContext(ctx, querySaveRecoveries, recovery.ClusterName, recovery.EndTimestamp, data)
139 | 
140 | 	return err
141 | }
142 | 
143 | func (s *sqlite) GetClusterSnapshot(ctx context.Context, clusterName string) (vshard.Snapshot, error) {
144 | 	ctx, cancel := context.WithTimeout(ctx, s.config.QueryTimeout)
145 | 	defer cancel()
146 | 
147 | 	data := make([]byte, 0)
148 | 	row := s.db.QueryRowContext(ctx, queryGetLastSnapshot, clusterName)
149 | 
150 | 	var ns vshard.Snapshot
151 | 	err := row.Scan(&data)
152 | 	if err == sql.ErrNoRows {
153 | 		return ns, ErrEmptyResult
154 | 	}
155 | 	err = json.Unmarshal(data, &ns)
156 | 
157 | 	return ns, err
158 | }
159 | 
160 | func (s *sqlite) GetRecoveries(ctx context.Context, clusterName string) ([]orchestrator.Recovery, error) {
161 | 	ctx, cancel := context.WithTimeout(ctx, s.config.QueryTimeout)
162 | 	defer cancel()
163 | 
164 | 	data := make([]byte, 0)
165 | 	resp := make([]orchestrator.Recovery, 0)
166 | 	rows, err := s.db.QueryContext(ctx, queryGetRecoveries, clusterName)
167 | 	if err != nil {
168 | 		return nil, err
169 | 	}
170 | 	defer rows.Close()
171 | 
172 | 	for rows.Next() {
173 | 		err = rows.Scan(&data)
174 | 		if err != nil {
175 | 			return nil, err
176 | 		}
177 | 
178 | 		var recovery orchestrator.Recovery
179 | 		err = json.Unmarshal(data, &recovery)
180 | 		if err != nil {
181 | 			return nil, err
182 | 		}
183 | 
184 | 		resp = append(resp, recovery)
185 | 	}
186 | 
187 | 	return resp, err
188 | }
189 | 
190 | func createTables(ctx context.Context, db *sql.DB) error {
191 | 	_, err := db.ExecContext(ctx, initDatabaseQueries)
192 | 
193 | 	return err
194 | }
195 | 


--------------------------------------------------------------------------------
/internal/storage/sqlite/sqlite_test.go:
--------------------------------------------------------------------------------
  1 | package sqlite
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"database/sql"
  6 | 	"os"
  7 | 	"testing"
  8 | 	"time"
  9 | 
 10 | 	"github.com/stretchr/testify/assert"
 11 | 
 12 | 	"github.com/shmel1k/qumomf/internal/storage"
 13 | 	"github.com/shmel1k/qumomf/internal/vshard"
 14 | 	"github.com/shmel1k/qumomf/internal/vshard/orchestrator"
 15 | 
 16 | 	"github.com/stretchr/testify/require"
 17 | 	"github.com/stretchr/testify/suite"
 18 | )
 19 | 
 20 | var (
 21 | 	tFileName    = "tFileName.db"
 22 | 	tClusterName = "testCluster"
 23 | 	tSnapshot    = vshard.Snapshot{
 24 | 		Created:     123,
 25 | 		Routers:     []vshard.Router{},
 26 | 		ReplicaSets: []vshard.ReplicaSet{},
 27 | 	}
 28 | 	tRecovery = orchestrator.Recovery{
 29 | 		Type:        "test type",
 30 | 		ClusterName: tClusterName,
 31 | 	}
 32 | )
 33 | 
 34 | var (
 35 | 	dummyContext = context.Background()
 36 | )
 37 | 
 38 | type storageSuite struct {
 39 | 	suite.Suite
 40 | 	db       storage.Storage
 41 | 	sqliteDB *sql.DB
 42 | }
 43 | 
 44 | func TestStorage(t *testing.T) {
 45 | 	suite.Run(t, &storageSuite{
 46 | 		Suite: suite.Suite{},
 47 | 	})
 48 | }
 49 | 
 50 | func (s *storageSuite) BeforeTest(_, _ string) {
 51 | 	t := s.T()
 52 | 
 53 | 	sqliteDB, err := sql.Open("sqlite3", tFileName)
 54 | 	require.NoError(t, err)
 55 | 	s.sqliteDB = sqliteDB
 56 | 
 57 | 	db, err := New(Config{
 58 | 		FileName:       tFileName,
 59 | 		ConnectTimeout: 3 * time.Second,
 60 | 		QueryTimeout:   3 * time.Second,
 61 | 	})
 62 | 	require.NoError(t, err)
 63 | 	require.NotNil(t, db)
 64 | 
 65 | 	s.db = db
 66 | }
 67 | 
 68 | func (s *storageSuite) AfterTest(_, _ string) {
 69 | 	err := os.Remove(tFileName)
 70 | 	require.NoError(s.T(), err)
 71 | }
 72 | 
 73 | func (s *storageSuite) TestEmptyResult() {
 74 | 	t := s.T()
 75 | 	_, err := s.db.GetClusterSnapshot(dummyContext, tClusterName)
 76 | 	require.Equal(t, ErrEmptyResult, err)
 77 | }
 78 | 
 79 | func (s *storageSuite) TestSaveSnapshot() {
 80 | 	t := s.T()
 81 | 	err := s.db.SaveSnapshot(dummyContext, tClusterName, tSnapshot)
 82 | 	require.NoError(t, err)
 83 | 
 84 | 	snap, err := s.db.GetClusterSnapshot(dummyContext, tClusterName)
 85 | 	require.NoError(t, err)
 86 | 	require.Equal(t, tSnapshot, snap)
 87 | }
 88 | 
 89 | func (s *storageSuite) TestSaveRecovery() {
 90 | 	t := s.T()
 91 | 	err := s.db.SaveRecovery(dummyContext, tRecovery)
 92 | 	require.NoError(t, err)
 93 | 
 94 | 	results, err := s.db.GetRecoveries(dummyContext, tClusterName)
 95 | 	require.NoError(t, err)
 96 | 	require.Equal(t, []orchestrator.Recovery{tRecovery}, results)
 97 | }
 98 | 
 99 | func (s *storageSuite) TestSaveSnapshot_ShouldNotDuplicateSnapshots() {
100 | 	t := s.T()
101 | 	var lastCreatedAt int64
102 | 	for i := 0; i < 3; i++ {
103 | 		lastCreatedAt = time.Now().Unix()
104 | 		err := s.db.SaveSnapshot(dummyContext, tClusterName, vshard.Snapshot{
105 | 			Created: lastCreatedAt,
106 | 		})
107 | 		require.NoError(t, err)
108 | 	}
109 | 
110 | 	snap, err := s.db.GetClusterSnapshot(dummyContext, tClusterName)
111 | 	require.NoError(t, err)
112 | 	assert.Equal(t, snap.Created, lastCreatedAt)
113 | 
114 | 	expectedSnapshotsCount := 1
115 | 	var snapshotsCount int
116 | 	row := s.sqliteDB.QueryRow("select count(1) from snapshots where cluster_name = ?", tClusterName)
117 | 	err = row.Scan(&snapshotsCount)
118 | 	require.NoError(t, err)
119 | 	assert.Equal(t, expectedSnapshotsCount, snapshotsCount)
120 | }
121 | 


--------------------------------------------------------------------------------
/internal/storage/storage.go:
--------------------------------------------------------------------------------
 1 | package storage
 2 | 
 3 | import (
 4 | 	"context"
 5 | 
 6 | 	"github.com/shmel1k/qumomf/internal/vshard"
 7 | 	"github.com/shmel1k/qumomf/internal/vshard/orchestrator"
 8 | )
 9 | 
10 | type Storage interface {
11 | 	GetClusters(context.Context) ([]ClusterSnapshotResp, error)
12 | 	SaveSnapshot(context.Context, string, vshard.Snapshot) error
13 | 	SaveRecovery(context.Context, orchestrator.Recovery) error
14 | 	GetClusterSnapshot(context.Context, string) (vshard.Snapshot, error)
15 | 	GetRecoveries(context.Context, string) ([]orchestrator.Recovery, error)
16 | }
17 | 


--------------------------------------------------------------------------------
/internal/util/util.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"time"
 5 | )
 6 | 
 7 | func Timestamp() int64 {
 8 | 	return time.Now().Unix()
 9 | }
10 | 
11 | func NewBool(v bool) *bool {
12 | 	return &v
13 | }
14 | 
15 | func NewDuration(v time.Duration) *time.Duration {
16 | 	return &v
17 | }
18 | 
19 | func NewString(v string) *string {
20 | 	return &v
21 | }
22 | 


--------------------------------------------------------------------------------
/internal/vshard/alert.go:
--------------------------------------------------------------------------------
 1 | package vshard
 2 | 
 3 | import "strings"
 4 | 
 5 | type AlertType string
 6 | 
 7 | const (
 8 | 	AlertUnreachableMaster  = "UNREACHABLE_MASTER"
 9 | 	AlertUnreachableReplica = "UNREACHABLE_REPLICA"
10 | )
11 | 
12 | type Alert struct {
13 | 	Type        AlertType `json:"type"`
14 | 	Description string    `json:"description"`
15 | }
16 | 
17 | func (a Alert) String() string {
18 | 	var sb strings.Builder
19 | 	sb.WriteString(string(a.Type))
20 | 	sb.WriteString(": ")
21 | 	sb.WriteRune('"')
22 | 	sb.WriteString(a.Description)
23 | 	sb.WriteRune('"')
24 | 	return sb.String()
25 | }
26 | 


--------------------------------------------------------------------------------
/internal/vshard/cluster_test.go:
--------------------------------------------------------------------------------
  1 | package vshard
  2 | 
  3 | import (
  4 | 	"sort"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/stretchr/testify/assert"
  8 | 	"github.com/stretchr/testify/require"
  9 | 
 10 | 	"github.com/shmel1k/qumomf/internal/util"
 11 | )
 12 | 
 13 | type tExpSet struct {
 14 | 	setUUID    ReplicaSetUUID
 15 | 	masterUUID InstanceUUID
 16 | 	instances  []tExpInst
 17 | }
 18 | 
 19 | type tExpInst struct {
 20 | 	uuid              InstanceUUID
 21 | 	uri               string
 22 | 	readonly          bool
 23 | 	hasUpstream       bool
 24 | 	upstreamStatus    UpstreamStatus
 25 | 	upstreamPeer      string
 26 | 	replicationStatus ReplicationStatus
 27 | 	priority          int
 28 | }
 29 | 
 30 | func TestCluster_Discover(t *testing.T) {
 31 | 	if testing.Short() {
 32 | 		t.Skip("test requires dev env - skipping it in short mode.")
 33 | 	}
 34 | 
 35 | 	c := MockCluster()
 36 | 	c.Discover()
 37 | 
 38 | 	assert.InDelta(t, util.Timestamp(), c.LastDiscovered(), 1000)
 39 | 
 40 | 	routers := c.Routers()
 41 | 	require.Len(t, routers, 1)
 42 | 	r := routers[0]
 43 | 	assert.Equal(t, "127.0.0.1:9301", r.URI)
 44 | 
 45 | 	sets := c.ReplicaSets()
 46 | 	sort.SliceStable(sets, func(i, j int) bool { // predictable order
 47 | 		return sets[j].UUID < sets[i].UUID
 48 | 	})
 49 | 
 50 | 	expected := []tExpSet{
 51 | 		{
 52 | 			setUUID:    "7432f072-c00b-4498-b1a6-6d9547a8a150",
 53 | 			masterUUID: "a94e7310-13f0-4690-b136-169599e87ba0",
 54 | 			instances: []tExpInst{
 55 | 				{
 56 | 					uuid:              "a94e7310-13f0-4690-b136-169599e87ba0",
 57 | 					uri:               "qumomf_1_m.ddk:3301",
 58 | 					readonly:          false,
 59 | 					hasUpstream:       false,
 60 | 					replicationStatus: StatusMaster,
 61 | 					priority:          0,
 62 | 				},
 63 | 				{
 64 | 					uuid:              "bd1095d1-1e73-4ceb-8e2f-6ebdc7838cb1",
 65 | 					uri:               "qumomf_1_s.ddk:3301",
 66 | 					readonly:          true,
 67 | 					hasUpstream:       true,
 68 | 					upstreamStatus:    UpstreamFollow,
 69 | 					upstreamPeer:      "qumomf@qumomf_1_s.ddk:3301",
 70 | 					replicationStatus: StatusFollow,
 71 | 					priority:          0,
 72 | 				},
 73 | 			},
 74 | 		},
 75 | 		{
 76 | 			setUUID:    "5065fb5f-5f40-498e-af79-43887ba3d1ec",
 77 | 			masterUUID: "a3ef657e-eb9a-4730-b420-7ea78d52797d",
 78 | 			instances: []tExpInst{
 79 | 				{
 80 | 					uuid:              "a3ef657e-eb9a-4730-b420-7ea78d52797d",
 81 | 					uri:               "qumomf_2_m.ddk:3301",
 82 | 					readonly:          false,
 83 | 					hasUpstream:       false,
 84 | 					replicationStatus: StatusMaster,
 85 | 					priority:          0,
 86 | 				},
 87 | 				{
 88 | 					uuid:              "bd64dd00-161e-4c99-8b3c-d3c4635e18d2",
 89 | 					uri:               "qumomf_2_s_1.ddk:3301",
 90 | 					readonly:          true,
 91 | 					hasUpstream:       true,
 92 | 					upstreamStatus:    UpstreamFollow,
 93 | 					upstreamPeer:      "qumomf@qumomf_2_s_1.ddk:3301",
 94 | 					replicationStatus: StatusFollow,
 95 | 					priority:          10,
 96 | 				},
 97 | 				{
 98 | 					uuid:              "cc4cfb9c-11d8-4810-84d2-66cfbebb0f6e",
 99 | 					uri:               "qumomf_2_s_2.ddk:3301",
100 | 					readonly:          true,
101 | 					hasUpstream:       true,
102 | 					upstreamStatus:    UpstreamFollow,
103 | 					upstreamPeer:      "qumomf@qumomf_2_s_2.ddk:3301",
104 | 					replicationStatus: StatusFollow,
105 | 					priority:          5,
106 | 				},
107 | 			},
108 | 		},
109 | 	}
110 | 
111 | 	require.Len(t, sets, len(expected))
112 | 
113 | 	for i, set := range sets {
114 | 		exp := expected[i]
115 | 
116 | 		assert.Equal(t, exp.setUUID, set.UUID)
117 | 		assert.Equal(t, exp.masterUUID, set.MasterUUID)
118 | 
119 | 		require.Len(t, set.Instances, len(exp.instances))
120 | 
121 | 		temp := set
122 | 		sort.SliceStable(set.Instances, func(i, j int) bool { // predictable order
123 | 			return temp.Instances[j].UUID > temp.Instances[i].UUID
124 | 		})
125 | 
126 | 		for j, inst := range set.Instances {
127 | 			expInst := exp.instances[j]
128 | 
129 | 			assert.Equal(t, expInst.uuid, inst.UUID)
130 | 			assert.Equal(t, expInst.uri, inst.URI)
131 | 			assert.Equal(t, expInst.readonly, inst.Readonly)
132 | 			assert.Equal(t, expInst.priority, inst.Priority)
133 | 			assert.True(t, inst.LastCheckValid)
134 | 
135 | 			upstream := inst.Upstream
136 | 			if expInst.hasUpstream {
137 | 				require.NotNil(t, upstream)
138 | 				assert.Equal(t, expInst.upstreamStatus, upstream.Status)
139 | 				assert.Equal(t, expInst.upstreamPeer, inst.Upstream.Peer)
140 | 				assert.Empty(t, inst.Upstream.Message)
141 | 			} else {
142 | 				assert.Nil(t, upstream)
143 | 			}
144 | 
145 | 			assert.Equal(t, expInst.replicationStatus, inst.StorageInfo.Replication.Status)
146 | 		}
147 | 	}
148 | }
149 | 
150 | func TestCluster_Instance(t *testing.T) {
151 | 	sets := []ReplicaSet{
152 | 		{
153 | 			UUID:       "set_1",
154 | 			MasterUUID: "set_1_replica_1",
155 | 			Instances: []Instance{
156 | 				{
157 | 					UUID: "set_1_replica_1",
158 | 				},
159 | 				{
160 | 					UUID: "set_1_replica_2",
161 | 				},
162 | 				{
163 | 					UUID: "set_1_replica_3",
164 | 				},
165 | 			},
166 | 		},
167 | 		{
168 | 			UUID:       "set_2",
169 | 			MasterUUID: "set_2_replica_2",
170 | 			Instances: []Instance{
171 | 				{
172 | 					UUID: "set_2_replica_1",
173 | 				},
174 | 				{
175 | 					UUID: "set_2_replica_2",
176 | 				},
177 | 			},
178 | 		},
179 | 	}
180 | 
181 | 	c := MockCluster()
182 | 	c.snapshot = Snapshot{
183 | 		Created:     util.Timestamp(),
184 | 		Routers:     c.Routers(),
185 | 		ReplicaSets: sets,
186 | 	}
187 | 
188 | 	tests := []struct {
189 | 		name    string
190 | 		uuid    InstanceUUID
191 | 		wantErr bool
192 | 	}{
193 | 		{
194 | 			name:    "KnownUUID_ShouldReturnInstance",
195 | 			uuid:    "set_2_replica_1",
196 | 			wantErr: false,
197 | 		},
198 | 		{
199 | 			name:    "UnknownUUID_ShouldReturnErr",
200 | 			uuid:    "set_2_replica_1000",
201 | 			wantErr: true,
202 | 		},
203 | 	}
204 | 
205 | 	for _, tv := range tests {
206 | 		tt := tv
207 | 		t.Run(tt.name, func(t *testing.T) {
208 | 			inst, err := c.Instance(tt.uuid)
209 | 			if tt.wantErr {
210 | 				require.NotNil(t, err)
211 | 				assert.Equal(t, ErrInstanceNotFound, err)
212 | 			} else {
213 | 				require.Nil(t, err)
214 | 				assert.Equal(t, tt.uuid, inst.UUID)
215 | 			}
216 | 		})
217 | 	}
218 | }
219 | 


--------------------------------------------------------------------------------
/internal/vshard/instance.go:
--------------------------------------------------------------------------------
  1 | package vshard
  2 | 
  3 | import "strings"
  4 | 
  5 | type InstanceUUID string
  6 | 
  7 | type ReplicationStatus string
  8 | type UpstreamStatus string
  9 | type DownstreamStatus string
 10 | 
 11 | type HealthCode int
 12 | type HealthLevel string
 13 | 
 14 | const (
 15 | 	StatusFollow       ReplicationStatus = "follow"
 16 | 	StatusMaster       ReplicationStatus = "master"
 17 | 	StatusDisconnected ReplicationStatus = "disconnected"
 18 | )
 19 | 
 20 | const (
 21 | 	UpstreamAuth         UpstreamStatus = "auth"         // the instance is getting authenticated to connect to a replication source.
 22 | 	UpstreamConnecting   UpstreamStatus = "connecting"   // the instance is trying to connect to the replications source(s) listed in its replication parameter.
 23 | 	UpstreamDisconnected UpstreamStatus = "disconnected" // the instance is not connected to the replica set (due to network problems, not replication errors).
 24 | 	UpstreamFollow       UpstreamStatus = "follow"       // the replication is in progress.
 25 | 	UpstreamRunning      UpstreamStatus = "running"      // the instance’s role is “master” (non read-only) and replication is in progress.
 26 | 	UpstreamStopped      UpstreamStatus = "stopped"      // the replication was stopped due to a replication error (e.g. duplicate key).
 27 | 	UpstreamOrphan       UpstreamStatus = "orphan"       // the instance has not (yet) succeeded in joining the required number of masters (see orphan status).
 28 | 	UpstreamSync         UpstreamStatus = "sync"         // the master and replica are synchronizing to have the same data.
 29 | )
 30 | 
 31 | const (
 32 | 	DownstreamFollow  DownstreamStatus = "follow"  // the downstream replication is in progress.
 33 | 	DownstreamStopped DownstreamStatus = "stopped" // the downstream replication has stopped.
 34 | )
 35 | 
 36 | const (
 37 | 	// A replica set works in a regular way.
 38 | 	HealthCodeGreen HealthCode = 0
 39 | 	// There are some issues, but they don’t affect a replica set efficiency
 40 | 	// (worth noticing, but don’t require immediate intervention).
 41 | 	HealthCodeYellow HealthCode = 1
 42 | 	// A replica set is in a degraded state.
 43 | 	HealthCodeOrange HealthCode = 2
 44 | 	// A replica set is disabled.
 45 | 	HealthCodeRed HealthCode = 3
 46 | 	// If something will change.
 47 | 	HealthCodeUnknown HealthCode = 4
 48 | )
 49 | 
 50 | const (
 51 | 	HealthLevelGreen   HealthLevel = "green"
 52 | 	HealthLevelYellow  HealthLevel = "yellow"
 53 | 	HealthLevelOrange  HealthLevel = "orange"
 54 | 	HealthLevelRed     HealthLevel = "red"
 55 | 	HealthLevelUnknown HealthLevel = "unknown" // if something will change
 56 | )
 57 | 
 58 | type Instance struct {
 59 | 	// ID is a short numeric identifier of the instance within the replica set.
 60 | 	ID uint64 `json:"id"`
 61 | 
 62 | 	// UUID is a global unique identifier of the instance.
 63 | 	UUID InstanceUUID `json:"uuid"`
 64 | 
 65 | 	// URI contains the host IP address and port number of the instance.
 66 | 	URI string `json:"uri"`
 67 | 
 68 | 	// Readonly indicates whether the instance is readonly or readwrite.
 69 | 	Readonly bool `json:"readonly"`
 70 | 
 71 | 	// LastCheckValid indicates whether the last check of the instance by qumomf was successful or not.
 72 | 	LastCheckValid bool `json:"last_check_valid"`
 73 | 
 74 | 	// LSN is the log sequence number (LSN) for the latest entry in the instance’s write ahead log (WAL).
 75 | 	LSN int64 `json:"lsn"`
 76 | 
 77 | 	// LSNBehindMaster is a measure of how the replica is far from master.
 78 | 	LSNBehindMaster int64 `json:"lsn_behind_master"`
 79 | 
 80 | 	// Upstream contains statistics for the replication data uploaded by the instance.
 81 | 	Upstream *Upstream `json:"upstream"`
 82 | 
 83 | 	// Downstream contains statistics for the replication data requested and downloaded from the instance.
 84 | 	Downstream *Downstream `json:"downstream"`
 85 | 
 86 | 	// StorageInfo contains the information about the storage instance.
 87 | 	StorageInfo StorageInfo `json:"storage_info"`
 88 | 
 89 | 	// VShardFingerprint is a CRC32 hash code of the shard topology configuration.
 90 | 	VShardFingerprint uint64 `json:"vshard_fingerprint"`
 91 | 
 92 | 	// Priority helps to choose the best candidate during the failover using
 93 | 	// user promotion rules.
 94 | 	//
 95 | 	// If priority less than 0, instance will not participate in the master election.
 96 | 	Priority int `json:"priority"`
 97 | }
 98 | 
 99 | // InstanceIdent contains unique UUID and URI of the instance.
100 | type InstanceIdent struct {
101 | 	UUID InstanceUUID
102 | 	URI  string
103 | }
104 | 
105 | func (ident InstanceIdent) String() string {
106 | 	var sb strings.Builder
107 | 	sb.Grow(len(ident.URI) + len(ident.UUID) + 1)
108 | 	sb.WriteString(string(ident.UUID))
109 | 	sb.WriteRune('/')
110 | 	sb.WriteString(ident.URI)
111 | 
112 | 	return sb.String()
113 | }
114 | 
115 | // Upstream contains statistics for the replication data uploaded by the instance.
116 | type Upstream struct {
117 | 	// Peer contains the replication user name, host IP address and port number used for the instance.
118 | 	Peer string `json:"peer"`
119 | 
120 | 	// Status is the replication status of the instance.
121 | 	Status UpstreamStatus `json:"status"`
122 | 
123 | 	// Idle is the time (in seconds) since the instance received the last event from a master.
124 | 	// This is the primary indicator of replication health.
125 | 	Idle float64 `json:"idle"`
126 | 
127 | 	// Lag is the time difference between the local time at the instance, recorded when the event was received,
128 | 	// and the local time at another master recorded when the event was written to the write ahead log on that master.
129 | 	Lag float64 `json:"lag"`
130 | 
131 | 	// Message contains an error message in case of a degraded state, empty otherwise.
132 | 	Message string `json:"message"`
133 | }
134 | 
135 | type Downstream struct {
136 | 	// Status is the replication status for downstream replications.
137 | 	Status DownstreamStatus `json:"status"`
138 | }
139 | 
140 | func (i *Instance) Ident() InstanceIdent {
141 | 	return InstanceIdent{
142 | 		UUID: i.UUID,
143 | 		URI:  i.URI,
144 | 	}
145 | }
146 | 
147 | func (i *Instance) HasAlert(t AlertType) bool {
148 | 	for _, a := range i.StorageInfo.Alerts {
149 | 		if a.Type == t {
150 | 			return true
151 | 		}
152 | 	}
153 | 
154 | 	return false
155 | }
156 | 
157 | func (i *Instance) CriticalCode() HealthCode {
158 | 	return i.StorageInfo.Status
159 | }
160 | 
161 | func (i *Instance) CriticalLevel() HealthLevel {
162 | 	switch i.CriticalCode() {
163 | 	case HealthCodeGreen:
164 | 		return HealthLevelGreen
165 | 	case HealthCodeYellow:
166 | 		return HealthLevelYellow
167 | 	case HealthCodeOrange:
168 | 		return HealthLevelOrange
169 | 	case HealthCodeRed:
170 | 		return HealthLevelRed
171 | 	}
172 | 
173 | 	return HealthLevelUnknown
174 | }
175 | 
176 | func (i *Instance) Idle() float64 {
177 | 	if i.Upstream == nil {
178 | 		return 0
179 | 	}
180 | 
181 | 	return i.Upstream.Idle
182 | }
183 | 
184 | func (i *Instance) SameAs(another Instance) bool {
185 | 	return i.UUID == another.UUID &&
186 | 		i.URI == another.URI &&
187 | 		i.VShardFingerprint == another.VShardFingerprint &&
188 | 		i.ID == another.ID &&
189 | 		i.Readonly == another.Readonly &&
190 | 		i.StorageInfo.Replication.Status == another.StorageInfo.Replication.Status &&
191 | 		i.StorageInfo.Status == another.StorageInfo.Status
192 | }
193 | 
194 | // InstanceInfo is a helper structure contains
195 | // instance info in custom format.
196 | type InstanceInfo struct {
197 | 	Readonly          bool
198 | 	VShardFingerprint uint64
199 | 	StorageInfo       StorageInfo
200 | }
201 | 
202 | type StorageInfo struct {
203 | 	// Status indicates current state of the ReplicaSet.
204 | 	// It ranges from 0 (green) up to 3 (red).
205 | 	Status      HealthCode     `json:"status"`
206 | 	Replication Replication    `json:"replication"`
207 | 	Bucket      InstanceBucket `json:"bucket"`
208 | 	Alerts      []Alert        `json:"alerts"`
209 | }
210 | 
211 | type Replication struct {
212 | 	Status ReplicationStatus `json:"status"`
213 | }
214 | 
215 | type InstanceBucket struct {
216 | 	Active    int64 `json:"active"`
217 | 	Garbage   int64 `json:"garbage"`
218 | 	Pinned    int64 `json:"pinned"`
219 | 	Receiving int64 `json:"receiving"`
220 | 	Sending   int64 `json:"sending"`
221 | 	Total     int64 `json:"total"`
222 | }
223 | 


--------------------------------------------------------------------------------
/internal/vshard/mock.go:
--------------------------------------------------------------------------------
 1 | package vshard
 2 | 
 3 | import (
 4 | 	"time"
 5 | 
 6 | 	"github.com/shmel1k/qumomf/internal/config"
 7 | 	"github.com/shmel1k/qumomf/internal/util"
 8 | )
 9 | 
10 | func MockCluster() *Cluster {
11 | 	return NewCluster("sandbox", config.ClusterConfig{
12 | 		Connection: &config.ConnectConfig{
13 | 			User:           util.NewString("qumomf"),
14 | 			Password:       util.NewString("qumomf"),
15 | 			ConnectTimeout: util.NewDuration(1 * time.Second),
16 | 			RequestTimeout: util.NewDuration(1 * time.Second),
17 | 		},
18 | 		ReadOnly: util.NewBool(true),
19 | 		OverrideURIRules: map[string]string{
20 | 			"qumomf_1_m.ddk:3301":   "127.0.0.1:9303",
21 | 			"qumomf_1_s.ddk:3301":   "127.0.0.1:9304",
22 | 			"qumomf_2_m.ddk:3301":   "127.0.0.1:9305",
23 | 			"qumomf_2_s_1.ddk:3301": "127.0.0.1:9306",
24 | 			"qumomf_2_s_2.ddk:3301": "127.0.0.1:9307",
25 | 		},
26 | 		Priorities: map[string]int{
27 | 			"bd64dd00-161e-4c99-8b3c-d3c4635e18d2": 10,
28 | 			"cc4cfb9c-11d8-4810-84d2-66cfbebb0f6e": 5,
29 | 		},
30 | 		Routers: []config.RouterConfig{
31 | 			{
32 | 				Name: "router_1",
33 | 				Addr: "127.0.0.1:9301",
34 | 			},
35 | 		},
36 | 	})
37 | }
38 | 


--------------------------------------------------------------------------------
/internal/vshard/orchestrator/analysis.go:
--------------------------------------------------------------------------------
 1 | package orchestrator
 2 | 
 3 | import (
 4 | 	"crypto/sha256"
 5 | 	"encoding/hex"
 6 | 	"fmt"
 7 | 	"strconv"
 8 | 
 9 | 	"github.com/shmel1k/qumomf/internal/vshard"
10 | )
11 | 
12 | type AnalysisWriteStream chan<- *ReplicationAnalysis
13 | type AnalysisReadStream <-chan *ReplicationAnalysis
14 | 
15 | func NewAnalysisStream() chan *ReplicationAnalysis {
16 | 	return make(chan *ReplicationAnalysis)
17 | }
18 | 
19 | type ReplicaSetState string
20 | 
21 | const (
22 | 	NoProblem                        ReplicaSetState = "NoProblem"
23 | 	DeadMaster                       ReplicaSetState = "DeadMaster"
24 | 	DeadMasterAndFollowers           ReplicaSetState = "DeadMasterAndFollowers"
25 | 	DeadMasterAndSomeFollowers       ReplicaSetState = "DeadMasterAndSomeFollowers"
26 | 	DeadMasterWithoutFollowers       ReplicaSetState = "DeadMasterWithoutFollowers"
27 | 	DeadFollowers                    ReplicaSetState = "DeadFollowers"
28 | 	AllMasterFollowersNotReplicating ReplicaSetState = "AllMasterFollowersNotReplicating"
29 | 	NetworkProblems                  ReplicaSetState = "NetworkProblems"
30 | 	MasterMasterReplication          ReplicaSetState = "MasterMasterReplication"
31 | 	InconsistentVShardConfiguration  ReplicaSetState = "InconsistentVShardConfiguration"
32 | )
33 | 
34 | var (
35 | 	ReplicaSetStateEnum = []ReplicaSetState{
36 | 		NoProblem,
37 | 		DeadMaster,
38 | 		DeadMasterAndFollowers,
39 | 		DeadMasterAndSomeFollowers,
40 | 		DeadMasterWithoutFollowers,
41 | 		DeadFollowers,
42 | 		AllMasterFollowersNotReplicating,
43 | 		NetworkProblems,
44 | 		MasterMasterReplication,
45 | 		InconsistentVShardConfiguration,
46 | 	}
47 | )
48 | 
49 | type ReplicationAnalysis struct {
50 | 	Set                         vshard.ReplicaSet
51 | 	CountReplicas               int // Total number of replicas in set
52 | 	CountWorkingReplicas        int // Total number of successfully discovered replicas
53 | 	CountReplicatingReplicas    int // Total number of replicas confirmed replication
54 | 	CountInconsistentVShardConf int // Total number of replicas with other than master vshard configuration
55 | 	State                       ReplicaSetState
56 | 	// DeadFollowers is a list with followers that are not currently connected to leader.
57 | 	DeadFollowers []string
58 | }
59 | 
60 | func (a ReplicationAnalysis) String() string {
61 | 	return fmt.Sprintf(
62 | 		"[State: %s; CountReplicas: %d; CountWorkingReplicas: %d; CountReplicatingReplicas: %d]",
63 | 		a.State, a.CountReplicas, a.CountWorkingReplicas, a.CountReplicatingReplicas,
64 | 	)
65 | }
66 | 
67 | func (a ReplicationAnalysis) GetHash() (string, error) {
68 | 	h := sha256.New()
69 | 
70 | 	for _, val := range []string{
71 | 		string(a.State),
72 | 		strconv.Itoa(a.CountReplicas),
73 | 		strconv.Itoa(a.CountWorkingReplicas),
74 | 		strconv.Itoa(a.CountReplicatingReplicas),
75 | 		strconv.Itoa(a.CountInconsistentVShardConf),
76 | 		a.Set.String(),
77 | 	} {
78 | 		_, err := h.Write([]byte(val))
79 | 		if err != nil {
80 | 			return "", err
81 | 		}
82 | 	}
83 | 
84 | 	return hex.EncodeToString(h.Sum(nil)), nil
85 | }
86 | 


--------------------------------------------------------------------------------
/internal/vshard/orchestrator/config.go:
--------------------------------------------------------------------------------
 1 | package orchestrator
 2 | 
 3 | import (
 4 | 	"time"
 5 | 
 6 | 	"github.com/shmel1k/qumomf/internal/quorum"
 7 | )
 8 | 
 9 | type Config struct {
10 | 	RecoveryPollTime  time.Duration
11 | 	DiscoveryPollTime time.Duration
12 | }
13 | 
14 | type FailoverConfig struct {
15 | 	Hooker                      *Hooker
16 | 	Elector                     quorum.Elector
17 | 	InstanceRecoveryBlockTime   time.Duration
18 | 	ReplicaSetRecoveryBlockTime time.Duration
19 | }
20 | 


--------------------------------------------------------------------------------
/internal/vshard/orchestrator/failover_test.go:
--------------------------------------------------------------------------------
  1 | package orchestrator
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 	"time"
  6 | 
  7 | 	"github.com/rs/zerolog"
  8 | 	"github.com/stretchr/testify/assert"
  9 | 	"github.com/stretchr/testify/require"
 10 | 	"github.com/stretchr/testify/suite"
 11 | 
 12 | 	"github.com/shmel1k/qumomf/internal/quorum"
 13 | 	"github.com/shmel1k/qumomf/internal/util"
 14 | 	"github.com/shmel1k/qumomf/internal/vshard"
 15 | )
 16 | 
 17 | var (
 18 | 	tests = []struct {
 19 | 		name string
 20 | 		mode quorum.Mode
 21 | 		opts quorum.Options
 22 | 	}{
 23 | 		{
 24 | 			name: "IdleElector",
 25 | 			mode: quorum.ModeIdle,
 26 | 			opts: quorum.Options{
 27 | 				ReasonableFollowerLSNLag: 10,
 28 | 				ReasonableFollowerIdle:   1,
 29 | 			},
 30 | 		},
 31 | 		{
 32 | 			name: "SmartElector",
 33 | 			mode: quorum.ModeSmart,
 34 | 			opts: quorum.Options{
 35 | 				ReasonableFollowerLSNLag: 10,
 36 | 				ReasonableFollowerIdle:   1,
 37 | 			},
 38 | 		},
 39 | 	}
 40 | )
 41 | 
 42 | type failoverTestSuite struct {
 43 | 	suite.Suite
 44 | 
 45 | 	cluster  *vshard.Cluster
 46 | 	failover Failover
 47 | 
 48 | 	logger zerolog.Logger
 49 | }
 50 | 
 51 | func newFailoverTestSuite() *failoverTestSuite {
 52 | 	return &failoverTestSuite{
 53 | 		logger: zerolog.New(zerolog.NewConsoleWriter()).With().Timestamp().Logger(),
 54 | 	}
 55 | }
 56 | 
 57 | func (s *failoverTestSuite) SetupTest() {
 58 | 	s.cluster = vshard.MockCluster()
 59 | 	s.cluster.SetReadOnly(false)
 60 | }
 61 | 
 62 | func (s *failoverTestSuite) AfterTest(_, _ string) {
 63 | 	if s.failover != nil {
 64 | 		s.failover.Shutdown()
 65 | 	}
 66 | 	if s.cluster != nil {
 67 | 		s.cluster.Shutdown()
 68 | 	}
 69 | }
 70 | 
 71 | func (s *failoverTestSuite) Test_failover_promoteFollowerToMaster() {
 72 | 	t := s.T()
 73 | 
 74 | 	if testing.Short() {
 75 | 		t.Skip("test requires dev env - skipping it in short mode.")
 76 | 	}
 77 | 
 78 | 	s.cluster.Discover()
 79 | 	require.InDelta(t, util.Timestamp(), s.cluster.LastDiscovered(), 1)
 80 | 
 81 | 	for _, tt := range tests {
 82 | 		tt := tt
 83 | 		s.Run(tt.name, func() {
 84 | 			hooker := NewBashHooker(s.logger)
 85 | 			elector := quorum.New(tt.mode, tt.opts)
 86 | 			s.failover = NewDefaultFailover(s.cluster, FailoverConfig{
 87 | 				Hooker:                      hooker,
 88 | 				Elector:                     elector,
 89 | 				ReplicaSetRecoveryBlockTime: 2 * time.Second,
 90 | 			}, s.logger)
 91 | 			fv := s.failover.(*failover)
 92 | 
 93 | 			stream := NewAnalysisStream()
 94 | 			fv.Serve(stream)
 95 | 
 96 | 			set, err := s.cluster.ReplicaSet("7432f072-c00b-4498-b1a6-6d9547a8a150")
 97 | 			require.Nil(t, err)
 98 | 
 99 | 			analysis := &ReplicationAnalysis{
100 | 				Set:                      set,
101 | 				CountReplicas:            1,
102 | 				CountWorkingReplicas:     0,
103 | 				CountReplicatingReplicas: 0,
104 | 				State:                    DeadMaster,
105 | 			}
106 | 			stream <- analysis
107 | 
108 | 			require.Eventually(t, func() bool {
109 | 				return fv.hasBlockedRecovery(string(set.UUID))
110 | 			}, 5*time.Second, 100*time.Millisecond)
111 | 			require.Len(t, fv.recoveries, 1)
112 | 			recv := fv.recoveries[0]
113 | 
114 | 			require.True(t, recv.IsSuccessful)
115 | 			assert.InDelta(t, util.Timestamp(), recv.StartTimestamp, 5)
116 | 			assert.InDelta(t, util.Timestamp(), recv.EndTimestamp, 2)
117 | 			assert.Equal(t, string(analysis.State), recv.Type)
118 | 			assert.Equal(t, set.MasterUUID, recv.Failed.UUID)
119 | 
120 | 			recvSet, err := s.cluster.ReplicaSet("7432f072-c00b-4498-b1a6-6d9547a8a150")
121 | 			require.Nil(t, err)
122 | 
123 | 			assert.Equal(t, recv.Successor.UUID, recvSet.MasterUUID)
124 | 
125 | 			master, err := recvSet.Master()
126 | 			require.Nil(t, err)
127 | 			assert.False(t, master.Readonly)
128 | 
129 | 			alive := recvSet.AliveFollowers()
130 | 			assert.Len(t, alive, 1)
131 | 			for i := range alive {
132 | 				assert.True(t, alive[i].Readonly)
133 | 			}
134 | 
135 | 			// Ensure that anti-flapping is working.
136 | 			analysis.Set = recvSet
137 | 			stream <- analysis
138 | 
139 | 			require.Len(t, fv.recoveries, 1)
140 | 			assert.Same(t, recv, fv.recoveries[0])
141 | 
142 | 			// Recreate the initial cluster.
143 | 			fv.cleanup(true)
144 | 			require.False(t, fv.hasBlockedRecovery(string(set.UUID)))
145 | 
146 | 			stream <- analysis
147 | 
148 | 			require.Eventually(t, func() bool {
149 | 				return fv.hasBlockedRecovery(string(set.UUID))
150 | 			}, 5*time.Second, 100*time.Millisecond)
151 | 			require.Len(t, fv.recoveries, 1)
152 | 			assert.True(t, recv != fv.recoveries[0])
153 | 
154 | 			recv = fv.recoveries[0]
155 | 			assert.True(t, recv.IsSuccessful)
156 | 			assert.Equal(t, set.MasterUUID, recv.Successor.UUID)
157 | 
158 | 			time.Sleep(1 * time.Second)
159 | 		})
160 | 	}
161 | }
162 | 
163 | func (s *failoverTestSuite) Test_failover_applyFollowerRoleToCoMasters() {
164 | 	t := s.T()
165 | 
166 | 	if testing.Short() {
167 | 		t.Skip("test requires dev env - skipping it in short mode.")
168 | 	}
169 | 
170 | 	s.cluster.Discover()
171 | 	require.InDelta(t, util.Timestamp(), s.cluster.LastDiscovered(), 1)
172 | 
173 | 	for _, tt := range tests {
174 | 		tt := tt
175 | 		s.Run(tt.name, func() {
176 | 			hooker := NewBashHooker(s.logger)
177 | 			elector := quorum.New(tt.mode, tt.opts)
178 | 			s.failover = NewDefaultFailover(s.cluster, FailoverConfig{
179 | 				Hooker:                      hooker,
180 | 				Elector:                     elector,
181 | 				ReplicaSetRecoveryBlockTime: 2 * time.Second,
182 | 				InstanceRecoveryBlockTime:   2 * time.Second,
183 | 			}, s.logger)
184 | 			fv := s.failover.(*failover)
185 | 
186 | 			stream := NewAnalysisStream()
187 | 			fv.Serve(stream)
188 | 
189 | 			set, err := s.cluster.ReplicaSet("7432f072-c00b-4498-b1a6-6d9547a8a150")
190 | 			require.Nil(t, err)
191 | 
192 | 			invalidUUID := "bd1095d1-1e73-4ceb-8e2f-6ebdc7838cb1"
193 | 
194 | 			for i := range set.Instances {
195 | 				inst := &set.Instances[i]
196 | 				if inst.UUID == vshard.InstanceUUID(invalidUUID) {
197 | 					inst.VShardFingerprint = 100
198 | 					break
199 | 				}
200 | 			}
201 | 
202 | 			analysis := &ReplicationAnalysis{
203 | 				Set:                         set,
204 | 				CountReplicas:               1,
205 | 				CountWorkingReplicas:        1,
206 | 				CountReplicatingReplicas:    1,
207 | 				CountInconsistentVShardConf: 1,
208 | 				State:                       MasterMasterReplication,
209 | 			}
210 | 			stream <- analysis
211 | 
212 | 			require.Eventually(t, func() bool {
213 | 				return fv.hasBlockedRecovery(invalidUUID)
214 | 			}, 5*time.Second, 100*time.Millisecond)
215 | 			require.Len(t, fv.recoveries, 1)
216 | 			recv := fv.recoveries[0]
217 | 
218 | 			assert.True(t, recv.IsSuccessful)
219 | 			assert.Equal(t, string(analysis.State), recv.Type)
220 | 			assert.Equal(t, invalidUUID, recv.ScopeKey())
221 | 			assert.False(t, recv.Expired())
222 | 
223 | 			time.Sleep(1 * time.Second)
224 | 		})
225 | 	}
226 | }
227 | 
228 | func TestFailover(t *testing.T) {
229 | 	suite.Run(t, newFailoverTestSuite())
230 | }
231 | 


--------------------------------------------------------------------------------
/internal/vshard/orchestrator/hook.go:
--------------------------------------------------------------------------------
  1 | package orchestrator
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"os/exec"
  8 | 	"strconv"
  9 | 	"strings"
 10 | 	"time"
 11 | 
 12 | 	"github.com/rs/zerolog"
 13 | )
 14 | 
 15 | type HookType string
 16 | 
 17 | const (
 18 | 	HookPreFailover              HookType = "PreFailover"
 19 | 	HookPostSuccessfulFailover   HookType = "PostSuccessfulFailover"
 20 | 	HookPostUnsuccessfulFailover HookType = "PostUnsuccessfulFailover"
 21 | )
 22 | 
 23 | const (
 24 | 	ShellBash = "bash"
 25 | )
 26 | 
 27 | type Hooker struct {
 28 | 	processesShellCommand string
 29 | 	processes             map[HookType][]string
 30 | 	timeout               time.Duration
 31 | 	timeoutAsync          time.Duration
 32 | 	logger                zerolog.Logger
 33 | }
 34 | 
 35 | func NewHooker(shell string, logger zerolog.Logger) *Hooker {
 36 | 	return &Hooker{
 37 | 		processesShellCommand: shell,
 38 | 		processes:             make(map[HookType][]string),
 39 | 		timeout:               2 * time.Second,
 40 | 		timeoutAsync:          10 * time.Minute,
 41 | 		logger:                logger,
 42 | 	}
 43 | }
 44 | 
 45 | func NewBashHooker(logger zerolog.Logger) *Hooker {
 46 | 	return NewHooker(ShellBash, logger)
 47 | }
 48 | 
 49 | // SetTimeout sets timeout for basic hook.
 50 | func (h *Hooker) SetTimeout(t time.Duration) {
 51 | 	h.timeout = t
 52 | }
 53 | 
 54 | // SetTimeoutAsync sets timeout for async hook.
 55 | func (h *Hooker) SetTimeoutAsync(t time.Duration) {
 56 | 	h.timeoutAsync = t
 57 | }
 58 | 
 59 | func (h *Hooker) AddHook(t HookType, commands ...string) {
 60 | 	hooks, ok := h.processes[t]
 61 | 	if !ok {
 62 | 		hooks = make([]string, 0, len(commands))
 63 | 	}
 64 | 	hooks = append(hooks, commands...)
 65 | 	h.processes[t] = hooks
 66 | }
 67 | 
 68 | // ExecuteProcesses executes a list of processes.
 69 | func (h *Hooker) ExecuteProcesses(t HookType, recv *Recovery, failOnError bool) (err error) {
 70 | 	processes := h.processes[t]
 71 | 	if len(processes) == 0 {
 72 | 		h.logger.Info().Msgf("No %s hooks to run", t)
 73 | 		return nil
 74 | 	}
 75 | 
 76 | 	h.logger.Info().Msgf("Running %d %s hooks", len(processes), t)
 77 | 	for i, process := range processes {
 78 | 		command, async := prepareCommand(process, recv)
 79 | 		env := applyEnvironmentVariables(recv)
 80 | 
 81 | 		fullDescription := fmt.Sprintf("%s hook %d of %d", t, i+1, len(processes))
 82 | 		if async {
 83 | 			fullDescription = fmt.Sprintf("%s (async)", fullDescription)
 84 | 		}
 85 | 		if async {
 86 | 			go func() {
 87 | 				ctx, cancel := context.WithTimeout(context.Background(), h.timeoutAsync)
 88 | 				// Ignore errors, it is async process.
 89 | 				_ = h.executeProcess(ctx, command, env, fullDescription)
 90 | 				cancel()
 91 | 			}()
 92 | 		} else {
 93 | 			ctx, cancel := context.WithTimeout(context.Background(), h.timeout)
 94 | 			cmdErr := h.executeProcess(ctx, command, env, fullDescription)
 95 | 			cancel()
 96 | 
 97 | 			if cmdErr != nil {
 98 | 				if failOnError {
 99 | 					h.logger.Warn().Msgf("Not running further %s hooks", t)
100 | 					return cmdErr
101 | 				}
102 | 				if err == nil {
103 | 					// Keep first error encountered.
104 | 					err = cmdErr
105 | 				}
106 | 			}
107 | 		}
108 | 	}
109 | 	h.logger.Info().Msgf("Done running %s hooks", t)
110 | 
111 | 	return err
112 | }
113 | 
114 | func (h *Hooker) executeProcess(ctx context.Context, command string, env []string, fullDescription string) error {
115 | 	// Log the command to be run and record how long it takes as this may be useful.
116 | 	h.logger.Info().Msgf("Running %s: %s", fullDescription, command)
117 | 	start := time.Now()
118 | 
119 | 	cmd := exec.CommandContext(ctx, h.processesShellCommand, "-c", command) //nolint:gosec
120 | 	cmd.Env = env
121 | 
122 | 	err := cmd.Run()
123 | 	if err == nil {
124 | 		h.logger.Info().Msgf("Completed %s in %v", fullDescription, time.Since(start))
125 | 	} else {
126 | 		h.logger.Error().Msgf("Execution of %s failed in %v with error: %v", fullDescription, time.Since(start), err)
127 | 	}
128 | 
129 | 	return err
130 | }
131 | 
132 | // prepareCommand replaces agreed-upon placeholders with recovery data.
133 | func prepareCommand(command string, recv *Recovery) (result string, async bool) {
134 | 	command = strings.TrimSpace(command)
135 | 	if strings.HasPrefix(command, "&") {
136 | 		command = strings.TrimLeft(command, "&")
137 | 		async = true
138 | 	}
139 | 
140 | 	analysis := recv.AnalysisEntry
141 | 
142 | 	command = strings.Replace(command, "{failureType}", recv.Type, -1)
143 | 	command = strings.Replace(command, "{failedUUID}", string(recv.Failed.UUID), -1)
144 | 	command = strings.Replace(command, "{failedURI}", recv.Failed.URI, -1)
145 | 	command = strings.Replace(command, "{failureCluster}", recv.ClusterName, -1)
146 | 	command = strings.Replace(command, "{failureReplicaSetUUID}", string(recv.SetUUID), -1)
147 | 	command = strings.Replace(command, "{countFollowers}", strconv.Itoa(analysis.CountReplicas), -1)
148 | 	command = strings.Replace(command, "{countWorkingFollowers}", strconv.Itoa(analysis.CountWorkingReplicas), -1)
149 | 	command = strings.Replace(command, "{countReplicatingFollowers}", strconv.Itoa(analysis.CountReplicatingReplicas), -1)
150 | 	command = strings.Replace(command, "{countInconsistentVShardConf}", strconv.Itoa(analysis.CountInconsistentVShardConf), -1)
151 | 	command = strings.Replace(command, "{isSuccessful}", fmt.Sprint(recv.IsSuccessful), -1)
152 | 
153 | 	if recv.IsSuccessful {
154 | 		command = strings.Replace(command, "{successorUUID}", string(recv.Successor.UUID), -1)
155 | 		command = strings.Replace(command, "{successorURI}", recv.Successor.URI, -1)
156 | 	}
157 | 
158 | 	return command, async
159 | }
160 | 
161 | // applyEnvironmentVariables sets the relevant environment variables for a recovery.
162 | //nolint:gocritic
163 | func applyEnvironmentVariables(recv *Recovery) []string {
164 | 	env := os.Environ()
165 | 
166 | 	env = append(env, fmt.Sprintf("QUM_FAILURE_TYPE=%s", recv.Type))
167 | 	env = append(env, fmt.Sprintf("QUM_FAILED_UUID=%s", string(recv.Failed.UUID)))
168 | 	env = append(env, fmt.Sprintf("QUM_FAILED_URI=%s", recv.Failed.URI))
169 | 	env = append(env, fmt.Sprintf("QUM_FAILURE_CLUSTER=%s", recv.ClusterName))
170 | 	env = append(env, fmt.Sprintf("QUM_FAILURE_REPLICA_SET_UUID=%s", recv.SetUUID))
171 | 	env = append(env, fmt.Sprintf("QUM_COUNT_FOLLOWERS=%d", recv.AnalysisEntry.CountReplicas))
172 | 	env = append(env, fmt.Sprintf("QUM_COUNT_WORKING_FOLLOWERS=%d", recv.AnalysisEntry.CountWorkingReplicas))
173 | 	env = append(env, fmt.Sprintf("QUM_COUNT_REPLICATING_FOLLOWERS=%d", recv.AnalysisEntry.CountReplicatingReplicas))
174 | 	env = append(env, fmt.Sprintf("QUM_COUNT_INCONSISTENT_VSHARD_CONF=%d", recv.AnalysisEntry.CountInconsistentVShardConf))
175 | 	env = append(env, fmt.Sprintf("QUM_IS_SUCCESSFUL=%t", recv.IsSuccessful))
176 | 
177 | 	if recv.IsSuccessful {
178 | 		env = append(env, fmt.Sprintf("QUM_SUCCESSOR_UUID=%s", recv.Successor.UUID))
179 | 		env = append(env, fmt.Sprintf("QUM_SUCCESSOR_URI=%s", recv.Successor.URI))
180 | 	}
181 | 
182 | 	return env
183 | }
184 | 


--------------------------------------------------------------------------------
/internal/vshard/orchestrator/hook_test.go:
--------------------------------------------------------------------------------
  1 | package orchestrator
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"path"
  8 | 	"strconv"
  9 | 	"strings"
 10 | 	"testing"
 11 | 	"time"
 12 | 
 13 | 	"github.com/rs/zerolog"
 14 | 	"github.com/stretchr/testify/assert"
 15 | 	"github.com/stretchr/testify/require"
 16 | 	"github.com/stretchr/testify/suite"
 17 | 
 18 | 	"github.com/shmel1k/qumomf/internal/vshard"
 19 | )
 20 | 
 21 | type hookerTestSuite struct {
 22 | 	suite.Suite
 23 | 
 24 | 	failed   vshard.InstanceIdent
 25 | 	analysis *ReplicationAnalysis
 26 | 	recv     *Recovery
 27 | 
 28 | 	logger zerolog.Logger
 29 | }
 30 | 
 31 | func (s *hookerTestSuite) SetupTest() {
 32 | 	s.analysis = mockAnalysis
 33 | 	s.failed = vshard.InstanceIdent{
 34 | 		UUID: s.analysis.Set.MasterUUID,
 35 | 		URI:  "localhost:8080",
 36 | 	}
 37 | 	s.recv = NewRecovery(RecoveryScopeSet, s.failed, *s.analysis)
 38 | 	s.recv.ClusterName = "sandbox"
 39 | 	s.logger = zerolog.New(zerolog.NewConsoleWriter()).With().Timestamp().Logger()
 40 | }
 41 | 
 42 | func TestHooker(t *testing.T) {
 43 | 	suite.Run(t, &hookerTestSuite{})
 44 | }
 45 | 
 46 | func (s *hookerTestSuite) Test_ExecuteProcesses() {
 47 | 	t := s.T()
 48 | 
 49 | 	env := []string{
 50 | 		fmt.Sprintf("QUM_FAILURE_TYPE=%s", s.analysis.State),
 51 | 		fmt.Sprintf("QUM_FAILED_UUID=%s", s.failed.UUID),
 52 | 		fmt.Sprintf("QUM_FAILED_URI=%s", s.failed.URI),
 53 | 		fmt.Sprintf("QUM_FAILURE_CLUSTER=%s", s.recv.ClusterName),
 54 | 		fmt.Sprintf("QUM_FAILURE_REPLICA_SET_UUID=%s", s.analysis.Set.UUID),
 55 | 		fmt.Sprintf("QUM_COUNT_FOLLOWERS=%d", s.analysis.CountReplicas),
 56 | 		fmt.Sprintf("QUM_COUNT_WORKING_FOLLOWERS=%d", s.analysis.CountWorkingReplicas),
 57 | 		fmt.Sprintf("QUM_COUNT_REPLICATING_FOLLOWERS=%d", s.analysis.CountReplicatingReplicas),
 58 | 		fmt.Sprintf("QUM_COUNT_INCONSISTENT_VSHARD_CONF=%d", s.analysis.CountInconsistentVShardConf),
 59 | 		fmt.Sprintf("IS_SUCCESSFUL=%t", s.recv.IsSuccessful),
 60 | 	}
 61 | 
 62 | 	hooker := NewBashHooker(s.logger)
 63 | 
 64 | 	filename := genUniqueFilename(os.TempDir(), "qumomf-hook-test")
 65 | 	require.NotEmpty(t, filename)
 66 | 	defer func() {
 67 | 		_ = os.Remove(filename)
 68 | 	}()
 69 | 
 70 | 	hooker.AddHook(HookPreFailover, fmt.Sprintf("touch %s", filename))
 71 | 	hooker.AddHook(HookPreFailover, fmt.Sprintf("echo $(printenv | grep QUM) >> %s", filename))
 72 | 
 73 | 	hooker.AddHook(HookPostSuccessfulFailover, fmt.Sprintf("rm -f %s", filename))
 74 | 
 75 | 	err := hooker.ExecuteProcesses(HookPreFailover, s.recv, true)
 76 | 	require.Nil(t, err)
 77 | 
 78 | 	f, err := os.Open(filename)
 79 | 	require.Nil(t, err)
 80 | 	defer func() { _ = f.Close() }()
 81 | 
 82 | 	foundEnv := make([]string, 0, len(env))
 83 | 	scanner := bufio.NewScanner(f)
 84 | 	for scanner.Scan() {
 85 | 		line := scanner.Text()
 86 | 		for _, e := range env {
 87 | 			if strings.Contains(line, e) {
 88 | 				foundEnv = append(foundEnv, e)
 89 | 			}
 90 | 		}
 91 | 	}
 92 | 
 93 | 	assert.Equal(t, env, foundEnv)
 94 | 
 95 | 	err = hooker.ExecuteProcesses(HookPostSuccessfulFailover, s.recv, false)
 96 | 	assert.Nil(t, err)
 97 | }
 98 | 
 99 | func (s *hookerTestSuite) Test_ExecuteProcesses_Async() {
100 | 	t := s.T()
101 | 
102 | 	hooker := NewBashHooker(s.logger)
103 | 
104 | 	start := time.Now()
105 | 	hooker.AddHook(HookPreFailover, "&sleep 3")
106 | 	err := hooker.ExecuteProcesses(HookPreFailover, s.recv, true)
107 | 	end := time.Now()
108 | 	assert.Nil(t, err)
109 | 	assert.WithinDuration(t, start, end, 1*time.Second)
110 | }
111 | 
112 | func (s *hookerTestSuite) Test_ExecuteProcesses_CheckArguments() {
113 | 	t := s.T()
114 | 
115 | 	s.recv.IsSuccessful = true
116 | 	s.recv.Successor = vshard.InstanceIdent{
117 | 		UUID: "successor_uuid",
118 | 		URI:  "successor_uri",
119 | 	}
120 | 
121 | 	args := []string{
122 | 		"failureType",
123 | 		"failedUUID",
124 | 		"failedURI",
125 | 		"failureCluster",
126 | 		"failureReplicaSetUUID",
127 | 		"countFollowers",
128 | 		"countWorkingFollowers",
129 | 		"countReplicatingFollowers",
130 | 		"countInconsistentVShardConf",
131 | 		"isSuccessful",
132 | 		"successorUUID",
133 | 		"successorURI",
134 | 	}
135 | 	expectedArgs := []string{
136 | 		fmt.Sprintf("failureType=%s", s.analysis.State),
137 | 		fmt.Sprintf("failedUUID=%s", s.failed.UUID),
138 | 		fmt.Sprintf("failedURI=%s", s.failed.URI),
139 | 		fmt.Sprintf("failureCluster=%s", s.recv.ClusterName),
140 | 		fmt.Sprintf("failureReplicaSetUUID=%s", s.analysis.Set.UUID),
141 | 		fmt.Sprintf("countFollowers=%d", s.analysis.CountReplicas),
142 | 		fmt.Sprintf("countWorkingFollowers=%d", s.analysis.CountWorkingReplicas),
143 | 		fmt.Sprintf("countReplicatingFollowers=%d", s.analysis.CountReplicatingReplicas),
144 | 		fmt.Sprintf("countInconsistentVShardConf=%d", s.analysis.CountInconsistentVShardConf),
145 | 		fmt.Sprintf("isSuccessful=%t", s.recv.IsSuccessful),
146 | 		fmt.Sprintf("successorUUID=%s", s.recv.Successor.UUID),
147 | 		fmt.Sprintf("successorURI=%s", s.recv.Successor.URI),
148 | 	}
149 | 
150 | 	hooker := NewBashHooker(s.logger)
151 | 
152 | 	filename := genUniqueFilename(os.TempDir(), "qumomf-hook-test")
153 | 	require.NotEmpty(t, filename)
154 | 	defer func() {
155 | 		_ = os.Remove(filename)
156 | 	}()
157 | 
158 | 	hooker.AddHook(HookPreFailover, fmt.Sprintf("touch %s", filename))
159 | 	for _, arg := range args {
160 | 		hooker.AddHook(HookPreFailover, fmt.Sprintf("echo '%s={%s}' >> %s", arg, arg, filename))
161 | 	}
162 | 	hooker.AddHook(HookPostSuccessfulFailover, fmt.Sprintf("rm -f %s", filename))
163 | 
164 | 	err := hooker.ExecuteProcesses(HookPreFailover, s.recv, true)
165 | 	require.Nil(t, err)
166 | 
167 | 	f, err := os.Open(filename)
168 | 	require.Nil(t, err)
169 | 	defer func() { _ = f.Close() }()
170 | 
171 | 	foundArgs := make([]string, 0, len(expectedArgs))
172 | 	scanner := bufio.NewScanner(f)
173 | 	for scanner.Scan() {
174 | 		line := scanner.Text()
175 | 		for _, e := range expectedArgs {
176 | 			if strings.Contains(line, e) {
177 | 				foundArgs = append(foundArgs, e)
178 | 			}
179 | 		}
180 | 	}
181 | 
182 | 	assert.Equal(t, expectedArgs, foundArgs)
183 | 
184 | 	err = hooker.ExecuteProcesses(HookPostSuccessfulFailover, s.recv, false)
185 | 	assert.Nil(t, err)
186 | }
187 | 
188 | func genUniqueFilename(dir, prefix string) string {
189 | 	name := ""
190 | 	rand := uint32(0)
191 | 	for i := 0; i < 1000; i++ {
192 | 		name = path.Join(dir, prefix+nextRandom(&rand))
193 | 		_, err := os.Stat(name)
194 | 		if os.IsExist(err) {
195 | 			continue
196 | 		}
197 | 		break
198 | 	}
199 | 	return name
200 | }
201 | 
202 | func reseed() uint32 {
203 | 	return uint32(time.Now().UnixNano() + int64(os.Getpid()))
204 | }
205 | 
206 | func nextRandom(rand *uint32) string {
207 | 	r := *rand
208 | 	if r == 0 {
209 | 		r = reseed()
210 | 	}
211 | 	r = r*1664525 + 1013904223 // constants from Numerical Recipes
212 | 	*rand = r
213 | 
214 | 	return strconv.Itoa(int(1e9 + r%1e9))[1:]
215 | }
216 | 


--------------------------------------------------------------------------------
/internal/vshard/orchestrator/instance_utils.go:
--------------------------------------------------------------------------------
 1 | package orchestrator
 2 | 
 3 | import "github.com/shmel1k/qumomf/internal/vshard"
 4 | 
 5 | // InstanceFailoverSorter sorts instances by priority to update vshard configuration.
 6 | type InstanceFailoverSorter struct {
 7 | 	instances []vshard.Instance
 8 | }
 9 | 
10 | func NewInstanceFailoverSorter(instances []vshard.Instance) *InstanceFailoverSorter {
11 | 	return &InstanceFailoverSorter{
12 | 		instances: instances,
13 | 	}
14 | }
15 | 
16 | func (s *InstanceFailoverSorter) Len() int {
17 | 	return len(s.instances)
18 | }
19 | 
20 | func (s *InstanceFailoverSorter) Swap(i, j int) {
21 | 	s.instances[i], s.instances[j] = s.instances[j], s.instances[i]
22 | }
23 | 
24 | func (s *InstanceFailoverSorter) Less(i, j int) bool {
25 | 	left, right := s.instances[i], s.instances[j]
26 | 
27 | 	// Prefer replicas which was polled successfully last time.
28 | 	if left.LastCheckValid && !right.LastCheckValid {
29 | 		return true
30 | 	}
31 | 	// Prefer instance which has unreachable master.
32 | 	if left.HasAlert(vshard.AlertUnreachableMaster) && !right.HasAlert(vshard.AlertUnreachableMaster) {
33 | 		return true
34 | 	}
35 | 	if right.HasAlert(vshard.AlertUnreachableMaster) && !left.HasAlert(vshard.AlertUnreachableMaster) {
36 | 		return false
37 | 	}
38 | 	// Prefer most up to date replica.
39 | 	return left.Idle() < right.Idle()
40 | }
41 | 


--------------------------------------------------------------------------------
/internal/vshard/orchestrator/instance_utils_test.go:
--------------------------------------------------------------------------------
  1 | package orchestrator
  2 | 
  3 | import (
  4 | 	"sort"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/stretchr/testify/assert"
  8 | 
  9 | 	"github.com/shmel1k/qumomf/internal/vshard"
 10 | )
 11 | 
 12 | func TestInstanceFailoverSorter(t *testing.T) {
 13 | 	instances := []vshard.Instance{
 14 | 		{
 15 | 			UUID:           "replica_1",
 16 | 			LastCheckValid: false,
 17 | 			Upstream: &vshard.Upstream{
 18 | 				Status: vshard.UpstreamFollow,
 19 | 				Idle:   0,
 20 | 			},
 21 | 			StorageInfo: vshard.StorageInfo{
 22 | 				Replication: vshard.Replication{
 23 | 					Status: "",
 24 | 				},
 25 | 				Alerts: nil,
 26 | 			},
 27 | 		},
 28 | 		{
 29 | 			UUID:           "replica_2",
 30 | 			LastCheckValid: true,
 31 | 			Upstream: &vshard.Upstream{
 32 | 				Status: vshard.UpstreamFollow,
 33 | 				Idle:   0.032492704689502716,
 34 | 			},
 35 | 			StorageInfo: vshard.StorageInfo{
 36 | 				Replication: vshard.Replication{
 37 | 					Status: vshard.StatusDisconnected,
 38 | 				},
 39 | 				Alerts: []vshard.Alert{
 40 | 					{
 41 | 						Type:        vshard.AlertUnreachableMaster,
 42 | 						Description: "Master of replicaset is unreachable: disconnected",
 43 | 					},
 44 | 				},
 45 | 			},
 46 | 		},
 47 | 		{
 48 | 			UUID:           "replica_3",
 49 | 			LastCheckValid: true,
 50 | 			Upstream: &vshard.Upstream{
 51 | 				Status: vshard.UpstreamFollow,
 52 | 				Idle:   3.479430440813303,
 53 | 			},
 54 | 			StorageInfo: vshard.StorageInfo{
 55 | 				Replication: vshard.Replication{
 56 | 					Status: vshard.StatusDisconnected,
 57 | 				},
 58 | 				Alerts: []vshard.Alert{
 59 | 					{
 60 | 						Type:        vshard.AlertUnreachableMaster,
 61 | 						Description: "Master of replicaset is unreachable: disconnected",
 62 | 					},
 63 | 				},
 64 | 			},
 65 | 		},
 66 | 		{
 67 | 			UUID:           "replica_4",
 68 | 			LastCheckValid: true,
 69 | 			Upstream: &vshard.Upstream{
 70 | 				Status: vshard.UpstreamFollow,
 71 | 				Idle:   0.079430440813303,
 72 | 			},
 73 | 			StorageInfo: vshard.StorageInfo{
 74 | 				Replication: vshard.Replication{
 75 | 					Status: vshard.StatusFollow,
 76 | 				},
 77 | 				Alerts: nil,
 78 | 			},
 79 | 		},
 80 | 		{
 81 | 			UUID:           "replica_5",
 82 | 			LastCheckValid: true,
 83 | 			Upstream: &vshard.Upstream{
 84 | 				Status: vshard.UpstreamFollow,
 85 | 				Idle:   0,
 86 | 			},
 87 | 			StorageInfo: vshard.StorageInfo{
 88 | 				Replication: vshard.Replication{
 89 | 					Status: vshard.StatusMaster,
 90 | 				},
 91 | 				Alerts: nil,
 92 | 			},
 93 | 		},
 94 | 	}
 95 | 
 96 | 	sort.Sort(NewInstanceFailoverSorter(instances))
 97 | 
 98 | 	expected := []vshard.InstanceUUID{
 99 | 		"replica_2", "replica_3", "replica_5", "replica_4", "replica_1",
100 | 	}
101 | 
102 | 	got := make([]vshard.InstanceUUID, 0, len(instances))
103 | 	for _, inst := range instances {
104 | 		got = append(got, inst.UUID)
105 | 	}
106 | 
107 | 	assert.Equal(t, expected, got)
108 | }
109 | 


--------------------------------------------------------------------------------
/internal/vshard/orchestrator/monitor.go:
--------------------------------------------------------------------------------
  1 | package orchestrator
  2 | 
  3 | import (
  4 | 	"time"
  5 | 
  6 | 	"github.com/rs/zerolog"
  7 | 
  8 | 	"github.com/shmel1k/qumomf/internal/metrics"
  9 | 	"github.com/shmel1k/qumomf/internal/vshard"
 10 | )
 11 | 
 12 | type Monitor interface {
 13 | 	Serve() AnalysisReadStream
 14 | 	Shutdown()
 15 | }
 16 | 
 17 | func NewMonitor(cluster *vshard.Cluster, cfg Config, logger zerolog.Logger) Monitor {
 18 | 	return &storageMonitor{
 19 | 		config:  cfg,
 20 | 		cluster: cluster,
 21 | 		stop:    make(chan struct{}, 1),
 22 | 		logger:  logger,
 23 | 	}
 24 | }
 25 | 
 26 | type storageMonitor struct {
 27 | 	config Config
 28 | 
 29 | 	cluster  *vshard.Cluster
 30 | 	analyzed int64 // identifier of the last analyzed cluster topology
 31 | 
 32 | 	stop   chan struct{}
 33 | 	logger zerolog.Logger
 34 | }
 35 | 
 36 | func (m *storageMonitor) Serve() AnalysisReadStream {
 37 | 	stream := NewAnalysisStream()
 38 | 	go m.continuousDiscovery(stream)
 39 | 
 40 | 	return stream
 41 | }
 42 | 
 43 | func (m *storageMonitor) continuousDiscovery(stream AnalysisWriteStream) {
 44 | 	recoveryTick := time.NewTicker(m.config.RecoveryPollTime)
 45 | 	defer recoveryTick.Stop()
 46 | 	discoveryTick := time.NewTicker(m.config.DiscoveryPollTime)
 47 | 	defer discoveryTick.Stop()
 48 | 
 49 | 	continuousDiscoveryStartTime := time.Now()
 50 | 	checkAndRecoverWaitPeriod := 3 * m.config.DiscoveryPollTime
 51 | 
 52 | 	runCheckAndRecoverOperationsTimeRipe := func() bool {
 53 | 		return time.Since(continuousDiscoveryStartTime) >= checkAndRecoverWaitPeriod
 54 | 	}
 55 | 
 56 | 	for {
 57 | 		select {
 58 | 		case <-m.stop:
 59 | 			return
 60 | 		case <-discoveryTick.C:
 61 | 			go m.cluster.Discover()
 62 | 		case <-recoveryTick.C:
 63 | 			// NOTE: we might improve this place checking the delay only on start.
 64 | 			if runCheckAndRecoverOperationsTimeRipe() {
 65 | 				m.checkCluster(stream)
 66 | 			} else {
 67 | 				m.logger.Info().Msgf("Waiting for %+v seconds to pass before running failure detection/recovery", checkAndRecoverWaitPeriod.Seconds())
 68 | 			}
 69 | 		}
 70 | 	}
 71 | }
 72 | 
 73 | func (m *storageMonitor) checkCluster(stream AnalysisWriteStream) {
 74 | 	discovered := m.cluster.LastDiscovered()
 75 | 	if discovered <= m.analyzed {
 76 | 		// Prevent too much analyzes of the same cluster topology.
 77 | 		return
 78 | 	}
 79 | 
 80 | 	for _, set := range m.cluster.ReplicaSets() {
 81 | 		go func(set vshard.ReplicaSet) {
 82 | 			logger := m.logger.With().Str("replica_set", string(set.UUID)).Logger()
 83 | 			analysis := analyze(set, logger)
 84 | 			if analysis != nil {
 85 | 				stream <- analysis
 86 | 
 87 | 				for _, state := range ReplicaSetStateEnum {
 88 | 					active := state == analysis.State
 89 | 					metrics.SetShardState(m.cluster.Name, string(set.UUID), string(state), active)
 90 | 				}
 91 | 			}
 92 | 		}(set)
 93 | 	}
 94 | 
 95 | 	m.analyzed = discovered
 96 | }
 97 | 
 98 | func analyze(set vshard.ReplicaSet, logger zerolog.Logger) *ReplicationAnalysis { //nolint: gocyclo
 99 | 	master, err := set.Master()
100 | 	if err != nil {
101 | 		// Something really weird but we have data inconsistency here.
102 | 		// Master UUID not found in ReplicaSet.
103 | 		logger.Error().Msgf("Fatal analyze error: master '%s' not found in given snapshot. Likely an internal error", set.MasterUUID)
104 | 		return nil
105 | 	}
106 | 
107 | 	countReplicas := 0
108 | 	countWorkingReplicas := 0
109 | 	countReplicatingReplicas := 0
110 | 	countInconsistentVShardConf := 0
111 | 	masterMasterReplication := false
112 | 	followers := set.Followers()
113 | 	var deadFollowers []string
114 | 	for i := range followers {
115 | 		r := &followers[i]
116 | 		countReplicas++
117 | 		if r.LastCheckValid {
118 | 			countWorkingReplicas++
119 | 
120 | 			status := r.StorageInfo.Replication.Status
121 | 			switch status {
122 | 			case vshard.StatusFollow:
123 | 				countReplicatingReplicas++
124 | 			case vshard.StatusMaster:
125 | 				countReplicatingReplicas++
126 | 				masterMasterReplication = true
127 | 				logger.Warn().Msgf("Found M-M replication ('%s'-'%s'), ('%s'-'%s')", set.MasterUUID, r.UUID, set.MasterURI, r.URI)
128 | 			default:
129 | 				deadFollowers = append(deadFollowers, string(r.UUID))
130 | 			}
131 | 
132 | 			if r.VShardFingerprint != master.VShardFingerprint {
133 | 				countInconsistentVShardConf++
134 | 			}
135 | 		}
136 | 	}
137 | 
138 | 	isMasterDead := !master.LastCheckValid // relative to qumomf
139 | 
140 | 	state := NoProblem
141 | 	if isMasterDead && countWorkingReplicas == countReplicas && countReplicatingReplicas == 0 {
142 | 		if countReplicas == 0 {
143 | 			state = DeadMasterWithoutFollowers
144 | 		} else {
145 | 			state = DeadMaster
146 | 		}
147 | 	} else if isMasterDead && countWorkingReplicas <= countReplicas && countReplicatingReplicas == 0 {
148 | 		if countWorkingReplicas == 0 {
149 | 			state = DeadMasterAndFollowers
150 | 		} else {
151 | 			state = DeadMasterAndSomeFollowers
152 | 		}
153 | 	} else if isMasterDead && countReplicatingReplicas != 0 {
154 | 		state = NetworkProblems
155 | 	} else if !isMasterDead && countReplicas > 0 && countReplicatingReplicas == 0 {
156 | 		state = AllMasterFollowersNotReplicating
157 | 	} else if countInconsistentVShardConf > 0 {
158 | 		if masterMasterReplication {
159 | 			state = MasterMasterReplication
160 | 		} else {
161 | 			state = InconsistentVShardConfiguration
162 | 		}
163 | 	} else if !isMasterDead && countReplicas > 0 && countReplicatingReplicas < countReplicas {
164 | 		state = DeadFollowers
165 | 	}
166 | 
167 | 	return &ReplicationAnalysis{
168 | 		Set:                         set,
169 | 		CountReplicas:               countReplicas,
170 | 		CountWorkingReplicas:        countWorkingReplicas,
171 | 		CountReplicatingReplicas:    countReplicatingReplicas,
172 | 		CountInconsistentVShardConf: countInconsistentVShardConf,
173 | 		State:                       state,
174 | 		DeadFollowers:               deadFollowers,
175 | 	}
176 | }
177 | 
178 | func (m *storageMonitor) Shutdown() {
179 | 	m.stop <- struct{}{}
180 | }
181 | 


--------------------------------------------------------------------------------
/internal/vshard/orchestrator/monitor_test.go:
--------------------------------------------------------------------------------
  1 | package orchestrator
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/rs/zerolog"
  8 | 	"github.com/stretchr/testify/assert"
  9 | 	"github.com/stretchr/testify/require"
 10 | 
 11 | 	"github.com/shmel1k/qumomf/internal/vshard"
 12 | )
 13 | 
 14 | func Test_storageMonitor_analyze(t *testing.T) {
 15 | 	logger := zerolog.Nop()
 16 | 
 17 | 	tests := []struct {
 18 | 		name string
 19 | 		set  vshard.ReplicaSet
 20 | 		want *ReplicationAnalysis
 21 | 	}{
 22 | 		{
 23 | 			name: "NoProblem",
 24 | 			set: vshard.ReplicaSet{
 25 | 				UUID:       "set_1",
 26 | 				MasterUUID: "replica_1",
 27 | 				Instances: []vshard.Instance{
 28 | 					mockInstance(1, true, vshard.StatusMaster),
 29 | 					mockInstance(2, true, vshard.StatusFollow),
 30 | 					mockInstance(3, true, vshard.StatusFollow),
 31 | 				},
 32 | 			},
 33 | 			want: &ReplicationAnalysis{
 34 | 				CountReplicas:            2,
 35 | 				CountWorkingReplicas:     2,
 36 | 				CountReplicatingReplicas: 2,
 37 | 				State:                    NoProblem,
 38 | 			},
 39 | 		},
 40 | 		{
 41 | 			name: "NoProblem_MasterMasterReplication",
 42 | 			set: vshard.ReplicaSet{
 43 | 				UUID:       "set_1",
 44 | 				MasterUUID: "replica_1",
 45 | 				Instances: []vshard.Instance{
 46 | 					mockInstance(1, true, vshard.StatusMaster),
 47 | 					mockInstance(2, true, vshard.StatusMaster),
 48 | 					mockInstance(3, true, vshard.StatusFollow),
 49 | 				},
 50 | 			},
 51 | 			want: &ReplicationAnalysis{
 52 | 				CountReplicas:            2,
 53 | 				CountWorkingReplicas:     2,
 54 | 				CountReplicatingReplicas: 2,
 55 | 				State:                    NoProblem,
 56 | 			},
 57 | 		},
 58 | 		{
 59 | 			name: "DeadMaster",
 60 | 			set: vshard.ReplicaSet{
 61 | 				UUID:       "set_1",
 62 | 				MasterUUID: "replica_1",
 63 | 				Instances: []vshard.Instance{
 64 | 					mockInstance(1, false, vshard.StatusMaster),
 65 | 					mockInstance(2, true, vshard.StatusDisconnected),
 66 | 					mockInstance(3, true, vshard.StatusDisconnected),
 67 | 				},
 68 | 			},
 69 | 			want: &ReplicationAnalysis{
 70 | 				CountReplicas:            2,
 71 | 				CountWorkingReplicas:     2,
 72 | 				CountReplicatingReplicas: 0,
 73 | 				State:                    DeadMaster,
 74 | 			},
 75 | 		},
 76 | 		{
 77 | 			name: "DeadMaster",
 78 | 			set: vshard.ReplicaSet{
 79 | 				UUID:       "set_1",
 80 | 				MasterUUID: "replica_1",
 81 | 				Instances: []vshard.Instance{
 82 | 					mockInstance(1, false, vshard.StatusMaster),
 83 | 					mockInstance(2, true, vshard.StatusDisconnected),
 84 | 					mockInstance(3, true, vshard.StatusDisconnected),
 85 | 				},
 86 | 			},
 87 | 			want: &ReplicationAnalysis{
 88 | 				CountReplicas:            2,
 89 | 				CountWorkingReplicas:     2,
 90 | 				CountReplicatingReplicas: 0,
 91 | 				State:                    DeadMaster,
 92 | 			},
 93 | 		},
 94 | 		{
 95 | 			name: "DeadMasterAndFollowers",
 96 | 			set: vshard.ReplicaSet{
 97 | 				UUID:       "set_1",
 98 | 				MasterUUID: "replica_1",
 99 | 				Instances: []vshard.Instance{
100 | 					mockInstance(1, false, vshard.StatusMaster),
101 | 					mockInstance(2, false, vshard.StatusDisconnected),
102 | 					mockInstance(3, false, vshard.StatusDisconnected),
103 | 				},
104 | 			},
105 | 			want: &ReplicationAnalysis{
106 | 				CountReplicas:            2,
107 | 				CountWorkingReplicas:     0,
108 | 				CountReplicatingReplicas: 0,
109 | 				State:                    DeadMasterAndFollowers,
110 | 			},
111 | 		},
112 | 		{
113 | 			name: "DeadMasterAndSomeFollowers",
114 | 			set: vshard.ReplicaSet{
115 | 				UUID:       "set_1",
116 | 				MasterUUID: "replica_1",
117 | 				Instances: []vshard.Instance{
118 | 					mockInstance(1, false, vshard.StatusMaster),
119 | 					mockInstance(2, false, vshard.StatusDisconnected),
120 | 					mockInstance(3, true, vshard.StatusDisconnected),
121 | 				},
122 | 			},
123 | 			want: &ReplicationAnalysis{
124 | 				CountReplicas:            2,
125 | 				CountWorkingReplicas:     1,
126 | 				CountReplicatingReplicas: 0,
127 | 				State:                    DeadMasterAndSomeFollowers,
128 | 			},
129 | 		},
130 | 		{
131 | 			name: "DeadMasterWithoutFollowers",
132 | 			set: vshard.ReplicaSet{
133 | 				UUID:       "set_1",
134 | 				MasterUUID: "replica_1",
135 | 				Instances: []vshard.Instance{
136 | 					mockInstance(1, false, vshard.StatusMaster),
137 | 				},
138 | 			},
139 | 			want: &ReplicationAnalysis{
140 | 				CountReplicas:            0,
141 | 				CountWorkingReplicas:     0,
142 | 				CountReplicatingReplicas: 0,
143 | 				State:                    DeadMasterWithoutFollowers,
144 | 			},
145 | 		},
146 | 		{
147 | 			name: "DeadFollowers",
148 | 			set: vshard.ReplicaSet{
149 | 				UUID:       "set_1",
150 | 				MasterUUID: "replica_1",
151 | 				Instances: []vshard.Instance{
152 | 					mockInstance(1, true, vshard.StatusMaster),
153 | 					mockInstance(2, true, vshard.StatusFollow),
154 | 					mockInstance(3, false, vshard.StatusDisconnected),
155 | 					mockInstance(4, false, vshard.StatusDisconnected),
156 | 				},
157 | 			},
158 | 			want: &ReplicationAnalysis{
159 | 				CountReplicas:            3,
160 | 				CountWorkingReplicas:     1,
161 | 				CountReplicatingReplicas: 1,
162 | 				State:                    DeadFollowers,
163 | 			},
164 | 		},
165 | 		{
166 | 			name: "AllMasterFollowersNotReplicating",
167 | 			set: vshard.ReplicaSet{
168 | 				UUID:       "set_1",
169 | 				MasterUUID: "replica_1",
170 | 				Instances: []vshard.Instance{
171 | 					mockInstance(1, true, vshard.StatusMaster),
172 | 					mockInstance(2, false, vshard.StatusFollow),
173 | 					mockInstance(3, true, vshard.StatusDisconnected),
174 | 				},
175 | 			},
176 | 			want: &ReplicationAnalysis{
177 | 				CountReplicas:            2,
178 | 				CountWorkingReplicas:     1,
179 | 				CountReplicatingReplicas: 0,
180 | 				State:                    AllMasterFollowersNotReplicating,
181 | 			},
182 | 		},
183 | 		{
184 | 			name: "NetworkProblems",
185 | 			set: vshard.ReplicaSet{
186 | 				UUID:       "set_1",
187 | 				MasterUUID: "replica_1",
188 | 				Instances: []vshard.Instance{
189 | 					mockInstance(1, false, vshard.StatusMaster),
190 | 					mockInstance(2, true, vshard.StatusFollow),
191 | 					mockInstance(3, true, vshard.StatusFollow),
192 | 				},
193 | 			},
194 | 			want: &ReplicationAnalysis{
195 | 				CountReplicas:            2,
196 | 				CountWorkingReplicas:     2,
197 | 				CountReplicatingReplicas: 2,
198 | 				State:                    NetworkProblems,
199 | 			},
200 | 		},
201 | 		{
202 | 			name: "MasterMasterReplication",
203 | 			set: vshard.ReplicaSet{
204 | 				UUID:       "set_1",
205 | 				MasterUUID: "replica_1",
206 | 				Instances: []vshard.Instance{
207 | 					mockInstance(1, true, vshard.StatusMaster),
208 | 					mockInvalidVShardConf(mockInstance(2, true, vshard.StatusMaster)),
209 | 					mockInstance(3, true, vshard.StatusFollow),
210 | 				},
211 | 			},
212 | 			want: &ReplicationAnalysis{
213 | 				CountReplicas:               2,
214 | 				CountWorkingReplicas:        2,
215 | 				CountReplicatingReplicas:    2,
216 | 				CountInconsistentVShardConf: 1,
217 | 				State:                       MasterMasterReplication,
218 | 			},
219 | 		},
220 | 		{
221 | 			name: "InconsistentVShardConfiguration",
222 | 			set: vshard.ReplicaSet{
223 | 				UUID:       "set_1",
224 | 				MasterUUID: "replica_1",
225 | 				Instances: []vshard.Instance{
226 | 					mockInstance(1, true, vshard.StatusMaster),
227 | 					mockInstance(2, true, vshard.StatusFollow),
228 | 					mockInvalidVShardConf(mockInstance(3, true, vshard.StatusFollow)),
229 | 				},
230 | 			},
231 | 			want: &ReplicationAnalysis{
232 | 				CountReplicas:               2,
233 | 				CountWorkingReplicas:        2,
234 | 				CountReplicatingReplicas:    2,
235 | 				CountInconsistentVShardConf: 1,
236 | 				State:                       InconsistentVShardConfiguration,
237 | 			},
238 | 		},
239 | 	}
240 | 
241 | 	for _, tv := range tests {
242 | 		tt := tv
243 | 		t.Run(tt.name, func(t *testing.T) {
244 | 			got := analyze(tt.set, logger)
245 | 			require.NotNil(t, got)
246 | 			assert.Equal(t, tt.want.CountReplicas, got.CountReplicas)
247 | 			assert.Equal(t, tt.want.CountWorkingReplicas, got.CountWorkingReplicas)
248 | 			assert.Equal(t, tt.want.CountReplicatingReplicas, got.CountReplicatingReplicas)
249 | 			assert.Equal(t, tt.want.State, got.State)
250 | 		})
251 | 	}
252 | }
253 | 
254 | func mockInstance(id int, valid bool, status vshard.ReplicationStatus) vshard.Instance {
255 | 	return vshard.Instance{
256 | 		UUID:           vshard.InstanceUUID(fmt.Sprintf("replica_%d", id)),
257 | 		URI:            fmt.Sprintf("replica_%d:3306", id),
258 | 		LastCheckValid: valid,
259 | 		StorageInfo: vshard.StorageInfo{
260 | 			Replication: vshard.Replication{
261 | 				Status: status,
262 | 			},
263 | 		},
264 | 	}
265 | }
266 | 
267 | func mockInvalidVShardConf(inst vshard.Instance) vshard.Instance {
268 | 	inst.VShardFingerprint = 1000
269 | 	return inst
270 | }
271 | 


--------------------------------------------------------------------------------
/internal/vshard/orchestrator/recovery.go:
--------------------------------------------------------------------------------
  1 | package orchestrator
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"strconv"
  6 | 	"strings"
  7 | 	"time"
  8 | 
  9 | 	"github.com/shmel1k/qumomf/internal/util"
 10 | 	"github.com/shmel1k/qumomf/internal/vshard"
 11 | )
 12 | 
 13 | // recoveryTimeFormat is a datetime format used in logs.
 14 | const recoveryTimeFormat = time.RFC3339
 15 | 
 16 | // RecoveryFunc is a function executed by orchestrator in case of failover.
 17 | // Returns list of recoveries applied on cluster, replica set or instances.
 18 | type RecoveryFunc func(ctx context.Context, analysis *ReplicationAnalysis) []*Recovery
 19 | 
 20 | type RecoveryScope string
 21 | 
 22 | const (
 23 | 	RecoveryScopeInstance RecoveryScope = "instance"
 24 | 	RecoveryScopeSet      RecoveryScope = "replica set"
 25 | )
 26 | 
 27 | // Recovery describes the applied recovery to a cluster, replica set or instance.
 28 | type Recovery struct {
 29 | 	Type           string
 30 | 	Scope          RecoveryScope
 31 | 	AnalysisEntry  ReplicationAnalysis
 32 | 	ClusterName    string
 33 | 	SetUUID        vshard.ReplicaSetUUID
 34 | 	Failed         vshard.InstanceIdent
 35 | 	Successor      vshard.InstanceIdent
 36 | 	IsSuccessful   bool
 37 | 	StartTimestamp int64
 38 | 	EndTimestamp   int64
 39 | 	Expiration     int64
 40 | }
 41 | 
 42 | func NewRecovery(scope RecoveryScope, failed vshard.InstanceIdent, analysis ReplicationAnalysis) *Recovery {
 43 | 	return &Recovery{
 44 | 		Type:           string(analysis.State),
 45 | 		Scope:          scope,
 46 | 		AnalysisEntry:  analysis,
 47 | 		SetUUID:        analysis.Set.UUID,
 48 | 		Failed:         failed,
 49 | 		StartTimestamp: util.Timestamp(),
 50 | 	}
 51 | }
 52 | 
 53 | func (r *Recovery) ExpireAfter(ttl time.Duration) {
 54 | 	exp := time.Now().Add(ttl).Unix()
 55 | 	r.Expiration = exp
 56 | }
 57 | 
 58 | // ScopeKey returns the UUID of the replica set or instance
 59 | // where recovery has been applied on.
 60 | func (r *Recovery) ScopeKey() string {
 61 | 	switch r.Scope {
 62 | 	case RecoveryScopeInstance:
 63 | 		return string(r.Failed.UUID)
 64 | 	case RecoveryScopeSet:
 65 | 		return string(r.SetUUID)
 66 | 	}
 67 | 
 68 | 	return r.ClusterName
 69 | }
 70 | 
 71 | func (r *Recovery) Expired() bool {
 72 | 	now := util.Timestamp()
 73 | 	return r.Expiration < now
 74 | }
 75 | 
 76 | func (r *Recovery) String() string {
 77 | 	start := time.Unix(r.StartTimestamp, 0).Format(recoveryTimeFormat)
 78 | 	end := time.Unix(r.EndTimestamp, 0).Format(recoveryTimeFormat)
 79 | 	duration := r.EndTimestamp - r.StartTimestamp
 80 | 
 81 | 	var sb strings.Builder
 82 | 	sb.WriteString("set: ")
 83 | 	sb.WriteString(string(r.SetUUID))
 84 | 	sb.WriteString(", type: ")
 85 | 	sb.WriteString(r.Type)
 86 | 	sb.WriteString(", failed: ")
 87 | 	sb.WriteString(string(r.Failed.UUID))
 88 | 	if r.Successor.UUID != "" {
 89 | 		sb.WriteString(", successor: ")
 90 | 		sb.WriteString(string(r.Successor.UUID))
 91 | 	}
 92 | 	sb.WriteString(", success: ")
 93 | 	sb.WriteString(strconv.FormatBool(r.IsSuccessful))
 94 | 	sb.WriteString(", period: ")
 95 | 	sb.WriteString(start)
 96 | 	sb.WriteString(" - ")
 97 | 	sb.WriteString(end)
 98 | 	sb.WriteString(", duration: ")
 99 | 	sb.WriteString(strconv.FormatInt(duration, 10))
100 | 	sb.WriteString("s")
101 | 
102 | 	return sb.String()
103 | }
104 | 


--------------------------------------------------------------------------------
/internal/vshard/orchestrator/recovery_test.go:
--------------------------------------------------------------------------------
 1 | package orchestrator
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"time"
 6 | 
 7 | 	"github.com/stretchr/testify/assert"
 8 | 
 9 | 	"github.com/shmel1k/qumomf/internal/util"
10 | 	"github.com/shmel1k/qumomf/internal/vshard"
11 | )
12 | 
13 | var mockAnalysis = &ReplicationAnalysis{
14 | 	Set: vshard.ReplicaSet{
15 | 		UUID:       "set_uuid",
16 | 		MasterUUID: "master_uuid",
17 | 	},
18 | 	CountReplicas:            3,
19 | 	CountWorkingReplicas:     0,
20 | 	CountReplicatingReplicas: 0,
21 | 	State:                    DeadMaster,
22 | }
23 | 
24 | func TestNewRecovery(t *testing.T) {
25 | 	ttl := 100 * time.Second
26 | 	failed := vshard.InstanceIdent{
27 | 		UUID: "master",
28 | 		URI:  "localhost:3301",
29 | 	}
30 | 	r := NewRecovery(RecoveryScopeSet, failed, *mockAnalysis)
31 | 	r.ExpireAfter(ttl)
32 | 
33 | 	assert.Equal(t, *mockAnalysis, r.AnalysisEntry)
34 | 	assert.Equal(t, mockAnalysis.Set.UUID, r.SetUUID)
35 | 	assert.Equal(t, failed.UUID, r.Failed.UUID)
36 | 	assert.Equal(t, failed.URI, r.Failed.URI)
37 | 	assert.Equal(t, string(DeadMaster), r.Type)
38 | 	assert.InDelta(t, util.Timestamp(), r.StartTimestamp, 5)
39 | 	assert.InDelta(t, time.Now().Add(ttl).UTC().Unix(), r.Expiration, 1)
40 | }
41 | 
42 | func TestRecovery_Expired(t *testing.T) {
43 | 	ttl := 1 * time.Second
44 | 	failed := vshard.InstanceIdent{
45 | 		UUID: "master",
46 | 		URI:  "localhost:3301",
47 | 	}
48 | 	r := NewRecovery(RecoveryScopeInstance, failed, *mockAnalysis)
49 | 	r.ExpireAfter(ttl)
50 | 
51 | 	assert.False(t, r.Expired())
52 | 	time.Sleep(2 * ttl)
53 | 	assert.True(t, r.Expired())
54 | }
55 | 


--------------------------------------------------------------------------------
/internal/vshard/orchestrator/sampler.go:
--------------------------------------------------------------------------------
 1 | package orchestrator
 2 | 
 3 | import (
 4 | 	"sync"
 5 | 
 6 | 	"github.com/rs/zerolog"
 7 | )
 8 | 
 9 | type sampler struct {
10 | 	enabled      bool
11 | 	fingerprints map[string]string
12 | 	mu           *sync.RWMutex
13 | }
14 | 
15 | func (s *sampler) sample(analysis *ReplicationAnalysis) zerolog.Level {
16 | 	if !s.enabled {
17 | 		return zerolog.InfoLevel
18 | 	}
19 | 
20 | 	got, err := analysis.GetHash()
21 | 	if err != nil {
22 | 		return zerolog.InfoLevel
23 | 	}
24 | 	s.mu.RLock()
25 | 	found, ok := s.fingerprints[string(analysis.Set.UUID)]
26 | 	s.mu.RUnlock()
27 | 	if ok && found == got {
28 | 		return zerolog.DebugLevel
29 | 	}
30 | 
31 | 	s.mu.Lock()
32 | 	defer s.mu.Unlock()
33 | 	s.fingerprints[string(analysis.Set.UUID)] = got
34 | 
35 | 	return zerolog.InfoLevel
36 | }
37 | 


--------------------------------------------------------------------------------
/internal/vshard/parser_test.go:
--------------------------------------------------------------------------------
  1 | package vshard
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"testing"
  6 | 	"time"
  7 | 
  8 | 	"github.com/stretchr/testify/assert"
  9 | 	"github.com/stretchr/testify/require"
 10 | )
 11 | 
 12 | func TestParseRouterInfo(t *testing.T) {
 13 | 	if testing.Short() {
 14 | 		t.Skip("test requires dev env - skipping it in short mode.")
 15 | 	}
 16 | 
 17 | 	conn := setupConnection("127.0.0.1:9301", ConnOptions{
 18 | 		User:           "qumomf",
 19 | 		Password:       "qumomf",
 20 | 		ConnectTimeout: 1 * time.Second,
 21 | 		QueryTimeout:   1 * time.Second,
 22 | 	})
 23 | 
 24 | 	ctx := context.Background()
 25 | 	ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
 26 | 	defer cancel()
 27 | 
 28 | 	resp := conn.Exec(ctx, vshardRouterInfoQuery)
 29 | 	if resp.Error != nil {
 30 | 		require.Nil(t, resp.Error, resp.String())
 31 | 	}
 32 | 
 33 | 	info, err := ParseRouterInfo(resp.Data)
 34 | 	require.Nil(t, err)
 35 | 
 36 | 	assert.Equal(t, int64(0), info.Status)
 37 | 	assert.Empty(t, info.Alerts)
 38 | 
 39 | 	b := RouterBucket{
 40 | 		AvailableRO: 0,
 41 | 		AvailableRW: 120,
 42 | 		Unknown:     0,
 43 | 		Unreachable: 0,
 44 | 	}
 45 | 	assert.Equal(t, b, info.Bucket)
 46 | 
 47 | 	expected := RouterReplicaSetParameters{
 48 | 		"7432f072-c00b-4498-b1a6-6d9547a8a150": RouterInstanceParameters{
 49 | 			UUID:           "a94e7310-13f0-4690-b136-169599e87ba0",
 50 | 			Status:         InstanceAvailable,
 51 | 			URI:            "qumomf_1_m.ddk:3301",
 52 | 			NetworkTimeout: 0.5,
 53 | 		},
 54 | 		"5065fb5f-5f40-498e-af79-43887ba3d1ec": RouterInstanceParameters{
 55 | 			UUID:           "a3ef657e-eb9a-4730-b420-7ea78d52797d",
 56 | 			Status:         InstanceAvailable,
 57 | 			URI:            "qumomf_2_m.ddk:3301",
 58 | 			NetworkTimeout: 0.5,
 59 | 		},
 60 | 	}
 61 | 
 62 | 	require.Len(t, info.ReplicaSets, len(expected))
 63 | 	for uuid, set := range info.ReplicaSets {
 64 | 		expSet, ok := expected[uuid]
 65 | 		require.True(t, ok)
 66 | 
 67 | 		assert.Equal(t, expSet.UUID, set.UUID)
 68 | 		assert.Equal(t, expSet.Status, set.Status)
 69 | 		assert.Equal(t, expSet.URI, set.URI)
 70 | 		assert.InDelta(t, expSet.NetworkTimeout, set.NetworkTimeout, 1.0)
 71 | 	}
 72 | }
 73 | 
 74 | func TestParseReplication(t *testing.T) {
 75 | 	if testing.Short() {
 76 | 		t.Skip("test requires dev env - skipping it in short mode.")
 77 | 	}
 78 | 
 79 | 	conn := setupConnection("127.0.0.1:9303", ConnOptions{
 80 | 		User:           "qumomf",
 81 | 		Password:       "qumomf",
 82 | 		ConnectTimeout: 1 * time.Second,
 83 | 		QueryTimeout:   1 * time.Second,
 84 | 	})
 85 | 
 86 | 	ctx := context.Background()
 87 | 	ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
 88 | 	defer cancel()
 89 | 
 90 | 	resp := conn.Exec(ctx, vshardBoxInfoQuery)
 91 | 	if resp.Error != nil {
 92 | 		require.Nil(t, resp.Error, resp.String())
 93 | 	}
 94 | 
 95 | 	data, err := ParseReplication(resp.Data)
 96 | 	require.Nil(t, err)
 97 | 
 98 | 	assert.Len(t, data, 2)
 99 | 
100 | 	master := data[0]
101 | 	assert.Equal(t, uint64(1), master.ID)
102 | 	assert.Equal(t, InstanceUUID("a94e7310-13f0-4690-b136-169599e87ba0"), master.UUID)
103 | 	assert.Equal(t, "", master.URI) // No upstream data for master, URI must be set manually
104 | 	assert.Equal(t, int64(105), master.LSN)
105 | 	assert.Equal(t, int64(0), master.LSNBehindMaster)
106 | 	assert.Nil(t, master.Upstream)
107 | 	assert.Nil(t, master.Downstream)
108 | 
109 | 	replica := data[1]
110 | 	assert.Equal(t, uint64(2), replica.ID)
111 | 	assert.Equal(t, InstanceUUID("bd1095d1-1e73-4ceb-8e2f-6ebdc7838cb1"), replica.UUID)
112 | 	assert.Equal(t, "qumomf_1_s.ddk:3301", replica.URI)
113 | 	assert.Equal(t, int64(0), replica.LSN)
114 | 	assert.Equal(t, int64(0), replica.LSNBehindMaster)
115 | 	require.NotNil(t, replica.Upstream)
116 | 	assert.Equal(t, UpstreamFollow, replica.Upstream.Status)
117 | 	require.NotNil(t, replica.Downstream)
118 | 	assert.Equal(t, DownstreamFollow, replica.Downstream.Status)
119 | }
120 | 
121 | func TestParseInstanceInfo(t *testing.T) {
122 | 	if testing.Short() {
123 | 		t.Skip("test requires dev env - skipping it in short mode.")
124 | 	}
125 | 
126 | 	conn := setupConnection("127.0.0.1:9304", ConnOptions{
127 | 		User:           "qumomf",
128 | 		Password:       "qumomf",
129 | 		ConnectTimeout: 1 * time.Second,
130 | 		QueryTimeout:   1 * time.Second,
131 | 	})
132 | 
133 | 	ctx := context.Background()
134 | 	ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
135 | 	defer cancel()
136 | 
137 | 	resp := conn.Exec(ctx, vshardInstanceInfoQuery)
138 | 	if resp.Error != nil {
139 | 		require.Nil(t, resp.Error, resp.String())
140 | 	}
141 | 
142 | 	data, err := ParseInstanceInfo(resp.Data)
143 | 	require.Nil(t, err)
144 | 
145 | 	assert.True(t, data.Readonly)
146 | 	assert.Equal(t, uint64(251215738), data.VShardFingerprint)
147 | 
148 | 	storage := &data.StorageInfo
149 | 	assert.Equal(t, HealthCodeGreen, storage.Status)
150 | 
151 | 	replication := &storage.Replication
152 | 	assert.Equal(t, StatusFollow, replication.Status)
153 | 
154 | 	assert.Empty(t, storage.Alerts)
155 | 
156 | 	b := InstanceBucket{
157 | 		Active:    60,
158 | 		Garbage:   0,
159 | 		Pinned:    0,
160 | 		Receiving: 0,
161 | 		Sending:   0,
162 | 		Total:     60,
163 | 	}
164 | 	assert.Equal(t, b, storage.Bucket)
165 | }
166 | 
167 | func TestParseReplication_TableTests(t *testing.T) {
168 | 	tests := []struct {
169 | 		name string
170 | 		data [][]interface{}
171 | 		want []Instance
172 | 	}{
173 | 		{
174 | 			name: "regular_case",
175 | 			data: [][]interface{}{
176 | 				{
177 | 					map[string]interface{}{
178 | 						"id":                int64(1),
179 | 						"uuid":              "uuid",
180 | 						"lsn":               int64(1),
181 | 						"lsn_behind_master": int64(1),
182 | 						"downstream": map[string]interface{}{
183 | 							"idle":   int64(1),
184 | 							"status": "follow",
185 | 						},
186 | 						"upstream": map[string]interface{}{
187 | 							"idle":   int64(1),
188 | 							"lag":    int64(1),
189 | 							"peer":   "test@test",
190 | 							"status": "follow",
191 | 						},
192 | 					},
193 | 				},
194 | 			},
195 | 			want: []Instance{
196 | 				{
197 | 					URI:             "test",
198 | 					ID:              1,
199 | 					UUID:            "uuid",
200 | 					LSN:             1,
201 | 					LSNBehindMaster: 1,
202 | 					Downstream:      &Downstream{Status: DownstreamFollow},
203 | 					Upstream: &Upstream{
204 | 						Status: UpstreamFollow,
205 | 						Idle:   1,
206 | 						Peer:   "test@test",
207 | 						Lag:    1,
208 | 					},
209 | 				},
210 | 			},
211 | 		},
212 | 		{
213 | 			name: "response_array_with_gaps_should_ignore_it",
214 | 			data: [][]interface{}{
215 | 				{
216 | 					nil,
217 | 					map[string]interface{}{
218 | 						"id":                int64(1),
219 | 						"uuid":              "uuid",
220 | 						"lsn":               int64(1),
221 | 						"lsn_behind_master": int64(1),
222 | 					},
223 | 				},
224 | 			},
225 | 			want: []Instance{
226 | 				{
227 | 					ID:              1,
228 | 					UUID:            "uuid",
229 | 					LSN:             1,
230 | 					LSNBehindMaster: 1,
231 | 				},
232 | 			},
233 | 		},
234 | 	}
235 | 
236 | 	for _, tt := range tests {
237 | 		tc := tt
238 | 		t.Run(tt.name, func(t *testing.T) {
239 | 			got, err := ParseReplication(tc.data)
240 | 
241 | 			assert.NoError(t, err)
242 | 			assert.Equal(t, tc.want, got)
243 | 		})
244 | 	}
245 | }
246 | 


--------------------------------------------------------------------------------
/internal/vshard/replicaset.go:
--------------------------------------------------------------------------------
  1 | package vshard
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"sort"
  6 | 	"strconv"
  7 | 	"strings"
  8 | )
  9 | 
 10 | type ReplicaSetUUID string
 11 | 
 12 | type ReplicaSet struct {
 13 | 	// UUID is an unique identifier of the replica set in the cluster.
 14 | 	UUID ReplicaSetUUID `json:"uuid"`
 15 | 
 16 | 	// MasterUUID is UUID of current master in the replica set.
 17 | 	MasterUUID InstanceUUID `json:"master_uuid"`
 18 | 
 19 | 	// MasterURI is URI of current master in the replica set.
 20 | 	MasterURI string `json:"master_uri"`
 21 | 
 22 | 	// Instances contains replication statistics and storage info
 23 | 	// for all instances in the replica set in regard to the current master.
 24 | 	Instances []Instance `json:"instances"`
 25 | }
 26 | 
 27 | func (set ReplicaSet) Copy() ReplicaSet {
 28 | 	r := ReplicaSet{
 29 | 		UUID:       set.UUID,
 30 | 		MasterUUID: set.MasterUUID,
 31 | 		MasterURI:  set.MasterURI,
 32 | 		Instances:  make([]Instance, len(set.Instances)),
 33 | 	}
 34 | 	copy(r.Instances, set.Instances)
 35 | 
 36 | 	return r
 37 | }
 38 | 
 39 | func (set ReplicaSet) SameAs(another *ReplicaSet) bool {
 40 | 	if set.UUID != another.UUID {
 41 | 		return false
 42 | 	}
 43 | 
 44 | 	n := len(set.Instances)
 45 | 	if set.MasterUUID != another.MasterUUID || n != len(another.Instances) {
 46 | 		return false
 47 | 	}
 48 | 
 49 | 	instances := set.Instances
 50 | 	anotherInstances := another.Instances
 51 | 	sortInstances(instances)
 52 | 	sortInstances(anotherInstances)
 53 | 
 54 | 	for i := 0; i < n; i++ {
 55 | 		if !instances[i].SameAs(anotherInstances[i]) {
 56 | 			return false
 57 | 		}
 58 | 	}
 59 | 
 60 | 	return true
 61 | }
 62 | 
 63 | func sortInstances(instances []Instance) {
 64 | 	sort.Slice(instances, func(i, j int) bool {
 65 | 		return instances[i].UUID < instances[j].UUID
 66 | 	})
 67 | }
 68 | 
 69 | func (set ReplicaSet) HealthStatus() (code HealthCode, level HealthLevel) {
 70 | 	master, err := set.Master()
 71 | 	if err != nil {
 72 | 		return HealthCodeUnknown, HealthLevelUnknown
 73 | 	}
 74 | 
 75 | 	return master.CriticalCode(), master.CriticalLevel()
 76 | }
 77 | 
 78 | func (set ReplicaSet) Followers() []Instance {
 79 | 	if len(set.Instances) == 0 {
 80 | 		return []Instance{}
 81 | 	}
 82 | 
 83 | 	followers := make([]Instance, 0, len(set.Instances)-1)
 84 | 	for _, inst := range set.Instances { //nolint:gocritic
 85 | 		if inst.UUID != set.MasterUUID {
 86 | 			followers = append(followers, inst)
 87 | 		}
 88 | 	}
 89 | 
 90 | 	return followers
 91 | }
 92 | 
 93 | func (set ReplicaSet) AliveFollowers() []Instance {
 94 | 	if len(set.Instances) == 0 {
 95 | 		return []Instance{}
 96 | 	}
 97 | 
 98 | 	followers := make([]Instance, 0, len(set.Instances)-1)
 99 | 	for _, inst := range set.Instances { // nolint:gocritic
100 | 		if inst.UUID == set.MasterUUID {
101 | 			continue
102 | 		}
103 | 
104 | 		if !inst.LastCheckValid {
105 | 			continue
106 | 		}
107 | 
108 | 		upstream := inst.Upstream
109 | 		downstream := inst.Downstream
110 | 
111 | 		if upstream == nil && downstream == nil {
112 | 			continue
113 | 		}
114 | 
115 | 		if upstream != nil {
116 | 			if upstream.Status != UpstreamDisconnected && upstream.Status != UpstreamStopped {
117 | 				followers = append(followers, inst)
118 | 			}
119 | 		} else if downstream != nil {
120 | 			if downstream.Status != DownstreamStopped {
121 | 				followers = append(followers, inst)
122 | 			}
123 | 		}
124 | 	}
125 | 
126 | 	return followers
127 | }
128 | 
129 | func (set ReplicaSet) Master() (Instance, error) {
130 | 	for i := range set.Instances {
131 | 		inst := &set.Instances[i]
132 | 		if inst.UUID == set.MasterUUID {
133 | 			return *inst, nil
134 | 		}
135 | 	}
136 | 
137 | 	return Instance{}, fmt.Errorf("replica set `%s` has invalid topology snapshot: master `%s` not found", set.UUID, set.MasterUUID)
138 | }
139 | 
140 | func (set ReplicaSet) String() string {
141 | 	// Minimal style, only important info.
142 | 	var sb strings.Builder
143 | 	sb.WriteString("id: ")
144 | 	sb.WriteString(string(set.UUID))
145 | 	sb.WriteString("; master_uuid: ")
146 | 	sb.WriteString(string(set.MasterUUID))
147 | 	sb.WriteString("; master_uri: ")
148 | 	sb.WriteString(set.MasterURI)
149 | 	sb.WriteString("; size: ")
150 | 	sb.WriteString(strconv.Itoa(len(set.Instances)))
151 | 	sb.WriteString("; health: ")
152 | 	_, cl := set.HealthStatus()
153 | 	sb.WriteString(string(cl))
154 | 
155 | 	if cl == HealthLevelGreen {
156 | 		return sb.String()
157 | 	}
158 | 
159 | 	sb.WriteString("; alerts: [")
160 | 	prettyList := false
161 | 	for i := range set.Instances {
162 | 		inst := &set.Instances[i]
163 | 		alerts := inst.StorageInfo.Alerts
164 | 		if len(alerts) > 0 {
165 | 			if prettyList {
166 | 				sb.WriteString(", ")
167 | 			}
168 | 			sb.WriteString(inst.URI)
169 | 			sb.WriteString(" -> ")
170 | 			for j, alert := range alerts {
171 | 				sb.WriteString(alert.String())
172 | 				if j != len(alerts)-1 {
173 | 					sb.WriteString(", ")
174 | 				}
175 | 			}
176 | 			prettyList = true
177 | 		}
178 | 	}
179 | 	sb.WriteString("]")
180 | 
181 | 	return sb.String()
182 | }
183 | 


--------------------------------------------------------------------------------
/internal/vshard/replicaset_test.go:
--------------------------------------------------------------------------------
  1 | package vshard
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/stretchr/testify/assert"
  7 | )
  8 | 
  9 | func TestReplicaSet_Followers(t *testing.T) {
 10 | 	type fields struct {
 11 | 		UUID       ReplicaSetUUID
 12 | 		MasterUUID InstanceUUID
 13 | 		Instances  []Instance
 14 | 	}
 15 | 
 16 | 	tests := []struct {
 17 | 		name   string
 18 | 		fields fields
 19 | 		want   []InstanceUUID
 20 | 	}{
 21 | 		{
 22 | 			name: "NoFollowers",
 23 | 			fields: fields{
 24 | 				UUID:       "uuid_1",
 25 | 				MasterUUID: "master_uuid_1",
 26 | 				Instances:  []Instance{},
 27 | 			},
 28 | 			want: []InstanceUUID{},
 29 | 		},
 30 | 		{
 31 | 			name: "MultipleFollowers",
 32 | 			fields: fields{
 33 | 				UUID:       "uuid_1",
 34 | 				MasterUUID: "master_uuid_1",
 35 | 				Instances: []Instance{
 36 | 					{
 37 | 						UUID: "master_uuid_1",
 38 | 					},
 39 | 					{
 40 | 						UUID: "replica_uuid_1",
 41 | 					},
 42 | 					{
 43 | 						UUID: "replica_uuid_2",
 44 | 					},
 45 | 				},
 46 | 			},
 47 | 			want: []InstanceUUID{"replica_uuid_1", "replica_uuid_2"},
 48 | 		},
 49 | 	}
 50 | 
 51 | 	for _, tv := range tests {
 52 | 		tt := tv
 53 | 		t.Run(tt.name, func(t *testing.T) {
 54 | 			set := ReplicaSet{
 55 | 				UUID:       tt.fields.UUID,
 56 | 				MasterUUID: tt.fields.MasterUUID,
 57 | 				Instances:  tt.fields.Instances,
 58 | 			}
 59 | 
 60 | 			followers := set.Followers()
 61 | 			got := make([]InstanceUUID, 0, len(followers))
 62 | 			for _, f := range followers {
 63 | 				got = append(got, f.UUID)
 64 | 			}
 65 | 
 66 | 			assert.Equal(t, tt.want, got)
 67 | 		})
 68 | 	}
 69 | }
 70 | 
 71 | func TestReplicaSet_AliveFollowers(t *testing.T) {
 72 | 	type fields struct {
 73 | 		UUID       ReplicaSetUUID
 74 | 		MasterUUID InstanceUUID
 75 | 		Instances  []Instance
 76 | 	}
 77 | 
 78 | 	tests := []struct {
 79 | 		name   string
 80 | 		fields fields
 81 | 		want   []InstanceUUID
 82 | 	}{
 83 | 		{
 84 | 			name: "NoFollowers",
 85 | 			fields: fields{
 86 | 				UUID:       "uuid_1",
 87 | 				MasterUUID: "master_uuid_1",
 88 | 				Instances:  []Instance{},
 89 | 			},
 90 | 			want: []InstanceUUID{},
 91 | 		},
 92 | 		{
 93 | 			name: "MultipleFollowers",
 94 | 			fields: fields{
 95 | 				UUID:       "uuid_1",
 96 | 				MasterUUID: "master_uuid_1",
 97 | 				Instances: []Instance{
 98 | 					{
 99 | 						UUID: "master_uuid_1",
100 | 						Upstream: &Upstream{
101 | 							Status: UpstreamRunning,
102 | 						},
103 | 					},
104 | 					{
105 | 						UUID:           "replica_uuid_1",
106 | 						LastCheckValid: true,
107 | 						Upstream: &Upstream{
108 | 							Status: UpstreamFollow,
109 | 						},
110 | 						Downstream: &Downstream{
111 | 							Status: DownstreamFollow,
112 | 						},
113 | 					},
114 | 					{
115 | 						UUID:           "replica_uuid_2",
116 | 						LastCheckValid: true,
117 | 						Upstream: &Upstream{
118 | 							Status: UpstreamFollow,
119 | 						},
120 | 						Downstream: &Downstream{
121 | 							Status: DownstreamFollow,
122 | 						},
123 | 					},
124 | 					{
125 | 						UUID:           "replica_uuid_3",
126 | 						LastCheckValid: true,
127 | 						Upstream: &Upstream{
128 | 							Status: UpstreamStopped,
129 | 						},
130 | 					},
131 | 					{
132 | 						UUID:           "replica_uuid_4",
133 | 						LastCheckValid: false,
134 | 						Upstream: &Upstream{
135 | 							Status: UpstreamFollow,
136 | 						},
137 | 					},
138 | 				},
139 | 			},
140 | 			want: []InstanceUUID{"replica_uuid_1", "replica_uuid_2"},
141 | 		},
142 | 	}
143 | 
144 | 	for _, tv := range tests {
145 | 		tt := tv
146 | 		t.Run(tt.name, func(t *testing.T) {
147 | 			set := &ReplicaSet{
148 | 				UUID:       tt.fields.UUID,
149 | 				MasterUUID: tt.fields.MasterUUID,
150 | 				Instances:  tt.fields.Instances,
151 | 			}
152 | 
153 | 			followers := set.AliveFollowers()
154 | 			got := make([]InstanceUUID, 0, len(followers))
155 | 			for _, f := range followers {
156 | 				got = append(got, f.UUID)
157 | 			}
158 | 
159 | 			assert.Equal(t, tt.want, got)
160 | 		})
161 | 	}
162 | }
163 | 


--------------------------------------------------------------------------------
/internal/vshard/router.go:
--------------------------------------------------------------------------------
 1 | package vshard
 2 | 
 3 | type InstanceStatus string
 4 | 
 5 | const (
 6 | 	InstanceAvailable   InstanceStatus = "available"
 7 | 	InstanceUnreachable InstanceStatus = "unreachable"
 8 | 	InstanceMissing     InstanceStatus = "missing"
 9 | )
10 | 
11 | type Router struct {
12 | 	URI  string     `json:"uri"`
13 | 	Info RouterInfo `json:"info"`
14 | }
15 | 
16 | func NewRouter(uri string) Router {
17 | 	return Router{
18 | 		URI: uri,
19 | 		Info: RouterInfo{
20 | 			Status: -1,
21 | 		},
22 | 	}
23 | }
24 | 
25 | type RouterInfo struct {
26 | 	LastSeen    int64                      `json:"last_seen"`
27 | 	ReplicaSets RouterReplicaSetParameters `json:"replica_sets"`
28 | 	Bucket      RouterBucket               `json:"bucket"`
29 | 	Status      int64                      `json:"status"`
30 | 	Alerts      []Alert                    `json:"alerts"`
31 | }
32 | 
33 | type RouterReplicaSetParameters map[ReplicaSetUUID]RouterInstanceParameters
34 | 
35 | type RouterInstanceParameters struct {
36 | 	UUID           InstanceUUID   `json:"uuid"`
37 | 	Status         InstanceStatus `json:"status"`
38 | 	URI            string         `json:"uri"`
39 | 	NetworkTimeout float64        `json:"network_timeout"`
40 | }
41 | 
42 | // RouterBucket represents bucket parameters known to the router.
43 | type RouterBucket struct {
44 | 	// AvailableRO is the number of buckets known to the router
45 | 	// and available for read requests.
46 | 	AvailableRO int64 `json:"available_ro"`
47 | 
48 | 	// AvailableRW is the number of buckets known to the router
49 | 	// and available for read and write requests.
50 | 	AvailableRW int64 `json:"available_rw"`
51 | 
52 | 	// Unknown is the number of buckets known to the router
53 | 	// but unavailable for any requests.
54 | 	Unknown int64 `json:"unknown"`
55 | 
56 | 	// Unreachable is the number of buckets
57 | 	// whose replica sets are not known to the router.
58 | 	Unreachable int64 `json:"unreachable"`
59 | }
60 | 


--------------------------------------------------------------------------------
/internal/vshard/snapshot.go:
--------------------------------------------------------------------------------
 1 | package vshard
 2 | 
 3 | // Snapshot is a copy of the cluster topology in given time.
 4 | type Snapshot struct {
 5 | 	Created     int64        `json:"created"`
 6 | 	Routers     []Router     `json:"routers"`
 7 | 	ReplicaSets []ReplicaSet `json:"replica_sets"`
 8 | 	priorities  map[string]int
 9 | }
10 | 
11 | func (s *Snapshot) ClusterHealthLevel() HealthLevel {
12 | 	hc := HealthCodeGreen
13 | 	for _, replicaSet := range s.ReplicaSets {
14 | 		gotHC, _ := replicaSet.HealthStatus()
15 | 		if gotHC > hc {
16 | 			hc = gotHC
17 | 		}
18 | 	}
19 | 
20 | 	return s.healthLevel(hc)
21 | }
22 | 
23 | func (s *Snapshot) healthLevel(healthCode HealthCode) HealthLevel {
24 | 	switch healthCode {
25 | 	case HealthCodeGreen:
26 | 		return HealthLevelGreen
27 | 	case HealthCodeYellow:
28 | 		return HealthLevelYellow
29 | 	case HealthCodeOrange:
30 | 		return HealthLevelOrange
31 | 	case HealthCodeRed:
32 | 		return HealthLevelRed
33 | 	}
34 | 
35 | 	return HealthLevelUnknown
36 | }
37 | 
38 | func (s *Snapshot) Copy() Snapshot {
39 | 	dst := Snapshot{
40 | 		Created:     s.Created,
41 | 		Routers:     make([]Router, len(s.Routers)),
42 | 		ReplicaSets: make([]ReplicaSet, 0, len(s.ReplicaSets)),
43 | 		priorities:  make(map[string]int),
44 | 	}
45 | 
46 | 	for key, value := range s.priorities {
47 | 		dst.priorities[key] = value
48 | 	}
49 | 
50 | 	for _, set := range s.ReplicaSets {
51 | 		dst.ReplicaSets = append(dst.ReplicaSets, set.Copy())
52 | 	}
53 | 
54 | 	copy(dst.Routers, s.Routers)
55 | 
56 | 	return dst
57 | }
58 | 
59 | func (s *Snapshot) TopologyOf(uuid ReplicaSetUUID) ([]Instance, error) {
60 | 	for _, set := range s.ReplicaSets {
61 | 		if set.UUID == uuid {
62 | 			return set.Instances, nil
63 | 		}
64 | 	}
65 | 
66 | 	return []Instance{}, ErrReplicaSetNotFound
67 | }
68 | 
69 | func (s *Snapshot) ReplicaSet(uuid ReplicaSetUUID) (ReplicaSet, error) {
70 | 	for _, set := range s.ReplicaSets {
71 | 		if set.UUID == uuid {
72 | 			return set, nil
73 | 		}
74 | 	}
75 | 
76 | 	return ReplicaSet{}, ErrReplicaSetNotFound
77 | }
78 | 
79 | func (s *Snapshot) UpdatePriorities(priorities map[string]int) {
80 | 	s.priorities = priorities
81 | 
82 | 	for i := range s.ReplicaSets {
83 | 		set := &s.ReplicaSets[i]
84 | 		for j := range set.Instances {
85 | 			inst := &set.Instances[j]
86 | 			if priority, ok := s.priorities[string(inst.UUID)]; ok {
87 | 				inst.Priority = priority
88 | 			} else {
89 | 				inst.Priority = 0
90 | 			}
91 | 		}
92 | 	}
93 | }
94 | 


--------------------------------------------------------------------------------
/internal/vshard/tarantool.go:
--------------------------------------------------------------------------------
  1 | package vshard
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"strings"
  6 | 	"sync"
  7 | 	"time"
  8 | 
  9 | 	"github.com/viciious/go-tarantool"
 10 | )
 11 | 
 12 | const maxRetries = 2 // TODO: move to config
 13 | 
 14 | var tntRetryableErrors = []uint{
 15 | 	tarantool.ErrNoConnection,
 16 | 	tarantool.ErrTimeout,
 17 | }
 18 | 
 19 | type ConnPool interface {
 20 | 	Get(uri string) *Connector
 21 | 	Close()
 22 | }
 23 | 
 24 | type ConnOptions struct {
 25 | 	User           string
 26 | 	Password       string
 27 | 	ConnectTimeout time.Duration
 28 | 	QueryTimeout   time.Duration
 29 | }
 30 | 
 31 | type OverrideURIRules map[string]string
 32 | 
 33 | type pool struct {
 34 | 	template ConnOptions
 35 | 	rules    OverrideURIRules
 36 | 
 37 | 	m     map[string]*Connector
 38 | 	mutex sync.RWMutex
 39 | }
 40 | 
 41 | func NewConnPool(template ConnOptions, rules OverrideURIRules) ConnPool {
 42 | 	return &pool{
 43 | 		template: template,
 44 | 		rules:    rules,
 45 | 		m:        make(map[string]*Connector),
 46 | 	}
 47 | }
 48 | 
 49 | func (p *pool) Get(uri string) *Connector {
 50 | 	u := removeUserInfo(uri)
 51 | 	u = overrideURI(u, p.rules)
 52 | 
 53 | 	p.mutex.RLock()
 54 | 	conn, ok := p.m[u]
 55 | 	p.mutex.RUnlock()
 56 | 	if ok {
 57 | 		return conn
 58 | 	}
 59 | 
 60 | 	p.mutex.Lock()
 61 | 	// double check
 62 | 	conn, ok = p.m[u]
 63 | 	if ok {
 64 | 		return conn
 65 | 	}
 66 | 	conn = setupConnection(u, p.template)
 67 | 	p.m[u] = conn
 68 | 	p.mutex.Unlock()
 69 | 
 70 | 	return conn
 71 | }
 72 | 
 73 | func overrideURI(uri string, rules OverrideURIRules) string {
 74 | 	u, ok := rules[uri]
 75 | 	if ok {
 76 | 		return u
 77 | 	}
 78 | 	return uri
 79 | }
 80 | 
 81 | func (p *pool) Close() {
 82 | 	p.mutex.Lock()
 83 | 	for _, conn := range p.m {
 84 | 		conn.Close()
 85 | 	}
 86 | 	p.mutex.Unlock()
 87 | }
 88 | 
 89 | func removeUserInfo(uri string) string {
 90 | 	if idx := strings.IndexByte(uri, '@'); idx >= 0 {
 91 | 		return uri[idx+1:]
 92 | 	}
 93 | 	return uri
 94 | }
 95 | 
 96 | type Connector struct {
 97 | 	conn *tarantool.Connector
 98 | }
 99 | 
100 | func (c *Connector) Exec(ctx context.Context, q tarantool.Query, opts ...tarantool.ExecOption) *tarantool.Result {
101 | 	var resp *tarantool.Result
102 | 	for i := 0; i < maxRetries; i++ {
103 | 		conn, err := c.conn.Connect()
104 | 		if err != nil {
105 | 			return &tarantool.Result{
106 | 				Error: err,
107 | 			}
108 | 		}
109 | 
110 | 		select {
111 | 		case <-ctx.Done():
112 | 			return &tarantool.Result{
113 | 				Error:     tarantool.NewContextError(ctx, conn, "Exec error"),
114 | 				ErrorCode: tarantool.ErrTimeout,
115 | 			}
116 | 		default:
117 | 		}
118 | 
119 | 		resp = conn.Exec(ctx, q, opts...)
120 | 		if resp.Error != nil && isRetryable(resp.ErrorCode) {
121 | 			conn.Close()
122 | 			continue
123 | 		}
124 | 		return resp
125 | 	}
126 | 
127 | 	return resp
128 | }
129 | 
130 | func (c *Connector) Close() {
131 | 	c.conn.Close()
132 | }
133 | 
134 | func setupConnection(uri string, c ConnOptions) *Connector {
135 | 	cfg := &tarantool.Options{
136 | 		User:           c.User,
137 | 		Password:       c.Password,
138 | 		ConnectTimeout: c.ConnectTimeout,
139 | 		QueryTimeout:   c.QueryTimeout,
140 | 	}
141 | 
142 | 	conn := tarantool.New(uri, cfg)
143 | 	return &Connector{
144 | 		conn: conn,
145 | 	}
146 | }
147 | 
148 | func isRetryable(code uint) bool {
149 | 	for _, rc := range tntRetryableErrors {
150 | 		if rc == code {
151 | 			return true
152 | 		}
153 | 	}
154 | 
155 | 	return false
156 | }
157 | 


--------------------------------------------------------------------------------
/internal/vshard/tarantool_test.go:
--------------------------------------------------------------------------------
  1 | package vshard
  2 | 
  3 | import (
  4 | 	"strconv"
  5 | 	"strings"
  6 | 	"sync"
  7 | 	"testing"
  8 | 
  9 | 	"github.com/stretchr/testify/assert"
 10 | 	"github.com/stretchr/testify/require"
 11 | )
 12 | 
 13 | func Test_removeUserInfo(t *testing.T) {
 14 | 	tests := []struct {
 15 | 		name string
 16 | 		uri  string
 17 | 		want string
 18 | 	}{
 19 | 		{
 20 | 			name: "NoUserInfo_ShouldReturnTheSameUri",
 21 | 			uri:  "tarantool.repl:3301",
 22 | 			want: "tarantool.repl:3301",
 23 | 		},
 24 | 		{
 25 | 			name: "Username_ShouldReturnHostAndPort",
 26 | 			uri:  "qumomf@tarantool.repl:3301",
 27 | 			want: "tarantool.repl:3301",
 28 | 		},
 29 | 		{
 30 | 			name: "UsernameAndPass_ShouldReturnHostAndPort",
 31 | 			uri:  "qumomf:qumomf@tarantool.repl:3301",
 32 | 			want: "tarantool.repl:3301",
 33 | 		},
 34 | 	}
 35 | 	for _, tv := range tests {
 36 | 		tt := tv
 37 | 		t.Run(tt.name, func(t *testing.T) {
 38 | 			got := removeUserInfo(tt.uri)
 39 | 			assert.Equal(t, tt.want, got)
 40 | 		})
 41 | 	}
 42 | }
 43 | 
 44 | func Test_overrideURI(t *testing.T) {
 45 | 	type args struct {
 46 | 		uri   string
 47 | 		rules OverrideURIRules
 48 | 	}
 49 | 	tests := []struct {
 50 | 		name string
 51 | 		args args
 52 | 		want string
 53 | 	}{
 54 | 		{
 55 | 			name: "NoRules_ShouldReturnTheSameUri",
 56 | 			args: args{
 57 | 				uri:   "tarantool.repl:3301",
 58 | 				rules: nil,
 59 | 			},
 60 | 			want: "tarantool.repl:3301",
 61 | 		},
 62 | 		{
 63 | 			name: "NoSuitableRule_ShouldReturnTheSameUri",
 64 | 			args: args{
 65 | 				uri: "tarantool.repl:3301",
 66 | 				rules: OverrideURIRules{
 67 | 					"tarantool2.repl:3301": "tnt2.repl:3301",
 68 | 					"tarantool.repl:8801":  "tnt.repl:8801",
 69 | 				},
 70 | 			},
 71 | 			want: "tarantool.repl:3301",
 72 | 		},
 73 | 		{
 74 | 			name: "RuleApplied_ShouldReturnOverridden",
 75 | 			args: args{
 76 | 				uri: "tarantool.repl:3301",
 77 | 				rules: OverrideURIRules{
 78 | 					"tarantool.repl:3301": "tnt.repl:3301",
 79 | 					"tarantool.repl:8801": "tnt.repl:8801",
 80 | 				},
 81 | 			},
 82 | 			want: "tnt.repl:3301",
 83 | 		},
 84 | 	}
 85 | 	for _, tv := range tests {
 86 | 		tt := tv
 87 | 		t.Run(tt.name, func(t *testing.T) {
 88 | 			got := overrideURI(tt.args.uri, tt.args.rules)
 89 | 			assert.Equal(t, tt.want, got)
 90 | 		})
 91 | 	}
 92 | }
 93 | 
 94 | func TestPool_Get(t *testing.T) {
 95 | 	connOpts := ConnOptions{
 96 | 		User:     "qumomf",
 97 | 		Password: "qumomf",
 98 | 	}
 99 | 	p := NewConnPool(connOpts, nil)
100 | 	uri := "tarantool.repl:3301"
101 | 	n := 1000
102 | 
103 | 	ch := make(chan *Connector, n)
104 | 	var wg sync.WaitGroup
105 | 	wg.Add(n)
106 | 	for i := 0; i < n; i++ {
107 | 		go func() {
108 | 			ch <- p.Get(uri)
109 | 			wg.Done()
110 | 		}()
111 | 	}
112 | 	wg.Wait()
113 | 	close(ch)
114 | 
115 | 	var conn *Connector
116 | 	for c := range ch {
117 | 		if conn == nil {
118 | 			conn = c
119 | 		}
120 | 		require.Same(t, conn, c)
121 | 	}
122 | 
123 | 	p.Close()
124 | }
125 | 
126 | func BenchmarkPool_Get(b *testing.B) {
127 | 	connOpts := ConnOptions{
128 | 		User:     "qumomf",
129 | 		Password: "qumomf",
130 | 	}
131 | 	p := NewConnPool(connOpts, nil)
132 | 
133 | 	var ub strings.Builder
134 | 	var uri string
135 | 	var conn *Connector
136 | 
137 | 	b.ReportAllocs()
138 | 	for i := 0; i < b.N; i++ {
139 | 		ub.Reset()
140 | 		ub.WriteString("tnt-")
141 | 		ub.WriteString(strconv.Itoa(i))
142 | 		ub.WriteString(":3301")
143 | 		uri = ub.String()
144 | 
145 | 		conn = p.Get(uri)
146 | 		conn.Close()
147 | 	}
148 | }
149 | 


--------------------------------------------------------------------------------
/scripts/etc/systemd/qumomf.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=qumomf: Tarantool vshard HA tool supports auto discovery and recovery.
 3 | Documentation=https://github.com/shmel1k/qumomf
 4 | After=syslog.target network.target
 5 | 
 6 | [Service]
 7 | Type=simple
 8 | WorkingDirectory=/usr/local/bin
 9 | ExecStart=/usr/local/bin/qumomf -config /etc/qumomf/conf.yml
10 | TimeoutSec=30
11 | 
12 | [Install]
13 | WantedBy=multi-user.target


--------------------------------------------------------------------------------
/scripts/postinstall.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | systemctl --system daemon-reload >/dev/null || true
4 | systemctl enable qumomf.service >/dev/null || true
5 | 
6 | deb_systemctl=$(command -v deb-systemd-invoke || echo systemctl)
7 | ${deb_systemctl} restart qumomf.service >/dev/null || true
8 | 


--------------------------------------------------------------------------------
/scripts/preremove.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | deb_systemctl=$(command -v deb-systemd-invoke || echo systemctl)
4 | ${deb_systemctl} stop qumomf.service >/dev/null || true
5 | 
6 | systemctl disable qumomf.service >/dev/null || true
7 | systemctl --system daemon-reload >/dev/null || true
8 | 


--------------------------------------------------------------------------------