├── .gitignore ├── LICENSE.txt ├── README.md ├── api.go ├── control_test.go ├── go.mod ├── licenses ├── APL2.txt └── BSL-Couchbase.txt ├── misc.go ├── misc_test.go ├── moves.go ├── moves_test.go ├── orchestrate.go ├── orchestrate_test.go ├── plan.go └── plan_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.tmp 3 | *.out 4 | /tmp 5 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Source code in this repository is licensed under various licenses. The 2 | Business Source License 1.1 (BSL) is one such license. Each file indicates in 3 | a section at the beginning of the file the name of the license that applies to 4 | it. All licenses used in this repository can be found in the top-level 5 | licenses directory. 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | blance 2 | ====== 3 | 4 | blance implements a straightforward partition assignment algorithm, 5 | using a greedy, heuristic, functional approach. 6 | 7 | blance provides features like multiple, user-configurable partition 8 | states (primary, replica, read-only, etc), multi-level containment 9 | hierarchy (shelf/rack/row/zone/datacenter awareness) with configurable 10 | inclusion/exclusion policies, heterogeneous partition weights, 11 | heterogeneous node weights, partition stickiness control, and multi-primary 12 | support. 13 | 14 | [![Build Status](https://travis-ci.org/couchbase/blance.svg)](https://travis-ci.org/couchbase/blance) [![GoDoc](https://godoc.org/github.com/couchbase/blance?status.svg)](https://godoc.org/github.com/couchbase/blance) [![Coverage Status](https://coveralls.io/repos/couchbase/blance/badge.svg?branch=master&service=github)](https://coveralls.io/github/couchbase/blance?branch=master) 15 | 16 | LICENSE: Apache 2.0 17 | 18 | ### Usage 19 | 20 | See the PlanNextMap() function as a starting point. 21 | 22 | ### For developers 23 | 24 | To get local coverage reports with heatmaps... 25 | 26 | go test -coverprofile=coverage.out -covermode=count && go tool cover -html=coverage.out 27 | -------------------------------------------------------------------------------- /api.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included 4 | // in the file licenses/BSL-Couchbase.txt. As of the Change Date specified 5 | // in that file, in accordance with the Business Source License, use of this 6 | // software will be governed by the Apache License, Version 2.0, included in 7 | // the file licenses/APL2.txt. 8 | 9 | // Package blance provides a partition assignment library, using a 10 | // greedy, heuristic, functional approach. It supports multiple, 11 | // configurable partition states (primary, replica, read-only, etc), 12 | // configurable multi-level containment hierarchy 13 | // (shelf/rack/zone/datacenter awareness) with inclusion/exclusion 14 | // policies, heterogeneous partition weights, heterogeneous node 15 | // weights, partition stickiness control, and multi-primary support. 16 | package blance 17 | 18 | // A PartitionMap represents all the partitions for some logical 19 | // resource, where the partitions are assigned to different nodes and 20 | // with different states. For example, partition "A-thru-H" is 21 | // assigned to node "x" as a "primary" and to node "y" as a "replica". 22 | // And, partition "I-thru-Z" is assigned to node "y" as a "primary" and 23 | // to nodes "z" & "x" as "replica". 24 | type PartitionMap map[string]*Partition // Keyed by Partition.Name. 25 | 26 | // A Partition represents a distinct, non-overlapping subset (or a 27 | // shard) of some logical resource. 28 | type Partition struct { 29 | // The Name of a Partition must be unique within a PartitionMap. 30 | Name string `json:"name"` 31 | 32 | // NodesByState is keyed is stateName, and the values are an array 33 | // of node names. For example, {"primary": ["a"], "replica": ["b", 34 | // "c"]}. 35 | NodesByState map[string][]string `json:"nodesByState"` 36 | } 37 | 38 | // A PartitionModel lets applications define different states for each 39 | // partition per node, such as "primary", "replica", "dead", etc. Key is 40 | // stateName, like "primary", "replica", "dead", etc. 41 | type PartitionModel map[string]*PartitionModelState 42 | 43 | // A PartitionModelState lets applications define metadata per 44 | // partition model state. For example, "primary" state should have 45 | // different priority and constraints than a "replica" state. 46 | type PartitionModelState struct { 47 | // Priority of zero is the highest. e.g., "primary" Priority 48 | // should be < than "replica" Priority, so we can define that 49 | // as "primary" Priority of 0 and "replica" priority of 1. 50 | Priority int `json:"priority"` 51 | 52 | // A Constraint defines how many nodes the algorithm strives to 53 | // assign a partition. For example, for any given partition, 54 | // perhaps the application wants 1 node to have "primary" state and 55 | // wants 2 nodes to have "replica" state. That is, the "primary" 56 | // state has Constraints of 1, and the "replica" state has 57 | // Constraints of 2. Continuing the example, when the "primary" 58 | // state has Priority of 0 and the "replica" state has Priority of 59 | // 1, then "primary" partitions will be assigned to nodes before 60 | // "replica" partitions. 61 | Constraints int `json:"constraints"` 62 | } 63 | 64 | // HierarchyRules example: 65 | // {"replica":[{IncludeLevel:1,ExcludeLevel:0}]}, which means that after 66 | // a partition is assigned to a node as primary, then assign the first 67 | // replica to a node that is a close sibling node to the primary node 68 | // (e.g., same parent or same rack). Another example: 69 | // {"replica":[{IncludeLevel:1,ExcludeLevel:0}, 70 | // {IncludeLevel:2,ExcludeLevel:1}]}, which means assign the first 71 | // replica same as above, but assign the second replica to a node that is 72 | // not a sibling of the primary (not the same parent, so to a different 73 | // rack). 74 | type HierarchyRules map[string][]*HierarchyRule 75 | 76 | // A HierarchyRule is metadata for rack/zone awareness features. 77 | // First, IncludeLevel is processed to find a set of candidate nodes. 78 | // Then, ExcludeLevel is processed to remove or exclude nodes from 79 | // that set. For example, for this containment tree, (datacenter0 80 | // (rack0 (nodeA nodeB)) (rack1 (nodeC nodeD))), lets focus on nodeA. 81 | // If IncludeLevel is 1, that means go up 1 parent (so, from nodeA up 82 | // to rack0) and then take all of rack0's leaves: nodeA and nodeB. 83 | // So, the candidate nodes of nodeA and nodeB are all on the same rack 84 | // as nodeA, or a "same rack" policy. If instead the IncludeLevel was 85 | // 2 and ExcludeLevel was 1, then that means a "different rack" 86 | // policy. With IncludeLevel of 2, we go up 2 ancestors from node A 87 | // (from nodeA to rack0; and then from rack0 to datacenter0) to get to 88 | // datacenter0. The datacenter0 has leaves of nodeA, nodeB, nodeC, 89 | // nodeD, so those nodes comprise the inclusion candidate set. But, 90 | // with ExcludeLevel of 1, that means we go up 1 parent from nodeA to 91 | // rack0, take rack0's leaves, giving us an exclusion set of nodeA & 92 | // nodeB. The inclusion candidate set minus the exclusion set finally 93 | // gives us just nodeC & nodeD as our final candidate nodes. That 94 | // final candidate set of nodes (just nodeC & nodeD) are from a 95 | // different rack as nodeA. 96 | type HierarchyRule struct { 97 | // IncludeLevel defines how many parents or ancestors to traverse 98 | // upwards in a containment hierarchy to find candidate nodes. 99 | IncludeLevel int `json:"includeLevel"` 100 | 101 | // ExcludeLevel defines how many parents or ancestors to traverse 102 | // upwards in a containment hierarchy to find an exclusion set of 103 | // nodes. 104 | ExcludeLevel int `json:"excludeLevel"` 105 | } 106 | 107 | // PlanNextMap is deprecated. Applications should instead use the 108 | // PlanNextMapEx() and PlanNextMapOptions API's. 109 | func PlanNextMap( 110 | prevMap PartitionMap, 111 | partitionsToAssign PartitionMap, 112 | nodesAll []string, // Union of nodesBefore, nodesToAdd, nodesToRemove. 113 | nodesToRemove []string, 114 | nodesToAdd []string, 115 | model PartitionModel, 116 | modelStateConstraints map[string]int, // Keyed by stateName. 117 | partitionWeights map[string]int, // Keyed by partitionName. 118 | stateStickiness map[string]int, // Keyed by stateName. 119 | nodeWeights map[string]int, // Keyed by node. 120 | nodeHierarchy map[string]string, // Keyed by node, value is node's parent. 121 | hierarchyRules HierarchyRules, 122 | ) (nextMap PartitionMap, warnings map[string][]string) { 123 | return PlanNextMapEx(prevMap, partitionsToAssign, nodesAll, nodesToRemove, nodesToAdd, 124 | model, PlanNextMapOptions{ 125 | ModelStateConstraints: modelStateConstraints, 126 | PartitionWeights: partitionWeights, 127 | StateStickiness: stateStickiness, 128 | NodeWeights: nodeWeights, 129 | NodeHierarchy: nodeHierarchy, 130 | HierarchyRules: hierarchyRules, 131 | }) 132 | } 133 | 134 | // PlanNextMapEx is the main entry point to the algorithm to assign 135 | // partitions to nodes. The partitionsToAssign must define the partitions. 136 | // prevMap contains existing partitions' placements, which may be used 137 | // to influence the location of the partitions to assign. 138 | // Partitions must be stable between PlanNextMapEx() runs. That is, 139 | // splitting and merging or partitions are an orthogonal concern and 140 | // must be done separately than PlanNextMapEx() invocations. The 141 | // nodeAll parameters is all nodes (union of existing nodes, nodes to 142 | // be added, nodes to be removed, nodes that aren't changing). The 143 | // nodesToRemove may be empty. The nodesToAdd may be empty. When 144 | // both nodesToRemove and nodesToAdd are empty, partitioning 145 | // assignment may still change, as another PlanNextMapEx() invocation 146 | // may reach more stabilization or balanced'ness. 147 | func PlanNextMapEx( 148 | prevMap PartitionMap, 149 | partitionsToAssign PartitionMap, 150 | nodesAll []string, // Union of nodesBefore, nodesToAdd, nodesToRemove. 151 | nodesToRemove []string, 152 | nodesToAdd []string, 153 | model PartitionModel, 154 | options PlanNextMapOptions) (nextMap PartitionMap, warnings map[string][]string) { 155 | return planNextMapEx(prevMap, partitionsToAssign, nodesAll, nodesToRemove, nodesToAdd, 156 | model, options) 157 | } 158 | 159 | // PlanNextMapOptions represents optional parameters to the PlanNextMapEx() API. 160 | // 161 | // ModelStateConstraints allows the caller to override the constraints 162 | // defined in the model. The ModelStateConstraints is keyed by stateName 163 | // (like "primary", "replica", etc). 164 | // 165 | // PartitionWeights is optional and is keyed by partitionName; 166 | // it allows the caller to specify that some partitions are bigger than 167 | // others (e.g., California has more records than Hawaii); default 168 | // partition weight is 1. 169 | // 170 | // StateStickiness is optional and is keyed by stateName; 171 | // it allows the caller to prefer not moving data at the tradeoff of 172 | // potentially more imbalance; default state stickiness is 1.5. 173 | // 174 | // NodeWeights is optional and is keyed by node name; 175 | // it allows the caller to specify that some nodes can hold more 176 | // partitions than other nodes; default node weight is 1. 177 | // 178 | // NodeHierarchy defines optional parent relationships per node; 179 | // it is keyed by node and a value is the node's parent. 180 | // 181 | // HierarchyRules allows the caller to optionally define replica 182 | // placement policy (e.g., same/different rack; same/different zone; etc). 183 | type PlanNextMapOptions struct { 184 | ModelStateConstraints map[string]int // Keyed by stateName. 185 | PartitionWeights map[string]int // Keyed by partitionName. 186 | StateStickiness map[string]int // Keyed by stateName. 187 | NodeWeights map[string]int // Keyed by node. 188 | NodeHierarchy map[string]string // Keyed by node; value is node's parent. 189 | HierarchyRules HierarchyRules 190 | } 191 | -------------------------------------------------------------------------------- /control_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included 4 | // in the file licenses/BSL-Couchbase.txt. As of the Change Date specified 5 | // in that file, in accordance with the Business Source License, use of this 6 | // software will be governed by the Apache License, Version 2.0, included in 7 | // the file licenses/APL2.txt. 8 | 9 | package blance 10 | 11 | import ( 12 | "fmt" 13 | "reflect" 14 | "testing" 15 | ) 16 | 17 | // Force partition's active on "c" and replica on "b" 18 | func TestControlCase1(t *testing.T) { 19 | NodeScoreBooster = func(w int, s float64) float64 { 20 | // As defined in couchbase/cbgt 21 | score := float64(-w) 22 | if score < s { 23 | score = s 24 | } 25 | return score 26 | } 27 | defer func() { 28 | NodeScoreBooster = nil 29 | }() 30 | 31 | partitionsToAssign := PartitionMap{ 32 | "X": &Partition{ 33 | Name: "X", 34 | NodesByState: map[string][]string{}, 35 | }, 36 | } 37 | 38 | nodes := []string{"a", "b", "c", "d", "e"} 39 | 40 | model := PartitionModel{ 41 | "primary": &PartitionModelState{ 42 | Priority: 0, 43 | Constraints: 1, 44 | }, 45 | "replica": &PartitionModelState{ 46 | Priority: 1, 47 | Constraints: 1, 48 | }, 49 | } 50 | 51 | r, warnings := PlanNextMapEx( 52 | PartitionMap{}, partitionsToAssign, 53 | nodes, 54 | nil, 55 | nil, 56 | model, 57 | PlanNextMapOptions{ 58 | ModelStateConstraints: nil, 59 | PartitionWeights: nil, 60 | StateStickiness: nil, 61 | NodeWeights: map[string]int{ 62 | "a": -2, 63 | "b": -1, 64 | "d": -2, 65 | "e": -2, 66 | }, 67 | NodeHierarchy: nil, 68 | HierarchyRules: nil, 69 | }) 70 | 71 | if len(warnings) > 0 { 72 | t.Errorf("WARNINGS: %v", warnings) 73 | } 74 | 75 | expect := PartitionMap{ 76 | "X": &Partition{ 77 | Name: "X", 78 | NodesByState: map[string][]string{ 79 | "primary": {"c"}, 80 | "replica": {"b"}, 81 | }, 82 | }, 83 | } 84 | 85 | if !reflect.DeepEqual(r, expect) { 86 | var rStr string 87 | for k, v := range r { 88 | rStr += fmt.Sprintf("%+v: %+v", k, v) 89 | } 90 | 91 | t.Fatalf("Mismatch: %+v", rStr) 92 | } 93 | } 94 | 95 | // Do not normalize node weights (meaning "enablePartitionNodeStickiness" in 96 | // cbgt) so single partitioned indexes (even with 1 replica) do not relocate 97 | // on node additions. 98 | func TestControlCase2(t *testing.T) { 99 | NodeScoreBooster = func(w int, s float64) float64 { 100 | // As defined in couchbase/cbgt 101 | score := float64(-w) 102 | if score < s { 103 | score = s 104 | } 105 | return score 106 | } 107 | defer func() { 108 | NodeScoreBooster = nil 109 | }() 110 | 111 | partitions := PartitionMap{ 112 | "X": &Partition{ 113 | Name: "X", 114 | NodesByState: map[string][]string{ 115 | "primary": {"a"}, 116 | "replica": {"b"}, 117 | }, 118 | }, 119 | "Y": &Partition{ 120 | Name: "Y", 121 | NodesByState: map[string][]string{ 122 | "primary": {"b"}, 123 | "replica": {"a"}, 124 | }, 125 | }, 126 | "Z": &Partition{ 127 | Name: "Z", 128 | NodesByState: map[string][]string{ 129 | "primary": {"a"}, 130 | "replica": {"b"}, 131 | }, 132 | }, 133 | } 134 | 135 | nodes := []string{"a", "b"} 136 | nodesToAdd := []string{"c"} 137 | 138 | model := PartitionModel{ 139 | "primary": &PartitionModelState{ 140 | Priority: 0, 141 | Constraints: 1, 142 | }, 143 | "replica": &PartitionModelState{ 144 | Priority: 1, 145 | Constraints: 1, 146 | }, 147 | } 148 | 149 | r, warnings := PlanNextMapEx( 150 | PartitionMap{}, partitions, 151 | nodes, 152 | nil, 153 | nodesToAdd, 154 | model, 155 | PlanNextMapOptions{ 156 | ModelStateConstraints: nil, 157 | PartitionWeights: nil, 158 | StateStickiness: nil, 159 | NodeWeights: nil, 160 | NodeHierarchy: nil, 161 | HierarchyRules: nil, 162 | }) 163 | 164 | if len(warnings) > 0 { 165 | t.Errorf("WARNINGS: %v", warnings) 166 | } 167 | 168 | expect := PartitionMap{ 169 | "X": &Partition{ 170 | Name: "X", 171 | NodesByState: map[string][]string{ 172 | "primary": {"a"}, 173 | "replica": {"b"}, 174 | }, 175 | }, 176 | "Y": &Partition{ 177 | Name: "Y", 178 | NodesByState: map[string][]string{ 179 | "primary": {"b"}, 180 | "replica": {"a"}, 181 | }, 182 | }, 183 | "Z": &Partition{ 184 | Name: "Z", 185 | NodesByState: map[string][]string{ 186 | "primary": {"a"}, 187 | "replica": {"b"}, 188 | }, 189 | }, 190 | } 191 | 192 | if !reflect.DeepEqual(r, expect) { 193 | var rStr string 194 | for k, v := range r { 195 | rStr += fmt.Sprintf("%+v: %+v", k, v) 196 | } 197 | 198 | t.Fatalf("Mismatch: %+v", rStr) 199 | } 200 | } 201 | 202 | // If multiple nodes available for a single partitioned 203 | // index with 1 replica, control the new index to reside on 204 | // replica:"a" and primary:"b" 205 | func TestControlCase3(t *testing.T) { 206 | NodeScoreBooster = func(w int, s float64) float64 { 207 | // As defined in couchbase/cbgt 208 | score := float64(-w) 209 | if score < s { 210 | score = s 211 | } 212 | return score 213 | } 214 | defer func() { 215 | NodeScoreBooster = nil 216 | }() 217 | 218 | partitions := PartitionMap{ 219 | "X": &Partition{ 220 | Name: "X", 221 | NodesByState: map[string][]string{ 222 | "primary": {"a"}, 223 | "replica": {"b"}, 224 | }, 225 | }, 226 | "Y": &Partition{ 227 | Name: "Y", 228 | NodesByState: map[string][]string{ 229 | "primary": {"b"}, 230 | "replica": {"a"}, 231 | }, 232 | }, 233 | "Z": &Partition{ 234 | Name: "Z", 235 | NodesByState: map[string][]string{}, 236 | }, 237 | } 238 | 239 | nodes := []string{"a", "b", "c"} 240 | 241 | model := PartitionModel{ 242 | "primary": &PartitionModelState{ 243 | Priority: 0, 244 | Constraints: 1, 245 | }, 246 | "replica": &PartitionModelState{ 247 | Priority: 1, 248 | Constraints: 1, 249 | }, 250 | } 251 | 252 | r, warnings := PlanNextMapEx( 253 | PartitionMap{}, partitions, 254 | nodes, 255 | nil, 256 | nil, 257 | model, 258 | PlanNextMapOptions{ 259 | ModelStateConstraints: nil, 260 | PartitionWeights: nil, 261 | StateStickiness: nil, 262 | NodeWeights: map[string]int{ 263 | "c": -3, 264 | "a": -1, 265 | }, 266 | NodeHierarchy: nil, 267 | HierarchyRules: nil, 268 | }) 269 | 270 | if len(warnings) > 0 { 271 | t.Errorf("WARNINGS: %v", warnings) 272 | } 273 | 274 | expect := PartitionMap{ 275 | "X": &Partition{ 276 | Name: "X", 277 | NodesByState: map[string][]string{ 278 | "primary": {"a"}, 279 | "replica": {"b"}, 280 | }, 281 | }, 282 | "Y": &Partition{ 283 | Name: "Y", 284 | NodesByState: map[string][]string{ 285 | "primary": {"b"}, 286 | "replica": {"a"}, 287 | }, 288 | }, 289 | "Z": &Partition{ 290 | Name: "Z", 291 | NodesByState: map[string][]string{ 292 | "primary": {"b"}, 293 | "replica": {"a"}, 294 | }, 295 | }, 296 | } 297 | 298 | if !reflect.DeepEqual(r, expect) { 299 | var rStr string 300 | for k, v := range r { 301 | rStr += fmt.Sprintf("%+v: %+v", k, v) 302 | } 303 | 304 | t.Fatalf("Mismatch: %+v", rStr) 305 | } 306 | } 307 | 308 | // Expect even distribution of actives and replicas 309 | func TestControlCase4(t *testing.T) { 310 | NodeScoreBooster = func(w int, s float64) float64 { 311 | // As defined in couchbase/cbgt 312 | score := float64(-w) 313 | if score < s { 314 | score = s 315 | } 316 | return score 317 | } 318 | defer func() { 319 | NodeScoreBooster = nil 320 | }() 321 | 322 | partitions := PartitionMap{ 323 | "X": &Partition{ 324 | Name: "X", 325 | NodesByState: map[string][]string{ 326 | "primary": {"a"}, 327 | "replica": {"b"}, 328 | }, 329 | }, 330 | } 331 | 332 | partitionsToAssign := PartitionMap{ 333 | "X": &Partition{ 334 | Name: "X", 335 | NodesByState: map[string][]string{ 336 | "primary": {"a"}, 337 | "replica": {"b"}, 338 | }, 339 | }, 340 | "Y": &Partition{ 341 | Name: "Y", 342 | NodesByState: map[string][]string{}, 343 | }, 344 | } 345 | 346 | nodes := []string{"a", "b"} 347 | 348 | model := PartitionModel{ 349 | "primary": &PartitionModelState{ 350 | Priority: 0, 351 | Constraints: 1, 352 | }, 353 | "replica": &PartitionModelState{ 354 | Priority: 1, 355 | Constraints: 1, 356 | }, 357 | } 358 | 359 | r, warnings := PlanNextMapEx( 360 | partitions, partitionsToAssign, 361 | nodes, 362 | nil, 363 | nil, 364 | model, 365 | PlanNextMapOptions{ 366 | ModelStateConstraints: nil, 367 | PartitionWeights: nil, 368 | StateStickiness: nil, 369 | NodeWeights: map[string]int{ 370 | "a": -1, 371 | "b": -1, 372 | }, 373 | NodeHierarchy: map[string]string{ 374 | "a": "Group 1", 375 | "b": "Group 2", 376 | }, 377 | HierarchyRules: HierarchyRules{ 378 | "replica": []*HierarchyRule{ 379 | { 380 | IncludeLevel: 2, 381 | ExcludeLevel: 1, 382 | }, 383 | }, 384 | }, 385 | }) 386 | 387 | if len(warnings) > 0 { 388 | t.Errorf("WARNINGS: %v", warnings) 389 | } 390 | 391 | expect := PartitionMap{ 392 | "X": &Partition{ 393 | Name: "X", 394 | NodesByState: map[string][]string{ 395 | "primary": {"a"}, 396 | "replica": {"b"}, 397 | }, 398 | }, 399 | "Y": &Partition{ 400 | Name: "Y", 401 | NodesByState: map[string][]string{ 402 | "primary": {"b"}, 403 | "replica": {"a"}, 404 | }, 405 | }, 406 | } 407 | 408 | if !reflect.DeepEqual(r, expect) { 409 | var rStr string 410 | for k, v := range r { 411 | rStr += fmt.Sprintf("%+v: %+v", k, v) 412 | } 413 | 414 | t.Fatalf("Mismatch: %+v", rStr) 415 | } 416 | } 417 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/couchbase/blance 2 | 3 | go 1.13 4 | -------------------------------------------------------------------------------- /licenses/APL2.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /licenses/BSL-Couchbase.txt: -------------------------------------------------------------------------------- 1 | COUCHBASE BUSINESS SOURCE LICENSE AGREEMENT 2 | 3 | Business Source License 1.1 4 | Licensor: Couchbase, Inc. 5 | Licensed Work: Couchbase Server Version 7.0 6 | The Licensed Work is © 2021-Present Couchbase, Inc. 7 | 8 | Additional Use Grant: You may make production use of the Licensed Work, provided 9 | you comply with the following conditions: 10 | 11 | (i) You may not prepare a derivative work based upon the Licensed Work and 12 | distribute or otherwise offer such derivative work, whether on a standalone 13 | basis or in combination with other products, applications, or services 14 | (including in any "as-a-service" offering, such as, by way of example, a 15 | software-as-a-service, database-as-a-service, or infrastructure-as-a-service 16 | offering, or any other offering based on a cloud computing or other type of 17 | hosted distribution model (collectively, "Hosted Offerings")), for a fee or 18 | otherwise on a commercial or other for-profit basis. 19 | 20 | (ii) You may not link the Licensed Work to, or otherwise include the Licensed 21 | Work in or with, any product, application, or service (including in any Hosted 22 | Offering) that is distributed or otherwise offered, whether on a standalone 23 | basis or in combination with other products, applications, or services for a fee 24 | or otherwise on a commercial or other for-profit basis. Condition (ii) shall not 25 | limit the generality of condition (i) above. 26 | 27 | 28 | Change Date: July 1, 2025 29 | 30 | Change License: Apache License, Version 2.0 31 | 32 | 33 | Notice 34 | 35 | The Business Source License (this document, or the "License") is not an Open 36 | Source license. However, the Licensed Work will eventually be made available 37 | under an Open Source License, as stated in this License. License text copyright 38 | © 2017 MariaDB Corporation Ab, All Rights Reserved. "Business Source License" is 39 | a trademark of MariaDB Corporation Ab. 40 | 41 | Terms 42 | 43 | The Licensor hereby grants You the right to copy, modify, create derivative 44 | works, redistribute, and make non-production use of the Licensed Work. The 45 | Licensor may make an Additional Use Grant, above, permitting limited production 46 | use. 47 | 48 | Effective on the Change Date, or the fourth anniversary of the first publicly 49 | available distribution of a specific version of the Licensed Work under this 50 | License, whichever comes first, the Licensor hereby grants you rights under the 51 | terms of the Change License, and the rights granted in the paragraph above 52 | terminate. 53 | 54 | If your use of the Licensed Work does not comply with the requirements currently 55 | in effect as described in this License, you must purchase a commercial license 56 | from the Licensor, its affiliated entities, or authorized resellers, or you must 57 | refrain from using the Licensed Work. 58 | 59 | All copies of the original and modified Licensed Work, and derivative works of 60 | the Licensed Work, are subject to this License. This License applies separately 61 | for each version of the Licensed Work and the Change Date may vary for each 62 | version of the Licensed Work released by Licensor. 63 | 64 | You must conspicuously display this License on each original or modified copy of 65 | the Licensed Work. If you receive the Licensed Work in original or modified form 66 | from a third party, the terms and conditions set forth in this License apply to 67 | your use of that work. 68 | 69 | Any use of the Licensed Work in violation of this License will automatically 70 | terminate your rights under this License for the current and all other versions 71 | of the Licensed Work. 72 | 73 | This License does not grant you any right in any trademark or logo of Licensor 74 | or its affiliates (provided that you may use a trademark or logo of Licensor as 75 | expressly required by this License). 76 | 77 | TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON AN 78 | "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS 79 | OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF MERCHANTABILITY, 80 | FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND TITLE. 81 | 82 | MariaDB hereby grants you permission to use this License's text to license your 83 | works, and to refer to it using the trademark "Business Source License", as long 84 | as you comply with the Covenants of Licensor below. 85 | 86 | Covenants of Licensor 87 | 88 | In consideration of the right to use this License's text and the "Business 89 | Source License" name and trademark, Licensor covenants to MariaDB, and to all 90 | other recipients of the licensed work to be provided by Licensor: 91 | 92 | 1. To specify as the Change License the GPL Version 2.0 or any later version, or 93 | a license that is compatible with GPL Version 2.0 or a later version, where 94 | "compatible" means that software provided under the Change License can be 95 | included in a program with software provided under GPL Version 2.0 or a later 96 | version. Licensor may specify additional Change Licenses without limitation. 97 | 98 | 2. To either: (a) specify an additional grant of rights to use that does not 99 | impose any additional restriction on the right granted in this License, as the 100 | Additional Use Grant; or (b) insert the text "None". 101 | 102 | 3. To specify a Change Date. 103 | 104 | 4. Not to modify this License in any other way. 105 | -------------------------------------------------------------------------------- /misc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included 4 | // in the file licenses/BSL-Couchbase.txt. As of the Change Date specified 5 | // in that file, in accordance with the Business Source License, use of this 6 | // software will be governed by the Apache License, Version 2.0, included in 7 | // the file licenses/APL2.txt. 8 | 9 | package blance 10 | 11 | // StringsToMap converts an array of strings to an map keyed by 12 | // strings, so the caller can have faster lookups. 13 | func StringsToMap(strsArr []string) map[string]bool { 14 | if strsArr == nil { 15 | return nil 16 | } 17 | strs := map[string]bool{} 18 | for _, str := range strsArr { 19 | strs[str] = true 20 | } 21 | return strs 22 | } 23 | 24 | // StringsRemoveStrings returns a copy of stringArr, but with any 25 | // strings from removeArr removed, keeping the same order as 26 | // stringArr. So, stringArr subtract removeArr. 27 | func StringsRemoveStrings(stringArr, removeArr []string) []string { 28 | removeMap := StringsToMap(removeArr) 29 | rv := make([]string, 0, len(stringArr)) 30 | for _, s := range stringArr { 31 | if !removeMap[s] { 32 | rv = append(rv, s) 33 | } 34 | } 35 | return rv 36 | } 37 | 38 | // StringsIntersectStrings returns a brand new array that has the 39 | // intersection of a and b. 40 | func StringsIntersectStrings(a, b []string) []string { 41 | bMap := StringsToMap(b) 42 | rMap := map[string]bool{} 43 | rv := make([]string, 0, len(a)) 44 | for _, s := range a { 45 | if bMap[s] && !rMap[s] { 46 | rMap[s] = true 47 | rv = append(rv, s) 48 | } 49 | } 50 | return rv 51 | } 52 | 53 | // stringsDeduplicate returns a brand new array that has the 54 | // all the unique of a preserving the order. 55 | func stringsDeduplicate(a []string) []string { 56 | bMap := make(map[string]struct{}) 57 | rv := make([]string, 0, len(bMap)) 58 | for _, s := range a { 59 | if _, ok := bMap[s]; ok { 60 | continue 61 | } 62 | bMap[s] = struct{}{} 63 | rv = append(rv, s) 64 | } 65 | return rv 66 | } 67 | -------------------------------------------------------------------------------- /misc_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014-Present Couchbase, Inc. 3 | 4 | Use of this software is governed by the Business Source License included in 5 | the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 6 | file, in accordance with the Business Source License, use of this software will 7 | be governed by the Apache License, Version 2.0, included in the file 8 | licenses/APL2.txt. 9 | */ 10 | 11 | package blance 12 | 13 | import ( 14 | "reflect" 15 | "testing" 16 | ) 17 | 18 | func TestStringsToMap(t *testing.T) { 19 | s := []string{} 20 | m := StringsToMap(s) 21 | if m == nil || len(m) != 0 { 22 | t.Errorf("expected StringsToMap to work on empty array") 23 | } 24 | m = StringsToMap([]string{"a"}) 25 | if m == nil || !reflect.DeepEqual(m, map[string]bool{"a": true}) { 26 | t.Errorf("expected single string arr to work") 27 | } 28 | m = StringsToMap([]string{"a", "b", "a"}) 29 | if m == nil || !reflect.DeepEqual(m, map[string]bool{"a": true, "b": true}) { 30 | t.Errorf("expected 3 string arr to work with dupe removal") 31 | } 32 | } 33 | 34 | func TestStringsRemoveStrings(t *testing.T) { 35 | tests := []struct { 36 | a []string 37 | b []string 38 | exp []string 39 | }{ 40 | {[]string{}, []string{}, []string{}}, 41 | {[]string{"a"}, []string{}, []string{"a"}}, 42 | {[]string{"a"}, []string{"a"}, []string{}}, 43 | {[]string{"a"}, []string{"b"}, []string{"a"}}, 44 | {[]string{}, []string{"b"}, []string{}}, 45 | {[]string{"a", "b", "c"}, []string{"a"}, []string{"b", "c"}}, 46 | {[]string{"a", "b", "c"}, []string{"b"}, []string{"a", "c"}}, 47 | {[]string{"a", "b", "c"}, []string{"c"}, []string{"a", "b"}}, 48 | {[]string{"a", "b", "c"}, []string{"a", "b"}, []string{"c"}}, 49 | {[]string{"a", "b", "c"}, []string{"a", "b", "c"}, []string{}}, 50 | {[]string{"a", "b", "c"}, []string{"b", "c"}, []string{"a"}}, 51 | {[]string{"a", "b", "c"}, []string{"c", "c"}, []string{"a", "b"}}, 52 | } 53 | for i, c := range tests { 54 | r := StringsRemoveStrings(c.a, c.b) 55 | if !reflect.DeepEqual(r, c.exp) { 56 | t.Errorf("i: %d, a: %#v, b: %#v, exp: %#v, got: %#v", 57 | i, c.a, c.b, c.exp, r) 58 | } 59 | } 60 | } 61 | 62 | func TestStringsIntersectStrings(t *testing.T) { 63 | tests := []struct { 64 | a []string 65 | b []string 66 | exp []string 67 | }{ 68 | {[]string{}, []string{}, []string{}}, 69 | {[]string{"a"}, []string{}, []string{}}, 70 | {[]string{"a"}, []string{"a"}, []string{"a"}}, 71 | {[]string{"a"}, []string{"b"}, []string{}}, 72 | {[]string{}, []string{"b"}, []string{}}, 73 | {[]string{"a", "b", "c"}, []string{"a"}, []string{"a"}}, 74 | {[]string{"a", "b", "c"}, []string{"b"}, []string{"b"}}, 75 | {[]string{"a", "b", "c"}, []string{"c"}, []string{"c"}}, 76 | {[]string{"a", "b", "c"}, []string{"a", "b"}, []string{"a", "b"}}, 77 | {[]string{"a", "b", "c"}, []string{"a", "b", "c"}, []string{"a", "b", "c"}}, 78 | {[]string{"a", "b", "c"}, []string{"b", "c"}, []string{"b", "c"}}, 79 | {[]string{"a", "b", "c"}, []string{"c", "c"}, []string{"c"}}, 80 | {[]string{"a", "b", "a", "b"}, []string{"a", "b"}, []string{"a", "b"}}, 81 | } 82 | for i, c := range tests { 83 | r := StringsIntersectStrings(c.a, c.b) 84 | if !reflect.DeepEqual(r, c.exp) { 85 | t.Errorf("i: %d, a: %#v, b: %#v, exp: %#v, got: %#v", 86 | i, c.a, c.b, c.exp, r) 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /moves.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included 4 | // in the file licenses/BSL-Couchbase.txt. As of the Change Date specified 5 | // in that file, in accordance with the Business Source License, use of this 6 | // software will be governed by the Apache License, Version 2.0, included in 7 | // the file licenses/APL2.txt. 8 | 9 | package blance 10 | 11 | // A NodeStateOp associates a node with a state change and operation. 12 | // An array of NodeStateOp's could be interpreted as a series of 13 | // node-by-node state transitions for a partition. For example, for 14 | // partition X, the NodeState transitions might be: first add node A 15 | // to "primary", then demote node B to "replica", then remove (or del) 16 | // partition X from node C. 17 | type NodeStateOp struct { 18 | Node string 19 | State string 20 | Op string // Ex: "add", "del", "promote", "demote". 21 | } 22 | 23 | // CalcPartitionMoves computes the step-by-step moves to transition a 24 | // partition from begNodesByState to endNodesByState. 25 | // 26 | // The states is an array of state names, like ["primary", 27 | // "hotStandBy", "coldStandBy"], and should be ordered by more 28 | // superior or important states coming earlier. For example, "primary" 29 | // should come before "replica". 30 | // 31 | // The begNodesByState and endNodesByState are keyed by stateName, 32 | // where the values are an array of node names. For example, 33 | // {"primary": ["a"], "replica": ["b", "c"]}. 34 | // 35 | // The favorMinNodes should be true if the moves should be computed to 36 | // have the partition assigned to the least number of nodes at any 37 | // time (i.e., favoring max of single primary, if even there are 38 | // temporarily no primaries for a time); if false, then the algorithm 39 | // will instead try to assign the partition to 1 or more nodes, 40 | // favoring partition availability across multiple nodes during moves. 41 | func CalcPartitionMoves( 42 | states []string, 43 | begNodesByState map[string][]string, 44 | endNodesByState map[string][]string, 45 | favorMinNodes bool, 46 | ) []NodeStateOp { 47 | var moves []NodeStateOp 48 | 49 | seen := map[string]bool{} 50 | 51 | addMoves := func(nodes []string, state, op string) { 52 | for _, node := range nodes { 53 | if !seen[node] { 54 | seen[node] = true 55 | moves = append(moves, NodeStateOp{node, state, op}) 56 | } 57 | } 58 | } 59 | 60 | begNodes := flattenNodesByState(begNodesByState) 61 | endNodes := flattenNodesByState(endNodesByState) 62 | 63 | adds := StringsRemoveStrings(endNodes, begNodes) 64 | dels := StringsRemoveStrings(begNodes, endNodes) 65 | 66 | if !favorMinNodes { 67 | for statei, state := range states { 68 | // Handle promotions of inferiorTo(state) to state. 69 | addMoves(findStateChanges(statei+1, len(states), 70 | state, states, begNodesByState, endNodesByState), 71 | state, "promote") 72 | 73 | // Handle demotions of superiorTo(state) to state. 74 | addMoves(findStateChanges(0, statei, 75 | state, states, begNodesByState, endNodesByState), 76 | state, "demote") 77 | 78 | // Handle clean additions of state. 79 | addMoves(StringsIntersectStrings(StringsRemoveStrings( 80 | endNodesByState[state], begNodesByState[state]), 81 | adds), 82 | state, "add") 83 | 84 | // Handle clean deletions of state. 85 | addMoves(StringsIntersectStrings(StringsRemoveStrings( 86 | begNodesByState[state], endNodesByState[state]), 87 | dels), 88 | "", "del") 89 | } 90 | } else { 91 | for statei := len(states) - 1; statei >= 0; statei-- { 92 | state := states[statei] 93 | 94 | // Handle clean deletions of state. 95 | addMoves(StringsIntersectStrings(StringsRemoveStrings( 96 | begNodesByState[state], endNodesByState[state]), 97 | dels), 98 | "", "del") 99 | 100 | // Handle demotions of superiorTo(state) to state. 101 | addMoves(findStateChanges(0, statei, 102 | state, states, begNodesByState, endNodesByState), 103 | state, "demote") 104 | 105 | // Handle promotions of inferiorTo(state) to state. 106 | addMoves(findStateChanges(statei+1, len(states), 107 | state, states, begNodesByState, endNodesByState), 108 | state, "promote") 109 | 110 | // Handle clean additions of state. 111 | addMoves(StringsIntersectStrings(StringsRemoveStrings( 112 | endNodesByState[state], begNodesByState[state]), 113 | adds), 114 | state, "add") 115 | } 116 | } 117 | 118 | return moves 119 | } 120 | 121 | func findStateChanges(begStateIdx, endStateIdx int, 122 | state string, states []string, 123 | begNodesByState map[string][]string, 124 | endNodesByState map[string][]string) (rv []string) { 125 | for _, node := range endNodesByState[state] { 126 | for i := begStateIdx; i < endStateIdx; i++ { 127 | for _, n := range begNodesByState[states[i]] { 128 | if n == node { 129 | rv = append(rv, node) 130 | } 131 | } 132 | } 133 | } 134 | 135 | return rv 136 | } 137 | -------------------------------------------------------------------------------- /moves_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015-Present Couchbase, Inc. 3 | 4 | Use of this software is governed by the Business Source License included in 5 | the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 6 | file, in accordance with the Business Source License, use of this software will 7 | be governed by the Apache License, Version 2.0, included in the file 8 | licenses/APL2.txt. 9 | */ 10 | 11 | package blance 12 | 13 | import ( 14 | "reflect" 15 | "strings" 16 | "testing" 17 | ) 18 | 19 | func TestFindStateChanges(t *testing.T) { 20 | tests := []struct { 21 | begStateIdx int 22 | endStateIdx int 23 | state string 24 | states []string 25 | begNodesByState map[string][]string 26 | endNodesByState map[string][]string 27 | expected []string 28 | }{ 29 | {0, 0, "primary", 30 | []string{"primary", "replica"}, 31 | map[string][]string{ 32 | "primary": {"a"}, 33 | "replica": {"b", "c"}, 34 | }, 35 | map[string][]string{ 36 | "primary": {"a"}, 37 | "replica": {"b", "c"}, 38 | }, 39 | nil, 40 | }, 41 | {1, 2, "primary", 42 | []string{"primary", "replica"}, 43 | map[string][]string{ 44 | "primary": {"a"}, 45 | "replica": {"b", "c"}, 46 | }, 47 | map[string][]string{ 48 | "primary": {"a"}, 49 | "replica": {"b", "c"}, 50 | }, 51 | nil, 52 | }, 53 | {0, 0, "primary", 54 | []string{"primary", "replica"}, 55 | map[string][]string{ 56 | "primary": {}, 57 | "replica": {"a"}, 58 | }, 59 | map[string][]string{ 60 | "primary": {"a"}, 61 | "replica": {}, 62 | }, 63 | nil, 64 | }, 65 | {1, 2, "primary", 66 | []string{"primary", "replica"}, 67 | map[string][]string{ 68 | "primary": {}, 69 | "replica": {"a"}, 70 | }, 71 | map[string][]string{ 72 | "primary": {"a"}, 73 | "replica": {}, 74 | }, 75 | []string{"a"}, 76 | }, 77 | {0, 1, "replica", 78 | []string{"primary", "replica"}, 79 | map[string][]string{ 80 | "primary": {"a"}, 81 | "replica": {}, 82 | }, 83 | map[string][]string{ 84 | "primary": {}, 85 | "replica": {"a"}, 86 | }, 87 | []string{"a"}, 88 | }, 89 | {1, 2, "replica", 90 | []string{"primary", "replica"}, 91 | map[string][]string{ 92 | "primary": {"a"}, 93 | "replica": {}, 94 | }, 95 | map[string][]string{ 96 | "primary": {}, 97 | "replica": {"a"}, 98 | }, 99 | nil, 100 | }, 101 | {1, 2, "replica", 102 | []string{"primary", "replica"}, 103 | map[string][]string{ 104 | "primary": {}, 105 | "replica": {"a"}, 106 | }, 107 | map[string][]string{ 108 | "primary": {}, 109 | "replica": {}, 110 | }, 111 | nil, 112 | }, 113 | {1, 2, "primary", 114 | []string{"primary", "replica"}, 115 | map[string][]string{ 116 | "primary": {"a"}, 117 | "replica": {"b", "c", "d"}, 118 | }, 119 | map[string][]string{ 120 | "primary": {"b"}, 121 | "replica": {"a", "c", "d"}, 122 | }, 123 | []string{"b"}, 124 | }, 125 | {1, 2, "primary", 126 | []string{"primary", "replica"}, 127 | map[string][]string{ 128 | "primary": {"a"}, 129 | "replica": {"b", "c", "d"}, 130 | }, 131 | map[string][]string{ 132 | "primary": {"x"}, 133 | "replica": {"a", "c", "d"}, 134 | }, 135 | nil, 136 | }, 137 | } 138 | 139 | for i, test := range tests { 140 | got := findStateChanges(test.begStateIdx, test.endStateIdx, 141 | test.state, test.states, 142 | test.begNodesByState, 143 | test.endNodesByState) 144 | if !reflect.DeepEqual(got, test.expected) { 145 | t.Errorf("i: %d, got: %#v, expected: %#v, test: %#v", 146 | i, got, test.expected, test) 147 | } 148 | } 149 | } 150 | 151 | func TestCalcPartitionMoves(t *testing.T) { 152 | states := []string{"primary", "replica"} 153 | 154 | tests := []struct { 155 | before string 156 | moves string 157 | after string 158 | 159 | favorMinNodes bool 160 | }{ 161 | // primary | replica 162 | // -------|-------- 163 | { // Test #0. 164 | " a", 165 | "", 166 | " a", 167 | false, 168 | }, 169 | { 170 | " a", 171 | "", 172 | " a", 173 | true, 174 | }, 175 | { 176 | " | a", 177 | "", 178 | " | a", 179 | false, 180 | }, 181 | { 182 | " | a", 183 | "", 184 | " | a", 185 | true, 186 | }, 187 | { 188 | " a | b", 189 | "", 190 | " a | b", 191 | false, 192 | }, 193 | { // Test #5. 194 | " a | b", 195 | "", 196 | " a | b", 197 | true, 198 | }, 199 | { 200 | "", 201 | "+a", 202 | " a", 203 | false, 204 | }, 205 | { 206 | "", 207 | "+a", 208 | " a", 209 | true, 210 | }, 211 | { 212 | " a", 213 | "-a", 214 | "", 215 | false, 216 | }, 217 | { 218 | " a", 219 | "-a", 220 | "", 221 | true, 222 | }, 223 | { // Test #10. 224 | "", 225 | `+a | 226 | a |+b`, 227 | " a | b", 228 | false, 229 | }, 230 | { 231 | "", 232 | ` |+b 233 | +a | b`, 234 | " a | b", 235 | true, 236 | }, 237 | { 238 | " a | b", 239 | ` a |-b`, 240 | " a", 241 | false, 242 | }, 243 | { 244 | " a | b", 245 | ` a |-b`, 246 | " a", 247 | true, 248 | }, 249 | { 250 | " a | b", 251 | `-a | b`, 252 | " | b", 253 | false, 254 | }, 255 | { // Test #15. 256 | " a | b", 257 | `-a | b`, 258 | " | b", 259 | true, 260 | }, 261 | { 262 | " a | b", 263 | `-a | b 264 | |-b`, // NOTE: Some may say we should remove replica first. 265 | "", 266 | false, 267 | }, 268 | { 269 | " a | b", 270 | ` a |-b 271 | -a |`, 272 | "", 273 | true, 274 | }, 275 | { 276 | " a", 277 | ` a +b | 278 | -a b |`, 279 | " b", 280 | false, 281 | }, 282 | { 283 | " a", 284 | `-a | 285 | +b |`, 286 | " b", 287 | true, 288 | }, 289 | { // Test #20. 290 | " a | b c", 291 | ` a +b |-b c 292 | -a b | c 293 | b | c +d`, 294 | " b | c d", 295 | false, 296 | }, 297 | { // Test #21. 298 | " a | b c", 299 | ` a | b c +d 300 | -a | b c d 301 | +b |-b c d`, 302 | " b | c d", 303 | true, 304 | }, 305 | { 306 | " a | b", 307 | ` a +b | -b 308 | -a b |+a`, 309 | " b | a", 310 | false, 311 | }, 312 | { 313 | " a | b", 314 | `-a |+a b 315 | +b | a -b`, 316 | " b | a", 317 | true, 318 | }, 319 | { 320 | " a | b", 321 | ` a +c | b 322 | -a c |+a b 323 | c | a -b`, 324 | " c | a", 325 | false, 326 | }, 327 | { // Test #25. 328 | " a | b", 329 | ` a | -b 330 | -a |+a 331 | +c | a`, 332 | " c | a", 333 | true, 334 | }, 335 | { 336 | " a | b", 337 | ` a +c | b 338 | -a c | b 339 | c | b +d 340 | c |-b d`, 341 | " c | d", 342 | false, 343 | }, 344 | { 345 | " a | b", 346 | ` a |-b 347 | a | +d 348 | -a | d 349 | +c | d`, 350 | " c | d", 351 | true, 352 | }, 353 | { 354 | " a | b", 355 | `-a |+a b 356 | | a b +c`, 357 | " | a b c", 358 | false, 359 | }, 360 | } 361 | 362 | negate := map[string]string{ 363 | "+": "-", 364 | "-": "+", 365 | } 366 | 367 | ops := map[string]string{ 368 | "+": "add", 369 | "-": "del", 370 | } 371 | 372 | for testi, test := range tests { 373 | before := convertLineToNodesByState(test.before, states) 374 | after := convertLineToNodesByState(test.after, states) 375 | 376 | var movesExp []map[string][]string 377 | 378 | if test.moves != "" { 379 | moveLines := strings.Split(test.moves, "\n") 380 | for _, moveLine := range moveLines { 381 | moveExp := convertLineToNodesByState(moveLine, states) 382 | movesExp = append(movesExp, moveExp) 383 | } 384 | } 385 | 386 | movesGot := CalcPartitionMoves(states, before, after, test.favorMinNodes) 387 | 388 | if len(movesGot) != len(movesExp) { 389 | t.Errorf("testi: %d, mismatch lengths,"+ 390 | " before: %#v, after: %#v,"+ 391 | " movesExp: %#v, movesGot: %#v, test: %#v", 392 | testi, before, after, movesExp, movesGot, test) 393 | 394 | continue 395 | } 396 | 397 | for moveExpi, moveExp := range movesExp { 398 | moveGot := movesGot[moveExpi] 399 | 400 | found := false 401 | 402 | for statei, state := range states { 403 | if found { 404 | continue 405 | } 406 | 407 | for _, move := range moveExp[state] { 408 | if found { 409 | continue 410 | } 411 | 412 | op := move[0:1] 413 | if op == "+" || op == "-" { 414 | found = true 415 | 416 | if moveGot.Node != move[1:] { 417 | t.Errorf("testi: %d, wrong node,"+ 418 | " before: %#v, after: %#v,"+ 419 | " movesExp: %#v, movesGot: %#v,"+ 420 | " test: %#v", 421 | testi, before, after, 422 | movesExp, movesGot, test) 423 | } 424 | 425 | flipSideFound := "" 426 | flipSideState := "" 427 | flipSide := negate[op] + move[1:] 428 | for j := statei + 1; j < len(states); j++ { 429 | for _, x := range moveExp[states[j]] { 430 | if x == flipSide { 431 | flipSideFound = flipSide 432 | flipSideState = states[j] 433 | } 434 | } 435 | } 436 | 437 | stateExp := state 438 | if flipSideFound != "" { 439 | if op == "-" { 440 | stateExp = flipSideState 441 | } 442 | } else { 443 | if op == "-" { 444 | stateExp = "" 445 | } 446 | } 447 | 448 | if moveGot.State != stateExp { 449 | t.Errorf("testi: %d, not stateExp: %q,"+ 450 | " before: %#v, after: %#v,"+ 451 | " movesExp: %#v, movesGot: %#v,"+ 452 | " test: %#v, move: %s,"+ 453 | " flipSideFound: %q, flipSideState: %q", 454 | testi, stateExp, before, after, 455 | movesExp, movesGot, test, move, 456 | flipSideFound, flipSideState) 457 | } 458 | 459 | if flipSideFound != "" { 460 | if moveGot.Op != "promote" && 461 | moveGot.Op != "demote" { 462 | t.Errorf("testi: %d, wanted pro/demote,"+ 463 | " before: %#v, after: %#v,"+ 464 | " movesExp: %#v, movesGot: %#v,"+ 465 | " test: %#v, move: %s,"+ 466 | " flipSideFound: %q, flipSideState: %q", 467 | testi, before, after, 468 | movesExp, movesGot, test, move, 469 | flipSideFound, flipSideState) 470 | } 471 | } else if moveGot.Op != ops[op] { 472 | t.Errorf("testi: %d, wanted op: %q,"+ 473 | " before: %#v, after: %#v,"+ 474 | " movesExp: %#v, movesGot: %#v,"+ 475 | " test: %#v, move: %s,"+ 476 | " flipSideFound: %q, flipSideState: %q", 477 | testi, ops[op], before, after, 478 | movesExp, movesGot, test, move, 479 | flipSideFound, flipSideState) 480 | } 481 | } 482 | } 483 | } 484 | } 485 | } 486 | } 487 | 488 | // Converts an input line string like " a b | +c -d", with input 489 | // states of ["primary", "replica"] to something like {"primary": ["a", 490 | // "b"], "replica": ["+c", "-d"]}. 491 | func convertLineToNodesByState( 492 | line string, states []string) map[string][]string { 493 | nodesByState := map[string][]string{} 494 | 495 | line = strings.Trim(line, " ") 496 | for { 497 | linex := strings.Replace(line, " ", " ", -1) 498 | if linex == line { 499 | break 500 | } 501 | line = linex 502 | } 503 | 504 | parts := strings.Split(line, "|") 505 | for i, state := range states { 506 | if i >= len(parts) { 507 | break 508 | } 509 | part := strings.Trim(parts[i], " ") 510 | if part != "" { 511 | nodes := strings.Split(part, " ") 512 | nodesByState[state] = append(nodesByState[state], nodes...) 513 | } 514 | } 515 | 516 | return nodesByState 517 | } 518 | -------------------------------------------------------------------------------- /orchestrate.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included 4 | // in the file licenses/BSL-Couchbase.txt. As of the Change Date specified 5 | // in that file, in accordance with the Business Source License, use of this 6 | // software will be governed by the Apache License, Version 2.0, included in 7 | // the file licenses/APL2.txt. 8 | 9 | package blance 10 | 11 | import ( 12 | "errors" 13 | "fmt" 14 | "sync" 15 | ) 16 | 17 | // ErrorStopped is returned when an operation was stopped 18 | var ErrorStopped = errors.New("stopped") 19 | 20 | // ErrorInterrupt is returned when an operation was interrupted 21 | var ErrorInterrupt = errors.New("interrupt") 22 | 23 | /* 24 | We let the app have detailed control of the prioritization heuristics 25 | via the FindMoveFunc callback. Here are some move prioritization 26 | ideas or heuristics for FindMoveFunc implementors to consider. 27 | 28 | Some apps might first favor easy, single-node promotions and demotions 29 | (e.g., a replica partition graduating to primary on the same node) 30 | because single-node state changes should be fast and so that clients 31 | can have more coverage across all partitions. The 32 | LowestWeightPartitionMoveForNode() implementation does this now. 33 | 34 | Next, favor assignments of partitions that have no replicas assigned 35 | anywhere, where we want to get to that first data partition instance 36 | or replica as soon as possible. Once we have that first replica for a 37 | data partition, though, we should consider favoring other kinds of 38 | moves over building even more replicas of that data partition. 39 | 40 | Next, favor reassignments that utilize capacity on newly added nodes, 41 | as the new nodes may be able to help with existing, overtaxed 42 | nodes. But be aware: starting off more KV backfills, for example, may 43 | push existing nodes running at the limit over the edge. 44 | 45 | Next, favor reassignments that help get partitions off of nodes that 46 | are leaving the cluster. The idea is to allow us to remove nodes 47 | (which may need servicing) sooner. 48 | 49 | Next, favor removals of partitions that are over-replicated. For 50 | example, there might be too many replicas of a partition remaining on 51 | new/existing nodes. 52 | 53 | Lastly, favor reassignments that move partitions amongst nodes than 54 | are neither joining nor leaving the cluster. In this case, the system 55 | may need to shuffle partitions to achieve better balance or meet 56 | replication constraints. 57 | 58 | Other, more advanced factors to consider in the heuristics, which may 59 | be addressed in future releases, but would just be additions to the 60 | ordering/sorting algorithm. 61 | 62 | Some nodes might be slower, less powerful and more impacted than 63 | others. 64 | 65 | Some partitions might be way behind compared to others. 66 | 67 | Some partitions might be much larger than others. 68 | 69 | Some partitions might have data sources under more pressure than 70 | others and less able to handle yet another a request for a data source 71 | full-scan (backfill). 72 | 73 | Perhaps consider how about some randomness? 74 | */ 75 | 76 | // ------------------------------------------ 77 | 78 | // An Orchestrator instance holds the runtime state during an 79 | // OrchestrateMoves() operation. 80 | type Orchestrator struct { 81 | model PartitionModel 82 | 83 | options OrchestratorOptions 84 | 85 | nodesAll []string // Union of all nodes (entering, leaving, remaining). 86 | 87 | begMap PartitionMap // The map state that we start with. 88 | endMap PartitionMap // The map state we want to end up with. 89 | 90 | assignPartitions AssignPartitionsFunc 91 | findMove FindMoveFunc 92 | 93 | progressCh chan OrchestratorProgress 94 | 95 | // Keyed by node name. 96 | mapNodeToPartitionMoveReqCh map[string]chan partitionMoveReq 97 | 98 | m sync.Mutex // Protects the fields that follow. 99 | 100 | stopCh chan struct{} // Becomes nil when stopped. 101 | pauseCh chan struct{} // May be nil; non-nil when paused. 102 | progress OrchestratorProgress 103 | 104 | // Keyed by partition name. 105 | mapPartitionToNextMoves map[string]*NextMoves 106 | } 107 | 108 | // OrchestratorOptions represents advanced config parameters for 109 | // OrchestrateMoves(). 110 | type OrchestratorOptions struct { 111 | MaxConcurrentPartitionMovesPerNode int 112 | 113 | // See blance.CalcPartitionMoves(favorMinNodes). 114 | FavorMinNodes bool 115 | } 116 | 117 | // OrchestratorProgress represents progress counters and/or error 118 | // information as the OrchestrateMoves() operation proceeds. 119 | type OrchestratorProgress struct { 120 | Errors []error 121 | 122 | TotStop int 123 | TotPauseNewAssignments int 124 | TotResumeNewAssignments int 125 | TotRunMover int 126 | TotRunMoverDone int 127 | TotRunMoverDoneErr int 128 | TotMoverLoop int 129 | TotMoverAssignPartition int 130 | TotMoverAssignPartitionOk int 131 | TotMoverAssignPartitionErr int 132 | TotRunSupplyMovesLoop int 133 | TotRunSupplyMovesLoopDone int 134 | TotRunSupplyMovesFeeding int 135 | TotRunSupplyMovesFeedingDone int 136 | TotRunSupplyMovesDone int 137 | TotRunSupplyMovesDoneErr int 138 | TotRunSupplyMovesPause int 139 | TotRunSupplyMovesResume int 140 | TotProgressClose int 141 | } 142 | 143 | // AssignPartitionsFunc is a callback invoked by OrchestrateMoves() 144 | // when it wants to synchronously assign one more more partitions to 145 | // a node at a given state, or change the state of an existing partition 146 | // on a node. The state will be "" if the partition should be removed or 147 | // deleted from the node. 148 | type AssignPartitionsFunc func(stopCh chan struct{}, 149 | node string, 150 | partitions []string, 151 | states []string, 152 | ops []string) error 153 | 154 | // FindMoveFunc is a callback invoked by OrchestrateMoves() when it 155 | // wants to find the best partition move out of a set of available 156 | // partition moves for node. It should return the array index of the 157 | // partition move that should be used next. 158 | type FindMoveFunc func(node string, moves []PartitionMove) int 159 | 160 | // A PartitionMove struct represents a state change or operation on a 161 | // partition on a node. 162 | type PartitionMove struct { 163 | Partition string 164 | 165 | Node string 166 | 167 | // Ex: "primary", "replica". 168 | State string 169 | 170 | // Same as NodeStateOp.Op: "add", "del", "promote", "demote". 171 | Op string 172 | } 173 | 174 | // LowestWeightPartitionMoveForNode implements the FindMoveFunc 175 | // callback signature, by using the MoveOpWeight lookup table to find 176 | // the lowest weight partition move for a node. 177 | func LowestWeightPartitionMoveForNode( 178 | node string, moves []PartitionMove) int { 179 | r := 0 180 | for i, move := range moves { 181 | if MoveOpWeight[moves[r].Op] > MoveOpWeight[move.Op] { 182 | r = i 183 | } 184 | } 185 | return r 186 | } 187 | 188 | // MoveOpWeight sets the weight associated with each op 189 | var MoveOpWeight = map[string]int{ 190 | "promote": 1, 191 | "demote": 2, 192 | "add": 3, 193 | "del": 4, 194 | } 195 | 196 | // A NextMoves struct is used to track a sequence of moves of a 197 | // partition, including the next move that that needs to be taken. 198 | type NextMoves struct { 199 | Partition string // Immutable. 200 | 201 | // Mutable index or current position in the moves array that 202 | // represents the next available move for a partition. 203 | Next int 204 | 205 | // The sequence of moves can come from the output of the 206 | // CalcPartitionMoves() function and is immutable. 207 | Moves []NodeStateOp 208 | 209 | // When non-nil, it means the move is already in-flight (was 210 | // successfully fed to a mover) but hasn't finished yet, and the 211 | // move supplier needs to wait for the nextDoneCh to be closed. 212 | // The nextDoneCh == partitionMoveReq.doneCh. 213 | nextDoneCh chan error 214 | } 215 | 216 | // ------------------------------------------ 217 | 218 | // A partitionMoveReq wraps one or many partitionMoves, allowing the receiver (a 219 | // mover) to signal that the move request is completed by closing the doneCh. 220 | type partitionMoveReq struct { 221 | partitionMove []PartitionMove 222 | doneCh chan error 223 | } 224 | 225 | // ------------------------------------------ 226 | 227 | // OrchestrateMoves asynchronously begins reassigning partitions 228 | // amongst nodes in order to transition from the begMap to the endMap 229 | // state, invoking the assignPartition() to affect changes. 230 | // Additionally, the caller must read the progress channel until it's 231 | // closed by OrchestrateMoves to avoid blocking the orchestration, and 232 | // as a way to monitor progress. 233 | // 234 | // The nodesAll must be a union or superset of all the nodes during 235 | // the orchestration (nodes added, removed, unchanged). 236 | // 237 | // The findMove callback is invoked when OrchestrateMoves needs to 238 | // find the best move for a node from amongst a set of available 239 | // moves. 240 | func OrchestrateMoves( 241 | model PartitionModel, 242 | options OrchestratorOptions, 243 | nodesAll []string, 244 | begMap PartitionMap, 245 | endMap PartitionMap, 246 | assignPartitions AssignPartitionsFunc, 247 | findMove FindMoveFunc) (*Orchestrator, error) { 248 | if len(begMap) != len(endMap) { 249 | return nil, fmt.Errorf("mismatched begMap and endMap") 250 | } 251 | 252 | if assignPartitions == nil { 253 | return nil, fmt.Errorf("callback implementation for " + 254 | "AssignPartitionsFunc is expected") 255 | } 256 | 257 | // Populate the mapNodeToPartitionMoveReqCh, keyed by node name. 258 | mapNodeToPartitionMoveReqCh := map[string]chan partitionMoveReq{} 259 | for _, node := range nodesAll { 260 | mapNodeToPartitionMoveReqCh[node] = make(chan partitionMoveReq) 261 | } 262 | 263 | states := sortStateNames(model) 264 | 265 | // Populate the mapPartitionToNextMoves, keyed by partition name, 266 | // with the output from CalcPartitionMoves(). 267 | // 268 | // As an analogy, this step calculates a bunch of airplane flight 269 | // plans, without consideration to what the other airplanes are 270 | // doing, where each flight plan has multi-city, multi-leg hops. 271 | mapPartitionToNextMoves := map[string]*NextMoves{} 272 | 273 | for partitionName, begPartition := range begMap { 274 | endPartition := endMap[partitionName] 275 | 276 | moves := CalcPartitionMoves(states, 277 | begPartition.NodesByState, 278 | endPartition.NodesByState, 279 | options.FavorMinNodes, 280 | ) 281 | 282 | mapPartitionToNextMoves[partitionName] = &NextMoves{ 283 | Partition: partitionName, 284 | Next: 0, 285 | Moves: moves, 286 | } 287 | } 288 | 289 | o := &Orchestrator{ 290 | model: model, 291 | options: options, 292 | nodesAll: nodesAll, 293 | begMap: begMap, 294 | endMap: endMap, 295 | assignPartitions: assignPartitions, 296 | findMove: findMove, 297 | progressCh: make(chan OrchestratorProgress), 298 | 299 | mapNodeToPartitionMoveReqCh: mapNodeToPartitionMoveReqCh, 300 | 301 | stopCh: make(chan struct{}), 302 | pauseCh: nil, 303 | 304 | mapPartitionToNextMoves: mapPartitionToNextMoves, 305 | } 306 | 307 | stopCh := o.stopCh 308 | 309 | runMoverDoneCh := make(chan error) 310 | 311 | // Start a concurrent mover per node. 312 | // 313 | // Following the airplane analogy, a runMover() represents 314 | // a takeoff runway at a city airport (or node). 315 | // A single runMover is capable of dispatching multiple 316 | // partition move requests in a batch. 317 | // All the partitions in a batch steps together through the 318 | // various stages of movement like replica add, primary promote etc. 319 | for _, node := range o.nodesAll { 320 | go o.runMover(stopCh, runMoverDoneCh, node) 321 | } 322 | 323 | // Supply moves to movers. 324 | // 325 | // Following the airplane/airport analogy, a runSupplyMoves() 326 | // goroutine is like some global, supreme airport controller, 327 | // remotely controlling all the city airports across the entire 328 | // realm, and deciding which plane can take off next at each 329 | // airport. Each plane is following its multi-leg flight plan 330 | // that was computed from earlier (via CalcPartitionMoves), but 331 | // when multiple planes are concurrently ready to takeoff from a 332 | // city's airport (or node), this global, supreme airport 333 | // controller chooses which plane (or partition) gets to takeoff 334 | // next. 335 | go o.runSupplyMoves(stopCh, runMoverDoneCh) 336 | 337 | return o, nil 338 | } 339 | 340 | // Stop asynchronously requests the orchestrator to stop, where the 341 | // caller will eventually see a closed progress channel. 342 | func (o *Orchestrator) Stop() { 343 | o.m.Lock() 344 | if o.stopCh != nil { 345 | o.progress.TotStop++ 346 | close(o.stopCh) 347 | o.stopCh = nil 348 | } 349 | o.m.Unlock() 350 | } 351 | 352 | // ProgressCh returns a channel that is updated occasionally when 353 | // the orchestrator has made some progress on one or more partition 354 | // reassignments, or has reached an error. The channel is closed by 355 | // the orchestrator when it is finished, either naturally, or due to 356 | // an error, or via a Stop(), and all the orchestrator's resources 357 | // have been released. 358 | func (o *Orchestrator) ProgressCh() chan OrchestratorProgress { 359 | return o.progressCh 360 | } 361 | 362 | // PauseNewAssignments disallows the orchestrator from starting any 363 | // new assignments of partitions to nodes. Any inflight partition 364 | // moves will continue to be finished. The caller can monitor the 365 | // ProgressCh to determine when to pause and/or resume partition 366 | // assignments. PauseNewAssignments is idempotent. 367 | func (o *Orchestrator) PauseNewAssignments() error { 368 | o.m.Lock() 369 | if o.pauseCh == nil { 370 | o.pauseCh = make(chan struct{}) 371 | o.progress.TotPauseNewAssignments++ 372 | } 373 | o.m.Unlock() 374 | return nil 375 | } 376 | 377 | // ResumeNewAssignments tells the orchestrator that it may resume 378 | // assignments of partitions to nodes, and is idempotent. 379 | func (o *Orchestrator) ResumeNewAssignments() error { 380 | o.m.Lock() 381 | if o.pauseCh != nil { 382 | o.progress.TotResumeNewAssignments++ 383 | close(o.pauseCh) 384 | o.pauseCh = nil 385 | } 386 | o.m.Unlock() 387 | return nil 388 | } 389 | 390 | // ------------------------------------------------- 391 | 392 | // VisitNextMoves invokes the supplied callback with the map of 393 | // partitions to *NextMoves, which should be treated as immutable by 394 | // the callback. 395 | func (o *Orchestrator) VisitNextMoves(cb func(map[string]*NextMoves)) { 396 | o.m.Lock() 397 | cb(o.mapPartitionToNextMoves) 398 | o.m.Unlock() 399 | } 400 | 401 | // ------------------------------------------------- 402 | 403 | // runMover handles partition moves for a single node. 404 | // 405 | // There will only be a single runMover for a node which is 406 | // capable of handling multiple partition movements for higher 407 | // concurrency. 408 | func (o *Orchestrator) runMover( 409 | stopCh chan struct{}, runMoverDoneCh chan error, node string) { 410 | o.updateProgress(func() { 411 | o.progress.TotRunMover++ 412 | }) 413 | 414 | // The partitionMoveReqCh has commands from the global, supreme 415 | // airport controller on which airplane (or partition) should 416 | // takeoff from the city airport next (but the supreme airport 417 | // controller doesn't care which takeoff runway at that airport is 418 | // used). 419 | partitionMoveReqCh := o.mapNodeToPartitionMoveReqCh[node] 420 | 421 | runMoverDoneCh <- o.moverLoop(stopCh, partitionMoveReqCh, node) 422 | } 423 | 424 | // moverLoop handles partitionMoveReq's by invoking the 425 | // assignPartition callback. 426 | func (o *Orchestrator) moverLoop(stopCh chan struct{}, 427 | partitionMoveReqCh chan partitionMoveReq, node string) error { 428 | for { 429 | o.updateProgress(func() { 430 | o.progress.TotMoverLoop++ 431 | }) 432 | 433 | select { 434 | case <-stopCh: 435 | return nil 436 | 437 | case partitionMoveReqVal, ok := <-partitionMoveReqCh: 438 | if !ok { 439 | return nil 440 | } 441 | 442 | // batch all the partition moves requested. 443 | var partitions, states, ops []string 444 | partitionMove := partitionMoveReqVal.partitionMove 445 | for _, pm := range partitionMove { 446 | partitions = append(partitions, pm.Partition) 447 | states = append(states, pm.State) 448 | ops = append(ops, pm.Op) 449 | } 450 | 451 | o.updateProgress(func() { 452 | o.progress.TotMoverAssignPartition++ 453 | }) 454 | 455 | err := o.assignPartitions(stopCh, node, partitions, 456 | states, ops) 457 | 458 | o.updateProgress(func() { 459 | if err != nil { 460 | o.progress.TotMoverAssignPartitionErr++ 461 | } else { 462 | o.progress.TotMoverAssignPartitionOk++ 463 | } 464 | }) 465 | 466 | if partitionMoveReqVal.doneCh != nil { 467 | if err != nil { 468 | select { 469 | case <-stopCh: 470 | // NO-OP. 471 | case partitionMoveReqVal.doneCh <- err: 472 | // NO-OP. 473 | } 474 | } 475 | 476 | close(partitionMoveReqVal.doneCh) 477 | } 478 | } 479 | } 480 | } 481 | 482 | func (o *Orchestrator) filterNextPlausibleMovesForNode(node string, 483 | nextMovesArr []*NextMoves) (nxtMoves []*NextMoves) { 484 | count := o.options.MaxConcurrentPartitionMovesPerNode 485 | if count <= 0 { 486 | count = 1 487 | } 488 | if count > len(nextMovesArr) { 489 | count = len(nextMovesArr) 490 | } 491 | 492 | // pick sufficient number of the best possible moves per node. 493 | for count > 0 { 494 | i := o.findNextMoves(node, nextMovesArr) 495 | nxtMoves = append(nxtMoves, nextMovesArr[i]) 496 | 497 | count-- 498 | nextMovesArr[i] = nextMovesArr[len(nextMovesArr)-1] 499 | nextMovesArr[len(nextMovesArr)-1] = nil 500 | nextMovesArr = nextMovesArr[:len(nextMovesArr)-1] 501 | } 502 | 503 | return nxtMoves 504 | } 505 | 506 | // runSupplyMoves "broadcasts" available partitionMoveReq's to movers. 507 | // The broadcast is implemented via repeated "rounds" of spawning off 508 | // concurrent helper goroutines of runSupplyMove()'s for each node. 509 | func (o *Orchestrator) runSupplyMoves(stopCh chan struct{}, 510 | runMoverDoneCh chan error) { 511 | var errOuter error 512 | 513 | for errOuter == nil { 514 | o.updateProgress(func() { 515 | o.progress.TotRunSupplyMovesLoop++ 516 | }) 517 | 518 | o.m.Lock() 519 | 520 | // The availableMoves is keyed by node name. 521 | availableMoves := o.findAvailableMovesUnlocked() 522 | 523 | pauseCh := o.pauseCh 524 | 525 | o.m.Unlock() 526 | 527 | if len(availableMoves) <= 0 { 528 | break 529 | } 530 | 531 | // The main pause/resume handling is via pausing/resuming the 532 | // runSupplyMoves loop. If caller needs to rebalancer.Stop() 533 | // while paused, they should resume before Stop()'ing. 534 | if pauseCh != nil { 535 | o.updateProgress(func() { 536 | o.progress.TotRunSupplyMovesPause++ 537 | }) 538 | 539 | <-pauseCh 540 | 541 | o.updateProgress(func() { 542 | o.progress.TotRunSupplyMovesResume++ 543 | }) 544 | } 545 | 546 | // Broadcast to every node mover their next, best move. 547 | broadcastStopCh := make(chan struct{}) 548 | broadcastDoneCh := make(chan error) 549 | 550 | for node, nextMovesArr := range availableMoves { 551 | nxtMoves := o.filterNextPlausibleMovesForNode(node, nextMovesArr) 552 | 553 | go o.runSupplyMove(stopCh, node, 554 | nxtMoves, 555 | broadcastStopCh, broadcastDoneCh) 556 | } 557 | 558 | o.updateProgress(func() { 559 | o.progress.TotRunSupplyMovesFeeding++ 560 | }) 561 | 562 | // When the one or more node movers is successfully "fed" (via 563 | // broadcastDoneCh), then stop the broadcast (via 564 | // broadcastStopCh) so that we can repeat the outer loop to 565 | // re-calculate another round of available moves. 566 | broadcastStopChClosed := false 567 | 568 | for range availableMoves { 569 | err := <-broadcastDoneCh 570 | if err == nil && !broadcastStopChClosed { 571 | close(broadcastStopCh) 572 | broadcastStopChClosed = true 573 | } 574 | 575 | if err != nil && 576 | err != ErrorInterrupt && 577 | errOuter == nil { 578 | errOuter = err 579 | } 580 | } 581 | 582 | o.updateProgress(func() { 583 | o.progress.TotRunSupplyMovesFeedingDone++ 584 | }) 585 | 586 | if !broadcastStopChClosed { 587 | close(broadcastStopCh) 588 | } 589 | 590 | close(broadcastDoneCh) 591 | } 592 | 593 | o.updateProgress(func() { 594 | o.progress.TotRunSupplyMovesLoopDone++ 595 | }) 596 | 597 | for _, partitionMoveReqCh := range o.mapNodeToPartitionMoveReqCh { 598 | close(partitionMoveReqCh) 599 | } 600 | 601 | o.updateProgress(func() { 602 | o.progress.TotRunSupplyMovesDone++ 603 | if errOuter != nil && 604 | errOuter != ErrorStopped { 605 | o.progress.Errors = append(o.progress.Errors, errOuter) 606 | o.progress.TotRunSupplyMovesDoneErr++ 607 | } 608 | }) 609 | 610 | // Wait for movers to finish. 611 | o.waitForAllMoversDone(1, runMoverDoneCh) 612 | 613 | o.updateProgress(func() { 614 | o.progress.TotProgressClose++ 615 | }) 616 | 617 | close(o.progressCh) 618 | } 619 | 620 | // runSupplyMove tries to send a single partitionMoveReq to a single 621 | // node, along with handling the broadcast interruptions. 622 | func (o *Orchestrator) runSupplyMove(stopCh chan struct{}, 623 | node string, nextMoves []*NextMoves, 624 | broadcastStopCh chan struct{}, 625 | broadcastDoneCh chan error) { 626 | var nextDoneCh chan error 627 | // check whether any of the nextMoves entry is already in flight, 628 | // If so, then wait for that movement to complete before 629 | // submitting any subsequent partition movement requests. 630 | o.m.Lock() 631 | for _, nm := range nextMoves { 632 | if nm.nextDoneCh != nil { 633 | nextDoneCh = nm.nextDoneCh 634 | break 635 | } 636 | } 637 | o.m.Unlock() 638 | 639 | if nextDoneCh == nil { 640 | nextDoneCh = make(chan error) 641 | 642 | pmr := partitionMoveReq{ 643 | partitionMove: make([]PartitionMove, 0, len(nextMoves)), 644 | doneCh: nextDoneCh, 645 | } 646 | 647 | o.m.Lock() 648 | for _, nm := range nextMoves { 649 | pmr.partitionMove = append(pmr.partitionMove, PartitionMove{ 650 | Partition: nm.Partition, 651 | Node: nm.Moves[nm.Next].Node, 652 | State: nm.Moves[nm.Next].State, 653 | Op: nm.Moves[nm.Next].Op, 654 | }) 655 | } 656 | o.m.Unlock() 657 | 658 | select { 659 | case <-stopCh: 660 | broadcastDoneCh <- ErrorStopped 661 | return 662 | 663 | case <-broadcastStopCh: 664 | broadcastDoneCh <- ErrorInterrupt 665 | return 666 | 667 | case o.mapNodeToPartitionMoveReqCh[node] <- pmr: 668 | o.m.Lock() 669 | for i := range nextMoves { 670 | nextMoves[i].nextDoneCh = nextDoneCh 671 | } 672 | o.m.Unlock() 673 | } 674 | } 675 | 676 | select { 677 | case <-stopCh: 678 | broadcastDoneCh <- ErrorStopped 679 | 680 | case <-broadcastStopCh: 681 | broadcastDoneCh <- ErrorInterrupt 682 | 683 | case err := <-nextDoneCh: 684 | o.m.Lock() 685 | for i := range nextMoves { 686 | // check the inflight status. 687 | if nextMoves[i].nextDoneCh == nextDoneCh { 688 | nextMoves[i].nextDoneCh = nil 689 | nextMoves[i].Next++ 690 | } 691 | } 692 | o.m.Unlock() 693 | 694 | broadcastDoneCh <- err 695 | } 696 | } 697 | 698 | // findNextMoves invokes the application's FindMoveFunc callback. 699 | func (o *Orchestrator) findNextMoves( 700 | node string, nextMovesArr []*NextMoves) int { 701 | moves := make([]PartitionMove, len(nextMovesArr)) 702 | 703 | for i, nextMoves := range nextMovesArr { 704 | m := nextMoves.Moves[nextMoves.Next] 705 | 706 | moves[i] = PartitionMove{ 707 | Partition: nextMoves.Partition, 708 | Node: m.Node, 709 | State: m.State, 710 | Op: m.Op, 711 | } 712 | } 713 | return o.findMove(node, moves) 714 | } 715 | 716 | // waitForAllMoversDone returns when all concurrent movers have 717 | // finished, propagating any of their errors to the progressCh. 718 | func (o *Orchestrator) waitForAllMoversDone( 719 | m int, runMoverDoneCh chan error) { 720 | for i := 0; i < len(o.nodesAll)*m; i++ { 721 | err := <-runMoverDoneCh 722 | 723 | o.updateProgress(func() { 724 | o.progress.TotRunMoverDone++ 725 | if err != nil { 726 | o.progress.Errors = append(o.progress.Errors, err) 727 | o.progress.TotRunMoverDoneErr++ 728 | } 729 | }) 730 | } 731 | } 732 | 733 | // updateProgress is a helper func to allow for progress updates and 734 | // sends progress events to the progressCh. 735 | func (o *Orchestrator) updateProgress(f func()) { 736 | o.m.Lock() 737 | 738 | f() 739 | 740 | progress := o.progress 741 | 742 | o.m.Unlock() 743 | 744 | o.progressCh <- progress 745 | } 746 | 747 | // findAvailableMovesUnlocked returns the next round of available 748 | // moves. 749 | func (o *Orchestrator) findAvailableMovesUnlocked() ( 750 | availableMoves map[string][]*NextMoves) { 751 | // The availableMoves is keyed by node name. 752 | availableMoves = map[string][]*NextMoves{} 753 | 754 | for _, nextMoves := range o.mapPartitionToNextMoves { 755 | if nextMoves.Next < len(nextMoves.Moves) { 756 | node := nextMoves.Moves[nextMoves.Next].Node 757 | availableMoves[node] = 758 | append(availableMoves[node], nextMoves) 759 | } 760 | } 761 | 762 | return availableMoves 763 | } 764 | -------------------------------------------------------------------------------- /orchestrate_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015-Present Couchbase, Inc. 3 | 4 | Use of this software is governed by the Business Source License included in 5 | the file licenses/BSL-Couchbase.txt. As of the Change Date specified in that 6 | file, in accordance with the Business Source License, use of this software will 7 | be governed by the Apache License, Version 2.0, included in the file 8 | licenses/APL2.txt. 9 | */ 10 | 11 | package blance 12 | 13 | import ( 14 | "fmt" 15 | "reflect" 16 | "sort" 17 | "sync" 18 | "testing" 19 | ) 20 | 21 | type assignPartitionRec struct { 22 | partition string 23 | node string 24 | state string 25 | op string 26 | } 27 | 28 | var mrPartitionModel = PartitionModel{ 29 | "primary": &PartitionModelState{ 30 | Priority: 0, 31 | }, 32 | "replica": &PartitionModelState{ 33 | Constraints: 1, 34 | }, 35 | } 36 | 37 | var options1 = OrchestratorOptions{ 38 | MaxConcurrentPartitionMovesPerNode: 1, 39 | } 40 | 41 | func TestOrchestrateBadMoves(t *testing.T) { 42 | o, err := OrchestrateMoves( 43 | mrPartitionModel, 44 | options1, 45 | nil, 46 | PartitionMap{ 47 | "00": &Partition{ 48 | Name: "00", 49 | NodesByState: map[string][]string{}, 50 | }, 51 | "01": &Partition{ 52 | Name: "01", 53 | NodesByState: map[string][]string{}, 54 | }, 55 | }, 56 | PartitionMap{ 57 | "01": &Partition{ 58 | Name: "01", 59 | NodesByState: map[string][]string{}, 60 | }, 61 | }, 62 | nil, 63 | nil, 64 | ) 65 | if err == nil || o != nil { 66 | t.Errorf("expected err on mismatched beg/end maps") 67 | } 68 | } 69 | 70 | func TestOrchestrateErrAssignPartitionFunc(t *testing.T) { 71 | theErr := fmt.Errorf("theErr") 72 | 73 | errAssignPartitionsFunc := func(stopCh chan struct{}, 74 | node string, partition, state, op []string) error { 75 | return theErr 76 | } 77 | 78 | o, err := OrchestrateMoves( 79 | mrPartitionModel, 80 | OrchestratorOptions{}, 81 | []string{"a", "b"}, 82 | PartitionMap{ 83 | "00": &Partition{ 84 | Name: "00", 85 | NodesByState: map[string][]string{ 86 | "primary": {"a"}, 87 | }, 88 | }, 89 | }, 90 | PartitionMap{ 91 | "00": &Partition{ 92 | Name: "00", 93 | NodesByState: map[string][]string{ 94 | "primary": {"b"}, 95 | }, 96 | }, 97 | }, 98 | errAssignPartitionsFunc, 99 | LowestWeightPartitionMoveForNode, 100 | ) 101 | if err != nil || o == nil { 102 | t.Errorf("expected nil err") 103 | } 104 | 105 | gotProgress := 0 106 | var lastProgress OrchestratorProgress 107 | 108 | for progress := range o.ProgressCh() { 109 | gotProgress++ 110 | lastProgress = progress 111 | } 112 | 113 | o.Stop() 114 | 115 | if gotProgress <= 0 { 116 | t.Errorf("expected progress") 117 | } 118 | 119 | if len(lastProgress.Errors) <= 0 { 120 | t.Errorf("expected errs") 121 | } 122 | 123 | o.VisitNextMoves(func(x map[string]*NextMoves) { 124 | if x == nil { 125 | t.Errorf("expected x") 126 | } 127 | }) 128 | } 129 | 130 | func testMkFuncs() ( 131 | map[string]map[string]string, 132 | map[string][]assignPartitionRec, 133 | AssignPartitionsFunc, 134 | ) { 135 | var m sync.Mutex 136 | 137 | // Map of partition -> node -> state. 138 | currStates := map[string]map[string]string{} 139 | 140 | assignPartitionRecs := map[string][]assignPartitionRec{} 141 | 142 | assignPartitionsFunc := func(stopCh chan struct{}, 143 | node string, partitions, states, ops []string) error { 144 | m.Lock() 145 | 146 | assignPartitionRecs[partitions[0]] = 147 | append(assignPartitionRecs[partitions[0]], 148 | assignPartitionRec{partitions[0], node, states[0], ops[0]}) 149 | 150 | nodes := currStates[partitions[0]] 151 | if nodes == nil { 152 | nodes = map[string]string{} 153 | currStates[partitions[0]] = nodes 154 | } 155 | 156 | nodes[node] = states[0] 157 | 158 | m.Unlock() 159 | 160 | return nil 161 | } 162 | 163 | return currStates, assignPartitionRecs, assignPartitionsFunc 164 | } 165 | 166 | func TestOrchestrateEarlyPauseResume(t *testing.T) { 167 | testOrchestratePauseResume(t, 1) 168 | } 169 | 170 | func TestOrchestrateMidPauseResume(t *testing.T) { 171 | testOrchestratePauseResume(t, 2) 172 | } 173 | 174 | func testOrchestratePauseResume(t *testing.T, numProgress int) { 175 | _, _, assignPartitionsFunc := testMkFuncs() 176 | 177 | pauseCh := make(chan struct{}) 178 | 179 | slowAssignPartitionsFunc := func(stopCh chan struct{}, 180 | node string, partitions, states, ops []string) error { 181 | <-pauseCh 182 | return assignPartitionsFunc(stopCh, node, partitions, states, ops) 183 | } 184 | 185 | o, err := OrchestrateMoves( 186 | mrPartitionModel, 187 | OrchestratorOptions{}, 188 | []string{"a", "b"}, 189 | PartitionMap{ 190 | "00": &Partition{ 191 | Name: "00", 192 | NodesByState: map[string][]string{ 193 | "primary": {"a"}, 194 | "replica": {"b"}, 195 | }, 196 | }, 197 | "01": &Partition{ 198 | Name: "01", 199 | NodesByState: map[string][]string{ 200 | "primary": {"a"}, 201 | "replica": {"b"}, 202 | }, 203 | }, 204 | "02": &Partition{ 205 | Name: "02", 206 | NodesByState: map[string][]string{ 207 | "primary": {"a"}, 208 | "replica": {"b"}, 209 | }, 210 | }, 211 | }, 212 | PartitionMap{ 213 | "00": &Partition{ 214 | Name: "00", 215 | NodesByState: map[string][]string{ 216 | "primary": {"b"}, 217 | "replica": {"a"}, 218 | }, 219 | }, 220 | "01": &Partition{ 221 | Name: "01", 222 | NodesByState: map[string][]string{ 223 | "primary": {"b"}, 224 | "replica": {"a"}, 225 | }, 226 | }, 227 | "02": &Partition{ 228 | Name: "02", 229 | NodesByState: map[string][]string{ 230 | "primary": {"b"}, 231 | "replica": {"a"}, 232 | }, 233 | }, 234 | }, 235 | slowAssignPartitionsFunc, 236 | LowestWeightPartitionMoveForNode, 237 | ) 238 | if err != nil || o == nil { 239 | t.Errorf("expected nil err") 240 | } 241 | 242 | for i := 0; i < numProgress; i++ { 243 | <-o.ProgressCh() 244 | } 245 | 246 | o.PauseNewAssignments() 247 | o.PauseNewAssignments() 248 | o.PauseNewAssignments() 249 | 250 | o.ResumeNewAssignments() 251 | o.ResumeNewAssignments() 252 | 253 | close(pauseCh) 254 | 255 | gotProgress := 0 256 | var lastProgress OrchestratorProgress 257 | 258 | for progress := range o.ProgressCh() { 259 | gotProgress++ 260 | lastProgress = progress 261 | 262 | o.ResumeNewAssignments() 263 | } 264 | 265 | o.Stop() 266 | 267 | if gotProgress <= 0 { 268 | t.Errorf("expected progress") 269 | } 270 | 271 | if len(lastProgress.Errors) > 0 { 272 | t.Errorf("expected no errs") 273 | } 274 | 275 | if lastProgress.TotPauseNewAssignments != 1 || 276 | lastProgress.TotResumeNewAssignments != 1 { 277 | t.Errorf("numProgress: %d, expected pause/resume of 1, got: %#v", 278 | numProgress, lastProgress) 279 | } 280 | } 281 | 282 | // Another attempt at pause/resume testing that tries to exercise 283 | // pause/resume code paths in the moves supplier. 284 | func TestOrchestratePauseResumeIntoMovesSupplier(t *testing.T) { 285 | testOrchestratePauseResumeIntoMovesSupplier(t, 2, 1) 286 | } 287 | 288 | func testOrchestratePauseResumeIntoMovesSupplier(t *testing.T, 289 | numProgressBeforePause, numFastAssignPartitionFuncs int) { 290 | _, _, assignPartitionsFunc := testMkFuncs() 291 | 292 | var m sync.Mutex 293 | numAssignPartitionFuncs := 0 294 | 295 | slowCh := make(chan struct{}) 296 | 297 | slowAssignPartitionsFunc := func(stopCh chan struct{}, 298 | node string, partitions, states, ops []string) error { 299 | m.Lock() 300 | numAssignPartitionFuncs++ 301 | n := numAssignPartitionFuncs 302 | m.Unlock() 303 | 304 | if n > numFastAssignPartitionFuncs { 305 | <-slowCh 306 | } 307 | 308 | return assignPartitionsFunc(stopCh, node, partitions, states, ops) 309 | } 310 | 311 | o, err := OrchestrateMoves( 312 | mrPartitionModel, 313 | OrchestratorOptions{}, 314 | []string{"a", "b", "c"}, 315 | PartitionMap{ 316 | "00": &Partition{ 317 | Name: "00", 318 | NodesByState: map[string][]string{ 319 | "primary": {"a"}, 320 | "replica": {"b"}, 321 | }, 322 | }, 323 | "01": &Partition{ 324 | Name: "01", 325 | NodesByState: map[string][]string{ 326 | "primary": {"b"}, 327 | "replica": {"c"}, 328 | }, 329 | }, 330 | }, 331 | PartitionMap{ 332 | "00": &Partition{ 333 | Name: "00", 334 | NodesByState: map[string][]string{ 335 | "primary": {"b"}, 336 | "replica": {"c"}, 337 | }, 338 | }, 339 | "01": &Partition{ 340 | Name: "01", 341 | NodesByState: map[string][]string{ 342 | "primary": {"c"}, 343 | "replica": {"a"}, 344 | }, 345 | }, 346 | }, 347 | slowAssignPartitionsFunc, 348 | LowestWeightPartitionMoveForNode, 349 | ) 350 | if err != nil || o == nil { 351 | t.Errorf("expected nil err") 352 | } 353 | 354 | for i := 0; i < numProgressBeforePause; i++ { 355 | <-o.ProgressCh() 356 | } 357 | 358 | o.PauseNewAssignments() 359 | o.PauseNewAssignments() 360 | o.PauseNewAssignments() 361 | 362 | o.ResumeNewAssignments() 363 | o.ResumeNewAssignments() 364 | 365 | close(slowCh) 366 | 367 | gotProgress := 0 368 | var lastProgress OrchestratorProgress 369 | 370 | for progress := range o.ProgressCh() { 371 | gotProgress++ 372 | lastProgress = progress 373 | 374 | o.ResumeNewAssignments() 375 | } 376 | 377 | o.Stop() 378 | 379 | if gotProgress <= 0 { 380 | t.Errorf("expected progress") 381 | } 382 | 383 | if len(lastProgress.Errors) > 0 { 384 | t.Errorf("expected no errs") 385 | } 386 | 387 | if lastProgress.TotPauseNewAssignments != 1 || 388 | lastProgress.TotResumeNewAssignments != 1 { 389 | t.Errorf("numProgressBeforePause: %d,"+ 390 | " expected pause/resume of 1, got: %#v", 391 | numProgressBeforePause, lastProgress) 392 | } 393 | } 394 | 395 | func TestOrchestrateEarlyStop(t *testing.T) { 396 | _, _, assignPartitionFunc := testMkFuncs() 397 | 398 | o, err := OrchestrateMoves( 399 | mrPartitionModel, 400 | OrchestratorOptions{}, 401 | []string{"a", "b"}, 402 | PartitionMap{ 403 | "00": &Partition{ 404 | Name: "00", 405 | NodesByState: map[string][]string{ 406 | "primary": {"a"}, 407 | }, 408 | }, 409 | }, 410 | PartitionMap{ 411 | "00": &Partition{ 412 | Name: "00", 413 | NodesByState: map[string][]string{ 414 | "primary": {"b"}, 415 | }, 416 | }, 417 | }, 418 | assignPartitionFunc, 419 | LowestWeightPartitionMoveForNode, 420 | ) 421 | if err != nil || o == nil { 422 | t.Errorf("expected nil err") 423 | } 424 | 425 | <-o.ProgressCh() 426 | 427 | o.Stop() 428 | o.Stop() 429 | o.Stop() 430 | 431 | gotProgress := 0 432 | var lastProgress OrchestratorProgress 433 | 434 | for progress := range o.ProgressCh() { 435 | gotProgress++ 436 | lastProgress = progress 437 | } 438 | 439 | if gotProgress <= 0 { 440 | t.Errorf("expected some progress") 441 | } 442 | 443 | if len(lastProgress.Errors) > 0 { 444 | t.Errorf("expected no errs") 445 | } 446 | 447 | if lastProgress.TotStop != 1 { 448 | t.Errorf("expected stop of 1") 449 | } 450 | } 451 | 452 | func TestOrchestrateConcurrentMoves(t *testing.T) { 453 | options := OrchestratorOptions{} 454 | 455 | tests := []struct { 456 | skip bool 457 | label string 458 | partitionModel PartitionModel 459 | maxConcurrentMoves int 460 | numProgress int 461 | nodesAll []string 462 | begMap PartitionMap 463 | endMap PartitionMap 464 | assignPartitionsFunc AssignPartitionsFunc 465 | skipCallbacks int 466 | expConcurrentMovesCount int 467 | expNode string 468 | expMovePartitions []string 469 | expMoveStates []string 470 | expMoveOps []string 471 | expectErr error 472 | }{ 473 | { 474 | label: "2 node, 2 partition movement", 475 | partitionModel: mrPartitionModel, 476 | maxConcurrentMoves: 2, 477 | numProgress: 1, 478 | nodesAll: []string{"a", "b"}, 479 | begMap: PartitionMap{ 480 | "00": &Partition{ 481 | Name: "00", 482 | NodesByState: map[string][]string{ 483 | "primary": {"a"}, 484 | "replica": {}, 485 | }, 486 | }, 487 | "01": &Partition{ 488 | Name: "01", 489 | NodesByState: map[string][]string{ 490 | "primary": {"a"}, 491 | "replica": {}, 492 | }, 493 | }, 494 | "02": &Partition{ 495 | Name: "02", 496 | NodesByState: map[string][]string{ 497 | "primary": {"a"}, 498 | "replica": {}, 499 | }, 500 | }, 501 | "03": &Partition{ 502 | Name: "03", 503 | NodesByState: map[string][]string{ 504 | "primary": {"a"}, 505 | "replica": {}, 506 | }, 507 | }, 508 | }, 509 | endMap: PartitionMap{ 510 | "00": &Partition{ 511 | Name: "00", 512 | NodesByState: map[string][]string{ 513 | "primary": {"a"}, 514 | "replica": {}, 515 | }, 516 | }, 517 | "01": &Partition{ 518 | Name: "01", 519 | NodesByState: map[string][]string{ 520 | "primary": {"a"}, 521 | "replica": {}, 522 | }, 523 | }, 524 | "02": &Partition{ 525 | Name: "02", 526 | NodesByState: map[string][]string{ 527 | "primary": {"b"}, 528 | "replica": {}, 529 | }, 530 | }, 531 | "03": &Partition{ 532 | Name: "3", 533 | NodesByState: map[string][]string{ 534 | "primary": {"b"}, 535 | "replica": {}, 536 | }, 537 | }, 538 | }, 539 | expNode: "b", 540 | expConcurrentMovesCount: 2, 541 | expMovePartitions: []string{"02", "03"}, 542 | expMoveStates: []string{"primary", "primary"}, 543 | expMoveOps: []string{"add", "add"}, 544 | expectErr: nil, 545 | }, 546 | { 547 | label: "1 node, 4 partition movement", 548 | partitionModel: mrPartitionModel, 549 | maxConcurrentMoves: 4, 550 | numProgress: 1, 551 | nodesAll: []string{"a"}, 552 | begMap: PartitionMap{ 553 | "00": &Partition{ 554 | Name: "00", 555 | NodesByState: map[string][]string{}, 556 | }, 557 | "01": &Partition{ 558 | Name: "01", 559 | NodesByState: map[string][]string{}, 560 | }, 561 | "02": &Partition{ 562 | Name: "02", 563 | NodesByState: map[string][]string{}, 564 | }, 565 | "03": &Partition{ 566 | Name: "03", 567 | NodesByState: map[string][]string{}, 568 | }, 569 | }, 570 | endMap: PartitionMap{ 571 | "00": &Partition{ 572 | Name: "00", 573 | NodesByState: map[string][]string{ 574 | "primary": {"a"}, 575 | "replica": {}, 576 | }, 577 | }, 578 | "01": &Partition{ 579 | Name: "01", 580 | NodesByState: map[string][]string{ 581 | "primary": {"a"}, 582 | "replica": {}, 583 | }, 584 | }, 585 | "02": &Partition{ 586 | Name: "02", 587 | NodesByState: map[string][]string{ 588 | "primary": {"a"}, 589 | "replica": {}, 590 | }, 591 | }, 592 | "03": &Partition{ 593 | Name: "3", 594 | NodesByState: map[string][]string{ 595 | "primary": {"a"}, 596 | "replica": {}, 597 | }, 598 | }, 599 | }, 600 | expConcurrentMovesCount: 4, 601 | expNode: "a", 602 | expMovePartitions: []string{"00", "01", "02", "03"}, 603 | expMoveStates: []string{"primary", "primary", "primary", "primary"}, 604 | expMoveOps: []string{"add", "add", "add", "add"}, 605 | expectErr: nil, 606 | }, 607 | { 608 | skip: true, 609 | label: "empty assignPartitions callback", 610 | partitionModel: mrPartitionModel, 611 | maxConcurrentMoves: 2, 612 | expConcurrentMovesCount: 2, 613 | numProgress: 0, 614 | nodesAll: []string{"a", "b"}, 615 | begMap: PartitionMap{}, 616 | endMap: PartitionMap{}, 617 | expectErr: fmt.Errorf("callback implementation for " + 618 | "AssignPartitionsFunc is expected"), 619 | }, 620 | { 621 | label: "1 node delete, 2 partition promote", 622 | partitionModel: mrPartitionModel, 623 | maxConcurrentMoves: 4, 624 | numProgress: 1, 625 | nodesAll: []string{"a"}, 626 | begMap: PartitionMap{ 627 | "00": &Partition{ 628 | Name: "00", 629 | NodesByState: map[string][]string{ 630 | "primary": {"a"}, 631 | "replica": {"b"}, 632 | }, 633 | }, 634 | "01": &Partition{ 635 | Name: "01", 636 | NodesByState: map[string][]string{ 637 | "primary": {"a"}, 638 | "replica": {"b"}, 639 | }, 640 | }, 641 | "02": &Partition{ 642 | Name: "02", 643 | NodesByState: map[string][]string{ 644 | "primary": {"b"}, 645 | "replica": {"a"}, 646 | }, 647 | }, 648 | "03": &Partition{ 649 | Name: "03", 650 | NodesByState: map[string][]string{ 651 | "primary": {"b"}, 652 | "replica": {"a"}, 653 | }, 654 | }, 655 | }, 656 | endMap: PartitionMap{ 657 | "00": &Partition{ 658 | Name: "00", 659 | NodesByState: map[string][]string{ 660 | "primary": {"a"}, 661 | "replica": {}, 662 | }, 663 | }, 664 | "01": &Partition{ 665 | Name: "01", 666 | NodesByState: map[string][]string{ 667 | "primary": {"a"}, 668 | "replica": {}, 669 | }, 670 | }, 671 | "02": &Partition{ 672 | Name: "02", 673 | NodesByState: map[string][]string{ 674 | "primary": {"a"}, 675 | "replica": {}, 676 | }, 677 | }, 678 | "03": &Partition{ 679 | Name: "3", 680 | NodesByState: map[string][]string{ 681 | "primary": {"a"}, 682 | "replica": {}, 683 | }, 684 | }, 685 | }, 686 | expConcurrentMovesCount: 2, 687 | expNode: "a", 688 | expMovePartitions: []string{"02", "03"}, 689 | expMoveStates: []string{"primary", "primary"}, 690 | expMoveOps: []string{"promote", "promote"}, 691 | expectErr: nil, 692 | }, 693 | { 694 | label: "1 node delete, 2 partition del", 695 | partitionModel: mrPartitionModel, 696 | maxConcurrentMoves: 2, 697 | numProgress: 2, 698 | nodesAll: []string{"a", "b"}, 699 | begMap: PartitionMap{ 700 | "00": &Partition{ 701 | Name: "00", 702 | NodesByState: map[string][]string{ 703 | "primary": {"a"}, 704 | "replica": {"b"}, 705 | }, 706 | }, 707 | "01": &Partition{ 708 | Name: "01", 709 | NodesByState: map[string][]string{ 710 | "primary": {"a"}, 711 | "replica": {"b"}, 712 | }, 713 | }, 714 | "02": &Partition{ 715 | Name: "02", 716 | NodesByState: map[string][]string{ 717 | "primary": {"b"}, 718 | "replica": {"a"}, 719 | }, 720 | }, 721 | "03": &Partition{ 722 | Name: "03", 723 | NodesByState: map[string][]string{ 724 | "primary": {"b"}, 725 | "replica": {"a"}, 726 | }, 727 | }, 728 | }, 729 | endMap: PartitionMap{ 730 | "00": &Partition{ 731 | Name: "00", 732 | NodesByState: map[string][]string{ 733 | "primary": {"a"}, 734 | "replica": {}, 735 | }, 736 | }, 737 | "01": &Partition{ 738 | Name: "01", 739 | NodesByState: map[string][]string{ 740 | "primary": {"a"}, 741 | "replica": {}, 742 | }, 743 | }, 744 | "02": &Partition{ 745 | Name: "02", 746 | NodesByState: map[string][]string{ 747 | "primary": {"a"}, 748 | "replica": {}, 749 | }, 750 | }, 751 | "03": &Partition{ 752 | Name: "03", 753 | NodesByState: map[string][]string{ 754 | "primary": {"a"}, 755 | "replica": {}, 756 | }, 757 | }, 758 | }, 759 | expConcurrentMovesCount: 2, 760 | expNode: "b", 761 | expMovePartitions: []string{"00", "01"}, 762 | expMoveStates: []string{"", ""}, 763 | expMoveOps: []string{"del", "del"}, 764 | expectErr: nil, 765 | }, 766 | { 767 | label: "2 node deletions out of 3 node cluster", 768 | partitionModel: mrPartitionModel, 769 | maxConcurrentMoves: 2, 770 | numProgress: 6, 771 | nodesAll: []string{"a", "b", "c"}, 772 | begMap: PartitionMap{ 773 | "00": &Partition{ 774 | Name: "00", 775 | NodesByState: map[string][]string{ 776 | "primary": {"a"}, 777 | "replica": {"b"}, 778 | }, 779 | }, 780 | "01": &Partition{ 781 | Name: "01", 782 | NodesByState: map[string][]string{ 783 | "primary": {"a"}, 784 | "replica": {"c"}, 785 | }, 786 | }, 787 | "02": &Partition{ 788 | Name: "02", 789 | NodesByState: map[string][]string{ 790 | "primary": {"b"}, 791 | "replica": {"a"}, 792 | }, 793 | }, 794 | "03": &Partition{ 795 | Name: "03", 796 | NodesByState: map[string][]string{ 797 | "primary": {"b"}, 798 | "replica": {"c"}, 799 | }, 800 | }, 801 | "04": &Partition{ 802 | Name: "04", 803 | NodesByState: map[string][]string{ 804 | "primary": {"c"}, 805 | "replica": {"a"}, 806 | }, 807 | }, 808 | "05": &Partition{ 809 | Name: "05", 810 | NodesByState: map[string][]string{ 811 | "primary": {"c"}, 812 | "replica": {"b"}, 813 | }, 814 | }, 815 | }, 816 | endMap: PartitionMap{ 817 | "00": &Partition{ 818 | Name: "00", 819 | NodesByState: map[string][]string{ 820 | "primary": {"a"}, 821 | "replica": {}, 822 | }, 823 | }, 824 | "01": &Partition{ 825 | Name: "01", 826 | NodesByState: map[string][]string{ 827 | "primary": {"a"}, 828 | "replica": {}, 829 | }, 830 | }, 831 | "02": &Partition{ 832 | Name: "02", 833 | NodesByState: map[string][]string{ 834 | "primary": {"a"}, 835 | "replica": {}, 836 | }, 837 | }, 838 | "03": &Partition{ 839 | Name: "03", 840 | NodesByState: map[string][]string{ 841 | "primary": {"a"}, 842 | "replica": {}, 843 | }, 844 | }, 845 | "04": &Partition{ 846 | Name: "04", 847 | NodesByState: map[string][]string{ 848 | "primary": {"a"}, 849 | "replica": {}, 850 | }, 851 | }, 852 | "05": &Partition{ 853 | Name: "05", 854 | NodesByState: map[string][]string{ 855 | "primary": {"a"}, 856 | "replica": {}, 857 | }, 858 | }, 859 | }, 860 | expConcurrentMovesCount: 2, 861 | skipCallbacks: 1, 862 | expNode: "a", 863 | expMovePartitions: []string{"03", "05"}, 864 | expMoveStates: []string{"primary", "primary"}, 865 | expMoveOps: []string{"add", "add"}, 866 | expectErr: nil, 867 | }, 868 | { 869 | label: "2 node deletions out of 3 node cluster", 870 | partitionModel: mrPartitionModel, 871 | maxConcurrentMoves: 4, 872 | numProgress: 6, 873 | nodesAll: []string{"a", "b", "c"}, 874 | begMap: PartitionMap{ 875 | "00": &Partition{ 876 | Name: "00", 877 | NodesByState: map[string][]string{ 878 | "primary": {"a"}, 879 | "replica": {"b"}, 880 | }, 881 | }, 882 | "01": &Partition{ 883 | Name: "01", 884 | NodesByState: map[string][]string{ 885 | "primary": {"a"}, 886 | "replica": {"c"}, 887 | }, 888 | }, 889 | "02": &Partition{ 890 | Name: "02", 891 | NodesByState: map[string][]string{ 892 | "primary": {"b"}, 893 | "replica": {"a"}, 894 | }, 895 | }, 896 | "03": &Partition{ 897 | Name: "03", 898 | NodesByState: map[string][]string{ 899 | "primary": {"b"}, 900 | "replica": {"c"}, 901 | }, 902 | }, 903 | "04": &Partition{ 904 | Name: "04", 905 | NodesByState: map[string][]string{ 906 | "primary": {"c"}, 907 | "replica": {"a"}, 908 | }, 909 | }, 910 | "05": &Partition{ 911 | Name: "05", 912 | NodesByState: map[string][]string{ 913 | "primary": {"c"}, 914 | "replica": {"b"}, 915 | }, 916 | }, 917 | }, 918 | endMap: PartitionMap{ 919 | "00": &Partition{ 920 | Name: "00", 921 | NodesByState: map[string][]string{ 922 | "primary": {"a"}, 923 | "replica": {}, 924 | }, 925 | }, 926 | "01": &Partition{ 927 | Name: "01", 928 | NodesByState: map[string][]string{ 929 | "primary": {"a"}, 930 | "replica": {}, 931 | }, 932 | }, 933 | "02": &Partition{ 934 | Name: "02", 935 | NodesByState: map[string][]string{ 936 | "primary": {"a"}, 937 | "replica": {}, 938 | }, 939 | }, 940 | "03": &Partition{ 941 | Name: "03", 942 | NodesByState: map[string][]string{ 943 | "primary": {"a"}, 944 | "replica": {}, 945 | }, 946 | }, 947 | "04": &Partition{ 948 | Name: "04", 949 | NodesByState: map[string][]string{ 950 | "primary": {"a"}, 951 | "replica": {}, 952 | }, 953 | }, 954 | "05": &Partition{ 955 | Name: "05", 956 | NodesByState: map[string][]string{ 957 | "primary": {"a"}, 958 | "replica": {}, 959 | }, 960 | }, 961 | }, 962 | expConcurrentMovesCount: 4, 963 | expNode: "a", 964 | expMovePartitions: []string{"02", "03", "04", "05"}, 965 | expMoveStates: []string{"primary", "primary", "primary", "primary"}, 966 | expMoveOps: []string{"promote", "promote", "add", "add"}, 967 | expectErr: nil, 968 | }, 969 | } 970 | 971 | for testi, test := range tests { 972 | _, _, assignPartitionsFunc := testMkFuncs() 973 | 974 | if !test.skip { 975 | test.assignPartitionsFunc = func(stopCh chan struct{}, 976 | node string, partitions []string, states []string, ops []string) error { 977 | if test.expNode != node { 978 | return nil 979 | } 980 | if test.skipCallbacks > 0 { 981 | test.skipCallbacks-- 982 | return nil 983 | } 984 | 985 | if len(partitions) != test.expConcurrentMovesCount { 986 | t.Errorf("testi: %d, label: %s, concurrent partition moves expected: %d, but got only: %d", 987 | testi, test.label, test.expConcurrentMovesCount, len(partitions)) 988 | } 989 | 990 | sort.Strings(partitions) 991 | if !reflect.DeepEqual(test.expMovePartitions, partitions) { 992 | t.Errorf("testi: %d, label: %s, moving partitions expected: %+v, but got: %+v", 993 | testi, test.label, test.expMovePartitions, partitions) 994 | } 995 | 996 | sort.Strings(states) 997 | if !reflect.DeepEqual(test.expMoveStates, states) { 998 | t.Errorf("testi: %d, label: %s, moving states expected: %+v, but got: %+v", 999 | testi, test.label, test.expMoveStates, states) 1000 | } 1001 | 1002 | if !reflect.DeepEqual(test.expMoveOps, ops) { 1003 | t.Errorf("testi: %d, label: %s, moving ops expected: %+v, but got: %+v", 1004 | testi, test.label, test.expMoveStates, ops) 1005 | } 1006 | 1007 | assignPartitionsFunc(stopCh, node, partitions, states, ops) 1008 | return nil 1009 | } 1010 | } 1011 | 1012 | options.MaxConcurrentPartitionMovesPerNode = test.maxConcurrentMoves 1013 | 1014 | o, err := OrchestrateMoves( 1015 | test.partitionModel, 1016 | options, 1017 | test.nodesAll, 1018 | test.begMap, 1019 | test.endMap, 1020 | test.assignPartitionsFunc, 1021 | LowestWeightPartitionMoveForNode, 1022 | ) 1023 | if test.expectErr == nil && o == nil { 1024 | t.Errorf("testi: %d, label: %s,"+ 1025 | " expected o", 1026 | testi, test.label) 1027 | } 1028 | if err != nil && test.expectErr != nil && err.Error() != test.expectErr.Error() { 1029 | t.Errorf("testi: %d, label: %s,"+ 1030 | " expectErr: %v, got: %v", 1031 | testi, test.label, 1032 | test.expectErr, err) 1033 | } 1034 | 1035 | if !test.skip { 1036 | for { 1037 | prog := <-o.ProgressCh() 1038 | 1039 | if prog.TotMoverAssignPartitionOk >= test.numProgress { 1040 | break 1041 | } 1042 | } 1043 | 1044 | o.Stop() 1045 | } 1046 | } 1047 | } 1048 | 1049 | func TestOrchestrateMoves(t *testing.T) { 1050 | tests := []struct { 1051 | skip bool 1052 | label string 1053 | partitionModel PartitionModel 1054 | options OrchestratorOptions 1055 | nodesAll []string 1056 | begMap PartitionMap 1057 | endMap PartitionMap 1058 | expectErr error 1059 | 1060 | // Keyed by partition. 1061 | expectAssignPartitions map[string][]assignPartitionRec 1062 | }{ 1063 | { 1064 | label: "do nothing", 1065 | partitionModel: mrPartitionModel, 1066 | options: options1, 1067 | nodesAll: []string(nil), 1068 | begMap: PartitionMap{}, 1069 | endMap: PartitionMap{}, 1070 | expectErr: nil, 1071 | }, 1072 | { 1073 | label: "1 node, no assignments or changes", 1074 | partitionModel: mrPartitionModel, 1075 | options: options1, 1076 | nodesAll: []string{"a"}, 1077 | begMap: PartitionMap{}, 1078 | endMap: PartitionMap{}, 1079 | expectErr: nil, 1080 | }, 1081 | { 1082 | label: "no nodes, but some partitions", 1083 | partitionModel: mrPartitionModel, 1084 | options: options1, 1085 | nodesAll: []string(nil), 1086 | begMap: PartitionMap{ 1087 | "00": &Partition{ 1088 | Name: "00", 1089 | NodesByState: map[string][]string{}, 1090 | }, 1091 | "01": &Partition{ 1092 | Name: "01", 1093 | NodesByState: map[string][]string{}, 1094 | }, 1095 | }, 1096 | endMap: PartitionMap{ 1097 | "00": &Partition{ 1098 | Name: "00", 1099 | NodesByState: map[string][]string{}, 1100 | }, 1101 | "01": &Partition{ 1102 | Name: "01", 1103 | NodesByState: map[string][]string{}, 1104 | }, 1105 | }, 1106 | expectErr: nil, 1107 | }, 1108 | { 1109 | label: "add node a, 1 partition", 1110 | partitionModel: mrPartitionModel, 1111 | options: options1, 1112 | nodesAll: []string{"a"}, 1113 | begMap: PartitionMap{ 1114 | "00": &Partition{ 1115 | Name: "00", 1116 | NodesByState: map[string][]string{}, 1117 | }, 1118 | }, 1119 | endMap: PartitionMap{ 1120 | "00": &Partition{ 1121 | Name: "00", 1122 | NodesByState: map[string][]string{ 1123 | "primary": {"a"}, 1124 | }, 1125 | }, 1126 | }, 1127 | expectAssignPartitions: map[string][]assignPartitionRec{ 1128 | "00": { 1129 | { 1130 | partition: "00", node: "a", state: "primary", 1131 | }, 1132 | }, 1133 | }, 1134 | expectErr: nil, 1135 | }, 1136 | { 1137 | label: "add node a & b, 1 partition", 1138 | partitionModel: mrPartitionModel, 1139 | options: options1, 1140 | nodesAll: []string{"a", "b"}, 1141 | begMap: PartitionMap{ 1142 | "00": &Partition{ 1143 | Name: "00", 1144 | NodesByState: map[string][]string{}, 1145 | }, 1146 | }, 1147 | endMap: PartitionMap{ 1148 | "00": &Partition{ 1149 | Name: "00", 1150 | NodesByState: map[string][]string{ 1151 | "primary": {"a"}, 1152 | "replica": {"b"}, 1153 | }, 1154 | }, 1155 | }, 1156 | expectAssignPartitions: map[string][]assignPartitionRec{ 1157 | "00": { 1158 | { 1159 | partition: "00", node: "a", state: "primary", 1160 | }, 1161 | { 1162 | partition: "00", node: "b", state: "replica", 1163 | }, 1164 | }, 1165 | }, 1166 | expectErr: nil, 1167 | }, 1168 | { 1169 | label: "add node a & b & c, 1 partition", 1170 | partitionModel: mrPartitionModel, 1171 | options: options1, 1172 | nodesAll: []string{"a", "b", "c"}, 1173 | begMap: PartitionMap{ 1174 | "00": &Partition{ 1175 | Name: "00", 1176 | NodesByState: map[string][]string{}, 1177 | }, 1178 | }, 1179 | endMap: PartitionMap{ 1180 | "00": &Partition{ 1181 | Name: "00", 1182 | NodesByState: map[string][]string{ 1183 | "primary": {"a"}, 1184 | "replica": {"b"}, 1185 | }, 1186 | }, 1187 | }, 1188 | expectAssignPartitions: map[string][]assignPartitionRec{ 1189 | "00": { 1190 | { 1191 | partition: "00", node: "a", state: "primary", 1192 | }, 1193 | { 1194 | partition: "00", node: "b", state: "replica", 1195 | }, 1196 | }, 1197 | }, 1198 | expectErr: nil, 1199 | }, 1200 | { 1201 | label: "del node a, 1 partition", 1202 | partitionModel: mrPartitionModel, 1203 | options: options1, 1204 | nodesAll: []string{"a"}, 1205 | begMap: PartitionMap{ 1206 | "00": &Partition{ 1207 | Name: "00", 1208 | NodesByState: map[string][]string{ 1209 | "primary": {"a"}, 1210 | }, 1211 | }, 1212 | }, 1213 | endMap: PartitionMap{ 1214 | "00": &Partition{ 1215 | Name: "00", 1216 | NodesByState: map[string][]string{}, 1217 | }, 1218 | }, 1219 | expectAssignPartitions: map[string][]assignPartitionRec{ 1220 | "00": { 1221 | { 1222 | partition: "00", node: "a", state: "", 1223 | }, 1224 | }, 1225 | }, 1226 | expectErr: nil, 1227 | }, 1228 | { 1229 | label: "swap a to b, 1 partition", 1230 | partitionModel: mrPartitionModel, 1231 | options: options1, 1232 | nodesAll: []string{"a", "b"}, 1233 | begMap: PartitionMap{ 1234 | "00": &Partition{ 1235 | Name: "00", 1236 | NodesByState: map[string][]string{ 1237 | "primary": {"a"}, 1238 | }, 1239 | }, 1240 | }, 1241 | endMap: PartitionMap{ 1242 | "00": &Partition{ 1243 | Name: "00", 1244 | NodesByState: map[string][]string{ 1245 | "primary": {"b"}, 1246 | }, 1247 | }, 1248 | }, 1249 | expectAssignPartitions: map[string][]assignPartitionRec{ 1250 | "00": { 1251 | { 1252 | partition: "00", node: "b", state: "primary", 1253 | }, 1254 | { 1255 | partition: "00", node: "a", state: "", 1256 | }, 1257 | }, 1258 | }, 1259 | expectErr: nil, 1260 | }, 1261 | { 1262 | label: "swap a to b, 1 partition, c unchanged", 1263 | partitionModel: mrPartitionModel, 1264 | options: options1, 1265 | nodesAll: []string{"a", "b", "c"}, 1266 | begMap: PartitionMap{ 1267 | "00": &Partition{ 1268 | Name: "00", 1269 | NodesByState: map[string][]string{ 1270 | "primary": {"a"}, 1271 | "replica": {"c"}, 1272 | }, 1273 | }, 1274 | }, 1275 | endMap: PartitionMap{ 1276 | "00": &Partition{ 1277 | Name: "00", 1278 | NodesByState: map[string][]string{ 1279 | "primary": {"b"}, 1280 | "replica": {"c"}, 1281 | }, 1282 | }, 1283 | }, 1284 | expectAssignPartitions: map[string][]assignPartitionRec{ 1285 | "00": { 1286 | { 1287 | partition: "00", node: "b", state: "primary", 1288 | }, 1289 | { 1290 | partition: "00", node: "a", state: "", 1291 | }, 1292 | }, 1293 | }, 1294 | expectErr: nil, 1295 | }, 1296 | { 1297 | label: "1 partition from a|b to c|a", 1298 | partitionModel: mrPartitionModel, 1299 | options: options1, 1300 | nodesAll: []string{"a", "b", "c"}, 1301 | begMap: PartitionMap{ 1302 | "00": &Partition{ 1303 | Name: "00", 1304 | NodesByState: map[string][]string{ 1305 | "primary": {"a"}, 1306 | "replica": {"b"}, 1307 | }, 1308 | }, 1309 | }, 1310 | endMap: PartitionMap{ 1311 | "00": &Partition{ 1312 | Name: "00", 1313 | NodesByState: map[string][]string{ 1314 | "primary": {"c"}, 1315 | "replica": {"a"}, 1316 | }, 1317 | }, 1318 | }, 1319 | expectAssignPartitions: map[string][]assignPartitionRec{ 1320 | "00": { 1321 | { 1322 | partition: "00", node: "c", state: "primary", 1323 | }, 1324 | { 1325 | partition: "00", node: "a", state: "replica", 1326 | }, 1327 | { 1328 | partition: "00", node: "b", state: "", 1329 | }, 1330 | }, 1331 | }, 1332 | expectErr: nil, 1333 | }, 1334 | { 1335 | label: "add node a & b, 2 partitions", 1336 | partitionModel: mrPartitionModel, 1337 | options: options1, 1338 | nodesAll: []string{"a", "b"}, 1339 | begMap: PartitionMap{ 1340 | "00": &Partition{ 1341 | Name: "00", 1342 | NodesByState: map[string][]string{}, 1343 | }, 1344 | "01": &Partition{ 1345 | Name: "01", 1346 | NodesByState: map[string][]string{}, 1347 | }, 1348 | }, 1349 | endMap: PartitionMap{ 1350 | "00": &Partition{ 1351 | Name: "00", 1352 | NodesByState: map[string][]string{ 1353 | "primary": {"a"}, 1354 | "replica": {"b"}, 1355 | }, 1356 | }, 1357 | "01": &Partition{ 1358 | Name: "01", 1359 | NodesByState: map[string][]string{ 1360 | "primary": {"b"}, 1361 | "replica": {"a"}, 1362 | }, 1363 | }, 1364 | }, 1365 | expectAssignPartitions: map[string][]assignPartitionRec{ 1366 | "00": { 1367 | { 1368 | partition: "00", node: "a", state: "primary", 1369 | }, 1370 | { 1371 | partition: "00", node: "b", state: "replica", 1372 | }, 1373 | }, 1374 | "01": { 1375 | { 1376 | partition: "01", node: "b", state: "primary", 1377 | }, 1378 | { 1379 | partition: "01", node: "a", state: "replica", 1380 | }, 1381 | }, 1382 | }, 1383 | expectErr: nil, 1384 | }, 1385 | { 1386 | label: "swap ab to cd, 2 partitions", 1387 | partitionModel: mrPartitionModel, 1388 | options: options1, 1389 | nodesAll: []string{"a", "b", "c", "d"}, 1390 | begMap: PartitionMap{ 1391 | "00": &Partition{ 1392 | Name: "00", 1393 | NodesByState: map[string][]string{ 1394 | "primary": {"a"}, 1395 | "replica": {"b"}, 1396 | }, 1397 | }, 1398 | "01": &Partition{ 1399 | Name: "01", 1400 | NodesByState: map[string][]string{ 1401 | "primary": {"b"}, 1402 | "replica": {"a"}, 1403 | }, 1404 | }, 1405 | }, 1406 | endMap: PartitionMap{ 1407 | "00": &Partition{ 1408 | Name: "00", 1409 | NodesByState: map[string][]string{ 1410 | "primary": {"c"}, 1411 | "replica": {"d"}, 1412 | }, 1413 | }, 1414 | "01": &Partition{ 1415 | Name: "01", 1416 | NodesByState: map[string][]string{ 1417 | "primary": {"d"}, 1418 | "replica": {"c"}, 1419 | }, 1420 | }, 1421 | }, 1422 | expectAssignPartitions: map[string][]assignPartitionRec{ 1423 | "00": { 1424 | { 1425 | partition: "00", node: "c", state: "primary", 1426 | }, 1427 | { 1428 | partition: "00", node: "a", state: "", 1429 | }, 1430 | { 1431 | partition: "00", node: "d", state: "replica", 1432 | }, 1433 | { 1434 | partition: "00", node: "b", state: "", 1435 | }, 1436 | }, 1437 | "01": { 1438 | { 1439 | partition: "01", node: "d", state: "primary", 1440 | }, 1441 | { 1442 | partition: "01", node: "b", state: "", 1443 | }, 1444 | { 1445 | partition: "01", node: "c", state: "replica", 1446 | }, 1447 | { 1448 | partition: "01", node: "a", state: "", 1449 | }, 1450 | }, 1451 | }, 1452 | expectErr: nil, 1453 | }, 1454 | { 1455 | // TODO: This test is intended to get coverage on 1456 | // LowestWeightPartitionMoveForNode() on its inner 1457 | // MoveOpWeight if statement, but seems to be 1458 | // intermittent -- perhaps goroutine race? 1459 | label: "concurrent moves on b, 2 partitions", 1460 | partitionModel: mrPartitionModel, 1461 | options: options1, 1462 | nodesAll: []string{"a", "b", "c"}, 1463 | begMap: PartitionMap{ 1464 | "00": &Partition{ 1465 | Name: "00", 1466 | NodesByState: map[string][]string{ 1467 | "primary": {"b"}, 1468 | "replica": {"a"}, 1469 | }, 1470 | }, 1471 | "01": &Partition{ 1472 | Name: "01", 1473 | NodesByState: map[string][]string{ 1474 | "primary": {"b"}, 1475 | "replica": {"a"}, 1476 | }, 1477 | }, 1478 | }, 1479 | endMap: PartitionMap{ 1480 | "00": &Partition{ 1481 | Name: "00", 1482 | NodesByState: map[string][]string{ 1483 | "primary": {"a"}, 1484 | "replica": {"b"}, 1485 | }, 1486 | }, 1487 | "01": &Partition{ 1488 | Name: "01", 1489 | NodesByState: map[string][]string{ 1490 | "primary": {"c"}, 1491 | "replica": {"a"}, 1492 | }, 1493 | }, 1494 | }, 1495 | expectAssignPartitions: map[string][]assignPartitionRec{ 1496 | "00": { 1497 | { 1498 | partition: "00", node: "a", state: "primary", 1499 | }, 1500 | { 1501 | partition: "00", node: "b", state: "replica", 1502 | }, 1503 | }, 1504 | "01": { 1505 | { 1506 | partition: "01", node: "c", state: "primary", 1507 | }, 1508 | { 1509 | partition: "01", node: "b", state: "", 1510 | }, 1511 | }, 1512 | }, 1513 | expectErr: nil, 1514 | }, 1515 | { 1516 | label: "nodes with not much work", 1517 | partitionModel: mrPartitionModel, 1518 | options: options1, 1519 | nodesAll: []string{"a", "b", "c", "d", "e"}, 1520 | begMap: PartitionMap{ 1521 | "00": &Partition{ 1522 | Name: "00", 1523 | NodesByState: map[string][]string{ 1524 | "primary": {"b"}, 1525 | "replica": {"a", "d", "e"}, 1526 | }, 1527 | }, 1528 | "01": &Partition{ 1529 | Name: "01", 1530 | NodesByState: map[string][]string{ 1531 | "primary": {"b"}, 1532 | "replica": {"a", "d", "e"}, 1533 | }, 1534 | }, 1535 | }, 1536 | endMap: PartitionMap{ 1537 | "00": &Partition{ 1538 | Name: "00", 1539 | NodesByState: map[string][]string{ 1540 | "primary": {"a"}, 1541 | "replica": {"b", "d", "e"}, 1542 | }, 1543 | }, 1544 | "01": &Partition{ 1545 | Name: "01", 1546 | NodesByState: map[string][]string{ 1547 | "primary": {"c"}, 1548 | "replica": {"a", "d", "e"}, 1549 | }, 1550 | }, 1551 | }, 1552 | expectAssignPartitions: map[string][]assignPartitionRec{ 1553 | "00": { 1554 | { 1555 | partition: "00", node: "a", state: "primary", 1556 | }, 1557 | { 1558 | partition: "00", node: "b", state: "replica", 1559 | }, 1560 | }, 1561 | "01": { 1562 | { 1563 | partition: "01", node: "c", state: "primary", 1564 | }, 1565 | { 1566 | partition: "01", node: "b", state: "", 1567 | }, 1568 | }, 1569 | }, 1570 | expectErr: nil, 1571 | }, 1572 | { 1573 | label: "more concurrent moves", 1574 | partitionModel: mrPartitionModel, 1575 | options: options1, 1576 | nodesAll: []string{"a", "b", "c", "d", "e", "f", "g"}, 1577 | begMap: PartitionMap{ 1578 | "00": &Partition{ 1579 | Name: "00", 1580 | NodesByState: map[string][]string{ 1581 | "primary": {"a"}, 1582 | "replica": {"b"}, 1583 | }, 1584 | }, 1585 | "01": &Partition{ 1586 | Name: "01", 1587 | NodesByState: map[string][]string{ 1588 | "primary": {"b"}, 1589 | "replica": {"c"}, 1590 | }, 1591 | }, 1592 | "02": &Partition{ 1593 | Name: "02", 1594 | NodesByState: map[string][]string{ 1595 | "primary": {"c"}, 1596 | "replica": {"d"}, 1597 | }, 1598 | }, 1599 | "03": &Partition{ 1600 | Name: "03", 1601 | NodesByState: map[string][]string{ 1602 | "primary": {"d"}, 1603 | "replica": {"e"}, 1604 | }, 1605 | }, 1606 | "04": &Partition{ 1607 | Name: "04", 1608 | NodesByState: map[string][]string{ 1609 | "primary": {"e"}, 1610 | "replica": {"f"}, 1611 | }, 1612 | }, 1613 | "05": &Partition{ 1614 | Name: "05", 1615 | NodesByState: map[string][]string{ 1616 | "primary": {"f"}, 1617 | "replica": {"g"}, 1618 | }, 1619 | }, 1620 | }, 1621 | endMap: PartitionMap{ 1622 | "00": &Partition{ 1623 | Name: "00", 1624 | NodesByState: map[string][]string{ 1625 | "primary": {"b"}, 1626 | "replica": {"c"}, 1627 | }, 1628 | }, 1629 | "01": &Partition{ 1630 | Name: "01", 1631 | NodesByState: map[string][]string{ 1632 | "primary": {"c"}, 1633 | "replica": {"d"}, 1634 | }, 1635 | }, 1636 | "02": &Partition{ 1637 | Name: "02", 1638 | NodesByState: map[string][]string{ 1639 | "primary": {"d"}, 1640 | "replica": {"e"}, 1641 | }, 1642 | }, 1643 | "03": &Partition{ 1644 | Name: "03", 1645 | NodesByState: map[string][]string{ 1646 | "primary": {"e"}, 1647 | "replica": {"f"}, 1648 | }, 1649 | }, 1650 | "04": &Partition{ 1651 | Name: "04", 1652 | NodesByState: map[string][]string{ 1653 | "primary": {"f"}, 1654 | "replica": {"g"}, 1655 | }, 1656 | }, 1657 | "05": &Partition{ 1658 | Name: "05", 1659 | NodesByState: map[string][]string{ 1660 | "primary": {"g"}, 1661 | "replica": {"a"}, 1662 | }, 1663 | }, 1664 | }, 1665 | expectAssignPartitions: map[string][]assignPartitionRec{ 1666 | "00": { 1667 | { 1668 | partition: "00", node: "b", state: "primary", 1669 | }, 1670 | { 1671 | partition: "00", node: "a", state: "", 1672 | }, 1673 | { 1674 | partition: "00", node: "c", state: "replica", 1675 | }, 1676 | }, 1677 | "01": { 1678 | { 1679 | partition: "01", node: "c", state: "primary", 1680 | }, 1681 | { 1682 | partition: "01", node: "b", state: "", 1683 | }, 1684 | { 1685 | partition: "01", node: "d", state: "replica", 1686 | }, 1687 | }, 1688 | "02": { 1689 | { 1690 | partition: "02", node: "d", state: "primary", 1691 | }, 1692 | { 1693 | partition: "02", node: "c", state: "", 1694 | }, 1695 | { 1696 | partition: "02", node: "e", state: "replica", 1697 | }, 1698 | }, 1699 | "03": { 1700 | { 1701 | partition: "03", node: "e", state: "primary", 1702 | }, 1703 | { 1704 | partition: "03", node: "d", state: "", 1705 | }, 1706 | { 1707 | partition: "03", node: "f", state: "replica", 1708 | }, 1709 | }, 1710 | "04": { 1711 | { 1712 | partition: "04", node: "f", state: "primary", 1713 | }, 1714 | { 1715 | partition: "04", node: "e", state: "", 1716 | }, 1717 | { 1718 | partition: "04", node: "g", state: "replica", 1719 | }, 1720 | }, 1721 | "05": { 1722 | { 1723 | partition: "05", node: "g", state: "primary", 1724 | }, 1725 | { 1726 | partition: "05", node: "f", state: "", 1727 | }, 1728 | { 1729 | partition: "05", node: "a", state: "replica", 1730 | }, 1731 | }, 1732 | }, 1733 | expectErr: nil, 1734 | }, 1735 | } 1736 | 1737 | for testi, test := range tests { 1738 | if test.skip { 1739 | continue 1740 | } 1741 | 1742 | _, assignPartitionRecs, assignPartitionFunc := testMkFuncs() 1743 | 1744 | o, err := OrchestrateMoves( 1745 | test.partitionModel, 1746 | test.options, 1747 | test.nodesAll, 1748 | test.begMap, 1749 | test.endMap, 1750 | assignPartitionFunc, 1751 | LowestWeightPartitionMoveForNode, 1752 | ) 1753 | if o == nil { 1754 | t.Errorf("testi: %d, label: %s,"+ 1755 | " expected o", 1756 | testi, test.label) 1757 | } 1758 | if err != test.expectErr { 1759 | t.Errorf("testi: %d, label: %s,"+ 1760 | " expectErr: %v, got: %v", 1761 | testi, test.label, 1762 | test.expectErr, err) 1763 | } 1764 | 1765 | debug := false 1766 | 1767 | if debug { 1768 | o.m.Lock() 1769 | fmt.Printf("test: %q\n START progress: %#v\n", 1770 | test.label, o.progress) 1771 | o.m.Unlock() 1772 | } 1773 | 1774 | for progress := range o.ProgressCh() { 1775 | if debug { 1776 | fmt.Printf("test: %q\n progress: %#v\n", 1777 | test.label, progress) 1778 | } 1779 | } 1780 | 1781 | o.Stop() 1782 | 1783 | if len(assignPartitionRecs) != len(test.expectAssignPartitions) { 1784 | t.Errorf("testi: %d, label: %s,"+ 1785 | " len(assignPartitionRecs == %d)"+ 1786 | " != len(test.expectAssignPartitions == %d),"+ 1787 | " assignPartitionRecs: %#v,"+ 1788 | " test.expectAssignPartitions: %#v", 1789 | testi, test.label, 1790 | len(assignPartitionRecs), 1791 | len(test.expectAssignPartitions), 1792 | assignPartitionRecs, 1793 | test.expectAssignPartitions) 1794 | } 1795 | 1796 | for partition, eapm := range test.expectAssignPartitions { 1797 | for eapi, eap := range eapm { 1798 | apr := assignPartitionRecs[partition][eapi] 1799 | if eap.partition != apr.partition || 1800 | eap.node != apr.node || 1801 | eap.state != apr.state { 1802 | t.Errorf("testi: %d, label: %s,"+ 1803 | " mismatched assignment,"+ 1804 | " eapi: %d, eap: %#v, apr: %#v", 1805 | testi, test.label, 1806 | eapi, eap, apr) 1807 | } 1808 | } 1809 | } 1810 | } 1811 | } 1812 | -------------------------------------------------------------------------------- /plan.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014-Present Couchbase, Inc. 2 | // 3 | // Use of this software is governed by the Business Source License included 4 | // in the file licenses/BSL-Couchbase.txt. As of the Change Date specified 5 | // in that file, in accordance with the Business Source License, use of this 6 | // software will be governed by the Apache License, Version 2.0, included in 7 | // the file licenses/APL2.txt. 8 | 9 | package blance 10 | 11 | import ( 12 | "fmt" 13 | "reflect" 14 | "sort" 15 | "strconv" 16 | ) 17 | 18 | // MaxIterationsPerPlan controls how many iterations blance will 19 | // attempt to try to converge to a stabilized plan. Usually, blance 20 | // only needs only 1 or 2 iterations. 21 | var MaxIterationsPerPlan = 10 22 | 23 | func planNextMapEx( 24 | prevMap PartitionMap, 25 | partitionsToAssign PartitionMap, 26 | nodesAll []string, // Union of nodesBefore, nodesToAdd, nodesToRemove. 27 | nodesToRemove []string, 28 | nodesToAdd []string, 29 | model PartitionModel, 30 | opts PlanNextMapOptions, 31 | ) (nextMap PartitionMap, warnings map[string][]string) { 32 | for i := 0; i < MaxIterationsPerPlan; i++ { // Loop for convergence. 33 | nextMap, warnings = planNextMapInnerEx(prevMap, partitionsToAssign, 34 | nodesAll, nodesToRemove, nodesToAdd, model, opts) 35 | // Only check if the partitions to be assigned match. 36 | notMatch := false 37 | for _, partition := range nextMap { 38 | if !reflect.DeepEqual(partition, prevMap[partition.Name]) { 39 | notMatch = true 40 | break 41 | } 42 | } 43 | if !notMatch { 44 | break 45 | } 46 | // If there's nothing to be changed since it's the best fit heuristically, 47 | // then abort. 48 | // Need to replace only the _new_ partitions - not truncate them all. 49 | for _, partition := range nextMap { 50 | prevMap[partition.Name] = partition 51 | partitionsToAssign[partition.Name] = partition 52 | } 53 | nodesAll = StringsRemoveStrings(nodesAll, nodesToRemove) 54 | nodesToRemove = []string{} 55 | nodesToAdd = []string{} 56 | } 57 | return nextMap, warnings 58 | } 59 | 60 | func planNextMapInnerEx( 61 | prevMap PartitionMap, 62 | partitionsToAssign PartitionMap, 63 | nodesAll []string, // Union of nodesBefore, nodesToAdd, nodesToRemove. 64 | nodesToRemove []string, 65 | nodesToAdd []string, 66 | model PartitionModel, 67 | opts PlanNextMapOptions, 68 | ) (PartitionMap, map[string][]string) { 69 | // map of partition name to warnings for that partition 70 | partitionWarnings := make(map[string][]string, len(prevMap)) 71 | 72 | nodePositions := map[string]int{} 73 | for i, node := range nodesAll { 74 | nodePositions[node] = i 75 | } 76 | 77 | nodesNext := StringsRemoveStrings(nodesAll, nodesToRemove) 78 | 79 | hierarchyChildren := mapParentsToMapChildren(opts.NodeHierarchy) 80 | 81 | // Start by filling out nextPartitions as a deep clone of 82 | // partitionsToAssign.Partitions, but filter out the to-be-removed nodes. 83 | nextPartitions := partitionsToAssign.toArrayCopy() 84 | for _, partition := range nextPartitions { 85 | partition.NodesByState = 86 | removeNodesFromNodesByState(partition.NodesByState, 87 | nodesToRemove, nil) 88 | } 89 | sort.Sort(&partitionSorter{a: nextPartitions}) 90 | 91 | // Key is stateName, value is {node: count}. 92 | var stateNodeCounts map[string]map[string]int 93 | 94 | stateNodeCounts = countStateNodes(prevMap, opts.PartitionWeights) 95 | 96 | // Helper function that returns an ordered array of candidates 97 | // nodes to assign to a partition, ordered by best heuristic fit. 98 | findBestNodes := func( 99 | partition *Partition, 100 | stateName string, 101 | constraints int, 102 | nodeToNodeCounts map[string]map[string]int, 103 | ) []string { 104 | stickiness := 1.5 105 | if opts.PartitionWeights != nil { 106 | w, exists := opts.PartitionWeights[partition.Name] 107 | if exists { 108 | stickiness = float64(w) 109 | } else if opts.StateStickiness != nil { 110 | s, exists := opts.StateStickiness[stateName] 111 | if exists { 112 | stickiness = float64(s) 113 | } 114 | } 115 | } 116 | 117 | // Keyed by node, value is sum of partitions on that node. 118 | nodePartitionCounts := make(map[string]int) 119 | for _, nodeCounts := range stateNodeCounts { 120 | for node, nodeCount := range nodeCounts { 121 | nodePartitionCounts[node] = 122 | nodePartitionCounts[node] + nodeCount 123 | } 124 | } 125 | 126 | topPriorityStateName := "" 127 | for stateName, state := range model { 128 | if topPriorityStateName == "" || 129 | state.Priority < model[topPriorityStateName].Priority { 130 | topPriorityStateName = stateName 131 | } 132 | } 133 | 134 | topPriorityNode := "" 135 | topPriorityStateNodes := partition.NodesByState[topPriorityStateName] 136 | if len(topPriorityStateNodes) > 0 { 137 | topPriorityNode = topPriorityStateNodes[0] 138 | } 139 | 140 | statePriority := model[stateName].Priority 141 | 142 | candidateNodes := append([]string(nil), nodesNext...) 143 | 144 | // Filter out nodes of a higher priority state; e.g., if we're 145 | // assigning replicas, leave the primaries untouched. 146 | excludeHigherPriorityNodes := func(remainingNodes []string) []string { 147 | for stateName, stateNodes := range partition.NodesByState { 148 | if model[stateName].Priority < statePriority { 149 | remainingNodes = 150 | StringsRemoveStrings(remainingNodes, stateNodes) 151 | } 152 | } 153 | return remainingNodes 154 | } 155 | 156 | candidateNodes = excludeHigherPriorityNodes(candidateNodes) 157 | 158 | config := &NodeSorterConfig{ 159 | StateName: stateName, 160 | Partition: partition, 161 | NumPartitions: len(prevMap), 162 | TopPriorityNode: topPriorityNode, 163 | StateNodeCounts: stateNodeCounts, 164 | NodeToNodeCounts: nodeToNodeCounts, 165 | NodePartitionCounts: nodePartitionCounts, 166 | NodePositions: nodePositions, 167 | NodeWeights: opts.NodeWeights, 168 | Stickiness: stickiness, 169 | Nodes: candidateNodes, 170 | } 171 | nodeSorter := CustomNodeSorter(config) 172 | sort.Sort(nodeSorter) 173 | 174 | if opts.HierarchyRules != nil { 175 | hierarchyNodes := []string{} 176 | 177 | for _, hierarchyRule := range opts.HierarchyRules[stateName] { 178 | h := topPriorityNode 179 | if h == "" && len(hierarchyNodes) > 0 { 180 | h = hierarchyNodes[0] 181 | } 182 | 183 | // Pick the nodes for the partition until the given constaint is met. 184 | // {h} + {hierarchyNodes} contains the list of the nodes assigned for the 185 | // partition so far, so that the hierarchial inclusion or exclusion of 186 | // future node selections can be cognizant of the previous node assignments. 187 | for i := 0; i < constraints; i++ { 188 | hierarchyCandidates := includeExcludeNodesIntersect( 189 | append([]string{h}, hierarchyNodes...), 190 | hierarchyRule.IncludeLevel, 191 | hierarchyRule.ExcludeLevel, 192 | opts.NodeHierarchy, hierarchyChildren) 193 | hierarchyCandidates = 194 | StringsIntersectStrings(hierarchyCandidates, nodesNext) 195 | hierarchyCandidates = 196 | excludeHigherPriorityNodes(hierarchyCandidates) 197 | 198 | config := &NodeSorterConfig{ 199 | StateName: stateName, 200 | Partition: partition, 201 | NumPartitions: len(prevMap), 202 | TopPriorityNode: topPriorityNode, 203 | StateNodeCounts: stateNodeCounts, 204 | NodeToNodeCounts: nodeToNodeCounts, 205 | NodePartitionCounts: nodePartitionCounts, 206 | NodePositions: nodePositions, 207 | NodeWeights: opts.NodeWeights, 208 | Stickiness: stickiness, 209 | Nodes: hierarchyCandidates, 210 | } 211 | nodeSorter := CustomNodeSorter(config) 212 | sort.Sort(nodeSorter) 213 | 214 | if len(hierarchyCandidates) > 0 { 215 | hierarchyNodes = append(hierarchyNodes, 216 | hierarchyCandidates[0]) 217 | } else if len(candidateNodes) > 0 { 218 | hierarchyNodes = append(hierarchyNodes, 219 | candidateNodes[0]) 220 | } 221 | } 222 | } 223 | 224 | candidateNodes = append(hierarchyNodes, candidateNodes...) 225 | candidateNodes = stringsDeduplicate(candidateNodes) 226 | } 227 | 228 | if len(candidateNodes) >= constraints { 229 | candidateNodes = candidateNodes[0:constraints] 230 | } else { 231 | partitionWarnings[partition.Name] = append(partitionWarnings[partition.Name], 232 | fmt.Sprintf("could not meet constraints: %d,"+ 233 | " stateName: %s, partitionName: %s", 234 | constraints, stateName, partition.Name)) 235 | } 236 | 237 | // Keep nodeToNodeCounts updated. 238 | for _, candidateNode := range candidateNodes { 239 | m, exists := nodeToNodeCounts[topPriorityNode] 240 | if !exists { 241 | m = make(map[string]int) 242 | nodeToNodeCounts[topPriorityNode] = m 243 | } 244 | m[candidateNode] = m[candidateNode] + 1 245 | } 246 | 247 | return candidateNodes 248 | } 249 | 250 | // Helper function that given a PartitionModel state name and its 251 | // constraints, for every partition, assign nodes by mutating 252 | // nextPartitions. 253 | assignStateToPartitions := func(stateName string, constraints int) { 254 | // Sort the partitions to help reach a better assignment. 255 | p := &partitionSorter{ 256 | stateName: stateName, 257 | prevMap: prevMap, 258 | nodesToRemove: nodesToRemove, 259 | nodesToAdd: nodesToAdd, 260 | partitionWeights: opts.PartitionWeights, 261 | a: append([]*Partition(nil), nextPartitions...), 262 | } 263 | sort.Sort(p) 264 | 265 | // Key is higherPriorityNode, value is {lowerPriorityNode: count}. 266 | nodeToNodeCounts := make(map[string]map[string]int) 267 | 268 | for _, partition := range p.a { 269 | partitionWeight := 1 270 | if opts.PartitionWeights != nil { 271 | w, exists := opts.PartitionWeights[partition.Name] 272 | if exists { 273 | partitionWeight = w 274 | } 275 | } 276 | 277 | incStateNodeCounts := func(stateName string, nodes []string) { 278 | adjustStateNodeCounts(stateNodeCounts, stateName, nodes, 279 | partitionWeight) 280 | } 281 | decStateNodeCounts := func(stateName string, nodes []string) { 282 | adjustStateNodeCounts(stateNodeCounts, stateName, nodes, 283 | -partitionWeight) 284 | } 285 | 286 | nodesToAssign := 287 | findBestNodes(partition, 288 | stateName, constraints, nodeToNodeCounts) 289 | 290 | partition.NodesByState = 291 | removeNodesFromNodesByState(partition.NodesByState, 292 | partition.NodesByState[stateName], 293 | decStateNodeCounts) 294 | partition.NodesByState = 295 | removeNodesFromNodesByState(partition.NodesByState, 296 | nodesToAssign, 297 | decStateNodeCounts) 298 | 299 | partition.NodesByState[stateName] = nodesToAssign 300 | 301 | incStateNodeCounts(stateName, nodesToAssign) 302 | } 303 | } 304 | 305 | // Run through the sorted partition states (primary, replica, etc) 306 | // that have constraints and invoke assignStateToPartitions(). 307 | for _, stateName := range sortStateNames(model) { 308 | constraints := 0 309 | 310 | modelState, exists := model[stateName] 311 | if exists && modelState != nil { 312 | constraints = modelState.Constraints 313 | } 314 | if opts.ModelStateConstraints != nil { 315 | modelStateConstraints, exists := opts.ModelStateConstraints[stateName] 316 | if exists { 317 | constraints = modelStateConstraints 318 | } 319 | } 320 | 321 | if constraints > 0 { 322 | assignStateToPartitions(stateName, constraints) 323 | } 324 | } 325 | 326 | rv := PartitionMap{} 327 | for _, partition := range nextPartitions { 328 | rv[partition.Name] = partition 329 | } 330 | return rv, partitionWarnings 331 | } 332 | 333 | // Makes a deep copy of the PartitionMap as an array. 334 | func (m PartitionMap) toArrayCopy() []*Partition { 335 | rv := make([]*Partition, 0, len(m)) 336 | for _, partition := range m { 337 | rv = append(rv, &Partition{ 338 | Name: partition.Name, 339 | NodesByState: copyNodesByState(partition.NodesByState), 340 | }) 341 | } 342 | return rv 343 | } 344 | 345 | func copyNodesByState(nbs map[string][]string) map[string][]string { 346 | rv := make(map[string][]string) 347 | for stateName, nodes := range nbs { 348 | rv[stateName] = append([]string(nil), nodes...) 349 | } 350 | return rv 351 | } 352 | 353 | func adjustStateNodeCounts(stateNodeCounts map[string]map[string]int, 354 | stateName string, nodes []string, amt int) { 355 | for _, node := range nodes { 356 | s, exists := stateNodeCounts[stateName] 357 | if !exists || s == nil { 358 | s = make(map[string]int) 359 | stateNodeCounts[stateName] = s 360 | } 361 | s[node] = s[node] + amt 362 | } 363 | } 364 | 365 | // Example, with input partitionMap of... 366 | // 367 | // { "0": { NodesByState: {"primary": ["a"], "replica": ["b", "c"]} }, 368 | // "1": { NodesByState: {"primary": ["b"], "replica": ["c"]} } } 369 | // 370 | // then return value will be... 371 | // 372 | // { "primary": { "a": 1, "b": 1 }, 373 | // "replica": { "b": 1, "c": 2 } } 374 | func countStateNodes( 375 | partitionMap PartitionMap, 376 | partitionWeights map[string]int, 377 | ) map[string]map[string]int { 378 | rv := make(map[string]map[string]int) 379 | for partitionName, partition := range partitionMap { 380 | for stateName, nodes := range partition.NodesByState { 381 | s := rv[stateName] 382 | if s == nil { 383 | s = make(map[string]int) 384 | rv[stateName] = s 385 | } 386 | for _, node := range nodes { 387 | partitionWeight := 1 388 | if partitionWeights != nil { 389 | w, exists := partitionWeights[partitionName] 390 | if exists { 391 | partitionWeight = w 392 | } 393 | } 394 | s[node] = s[node] + partitionWeight 395 | } 396 | } 397 | } 398 | return rv 399 | } 400 | 401 | // -------------------------------------------------------- 402 | 403 | // Returns a copy of nodesByState but with nodes removed. Example, 404 | // when removeNodes == ["a"] and nodesByState == {"primary": ["a"], 405 | // "replica": ["b"]}, then result will be {"primary": [], "replica": 406 | // ["b"]}. Optional callback is invoked with the nodes that will 407 | // actually be removed. 408 | func removeNodesFromNodesByState( 409 | nodesByState map[string][]string, 410 | removeNodes []string, 411 | cb func(stateName string, nodesToBeRemoved []string), 412 | ) map[string][]string { 413 | rv := make(map[string][]string) 414 | for stateName, nodes := range nodesByState { 415 | if cb != nil { 416 | cb(stateName, StringsIntersectStrings(nodes, removeNodes)) 417 | } 418 | rv[stateName] = StringsRemoveStrings(nodes, removeNodes) 419 | } 420 | return rv 421 | } 422 | 423 | // Given a nodesByState, like {"primary": ["a"], "replica": ["b", "c"]}, 424 | // this function might return something like ["b", "c", "a"]. 425 | func flattenNodesByState(nodesByState map[string][]string) []string { 426 | rv := make([]string, 0) 427 | for _, nodes := range nodesByState { 428 | rv = append(rv, nodes...) 429 | } 430 | return rv 431 | } 432 | 433 | // -------------------------------------------------------- 434 | 435 | // Returns state names ordered by model.States[stateName].Priority 436 | // ASC, stateName ASC. 437 | func sortStateNames(model PartitionModel) []string { 438 | pms := &stateNameSorter{ 439 | m: model, 440 | s: make([]string, 0, len(model)), 441 | } 442 | for stateName := range model { 443 | pms.s = append(pms.s, stateName) 444 | } 445 | sort.Sort(pms) 446 | return pms.s 447 | } 448 | 449 | // Does ORDER BY m.States[stateName].Priority ASC, stateName ASC. 450 | type stateNameSorter struct { 451 | m PartitionModel 452 | s []string // This array is mutated during a sort.Sort() 453 | } 454 | 455 | func (pms *stateNameSorter) Len() int { 456 | return len(pms.s) 457 | } 458 | 459 | func (pms *stateNameSorter) Less(i, j int) bool { 460 | iname, jname := pms.s[i], pms.s[j] 461 | 462 | if pms.m != nil && 463 | pms.m[iname] != nil && 464 | pms.m[jname] != nil && 465 | pms.m[iname].Priority < pms.m[jname].Priority { 466 | return true 467 | } 468 | 469 | return iname < jname 470 | } 471 | 472 | func (pms *stateNameSorter) Swap(i, j int) { 473 | pms.s[i], pms.s[j] = pms.s[j], pms.s[i] 474 | } 475 | 476 | // -------------------------------------------------------- 477 | 478 | // Does ORDER BY partitions-on-nodes-to-be-removed, then by 479 | // partitions-who-haven't-been-assigned-anywhere-yet, then by 480 | // partition-weight, then by partition-name. 481 | type partitionSorter struct { 482 | stateName string // When "", just sort by partition name. 483 | prevMap PartitionMap 484 | nodesToRemove []string 485 | nodesToAdd []string 486 | partitionWeights map[string]int // Keyed by partition name. 487 | 488 | a []*Partition // This array is mutated during sort.Sort(). 489 | } 490 | 491 | func (r *partitionSorter) Len() int { 492 | return len(r.a) 493 | } 494 | 495 | func (r *partitionSorter) Less(i, j int) bool { 496 | ei := r.Score(i) 497 | ej := r.Score(j) 498 | for x := 0; x < len(ei) && x < len(ej); x++ { 499 | if ei[x] < ej[x] { 500 | return true 501 | } 502 | if ei[x] > ej[x] { 503 | return false 504 | } 505 | } 506 | if len(ei) < len(ej) { 507 | return true 508 | } 509 | if len(ei) > len(ej) { 510 | return false 511 | } 512 | return r.a[i].Name < r.a[j].Name 513 | } 514 | 515 | func (r *partitionSorter) Swap(i, j int) { 516 | r.a[i], r.a[j] = r.a[j], r.a[i] 517 | } 518 | 519 | func (r *partitionSorter) Score(i int) []string { 520 | partitionName := r.a[i].Name 521 | partitionNameStr := partitionName 522 | 523 | // If the partitionName looks like a positive integer, then 524 | // zero-pad it for sortability. 525 | partitionN, err := strconv.Atoi(partitionName) 526 | if err == nil && partitionN >= 0 { 527 | partitionNameStr = fmt.Sprintf("%10d", partitionN) 528 | } 529 | 530 | // Calculate partition weight, and zero-pad it for sortability, 531 | // where the nine 9's magic number is to to allow heavier 532 | // partitions to come first. 533 | partitionWeight := 1 534 | if r.partitionWeights != nil { 535 | if w, exists := r.partitionWeights[partitionName]; exists { 536 | partitionWeight = w 537 | } 538 | } 539 | partitionWeightStr := fmt.Sprintf("%10d", 999999999-partitionWeight) 540 | 541 | // First, favor partitions on nodes that are to-be-removed. 542 | if r.prevMap != nil && 543 | r.nodesToRemove != nil && len(r.nodesToRemove) > 0 { 544 | lastPartition := r.prevMap[partitionName] 545 | lpnbs := lastPartition.NodesByState[r.stateName] 546 | if lpnbs != nil && 547 | len(StringsIntersectStrings(lpnbs, r.nodesToRemove)) > 0 { 548 | return []string{"0", partitionWeightStr, partitionNameStr} 549 | } 550 | } 551 | 552 | // Then, favor partitions who haven't yet been assigned to any 553 | // newly added nodes yet for any state. 554 | if r.nodesToAdd != nil { 555 | fnbs := flattenNodesByState(r.a[i].NodesByState) 556 | if len(StringsIntersectStrings(fnbs, r.nodesToAdd)) <= 0 { 557 | return []string{"1", partitionWeightStr, partitionNameStr} 558 | } 559 | } 560 | 561 | return []string{"2", partitionWeightStr, partitionNameStr} 562 | } 563 | 564 | // -------------------------------------------------------- 565 | 566 | type NodeSorterConfig struct { 567 | StateName string 568 | Partition *Partition 569 | NumPartitions int 570 | TopPriorityNode string 571 | StateNodeCounts map[string]map[string]int 572 | NodeToNodeCounts map[string]map[string]int 573 | NodePartitionCounts map[string]int 574 | NodePositions map[string]int 575 | NodeWeights map[string]int 576 | Stickiness float64 577 | Nodes []string 578 | } 579 | 580 | var CustomNodeSorter = defaultNodeSorter 581 | 582 | func defaultNodeSorter(config *NodeSorterConfig) sort.Interface { 583 | return &nodeSorter{ 584 | stateName: config.StateName, 585 | partition: config.Partition, 586 | numPartitions: config.NumPartitions, 587 | topPriorityNode: config.TopPriorityNode, 588 | stateNodeCounts: config.StateNodeCounts, 589 | nodeToNodeCounts: config.NodeToNodeCounts, 590 | nodePartitionCounts: config.NodePartitionCounts, 591 | nodePositions: config.NodePositions, 592 | nodeWeights: config.NodeWeights, 593 | stickiness: config.Stickiness, 594 | a: config.Nodes, 595 | } 596 | } 597 | 598 | type nodeSorter struct { 599 | stateName string 600 | partition *Partition 601 | numPartitions int 602 | topPriorityNode string 603 | stateNodeCounts map[string]map[string]int 604 | nodeToNodeCounts map[string]map[string]int 605 | nodePartitionCounts map[string]int 606 | nodePositions map[string]int 607 | nodeWeights map[string]int 608 | stickiness float64 609 | 610 | a []string // Entries are node names. 611 | } 612 | 613 | func (ns *nodeSorter) Len() int { 614 | return len(ns.a) 615 | } 616 | 617 | func (ns *nodeSorter) Less(i, j int) bool { 618 | si := ns.Score(i) 619 | sj := ns.Score(j) 620 | if si < sj { 621 | return true 622 | } 623 | if si > sj { 624 | return false 625 | } 626 | 627 | return ns.nodePositions[ns.a[i]] < ns.nodePositions[ns.a[j]] 628 | } 629 | 630 | func (ns *nodeSorter) Swap(i, j int) { 631 | ns.a[i], ns.a[j] = ns.a[j], ns.a[i] 632 | } 633 | 634 | func (ns *nodeSorter) Score(i int) float64 { 635 | node := ns.a[i] 636 | 637 | lowerPriorityBalanceFactor := 0.0 638 | if ns.nodeToNodeCounts != nil && ns.numPartitions > 0 { 639 | m, exists := ns.nodeToNodeCounts[ns.topPriorityNode] 640 | if exists { 641 | lowerPriorityBalanceFactor = 642 | float64(m[node]) / float64(ns.numPartitions) 643 | } 644 | } 645 | 646 | filledFactor := 0.0 647 | if ns.nodePartitionCounts != nil && ns.numPartitions > 0 { 648 | c, exists := ns.nodePartitionCounts[node] 649 | if exists { 650 | filledFactor = (0.001 * float64(c)) / float64(ns.numPartitions) 651 | } 652 | } 653 | 654 | currentFactor := 0.0 655 | if ns.partition != nil { 656 | for _, stateNode := range ns.partition.NodesByState[ns.stateName] { 657 | if stateNode == node { 658 | // Minimise movement. 659 | currentFactor = ns.stickiness 660 | } 661 | } 662 | } 663 | 664 | r := 0.0 665 | if ns.stateNodeCounts != nil { 666 | nodeCounts, exists := ns.stateNodeCounts[ns.stateName] 667 | if exists && nodeCounts != nil { 668 | r = float64(nodeCounts[node]) 669 | } 670 | } 671 | 672 | r = r + lowerPriorityBalanceFactor 673 | r = r + filledFactor 674 | 675 | if ns.nodeWeights != nil { 676 | w, exists := ns.nodeWeights[node] 677 | if exists { 678 | if w > 0 { 679 | r = r / float64(w) 680 | } else if w < 0 && NodeScoreBooster != nil { 681 | r += NodeScoreBooster(w, currentFactor) 682 | } 683 | } 684 | } 685 | 686 | r = r - currentFactor 687 | 688 | return r 689 | } 690 | 691 | // NodeScoreBooster lets the clients override their optional 692 | // score booster callback implementations. 693 | var NodeScoreBooster CustomNodeScoreBooster 694 | 695 | // CustomNodeScoreBooster is an optional callback that helps the clients to 696 | // override the node weights and thereby control the partition placements. 697 | type CustomNodeScoreBooster func(weight int, stickiness float64) float64 698 | 699 | // -------------------------------------------------------- 700 | 701 | // The mapParents is keyed by node, value is parent node. Returns a 702 | // map keyed by node, value is array of child nodes. 703 | func mapParentsToMapChildren( 704 | mapParents map[string]string) map[string][]string { 705 | nodes := make([]string, 0) // Sort for stability. 706 | for node := range mapParents { 707 | nodes = append(nodes, node) 708 | } 709 | sort.Strings(nodes) 710 | 711 | rv := make(map[string][]string) 712 | for _, child := range nodes { 713 | parent := mapParents[child] 714 | rv[parent] = append(rv[parent], child) 715 | } 716 | return rv 717 | } 718 | 719 | // The includeLevel is tree ancestor inclusion level, and excludeLevel 720 | // is tree ancestor exclusion level. Example: includeLevel of 2 and 721 | // excludeLevel of 1 means include nodes with the same grandparent 722 | // (level 2), but exclude nodes with the same parent (level 1). 723 | func includeExcludeNodes(node string, 724 | includeLevel int, 725 | excludeLevel int, 726 | mapParents map[string]string, 727 | mapChildren map[string][]string) []string { 728 | incNodes := 729 | findLeaves(findAncestor(node, mapParents, includeLevel), mapChildren) 730 | excNodes := 731 | findLeaves(findAncestor(node, mapParents, excludeLevel), mapChildren) 732 | 733 | return StringsRemoveStrings(incNodes, excNodes) 734 | } 735 | 736 | // includeExcludeNodesIntersect gives back the set of filtered nodes 737 | // according to the given inclusion and exclusion parameter values. 738 | func includeExcludeNodesIntersect(nodes []string, 739 | includeLevel int, 740 | excludeLevel int, 741 | mapParents map[string]string, 742 | mapChildren map[string][]string) (rv []string) { 743 | for _, node := range nodes { 744 | res := includeExcludeNodes(node, includeLevel, excludeLevel, 745 | mapParents, mapChildren) 746 | if len(rv) == 0 { 747 | rv = res 748 | continue 749 | } 750 | rv = StringsIntersectStrings(rv, res) 751 | } 752 | return 753 | } 754 | 755 | func findAncestor(node string, 756 | mapParents map[string]string, level int) string { 757 | for level > 0 { 758 | node = mapParents[node] 759 | level-- 760 | } 761 | return node 762 | } 763 | 764 | func findLeaves(node string, mapChildren map[string][]string) []string { 765 | children := mapChildren[node] 766 | if len(children) <= 0 { 767 | return []string{node} // Node is a leaf. 768 | } 769 | rv := make([]string, 0) 770 | for _, c := range children { 771 | rv = append(rv, findLeaves(c, mapChildren)...) 772 | } 773 | return rv 774 | } 775 | --------------------------------------------------------------------------------