├── AUTHORS ├── go.mod ├── testdata └── templates │ └── config.tmpl ├── version.go ├── .gitignore ├── Makefile ├── enable.go ├── ChangeLog.md ├── .github └── workflows │ ├── go.yml │ ├── codecov.yml │ └── codeql-analysis.yml ├── make_version.sh ├── LICENSE ├── globals.go ├── TODO.md ├── health_test.go ├── status_test.go ├── cslb_test.go ├── README.md ├── dial.go ├── http_test.go ├── dial_test.go ├── cslb.go ├── health.go ├── status.go ├── srv_test.go ├── doc.go └── srv.go /AUTHORS: -------------------------------------------------------------------------------- 1 | Mark Delany 2 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/markdingo/cslb 2 | 3 | go 1.21 4 | -------------------------------------------------------------------------------- /testdata/templates/config.tmpl: -------------------------------------------------------------------------------- 1 | {{define "config"}} 2 |

CSLB Global State - which is completely empty but nonetheless functional

3 | {{end}} 4 | -------------------------------------------------------------------------------- /version.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | const ( 4 | // Version is auto-generated from ChangeLog.md 5 | Version = "v1.1.0" 6 | // ReleaseDate is also auto-generated from ChangeLog.md 7 | ReleaseDate = "2023-03-05" 8 | ) 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, build with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dev environment 15 | .DS_Store 16 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | 3 | help: 4 | @echo Targets: clean, fmt, test and version 5 | 6 | .PHONY: clean 7 | clean: 8 | go clean 9 | 10 | .PHONY: fmt 11 | fmt: 12 | gofmt -s -w . 13 | 14 | .PHONY: test tests 15 | test tests: 16 | go vet ./... 17 | go test ./... 18 | 19 | .PHONY: testrace 20 | testrace: 21 | go test -race ./... 22 | 23 | .PHONY: version 24 | version: 25 | sh make_version.sh ChangeLog.md >version.go 26 | -------------------------------------------------------------------------------- /enable.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | import ( 4 | "net/http" 5 | ) 6 | 7 | // Enable activates cslb processing for the http.Transport. The same transport is returned as a 8 | // convenience to the caller so they can make the Enable function part of a wrapper chain, thus: 9 | // 10 | // client := &http.Client{Transport: cslb.Enable(&http.Transport{})} 11 | // 12 | // The Enable function replaces the http.Transport.DialContent with cslb's dialContext. 13 | func Enable(ht *http.Transport) *http.Transport { 14 | ht.DialContext = getCSLB().dialContext 15 | 16 | return ht 17 | } 18 | -------------------------------------------------------------------------------- /ChangeLog.md: -------------------------------------------------------------------------------- 1 | # cslb Change Log 2 | ### v1.1.0 -- 2023-03-05 3 | * Incorporate all minor package changes since v1.0.0 4 | ### v1.0.0 -- 2021-12-16 5 | * Move from Travis to github actions 6 | * Tag package 7 | ### v0.1.0 -- 2021-03-21 8 | * Added go.mod 9 | * Bumped travis upper go version to 1.16 10 | * Started tagging releases to support go modules 11 | ### v0.0.2 -- 2019-10-05 12 | * Fixed typos and clarified comments - no bug fixes or functionality changes 13 | * Bumped travis upper go version to 1.13.1 14 | ### v0.0.1 -- 2019-07-15 15 | * Initial public release. 16 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | on: 3 | - push 4 | - pull_request 5 | jobs: 6 | build: 7 | name: Build and Test 8 | strategy: 9 | matrix: 10 | os: [ ubuntu-latest ] 11 | go: [ 1.21.x ] 12 | runs-on: ${{ matrix.os }} 13 | steps: 14 | - name: Set up Go 15 | uses: actions/setup-go@v3 16 | with: 17 | go-version: ${{ matrix.go }} 18 | 19 | - name: Check out code 20 | uses: actions/checkout@v3 21 | 22 | - name: Build 23 | run: go build -v ./... 24 | 25 | - name: Test 26 | run: go test -v ./... 27 | -------------------------------------------------------------------------------- /.github/workflows/codecov.yml: -------------------------------------------------------------------------------- 1 | name: codecov 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | run: 7 | strategy: 8 | matrix: 9 | os: [ ubuntu-latest ] 10 | go: [ 1.21.x ] 11 | runs-on: ${{ matrix.os }} 12 | steps: 13 | - uses: actions/checkout@main 14 | 15 | - name: Set up Go 16 | uses: actions/setup-go@v3 17 | with: 18 | go-version: ${{ matrix.go }} 19 | 20 | - name: Generate Coverage Report 21 | run: go test ./... -coverprofile=coverage.txt -covermode=atomic 22 | 23 | - name: Upload Coverage Report to Codecov 24 | uses: codecov/codecov-action@v3 25 | with: 26 | file: ./coverage.txt 27 | -------------------------------------------------------------------------------- /make_version.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | # Extract version and release date info from the ChangeLog.md 4 | 5 | cl=$1 6 | if [ -z "$cl" ]; then 7 | echo Error: Need the changelog file as parameter one >&2 8 | exit 1 9 | fi 10 | 11 | # Looking for '### version -- date' 12 | # $1 $2 $3 $4 13 | 14 | recent=`grep '^### v' $cl | head -1` 15 | if [ $? -ne 0 ]; then 16 | echo Error: changelog $cl does not contain a version heading >&2 17 | exit 1 18 | fi 19 | 20 | set -- $recent 21 | version=$2 22 | date=$4 23 | printf 'package cslb\n\nconst (\n' 24 | printf '\t// Version is auto-generated from ChangeLog.md\n' 25 | printf '\tVersion = "%s"\n' "${version}" 26 | printf '\t// ReleaseDate is also auto-generated from ChangeLog.md\n' 27 | printf '\tReleaseDate = "%s"\n' "${date}" 28 | printf ')\n' 29 | 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2019, 2020, Mark Delany 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /globals.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | /* 4 | 5 | globals and the manipulation of the current cslb predominantly exist for tests. If we didn't have 6 | tests, globals would solely consist of an init() function something like: 7 | 8 | func init() { 9 | currentCSLB = newCslb() 10 | Enable(http.DefaultTransport.(*http.Transport)) 11 | } 12 | 13 | because in a normal application only one cslb is created and it lives for the life of the program. 14 | 15 | */ 16 | 17 | import ( 18 | "net/http" 19 | "sync" 20 | ) 21 | 22 | var ( 23 | cslbMu sync.RWMutex // For go test -race as tests creates new cslbs so they can 24 | currentCSLB *cslb // be sure they are working with a known initial state. 25 | ) 26 | 27 | // init enables the http DefaultTransport for CSLB processing. Perhaps we should have an env 28 | // variable to control this or possibly not even enable it by default? 29 | func init() { 30 | realInit().start() 31 | } 32 | 33 | // realInit is separated out from init() so tests can call it multiple times without knowing the 34 | // innards of what is needed to reset the globals to their initial state. 35 | func realInit() *cslb { 36 | cslb := setCSLB(newCslb()) 37 | Enable(http.DefaultTransport.(*http.Transport)) 38 | 39 | return cslb 40 | } 41 | 42 | func setCSLB(c *cslb) *cslb { 43 | cslbMu.Lock() 44 | defer cslbMu.Unlock() 45 | 46 | currentCSLB = c 47 | 48 | return c 49 | } 50 | 51 | func getCSLB() *cslb { 52 | cslbMu.RLock() 53 | defer cslbMu.RUnlock() 54 | 55 | return currentCSLB 56 | } 57 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ main ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ main ] 20 | schedule: 21 | - cron: '25 0 * * 5' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'go' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v3 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v3 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 52 | 53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 54 | # If this step fails, then you should remove it and run the build manually (see below) 55 | - name: Autobuild 56 | uses: github/codeql-action/autobuild@v3 57 | 58 | # ℹ️ Command-line programs to run using the OS shell. 59 | # 📚 https://git.io/JvXDl 60 | 61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 62 | # and modify them (or add more) to build your code if your project 63 | # uses a compiled language 64 | 65 | #- run: | 66 | # make bootstrap 67 | # make release 68 | 69 | - name: Perform CodeQL Analysis 70 | uses: github/codeql-action/analyze@v3 71 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | ## TODO List for cslb 2 | 3 | A dumping ground for unresolved issues and discussion topics. 4 | 5 | ### Health Check Interface 6 | 7 | Cslb only knows about network connections, not application success. It may be that an application 8 | could communicate application success or otherwise via a call back into cslb. This would help cslb 9 | make better choices over which targets to use. The main difficult is correlating the application's 10 | URL with the actually target causing the problem. Even a single-threaded application is tricky due 11 | to connection caching by net/http. 12 | 13 | ``` 14 | resp, err := http.Get(url) 15 | if err != nil || resp.StatusCode != http.StatusOk { 16 | cslb.Failed(url) 17 | } 18 | ``` 19 | 20 | Seems obvious, but which target did cslb use? 21 | 22 | ### Re-fetch active SRVs 23 | 24 | To avoid adding DNS delays to application requests, cslb could re-fetch active SRVs in anticipation 25 | of their reuse. Triggering a fetch, say, five seconds or so before expiry should do the trick. 26 | 27 | ### Placing a weight in the health-check response 28 | 29 | Weights in SRV are obviously a relatively static value. While a GSLB can be used to selectively 30 | respond to SRV queries a more refined approach might be to have the health-check return some sort of 31 | structured content (such as json or xml) containing a utilization value which biases SRV 32 | weights. For example if 2 out of 3 targets are returning 50% utilization and the third is returning 33 | 1% utilization, cslb could bias more connections toward the third server. 34 | 35 | ### Different algorithms beyond weight? 36 | 37 | Server-side load-balancers have traditionally offered a variety of different selection algorithms, 38 | such as least-load, least-latency, round-robin, least-connections and more. In the clients case we 39 | can also add network latency as a desirable attribute to consider. Could some of these algorithms be 40 | incorporated into cslb? A service operator could communicate a preferred strategy by, e.g., encoding 41 | algorithm information into weights using an unlikely signal value. 42 | 43 | Purpose | Bits | Value(s) 44 | --------- | ---- | -------- 45 | Weight | 8 | Normal weight ranges from 0-255 46 | Signal | 5 | 0x1F (all ones) 47 | Algorithm | 3 | 0 = RR, 1 = Latency, 2 = Least Connections, ... 8 = Last 48 | 49 | Given three servers: s1, s2, s3 with a weight of 10, 15 and 20 respectively, the 16-bit value of the 50 | weights would then look like: 51 | 52 | Server | Calculation | Encoded Weight | Ratios 53 | ------- | ----------- | ----- | ------ 54 | s1 | 10 << 8 + 0x1F << 3 + 0 | = 2808 | 10.0 55 | s2 | 15 << 8 + 0x1F << 3 + 0 | = 4088 | 14.6 56 | s3 | 20 << 8 + 0x1F << 3 + 0 | = 5368 | 19.1 57 | 58 | The rationale for having the original weight in the top eight bits as opposed to the more obvious 59 | bottom eight bits is that implementations that do not understand this encoding will still work as 60 | the encoded weights still approximate the same ratios as the original values. 61 | 62 | 63 | ### ------ 64 | -------------------------------------------------------------------------------- /health_test.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | // Much of health is checked indirectly via srv_test so this test module only tests those things 10 | // that are missed. 11 | 12 | // Test the health check go-routine 13 | func TestHealthFetchAndRun(t *testing.T) { 14 | cslb := realInit() 15 | mr := newMockResolver() 16 | mr.appendTXT("_80"+cslb.HealthCheckTXTPrefix+"s1.example.net", []string{"http://google.com"}) 17 | mr.appendTXT("_80"+cslb.HealthCheckTXTPrefix+"s2.example.net", []string{"http:\ngoogle.badurl.com"}) 18 | cslb.netResolver = mr 19 | 20 | cslb.HealthTTL = time.Second * 5 21 | cslb.HealthCheckFrequency = time.Second * 5 22 | cslb.HealthCheckContentOk = "No Way This is in Google" // Make it highly unlikely content 23 | 24 | cslb.start() // Safe to start now that all configs have been set 25 | defer cslb.stop() 26 | 27 | ceh := &ceHealth{expires: time.Now().Add(time.Second * 3)} 28 | 29 | cslb.fetchAndRunHealthCheck(makeHealthStoreKey("s2.example.net", 80), ceh) 30 | if ceh.unHealthy { 31 | t.Error("Fetch should have failed as URL in TXT is bogus", ceh.url) 32 | } 33 | ceh = &ceHealth{expires: time.Now().Add(time.Second * 3)} 34 | cslb.fetchAndRunHealthCheck(makeHealthStoreKey("s1.example.net", 80), ceh) 35 | if !ceh.unHealthy { 36 | t.Error("Expected unhealthy to be set as Content should't match") 37 | } 38 | cslb.HealthCheckContentOk = "html" // Make it something google is bound to return 39 | ceh = &ceHealth{expires: time.Now().Add(time.Second * 3)} 40 | cslb.fetchAndRunHealthCheck(makeHealthStoreKey("s1.example.net", 80), ceh) 41 | if ceh.unHealthy { 42 | t.Error("Expected healthy to be set as Content should match") 43 | } 44 | } 45 | 46 | // Test that the cache cleaner works 47 | func TestHealthCleaner(t *testing.T) { 48 | cslb := realInit() // Do not start cleaners automatically 49 | now := time.Now() 50 | yesterday := now.AddDate(0, 0, -1) // Yesterday 51 | 52 | cslb.setDialResult(now, "residual.example.net", 443, fmt.Errorf("")) 53 | for ix := 0; ix < 99; ix++ { 54 | cslb.setDialResult(yesterday, fmt.Sprintf("%d.example.net", ix), 80, nil) 55 | } 56 | cslb.healthStore.start(time.Second / 2) 57 | defer cslb.healthStore.stop() 58 | 59 | time.Sleep(time.Second) // Give it a chance to do its job 60 | cslb.healthStore.RLock() 61 | origLen := len(cslb.healthStore.cache) 62 | cslb.healthStore.RUnlock() 63 | if origLen != 1 { 64 | t.Error("Expected one entry, not", origLen) 65 | } 66 | } 67 | 68 | func TestTrimTo(t *testing.T) { 69 | s1 := "Not truncated at all" 70 | s := trimTo(s1, 100) 71 | if s1 != s { 72 | t.Error("trimTo truncated when not expected to", s1, s) 73 | } 74 | 75 | s2 := "Is truncated somewhat" 76 | s = trimTo(s2, 10) 77 | if s == s2 { 78 | t.Error("trimTo did not truncated when expected to", s2) 79 | } 80 | if len(s) > 10 { 81 | t.Error("trimTo did not trim enough. Wanted 10, got", len(s), s) 82 | } 83 | 84 | s = trimTo("xxxxx", 2) 85 | if s != "..." { 86 | t.Error("Extremely short trimTo not converted to ...", s) 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /status_test.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | import ( 4 | "context" 5 | "io/ioutil" 6 | "net/http" 7 | "strings" 8 | "testing" 9 | "time" 10 | ) 11 | 12 | func TestStatusTemplates(t *testing.T) { 13 | ss := newStatusServer(newCslb()) 14 | for _, tn := range []string{"config", "cslb", "srv", "health"} { // Check that all templates have parsed ok 15 | tmpl := ss.allTmpl.Lookup(tn) 16 | if tmpl == nil { 17 | t.Error("Template", tn, "missing from parsed template allTmpl") 18 | } 19 | } 20 | 21 | if ss.trailerTmpl.Lookup("trailer") == nil { 22 | t.Error("Template 'trailer' missing from parsed template trailerTmpl") 23 | } 24 | } 25 | 26 | const ( 27 | sssListen = "127.0.0.1:55080" 28 | ) 29 | 30 | // Test that the web server starts and ostensibly serves the intended web page 31 | func TestStatusStartStop(t *testing.T) { 32 | cslb := newCslb() 33 | cslb.StatusServerAddress = sssListen 34 | ss := newStatusServer(cslb) 35 | go ss.start() 36 | time.Sleep(time.Second) // Give server a chance to start 37 | resp, err := http.Get("http://" + sssListen + "/") 38 | if err != nil { 39 | t.Fatal(err) 40 | } 41 | body, err := ioutil.ReadAll(resp.Body) 42 | resp.Body.Close() 43 | if err != nil { 44 | t.Fatal(err) 45 | } 46 | str := string(body) 47 | if !strings.Contains(str, "Client Side Load Balancing") { 48 | t.Error("GET of status page did not return title 'Client Side Load Balancing'", trimTo(str, 200)) 49 | } 50 | 51 | if !strings.Contains(str, "Brought to you by") { 52 | t.Error("GET of status page did not return trailer 'Brought to you by'", trimTo(str, 200)) 53 | } 54 | ss.stop(context.Background()) 55 | time.Sleep(time.Second) 56 | resp, err = http.Get("http://" + sssListen + "/") // Should get a connection failed if stop() worked 57 | if err == nil { 58 | t.Error("Expected 'Connection refused' since server is meant to have stopped") 59 | } 60 | } 61 | 62 | // Check that the SRV/HC caches are represented in the status server output 63 | func TestStatusCacheEntries(t *testing.T) { 64 | cslb := newCslb() 65 | cslb.InterceptTimeout = time.Second 66 | cslb.StatusServerAddress = sssListen 67 | mr := newMockResolver() 68 | mr.appendSRV("http", "tcp", "localhost", "localhost", 50087, 1, 1) 69 | mr.appendSRV("http", "tcp", "localhost", "localhost", 50088, 1, 1) 70 | mr.appendSRV("http", "tcp", "localhost", "localhost", 50089, 1, 1) 71 | mr.appendSRV("https", "tcp", "notfound.example.net", "", 0, 0, 0) 72 | mr.appendTXT("_50088"+cslb.HealthCheckTXTPrefix+"localhost", []string{"http://google.com"}) 73 | cslb.netResolver = mr 74 | cslb.start() 75 | defer cslb.stop() 76 | 77 | cslb.dialContext(context.Background(), "tcp", "localhost:80") 78 | cslb.dialContext(context.Background(), "tcp", "notfound.example.net:443") 79 | cslb.dialContext(context.Background(), "tcp", "nxdomain.example.net:80") 80 | 81 | time.Sleep(2 * time.Second) // Give both status server and HC a chance to get started 82 | 83 | resp, err := http.Get("http://" + sssListen + "/") 84 | if err != nil { 85 | t.Fatal(err) 86 | } 87 | body, err := ioutil.ReadAll(resp.Body) 88 | resp.Body.Close() 89 | if err != nil { 90 | t.Fatal(err) 91 | } 92 | str := string(body) 93 | for _, expect := range []string{"_http._tcp.localhost", "http://google.com", "**NXDomain**"} { 94 | if !strings.Contains(str, expect) { 95 | t.Error("Status page", sssListen, "does not have", expect, trimTo(str, 300)) 96 | } 97 | } 98 | } 99 | 100 | // Test template globbing 101 | func TestStatusGlob(t *testing.T) { 102 | cslb := newCslb() 103 | cslb.StatusServerTemplates = "testdata/templates/*.tmpl" 104 | cslb.StatusServerAddress = sssListen 105 | ss := newStatusServer(cslb) // Should load the testdata templates 106 | go ss.start() 107 | time.Sleep(time.Second) // Give server a chance to start 108 | resp, err := http.Get("http://" + sssListen + "/") 109 | if err != nil { 110 | t.Fatal(err) 111 | } 112 | body, err := ioutil.ReadAll(resp.Body) 113 | resp.Body.Close() 114 | if err != nil { 115 | t.Fatal(err) 116 | } 117 | str := string(body) 118 | if !strings.Contains(str, "empty but nonetheless functional") { 119 | t.Error("GET of status page did not return 'empty but nonetheless functional' from template file", 120 | trimTo(str, 200)) 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /cslb_test.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | import ( 4 | "io/ioutil" 5 | "net/http" 6 | "os" 7 | "strings" 8 | "testing" 9 | "time" 10 | ) 11 | 12 | func unsetAll() { 13 | os.Unsetenv(cslbEnvPrefix + "options") 14 | os.Unsetenv(cslbEnvPrefix + "hc_ok") 15 | 16 | os.Unsetenv(cslbEnvPrefix + "dial_veto") 17 | os.Unsetenv(cslbEnvPrefix + "hc_freq") 18 | os.Unsetenv(cslbEnvPrefix + "nxd_ttl") 19 | os.Unsetenv(cslbEnvPrefix + "srv_ttl") 20 | os.Unsetenv(cslbEnvPrefix + "tar_ttl") 21 | os.Unsetenv(cslbEnvPrefix + "timeout") 22 | } 23 | 24 | // Test that newCslb notices good env variables. This blows away any env variables that might have 25 | // been inherited by the test executable. 26 | func TestCSLBGoodOptions(t *testing.T) { 27 | os.Setenv(cslbEnvPrefix+"options", "dhirsHCN") 28 | os.Setenv(cslbEnvPrefix+"hc_ok", "BIG OK") 29 | 30 | os.Setenv(cslbEnvPrefix+"dial_veto", "5m") 31 | os.Setenv(cslbEnvPrefix+"hc_freq", "10m") 32 | os.Setenv(cslbEnvPrefix+"nxd_ttl", "15m") 33 | os.Setenv(cslbEnvPrefix+"srv_ttl", "20m") 34 | os.Setenv(cslbEnvPrefix+"tar_ttl", "25m") 35 | os.Setenv(cslbEnvPrefix+"timeout", "30m") 36 | 37 | cslb := newCslb() 38 | if !cslb.PrintHCResults || !cslb.PrintIntercepts || !cslb.PrintSRVLookup || !cslb.PrintDialContext || 39 | !cslb.PrintDialResults || !cslb.DisableHealthChecks || !cslb.DisableInterception || 40 | !cslb.AllowNumericServices { 41 | t.Error("At least one option not set", cslb.config) 42 | } 43 | 44 | if cslb.HealthCheckContentOk != "BIG OK" { 45 | t.Error("HealthCheckContentOk not set") 46 | } 47 | if cslb.DialVetoDuration != time.Minute*5 { 48 | t.Error("DialVetoDuration not set") 49 | } 50 | if cslb.HealthCheckFrequency != time.Minute*10 { 51 | t.Error("HealthCheckFrequency not set") 52 | } 53 | if cslb.NotFoundSRVTTL != time.Minute*15 { 54 | t.Error("NotFoundSRVTTL not set") 55 | } 56 | if cslb.FoundSRVTTL != time.Minute*20 { 57 | t.Error("FoundSRVTTL not set") 58 | } 59 | if cslb.HealthTTL != time.Minute*25 { 60 | t.Error("HealthTTL not set") 61 | } 62 | if cslb.InterceptTimeout != time.Minute*30 { 63 | t.Error("InterceptTimeout not set") 64 | } 65 | 66 | unsetAll() 67 | } 68 | 69 | func TestCSLBBadOptions(t *testing.T) { 70 | os.Setenv(cslbEnvPrefix+"options", "xxXX") 71 | 72 | os.Setenv(cslbEnvPrefix+"dial_veto", "0s") // Cover 73 | os.Setenv(cslbEnvPrefix+"hc_freq", "2h") // all 74 | os.Setenv(cslbEnvPrefix+"nxd_ttl", "junk") // error paths 75 | os.Setenv(cslbEnvPrefix+"srv_ttl", "junk") 76 | os.Setenv(cslbEnvPrefix+"tar_ttl", "junk") 77 | os.Setenv(cslbEnvPrefix+"timeout", "junk") 78 | 79 | cslb := newCslb() 80 | if cslb.PrintHCResults || cslb.PrintIntercepts || cslb.PrintSRVLookup || cslb.PrintDialContext || 81 | cslb.DisableHealthChecks || cslb.DisableInterception { 82 | t.Error("At least one option unexpectedly set", cslb.config) 83 | } 84 | 85 | if cslb.DialVetoDuration != defaultDialVetoDuration { 86 | t.Error("DialVetoDuration was set") 87 | } 88 | if cslb.HealthCheckFrequency != defaultHealthCheckFrequency { 89 | t.Error("HealthCheckFrequency was set") 90 | } 91 | if cslb.NotFoundSRVTTL != defaultNotFoundSRVTTL { 92 | t.Error("NotFoundSRVTTL was set") 93 | } 94 | if cslb.FoundSRVTTL != defaultFoundSRVTTL { 95 | t.Error("FoundSRVTTL was set") 96 | } 97 | if cslb.HealthTTL != defaultHealthTTL { 98 | t.Error("HealthTTL was set") 99 | } 100 | if cslb.InterceptTimeout != defaultInterceptTimeout { 101 | t.Error("InterceptTimeout was set") 102 | } 103 | 104 | unsetAll() 105 | } 106 | 107 | func TestCloneStats(t *testing.T) { 108 | cslb := newCslb() 109 | 110 | var ls cslbStats 111 | ls.FailedDials = 23 112 | ls.Deadline = 12 113 | ls.DialContext = 101 114 | 115 | cslb.addStats(&ls) 116 | 117 | s := cslb.cloneStats() 118 | 119 | if s.FailedDials != ls.FailedDials || s.Deadline != ls.Deadline || s.DialContext != ls.DialContext { 120 | t.Error("cloneStats does not agree with added stats", ls, s) 121 | } 122 | } 123 | 124 | func TestCslbStartStop(t *testing.T) { 125 | cslb := newCslb() 126 | cslb.StatusServerAddress = sssListen 127 | cslb.start() 128 | time.Sleep(time.Second) // Give server a chance to start 129 | 130 | resp, err := http.Get("http://" + sssListen + "/") 131 | if err != nil { 132 | t.Fatal(err) 133 | } 134 | body, err := ioutil.ReadAll(resp.Body) 135 | resp.Body.Close() 136 | if err != nil { 137 | t.Fatal(err) 138 | } 139 | str := string(body) 140 | if !strings.Contains(str, "Client Side Load Balancing") { 141 | t.Error("GET of status page did not return title 'Client Side Load Balancing'", trimTo(str, 200)) 142 | } 143 | 144 | cslb.stop() 145 | time.Sleep(time.Second) 146 | 147 | resp, err = http.Get("http://" + sssListen + "/") 148 | if err == nil { 149 | t.Error("Expected connection refused after cslb.stop()") 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## CSLB - A Go Client-Side Load-Balancer for HTTP/HTTPS 2 | 3 | `Cslb` is a client-side load-balancer for Go HTTP/HTTPS applications. `Cslb` is an alternative to 4 | server-side load-balancers which add deployment and diagnostic complexity, cost, throughput 5 | constraints and which also create an additional point of possible failure. `Cslb` puts load-balancer 6 | intelligence into your Go clients so you can simplify your deployment and potentially eliminate 7 | server-side load-balancers. 8 | 9 | In many cases the only action needed to take advantage of `cslb` is to import the package and add an 10 | SRV entry to your DNS. At that point, on behalf of your application, `cslb` automatically deals with 11 | failed servers and spreads load across serving targets according to your load-distribution rules. In 12 | addition, once you have `cslb` in place you can also run a "canary" alerting service which can 13 | notify you when clients are failing to reach their correct services. 14 | 15 | The primary goal of `cslb` is to make client-side load-balancing a no-brainer for your Go application. 16 | 17 | [![Build](https://github.com/markdingo/cslb/actions/workflows/go.yml/badge.svg)](https://github.com/markdingo/cslb/actions/workflows/go.yml) 18 | [![Go Report Card](https://goreportcard.com/badge/github.com/markdingo/cslb)](https://goreportcard.com/report/github.com/markdingo/cslb) 19 | [![codecov](https://codecov.io/gh/markdingo/cslb/branch/main/graph/badge.svg)](https://codecov.io/gh/markdingo/cslb) 20 | [![](https://godoc.org/github.com/markdingo/cslb?status.svg)](https://godoc.org/github.com/markdingo/cslb) 21 | 22 | ### Installation 23 | 24 | `Cslb` is a standard Go package thus if your program is go-module aware (which is to say 25 | you've run "go mod init") then `cslb` is pulled in when you run `"go mod tidy"` or you can 26 | specifically pull it in with: 27 | 28 | 29 | ```sh 30 | $ go get -u github.com/markdingo/cslb 31 | ``` 32 | 33 | At this stage `cslb` has no package dependencies beyond the standard packages shipped with the Go 34 | compiler. `Cslb` requires Go 1.12.x or greater. 35 | 36 | ### Application Changes 37 | 38 | To take advantage of `cslb` a program simply imports the package at which point `cslb` automatically 39 | starts performing client-side load-balancing by over-riding the `DialContext` of the 40 | `http.DefaultTransport`. If the program uses its own `http.Transport` then its `DialContext` needs to 41 | be similarly replaced. Here is the before and after code which shows the application changes needed: 42 | 43 | ### Before 44 | 45 | ```go 46 | 47 | import ( 48 | "net/http" 49 | ) 50 | 51 | func main() { 52 | resp, err := http.Get("http://example.net/resource") 53 | ... 54 | ``` 55 | 56 | ### After 57 | 58 | ```go 59 | 60 | import ( 61 | "net/http" 62 | _ "github.com/markdingo/cslb" 63 | ) 64 | 65 | func main() { 66 | resp, err := http.Get("http://example.net/resource") 67 | ... 68 | ``` 69 | 70 | and that's it! 71 | 72 | One line of import code and no changes to application code fetching HTTP resources. The package 73 | documentation describes what to do if your applications use its own `http.Transport`. Essentially 74 | you have to enable `cslb` for that Transport. 75 | 76 | ### SRV Resource Records 77 | 78 | `Cslb` processing is activated by the presence of SRV Resource Records (RRs) matching the requested 79 | hostname using the prescribed [RFC2782](https://tools.ietf.org/rfc/rfc2782.txt) formulation. If no 80 | SRV RR exists, `cslb` is completely transparent and passive. `Cslb` caches the presence or otherwise of 81 | SRVs to minimize its impact in a non-SRV environment. This means you can deploy with `cslb` at any 82 | time and activate the functionality at a later stage. 83 | 84 | ### Fail-Over and Load-Balancing Mechanism 85 | 86 | `Cslb` intercepts `DialContext` Requests made by `net/http` and makes internal `DialContext` Requests 87 | to the SRV targets. The first successful connection is returned to the calling application. In 88 | effect this provides a fail-over capability. 89 | 90 | `Cslb` achieves load-balancing by implementing the SRV selection algorithm which provides a lot of 91 | flexibility in terms of preferring targets by priority and distributing connections by weight within 92 | priority. The package documentation shows how this works. 93 | 94 | ### No Server-side changes required 95 | 96 | No server-side changes are required to use `cslb` - apart for possibly dispensing with your 97 | server-side load-balancers! You can even use `cslb`-enabled applications on third-party services with 98 | appropriate DNS finagling. It's also possible to use `cslb` in conjunction with an existing 99 | server-side load-balancer deployment by placing the load-balancers targets in an SRV RR. 100 | 101 | ### Active Health Checks 102 | 103 | In addition to the passive collection of fail-over data based on `DialContext` results, `cslb` has an 104 | optional "active mode" where a per-target health-check URL is periodically polled to determine the 105 | health of a target. If a health-check URL fails, that target is removed from the target candidate 106 | list for a configured time period. The health-check URL is defined by a TXT RR in the DNS. The 107 | package documentation describes the naming convention and syntax. 108 | 109 | ### Status Web Page 110 | 111 | Insights into the behaviour of `cslb` within your application are available via an optional status 112 | page. When activated via an environment variable the web page provides statistics on intercepted 113 | `DialContext` Requests, information on cached SRV and healthcheck results. 114 | 115 | ### Community 116 | 117 | If you have any problems using `cslb` or suggestions on how it can do a better job, don't hesitate to 118 | create an [issue](https://github.com/markdingo/cslb/issues) or email the 119 | [authors](https://github.com/markdingo/cslb/blob/master/AUTHORS) directly. This package can only 120 | improve with your feedback. 121 | 122 | ### Copyright and License 123 | 124 | Cslb is Copyright :copyright: 2019,2020,2021 Mark Delany. This software is licensed under the BSD 2-Clause "Simplified" License. 125 | -------------------------------------------------------------------------------- /dial.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net" 7 | "strings" 8 | "time" 9 | ) 10 | 11 | // dialResult is passed thru a channel back to the interceptor 12 | type dialResult struct { 13 | conn net.Conn 14 | err error 15 | } 16 | 17 | // dialContext replaces the DialContext function of net.Dialer used in http.Transport. It looks up 18 | // the deduced SRV in the DNS and if present, runs the load-balancing logic against the returned 19 | // targets before calling net.DialContext. Multiple connection attempts to different targets are 20 | // tried in an effort to select a functioning target. 21 | // 22 | // dialContext effectively implements what http clients should have implemented years ago but the 23 | // http crowd seem very reluctant to add latency to each web request by preceding it with an 24 | // additional DNS lookup so it hasn't happened thus far. Maybe the proposed HTTPSSVC, or whatever it 25 | // ends up being, will solve that problem? We'll see. 26 | // 27 | // If the supplied context contains a deadline dialContext honors that deadline, otherwise it 28 | // creates a "WithTimeout" context using the configure timeout. Unlike net.Dialer.DialContext the 29 | // deadline is not amortized across all targets. In part because we want to prefer the earlier 30 | // targets because that's how we've been instructed via the SRV; in part because we don't really 31 | // know how many address records there are across the different targets and finally in part because 32 | // a large number of targets implies an absurdly small amortised deadline per target - particularly 33 | // as net.Dialer.DialContext is doing yet more amortization per target of our amortization. All of 34 | // which can be coded around to arrive at a workable compromise, but it's unclear the additional 35 | // complexity buys us very much and determining the benefit is tough. 36 | func (t *cslb) dialContext(ctx context.Context, network, address string) (net.Conn, error) { 37 | var ls cslbStats // Accumulate stats locally then 38 | defer t.addStats(&ls) // transfer to cslb at the end 39 | 40 | ls.DialContext++ 41 | host, port := extractHostPort(strings.ToLower(address)) // Slough off trailing :port 42 | if t.PrintDialContext { 43 | fmt.Println("cslb.dialContext:intercept", network, address, "gives", host, "and", port) 44 | } 45 | 46 | // Convert the numeric port number back to a service name to formulate the SRV qName. This 47 | // is error prone as there is not necessarily any correlation between the two. E.g. with 48 | // http.Get("https://example.net:80/resource") the conversion results in qName of 49 | // _http._tcp.example.net which is unlikely to be what the caller wanted, but what can you 50 | // do? The problem is that the scheme on the original URL is not visible to us in any 51 | // way. Hardly surprising since net.DialContent is a generalized service. The only real 52 | // solution is if the net/http package were to introduce its own dialer interface which 53 | // passes scheme and port down to the Dial functions. 54 | 55 | service := "" 56 | switch port { // Map services that we can enable (which is only net/http for now) 57 | case "80": 58 | service = "http" 59 | case "443": 60 | service = "https" 61 | default: 62 | if t.AllowNumericServices { // Are we allowed to try _1443._tcp.$domain ? 63 | service = port 64 | } 65 | } 66 | 67 | // Everything has to be "just right" before we run the intercept logic. If not, pass thru to 68 | // the system dialContext and fuggedaboutit! 69 | if len(host) == 0 || len(service) == 0 || t.DisableInterception { 70 | ls.MissHostService++ 71 | return t.systemDialContext(ctx, network, address) 72 | } 73 | 74 | now := time.Now() 75 | ls.StartTime = now 76 | 77 | // If the supplied context does not have a deadline, derive WithTimeout context and set it 78 | // with our configured timeout. 79 | 80 | if deadline, ok := ctx.Deadline(); !ok || deadline.IsZero() { 81 | subCtx, cancel := context.WithTimeout(ctx, t.InterceptTimeout) 82 | defer cancel() 83 | ctx = subCtx // The WithTimeout context becomes our default context 84 | } 85 | 86 | cesrv := t.lookupSRV(ctx, now, service, network, host) 87 | if t.PrintSRVLookup { 88 | fmt.Println("cslb.dialContext:lookupSRV", service, network, host, cesrv.uniqueTargets(), cesrv) 89 | } 90 | if cesrv.uniqueTargets() == 0 { // Empty or non-existent SRV means revert to system Dailer 91 | ls.NoSRV++ 92 | return t.systemDialContext(ctx, network, address) 93 | } 94 | 95 | // Because we need to select on the cancel channel, run the iteration in a separate 96 | // go-routine and have it return the results via a channel that we can also select on. The 97 | // dialIterate function is responsible for closing the channel to ensure we don't leak. 98 | 99 | returned := make(chan dialResult) 100 | go t.dialIterate(ctx, cesrv, network, address, returned) 101 | select { 102 | case result := <-returned: // Some sort of response from dialIterate 103 | return result.conn, result.err 104 | 105 | case <-ctx.Done(): // Cancel or deadline exceeded 106 | return nil, ctx.Err() 107 | } 108 | 109 | // NOT REACHED 110 | } 111 | 112 | // dialIterate iterates over bestTargets until it gets a good connection, runs out of time or runs 113 | // out of unique targets. Because a failed target is put at the bottom of the pile in terms of 114 | // isGood() and nextDialAttempt it should only recur if bestTarget() has cycled thru *all* possible 115 | // good targets and all targets with a closer nextDialAttempt. 116 | // 117 | // Results are returned via the result channel as we're started as a separate go-routine. 118 | func (t *cslb) dialIterate(ctx context.Context, cesrv *ceSRV, network, address string, result chan dialResult) { 119 | var ls cslbStats // Do not set StartTime for nested stats 120 | var lastError error 121 | 122 | defer t.addStats(&ls) // Transfer counters back to the parent when we're done 123 | defer close(result) // This function is responsible for closing the dialResult channel 124 | 125 | dupes := make(map[string]bool) // Track targets to detect bestTarget() cycling 126 | for { 127 | ls.BestTarget++ 128 | srv := t.bestTarget(cesrv) // Returns a single synthesized *net.SRV with target 129 | newAddress := fmt.Sprintf("%s:%d", srv.Target, int(srv.Port)) 130 | if dupes[newAddress] { // If we've iterated over all targets, stop 131 | ls.DupesStopped++ 132 | result <- dialResult{nil, 133 | fmt.Errorf("cslb: All unique targets failed for %s/%s. Tried: %d. Last Error: %s", 134 | address, newAddress, len(dupes), lastError)} 135 | return 136 | } 137 | dupes[newAddress] = true 138 | if t.PrintIntercepts { 139 | fmt.Println("cslb.dialContext:SRV", address, "to target", network, newAddress) 140 | } 141 | nc, err := t.systemDialContext(ctx, network, newAddress) 142 | lastError = err 143 | now := time.Now() 144 | t.setDialResult(now, srv.Target, int(srv.Port), err) 145 | if t.PrintDialResults { 146 | fmt.Println("cslb.systemDialContext:Results", network, newAddress, err) 147 | } 148 | if err == nil { // Success! 149 | ls.GoodDials++ 150 | result <- dialResult{nc, nil} 151 | return 152 | } 153 | ls.FailedDials++ 154 | } 155 | 156 | // NOT REACHED 157 | } 158 | 159 | // extractHostPort extracts the hostname from the address, if there is one. Possible inputs are: 160 | // example.com:80, 127.0.0.1:80 and [::1]:443 only the first of which returns a non-zero host of 161 | // "example.com". Which exemplifies a main difference from net.SplitHostPort in that IP addresses 162 | // are not considered valid hosts for splitting purposes, nor is there an error return as the caller 163 | // doesn't want to impose their own (possibly different) constraints on an address which is 164 | // interpreted by net.DialContext. 165 | func extractHostPort(address string) (host, port string) { 166 | if address[0] == '[' { // Does it look like a wrapped ipv6 address? 167 | return 168 | } 169 | lastColon := strings.LastIndex(address, ":") 170 | if lastColon < 1 { // Unrecognized format 171 | return 172 | } 173 | if lastColon+1 == len(address) { // Unrecognized format - trailing colon with no port 174 | return 175 | } 176 | 177 | host = address[:lastColon] 178 | port = address[lastColon+1:] 179 | ip := net.ParseIP(host) // Easiest way to determine whether hostname or IP address 180 | if ip != nil { 181 | host = "" 182 | port = "" 183 | } 184 | return 185 | } 186 | -------------------------------------------------------------------------------- /http_test.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "io" 7 | "io/ioutil" 8 | "net/http" 9 | "strings" 10 | "sync" 11 | "testing" 12 | "time" 13 | ) 14 | 15 | type server struct { 16 | sync.Mutex 17 | name string 18 | port int 19 | get string // Returned from a regular GET 20 | hc string // Returned on the health check URL 21 | getHits int 22 | hcHits int 23 | httpServer *http.Server 24 | } 25 | 26 | var ( 27 | printServerSide bool // Tells servers to print goop 28 | srv1, srv2, srv3, srv4 *server 29 | ) 30 | 31 | func trunc(s string) string { 32 | if len(s) > 100 { 33 | s = s[0:99] + "..." 34 | } 35 | 36 | return strings.ReplaceAll(s, "\n", " ") + "\n" 37 | } 38 | 39 | func makeHTTPServer(srv *server, startHC bool) { 40 | mux := http.NewServeMux() 41 | mux.HandleFunc("/", func(w http.ResponseWriter, req *http.Request) { 42 | srv.Lock() 43 | defer srv.Unlock() 44 | if printServerSide { 45 | fmt.Println(srv.name, "hit. Have:", srv.get) 46 | } 47 | io.WriteString(w, "Hello from ") 48 | io.WriteString(w, srv.name) 49 | io.WriteString(w, " I have: ") 50 | io.WriteString(w, srv.get) 51 | io.WriteString(w, "\n") 52 | srv.getHits++ 53 | }) 54 | 55 | if startHC { 56 | mux.HandleFunc("/health", func(w http.ResponseWriter, req *http.Request) { 57 | srv.Lock() 58 | defer srv.Unlock() 59 | if printServerSide { 60 | fmt.Println(srv.name, "HC Hit. Have", srv.hc) 61 | } 62 | io.WriteString(w, "Hello from ") 63 | io.WriteString(w, srv.name) 64 | io.WriteString(w, " I have: ") 65 | io.WriteString(w, srv.hc) 66 | io.WriteString(w, "\n") 67 | srv.hcHits++ 68 | }) 69 | } 70 | 71 | srv.httpServer = &http.Server{Addr: fmt.Sprintf("127.0.0.1:%d", srv.port), Handler: mux} 72 | } 73 | 74 | // Run four real servers on localhost ports 5001-5004 75 | func startAllServers(startHC bool) { 76 | srv1 = &server{name: "s1", port: 5001, get: "ONE", hc: "OK"} 77 | srv2 = &server{name: "s2", port: 5002, get: "TWO", hc: "OK"} 78 | srv3 = &server{name: "s3", port: 5003, get: "THREE", hc: "OK"} 79 | srv4 = &server{name: "s4", port: 5004, get: "FOUR", hc: "OK"} 80 | makeHTTPServer(srv1, startHC) 81 | makeHTTPServer(srv2, startHC) 82 | makeHTTPServer(srv3, startHC) 83 | makeHTTPServer(srv4, startHC) 84 | go runServer(srv1) 85 | go runServer(srv2) 86 | go runServer(srv3) 87 | go runServer(srv4) 88 | time.Sleep(time.Second / 2) // Given them a chance to listen 89 | } 90 | 91 | func runServer(srv *server) { 92 | err := srv.httpServer.ListenAndServe() 93 | if err != nil { 94 | if !strings.Contains(err.Error(), "Server closed") { 95 | fmt.Println(err) 96 | } 97 | } 98 | } 99 | 100 | func stopAllServers() { 101 | srv1.httpServer.Shutdown(context.Background()) 102 | srv2.httpServer.Shutdown(context.Background()) 103 | srv3.httpServer.Shutdown(context.Background()) 104 | srv4.httpServer.Shutdown(context.Background()) 105 | } 106 | 107 | // Test cslb with servers shutting down and thus having connections fail. This is a real E2E test. 108 | func TestHTTPServerShutdowns(t *testing.T) { 109 | startAllServers(false) 110 | defer stopAllServers() 111 | 112 | mr := newMockResolver() // Construct DNS entries used by intercept 113 | 114 | // Randomized order should not affect results 115 | mr.appendSRV("http", "tcp", "example.net", "localhost", 5001, 10, 10) // srv1 116 | mr.appendSRV("http", "tcp", "example.net", "localhost", 5004, 40, 10) // srv4 117 | mr.appendSRV("http", "tcp", "example.net", "localhost", 5003, 20, 10) // srv3 118 | mr.appendSRV("http", "tcp", "example.net", "localhost", 5002, 20, 10) // srv2 119 | url := "http://example.net/" 120 | 121 | cslb := realInit() 122 | cslb.netResolver = mr 123 | /* 124 | cslb.PrintDialContext = true 125 | cslb.PrintIntercepts = true 126 | cslb.PrintSRVLookup = true 127 | cslb.PrintDialResults = true 128 | */ 129 | 130 | cslb.start() 131 | defer cslb.stop() 132 | 133 | start := time.Now() 134 | get(t, url) // Should latch on to srv1 as that has the lowest priority 135 | str := get(t, url) // Should still be srv1 136 | if !strings.Contains(str, srv1.get) { 137 | t.Error("Expected", srv1.get, "got", trunc(str)) 138 | } 139 | elapse := time.Now().Sub(start) 140 | if elapse > time.Second { 141 | t.Error("Get to all up servers took way too long", elapse) 142 | } 143 | 144 | srv1.httpServer.Shutdown(context.Background()) // Kill srv1 so cslb is forced to move to srv2/srv3 145 | 146 | start = time.Now() 147 | str = get(t, url) 148 | if !strings.Contains(str, "TWO") && !strings.Contains(str, "THREE") { 149 | t.Error("Expected srv2 or srv3 to respond with srv1 shut down, but got", trunc(str)) 150 | } 151 | elapse = time.Now().Sub(start) 152 | if elapse > time.Second { 153 | t.Error("Get after srv1 downed took way too long", elapse) 154 | } 155 | 156 | srv2.httpServer.Shutdown(context.Background()) // Kill all servers except srv4 157 | srv3.httpServer.Shutdown(context.Background()) 158 | 159 | start = time.Now() 160 | str = get(t, url) 161 | if !strings.Contains(str, "FOUR") { 162 | t.Error("Expected srv4 to respond, but got", trunc(str)) 163 | } 164 | elapse = time.Now().Sub(start) 165 | if elapse > time.Second { 166 | t.Error("Get after srv1, 2 &3 downed took way too long", elapse) 167 | } 168 | } 169 | 170 | // Test cslb with real servers and running HC on them to make sure HC turns off "downed" servers 171 | func TestHTTPHealthCheckFailures(t *testing.T) { 172 | startAllServers(true) 173 | defer stopAllServers() 174 | 175 | mr := newMockResolver() // Construct DNS entries used by intercept 176 | 177 | // Randomized order should not affect results 178 | mr.appendSRV("http", "tcp", "example.net", "localhost", 5001, 10, 10) // srv1 179 | mr.appendSRV("http", "tcp", "example.net", "localhost", 5004, 40, 10) // srv4 180 | mr.appendSRV("http", "tcp", "example.net", "localhost", 5003, 20, 10) // srv3 181 | mr.appendSRV("http", "tcp", "example.net", "localhost", 5002, 20, 10) // srv2 182 | mr.appendTXT("_5001._cslb.localhost", []string{"http", "://127.0.0.1:5001/", "health"}) 183 | mr.appendTXT("_5002._cslb.localhost", []string{"http://127.0", ".0.1:5002/health"}) 184 | mr.appendTXT("_5003._cslb.localhost", []string{"http://127.0.0.1:500", "3/health"}) 185 | mr.appendTXT("_5004._cslb.localhost", []string{"http://127.0.0.1:5004/healt", "h"}) 186 | url := "http://example.net/" 187 | 188 | cslb := realInit() 189 | cslb.netResolver = mr 190 | cslb.HealthCheckFrequency = time.Second // HC should hit every second 191 | cslb.start() 192 | defer cslb.stop() 193 | 194 | str := get(t, url) // Should be ONE, but we only care that it's something 195 | if len(str) == 0 { 196 | t.Error("Did not get a response from any server. They should all be running") 197 | } 198 | 199 | // The get will have caused a DNS lookup of the SRV which in turn would have populated the 200 | // healthStore which in turn would have looked up the TXT RRs for the HC URLs and started 201 | // running the HCs. Since the HCs have "OK" set they should be setting the targets as good - 202 | // even if we set them down the subsequent HC should over-ride them. After all, a connection 203 | // is just a TCP 3-way handshake, not a successful HTTP exchange. 204 | 205 | // To test that the HCs are running we should see the hcHits increase from zero 206 | 207 | time.Sleep(time.Second * 4) 208 | 209 | srv1.Lock() // Bah humbug. Keep go test -race quiet. Not really a meanful race, but still... 210 | if srv1.hcHits < 2 { 211 | t.Error("srv1 hcHits too small at", srv1.hcHits) 212 | } 213 | srv1.Unlock() 214 | 215 | srv2.Lock() 216 | if srv2.hcHits < 2 { 217 | t.Error("srv2 hcHits too small at", srv2.hcHits) 218 | } 219 | srv2.Unlock() 220 | 221 | srv3.Lock() 222 | if srv3.hcHits < 2 { 223 | t.Error("srv3 hcHits too small at", srv3.hcHits) 224 | } 225 | srv3.Unlock() 226 | 227 | srv4.Lock() 228 | if srv4.hcHits < 2 { 229 | t.Error("srv4 hcHits too small at", srv4.hcHits) 230 | } 231 | srv4.Unlock() 232 | 233 | // Shutdown srv1 and rotate out srv2 & 3 234 | 235 | srv1.httpServer.Shutdown(context.Background()) 236 | srv2.Lock() 237 | srv2.hc = "Bad" 238 | srv2.Unlock() 239 | 240 | srv3.Lock() 241 | srv3.hc = "Bad" 242 | srv3.Unlock() 243 | 244 | time.Sleep(time.Second * 2) // Give health check time to notice 245 | str = get(t, url) // get should now hit srv4 246 | 247 | if !strings.Contains(str, "FOUR") { 248 | t.Error("Expected srv4 to respond with srv2, 3 HC down, but got", trunc(str)) 249 | } 250 | } 251 | 252 | func get(t *testing.T, url string) string { 253 | resp, err := http.Get(url) 254 | if err != nil { 255 | t.Log("Error:", err) 256 | return "" 257 | } 258 | body, err := ioutil.ReadAll(resp.Body) 259 | if err != nil { 260 | panic(err) 261 | } 262 | resp.Body.Close() 263 | 264 | return string(body) 265 | } 266 | -------------------------------------------------------------------------------- /dial_test.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net" 7 | "strings" 8 | "sync" 9 | "testing" 10 | "time" 11 | ) 12 | 13 | type extractTestCase struct { 14 | address string 15 | host, port string // Expected values 16 | } 17 | 18 | var extractTestCases = []extractTestCase{ 19 | {"example.net:80", "example.net", "80"}, 20 | {"www.example.net:443", "www.example.net", "443"}, 21 | {":www.example.net", "", ""}, 22 | {"www.example.net:", "", ""}, 23 | {"127.0.0.1", "", ""}, 24 | {"127.0.0.1:80", "", ""}, 25 | {"[::1]", "", ""}, 26 | {"[::1]:80", "", ""}, 27 | {"[fe80::3c:740d:aca7:dea0]:443", "", ""}, 28 | } 29 | 30 | func TestExtractHostPort(t *testing.T) { 31 | for _, tc := range extractTestCases { 32 | t.Run(tc.address, func(t *testing.T) { 33 | h, p := extractHostPort(tc.address) 34 | if h != tc.host || p != tc.port { 35 | t.Error("Expected", tc.host, tc.port, "got", h, p) 36 | } 37 | }) 38 | } 39 | } 40 | 41 | type mockDialer struct { 42 | mu sync.RWMutex // Ensure go test -race doesn't barf 43 | delay time.Duration 44 | conn net.Conn 45 | err error 46 | networkS, addressS string 47 | networkSList, addressSList []string 48 | } 49 | 50 | func (t *mockDialer) network() string { 51 | t.mu.RLock() 52 | defer t.mu.RUnlock() 53 | return t.networkS 54 | } 55 | 56 | func (t *mockDialer) networkList() []string { 57 | t.mu.RLock() 58 | defer t.mu.RUnlock() 59 | return t.networkSList[:] 60 | } 61 | 62 | func (t *mockDialer) address() string { 63 | t.mu.RLock() 64 | defer t.mu.RUnlock() 65 | return t.addressS 66 | } 67 | 68 | func (t *mockDialer) addressList() []string { 69 | t.mu.RLock() 70 | defer t.mu.RUnlock() 71 | return t.addressSList[:] 72 | } 73 | 74 | func newMockDialer() *mockDialer { 75 | t := &mockDialer{} 76 | t.reset() // Init all lists 77 | 78 | return t 79 | } 80 | 81 | func (t *mockDialer) reset() { 82 | t.conn = nil 83 | t.err = nil 84 | t.networkS = "" 85 | t.addressS = "" 86 | t.networkSList = []string{} 87 | t.addressSList = []string{} 88 | } 89 | 90 | func (t *mockDialer) dialContext(ctx context.Context, network, address string) (net.Conn, error) { 91 | t.mu.Lock() 92 | 93 | t.networkS = network 94 | t.addressS = address 95 | t.networkSList = append(t.networkSList, network) 96 | t.addressSList = append(t.addressSList, address) 97 | conn := t.conn 98 | err := t.err 99 | delay := t.delay 100 | 101 | t.mu.Unlock() // Don't lock across sleep 102 | 103 | if delay > 0 { 104 | time.Sleep(delay) 105 | } 106 | 107 | return conn, err 108 | } 109 | 110 | func TestDialContext(t *testing.T) { 111 | cslb := realInit() 112 | mr := newMockResolver() // Empty DNS 113 | cslb.netResolver = mr 114 | dialer := newMockDialer() 115 | cslb.systemDialContext = dialer.dialContext 116 | 117 | cslb.start() 118 | defer cslb.stop() 119 | 120 | _, err := cslb.dialContext(context.Background(), "tcp", "localhost:81") // Not port 80 or 443 121 | if dialer.network() != "tcp" || dialer.address() != "localhost:81" { 122 | t.Error("Intercept did not call global cslb.dialContext with port 81", dialer.address(), err) 123 | } 124 | if len(mr.lastSRV) > 0 { 125 | t.Error("Non-standard port should not have attempted an SRV lookup", mr.lastSRV) 126 | } 127 | 128 | cslb.AllowNumericServices = true // Test that non-standard port is now ok 129 | cslb.dialContext(context.Background(), "tcp", "localhost:81") // Not port 80 or 443 130 | if len(mr.lastSRV) == 0 { 131 | t.Error("Non-standard port should have attempted an SRV lookup with AllowNumericServices set") 132 | } 133 | 134 | cslb.AllowNumericServices = false // Back to normal for rest of test 135 | 136 | dialer.reset() 137 | _, err = cslb.dialContext(context.Background(), "tcp", "localhost:80") // Should do an SRV Lookup and fail 138 | if dialer.network() != "tcp" || dialer.address() != "localhost:80" { 139 | t.Error("Intercept did not call global cslb.dialContext with port 80", dialer.address(), err) 140 | } 141 | if mr.lastSRV != "_http._tcp.localhost" { 142 | t.Error("Intercept did not attempt srv lookup", mr.lastSRV) 143 | } 144 | 145 | mr.appendSRV("http", "tcp", "localhost", "", 1, 1, 1) 146 | dialer.reset() 147 | _, err = cslb.dialContext(context.Background(), "tcp", "localhost:80") // Should do an SRV Lookup and fail on bestTarget() 148 | if dialer.network() != "tcp" || dialer.address() != "localhost:80" { 149 | t.Error("Intercept did not call system dialContext with port 80 after empty SRV", dialer.address(), err) 150 | } 151 | if mr.lastSRV != "_http._tcp.localhost" { 152 | t.Error("dial did not attempt srv lookup", mr.lastSRV) 153 | } 154 | 155 | mr.appendSRV("http", "tcp", "example.net", "realtarget", 8080, 1, 1) 156 | dialer.reset() 157 | _, err = cslb.dialContext(context.Background(), "tcp", "example.net:80") // Go all the way thru 158 | if dialer.network() != "tcp" || dialer.address() != "realtarget:8080" { 159 | t.Error("dial did not call system dialContext with realtarget:8080, rather", 160 | dialer.network(), dialer.address(), err) 161 | } 162 | } 163 | 164 | // Test that an iteration over all targets stops iterating and stops at the appopriate point. 165 | func TestDialExhaustUniqueTargets(t *testing.T) { 166 | cslb := realInit() 167 | mr := newMockResolver() // Empty DNS 168 | cslb.netResolver = mr 169 | dialer := newMockDialer() 170 | cslb.systemDialContext = dialer.dialContext // Intercept calls to system dialer 171 | 172 | dialer.err = fmt.Errorf("Dial Exhaustion Mock error") 173 | mr.appendSRV("https", "tcp", "localhost", "s1.localhost", 4000, 0, 0) 174 | mr.appendSRV("https", "tcp", "localhost", "s2.localhost", 4001, 0, 0) 175 | mr.appendSRV("https", "tcp", "localhost", "s3.localhost", 4002, 0, 0) 176 | now := time.Now() 177 | 178 | cslb.start() 179 | defer cslb.stop() 180 | 181 | cslb.setDialResult(now.Add(-time.Second*40), "s1.localhost", 4000, dialer.err) // Comes good third 182 | cslb.setDialResult(now.Add(-time.Second*60), "s2.localhost", 4001, dialer.err) // Comes good first 183 | cslb.setDialResult(now.Add(-time.Second*50), "s3.localhost", 4002, dialer.err) // Comes good second 184 | 185 | // Order of bestTarget() should be s2, s1 then s3 which should show up in the mock dailer's 186 | // addressList. 187 | 188 | _, err := cslb.dialContext(context.Background(), "tcp", "localhost:443") 189 | if err == nil { 190 | t.Fatal("Expected an error return with Exhausted set") 191 | } 192 | if !strings.Contains(err.Error(), "All unique targets failed") { 193 | t.Error("Expected error to contain 'All unique targets...' but got", err.Error()) 194 | } 195 | if len(dialer.addressList()) != 3 { 196 | t.Fatal("Expected three dial attempts by intercept, not", dialer.addressList()) 197 | } 198 | if dialer.addressList()[0] != "s2.localhost:4001" || dialer.addressList()[1] != "s3.localhost:4002" { 199 | t.Error("bestTarget() did not present unhealthy in age order", dialer.addressList()) 200 | } 201 | } 202 | 203 | // Test that the deadline is honoured. This also exercises the IsZero for the passed in context. 204 | func TestDialDeadline(t *testing.T) { 205 | cslb := realInit() 206 | mr := newMockResolver() // Empty DNS 207 | cslb.netResolver = mr 208 | dialer := newMockDialer() 209 | dialer.delay = time.Second * 2 210 | dialer.err = fmt.Errorf("Dial Deadline error") 211 | cslb.InterceptTimeout = 5 * time.Second 212 | cslb.systemDialContext = dialer.dialContext // Mock up calls to the system dialer 213 | mr.appendSRV("https", "tcp", "localhost", "s1.localhost", 4000, 0, 0) 214 | mr.appendSRV("https", "tcp", "localhost", "s2.localhost", 4001, 1, 0) 215 | mr.appendSRV("https", "tcp", "localhost", "s3.localhost", 4002, 2, 0) 216 | 217 | cslb.start() 218 | defer cslb.stop() 219 | 220 | _, err := cslb.dialContext(context.Background(), "tcp", "localhost:443") 221 | if err == nil { 222 | t.Fatal("Expected a timeout error due to deadline exceeded") 223 | } 224 | if !strings.Contains(err.Error(), "deadline exceed") { 225 | t.Error("Expected 'deadline exceed...' error, got", err.Error()) 226 | } 227 | 228 | // Should have tried all three targets 229 | 230 | if len(dialer.addressList()) != 3 { 231 | t.Error("Expected intercept to try three targets before timing out, not", len(dialer.addressList())) 232 | } 233 | } 234 | 235 | // Test that context cancel is honored. Also exercise all the print functions while we're here. 236 | func TestDialCancel(t *testing.T) { 237 | cslb := realInit() 238 | 239 | cslb.PrintDialContext = true 240 | cslb.PrintHCResults = true 241 | cslb.PrintIntercepts = true 242 | cslb.PrintDialResults = true 243 | cslb.PrintSRVLookup = true 244 | 245 | mr := newMockResolver() // Empty DNS 246 | cslb.netResolver = mr 247 | dialer := newMockDialer() 248 | dialer.delay = time.Second * 2 // Each lookup takes 2 seconds 249 | dialer.err = fmt.Errorf("Dial Deadline error") 250 | cslb.systemDialContext = dialer.dialContext // Mock up calls to the system dialer 251 | mr.appendSRV("https", "tcp", "localhost", "s1.localhost", 4000, 0, 0) 252 | mr.appendSRV("https", "tcp", "localhost", "s2.localhost", 4001, 1, 0) 253 | mr.appendSRV("https", "tcp", "localhost", "s3.localhost", 4002, 2, 0) 254 | 255 | cancelContext, cancelFunc := context.WithCancel(context.Background()) 256 | 257 | go func() { // Trigger a cancel one second from now 258 | time.Sleep(time.Second) 259 | cancelFunc() 260 | }() 261 | 262 | cslb.start() 263 | defer cslb.stop() 264 | 265 | start := time.Now() 266 | _, err := cslb.dialContext(cancelContext, "tcp", "localhost:443") 267 | if err == nil { 268 | t.Fatal("Expected a timeout error due to deadline exceeded") 269 | } 270 | dur := time.Now().Sub(start) 271 | if dur > (time.Second * 2) { // Allow some wiggle room, but not the six seconds it would take 272 | t.Error("Cancel did not terminate request within 2 seconds", dur) 273 | } 274 | } 275 | -------------------------------------------------------------------------------- /cslb.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | import ( 4 | "context" 5 | "math/rand" 6 | "net" 7 | "net/http" 8 | "os" 9 | "sync" 10 | "time" 11 | ) 12 | 13 | // limitedResolver is a subset of net.Resolver interface that is used by cslb. The idea is to create 14 | // a smaller interface that's easier to mock. 15 | type limitedResolver interface { 16 | LookupSRV(ctx context.Context, service, proto, name string) (cname string, addrs []*net.SRV, err error) 17 | LookupTXT(ctx context.Context, name string) ([]string, error) 18 | } 19 | 20 | const ( 21 | cslbEnvPrefix = "cslb_" // All cslb environment variables are prefixed with this 22 | defaultHealthCheckTXTPrefix = "._cslb." // Prepended to target name to form a TXT qName containing URL 23 | 24 | defaultHealthCheckContentOk = "OK" // Must be in the body of a good health check response 25 | defaultHealthCheckFrequency = time.Second * 50 // How often to run the health check query 26 | defaultInterceptTimeout = time.Minute // Default context duration for dialContextIntercept 27 | defaultDialVetoDuration = time.Minute // Ignore targets for this duration after dial fails 28 | 29 | // We need to configure our own TTLs because the go DNS APIs don't return TTLs. Most DNS 30 | // libraries don't, but they all should as it is vital data for long-running programs that 31 | // mistakenly hold onto a DNS results for the lifetime of the program istead of the lifetime 32 | // of the DNS response. 33 | 34 | defaultNotFoundSRVTTL = time.Minute * 20 // How long a NXDomain SRV is retained in the cache 35 | defaultFoundSRVTTL = time.Minute * 5 // How long a found SRV is retained in the cache 36 | defaultHealthTTL = time.Minute * 5 // How long a target stays in the cache 37 | ) 38 | 39 | // Config parameters manipulated by tests or possibly external options 40 | type config struct { 41 | Version string 42 | 43 | PrintDialContext bool // "d" - diagnostics settings are lowercase 44 | PrintHCResults bool // "h" 45 | PrintIntercepts bool // "i" 46 | PrintDialResults bool // "r" 47 | PrintSRVLookup bool // "s" 48 | 49 | DisableInterception bool // "C" - behaviour settings are uppercase 50 | DisableHealthChecks bool // "H" 51 | AllowNumericServices bool // "N" 52 | 53 | StatusServerAddress string // Listen address of status server 54 | StatusServerTemplates string // filepath.Glob of replacement templates for status server 55 | 56 | HealthCheckTXTPrefix string // Prepended to target name to form a TXT URL 57 | HealthCheckContentOk string // Must be in the body of the health check response 58 | HealthCheckFrequency time.Duration 59 | InterceptTimeout time.Duration // Maximum time to run connect attempts with an intercept call 60 | DialVetoDuration time.Duration // Ignore targets for this duration after dial fails 61 | 62 | NotFoundSRVTTL time.Duration // How long a not-found SRV is retained in the cache 63 | FoundSRVTTL time.Duration // How long a found SRV is retained in the cache 64 | HealthTTL time.Duration // How long a target stays in the cache 65 | } 66 | 67 | // cslbStats holds all statistics for the cslb package. See addStats() for typical usage. 68 | type cslbStats struct { 69 | StartTime time.Time 70 | Duration time.Duration // Total elapse time in DialContext 71 | DialContext int // intercepted calls to DialContext 72 | MissHostService int // Host or service don't match or interception disabled 73 | NoSRV int // Times SRV lookup returned zero targets 74 | BestTarget int // Calls to bestTarget() 75 | DupesStopped int // Times that a dupe target stopped the bestTarget() iteration (all failed) 76 | GoodDials int // system DialContext returned a good connection 77 | FailedDials int // system DialContext returned an error 78 | Deadline int // Times intercept deadline expired 79 | } 80 | 81 | // cloneStats creates a safe copy of the stats - primarily for the status server 82 | func (t *cslb) cloneStats() cslbStats { 83 | t.statsMu.RLock() 84 | clone := t.cslbStats 85 | t.statsMu.RUnlock() 86 | 87 | return clone 88 | } 89 | 90 | // addStats safely transfers a local copy of the cslbStats to the cslb's version. Rather than 91 | // updating a cslb's stats directly, callers tend to update a local version of cslbStats then 92 | // transfer it via addStats() to minimize locking calls (or more likely minimizing the risk of 93 | // forgetting a locking call). 94 | func (t *cslb) addStats(ls *cslbStats) { 95 | t.statsMu.Lock() 96 | defer t.statsMu.Unlock() 97 | 98 | if !ls.StartTime.IsZero() { // Nested local cslbStats must not set StartTime else we'll double count 99 | t.Duration += time.Now().Sub(ls.StartTime) 100 | } 101 | 102 | t.DialContext += ls.DialContext 103 | t.MissHostService += ls.MissHostService 104 | t.NoSRV += ls.NoSRV 105 | t.BestTarget += ls.BestTarget 106 | t.DupesStopped += ls.DupesStopped 107 | t.GoodDials += ls.GoodDials 108 | t.FailedDials += ls.FailedDials 109 | t.Deadline += ls.Deadline 110 | } 111 | 112 | // cslb is the main structure which holds all the state for the life of the application. The main 113 | // reason it's a struct rather than a big lump of globals is to make it easy to test. Normally there 114 | // will only be one of these structs created per program. 115 | type cslb struct { 116 | config 117 | 118 | netResolver limitedResolver // Replaceable functions for test mocks 119 | netDialer *net.Dialer // Not used - only here in case we later decide to modify Dialer values 120 | systemDialContext func(ctx context.Context, network, addr string) (net.Conn, error) 121 | randIntn func(int) int // Sufficient rand function used to select weight by bestTarget() 122 | 123 | srvStore *srvCache 124 | healthStore *healthCache 125 | 126 | statusServer *statusServer // Optional status web server 127 | hcClient *http.Client // Shared Health Check Client - it purposely avoids a cslb-intercepted transport 128 | 129 | statsMu sync.RWMutex // Protects everything below here 130 | cslbStats 131 | } 132 | 133 | // newCslb is the cslb constructor. It must be used in preference to a raw &cslb{} construction as 134 | // there are numerous variables which must be set for any cslb methods to work. 135 | func newCslb() *cslb { 136 | t := &cslb{} 137 | t.netResolver = net.DefaultResolver 138 | t.netDialer = &net.Dialer{ // Set up a net.Dialer identical to the 139 | Timeout: 30 * time.Second, // way that net.http does. 140 | KeepAlive: 30 * time.Second, 141 | DualStack: true, 142 | Resolver: net.DefaultResolver, 143 | } 144 | t.systemDialContext = t.netDialer.DialContext 145 | t.randIntn = rand.Intn 146 | 147 | t.srvStore = newSrvCache() 148 | t.healthStore = newHealthCache() 149 | t.hcClient = &http.Client{Transport: &http.Transport{}} // Use a non-cslb http.Transport 150 | 151 | // Transfer in all the default config values and then over-ride them 152 | 153 | t.Version = Version 154 | 155 | t.HealthCheckTXTPrefix = defaultHealthCheckTXTPrefix 156 | t.HealthCheckContentOk = defaultHealthCheckContentOk 157 | t.HealthCheckFrequency = defaultHealthCheckFrequency 158 | t.InterceptTimeout = defaultInterceptTimeout 159 | t.DialVetoDuration = defaultDialVetoDuration 160 | 161 | t.NotFoundSRVTTL = defaultNotFoundSRVTTL 162 | t.FoundSRVTTL = defaultFoundSRVTTL 163 | t.HealthTTL = defaultHealthTTL 164 | 165 | // Check for environment variable over-rides 166 | 167 | flags := os.Getenv(cslbEnvPrefix + "options") 168 | for _, opt := range []byte(flags) { 169 | switch opt { 170 | case 'd': 171 | t.PrintDialContext = true 172 | case 'h': 173 | t.PrintHCResults = true 174 | case 'i': 175 | t.PrintIntercepts = true 176 | case 'r': 177 | t.PrintDialResults = true 178 | case 's': 179 | t.PrintSRVLookup = true 180 | 181 | case 'C': 182 | t.DisableInterception = true 183 | case 'H': 184 | t.DisableHealthChecks = true 185 | case 'N': 186 | t.AllowNumericServices = true 187 | default: 188 | } 189 | } 190 | 191 | e := os.Getenv(cslbEnvPrefix + "hc_ok") 192 | if len(e) > 0 { 193 | t.HealthCheckContentOk = e 194 | } 195 | 196 | t.StatusServerAddress = os.Getenv(cslbEnvPrefix + "listen") 197 | t.StatusServerTemplates = os.Getenv(cslbEnvPrefix + "templates") 198 | 199 | t.HealthCheckFrequency = getAndParseDuration(cslbEnvPrefix+"hc_freq", t.HealthCheckFrequency) 200 | t.InterceptTimeout = getAndParseDuration(cslbEnvPrefix+"timeout", t.InterceptTimeout) 201 | t.DialVetoDuration = getAndParseDuration(cslbEnvPrefix+"dial_veto", t.DialVetoDuration) 202 | 203 | t.NotFoundSRVTTL = getAndParseDuration(cslbEnvPrefix+"nxd_ttl", t.NotFoundSRVTTL) 204 | t.FoundSRVTTL = getAndParseDuration(cslbEnvPrefix+"srv_ttl", t.FoundSRVTTL) 205 | t.HealthTTL = getAndParseDuration(cslbEnvPrefix+"tar_ttl", t.HealthTTL) 206 | 207 | t.StartTime = time.Now() 208 | 209 | return t 210 | } 211 | 212 | // start starts up the cache cleaners and optionally the status web server. It is called *after* all 213 | // config settings have been over-ridden so as to avoid any race conditions - particularly with 214 | // tests. 215 | func (t *cslb) start() *cslb { 216 | t.srvStore.start((t.FoundSRVTTL / 5) + time.Second) 217 | t.healthStore.start((t.HealthTTL / 5) + time.Second) 218 | 219 | if len(t.StatusServerAddress) > 0 { 220 | t.statusServer = newStatusServer(t) 221 | go t.statusServer.start() 222 | } 223 | 224 | return t 225 | } 226 | 227 | // stop stops what start started. Go figure. 228 | func (t *cslb) stop() { 229 | t.srvStore.stop() 230 | t.healthStore.stop() 231 | 232 | if t.statusServer != nil { 233 | t.statusServer.stop(context.Background()) 234 | t.statusServer = nil 235 | } 236 | } 237 | 238 | const ( 239 | lowerDurationLimit = time.Second // Arbitrary limits to avoid 240 | upperDurationLimit = time.Hour // absurd values being used 241 | ) 242 | 243 | // getAndParseDuration is a helper to get the env variable and convert it to a reasonable 244 | // duration. Returns the current value if the proposed value is outside reasonable limits. 245 | func getAndParseDuration(name string, currValue time.Duration) time.Duration { 246 | e := os.Getenv(name) 247 | if len(e) == 0 { 248 | return currValue 249 | } 250 | d, err := time.ParseDuration(e) 251 | if err != nil { 252 | return currValue 253 | } 254 | if d < lowerDurationLimit || d > upperDurationLimit { 255 | return currValue 256 | } 257 | 258 | return d 259 | } 260 | -------------------------------------------------------------------------------- /health.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | /* 4 | The health structs track the health of the target systems in terms of whether we have been able to 5 | establish successful connections to them or not and what the results of the health check is - if one 6 | is running. 7 | 8 | The health check URL is used as by a background check to pre-determine the state of the target 9 | rather than waiting for a failed connection. Defining a health check URL is recommended as a 10 | successful connect() does not necessarily imply a successful service - it merely implies a 11 | successful TCP setup. Furthermore a health check URL can be used to administratively turn a target 12 | on and off. Or take it "out of rotation" in devop parlance. 13 | 14 | Most of these functions are actually cslb functions rather than healthCache functions because they 15 | need access to cslb variables such as config and resolver. This could be restructured to bring all 16 | those values within a healthStore, but there's not a lot of value in that apart from slightly better 17 | encapsulation. 18 | */ 19 | 20 | import ( 21 | "bytes" 22 | "context" 23 | "fmt" 24 | "io/ioutil" 25 | "net/http" 26 | "net/url" 27 | "strconv" 28 | "strings" 29 | "sync" 30 | "time" 31 | ) 32 | 33 | type healthCache struct { 34 | sync.RWMutex // Protects everything within this struct 35 | done chan bool // Shuts down the cache cleaner 36 | cache map[string]*ceHealth // The key is ToLower(target:port) - use makeHealthStoreKey() 37 | } 38 | 39 | type ceHealth struct { 40 | expires time.Time // When this entry expire out of the cache 41 | goodDials int 42 | failedDials int 43 | nextDialAttempt time.Time // When we can next consider this target - IsZero() means now 44 | lastDialAttempt time.Time 45 | lastDialStatus string 46 | lastHealthCheck time.Time 47 | lastHealthCheckStatus string // From http.Get() 48 | url string // URL to probe to confirm target is healthy 49 | unHealthy bool // True if last health check failed 50 | } 51 | 52 | // isGood returns whether a target can be used. Caller must have locked beforehand. 53 | func (t *ceHealth) isGood(now time.Time) bool { 54 | return !t.unHealthy && !t.nextDialAttempt.After(now) // Don't use Before as it might be right now! 55 | } 56 | 57 | // makeHealthStoreKey generates the lookup key for the healthStore. It's of the form host:port 58 | func makeHealthStoreKey(host string, port int) string { 59 | return host + ":" + strconv.FormatUint(uint64(port), 10) 60 | } 61 | 62 | func unpackHealthStoreKey(targetKey string) (host, port string) { 63 | colon := strings.IndexByte(targetKey, ':') 64 | if colon > 0 { 65 | host = targetKey[:colon] 66 | port = targetKey[colon+1:] 67 | } 68 | 69 | return 70 | } 71 | 72 | func newHealthCache() *healthCache { 73 | return &healthCache{cache: make(map[string]*ceHealth), done: make(chan bool)} 74 | } 75 | 76 | func (t *healthCache) start(cacheInterval time.Duration) { 77 | go t.cleaner(cacheInterval) 78 | } 79 | 80 | func (t *healthCache) stop() { 81 | close(t.done) 82 | } 83 | 84 | // populateHealthStore adds a list of targets to the healthStore. Supplied keys are fully formed 85 | // cache keys, that is, target:port. It also starts off the health check for each new target if HC 86 | // is enabled. 87 | func (t *cslb) populateHealthStore(now time.Time, healthStoreKeys []string) { 88 | t.healthStore.Lock() 89 | defer t.healthStore.Unlock() 90 | 91 | for _, healthStoreKey := range healthStoreKeys { 92 | ceh := t.healthStore.cache[healthStoreKey] 93 | if ceh == nil { 94 | ceh = &ceHealth{expires: now.Add(t.HealthTTL)} 95 | t.healthStore.cache[healthStoreKey] = ceh 96 | if !t.DisableHealthChecks { 97 | go t.fetchAndRunHealthCheck(healthStoreKey, ceh) 98 | } 99 | } 100 | } 101 | } 102 | 103 | var zeroTime time.Time 104 | 105 | // setDialResult records the results of the last dial attempt. If this is a previously unknown 106 | // target then a health check is start for the target, if HC is enabled. This should rarely be the 107 | // case but it can happen if the HealthTTL is shorter than the SRV TTL or if a connection runs 108 | // across a target expiration. 109 | func (t *cslb) setDialResult(now time.Time, host string, port int, err error) { 110 | t.healthStore.Lock() 111 | defer t.healthStore.Unlock() 112 | 113 | healthStoreKey := makeHealthStoreKey(host, port) 114 | ceh := t.healthStore.cache[healthStoreKey] 115 | if ceh == nil { // I would expect an entry to be here 116 | ceh = &ceHealth{expires: now.Add(t.HealthTTL)} 117 | t.healthStore.cache[healthStoreKey] = ceh 118 | if !t.DisableHealthChecks { 119 | go t.fetchAndRunHealthCheck(healthStoreKey, ceh) 120 | } 121 | } 122 | ceh.lastDialAttempt = now 123 | if err == nil { 124 | ceh.goodDials++ 125 | ceh.nextDialAttempt = zeroTime 126 | ceh.lastDialStatus = "" 127 | } else { 128 | ceh.failedDials++ 129 | ceh.nextDialAttempt = now.Add(t.DialVetoDuration) 130 | ceh.lastDialStatus = err.Error() 131 | } 132 | } 133 | 134 | // fetchAndRunHealthCheck is normally started as a separate go-routine when a target is added to the 135 | // healthStore. It fetches the health check URL and if present runs a periodic GET check until the 136 | // ceHealth entry expires. The health check URL is stored in a TXT RR. It could be a TypeURI RR 137 | // (RFC7553) I suppose, but who supports/uses those? The qName for the TXT RR is of the form 138 | // _$port._cslb.$target, thus something like _80._cslb.example.net where port is from the SRV RR. 139 | func (t *cslb) fetchAndRunHealthCheck(healthStoreKey string, ceh *ceHealth) { 140 | host, port := unpackHealthStoreKey(healthStoreKey) 141 | qName := "_" + port + t.HealthCheckTXTPrefix + host 142 | txts, err := t.netResolver.LookupTXT(context.Background(), qName) 143 | if err != nil { 144 | return // No TXT 145 | } 146 | hcURL := strings.Join(txts, "") // TXT is a slice of sub-strings so bang them all together 147 | if len(hcURL) == 0 { 148 | return // Empty string can't be fetched! 149 | } 150 | t.healthStore.Lock() 151 | ceh.url = hcURL // For reporting purposes only 152 | expires := ceh.expires // Extract under protection of the lock 153 | t.healthStore.Unlock() 154 | 155 | _, err = url.Parse(hcURL) // Check that the URL is in fact a URL 156 | if err != nil { 157 | return // Doesn't look like it! 158 | } 159 | 160 | // Run the health check until the ceh expires. 161 | 162 | sleepFor := time.Second // Only wait a short time for the first health check 163 | for { 164 | time.Sleep(sleepFor) 165 | sleepFor = t.HealthCheckFrequency // Second and subsequents wait a normal amount of time 166 | now := time.Now() 167 | if expires.Before(now) { 168 | return 169 | } 170 | 171 | resp, err := t.hcClient.Get(hcURL) 172 | if err != nil { 173 | if t.PrintHCResults { 174 | fmt.Println("Health Check:", healthStoreKey, err) 175 | } 176 | t.healthStore.Lock() 177 | ceh.unHealthy = true 178 | ceh.lastHealthCheck = now 179 | ceh.lastHealthCheckStatus = err.Error() 180 | t.healthStore.Unlock() 181 | return // Fatal error - leave the ceh to its own devices 182 | } 183 | body, err := ioutil.ReadAll(resp.Body) 184 | resp.Body.Close() 185 | if err != nil { 186 | if t.PrintHCResults { 187 | fmt.Println("Health Check:", healthStoreKey, err) 188 | } 189 | continue 190 | } 191 | 192 | ok := resp.StatusCode == http.StatusOK && bytes.Contains(body, []byte(t.HealthCheckContentOk)) 193 | if t.PrintHCResults { 194 | fmt.Println("Health Check Set:", healthStoreKey, ok) 195 | } 196 | t.healthStore.Lock() 197 | ceh.unHealthy = !ok 198 | ceh.lastHealthCheck = now 199 | ceh.lastHealthCheckStatus = resp.Status 200 | t.healthStore.Unlock() 201 | } 202 | } 203 | 204 | // cleaner periodically scans the cache to delete expired entries. Normally run as a go-routine. 205 | func (t *healthCache) cleaner(cleanInterval time.Duration) { 206 | ticker := time.NewTicker(cleanInterval) 207 | defer ticker.Stop() 208 | 209 | for { 210 | select { 211 | case <-t.done: 212 | return 213 | case now := <-ticker.C: 214 | t.clean(now) 215 | } 216 | } 217 | } 218 | 219 | func (t *healthCache) clean(now time.Time) { 220 | t.Lock() 221 | defer t.Unlock() 222 | 223 | for key, ceh := range t.cache { 224 | if ceh.expires.Before(now) { 225 | delete(t.cache, key) 226 | } 227 | } 228 | } 229 | 230 | // ceHealthAsStats is a clone of ceHealth with exported variables for html.Template 231 | type ceHealthAsStats struct { 232 | Key string 233 | GoodDials int 234 | FailedDials int 235 | Expires time.Duration // In the future 236 | NextDialAttempt time.Duration // In the future 237 | LastDialAttempt time.Duration // In the past 238 | LastDialStatus string 239 | LastHealthCheck time.Duration // In the past 240 | LastHealthCheckStatus string 241 | Url string 242 | IsGood bool 243 | } 244 | 245 | type healthStats struct { 246 | Targets []ceHealthAsStats 247 | } 248 | 249 | // getStats clones all the ceHealth entries into a struct suitable for the status service. This 250 | // shouldn't be too expensive as we don't expect a huge number of targets, but who knows? 251 | func (t *healthCache) getStats() *healthStats { 252 | now := time.Now() 253 | s := &healthStats{} 254 | t.RLock() 255 | defer t.RUnlock() 256 | 257 | s.Targets = make([]ceHealthAsStats, 0, len(t.cache)) 258 | for k, v := range t.cache { 259 | entry := ceHealthAsStats{ 260 | Key: k, 261 | GoodDials: v.goodDials, 262 | FailedDials: v.failedDials, 263 | LastDialStatus: trimTo(v.lastDialStatus, 60), 264 | Url: v.url, 265 | IsGood: v.isGood(now), 266 | } 267 | if !v.expires.IsZero() { 268 | entry.Expires = v.expires.Sub(now).Truncate(time.Second) 269 | } 270 | if !v.nextDialAttempt.IsZero() { 271 | entry.NextDialAttempt = v.nextDialAttempt.Sub(now).Truncate(time.Second) 272 | } 273 | if !v.lastDialAttempt.IsZero() { 274 | entry.LastDialAttempt = now.Sub(v.lastDialAttempt).Truncate(time.Second) 275 | } 276 | if !v.lastHealthCheck.IsZero() { 277 | entry.LastHealthCheck = now.Sub(v.lastHealthCheck).Truncate(time.Second) 278 | entry.LastHealthCheckStatus = trimTo(v.lastHealthCheckStatus, 90) 279 | } 280 | s.Targets = append(s.Targets, entry) 281 | } 282 | 283 | return s 284 | } 285 | 286 | func trimTo(s string, max int) string { 287 | if len(s) > max { 288 | if max <= 3 { 289 | return "..." 290 | } 291 | s = s[:max-3] + "..." 292 | } 293 | 294 | return s 295 | } 296 | -------------------------------------------------------------------------------- /status.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | /* 4 | The status server presents a read-only web page of insights into cslb. This includes the contents of 5 | the SRV and health caches as well as the results of any health checks and connection attempts. 6 | 7 | There are default html templates uses to render the pages but most of these can be over-ridden with 8 | user-supplied templates defined with the "cslb_templates" environment variable. 9 | */ 10 | 11 | import ( 12 | "context" 13 | "html/template" 14 | "io" 15 | "log" 16 | "net/http" 17 | "os" 18 | "sort" 19 | "strings" 20 | "time" 21 | ) 22 | 23 | const ( 24 | header = ` 25 | CSLB - Client Side Load Balancing - Status 26 | 27 | ` 28 | configStr = `{{define "config"}} 29 |

CSLB Global State

30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 |
Start Time{{.StartTime.Format "2006-01-02T15:04:05Z07:00"}}
Up time{{.Uptime}}
DialContext Intercepts{{.DialContext}}
Time In Intercept{{.Duration}}
Status Server Addresshttp://{{.StatusServerAddress}}
Executable{{.Executable}}
38 | 39 |

CSLB Config

40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 |
PrintDialContextPrint entry into cslb.DialContext{{.PrintDialContext}}
PrintHCResultsPrint results of Health Check{{.PrintHCResults}}
PrintInterceptsPrint each domain to Target intercept{{.PrintIntercepts}}
PrintSRVLookupPrint results of SRV Lookups{{.PrintSRVLookup}}
DisableInterceptionTurn off Interception{{.DisableInterception}}
DisableHealthChecksTurn off Health Checks{{.DisableHealthChecks}}
AllowNumericServicesAllow Numeric Service SRV lookups{{.AllowNumericServices}}
HealthCheckTXTPrefixForms part of TXT qName{{.HealthCheckTXTPrefix}}
HealthCheckContentOkstrings.Contains in health check body"{{.HealthCheckContentOk}}"
HealthCheckFrequencyTime between health checks{{.HealthCheckFrequency}}
InterceptTimeoutMaximum time to try targets{{.InterceptTimeout}}
DialVetoDurationIgnore downed targets for this duration{{.DialVetoDuration}}
NotFoundSRVTTLCache lifetime for SRV NXDomain{{.NotFoundSRVTTL}}
FoundSRVTTLCache lifetime for SRV found{{.FoundSRVTTL}}
HealthTTLCache lifetime for SRV Target{{.HealthTTL}}
57 | {{end}} 58 | ` 59 | 60 | cslbStr = `{{define "cslb"}} 61 |

CSLB Global Statistics

62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 |
Intercepted calls to DialContext{{.DialContext}}
Host or service don't match or interception disabled{{.MissHostService}}
Times SRV lookup returned zero targets{{.NoSRV}}
Calls to bestTarget(){{.BestTarget}}
Times when all targets failed{{.DupesStopped}}
system DialContext returned a good connection{{.GoodDials}}
system DialContext returned an error{{.FailedDials}}
Times intercept deadline expired{{.Deadline}}
72 | {{end}} 73 | ` 74 | 75 | srvStr = `{{define "srv"}} 76 |

SRV DNS Cache

77 | 78 | 79 | 80 | 81 | {{range .Srvs}} 82 | 83 | 84 | 85 | 86 | 87 | 88 | {{end}} 89 |
CNameExpiresLookupsPriorityInternal WeightPortTargetGood DialsFailed DialsIsGood
{{.CName}}{{.Expires}}{{.Lookups}}{{.Priority}}{{.Weight}}{{.Port}}{{.Target}}{{.GoodDials}}{{.FailedDials}}{{.IsGood}}
90 | {{end}} 91 | ` 92 | 93 | healthStr = `{{define "health"}} 94 |

Target Health Cache

95 | 96 | 97 | 98 | 99 | 100 | 101 | {{range .Targets}} 102 | 103 | 104 | 105 | 106 | 107 | 108 | {{end}} 109 |
TargetExpiresGood DialsFailed DialsNext Dial
Attempt
Last Dial
Attempt
isGoodLast Dial
Status
Last Health
Check
Health Check URLLast Health
Status
{{.Key}}{{.Expires}}{{.GoodDials}}{{.FailedDials}}{{.NextDialAttempt}}{{.LastDialAttempt}}{{.IsGood}}{{.LastDialStatus}}{{.LastHealthCheck}}{{.Url}}{{.LastHealthCheckStatus}}
110 | {{end}} 111 | ` 112 | 113 | trailerStr = ` 114 |

Client-Side Load Balancing {{.Version}} released on {{.ReleaseDate}}. Brought to you by 115 | https://github/markdingo/cslb at {{.RunAt}} 116 | 117 | ` 118 | ) 119 | 120 | type statusServer struct { 121 | cslb *cslb 122 | httpServer *http.Server 123 | allTmpl *template.Template 124 | trailerTmpl *template.Template 125 | } 126 | 127 | // newStatusServer creates the base status server ready for starting 128 | func newStatusServer(cslb *cslb) *statusServer { 129 | t := &statusServer{cslb: cslb} 130 | err := t.loadTemplates() 131 | if err != nil { 132 | log.Fatal(err) 133 | } 134 | t.httpServer = &http.Server{Addr: cslb.StatusServerAddress} 135 | mux := http.NewServeMux() 136 | mux.HandleFunc("/", t.generateStatus) 137 | t.httpServer.Handler = mux 138 | 139 | return t 140 | } 141 | 142 | // start is normally called as a separate go-routine since it calls the http listener which blocks. 143 | func (t *statusServer) start() { 144 | err := t.httpServer.ListenAndServe() 145 | if !strings.Contains(err.Error(), "http: Server closed") { // Good return? 146 | log.Fatal(err) 147 | } 148 | } 149 | 150 | // stop shuts down the http listener 151 | func (t *statusServer) stop(ctx context.Context) { 152 | t.httpServer.Shutdown(ctx) 153 | } 154 | 155 | // loadTemplates performs a one-time parse of all the internal templates needed for the status 156 | // page. It also attempts to "glob" load any template files found in the directory identified by the 157 | // "cslb_templates" environment variable. If the glob load fails it only causes a warning as the 158 | // default templates will still function.. 159 | func (t *statusServer) loadTemplates() error { 160 | t.allTmpl = template.New("") 161 | _, err := t.allTmpl.Parse(configStr) 162 | if err != nil { 163 | return err 164 | } 165 | _, err = t.allTmpl.Parse(cslbStr) 166 | if err != nil { 167 | return err 168 | } 169 | _, err = t.allTmpl.Parse(srvStr) 170 | if err != nil { 171 | return err 172 | } 173 | _, err = t.allTmpl.Parse(healthStr) 174 | if err != nil { 175 | return err 176 | } 177 | t.trailerTmpl, err = template.New("trailer").Parse(trailerStr) 178 | if err != nil { 179 | return err 180 | } 181 | 182 | if len(t.cslb.StatusServerTemplates) > 0 { // If an alternate template glob has been configured 183 | _, err = t.allTmpl.ParseGlob(t.cslb.StatusServerTemplates) 184 | if err != nil { 185 | log.Print("cslb Warning:", err) // Not fatal if replacement templates fail to load 186 | } 187 | } 188 | 189 | return nil 190 | } 191 | 192 | // Aggregate structs are conveniences so we can render derived values in a single template. 193 | 194 | type cslbAggConfig struct { 195 | StartTime time.Time 196 | Uptime time.Duration 197 | Duration time.Duration 198 | DialContext int 199 | Executable string 200 | config 201 | } 202 | 203 | type cslbAggTrailer struct { 204 | Version string 205 | ReleaseDate string 206 | RunAt string 207 | } 208 | 209 | // generateStatus writes the status page out. It's quite extensive because everything is on one page. 210 | func (t *statusServer) generateStatus(w http.ResponseWriter, req *http.Request) { 211 | var err error 212 | io.WriteString(w, header) 213 | 214 | var cac cslbAggConfig 215 | cac.config = t.cslb.config 216 | cas := t.cslb.cloneStats() // Take a copy to avoid holding a long mutex 217 | 218 | cac.StartTime = cas.StartTime 219 | cac.Uptime = time.Now().Sub(cas.StartTime).Truncate(time.Second) 220 | cac.Duration = cas.Duration.Truncate(time.Second) // Total time in intercepts 221 | cac.DialContext = cas.DialContext 222 | cac.Executable, _ = os.Executable() 223 | 224 | err = t.allTmpl.ExecuteTemplate(w, "config", &cac) 225 | if err != nil { 226 | log.Fatal(err) 227 | } 228 | err = t.allTmpl.ExecuteTemplate(w, "cslb", &cas) 229 | if err != nil { 230 | log.Fatal(err) 231 | } 232 | 233 | srvStats := t.cslb.srvStore.getStats(t.cslb.healthStore) // Clone all ceSRVs and ancillary data 234 | sort.Slice(srvStats.Srvs, func(i, j int) bool { // Sort for a low-flicker re-render 235 | return srvStats.Srvs[i].CName < srvStats.Srvs[j].CName 236 | }) 237 | sort.Slice(srvStats.nxDomains, func(i, j int) bool { // Sort for a low-flicker re-render 238 | return srvStats.nxDomains[i].CName < srvStats.nxDomains[j].CName 239 | }) 240 | 241 | // Place NXDomains at the end to reduce visual clutter. To further reduce clutter, remove 242 | // duplicate data which comes from the SRV cache. 243 | srvStats.Srvs = append(srvStats.Srvs, srvStats.nxDomains...) 244 | prevCName := "" 245 | for ix := 0; ix < len(srvStats.Srvs); ix++ { 246 | if prevCName == srvStats.Srvs[ix].CName { 247 | srvStats.Srvs[ix].CName = "" 248 | srvStats.Srvs[ix].Expires = "" 249 | srvStats.Srvs[ix].Lookups = "" 250 | } else { 251 | prevCName = srvStats.Srvs[ix].CName 252 | } 253 | } 254 | 255 | err = t.allTmpl.ExecuteTemplate(w, "srv", srvStats) 256 | if err != nil { 257 | log.Fatal(err) 258 | } 259 | 260 | healthStats := t.cslb.healthStore.getStats() // Clone all ceHealth entries 261 | sort.Slice(healthStats.Targets, func(i, j int) bool { // Sort for a low-flicker re-render 262 | return healthStats.Targets[i].Key < healthStats.Targets[j].Key 263 | }) 264 | err = t.allTmpl.ExecuteTemplate(w, "health", healthStats) 265 | if err != nil { 266 | log.Fatal(err) 267 | } 268 | 269 | tv := cslbAggTrailer{Version: Version, ReleaseDate: ReleaseDate, 270 | RunAt: time.Now().Format("2006-01-02T15:04:05Z07:00")} 271 | err = t.trailerTmpl.Execute(w, tv) 272 | if err != nil { 273 | log.Fatal(err) 274 | } 275 | } 276 | -------------------------------------------------------------------------------- /srv_test.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net" 7 | "strings" 8 | "sync" 9 | "testing" 10 | "time" 11 | ) 12 | 13 | type mockResolver struct { 14 | mu sync.Mutex // So go test -race doesn't complain 15 | srvs map[string][]*net.SRV 16 | txts map[string][]string 17 | lastSRV string 18 | lastTXT string 19 | } 20 | 21 | func newMockResolver() *mockResolver { 22 | return &mockResolver{srvs: make(map[string][]*net.SRV), txts: make(map[string][]string)} 23 | } 24 | 25 | // append the target to the srv. Last append is always at the end to tests can rely on position. 26 | func (t *mockResolver) appendSRV(service, proto, name string, target string, port, priority, weight int) { 27 | t.mu.Lock() 28 | defer t.mu.Unlock() 29 | 30 | cname := "" 31 | if len(service) != 0 || len(proto) != 0 { 32 | cname = "_" + service + "._" + proto + "." 33 | } 34 | cname += name 35 | ar, ok := t.srvs[cname] 36 | if !ok { 37 | ar = make([]*net.SRV, 0) 38 | } 39 | if len(target) > 0 { 40 | ar = append(ar, &net.SRV{Target: target, Port: uint16(port), 41 | Priority: uint16(priority), Weight: uint16(weight)}) 42 | t.srvs[cname] = ar 43 | } 44 | } 45 | 46 | func (t *mockResolver) LookupSRV(ctx context.Context, service, proto, name string) (cname string, srvs []*net.SRV, err error) { 47 | t.mu.Lock() 48 | defer t.mu.Unlock() 49 | 50 | qName := "" 51 | if len(service) != 0 || len(proto) != 0 { 52 | qName = "_" + service + "._" + proto + "." 53 | } 54 | qName += name 55 | t.lastSRV = qName 56 | srvs, ok := t.srvs[qName] 57 | if !ok { 58 | err = fmt.Errorf("mock LookupSRV not found for %s", qName) 59 | return 60 | } 61 | cname = qName 62 | 63 | return 64 | } 65 | 66 | func (t *mockResolver) appendTXT(target string, txts []string) { 67 | t.mu.Lock() 68 | defer t.mu.Unlock() 69 | 70 | t.txts[target] = txts 71 | } 72 | 73 | func (t *mockResolver) LookupTXT(ctx context.Context, qName string) (txts []string, err error) { 74 | t.mu.Lock() 75 | defer t.mu.Unlock() 76 | 77 | txts, ok := t.txts[qName] 78 | t.lastTXT = qName 79 | if !ok { 80 | err = fmt.Errorf("mock LookupTXT not found for %s", qName) 81 | } 82 | 83 | return 84 | } 85 | 86 | func makeMockResolver() *mockResolver { 87 | mr := newMockResolver() 88 | mr.appendSRV("http", "tcp", "example.net", "t1.example.net", 1, 10, 20) // port used to codify bestTarget() order 89 | mr.appendSRV("http", "tcp", "example.net", "t2.example.net", 1, 10, 20) 90 | mr.appendSRV("http", "tcp", "example.net", "t3.example.net", 1, 10, 30) 91 | mr.appendSRV("http", "tcp", "example.net", "t4.example.net", 1, 10, 40) 92 | mr.appendSRV("http", "tcp", "example.net", "t5.example.net", 2, 11, 1) 93 | mr.appendSRV("http", "tcp", "example.net", "t6.example.net", 2, 11, 1) 94 | mr.appendSRV("http", "tcp", "example.net", "t7.example.net", 2, 11, 2) 95 | mr.appendSRV("http", "tcp", "example.net", "t8.example.net", 2, 11, 10) 96 | mr.appendSRV("http", "tcp", "example.net", "t9.example.net", 3, 12, 20) 97 | mr.appendSRV("http", "tcp", "example.net", "t10.example.net", 3, 12, 30) 98 | mr.appendSRV("http", "tcp", "example.net", "t11.example.net", 3, 12, 40) 99 | mr.appendSRV("http", "tcp", "example.net", "t12.example.net", 3, 12, 50) 100 | mr.appendSRV("http", "tcp", "example.net", "t13.example.net", 3, 12, 60) 101 | mr.appendSRV("http", "tcp", "example.net", "t14.example.net", 3, 12, 70) 102 | mr.appendSRV("http", "tcp", "example.net", "t15.example.net", 3, 12, 80) 103 | mr.appendSRV("http", "tcp", "example.net", "t16.example.net", 4, 13, 90) 104 | mr.appendSRV("http", "tcp", "example.net", "t17.example.net", 4, 13, 91) 105 | mr.appendSRV("http", "tcp", "example.net", "t18.example.net", 4, 13, 92) 106 | mr.appendSRV("http", "tcp", "example.net", "t19.example.net", 4, 13, 93) 107 | mr.appendSRV("http", "tcp", "example.net", "t20.example.net", 4, 13, 94) 108 | 109 | mr.appendSRV("https", "udp", "example.com", "u1.example.com", 1443, 13, 10) 110 | mr.appendSRV("https", "udp", "example.com", "u2.example.com", 1444, 13, 20) 111 | mr.appendSRV("https", "udp", "example.com", "u3.example.com", 1444, 13, 30) 112 | mr.appendSRV("https", "udp", "example.com", "u4.example.com", 1444, 13, 0) 113 | mr.appendSRV("https", "udp", "example.com", "u5.example.com", 1444, 13, 0) 114 | mr.appendSRV("https", "udp", "example.com", "u6.example.com", 1444, 13, 0) 115 | mr.appendSRV("https", "udp", "example.com", "", 1444, 13, 100) // Should disappear completely 116 | mr.appendSRV("https", "udp", "example.com", "u7.example.com", 1444, 14, 0) 117 | 118 | mr.appendSRV("http", "tcp", "empty.example.org", "", 0, 0, 0) 119 | 120 | return mr 121 | } 122 | 123 | type srvTestCase struct { 124 | service, proto, domain string 125 | srvCount int 126 | target string 127 | bestCount int // How many calls to bestTarget before target expected to show up 128 | never string // target which should never be returned 129 | } 130 | 131 | var srvTestCases = []srvTestCase{ 132 | {"http", "tcp", "example.net", 20, "t1.example.net", 100, "t20.example.net"}, 133 | {"https", "udp", "example.com", 7, "u4.example.com", 6500, "u7.example.com"}, // 0.1% / 3 is the weight 134 | } 135 | 136 | func TestSRVPopulate(t *testing.T) { 137 | cslb := realInit() 138 | cslb.netResolver = makeMockResolver() 139 | cslb.start() 140 | 141 | for _, tc := range srvTestCases { 142 | t.Run(tc.service+"_"+tc.proto+"_"+tc.domain, func(t *testing.T) { 143 | cesrv := cslb.lookupSRV(context.Background(), time.Now(), tc.service, tc.proto, tc.domain) 144 | if cesrv.uniqueTargets() != tc.srvCount { 145 | t.Error("SRV Count mismatch. Expected", tc.srvCount, "got", cesrv.uniqueTargets()) 146 | } 147 | distrib := make(map[string]int) 148 | for ix := 0; ix < tc.bestCount; ix++ { 149 | srv := cslb.bestTarget(cesrv) 150 | if srv == nil { 151 | t.Fatal("bestTarget() returned nil for", tc.domain) 152 | } 153 | distrib[srv.Target]++ 154 | } 155 | if distrib[tc.target] == 0 { 156 | t.Error("Expected", tc.target, "to have been bestTarget() at least once") 157 | t.Log(cesrv) 158 | for k, v := range distrib { 159 | t.Log(k, v) 160 | } 161 | } 162 | if distrib[tc.never] > 0 { 163 | t.Error("Never expected", tc.never, "to be returned as best, but", distrib[tc.never]) 164 | t.Log(cesrv) 165 | for k, v := range distrib { 166 | t.Log(k, v) 167 | } 168 | } 169 | }) 170 | } 171 | } 172 | 173 | // Test that bestTarget() distributes targets according to their weights. At least roughly 174 | // proportionally within the limits of the PRNG. example.com SRV has u1=10, u2=20, u3=30 and u4-u7=0 175 | // thus u3 > u2 > u1 > (u4-u7). 176 | func TestSRVWeightDistribution(t *testing.T) { 177 | cslb := realInit() 178 | cslb.netResolver = makeMockResolver() 179 | cslb.start() 180 | 181 | cesrv := cslb.lookupSRV(context.Background(), time.Now(), "https", "udp", "example.com") 182 | distrib := make(map[string]int) 183 | for ix := 0; ix < 1000; ix++ { 184 | srv := cslb.bestTarget(cesrv) 185 | distrib[srv.Target]++ 186 | } 187 | u1 := distrib["u1.example.com"] 188 | u2 := distrib["u2.example.com"] 189 | u3 := distrib["u3.example.com"] 190 | u4 := distrib["u4.example.com"] 191 | u5 := distrib["u5.example.com"] 192 | u6 := distrib["u6.example.com"] 193 | u7 := distrib["u7.example.com"] 194 | if !(u3 > u2) { 195 | t.Error("Expected u3 GT u2", u3, u2) 196 | } 197 | if !(u2 > u1) { 198 | t.Error("Expected u2 GT u1", u2, u1) 199 | } 200 | if u4 > u1 || u5 > u1 || u6 > u1 || u7 > u1 { 201 | t.Error("Expected u1 GT U4-u7", u1, u4, u5, u6, u7) 202 | } 203 | } 204 | 205 | // Test that failed targets are not considered by lookupSRV. 206 | func TestSRVHealth(t *testing.T) { 207 | cslb := realInit() 208 | cslb.netResolver = makeMockResolver() 209 | cslb.randIntn = func(int) int { return 0 } 210 | cslb.start() 211 | 212 | cesrv := cslb.lookupSRV(context.Background(), time.Now(), "https", "udp", "example.com") 213 | srv := cslb.bestTarget(cesrv) // nextRand is zero so first weight should win 214 | if srv == nil { 215 | t.Fatal("Setup error") 216 | } 217 | if srv.Target != "u1.example.com" { 218 | t.Error("bestTarget should be u1, not", srv) 219 | } 220 | fakeNow := time.Now().Add(time.Minute) // Put it sufficiently in the future 221 | nowPlusOne := fakeNow.Add(time.Hour) 222 | cslb.setDialResult(nowPlusOne, "u1.example.com", int(srv.Port), fmt.Errorf("")) 223 | srv = cslb.bestTarget(cesrv) // nextRand is zero but first is down 224 | if srv == nil { 225 | t.Fatal("Setup error") 226 | } 227 | if srv.Target != "u2.example.com" { 228 | t.Error("bestTarget should now be u2, not", srv) 229 | } 230 | cslb.setDialResult(nowPlusOne, "u2.example.com", int(srv.Port), fmt.Errorf("")) 231 | srv = cslb.bestTarget(cesrv) 232 | if srv == nil { 233 | t.Fatal("Setup error") 234 | } 235 | if srv.Target != "u3.example.com" { 236 | t.Error("bestTarget should now be u3, not", srv) 237 | } 238 | 239 | cslb.setDialResult(nowPlusOne, "u3.example.com", int(srv.Port), fmt.Errorf("")) // Last of the highest priority targets 240 | srv = cslb.bestTarget(cesrv) 241 | if srv == nil { 242 | t.Fatal("Setup error") 243 | } 244 | if srv.Target != "u4.example.com" { // Should get next priority down as second choice 245 | t.Error("bestTarget should now be u4, not", srv) 246 | } 247 | 248 | cslb.setDialResult(nowPlusOne, "u4.example.com", int(srv.Port), fmt.Errorf("")) 249 | cslb.setDialResult(fakeNow, "u5.example.com", 1444, fmt.Errorf("")) // Closest to now 250 | cslb.setDialResult(nowPlusOne, "u6.example.com", 1444, fmt.Errorf("")) 251 | 252 | srv = cslb.bestTarget(cesrv) 253 | if srv == nil { 254 | t.Fatal("Setup error") 255 | } 256 | if srv.Target != "u7.example.com" { // Last second choice 257 | t.Error("bestTarget should now be u7, not", srv) 258 | } 259 | cslb.setDialResult(nowPlusOne, "u7.example.com", int(srv.Port), fmt.Errorf("")) // Every Target is now in bad health 260 | 261 | srv = cslb.bestTarget(cesrv) // Should now get least-worst 262 | if srv == nil { 263 | t.Fatal("Setup error") 264 | } 265 | if srv.Target != "u5.example.com" { // Closet to now 266 | t.Error("bestTarget should now be u5, not", srv) 267 | } 268 | } 269 | 270 | func TestSRVNoFind(t *testing.T) { 271 | cslb := realInit() 272 | mr := makeMockResolver() 273 | cslb.netResolver = mr 274 | cslb.start() 275 | 276 | cesrv := cslb.lookupSRV(context.Background(), time.Now(), "http", "tcp", "empty.example.org") 277 | srv := cslb.bestTarget(cesrv) // nextRand is zero so first weight should win 278 | if srv != nil { 279 | t.Error("Should have got nil for bestTarget as SRV is empty. Got", srv) 280 | } 281 | 282 | // Test Cache hit while we're here 283 | 284 | mr.appendSRV("http", "tcp", "empty.example.org", "e1.example.org", 0, 0, 0) // Now in DNS 285 | cesrv = cslb.lookupSRV(context.Background(), time.Now(), "http", "tcp", "empty.example.org") 286 | srv = cslb.bestTarget(cesrv) // nextRand is zero so first weight should win 287 | if srv != nil { 288 | t.Error("Should have still got nil for bestTarget as SRV is in cache. Got", srv) 289 | } 290 | 291 | } 292 | 293 | func TestSRVString(t *testing.T) { 294 | cslb := realInit() 295 | cslb.netResolver = makeMockResolver() 296 | cslb.start() 297 | cesrv := cslb.lookupSRV(context.Background(), time.Now(), "https", "udp", "example.com") 298 | s := cesrv.String() 299 | c := strings.Count(s, "tarw=") 300 | if c != cesrv.uniqueTargets() { 301 | t.Error("Expected", cesrv.uniqueTargets(), "tarw= patterns, but got", c) 302 | } 303 | } 304 | 305 | // Test that the cache cleaner is expiring entries 306 | func TestSRVcleaner(t *testing.T) { 307 | cslb := realInit() 308 | cslb.netResolver = makeMockResolver() 309 | now := time.Now() 310 | yesterday := now.AddDate(0, 0, -1) // Yesterday 311 | 312 | cslb.lookupSRV(context.Background(), now, "http", "tcp", "keep.expire.example.com") 313 | for ix := 0; ix < 99; ix++ { 314 | cslb.lookupSRV(context.Background(), yesterday, "http", "tcp", 315 | fmt.Sprintf("%d.expire.example.com", ix)) 316 | } 317 | cslb.srvStore.start(time.Second / 2) 318 | defer cslb.srvStore.stop() 319 | 320 | time.Sleep(time.Second) // Give cleaner time to run 321 | cslb.srvStore.RLock() 322 | origLen := len(cslb.srvStore.cache) 323 | cslb.srvStore.RUnlock() 324 | if origLen != 1 { 325 | t.Error("Expected one entry, not", origLen) 326 | } 327 | } 328 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package cslb provides transparent HTTP/HTTPS Client Side Load Balancing for Go programs. 3 | 4 | Cslb intercepts "net/http" Dial Requests and re-directs them to a preferred set of target hosts 5 | based on the load balancing configuration expressed in DNS SRV and TXT Resource Records (RRs). 6 | 7 | Only one trivial change is required to client applications to benefit from cslb which is to import 8 | this package and (if needed) enabling it for non-default http.Transport instances. Cslb processing 9 | is triggered by the presence of SRV RRs. If no SRVs exist cslb is benign which means you can deploy 10 | your application with cslb and independently activate and deactivate cslb processing for each 11 | service at any time. 12 | 13 | No server-side changes are required at all - apart for possibly dispensing with your server-side 14 | load-balancers! 15 | 16 | # DEFAULT USAGE 17 | 18 | Importing cslb automatically enables interception for http.DefaultTransport. In this program 19 | snippet: 20 | 21 | import ( 22 | "net/http" 23 | _ "github.com/markdingo/cslb" 24 | ) 25 | 26 | func main() { 27 | resp, err := http.Get("http://example.net/resource") 28 | 29 | the Dial Request made by http.Get is intercepted and processed by cslb. 30 | 31 | # NON DEFAULT USAGE 32 | 33 | If the application uses its own http.Transport then cslb processing needs to be activated by calling 34 | the cslb.Enable() function, i.e.: 35 | 36 | import ( 37 | "net/http" 38 | "github.com/markdingo/cslb" 39 | ) 40 | 41 | func main() { 42 | myTransport := http.Transport{...} 43 | cslb.Enable(myTransport) 44 | client := &http.Client{Transport: myTransport} 45 | resp, err := client.Get("http://mydomain/resource") 46 | ... 47 | 48 | The cslb.Enable() function replaces http.Transport.DialContext with its own intercept function. 49 | 50 | # WHEN TO USE CSLB 51 | 52 | Server-side load-balancers are no panacea. They add deployment and diagnostic complexity, cost, 53 | throughput constraints and become an additional point of possible failure. 54 | 55 | Cslb can help you achieve good load-balancing and fail-over behaviour without the need for *any* 56 | server-side load-balancers. This is particularly useful in enterprise and micro-service deployments 57 | as well as smaller application deployments where configuring and managing load-balancers is a 58 | significant resource drain. 59 | 60 | Cslb can be used to load-balance across geographically dispersed targets or where "hot stand-by" 61 | systems are purposely deployed on diverse infrastructure. 62 | 63 | # DNS ACTIVATION 64 | 65 | When cslb intercepts a http.Transport Dial Request to port 80 or port 443 it looks up SRV RRs as 66 | prescribed by RFC2782. That is, _http._tcp.$domain and _https._tcp.$domain respectively. Cslb 67 | directs the Dial Request to the highest preference target based on the SRV algorithm. If that Dial 68 | Request fails, it tries the next lower preference target until a successful connection is returned 69 | or all unique targets fail or it runs out of time. 70 | 71 | Cslb caches the SRV RRs (or their non-existence) as well as the result of Dial Requests to the SRV 72 | targets to optimize subequent intercepted calls and the selection of preferred targets. If no SRV 73 | RRs exist, cslb passes the Dial Request on to net.DialContext. 74 | 75 | # RULES OF INTERCEPTION 76 | 77 | Cslb has specific rules about when interception occurs. It normally only considers intercepting port 78 | 80 and port 443 however if the "cslb_allports" environment variable is set, cslb intercepts 79 | non-standard HTTP ports and maps them to numeric service names. For example http://example.net:8080 80 | gets mapped to _8080._tcp.example.net as the SRV name to resolve. 81 | 82 | # ACTIVE HEALTH CHECKS 83 | 84 | While cslb runs passively by caching the results of previous Dial Requests, it can also run actively 85 | by periodically performing health checks on targets. This is useful as an administrator can control 86 | health check behaviour to move a target "in and out of rotation" without changing DNS entries and 87 | waiting for TTLs to age out. Health checks are also likely to make the application a little more 88 | responsive as they are less likely to make a dial attempt to a target that is not working. 89 | 90 | Active health checking is enabled by the presence of a TXT RR in the sub-domain "_$port._cslb" of 91 | the target. E.g. if the SRV target is "s1.example.net:80" then cslb looks for the TXT RR at 92 | "_80._cslb.s1.example.net". If that TXT RR contains a URL then it becomes the health check URL. If 93 | no TXT RR exists or the contents do not form a valid URL then no active health check is performed 94 | for that target. 95 | 96 | The health check URL does not have to be related to the target in any particular way. It could be a 97 | URL to a central monitoring system which performs complicated application level tests and 98 | performance monitoring. Or it could be a URL on the target system itself. 99 | 100 | A health check is considered successful when a GET of the URL returns a 200 status and the content 101 | contains the uppercase text "OK" somewhere in the body (See the "cslb_hc_ok" environment variable 102 | for how this can be modified). Unless both those conditions are met the target is considered 103 | unavailable. 104 | 105 | Active health checks cease once a target becomes idle for too long and health check Dial Requests 106 | are *not* get intercepted by cslb. 107 | 108 | # CONVERTING A SITE TO CSLB 109 | 110 | If your current service exists on a single server called "s1.example.net" and you want to spread the 111 | load across additional servers "s2.example.net" and "s3.example.net" and assuming you've added the 112 | "cslb" package to your application then the following DNS changes active cslb processing: 113 | 114 | Current DNS 115 | 116 | s1.example.net. IN A 172.16.254.1 117 | IN AAAA 2001:db8::1 118 | 119 | s2.example.net. IN A 172.16.254.2 120 | IN AAAA 2001:db8::2 121 | 122 | s3.example.net. IN A 172.16.254.3 123 | IN AAAA 2001:db8::3 124 | 125 | Additional DNS 126 | 127 | _http._tcp.s1.example.net. IN SRV 1 70 80 s1.example.net. 128 | IN SRV 1 30 80 s2.example.net. 129 | IN SRV 2 0 8080 s3.example.net. 130 | 131 | _80._cslb.s1.example.net. IN TXT "http://healthchecker.example.com/s1" 132 | _80._cslb.s2.example.net. IN TXT "http://healthchecker.example.com/s2" 133 | _8080._cslb.s3.example.net. IN TXT "http://s3.example.net/ok" 134 | 135 | A number of observations about this DNS setup: 136 | 137 | - "s1" and "s2" are the highest priority 138 | - "s3" is only ever considered if both "s1" and "s2" are not responding 139 | - On average 70 out of 100 requests will be directed to "s1" 140 | - Connections to "s3" are made on port 8080 141 | - The health check for "s3" is on the same system as the service 142 | - The heallth checks for "s1" and "s2" are on a centralized system 143 | 144 | # CACHE AGEING 145 | 146 | Cslb maintains a cache of SRV lookups and the health status of targets. Cache entries automatically 147 | age out as a form of garbage collection. Removed cache entries stop any associated active health 148 | checks. Unfortunately the cache ageing does not have access to the DNS TTLs associated with the SRV 149 | RRs so it makes a best-guess at reasonable time-to-live values. 150 | 151 | The important point to note is that *all* values get periodically refreshed from the DNS. Nothing 152 | persists internally forever regardless of the level of activity. This means you can be sure that any 153 | changes to your DNS will be noticed by cslb in due course. 154 | 155 | # STATUS WEB PAGE 156 | 157 | Cslb optional runs a web server which presents internal statistics on its performance and 158 | activity. This web service has *no* access controls so it's best to only run it on a loopback 159 | address. Setting the environment variable "cslb_listen" to a listen address activates the status 160 | server. E.g.: 161 | 162 | $ cslb_listen=127.0.0.1:8081 ./myProgram 163 | 164 | # RUN TIME CONTROLS 165 | 166 | On initialization the cslb package examines the "cslb_options" environment variable for single 167 | letter options which have the following meaning: 168 | 169 | 'd' - Debug print dialContext calls 170 | 'h' - Debug print Health Check results 171 | 'i' - Debug print intercepted Dial Requests 172 | 'r' - Debug print system Dial Context results 173 | 's' - Debug print SRV Lookups 174 | 175 | 'C' - Disable all Dial Request interception 176 | 'H' - Disable all health checks 177 | 'N' - Allow numeric service lookups for non-HTTP(S) ports 178 | 179 | An example of how this might by used from a shell: 180 | 181 | $ cslb_options=dh ./yourProgram -options ... 182 | 183 | Many internal configuration values can be over-ridden with environment variables as shown in this 184 | table: 185 | 186 | +----------------+----------------------------------------+---------+---------------+ 187 | | Variable Name | Description | Default | Format | 188 | +----------------+----------------------------------------+---------+---------------+ 189 | | cslb_dial_veto | Target veto period after dial fails | 1m | time.Duration | 190 | | cslb_hc_freq | Frequency of health checks per target | 50s | time.Duration | 191 | | cslb_hc_ok | strings.Contains in health check body | "OK" | String | 192 | | cslb_listen | Listen address for status server | | address:port | 193 | | cslb_nxd_ttl | Cache lifetime for NXDOMAIN SRVs | 20m | time.Duration | 194 | | cslb_srv_ttl | Cache lifetime for found SRVs | 5m | time.Duration | 195 | | cslb_tar_ttl | Cache lifetime for dial Targets | 5m | time.Duration | 196 | | cslb_templates | Alternate status server html/templates | | filepath.Glob | 197 | | cslb_timeout | Default intercept Dial duration | 1m | time.Duration | 198 | +----------------+----------------------------------------+---------+---------------+ 199 | 200 | Any values which are invalid or fall outside a reasonable range are ignored. 201 | 202 | # DETECTING A GOOD SERVICE 203 | 204 | Cslb only knows about the results of network connection attempts made by DialContext and the results 205 | of any configured health checks. If a service is accepting network connections but not responding to 206 | HTTP requests - or responding negatively - the client experiences failures but cslb will be unaware 207 | of these failures. The result is that cslb will continue to direct future Dial Requests to that 208 | faulty service in accordance with the SRV priorities. If your service is vulnerable to this 209 | scenario, active health checks are recommended. This could be something ss simple as an on-service 210 | health check which responds based on recent "200 OK" responses in the service log file. 211 | Alternatively an on-service monitor which closes the listen socket will also work. 212 | 213 | In general, defining a failing service is a complicated matter that only the application truly 214 | understands. For this reason health checks are used as an intermediary which does understand 215 | application level failures and converts them to simple language which cslb groks. 216 | 217 | # RECOMMENDED SETUP 218 | 219 | While every service is different there are a few general guidelines which apply to most services 220 | when using cslb. First of all, run simple health checks if you can and configure them for use by 221 | cslb. Second, have each target configured with both ipv4 and ipv6 addresses. This affords two 222 | potentially independent network paths to the targets. Furthermore, net.Dialer attempts both ipv4 and 223 | ipv6 connections simultaneously which maximizes responsiveness for the client. 224 | 225 | Third, consider a "canary" target as a low preference (highest numeric value SRV priority) 226 | target. If this "canary" target is accessed by cslb clients it tells you they are having trouble 227 | reaching their "real" targets. Being able to run a "canary" service is one of the side-benefits of 228 | cslb and SRVs. 229 | 230 | # CAVEATS 231 | 232 | Whan analyzing the Status Web Page or watching the Run Time Control output, observers need to be 233 | aware of caching by the http (and possibly other) packages. For example not every call to http.Get() 234 | results in a Dial Request as httpClient tries to re-use connections. 235 | 236 | In a similar vein if you change a DNS entry and don't believe cslb has noticed this change within an 237 | appropriate TTL amount of time, be aware that on some platforms the intervening recursive resolvers 238 | adjust TTLs as they see fit. For example some home-gamer routers are known to increase short TTLs to 239 | values they believe to be a more "appropriate" in an attempt to reduce their cache churn. 240 | 241 | Perhaps the biggest caveat of all is that cslb relies on being enabled for all http.Transports in 242 | use by your application. If you are importing a package (either directly or indirectly) which 243 | constructs its own http.Transports then you'll need to modify that package to call cslb.Enable() 244 | otherwise those http requests will not be intercepted. Of course if the package is making requests 245 | incidental to the core functionality of your application then maybe it doesn't matter and you can 246 | leave them be. Something to be aware of. 247 | 248 | ----- 249 | */ 250 | package cslb 251 | -------------------------------------------------------------------------------- /srv.go: -------------------------------------------------------------------------------- 1 | package cslb 2 | 3 | /* 4 | The srv structs cache SRV RRs. They are structured to make life easy for bestTarget() as that is 5 | presumed to be the most heavily used function in this package. The relationship between structs is: 6 | ceSRV->cePriority->ceTarget where cePriority contains all matching priorities and ceTarget contains 7 | all targets with that priority. 8 | */ 9 | 10 | import ( 11 | "context" 12 | "fmt" 13 | "net" 14 | "sort" 15 | "strings" 16 | "sync" 17 | "time" 18 | ) 19 | 20 | const ( 21 | smallChanceMultiplier = 1000 // Fraction of weight given to zero weighted targets 22 | ) 23 | 24 | type srvCache struct { 25 | sync.RWMutex // Protects everything within this struct 26 | done chan bool // Shuts down the cache cleaner 27 | cache map[string]*ceSRV // The cache key is ToLower(qName). 28 | } 29 | 30 | type ceSRV struct { 31 | expires time.Time // When this entry expire out of the cache 32 | lookups int // Includes initial lookup that creates the cache entry 33 | priorities []*cePriority // Slice of targets with equal priority 34 | uniqueTargetCount int // Count of all unique targets (host:port) 35 | } 36 | 37 | // String return a printable string of the cached Entry SRV 38 | func (t *ceSRV) String() (s string) { 39 | s = fmt.Sprintf("%s (%d):", t.expires, len(t.priorities)) 40 | for _, cep := range t.priorities { 41 | s += fmt.Sprintf("\n\tp=%d totw=%d (%d):", cep.priority, cep.totalWeight, len(cep.targets)) 42 | for _, cet := range cep.targets { 43 | s += fmt.Sprintf("\n\t\ttarw=%d %s:%d", cet.weight, cet.target, cet.port) 44 | } 45 | } 46 | return 47 | } 48 | 49 | type cePriority struct { 50 | priority int 51 | totalWeight int // Sum of all weights - used as an upper limit for the PRNG 52 | targets []*ceTarget // Slice of all targets within priority 53 | } 54 | 55 | type ceTarget struct { 56 | weight int 57 | port int 58 | target string 59 | } 60 | 61 | // healthStoreKey generates the lookup key for the healthStore. It's of the form host:port 62 | func (t *ceTarget) healthStoreKey() string { 63 | return makeHealthStoreKey(t.target, t.port) 64 | } 65 | 66 | func newSrvCache() *srvCache { 67 | return &srvCache{cache: make(map[string]*ceSRV), done: make(chan bool)} 68 | } 69 | 70 | func (t *srvCache) start(cacheInterval time.Duration) { 71 | go t.cleaner(cacheInterval) 72 | } 73 | 74 | func (t *srvCache) stop() { 75 | close(t.done) 76 | } 77 | 78 | // lookupSRV looks up the SRV RR for the domain. First it tries looking in the cache and if not 79 | // there, the DNS is consulted. The qName is of the form ToLower(_http._tcp.$domain) where "http" is 80 | // the service and "tcp" is the proto. The cache is updated with the results of the DNS lookup. 81 | // 82 | // lookupSRV returns a *ceSRV with an array of (possibly zero) net.SRV RRs even with an NXDomain 83 | // response (which comes back as an error). An empty list means the DNS lookup failed; normally this 84 | // means that the domain should not be considered to be under cslb control. 85 | // 86 | // We construct the full key/qName ourselves rather than rely on LookupSRV as we need the formed 87 | // qName for the cache lookup. I suppose we could have a different cache key and let LookupSRV do 88 | // it's thing but that would be a little confusing. Besides, the SRV name construction is well-known 89 | // and simple. 90 | func (t *cslb) lookupSRV(ctx context.Context, now time.Time, service, proto, domain string) *ceSRV { 91 | key := strings.ToLower("_" + service + "._" + proto + "." + domain) // rfc2782 format 92 | t.srvStore.RLock() 93 | cesrv := t.srvStore.cache[key] 94 | if cesrv != nil { // If cache entry exists we're done 95 | cesrv.lookups++ 96 | t.srvStore.RUnlock() 97 | return cesrv 98 | } 99 | t.srvStore.RUnlock() // Don't hold mutex across a possible DNS lookup 100 | 101 | cesrv = &ceSRV{expires: now.Add(t.NotFoundSRVTTL), lookups: 1} // Assume NXDomain 102 | _, srvList, _ := t.netResolver.LookupSRV(ctx, "", "", key) 103 | if len(srvList) > 0 { // Found something so transfer to the new ceSRV 104 | cesrv.expires = now.Add(t.FoundSRVTTL) 105 | cesrv.populate(srvList) 106 | } 107 | 108 | // Insert/over-write the cache entry. It's possible another go-routine snuck in while we 109 | // were off in DNS-land and created an entry with the same key. Oh well. The slowest 110 | // go-routine wins. We don't bother queuing all callers for the same cache key behind each 111 | // other and let one do the resolution. I suppose we could at some point in the future by 112 | // creating a sync.Cond and the resolving go-routine could Broadcast when done. Go makes 113 | // this quite easy but that's bound to be a premature optimization as a single program is 114 | // unlikely to be banging away at a single domain at the same instant in time without an 115 | // entry existing in our cache. 116 | 117 | targetKeys := cesrv.uniqueTargetKeys() 118 | cesrv.uniqueTargetCount = len(targetKeys) 119 | t.srvStore.Lock() 120 | t.srvStore.cache[key] = cesrv // cesrv is now read-only for the rest of its life 121 | t.srvStore.Unlock() 122 | t.populateHealthStore(now, targetKeys) 123 | 124 | return cesrv 125 | } 126 | 127 | // populate transfers the SRV RRs into the ceSRV. This means sorting the SRVs by priority order and 128 | // also calculating total weight so we can conveniently apply the SRV selection algorithm. 129 | // 130 | // The Golang resolver has pre-sorted the SRV RRs for us in priority/weight order but we don't rely 131 | // on that as there may be replacement resolvers or mock resolvers involved. 132 | // 133 | // RFC2782 says that "weights of 0 should have a very small chance of being selected" without 134 | // defining "very small". They way we achieve this as well as make our selection algorithm simple is 135 | // to give them collectively an effective weight of 0.1% of the total weights so that all zero 136 | // targets will on average get 1/1000th of traffic. That seems like a "very small chance" to me. 137 | func (t *ceSRV) populate(srvs []*net.SRV) { 138 | sort.Slice(srvs, func(i, j int) bool { return srvs[i].Priority < srvs[j].Priority }) 139 | var cep *cePriority // Ptr to current priority or nil if none yet 140 | for _, srv := range srvs { 141 | if len(srv.Target) == 0 { // RFC2782 says to ignore any zero length targets completely 142 | continue 143 | } 144 | if cep == nil || int(srv.Priority) != cep.priority { // Create a new higher priority? 145 | cep = &cePriority{priority: int(srv.Priority)} 146 | t.priorities = append(t.priorities, cep) 147 | } 148 | cet := &ceTarget{weight: int(srv.Weight) * smallChanceMultiplier, port: int(srv.Port), 149 | target: strings.ToLower(srv.Target)} 150 | cep.targets = append(cep.targets, cet) 151 | cep.totalWeight += cet.weight 152 | } 153 | 154 | // Assign a non-zero weight to targets with an SRV weight of zero 155 | 156 | for _, cep := range t.priorities { 157 | zeroWeightEntryCount := 0 158 | for _, cet := range cep.targets { 159 | if cet.weight == 0 { 160 | zeroWeightEntryCount++ 161 | } 162 | } 163 | 164 | if zeroWeightEntryCount == 0 { 165 | continue 166 | } 167 | 168 | verySmall := cep.totalWeight / smallChanceMultiplier 169 | verySmall = (verySmall + zeroWeightEntryCount - 1) / zeroWeightEntryCount 170 | if verySmall == 0 { // This can be true if totalWeight is very small or zero! 171 | verySmall = 1 172 | } 173 | for _, cet := range cep.targets { 174 | if cet.weight == 0 { 175 | cet.weight = verySmall 176 | cep.totalWeight += verySmall 177 | } 178 | } 179 | } 180 | } 181 | 182 | // bestTarget selects the "best" target to try and connect to based on the SRV selection algorithm 183 | // and the health of the targets. That is, pick the SRV set with the lowest-numerical priority 184 | // first. If all those targets are unavailable then pick the SRV set with the next lowest-numerical 185 | // priority. Within the selected priority weight is used to distribute load. E.g. a weight list of 186 | // a=1, b=2, c=3 would ideally have 12 requests distributed such that 2 go to a, 4 go to b and 6 go 187 | // to c. 188 | // 189 | // If all targets in all priorities are "bad" due to health checks or connection failures, pick the 190 | // the least-worst target which is the target with a nextDialAttempt closest to now. 191 | // 192 | // A synthesized SRV is always returned if there are any targets in the SRV. In the case of the 193 | // least-worst return, maybe the caller will get lucky and the connection will come good this time? 194 | // Or maybe they won't get lucky but at least they get to see a "connection failed" outcome and can 195 | // report it to something or someone. 196 | // 197 | // To summarize, the returned SRV will be one of the following in the order shown: 198 | // 199 | // - Highest Priority in good health in weight range - first choice within priority 200 | // - First target in highest Priority in good health - second choice within priority 201 | // - Same thing for each Priority down if none of the previous priorities are in good health 202 | // - Target with soonest next connection attempt regardless of priority or weight - least worst 203 | // 204 | // The caller should always check for a nil return, the other values in the returned SRV are mostly 205 | // returned as a convenience to the caller. They should not presume they are the exact same values 206 | // as retrieved from the DNS but they will be comparable. 207 | func (t *cslb) bestTarget(cesrv *ceSRV) (srv *net.SRV) { 208 | if len(cesrv.priorities) == 0 { // Either an NXDomain or SRV with zero length targets 209 | return nil 210 | } 211 | 212 | srv = &net.SRV{} // We will return something! 213 | now := time.Now() 214 | t.healthStore.RLock() // Apply Read lock across whole search rather than a nickle & dime approach 215 | defer t.healthStore.RUnlock() // whereby we may cycle the lock many times. 216 | 217 | // Search for the in-range weight but also note a target in good health in passing (called 218 | // our secondChoice) as the preferred weight may be in bad health in which case we'll take 219 | // any weight in the same priority as our second choice in preference to a lower priority. 220 | 221 | haveSecondChoice := false 222 | for _, cep := range cesrv.priorities { 223 | wix := t.randIntn(cep.totalWeight) // Select the weight value using a "cheap" RNG 224 | lower := 0 225 | upper := 0 226 | for _, cet := range cep.targets { 227 | ceh := t.healthStore.cache[cet.healthStoreKey()] 228 | upper += cet.weight 229 | if ceh == nil || ceh.isGood(now) { 230 | if wix >= lower && wix < upper { // Is this target in the weight range? 231 | srv.Target = cet.target 232 | srv.Port = uint16(cet.port) 233 | srv.Priority = uint16(cep.priority) 234 | srv.Weight = uint16(cet.weight) / smallChanceMultiplier 235 | return // This is expected to be the "happy path" 236 | } 237 | if !haveSecondChoice { // If we don't have a second choice yet, use this one 238 | haveSecondChoice = true 239 | srv.Target = cet.target 240 | srv.Port = uint16(cet.port) 241 | srv.Priority = uint16(cep.priority) 242 | srv.Weight = uint16(cet.weight) / smallChanceMultiplier 243 | } 244 | } 245 | lower = upper // Iterate over targets 246 | } 247 | if haveSecondChoice { // Preferred weight range was in bad health but we 248 | return // found a good health target in the preferred priority 249 | } 250 | } 251 | 252 | // Didn't find *any* healthy targets so search over *all* targets for least-worst. We don't 253 | // expect this to occur very often so the search loop is run a second time rather than add 254 | // complexity to the relatively simple search loop above. The least-worst target has the 255 | // soonest nextDialAttempt. Priority and weight are ignored as there is no point in 256 | // considering a higher priority target which has a nextDialAttempt way off into the future 257 | // as that means it's *just* failed whereas one that's a millisecond away from now has had 258 | // the longest time period to "come good". 259 | 260 | var smallestLeastWorst time.Time 261 | for _, cep := range cesrv.priorities { 262 | for _, cet := range cep.targets { 263 | ceh := t.healthStore.cache[cet.healthStoreKey()] 264 | nextAttempt := now 265 | if ceh != nil { // This could have changed underneath us, so be defensive 266 | nextAttempt = ceh.nextDialAttempt 267 | } 268 | 269 | // smallestLeastWorst starts life as IsZero() and nextAttempt is always 270 | // greater than zero so the first time thru this test always comes true and 271 | // sets a least-worst. 272 | 273 | if smallestLeastWorst.IsZero() || nextAttempt.Before(smallestLeastWorst) { 274 | srv.Target = cet.target 275 | srv.Port = uint16(cet.port) 276 | srv.Priority = uint16(cep.priority) 277 | srv.Weight = uint16(cet.weight) / smallChanceMultiplier 278 | smallestLeastWorst = nextAttempt 279 | } 280 | } 281 | } 282 | 283 | return 284 | } 285 | 286 | // uniqueTargetKeys returns a slice of all unique targets keys in the SRV (a key is host:port). The 287 | // SRV might actually have more targets than this count if some of the targets are identical. This 288 | // shouldn't occur in a single well-constructed SRV arrangement but targets might be shared across 289 | // other SRVs so we've generalized the need to cater for that such that a target is an independent 290 | // beast which just happens to be attached to one or more SRVs. 291 | func (t *ceSRV) uniqueTargetKeys() (tSlice []string) { 292 | dupes := make(map[string]bool) 293 | for _, cep := range t.priorities { 294 | for _, cet := range cep.targets { 295 | dupes[cet.healthStoreKey()] = true 296 | } 297 | } 298 | 299 | for k := range dupes { 300 | tSlice = append(tSlice, k) 301 | } 302 | 303 | return 304 | } 305 | 306 | // uniqueTargets returns the count of uniqueTargetKeys 307 | func (t *ceSRV) uniqueTargets() (count int) { 308 | return t.uniqueTargetCount 309 | } 310 | 311 | // cleaner periodically scans the srvStore for expired entries and deletes them. Normally run as a 312 | // go-routine. 313 | func (t *srvCache) cleaner(cleanInterval time.Duration) { 314 | ticker := time.NewTicker(cleanInterval) 315 | defer ticker.Stop() 316 | 317 | for { 318 | select { 319 | case <-t.done: 320 | return 321 | case now := <-ticker.C: 322 | t.clean(now) 323 | } 324 | } 325 | } 326 | 327 | func (t *srvCache) clean(now time.Time) { 328 | t.Lock() 329 | defer t.Unlock() 330 | 331 | for key, cesrv := range t.cache { 332 | if cesrv.expires.Before(now) { 333 | delete(t.cache, key) 334 | } 335 | } 336 | } 337 | 338 | // ceSrvAsStats is a clone of ceSRV (and related material) with exported variable for html.Template 339 | type ceSrvAsStats struct { 340 | CName string 341 | Expires string 342 | Lookups string 343 | Priority int 344 | Weight int 345 | Port int 346 | Target string 347 | GoodDials int // From healthStore 348 | FailedDials int 349 | IsGood bool 350 | } 351 | 352 | type srvStats struct { 353 | Srvs []ceSrvAsStats 354 | nxDomains []ceSrvAsStats 355 | } 356 | 357 | // getStats clones all the ceSRV entries into a struct suitable for the status service. This 358 | // shouldn't be too expensive as we don't expect a huge number of SRVs, but who knows? 359 | func (t *srvCache) getStats(hc *healthCache) *srvStats { 360 | now := time.Now() 361 | s := &srvStats{} 362 | t.RLock() 363 | defer t.RUnlock() 364 | 365 | s.Srvs = make([]ceSrvAsStats, 0, len(t.cache)*4) // Just guesses, but better than nothing and 366 | s.nxDomains = make([]ceSrvAsStats, 0, len(t.cache)) // over-sized is probably better than under-sized. 367 | for cname, cesrv := range t.cache { 368 | if len(cesrv.priorities) == 0 { // NXDomain? 369 | s.nxDomains = append(s.nxDomains, 370 | ceSrvAsStats{CName: cname, 371 | Expires: cesrv.expires.Sub(now).Truncate(time.Second).String(), 372 | Lookups: fmt.Sprintf("%d", cesrv.lookups), 373 | Target: "**NXDomain**"}) 374 | continue 375 | } 376 | for _, cep := range cesrv.priorities { 377 | for _, cet := range cep.targets { 378 | entry := ceSrvAsStats{CName: cname, 379 | Expires: cesrv.expires.Sub(now).Truncate(time.Second).String(), 380 | Lookups: fmt.Sprintf("%d", cesrv.lookups), 381 | Priority: cep.priority, 382 | Weight: cet.weight, 383 | Port: cet.port, 384 | Target: cet.target, 385 | IsGood: true} 386 | hc.RLock() 387 | ceh := hc.cache[cet.healthStoreKey()] 388 | if ceh != nil { 389 | entry.GoodDials = ceh.goodDials 390 | entry.FailedDials = ceh.failedDials 391 | entry.IsGood = ceh.isGood(now) 392 | } 393 | hc.RUnlock() 394 | 395 | s.Srvs = append(s.Srvs, entry) 396 | } 397 | } 398 | } 399 | 400 | return s 401 | } 402 | --------------------------------------------------------------------------------