├── fci ├── docs │ ├── fci_usage │ └── fci_metadata ├── test │ └── fci_test └── fci ├── ag ├── docs │ ├── ag_usage │ └── ag_metadata └── ag ├── LICENSE ├── go └── src │ ├── mssqlcommon │ ├── ocf │ │ ├── lib_test.go │ │ └── lib.go │ ├── lib_test.go │ ├── ag │ │ └── lib.go │ └── lib.go │ ├── fci-helper │ └── main.go │ └── ag-helper │ └── main.go ├── SECURITY.md └── README.md /fci/docs/fci_usage: -------------------------------------------------------------------------------- 1 | usage: $0 {start|stop|monitor|validate-all|meta-data} 2 | 3 | Expects to have a fully populated OCF RA-compliant environment set. 4 | -------------------------------------------------------------------------------- /ag/docs/ag_usage: -------------------------------------------------------------------------------- 1 | usage: $0 {start|stop|promote|demote|monitor|validate-all|meta-data} 2 | 3 | Expects to have a fully populated OCF RA-compliant environment set. 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2017 Microsoft Corporation 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /go/src/mssqlcommon/ocf/lib_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) Microsoft Corporation. 2 | 3 | package ocf 4 | 5 | import ( 6 | "os" 7 | "testing" 8 | ) 9 | 10 | func TestImportOcfExitCodes(t *testing.T) { 11 | // Note the values here are intentionally not the real values of these env vars. 12 | // In particular OCF_SUCCESS is not set to its real value of 0 to be able to check if it's correctly initialized 13 | var requiredEnvironmentVariables = map[string]string{ 14 | "OCF_SUCCESS": "1", 15 | "OCF_ERR_ARGS": "2", 16 | "OCF_ERR_CONFIGURED": "3", 17 | "OCF_ERR_GENERIC": "4", 18 | "OCF_ERR_PERM": "5", 19 | "OCF_ERR_UNIMPLEMENTED": "6", 20 | "OCF_FAILED_MASTER": "7", 21 | "OCF_NOT_RUNNING": "8", 22 | "OCF_RUNNING_MASTER": "9", 23 | } 24 | 25 | // All vars should be 0 initially 26 | if OCF_SUCCESS != 0 { 27 | t.Fatal("OCF_SUCCESS is not 0. Test is not starting from clean slate.") 28 | } 29 | 30 | for key, value := range requiredEnvironmentVariables { 31 | os.Setenv(key, value) 32 | } 33 | 34 | // All vars set to valid values 35 | err := ImportOcfExitCodes() 36 | if err != nil { 37 | t.Fatalf("Expected ImportOcfExitCodes to succeed but it failed: %s", err) 38 | } 39 | 40 | // One var not set 41 | os.Unsetenv("OCF_SUCCESS") 42 | err = ImportOcfExitCodes() 43 | if err == nil { 44 | t.Fatal("Expected ImportOcfExitCodes to fail but it succeeded") 45 | } 46 | if err.Error() != "OCF_SUCCESS is set to an invalid value []" { 47 | t.Fatalf("ImportOcfExitCodes did not fail with an error about OCF_SUCCESS being unset: %s", err.Error()) 48 | } 49 | 50 | // One var set to invalid value 51 | os.Setenv("OCF_SUCCESS", "A") 52 | err = ImportOcfExitCodes() 53 | if err == nil { 54 | t.Fatal("Expected ImportOcfExitCodes to fail but it succeeded") 55 | } 56 | if err.Error() != "OCF_SUCCESS is set to an invalid value [A]" { 57 | t.Fatalf("ImportOcfExitCodes did not fail with an error about OCF_SUCCESS being set to A: %s", err.Error()) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /go/src/mssqlcommon/lib_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) Microsoft Corporation. 2 | 3 | package mssqlcommon 4 | 5 | import ( 6 | "fmt" 7 | "testing" 8 | ) 9 | 10 | func TestDiagnose(t *testing.T) { 11 | t.Parallel() 12 | 13 | for _, system := range []bool{true, false} { 14 | for _, resource := range []bool{true, false} { 15 | for _, queryProcessing := range []bool{true, false} { 16 | // Local copies of loop variables for the closure to capture 17 | system := system 18 | resource := resource 19 | queryProcessing := queryProcessing 20 | 21 | t.Run(fmt.Sprintf("system = %t, resource = %t, queryProcessing = %t", system, resource, queryProcessing), func(t *testing.T) { 22 | t.Parallel() 23 | 24 | diagnostics := Diagnostics{System: system, Resource: resource, QueryProcessing: queryProcessing} 25 | err := Diagnose(diagnostics) 26 | 27 | if system && resource && queryProcessing { 28 | if err != nil { 29 | t.Fatalf("Expected Diagnose to succeed but it failed: %s", err) 30 | } 31 | } else { 32 | if err == nil { 33 | t.Fatal("Expected Diagnose to fail but it succeeded") 34 | } 35 | 36 | switch serverUnhealthyError := err.(type) { 37 | case *ServerUnhealthyError: 38 | if !system { 39 | if serverUnhealthyError.RawValue != ServerCriticalError { 40 | t.Fatalf("Diagnose did not fail with ServerCriticalError: %d", serverUnhealthyError.RawValue) 41 | } 42 | 43 | if serverUnhealthyError.Inner.Error() != "sp_server_diagnostics result indicates system error" { 44 | t.Fatalf("Diagnose did not fail with an error about system error: %s", serverUnhealthyError.Inner.Error()) 45 | } 46 | } else if !resource { 47 | if serverUnhealthyError.RawValue != ServerModerateError { 48 | t.Fatalf("Diagnose did not fail with ServerModerateError: %d", serverUnhealthyError.RawValue) 49 | } 50 | 51 | if serverUnhealthyError.Inner.Error() != "sp_server_diagnostics result indicates resource error" { 52 | t.Fatalf("Diagnose did not fail with an error about resource error: %s", serverUnhealthyError.Inner.Error()) 53 | } 54 | } else if !queryProcessing { 55 | if serverUnhealthyError.RawValue != ServerAnyQualifiedError { 56 | t.Fatalf("Diagnose did not fail with ServerAnyQualifiedError: %d", serverUnhealthyError.RawValue) 57 | } 58 | 59 | if serverUnhealthyError.Inner.Error() != "sp_server_diagnostics result indicates query processing error" { 60 | t.Fatalf("Diagnose did not fail with an error about query processing error: %s", serverUnhealthyError.Inner.Error()) 61 | } 62 | } else { 63 | t.Fatal("Unreachable") 64 | } 65 | 66 | default: 67 | t.Fatal("Diagnose did not return an error of type ServerUnhealthyError") 68 | } 69 | } 70 | }) 71 | } 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repository contains the source code of the Pacemaker resource agents that ship in the mssql-server-ha package. 2 | 3 | This is a snapshot of our SQL Server-internal repository where the actual development takes place. As the commit histories are completely different, we cannot currently accept pull requests to this repository. This snapshot is provided so that users can see the source of the agents, make changes to suit any specific scenarios they have, write agents for other clustering systems following the same protocol, and so on. We intend to migrate development to this repository at a future date. 4 | 5 | We're happy to receive bug reports, suggestions and feedback for the code in this repository as Github issues. 6 | 7 | Kubernetes agents for monitoring SQL Server instances and Availability Groups are also coming, and will be added to this repository at a future date. 8 | 9 | 10 | # Availability Group resource agent `ocf:mssql:ag` 11 | 12 | This is made up of a golang binary `go/src/ag-helper` and a shell script `ag/ag`. `ag-helper` can be built by running `GOPATH=$PWD/go go install ag-helper` 13 | 14 | The agent can be installed by moving the files to these locations: 15 | 16 | - `ag/ag` to `/usr/lib/ocf/resource.d/mssql/ag` 17 | - `ag/docs/*` to `/usr/lib/ocf/lib/mssql/*` 18 | - `go/bin/ag-helper` to `/usr/lib/ocf/lib/mssql/ag-helper` 19 | 20 | The shell script is the entry point for the resource agent and delegates to the helper binary for most tasks. The helper binary monitors the instance health by running `sp_server_diagnostics` and the AG health by querying `sys.databases`. It also implements the promote and demote actions by running the `ALTER AVAILABILITY GROUP FAILOVER` and `ALTER AVAILABILITY GROUP SET (ROLE = SECONDARY)` DDLs. 21 | 22 | 23 | Major changes since SQL2017: 24 | 25 | - Provide hostname support for ag and fci 26 | - Install External Lease in Pacemaker 27 | - Introduce external write lease handling to Pacemaker AG resource agent 28 | - Not to wait for databases to come online during failover 29 | - Bring secondaries offline in post promote 30 | - Various Pacemaker AG agent fixes for more reliable failovers 31 | 32 | # Failover Cluster Instance resource agent `ocf:mssql:fci` 33 | 34 | This is made up of a golang binary `go/src/fci-helper` and a shell script `fci/fci`. `fci-helper` can be built by running `GOPATH=$PWD/go go install fci-helper` 35 | 36 | The agent can be installed by moving the files to these locations: 37 | 38 | - `fci/fci` to `/usr/lib/ocf/resource.d/mssql/fci` 39 | - `fci/docs/*` to `/usr/lib/ocf/lib/mssql/*` 40 | - `go/bin/fci-helper` to `/usr/lib/ocf/lib/mssql/fci-helper` 41 | 42 | The shell script is the entry point for the resource agent and handles starting and stopping the `sqlservr` process. The script invokes the `fci-helper` binary to fixup the server name after starting the resource (if necessary), and to monitor the instance health by running `sp_server_diagnostics` 43 | 44 | 45 | Major changes since SQL2017: 46 | 47 | - Ensure ag-helper and fci-helper exit if the resource agent process is killed 48 | - ag helper will reattempt connection if connection times out for monitor action 49 | 50 | 51 | # License 52 | 53 | MIT 54 | 55 | 56 | # Contributing 57 | 58 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 59 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 60 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 61 | -------------------------------------------------------------------------------- /go/src/mssqlcommon/ocf/lib.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) Microsoft Corporation. 2 | 3 | // Package ocf contains items related to SQL Server on Pacemaker. 4 | package ocf 5 | 6 | import ( 7 | "fmt" 8 | "log" 9 | "os" 10 | "strconv" 11 | "strings" 12 | "syscall" 13 | ) 14 | 15 | type OcfExitCode int 16 | 17 | var ( 18 | OCF_ERR_CONFIGURED OcfExitCode 19 | OCF_ERR_GENERIC OcfExitCode 20 | OCF_ERR_ARGS OcfExitCode 21 | OCF_ERR_PERM OcfExitCode 22 | OCF_ERR_UNIMPLEMENTED OcfExitCode 23 | OCF_FAILED_MASTER OcfExitCode 24 | OCF_NOT_RUNNING OcfExitCode 25 | OCF_RUNNING_MASTER OcfExitCode 26 | OCF_SUCCESS OcfExitCode 27 | ) 28 | 29 | // -------------------------------------------------------------------------------------- 30 | // Function: ImportOcfExitCodes 31 | // 32 | // Description: 33 | // Imports the OCF exit codes from corresponding environment variables. 34 | // 35 | func ImportOcfExitCodes() error { 36 | var err error 37 | 38 | OCF_ERR_CONFIGURED, err = importOcfExitCode("OCF_ERR_CONFIGURED") 39 | if err != nil { 40 | return err 41 | } 42 | 43 | OCF_ERR_GENERIC, err = importOcfExitCode("OCF_ERR_GENERIC") 44 | if err != nil { 45 | return err 46 | } 47 | 48 | OCF_ERR_ARGS, err = importOcfExitCode("OCF_ERR_ARGS") 49 | if err != nil { 50 | return err 51 | } 52 | 53 | OCF_ERR_PERM, err = importOcfExitCode("OCF_ERR_PERM") 54 | if err != nil { 55 | return err 56 | } 57 | 58 | OCF_ERR_UNIMPLEMENTED, err = importOcfExitCode("OCF_ERR_UNIMPLEMENTED") 59 | if err != nil { 60 | return err 61 | } 62 | 63 | OCF_FAILED_MASTER, err = importOcfExitCode("OCF_FAILED_MASTER") 64 | if err != nil { 65 | return err 66 | } 67 | 68 | OCF_NOT_RUNNING, err = importOcfExitCode("OCF_NOT_RUNNING") 69 | if err != nil { 70 | return err 71 | } 72 | 73 | OCF_RUNNING_MASTER, err = importOcfExitCode("OCF_RUNNING_MASTER") 74 | if err != nil { 75 | return err 76 | } 77 | 78 | OCF_SUCCESS, err = importOcfExitCode("OCF_SUCCESS") 79 | if err != nil { 80 | return err 81 | } 82 | 83 | return nil 84 | } 85 | 86 | func importOcfExitCode(name string) (OcfExitCode, error) { 87 | stringValue := os.Getenv(name) 88 | intValue, err := strconv.Atoi(stringValue) 89 | if err != nil { 90 | return 0, fmt.Errorf("%s is set to an invalid value [%s]", name, stringValue) 91 | } 92 | 93 | return OcfExitCode(intValue), nil 94 | } 95 | 96 | // Function: Exit 97 | // 98 | // Description: 99 | // Helper to exit with the given exit code and error. 100 | // 101 | func Exit(logger *log.Logger, exitCode int, err error) error { 102 | if err != nil { 103 | // Print each line individually to ensure that each line is prefixed with the logger prefix 104 | for _, line := range strings.Split(err.Error(), "\n") { 105 | logger.Println(line) 106 | } 107 | } 108 | 109 | os.Exit(exitCode) 110 | 111 | return nil 112 | } 113 | 114 | // KillCurrentProcessWhenParentExits uses prctl to request that the current process receive a SIGKILL if its parent process dies. 115 | // This ensures that the helper processes spawned by the resource agent shell script gets cleaned up if Pacemaker kills the resource agent's shell process 116 | // (due to op timeout, etc.) 117 | // 118 | // This uses syscall.Syscall instead of unix.Prctl since we don't have golang.org/x/sys/unix 119 | func KillCurrentProcessWhenParentExits() error { 120 | _, _, errno := syscall.Syscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0) 121 | if errno != 0 { 122 | return fmt.Errorf("prctl failed with errno %d", errno) 123 | } 124 | 125 | return nil 126 | } 127 | 128 | // Function: OcfExit 129 | // 130 | // Description: 131 | // Helper to exit with the given OCF exit code and error. 132 | // 133 | // To distinguish OCF exit codes from other exit codes (like 1 for panics), 134 | // the actual exit code is 10 + the OCF exit code. 135 | // 136 | func OcfExit(logger *log.Logger, ocfExitCode OcfExitCode, err error) error { 137 | return Exit(logger, int(ocfExitCode)+10, err) 138 | } 139 | -------------------------------------------------------------------------------- /fci/docs/fci_metadata: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 1.0 5 | 6 | Stateless resource agent for a SQL Server Failover Cluster Instance. 7 | 8 | Failover Cluster Instance resource agent. 9 | 10 | 11 | 12 | User account under which SQL Server will run. 13 | 14 | SQL Server's user account. 15 | 16 | 17 | 18 | Path to the SQL Server binary. 19 | SQL Server binary 20 | 21 | 22 | 23 | Command line arguments for SQL Server. 24 | SQL Server arguments 25 | 26 | 27 | 28 | Working directory for SQL Server. 29 | Working directory 30 | 31 | 32 | 33 | Status file location. The status is used to determine with SQL Server crashed or was properly stopped by us. 34 | Status file 35 | 36 | 37 | 38 | 39 | Monitoring policy options are: 40 | 41 | 1) SERVER_DOWN: only restart or failover if SQL Server is down (the process is not running) 42 | 2) SERVER_UNRESPONSIVE: restart or failover if SQL Server is unresponsive (unable to establish a connection) 43 | 3) SERVER_CRITICAL_ERROR: restart or failover when sp_server_diagnostics detects a critical system error 44 | 4) SERVER_MODERATE_ERROR: restart or failover when sp_server_diagnostics detects a critical system or resource error 45 | 5) SERVER_ANY_QUALIFIED_ERROR: restart or failover when sp_server_diagnostics detects any qualified error 46 | Monitoring policy 47 | 48 | 49 | 50 | Login and query execution timeout for monitoring in seconds. 51 | Login and query execution timeout for monitoring. 52 | 53 | 54 | 55 | This parameter is unused and only kept for backward-compatibility. Set monitor_timeout instead. 56 | Unused. 57 | 58 | 59 | 60 | 61 | Timeout for stopping SQL Server. The resource agent will first attempt to kill SQL Server politely (using a TERM signal). SQL Server will then checkpoint all databases and exit. If the SQL Server process has not exited by the timeout then the resource agent will attempt to forcibly kill it by sending SIGKILL. 62 | 63 | Timeout for stopping the SQL Server process. 64 | 65 | 66 | 67 | 68 | Credentials for a SQL Server user the resource agent will log in as to execute a stored procedure to monitor instance health. This file should contain two lines, the first with the username and second with the password. 69 | 70 | Location of file containing a SQL Server user credential the resource agent can use. 71 | 72 | 73 | 74 | Port SQL Server listens on. 75 | Port 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /go/src/fci-helper/main.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) Microsoft Corporation. 2 | 3 | package main 4 | 5 | import ( 6 | "database/sql" 7 | "errors" 8 | "flag" 9 | "fmt" 10 | "log" 11 | "os" 12 | "strings" 13 | "time" 14 | 15 | _ "github.com/denisenkom/go-mssqldb" 16 | 17 | "mssqlcommon" 18 | mssqlocf "mssqlcommon/ocf" 19 | ) 20 | 21 | /* 22 | Program to be called from the mssql:fci resource agent to monitor SQL Server health. 23 | 24 | Determines the health of the specified SQL Server instance based on 25 | 1) whether a connection can be established to the instance, and 26 | 2) the results of the 'sp_server_diagnostics' stored procedure 27 | */ 28 | 29 | func main() { 30 | stdout := log.New(os.Stdout, "", log.LstdFlags) 31 | stderr := log.New(os.Stderr, "ERROR: ", log.LstdFlags) 32 | 33 | err := mssqlocf.KillCurrentProcessWhenParentExits() 34 | if err != nil { 35 | mssqlocf.Exit(stderr, 1, fmt.Errorf("Unexpected error: %s", err)) 36 | } 37 | 38 | err = doMain(stdout, stderr) 39 | if err != nil { 40 | mssqlocf.Exit(stderr, 1, fmt.Errorf("Unexpected error: %s", err)) 41 | } 42 | } 43 | 44 | func doMain(stdout *log.Logger, stderr *log.Logger) error { 45 | var ( 46 | hostname string 47 | sqlPort uint64 48 | credentialsFile string 49 | applicationName string 50 | rawConnectionTimeout int64 51 | rawHealthThreshold uint 52 | rawMonitorTimeout int64 53 | 54 | action string 55 | 56 | virtualServerName string 57 | ) 58 | 59 | flag.StringVar(&hostname, "hostname", "localhost", "The hostname of the SQL Server instance to connect to. Default: localhost") 60 | flag.Uint64Var(&sqlPort, "port", 0, "The port on which the instance is listening for logins.") 61 | flag.StringVar(&credentialsFile, "credentials-file", "", "The path to the credentials file.") 62 | flag.StringVar(&applicationName, "application-name", "", "The application name to use for the T-SQL connection.") 63 | flag.Int64Var(&rawConnectionTimeout, "connection-timeout", 30, "The connection timeout in seconds. "+ 64 | "The application will retry connecting to the instance until this time elapses. Default: 30") 65 | flag.UintVar(&rawHealthThreshold, "health-threshold", uint(mssqlcommon.ServerCriticalError), "The instance health threshold. Default: 3 (SERVER_CRITICAL_ERROR)") 66 | 67 | flag.StringVar(&action, "action", "", `One of --start, --monitor 68 | start: Start the replica on this node. 69 | monitor: Monitor the replica on this node.`) 70 | 71 | flag.StringVar(&virtualServerName, "virtual-server-name", "", "The virtual server name that should be set on the SQL Server instance.") 72 | flag.Int64Var(&rawMonitorTimeout, "monitor-interval-timeout", 0, "The monitor interval timeout in seconds. "+ 73 | "For FCI this is expected to be always Default: 0") 74 | 75 | flag.Parse() 76 | 77 | stdout.Printf( 78 | "fci-helper invoked with hostname [%s]; port [%d]; credentials-file [%s]; application-name [%s]; connection-timeout [%d]; health-threshold [%d]; action [%s]\n", 79 | hostname, sqlPort, 80 | credentialsFile, 81 | applicationName, 82 | rawConnectionTimeout, rawHealthThreshold, 83 | action) 84 | 85 | switch action { 86 | case "start": 87 | stdout.Printf( 88 | "fci-helper invoked with virtual-server-name [%s]\n", 89 | virtualServerName) 90 | 91 | case "monitor": 92 | stdout.Printf( 93 | "fci-helper invoked with virtual-server-name [%s]\n", 94 | virtualServerName) 95 | } 96 | 97 | if hostname == "" { 98 | return errors.New("a valid hostname must be specified using --hostname") 99 | } 100 | 101 | if sqlPort == 0 { 102 | return errors.New("a valid port number must be specified using --port") 103 | } 104 | 105 | if credentialsFile == "" { 106 | return errors.New("a valid path to a credentials file must be specified using --credentials-file") 107 | } 108 | 109 | if applicationName == "" { 110 | return errors.New("a valid application name must be specified using --application-name") 111 | } 112 | 113 | if action == "" { 114 | return errors.New("a valid action must be specified using --action") 115 | } 116 | 117 | if action == "start" || action == "monitor" { 118 | if virtualServerName == "" { 119 | return errors.New("a valid virtual server name must be specified using --virtual-server-name") 120 | } 121 | } 122 | 123 | err := mssqlocf.ImportOcfExitCodes() 124 | if err != nil { 125 | return err 126 | } 127 | 128 | connectionTimeout := time.Duration(rawConnectionTimeout) * time.Second 129 | monitorTimeout := time.Duration(rawMonitorTimeout) * time.Second 130 | healthThreshold := mssqlcommon.ServerHealth(rawHealthThreshold) 131 | 132 | sqlUsername, sqlPassword, err := mssqlcommon.ReadCredentialsFile(credentialsFile) 133 | if err != nil { 134 | return mssqlocf.OcfExit(stderr, mssqlocf.OCF_ERR_ARGS, fmt.Errorf("Could not read credentials file: %s", err)) 135 | } 136 | 137 | db, err := mssqlcommon.OpenDBWithHealthCheck( 138 | hostname, sqlPort, 139 | sqlUsername, sqlPassword, 140 | applicationName, 141 | connectionTimeout, connectionTimeout, 142 | monitorTimeout, 143 | stdout) 144 | if err != nil { 145 | switch serverUnhealthyError := err.(type) { 146 | case *mssqlcommon.ServerUnhealthyError: 147 | if serverUnhealthyError.RawValue <= healthThreshold { 148 | return mssqlocf.OcfExit(stderr, mssqlocf.OCF_ERR_GENERIC, fmt.Errorf( 149 | "Instance health status %d is at or below the threshold value of %d", 150 | serverUnhealthyError.RawValue, healthThreshold)) 151 | } 152 | 153 | stdout.Printf("Instance health status %d is greater than the threshold value of %d\n", serverUnhealthyError.RawValue, healthThreshold) 154 | 155 | default: 156 | return err 157 | } 158 | } 159 | defer db.Close() 160 | 161 | var ocfExitCode mssqlocf.OcfExitCode 162 | 163 | switch action { 164 | case "start": 165 | ocfExitCode, err = start(db, virtualServerName, stdout) 166 | 167 | case "monitor": 168 | ocfExitCode, err = monitor(db, virtualServerName, stdout) 169 | 170 | default: 171 | return fmt.Errorf("unknown value for --action %s", action) 172 | } 173 | 174 | return mssqlocf.OcfExit(stderr, ocfExitCode, err) 175 | } 176 | 177 | // Function: start 178 | // 179 | // Description: 180 | // Implements the OCF "start" action 181 | // 182 | func start(db *sql.DB, virtualServerName string, stdout *log.Logger) (mssqlocf.OcfExitCode, error) { 183 | stdout.Printf("Setting local server name to %s...\n", virtualServerName) 184 | 185 | err := mssqlcommon.SetLocalServerName(db, virtualServerName) 186 | if err != nil { 187 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf("Could not set local server name: %s", err) 188 | } 189 | 190 | return monitor(db, virtualServerName, stdout) 191 | } 192 | 193 | // Function: monitor 194 | // 195 | // Description: 196 | // Implements the OCF "monitor" action 197 | // 198 | func monitor(db *sql.DB, virtualServerName string, stdout *log.Logger) (mssqlocf.OcfExitCode, error) { 199 | stdout.Println("Querying local server name...") 200 | 201 | currentServerName, err := mssqlcommon.GetLocalServerName(db) 202 | if err != nil { 203 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf("Could not query local server name: %s", err) 204 | } 205 | 206 | stdout.Printf("Local server name is %s\n", currentServerName) 207 | 208 | if !strings.EqualFold(currentServerName, virtualServerName) { 209 | return mssqlocf.OCF_ERR_ARGS, fmt.Errorf("Expected local server name to be %s but it was %s", virtualServerName, currentServerName) 210 | } 211 | 212 | return mssqlocf.OCF_SUCCESS, nil 213 | } 214 | -------------------------------------------------------------------------------- /ag/docs/ag_metadata: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 1.0 5 | 6 | Stateful resource agent for a SQL Server Availability Group. 7 | 8 | Availability Group resource agent. 9 | 10 | 11 | 12 | The name of the Availability Group that this resource will represent. 13 | 14 | Name of the AG. 15 | 16 | 17 | 18 | 19 | Login and query execution timeout in seconds. Default: 30 20 | 21 | The value of this parameter should be higher than the longest time it takes for any database in the AG to complete recovery. 22 | For example, if a database in the AG may take up to about 2 minutes to recover, this parameter should be set to 120 or higher. 23 | 24 | Login and query execution timeout. 25 | 26 | 27 | 28 | 29 | If the primary is unable to commit AG configuration updates with a sufficient number of other SYNCHRONOUS_COMMIT and CONFIGURATION_ONLY replicas in the AG, the agent will disable the primary by not renewing the primary's write lease. If the time in seconds specified by this value has passed since the primary was unable to commit a configuration update, the agent will stop renewing the primary's write lease, causing all pending transactions to fail and the DB to become inaccessible. Default: 60 30 | 31 | How long the primary can block on committing a configuration update before the agent stops renewing its write lease. 32 | 33 | 34 | 35 | This parameter is unused and only kept for backward-compatibility. Set connection_timeout instead. 36 | Unused. 37 | 38 | 39 | 40 | 41 | Monitoring policy options are: 42 | 43 | 1) SERVER_UNRESPONSIVE_OR_DOWN: Fail if the SQL Server instance is unresponsive (unable to establish a connection) or down (the process is not running) 44 | 3) SERVER_CRITICAL_ERROR: Fail if sp_server_diagnostics detects a critical system error 45 | 4) SERVER_MODERATE_ERROR: Fail if sp_server_diagnostics detects a critical system or resource error 46 | 5) SERVER_ANY_QUALIFIED_ERROR: Fail if sp_server_diagnostics detects any qualified error 47 | 48 | Monitoring policy 49 | 50 | 51 | 52 | This parameter is deprecated. Set connection_timeout instead. 53 | Deprecated. 54 | 55 | 56 | 57 | 58 | Path to a file containing the credentials for a SQL Server user. The resource agent will login using these credentials to perform actions against the instance. 59 | 60 | This file should contain two lines separated by LF. The first line should have the username, and the second line should have the password. 61 | 62 | Path to a file containing the credentials for a SQL Server user. 63 | 64 | 65 | 66 | 67 | This parameter is unused. Set the timeouts of the start, monitor and promote actions instead. 68 | 69 | This parameter used to control how long the resource agent waited for all databases of an AG to be ONLINE on a primary replica with DB_FAILOVER = ON. 70 | 71 | The resource agent has been changed to wait indefinitely, until the corresponding action (start / monitor / promote) times out, so this parameter is no longer used. 72 | 73 | Unused. 74 | 75 | 76 | 77 | The TSQL port that the SQL Server instance listens on. 78 | TSQL port 79 | 80 | 81 | 82 | 83 | If set, the agent will renew the primary's AG write lease to this value in seconds. Otherwise the agent will set it based on the monitor action's interval and timeout. 84 | 85 | It is recommended to set this value greater than the sum of the monitor action's interval and timeout. 86 | 87 | Primary write lease duration. 88 | 89 | 90 | 91 | The name of the SQL Server process. Default: sqlservr 92 | The name of the SQL Server process. 93 | 94 | 95 | 96 | This parameter is deprecated. Set required_synchronized_secondaries_to_commit instead. 97 | Deprecated. 98 | 99 | 100 | 101 | 102 | If set, the agent will set REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT to this value. Otherwise the agent will calculate a value based on the number of SYNCHRONOUS_COMMIT replicas. 103 | 104 | Override for the default REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT value. 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /fci/test/fci_test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (C) Microsoft Corporation. 4 | # 5 | # Some simple functional tests for the resource agent. 6 | # Should only be run when nothing else will be starting or stopping SQL Server. 7 | # Any of the default values set in the resource agent can be overriden by exporting an env variable with the same name. 8 | # This even applies to paths of other files the resource agent uses. 9 | # So, for example, setting $FCI_HELPER_BIN allows use of another monitoring script 10 | # 11 | # The --ra-output parameter specifies where resource agent output is piped to, 12 | # --test-output specifies where test output is piped to. 13 | # 14 | # By default, output from the resource agent is suppressed and output from the test goes to the terminal. 15 | # 16 | 17 | usage() { 18 | cat<<-EOF 19 | fci_test [--ra-output --test-output --test-cases "testcase_1 testcase_2"] 20 | 21 | The default behavior is to run all tests. If any specific test cases are specified only those will be run. 22 | Expects resource agent and SQL Server to be installed. To test basic ops, run "fci_test --test-cases run_start_monitor_stop" 23 | EOF 24 | } 25 | 26 | MODE="test" 27 | RA_OUTPUT=/dev/null 28 | TEST_LOGS=/dev/null 29 | TEST_OUTPUT=/dev/tty 30 | ARGS_TEST_CASES="" 31 | 32 | OCF_RESOURCE_INSTANCE='fci1' 33 | OCF_RESKEY_CRM_meta_timeout='30000' 34 | 35 | while [[ $# -gt 1 ]]; do 36 | case "$1" in 37 | --ra-output) 38 | RA_OUTPUT="$2" 39 | ;; 40 | --test-output) 41 | TEST_OUTPUT="$2" 42 | ;; 43 | --test-log) 44 | TEST_LOGS="$2" 45 | ;; 46 | --test-cases) 47 | ARGS_TEST_CASES="$2" 48 | ;; 49 | *) 50 | usage 51 | exit 1 52 | ;; 53 | esac 54 | shift 2 55 | done 56 | 57 | if [[ $# -ne 0 ]]; then 58 | usage 59 | exit 1 60 | fi 61 | 62 | # By default use the resource agent that's installed 63 | # Override the FCI environment variable to use another 64 | # 65 | : ${FCI=/usr/lib/ocf/resource.d/mssql/fci} 66 | . ${FCI} 67 | 68 | 69 | printerror() { 70 | echo "$@" &>> $TEST_OUTPUT 71 | } 72 | run_agent() { 73 | local output rc 74 | "$@" &>> $RA_OUTPUT 75 | return $? 76 | } 77 | error() { 78 | printerror "***Error encountered in ${FUNCNAME[1]} at ${BASH_LINENO[0]}:***" 79 | printerror "$@" 80 | printerror "" 81 | } 82 | log() { 83 | echo "$*" &>> $TEST_LOGS 84 | } 85 | 86 | cleanup() { 87 | if ! run_agent mssql_stop; then 88 | error "mssql_stop failed; cannot cleanup." 89 | return 1 90 | fi 91 | } 92 | 93 | # expected cases 94 | # start, monitor, stop, all in order 95 | run_start_monitor_stop() { 96 | local rc 97 | 98 | run_agent mssql_start 99 | rc=$? 100 | if [[ $rc -ne $OCF_SUCCESS ]]; then 101 | error "mssql_start returned with $rc, expected value: $OCF_SUCCESS" 102 | return 1 103 | fi 104 | 105 | if ! pidof $OCF_RESKEY_binary >/dev/null; then 106 | error "Expects $OCF_RESKEY_binary to be running" 107 | return 1 108 | fi 109 | 110 | run_agent mssql_monitor 111 | rc=$? 112 | if [[ $rc -ne $OCF_SUCCESS ]]; then 113 | error "mssql_monitor returned with $rc, expected value: $OCF_SUCCESS" 114 | return 1 115 | fi 116 | 117 | run_agent mssql_stop 118 | rc=$? 119 | if [[ $rc -ne $OCF_SUCCESS ]]; then 120 | error "mssql_stop returned with $rc, expected value: $OCF_SUCCESS" 121 | return 1 122 | fi 123 | } 124 | 125 | # start, monitor, kill SQL Server, monitor again, stop, monitor again 126 | # after killing SQL Server monitor should return OCF_ERR_GENERIC 127 | # stop should succeed 128 | # after running stop monitor should return OCF_NOT_RUNNING 129 | run_start_monitor_kill_monitor_stop() { 130 | local rc pids 131 | 132 | run_agent mssql_start 133 | rc=$? 134 | if [[ $rc -ne $OCF_SUCCESS ]]; then 135 | error "mssql_monitor returned with $rc, expected value: $OCF_SUCCESS" 136 | return 1 137 | fi 138 | 139 | run_agent mssql_monitor 140 | rc=$? 141 | if [[ $rc -ne $OCF_SUCCESS ]]; then 142 | error "mssql_monitor returned with $rc, expected value: $OCF_SUCCESS" 143 | return 1 144 | fi 145 | 146 | # kill SQL 147 | pids=$(pidof "$OCF_RESKEY_binary") 148 | for pid in $pids; do 149 | kill "$pid" 150 | done 151 | 152 | run_agent mssql_monitor 153 | rc=$? 154 | if [[ $rc -ne $OCF_ERR_GENERIC ]]; then 155 | error "mssql_monitor returned $rc, expected $OCF_ERR_GENERIC" 156 | return 1 157 | fi 158 | 159 | # this should succeed 160 | run_agent mssql_stop 161 | rc=$? 162 | if [[ $rc -ne $OCF_SUCCESS ]]; then 163 | error "mssql_stop returned with $rc, expected value: $OCF_SUCCESS" 164 | return 1 165 | fi 166 | 167 | # now monitor should return not_running 168 | run_agent mssql_monitor 169 | rc=$? 170 | if [[ $rc != $OCF_NOT_RUNNING ]]; then 171 | error "mssql_monitor returned $rc, expected $OCF_NOT_RUNNING" 172 | return 1 173 | fi 174 | } 175 | 176 | # running start twice should succeed both times 177 | run_start_twice() { 178 | run_agent mssql_start 179 | rc=$? 180 | if [[ $rc -ne $OCF_SUCCESS ]]; then 181 | error "mssql_start returned with $rc, expected value: $OCF_SUCCESS" 182 | return 1 183 | fi 184 | 185 | run_agent mssql_start 186 | rc=$? 187 | if [[ $rc -ne $OCF_SUCCESS ]]; then 188 | error "mssql_start returned with $rc, expected value: $OCF_SUCCESS" 189 | return 1 190 | fi 191 | 192 | if ! pidof $OCF_RESKEY_binary >/dev/null; then 193 | error "Expects $OCF_RESKEY_binary to be running" 194 | return 1 195 | fi 196 | } 197 | 198 | # running stop twice should succeed both times 199 | run_stop_twice() { 200 | run_agent mssql_start 201 | rc=$? 202 | if [[ $rc -ne $OCF_SUCCESS ]]; then 203 | error "mssql_start returned with $rc, expected value: $OCF_SUCCESS" 204 | return 1 205 | fi 206 | 207 | run_agent mssql_stop 208 | rc=$? 209 | if [[ $rc -ne $OCF_SUCCESS ]]; then 210 | error "mssql_stop returned with $rc, expected value: $OCF_SUCCESS" 211 | return 1 212 | fi 213 | 214 | run_agent mssql_stop 215 | rc=$? 216 | if [[ $rc -ne $OCF_SUCCESS ]]; then 217 | error "mssql_stop returned with $rc, expected value: $OCF_SUCCESS" 218 | return 1 219 | fi 220 | 221 | if pidof $OCF_RESKEY_binary; then 222 | error "Expects $OCF_RESKEY_binary not to be running" 223 | return 1 224 | fi 225 | } 226 | 227 | # Stop operation should succeed even in the middle of a start operation 228 | # Start operation will probably exit with an error (although we don't really care) 229 | stop_during_start() { 230 | local start_pid 231 | local seconds="0.2 0.5 1 3 10" 232 | 233 | for i in $seconds; do 234 | run_agent mssql_start & 235 | start_pid=$! 236 | log "mssql_start started with pid: $start_pid" 237 | 238 | sleep "$i" 239 | log "Running mssql_stop" 240 | if ! run_agent mssql_stop; then 241 | error "Expects mssql_stop to succeed $i seconds after running mssql_start" 242 | return 1 243 | fi 244 | 245 | wait $start_pid 246 | rc=$? 247 | log "mssql_start finished with exit code: $rc. Most likely (but not necessarily) will be non-zero." 248 | 249 | if pidof $OCF_RESKEY_binary; then 250 | error "Expects no SQL Server processes to be running" 251 | return 1 252 | fi 253 | done 254 | } 255 | 256 | # Stop operation should succeed in the middle of a monitor op 257 | # Monitor will probably exit with an error (although we don't really care) 258 | stop_during_monitor() { 259 | local seconds="0.2 1 3" 260 | 261 | for i in $seconds; do 262 | run_agent mssql_start 263 | 264 | run_agent mssql_monitor & 265 | start_pid=$! 266 | log "mssql_monitor started with pid: $start_pid" 267 | 268 | sleep $i 269 | log "Running mssql_stop" 270 | if ! run_agent mssql_stop; then 271 | error "Expects mssql_stop to succeed $i seconds after running mssql_start" 272 | return 1 273 | fi 274 | 275 | wait $start_pid 276 | rc=$? 277 | log "mssql_monitor finished with exit code: $rc. Most likely (but not necessarily) will be non-zero." 278 | 279 | if pidof $OCF_RESKEY_binary; then 280 | error "Expects no SQL Server processes to be running" 281 | return 1 282 | fi 283 | done 284 | } 285 | 286 | # sometimes pacemaker will cancel the monitoring op (by killing the process), and then running "stop" 287 | # the stop should always succeed 288 | cancel_monitor() { 289 | local seconds="0.2 1 3" 290 | 291 | for i in $seconds; do 292 | run_agent mssql_start 293 | 294 | run_agent mssql_monitor & 295 | start_pid=$! 296 | log "mssql_monitor started with pid: $start_pid" 297 | 298 | sleep $i 299 | log "Killing the monitor action:" 300 | kill "$start_pid" 301 | 302 | log "Running mssql_stop" 303 | if ! run_agent mssql_stop; then 304 | error "Expects mssql_stop to succeed $i seconds after running mssql_monitor" 305 | return 1 306 | fi 307 | done 308 | 309 | } 310 | 311 | test_cases=(run_start_monitor_stop 312 | run_start_monitor_kill_monitor_stop 313 | run_start_twice 314 | run_stop_twice 315 | stop_during_start 316 | stop_during_monitor) 317 | 318 | run_test() { 319 | local rc 320 | if ! type "$1" &>/dev/null; then 321 | echo "Test: $1 does not exist" 322 | fi 323 | 324 | echo "Running test: \"$1\"" 325 | cleanup 326 | "$1" 327 | rc=$? 328 | echo "Test: \"$1\" exited with return code: $rc" 329 | } 330 | 331 | if [[ "x$ARGS_TEST_CASES" != "x" ]]; then 332 | for i in $ARGS_TEST_CASES; do 333 | run_test "$i" 334 | done 335 | else 336 | for i in "${test_cases[@]}"; do 337 | run_test "$i" 338 | done 339 | fi 340 | -------------------------------------------------------------------------------- /fci/fci: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (C) Microsoft Corporation. 4 | # 5 | # SQL Server Failover Cluster Instance resource agent 6 | # 7 | # Valid actions are: 8 | # start 9 | # stop 10 | # monitor 11 | # validate-all 12 | # meta-data 13 | # 14 | 15 | # ---------------------------------------------------------------------------------------------------------- 16 | # functions: test_mode 17 | # 18 | # Description: 19 | # If the env variable MODE == "test", then this file can be sourced for debugging 20 | # 21 | test_mode() { 22 | [[ "$MODE" = 'test' ]] 23 | } 24 | 25 | # ---------------------------------------------------------------------------------------------------------- 26 | # 27 | if test_mode; then 28 | : "${OCF_ROOT=/usr/lib/ocf}" 29 | fi 30 | 31 | # Subtract this from the exit code of ag-helper to get the OCF exit code. 32 | # This is the inverse of the operation performed by `mssqlcommon.OcfExit` used by ag-helper. 33 | OCF_EXIT_DIFFERENCE=10 34 | 35 | # ---------------------------------------------------------------------------------------------------------- 36 | # Pacemaker libraries 37 | # 38 | : "${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}" 39 | . "$OCF_FUNCTIONS" 40 | : "${__OCF_ACTION=$1}" 41 | 42 | # ---------------------------------------------------------------------------------------------------------- 43 | # Location of other files used by this resource agent 44 | # 45 | : "${FCI_HELPER_BIN=${OCF_ROOT}/lib/mssql/fci-helper}" 46 | : "${METADATA_FILE=${OCF_ROOT}/lib/mssql/fci_metadata}" 47 | : "${MSSQL_RA_USAGE_FILE=${OCF_ROOT}/lib/mssql/fci_usage}" 48 | 49 | # ---------------------------------------------------------------------------------------------------------- 50 | # Defaults values for optional parameters 51 | # 52 | : "${WORKING_DIR_DEFAULT=/var/opt/mssql}" 53 | : "${MONITOR_LEVEL_DEFAULT=3}" 54 | : "${MONITOR_TIMEOUT_DEFAULT=20}" 55 | : "${MONITORING_CREDENTIALS_FILE_DEFAULT=${WORKING_DIR_DEFAULT}/secrets/passwd}" 56 | : "${PORT_DEFAULT=1433}" 57 | : "${INSTANCE_FILE_DEFAULT=${HA_VARRUN%%/}/mssql-${OCF_RESOURCE_INSTANCE}.pid}" 58 | : "${BINARY_DEFAULT=/opt/mssql/bin/sqlservr}" 59 | : "${USER_DEFAULT=mssql}" 60 | : "${STOP_TIMEOUT_DEFAULT=19}" 61 | : "${MSSQL_ARGS_DEFAULT=""}" 62 | 63 | # ---------------------------------------------------------------------------------------------------------- 64 | # function: mssql_meta_data 65 | # 66 | # Description: 67 | # Implements the OCF "meta-data" action. 68 | # 69 | mssql_meta_data() { 70 | cat "$METADATA_FILE" 71 | } 72 | 73 | # ---------------------------------------------------------------------------------------------------------- 74 | # function: usage 75 | # 76 | mssql_usage() { 77 | cat "$MSSQL_RA_USAGE_FILE" 78 | } 79 | 80 | # ---------------------------------------------------------------------------------------------------------- 81 | # function: mssql_start_process 82 | # 83 | # Description: 84 | # Starts SQL Server as user $OCF_RESKEY_user in $OCF_RESKEY_working_dir with args $OCF_RESKEY_mssql_args. 85 | # Prints out the pid of the SQL Server process. 86 | # 87 | # Returns: 88 | # Result code from the su -c command used to start SQL Server. 89 | # 90 | mssql_start_process() { 91 | local pid 92 | 93 | # go to working directory we will run sql from 94 | # 95 | pushd "$OCF_RESKEY_working_dir" 96 | 97 | mssql_start_command="$OCF_RESKEY_binary $OCF_RESKEY_mssql_args" 98 | 99 | # su - -s /bin/bash "$OCF_RESKEY_user": run bash as configured user 100 | # -c: execute command and exit bash 101 | # $mssql_start_command >/dev/null 2>&1: run SQL Server in background, closing stdin and stdout 102 | # echo $!: print out the PID of the new SQL Server process 103 | # 104 | pid="$(su - -s /bin/bash "$OCF_RESKEY_user" -c "$mssql_start_command >/dev/null 2>&1 & echo \$!")" 105 | rc="$?" 106 | 107 | ocf_log info "SQL Server started. PID: $pid; user: $OCF_RESKEY_user; command: $mssql_start_command" 108 | 109 | # leave $OCF_RESKEY_working_dir 110 | # 111 | popd 112 | 113 | return "$rc" 114 | } 115 | 116 | # ---------------------------------------------------------------------------------------------------------- 117 | # function: mssql_start 118 | # 119 | # Description: 120 | # Implements the OCF "start" action. 121 | # Starts the SQL Server process and waits until recovery completes (we can log in and execute a query). 122 | # Creates the empty file: $OCF_RESKEY_status_file 123 | # 124 | # Returns: 125 | # OCF_SUCCESS: SQL Server successfully started or was already running 126 | # OCF_ERR_ARGS: User account or working directory don't exist, or starting SQL Server failed 127 | # OCF_ERR_PERM: We were completely unable to start the SQL Server process. 128 | # OCF_ERR_GENERIC: The SQL Server process crashed during startup. 129 | # 130 | mssql_start() { 131 | local rc 132 | ocf_log info 'mssql_start' 133 | 134 | # Check if the SQL Server process is already running. 135 | # 136 | if pid="$(get_processes)"; then 137 | ocf_exit_reason "SQL Server processes already running with pids: $pid." 138 | return "$OCF_SUCCESS" 139 | fi 140 | 141 | # Ensure that the working directory exists 142 | # 143 | if [[ ! -d "$OCF_RESKEY_working_dir" ]]; then 144 | ocf_exit_reason "Working directory doesn't exist: $OCF_RESKEY_working_dir" 145 | return "$OCF_ERR_ARGS" 146 | fi 147 | 148 | # Check that the user to run SQL Server as exists 149 | # 150 | if ! id "$OCF_RESKEY_user" >/dev/null; then 151 | ocf_exit_reason "Invalid user: $OCF_RESKEY_user" 152 | return "$OCF_ERR_ARGS" 153 | fi 154 | 155 | # Start the SQL Server process 156 | # 157 | if ! pid="$(mssql_start_process)"; then 158 | # We get here if a problem was encountered while starting the process itself. 159 | # This is probably not a transient error. 160 | return "$OCF_ERR_PERM" 161 | fi 162 | 163 | # Logins will fail until recovery completes. 164 | # Retry in an infinite loop until we get a login. 165 | # 166 | while :; do 167 | sleep 1 168 | 169 | # Recognize that the SQL Server process crashed and exit the loop 170 | # 171 | if ! get_processes; then 172 | ocf_exit_reason 'SQL Server crashed during startup.' 173 | return "$OCF_ERR_GENERIC" 174 | fi 175 | 176 | local command_output 177 | local rc 178 | 179 | command_output="$( 180 | "$FCI_HELPER_BIN" \ 181 | --port "$OCF_RESKEY_port" \ 182 | --credentials-file "$OCF_RESKEY_monitoring_credentials_file" \ 183 | --application-name "monitor-$OCF_RESOURCE_INSTANCE-start" \ 184 | --connection-timeout "$OCF_RESKEY_monitor_timeout" \ 185 | --health-threshold "$OCF_RESKEY_monitor_policy" \ 186 | --action start \ 187 | --virtual-server-name "$OCF_RESOURCE_INSTANCE" \ 188 | 2>&1 | 189 | while read -r line; do 190 | ocf_log info "start: $line" 191 | echo "$line" 192 | done 193 | exit "${PIPESTATUS[0]}" 194 | )" 195 | rc="$?" 196 | 197 | set_exit_reason "$command_output" 198 | 199 | if (( rc < OCF_EXIT_DIFFERENCE )); then 200 | # fci-helper failed in an unexpected way 201 | # 202 | return "$OCF_ERR_GENERIC" 203 | fi 204 | 205 | rc="$(( rc - OCF_EXIT_DIFFERENCE ))" 206 | 207 | case "$rc" in 208 | "$OCF_SUCCESS") 209 | mark_started 210 | 211 | return "$OCF_SUCCESS" 212 | ;; 213 | "$OCF_ERR_GENERIC") 214 | # Retry 215 | ;; 216 | *) 217 | # Unexpected error 218 | return "$rc" 219 | ;; 220 | esac 221 | 222 | done 223 | } 224 | 225 | # ---------------------------------------------------------------------------------------------------------- 226 | # function: mssql_stop 227 | # 228 | # Description: 229 | # Implements the ocf "stop" action 230 | # Calls "ocf_stop_processes" 231 | # Removes the file: $OCF_RESKEY_status_file 232 | # 233 | # Returns: 234 | # OCF_SUCCESS: SQL Server process exited (or it was already stopped) 235 | # OCF_ERR_GENERIC: Even after SIGKILL, SQL Server refused to die 236 | # 237 | mssql_stop() { 238 | ocf_log info 'mssql_stop' 239 | 240 | # Delete the file we created when starting SQL Server so future invocations of the resource agent know 241 | # that SQL Server was stopped from pacemaker. 242 | # 243 | mark_stopped 244 | 245 | local rc processes stop_timeout 246 | 247 | if ! processes="$(get_processes)"; then 248 | ocf_exit_reason 'SQL Server is not running.' 249 | return "$OCF_SUCCESS" 250 | fi 251 | 252 | # ocf_stop_process will first send a "TERM" signal, wait (stop_timeout / 2) seconds for SQL to exit. 253 | # If it hasn't, then it will send a KILL signal and wait another (stop_timeout) / 2 seconds. 254 | # 255 | stop_timeout="$OCF_RESKEY_stop_timeout" 256 | processes="$(get_processes)" 257 | ocf_log info "Attempting to stop SQL Server processes with pids: $processes and timeout $stop_timeout" 258 | 259 | ocf_stop_processes 'TERM KILL' "$stop_timeout" "$processes" 260 | 261 | # SQL Server wouldn't die 262 | # 263 | if get_processes; then 264 | ocf_exit_reason 'SQL Server is still running.' 265 | return "$OCF_ERR_GENERIC" 266 | fi 267 | 268 | return "$OCF_SUCCESS" 269 | } 270 | 271 | # ---------------------------------------------------------------------------------------------------------- 272 | # function: mssql_monitor 273 | # 274 | # Description: 275 | # Implements the ocf "monitor" action 276 | # 1. Checks if the SQL Server process is running. 277 | # 2. Calls the "fci-helper" binary to monitor the instance health and determine whether a failover or restart is necessary 278 | # 279 | # Returns: 280 | # OCF_SUCCESS: SQL Server process is running, healthier than the user specified failover threshold 281 | # OCF_ERR_GENERIC: SQL Server is not running and pacemaker didn't stop it, SQL Server is unresponsive, 282 | # or SQL Server is less healthy than user specified failover or restart threshold. 283 | # OCF_NOT_RUNNING: SQL Server process is not running and pacemaker stopped it. 284 | # 285 | mssql_monitor() { 286 | ocf_log info 'mssql_monitor' 287 | 288 | if (( OCF_RESKEY_CRM_meta_timeout / 1000 <= OCF_RESKEY_monitor_timeout )); then 289 | ocf_exit_reason "The monitor action should have a higher timeout than the 'monitor_timeout' resource option" 290 | return "$OCF_ERR_CONFIGURED" 291 | fi 292 | 293 | if ! get_processes; then 294 | # If we did not stop SQL Server (mssql_stop removes calls mark_stopped) 295 | # 296 | if is_marked_started; then 297 | ocf_exit_reason 'SQL Server process crashed.' 298 | return "$OCF_ERR_GENERIC" 299 | fi 300 | 301 | return "$OCF_NOT_RUNNING" 302 | fi 303 | 304 | if ! is_marked_started; then 305 | mark_started 306 | fi 307 | 308 | # SQL Server is running. Monitor it. 309 | # 310 | local command_output 311 | local rc 312 | 313 | command_output="$( 314 | "$FCI_HELPER_BIN" \ 315 | --port "$OCF_RESKEY_port" \ 316 | --credentials-file "$OCF_RESKEY_monitoring_credentials_file" \ 317 | --application-name "monitor-$OCF_RESOURCE_INSTANCE-monitor" \ 318 | --connection-timeout "$OCF_RESKEY_monitor_timeout" \ 319 | --health-threshold "$OCF_RESKEY_monitor_policy" \ 320 | --action monitor \ 321 | --virtual-server-name "$OCF_RESOURCE_INSTANCE" \ 322 | 2>&1 | 323 | while read -r line; do 324 | ocf_log info "monitor: $line" 325 | echo "$line" 326 | done 327 | exit "${PIPESTATUS[0]}" 328 | )" 329 | rc="$?" 330 | 331 | set_exit_reason "$command_output" 332 | 333 | if (( rc < OCF_EXIT_DIFFERENCE )); then 334 | # fci-helper failed in an unexpected way 335 | # 336 | return "$OCF_ERR_GENERIC" 337 | fi 338 | 339 | rc="$(( rc - OCF_EXIT_DIFFERENCE ))" 340 | 341 | return "$rc" 342 | } 343 | 344 | # ---------------------------------------------------------------------------------------------------------- 345 | # function: mssql_validate 346 | # 347 | # Description: 348 | # Implements the ocf "validate-all" action 349 | # 350 | # Returns: 351 | # OCF_SUCCESS: all required parameters are set, binaries and credentials file are present 352 | # OCF_NOT_INSTALLED: sqlservr or fci-helper binaries are missing 353 | # OCF_ERR_ARGS: credentials file is missing 354 | # 355 | mssql_validate() { 356 | ocf_log info 'mssql_validate' 357 | 358 | # Set default parameters 359 | # 360 | : "${OCF_RESKEY_status_file_path=$INSTANCE_FILE_DEFAULT}" 361 | : "${OCF_RESKEY_binary=$BINARY_DEFAULT}" 362 | : "${OCF_RESKEY_working_dir=$WORKING_DIR_DEFAULT}" 363 | : "${OCF_RESKEY_user=$USER_DEFAULT}" 364 | : "${OCF_RESKEY_monitoring_credentials_file=$MONITORING_CREDENTIALS_FILE_DEFAULT}" 365 | : "${OCF_RESKEY_stop_timeout=$STOP_TIMEOUT_DEFAULT}" 366 | : "${OCF_RESKEY_mssql_args=$MSSQL_ARGS_DEFAULT}" 367 | : "${OCF_RESKEY_monitor_policy=$MONITOR_LEVEL_DEFAULT}" 368 | : "${OCF_RESKEY_monitor_timeout=$MONITOR_TIMEOUT_DEFAULT}" 369 | : "${OCF_RESKEY_port=$PORT_DEFAULT}" 370 | 371 | # Check binaries necessary for the resource agent to run exit 372 | # 373 | check_binary "$OCF_RESKEY_binary" 374 | check_binary "$FCI_HELPER_BIN" 375 | 376 | # Check that we have file with username / password for monitoring login 377 | # 378 | if [[ ! -f "$OCF_RESKEY_monitoring_credentials_file" ]]; then 379 | ocf_exit_reason "Expect credentials file at $OCF_RESKEY_monitoring_credentials_file" 380 | if ocf_is_probe; then 381 | # This is a probe. The credentials file might be on shared storage, so don't return a hard error 382 | if get_processes; then 383 | # Credentials file isn't found but sqlservr is running somehow. 384 | return "$OCF_ERR_GENERIC" 385 | else 386 | # Ignore the fact that the credentials file is missing. 387 | return "$OCF_NOT_RUNNING" 388 | fi 389 | else 390 | return "$OCF_ERR_ARGS" 391 | fi 392 | fi 393 | 394 | # check required parameters (there are none for now) 395 | 396 | return "$OCF_SUCCESS" 397 | } 398 | 399 | # ---------------------------------------------------------------------------------------------------------- 400 | # function: get_processes 401 | # 402 | # Description: 403 | # Gets SQL Server processes using "pidof." This is safe since we expected OCF_RESKEY_binary 404 | # is the absolute path to the SQL Server binary and we only have one instance running. 405 | # 406 | # 407 | get_processes() { 408 | pidof "$OCF_RESKEY_binary" 409 | } 410 | 411 | # ---------------------------------------------------------------------------------------------------------- 412 | # functions: mark_started, mark_stopped, is_marked_started 413 | # 414 | # Description: 415 | # mark_started creates an empty file at OCF_RESKEY_status_file_path (which is by default /var/run/mssql-). 416 | # mark_stopped removes this file 417 | # is_marked_started checks for the existence of this file 418 | # This is used to determine whether SQL Server crashed or was started and stopped by pacemaker 419 | # 420 | mark_started() { 421 | touch "$OCF_RESKEY_status_file_path" 422 | } 423 | 424 | mark_stopped() { 425 | rm -f "$OCF_RESKEY_status_file_path" 426 | } 427 | 428 | is_marked_started() { 429 | [[ -f "$OCF_RESKEY_status_file_path" ]] 430 | } 431 | 432 | # ---------------------------------------------------------------------------------------------------------- 433 | # function: mssql_export_ocf_exit_codes 434 | # 435 | # Description: 436 | # Exports the OCF exit code variables as environment variables for sub-processes. 437 | # 438 | mssql_export_ocf_exit_codes() { 439 | export \ 440 | OCF_ERR_ARGS OCF_ERR_CONFIGURED OCF_ERR_GENERIC OCF_ERR_PERM OCF_ERR_UNIMPLEMENTED \ 441 | OCF_FAILED_MASTER OCF_NOT_RUNNING \ 442 | OCF_RUNNING_MASTER OCF_SUCCESS 443 | } 444 | 445 | # ---------------------------------------------------------------------------------------------------------- 446 | # function: set_exit_reason 447 | # 448 | # Description: 449 | # Extracts the exit reason from the given command output if it exists, and sets it. 450 | # 451 | set_exit_reason() { 452 | local exit_reason="$(echo "$1" | grep -Po '^ERROR: \K.*' | head -n1)" 453 | if [[ -n "$exit_reason" ]]; then 454 | ocf_exit_reason "$exit_reason" 455 | fi 456 | } 457 | 458 | # ---------------------------------------------------------------------------------------------------------- 459 | # 460 | if [[ "$__OCF_ACTION" = 'meta-data' ]]; then 461 | mssql_meta_data 462 | exit "$OCF_SUCCESS" 463 | fi 464 | 465 | mssql_validate 466 | validate_result="$?" 467 | 468 | if ! test_mode; then 469 | ocf_log info "Resource agent invoked with: $__OCF_ACTION" 470 | 471 | # If validation failed, then return that failure unless the action is stop. 472 | # Stop should always try to stop sqlservr. 473 | if [ "$validate_result" -ne 0 -a "$__OCF_ACTION" != 'stop' ]; then 474 | exit "$validate_result" 475 | fi 476 | fi 477 | 478 | mssql_export_ocf_exit_codes 479 | 480 | case "$__OCF_ACTION" in 481 | 'start') 482 | mssql_start 483 | ;; 484 | 'stop') 485 | mssql_stop 486 | ;; 487 | 'monitor') 488 | mssql_monitor 489 | ;; 490 | 'validate-all') 491 | exit "$validate_result" 492 | ;; 493 | 'usage' | 'help') 494 | mssql_usage 495 | exit "$OCF_SUCCESS" 496 | ;; 497 | *) 498 | # We can source this file for testing, in which case we don't want to exit 499 | # 500 | if ! test_mode; then 501 | mssql_usage 502 | exit "$OCF_ERR_UNIMPLEMENTED" 503 | fi 504 | ;; 505 | esac 506 | rc="$?" 507 | 508 | if ! test_mode; then 509 | ocf_log info "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" 510 | exit "$rc" 511 | fi 512 | -------------------------------------------------------------------------------- /ag/ag: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (C) Microsoft Corporation. 4 | # 5 | # SQL Server Always-On Availability Groups resource agent 6 | # 7 | # Valid actions are: 8 | # start 9 | # stop 10 | # promote 11 | # demote 12 | # monitor 13 | # validate-all 14 | # meta-data 15 | # 16 | 17 | # ---------------------------------------------------------------------------------------------------------- 18 | # Location of other files used by this resource agent 19 | # 20 | : "${AG_HELPER_BIN=${OCF_ROOT}/lib/mssql/ag-helper}" 21 | : "${METADATA_FILE=${OCF_ROOT}/lib/mssql/ag_metadata}" 22 | : "${USAGE_FILE=${OCF_ROOT}/lib/mssql/ag_usage}" 23 | 24 | # ---------------------------------------------------------------------------------------------------------- 25 | # Defaults values for optional parameters 26 | # 27 | : "${MONITOR_LEVEL_DEFAULT=3}" 28 | : "${MONITOR_INTERVAL_DEFAULT=10}" 29 | : "${MONITOR_TIMEOUT_DEFAULT=60}" 30 | : "${CONNECTION_TIMEOUT_DEFAULT=30}" 31 | : "${DISABLE_PRIMARY_ON_QUORUM_TIMEOUT_DEFAULT=60}" 32 | : "${MONITORING_CREDENTIALS_FILE_DEFAULT=/var/opt/mssql/secrets/passwd}" 33 | : "${PORT_DEFAULT=1433}" 34 | : "${PROCESS_NAME_DEFAULT=sqlservr}" 35 | : "${PRIMARY_LEASE_DURATION_DEFAULT=$[MONITOR_INTERVAL_DEFAULT+MONITOR_TIMEOUT_DEFAULT+2]}" 36 | : "${REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT_DEFAULT=-1}" # -1 is a sentinel that ag-helper interprets as "unset" 37 | 38 | # Subtract this from the exit code of ag-helper to get the OCF exit code. 39 | # This is the inverse of the operation performed by `mssqlcommon.OcfExit` used by ag-helper. 40 | OCF_EXIT_DIFFERENCE=10 41 | 42 | # ---------------------------------------------------------------------------------------------------------- 43 | # Pacemaker libraries 44 | # 45 | : "${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}" 46 | . "$OCF_FUNCTIONS" 47 | : "${__OCF_ACTION=$1}" 48 | 49 | # ---------------------------------------------------------------------------------------------------------- 50 | # function: mssql_meta_data 51 | # 52 | # Description: 53 | # Implements the OCF "meta-data" action. 54 | # 55 | mssql_meta_data() { 56 | cat "$METADATA_FILE" 57 | } 58 | 59 | # ---------------------------------------------------------------------------------------------------------- 60 | # function: usage 61 | # 62 | mssql_usage() { 63 | cat "$USAGE_FILE" 64 | } 65 | 66 | mssql_conf="/var/opt/mssql/mssql.conf" 67 | 68 | if [ -f $mssql_conf ]; then 69 | hostname=$(sed -n -e '/^\s*\[network]\s*/I,/\s*ipaddress\s*=./I {s/^\s*ipaddress\s*=\s*\(.*\)/\1/I p}' $mssql_conf) 70 | fi 71 | 72 | # ---------------------------------------------------------------------------------------------------------- 73 | # function: mssql_start 74 | # 75 | # Description: 76 | # Implements the OCF "start" action. 77 | # 78 | mssql_start() { 79 | ocf_log info 'mssql_start' 80 | 81 | # Fetch sequence numbers of all replicas 82 | # 83 | local sequence_numbers="$(attrd_updater -n "$OCF_RESOURCE_INSTANCE-sequence-number" -QA)" 84 | 85 | local command_output 86 | local rc 87 | 88 | if ! pidof "$OCF_RESKEY_process_name"; then 89 | # SQL Server crashed or isn't running at all. 90 | ocf_exit_reason "SQL Server isn't running." 91 | return "$OCF_ERR_GENERIC" 92 | fi 93 | 94 | # RHEL9 introduces a breaking change about crm_resource cmd return value from "Master" to "Promoted". 95 | # We need to change grep pip condition accordingly so our agent code supports both old and new pacemaker version. 96 | local current_master="$(crm_resource -r "$OCF_RESOURCE_INSTANCE" --locate | grep -Po 'resource [^ ]+ is running on: \K(.+)(?= (Master|Promoted)$)')" 97 | ocf_log info "mssql_start current_master: $current_master" 98 | 99 | command_output="$( 100 | "$AG_HELPER_BIN" \ 101 | --hostname "$hostname" \ 102 | --port "$OCF_RESKEY_port" \ 103 | --credentials-file "$OCF_RESKEY_monitoring_credentials_file" \ 104 | --ag-name "$OCF_RESKEY_ag_name" \ 105 | --application-name "monitor-$OCF_RESOURCE_INSTANCE-start" \ 106 | --connection-timeout "$OCF_RESKEY_connection_timeout" \ 107 | --health-threshold "$OCF_RESKEY_monitor_policy" \ 108 | --action start \ 109 | --sequence-numbers "$sequence_numbers" \ 110 | --required-synchronized-secondaries-to-commit "$OCF_RESKEY_required_synchronized_secondaries_to_commit" \ 111 | --current-master "$current_master" \ 112 | --disable-primary-on-quorum-timeout-after "$OCF_RESKEY_disable_primary_on_quorum_timeout_after" \ 113 | --primary-write-lease-duration "$OCF_RESKEY_primary_write_lease_duration" \ 114 | 2>&1 | 115 | while read -r line; do 116 | ocf_log info "start: $line" 117 | echo "$line" 118 | done 119 | exit "${PIPESTATUS[0]}" 120 | )" 121 | rc="$?" 122 | 123 | set_exit_reason "$command_output" 124 | 125 | if (( rc < OCF_EXIT_DIFFERENCE )); then 126 | # ag-helper failed in an unexpected way 127 | # 128 | return "$OCF_ERR_GENERIC" 129 | fi 130 | 131 | rc="$(( rc - OCF_EXIT_DIFFERENCE ))" 132 | 133 | set_promotion_score "$command_output" 134 | 135 | return "$rc" 136 | } 137 | 138 | # ---------------------------------------------------------------------------------------------------------- 139 | # function: mssql_stop 140 | # 141 | # Description: 142 | # Implements the OCF "stop" action. 143 | # 144 | mssql_stop() { 145 | ocf_log info 'mssql_stop' 146 | 147 | local command_output 148 | local rc 149 | 150 | if ! pidof "$OCF_RESKEY_process_name"; then 151 | # SQL Server crashed. This is as good as stopped. 152 | # Even if it was a PRIMARY before it crashed and automatically restarts, it will come back as RESOLVING. 153 | ocf_exit_reason "SQL Server isn't running." 154 | return "$OCF_SUCCESS" 155 | fi 156 | 157 | # Reserve 5s for killing the SQL Server process if ag-helper fails 158 | # and use the rest to run ag-helper 159 | local stop_timeout="$(( OCF_RESKEY_CRM_meta_timeout / 1000 - 5 ))" 160 | if (( stop_timeout < 5 )); then 161 | # The stop timeout should be atleast 10 seconds so that there's enough time to change the AG replica's role, 162 | # and kill the SQL Server process if that fails. If the user set it shorter, there doesn't seem to be any way to 163 | # tell them that while still having the `stop` action succeed (failure would cause the node to be fenced). 164 | # 165 | # So print an error to cluster log and then pretend the change role failed without actually trying to change the role. 166 | # This will kill the process and hopefully trigger the user to read the cluster log and discover this error. 167 | # 168 | # Note that Pacemaker documentation recommends both that the default timeout specified in a resource's metadata (10s in this case) 169 | # is the minimum that the user should set, and further recommends not setting *any* timeout less than 10 seconds anyway. 170 | # So the user who sets this timeout to less than 10 seconds is going out of their way to do it. 171 | ocf_log error 'The stop action timeout should be at least 10 seconds' 172 | rc="$OCF_ERR_GENERIC" 173 | else 174 | command_output="$( 175 | timeout "$stop_timeout" "$AG_HELPER_BIN" \ 176 | --hostname "$hostname" \ 177 | --port "$OCF_RESKEY_port" \ 178 | --credentials-file "$OCF_RESKEY_monitoring_credentials_file" \ 179 | --ag-name "$OCF_RESKEY_ag_name" \ 180 | --application-name "monitor-$OCF_RESOURCE_INSTANCE-stop" \ 181 | --connection-timeout "$OCF_RESKEY_connection_timeout" \ 182 | --health-threshold "$OCF_RESKEY_monitor_policy" \ 183 | --action stop \ 184 | 2>&1 | 185 | while read -r line; do 186 | ocf_log info "stop: $line" 187 | echo "$line" 188 | done 189 | exit "${PIPESTATUS[0]}" 190 | )" 191 | rc="$?" 192 | if (( rc == 124 )); then 193 | # `timeout` exits with 124 on timeout. Map it to $OCF_ERR_GENERIC 194 | ocf_log error 'ag-helper timed out' 195 | rc="$(( OCF_ERR_GENERIC + OCF_EXIT_DIFFERENCE ))" 196 | fi 197 | 198 | set_exit_reason "$command_output" 199 | 200 | if (( rc < OCF_EXIT_DIFFERENCE )); then 201 | # ag-helper failed in an unexpected way 202 | # 203 | return "$OCF_ERR_GENERIC" 204 | fi 205 | 206 | rc="$(( rc - OCF_EXIT_DIFFERENCE ))" 207 | fi 208 | 209 | case "$rc" in 210 | "$OCF_SUCCESS") 211 | # ag-helper succeeded. Nothing else to do. 212 | # 213 | ;; 214 | "$OCF_ERR_GENERIC") 215 | # ag-helper failed to set the AG replica to SECONDARY Role. 216 | # Kill the instance to ensure that it stops being in PRIMARY role. 217 | # 218 | ocf_log info 'Killing SQL Server process...' 219 | 220 | local mssql_pid="$(pidof "$OCF_RESKEY_process_name")" 221 | ocf_log info "SQL Server process id before kill: $mssql_pid" 222 | 223 | if ocf_stop_processes KILL 9 $mssql_pid; then 224 | # Once the SQL Server instance has been killed, it will not come back as PRIMARY. 225 | # If it was PRIMARY before, it will come back as RESOLVING. 226 | # So by killing the SQL Server, the stop action has succeeded. 227 | 228 | # loop to make sure KILL 9 success 229 | 230 | ocf_log info 'Start Looping...' 231 | 232 | while : 233 | do 234 | if ! ps -p $mssql_pid > /dev/null; then 235 | ocf_log info 'SQL Server process was killed successfully' 236 | rc="$OCF_SUCCESS" 237 | break 238 | fi 239 | 240 | # Double check process id before resend kill 9 signal. 241 | # If not both old mssql pids are killed, we need to compare them one by one with cur mssql pids 242 | # because it is possible only one of them is killed. 243 | local cur_mssql_pid="$(pidof "$OCF_RESKEY_process_name")" 244 | 245 | local cur_mssql_pid_list=$(echo $cur_mssql_pid | tr " " "\n") 246 | local old_mssql_pid_list=$(echo $mssql_pid | tr " " "\n") 247 | 248 | local dirty_flag="0" 249 | for old_pid in $old_mssql_pid_list 250 | do 251 | for cur_pid in $cur_mssql_pid_list 252 | do 253 | ocf_log info "-------------Compare and do KILL 9---------------" 254 | 255 | ocf_log info "Current SQL Server process id: $cur_pid" 256 | ocf_log info "Old SQL Server process id: $old_pid" 257 | 258 | if [[ $cur_pid == $old_pid ]]; then 259 | $dirty_flag="1" 260 | ocf_stop_processes KILL 9 $cur_pid 261 | fi 262 | 263 | ocf_log info "--------------------------------------------------" 264 | done 265 | done 266 | 267 | if [[ $dirty_flag == "0" ]]; then 268 | ocf_log info "Cannot find SQL Server process, it may be killed successfully" 269 | rc="$OCF_SUCCESS" 270 | break 271 | fi 272 | 273 | sleep 5 274 | done 275 | fi 276 | ;; 277 | esac 278 | 279 | return "$rc" 280 | } 281 | 282 | # ---------------------------------------------------------------------------------------------------------- 283 | # function: mssql_monitor 284 | # 285 | # Description: 286 | # Implements the OCF "monitor" action. 287 | # 288 | mssql_monitor() { 289 | ocf_log info 'mssql_monitor' 290 | 291 | local command_output 292 | local rc 293 | 294 | if ! pidof "$OCF_RESKEY_process_name"; then 295 | # SQL Server crashed or isn't running at all. 296 | ocf_exit_reason "SQL Server isn't running." 297 | return "$OCF_NOT_RUNNING" 298 | fi 299 | 300 | if (( OCF_RESKEY_meta_timeout_sec <= OCF_RESKEY_connection_timeout )); then 301 | ocf_log info "WARNING: Monitor timeout is lower than connection timeout. Connection will not retried if connection timeout occurs" 302 | fi 303 | 304 | # RHEL9 introduces a breaking change about crm_resource cmd return value from "Master" to "Promoted". 305 | # We need to change grep pip condition accordingly so our agent code supports both old and new pacemaker version. 306 | local current_master="$(crm_resource -r "$OCF_RESOURCE_INSTANCE" --locate | grep -Po 'resource [^ ]+ is running on: \K(.+)(?= (Master|Promoted)$)')" 307 | ocf_log info "mssql_monitor current_master: $current_master" 308 | 309 | command_output="$( 310 | "$AG_HELPER_BIN" \ 311 | --hostname "$hostname" \ 312 | --port "$OCF_RESKEY_port" \ 313 | --credentials-file "$OCF_RESKEY_monitoring_credentials_file" \ 314 | --ag-name "$OCF_RESKEY_ag_name" \ 315 | --application-name "monitor-$OCF_RESOURCE_INSTANCE-monitor" \ 316 | --connection-timeout "$OCF_RESKEY_connection_timeout" \ 317 | --health-threshold "$OCF_RESKEY_monitor_policy" \ 318 | --action monitor \ 319 | --required-synchronized-secondaries-to-commit "$OCF_RESKEY_required_synchronized_secondaries_to_commit" \ 320 | --current-master "$current_master" \ 321 | --disable-primary-on-quorum-timeout-after "$OCF_RESKEY_disable_primary_on_quorum_timeout_after" \ 322 | --primary-write-lease-duration "$OCF_RESKEY_primary_write_lease_duration" \ 323 | --monitor-interval-timeout "$OCF_RESKEY_meta_timeout_sec" \ 324 | 2>&1 | 325 | while read -r line; do 326 | ocf_log info "monitor: $line" 327 | echo "$line" 328 | done 329 | exit "${PIPESTATUS[0]}" 330 | )" 331 | rc="$?" 332 | 333 | set_exit_reason "$command_output" 334 | 335 | local lease_expiry="$(echo "$command_output" | grep -Po '^LEASE_EXPIRY: \K.*')" 336 | 337 | attrd_updater -n "$OCF_RESOURCE_INSTANCE-lease-expiry" -U "$lease_expiry" -p 338 | 339 | local lease="$(attrd_updater -n "$OCF_RESOURCE_INSTANCE-lease-expiry" -QA)" 340 | 341 | ocf_log info "lease_expiry after monitor update: $lease_expiry" 342 | ocf_log info "lease information from all replicas after monitor update: $lease" 343 | 344 | echo "$lease" | systemd-cat 345 | 346 | set_exit_reason "$command_output" 347 | 348 | if (( rc < OCF_EXIT_DIFFERENCE )); then 349 | # ag-helper failed in an unexpected way 350 | # 351 | return "$OCF_ERR_GENERIC" 352 | fi 353 | 354 | rc="$(( rc - OCF_EXIT_DIFFERENCE ))" 355 | 356 | set_promotion_score "$command_output" 357 | 358 | return "$rc" 359 | } 360 | 361 | # ---------------------------------------------------------------------------------------------------------- 362 | # function: mssql_promote 363 | # 364 | # Description: 365 | # Implements the OCF "promote" action. 366 | # 367 | mssql_promote() { 368 | ocf_log info 'mssql_promote' 369 | 370 | if ! pidof "$OCF_RESKEY_process_name"; then 371 | # SQL Server crashed. 372 | ocf_exit_reason "SQL Server isn't running." 373 | return "$OCF_NOT_RUNNING" 374 | fi 375 | 376 | local command_output 377 | local rc 378 | 379 | # Fetch sequence numbers of all replicas 380 | # 381 | local sequence_numbers="$(attrd_updater -n "$OCF_RESOURCE_INSTANCE-sequence-number" -QA)" 382 | 383 | command_output="$( 384 | "$AG_HELPER_BIN" \ 385 | --hostname "$hostname" \ 386 | --port "$OCF_RESKEY_port" \ 387 | --credentials-file "$OCF_RESKEY_monitoring_credentials_file" \ 388 | --ag-name "$OCF_RESKEY_ag_name" \ 389 | --application-name "monitor-$OCF_RESOURCE_INSTANCE-promote" \ 390 | --connection-timeout "$OCF_RESKEY_connection_timeout" \ 391 | --health-threshold "$OCF_RESKEY_monitor_policy" \ 392 | --action promote \ 393 | --sequence-numbers "$sequence_numbers" \ 394 | --new-master "$OCF_RESKEY_CRM_meta_notify_promote_uname" \ 395 | --required-synchronized-secondaries-to-commit "$OCF_RESKEY_required_synchronized_secondaries_to_commit" \ 396 | --disable-primary-on-quorum-timeout-after "$OCF_RESKEY_disable_primary_on_quorum_timeout_after" \ 397 | --primary-write-lease-duration "$OCF_RESKEY_primary_write_lease_duration" \ 398 | 2>&1 | 399 | while read -r line; do 400 | ocf_log info "promote: $line" 401 | echo "$line" 402 | done 403 | exit "${PIPESTATUS[0]}" 404 | )" 405 | rc="$?" 406 | 407 | set_exit_reason "$command_output" 408 | 409 | local lease_expiry="$(echo "$command_output" | grep -Po '^LEASE_EXPIRY: \K.*')" 410 | 411 | attrd_updater -n "$OCF_RESOURCE_INSTANCE-lease-expiry" -U "$lease_expiry" -p 412 | 413 | local lease="$(attrd_updater -n "$OCF_RESOURCE_INSTANCE-lease-expiry" -QA)" 414 | 415 | echo "$lease" | systemd-cat 416 | 417 | set_exit_reason "$command_output" 418 | 419 | if (( rc < OCF_EXIT_DIFFERENCE )); then 420 | # ag-helper failed in an unexpected way 421 | # 422 | return "$OCF_ERR_GENERIC" 423 | fi 424 | 425 | rc="$(( rc - OCF_EXIT_DIFFERENCE ))" 426 | 427 | return "$rc" 428 | } 429 | 430 | # ---------------------------------------------------------------------------------------------------------- 431 | # function: mssql_demote 432 | # 433 | # Description: 434 | # Implements the OCF "demote" action. 435 | # 436 | mssql_demote() { 437 | ocf_log info 'mssql_demote' 438 | 439 | if ! pidof "$OCF_RESKEY_process_name"; then 440 | # SQL Server crashed. This is as good as demoted. 441 | # Even if it was a PRIMARY before it crashed and automatically restarts, it will come back as RESOLVING. 442 | ocf_exit_reason "SQL Server isn't running." 443 | return "$OCF_SUCCESS" 444 | fi 445 | 446 | local lease_expiry="$(attrd_updater -n "$OCF_RESOURCE_INSTANCE-lease-expiry" -QA)" 447 | 448 | echo "$lease_expiry" | systemd-cat 449 | 450 | local lease_expiry_value='' 451 | while :; do 452 | lease_expiry_value="$(attrd_updater -n "$OCF_RESOURCE_INSTANCE-lease-expiry" -Q)" 453 | if [ -n "$lease_expiry_value" ]; then 454 | break 455 | fi 456 | 457 | done 458 | 459 | echo "$lease_expiry_value" | systemd-cat 460 | 461 | local command_output 462 | local rc 463 | 464 | command_output="$( 465 | "$AG_HELPER_BIN" \ 466 | --hostname "$hostname" \ 467 | --port "$OCF_RESKEY_port" \ 468 | --credentials-file "$OCF_RESKEY_monitoring_credentials_file" \ 469 | --ag-name "$OCF_RESKEY_ag_name" \ 470 | --application-name "monitor-$OCF_RESOURCE_INSTANCE-demote" \ 471 | --connection-timeout "$OCF_RESKEY_connection_timeout" \ 472 | --health-threshold "$OCF_RESKEY_monitor_policy" \ 473 | --lease-expiry "$lease_expiry_value" \ 474 | --action demote \ 475 | 2>&1 | 476 | while read -r line; do 477 | ocf_log info "demote: $line" 478 | echo "$line" 479 | done 480 | exit "${PIPESTATUS[0]}" 481 | )" 482 | rc="$?" 483 | 484 | set_exit_reason "$command_output" 485 | 486 | local lease_expiry="$(echo "$command_output" | grep -Po '^LEASE_EXPIRY: \K.*')" 487 | 488 | attrd_updater -n "$OCF_RESOURCE_INSTANCE-lease-expiry" -U "$lease_expiry" -p 489 | 490 | local lease="$(attrd_updater -n "$OCF_RESOURCE_INSTANCE-lease-expiry" -QA)" 491 | 492 | echo "$lease" | systemd-cat 493 | 494 | set_exit_reason "$command_output" 495 | 496 | if (( rc < OCF_EXIT_DIFFERENCE )); then 497 | # ag-helper failed in an unexpected way 498 | # 499 | return "$OCF_ERR_GENERIC" 500 | fi 501 | 502 | rc="$(( rc - OCF_EXIT_DIFFERENCE ))" 503 | 504 | return "$rc" 505 | } 506 | 507 | # ---------------------------------------------------------------------------------------------------------- 508 | # function: mssql_notify 509 | # 510 | # Description: 511 | # Implements the OCF "notify" action. 512 | # 513 | mssql_notify() { 514 | ocf_log info "mssql_notify $OCF_RESKEY_CRM_meta_notify_type-$OCF_RESKEY_CRM_meta_notify_operation" 515 | 516 | local command_output 517 | local rc 518 | 519 | case "$OCF_RESKEY_CRM_meta_notify_type-$OCF_RESKEY_CRM_meta_notify_operation" in 520 | 'pre-start') 521 | command_output="$( 522 | "$AG_HELPER_BIN" \ 523 | --hostname "$hostname" \ 524 | --port "$OCF_RESKEY_port" \ 525 | --credentials-file "$OCF_RESKEY_monitoring_credentials_file" \ 526 | --ag-name "$OCF_RESKEY_ag_name" \ 527 | --application-name "monitor-$OCF_RESOURCE_INSTANCE-pre-start" \ 528 | --connection-timeout "$OCF_RESKEY_connection_timeout" \ 529 | --health-threshold "$OCF_RESKEY_monitor_policy" \ 530 | --action pre-start \ 531 | --required-synchronized-secondaries-to-commit "$OCF_RESKEY_required_synchronized_secondaries_to_commit" \ 532 | 2>&1 | 533 | while read -r line; do 534 | ocf_log info "notify: $line" 535 | echo "$line" 536 | done 537 | exit "${PIPESTATUS[0]}" 538 | )" 539 | rc="$?" 540 | ;; 541 | 542 | 'post-promote') 543 | # Reset sequence number attribute so that it doesn't retain old values for subsequent starts or promotes 544 | attrd_updater -n "$OCF_RESOURCE_INSTANCE-sequence-number" -D 545 | command_output="$( 546 | "$AG_HELPER_BIN" \ 547 | --hostname "$hostname" \ 548 | --port "$OCF_RESKEY_port" \ 549 | --credentials-file "$OCF_RESKEY_monitoring_credentials_file" \ 550 | --ag-name "$OCF_RESKEY_ag_name" \ 551 | --application-name "monitor-$OCF_RESOURCE_INSTANCE-post-promote" \ 552 | --connection-timeout "$OCF_RESKEY_connection_timeout" \ 553 | --health-threshold "$OCF_RESKEY_monitor_policy" \ 554 | --action post-promote \ 555 | 2>&1 | 556 | while read -r line; do 557 | ocf_log info "notify: $line" 558 | echo "$line" 559 | done 560 | exit "${PIPESTATUS[0]}" 561 | )" 562 | rc="$?" 563 | ;; 564 | 565 | 'post-start') 566 | # Reset sequence number attribute so that it doesn't retain old values for subsequent starts or promotes 567 | attrd_updater -n "$OCF_RESOURCE_INSTANCE-sequence-number" -D 568 | return "$OCF_SUCCESS" 569 | ;; 570 | 571 | 'post-stop') 572 | command_output="$( 573 | "$AG_HELPER_BIN" \ 574 | --hostname "$hostname" \ 575 | --port "$OCF_RESKEY_port" \ 576 | --credentials-file "$OCF_RESKEY_monitoring_credentials_file" \ 577 | --ag-name "$OCF_RESKEY_ag_name" \ 578 | --application-name "monitor-$OCF_RESOURCE_INSTANCE-post-stop" \ 579 | --connection-timeout "$OCF_RESKEY_connection_timeout" \ 580 | --health-threshold "$OCF_RESKEY_monitor_policy" \ 581 | --action post-stop \ 582 | --required-synchronized-secondaries-to-commit "$OCF_RESKEY_required_synchronized_secondaries_to_commit" \ 583 | 2>&1 | 584 | while read -r line; do 585 | ocf_log info "notify: $line" 586 | echo "$line" 587 | done 588 | exit "${PIPESTATUS[0]}" 589 | )" 590 | rc="$?" 591 | ;; 592 | 593 | 'pre-promote') 594 | command_output="$( 595 | "$AG_HELPER_BIN" \ 596 | --hostname "$hostname" \ 597 | --port "$OCF_RESKEY_port" \ 598 | --credentials-file "$OCF_RESKEY_monitoring_credentials_file" \ 599 | --ag-name "$OCF_RESKEY_ag_name" \ 600 | --application-name "monitor-$OCF_RESOURCE_INSTANCE-pre-promote" \ 601 | --connection-timeout "$OCF_RESKEY_connection_timeout" \ 602 | --health-threshold "$OCF_RESKEY_monitor_policy" \ 603 | --action pre-promote \ 604 | 2>&1 | 605 | while read -r line; do 606 | ocf_log info "notify: $line" 607 | echo "$line" 608 | done 609 | exit "${PIPESTATUS[0]}" 610 | )" 611 | rc="$?" 612 | ;; 613 | 614 | *) 615 | return "$OCF_SUCCESS" 616 | ;; 617 | esac 618 | 619 | set_exit_reason "$command_output" 620 | 621 | if (( rc < OCF_EXIT_DIFFERENCE )); then 622 | # ag-helper failed in an unexpected way 623 | # 624 | return "$OCF_ERR_GENERIC" 625 | fi 626 | 627 | rc="$(( rc - OCF_EXIT_DIFFERENCE ))" 628 | 629 | case "$OCF_RESKEY_CRM_meta_notify_type-$OCF_RESKEY_CRM_meta_notify_operation" in 630 | 'pre-start' | 'pre-promote') 631 | if (( rc == OCF_SUCCESS )); then 632 | # Find sequence number in ag-helper's output 633 | # 634 | local sequence_number="$(echo "$command_output" | grep -Po '^SEQUENCE_NUMBER: \K.*')" 635 | 636 | if [ -z "$sequence_number" ]; then 637 | ocf_exit_reason 'Could not find sequence number in ag-helper output.' 638 | return "$OCF_ERR_GENERIC" 639 | fi 640 | 641 | attrd_updater -n "$OCF_RESOURCE_INSTANCE-sequence-number" -U "$sequence_number" -p 642 | 643 | # Work around attrd bug https://bugzilla.redhat.com/show_bug.cgi?id=1463033 644 | # attrd_updater can receive ack from attrd for the update before attrd has propagated the value to other nodes 645 | # or even committed it locally 646 | local attribute_value='' 647 | while :; do 648 | attribute_value="$(attrd_updater -n "$OCF_RESOURCE_INSTANCE-sequence-number" -Q)" 649 | if [ -n "$attribute_value" ]; then 650 | break 651 | fi 652 | 653 | sleep 5 654 | done 655 | 656 | # -Q returns the value when it's committed locally but not necessarily propagated to other nodes, 657 | # so sleep some more to let the update propagate 658 | sleep 5 659 | fi 660 | ;; 661 | esac 662 | 663 | return "$rc" 664 | } 665 | 666 | # ---------------------------------------------------------------------------------------------------------- 667 | # function: mssql_validate 668 | # 669 | # Description: 670 | # Implements the OCF "validate-all" action. 671 | # 672 | # Returns: 673 | # OCF_SUCCESS: The credentials file exists and the ag-helper binary is present. 674 | # OCF_ERR_ARGS: The credentials file does not exist. 675 | # OCF_ERR_CONFIGURED: The ag-helper binary is not present. 676 | # 677 | mssql_validate() { 678 | ocf_log info 'mssql_validate' 679 | 680 | # Set default parameters 681 | # 682 | : "${OCF_RESKEY_disable_primary_on_quorum_timeout_after=$DISABLE_PRIMARY_ON_QUORUM_TIMEOUT_DEFAULT}" 683 | : "${OCF_RESKEY_monitoring_credentials_file=$MONITORING_CREDENTIALS_FILE_DEFAULT}" 684 | : "${OCF_RESKEY_monitor_policy=$MONITOR_LEVEL_DEFAULT}" 685 | : "${OCF_RESKEY_port=$PORT_DEFAULT}" 686 | 687 | ocf_log info "OCF_RESKEY_CRM_meta_interval value: $OCF_RESKEY_CRM_meta_interval" 688 | ocf_log info "OCF_RESKEY_CRM_meta_timeout value: $OCF_RESKEY_CRM_meta_timeout" 689 | # try to set write_lease_duration value based on monitor meta 690 | : "${OCF_RESKEY_primary_write_lease_duration=$(( OCF_RESKEY_CRM_meta_interval / 1000 + OCF_RESKEY_CRM_meta_timeout / 1000 + 2 ))}" 691 | : "${OCF_RESKEY_process_name=$PROCESS_NAME_DEFAULT}" 692 | 693 | # monitor_timeout is an old alias for connection_timeout 694 | : "${OCF_RESKEY_connection_timeout:=$OCF_RESKEY_monitor_timeout}" 695 | : "${OCF_RESKEY_connection_timeout:=$CONNECTION_TIMEOUT_DEFAULT}" 696 | 697 | # required_copies_to_commit is an old alias for required_synchronized_secondaries_to_commit 698 | : "${OCF_RESKEY_required_synchronized_secondaries_to_commit:=$OCF_RESKEY_required_copies_to_commit}" 699 | : "${OCF_RESKEY_required_synchronized_secondaries_to_commit:=$REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT_DEFAULT}" 700 | 701 | # timeout value in seconds 702 | : "${OCF_RESKEY_meta_timeout_sec=$(( OCF_RESKEY_CRM_meta_timeout / 1000 ))}" 703 | 704 | # Check binaries necessary for the resource agent to run 705 | # 706 | check_binary "$AG_HELPER_BIN" 707 | 708 | # Check credentials file 709 | # 710 | if [[ ! -f "$OCF_RESKEY_monitoring_credentials_file" ]]; then 711 | ocf_exit_reason "Credentials file at $OCF_RESKEY_monitoring_credentials_file does not exist" 712 | return "$OCF_ERR_ARGS" 713 | fi 714 | 715 | # Check notify=true 716 | if [[ "$OCF_RESKEY_CRM_meta_notify" != 'true' ]]; then 717 | ocf_exit_reason 'Resource must be configured with notify=true' 718 | return "$OCF_ERR_CONFIGURED" 719 | fi 720 | 721 | return "$OCF_SUCCESS" 722 | } 723 | 724 | # ---------------------------------------------------------------------------------------------------------- 725 | # function: mssql_export_ocf_exit_codes 726 | # 727 | # Description: 728 | # Exports the OCF exit code variables as environment variables for sub-processes. 729 | # 730 | mssql_export_ocf_exit_codes() { 731 | export \ 732 | OCF_ERR_ARGS OCF_ERR_CONFIGURED OCF_ERR_GENERIC OCF_ERR_PERM OCF_ERR_UNIMPLEMENTED \ 733 | OCF_FAILED_MASTER OCF_NOT_RUNNING \ 734 | OCF_RUNNING_MASTER OCF_SUCCESS 735 | } 736 | 737 | # ---------------------------------------------------------------------------------------------------------- 738 | # function: set_exit_reason 739 | # 740 | # Description: 741 | # Extracts the exit reason from the given command output if it exists, and sets it. 742 | # 743 | set_exit_reason() { 744 | local exit_reason="$(echo "$1" | grep -Po '^ERROR: \K.*' | head -n1)" 745 | if [[ -n "$exit_reason" ]]; then 746 | ocf_exit_reason "$exit_reason" 747 | fi 748 | } 749 | 750 | # ---------------------------------------------------------------------------------------------------------- 751 | # function: set_promotion_score 752 | # 753 | # Description: 754 | # Extracts the promotion score value from the given command output and sets it. 755 | # If no output is found, sets the promotion score to `-INFINITY` 756 | # 757 | set_promotion_score() { 758 | local promotion_score="$(echo "$1" | grep -Po '^PROMOTION_SCORE: \K.*')" 759 | if [ -z "$promotion_score" ]; then 760 | crm_master -v '-INFINITY' -l reboot 761 | else 762 | crm_master -v "$promotion_score" -l reboot 763 | fi 764 | } 765 | 766 | # ---------------------------------------------------------------------------------------------------------- 767 | # 768 | if [[ "$__OCF_ACTION" = 'meta-data' ]]; then 769 | mssql_meta_data 770 | exit "$OCF_SUCCESS" 771 | fi 772 | 773 | mssql_validate 774 | validate_result="$?" 775 | 776 | ocf_log info "Resource agent invoked with: $__OCF_ACTION" 777 | 778 | # Everything else must pass validation 779 | if (( validate_result != 0 )); then 780 | exit "$validate_result" 781 | fi 782 | 783 | mssql_export_ocf_exit_codes 784 | 785 | case "$__OCF_ACTION" in 786 | 'start') 787 | mssql_start 788 | ;; 789 | 'stop') 790 | mssql_stop 791 | ;; 792 | 'monitor') 793 | mssql_monitor 794 | ;; 795 | 'promote') 796 | mssql_promote 797 | ;; 798 | 'demote') 799 | mssql_demote 800 | ;; 801 | 'notify') 802 | mssql_notify 803 | ;; 804 | 'validate-all') 805 | exit "$validate_result" 806 | ;; 807 | 'usage' | 'help') 808 | mssql_usage 809 | exit "$OCF_SUCCESS" 810 | ;; 811 | *) 812 | mssql_usage 813 | exit "$OCF_ERR_UNIMPLEMENTED" 814 | ;; 815 | esac 816 | rc="$?" 817 | 818 | ocf_log info "$OCF_RESOURCE_INSTANCE $__OCF_ACTION : $rc" 819 | exit "$rc" 820 | -------------------------------------------------------------------------------- /go/src/mssqlcommon/ag/lib.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) Microsoft Corporation. 2 | 3 | // Package ag contains items related to SQL Server Availability Groups. 4 | package ag 5 | 6 | import ( 7 | "database/sql" 8 | "encoding/json" 9 | "fmt" 10 | "log" 11 | "strings" 12 | "time" 13 | 14 | "mssqlcommon" 15 | ) 16 | 17 | // An AvailabilityMode represents an AG replica's availability mode. 18 | // 19 | // See the availability_mode field in https://msdn.microsoft.com/en-us/library/ff877883.aspx for details. 20 | type AvailabilityMode byte 21 | 22 | const ( 23 | // The replica has ASYNCHRONOUS_COMMIT availability mode 24 | AmASYNCHRONOUS_COMMIT AvailabilityMode = 0 25 | 26 | // The replica has SYNCHRONOUS_COMMIT availability mode 27 | AmSYNCHRONOUS_COMMIT AvailabilityMode = 1 28 | 29 | // The replica has CONFIGURATION_ONLY availability mode 30 | AmCONFIGURATION_ONLY AvailabilityMode = 4 31 | ) 32 | 33 | const ( 34 | // AmASYNCHRONOUS_COMMIT_JSON is the JSON string that AmASYNCHRONOUS_COMMIT is serialized to 35 | AmASYNCHRONOUS_COMMIT_JSON = "asynchronousCommit" 36 | 37 | // AmSYNCHRONOUS_COMMIT_JSON is the JSON string that AmSYNCHRONOUS_COMMIT is serialized to 38 | AmSYNCHRONOUS_COMMIT_JSON = "synchronousCommit" 39 | 40 | // AmCONFIGURATION_ONLY_JSON is the JSON string that AmCONFIGURATION_ONLY is serialized to 41 | AmCONFIGURATION_ONLY_JSON = "configurationOnly" 42 | ) 43 | 44 | const ( 45 | // AmASYNCHRONOUS_COMMIT_TSQL is the T-SQL keyword for AmASYNCHRONOUS_COMMIT 46 | AmASYNCHRONOUS_COMMIT_TSQL = "ASYNCHRONOUS_COMMIT" 47 | 48 | // AmSYNCHRONOUS_COMMIT_TSQL is the T-SQL keyword for AmSYNCHRONOUS_COMMIT 49 | AmSYNCHRONOUS_COMMIT_TSQL = "SYNCHRONOUS_COMMIT" 50 | 51 | // AmCONFIGURATION_ONLY_TSQL is the T-SQL keyword for AmCONFIGURATION_ONLY 52 | AmCONFIGURATION_ONLY_TSQL = "CONFIGURATION_ONLY" 53 | ) 54 | 55 | // TSQL returns the T-SQL keyword corresponding to this AvailabilityMode 56 | func (availabilityMode AvailabilityMode) TSQL() (string, error) { 57 | switch availabilityMode { 58 | case AmASYNCHRONOUS_COMMIT: 59 | return AmASYNCHRONOUS_COMMIT_TSQL, nil 60 | case AmSYNCHRONOUS_COMMIT: 61 | return AmSYNCHRONOUS_COMMIT_TSQL, nil 62 | case AmCONFIGURATION_ONLY: 63 | return AmCONFIGURATION_ONLY_TSQL, nil 64 | default: 65 | return "", fmt.Errorf("unexpected availabilty mode %d", availabilityMode) 66 | } 67 | } 68 | 69 | // MarshalJSON serializes this AvailabilityMode as a JSON value 70 | func (availabilityMode AvailabilityMode) MarshalJSON() ([]byte, error) { 71 | var availabilityModeString string 72 | 73 | switch availabilityMode { 74 | case AmASYNCHRONOUS_COMMIT: 75 | availabilityModeString = AmASYNCHRONOUS_COMMIT_JSON 76 | case AmSYNCHRONOUS_COMMIT: 77 | availabilityModeString = AmSYNCHRONOUS_COMMIT_JSON 78 | case AmCONFIGURATION_ONLY: 79 | availabilityModeString = AmCONFIGURATION_ONLY_JSON 80 | default: 81 | return nil, fmt.Errorf("unrecognized availability mode %d", availabilityMode) 82 | } 83 | 84 | return json.Marshal(availabilityModeString) 85 | } 86 | 87 | // UnmarshalJSON deserializes this AvailabilityMode from a JSON value 88 | func (availabilityMode *AvailabilityMode) UnmarshalJSON(data []byte) error { 89 | var s string 90 | err := json.Unmarshal(data, &s) 91 | if err != nil { 92 | return err 93 | } 94 | 95 | switch s { 96 | case AmASYNCHRONOUS_COMMIT_JSON: 97 | *availabilityMode = AmASYNCHRONOUS_COMMIT 98 | case AmSYNCHRONOUS_COMMIT_JSON: 99 | *availabilityMode = AmSYNCHRONOUS_COMMIT 100 | case AmCONFIGURATION_ONLY_JSON: 101 | *availabilityMode = AmCONFIGURATION_ONLY 102 | default: 103 | return fmt.Errorf("Expected one of %s, %s, %s", AmASYNCHRONOUS_COMMIT_JSON, AmCONFIGURATION_ONLY_JSON, AmSYNCHRONOUS_COMMIT_JSON) 104 | } 105 | 106 | return nil 107 | } 108 | 109 | // A DatabaseState represents a database state. 110 | // 111 | // See the state field in https://docs.microsoft.com/en-us/sql/relational-databases/system-catalog-views/sys-databases-transact-sql for details. 112 | type DatabaseState byte 113 | 114 | const ( 115 | // The database is in RESTORING state 116 | DatabaseStateRESTORING DatabaseState = 1 117 | 118 | // The database is in RECOVERING state 119 | DatabaseStateRECOVERING DatabaseState = 2 120 | 121 | // The database is in RECOVERY_PENDING state 122 | DatabaseStateRECOVERY_PENDING DatabaseState = 3 123 | 124 | // The database is in OFFLINE state 125 | DatabaseStateOFFLINE DatabaseState = 6 126 | ) 127 | 128 | // A Role represents an AG replica's role. 129 | // 130 | // See the role field in https://docs.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-hadr-availability-replica-states-transact-sql for details. 131 | type Role byte 132 | 133 | const ( 134 | // The replica is in RESOLVING role. 135 | RoleRESOLVING Role = 0 136 | 137 | // The replica is in PRIMARY role. 138 | RolePRIMARY Role = 1 139 | 140 | // The replica is in SECONDARY role. 141 | RoleSECONDARY Role = 2 142 | ) 143 | 144 | // The seeding mode of an AG replica 145 | // 146 | // See the seeding_mode field in https://msdn.microsoft.com/en-us/library/ff877883.aspx for details 147 | type SeedingMode byte 148 | 149 | const ( 150 | // The replica is in automatic seeding mode 151 | SmAUTOMATIC SeedingMode = 0 152 | 153 | // The replica is in manual seeding mode 154 | SmMANUAL SeedingMode = 1 155 | ) 156 | 157 | // Replica contains the properties of an AG replica. 158 | type Replica struct { 159 | ID string 160 | Name string 161 | EndpointURL string 162 | AvailabilityMode AvailabilityMode 163 | } 164 | 165 | // AddReplica adds the given replica to the given Availability Group. 166 | // 167 | // Params: 168 | // db: A connection to a SQL Server instance hosting a replica of the AG. 169 | // agName: The name of the AG. 170 | // replica: The replica to add. 171 | // 172 | func AddReplica(db *sql.DB, agName string, replica Replica) error { 173 | alterDdl := ` 174 | ALTER AVAILABILITY GROUP %s ADD REPLICA ON %s WITH ( 175 | ENDPOINT_URL = %s, 176 | AVAILABILITY_MODE = %s` 177 | 178 | availabilityMode, err := replica.AvailabilityMode.TSQL() 179 | if err != nil { 180 | return err 181 | } 182 | 183 | // Only add these options if it is not an AmCONFIGURATION_ONLY replica 184 | var miscOptions string 185 | if replica.AvailabilityMode != AmCONFIGURATION_ONLY { 186 | miscOptions = `, 187 | FAILOVER_MODE = EXTERNAL, 188 | SEEDING_MODE = AUTOMATIC, 189 | SECONDARY_ROLE ( 190 | ALLOW_CONNECTIONS = READ_ONLY 191 | )` 192 | } 193 | 194 | alterDdl = fmt.Sprintf("%s%s)", alterDdl, miscOptions) 195 | 196 | _, err = db.Exec(fmt.Sprintf( 197 | alterDdl, 198 | mssqlcommon.QuoteNameBracket(agName), 199 | mssqlcommon.QuoteNameQuote(replica.Name), 200 | mssqlcommon.QuoteNameQuote(replica.EndpointURL), 201 | availabilityMode, 202 | )) 203 | return err 204 | } 205 | 206 | // CalculateNumRequiredSequenceNumbers calculates the number of sequence numbers required for a safe promotion for the given number of 207 | // SYNCHRONOUS_COMMIT or CONFIGURATION_ONLY replicas. 208 | // 209 | // Params: 210 | // numReplicas: The number of SYNCHRONOUS_COMMIT or CONFIGURATION_ONLY replicas. 211 | // 212 | func CalculateNumRequiredSequenceNumbers(numReplicas uint) uint { 213 | // num replicas which must commit = quorum count = floor(numReplicas / 2) + 1 214 | // num replicas which may not commit = numReplicas - num replicas which must commit = ceil(numReplicas / 2) - 1 = floor((numReplicas + 1) / 2) - 1 215 | // required sequence numbers = num replicas which may not commit + 1 = floor((numReplicas / 2) + 1) 216 | 217 | return (numReplicas / 2) + 1 218 | } 219 | 220 | // CalculateRequiredSynchronizedSecondariesToCommit Calculates the optimal value of REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT for the given number of SYNCHRONOUS_COMMIT replicas. 221 | // 222 | // Params: 223 | // numReplicas: The number of SYNCHRONOUS_COMMIT replicas. 224 | // 225 | func CalculateRequiredSynchronizedSecondariesToCommit(numReplicas uint) uint { 226 | // quorum count = floor(numReplicas / 2) + 1 227 | // required synchronized secondaries to commit = quorum count - 1 (value doesn't count the primary) 228 | // 229 | // But for two replicas, customers prefer RSSTC = 0 since they don't want unavailablility on the single S to block writes on P 230 | 231 | if numReplicas == 2 { 232 | return 0 233 | } 234 | 235 | return numReplicas / 2 236 | } 237 | 238 | // Create creates an Availability Group with the given name. 239 | // 240 | // Params: 241 | // db: A connection to a SQL Server instance hosting a replica of the AG. 242 | // agName: The name of the AG. 243 | // 244 | func Create(db *sql.DB, agName string, sqlMajorVersion int, externalWriteLeaseValidTime time.Duration, replica Replica) error { 245 | var createDdl string 246 | 247 | if sqlMajorVersion >= 15 { 248 | createDdl = ` 249 | CREATE AVAILABILITY GROUP %s 250 | WITH (DB_FAILOVER = ON, CLUSTER_TYPE = EXTERNAL, WRITE_LEASE_VALIDITY = ` + fmt.Sprintf("%d", externalWriteLeaseValidTime/time.Second) + `) 251 | FOR REPLICA ON %s WITH ( 252 | ENDPOINT_URL = %s, 253 | AVAILABILITY_MODE = %s` 254 | } else { 255 | createDdl = ` 256 | CREATE AVAILABILITY GROUP %s 257 | WITH (DB_FAILOVER = ON, CLUSTER_TYPE = EXTERNAL) 258 | FOR REPLICA ON %s WITH ( 259 | ENDPOINT_URL = %s, 260 | AVAILABILITY_MODE = %s` 261 | } 262 | 263 | availabilityMode, err := replica.AvailabilityMode.TSQL() 264 | if err != nil { 265 | return err 266 | } 267 | 268 | // Only add these options if it is not an AmCONFIGURATION_ONLY replica 269 | var miscOptions string 270 | if replica.AvailabilityMode != AmCONFIGURATION_ONLY { 271 | miscOptions = `, 272 | FAILOVER_MODE = EXTERNAL, 273 | SEEDING_MODE = AUTOMATIC, 274 | SECONDARY_ROLE ( 275 | ALLOW_CONNECTIONS = READ_ONLY 276 | )` 277 | } 278 | 279 | createDdl = fmt.Sprintf("%s%s)", createDdl, miscOptions) 280 | 281 | _, err = db.Exec(fmt.Sprintf( 282 | createDdl, 283 | mssqlcommon.QuoteNameBracket(agName), 284 | mssqlcommon.QuoteNameQuote(replica.Name), 285 | mssqlcommon.QuoteNameQuote(replica.EndpointURL), 286 | availabilityMode, 287 | )) 288 | return err 289 | } 290 | 291 | // CreateDbmUser creates a SQL user for the given login if it doesn't already exist, and grants it permissions required for the DBM Kubernetes agent. 292 | func CreateDbmUser(db *sql.DB, username string, loginName string) error { 293 | err := mssqlcommon.CreateUser(db, username, loginName) 294 | if err != nil { 295 | return err 296 | } 297 | _, err = db.Exec(fmt.Sprintf(` 298 | GRANT CREATE CERTIFICATE TO %[1]s; 299 | GRANT CREATE ENDPOINT TO %[2]s; -- CREATE/DROP DBM ENDPOINT 300 | `, mssqlcommon.QuoteNameBracket(username), mssqlcommon.QuoteNameBracket(loginName))) 301 | return err 302 | } 303 | 304 | // DropIfExists drops the given Availability Group if it exists. 305 | // 306 | // Params: 307 | // db: A connection to a SQL Server instance hosting a replica of the AG. 308 | // agName: The name of the AG. 309 | // 310 | func DropIfExists(db *sql.DB, agName string) error { 311 | _, err := db.Exec(fmt.Sprintf(` 312 | IF EXISTS(SELECT * FROM sys.availability_groups WHERE name = ?) 313 | DROP AVAILABILITY GROUP %s 314 | ; 315 | `, mssqlcommon.QuoteNameBracket(agName)), agName) 316 | return err 317 | } 318 | 319 | // Failover performs a failover of the given Availability Group. 320 | // 321 | // Params: 322 | // db: A connection to a SQL Server instance hosting a replica of the AG. 323 | // agName: The name of the AG. 324 | // 325 | func Failover(db *sql.DB, agName string) error { 326 | _, err := db.Exec(fmt.Sprintf(` 327 | EXEC sp_set_session_context @key = N'external_cluster', @value = N'yes' 328 | ALTER AVAILABILITY GROUP %s FAILOVER 329 | `, mssqlcommon.QuoteNameBracket(agName))) 330 | return err 331 | } 332 | 333 | // FailoverWithDataLoss forces a failover of the given Availability Group, accepting data loss. 334 | // 335 | // Params: 336 | // db: A connection to a SQL Server instance hosting a replica of the AG. 337 | // agName: The name of the AG. 338 | // 339 | func FailoverWithDataLoss(db *sql.DB, agName string) error { 340 | _, err := db.Exec(fmt.Sprintf(` 341 | EXEC sp_set_session_context @key = N'external_cluster', @value = N'yes' 342 | ALTER AVAILABILITY GROUP %s FORCE_FAILOVER_ALLOW_DATA_LOSS 343 | `, mssqlcommon.QuoteNameBracket(agName))) 344 | return err 345 | } 346 | 347 | // GetNumHealthySyncCommitSecondaries gets the number of sync seconaries that are connected and synchronized 348 | func GetNumHealthySyncCommitSecondaries(db *sql.DB, agName string) (uint, error) { 349 | // See https://docs.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-hadr-availability-replica-states-transact-sql 350 | var healthSyncSecondaries uint 351 | err := db.QueryRow(` 352 | DECLARE @numAgDbs INT 353 | SELECT @numAgDbs = COUNT(*) 354 | FROM 355 | sys.availability_groups ag 356 | INNER JOIN sys.dm_hadr_database_replica_states drs ON drs.group_id = ag.group_id 357 | WHERE ag.name = ? AND drs.is_local = 1 358 | 359 | SELECT COUNT(*) 360 | FROM 361 | sys.availability_groups ag 362 | INNER JOIN sys.dm_hadr_availability_replica_states ars ON ars.group_id = ag.group_id 363 | INNER JOIN sys.availability_replicas ar ON ar.replica_id = ars.replica_id 364 | WHERE ag.name = ? AND ars.is_local = 0 AND ar.availability_mode = 1 AND ars.connected_state = 1 AND (synchronization_health = 2 OR @numAgDbs = 0)`, 365 | agName, agName).Scan(&healthSyncSecondaries) 366 | if err != nil { 367 | return 0, err 368 | } 369 | 370 | return healthSyncSecondaries, nil 371 | } 372 | 373 | // GetNumConnectedSyncCommitSecondaries gets the number of sync seconaries that are connected and synchronized 374 | func GetNumConnectedSyncCommitSecondaries(db *sql.DB, agName string) (uint, error) { 375 | // See https://docs.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-hadr-availability-replica-states-transact-sql 376 | var connectedSyncSecondaries uint 377 | err := db.QueryRow(` 378 | SELECT COUNT(*) 379 | FROM 380 | sys.availability_groups ag 381 | INNER JOIN sys.dm_hadr_availability_replica_states ars ON ars.group_id = ag.group_id 382 | INNER JOIN sys.availability_replicas ar ON ar.replica_id = ars.replica_id 383 | WHERE ag.name = ? AND ars.is_local = 0 AND ar.availability_mode = 1 AND ars.connected_state = 1`, 384 | agName).Scan(&connectedSyncSecondaries) 385 | if err != nil { 386 | return 0, err 387 | } 388 | 389 | return connectedSyncSecondaries, nil 390 | } 391 | 392 | // PrintReplicaAgState prints the availiblity groups state 393 | func PrintReplicaAgState(db *sql.DB, agName string, stdout *log.Logger) { 394 | // See https://docs.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-hadr-availability-replica-states-transact-sql 395 | rows, err := db.Query(` 396 | SELECT ar.replica_server_name, ars.synchronization_health_desc, ars.connected_state_desc 397 | FROM 398 | sys.availability_groups ag 399 | INNER JOIN sys.dm_hadr_availability_replica_states ars ON ars.group_id = ag.group_id 400 | INNER JOIN sys.availability_replicas ar ON ar.replica_id = ars.replica_id 401 | WHERE ag.name = ? AND ar.availability_mode = 1`, 402 | agName) 403 | 404 | if err != nil { 405 | panic(err) 406 | } 407 | 408 | defer rows.Close() 409 | result := "AG replica states:\nreplica_server_name, synchronization_health_desc, connected_state_desc\n" 410 | for rows.Next() { 411 | var serverName string 412 | var syncHealth string 413 | var conected string 414 | err = rows.Scan(&serverName, &syncHealth, &conected) 415 | if err != nil { 416 | return 417 | } 418 | 419 | result += fmt.Sprintf("%s, %s, %s\n", serverName, syncHealth, conected) 420 | } 421 | stdout.Print(result) 422 | 423 | var hadDbs int 424 | err = db.QueryRow(` 425 | SELECT COUNT(*) 426 | FROM 427 | sys.availability_groups ag 428 | INNER JOIN sys.dm_hadr_database_replica_states drs ON drs.group_id = ag.group_id 429 | WHERE ag.name = ? AND drs.is_local = 1`, 430 | agName).Scan(&hadDbs) 431 | if err != nil { 432 | panic(err) 433 | } 434 | stdout.Printf("AG has dbs: %v", hadDbs > 0) 435 | } 436 | 437 | // GetReplicaHealthState determines if the instance, replicaName, is healthy and connected respectively 438 | // return values: 439 | // - bool noting if the replica is connected 440 | // - bool noting if the replica is health 441 | // - error if an error occurted 442 | func GetReplicaHealthState(db *sql.DB, replicaName string, agName string) (bool, bool, error) { 443 | var connectedState byte 444 | var synchronizationState byte 445 | err := db.QueryRow(` 446 | DECLARE @numAgDbs INT 447 | SELECT @numAgDbs = COUNT(*) 448 | FROM 449 | sys.availability_groups ag 450 | INNER JOIN sys.dm_hadr_availability_replica_states ars ON ars.group_id = ag.group_id 451 | INNER JOIN sys.dm_hadr_database_replica_states drs ON drs.group_id = ag.group_id 452 | WHERE ag.name = ? AND drs.is_local = 1 453 | 454 | SELECT ars.connected_state, CASE @numAgDbs WHEN 0 THEN 2 ELSE ars.synchronization_health END AS synchronization_health 455 | FROM 456 | sys.availability_groups ag 457 | INNER JOIN sys.dm_hadr_availability_replica_states ars ON ars.group_id = ag.group_id 458 | INNER JOIN sys.availability_replicas ar ON ar.replica_id = ars.replica_id 459 | WHERE ag.name = ? AND ar.replica_server_name = ?`, agName, agName, replicaName).Scan(&connectedState, &synchronizationState) 460 | 461 | if err != nil { 462 | return false, false, err 463 | } 464 | // See https://docs.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-hadr-availability-replica-states-transact-sql 465 | return connectedState == 1, synchronizationState == 2, nil 466 | } 467 | 468 | // GetAvailabilityMode gets the availability mode of the given Availability Group. 469 | // 470 | // Params: 471 | // db: A connection to a SQL Server instance hosting a replica of the AG. 472 | // agName: The name of the AG. 473 | // 474 | // Returns: 475 | // The numeric value and string name of the availability mode, or an error if the AG was not found. 476 | // 477 | func GetAvailabilityMode(db *sql.DB, agName string) (availabilityMode AvailabilityMode, availabilityModeDesc string, err error) { 478 | err = db.QueryRow(` 479 | SELECT ar.availability_mode, ar.availability_mode_desc 480 | FROM 481 | sys.availability_groups ag 482 | INNER JOIN sys.dm_hadr_availability_replica_states ars ON ars.group_id = ag.group_id AND ars.is_local = 1 483 | INNER JOIN sys.availability_replicas ar ON ar.replica_id = ars.replica_id 484 | WHERE 485 | ag.name = ?`, agName).Scan(&availabilityMode, &availabilityModeDesc) 486 | 487 | return 488 | } 489 | 490 | // GetCurrentConfigurationCommitStartTime gets the start timestamp of the current configuration commit, if any, of the given Availability Group. 491 | // 492 | // Params: 493 | // db: A connection to a SQL Server instance hosting a replica of the AG. 494 | // agName: The name of the AG. 495 | // 496 | // Returns: 497 | // The start timestamp, or nil if there is no configuration commit in progress. 498 | // 499 | func GetCurrentConfigurationCommitStartTime(db *sql.DB, agName string) (currentConfigurationCommitStartTime *time.Time, err error) { 500 | err = db.QueryRow(` 501 | SELECT ars.current_configuration_commit_start_time_utc 502 | FROM 503 | sys.availability_groups ag 504 | INNER JOIN sys.dm_hadr_availability_replica_states ars ON ars.group_id = ag.group_id AND ars.is_local = 1 505 | WHERE 506 | ag.name = ?`, agName).Scan(¤tConfigurationCommitStartTime) 507 | 508 | return 509 | } 510 | 511 | // GetCurrentReplicaID gets the ID of the local replica of the given Availability Group. 512 | // 513 | // Params: 514 | // db: A connection to a SQL Server instance hosting a replica of the AG. 515 | // agName: The name of the AG. 516 | // 517 | func GetCurrentReplicaID(db *sql.DB, agName string) (currentReplicaID string, err error) { 518 | err = db.QueryRow(` 519 | SELECT CAST(ar.replica_id as NCHAR(36)) 520 | FROM 521 | sys.availability_groups ag 522 | INNER JOIN sys.dm_hadr_availability_replica_states ars ON ars.group_id = ag.group_id AND ars.is_local = 1 523 | INNER JOIN sys.availability_replicas ar ON ar.replica_id = ars.replica_id 524 | WHERE 525 | ag.name = ?`, agName).Scan(¤tReplicaID) 526 | 527 | return 528 | } 529 | 530 | // GetCurrentReplicaName gets the name of the local replica of the given Availability Group. 531 | // 532 | // Params: 533 | // db: A connection to a SQL Server instance hosting a replica of the AG. 534 | // agName: The name of the AG. 535 | // 536 | func GetCurrentReplicaName(db *sql.DB, agName string) (currentReplicaName string, err error) { 537 | err = db.QueryRow(` 538 | SELECT ar.replica_server_name 539 | FROM 540 | sys.availability_groups ag 541 | INNER JOIN sys.dm_hadr_availability_replica_states ars ON ars.group_id = ag.group_id AND ars.is_local = 1 542 | INNER JOIN sys.availability_replicas ar ON ar.replica_id = ars.replica_id 543 | WHERE 544 | ag.name = ?`, agName).Scan(¤tReplicaName) 545 | 546 | return 547 | } 548 | 549 | // GetDatabaseStates returns two messages containing the number of databases that belong to the given Availability Group and are not ONLINE. 550 | // 551 | // The first message contains the number of databases in states that are transient, like RECOVERING. The caller will likely want to 552 | // wait for these databases to come ONLINE on their own. 553 | // 554 | // The second message contains the number of databases in states that are permanent, like SUSPECT. The caller will likely want to fail 555 | // immediately. 556 | // 557 | // Params: 558 | // db: A connection to a SQL Server instance hosting a replica of the AG. 559 | // agName: The name of the AG. 560 | // 561 | func GetDatabaseStates(db *sql.DB, agName string) (transient string, permanent string, err error) { 562 | stmt, err := db.Prepare(` 563 | SELECT drs.database_state, drs.database_state_desc, COUNT(*) FROM 564 | sys.availability_groups ag 565 | INNER JOIN sys.dm_hadr_database_replica_states drs ON drs.group_id = ag.group_id AND drs.is_local = 1 566 | WHERE 567 | ag.name = ? AND drs.database_state <> 0 568 | GROUP BY drs.database_state, drs.database_state_desc`) 569 | if err != nil { 570 | return 571 | } 572 | defer stmt.Close() 573 | 574 | rows, err := stmt.Query(agName) 575 | if err != nil { 576 | return 577 | } 578 | defer rows.Close() 579 | 580 | for rows.Next() { 581 | var state DatabaseState 582 | var stateDesc string 583 | var numDatabases int 584 | err = rows.Scan(&state, &stateDesc, &numDatabases) 585 | if err != nil { 586 | return 587 | } 588 | 589 | message := fmt.Sprintf("%d databases are %s, ", numDatabases, stateDesc) 590 | switch state { 591 | case DatabaseStateRESTORING, DatabaseStateRECOVERING, DatabaseStateRECOVERY_PENDING, DatabaseStateOFFLINE: 592 | transient += message 593 | default: 594 | permanent += message 595 | } 596 | } 597 | 598 | transient = strings.TrimSuffix(transient, ", ") 599 | permanent = strings.TrimSuffix(permanent, ", ") 600 | 601 | err = rows.Err() 602 | 603 | return 604 | } 605 | 606 | // GetDBFailoverMode gets the DB_FAILOVER setting of the given Availability Group. 607 | // 608 | // Params: 609 | // db: A connection to a SQL Server instance hosting a replica of the AG. 610 | // agName: The name of the AG. 611 | // 612 | // Returns: 613 | // `true` means ON, `false` means OFF. 614 | // 615 | func GetDBFailoverMode(db *sql.DB, agName string) (dbFailoverMode bool, err error) { 616 | err = db.QueryRow(` 617 | SELECT ag.db_failover 618 | FROM 619 | sys.availability_groups ag 620 | WHERE 621 | ag.name = ?`, agName).Scan(&dbFailoverMode) 622 | 623 | return 624 | } 625 | 626 | // GetNumSyncCommitReplicas gets the number of SYNCHRONOUS_COMMIT replicas in the given Availability Group. 627 | // 628 | // Params: 629 | // db: A connection to a SQL Server instance hosting a replica of the AG. 630 | // agName: The name of the AG. 631 | // 632 | func GetNumSyncCommitReplicas(db *sql.DB, agName string) (numReplicas uint, err error) { 633 | err = db.QueryRow(` 634 | SELECT COUNT(*) 635 | FROM 636 | sys.availability_replicas ar 637 | INNER JOIN sys.availability_groups ag ON ar.group_id = ag.group_id 638 | WHERE ag.name = ? AND ar.availability_mode = ?`, agName, AmSYNCHRONOUS_COMMIT).Scan(&numReplicas) 639 | 640 | return 641 | } 642 | 643 | // GetNumSyncCommitAndConfigurationOnlyReplicas gets the number of SYNCHRONOUS_COMMIT and CONFIGURATION_ONLY replicas in the given Availability Group. 644 | // 645 | // Params: 646 | // db: A connection to a SQL Server instance hosting a replica of the AG. 647 | // agName: The name of the AG. 648 | // 649 | func GetNumSyncCommitAndConfigurationOnlyReplicas(db *sql.DB, agName string) (numReplicas uint, err error) { 650 | err = db.QueryRow(` 651 | SELECT COUNT(*) 652 | FROM 653 | sys.availability_replicas ar 654 | INNER JOIN sys.availability_groups ag ON ar.group_id = ag.group_id 655 | WHERE ag.name = ? AND ar.availability_mode IN (?, ?)`, agName, AmSYNCHRONOUS_COMMIT, AmCONFIGURATION_ONLY).Scan(&numReplicas) 656 | 657 | return 658 | } 659 | 660 | // GetPrimaryReplicaName gets the name of the primary replica of the given Availability Group. 661 | // 662 | // Params: 663 | // db: A connection to a SQL Server instance hosting a replica of the AG. 664 | // agName: The name of the AG. 665 | // 666 | func GetPrimaryReplicaName(db *sql.DB, agName string) (primaryReplicaName string, err error) { 667 | err = db.QueryRow(` 668 | SELECT ags.primary_replica 669 | FROM 670 | sys.availability_groups ag 671 | INNER JOIN sys.dm_hadr_availability_group_states ags ON ags.group_id = ag.group_id 672 | WHERE 673 | ag.name = ?`, agName).Scan(&primaryReplicaName) 674 | 675 | return 676 | } 677 | 678 | // GetReplicas gets the name and IP of the replicas of the given Availability Group. 679 | // 680 | // Params: 681 | // db: A connection to a SQL Server instance hosting a replica of the AG. 682 | // agName: The name of the AG. 683 | // 684 | func GetReplicas(db *sql.DB, agName string) ([]Replica, error) { 685 | rows, err := db.Query(` 686 | SELECT CAST(ar.replica_id as NCHAR(36)), ar.replica_server_name, ar.endpoint_url, ar.availability_mode 687 | FROM 688 | sys.availability_replicas ar 689 | INNER JOIN sys.availability_groups ag ON ag.group_id = ar.group_id 690 | WHERE 691 | ag.name = ? 692 | `, agName) 693 | if err != nil { 694 | return nil, err 695 | } 696 | 697 | defer rows.Close() 698 | 699 | result := []Replica{} 700 | for rows.Next() { 701 | var replicaID string 702 | var replicaName string 703 | var endpointURL string 704 | var availabilityMode AvailabilityMode 705 | err = rows.Scan(&replicaID, &replicaName, &endpointURL, &availabilityMode) 706 | if err != nil { 707 | return nil, err 708 | } 709 | 710 | result = append(result, Replica{ 711 | ID: replicaID, 712 | Name: replicaName, 713 | EndpointURL: endpointURL, 714 | AvailabilityMode: availabilityMode, 715 | }) 716 | } 717 | 718 | return result, nil 719 | } 720 | 721 | // GetRole gets the role of the given Availability Group. 722 | // 723 | // Params: 724 | // db: A connection to a SQL Server instance hosting a replica of the AG. 725 | // agName: The name of the AG. 726 | // 727 | // Returns: 728 | // The numeric value and name of the role, or an error if the AG was not found. 729 | // 730 | func GetRole(db *sql.DB, agName string) (role Role, roleDesc string, err error) { 731 | err = db.QueryRow(` 732 | SELECT ars.role, ars.role_desc 733 | FROM 734 | sys.availability_groups ag 735 | INNER JOIN sys.dm_hadr_availability_replica_states ars ON ars.group_id = ag.group_id AND ars.is_local = 1 736 | WHERE 737 | ag.name = ?`, agName).Scan(&role, &roleDesc) 738 | 739 | return 740 | } 741 | 742 | // GetSeedingMode gets the seeding mode of the current replica of the given Availability Group 743 | // 744 | // Params: 745 | // db: A connection to a SQL Server instance hosting a replica of the AG. 746 | // agName: The name of the AG. 747 | // 748 | // Returns: 749 | // The numeric value and string name of the seeding mode, or an error if the AG was not found. 750 | // 751 | func GetSeedingMode(db *sql.DB, agName string) (seedingMode SeedingMode, seedingModeDesc string, err error) { 752 | err = db.QueryRow(` 753 | SELECT ar.seeding_mode, ar.seeding_mode_desc 754 | FROM 755 | sys.availability_groups ag 756 | INNER JOIN sys.dm_hadr_availability_replica_states ars ON ars.group_id = ag.group_id AND ars.is_local = 1 757 | INNER JOIN sys.availability_replicas ar ON ar.replica_id = ars.replica_id 758 | WHERE 759 | ag.name = ?`, agName).Scan(&seedingMode, &seedingModeDesc) 760 | 761 | return 762 | } 763 | 764 | // GetSequenceNumber gets the sequence number of the current replica of the given Availability Group 765 | // 766 | // Params: 767 | // db: A connection to a SQL Server instance hosting a replica of the AG. 768 | // agName: The name of the AG. 769 | // 770 | // Returns: 771 | // The sequence number. 772 | // 773 | func GetSequenceNumber(db *sql.DB, agName string) (sequenceNumber int64, err error) { 774 | err = db.QueryRow(` 775 | SELECT ag.sequence_number 776 | FROM 777 | sys.availability_groups ag 778 | WHERE 779 | ag.name = ?`, agName).Scan(&sequenceNumber) 780 | 781 | return 782 | } 783 | 784 | // GrantCreateAnyDatabase grants the given Availability Group's replica the permission to create any databases in the AG that aren't present. 785 | // 786 | // Params: 787 | // db: A connection to a SQL Server instance hosting a replica of the AG. 788 | // agName: The name of the AG. 789 | // 790 | func GrantCreateAnyDatabase(db *sql.DB, agName string) (err error) { 791 | _, err = db.Exec(fmt.Sprintf("ALTER AVAILABILITY GROUP %s GRANT CREATE ANY DATABASE", mssqlcommon.QuoteNameBracket(agName))) 792 | return 793 | } 794 | 795 | // Join joins the given Availability Group. 796 | // 797 | // Params: 798 | // db: A connection to a SQL Server instance hosting a replica of the AG. 799 | // agName: The name of the AG. 800 | // 801 | func Join(db *sql.DB, agName string) (err error) { 802 | _, err = db.Exec(fmt.Sprintf("ALTER AVAILABILITY GROUP %s JOIN WITH (CLUSTER_TYPE = EXTERNAL)", mssqlcommon.QuoteNameBracket(agName))) 803 | return 804 | } 805 | 806 | // Offline offlines the local replica of the given Availability Group. 807 | // 808 | // Params: 809 | // db: A connection to a SQL Server instance hosting a replica of the AG. 810 | // agName: The name of the AG. 811 | func Offline(db *sql.DB, agName string) (err error) { 812 | _, err = db.Exec(fmt.Sprintf("ALTER AVAILABILITY GROUP %s OFFLINE", mssqlcommon.QuoteNameBracket(agName))) 813 | return 814 | } 815 | 816 | // RemoveReplica removes the given replica from the given Availability Group. 817 | // 818 | // Params: 819 | // db: A connection to a SQL Server instance hosting a replica of the AG. 820 | // agName: The name of the AG. 821 | // replicaName: The name of the replica to add. 822 | // 823 | func RemoveReplica(db *sql.DB, agName string, replicaName string) (err error) { 824 | _, err = db.Exec(fmt.Sprintf("ALTER AVAILABILITY GROUP %s REMOVE REPLICA ON %s", mssqlcommon.QuoteNameBracket(agName), mssqlcommon.QuoteNameQuote(replicaName))) 825 | return 826 | } 827 | 828 | // SetRequiredSynchronizedSecondariesToCommit sets the value of REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT on the given Availability Group on the instance. 829 | // 830 | // Params: 831 | // db: A connection to a SQL Server instance hosting a replica of the AG. 832 | // agName: The name of the AG. 833 | // newValue: The new REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT value. 834 | // 835 | func SetRequiredSynchronizedSecondariesToCommit(db *sql.DB, agName string, newValue int32) (err error) { 836 | _, err = db.Exec(fmt.Sprintf(` 837 | DECLARE @num_ags INT; 838 | SELECT @num_ags = COUNT(*) FROM sys.availability_groups WHERE name = ? AND required_synchronized_secondaries_to_commit = ?; 839 | IF @num_ags = 0 840 | ALTER AVAILABILITY GROUP %s SET (REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT = %d) 841 | ; 842 | `, mssqlcommon.QuoteNameBracket(agName), newValue), agName, newValue) 843 | return 844 | } 845 | 846 | // SetRoleToSecondary sets the role of the given Availability Group to SECONDARY. 847 | // 848 | // Params: 849 | // db: A connection to a SQL Server instance hosting a replica of the AG. 850 | // agName: The name of the AG. 851 | func SetRoleToSecondary(db *sql.DB, agName string) (err error) { 852 | _, err = db.Exec(fmt.Sprintf("ALTER AVAILABILITY GROUP %s SET (ROLE = SECONDARY)", mssqlcommon.QuoteNameBracket(agName))) 853 | return 854 | } 855 | 856 | // UpdateExternalWriteLease updates the valid time of external write lease of the specified availability group to the given value 857 | func UpdateExternalWriteLease(db *sql.DB, agName string, validTime time.Duration) error { 858 | _, err := db.Exec(fmt.Sprintf( 859 | "ALTER AVAILABILITY GROUP %s SET (WRITE_LEASE_VALIDITY = %d)", 860 | mssqlcommon.QuoteNameBracket(agName), validTime/time.Second)) 861 | return err 862 | } 863 | 864 | // UpdateReplica adds the given replica to the given Availability Group. 865 | // 866 | // Params: 867 | // db: A connection to a SQL Server instance hosting a replica of the AG. 868 | // agName: The name of the AG. 869 | // replica: The replica to update. 870 | // 871 | func UpdateReplica(db *sql.DB, agName string, replica Replica) (err error) { 872 | _, err = db.Exec(fmt.Sprintf( 873 | "ALTER AVAILABILITY GROUP %s MODIFY REPLICA ON %s WITH (ENDPOINT_URL = %s)", 874 | mssqlcommon.QuoteNameBracket(agName), mssqlcommon.QuoteNameQuote(replica.Name), mssqlcommon.QuoteNameQuote(replica.EndpointURL))) 875 | return 876 | } 877 | -------------------------------------------------------------------------------- /go/src/mssqlcommon/lib.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) Microsoft Corporation. 2 | 3 | // Package mssqlcommon contains items that are common to all SQL Server golang packages. 4 | package mssqlcommon 5 | 6 | import ( 7 | "bufio" 8 | "database/sql" 9 | "encoding/hex" 10 | "errors" 11 | "fmt" 12 | "log" 13 | "net/url" 14 | "os" 15 | "strconv" 16 | "strings" 17 | "time" 18 | ) 19 | 20 | // Diagnostics represents whether the SQL Server subsystems are healthy or not 21 | type Diagnostics struct { 22 | System bool 23 | Resource bool 24 | QueryProcessing bool 25 | } 26 | 27 | // ServerHealth represents the various thresholds for server health 28 | type ServerHealth uint 29 | 30 | // SQLConnectionInfo contains properties related to opening a T-SQL connection to a SQL Server instance 31 | type SQLConnectionInfo struct { 32 | Hostname string 33 | Port uint64 34 | Username string 35 | Password string 36 | ApplicationName string 37 | ConnectionTimeout time.Duration 38 | QueryCommandTimeout time.Duration 39 | JoinCommandTimeout time.Duration 40 | } 41 | 42 | const ( 43 | // The instance is down or refusing connections 44 | // 45 | // This library can't distinguish between down or unresponsive, which is why a single health code is used for both, 46 | // and why there is no enum member with a value of `2`. 47 | ServerDownOrUnresponsive ServerHealth = 1 48 | 49 | // sp_server_diagnostics detected a critical system error 50 | ServerCriticalError ServerHealth = 3 51 | 52 | // sp_server_diagnostics detected a moderate resources error 53 | ServerModerateError ServerHealth = 4 54 | 55 | // sp_server_diagnostics detected an error that's neither moderate nor critical 56 | ServerAnyQualifiedError ServerHealth = 5 57 | ) 58 | 59 | // ServerUnhealthyError is an error that means the SQL Server instance is unhealthy 60 | type ServerUnhealthyError struct { 61 | RawValue ServerHealth 62 | Inner error 63 | } 64 | 65 | func (err *ServerUnhealthyError) Error() string { 66 | switch err.RawValue { 67 | case ServerAnyQualifiedError: 68 | return fmt.Sprintf("AnyQualified %s", err.Inner) 69 | 70 | case ServerModerateError: 71 | return fmt.Sprintf("Moderate %s", err.Inner) 72 | 73 | case ServerCriticalError: 74 | return fmt.Sprintf("Critical %s", err.Inner) 75 | 76 | case ServerDownOrUnresponsive: 77 | return fmt.Sprintf("Unresponsive or down %s", err.Inner) 78 | 79 | default: 80 | return fmt.Sprintf("Unknown (%d) %s", err.RawValue, err.Inner) 81 | } 82 | } 83 | 84 | // DiagnosticsChannelItem represents values returned by a QueryDiagnosticsChannel channel 85 | type DiagnosticsChannelItem struct { 86 | Diagnostics Diagnostics 87 | Error error 88 | } 89 | 90 | const ( 91 | // SQLError_InvalidSyntax corresponds to SQL error 102: 92 | // "Incorrect syntax near ..." 93 | SQLError_InvalidSyntax int32 = 102 94 | 95 | // SQLError_AGCannotFailover_UnsynchronizedDBs corresponds to SQL error 41142: 96 | // "The availability replica ... cannot become the primary replica. One or more databases are not synchronized or have not joined the availability group." 97 | SQLError_AGCannotFailover_UnsynchronizedDBs = 41142 98 | 99 | // SQLError_AlterAGAddRemoveReplica_NotReady corresponds to SQL error 41190: 100 | // "Availability group ... failed to process ... command. The local availability replica is not in a state that could process the command." 101 | SQLError_AlterAGAddRemoveReplica_NotReady int32 = 41190 102 | 103 | // SQLError_AGDoesNotAllowExternalLeaseUpdates corresponds to SQL error 47116: 104 | // "The external lease cannot be set on availability group .... External Lease updates are not enabled for this availability group." 105 | SQLError_AGDoesNotAllowExternalLeaseUpdates int32 = 47116 106 | 107 | // SQLError_AGExternalLeaseUpdate_NewExpiryIsOlderThanCurrentExpiry corresponds to SQL error 47119: 108 | // "The current write lease of the availability group ... is still valid. The lease expiration time cannot be set to an earlier time than its current value." 109 | SQLError_AGExternalLeaseUpdate_NewExpiryIsOlderThanCurrentExpiry int32 = 47119 110 | ) 111 | 112 | // GetCertInfo gets the properties of the given SQL Server certificate. 113 | func GetCertInfo(db *sql.DB, certName string, encryptionPassword string) (publicKey []byte, privateKey []byte, thumbprint []byte, err error) { 114 | err = db.QueryRow(` 115 | IF EXISTS (SELECT * FROM sys.certificates WHERE name = ? ) 116 | BEGIN 117 | DECLARE @cert_id INT = CERT_ID(?); 118 | SELECT CERTENCODED(@cert_id) AS public_key, CERTPRIVATEKEY(@cert_id, ?) AS private_key, thumbprint FROM sys.certificates WHERE certificate_id = @cert_id; 119 | END 120 | ELSE 121 | BEGIN 122 | SELECT 0,0,0 123 | END 124 | `, certName, certName, encryptionPassword).Scan(&publicKey, &privateKey, &thumbprint) 125 | return 126 | } 127 | 128 | // CreateCert creates a certificate with the given properties if it doesn't already exist, and returns its properties. 129 | // 130 | // The subject is set to the cert name. 131 | func CreateCert(db *sql.DB, certName string, username string, encryptionPassword string) (publicKey []byte, privateKey []byte, thumbprint []byte, err error) { 132 | err = db.QueryRow(fmt.Sprintf(` 133 | IF NOT EXISTS (SELECT * FROM sys.certificates WHERE name = ?) 134 | BEGIN 135 | CREATE CERTIFICATE %s 136 | AUTHORIZATION %s 137 | WITH SUBJECT = %s 138 | END 139 | 140 | DECLARE @cert_id INT = CERT_ID(?); 141 | SELECT CERTENCODED(@cert_id) AS public_key, CERTPRIVATEKEY(@cert_id, ?) AS private_key, thumbprint FROM sys.certificates WHERE certificate_id = @cert_id; 142 | `, QuoteNameBracket(certName), QuoteNameBracket(username), QuoteNameQuote(certName)), certName, certName, encryptionPassword).Scan(&publicKey, &privateKey, &thumbprint) 143 | return 144 | } 145 | 146 | // CreateDBMirroringEndpoint creates a DATABASE_MIRRORING endpoint using the given properties. 147 | func CreateDBMirroringEndpoint(db *sql.DB, endpointName string, endpointPort uint64, certName string) error { 148 | _, err := db.Exec(fmt.Sprintf(` 149 | IF NOT EXISTS (SELECT * FROM sys.endpoints WHERE name = ?) 150 | BEGIN 151 | CREATE ENDPOINT %s 152 | STATE = STARTED 153 | AS TCP (LISTENER_IP = (0.0.0.0), LISTENER_PORT = %d) 154 | FOR DATA_MIRRORING ( 155 | ROLE = ALL, 156 | AUTHENTICATION = CERTIFICATE %s, 157 | ENCRYPTION = REQUIRED ALGORITHM AES 158 | ) 159 | END 160 | `, QuoteNameBracket(endpointName), endpointPort, QuoteNameBracket(certName)), endpointName) 161 | return err 162 | } 163 | 164 | // LockSQL creates a T-SQL application lock and holds it 165 | func LockSQL(db *sql.DB, lockName string) (success int, err error) { 166 | err = db.QueryRow(fmt.Sprintf(` 167 | BEGIN TRANSACTION; 168 | DECLARE @result int; 169 | EXEC @result = sp_getapplock @Resource = %s, @LockMode = 'Exclusive', @LockTimeout= '0'; 170 | IF @result < 0 171 | BEGIN 172 | SELECT 0 173 | ROLLBACK TRANSACTION; 174 | END 175 | ELSE 176 | BEGIN 177 | SELECT 1 178 | END; 179 | `, QuoteNameQuote(lockName))).Scan(&success) 180 | return 181 | } 182 | 183 | // CreateOrUpdateLogin creates a login with the given name and password. If the login already exists, updates its password. 184 | func CreateOrUpdateLogin(db *sql.DB, loginName string, loginPassword string) error { 185 | _, err := db.Exec(fmt.Sprintf(` 186 | IF NOT EXISTS (SELECT * FROM sys.syslogins WHERE name = ?) 187 | BEGIN 188 | CREATE LOGIN %s WITH PASSWORD = %s 189 | END 190 | ELSE 191 | BEGIN 192 | ALTER LOGIN %s WITH PASSWORD = %s 193 | END 194 | `, QuoteNameBracket(loginName), QuoteNameQuote(loginPassword), QuoteNameBracket(loginName), QuoteNameQuote(loginPassword)), loginName) 195 | return err 196 | } 197 | 198 | // CreateMasterKey creates a master key with the given password if it doesn't already exist. 199 | func CreateMasterKey(db *sql.DB, masterKeyPassword string) error { 200 | _, err := db.Exec(fmt.Sprintf(` 201 | IF NOT EXISTS (SELECT * FROM sys.symmetric_keys WHERE name like '%%DatabaseMasterKey%%') 202 | BEGIN 203 | CREATE MASTER KEY ENCRYPTION BY PASSWORD = %s 204 | END 205 | `, QuoteNameQuote(masterKeyPassword))) 206 | return err 207 | } 208 | 209 | // RegenerateMasterKey regenerates the master key with the new password. 210 | func RegenerateMasterKey(db *sql.DB, masterKeyPassword string) error { 211 | _, err := db.Exec(fmt.Sprintf(` 212 | IF EXISTS (SELECT * FROM sys.symmetric_keys WHERE name like '%%DatabaseMasterKey%%') 213 | BEGIN 214 | ALTER MASTER KEY REGENERATE WITH ENCRYPTION BY PASSWORD = %s 215 | END 216 | `, QuoteNameQuote(masterKeyPassword))) 217 | return err 218 | } 219 | 220 | // CreateUser creates a SQL user for the given login if it doesn't already exist. 221 | func CreateUser(db *sql.DB, userName string, loginName string) error { 222 | _, err := db.Exec(fmt.Sprintf(` 223 | IF NOT EXISTS (SELECT * FROM sysusers WHERE name = ?) 224 | BEGIN 225 | CREATE USER %s FOR LOGIN %s 226 | END 227 | `, QuoteNameBracket(userName), QuoteNameBracket(loginName)), userName) 228 | return err 229 | } 230 | 231 | // Diagnose uses the server health diagnostics object returned by `QueryDiagnostics` to determine server health 232 | func Diagnose(diagnostics Diagnostics) error { 233 | if !diagnostics.System { 234 | return &ServerUnhealthyError{RawValue: ServerCriticalError, Inner: fmt.Errorf("sp_server_diagnostics result indicates system error")} 235 | } 236 | 237 | if !diagnostics.Resource { 238 | return &ServerUnhealthyError{RawValue: ServerModerateError, Inner: fmt.Errorf("sp_server_diagnostics result indicates resource error")} 239 | } 240 | 241 | if !diagnostics.QueryProcessing { 242 | return &ServerUnhealthyError{RawValue: ServerAnyQualifiedError, Inner: fmt.Errorf("sp_server_diagnostics result indicates query processing error")} 243 | } 244 | 245 | return nil 246 | } 247 | 248 | // DropCert drops the specified certificate if it exists 249 | func DropCert(db *sql.DB, certName string) error { 250 | _, err := db.Exec(fmt.Sprintf(` 251 | IF EXISTS (SELECT * FROM sys.certificates WHERE name = ?) 252 | BEGIN 253 | DROP CERTIFICATE %s 254 | END 255 | `, QuoteNameBracket(certName)), certName) 256 | return err 257 | } 258 | 259 | // DropDBMirroringEndpoint drops the DBMirroring endpoint if it exists 260 | func DropDBMirroringEndpoint(db *sql.DB) error { 261 | var dbMirrorName string 262 | err := db.QueryRow(` 263 | SELECT name FROM sys.database_mirroring_endpoints 264 | `).Scan(&dbMirrorName) 265 | 266 | if err == nil && dbMirrorName != "" { 267 | _, err = db.Exec(fmt.Sprintf(` 268 | DROP ENDPOINT %s 269 | `, QuoteNameBracket(dbMirrorName))) 270 | } else if err == sql.ErrNoRows { 271 | err = nil 272 | } 273 | 274 | return err 275 | } 276 | 277 | // GetEnvDuration gets the environment variable value and attempds to convert it to a duration 278 | func GetEnvDuration(envVarName string, defaultValue time.Duration) time.Duration { 279 | envValue := defaultValue 280 | envValueString := os.Getenv(envVarName) 281 | if envValueString != "" { 282 | envValueFloat, err := strconv.ParseFloat(envValueString, 64) 283 | if err != nil { 284 | panic(fmt.Sprintf("Failed to convert environment variable %s to float, value: %s, %s", envVarName, envValueString, err.Error())) 285 | } 286 | envValue = time.Duration(envValueFloat*1000) * time.Millisecond 287 | } 288 | 289 | return envValue 290 | } 291 | 292 | // GetEnvOrPanic gets value of the specified environment variable, and panics if the variable is not set 293 | func GetEnvOrPanic(envVarName string) string { 294 | envVar := os.Getenv(envVarName) 295 | if envVar == "" { 296 | panic(fmt.Errorf("%s env var does not have valid value", envVarName)) 297 | } 298 | return envVar 299 | } 300 | 301 | // GetLocalServerName gets the server name of the given SQL Server 302 | func GetLocalServerName(db *sql.DB) (serverName string, err error) { 303 | err = db.QueryRow("SELECT @@SERVERNAME").Scan(&serverName) 304 | return 305 | } 306 | 307 | // GetServerInstanceName gets the server instance name of the given SQL Server 308 | func GetServerInstanceName(db *sql.DB) (hostname string, err error) { 309 | err = db.QueryRow("SELECT SERVERPROPERTY('ServerName')").Scan(&hostname) 310 | return 311 | } 312 | 313 | // GetServerVersion gets the server version 314 | func GetServerVersion(db *sql.DB) (serverVersion string, err error) { 315 | err = db.QueryRow("SELECT SERVERPROPERTY('ProductVersion')").Scan(&serverVersion) 316 | return 317 | } 318 | 319 | // ParseVersionInfo parses VersionInfo from ProductVersion 320 | // the 0th index stores the major version number, the second index stores the minor version number ect. 321 | func ParseVersionInfo(version string) ([4]int, error) { 322 | var version0, version1, version2, version3 int 323 | _, err := fmt.Sscanf(version, "%d.%d.%d.%d", &version0, &version1, &version2, &version3) 324 | return [4]int{version0, version1, version2, version3}, err 325 | } 326 | 327 | // CompareVersionInfo compares sql version information 328 | // returns -1 if version1 is less than version2 329 | // returns 0 if version1 is equal to version2 330 | // returns 1 if version1 is greater than version2 331 | func CompareVersionInfo(version1 [4]int, version2 [4]int) int { 332 | for i := 0; i < 4; i++ { 333 | if version1[i] > version2[i] { 334 | return 1 335 | } else if version1[i] < version2[i] { 336 | return -1 337 | } 338 | } 339 | 340 | return 0 341 | } 342 | 343 | // GrantConnectOnEndpoint grants CONNECT permission to the specified login on the specified endpoint 344 | func GrantConnectOnEndpoint(db *sql.DB, endpointName string, loginName string) error { 345 | _, err := db.Exec(fmt.Sprintf("GRANT CONNECT ON ENDPOINT::%s TO %s", QuoteNameBracket(endpointName), QuoteNameBracket(loginName))) 346 | return err 347 | } 348 | 349 | // GrantAgControl grants availability group control to a sql login if the ag exists 350 | func GrantAgControl(db *sql.DB, agName string, loginName string) error { 351 | _, err := db.Exec(fmt.Sprintf(` 352 | IF EXISTS(SELECT * FROM sys.availability_groups WHERE name = ?) 353 | GRANT CONTROL ON AVAILABILITY GROUP :: %s TO %s 354 | ; 355 | `, QuoteNameBracket(agName), QuoteNameBracket(loginName)), agName) 356 | 357 | return err 358 | } 359 | 360 | // GrantCertificateControl grants endpoint control to a sql login 361 | func GrantCertificateControl(db *sql.DB, certName string, loginName string) error { 362 | _, err := db.Exec(fmt.Sprintf(` 363 | IF EXISTS(SELECT * FROM sys.certificates) 364 | GRANT CONTROL ON CERTIFICATE :: %s TO %s 365 | ; 366 | `, QuoteNameBracket(certName), QuoteNameBracket(loginName))) 367 | 368 | return err 369 | } 370 | 371 | // GrantEndpointCertificates grants loginName control permissions to all of the certificates associated with the endpoint 372 | func GrantEndpointCertificates(db *sql.DB, loginName string) error { 373 | rows, err := db.Query(` 374 | WITH dbm_endpoints 375 | AS ( 376 | SELECT * FROM sys.database_mirroring_endpoints 377 | WHERE connection_auth_desc = 'CERTIFICATE' 378 | ) 379 | SELECT certs.name FROM dbm_endpoints 380 | INNER JOIN sys.certificates certs 381 | ON dbm_endpoints.certificate_id = certs.certificate_id 382 | `) 383 | if err == sql.ErrNoRows { 384 | return nil 385 | } 386 | if err != nil { 387 | return err 388 | } 389 | 390 | defer rows.Close() 391 | 392 | for rows.Next() { 393 | var certName string 394 | err = rows.Scan(&certName) 395 | if err != nil { 396 | return err 397 | } 398 | 399 | // Grant control to all certificates 400 | err = GrantCertificateControl(db, certName, loginName) 401 | if err != nil { 402 | return err 403 | } 404 | } 405 | return err 406 | } 407 | 408 | // GrantEndpointControl grants endpoint control to a sql login 409 | func GrantEndpointControl(db *sql.DB, endpointName string, loginName string) error { 410 | _, err := db.Exec(fmt.Sprintf(` 411 | IF EXISTS(SELECT * FROM sys.database_mirroring_endpoints) 412 | GRANT CONTROL ON ENDPOINT :: %s TO %s 413 | ; 414 | `, QuoteNameBracket(endpointName), QuoteNameBracket(loginName))) 415 | 416 | return err 417 | } 418 | 419 | // GrantAgentPermissions grants permissions to the AG agent login. 420 | // 421 | // Params: 422 | // username: the username 423 | // loginName: the login name 424 | // 425 | func GrantAgentPermissions(db *sql.DB, username string, loginName string) error { 426 | _, err := db.Exec(fmt.Sprintf(` 427 | GRANT ALTER ANY LOGIN TO %[2]s; -- CREATE/ALTER DBM LOGIN 428 | GRANT ALTER ANY USER TO %[1]s; -- CREATE DBM USER 429 | GRANT CREATE CERTIFICATE TO %[1]s WITH GRANT OPTION; -- Grant CREATE/DROP CERTIFICATE to dbm users 430 | GRANT CREATE ENDPOINT TO %[2]s WITH GRANT OPTION; -- Grant CREATE/DROP DBM ENDPOINT to local dbm user 431 | GRANT CREATE AVAILABILITY GROUP TO %[1]s; -- CREATE AG / ALTER AG JOIN 432 | GRANT ALTER ANY DATABASE TO %[1]s; -- ALTER AG GRANT CREATE ANY DATABASE 433 | GRANT VIEW SERVER STATE TO %[2]s; -- sys.dm_hadr_availability_replica_states 434 | `, QuoteNameBracket(username), QuoteNameBracket(loginName))) 435 | return err 436 | } 437 | 438 | // ImportCert creates a certificate with the given name and keys, if it doesn't already exist 439 | func ImportCert(db *sql.DB, certName string, username string, publicKey []byte, privateKey []byte, decryptionPassword string) error { 440 | _, err := db.Exec(fmt.Sprintf(` 441 | IF NOT EXISTS (SELECT * FROM sys.certificates WHERE name = ?) 442 | BEGIN 443 | CREATE CERTIFICATE %s 444 | AUTHORIZATION %s 445 | FROM BINARY = 0x%s 446 | WITH PRIVATE KEY ( 447 | BINARY = 0x%s, 448 | DECRYPTION BY PASSWORD = %s 449 | ) 450 | END 451 | `, QuoteNameBracket(certName), QuoteNameBracket(username), hex.EncodeToString(publicKey), hex.EncodeToString(privateKey), QuoteNameQuote(decryptionPassword)), certName) 452 | return err 453 | } 454 | 455 | // MonitorDBHealth opens a connection to the SQL Server instance using the given parameters, 456 | // and repeatedly performs health checks. 457 | // 458 | // The returned channel yields: 459 | // 1. errors from opening the connection or running the health check 460 | // 2. ServerUnhealthyError values corresponding to the health check results if they violate the given healthThreshold 461 | // 462 | // The channel will be closed after yielding an error of the first category. 463 | func MonitorDBHealth(connectionInfo SQLConnectionInfo, repeatInterval time.Duration, healthThreshold ServerHealth, stdout *log.Logger) <-chan DiagnosticsChannelItem { 464 | result := make(chan DiagnosticsChannelItem) 465 | 466 | // https://msdn.microsoft.com/en-us/library/ff878233.aspx 467 | const stateError = 3 468 | 469 | go func() { 470 | var db *sql.DB 471 | defer close(result) 472 | err := RetryExecute(connectionInfo.ConnectionTimeout, func(i uint) (bool, error) { 473 | stdout.Printf("Attempt %d to connect to the instance at %s:%d and run sp_server_diagnostics", i, connectionInfo.Hostname, connectionInfo.Port) 474 | 475 | var err error 476 | db, err = OpenDB( 477 | connectionInfo.Hostname, 478 | connectionInfo.Port, 479 | connectionInfo.Username, 480 | connectionInfo.Password, 481 | "", 482 | connectionInfo.ApplicationName, 483 | connectionInfo.ConnectionTimeout, 484 | connectionInfo.QueryCommandTimeout, 485 | ) 486 | if err != nil { 487 | stdout.Printf("Attempt %d returned error: %s", i, err) 488 | return false, err 489 | } 490 | 491 | stdout.Printf("Connected to the instance at %s:%d", connectionInfo.Hostname, connectionInfo.Port) 492 | 493 | return true, nil 494 | }) 495 | if err != nil { 496 | result <- DiagnosticsChannelItem{Error: err} 497 | return 498 | } 499 | defer db.Close() 500 | 501 | rows, err := db.Query(fmt.Sprintf("EXEC sp_server_diagnostics %d", int(repeatInterval.Seconds()))) 502 | if err != nil { 503 | result <- DiagnosticsChannelItem{Error: err} 504 | return 505 | } 506 | defer rows.Close() 507 | 508 | for { 509 | var diagnostics Diagnostics 510 | 511 | for rows.Next() { 512 | var creationTime, componentType, componentName, stateDesc, data string 513 | var state int 514 | 515 | err = rows.Scan(&creationTime, &componentType, &componentName, &state, &stateDesc, &data) 516 | if err != nil { 517 | break 518 | } 519 | 520 | switch strings.ToLower(componentName) { 521 | case "system": 522 | diagnostics.System = state != stateError 523 | case "resource": 524 | diagnostics.Resource = state != stateError 525 | case "query_processing": 526 | diagnostics.QueryProcessing = state != stateError 527 | } 528 | } 529 | 530 | if err == nil { 531 | err = rows.Err() 532 | } 533 | if err != nil { 534 | result <- DiagnosticsChannelItem{Error: err} 535 | return 536 | } 537 | 538 | result <- DiagnosticsChannelItem{ 539 | Diagnostics: diagnostics, 540 | } 541 | 542 | if !rows.NextResultSet() { 543 | result <- DiagnosticsChannelItem{ 544 | Error: errors.New("unexpected end of resultsets"), 545 | } 546 | return 547 | } 548 | } 549 | }() 550 | 551 | return result 552 | } 553 | 554 | // OpenDB opens a connection to a SQL Server instance using the given parameters. 555 | // 556 | // Params: 557 | // hostname: Hostname of the instance. 558 | // port: Port number for the T-SQL endpoint of the instance. 559 | // username: Username to use to connect to the instance. 560 | // password: Password to use to connect to the instance. 561 | // applicationName: The application name that the connection will use. 562 | // connectionTimeout: Connection timeout. 563 | // commandTimeout: Command timeout. 564 | func OpenDB( 565 | hostname string, 566 | port uint64, 567 | username string, 568 | password string, 569 | databaseName string, 570 | applicationName string, 571 | connectionTimeout time.Duration, 572 | commandTimeout time.Duration, 573 | ) (*sql.DB, error) { 574 | query := url.Values{} 575 | 576 | if databaseName != "" { 577 | query.Add("database", databaseName) 578 | } 579 | 580 | query.Add("app name", applicationName) 581 | // golang calls connection timeout "dial timeout", and go-mssqldb reuses it for the same meaning. 582 | // SqlClient's CommandTimeout maps to go-mssqldb's "connection timeout", since it gives up on the connection if no data is received for that time. 583 | query.Add("dial timeout", fmt.Sprintf("%d", connectionTimeout/time.Second)) 584 | query.Add("connection timeout", fmt.Sprintf("%d", commandTimeout/time.Second)) 585 | query.Add("encrypt", "true") 586 | query.Add("TrustServerCertificate", "true") // Otherwise TLS will fail because sql is using a self signed certificate 587 | 588 | u := &url.URL{ 589 | Scheme: "sqlserver", 590 | User: url.UserPassword(username, password), 591 | Host: fmt.Sprintf("%s:%d", hostname, port), 592 | RawQuery: query.Encode(), 593 | } 594 | 595 | connectionString := u.String() 596 | 597 | db, err := sql.Open("mssql", connectionString) 598 | if err != nil { 599 | return nil, &ServerUnhealthyError{RawValue: ServerDownOrUnresponsive, Inner: err} 600 | } 601 | 602 | err = db.Ping() 603 | if err != nil { 604 | _ = db.Close() 605 | return nil, &ServerUnhealthyError{RawValue: ServerDownOrUnresponsive, Inner: err} 606 | } 607 | 608 | return db, nil 609 | } 610 | 611 | // OpenDBWithHealthCheck opens a connection to a SQL Server instance using the given parameters, and performs a health check. 612 | // 613 | // Params: 614 | // hostname: Hostname of the instance. 615 | // port: Port number for the T-SQL endpoint of the instance. 616 | // username: Username to use to connect to the instance. 617 | // password: Password to use to connect to the instance. 618 | // connectionTimeout: Connection timeout. 619 | // If connection fails, this function will retry until this time has elapsed. 620 | // If this time elapses, the last error encountered will be returned. 621 | // monitorTimeout: monitor interval timeout. This will be 0 from other actions, but greater than 0 for monitor action 622 | func OpenDBWithHealthCheck( 623 | hostname string, 624 | port uint64, 625 | username string, 626 | password string, 627 | applicationName string, 628 | connectionTimeout time.Duration, 629 | commandTimeout time.Duration, 630 | monitorTimeout time.Duration, 631 | stdout *log.Logger, 632 | ) (db *sql.DB, err error) { 633 | 634 | 635 | if monitorTimeout > 0 && monitorTimeout > connectionTimeout { 636 | err = RetryExecuteWithTimeout(connectionTimeout, monitorTimeout, stdout, func(i uint) (bool, error) { 637 | stdout.Printf("From RetryExecuteWithTimeout - Attempt %d to connect to the instance at %s:%d\n", i, hostname, port) 638 | var err error 639 | 640 | db, err = OpenDB(hostname, port, username, password, "", applicationName, connectionTimeout, commandTimeout) 641 | if err != nil { 642 | stdout.Printf("Attempt %d returned error: %s\n", i, FormatErrorString(err)) 643 | 644 | return false, err 645 | } 646 | 647 | stdout.Printf("Connected to the instance at %s:%d\n", hostname, port) 648 | 649 | return true, nil 650 | }) 651 | } else { 652 | err = RetryExecute(connectionTimeout, func(i uint) (bool, error) { 653 | stdout.Printf("From RetryExecute - Attempt %d to connect to the instance at %s:%d\n", i, hostname, port) 654 | var err error 655 | 656 | db, err = OpenDB(hostname, port, username, password, "", applicationName, connectionTimeout, commandTimeout) 657 | if err != nil { 658 | stdout.Printf("Attempt %d returned error: %s\n", i, FormatErrorString(err)) 659 | 660 | return false, err 661 | } 662 | 663 | stdout.Printf("Connected to the instance at %s:%d\n", hostname, port) 664 | 665 | return true, nil 666 | }) 667 | } 668 | 669 | 670 | if err != nil { 671 | return 672 | } 673 | 674 | diagnostics, err := QueryDiagnostics(db) 675 | if err != nil { 676 | return 677 | } 678 | 679 | err = Diagnose(diagnostics) 680 | 681 | return 682 | } 683 | 684 | // QueryDiagnostics gets the server health diagnostics of a SQL Server instance 685 | func QueryDiagnostics(db *sql.DB) (result Diagnostics, err error) { 686 | // https://msdn.microsoft.com/en-us/library/ff878233.aspx 687 | const stateError = 3 688 | 689 | rows, err := db.Query("EXEC sp_server_diagnostics") 690 | if err != nil { 691 | return result, err 692 | } 693 | defer rows.Close() 694 | 695 | for rows.Next() { 696 | var creationTime, componentType, componentName, stateDesc, data string 697 | var state int // https://msdn.microsoft.com/en-us/library/ff878233.aspx 698 | 699 | err = rows.Scan(&creationTime, &componentType, &componentName, &state, &stateDesc, &data) 700 | if err != nil { 701 | break 702 | } 703 | 704 | switch strings.ToLower(componentName) { 705 | case "system": 706 | result.System = state != stateError 707 | case "resource": 708 | result.Resource = state != stateError 709 | case "query_processing": 710 | result.QueryProcessing = state != stateError 711 | } 712 | } 713 | 714 | err = rows.Err() 715 | 716 | return 717 | } 718 | 719 | // QuoteNameBracket performs the equivalent operation of QUOTENAME with quote_character = '[' 720 | func QuoteNameBracket(s string) string { 721 | return fmt.Sprintf("[%s]", strings.Replace(s, "]", "]]", -1)) 722 | } 723 | 724 | // QuoteNameQuote performs the equivalent operation of QUOTENAME with quote_character = '\'' 725 | func QuoteNameQuote(s string) string { 726 | return fmt.Sprintf("'%s'", strings.Replace(s, "'", "''", -1)) 727 | } 728 | 729 | // ReadCredentialsFile reads the specified credentials file to extract a SQL username and password. 730 | // 731 | // The first line contains the username. 732 | // The second line contains the password. 733 | // Lines are separated by LF. 734 | // The second line can end with LF or EOF. 735 | func ReadCredentialsFile(filename string) (username string, password string, err error) { 736 | file, err := os.Open(filename) 737 | if err != nil { 738 | return 739 | } 740 | defer file.Close() 741 | 742 | scanner := bufio.NewScanner(file) 743 | 744 | if !scanner.Scan() { 745 | err = fmt.Errorf("could not read first line to extract username") 746 | return 747 | } 748 | username = scanner.Text() 749 | 750 | if !scanner.Scan() { 751 | err = fmt.Errorf("could not read second line to extract password") 752 | return 753 | } 754 | password = scanner.Text() 755 | 756 | return 757 | } 758 | 759 | // RetryExecute retries the execution of a function until the function returns true or the specified timeout elapses. 760 | // If the timeout elapsed, the last error returned by the function is returned. 761 | // 762 | // The function receives a uint that contains the iteration number (starting from 1). 763 | func RetryExecute( 764 | retryTimeout time.Duration, 765 | retryFn func(uint) (bool, error), 766 | ) (err error) { 767 | successChannel := make(chan struct{}) 768 | errChannel := make(chan error) 769 | timeoutChannel := time.After(retryTimeout) 770 | 771 | go func() { 772 | defer func() { 773 | if r := recover(); r != nil { 774 | err := r.(error) // Panics if r is not of type error, which is desirable 775 | errChannel <- err 776 | } 777 | }() 778 | 779 | for i := uint(1); ; i++ { 780 | success, err := retryFn(i) 781 | 782 | if err == nil { 783 | if success { 784 | successChannel <- struct{}{} 785 | return 786 | } 787 | } else { 788 | errChannel <- err 789 | } 790 | 791 | time.Sleep(1 * time.Second) 792 | } 793 | }() 794 | 795 | // Loop until success or timeout 796 | for { 797 | select { 798 | case _ = <-successChannel: 799 | err = nil 800 | return 801 | 802 | case err = <-errChannel: 803 | // Store the latest error so that it can be returned on timeout 804 | 805 | case _ = <-timeoutChannel: 806 | if err == nil { 807 | // Goroutine timed out without failing even once, so construct a timeout error to return to the caller 808 | err = fmt.Errorf("Connection attempt timed out. Either SQL Server is unresponsive or not accepting connection request") 809 | } 810 | 811 | return 812 | } 813 | } 814 | } 815 | 816 | 817 | // RetryExecuteWithTimeout is an extention of RetryExecute. If a connection timeout occurs then we will retry the connection attempt until the monitor timeout elapses. 818 | // If the timeout elapsed, the last error returned by the function is returned. 819 | // 820 | // The function receives a uint that contains the iteration number (starting from 1). 821 | 822 | func RetryExecuteWithTimeout( 823 | retryTimeout time.Duration, 824 | monitorTimeout time.Duration, 825 | stdout *log.Logger, 826 | retryFn func(uint) (bool, error), 827 | ) (err error) { 828 | successChannel := make(chan struct{}) 829 | errChannel := make(chan error) 830 | timeoutChannel := time.After(monitorTimeout) 831 | quit := make(chan bool) 832 | 833 | var reterr error 834 | var finner func() 835 | var fouter func() (err error) 836 | 837 | finner = func() { 838 | retryChannel := time.After(retryTimeout) 839 | defer func() { 840 | if r := recover(); r != nil { 841 | err := r.(error) // Panics if r is not of type error, which is desirable 842 | errChannel <- err 843 | } 844 | }() 845 | 846 | for i := uint(1); ; i++ { 847 | 848 | select { 849 | 850 | case <-retryChannel: 851 | quit <- true 852 | return 853 | 854 | default: 855 | success, err := retryFn(i) 856 | 857 | if err == nil { 858 | if success { 859 | successChannel <- struct{}{} 860 | return 861 | } 862 | } else { 863 | errChannel <- err 864 | } 865 | 866 | time.Sleep(1 * time.Second) 867 | 868 | } 869 | } 870 | } 871 | 872 | fouter = func() (err error) { 873 | go finner() 874 | 875 | // Loop until success or timeout 876 | for { 877 | select { 878 | case _ = <-successChannel: 879 | err = nil 880 | return 881 | 882 | case err = <-errChannel: 883 | // Store the latest error so that it can be returned on timeout 884 | 885 | 886 | case _ = <-quit: 887 | err = fmt.Errorf("Attempt retry") 888 | return 889 | 890 | case _ = <-timeoutChannel: 891 | if err == nil { 892 | // Goroutine timed out without failing even once, so construct a timeout error to return to the caller 893 | err = fmt.Errorf("Connection attempts timed out. Either SQL Server is unresponsive or not accepting connection request") 894 | } 895 | 896 | return 897 | } 898 | } 899 | } 900 | 901 | reterr = fouter() 902 | for reterr != nil { 903 | if strings.Index(reterr.Error(), "retry") > 0 { 904 | stdout.Printf("Connection request timed out - attempting retry \n") 905 | reterr = fouter() 906 | } else { 907 | return reterr 908 | } 909 | } 910 | 911 | return reterr 912 | } 913 | 914 | // SetLocalServerName sets the local server name to the given name via sp_dropserver + sp_addserver 915 | func SetLocalServerName(db *sql.DB, serverName string) error { 916 | var currentServerName string 917 | err := db.QueryRow(`SELECT name FROM sys.servers WHERE server_id = 0`).Scan(¤tServerName) 918 | 919 | if err == nil && strings.EqualFold(currentServerName, serverName) { 920 | // Existing sys.servers row already has the specified name 921 | return nil 922 | } 923 | 924 | if err != nil && err != sql.ErrNoRows { 925 | // Unexpected error 926 | return err 927 | } 928 | 929 | if err == nil { 930 | // There is an existing sys.servers row and it has a different name than the specified name. Drop it. 931 | _, err = db.Exec("EXEC sp_dropserver ?", currentServerName) 932 | if err != nil { 933 | return err 934 | } 935 | } 936 | 937 | // At this point there is no sys.servers row for a local server. Add a row with the specified name. 938 | _, err = db.Exec("EXEC sp_addserver ?, local", serverName) 939 | 940 | return err 941 | } 942 | 943 | // GetReplicaNameRetry gets the replica name, will retry until success 944 | func GetReplicaNameRetry(connectionInfo SQLConnectionInfo, stdout *log.Logger) string { 945 | var replicaName string 946 | // Retry until SQL Server is up 947 | for { 948 | err := WithDbConnection(connectionInfo, stdout, func(db *sql.DB) error { 949 | var err error 950 | stdout.Printf("Getting replica name...") 951 | replicaName, err = GetServerInstanceName(db) 952 | return err 953 | }) 954 | if err != nil { 955 | stdout.Printf("could not connect to local sqlservr: %s", err) 956 | time.Sleep(1 * time.Second) 957 | } else { 958 | return replicaName 959 | } 960 | } 961 | } 962 | 963 | // WithDbConnection connects to a SQL Server instance using the specified connection info, runs the given function against it, and closes the connection. 964 | // 965 | // The QueryCommandTimeout is used for the command timeout. 966 | func WithDbConnection(connectionInfo SQLConnectionInfo, stdout *log.Logger, f func(*sql.DB) error) error { 967 | return WithDB( 968 | connectionInfo.Hostname, 969 | connectionInfo.Port, 970 | connectionInfo.Username, 971 | connectionInfo.Password, 972 | "", 973 | connectionInfo.ApplicationName, 974 | connectionInfo.ConnectionTimeout, 975 | connectionInfo.QueryCommandTimeout, 976 | stdout, 977 | f, 978 | ) 979 | } 980 | 981 | // WithDbJoinConnection connects to a SQL Server instance using the specified connection info, runs the given function against it, and closes the connection. 982 | // 983 | // The JoinCommandTimeout is used for the command timeout. 984 | func WithDbJoinConnection(connectionInfo SQLConnectionInfo, stdout *log.Logger, f func(*sql.DB) error) error { 985 | return WithDB( 986 | connectionInfo.Hostname, 987 | connectionInfo.Port, 988 | connectionInfo.Username, 989 | connectionInfo.Password, 990 | "", 991 | connectionInfo.ApplicationName, 992 | connectionInfo.ConnectionTimeout, 993 | connectionInfo.JoinCommandTimeout, 994 | stdout, 995 | f, 996 | ) 997 | } 998 | 999 | // WithDB connects to a SQL Server instance, runs the given function against it, and closes the connection. 1000 | func WithDB( 1001 | hostname string, 1002 | port uint64, 1003 | username string, 1004 | password string, 1005 | databaseName string, 1006 | applicationName string, 1007 | connectionTimeout time.Duration, 1008 | commandTimeout time.Duration, 1009 | stdout *log.Logger, 1010 | f func(*sql.DB) error, 1011 | ) error { 1012 | db, err := OpenDB(hostname, port, username, password, databaseName, applicationName, connectionTimeout, commandTimeout) 1013 | if err != nil { 1014 | return err 1015 | } 1016 | 1017 | defer db.Close() 1018 | 1019 | err = f(db) 1020 | 1021 | return err 1022 | } 1023 | 1024 | // Format the error string 1025 | // 1026 | func FormatErrorString(err error) error { 1027 | type SqlErrorWithNumber interface { 1028 | SQLErrorNumber() int32 1029 | SQLErrorState() uint8 1030 | } 1031 | 1032 | if err != nil { 1033 | if sqlErrorWithNumber, ok := err.(SqlErrorWithNumber); ok { 1034 | var errVal int 1035 | var errSt int 1036 | var errNo string 1037 | var errState string 1038 | errSt = int(sqlErrorWithNumber.SQLErrorState()) 1039 | errVal = int(sqlErrorWithNumber.SQLErrorNumber()) 1040 | errNo = strconv.Itoa(errVal) 1041 | errState = strconv.Itoa(errSt) 1042 | return fmt.Errorf("Error %s, State %s: %s", errNo, errState, err) 1043 | } 1044 | return fmt.Errorf("%s", err) 1045 | } 1046 | return nil 1047 | } 1048 | -------------------------------------------------------------------------------- /go/src/ag-helper/main.go: -------------------------------------------------------------------------------- 1 | // Copyright (C) Microsoft Corporation. 2 | 3 | package main 4 | 5 | import ( 6 | "database/sql" 7 | "errors" 8 | "flag" 9 | "fmt" 10 | "log" 11 | "math" 12 | "os" 13 | "regexp" 14 | "strconv" 15 | "strings" 16 | "time" 17 | 18 | mssql "github.com/denisenkom/go-mssqldb" 19 | 20 | "mssqlcommon" 21 | mssqlag "mssqlcommon/ag" 22 | mssqlocf "mssqlcommon/ocf" 23 | ) 24 | 25 | const ( 26 | // promotionScoreCurrentMaster is the promotion score set on a replica that's already the master. 27 | // This is the highest value to motivate Pacemaker to keep it the master. 28 | promotionScoreCurrentMaster = "20" 29 | 30 | // promotionScoreCanBePromoted is the promotion score set on a replica that can be promoted if necessary. 31 | // This is lower than the score set on the current master but still greater than 0. 32 | promotionScoreCanBePromoted = "10" 33 | 34 | // promotionScoreShouldNotBePromoted is the promotion score set on a replica that should not be promoted. 35 | promotionScoreShouldNotBePromoted = "-INFINITY" 36 | ) 37 | 38 | func main() { 39 | stdout := log.New(os.Stdout, "", log.LstdFlags) 40 | stderr := log.New(os.Stderr, "ERROR: ", log.LstdFlags) 41 | promotionScoreOut := log.New(os.Stderr, "PROMOTION_SCORE: ", 0) 42 | sequenceNumberOut := log.New(os.Stderr, "SEQUENCE_NUMBER: ", 0) 43 | leaseExpiryOut := log.New(os.Stderr, "LEASE_EXPIRY: ", 0) 44 | 45 | err := mssqlocf.KillCurrentProcessWhenParentExits() 46 | if err != nil { 47 | mssqlocf.Exit(stderr, 1, fmt.Errorf("Unexpected error: %s", err)) 48 | } 49 | 50 | err = doMain(stdout, stderr, promotionScoreOut, sequenceNumberOut, leaseExpiryOut) 51 | if err != nil { 52 | mssqlocf.Exit(stderr, 1, fmt.Errorf("Unexpected error: %s", err)) 53 | } 54 | } 55 | 56 | func doMain(stdout *log.Logger, stderr *log.Logger, promotionScoreOut *log.Logger, sequenceNumberOut *log.Logger, leaseExpiryOut *log.Logger) error { 57 | var ( 58 | hostname string 59 | sqlPort uint64 60 | agName string 61 | credentialsFile string 62 | applicationName string 63 | rawConnectionTimeout int64 64 | rawHealthThreshold uint 65 | 66 | action string 67 | 68 | skipPreCheck bool 69 | sequenceNumbers string 70 | newMaster string 71 | requiredSynchronizedSecondariesToCommitArg int 72 | currentMaster string 73 | disablePrimaryOnQuorumTimeoutAfter int64 74 | primaryWriteLeaseDuration int64 75 | rawMonitorTimeout int64 76 | leaseExpiry string 77 | ) 78 | 79 | flag.StringVar(&hostname, "hostname", "localhost", "The hostname of the SQL Server instance to connect to. Default: localhost") 80 | flag.Uint64Var(&sqlPort, "port", 0, "The port on which the instance is listening for logins.") 81 | flag.StringVar(&agName, "ag-name", "", "The name of the Availability Group") 82 | flag.StringVar(&credentialsFile, "credentials-file", "", "The path to the credentials file.") 83 | flag.StringVar(&applicationName, "application-name", "", "The application name to use for the T-SQL connection.") 84 | flag.Int64Var(&rawConnectionTimeout, "connection-timeout", 30, "The connection timeout in seconds. "+ 85 | "The application will retry connecting to the instance until this time elapses. Default: 30") 86 | flag.UintVar(&rawHealthThreshold, "health-threshold", uint(mssqlcommon.ServerCriticalError), "The instance health threshold. Default: 3 (SERVER_CRITICAL_ERROR)") 87 | 88 | flag.StringVar(&action, "action", "", `One of --start, --stop, --monitor, --pre-promote, --promote, --demote 89 | start: Start the replica on this node. 90 | stop: Stop the replica on this node. 91 | monitor: Monitor the replica on this node. 92 | pre-start: Before starting a new clone. 93 | post-stop: After stopping an existing clone. 94 | pre-promote: Fetch the sequence number of the replica on this node. 95 | promote: Promote the replica on this node to master. 96 | demote: Demote the replica on this node to slave.`) 97 | 98 | flag.BoolVar(&skipPreCheck, "skip-precheck", false, "Promote the replica on this node to master even if its availability mode is ASYNCHRONOUS_COMMIT.") 99 | flag.StringVar(&sequenceNumbers, "sequence-numbers", "", "The sequence numbers of each replica as stored in the cluster. The value is expected to be in the format returned by attrd_updater -QA") 100 | flag.StringVar(&newMaster, "new-master", "", "The name of the node that is being promoted.") 101 | flag.IntVar(&requiredSynchronizedSecondariesToCommitArg, "required-synchronized-secondaries-to-commit", -1, "Explicit value for REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT. If not provided, the value will be derived from the number of SYNCHRONOUS_COMMIT replicas.") 102 | flag.StringVar(¤tMaster, "current-master", "", "The name of the node that is currently the master.") 103 | flag.Int64Var(&disablePrimaryOnQuorumTimeoutAfter, "disable-primary-on-quorum-timeout-after", 60, "How long the primary can block on committing a configuration update before the agent stops renewing its write lease, in seconds. Default: 60") 104 | flag.Int64Var(&primaryWriteLeaseDuration, "primary-write-lease-duration", -1, "Primary write lease duration, in seconds.") 105 | flag.Int64Var(&rawMonitorTimeout, "monitor-interval-timeout", 0, "The monitor interval timeout in seconds. "+ 106 | "The aghelper will attempt new connection request to the instance after connection timeout, this goes until this time elapses. Default: 0") 107 | flag.StringVar(&leaseExpiry, "lease-expiry", "", "The lease expiry time. The value is expected to be in the format returned by attrd_updater -QA") 108 | 109 | flag.Parse() 110 | if hostname == "" { 111 | hostname = "localhost" 112 | } 113 | stdout.Printf( 114 | "ag-helper invoked with hostname [%s]; port [%d]; ag-name [%s]; credentials-file [%s]; application-name [%s]; connection-timeout [%d]; health-threshold [%d]; action [%s]\n", 115 | hostname, 116 | sqlPort, 117 | agName, 118 | credentialsFile, 119 | applicationName, 120 | rawConnectionTimeout, 121 | rawHealthThreshold, 122 | action) 123 | 124 | switch action { 125 | case "start": 126 | stdout.Printf( 127 | "ag-helper invoked with sequence-numbers [...]; required-synchronized-secondaries-to-commit [%d]; current-master [%s]; disable-primary-on-quorum-timeout-after [%d]; primary-write-lease-duration [%d]", 128 | requiredSynchronizedSecondariesToCommitArg, currentMaster, disablePrimaryOnQuorumTimeoutAfter, primaryWriteLeaseDuration, 129 | ) 130 | 131 | case "monitor": 132 | stdout.Printf( 133 | "ag-helper invoked with required-synchronized-secondaries-to-commit [%d]; current-master [%s]; disable-primary-on-quorum-timeout-after [%d]; primary-write-lease-duration [%d]; monitor-interval-timeout [%d]", 134 | requiredSynchronizedSecondariesToCommitArg, currentMaster, disablePrimaryOnQuorumTimeoutAfter, primaryWriteLeaseDuration, rawMonitorTimeout, 135 | ) 136 | 137 | case "pre-start": 138 | stdout.Printf( 139 | "ag-helper invoked with required-synchronized-secondaries-to-commit [%d]\n", 140 | requiredSynchronizedSecondariesToCommitArg) 141 | 142 | case "post-stop": 143 | stdout.Printf( 144 | "ag-helper invoked with required-synchronized-secondaries-to-commit [%d]\n", 145 | requiredSynchronizedSecondariesToCommitArg) 146 | 147 | case "promote": 148 | stdout.Printf( 149 | "ag-helper invoked with skip-precheck [%t]; sequence-numbers [...]; new-master [%s]; required-synchronized-secondaries-to-commit [%d]; disable-primary-on-quorum-timeout-after [%d]; primary-write-lease-duration [%d]", 150 | skipPreCheck, newMaster, requiredSynchronizedSecondariesToCommitArg, disablePrimaryOnQuorumTimeoutAfter, primaryWriteLeaseDuration, 151 | ) 152 | } 153 | 154 | if hostname == "" { 155 | return errors.New("a valid hostname must be specified using --hostname") 156 | } 157 | 158 | if sqlPort == 0 { 159 | return errors.New("a valid port number must be specified using --port") 160 | } 161 | 162 | if agName == "" { 163 | return errors.New("a valid AG name must be specified using --ag-name") 164 | } 165 | 166 | if credentialsFile == "" { 167 | return errors.New("a valid path to a credentials file must be specified using --credentials-file") 168 | } 169 | 170 | if applicationName == "" { 171 | return errors.New("a valid application name must be specified using --application-name") 172 | } 173 | 174 | if action == "" { 175 | return errors.New("a valid action must be specified using --action") 176 | } 177 | 178 | if (action == "start" || action == "monitor" || action == "promote") && primaryWriteLeaseDuration < 0 { 179 | return errors.New("a valid value must be specified using --primary-write-lease-duration") 180 | } 181 | 182 | if (action == "promote") && newMaster == "" { 183 | return errors.New("a valid hostname must be specified using --new-master") 184 | } 185 | 186 | err := mssqlocf.ImportOcfExitCodes() 187 | if err != nil { 188 | return err 189 | } 190 | 191 | connectionTimeout := time.Duration(rawConnectionTimeout) * time.Second 192 | monitorTimeout := time.Duration(rawMonitorTimeout) * time.Second 193 | healthThreshold := mssqlcommon.ServerHealth(rawHealthThreshold) 194 | 195 | var requiredSynchronizedSecondariesToCommit *uint 196 | if requiredSynchronizedSecondariesToCommitArg != -1 { 197 | if requiredSynchronizedSecondariesToCommitArg < 0 || requiredSynchronizedSecondariesToCommitArg > math.MaxInt32 { 198 | return mssqlocf.OcfExit(stderr, mssqlocf.OCF_ERR_CONFIGURED, errors.New( 199 | "--required-synchronized-secondaries-to-commit must be set to a valid integer between 0 and one less than the number of SYNCHRONOUS_COMMIT replicas (both inclusive)")) 200 | } 201 | 202 | requiredSynchronizedSecondariesToCommitUint := uint(requiredSynchronizedSecondariesToCommitArg) 203 | requiredSynchronizedSecondariesToCommit = &requiredSynchronizedSecondariesToCommitUint 204 | } 205 | 206 | sqlUsername, sqlPassword, err := mssqlcommon.ReadCredentialsFile(credentialsFile) 207 | if err != nil { 208 | return mssqlocf.OcfExit(stderr, mssqlocf.OCF_ERR_ARGS, fmt.Errorf("Could not read credentials file: %s", err)) 209 | } 210 | 211 | var db *sql.DB 212 | 213 | switch action { 214 | case "start", "monitor", "promote", "pre-promote": 215 | // Ensure instance is healthy before checking AG health 216 | db, err = mssqlcommon.OpenDBWithHealthCheck( 217 | hostname, 218 | sqlPort, 219 | sqlUsername, 220 | sqlPassword, 221 | applicationName, 222 | connectionTimeout, 223 | connectionTimeout, 224 | monitorTimeout, 225 | stdout, 226 | ) 227 | if err != nil { 228 | switch serverUnhealthyError := err.(type) { 229 | case *mssqlcommon.ServerUnhealthyError: 230 | if serverUnhealthyError.RawValue <= healthThreshold { 231 | return mssqlocf.OcfExit(stderr, mssqlocf.OCF_ERR_GENERIC, fmt.Errorf( 232 | "Instance is unhealthy: status %d is at or below monitor policy %d", 233 | serverUnhealthyError.RawValue, healthThreshold)) 234 | } 235 | 236 | stdout.Printf("Instance is healthy: status %d is above monitor policy %d", serverUnhealthyError.RawValue, healthThreshold) 237 | 238 | default: 239 | stdout.Printf("OpenDBWithHealthCheck failed during %s: %s", action, mssqlcommon.FormatErrorString(err)) 240 | return err 241 | } 242 | } 243 | 244 | default: 245 | // Don't check instance health for other actions 246 | 247 | db, err = mssqlcommon.OpenDB( 248 | hostname, 249 | sqlPort, 250 | sqlUsername, 251 | sqlPassword, 252 | "", 253 | applicationName, 254 | connectionTimeout, 255 | connectionTimeout, 256 | ) 257 | if err != nil { 258 | stdout.Printf("OpenDB failed during %s: %s", action, mssqlcommon.FormatErrorString(err)) 259 | return mssqlocf.OcfExit(stderr, mssqlocf.OCF_ERR_GENERIC, fmt.Errorf("Could not connect to instance: %s", mssqlcommon.FormatErrorString(err))) 260 | } 261 | } 262 | defer db.Close() 263 | 264 | var ocfExitCode mssqlocf.OcfExitCode 265 | 266 | switch action { 267 | case "start": 268 | ocfExitCode, err = 269 | start( 270 | db, 271 | agName, 272 | sequenceNumbers, 273 | requiredSynchronizedSecondariesToCommit, 274 | currentMaster, 275 | disablePrimaryOnQuorumTimeoutAfter, 276 | primaryWriteLeaseDuration, 277 | stdout, 278 | promotionScoreOut, 279 | leaseExpiryOut) 280 | 281 | case "stop": 282 | ocfExitCode, err = stop(db, agName, stdout) 283 | 284 | case "monitor": 285 | ocfExitCode, err = 286 | monitor( 287 | db, 288 | agName, 289 | requiredSynchronizedSecondariesToCommit, 290 | currentMaster, 291 | "monitor", 292 | disablePrimaryOnQuorumTimeoutAfter, 293 | primaryWriteLeaseDuration, 294 | stdout, 295 | promotionScoreOut, 296 | leaseExpiryOut) 297 | 298 | case "pre-start": 299 | ocfExitCode, err = preStart(db, agName, requiredSynchronizedSecondariesToCommit, stdout, sequenceNumberOut) 300 | 301 | case "post-stop": 302 | ocfExitCode, err = postStop(db, agName, requiredSynchronizedSecondariesToCommit, stdout) 303 | 304 | case "pre-promote": 305 | ocfExitCode, err = prePromote(db, agName, stdout, sequenceNumberOut) 306 | 307 | case "post-promote": 308 | ocfExitCode, err = postPromote(db, agName, stdout, sequenceNumberOut) 309 | 310 | case "promote": 311 | ocfExitCode, err = 312 | promote( 313 | db, 314 | agName, 315 | sequenceNumbers, 316 | newMaster, 317 | skipPreCheck, 318 | requiredSynchronizedSecondariesToCommit, 319 | disablePrimaryOnQuorumTimeoutAfter, 320 | primaryWriteLeaseDuration, 321 | stdout, 322 | promotionScoreOut, 323 | leaseExpiryOut) 324 | 325 | case "demote": 326 | ocfExitCode, err = demote(db, agName, leaseExpiry, stdout) 327 | 328 | default: 329 | return fmt.Errorf("unknown value for --action %s", action) 330 | } 331 | 332 | if err != nil { 333 | stdout.Printf("Failed action %s: %s", action, mssqlcommon.FormatErrorString(err)) 334 | } 335 | return mssqlocf.OcfExit(stderr, ocfExitCode, err) 336 | } 337 | 338 | // Function: start 339 | // 340 | // Description: 341 | // Implements the OCF "start" action by ensuring the AG replica exists and is in SECONDARY role. 342 | // 343 | // Returns: 344 | // OCF_SUCCESS: AG replica exists and is in SECONDARY role. 345 | // OCF_ERR_GENERIC: Propagated from `monitor()` 346 | // 347 | func start( 348 | db *sql.DB, agName string, 349 | sequenceNumbers string, 350 | requiredSynchronizedSecondariesToCommit *uint, 351 | currentMaster string, 352 | disablePrimaryOnQuorumTimeoutAfter int64, 353 | primaryWriteLeaseDuration int64, 354 | stdout *log.Logger, 355 | promotionScoreOut *log.Logger, 356 | leaseExpiryOut *log.Logger, 357 | ) (mssqlocf.OcfExitCode, error) { 358 | 359 | instancename, err := mssqlcommon.GetServerInstanceName(db) 360 | if err != nil { 361 | stdout.Printf("[DEBUG] AG Helper Start Role GetServerInstanceName error: %v", err) 362 | instancename = "" 363 | } 364 | stdout.Printf("[DEBUG] AG Helper Start Role info: AVAILABILITY GROUP %s on instance %s", agName, instancename) 365 | 366 | role, err := getRole(db, agName, stdout) 367 | if err == sql.ErrNoRows { 368 | return mssqlocf.OCF_ERR_GENERIC, errors.New("Did not find AG row in sys.availability_groups") 369 | } 370 | if err != nil { 371 | return mssqlocf.OCF_ERR_GENERIC, err 372 | } 373 | 374 | switch role { 375 | case mssqlag.RoleRESOLVING: 376 | if currentMaster == "" { 377 | // There is no master right now. Don't run ALTER AG SET ROLE = SECONDARY because we don't want to wait for recovery. 378 | // Just pretend the replica is healthy so the replica is available for pre-promote. 379 | // 380 | // If this replica gets chosen to be promoted to master, `promote` will run ALTER AG FAILOVER and bring it out of RESOLVING. 381 | // 382 | // If another replica gets chosen to be promoted to master, --current-master will be set, 383 | // so `monitor` will return OCF_NOT_RUNNING and trigger Pacemaker to stop the resource. 384 | // The subsequent `start` will run ALTER AG SET ROLE = SECONDARY and bring it out of RESOLVING. 385 | } else { 386 | // There is already a master, so run ALTER AG SET ROLE = SECONDARY and wait for DBs to finish recovery 387 | err := setRoleToSecondaryAndWait(db, agName, stdout) 388 | if err != nil { 389 | return mssqlocf.OCF_ERR_GENERIC, err 390 | } 391 | } 392 | 393 | case mssqlag.RolePRIMARY: 394 | // Don't expect to be a primary. Tell Pacemaker so that it stops the resource. 395 | return mssqlocf.OCF_RUNNING_MASTER, nil 396 | 397 | default: 398 | // Do nothing 399 | } 400 | 401 | sequenceNumber, err := mssqlag.GetSequenceNumber(db, agName) 402 | if err != nil { 403 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf("Could not query sequence number: %s", mssqlcommon.FormatErrorString(err)) 404 | } 405 | stdout.Printf("Sequence number is %s", humanReadableSequenceNumber(sequenceNumber)) 406 | 407 | parsedSequenceNumbers, err := parseSequenceNumbers(sequenceNumbers, nil, stdout) 408 | if err != nil { 409 | return mssqlocf.OCF_ERR_GENERIC, err 410 | } 411 | 412 | maxSequenceNumber := parsedSequenceNumbers.Max 413 | 414 | if sequenceNumber < maxSequenceNumber { 415 | stdout.Printf( 416 | "Replica has sequence number %s but max sequence number is %s, so it cannot be promoted", 417 | humanReadableSequenceNumber(sequenceNumber), humanReadableSequenceNumber(maxSequenceNumber)) 418 | 419 | promotionScoreOut.Println(promotionScoreShouldNotBePromoted) 420 | 421 | promotionScoreOut = nil // Don't let `monitor` set a different promotion score 422 | } 423 | 424 | // Check health to confirm successful startup 425 | return monitor( 426 | db, 427 | agName, 428 | requiredSynchronizedSecondariesToCommit, 429 | currentMaster, 430 | "start", 431 | disablePrimaryOnQuorumTimeoutAfter, 432 | primaryWriteLeaseDuration, 433 | stdout, 434 | promotionScoreOut, 435 | leaseExpiryOut) 436 | } 437 | 438 | // Function: stop 439 | // 440 | // Description: 441 | // Implements the OCF "stop" action by ensuring the AG replica doesn't exist or is in SECONDARY role. 442 | // 443 | // Returns: 444 | // OCF_SUCCESS: AG replica does not exist, or was successfully set to SECONDARY role (if necessary). 445 | // OCF_ERR_GENERIC: Any error from trying to set the AG replica to SECONDARY role. 446 | // 447 | func stop(db *sql.DB, agName string, stdout *log.Logger) (mssqlocf.OcfExitCode, error) { 448 | instancename, err := mssqlcommon.GetServerInstanceName(db) 449 | if err != nil { 450 | stdout.Printf("[DEBUG] AG Helper Stop Role GetServerInstanceName error: %v", err) 451 | instancename = "" 452 | } 453 | stdout.Printf("[DEBUG] AG Helper Stop Role info: AVAILABILITY GROUP %s on instance %s", agName, instancename) 454 | 455 | role, err := getRole(db, agName, stdout) 456 | if err == sql.ErrNoRows { 457 | return mssqlocf.OCF_SUCCESS, nil 458 | } 459 | if err != nil { 460 | return mssqlocf.OCF_ERR_GENERIC, err 461 | } 462 | 463 | // Only set role to SECONDARY if it's a PRIMARY. 464 | // We don't care to change role if it's RESOLVING. That will be handled by a subsequent `start`, if any. 465 | if role == mssqlag.RolePRIMARY { 466 | err := offlineAndWait(db, agName, stdout) 467 | if err != nil { 468 | return mssqlocf.OCF_ERR_GENERIC, err 469 | } 470 | } 471 | 472 | return mssqlocf.OCF_SUCCESS, nil 473 | } 474 | 475 | // Function: monitor 476 | // 477 | // Description: 478 | // Implements the OCF "monitor" action. 479 | // 480 | // Returns: 481 | // OCF_SUCCESS: AG replica on this instance is in SECONDARY role. 482 | // OCF_RUNNING_MASTER: AG replica on this instance is in PRIMARY role. If DB_FAILOVER is ON for this AG, 483 | // then all databases on this replica are ONLINE. 484 | // OCF_NOT_RUNNING: The AG is not found in sys.availability_groups, or its role is RESOLVING. 485 | // OCF_ERR_GENERIC: One of the above is not true. 486 | // 487 | func monitor( 488 | db *sql.DB, agName string, 489 | requiredSynchronizedSecondariesToCommit *uint, 490 | currentMaster string, 491 | caller string, 492 | disablePrimaryOnQuorumTimeoutAfter int64, 493 | primaryWriteLeaseDuration int64, 494 | stdout *log.Logger, 495 | promotionScoreOut *log.Logger, 496 | leaseExpiryOut *log.Logger, 497 | ) (mssqlocf.OcfExitCode, error) { 498 | stdout.Printf("Monitor Caller is: %s.", caller) 499 | 500 | instancename, err := mssqlcommon.GetServerInstanceName(db) 501 | if err != nil { 502 | stdout.Printf("[DEBUG] AG Helper Monitor Role GetServerInstanceName error: %v", err) 503 | instancename = "" 504 | } 505 | stdout.Printf("[DEBUG] AG Helper Monitor Role info: AVAILABILITY GROUP %s on instance %s", agName, instancename) 506 | 507 | role, err := getRole(db, agName, stdout) 508 | if err == sql.ErrNoRows { 509 | return mssqlocf.OCF_NOT_RUNNING, nil 510 | } 511 | if err != nil { 512 | return mssqlocf.OCF_ERR_GENERIC, err 513 | } 514 | 515 | // We need to skip monitor logic in primary directly if: 516 | // 1) currentMaster does not have a value yet 517 | // 2) monitor is called from monitor, not start or promote 518 | noMasterFlag := currentMaster == "" || currentMaster == " " 519 | if role == mssqlag.RolePRIMARY && caller == "monitor" && noMasterFlag { 520 | stdout.Printf("Skipping monitor for primary...") 521 | 522 | // These printlns will be used by pengine to decide the following steps after monitor() 523 | // so we need to add them here to avoid unnecessary offline 524 | if leaseExpiryOut == nil { 525 | stdout.Printf("nomaster - Lease Expiry Log is null,return error...") 526 | return mssqlocf.OCF_ERR_GENERIC, err 527 | } 528 | 529 | fmt.Printf("nomaster - timenow: %v \n",time.Now().UTC().Format("20060102150405")) 530 | leaseExpiryOut.Println(time.Now().UTC().Add(time.Duration(primaryWriteLeaseDuration)*time.Second).Format("20060102150405")) 531 | 532 | if promotionScoreOut != nil { 533 | promotionScoreOut.Println(promotionScoreCurrentMaster) 534 | } 535 | 536 | return mssqlocf.OCF_RUNNING_MASTER, nil 537 | } 538 | 539 | instancename, err = mssqlcommon.GetServerInstanceName(db) 540 | if err == sql.ErrNoRows { 541 | return mssqlocf.OCF_NOT_RUNNING, nil 542 | } 543 | if err != nil { 544 | return mssqlocf.OCF_ERR_GENERIC, err 545 | } 546 | 547 | if role == mssqlag.RolePRIMARY && !strings.EqualFold(instancename, currentMaster) && caller == "monitor" { 548 | 549 | err := offlineAndWait(db, agName, stdout) 550 | if err != nil { 551 | return mssqlocf.OCF_ERR_GENERIC, err 552 | } 553 | 554 | // We need to manually reset role to resolving if offlineAndWait() does not return any errors. 555 | role = mssqlag.RoleRESOLVING 556 | } 557 | 558 | stdout.Printf("Instance name is %s.", instancename) 559 | stdout.Printf("Current master is %s.", currentMaster) 560 | 561 | if role == mssqlag.RolePRIMARY { 562 | currentConfigurationCommitStartTime, err := mssqlag.GetCurrentConfigurationCommitStartTime(db, agName) 563 | if err != nil { 564 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf("could not get last configuration commit time: %s", mssqlcommon.FormatErrorString(err)) 565 | } 566 | 567 | // If there's a current configuration with start time < now() - disablePrimaryOnQuorumTimeoutAfter, then a quorum timeout has occurred 568 | quorumTimeoutOccurred := 569 | currentConfigurationCommitStartTime != nil && 570 | (*currentConfigurationCommitStartTime).Before(time.Now().Add(-time.Duration(disablePrimaryOnQuorumTimeoutAfter)*time.Second)) 571 | if quorumTimeoutOccurred { 572 | stdout.Printf("There is a configuration commit in progress since %s. Not renewing lease.", currentConfigurationCommitStartTime.Local()) 573 | 574 | // We need to offline primay to resolving role if quorum timeout happens. 575 | err := offlineAndWait(db, agName, stdout) 576 | if err != nil { 577 | return mssqlocf.OCF_ERR_GENERIC, err 578 | } 579 | 580 | } else { 581 | ocfExitCode, err := updateExternalLease(db, agName, primaryWriteLeaseDuration, stdout, leaseExpiryOut) 582 | 583 | if err != nil { 584 | return ocfExitCode, err 585 | } 586 | } 587 | 588 | dbFailoverMode, err := mssqlag.GetDBFailoverMode(db, agName) 589 | if err != nil { 590 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf("Could not query DB_FAILOVER setting: %s", mssqlcommon.FormatErrorString(err)) 591 | } 592 | 593 | var dbFailoverModeString string 594 | if dbFailoverMode { 595 | dbFailoverModeString = "ON" 596 | } else { 597 | dbFailoverModeString = "OFF" 598 | } 599 | 600 | stdout.Printf("DB_FAILOVER is %s.", dbFailoverModeString) 601 | 602 | // Update REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT if necessary 603 | err = setRequiredSynchronizedSecondariesToCommit(db, agName, requiredSynchronizedSecondariesToCommit, stdout) 604 | if err != nil { 605 | return mssqlocf.OCF_ERR_GENERIC, err 606 | } 607 | 608 | if promotionScoreOut != nil { 609 | promotionScoreOut.Println(promotionScoreCurrentMaster) 610 | } 611 | 612 | return mssqlocf.OCF_RUNNING_MASTER, nil 613 | } 614 | 615 | //We need to return error if currentMaster is not correct. 616 | //It is possible when quorum timeout happens, db replica is already resolving 617 | //But pacemaker resource still thinks current replica is primary. 618 | if strings.EqualFold(instancename, currentMaster) && caller == "monitor" { 619 | stdout.Printf("test - Role is %s.", role) 620 | stdout.Printf("test - Instance name is %s.", instancename) 621 | stdout.Printf("test - Current master is %s.", currentMaster) 622 | return mssqlocf.OCF_ERR_GENERIC, nil 623 | } 624 | 625 | // Ideally we would check if this is a SYNCHRONOUS_COMMIT replica and all DBs are SYNCHRONIZED, otherwise set promotion score 626 | // to `-INFINITY`. This doesn't work since if the PRIMARY is down, all DB replicas report themselves as NOT SYNCHRONIZING in 627 | // sys.dm_hadr_database_replica_states even if their copy of the AG configuration indicates they were synchronized before the 628 | // PRIMARY went down. Even if sys.dm_hadr_database_replica_states did tell the truth, it wouldn't know about databases that 629 | // don't even exist on the local replica, ie databases that were never seeded from the PRIMARY to the local replica for any reason. 630 | // 631 | // The FAILOVER DDL has access to this information, which is how it's able to fail with 41142, but the DMVs don't expose it. 632 | 633 | if role == mssqlag.RoleRESOLVING && caller == "monitor" { 634 | // AG is neither PRIMARY nor SECONDARY, which means it's waiting to be explicitly set to one or the other via start / promote. 635 | 636 | if currentMaster == "" { 637 | // There is no master right now. Don't report this as a failure since we don't want the replica to stop and restart. 638 | } else { 639 | // There is already a master, so run ALTER AG SET ROLE = SECONDARY and wait for DBs to finish recovery 640 | 641 | stdout.Printf("Setting the role to Secondary.") 642 | err := setRoleToSecondaryAndWait(db, agName, stdout) 643 | if err != nil { 644 | return mssqlocf.OCF_ERR_GENERIC, err 645 | } 646 | 647 | if promotionScoreOut != nil { 648 | promotionScoreOut.Println(promotionScoreShouldNotBePromoted) 649 | } 650 | 651 | return mssqlocf.OCF_SUCCESS, nil 652 | } 653 | } 654 | 655 | if promotionScoreOut != nil { 656 | availabilityMode, _, err := mssqlag.GetAvailabilityMode(db, agName) 657 | if err != nil { 658 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf("Could not query availability mode: %s", mssqlcommon.FormatErrorString(err)) 659 | } 660 | 661 | if availabilityMode == mssqlag.AmSYNCHRONOUS_COMMIT { 662 | promotionScoreOut.Println(promotionScoreCanBePromoted) 663 | } else { 664 | promotionScoreOut.Println(promotionScoreShouldNotBePromoted) 665 | } 666 | } 667 | 668 | return mssqlocf.OCF_SUCCESS, nil 669 | } 670 | 671 | // Function: preStart 672 | // 673 | // Description: 674 | // Invoked to handle pre-start notifications from the OCF "notify" action. 675 | // 676 | // Returns: 677 | // OCF_SUCCESS 678 | // OCF_ERR_GENERIC 679 | // 680 | func preStart( 681 | db *sql.DB, agName string, 682 | requiredSynchronizedSecondariesToCommit *uint, 683 | stdout *log.Logger, 684 | sequenceNumberOut *log.Logger, 685 | ) (mssqlocf.OcfExitCode, error) { 686 | isPrimary, err := isPrimary(db, agName, stdout) 687 | if err != nil { 688 | return mssqlocf.OCF_ERR_GENERIC, err 689 | } 690 | 691 | if isPrimary { 692 | // A replica is going to start. If it's starting because a new replica was added to the AG, then we need to update REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT. 693 | err := setRequiredSynchronizedSecondariesToCommit(db, agName, requiredSynchronizedSecondariesToCommit, stdout) 694 | if err != nil { 695 | return mssqlocf.OCF_ERR_GENERIC, err 696 | } 697 | } 698 | 699 | // Write out local replica's sequence number to attrd so that the replica being started can set its promotion score 700 | sequenceNumber, err := getSequenceNumberAdjustedForAvailabilityMode(db, agName, stdout) 701 | if err != nil { 702 | return mssqlocf.OCF_ERR_GENERIC, err 703 | } 704 | sequenceNumberOut.Println(sequenceNumber) 705 | 706 | return mssqlocf.OCF_SUCCESS, nil 707 | } 708 | 709 | // Function: postStop 710 | // 711 | // Description: 712 | // Invoked to handle post-stop notifications from the OCF "notify" action. 713 | // 714 | // Returns: 715 | // OCF_SUCCESS 716 | // OCF_ERR_GENERIC 717 | // 718 | func postStop( 719 | db *sql.DB, agName string, 720 | requiredSynchronizedSecondariesToCommit *uint, 721 | stdout *log.Logger) (mssqlocf.OcfExitCode, error) { 722 | 723 | isPrimary, err := isPrimary(db, agName, stdout) 724 | if err != nil { 725 | return mssqlocf.OCF_ERR_GENERIC, err 726 | } 727 | 728 | if isPrimary { 729 | // A replica has stopped. If it stopped because a replica was removed from the AG, then we need to update REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT. 730 | err := setRequiredSynchronizedSecondariesToCommit(db, agName, requiredSynchronizedSecondariesToCommit, stdout) 731 | if err != nil { 732 | return mssqlocf.OCF_ERR_GENERIC, err 733 | } 734 | } 735 | 736 | return mssqlocf.OCF_SUCCESS, nil 737 | } 738 | 739 | // Function: postPromote 740 | // 741 | // Description: 742 | // Invoked to handle post-promote notifications from the OCF "notify" action. 743 | // 744 | // Returns: 745 | // OCF_SUCCESS 746 | // OCF_ERR_GENERIC 747 | // 748 | func postPromote( 749 | db *sql.DB, agName string, 750 | stdout *log.Logger, sequenceNumberOut *log.Logger) (mssqlocf.OcfExitCode, error) { 751 | 752 | instancename, err := mssqlcommon.GetServerInstanceName(db) 753 | if err != nil { 754 | stdout.Printf("[DEBUG] AG Helper PostPromote Role GetServerInstanceName error: %v", err) 755 | instancename = "" 756 | } 757 | stdout.Printf("[DEBUG] AG Helper PostPromote Role info: AVAILABILITY GROUP %s on instance %s", agName, instancename) 758 | 759 | role, err := getRole(db, agName, stdout) 760 | if err == sql.ErrNoRows { 761 | return mssqlocf.OCF_SUCCESS, nil 762 | } 763 | if err != nil { 764 | return mssqlocf.OCF_ERR_GENERIC, err 765 | } 766 | 767 | // If role is secondary we will bring it offline in post-promote. 768 | if role == mssqlag.RoleSECONDARY { 769 | stdout.Println("Setting secondary to offline") 770 | err := offlineAndWait(db, agName, stdout) 771 | if err != nil { 772 | return mssqlocf.OCF_ERR_GENERIC, err 773 | } 774 | } 775 | return mssqlocf.OCF_SUCCESS, nil 776 | } 777 | 778 | // Function: prePromote 779 | // 780 | // Description: 781 | // Invoked to handle pre-promote notifications from the OCF "notify" action. 782 | // 783 | // Returns: 784 | // OCF_SUCCESS: Sequence number was fetched successfully. 785 | // OCF_ERR_GENERIC: Could not query sequence number of the AG replica. 786 | // 787 | func prePromote( 788 | db *sql.DB, agName string, 789 | stdout *log.Logger, sequenceNumberOut *log.Logger) (mssqlocf.OcfExitCode, error) { 790 | 791 | sequenceNumber, err := getSequenceNumberAdjustedForAvailabilityMode(db, agName, stdout) 792 | if err != nil { 793 | return mssqlocf.OCF_ERR_GENERIC, err 794 | } 795 | sequenceNumberOut.Println(sequenceNumber) 796 | 797 | return mssqlocf.OCF_SUCCESS, nil 798 | } 799 | 800 | // Function: promote 801 | // 802 | // Description: 803 | // Implements the OCF "promote" action by failing over the AG replica to PRIMARY role. 804 | // 805 | // Returns: 806 | // OCF_SUCCESS: AG replica is already in PRIMARY role or was successfully failed over to PRIMARY role. 807 | // OCF_FAILED_MASTER: AG replica could not be failed over to PRIMARY role and is now in unknown state. 808 | // OCF_ERR_GENERIC: Could not determine initial role of AG replica, or --skip-precheck was not passed and the availability mode is 809 | // ASYNCHRONOUS_COMMIT or could not be successfully retrieved, or the sequence number of the AG replica is lower than the 810 | // sequence number of some other replica. 811 | // 812 | func promote( 813 | db *sql.DB, agName string, 814 | sequenceNumbers string, 815 | newMaster string, 816 | skipPreCheck bool, 817 | requiredSynchronizedSecondariesToCommit *uint, 818 | disablePrimaryOnQuorumTimeoutAfter int64, 819 | primaryWriteLeaseDuration int64, 820 | stdout *log.Logger, 821 | promotionScoreOut *log.Logger, 822 | leaseExpiryOut *log.Logger, 823 | ) (mssqlocf.OcfExitCode, error) { 824 | 825 | isPrimary, err := isPrimary(db, agName, stdout) 826 | if err != nil { 827 | return mssqlocf.OCF_ERR_GENERIC, err 828 | } 829 | if isPrimary { 830 | return mssqlocf.OCF_SUCCESS, nil 831 | } 832 | 833 | if skipPreCheck { 834 | stdout.Println("Skipping pre-check since --skip-precheck was specified.") 835 | } else { 836 | availabilityMode, availabilityModeDesc, err := mssqlag.GetAvailabilityMode(db, agName) 837 | if err != nil { 838 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf("Could not query availability mode: %s", mssqlcommon.FormatErrorString(err)) 839 | } 840 | 841 | if availabilityMode != mssqlag.AmSYNCHRONOUS_COMMIT { 842 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf( 843 | "%s (%d) replica cannot be promoted to PRIMARY", 844 | availabilityModeDesc, availabilityMode) 845 | } 846 | } 847 | 848 | parsedSequenceNumbers, err := parseSequenceNumbers(sequenceNumbers, &newMaster, stdout) 849 | if err != nil { 850 | return mssqlocf.OCF_ERR_GENERIC, err 851 | } 852 | 853 | numSequenceNumbers := parsedSequenceNumbers.Count 854 | maxSequenceNumber := parsedSequenceNumbers.Max 855 | newMasterSequenceNumber := parsedSequenceNumbers.NewMaster 856 | 857 | if newMasterSequenceNumber < maxSequenceNumber { 858 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf( 859 | "Replica has sequence number %s but max sequence number is %s, so it cannot be promoted", 860 | humanReadableSequenceNumber(newMasterSequenceNumber), humanReadableSequenceNumber(maxSequenceNumber)) 861 | } 862 | 863 | if newMasterSequenceNumber == 0 { 864 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf("Replica has sequence number %s, so it cannot be promoted", humanReadableSequenceNumber(newMasterSequenceNumber)) 865 | } 866 | 867 | numSyncCommitAndConfigurationOnlyReplicas, err := mssqlag.GetNumSyncCommitAndConfigurationOnlyReplicas(db, agName) 868 | if err != nil { 869 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf("Could not query number of SYNCHRONOUS_COMMIT or CONFIGURATION_ONLY replicas: %s", mssqlcommon.FormatErrorString(err)) 870 | } 871 | 872 | stdout.Printf("AG has %d SYNCHRONOUS_COMMIT or CONFIGURATION_ONLY replicas.", numSyncCommitAndConfigurationOnlyReplicas) 873 | 874 | requiredNumSequenceNumbers := mssqlag.CalculateNumRequiredSequenceNumbers(numSyncCommitAndConfigurationOnlyReplicas) 875 | if numSequenceNumbers < requiredNumSequenceNumbers { 876 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf( 877 | "Not enough replicas are online to safely promote this replica: need %d but have %d", 878 | requiredNumSequenceNumbers, numSequenceNumbers) 879 | } 880 | 881 | // We need to renew lease before and after failover so db will not out of sync 882 | // renew lease after failover will be done in monitor() function 883 | ocfExitCode, err := updateExternalLease(db, agName, primaryWriteLeaseDuration, stdout, leaseExpiryOut) 884 | 885 | if err != nil { 886 | return ocfExitCode, err 887 | } 888 | 889 | instancename, err := mssqlcommon.GetServerInstanceName(db) 890 | if err != nil { 891 | stdout.Printf("[DEBUG] AG Helper Promote Role GetServerInstanceName error: %v", err) 892 | instancename = "" 893 | } 894 | stdout.Printf("[DEBUG] Promote info: AVAILABILITY GROUP %s on instance %s", agName, instancename) 895 | 896 | stdout.Println("Promoting replica to PRIMARY role...") 897 | 898 | err = mssqlag.Failover(db, agName) 899 | if err != nil { 900 | if e, ok := err.(mssql.Error); ok && e.Number == mssqlcommon.SQLError_AGCannotFailover_UnsynchronizedDBs { 901 | // Write a shorter error message prefix so that it's readable when truncated by `pcs resource status` or `crm_mon` 902 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf("One or more DBs are unsynchronized or not joined to the AG: %s", mssqlcommon.FormatErrorString(err)) 903 | } 904 | 905 | return mssqlocf.OCF_FAILED_MASTER, fmt.Errorf("Could not promote replica to PRIMARY role: %s", mssqlcommon.FormatErrorString(err)) 906 | } 907 | 908 | // `FAILOVER` DDL returns before role change finishes, so wait till it completes. 909 | err = waitUntilRoleSatisfies(db, agName, stdout, func(role mssqlag.Role) bool { return role == mssqlag.RolePRIMARY }) 910 | if err != nil { 911 | return mssqlocf.OCF_FAILED_MASTER, fmt.Errorf("Failed while waiting for replica to be in PRIMARY role: %s", mssqlcommon.FormatErrorString(err)) 912 | } 913 | 914 | stdout.Println("Replica is now PRIMARY") 915 | 916 | err = setRequiredSynchronizedSecondariesToCommit(db, agName, requiredSynchronizedSecondariesToCommit, stdout) 917 | if err != nil { 918 | return mssqlocf.OCF_ERR_GENERIC, err 919 | } 920 | 921 | // Wait for databases to be healthy before considering `promote` complete 922 | ocfExitCode, err = 923 | monitor( 924 | db, 925 | agName, 926 | requiredSynchronizedSecondariesToCommit, 927 | newMaster, 928 | "promote", 929 | disablePrimaryOnQuorumTimeoutAfter, 930 | primaryWriteLeaseDuration, 931 | stdout, 932 | promotionScoreOut, 933 | leaseExpiryOut) 934 | if err != nil { 935 | return ocfExitCode, err 936 | } 937 | 938 | if ocfExitCode == mssqlocf.OCF_RUNNING_MASTER { 939 | // `promote` should return OCF_SUCCESS since Pacemaker treats OCF_RUNNING_MASTER as an error 940 | ocfExitCode = mssqlocf.OCF_SUCCESS 941 | } 942 | 943 | return mssqlocf.OCF_SUCCESS, nil 944 | } 945 | 946 | // Function: demote 947 | // 948 | // Description: 949 | // Implements the OCF "demote" action by setting the AG replica to SECONDARY role. 950 | // 951 | // Returns: 952 | // OCF_SUCCESS: AG replica was successfully set to SECONDARY role. 953 | // OCF_ERR_GENERIC: Could not set AG replica to SECONDARY role. 954 | // 955 | func demote(db *sql.DB, agName string, leaseExpiry string, stdout *log.Logger) (mssqlocf.OcfExitCode, error) { 956 | instancename, err := mssqlcommon.GetServerInstanceName(db) 957 | if err != nil { 958 | stdout.Printf("[DEBUG] AG Helper Demote Role GetServerInstanceName error: %v", err) 959 | instancename = "" 960 | } 961 | stdout.Printf("[DEBUG] AG Helper Demote Role info: AVAILABILITY GROUP %s on instance %s", agName, instancename) 962 | 963 | role, err := getRole(db, agName, stdout) 964 | if err == sql.ErrNoRows { 965 | return mssqlocf.OCF_NOT_RUNNING, nil 966 | } 967 | if err != nil { 968 | return mssqlocf.OCF_ERR_GENERIC, err 969 | } 970 | 971 | if role == mssqlag.RolePRIMARY { 972 | // We don't want to ALTER AG SET ROLE = SECONDARY since that will start recovery of AG DBs, causing subsequent 973 | // ALTER AG FAILOVER DDL to block. Offlining the replica is better. 974 | err := offlineAndWait(db, agName, stdout) 975 | if err != nil { 976 | leaseExpiryTime, err := parseLeaseExpiryTime(leaseExpiry, stdout) 977 | if err != nil { 978 | return mssqlocf.OCF_ERR_GENERIC, err 979 | } 980 | 981 | stdout.Printf("Lease Expiry %s", leaseExpiryTime) 982 | 983 | //Here leaseExpiryTime means the TIME when lease expires 984 | if len(leaseExpiryTime) > 0 { 985 | layout := "20060102150405" 986 | t, err := time.Parse(layout, leaseExpiryTime) 987 | if err != nil { 988 | return mssqlocf.OCF_ERR_GENERIC, err 989 | } 990 | 991 | timenow := time.Now().UTC() 992 | 993 | // Based on doc, we sleep 1 more sec. But in real life, sleep 5 more sec is safer 994 | diff := t.Sub(timenow.Add(time.Second * (-5))) 995 | fmt.Printf("Offline of AG didn't succeed, waiting for %v so that the lease expires \n", diff) 996 | 997 | if diff > 0{ 998 | time.Sleep(diff) 999 | } 1000 | 1001 | } 1002 | return mssqlocf.OCF_SUCCESS, nil 1003 | } 1004 | } 1005 | 1006 | return mssqlocf.OCF_SUCCESS, nil 1007 | } 1008 | 1009 | // Function: waitForDatabasesToBeOnline 1010 | // 1011 | // Description: 1012 | // Waits for all databases in the AG to be ONLINE. 1013 | // Periodically prints a message detailing the number of databases that are not ONLINE. 1014 | // 1015 | func waitForDatabasesToBeOnline( 1016 | db *sql.DB, agName string, 1017 | stdout *log.Logger, 1018 | ) error { 1019 | for { 1020 | transientNonOnlineDatabasesMessage, permanentNonOnlineDatabasesMessage, err := mssqlag.GetDatabaseStates(db, agName) 1021 | if err != nil { 1022 | return fmt.Errorf("Failed while waiting for databases to be online: %s", mssqlcommon.FormatErrorString(err)) 1023 | } 1024 | 1025 | if len(permanentNonOnlineDatabasesMessage) > 0 { 1026 | return errors.New(permanentNonOnlineDatabasesMessage) 1027 | } 1028 | 1029 | if len(transientNonOnlineDatabasesMessage) > 0 { 1030 | stdout.Println(transientNonOnlineDatabasesMessage) 1031 | time.Sleep(1 * time.Second) 1032 | continue 1033 | } 1034 | 1035 | // All ready 1036 | stdout.Println("All databases are ONLINE.") 1037 | return nil 1038 | } 1039 | } 1040 | 1041 | func isPrimary(db *sql.DB, agName string, stdout *log.Logger) (bool, error) { 1042 | role, err := getRole(db, agName, stdout) 1043 | if err != nil { 1044 | return false, err 1045 | } 1046 | 1047 | return (role == mssqlag.RolePRIMARY), nil 1048 | } 1049 | 1050 | func getRole(db *sql.DB, agName string, stdout *log.Logger) (mssqlag.Role, error) { 1051 | role, roleDesc, err := mssqlag.GetRole(db, agName) 1052 | if err == sql.ErrNoRows { 1053 | stdout.Println("Did not find AG row in sys.availability_groups") 1054 | stdout.Println("Either the AG replica does not exist on the instance, or the SQL user does not have ALTER, CONTROL and VIEW DEFINITION permissions on the AG.") 1055 | return 0, err 1056 | } 1057 | if err != nil { 1058 | return 0, fmt.Errorf("Could not query replica role: %s", mssqlcommon.FormatErrorString(err)) 1059 | } 1060 | 1061 | stdout.Printf("Replica is %s (%d)", roleDesc, role) 1062 | 1063 | return role, nil 1064 | } 1065 | 1066 | func updateExternalLease(db *sql.DB, agName string, primaryWriteLeaseDuration int64, stdout *log.Logger, leaseExpiryOut *log.Logger) (mssqlocf.OcfExitCode, error) { 1067 | stdout.Printf("[DEBUG] Lease is in the process of being renewed for AVAILABILITY GROUP %s for %d seconds", agName, primaryWriteLeaseDuration) 1068 | err := mssqlag.UpdateExternalWriteLease(db, agName, time.Duration(primaryWriteLeaseDuration)*time.Second) 1069 | if err != nil { 1070 | if e, ok := err.(mssql.Error); ok { 1071 | if e.Number == mssqlcommon.SQLError_AGDoesNotAllowExternalLeaseUpdates { 1072 | stdout.Println( 1073 | "WARNING (Ignore this if External Lease is not used) : The AG does not allow updating external write lease. Not updating the external write lease. " + 1074 | "Recreate the AG with the WRITE_LEASE_VALIDITY option in the CREATE AVAILABILITY GROUP DDL.") 1075 | } else if e.Number == mssqlcommon.SQLError_AGExternalLeaseUpdate_NewExpiryIsOlderThanCurrentExpiry { 1076 | stdout.Printf("Cannot renew the external write lease to %ds because it's already valid for a longer time.", primaryWriteLeaseDuration) 1077 | } else { 1078 | stdout.Printf("[DEBUG] Update external lease failed with mssql error: %v", err) 1079 | return mssqlocf.OCF_ERR_GENERIC, err 1080 | } 1081 | } else { 1082 | stdout.Printf("[DEBUG] Update external lease failed with non-mssql error: %v", err) 1083 | return mssqlocf.OCF_ERR_GENERIC, err 1084 | } 1085 | } else { 1086 | if leaseExpiryOut == nil { 1087 | stdout.Printf("Lease Expiry Log is null,return error...") 1088 | return mssqlocf.OCF_ERR_GENERIC, err 1089 | } 1090 | 1091 | stdout.Printf("[DEBUG] Lease update success.") 1092 | 1093 | //We may not need to save "start lease time" and "primaryWriteLeaseDuration(Monitor_Interval)" by global variables. 1094 | //Since we already have "leaseExpiryTime" as time formation in demote() 1095 | fmt.Printf("timenow: %v \n",time.Now().UTC().Format("20060102150405")) 1096 | leaseExpiryOut.Println(time.Now().UTC().Add(time.Duration(primaryWriteLeaseDuration)*time.Second).Format("20060102150405")) 1097 | } 1098 | 1099 | return mssqlocf.OCF_SUCCESS, nil 1100 | } 1101 | 1102 | func getSequenceNumberAdjustedForAvailabilityMode(db *sql.DB, agName string, stdout *log.Logger) (int64, error) { 1103 | availabilityMode, availabilityModeDesc, err := mssqlag.GetAvailabilityMode(db, agName) 1104 | if err != nil { 1105 | return 0, fmt.Errorf("Could not query availability mode: %s", mssqlcommon.FormatErrorString(err)) 1106 | } 1107 | 1108 | stdout.Printf("Replica is %s (%d).", availabilityModeDesc, availabilityMode) 1109 | 1110 | var sequenceNumber int64 1111 | if availabilityMode == mssqlag.AmSYNCHRONOUS_COMMIT || availabilityMode == mssqlag.AmCONFIGURATION_ONLY { 1112 | sequenceNumber, err = mssqlag.GetSequenceNumber(db, agName) 1113 | if err != nil { 1114 | return 0, fmt.Errorf("Could not query sequence number: %s", mssqlcommon.FormatErrorString(err)) 1115 | } 1116 | } else { 1117 | sequenceNumber = 0 1118 | } 1119 | 1120 | stdout.Printf("Sequence number is %s", humanReadableSequenceNumber(sequenceNumber)) 1121 | 1122 | return sequenceNumber, nil 1123 | } 1124 | 1125 | func humanReadableSequenceNumber(sequenceNumber int64) string { 1126 | majorNumber := (sequenceNumber >> 32) & 0xFFFFFFFF 1127 | minorNumber := sequenceNumber & 0xFFFFFFFF 1128 | return fmt.Sprintf("%X:%X (%d)", majorNumber, minorNumber, sequenceNumber) 1129 | } 1130 | 1131 | // Function: offlineAndWait 1132 | // 1133 | // Description: 1134 | // Runs ALTER AG OFFLINE DDL and waits for the role to change to RESOLVING 1135 | // 1136 | func offlineAndWait(db *sql.DB, agName string, stdout *log.Logger) error { 1137 | stdout.Println("Offlining replica...") 1138 | 1139 | err := mssqlag.Offline(db, agName) 1140 | if err != nil { 1141 | return fmt.Errorf("Could not offline replica: %s", mssqlcommon.FormatErrorString(err)) 1142 | } 1143 | 1144 | // Ensure role is RESOLVING before continuing. 1145 | err = waitUntilRoleSatisfies(db, agName, stdout, func(role mssqlag.Role) bool { return role == mssqlag.RoleRESOLVING }) 1146 | if err != nil { 1147 | return fmt.Errorf("Failed while waiting for replica to be in RESOLVING role: %s", mssqlcommon.FormatErrorString(err)) 1148 | } 1149 | 1150 | return nil 1151 | } 1152 | 1153 | type parsedSequenceNumbers struct { 1154 | Count uint 1155 | Max int64 1156 | NewMaster int64 1157 | } 1158 | 1159 | func parseSequenceNumbers( 1160 | sequenceNumbers string, 1161 | newMaster *string, 1162 | stdout *log.Logger, 1163 | ) (parsedSequenceNumbers, error) { 1164 | stdout.Println("Verifying replica's sequence number vs all sequence numbers...") 1165 | 1166 | var result parsedSequenceNumbers 1167 | 1168 | lineRegex := regexp.MustCompile(`^name="[^"]+" host="([^"]+)" value="(\d+)"$`) 1169 | 1170 | for _, line := range strings.Split(sequenceNumbers, "\n") { 1171 | stdout.Printf("Sequence number line [%s]", line) 1172 | 1173 | match := lineRegex.FindStringSubmatch(line) 1174 | if match == nil { 1175 | stdout.Println("Line does not match expected syntax. Ignoring.") 1176 | continue 1177 | } 1178 | 1179 | host := match[1] 1180 | value, err := strconv.ParseInt(match[2], 10, 64) 1181 | if err != nil { 1182 | return result, fmt.Errorf("Could not parse sequence number line: %s", mssqlcommon.FormatErrorString(err)) 1183 | } 1184 | 1185 | if newMaster != nil && host == *newMaster { 1186 | result.NewMaster = value 1187 | } 1188 | 1189 | if value > result.Max { 1190 | result.Max = value 1191 | } 1192 | 1193 | if value > 0 { 1194 | result.Count++ 1195 | } 1196 | } 1197 | 1198 | stdout.Printf("%d sequence numbers were found", result.Count) 1199 | stdout.Printf("Max sequence number is %s", humanReadableSequenceNumber(result.Max)) 1200 | if newMaster != nil { 1201 | stdout.Printf("Sequence number of %s is %s", *newMaster, humanReadableSequenceNumber(result.NewMaster)) 1202 | } 1203 | 1204 | return result, nil 1205 | } 1206 | 1207 | func parseLeaseExpiryTime( 1208 | leaseExpiry string, 1209 | stdout *log.Logger, 1210 | ) (string, error) { 1211 | stdout.Println("Verifying replica's lease expiry time..") 1212 | 1213 | var leaseExpiryTime string 1214 | 1215 | leaseExpiryTime = "" 1216 | 1217 | lineRegex := regexp.MustCompile(`^name="[^"]+" host="([^"]+)" value="(\d+)"$`) 1218 | 1219 | for _, line := range strings.Split(leaseExpiry, "\n") { 1220 | stdout.Printf("Lease Expiry line [%s]", line) 1221 | 1222 | match := lineRegex.FindStringSubmatch(line) 1223 | if match == nil { 1224 | stdout.Println("Line does not match expected syntax. Ignoring.") 1225 | continue 1226 | } 1227 | if len(match)>2 { 1228 | leaseExpiryTime = match[2] 1229 | } 1230 | 1231 | } 1232 | 1233 | if len(leaseExpiryTime)<1 { 1234 | return leaseExpiryTime, fmt.Errorf("Lease expiry time not present.") 1235 | } 1236 | 1237 | return leaseExpiryTime, nil 1238 | } 1239 | 1240 | func setRequiredSynchronizedSecondariesToCommit( 1241 | db *sql.DB, agName string, 1242 | override *uint, 1243 | stdout *log.Logger, 1244 | ) error { 1245 | var requiredSynchronizedSecondariesToCommit uint 1246 | 1247 | if override == nil { 1248 | numSyncCommitReplicas, err := mssqlag.GetNumSyncCommitReplicas(db, agName) 1249 | if err != nil { 1250 | return fmt.Errorf("Could not query number of SYNCHRONOUS_COMMIT replicas: %s", mssqlcommon.FormatErrorString(err)) 1251 | } 1252 | 1253 | stdout.Printf("AG has %d SYNCHRONOUS_COMMIT replicas.", numSyncCommitReplicas) 1254 | 1255 | requiredSynchronizedSecondariesToCommit = mssqlag.CalculateRequiredSynchronizedSecondariesToCommit(numSyncCommitReplicas) 1256 | } else { 1257 | requiredSynchronizedSecondariesToCommit = *override 1258 | } 1259 | 1260 | stdout.Printf("Setting REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT to %d...", requiredSynchronizedSecondariesToCommit) 1261 | 1262 | err := mssqlag.SetRequiredSynchronizedSecondariesToCommit(db, agName, int32(requiredSynchronizedSecondariesToCommit)) 1263 | if err != nil { 1264 | return fmt.Errorf("Could not set REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT: %s", mssqlcommon.FormatErrorString(err)) 1265 | } 1266 | 1267 | return nil 1268 | } 1269 | 1270 | // Function: setRoleToSecondaryAndWait 1271 | // 1272 | // Description: 1273 | // Runs ALTER AG SET ROLE = SECONDARY DDL and waits for the role to change to SECONDARY 1274 | // 1275 | func setRoleToSecondaryAndWait(db *sql.DB, agName string, stdout *log.Logger) error { 1276 | stdout.Println("Setting replica to SECONDARY role...") 1277 | 1278 | err := mssqlag.SetRoleToSecondary(db, agName) 1279 | 1280 | type ErrorWithNumber interface { 1281 | SQLErrorNumber() int32 1282 | } 1283 | 1284 | if err != nil { 1285 | if errorWithNumber, ok := err.(ErrorWithNumber); ok { 1286 | if errorWithNumber.SQLErrorNumber() == 41104 { 1287 | stdout.Println("Could not set replica to SECONDARY role. Failover Failed.") 1288 | return nil 1289 | } 1290 | } 1291 | 1292 | return fmt.Errorf("Could not set replica to SECONDARY role: %s", mssqlcommon.FormatErrorString(err)) 1293 | } 1294 | 1295 | // `SET (ROLE = SECONDARY)` DDL returns before role change finishes, so wait till it completes. 1296 | err = waitUntilRoleSatisfies(db, agName, stdout, func(role mssqlag.Role) bool { return role == mssqlag.RoleSECONDARY }) 1297 | if err != nil { 1298 | return fmt.Errorf("Failed while waiting for replica to be in SECONDARY role: %s", mssqlcommon.FormatErrorString(err)) 1299 | } 1300 | 1301 | return nil 1302 | } 1303 | 1304 | func waitUntilRoleSatisfies(db *sql.DB, agName string, stdout *log.Logger, predicate func(mssqlag.Role) bool) error { 1305 | for { 1306 | role, err := getRole(db, agName, stdout) 1307 | if err != nil { 1308 | return err 1309 | } 1310 | 1311 | if predicate(role) { 1312 | return nil 1313 | } 1314 | 1315 | time.Sleep(1 * time.Second) 1316 | } 1317 | } 1318 | --------------------------------------------------------------------------------