├── fci
├── docs
│ ├── fci_usage
│ └── fci_metadata
├── test
│ └── fci_test
└── fci
├── ag
├── docs
│ ├── ag_usage
│ └── ag_metadata
└── ag
├── LICENSE
├── go
└── src
│ ├── mssqlcommon
│ ├── ocf
│ │ ├── lib_test.go
│ │ └── lib.go
│ ├── lib_test.go
│ ├── ag
│ │ └── lib.go
│ └── lib.go
│ ├── fci-helper
│ └── main.go
│ └── ag-helper
│ └── main.go
├── SECURITY.md
└── README.md
/fci/docs/fci_usage:
--------------------------------------------------------------------------------
1 | usage: $0 {start|stop|monitor|validate-all|meta-data}
2 |
3 | Expects to have a fully populated OCF RA-compliant environment set.
4 |
--------------------------------------------------------------------------------
/ag/docs/ag_usage:
--------------------------------------------------------------------------------
1 | usage: $0 {start|stop|promote|demote|monitor|validate-all|meta-data}
2 |
3 | Expects to have a fully populated OCF RA-compliant environment set.
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2017 Microsoft Corporation
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 |
--------------------------------------------------------------------------------
/go/src/mssqlcommon/ocf/lib_test.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) Microsoft Corporation.
2 |
3 | package ocf
4 |
5 | import (
6 | "os"
7 | "testing"
8 | )
9 |
10 | func TestImportOcfExitCodes(t *testing.T) {
11 | // Note the values here are intentionally not the real values of these env vars.
12 | // In particular OCF_SUCCESS is not set to its real value of 0 to be able to check if it's correctly initialized
13 | var requiredEnvironmentVariables = map[string]string{
14 | "OCF_SUCCESS": "1",
15 | "OCF_ERR_ARGS": "2",
16 | "OCF_ERR_CONFIGURED": "3",
17 | "OCF_ERR_GENERIC": "4",
18 | "OCF_ERR_PERM": "5",
19 | "OCF_ERR_UNIMPLEMENTED": "6",
20 | "OCF_FAILED_MASTER": "7",
21 | "OCF_NOT_RUNNING": "8",
22 | "OCF_RUNNING_MASTER": "9",
23 | }
24 |
25 | // All vars should be 0 initially
26 | if OCF_SUCCESS != 0 {
27 | t.Fatal("OCF_SUCCESS is not 0. Test is not starting from clean slate.")
28 | }
29 |
30 | for key, value := range requiredEnvironmentVariables {
31 | os.Setenv(key, value)
32 | }
33 |
34 | // All vars set to valid values
35 | err := ImportOcfExitCodes()
36 | if err != nil {
37 | t.Fatalf("Expected ImportOcfExitCodes to succeed but it failed: %s", err)
38 | }
39 |
40 | // One var not set
41 | os.Unsetenv("OCF_SUCCESS")
42 | err = ImportOcfExitCodes()
43 | if err == nil {
44 | t.Fatal("Expected ImportOcfExitCodes to fail but it succeeded")
45 | }
46 | if err.Error() != "OCF_SUCCESS is set to an invalid value []" {
47 | t.Fatalf("ImportOcfExitCodes did not fail with an error about OCF_SUCCESS being unset: %s", err.Error())
48 | }
49 |
50 | // One var set to invalid value
51 | os.Setenv("OCF_SUCCESS", "A")
52 | err = ImportOcfExitCodes()
53 | if err == nil {
54 | t.Fatal("Expected ImportOcfExitCodes to fail but it succeeded")
55 | }
56 | if err.Error() != "OCF_SUCCESS is set to an invalid value [A]" {
57 | t.Fatalf("ImportOcfExitCodes did not fail with an error about OCF_SUCCESS being set to A: %s", err.Error())
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/go/src/mssqlcommon/lib_test.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) Microsoft Corporation.
2 |
3 | package mssqlcommon
4 |
5 | import (
6 | "fmt"
7 | "testing"
8 | )
9 |
10 | func TestDiagnose(t *testing.T) {
11 | t.Parallel()
12 |
13 | for _, system := range []bool{true, false} {
14 | for _, resource := range []bool{true, false} {
15 | for _, queryProcessing := range []bool{true, false} {
16 | // Local copies of loop variables for the closure to capture
17 | system := system
18 | resource := resource
19 | queryProcessing := queryProcessing
20 |
21 | t.Run(fmt.Sprintf("system = %t, resource = %t, queryProcessing = %t", system, resource, queryProcessing), func(t *testing.T) {
22 | t.Parallel()
23 |
24 | diagnostics := Diagnostics{System: system, Resource: resource, QueryProcessing: queryProcessing}
25 | err := Diagnose(diagnostics)
26 |
27 | if system && resource && queryProcessing {
28 | if err != nil {
29 | t.Fatalf("Expected Diagnose to succeed but it failed: %s", err)
30 | }
31 | } else {
32 | if err == nil {
33 | t.Fatal("Expected Diagnose to fail but it succeeded")
34 | }
35 |
36 | switch serverUnhealthyError := err.(type) {
37 | case *ServerUnhealthyError:
38 | if !system {
39 | if serverUnhealthyError.RawValue != ServerCriticalError {
40 | t.Fatalf("Diagnose did not fail with ServerCriticalError: %d", serverUnhealthyError.RawValue)
41 | }
42 |
43 | if serverUnhealthyError.Inner.Error() != "sp_server_diagnostics result indicates system error" {
44 | t.Fatalf("Diagnose did not fail with an error about system error: %s", serverUnhealthyError.Inner.Error())
45 | }
46 | } else if !resource {
47 | if serverUnhealthyError.RawValue != ServerModerateError {
48 | t.Fatalf("Diagnose did not fail with ServerModerateError: %d", serverUnhealthyError.RawValue)
49 | }
50 |
51 | if serverUnhealthyError.Inner.Error() != "sp_server_diagnostics result indicates resource error" {
52 | t.Fatalf("Diagnose did not fail with an error about resource error: %s", serverUnhealthyError.Inner.Error())
53 | }
54 | } else if !queryProcessing {
55 | if serverUnhealthyError.RawValue != ServerAnyQualifiedError {
56 | t.Fatalf("Diagnose did not fail with ServerAnyQualifiedError: %d", serverUnhealthyError.RawValue)
57 | }
58 |
59 | if serverUnhealthyError.Inner.Error() != "sp_server_diagnostics result indicates query processing error" {
60 | t.Fatalf("Diagnose did not fail with an error about query processing error: %s", serverUnhealthyError.Inner.Error())
61 | }
62 | } else {
63 | t.Fatal("Unreachable")
64 | }
65 |
66 | default:
67 | t.Fatal("Diagnose did not return an error of type ServerUnhealthyError")
68 | }
69 | }
70 | })
71 | }
72 | }
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This repository contains the source code of the Pacemaker resource agents that ship in the mssql-server-ha package.
2 |
3 | This is a snapshot of our SQL Server-internal repository where the actual development takes place. As the commit histories are completely different, we cannot currently accept pull requests to this repository. This snapshot is provided so that users can see the source of the agents, make changes to suit any specific scenarios they have, write agents for other clustering systems following the same protocol, and so on. We intend to migrate development to this repository at a future date.
4 |
5 | We're happy to receive bug reports, suggestions and feedback for the code in this repository as Github issues.
6 |
7 | Kubernetes agents for monitoring SQL Server instances and Availability Groups are also coming, and will be added to this repository at a future date.
8 |
9 |
10 | # Availability Group resource agent `ocf:mssql:ag`
11 |
12 | This is made up of a golang binary `go/src/ag-helper` and a shell script `ag/ag`. `ag-helper` can be built by running `GOPATH=$PWD/go go install ag-helper`
13 |
14 | The agent can be installed by moving the files to these locations:
15 |
16 | - `ag/ag` to `/usr/lib/ocf/resource.d/mssql/ag`
17 | - `ag/docs/*` to `/usr/lib/ocf/lib/mssql/*`
18 | - `go/bin/ag-helper` to `/usr/lib/ocf/lib/mssql/ag-helper`
19 |
20 | The shell script is the entry point for the resource agent and delegates to the helper binary for most tasks. The helper binary monitors the instance health by running `sp_server_diagnostics` and the AG health by querying `sys.databases`. It also implements the promote and demote actions by running the `ALTER AVAILABILITY GROUP FAILOVER` and `ALTER AVAILABILITY GROUP SET (ROLE = SECONDARY)` DDLs.
21 |
22 |
23 | Major changes since SQL2017:
24 |
25 | - Provide hostname support for ag and fci
26 | - Install External Lease in Pacemaker
27 | - Introduce external write lease handling to Pacemaker AG resource agent
28 | - Not to wait for databases to come online during failover
29 | - Bring secondaries offline in post promote
30 | - Various Pacemaker AG agent fixes for more reliable failovers
31 |
32 | # Failover Cluster Instance resource agent `ocf:mssql:fci`
33 |
34 | This is made up of a golang binary `go/src/fci-helper` and a shell script `fci/fci`. `fci-helper` can be built by running `GOPATH=$PWD/go go install fci-helper`
35 |
36 | The agent can be installed by moving the files to these locations:
37 |
38 | - `fci/fci` to `/usr/lib/ocf/resource.d/mssql/fci`
39 | - `fci/docs/*` to `/usr/lib/ocf/lib/mssql/*`
40 | - `go/bin/fci-helper` to `/usr/lib/ocf/lib/mssql/fci-helper`
41 |
42 | The shell script is the entry point for the resource agent and handles starting and stopping the `sqlservr` process. The script invokes the `fci-helper` binary to fixup the server name after starting the resource (if necessary), and to monitor the instance health by running `sp_server_diagnostics`
43 |
44 |
45 | Major changes since SQL2017:
46 |
47 | - Ensure ag-helper and fci-helper exit if the resource agent process is killed
48 | - ag helper will reattempt connection if connection times out for monitor action
49 |
50 |
51 | # License
52 |
53 | MIT
54 |
55 |
56 | # Contributing
57 |
58 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
59 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
60 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
61 |
--------------------------------------------------------------------------------
/go/src/mssqlcommon/ocf/lib.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) Microsoft Corporation.
2 |
3 | // Package ocf contains items related to SQL Server on Pacemaker.
4 | package ocf
5 |
6 | import (
7 | "fmt"
8 | "log"
9 | "os"
10 | "strconv"
11 | "strings"
12 | "syscall"
13 | )
14 |
15 | type OcfExitCode int
16 |
17 | var (
18 | OCF_ERR_CONFIGURED OcfExitCode
19 | OCF_ERR_GENERIC OcfExitCode
20 | OCF_ERR_ARGS OcfExitCode
21 | OCF_ERR_PERM OcfExitCode
22 | OCF_ERR_UNIMPLEMENTED OcfExitCode
23 | OCF_FAILED_MASTER OcfExitCode
24 | OCF_NOT_RUNNING OcfExitCode
25 | OCF_RUNNING_MASTER OcfExitCode
26 | OCF_SUCCESS OcfExitCode
27 | )
28 |
29 | // --------------------------------------------------------------------------------------
30 | // Function: ImportOcfExitCodes
31 | //
32 | // Description:
33 | // Imports the OCF exit codes from corresponding environment variables.
34 | //
35 | func ImportOcfExitCodes() error {
36 | var err error
37 |
38 | OCF_ERR_CONFIGURED, err = importOcfExitCode("OCF_ERR_CONFIGURED")
39 | if err != nil {
40 | return err
41 | }
42 |
43 | OCF_ERR_GENERIC, err = importOcfExitCode("OCF_ERR_GENERIC")
44 | if err != nil {
45 | return err
46 | }
47 |
48 | OCF_ERR_ARGS, err = importOcfExitCode("OCF_ERR_ARGS")
49 | if err != nil {
50 | return err
51 | }
52 |
53 | OCF_ERR_PERM, err = importOcfExitCode("OCF_ERR_PERM")
54 | if err != nil {
55 | return err
56 | }
57 |
58 | OCF_ERR_UNIMPLEMENTED, err = importOcfExitCode("OCF_ERR_UNIMPLEMENTED")
59 | if err != nil {
60 | return err
61 | }
62 |
63 | OCF_FAILED_MASTER, err = importOcfExitCode("OCF_FAILED_MASTER")
64 | if err != nil {
65 | return err
66 | }
67 |
68 | OCF_NOT_RUNNING, err = importOcfExitCode("OCF_NOT_RUNNING")
69 | if err != nil {
70 | return err
71 | }
72 |
73 | OCF_RUNNING_MASTER, err = importOcfExitCode("OCF_RUNNING_MASTER")
74 | if err != nil {
75 | return err
76 | }
77 |
78 | OCF_SUCCESS, err = importOcfExitCode("OCF_SUCCESS")
79 | if err != nil {
80 | return err
81 | }
82 |
83 | return nil
84 | }
85 |
86 | func importOcfExitCode(name string) (OcfExitCode, error) {
87 | stringValue := os.Getenv(name)
88 | intValue, err := strconv.Atoi(stringValue)
89 | if err != nil {
90 | return 0, fmt.Errorf("%s is set to an invalid value [%s]", name, stringValue)
91 | }
92 |
93 | return OcfExitCode(intValue), nil
94 | }
95 |
96 | // Function: Exit
97 | //
98 | // Description:
99 | // Helper to exit with the given exit code and error.
100 | //
101 | func Exit(logger *log.Logger, exitCode int, err error) error {
102 | if err != nil {
103 | // Print each line individually to ensure that each line is prefixed with the logger prefix
104 | for _, line := range strings.Split(err.Error(), "\n") {
105 | logger.Println(line)
106 | }
107 | }
108 |
109 | os.Exit(exitCode)
110 |
111 | return nil
112 | }
113 |
114 | // KillCurrentProcessWhenParentExits uses prctl to request that the current process receive a SIGKILL if its parent process dies.
115 | // This ensures that the helper processes spawned by the resource agent shell script gets cleaned up if Pacemaker kills the resource agent's shell process
116 | // (due to op timeout, etc.)
117 | //
118 | // This uses syscall.Syscall instead of unix.Prctl since we don't have golang.org/x/sys/unix
119 | func KillCurrentProcessWhenParentExits() error {
120 | _, _, errno := syscall.Syscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0)
121 | if errno != 0 {
122 | return fmt.Errorf("prctl failed with errno %d", errno)
123 | }
124 |
125 | return nil
126 | }
127 |
128 | // Function: OcfExit
129 | //
130 | // Description:
131 | // Helper to exit with the given OCF exit code and error.
132 | //
133 | // To distinguish OCF exit codes from other exit codes (like 1 for panics),
134 | // the actual exit code is 10 + the OCF exit code.
135 | //
136 | func OcfExit(logger *log.Logger, ocfExitCode OcfExitCode, err error) error {
137 | return Exit(logger, int(ocfExitCode)+10, err)
138 | }
139 |
--------------------------------------------------------------------------------
/fci/docs/fci_metadata:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 1.0
5 |
6 | Stateless resource agent for a SQL Server Failover Cluster Instance.
7 |
8 | Failover Cluster Instance resource agent.
9 |
10 |
11 |
12 | User account under which SQL Server will run.
13 |
14 | SQL Server's user account.
15 |
16 |
17 |
18 | Path to the SQL Server binary.
19 | SQL Server binary
20 |
21 |
22 |
23 | Command line arguments for SQL Server.
24 | SQL Server arguments
25 |
26 |
27 |
28 | Working directory for SQL Server.
29 | Working directory
30 |
31 |
32 |
33 | Status file location. The status is used to determine with SQL Server crashed or was properly stopped by us.
34 | Status file
35 |
36 |
37 |
38 |
39 | Monitoring policy options are:
40 |
41 | 1) SERVER_DOWN: only restart or failover if SQL Server is down (the process is not running)
42 | 2) SERVER_UNRESPONSIVE: restart or failover if SQL Server is unresponsive (unable to establish a connection)
43 | 3) SERVER_CRITICAL_ERROR: restart or failover when sp_server_diagnostics detects a critical system error
44 | 4) SERVER_MODERATE_ERROR: restart or failover when sp_server_diagnostics detects a critical system or resource error
45 | 5) SERVER_ANY_QUALIFIED_ERROR: restart or failover when sp_server_diagnostics detects any qualified error
46 | Monitoring policy
47 |
48 |
49 |
50 | Login and query execution timeout for monitoring in seconds.
51 | Login and query execution timeout for monitoring.
52 |
53 |
54 |
55 | This parameter is unused and only kept for backward-compatibility. Set monitor_timeout instead.
56 | Unused.
57 |
58 |
59 |
60 |
61 | Timeout for stopping SQL Server. The resource agent will first attempt to kill SQL Server politely (using a TERM signal). SQL Server will then checkpoint all databases and exit. If the SQL Server process has not exited by the timeout then the resource agent will attempt to forcibly kill it by sending SIGKILL.
62 |
63 | Timeout for stopping the SQL Server process.
64 |
65 |
66 |
67 |
68 | Credentials for a SQL Server user the resource agent will log in as to execute a stored procedure to monitor instance health. This file should contain two lines, the first with the username and second with the password.
69 |
70 | Location of file containing a SQL Server user credential the resource agent can use.
71 |
72 |
73 |
74 | Port SQL Server listens on.
75 | Port
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
--------------------------------------------------------------------------------
/go/src/fci-helper/main.go:
--------------------------------------------------------------------------------
1 | // Copyright (C) Microsoft Corporation.
2 |
3 | package main
4 |
5 | import (
6 | "database/sql"
7 | "errors"
8 | "flag"
9 | "fmt"
10 | "log"
11 | "os"
12 | "strings"
13 | "time"
14 |
15 | _ "github.com/denisenkom/go-mssqldb"
16 |
17 | "mssqlcommon"
18 | mssqlocf "mssqlcommon/ocf"
19 | )
20 |
21 | /*
22 | Program to be called from the mssql:fci resource agent to monitor SQL Server health.
23 |
24 | Determines the health of the specified SQL Server instance based on
25 | 1) whether a connection can be established to the instance, and
26 | 2) the results of the 'sp_server_diagnostics' stored procedure
27 | */
28 |
29 | func main() {
30 | stdout := log.New(os.Stdout, "", log.LstdFlags)
31 | stderr := log.New(os.Stderr, "ERROR: ", log.LstdFlags)
32 |
33 | err := mssqlocf.KillCurrentProcessWhenParentExits()
34 | if err != nil {
35 | mssqlocf.Exit(stderr, 1, fmt.Errorf("Unexpected error: %s", err))
36 | }
37 |
38 | err = doMain(stdout, stderr)
39 | if err != nil {
40 | mssqlocf.Exit(stderr, 1, fmt.Errorf("Unexpected error: %s", err))
41 | }
42 | }
43 |
44 | func doMain(stdout *log.Logger, stderr *log.Logger) error {
45 | var (
46 | hostname string
47 | sqlPort uint64
48 | credentialsFile string
49 | applicationName string
50 | rawConnectionTimeout int64
51 | rawHealthThreshold uint
52 | rawMonitorTimeout int64
53 |
54 | action string
55 |
56 | virtualServerName string
57 | )
58 |
59 | flag.StringVar(&hostname, "hostname", "localhost", "The hostname of the SQL Server instance to connect to. Default: localhost")
60 | flag.Uint64Var(&sqlPort, "port", 0, "The port on which the instance is listening for logins.")
61 | flag.StringVar(&credentialsFile, "credentials-file", "", "The path to the credentials file.")
62 | flag.StringVar(&applicationName, "application-name", "", "The application name to use for the T-SQL connection.")
63 | flag.Int64Var(&rawConnectionTimeout, "connection-timeout", 30, "The connection timeout in seconds. "+
64 | "The application will retry connecting to the instance until this time elapses. Default: 30")
65 | flag.UintVar(&rawHealthThreshold, "health-threshold", uint(mssqlcommon.ServerCriticalError), "The instance health threshold. Default: 3 (SERVER_CRITICAL_ERROR)")
66 |
67 | flag.StringVar(&action, "action", "", `One of --start, --monitor
68 | start: Start the replica on this node.
69 | monitor: Monitor the replica on this node.`)
70 |
71 | flag.StringVar(&virtualServerName, "virtual-server-name", "", "The virtual server name that should be set on the SQL Server instance.")
72 | flag.Int64Var(&rawMonitorTimeout, "monitor-interval-timeout", 0, "The monitor interval timeout in seconds. "+
73 | "For FCI this is expected to be always Default: 0")
74 |
75 | flag.Parse()
76 |
77 | stdout.Printf(
78 | "fci-helper invoked with hostname [%s]; port [%d]; credentials-file [%s]; application-name [%s]; connection-timeout [%d]; health-threshold [%d]; action [%s]\n",
79 | hostname, sqlPort,
80 | credentialsFile,
81 | applicationName,
82 | rawConnectionTimeout, rawHealthThreshold,
83 | action)
84 |
85 | switch action {
86 | case "start":
87 | stdout.Printf(
88 | "fci-helper invoked with virtual-server-name [%s]\n",
89 | virtualServerName)
90 |
91 | case "monitor":
92 | stdout.Printf(
93 | "fci-helper invoked with virtual-server-name [%s]\n",
94 | virtualServerName)
95 | }
96 |
97 | if hostname == "" {
98 | return errors.New("a valid hostname must be specified using --hostname")
99 | }
100 |
101 | if sqlPort == 0 {
102 | return errors.New("a valid port number must be specified using --port")
103 | }
104 |
105 | if credentialsFile == "" {
106 | return errors.New("a valid path to a credentials file must be specified using --credentials-file")
107 | }
108 |
109 | if applicationName == "" {
110 | return errors.New("a valid application name must be specified using --application-name")
111 | }
112 |
113 | if action == "" {
114 | return errors.New("a valid action must be specified using --action")
115 | }
116 |
117 | if action == "start" || action == "monitor" {
118 | if virtualServerName == "" {
119 | return errors.New("a valid virtual server name must be specified using --virtual-server-name")
120 | }
121 | }
122 |
123 | err := mssqlocf.ImportOcfExitCodes()
124 | if err != nil {
125 | return err
126 | }
127 |
128 | connectionTimeout := time.Duration(rawConnectionTimeout) * time.Second
129 | monitorTimeout := time.Duration(rawMonitorTimeout) * time.Second
130 | healthThreshold := mssqlcommon.ServerHealth(rawHealthThreshold)
131 |
132 | sqlUsername, sqlPassword, err := mssqlcommon.ReadCredentialsFile(credentialsFile)
133 | if err != nil {
134 | return mssqlocf.OcfExit(stderr, mssqlocf.OCF_ERR_ARGS, fmt.Errorf("Could not read credentials file: %s", err))
135 | }
136 |
137 | db, err := mssqlcommon.OpenDBWithHealthCheck(
138 | hostname, sqlPort,
139 | sqlUsername, sqlPassword,
140 | applicationName,
141 | connectionTimeout, connectionTimeout,
142 | monitorTimeout,
143 | stdout)
144 | if err != nil {
145 | switch serverUnhealthyError := err.(type) {
146 | case *mssqlcommon.ServerUnhealthyError:
147 | if serverUnhealthyError.RawValue <= healthThreshold {
148 | return mssqlocf.OcfExit(stderr, mssqlocf.OCF_ERR_GENERIC, fmt.Errorf(
149 | "Instance health status %d is at or below the threshold value of %d",
150 | serverUnhealthyError.RawValue, healthThreshold))
151 | }
152 |
153 | stdout.Printf("Instance health status %d is greater than the threshold value of %d\n", serverUnhealthyError.RawValue, healthThreshold)
154 |
155 | default:
156 | return err
157 | }
158 | }
159 | defer db.Close()
160 |
161 | var ocfExitCode mssqlocf.OcfExitCode
162 |
163 | switch action {
164 | case "start":
165 | ocfExitCode, err = start(db, virtualServerName, stdout)
166 |
167 | case "monitor":
168 | ocfExitCode, err = monitor(db, virtualServerName, stdout)
169 |
170 | default:
171 | return fmt.Errorf("unknown value for --action %s", action)
172 | }
173 |
174 | return mssqlocf.OcfExit(stderr, ocfExitCode, err)
175 | }
176 |
177 | // Function: start
178 | //
179 | // Description:
180 | // Implements the OCF "start" action
181 | //
182 | func start(db *sql.DB, virtualServerName string, stdout *log.Logger) (mssqlocf.OcfExitCode, error) {
183 | stdout.Printf("Setting local server name to %s...\n", virtualServerName)
184 |
185 | err := mssqlcommon.SetLocalServerName(db, virtualServerName)
186 | if err != nil {
187 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf("Could not set local server name: %s", err)
188 | }
189 |
190 | return monitor(db, virtualServerName, stdout)
191 | }
192 |
193 | // Function: monitor
194 | //
195 | // Description:
196 | // Implements the OCF "monitor" action
197 | //
198 | func monitor(db *sql.DB, virtualServerName string, stdout *log.Logger) (mssqlocf.OcfExitCode, error) {
199 | stdout.Println("Querying local server name...")
200 |
201 | currentServerName, err := mssqlcommon.GetLocalServerName(db)
202 | if err != nil {
203 | return mssqlocf.OCF_ERR_GENERIC, fmt.Errorf("Could not query local server name: %s", err)
204 | }
205 |
206 | stdout.Printf("Local server name is %s\n", currentServerName)
207 |
208 | if !strings.EqualFold(currentServerName, virtualServerName) {
209 | return mssqlocf.OCF_ERR_ARGS, fmt.Errorf("Expected local server name to be %s but it was %s", virtualServerName, currentServerName)
210 | }
211 |
212 | return mssqlocf.OCF_SUCCESS, nil
213 | }
214 |
--------------------------------------------------------------------------------
/ag/docs/ag_metadata:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 1.0
5 |
6 | Stateful resource agent for a SQL Server Availability Group.
7 |
8 | Availability Group resource agent.
9 |
10 |
11 |
12 | The name of the Availability Group that this resource will represent.
13 |
14 | Name of the AG.
15 |
16 |
17 |
18 |
19 | Login and query execution timeout in seconds. Default: 30
20 |
21 | The value of this parameter should be higher than the longest time it takes for any database in the AG to complete recovery.
22 | For example, if a database in the AG may take up to about 2 minutes to recover, this parameter should be set to 120 or higher.
23 |
24 | Login and query execution timeout.
25 |
26 |
27 |
28 |
29 | If the primary is unable to commit AG configuration updates with a sufficient number of other SYNCHRONOUS_COMMIT and CONFIGURATION_ONLY replicas in the AG, the agent will disable the primary by not renewing the primary's write lease. If the time in seconds specified by this value has passed since the primary was unable to commit a configuration update, the agent will stop renewing the primary's write lease, causing all pending transactions to fail and the DB to become inaccessible. Default: 60
30 |
31 | How long the primary can block on committing a configuration update before the agent stops renewing its write lease.
32 |
33 |
34 |
35 | This parameter is unused and only kept for backward-compatibility. Set connection_timeout instead.
36 | Unused.
37 |
38 |
39 |
40 |
41 | Monitoring policy options are:
42 |
43 | 1) SERVER_UNRESPONSIVE_OR_DOWN: Fail if the SQL Server instance is unresponsive (unable to establish a connection) or down (the process is not running)
44 | 3) SERVER_CRITICAL_ERROR: Fail if sp_server_diagnostics detects a critical system error
45 | 4) SERVER_MODERATE_ERROR: Fail if sp_server_diagnostics detects a critical system or resource error
46 | 5) SERVER_ANY_QUALIFIED_ERROR: Fail if sp_server_diagnostics detects any qualified error
47 |
48 | Monitoring policy
49 |
50 |
51 |
52 | This parameter is deprecated. Set connection_timeout instead.
53 | Deprecated.
54 |
55 |
56 |
57 |
58 | Path to a file containing the credentials for a SQL Server user. The resource agent will login using these credentials to perform actions against the instance.
59 |
60 | This file should contain two lines separated by LF. The first line should have the username, and the second line should have the password.
61 |
62 | Path to a file containing the credentials for a SQL Server user.
63 |
64 |
65 |
66 |
67 | This parameter is unused. Set the timeouts of the start, monitor and promote actions instead.
68 |
69 | This parameter used to control how long the resource agent waited for all databases of an AG to be ONLINE on a primary replica with DB_FAILOVER = ON.
70 |
71 | The resource agent has been changed to wait indefinitely, until the corresponding action (start / monitor / promote) times out, so this parameter is no longer used.
72 |
73 | Unused.
74 |
75 |
76 |
77 | The TSQL port that the SQL Server instance listens on.
78 | TSQL port
79 |
80 |
81 |
82 |
83 | If set, the agent will renew the primary's AG write lease to this value in seconds. Otherwise the agent will set it based on the monitor action's interval and timeout.
84 |
85 | It is recommended to set this value greater than the sum of the monitor action's interval and timeout.
86 |
87 | Primary write lease duration.
88 |
89 |
90 |
91 | The name of the SQL Server process. Default: sqlservr
92 | The name of the SQL Server process.
93 |
94 |
95 |
96 | This parameter is deprecated. Set required_synchronized_secondaries_to_commit instead.
97 | Deprecated.
98 |
99 |
100 |
101 |
102 | If set, the agent will set REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT to this value. Otherwise the agent will calculate a value based on the number of SYNCHRONOUS_COMMIT replicas.
103 |
104 | Override for the default REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT value.
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
--------------------------------------------------------------------------------
/fci/test/fci_test:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Copyright (C) Microsoft Corporation.
4 | #
5 | # Some simple functional tests for the resource agent.
6 | # Should only be run when nothing else will be starting or stopping SQL Server.
7 | # Any of the default values set in the resource agent can be overriden by exporting an env variable with the same name.
8 | # This even applies to paths of other files the resource agent uses.
9 | # So, for example, setting $FCI_HELPER_BIN allows use of another monitoring script
10 | #
11 | # The --ra-output parameter specifies where resource agent output is piped to,
12 | # --test-output specifies where test output is piped to.
13 | #
14 | # By default, output from the resource agent is suppressed and output from the test goes to the terminal.
15 | #
16 |
17 | usage() {
18 | cat<<-EOF
19 | fci_test [--ra-output