├── integration-examples
├── lambda-vpc-connection-sample
│ ├── __init__.py
│ ├── tests
│ │ ├── __init__.py
│ │ ├── unit
│ │ │ ├── __init__.py
│ │ │ └── test_handler.py
│ │ ├── integration
│ │ │ ├── __init__.py
│ │ │ └── test_api_gateway.py
│ │ └── requirements.txt
│ ├── hello_world
│ │ ├── __init__.py
│ │ ├── requirements.txt
│ │ └── app.py
│ └── events
│ │ └── event.json
├── splunk-otel-dotnet-docker
│ ├── .gitignore
│ ├── Program.cs
│ ├── MultiStageDocker
│ │ ├── entrypoint.sh
│ │ ├── MultiStageDocker.csproj
│ │ └── Dockerfile
│ └── MultiStageDockerNuGetOption
│ │ ├── MultiStageDocker.csproj
│ │ └── Dockerfile
├── usage-reports-scripts
│ ├── requirements.txt
│ ├── images
│ │ └── custom-metric-report.png
│ ├── README.md
│ └── custom-metric-report-parser.py
├── terraform-jumpstart
│ ├── export_script
│ │ ├── .gitignore
│ │ └── requirements.txt
│ ├── terraform.tfvars.template
│ ├── modules
│ │ ├── dashboards
│ │ │ ├── parent
│ │ │ │ ├── variables.tf
│ │ │ │ ├── versions.tf
│ │ │ │ └── main.tf
│ │ │ ├── usage
│ │ │ │ ├── variables.tf
│ │ │ │ ├── versions.tf
│ │ │ │ └── main.tf
│ │ │ ├── executive-dashboards
│ │ │ │ ├── variables.tf
│ │ │ │ ├── versions.tf
│ │ │ │ └── main.tf
│ │ │ └── rum_and_synthetics
│ │ │ │ ├── variables.tf
│ │ │ │ ├── versions.tf
│ │ │ │ └── main.tf
│ │ ├── aws
│ │ │ ├── versions.tf
│ │ │ ├── variables.tf
│ │ │ ├── ecs.tf
│ │ │ ├── elb.tf
│ │ │ └── lambda.tf
│ │ ├── azure
│ │ │ ├── versions.tf
│ │ │ ├── vm.tf
│ │ │ └── variables.tf
│ │ ├── docker
│ │ │ ├── versions.tf
│ │ │ ├── variables.tf
│ │ │ └── container.tf
│ │ ├── gcp
│ │ │ ├── versions.tf
│ │ │ ├── compute.tf
│ │ │ ├── variables.tf
│ │ │ └── storage.tf
│ │ ├── host
│ │ │ ├── versions.tf
│ │ │ ├── variables.tf
│ │ │ ├── disk.tf
│ │ │ ├── mem.tf
│ │ │ └── cpu.tf
│ │ ├── kafka
│ │ │ ├── versions.tf
│ │ │ └── variables.tf
│ │ ├── pivotal
│ │ │ ├── versions.tf
│ │ │ ├── system.tf
│ │ │ ├── variables.tf
│ │ │ ├── RouteEmitter.tf
│ │ │ └── gorouter.tf
│ │ └── kubernetes
│ │ │ ├── versions.tf
│ │ │ ├── variables.tf
│ │ │ └── node.tf
│ ├── versions.tf
│ ├── .gitignore
│ ├── variables.tf
│ ├── main.tf
│ └── README.md
├── apiScripts
│ ├── requirements.txt
│ ├── token.yaml
│ ├── README.md
│ ├── getMetricsForHost.py
│ └── muteAllAutoDetectors.py
├── active_detectors
│ ├── requirements.txt
│ ├── images
│ │ └── screenshot.png
│ └── README.md
├── ci-webhook-serverless
│ ├── ci-webhook-handler
│ │ ├── requirements.txt
│ │ └── serverless.yml
│ └── generate-test-events.py
├── synthetics-examples
│ ├── API
│ │ ├── status-page-to-metrics-api
│ │ │ ├── image.png
│ │ │ ├── synthetic-variables.png
│ │ │ └── README.md
│ │ ├── token-expiration-to-metrics-api
│ │ │ ├── synthetic-variables.png
│ │ │ ├── token-expire-chart.png
│ │ │ └── README.md
│ │ └── status-to-splunk-hec
│ │ │ ├── README.md
│ │ │ └── synthetics_status_to_splunk_hec_api_check.tf
│ └── Browser
│ │ └── hipstershop-complete-order-test-browser
│ │ └── README.md
├── jenkins-apm
│ └── README.md
├── system-scanner
│ ├── validators.py
│ ├── utils.py
│ ├── os_info.py
│ └── health.py
├── azure-devops
│ └── README.md
├── get-rum-urls
│ └── README.md
├── README.md
└── splunk-otel-databricks
│ └── splunk-start-up.sh
├── dashboards-and-dashboard-groups
├── otel-receiver-dashboard-generator
│ ├── requirements.txt
│ ├── otel-receiver-yaml
│ │ ├── gitproviderreceiver_metadata.yaml
│ │ ├── httpcheckreceiver_metadata.yaml
│ │ ├── nginxreceiver_metadata.yaml
│ │ ├── filestatsreceiver_metadata.yaml
│ │ ├── sshcheckreceiver_metadata.yaml
│ │ ├── riakreceiver_metadata.yaml
│ │ ├── rabbitmqreceiver_metadata.yaml
│ │ ├── chronyreceiver_metadata.yaml
│ │ └── couchdbreceiver_metadata.yaml
│ ├── observability-tf-configs
│ │ ├── httpcheckreceiver_metadata.yaml.tf
│ │ ├── gitproviderreceiver_metadata.yaml.tf
│ │ ├── filestatsreceiver_metadata.yaml.tf
│ │ └── nginxreceiver_metadata.yaml.tf
│ └── pull-otel-yaml.py
├── oracle-cloud
│ ├── occ-instance.png
│ ├── occ-instances.png
│ └── README.md
├── inferred-services-dg
│ ├── Inferred-services-DashboardGroup.png
│ └── README.md
├── SLO-Error-Budget
│ └── main.tf
├── RUM-Real-User-Monitoring
│ └── README.md
├── snowflakedb
│ └── Configuration
│ │ ├── splunk-otel-collector.conf
│ │ ├── snowflake-receiver
│ │ └── splunk-otel-collector.conf
│ │ └── README.md
├── executive-dashboards
│ └── main.tf
├── SC4SNMP
│ └── README.md
└── metricshub
│ └── README.md
├── detectors
├── inferred-services-detectors
│ ├── detectors-1.png
│ ├── detectors-errors.png
│ ├── detectors-latency.png
│ └── README.md
├── metricshub
│ ├── README.md
│ ├── Hardware - High number of errors.json
│ ├── Hardware - Missing device.json
│ ├── Hardware - Critically low fan speed (%).json
│ ├── Hardware - Predicted failure.json
│ ├── Hardware - Device status failed.json
│ ├── Hardware - Critical LUN pathing issue.json
│ ├── Hardware - Device status degraded.json
│ ├── Hardware - LUN multi-pathing issue.json
│ ├── Hardware - Network errors.json
│ ├── Hardware - Physical intrusion.json
│ ├── Hardware - Networking link down.json
│ ├── Hardware - Tape drive needs cleaning.json
│ ├── Hardware - Low fan speed (%).json
│ ├── Hardware - Connector failed.json
│ └── Hardware - Low battery.json
├── snowflakedb
│ ├── README.md
│ ├── Snowflake - Blocked Queries.json
│ ├── Snowflake - Overloaded Queries.json
│ └── Snowflake - No Queries in Last 3 Hours.json
└── README.md
├── import-dashboards-terraform
├── Dockerfile
├── go.mod
└── Makefile
├── .gitignore
├── README.md
├── CONTRIBUTING.md
└── .github
└── workflows
└── CLA.yaml
/integration-examples/lambda-vpc-connection-sample/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/integration-examples/lambda-vpc-connection-sample/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/integration-examples/splunk-otel-dotnet-docker/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 |
--------------------------------------------------------------------------------
/integration-examples/usage-reports-scripts/requirements.txt:
--------------------------------------------------------------------------------
1 | rich
2 |
--------------------------------------------------------------------------------
/integration-examples/lambda-vpc-connection-sample/hello_world/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/integration-examples/lambda-vpc-connection-sample/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/export_script/.gitignore:
--------------------------------------------------------------------------------
1 | *.tf
2 | venv
--------------------------------------------------------------------------------
/integration-examples/apiScripts/requirements.txt:
--------------------------------------------------------------------------------
1 | jsonpath_ng
2 | pyyaml
3 | requests
--------------------------------------------------------------------------------
/integration-examples/lambda-vpc-connection-sample/tests/integration/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/integration-examples/lambda-vpc-connection-sample/hello_world/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
--------------------------------------------------------------------------------
/integration-examples/active_detectors/requirements.txt:
--------------------------------------------------------------------------------
1 | rich
2 | background
3 | requests
4 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/export_script/requirements.txt:
--------------------------------------------------------------------------------
1 | signalfx >= 1.1.7
2 |
--------------------------------------------------------------------------------
/integration-examples/lambda-vpc-connection-sample/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | boto3
3 | requests
4 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/requirements.txt:
--------------------------------------------------------------------------------
1 | PyYAML==6.*
2 | Requests==2.*
3 |
--------------------------------------------------------------------------------
/integration-examples/ci-webhook-serverless/ci-webhook-handler/requirements.txt:
--------------------------------------------------------------------------------
1 | # dependencies for current version
2 | signalfx
3 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/terraform.tfvars.template:
--------------------------------------------------------------------------------
1 | api_token="1234xxx5678yyyy"
2 | realm="eu0"
3 | o11y_prefix="[Splunk]"
4 |
--------------------------------------------------------------------------------
/detectors/inferred-services-detectors/detectors-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/splunk/observability-content-contrib/HEAD/detectors/inferred-services-detectors/detectors-1.png
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/dashboards/parent/variables.tf:
--------------------------------------------------------------------------------
1 | variable "o11y_prefix" {
2 | type = string
3 | description = "Dashboard Prefix"
4 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/dashboards/usage/variables.tf:
--------------------------------------------------------------------------------
1 | variable "o11y_prefix" {
2 | type = string
3 | description = "Dashboard Prefix"
4 | }
--------------------------------------------------------------------------------
/detectors/inferred-services-detectors/detectors-errors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/splunk/observability-content-contrib/HEAD/detectors/inferred-services-detectors/detectors-errors.png
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/dashboards/executive-dashboards/variables.tf:
--------------------------------------------------------------------------------
1 | variable "o11y_prefix" {
2 | type = string
3 | description = "Detector Prefix"
4 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/dashboards/rum_and_synthetics/variables.tf:
--------------------------------------------------------------------------------
1 | variable "o11y_prefix" {
2 | type = string
3 | description = "Dashboard Prefix"
4 | }
--------------------------------------------------------------------------------
/detectors/inferred-services-detectors/detectors-latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/splunk/observability-content-contrib/HEAD/detectors/inferred-services-detectors/detectors-latency.png
--------------------------------------------------------------------------------
/integration-examples/active_detectors/images/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/splunk/observability-content-contrib/HEAD/integration-examples/active_detectors/images/screenshot.png
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/oracle-cloud/occ-instance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/splunk/observability-content-contrib/HEAD/dashboards-and-dashboard-groups/oracle-cloud/occ-instance.png
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/oracle-cloud/occ-instances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/splunk/observability-content-contrib/HEAD/dashboards-and-dashboard-groups/oracle-cloud/occ-instances.png
--------------------------------------------------------------------------------
/integration-examples/splunk-otel-dotnet-docker/Program.cs:
--------------------------------------------------------------------------------
1 | var builder = WebApplication.CreateBuilder(args);
2 | var app = builder.Build();
3 |
4 | app.MapGet("/", () => "Hello World!");
5 |
6 | app.Run();
7 |
--------------------------------------------------------------------------------
/integration-examples/splunk-otel-dotnet-docker/MultiStageDocker/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Read in the file of environment settings
3 | . /$HOME/.splunk-otel-dotnet/instrument.sh
4 |
5 | # Then run the CMD
6 | exec "$@"
--------------------------------------------------------------------------------
/integration-examples/usage-reports-scripts/images/custom-metric-report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/splunk/observability-content-contrib/HEAD/integration-examples/usage-reports-scripts/images/custom-metric-report.png
--------------------------------------------------------------------------------
/integration-examples/synthetics-examples/API/status-page-to-metrics-api/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/splunk/observability-content-contrib/HEAD/integration-examples/synthetics-examples/API/status-page-to-metrics-api/image.png
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/aws/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | }
6 | }
7 | required_version = ">= 0.13"
8 | }
9 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/azure/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | }
6 | }
7 | required_version = ">= 0.13"
8 | }
9 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/docker/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | }
6 | }
7 | required_version = ">= 0.13"
8 | }
9 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/gcp/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | }
6 | }
7 | required_version = ">= 0.13"
8 | }
9 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/host/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | }
6 | }
7 | required_version = ">= 0.13"
8 | }
9 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/kafka/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | }
6 | }
7 | required_version = ">= 0.13"
8 | }
9 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/pivotal/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | }
6 | }
7 | required_version = ">= 0.13"
8 | }
9 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/versions.tf:
--------------------------------------------------------------------------------
1 |
2 | terraform {
3 | required_version = ">= 0.13"
4 | required_providers {
5 | signalfx = {
6 | source = "splunk-terraform/signalfx"
7 | }
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/kubernetes/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | }
6 | }
7 | required_version = ">= 0.13"
8 | }
9 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/dashboards/parent/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | }
6 | }
7 | required_version = ">= 0.13"
8 | }
9 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/dashboards/usage/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | }
6 | }
7 | required_version = ">= 0.13"
8 | }
9 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/inferred-services-dg/Inferred-services-DashboardGroup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/splunk/observability-content-contrib/HEAD/dashboards-and-dashboard-groups/inferred-services-dg/Inferred-services-DashboardGroup.png
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/dashboards/executive-dashboards/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | version = ">=6.13.1"
6 | }
7 | }
8 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/.gitignore:
--------------------------------------------------------------------------------
1 | terraform.tfstate
2 | terraform.tfstate.d/*
3 | terraform.tfstate.backup
4 | .terraform.lock.hcl
5 | .terraform
6 | secret.tfvars
7 | terraform.tfvars
8 | terraform.tfvars
9 | .DS_Store
10 |
11 |
--------------------------------------------------------------------------------
/integration-examples/synthetics-examples/API/status-page-to-metrics-api/synthetic-variables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/splunk/observability-content-contrib/HEAD/integration-examples/synthetics-examples/API/status-page-to-metrics-api/synthetic-variables.png
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/dashboards/rum_and_synthetics/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | }
6 | }
7 | required_version = ">= 0.13"
8 | }
9 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/dashboards/usage/main.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_dashboard_group" "usageoverview" {
2 | name = "${var.o11y_prefix} Usage Overview (Terraform)"
3 | description = "Host Based Model, MTS and Events Usage"
4 | }
5 |
--------------------------------------------------------------------------------
/integration-examples/synthetics-examples/API/token-expiration-to-metrics-api/synthetic-variables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/splunk/observability-content-contrib/HEAD/integration-examples/synthetics-examples/API/token-expiration-to-metrics-api/synthetic-variables.png
--------------------------------------------------------------------------------
/integration-examples/synthetics-examples/API/token-expiration-to-metrics-api/token-expire-chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/splunk/observability-content-contrib/HEAD/integration-examples/synthetics-examples/API/token-expiration-to-metrics-api/token-expire-chart.png
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/dashboards/rum_and_synthetics/main.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_dashboard_group" "rumandsynthetics" {
2 | name = "${var.o11y_prefix} RUM and Synthetics (Terraform)"
3 | description = "RUM and SYnthetics Dashboard"
4 | }
5 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/dashboards/parent/main.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_dashboard_group" "parentchildoverview" {
2 | name = "${var.o11y_prefix} Parent/Child Overview (Terraform)"
3 | description = "Parent/Child Overview/Usage Dashboards"
4 | }
5 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/variables.tf:
--------------------------------------------------------------------------------
1 | variable "api_token" {
2 | description = "Splunk API Token"
3 | }
4 |
5 | variable "realm" {
6 | description = "Splunk Realm"
7 | }
8 |
9 | variable "o11y_prefix" {
10 | type = string
11 | description = "Detector Prefix"
12 | default = "[Splunk]"
13 | }
14 |
--------------------------------------------------------------------------------
/import-dashboards-terraform/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM golang:1.24-alpine AS build
2 |
3 | RUN apk add --no-cache git
4 | WORKDIR /app
5 | COPY go.mod go.sum ./
6 | RUN go mod download
7 | COPY import.go .
8 | RUN go build -o main .
9 |
10 | FROM hashicorp/terraform:1.13
11 |
12 | WORKDIR /app
13 | COPY --from=build /app/main .
14 | ENTRYPOINT ["/app/main"]
--------------------------------------------------------------------------------
/integration-examples/lambda-vpc-connection-sample/hello_world/app.py:
--------------------------------------------------------------------------------
1 | import json
2 | import requests
3 |
4 |
5 | def lambda_handler(event, context):
6 | x = requests.get('https://w3schools.com/python/demopage.htm')
7 | print(x.text)
8 |
9 | return {
10 | 'statusCode': 200,
11 | 'body': json.dumps(x.text)
12 | }
13 |
--------------------------------------------------------------------------------
/integration-examples/splunk-otel-dotnet-docker/MultiStageDocker/MultiStageDocker.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net8.0
5 | enable
6 | enable
7 | Linux
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/integration-examples/splunk-otel-dotnet-docker/MultiStageDockerNuGetOption/MultiStageDocker.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net8.0
5 | enable
6 | enable
7 | Linux
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/SLO-Error-Budget/main.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | version = ">=6.13.1"
6 | }
7 | }
8 | }
9 |
10 | variable "signalfx_auth_token" {
11 | type=string
12 | }
13 |
14 | provider "signalfx" {
15 | auth_token = "${var.signalfx_auth_token}"
16 | # If your organization uses a different realm
17 | # api_url = "https://api.us2.signalfx.com"
18 | # If your organization uses a custom URL
19 | # custom_app_url = "https://myorg.signalfx.com"
20 | }
21 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/dashboards/executive-dashboards/main.tf:
--------------------------------------------------------------------------------
1 | ### Create a Dashboard Group for our Dashboards
2 | resource "signalfx_dashboard_group" "exec_dashboard_group" {
3 | name = "${var.o11y_prefix} Exec Level Dashboards"
4 | description = "Executive Level Dashboards"
5 |
6 | ### Note that if you use these features, you must use a user's
7 | ### admin key to authenticate the provider, lest Terraform not be able
8 | ### to modify the dashboard group in the future!
9 | #authorized_writer_teams = [signalfx_team.mycoolteam.id]
10 | #authorized_writer_users = ["abc123"]
11 | }
12 |
--------------------------------------------------------------------------------
/integration-examples/apiScripts/token.yaml:
--------------------------------------------------------------------------------
1 | # these values are used by the apiScripts
2 | access_token: #required. User API Access Token from your Splunk Observability Cloud Organization
3 | realm: #required. Realm for your Splunk Observability Cloud Organization (ex: us1)
4 | emailAddress: # required. Email address that you would like to add to the detector(s)
5 | detectorName: # optional. If not included, script will add email address to all detectors
6 | limit: 50 # optional. Will default to 50; Number of results to return from the list of detectors that match your search criteria.
7 | offset: 0 # optional; Will default to 0; Index, in the list of detectors that match your search criteria, at which you want to start downloading results.
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/RUM-Real-User-Monitoring/README.md:
--------------------------------------------------------------------------------
1 | # Real User Monitoring Detailed Dashboards
2 |
3 | This directory contains detail oriented dashboards and required chart definitions in Terraform for:
4 | - RUM Apps
5 | - RUM Browsers
6 | - RUM Synthetics
7 |
8 | Each of these dashboards is meant as a place to look at details of the specific metrics RUM provides split by Browser, App, or Rigor Synthetics.
9 |
10 | To use:
11 |
12 | ```
13 | terraform init --upgrade
14 | terraform plan -var="access_token=" -var="realm="
15 | terraform apply -auto-approve -var="access_token=" -var="realm="
16 | ```
17 |
18 | And to remove
19 |
20 | ```
21 | terraform destroy -auto-approve -var="access_token=" -var="realm="
22 | ```
23 |
24 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/snowflakedb/Configuration/splunk-otel-collector.conf:
--------------------------------------------------------------------------------
1 | SPLUNK_CONFIG=/etc/otel/collector/agent_config.yaml
2 | SPLUNK_ACCESS_TOKEN=TH1S_1SN7_4_R34L_T0K3N
3 | SPLUNK_REALM=us0
4 | SPLUNK_API_URL=https://api.signalfx.com
5 | SPLUNK_LOGOBSERVER_URL=https://ingest.signalfx.com/v1/log
6 | SPLUNK_INGEST_URL=https://ingest.signalfx.com
7 | SPLUNK_TRACE_URL=https://ingest.signalfx.com/v2/trace
8 | SPLUNK_HEC_URL=https://ingest.signalfx.com/v1/log
9 | SPLUNK_HEC_TOKEN=TH1S_1SN7_4_R34L_T0K3N
10 | SPLUNK_MEMORY_TOTAL_MIB=512
11 | SPLUNK_BUNDLE_DIR=/usr/lib/splunk-otel-collector/agent-bundle
12 | SPLUNK_COLLECTD_DIR=/usr/lib/splunk-otel-collector/agent-bundle/run/collectd
13 | SNOWFLAKE_USER=ADD_YOUR_SNOWFLAKE_USERNAME_WITH_ACCOUNT_ADMIN_PRIVS
14 | SNOWFLAKE_PASS=ADD_YOUR_SNOWFLAKE_PASSWORD
15 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/snowflakedb/Configuration/snowflake-receiver/splunk-otel-collector.conf:
--------------------------------------------------------------------------------
1 | SPLUNK_CONFIG=/etc/otel/collector/agent_config.yaml
2 | SPLUNK_ACCESS_TOKEN=TH1S_1SN7_4_R34L_T0K3N
3 | SPLUNK_REALM=us0
4 | SPLUNK_API_URL=https://api.signalfx.com
5 | SPLUNK_LOGOBSERVER_URL=https://ingest.signalfx.com/v1/log
6 | SPLUNK_INGEST_URL=https://ingest.signalfx.com
7 | SPLUNK_TRACE_URL=https://ingest.signalfx.com/v2/trace
8 | SPLUNK_HEC_URL=https://ingest.signalfx.com/v1/log
9 | SPLUNK_HEC_TOKEN=TH1S_1SN7_4_R34L_T0K3N
10 | SPLUNK_MEMORY_TOTAL_MIB=512
11 | SPLUNK_BUNDLE_DIR=/usr/lib/splunk-otel-collector/agent-bundle
12 | SPLUNK_COLLECTD_DIR=/usr/lib/splunk-otel-collector/agent-bundle/run/collectd
13 | SNOWFLAKE_USER=ADD_YOUR_SNOWFLAKE_USERNAME_WITH_ACCOUNT_ADMIN_PRIVS
14 | SNOWFLAKE_PASS=ADD_YOUR_SNOWFLAKE_PASSWORD
15 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/oracle-cloud/README.md:
--------------------------------------------------------------------------------
1 | # Oracle Cloud Infrastructure (OCI) Example Content
2 |
3 | NOTE: The example content included here presumes you have connected to Oracle
4 | Cloud using the metrics forwarding integration available [here](https://github.com/splunk/oracle-cloud-examples-splunk-observability/tree/master/samples/oci-monitoring-metrics-to-splunk-observability-python).
5 | You may also want to collect log events using the integration available [here](https://github.com/splunk/oracle-cloud-examples-splunk-observability/tree/master/samples/oci-logs-splunk-hec)
6 |
7 | You should be able to import these examples directly using the UI. The option is
8 | available using the "+"(upper right corner) -> Import -> Dashboard Group option.
9 |
10 | The content will provide:
11 |
12 | 1. Aggregate View:
13 |
14 | 
15 |
16 | 2. Instance View:
17 |
18 | 
19 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # VSCode
2 | .vscode/*
3 | !.vscode/settings.json
4 | !.vscode/tasks.json
5 | !.vscode/launch.json
6 | !.vscode/extensions.json
7 | *.code-workspace
8 |
9 | # Local History for Visual Studio Code
10 | .history/
11 |
12 | # Common credential files
13 | **/credentials.json
14 | **/client_secrets.json
15 | **/client_secret.json
16 | *creds*
17 | *.dat
18 | *password*
19 | *.httr-oauth*
20 |
21 |
22 | # Mac/OSX
23 | .DS_Store
24 |
25 | # Distribution / packaging
26 | .Python
27 | build/
28 | develop-eggs/
29 | dist/
30 | downloads/
31 | eggs/
32 | .eggs/
33 | lib/
34 | lib64/
35 | parts/
36 | sdist/
37 | var/
38 | wheels/
39 | share/python-wheels/
40 | *.egg-info/
41 | .installed.cfg
42 | *.egg
43 | MANIFEST
44 |
45 | # Installer logs
46 | pip-log.txt
47 | pip-delete-this-directory.txt
48 |
49 | # Environments
50 | .env
51 | .venv
52 | env/
53 | venv/
54 | ENV/
55 | env.bak/
56 | venv.bak/
57 |
58 | # Terraform
59 | .terraform.lock.hcl
60 | .terraform/
61 | terraform.tfstate
62 | terraform.tfstate.backup
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/gcp/compute.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_detector" "gcp_cpu_historical_norm" {
2 | name = "${var.o11y_prefix} GCP Compute Engine CPU % greater than historical norm"
3 | description = "Alerts when CPU usage for this host for the last 10 minutes was significantly higher than normal, as compared to the last 24 hours"
4 | program_text = <<-EOF
5 | from signalfx.detectors.against_recent import against_recent
6 | A = data('instance/cpu/utilization').publish(label='A', enable=False)
7 | against_recent.detector_mean_std(stream=A, current_window='10m', historical_window='24h', fire_num_stddev=3, clear_num_stddev=2.5, orientation='above', ignore_extremes=True, calculation_mode='vanilla').publish('CPU utilization is significantly greater than normal, and increasing')
8 | EOF
9 | rule {
10 | detect_label = "CPU utilization is significantly greater than normal, and increasing"
11 | severity = "Warning"
12 | parameterized_body = var.message_body
13 | }
14 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/azure/vm.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_detector" "azure_cpu_historical_norm" {
2 | name = "${var.o11y_prefix} Azure VM CPU % greater than historical norm"
3 | description = "Alerts when CPU usage for this host for the last 10 minutes was significantly higher than normal, as compared to the last 24 hours"
4 | program_text = <<-EOF
5 | from signalfx.detectors.against_recent import against_recent
6 | A = data('Percentage CPU', filter=(filter('primary_aggregation_type', 'true'))).publish(label='A', enable=False)
7 | against_recent.detector_mean_std(stream=A, current_window='10m', historical_window='24h', fire_num_stddev=3, clear_num_stddev=2.5, orientation='above', ignore_extremes=True, calculation_mode='vanilla').publish('CPU % is significantly greater than normal, and increasing')
8 | EOF
9 | rule {
10 | detect_label = "CPU % is significantly greater than normal, and increasing"
11 | severity = "Warning"
12 | parameterized_body = var.message_body
13 | }
14 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/aws/variables.tf:
--------------------------------------------------------------------------------
1 | variable "message_body" {
2 | type = string
3 |
4 | default = <<-EOF
5 | {{#if anomalous}}
6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}.
7 | {{else}}
8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}.
9 | {{/if}}
10 |
11 | {{#if anomalous}}
12 | Triggering condition: {{{readableRule}}}
13 | {{/if}}
14 |
15 | {{#if anomalous}}
16 | Signal value: {{inputs.A.value}}
17 | {{else}}
18 | Current signal value: {{inputs.A.value}}
19 | {{/if}}
20 |
21 | {{#notEmpty dimensions}}
22 | Signal details: {{{dimensions}}}
23 | {{/notEmpty}}
24 |
25 | {{#if anomalous}}
26 | {{#if runbookUrl}}
27 | Runbook: {{{runbookUrl}}}
28 | {{/if}}
29 | {{#if tip}}
30 | Tip: {{{tip}}}
31 | {{/if}}
32 | {{/if}}
33 | EOF
34 | }
35 |
36 | variable "o11y_prefix" {
37 | type = string
38 | description = "Detector Prefix"
39 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/gcp/variables.tf:
--------------------------------------------------------------------------------
1 | variable "message_body" {
2 | type = string
3 |
4 | default = <<-EOF
5 | {{#if anomalous}}
6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}.
7 | {{else}}
8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}.
9 | {{/if}}
10 |
11 | {{#if anomalous}}
12 | Triggering condition: {{{readableRule}}}
13 | {{/if}}
14 |
15 | {{#if anomalous}}
16 | Signal value: {{inputs.A.value}}
17 | {{else}}
18 | Current signal value: {{inputs.A.value}}
19 | {{/if}}
20 |
21 | {{#notEmpty dimensions}}
22 | Signal details: {{{dimensions}}}
23 | {{/notEmpty}}
24 |
25 | {{#if anomalous}}
26 | {{#if runbookUrl}}
27 | Runbook: {{{runbookUrl}}}
28 | {{/if}}
29 | {{#if tip}}
30 | Tip: {{{tip}}}
31 | {{/if}}
32 | {{/if}}
33 | EOF
34 | }
35 |
36 | variable "o11y_prefix" {
37 | type = string
38 | description = "Detector Prefix"
39 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/host/variables.tf:
--------------------------------------------------------------------------------
1 | variable "message_body" {
2 | type = string
3 |
4 | default = <<-EOF
5 | {{#if anomalous}}
6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}.
7 | {{else}}
8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}.
9 | {{/if}}
10 |
11 | {{#if anomalous}}
12 | Triggering condition: {{{readableRule}}}
13 | {{/if}}
14 |
15 | {{#if anomalous}}
16 | Signal value: {{inputs.A.value}}
17 | {{else}}
18 | Current signal value: {{inputs.A.value}}
19 | {{/if}}
20 |
21 | {{#notEmpty dimensions}}
22 | Signal details: {{{dimensions}}}
23 | {{/notEmpty}}
24 |
25 | {{#if anomalous}}
26 | {{#if runbookUrl}}
27 | Runbook: {{{runbookUrl}}}
28 | {{/if}}
29 | {{#if tip}}
30 | Tip: {{{tip}}}
31 | {{/if}}
32 | {{/if}}
33 | EOF
34 | }
35 |
36 | variable "o11y_prefix" {
37 | type = string
38 | description = "Detector Prefix"
39 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/pivotal/system.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_detector" "pivotal_cloudfoundry_system_errors" {
2 | name = "${var.o11y_prefix} Pivotal cloudFoundry system errors"
3 | description = "Alerts for various Pivotal CloudFoundry system related error scenarios"
4 | program_text = <<-EOF
5 | from signalfx.detectors.against_periods import against_periods
6 | from signalfx.detectors.against_recent import against_recent
7 | system_healthy = data('system.healthy', filter=filter('metric_source', 'cloudfoundry'), rollup='average').mean(over='5m').publish(label='system_healthy', enable=False)
8 | detect(when(system_healthy > 1)).publish('Pivotal Cloudfoundry - The value of system.healthy - Mean(5m) is above 1.')
9 | EOF
10 | rule {
11 | detect_label = "Pivotal Cloudfoundry - The value of system.healthy - Mean(5m) is above 1."
12 | severity = "Minor"
13 | tip = "Investigate CF logs for the unhealthy component(s)."
14 | parameterized_body = var.message_body
15 | }
16 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/azure/variables.tf:
--------------------------------------------------------------------------------
1 | variable "message_body" {
2 | type = string
3 |
4 | default = <<-EOF
5 | {{#if anomalous}}
6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}.
7 | {{else}}
8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}.
9 | {{/if}}
10 |
11 | {{#if anomalous}}
12 | Triggering condition: {{{readableRule}}}
13 | {{/if}}
14 |
15 | {{#if anomalous}}
16 | Signal value: {{inputs.A.value}}
17 | {{else}}
18 | Current signal value: {{inputs.A.value}}
19 | {{/if}}
20 |
21 | {{#notEmpty dimensions}}
22 | Signal details: {{{dimensions}}}
23 | {{/notEmpty}}
24 |
25 | {{#if anomalous}}
26 | {{#if runbookUrl}}
27 | Runbook: {{{runbookUrl}}}
28 | {{/if}}
29 | {{#if tip}}
30 | Tip: {{{tip}}}
31 | {{/if}}
32 | {{/if}}
33 | EOF
34 | }
35 |
36 | variable "o11y_prefix" {
37 | type = string
38 | description = "Detector Prefix"
39 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/docker/variables.tf:
--------------------------------------------------------------------------------
1 | variable "message_body" {
2 | type = string
3 |
4 | default = <<-EOF
5 | {{#if anomalous}}
6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}.
7 | {{else}}
8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}.
9 | {{/if}}
10 |
11 | {{#if anomalous}}
12 | Triggering condition: {{{readableRule}}}
13 | {{/if}}
14 |
15 | {{#if anomalous}}
16 | Signal value: {{inputs.A.value}}
17 | {{else}}
18 | Current signal value: {{inputs.A.value}}
19 | {{/if}}
20 |
21 | {{#notEmpty dimensions}}
22 | Signal details: {{{dimensions}}}
23 | {{/notEmpty}}
24 |
25 | {{#if anomalous}}
26 | {{#if runbookUrl}}
27 | Runbook: {{{runbookUrl}}}
28 | {{/if}}
29 | {{#if tip}}
30 | Tip: {{{tip}}}
31 | {{/if}}
32 | {{/if}}
33 | EOF
34 | }
35 |
36 | variable "o11y_prefix" {
37 | type = string
38 | description = "Detector Prefix"
39 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/kafka/variables.tf:
--------------------------------------------------------------------------------
1 | variable "message_body" {
2 | type = string
3 |
4 | default = <<-EOF
5 | {{#if anomalous}}
6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}.
7 | {{else}}
8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}.
9 | {{/if}}
10 |
11 | {{#if anomalous}}
12 | Triggering condition: {{{readableRule}}}
13 | {{/if}}
14 |
15 | {{#if anomalous}}
16 | Signal value: {{inputs.A.value}}
17 | {{else}}
18 | Current signal value: {{inputs.A.value}}
19 | {{/if}}
20 |
21 | {{#notEmpty dimensions}}
22 | Signal details: {{{dimensions}}}
23 | {{/notEmpty}}
24 |
25 | {{#if anomalous}}
26 | {{#if runbookUrl}}
27 | Runbook: {{{runbookUrl}}}
28 | {{/if}}
29 | {{#if tip}}
30 | Tip: {{{tip}}}
31 | {{/if}}
32 | {{/if}}
33 | EOF
34 | }
35 |
36 | variable "o11y_prefix" {
37 | type = string
38 | description = "Detector Prefix"
39 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/pivotal/variables.tf:
--------------------------------------------------------------------------------
1 | variable "message_body" {
2 | type = string
3 |
4 | default = <<-EOF
5 | {{#if anomalous}}
6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}.
7 | {{else}}
8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}.
9 | {{/if}}
10 |
11 | {{#if anomalous}}
12 | Triggering condition: {{{readableRule}}}
13 | {{/if}}
14 |
15 | {{#if anomalous}}
16 | Signal value: {{inputs.A.value}}
17 | {{else}}
18 | Current signal value: {{inputs.A.value}}
19 | {{/if}}
20 |
21 | {{#notEmpty dimensions}}
22 | Signal details: {{{dimensions}}}
23 | {{/notEmpty}}
24 |
25 | {{#if anomalous}}
26 | {{#if runbookUrl}}
27 | Runbook: {{{runbookUrl}}}
28 | {{/if}}
29 | {{#if tip}}
30 | Tip: {{{tip}}}
31 | {{/if}}
32 | {{/if}}
33 | EOF
34 | }
35 |
36 | variable "o11y_prefix" {
37 | type = string
38 | description = "Detector Prefix"
39 | }
--------------------------------------------------------------------------------
/integration-examples/jenkins-apm/README.md:
--------------------------------------------------------------------------------
1 | # APM Tracing for Jenkins
2 | With Splunk APM and Splunk Log Observer you can gain a deeper understanding of your Jenkins usage!
3 | - Get Waterfall trace charts of your entire pipeline run
4 | - Identify long running steps in your jenkins jobs
5 | - Easily mark deployment successes and failures on Splunk Observability dashboards
6 | - And much more!
7 |
8 | For a complete integration guide including example configuration files and dashboard exports check out the repository at [https://github.com/splunk/splunk-jenkins-otel](https://github.com/splunk/splunk-jenkins-otel)
9 |
10 | ## For more details:
11 | - Splunk Blog: [Jenkins, OpenTelemetry, Observability](https://www.splunk.com/en_us/blog/devops/jenkins-opentelemetry-observability.html)
12 | - Jenkins OpenTelemetry Plugin: [Jenkins OTEL plugin](https://plugins.jenkins.io/opentelemetry/#getting-started) (by Cyrille Le Clerc) can be used with an [OTEL collector](https://github.com/signalfx/splunk-otel-collector) to send to Splunk Observability Cloud (formerly SignalFx) APM
13 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/kubernetes/variables.tf:
--------------------------------------------------------------------------------
1 | variable "message_body" {
2 | type = string
3 |
4 | default = <<-EOF
5 | {{#if anomalous}}
6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}.
7 | {{else}}
8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}.
9 | {{/if}}
10 |
11 | {{#if anomalous}}
12 | Triggering condition: {{{readableRule}}}
13 | {{/if}}
14 |
15 | {{#if anomalous}}
16 | Signal value: {{inputs.A.value}}
17 | {{else}}
18 | Current signal value: {{inputs.A.value}}
19 | {{/if}}
20 |
21 | {{#notEmpty dimensions}}
22 | Signal details: {{{dimensions}}}
23 | {{/notEmpty}}
24 |
25 | {{#if anomalous}}
26 | {{#if runbookUrl}}
27 | Runbook: {{{runbookUrl}}}
28 | {{/if}}
29 | {{#if tip}}
30 | Tip: {{{tip}}}
31 | {{/if}}
32 | {{/if}}
33 | EOF
34 | }
35 |
36 | variable "o11y_prefix" {
37 | type = string
38 | description = "Detector Prefix"
39 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Contribution repository for Splunk Observability Content
2 |
3 | This repository exists to enable sharing of content. No formal testing is
4 | required so it just might work. Some examples of content which would fit here
5 |
6 | * Dashboards
7 | * Detectors
8 | * API Examples
9 | * Usage Reports
10 | * OTel Example Configurations
11 | * Links to Other Relevant Projects
12 |
13 | ## Contributions
14 | Contributions are welcome and encouraged!
15 |
16 | Please see [CONTRIBUTING.md](./CONTRIBUTING.md) for details on contributing to this repository.
17 |
18 | All contributors must sign the CLA and Code of Conduct. You will be prompted by the [cla-assistant](https://github.com/marketplace/actions/cla-assistant-lite) workflow action during your Pull Request for your agreement.
19 |
20 | To agree to the CLA and COC please comment these in **separate individual messages** on your PR:
21 |
22 | CLA:
23 | ```
24 | I have read the CLA Document and I hereby sign the CLA
25 | ```
26 |
27 | Code of Conduct:
28 | ```
29 | I have read the Code of Conduct and I hereby accept the Terms
30 | ```
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/inferred-services-dg/README.md:
--------------------------------------------------------------------------------
1 | # Inferred Services - assets to help observing
2 |
3 | 1. [Dashboard Group - Inferred Services](./Dashboard_Group_Inferred%20Services.json)
4 |
5 | Feel free to also use
6 |
7 | 2. [Sample Detectors: Latency Spike (>3s for 90% of 5min); Error Rate (>50%, sudden change)](../../detectors/inferred-services-detectors/README.md)
8 |
9 | Learn more about Inferred Services:
10 | - [What are Inferred Services](https://docs.splunk.com/observability/en/apm/apm-spans-traces/inferred-services.html)
11 | - [Metrics available for Inferred Services](https://docs.splunk.com/observability/en/apm/span-tags/metricsets.html#available-default-mms-metrics-and-dimensions)
12 |
13 | ## Inferred Services - Dashboard Group
14 |
15 | 1. Import Dashboard Group
16 | *From UI:*
17 | Click on '+' on the top right and select Import->Dashboard Group.
18 |
19 | 2. Find your dashboard group `Inferred Services` and use as a starting point to create charts.
20 |
21 | Screenshot:
22 | 
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/executive-dashboards/main.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_providers {
3 | signalfx = {
4 | source = "splunk-terraform/signalfx"
5 | version = ">=6.13.1"
6 | }
7 | }
8 | }
9 |
10 | variable "signalfx_auth_token" {
11 | type=string
12 | }
13 |
14 | provider "signalfx" {
15 | auth_token = "${var.signalfx_auth_token}"
16 | # If your organization uses a different realm
17 | # api_url = "https://api.us2.signalfx.com"
18 | # If your organization uses a custom URL
19 | # custom_app_url = "https://myorg.signalfx.com"
20 | }
21 |
22 |
23 | ### Create a Dashboard Group for our Dashboards
24 | resource "signalfx_dashboard_group" "exec_dashboard_group" {
25 | name = "Exec Level Dashboards"
26 | description = "Executive Level Dashboards"
27 |
28 | ### Note that if you use these features, you must use a user's
29 | ### admin key to authenticate the provider, lest Terraform not be able
30 | ### to modify the dashboard group in the future!
31 | #authorized_writer_teams = [signalfx_team.mycoolteam.id]
32 | #authorized_writer_users = ["abc123"]
33 | }
34 |
--------------------------------------------------------------------------------
/import-dashboards-terraform/go.mod:
--------------------------------------------------------------------------------
1 | module main
2 |
3 | go 1.24.0
4 |
5 | toolchain go1.24.3
6 |
7 | require (
8 | github.com/hashicorp/go-version v1.7.0
9 | github.com/hashicorp/hcl/v2 v2.24.0
10 | github.com/hashicorp/terraform-exec v0.22.0
11 | github.com/signalfx/signalfx-go v1.52.0
12 | github.com/zclconf/go-cty v1.16.3
13 | )
14 |
15 | require (
16 | github.com/agext/levenshtein v1.2.1 // indirect
17 | github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect
18 | github.com/google/go-cmp v0.6.0 // indirect
19 | github.com/hashicorp/terraform-json v0.22.1 // indirect
20 | github.com/mitchellh/go-wordwrap v1.0.1 // indirect
21 | golang.org/x/mod v0.17.0 // indirect
22 | golang.org/x/net v0.33.0 // indirect
23 | golang.org/x/oauth2 v0.27.0 // indirect
24 | golang.org/x/sync v0.14.0 // indirect
25 | golang.org/x/text v0.25.0 // indirect
26 | golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect
27 | )
28 |
29 | // https://github.com/hashicorp/terraform-exec/pull/446 can be removed after the PR is merged
30 | replace github.com/hashicorp/terraform-exec v0.22.0 => github.com/hrmsk66/terraform-exec v0.21.0
31 |
--------------------------------------------------------------------------------
/integration-examples/system-scanner/validators.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | from typing import Optional
4 |
5 |
6 | def sanitize_command_output(output: str) -> str:
7 | """Sanitize command output to prevent injection"""
8 | if not isinstance(output, str):
9 | return ""
10 | return re.sub(r"[^\w\s.()-]", "", output) # Added () to the allowed characters
11 |
12 |
13 | def validate_path(path: str) -> Optional[str]:
14 | """Validate and normalize file path"""
15 | if not path or not isinstance(path, str):
16 | return None
17 | try:
18 | normalized_path = os.path.normpath(path)
19 | return normalized_path if os.path.exists(normalized_path) else None
20 | except Exception:
21 | return None
22 |
23 |
24 | def validate_version_string(version: str) -> str:
25 | """Validate and clean version string"""
26 | if not version or not isinstance(version, str):
27 | return "Unknown"
28 | # Remove any potentially harmful characters, keeping only valid version characters
29 | cleaned = re.sub(r"[^\w\s.-]", "", version)
30 | return cleaned if cleaned else "Unknown"
31 |
--------------------------------------------------------------------------------
/import-dashboards-terraform/Makefile:
--------------------------------------------------------------------------------
1 | IMAGE_NAME ?= import-tf-script
2 | SIGNALFX_API_URL ?= https://app.us0.signalfx.com
3 | RELATIVE_DIR_PATH ?= "generated-dashboards"
4 |
5 |
6 | ifneq (${MAKEFILE_DIR_LOCATION},${WORKING_DIR})
7 |
8 | %:
9 | $(MAKE) -C ${MAKEFILE_DIR_LOCATION} $@
10 |
11 | .PHONY: %
12 |
13 | else
14 |
15 | .PHONY: tidy
16 | tidy:
17 | go mod tidy
18 |
19 | .PHONY: fmt
20 | fmt:
21 | go fmt ./...
22 |
23 | .PHONY: build
24 | build:
25 | docker buildx build -t $(IMAGE_NAME) .
26 |
27 | .PHONY: import-dashboard-group
28 | import-dashboard-group: clean build
29 | @docker run --volume="$(shell pwd):/terraform-state" \
30 | -it $(IMAGE_NAME) --api-url $(SIGNALFX_API_URL) \
31 | --api-token=$(SIGNALFX_AUTH_TOKEN) \
32 | --groups $(GROUP_IDS) \
33 | --dir /terraform-state/$(RELATIVE_DIR_PATH) \
34 | --add-var-file \
35 | --add-versions-file \
36 | --allow-chart-name-conflict
37 |
38 | .PHONY: clean
39 | clean:
40 | @echo "Cleaning up containers and image for $(IMAGE_NAME)"
41 | @docker ps -a -q --filter ancestor=$(IMAGE_NAME) | xargs -r docker rm -f 2>/dev/null || true
42 | @docker rmi $(IMAGE_NAME) -f 2>/dev/null || true
43 | endif
--------------------------------------------------------------------------------
/integration-examples/apiScripts/README.md:
--------------------------------------------------------------------------------
1 | # Contribution repository for Splunk Observability Content apiScripts
2 |
3 | This repository exists to enable sharing of content.
4 |
5 | This directory contains sample API Scripts that you can use to call the Splunk Observability Cloud API.
6 |
7 | # addEmailToDetectors.py
8 | This script will allow users to insert an email address to the notifications for one or more detectors. It works with the token.yaml file which contains optional and required values needed to run the script. Please refer to the comments in the token.yaml file for more details.
9 |
10 | # getMetricsForHost.py
11 | This script is used to find all the metrics for a given host.
12 |
13 | Usage:
14 |
15 | ```
16 | python3 getMetricsForHost.py -h -r -t
17 | ```
18 |
19 | # muteAllAutoDetectors.py
20 | This script will mute all auto-detectors. It can also be used to re-enable (unmute) all detectors. (NOTE: Unmuting won't distinguish those you muted with the script or muted manually.)
21 |
22 | Usage:
23 | ```
24 | python3 muteAllAutoDetectors.py (to mute all)
25 | python3 muteAllAutoDetectors.py -e (to enable all)
26 | ```
27 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/aws/ecs.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_detector" "aws_ecs_smartagent_cpu" {
2 | name = "${var.o11y_prefix} ECS Cluster High CPU 5m (SFX) - SmartAgent"
3 | description = "Alert when an ECS Cluster has sustained high CPU levels for 5 minutes"
4 | program_text = <<-EOF
5 | A = data('cpu.usage.total', filter=filter('ecs_task_group', '*'), rollup='rate').publish(label='A', enable=False)
6 | B = data('cpu.usage.system', filter=filter('ecs_task_group', '*'), rollup='rate').publish(label='B', enable=False)
7 | C = ((A/B)*100).publish(label='C', enable=False)
8 | E = (C).min().publish(label='E', enable=False)
9 | G = (C).percentile(pct=10).publish(label='G', enable=False)
10 | F = (C).percentile(pct=50).publish(label='F', enable=False)
11 | H = (C).percentile(pct=95).publish(label='H', enable=False)
12 | D = (C).max().publish(label='D', enable=False)
13 | detect(when(D > 90, lasting='5m')).publish('AWS/ECS Cluster High CPU 5m')
14 | EOF
15 | rule {
16 | detect_label = "AWS/ECS Cluster High CPU 5m"
17 | severity = "Major"
18 | parameterized_body = var.message_body
19 | }
20 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/docker/container.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_detector" "container_cpu_utilization" {
2 | name = "${var.o11y_prefix} Container CPU utilization % high"
3 | description = "Alerts when CPU Utilization % is between 70% & 80% for 10mins and > 80% for 5mins"
4 | program_text = <<-EOF
5 | A = data('cpu.usage.total', filter=filter('plugin', 'docker')).publish(label='A', enable=False)
6 | B = data('cpu.usage.system', filter=filter('plugin', 'docker')).publish(label='B', enable=False)
7 | C = (A/B*100).publish(label='Container CPU')
8 | detect(when(C > 80, lasting='5m')).publish('Container CPU utilization % is above 80 for 5m')
9 | detect(when(not (C > 80) and not (C < 70), lasting='10m')).publish('Container CPU utilization % is within 70 and 80 for 10m')
10 | EOF
11 | rule {
12 | detect_label = "Container CPU utilization % is within 70 and 80 for 10m"
13 | severity = "Warning"
14 | parameterized_body = var.message_body
15 | }
16 | rule {
17 | detect_label = "Container CPU utilization % is above 80 for 5m"
18 | severity = "Major"
19 | parameterized_body = var.message_body
20 | }
21 | }
--------------------------------------------------------------------------------
/integration-examples/splunk-otel-dotnet-docker/MultiStageDockerNuGetOption/Dockerfile:
--------------------------------------------------------------------------------
1 | #See https://aka.ms/customizecontainer to learn how to customize your debug container and how Visual Studio uses this Dockerfile to build your images for faster debugging.
2 |
3 | FROM mcr.microsoft.com/dotnet/aspnet:8.0 AS base
4 | USER app
5 | WORKDIR /app
6 | EXPOSE 8080
7 |
8 | FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
9 | ARG BUILD_CONFIGURATION=Release
10 | WORKDIR /src
11 | COPY ["MultiStageDocker/MultiStageDocker.csproj", "MultiStageDocker/"]
12 | RUN dotnet restore "./MultiStageDocker/./MultiStageDocker.csproj"
13 | WORKDIR "/src/MultiStageDocker"
14 | COPY . .
15 |
16 | RUN dotnet add "./MultiStageDocker.csproj" package Splunk.OpenTelemetry.AutoInstrumentation --prerelease
17 |
18 | RUN dotnet build "./MultiStageDocker.csproj" -r linux-x64 -c $BUILD_CONFIGURATION -o /app/build
19 |
20 | FROM build AS publish
21 | ARG BUILD_CONFIGURATION=Release
22 | RUN dotnet publish "./MultiStageDocker.csproj" -r linux-x64 -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false
23 |
24 | FROM base AS final
25 |
26 | WORKDIR /app
27 | COPY --from=publish /app/publish .
28 |
29 | ENTRYPOINT ["./splunk-launch.sh", "dotnet", "MultiStageDocker.dll"]
--------------------------------------------------------------------------------
/detectors/inferred-services-detectors/README.md:
--------------------------------------------------------------------------------
1 | # Inferred Services - assets to help observing
2 |
3 | 1. [Detector: Latency Spike (>3s for 90% of 5min)](./POST_Detector_latency_spike.sh)
4 |
5 | 2. [Detector: Error Rate (>50%, sudden change)](./POST_Detector_error_rate.sh)
6 |
7 | Feel free to also use
8 |
9 | 3. [Dashboard Group - Inferred Services](../../dashboards-and-dashboard-groups/inferred-services-dg/README.md)
10 |
11 | Learn more about Inferred Services:
12 | - [What are Inferred Services](https://docs.splunk.com/observability/en/apm/apm-spans-traces/inferred-services.html)
13 | - [Metrics available for Inferred Services](https://docs.splunk.com/observability/en/apm/span-tags/metricsets.html#available-default-mms-metrics-and-dimensions)
14 |
15 | ## Inferred Services - Sample Detectors
16 | 
17 |
18 | Use curl command to post the detector (replace `Token` and `Realm` as required).
19 |
20 | These can be used as a starting point to customise signals, thresholds, messaging etc.
21 |
22 | Screeshots:
23 | 
24 | 
25 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/SC4SNMP/README.md:
--------------------------------------------------------------------------------
1 | # SC4SNMP (Splunk Connect for SNMP) Dashboard
2 |
3 | This folder contains a dashboard, `SNMP Agents` (WIP), to get insights out of [SC4SNMP](https://splunk.github.io/splunk-connect-for-snmp/main) metrics. This dashboard was built for `snmpd` running on Linux. Metric names may be different for other devices, which would require the dashboard to be updated.
4 |
5 | There is also a dashboard group, `Network Devices (SNMP)`, which provides basic device, interface and protocol metrics
6 | for devices based on the MIB-2 metrics (which should be available in basically any network device). This dashboard group is
7 | recommended as an initial dashboard for edge network device monitoring.
8 |
9 | ## Setup
10 |
11 | This dashboard requires [SC4SNMP](https://splunk.github.io/splunk-connect-for-snmp/main) to be set up:
12 |
13 | - [SC4SNMP official documentation](https://splunk.github.io/splunk-connect-for-snmp/main)
14 | - [Walkthrough of SC4SNMP setup with Linux agents running `snpmd`](https://smathur-splunk.github.io/workshops/snmp_intro)
15 |
16 | Follow these links to set up and configure SC4SNMP to send data to O11y Cloud. Set up SNMP agents as described in the second link and configure them for polling by SC4SNMP.
17 |
18 | The dashboard should automatically populate, but metric names may need changing as they may vary from agent to agent.
19 |
--------------------------------------------------------------------------------
/integration-examples/active_detectors/README.md:
--------------------------------------------------------------------------------
1 | # Active Detector Report
2 |
3 | This script will fetch the events for detectors (max. 1,000) in an Org.
4 |
5 | The output will show detectors with no events, detectors with events and how many events have fired within the number of days specified on the command line.
6 |
7 | The table supports hyperlinks on the detector ID if your terminal supports it.
8 |
9 | Also, a CSV report will be created in the same directory as you run the script.
10 |
11 | ## Using the Active Detector Report script
12 |
13 | 
14 |
15 | 1. Install the required packages with `pip3 installl -r requirements.txt`
16 | 2. Obtain an Org Access Token (with API permissions) and note the Realm your Org is in e.g. us1, eu0, jp0 etc.
17 | 3. Run the script e.g. `python3 active_detectors -t -d `
18 |
19 | ### Full CLI options
20 |
21 | ``` bash
22 | $ python3 active_detectors.py -h
23 |
24 | usage: active_detectors.py [-h] -t TOKEN -r REALM -d DAYS
25 |
26 | Splunk O11y Cloud - Active Detectors
27 |
28 | options:
29 | -h, --help show this help message and exit
30 | -t TOKEN, --token TOKEN
31 | Access Token
32 | -r REALM, --realm REALM
33 | us0, us1, us2, eu0 or jp0
34 | -d DAYS, --days DAYS No. of days ago
35 | ```
36 |
--------------------------------------------------------------------------------
/detectors/metricshub/README.md:
--------------------------------------------------------------------------------
1 | # MetricsHub Detectors
2 |
3 | This folder contains detectors that may be used to trigger events based on the metrics collected by MetricsHub.
4 |
5 | Please note that you may want or need different thresholds than those provided here.
6 | You may also want to create your own detectors.
7 |
8 | ## Importing Detectors
9 | Two options exist:
10 | 1. Edit and send the Detector JSON [via API](https://dev.splunk.com/observability/reference/api/detectors/latest#endpoint-create-single-detector)
11 | ```
12 | curl -X POST "https://api.{REALM}.signalfx.com/v2/detector" \
13 | -H "Content-Type: application/json" \
14 | -H "X-SF-TOKEN: " \
15 | -d @"/path/to/detector/detector_name.json"
16 | ```
17 | 2. Copy the detector's JSON and paste it into your own Detector [via the UI](https://docs.splunk.com/Observability/alerts-detectors-notifications/create-detectors-for-alerts.html#nav-Create-detectors-to-trigger-alerts)
18 |
19 | # Support
20 |
21 | Subscribers to **MetricsHub** gain access to the **MetricsHub Support Desk**, which provides:
22 |
23 | - Technical support
24 | - Patches and updates
25 | - Knowledge base access
26 |
27 | For more information, visit the [MetricsHub](https://metricshub.com/) website.
28 |
29 | Splunk does not provide support for these detectors and users should contact Sentry Software's support with any support requests.
30 |
31 |
--------------------------------------------------------------------------------
/integration-examples/usage-reports-scripts/README.md:
--------------------------------------------------------------------------------
1 | # Usage Reporting Scripts
2 |
3 | ## Custom Metric Reports
4 | This script prases unstructured Custom Metric Reports and outputs easy to read table of the values from highest to lowest.
5 |
6 | Special thanks to Robert Castley!
7 |
8 | ### Using The Custom Metric Report Script
9 | **Pre-Requisite:** Download your Custom Metric Report from Splunk Observability like so:
10 | 
11 |
12 | 1. Pull down this script
13 | 2. Install any required packages with `pip install -r requirements.txt`
14 | 3. Run the script on your downloaded Custom Metrics Report.
15 | - E.G. `python custom-metric-report-parser.py -r ~/Downloads/2022-07-22_mts-by-metric.txt `
16 |
17 |
18 | #### Full CLI Options
19 | ```
20 | # python custom-metric-report-parser.py -h
21 |
22 | usage: custom-metric-report-parser.py [-h] [-c CATEGORY] [-l LIMIT] -r REPORT
23 |
24 | Splunk Observability Cloud - Custom Metrics Report Parser
25 |
26 | optional arguments:
27 | -h, --help show this help message and exit
28 | -c CATEGORY, --category CATEGORY
29 | 1 (Host), 2 (Container), 3 (Custom), 4 (Hi-Res), 5 (Bundled)
30 | -l LIMIT, --limit LIMIT
31 | Limit no. of metrics displayed in table
32 | -r REPORT, --report REPORT
33 | Custom Metric Report
34 | ```
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/gcp/storage.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_detector" "gcp_cloud_storage_errors" {
2 | name = "${var.o11y_prefix} GCP Cloud Storage Requests High Error Rate"
3 | description = "Alerts when there is a high 4xx or 5xx error rate"
4 | program_text = <<-EOF
5 | A = data('api/request_count', filter=filter('response_code', '4*'), rollup='latest').sum(by=['bucket_name']).publish(label='4xx error', enable=False)
6 | B = data('api/request_count', rollup='latest').sum(by=['bucket_name']).publish(label='total', enable=False)
7 | detect(when(((A/B)*100) >= 10, lasting='5m')).publish('GCP Cloud Storage 10% of requests were 4xx for 5m')
8 | C = data('api/request_count', filter=filter('response_code', '5*'), rollup='latest').sum(by=['bucket_name']).publish(label='5xx error', enable=False)
9 | D = data('api/request_count', rollup='latest').sum(by=['bucket_name']).publish(label='total', enable=False)
10 | detect(when(((C/D)*100) >= 10, lasting='5m')).publish('GCP Cloud Storage 10% of requests were 5xx for 5m')
11 | EOF
12 | rule {
13 | detect_label = "GCP Cloud Storage 10% of requests were 4xx for 5m"
14 | severity = "Major"
15 | parameterized_body = var.message_body
16 | }
17 | rule {
18 | detect_label = "GCP Cloud Storage 10% of requests were 5xx for 5m"
19 | severity = "Major"
20 | parameterized_body = var.message_body
21 | }
22 | }
--------------------------------------------------------------------------------
/integration-examples/azure-devops/README.md:
--------------------------------------------------------------------------------
1 | # Azure DevOps integrations for Splunk Observability
2 |
3 | 1. Azure DevOps - [Splunk Observability Cloud Events](https://marketplace.visualstudio.com/items?itemName=jeremyh-splunk.splunk-events)
4 | - Send Azure DevOps Deployment events and alerts to Splunk Observability
5 | - Visualize these events overlaid on Splunk Observability Dashboards
6 | 2. Azure DevOps - [Splunk Observability Alert Gates](https://marketplace.visualstudio.com/items?itemName=jeremyh-splunk.splunk-alert)
7 | - Gate Azure DevOps Deployments based on status of Alerts in Splunk Observability
8 | - Deploy your software more safely by checking that the coast is clear in Splunk Observability.
9 |
10 | ## For more details:
11 | - Splunk Blog Post: [Azure DevOps: Fun with Observability Events and Alerts!](https://www.splunk.com/en_us/blog/devops/azure-devops-fun-with-observability-events-and-alerts.html)
12 | - Describes use cases and benefits of the above integrations for Azure DevOps users.
13 | - Microsoft Marketplace Links:
14 | - [Splunk Observability Cloud Events](https://marketplace.visualstudio.com/items?itemName=jeremyh-splunk.splunk-events)
15 | - [Splunk Observability Alert Gates](https://marketplace.visualstudio.com/items?itemName=jeremyh-splunk.splunk-alert)
16 | - GitHub Open Source Repositories and detailed setup instructions:
17 | - [azure-devops-splunk-events](https://github.com/splunk/azure-devops-splunk-events)
18 | - [azure-devops-splunk-alert-gate](https://github.com/splunk/azure-devops-splunk-alert-gate)
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/main.tf:
--------------------------------------------------------------------------------
1 | provider "signalfx" {
2 | auth_token = var.api_token
3 | api_url = "https://api.${var.realm}.signalfx.com"
4 | }
5 |
6 | module "aws" {
7 | source = "./modules/aws"
8 | o11y_prefix = var.o11y_prefix
9 | }
10 |
11 | module "host" {
12 | source = "./modules/host"
13 | o11y_prefix = var.o11y_prefix
14 |
15 | }
16 |
17 | module "kafka" {
18 | source = "./modules/kafka"
19 | o11y_prefix = var.o11y_prefix
20 |
21 | }
22 |
23 | module "azure" {
24 | source = "./modules/azure"
25 | o11y_prefix = var.o11y_prefix
26 | }
27 |
28 | module "docker" {
29 | source = "./modules/docker"
30 | o11y_prefix = var.o11y_prefix
31 | }
32 |
33 | module "gcp" {
34 | source = "./modules/gcp"
35 | o11y_prefix = var.o11y_prefix
36 | }
37 |
38 | module "kubernetes" {
39 | source = "./modules/kubernetes"
40 | o11y_prefix = var.o11y_prefix
41 | }
42 |
43 | module "pivotal" {
44 | source = "./modules/pivotal"
45 | o11y_prefix = var.o11y_prefix
46 | }
47 |
48 | module "usage_dashboard" {
49 | source = "./modules/dashboards/usage"
50 | o11y_prefix = var.o11y_prefix
51 | }
52 |
53 | module "parent_child_dashboard" {
54 | source = "./modules/dashboards/parent"
55 | o11y_prefix = var.o11y_prefix
56 | }
57 |
58 | module "rum_and_synthetics_dashboard" {
59 | source = "./modules/dashboards/rum_and_synthetics"
60 | o11y_prefix = var.o11y_prefix
61 | }
62 |
63 | module "executive-dashboards" {
64 | source = "./modules/dashboards/executive-dashboards"
65 | o11y_prefix = var.o11y_prefix
66 | }
--------------------------------------------------------------------------------
/integration-examples/synthetics-examples/API/status-to-splunk-hec/README.md:
--------------------------------------------------------------------------------
1 | # Third-party Status Page API Check to Metric
2 | This example API test calls the OpenAI status endpoint and collects data on ongoing incidents and updates.
3 | This test creates and sends a log event containing that incident data to a Splunk HEC endpoint.
4 | The test and it's configuration are included in this directory:
5 | - [`synthetics_status_to_splunk_hec_api_check.tf`](./synthetics_status_to_splunk_hec_api_check.tf)
6 | - Uses the [Splunk Synthetics Terraform provider](https://registry.terraform.io/providers/splunk/synthetics/latest/docs)
7 |
8 | ## Synthetic API Test
9 | The synthetic API test will call the OpenAI status page and report any current and ongoing incidents to a Splunk HEC endpoint of your choice. This example is mostly to illustrate ingest arbitrary ingest into Splunk. The test serves a double function of providing external monitoring of the HEC endpoint in question in addition to providing ingest of useful incident data.
10 |
11 |
12 | ### Required Splunk Synthetic Global Variables
13 | The following [global variables](https://docs.splunk.com/observability/en/synthetics/test-config/global-variables.html) are **REQUIRED** to run the included API test.
14 | - `splunk_hec_url`: The url to your hec raw ingest (E.G. `https://hec-inputs-for-my-service.mysplunkinstance.com:443/services/collector/raw`)
15 | - **Terraform apply will fail if this global variable does not exist in your environment!**
16 | - `hec_token`: A provisioned hec token for basic auth (E.G. `Splunk 123412-3123-1234-abcd-1234123412abc`)
17 |
18 |
--------------------------------------------------------------------------------
/integration-examples/system-scanner/utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import contextlib
3 | from datetime import datetime
4 | import os
5 |
6 |
7 | class ContextLogger:
8 | def __init__(self, logger_name: str, log_file: str = "system_scanner.log"):
9 | self.logger = logging.getLogger(logger_name)
10 |
11 | # Create logs directory if it doesn't exist
12 | log_dir = "logs"
13 | if not os.path.exists(log_dir):
14 | os.makedirs(log_dir)
15 |
16 | log_path = os.path.join(log_dir, log_file)
17 |
18 | # Configure logging
19 | formatter = logging.Formatter(
20 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
21 | )
22 | file_handler = logging.FileHandler(log_path)
23 | file_handler.setFormatter(formatter)
24 |
25 | # Add handlers if they don't exist
26 | if not self.logger.handlers:
27 | self.logger.addHandler(file_handler)
28 | self.logger.setLevel(logging.INFO)
29 |
30 | self.start_time = None
31 |
32 | @contextlib.contextmanager
33 | def operation_context(self, operation: str):
34 | self.start_time = datetime.now()
35 | self.logger.info(f"Starting operation: {operation}")
36 | try:
37 | yield
38 | except Exception as e:
39 | self.logger.error(f"Error during {operation}: {e}")
40 | raise
41 | finally:
42 | duration = datetime.now() - self.start_time
43 | self.logger.info(
44 | f"{operation} completed in {duration.total_seconds():.2f}s"
45 | )
46 |
--------------------------------------------------------------------------------
/integration-examples/splunk-otel-dotnet-docker/MultiStageDocker/Dockerfile:
--------------------------------------------------------------------------------
1 | #See https://aka.ms/customizecontainer to learn how to customize your debug container and how Visual Studio uses this Dockerfile to build your images for faster debugging.
2 |
3 | FROM mcr.microsoft.com/dotnet/aspnet:8.0 AS base
4 | USER app
5 | WORKDIR /app
6 | EXPOSE 8080
7 |
8 | FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
9 | ARG BUILD_CONFIGURATION=Release
10 | WORKDIR /src
11 | COPY ["MultiStageDocker/MultiStageDocker.csproj", "MultiStageDocker/"]
12 | RUN dotnet restore "./MultiStageDocker/./MultiStageDocker.csproj"
13 | WORKDIR "/src/MultiStageDocker"
14 | COPY . .
15 | RUN dotnet build "./MultiStageDocker.csproj" -c $BUILD_CONFIGURATION -o /app/build
16 |
17 | # Add dependencies for splunk-otel-dotnet-install.sh
18 | RUN apt-get update && \
19 | apt-get install -y unzip
20 |
21 | # Download Splunk OTel .NET installer
22 | RUN curl -sSfL https://github.com/signalfx/splunk-otel-dotnet/releases/latest/download/splunk-otel-dotnet-install.sh -O
23 |
24 | # Install the distribution
25 | RUN sh ./splunk-otel-dotnet-install.sh
26 |
27 | FROM build AS publish
28 | ARG BUILD_CONFIGURATION=Release
29 | RUN dotnet publish "./MultiStageDocker.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false
30 |
31 | FROM base AS final
32 |
33 | # Copy instrumentation file tree
34 | WORKDIR "//home/app/.splunk-otel-dotnet"
35 | COPY --from=build /root/.splunk-otel-dotnet/ .
36 |
37 | WORKDIR /app
38 | COPY --from=publish /app/publish .
39 | COPY MultiStageDocker/entrypoint.sh .
40 |
41 | ENTRYPOINT ["sh", "entrypoint.sh"]
42 | CMD ["dotnet", "MultiStageDocker.dll"]
--------------------------------------------------------------------------------
/integration-examples/synthetics-examples/API/status-page-to-metrics-api/README.md:
--------------------------------------------------------------------------------
1 | # Third-party Status Page API Check to Metric
2 | This example API test shows how to call multiple APIs, collect data, turn that data into a usable JSON payload, and send it off to another API.
3 | This test creates metrics using a Splunk Synthetics API test.
4 | The test and it's configuration are included in this directory:
5 | - [`synthetics_thirdparty_status_api_check.tf`](./synthetics_thirdparty_status_api_check.tf)
6 | - Uses the [Splunk Synthetics Terraform provider](https://registry.terraform.io/providers/splunk/synthetics/latest/docs)
7 |
8 | For a detailed description of this test and how it functions check out the [Splunk Lantern Article: Constructing an API test JSON payload](https://lantern.splunk.com/Observability/Product_Tips/Synthetic_Monitoring/Constructing_an_API_test_JSON_payload_for_alerting_on_external_dependencies)
9 |
10 | ## Synthetic API Test
11 | The synthetic API test will call the CloudFlare and GitHub status pages and report a metric with a value of 1 (status is impacted) or 0 (status is normal) for each:
12 | - `cloudflare.status`
13 | - `github.status`
14 |
15 | These metrics include dimensions for description of any impact to status and an indicator (none, minor, major, or critical).
16 | 
17 |
18 | ### Required Splunk Synthetic Global Variables
19 | The following [global variables](https://docs.splunk.com/observability/en/synthetics/test-config/global-variables.html) are **REQUIRED** to run the included API test.
20 | - `org_ingest_token`: A provisioned INGEST token
21 | 
22 |
23 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/gitproviderreceiver_metadata.yaml:
--------------------------------------------------------------------------------
1 | type: gitprovider
2 |
3 | sem_conv_version: 1.9.0
4 |
5 | status:
6 | class: receiver
7 | stability:
8 | development: [metrics]
9 | distributions: [liatrio]
10 | codeowners:
11 | active: [adrielp, astencel-sumo]
12 |
13 | # this might need to be unique per sub receiver implementation
14 | resource_attributes:
15 | organization.name:
16 | enabled: true
17 | description: Git Organization or Project Name
18 | type: string
19 | git.vendor.name:
20 | enabled: true
21 | # github, gitlab, bitbucket, gittea
22 | description: The name of the Git vendor/provider (ie. GitHub / GitLab)
23 | type: string
24 |
25 | ## Attritbutes that will be uncommented when the rest of the metrics are added
26 | attributes:
27 | repository.name:
28 | description: The full name of the Git repository
29 | type: string
30 |
31 | metrics:
32 | git.repository.count:
33 | enabled: true
34 | description: Number of repositories in an organization
35 | unit: 1
36 | gauge:
37 | value_type: int
38 | attributes: []
39 | git.repository.branch.count:
40 | enabled: true
41 | description: Number of branches in the repository
42 | unit: 1
43 | gauge:
44 | value_type: int
45 | attributes: [repository.name]
46 | git.repository.contributor.count:
47 | enabled: false
48 | description: Total number of unique contributors to this repository
49 | unit: 1
50 | gauge:
51 | value_type: int
52 | attributes: [repository.name]
53 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/host/disk.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_detector" "disk_space_low" {
2 | name = "${var.o11y_prefix} Low Disk Space"
3 | description = "Alerts when a partition is filling up or total disk space will fill up within 24hrs"
4 |
5 | program_text = <<-EOF
6 | A = data('disk.utilization', filter=(not filter('plugin_instance', 'snap*'))).publish(label='Disk Utilization', enable=False)
7 | detect(when(A >= 80 and A < 90)).publish('Disk space has filled upto greater than 80% but less than 90%')
8 | detect(when(A >= 90)).publish('Disk space has filled upto or is greater than 90%')
9 | from signalfx.detectors.countdown import countdown
10 | B = data('disk.summary_utilization').publish(label='Disk Summary Utilization', enable=False)
11 | countdown.hours_left_stream_incr_detector(stream=B, maximum_capacity=100, lower_threshold=24, fire_lasting=lasting('15m', 1), clear_threshold=36, clear_lasting=lasting('15m', 1), use_double_ewma=False).publish('Disk space utilization is projected to reach 100% within 24 hours')
12 | EOF
13 |
14 | rule {
15 | detect_label = "Disk space has filled upto greater than 80% but less than 90%"
16 | severity = "Major"
17 | parameterized_body = var.message_body
18 | }
19 | rule {
20 | detect_label = "Disk space has filled upto or is greater than 90%"
21 | severity = "Critical"
22 | parameterized_body = var.message_body
23 |
24 | }
25 | rule {
26 | detect_label = "Disk space utilization is projected to reach 100% within 24 hours"
27 | severity = "Critical"
28 | parameterized_body = var.message_body
29 | }
30 | }
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/httpcheckreceiver_metadata.yaml:
--------------------------------------------------------------------------------
1 | type: httpcheck
2 |
3 | status:
4 | class: receiver
5 | stability:
6 | development: [metrics]
7 | distributions: [contrib, sumo]
8 | warnings: []
9 | codeowners:
10 | active: [codeboten]
11 |
12 | resource_attributes:
13 |
14 | attributes:
15 | http.url:
16 | description: Full HTTP request URL.
17 | type: string
18 | http.status_code:
19 | description: HTTP response status code
20 | type: int
21 | http.method:
22 | description: HTTP request method
23 | type: string
24 | http.status_class:
25 | description: HTTP response status class
26 | type: string
27 | error.message:
28 | description: Error message recorded during check
29 | type: string
30 |
31 | metrics:
32 | httpcheck.status:
33 | description: 1 if the check resulted in status_code matching the status_class, otherwise 0.
34 | enabled: true
35 | sum:
36 | value_type: int
37 | aggregation_temporality: cumulative
38 | monotonic: false
39 | unit: 1
40 | attributes: [http.url, http.status_code, http.method, http.status_class]
41 | httpcheck.duration:
42 | description: Measures the duration of the HTTP check.
43 | enabled: true
44 | gauge:
45 | value_type: int
46 | unit: ms
47 | attributes: [http.url]
48 | httpcheck.error:
49 | description: Records errors occurring during HTTP check.
50 | enabled: true
51 | sum:
52 | value_type: int
53 | aggregation_temporality: cumulative
54 | monotonic: false
55 | unit: "{error}"
56 | attributes: [http.url, error.message]
57 |
--------------------------------------------------------------------------------
/integration-examples/lambda-vpc-connection-sample/tests/integration/test_api_gateway.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import boto3
4 | import pytest
5 | import requests
6 |
7 | """
8 | Make sure env variable AWS_SAM_STACK_NAME exists with the name of the stack we are going to test.
9 | """
10 |
11 |
12 | class TestApiGateway:
13 |
14 | @pytest.fixture()
15 | def api_gateway_url(self):
16 | """ Get the API Gateway URL from Cloudformation Stack outputs """
17 | stack_name = os.environ.get("AWS_SAM_STACK_NAME")
18 |
19 | if stack_name is None:
20 | raise ValueError('Please set the AWS_SAM_STACK_NAME environment variable to the name of your stack')
21 |
22 | client = boto3.client("cloudformation")
23 |
24 | try:
25 | response = client.describe_stacks(StackName=stack_name)
26 | except Exception as e:
27 | raise Exception(
28 | f"Cannot find stack {stack_name} \n" f'Please make sure a stack with the name "{stack_name}" exists'
29 | ) from e
30 |
31 | stacks = response["Stacks"]
32 | stack_outputs = stacks[0]["Outputs"]
33 | api_outputs = [output for output in stack_outputs if output["OutputKey"] == "HelloWorldApi"]
34 |
35 | if not api_outputs:
36 | raise KeyError(f"HelloWorldAPI not found in stack {stack_name}")
37 |
38 | return api_outputs[0]["OutputValue"] # Extract url from stack outputs
39 |
40 | def test_api_gateway(self, api_gateway_url):
41 | """ Call the API Gateway endpoint and check the response """
42 | response = requests.get(api_gateway_url)
43 |
44 | assert response.status_code == 200
45 | assert response.json() == {"message": "hello world"}
46 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/nginxreceiver_metadata.yaml:
--------------------------------------------------------------------------------
1 | type: nginx
2 |
3 | status:
4 | class: receiver
5 | stability:
6 | beta: [metrics]
7 | distributions: [contrib, observiq, sumo]
8 | codeowners:
9 | active: [djaglowski]
10 |
11 | attributes:
12 | state:
13 | description: The state of a connection
14 | type: string
15 | enum:
16 | - active
17 | - reading
18 | - writing
19 | - waiting
20 |
21 | metrics:
22 | nginx.requests:
23 | enabled: true
24 | description: Total number of requests made to the server since it started
25 | unit: requests
26 | sum:
27 | value_type: int
28 | monotonic: true
29 | aggregation_temporality: cumulative
30 | attributes: []
31 | nginx.connections_accepted:
32 | enabled: true
33 | description: The total number of accepted client connections
34 | unit: connections
35 | sum:
36 | value_type: int
37 | monotonic: true
38 | aggregation_temporality: cumulative
39 | attributes: []
40 | nginx.connections_handled:
41 | enabled: true
42 | description: The total number of handled connections. Generally, the parameter value is the same as nginx.connections_accepted unless some resource limits have been reached (for example, the worker_connections limit).
43 | unit: connections
44 | sum:
45 | value_type: int
46 | monotonic: true
47 | aggregation_temporality: cumulative
48 | attributes: []
49 | nginx.connections_current:
50 | enabled: true
51 | description: The current number of nginx connections by state
52 | unit: connections
53 | sum:
54 | value_type: int
55 | monotonic: false
56 | aggregation_temporality: cumulative
57 | attributes: [state]
58 |
--------------------------------------------------------------------------------
/integration-examples/get-rum-urls/README.md:
--------------------------------------------------------------------------------
1 | # Get Rum Urls
2 |
3 | This script will return all RUM urls present in Splunk Observability Cloud.
4 | The script expects you to provide both the application and environment you want to get the urls for. These settings are required.
5 | Use this an input to create of fine tune URL grouping rules.
6 |
7 | ## Prerequisites
8 | This script expects both `curl` and `jq` to be installed.
9 | This script has onoly been tested on MacOS.
10 |
11 | ## Environment Variables
12 | This script relies on environment variables.
13 | Set the following:
14 |
15 | ```
16 | export REALM=
17 | export APP=
18 | export ENVIRONMENT=
19 | export TOKEN=
20 | ```
21 |
22 | And run the script:
23 |
24 | ```
25 | $ ./get_rum_urls.sh
26 | REALM is set to: eu0
27 | TOKEN is set.
28 | APP is set.
29 | ENVIRONMENT is set.
30 | Script continues with REALM=eu0, APP=online-boutique-eu-store, ENVIRONMENT=online-boutique-eu (TOKEN value hidden).
31 | https://online-boutique-eu.splunko11y.com/
32 | https://online-boutique-eu.splunko11y.com/cart
33 | https://online-boutique-eu.splunko11y.com/cart/?>
34 | https://online-boutique-eu.splunko11y.com/cart/checkout
35 | https://online-boutique-eu.splunko11y.com/product/?>
36 | ```
37 | By default, the script shows some information before the urls are printed.
38 |
39 | If you just want to get the list of urls output. Redirect stderr like this:
40 | ```
41 | $ ./get_rum_urls.sh 2> /dev/null
42 | https://online-boutique-eu.splunko11y.com/
43 | https://online-boutique-eu.splunko11y.com/cart
44 | https://online-boutique-eu.splunko11y.com/cart/?>
45 | https://online-boutique-eu.splunko11y.com/cart/checkout
46 | https://online-boutique-eu.splunko11y.com/product/?>
47 | ```
48 |
--------------------------------------------------------------------------------
/integration-examples/system-scanner/os_info.py:
--------------------------------------------------------------------------------
1 | """
2 | SystemScanner: OS Information Module
3 |
4 | This module provides functionality to retrieve detailed information
5 | about the operating system on which the script is running.
6 |
7 | It includes functions to get the system name, release version,
8 | and architecture.
9 | """
10 |
11 | import platform
12 | import sys
13 |
14 |
15 | def get_os_info():
16 | system = platform.system()
17 | release = platform.release()
18 | architecture = platform.machine()
19 | os_flavor = ""
20 |
21 | # Add OS flavor information focusing on the [0] output of platform
22 | if system == "Darwin": # macOS
23 | os_flavor = f"macOS {platform.mac_ver()[0]}"
24 | elif system == "Linux":
25 | try:
26 | # Use platform.freedesktop_os_release() if available (Python 3.8+)
27 | if sys.version_info >= (3, 8):
28 | os_info = platform.freedesktop_os_release()
29 | if os_info.get("PRETTY_NAME"):
30 | os_flavor = f"Linux {os_info['PRETTY_NAME']}"
31 | elif os_info.get("NAME"):
32 | os_flavor = f"Linux {os_info['NAME']}"
33 | else:
34 | os_flavor = "Linux"
35 | else:
36 | # Fallback to reading /etc/os-release (for Python < 3.8)
37 | with open("/etc/os-release") as f:
38 | for line in f:
39 | if line.startswith("PRETTY_NAME="):
40 | os_flavor = line.split("=")[1].strip().strip('"')
41 | break
42 | except:
43 | os_flavor = "Linux"
44 | elif system == "Windows":
45 | os_flavor = f"Windows {platform.win32_ver()[0]}"
46 |
47 | return system, release, architecture, os_flavor
48 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/filestatsreceiver_metadata.yaml:
--------------------------------------------------------------------------------
1 | type: filestats
2 |
3 | status:
4 | class: receiver
5 | stability:
6 | beta: [metrics]
7 | distributions: [contrib, sumo]
8 | codeowners:
9 | active: [atoulme]
10 |
11 |
12 | resource_attributes:
13 | file.name:
14 | description: The name of the file
15 | enabled: true
16 | type: string
17 | file.path:
18 | description: The absolute path of the file
19 | enabled: false
20 | type: string
21 |
22 | attributes:
23 | file.permissions:
24 | description: the permissions associated with the file, using an octal format.
25 | type: string
26 |
27 | metrics:
28 | file.mtime:
29 | description: Elapsed time since the last modification of the file or folder, in seconds since Epoch.
30 | enabled: true
31 | sum:
32 | monotonic: false
33 | aggregation_temporality: cumulative
34 | value_type: int
35 | unit: "s"
36 | file.ctime:
37 | description: Elapsed time since the last change of the file or folder, in seconds since Epoch. In addition to `file.mtime`, this metric tracks metadata changes such as permissions or renaming the file.
38 | enabled: false
39 | sum:
40 | monotonic: false
41 | aggregation_temporality: cumulative
42 | value_type: int
43 | unit: "s"
44 | attributes:
45 | - file.permissions
46 | file.atime:
47 | description: Elapsed time since last access of the file or folder, in seconds since Epoch.
48 | enabled: false
49 | sum:
50 | monotonic: false
51 | aggregation_temporality: cumulative
52 | value_type: int
53 | unit: "s"
54 | file.size:
55 | description: The size of the file or folder, in bytes.
56 | enabled: true
57 | gauge:
58 | value_type: int
59 | unit: "b"
60 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/observability-tf-configs/httpcheckreceiver_metadata.yaml.tf:
--------------------------------------------------------------------------------
1 |
2 | resource "signalfx_dashboard" "httpcheckdashboard" {
3 | name = "httpcheck"
4 | dashboard_group = signalfx_dashboard_group.httpcheckdashboardgroup0.id
5 | time_range = "-1h"
6 |
7 | grid {
8 | chart_ids = [
9 | signalfx_time_chart.httpcheck_status.id, signalfx_time_chart.httpcheck_duration.id, signalfx_time_chart.httpcheck_error.id
10 | ]
11 | width = 4
12 | height = 1
13 | }
14 | }
15 |
16 | resource "signalfx_dashboard_group" "httpcheckdashboardgroup0" {
17 | name = "httpcheck generated OTel dashboard group"
18 | description = "httpcheck generated OTel dashboard group"
19 | }
20 |
21 | resource "signalfx_time_chart" "httpcheck_status" {
22 | name = "1 if the check resulted in status_code matching the status_class, otherwise 0."
23 |
24 | program_text = <<-EOF
25 | data("httpcheck.status").publish(label="1 if the check resulted in status_code matching the status_class, otherwise 0.")
26 | EOF
27 |
28 | time_range = 14400
29 |
30 | plot_type = "LineChart"
31 | show_data_markers = true
32 | }
33 |
34 |
35 | resource "signalfx_time_chart" "httpcheck_duration" {
36 | name = "Measures the duration of the HTTP check."
37 |
38 | program_text = <<-EOF
39 | data("httpcheck.duration").publish(label="Measures the duration of the HTTP check.")
40 | EOF
41 |
42 | time_range = 14400
43 |
44 | plot_type = "LineChart"
45 | show_data_markers = true
46 | }
47 |
48 |
49 | resource "signalfx_time_chart" "httpcheck_error" {
50 | name = "Records errors occurring during HTTP check."
51 |
52 | program_text = <<-EOF
53 | data("httpcheck.error").publish(label="Records errors occurring during HTTP check.")
54 | EOF
55 |
56 | time_range = 14400
57 |
58 | plot_type = "LineChart"
59 | show_data_markers = true
60 | }
61 |
--------------------------------------------------------------------------------
/integration-examples/README.md:
--------------------------------------------------------------------------------
1 | # General Recommendations for Integration and API Example Content for Reuse
2 |
3 | Integration Examples can be everything from example code for interacting with APIs, Configurations for Open Telemetry, code for getting custom metrics into Observability, and more.
4 |
5 | 1. **Noun-centric Organization** : Integrations and API interactions are generally composed of common software components/platforms.
6 |
7 | Please organize folders and submissions to group similar software and platforms together.
8 |
9 | Integrations should be oriented towards specific software/platforms with a focus on reusable patterns wherever possible for easily adjusting to a specific user's needs.
10 |
11 | 2. **Integrations** : Loosely integrations covers collections of code, scripts, documentation, etc which will aide others in setting up functionality with Splunk Observability. This could include Getting Data In (GDI), Tips and reusable SignalFlow patterns, Webhook setup information for a vendor, serverless code for performing checks, etc
12 |
13 | Include a `README.md` within your submission directory documenting and detailing the process of using your Submisson. If metrics are produced, include a list of those metrics and any associated dimensions in your `README.md`.
14 |
15 | 3. **OpenTelemetry Configurations** : Integrations using OpenTelemetry should include OpenTelemetry config files along with a `README.md` that briefly describes any novel pipelines and the receivers, processors, and exporters used.
16 |
17 | 4. **API Scripts and Interactions** : API Scripts and Interactions should include a `README.md` file that explains what the script does. If it emits metrics and dimensions for those metrics they should be noted in the `README.md` file.
18 |
19 | Double check and verify that you have not accidentally added your API tokens or secrets with your code. Wherever possible use environment variables to pass these secrets to the script.
20 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/sshcheckreceiver_metadata.yaml:
--------------------------------------------------------------------------------
1 | type: sshcheck
2 |
3 | status:
4 | class: receiver
5 | stability:
6 | alpha: [metrics]
7 | distributions: [contrib, sumo]
8 | codeowners:
9 | active: [nslaughter, codeboten]
10 |
11 | resource_attributes:
12 | ssh.endpoint:
13 | description: Full SSH endpoint
14 | type: string
15 |
16 | attributes:
17 | error.message:
18 | description: Error message recorded during check
19 | type: string
20 |
21 | metrics:
22 | sshcheck.status:
23 | description: 1 if the SSH client successfully connected, otherwise 0.
24 | enabled: true
25 | sum:
26 | value_type: int
27 | aggregation_temporality: cumulative
28 | monotonic: false
29 | unit: 1
30 | sshcheck.duration:
31 | description: Measures the duration of SSH connection.
32 | enabled: true
33 | gauge:
34 | value_type: int
35 | unit: ms
36 | sshcheck.error:
37 | description: Records errors occurring during SSH check.
38 | enabled: true
39 | sum:
40 | value_type: int
41 | aggregation_temporality: cumulative
42 | monotonic: false
43 | unit: "{error}"
44 | attributes: [error.message]
45 | sshcheck.sftp_status:
46 | description: 1 if the SFTP server replied to request, otherwise 0.
47 | enabled: false
48 | sum:
49 | value_type: int
50 | aggregation_temporality: cumulative
51 | monotonic: false
52 | unit: 1
53 | sshcheck.sftp_duration:
54 | description: Measures SFTP request duration.
55 | enabled: false
56 | gauge:
57 | value_type: int
58 | unit: ms
59 | sshcheck.sftp_error:
60 | description: Records errors occurring during SFTP check.
61 | enabled: false
62 | sum:
63 | value_type: int
64 | aggregation_temporality: cumulative
65 | monotonic: false
66 | unit: "{error}"
67 | attributes: [error.message]
68 |
--------------------------------------------------------------------------------
/detectors/snowflakedb/README.md:
--------------------------------------------------------------------------------
1 | # Snowflake Detectors
2 |
3 | This folder contains detectors that may be useful when working with Snowflake.
4 |
5 | Please note that you may want or need different thresholds than those provided here.
6 |
7 | ## Snowflake Metrics Configuration
8 | Please see [configuration examples](../../dashboards-and-dashboard-groups/snowflakedb/Configuration/) for help getting metrics from Snowflake into Splunk Observability.
9 | ## Importing Detectors
10 | Two options exist:
11 | 1. Edit and send the Detector JSON [via API](https://dev.splunk.com/observability/reference/api/detectors/latest#endpoint-create-single-detector)
12 | ```
13 | curl -X POST "https://api.{REALM}.signalfx.com/v2/detector" \
14 | -H "Content-Type: application/json" \
15 | -H "X-SF-TOKEN: " \
16 | -d @"/path/to/detector/detector_name_is_amazing.json"
17 | ```
18 | 2. Copy the SignalFlow out of the detector JSON and paste into your own Detector [via the UI](https://docs.splunk.com/Observability/alerts-detectors-notifications/create-detectors-for-alerts.html#nav-Create-detectors-to-trigger-alerts)
19 |
20 | ## Available Detectors
21 | Provided alerts follow the 4 Golden Signals of Latency, Errors, Traffic, and Saturation (L.E.T.S.) along with Billing.
22 | ### Latency:
23 | - Queries in Small / X-Small Warehouses longer than 5 minutes (I.E. 300000 ms)
24 | - Queries taking more than 15 minutes (900 seconds)
25 |
26 | ### Errors
27 | - Database Errors by Warehouse (Arbitrarily threshold of 100 errors)
28 | - Database Error Rate by Warehouse (Arbitrary threshold of 15%)
29 | - Login Failures by User (Threshold of 15 per hour)
30 |
31 | ### Traffic
32 | - Blocked Queries by Warehouse
33 | - No Queries in last 3 hours
34 |
35 | ### Saturation
36 | - Overloaded Queries by Warehouse
37 | - Queries Queued longer than 5 minutes (I.E. 300000 ms)
38 |
39 | ### Billing
40 | - Credits used by Warehouse (Anomaly detection)
41 | - % of spend for Cloud Service greater than 15% by Warehouse
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/observability-tf-configs/gitproviderreceiver_metadata.yaml.tf:
--------------------------------------------------------------------------------
1 |
2 | resource "signalfx_dashboard" "gitproviderdashboard" {
3 | name = "gitprovider"
4 | dashboard_group = signalfx_dashboard_group.gitproviderdashboardgroup0.id
5 | time_range = "-1h"
6 |
7 | grid {
8 | chart_ids = [
9 | signalfx_time_chart.git_repository_count.id, signalfx_time_chart.git_repository_branch_count.id, signalfx_time_chart.git_repository_contributor_count.id
10 | ]
11 | width = 4
12 | height = 1
13 | }
14 | }
15 |
16 | resource "signalfx_dashboard_group" "gitproviderdashboardgroup0" {
17 | name = "gitprovider generated OTel dashboard group"
18 | description = "gitprovider generated OTel dashboard group"
19 | }
20 |
21 | resource "signalfx_time_chart" "git_repository_count" {
22 | name = "Number of repositories in an organization"
23 |
24 | program_text = <<-EOF
25 | data("git.repository.count").publish(label="Number of repositories in an organization")
26 | EOF
27 |
28 | time_range = 14400
29 |
30 | plot_type = "LineChart"
31 | show_data_markers = true
32 | }
33 |
34 |
35 | resource "signalfx_time_chart" "git_repository_branch_count" {
36 | name = "Number of branches in the repository"
37 |
38 | program_text = <<-EOF
39 | data("git.repository.branch.count").publish(label="Number of branches in the repository")
40 | EOF
41 |
42 | time_range = 14400
43 |
44 | plot_type = "LineChart"
45 | show_data_markers = true
46 | }
47 |
48 |
49 | resource "signalfx_time_chart" "git_repository_contributor_count" {
50 | name = "Total number of unique contributors to this repository"
51 |
52 | program_text = <<-EOF
53 | data("git.repository.contributor.count").publish(label="Total number of unique contributors to this repository")
54 | EOF
55 |
56 | time_range = 14400
57 |
58 | plot_type = "LineChart"
59 | show_data_markers = true
60 | }
61 |
--------------------------------------------------------------------------------
/detectors/README.md:
--------------------------------------------------------------------------------
1 | # General Recommendations for Detector Content for Reuse
2 |
3 | There are many use cases for detectors, and any detector which provides
4 | insight may be useful for other users. There are some patterns we have found which
5 | work well in Splunk Observability and encourage content reuse.
6 |
7 | 1. **Noun-centric** : While each environment is different, frequently their
8 | applications are composed of common software components/platforms. detectors
9 | which are oriented to understanding those common software components tend to be
10 | more reusable than detectors which are related to processes (which tend to vary
11 | from environment to environment).
12 |
13 | 1. **Instances and Aggregates** : Users typically need to see the "forest" and
14 | the "trees". The way we typically implement this is to define instance and
15 | aggregate views. In addition to being generally useful to users, these
16 | views are used when promoting a detector set to Navigator Views.
17 |
18 | * **Aggregate views** focus on enabling users to identify which particular
19 | instances are outliers. Frequently the information presented in these
20 | detectors is aggregated to the instance as there is instance-level detail to
21 | enable users to further isolate the problem. An example would be showing the
22 | maximum utilization of all filesystems for a host in the aggregate view.
23 | Knowing that a host has a filesystem at 97% utilization would be enough
24 | information for a user to identify that host as an outlier and then further
25 | investigate which specific filesystem was approaching it's limit.
26 |
27 | * **Instance views** focus on enabling users to identify the specific
28 | problem related to the instance. So for instance breaking out the filesystem
29 | utilization metrics by filesystem so that the user knows exactly what
30 | resource is approaching exhaustion.
31 |
32 | 3. **External KPIs** :
33 |
34 | 4. **Internal KPIs** :
35 |
36 | 5. **Detector Variables** :
37 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/snowflakedb/Configuration/README.md:
--------------------------------------------------------------------------------
1 | # Configuration Examples
2 | **NOTE:** These are only examples. You configuration will likely be slightly different.
3 |
4 | These examples expect you are using the [`splunk-otel-collector`](https://github.com/signalfx/splunk-otel-collector) but these examples will also work with any other OTEL configuration.
5 |
6 | 1. [`agent_config.yaml`](./agent_config.yaml) Contains receiver, exporter, pipeline configuration
7 | The receiver entries for Snowflake can be found under `smartagent/sql`
8 | 1. **NOTE:** You MUST add your Snowflake `account` to this config where `account` taken from this format `.snowflakecomputing.com`
9 | 2. If you plan to use a custom `role` rather than `ACCOUNTADMIN` you will need to add your `role` to this config
10 | - **NOTE:** Resolution of `3600` seconds (1 hour) is recommended due to the latency between actions happening and then showing up in the `SNOWFLAKE/ACCOUNT_USAGE` db view. It is possible to collect at a higher interval but is not recommended.
11 | 2. [`splunk-otel-collector.conf`](./splunk-otel-collector.conf) Contains referenced variables like snowflake username / password, and Splunk Observability token, etc
12 | 1. Add your Splunk Observability token in `SPLUNK_ACCESS_TOKEN`
13 | 2. Add your Snowflake User to `SNOWFLAKE_USER` (the user MUST have a role that allows access to the `SNOWFLAKE/ACCOUNT_USAGE` db view)
14 | 3. Add the password for your Snowflake user account to `SNOWFLAKE_PASS`
15 | 3. [`snowflake-metrics.yaml`](./snowflake-metrics.yaml) Contains SQL queries and mappings for our Splunk Observability metrics and dimensions
16 | - [`snowflake-other-metrics.yaml`](./snowflake-other-metrics.yaml) file contains SQL queries for:
17 | - detailed and *high cardinality* DB query metrics including the `query_id` dimension which is a GUID
18 | - When using these metrics replace the `DB Metrics` in `snowflake-metrics.yaml`
19 | - Billing usage in USD
--------------------------------------------------------------------------------
/integration-examples/usage-reports-scripts/custom-metric-report-parser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | import csv
5 | from rich.console import Console
6 | from rich.table import Table
7 |
8 | parser = argparse.ArgumentParser(
9 | description="Splunk Observability Cloud - Custom Metrics Report Parser"
10 | )
11 | parser.add_argument(
12 | "-c",
13 | "--category",
14 | help="1 (Host), 2 (Container), 3 (Custom), 4 (Hi-Res), 5 (Bundled)",
15 | default="3",
16 | )
17 | parser.add_argument(
18 | "-l", "--limit", help="Limit no. of metrics displayed in table", default=10000
19 | )
20 | parser.add_argument("-r", "--report", help="Custom Metric Report", required=True)
21 | args = vars(parser.parse_args())
22 |
23 | if args["category"] == "1":
24 | type = "No. Host MTS"
25 | elif args["category"] == "2":
26 | type = "No. Container MTS"
27 | elif args["category"] == "3":
28 | type = "No. Custom MTS"
29 | elif args["category"] == "4":
30 | type = "No. High Resolution MTS"
31 | elif args["category"] == "5":
32 | type = "No. Bundled MTS"
33 |
34 | console = Console()
35 |
36 | metrics_list = {}
37 |
38 | table = Table(
39 | title="Splunk - Custom Metrics Report Parser",
40 | style="bright_magenta",
41 | title_style="bold italic",
42 | )
43 |
44 | table.add_column("Metric Name", justify="left", style="cyan", no_wrap=True, width=80)
45 | table.add_column("MTS", justify="right", style="green")
46 |
47 |
48 | with open(args["report"]) as f:
49 | reader = csv.DictReader(f, delimiter="\t")
50 | for row in reader:
51 | if int(row[type]) != 0:
52 | metrics_list[row["Metric Name"]] = int(row[type])
53 |
54 | total = 0
55 |
56 | res = sorted(metrics_list.items(), key=lambda v: v[1], reverse=True)
57 | for r in res[: int(args["limit"])]:
58 | mts = "{:,}".format(r[1])
59 | table.add_row(r[0], mts)
60 | total = total + int(r[1])
61 |
62 | total = "{:,}".format(total)
63 | table.add_row("Total MTS", total, style="bold white", end_section=True)
64 | console.print(table)
65 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/host/mem.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_detector" "mem_historical_norm" {
2 | name = "${var.o11y_prefix} Mem utilization % greater than historical norm"
3 | description = "Alerts when Mem usage for this host for the last 30 minutes was significantly higher than normal, as compared to the last 24 hours"
4 | program_text = <<-EOF
5 | from signalfx.detectors.against_recent import against_recent
6 | A = data('memory.utilization').publish(label='A', enable=True)
7 | against_recent.detector_mean_std(stream=A, current_window='30m', historical_window='24h', fire_num_stddev=3, clear_num_stddev=2.5, orientation='above', ignore_extremes=True, calculation_mode='vanilla').publish('Memory utilization is significantly greater than normal, and increasing')
8 | EOF
9 | rule {
10 | detect_label = "Memory utilization is significantly greater than normal, and increasing"
11 | severity = "Warning"
12 | parameterized_body = var.message_body
13 | }
14 | }
15 |
16 | resource "signalfx_detector" "mem_historical_cyclical_norm" {
17 | name = "${var.o11y_prefix} Memory utilization % greater than 3.5 std dev compared to the same time window over the last 3 days"
18 | description = "Alerts when Memory usage for this host for the last 30 minutes was significantly higher than normal, as compared to the last 24 hours"
19 | program_text = <<-EOF
20 | from signalfx.detectors.against_periods import against_periods
21 | A = data('memory.utilization').publish(label='A', enable=True)
22 | against_periods.detector_mean_std(stream=A, window_to_compare='30m', space_between_windows='24h', num_windows=3, fire_num_stddev=3.5, clear_num_stddev=2, discard_historical_outliers=True, orientation='above').publish('Memory Utilization is greater than normal for the same time window compared to the last 3 days')
23 | EOF
24 | rule {
25 | detect_label = "Memory Utilization is greater than normal for the same time window compared to the last 3 days"
26 | severity = "Warning"
27 | parameterized_body = var.message_body
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/integration-examples/ci-webhook-serverless/generate-test-events.py:
--------------------------------------------------------------------------------
1 | import urllib3
2 | import random
3 | import threading
4 | import sys
5 | import time
6 | import json
7 |
8 | build_delay_min = 60
9 | build_delay_max = 300
10 |
11 | build_stepdelay_min = 10
12 | build_stepdelay_max = 50
13 |
14 | build_nsteps_min = 2
15 | build_nsteps_max = 5
16 |
17 | # failure rate 1 out of every n
18 | step_failure_rate_1_per = 20
19 |
20 | global gwebhookurl
21 | global genvironment
22 | global gpoolmgr
23 |
24 | gpoolmgr = None
25 | gwebhookurl = None
26 | genvironment = None
27 |
28 | def fake_build():
29 | bsteps = random.randint(build_nsteps_min, build_nsteps_max)
30 | buildId = 'build' + str(random.randint(0,65536))
31 | for step in range(bsteps):
32 | buildStep = 'step' + str(step)
33 | body = {}
34 | body['environment'] = genvironment
35 | body['buildId'] = buildId
36 | body['buildStep'] = buildStep
37 | body['status'] = 'success'
38 | if 1 == random.randint(1, step_failure_rate_1_per):
39 | body['status'] = 'failed'
40 | if step == 0:
41 | body['eventType'] = 'start_build'
42 | elif step == (bsteps - 1):
43 | body['eventType'] = 'build_complete'
44 | else:
45 | body['eventType'] = 'build_step'
46 | bodyJson = json.dumps(body)
47 | print('sending data to url %s:' % (gwebhookurl))
48 | print(' %s' % (bodyJson))
49 | resp = gpoolmgr.request("POST", gwebhookurl, timeout=30,
50 | headers={'Content-Type': 'application/json'},
51 | body=bodyJson)
52 | print("resp = %s" % (resp.data.decode()))
53 | if body['status'] == 'failed':
54 | return
55 | time.sleep(random.randint(build_stepdelay_min,build_stepdelay_max))
56 |
57 |
58 | if __name__ == '__main__':
59 | if len(sys.argv) != 3:
60 | sys.exit('Usage %s ')
61 |
62 | gwebhookurl = sys.argv[1]
63 | genvironment = sys.argv[2]
64 | gpoolmgr = urllib3.PoolManager()
65 |
66 | while True:
67 | x = threading.Thread(target=fake_build)
68 | x.start()
69 | time.sleep(random.randint(build_delay_min,build_delay_max))
70 |
--------------------------------------------------------------------------------
/integration-examples/lambda-vpc-connection-sample/events/event.json:
--------------------------------------------------------------------------------
1 | {
2 | "body": "{\"message\": \"hello world\"}",
3 | "resource": "/hello",
4 | "path": "/hello",
5 | "httpMethod": "GET",
6 | "isBase64Encoded": false,
7 | "queryStringParameters": {
8 | "foo": "bar"
9 | },
10 | "pathParameters": {
11 | "proxy": "/path/to/resource"
12 | },
13 | "stageVariables": {
14 | "baz": "qux"
15 | },
16 | "headers": {
17 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
18 | "Accept-Encoding": "gzip, deflate, sdch",
19 | "Accept-Language": "en-US,en;q=0.8",
20 | "Cache-Control": "max-age=0",
21 | "CloudFront-Forwarded-Proto": "https",
22 | "CloudFront-Is-Desktop-Viewer": "true",
23 | "CloudFront-Is-Mobile-Viewer": "false",
24 | "CloudFront-Is-SmartTV-Viewer": "false",
25 | "CloudFront-Is-Tablet-Viewer": "false",
26 | "CloudFront-Viewer-Country": "US",
27 | "Host": "1234567890.execute-api.us-east-1.amazonaws.com",
28 | "Upgrade-Insecure-Requests": "1",
29 | "User-Agent": "Custom User Agent String",
30 | "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)",
31 | "X-Amz-Cf-Id": "cDehVQoZnx43VYQb9j2-nvCh-9z396Uhbp027Y2JvkCPNLmGJHqlaA==",
32 | "X-Forwarded-For": "127.0.0.1, 127.0.0.2",
33 | "X-Forwarded-Port": "443",
34 | "X-Forwarded-Proto": "https"
35 | },
36 | "requestContext": {
37 | "accountId": "123456789012",
38 | "resourceId": "123456",
39 | "stage": "prod",
40 | "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef",
41 | "requestTime": "09/Apr/2015:12:34:56 +0000",
42 | "requestTimeEpoch": 1428582896000,
43 | "identity": {
44 | "cognitoIdentityPoolId": null,
45 | "accountId": null,
46 | "cognitoIdentityId": null,
47 | "caller": null,
48 | "accessKey": null,
49 | "sourceIp": "127.0.0.1",
50 | "cognitoAuthenticationType": null,
51 | "cognitoAuthenticationProvider": null,
52 | "userArn": null,
53 | "userAgent": "Custom User Agent String",
54 | "user": null
55 | },
56 | "path": "/prod/hello",
57 | "resourcePath": "/hello",
58 | "httpMethod": "POST",
59 | "apiId": "1234567890",
60 | "protocol": "HTTP/1.1"
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | Contributions are encouraged and greatly appreciated! Every
4 | little bit helps, and credit will always be given.
5 |
6 | You can contribute in many ways:
7 |
8 | ## Types of Contributions
9 |
10 | ### Report Bugs/Issues:
11 |
12 | If you are reporting a bug or issues, please include:
13 |
14 | - Operating system name and version.
15 | - Any details about your local setup that might be helpful
16 | in troubleshooting (E.G. Python version if using a python script, Terraform version if you're using a Terraform script.).
17 | - Detailed steps to reproduce the bug.
18 |
19 | ### Fix Bugs
20 |
21 | Check the Issues for this repo on GitHub. Anything tagged with
22 | a "bug" ticket type is open to whoever wants to implement it.
23 |
24 | ### Implement Features
25 |
26 | If you have a great set of dashboards, detectors, API scripts for sending metrics, or any other content
27 | you believe will be of use to others, please contribute it!
28 |
29 | Or check the Issues for this repo on GitHub. Anything tagged with "enhancement"
30 | and "help wanted" is open to whoever wants to implement it.
31 |
32 | ### Write Documentation
33 |
34 | Submissions and `README.md` files could always use more documentation. Documentation can always use an update or tweak in the official docs, in docstrings of scripts, comments in configs, or anywhere a bit of clarity may be useful..
35 |
36 | ### Submit Feedback
37 |
38 | If you are proposing a feature:
39 |
40 | - Explain in detail how it would work.
41 | - Keep the scope as narrow as possible, to make it easier
42 | to implement.
43 | - Remember that this is a volunteer-driven project, and that
44 | contributions are welcome :)
45 |
46 | ## Pull Request Guidelines
47 |
48 | Before you submit a pull request, check that it meets these guidelines:
49 |
50 | 1. The pull request should include a `README.md` for any new submission.
51 | 2. If the pull request adds functionality, the `README.md` docs for that component or submission should be updated.
52 | Put your new functionality into a function with a docstring, and add
53 | the feature to the list in README.md.
54 | 3. Terraform submissions should work with the most current version of the included Terraform Provider.
55 | 4. Python submissions should work for Python3
56 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/aws/elb.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_detector" "httpcode_elb_5xx" {
2 | name = "${var.o11y_prefix} AWS/ELB has high 5XX response ratio"
3 | description = "Alerts when 10% of requests were 5XX for last 5m"
4 |
5 | program_text = <<-EOF
6 | A = data('HTTPCode_ELB_5XX', filter=(filter('namespace', 'AWS/ELB') and filter('stat', 'count') and filter('LoadBalancerName', '*'))).publish(label='HTTPCode_ELB_5XX', enable=False)
7 | B = data('RequestCount', filter=(filter('namespace', 'AWS/ELB') and filter('stat', 'count') and filter('LoadBalancerName', '*'))).publish(label='RequestCount', enable=False)
8 | detect(when(((A/B)*100) >= 10, lasting='5m')).publish('AWS/ELB 10% of requests were 5XX for last 5m')
9 | EOF
10 |
11 | rule {
12 | detect_label = "AWS/ELB 10% of requests were 5XX for last 5m"
13 | severity = "Critical"
14 | parameterized_body = var.message_body
15 | }
16 | }
17 |
18 | resource "signalfx_detector" "surgequeuelength_elb" {
19 | name = "${var.o11y_prefix} AWS/ELB has high Surge Queue Length (>= 90%)"
20 | description = "Alerts when Surge Queue Length is >= 90%"
21 |
22 | program_text = <<-EOF
23 | A = data('SurgeQueueLength', filter=filter('stat', 'upper') and (not filter('AvailabilityZone', '*'))).publish(label='A')
24 | detect(when((A/1024)*100 >= 90, lasting='5m')).publish('AWS/ELB SurgeQueueLength is close to capacity')
25 | EOF
26 |
27 | rule {
28 | detect_label = "AWS/ELB SurgeQueueLength is close to capacity"
29 | severity = "Critical"
30 | parameterized_body = var.message_body
31 | }
32 | }
33 |
34 | resource "signalfx_detector" "spillover_elb" {
35 | name = "${var.o11y_prefix} AWS/ELB has spillover"
36 | description = "Alerts when ELB Spillover is detected (generates 503 for users)"
37 |
38 | program_text = <<-EOF
39 | A = data('SpilloverCount', filter=filter('stat', 'sum') and filter('namespace', 'AWS/ELB') and (not filter('AvailabilityZone', '*'))).publish(label='A')
40 | detect(when(A > 0)).publish('AWS/ELB Spillover detected')
41 | EOF
42 |
43 | rule {
44 | detect_label = "AWS/ELB Spillover detected"
45 | severity = "Critical"
46 | parameterized_body = var.message_body
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/integration-examples/ci-webhook-serverless/ci-webhook-handler/serverless.yml:
--------------------------------------------------------------------------------
1 | service: ci-webhook-handler
2 | frameworkVersion: '2'
3 |
4 | plugins:
5 | - serverless-python-requirements
6 |
7 | provider:
8 | name: aws
9 | runtime: python3.8
10 | lambdaHashingVersion: '20201221'
11 | iam:
12 | role:
13 | statements: # permissions for all of your functions can be set here
14 | - Effect: Allow
15 | Action: # Gives permission to specific DynamoDB tables in all regions
16 | - dynamodb:DescribeTable
17 | - dynamodb:Query
18 | - dynamodb:Scan
19 | - dynamodb:GetItem
20 | - dynamodb:BatchGetItem
21 | - dynamodb:PutItem
22 | - dynamodb:BatchWriteItem
23 | - dynamodb:UpdateItem
24 | - dynamodb:DeleteItem
25 | Resource: 'arn:aws:dynamodb:*:*:table/webhookEventsTable'
26 | - Effect: Allow
27 | Action: # Gives permission to read secret via SecretsManager in all regions
28 | - secretsmanager:ListSecretVersionIds
29 | - secretsmanager:DescribeSecret
30 | - secretsmanager:GetResourcePolicy
31 | - secretsmanager:GetSecretValue
32 | Resource: 'arn:aws:secretsmanager:*:*:secret:SignalFx/Ingest-*'
33 |
34 | functions:
35 | ciwebhook:
36 | handler: handler.ciwebhook
37 | events:
38 | - httpApi:
39 | path: /
40 | method: post
41 |
42 | resources:
43 | Resources:
44 | eventsTable:
45 | Type: AWS::DynamoDB::Table
46 | Properties:
47 | TableName: webhookEventsTable
48 | AttributeDefinitions:
49 | - AttributeName: buildId
50 | AttributeType: S
51 | - AttributeName: buildStep
52 | AttributeType: S
53 | KeySchema:
54 | - AttributeName: buildId
55 | KeyType: HASH
56 | - AttributeName: buildStep
57 | KeyType: RANGE
58 | ProvisionedThroughput:
59 | ReadCapacityUnits: 1
60 | WriteCapacityUnits: 1
61 | authToken:
62 | Type: AWS::SecretsManager::Secret
63 | Properties:
64 | Description: SignalFx Endpoint and Token Info
65 | Name: SignalFx/Ingest
66 |
67 | custom:
68 | pythonRequirements:
69 | dockerizePip: non-linux
70 |
--------------------------------------------------------------------------------
/integration-examples/system-scanner/health.py:
--------------------------------------------------------------------------------
1 | import os
2 | import socket
3 | import shutil
4 | from typing import Dict
5 | import logging
6 |
7 |
8 | class HealthCheck:
9 | def __init__(self):
10 | self.logger = logging.getLogger(__name__)
11 |
12 | def check_system_resources(self) -> Dict[str, bool]:
13 | checks = {
14 | "disk_space": self._check_disk_space(),
15 | "network": self._check_network(),
16 | "file_permissions": self._check_file_permissions(),
17 | }
18 | self._log_health_status(checks)
19 | return checks
20 |
21 | def _check_disk_space(self, min_space_gb: float = 1.0) -> bool:
22 | """Check if there's sufficient disk space using standard lib shutil"""
23 | try:
24 | total, used, free = shutil.disk_usage("/")
25 | free_gb = free // (2**30) # Convert bytes to GB
26 | return free_gb >= min_space_gb
27 | except Exception as e:
28 | self.logger.error(f"Error checking disk space: {e}")
29 | return False
30 |
31 | def _check_network(self) -> bool:
32 | """Basic network connectivity check using standard socket library"""
33 | try:
34 | # Try to connect to Google's DNS server
35 | socket.create_connection(("8.8.8.8", 53), timeout=3)
36 | return True
37 | except Exception as e:
38 | self.logger.error(f"Error checking network: {e}")
39 | return False
40 |
41 | def _check_file_permissions(self) -> bool:
42 | """Check if the program has necessary file permissions"""
43 | try:
44 | # Try to create a temporary file
45 | test_file = "permission_test.tmp"
46 | with open(test_file, "w") as f:
47 | f.write("test")
48 | os.remove(test_file)
49 | return True
50 | except Exception as e:
51 | self.logger.error(f"Error checking file permissions: {e}")
52 | return False
53 |
54 | def _log_health_status(self, checks: Dict[str, bool]):
55 | for check, status in checks.items():
56 | if not status:
57 | self.logger.warning(f"Health check failed for: {check}")
58 | else:
59 | self.logger.info(f"Health check passed for: {check}")
60 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/aws/lambda.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_detector" "lambda_errors" {
2 | name = "${var.o11y_prefix} AWS/Lambda Errors"
3 | description = "AWS/Lambda Function Error Rates"
4 | program_text = <<-EOF
5 | function_errors = data('Errors', filter=(filter('namespace', 'AWS/Lambda') and filter('FunctionName', '*') and filter('Resource', '*') and filter('stat', 'sum'))).publish(label='function_errors', enable=False)
6 | detect((when(function_errors > 10, lasting='5m'))).publish('AWS/Lambda function error rate is greater than 10 for the last 5m')
7 | from signalfx.detectors.against_periods import against_periods
8 | hist_duration_errors = data('Duration', filter=filter('namespace', 'AWS/Lambda')).mean().publish(label='hist_duration_errors', enable=False)
9 | against_periods.detector_mean_std(stream=hist_duration_errors, window_to_compare='15m', space_between_windows='60m', num_windows=4, fire_num_stddev=3, clear_num_stddev=2, discard_historical_outliers=True, orientation='above').publish('AWS/Lambda Lambda duration has been greater then historical norm during the past 15 minutes')
10 | from signalfx.detectors.against_periods import against_periods
11 | cold_start_errors = data('function.cold_starts',filter=filter('namespace', 'AWS/Lambda')).publish(label='cold_start_errors', enable=False)
12 | against_periods.detector_mean_std(stream=cold_start_errors, window_to_compare='10m', space_between_windows='24h', num_windows=4, fire_num_stddev=3, clear_num_stddev=2.5, discard_historical_outliers=True, orientation='above').publish('AWS/Lambda Wrapper coldstart count has been greater then historical norm during the past 10 minutes')
13 | EOF
14 | rule {
15 | detect_label = "AWS/Lambda function error rate is greater than 10 for the last 5m"
16 | severity = "Major"
17 | parameterized_body = var.message_body
18 | }
19 | rule {
20 | detect_label = "AWS/Lambda Lambda duration has been greater then historical norm during the past 15 minutes"
21 | severity = "Minor"
22 | parameterized_body = var.message_body
23 | }
24 | rule {
25 | detect_label = "AWS/Lambda Wrapper coldstart count has been greater then historical norm during the past 10 minutes"
26 | severity = "Warning"
27 | parameterized_body = var.message_body
28 | }
29 | }
--------------------------------------------------------------------------------
/integration-examples/synthetics-examples/Browser/hipstershop-complete-order-test-browser/README.md:
--------------------------------------------------------------------------------
1 | # Synthetic Browser Check - Purchase Checkout Example
2 | This synthetic browser check provides an example test for complex user flows (in this case checkout) on an e-commerce website ([HipsterShop](https://github.com/signalfx/microservices-demo/)). It simulates the user journey from browsing products to completing an order, ensuring critical functionalities are working correctly.
3 | The test and it's configuration are included in this directory:
4 | - [`synthetics_hipstershop_order_completion_browser_check.tf`](./synthetics_hipstershop_order_completion_browser_check.tf)
5 | - Uses the [Splunk Synthetics Terraform provider](https://registry.terraform.io/providers/splunk/synthetics/latest/docs)
6 |
7 | ## Synthetic Browser Test
8 | The configuration leverages Terraform's synthetics browser check resource to automate interactions such as navigating URLs, selecting products, adding them to the cart, and placing orders. This example can be adapted for testing similar flows in your own applications.
9 |
10 | - For for more information on selectors and how to find the correct ones when building off this example check out this [Splunk Lantern article](https://lantern.splunk.com/Observability/UCE/Proactive_response/Improve_User_Experiences/Running_Synthetics_browser_tests/Selectors_for_multi-step_browser_tests)!
11 |
12 | ## Required Setup
13 |
14 | 1. **Replace the hipstershop URL in the test with your URL**: Modify the placeholder value in this test from `https://my-hipstershop-demo-url-should-go-here.com/` to the URL for your hipstershop instance url
15 |
16 | ## Transaction Steps Details:
17 |
18 | **Home Transaction:**
19 | Uses the go_to_url action to navigate to the Hipstershop demo site's URL.
20 |
21 | **Shop Transaction:**
22 | Executes JavaScript to select a random product from a predefined list and open the product's page.
23 |
24 | **Cart Transaction:**
25 | Clicks the "Add to Cart" button using an `xpath` selector to locate the button.
26 |
27 | **Place Order Transaction:**
28 | Step 1: Clicks the "Place order" button using an `xpath` selector.
29 | Step 2: Waits for 20 seconds to allow for the backend to process the order.
30 | Step 3: Asserts that the text "Order Confirmation ID" is present on the page.
31 |
32 | **Keep Browsing Transaction:**
33 | Clicks a button to navigate away from the order confirmation page
34 |
35 |
36 |
--------------------------------------------------------------------------------
/integration-examples/synthetics-examples/API/token-expiration-to-metrics-api/README.md:
--------------------------------------------------------------------------------
1 | # Token Expiration using Splunk Synthetics API check
2 | This Test queries the `/organization` endpoint of a Splunk Observability organization and retrieves the values of any tokens expiring within the next 30 days or next 7 days and sends metrics for that data to Splunk Observability.
3 | - [`synthetics_token_expiration_api_check.tf`](./synthetics_token_expiration_api_check.tf)
4 | This API test includes a detector which relies on metrics created by this test. That test and it's configuration are also included in this directory along with the detector as Terraform `.tf` files.
5 | - Uses the [Splunk Synthetics Terraform provider](https://registry.terraform.io/providers/splunk/synthetics/latest/docs)
6 | - [`detector_token_expiration.tf`](detector_token_expiration.tf)
7 | - Uses the [Signalfx Terraform Provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs)
8 |
9 | ## Synthetic API Test
10 | The synthetic API test will call the [`/organization` endpoint](https://dev.splunk.com/observability/reference/api/organizations/latest#endpoint-retrieve-organization) for your Splunk Observability organization and collect the list of tokens expiring in the next 7 and 30 days. Those token names will be added as dimension attributes to two new metrics named:
11 | - `tokens.expiring.7days`
12 | - `tokens.expiring.30days`
13 |
14 | These metrics and dimensions will be sent to your organization's ingest endpoint and will power your detector.
15 |
16 | ### Required Splunk Synthetic Global Variables
17 | The following [global variables](https://docs.splunk.com/observability/en/synthetics/test-config/global-variables.html) are **REQUIRED** to run the included API test.
18 | - `org_api_token`: A provisioned API token (Read-only is fine)
19 | - `org_ingest_token`: A provisioned INGEST token
20 | 
21 |
22 |
23 | ## Token Expiration Metrics and Detection
24 | Both `tokens.expiring.7days` and `tokens.expiring.30days` can be charted as you normally would with any other metric.
25 | 
26 |
27 | The [included alert](./detector_token_expiration.tf) includes custom thresholds for both of the included metrics. If you'd prefer these can easily be split into two alerts of different severities. Simply alert when either of the signals is greater than 0.
--------------------------------------------------------------------------------
/integration-examples/synthetics-examples/API/status-to-splunk-hec/synthetics_status_to_splunk_hec_api_check.tf:
--------------------------------------------------------------------------------
1 | resource "synthetics_create_api_check_v2" "synthetics_status_to_splunk_hec_api_check" {
2 | test {
3 | active = true
4 | automatic_retries = 0
5 | device_id = 34
6 | frequency = 60
7 | location_ids = ["aws-us-east-1", "aws-us-west-1"]
8 | name = "OpenAI Status - To Splunk HEC"
9 | scheduling_strategy = "round_robin"
10 | requests {
11 | configuration {
12 | body = null
13 | headers = {}
14 | name = "Get OpenAI status"
15 | request_method = "GET"
16 | url = "https://status.openai.com/proxy/status.openai.com"
17 | }
18 | validations {
19 | actual = "{{response.code}}"
20 | code = null
21 | comparator = "is_less_than"
22 | expected = "300"
23 | extractor = null
24 | name = "Assert response code is less than 300"
25 | source = null
26 | type = "assert_numeric"
27 | value = null
28 | variable = null
29 | }
30 | validations {
31 | actual = null
32 | code = null
33 | comparator = null
34 | expected = null
35 | extractor = "$.summary.ongoing_incidents[*].updates"
36 | name = "Extract from response body"
37 | source = "{{response.body}}"
38 | type = "extract_json"
39 | value = null
40 | variable = "openai_ongoing_incidents"
41 | }
42 | }
43 | requests {
44 | configuration {
45 | body = "{{custom.openai_ongoing_incidents}}"
46 | headers = {
47 | Authorization = "{{env.hec_token}}"
48 | }
49 | name = "Send to Splunk HEC Ingest"
50 | request_method = "POST"
51 | url = "{{env.splunk_hec_url}}"
52 | }
53 | validations {
54 | actual = "{{response.code}}"
55 | code = null
56 | comparator = "is_less_than"
57 | expected = "300"
58 | extractor = null
59 | name = "Assert response code is less than 300"
60 | source = null
61 | type = "assert_numeric"
62 | value = null
63 | variable = null
64 | }
65 | }
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/kubernetes/node.tf:
--------------------------------------------------------------------------------
1 | /*
2 | resource "signalfx_detector" "k8s_node_cpu_imbalance" {
3 | name = "${var.o11y_prefix} K8S Cluster CPU balance"
4 | description = "Alerts when cluster CPU usage is imbalanced"
5 | program_text = <<-EOF
6 | A = data('container_cpu_utilization', filter=filter('k8s.cluster.name', '*') and filter('k8s.node.name', '*'), rollup='rate').sum(by=['k8s.node.name', 'k8s.cluster.name']).publish(label='A', enable=False)
7 | B = data('container_cpu_utilization', filter=filter('k8s.cluster.name', '*') and filter('k8s.node.name', '*')).sum(by=['k8s.node.name']).mean(by=['k8s.cluster.name']).publish(label='B', enable=False)
8 | C = ((A-B)/B).stddev(by=['k8s.cluster.name']).publish(label='C', enable=False)
9 | D = data('kube_node_info', filter=filter('k8s.cluster.name', '*'), rollup='count').count(by=['k8s.cluster.name']).publish(label='D', enable=False)
10 | E = (C*D).publish(label='K8S Cluster CPU usage is imbalanced')
11 | EOF
12 | rule {
13 | detect_label = "K8S Cluster CPU usage is imbalanced"
14 | severity = "Critical"
15 | disabled = true
16 | parameterized_body = var.message_body
17 | }
18 | }
19 | */
20 |
21 | resource "signalfx_detector" "k8s_node_not_ready" {
22 | name = "${var.o11y_prefix} K8S Nodes are not ready"
23 | description = "Alerts when K8s Node is not a ready state"
24 | program_text = <<-EOF
25 | A = data('k8s.node.condition_ready').sum(by=['k8s.cluster.name', 'k8s.node.name']).publish(label='A')
26 | detect(when(A < threshold(1), lasting='30s')).publish('K8s Node is not in a ready state')
27 | EOF
28 | rule {
29 | detect_label = "K8s Node is not in a ready state"
30 | severity = "Critical"
31 | parameterized_body = var.message_body
32 | }
33 | }
34 |
35 |
36 | resource "signalfx_detector" "k8s_node_high_memory" {
37 | name = "${var.o11y_prefix} K8S Node Memory > 90%"
38 | description = "Alerts when K8s Node is using memory > 90% for 5m"
39 | program_text = <<-EOF
40 | A = data('memory.utilization', filter=filter('k8s.cluster.name', '*')).sum(by=['host', 'k8s.cluster.name']).publish(label='A')
41 | detect(when(A > threshold(90), lasting='5m')).publish('K8s Node Memory is higher than 90% for 5m')
42 | EOF
43 | rule {
44 | detect_label = "K8s Node Memory is higher than 90% for 5m"
45 | severity = "Major"
46 | parameterized_body = var.message_body
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - High number of errors.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1727103358641,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GYKN4UQAABo",
12 | "labelResolutions": {
13 | "Hardware - High number of errors": 1000
14 | },
15 | "lastUpdated": 1730894451575,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": 0,
18 | "minDelay": 0,
19 | "name": "Hardware - High number of errors",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "A = data('hw.errors', filter=filter('hw.type', 'physical_disk', 'memory')).publish(label='A')\ndetect(when(A > threshold(1))).publish('Hardware - High number of errors')",
23 | "rules": [
24 | {
25 | "description": "The value of hw.errors is above 1.",
26 | "detectLabel": "Hardware - High number of errors",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n## Errors\n**{{dimensions.[name]}}** encountered internal error(s) on {{dimensions.[host.name]}} in **{{dimensions.site}}**\n\n{{else}}\n{{dimensions.[hw.type]}} {{dimensions.[name]}} is no longer reporting errors.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "Hardware - Errors {{dimensions.[hw.type]}} {{dimensions.[name]}}",
31 | "severity": "Warning"
32 | }
33 | ],
34 | "sf_metricsInObjectProgramText": [
35 | "hw.errors"
36 | ],
37 | "status": "ACTIVE",
38 | "tags": [],
39 | "teams": [],
40 | "timezone": "",
41 | "visualizationOptions": {
42 | "disableSampling": false,
43 | "publishLabelOptions": [
44 | {
45 | "displayName": "hw.errors",
46 | "label": "A",
47 | "paletteIndex": null,
48 | "valuePrefix": null,
49 | "valueSuffix": null,
50 | "valueUnit": null
51 | }
52 | ],
53 | "showDataMarkers": true,
54 | "showEventLines": false,
55 | "time": {
56 | "range": 900000,
57 | "rangeEnd": 0,
58 | "type": "relative"
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/riakreceiver_metadata.yaml:
--------------------------------------------------------------------------------
1 | type: riak
2 |
3 | status:
4 | class: receiver
5 | stability:
6 | beta: [metrics]
7 | distributions: [contrib, observiq, sumo]
8 | codeowners:
9 | active: [djaglowski, armstrmi]
10 |
11 | resource_attributes:
12 | riak.node.name:
13 | description: The name this node uses to identify itself.
14 | enabled: true
15 | type: string
16 |
17 | attributes:
18 | request:
19 | description: The request operation type.
20 | type: string
21 | enum:
22 | - put
23 | - get
24 | operation:
25 | description: The operation type for index operations.
26 | type: string
27 | enum:
28 | - read
29 | - write
30 | - delete
31 |
32 | metrics:
33 | riak.node.operation.count:
34 | description: The number of operations performed by the node.
35 | unit: "{operation}"
36 | sum:
37 | monotonic: true
38 | aggregation_temporality: cumulative
39 | value_type: int
40 | enabled: true
41 | attributes: [request]
42 | riak.node.operation.time.mean:
43 | description: The mean time between request and response for operations performed by the node over the last minute.
44 | unit: us
45 | gauge:
46 | value_type: int
47 | enabled: true
48 | attributes: [request]
49 | riak.node.read_repair.count:
50 | description: The number of read repairs performed by the node.
51 | unit: "{read_repair}"
52 | sum:
53 | monotonic: true
54 | aggregation_temporality: cumulative
55 | value_type: int
56 | enabled: true
57 | riak.memory.limit:
58 | description: The amount of memory allocated to the node.
59 | unit: By
60 | sum:
61 | monotonic: false
62 | aggregation_temporality: cumulative
63 | value_type: int
64 | enabled: true
65 | riak.vnode.operation.count:
66 | description: The number of operations performed by vnodes on the node.
67 | unit: "{operation}"
68 | sum:
69 | monotonic: true
70 | aggregation_temporality: cumulative
71 | value_type: int
72 | enabled: true
73 | attributes: [request]
74 | riak.vnode.index.operation.count:
75 | description: The number of index operations performed by vnodes on the node.
76 | unit: "{operation}"
77 | sum:
78 | monotonic: false
79 | aggregation_temporality: cumulative
80 | value_type: int
81 | attributes: [operation]
82 | enabled: true
83 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/README.md:
--------------------------------------------------------------------------------
1 | # Observability Cloud Jumpstart
2 |
3 | **Note:** Requires Terraform (minimum) v1.3.x
4 |
5 | ## Introduction
6 |
7 | This repository provides detectors, dashboard groups, and dashboards that can easily be deployed in a Splunk Observability Cloud org using Terraform.
8 |
9 | This can be useful for the assets themselves, but also as a construct for how you can easily share assets across multiple parent/child orgs. Also included is an [export script](./export_script) which can be used to easily export dashboards, dashboard groups, and detectors.
10 |
11 | These are complimentary to the out of the box content provided by Splunk. This repository and its assets are provided "as-is" and are not supported by Splunk.
12 |
13 | ## Clone the repository
14 |
15 | `git clone https://github.com/splunk/observability-content-contrib.git`
16 |
17 | ## Change into JumpStart directory
18 |
19 | `cd observability-content-contrib/integration-examples/terraform-jumpstart`
20 |
21 | ## Initialise Terraform
22 |
23 | ``` text
24 | terraform init --upgrade
25 | ```
26 |
27 | ## Create a workspace (optional)
28 |
29 | ``` text
30 | terraform workspace new my_workspace
31 | ```
32 |
33 | Where `my_workspace` is the name of the workspace you want to create.
34 |
35 | ## 5. Terraform variables description
36 |
37 | - `api_token`: Observability API Token
38 | - `splunk_realm`: Observability Realm (`eu0`, `us0`, `us1`, `us2`, `jp0`, `au0`)
39 | - `o11y_prefix`: Text that will prefix all the detectors, dashboard groups, and dashboards
40 |
41 | ## Create a `terraform.tfvars` file
42 |
43 | Copy the template file `terraform.tfvars.template` to `terraform.tfvars` and fill in the values e.g.
44 |
45 | ``` text
46 | api_token="1234xxx5678yyyy"
47 | realm="eu0"
48 | o11y_prefix="[Splunk]"
49 | ```
50 |
51 | ## Review the execution plan
52 |
53 | ``` text
54 | terraform plan
55 | ```
56 |
57 | ## Apply the changes
58 |
59 | ``` text
60 | terraform apply
61 | ```
62 |
63 | ## Destroy everything
64 |
65 | If you created a workspace you will first need to ensure you are in the correct workspace e.g.
66 |
67 | ``` text
68 | terraform workspace select my_workspace
69 | ```
70 |
71 | Where `my_workspace` is the name of the workspace you want to be in. Then run the destroy command:
72 |
73 | ``` text
74 | terraform destroy
75 | ```
76 |
77 | ## Deploying a module
78 |
79 | ``` text
80 | terraform apply -target=module.aws
81 | terraform apply -target=module.dashboards
82 | terraform apply -target=module.gcp
83 | ```
84 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/rabbitmqreceiver_metadata.yaml:
--------------------------------------------------------------------------------
1 | type: rabbitmq
2 |
3 | status:
4 | class: receiver
5 | stability:
6 | beta: [metrics]
7 | distributions: [contrib, observiq, sumo]
8 | codeowners:
9 | active: [djaglowski, cpheps]
10 |
11 | resource_attributes:
12 | rabbitmq.queue.name:
13 | description: The name of the RabbitMQ queue.
14 | enabled: true
15 | type: string
16 | rabbitmq.node.name:
17 | description: The name of the RabbitMQ node.
18 | enabled: true
19 | type: string
20 | rabbitmq.vhost.name:
21 | description: The name of the RabbitMQ vHost.
22 | enabled: true
23 | type: string
24 |
25 | attributes:
26 | message.state:
27 | name_override: state
28 | description: The state of messages in a queue.
29 | type: string
30 | enum:
31 | - ready
32 | - unacknowledged
33 | metrics:
34 | rabbitmq.consumer.count:
35 | description: The number of consumers currently reading from the queue.
36 | unit: "{consumers}"
37 | sum:
38 | monotonic: false
39 | aggregation_temporality: cumulative
40 | value_type: int
41 | enabled: true
42 | rabbitmq.message.delivered:
43 | description: The number of messages delivered to consumers.
44 | unit: "{messages}"
45 | sum:
46 | monotonic: true
47 | aggregation_temporality: cumulative
48 | value_type: int
49 | enabled: true
50 | rabbitmq.message.published:
51 | description: The number of messages published to a queue.
52 | unit: "{messages}"
53 | sum:
54 | monotonic: true
55 | aggregation_temporality: cumulative
56 | value_type: int
57 | enabled: true
58 | rabbitmq.message.acknowledged:
59 | description: The number of messages acknowledged by consumers.
60 | unit: "{messages}"
61 | sum:
62 | monotonic: true
63 | aggregation_temporality: cumulative
64 | value_type: int
65 | enabled: true
66 | rabbitmq.message.dropped:
67 | description: The number of messages dropped as unroutable.
68 | unit: "{messages}"
69 | sum:
70 | monotonic: true
71 | aggregation_temporality: cumulative
72 | value_type: int
73 | enabled: true
74 | rabbitmq.message.current:
75 | description: The total number of messages currently in the queue.
76 | unit: "{messages}"
77 | sum:
78 | monotonic: false
79 | aggregation_temporality: cumulative
80 | value_type: int
81 | attributes: [message.state]
82 | enabled: true
83 |
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/pivotal/RouteEmitter.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_detector" "pivotal_cloudfoundry_DREM_errors" {
2 | name = "${var.o11y_prefix} Pivotal CloudFoundry Diego Route Emitter Metrics errors"
3 | description = "Alerts for various Pivotal CloudFoundry Route Emitter Metrics related error scenarios"
4 | program_text = <<-EOF
5 | from signalfx.detectors.against_periods import against_periods
6 | from signalfx.detectors.against_recent import against_recent
7 | from signalfx.detectors.not_reporting import not_reporting
8 | from signalfx.detectors.countdown import countdown
9 | TMP1 = data('route_emitter.RouteEmitterSyncDuration', filter=filter('metric_source', 'cloudfoundry'), rollup='max').max(over='15m').publish(label='TMP1', enable=False)
10 | RouteEmitterSyncDuration = (TMP1/1000000000).publish(label='C', enable=False)
11 | detect(when((RouteEmitterSyncDuration >= 5) and (RouteEmitterSyncDuration < 10))).publish('Pivotal Cloudfoundry - RouteEmitterSyncDuration between 5 and 10 seconds.')
12 | detect(when(RouteEmitterSyncDuration >= 10)).publish('Pivotal Cloudfoundry - RouteEmitterSyncDuration greater or eaqual to 10 seconds.')
13 | EOF
14 | rule {
15 | detect_label = "Pivotal Cloudfoundry - RouteEmitterSyncDuration between 5 and 10 seconds."
16 | severity = "Minor"
17 | tip = "If all or many jobs showing as impacted, there is likely an issue with Diego.\n 1 - Investigate the Route Emitter and Diego BBS logs for errors.\n2 - Verify that app routes are functional by making a request to an app, pushing an app and pinging it, or if applicable, checking that your smoke tests have passed.\nIf one or a few jobs showing as impacted, there is likely a connectivity issue and the impacted job should be investigated further."
18 | parameterized_body = var.message_body
19 | }
20 |
21 | rule {
22 | detect_label = "Pivotal Cloudfoundry - RouteEmitterSyncDuration greater or eaqual to 10 seconds."
23 | severity = "Minor"
24 | tip = "If all or many jobs showing as impacted, there is likely an issue with Diego.\n 1 - Investigate the Route Emitter and Diego BBS logs for errors.\n2 - Verify that app routes are functional by making a request to an app, pushing an app and pinging it, or if applicable, checking that your smoke tests have passed.\nIf one or a few jobs showing as impacted, there is likely a connectivity issue and the impacted job should be investigated further."
25 | parameterized_body = var.message_body
26 | }
27 |
28 | }
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - Missing device.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1726836391594,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GX6ZxEYAAAA",
12 | "labelResolutions": {
13 | "Hardware - Missing device": 5000
14 | },
15 | "lastUpdated": 1727976755429,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": 0,
18 | "minDelay": 0,
19 | "name": "Hardware - Missing device",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "AB = alerts(detector_name='Hardware - Missing Device').publish(label='AB')\nA = data('hw.status', filter=filter('state', 'present'), rollup='min').publish(label='A')\ndetect(when(A < threshold(1))).publish('Hardware - Missing device')",
23 | "rules": [
24 | {
25 | "description": "The value of hw.status is below 1.",
26 | "detectLabel": "Hardware - Missing device",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n###Missing {{dimensions.[hw.type]}}\n\n**{{dimensions.name}}** is no longer detected on on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Recommended action\nCheck whether the device was intentionally removed from the system or if it is not responding. \n{{else}}\nThe device has recovered. \n{{/if}}\n\n###Device Details\n**Name:** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Type:** {{dimensions.[hw.type]}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "Missing {{dimensions.[hw.type]}} on **{{dimensions.[host.name]}}**",
31 | "severity": "Major"
32 | }
33 | ],
34 | "sf_metricsInObjectProgramText": [
35 | "hw.status"
36 | ],
37 | "status": "ACTIVE",
38 | "tags": [],
39 | "teams": [],
40 | "timezone": "",
41 | "visualizationOptions": {
42 | "disableSampling": false,
43 | "publishLabelOptions": [
44 | {
45 | "displayName": "hw.status",
46 | "label": "A",
47 | "paletteIndex": null,
48 | "valuePrefix": null,
49 | "valueSuffix": null,
50 | "valueUnit": null
51 | }
52 | ],
53 | "showDataMarkers": true,
54 | "showEventLines": false,
55 | "time": {
56 | "range": 86400000,
57 | "rangeEnd": 0,
58 | "type": "relative"
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/observability-tf-configs/filestatsreceiver_metadata.yaml.tf:
--------------------------------------------------------------------------------
1 |
2 | resource "signalfx_dashboard" "filestatsdashboard" {
3 | name = "filestats"
4 | dashboard_group = signalfx_dashboard_group.filestatsdashboardgroup0.id
5 | time_range = "-1h"
6 |
7 | grid {
8 | chart_ids = [
9 | signalfx_time_chart.file_mtime.id, signalfx_time_chart.file_ctime.id, signalfx_time_chart.file_atime.id, signalfx_time_chart.file_size.id
10 | ]
11 | width = 4
12 | height = 1
13 | }
14 | }
15 |
16 | resource "signalfx_dashboard_group" "filestatsdashboardgroup0" {
17 | name = "filestats generated OTel dashboard group"
18 | description = "filestats generated OTel dashboard group"
19 | }
20 |
21 | resource "signalfx_time_chart" "file_mtime" {
22 | name = "Elapsed time since the last modification of the file or folder, in seconds since Epoch."
23 |
24 | program_text = <<-EOF
25 | data("file.mtime").publish(label="Elapsed time since the last modification of the file or folder, in seconds since Epoch.")
26 | EOF
27 |
28 | time_range = 14400
29 |
30 | plot_type = "LineChart"
31 | show_data_markers = true
32 | }
33 |
34 |
35 | resource "signalfx_time_chart" "file_ctime" {
36 | name = "Elapsed time since the last change of the file or folder, in seconds since Epoch. In addition to `file.mtime`, this metric tracks metadata changes such as permissions or renaming the file."
37 |
38 | program_text = <<-EOF
39 | data("file.ctime").publish(label="Elapsed time since the last change of the file or folder, in seconds since Epoch. In addition to `file.mtime`, this metric tracks metadata changes such as permissions or renaming the file.")
40 | EOF
41 |
42 | time_range = 14400
43 |
44 | plot_type = "LineChart"
45 | show_data_markers = true
46 | }
47 |
48 |
49 | resource "signalfx_time_chart" "file_atime" {
50 | name = "Elapsed time since last access of the file or folder, in seconds since Epoch."
51 |
52 | program_text = <<-EOF
53 | data("file.atime").publish(label="Elapsed time since last access of the file or folder, in seconds since Epoch.")
54 | EOF
55 |
56 | time_range = 14400
57 |
58 | plot_type = "LineChart"
59 | show_data_markers = true
60 | }
61 |
62 |
63 | resource "signalfx_time_chart" "file_size" {
64 | name = "The size of the file or folder, in bytes."
65 |
66 | program_text = <<-EOF
67 | data("file.size").publish(label="The size of the file or folder, in bytes.")
68 | EOF
69 |
70 | time_range = 14400
71 |
72 | plot_type = "LineChart"
73 | show_data_markers = true
74 | }
75 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/pull-otel-yaml.py:
--------------------------------------------------------------------------------
1 | import os
2 | import requests
3 | import yaml
4 |
5 | # GitHub repository information
6 | repo_owner = "open-telemetry"
7 | repo_name = "opentelemetry-collector-contrib"
8 | repo_path = "receiver"
9 | github_token = os.environ.get('GITHUB_PAT_TOKEN')
10 | headers = {}
11 | api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{repo_path}"
12 |
13 | # Check for our PAT and make auth headers so we don't get rate limited
14 | if github_token is not None:
15 | headers["Authorization"] = "Bearer " + github_token
16 | else:
17 | print("no $GITHUB_PAT_TOKEN environment variable found. Expect rate limiting.")
18 |
19 | # Make a request to GitHub API
20 | response = requests.get(api_url, headers=headers)
21 | contents = response.json()
22 | if response.status_code != 200:
23 | print("Received " + str(response.status_code) + " STATUS CODE. \n" + response.text)
24 | exit()
25 |
26 | # Iterate through contents and find subdirectories
27 | directories = [content["name"] for content in contents if content["type"] == "dir"]
28 |
29 | # Iterate through subdirectories and extract metadata.yaml with 'metrics' section
30 | for sub in directories:
31 | subdir_api_url = f"{api_url}/{sub}"
32 | subdir_response = requests.get(subdir_api_url, headers=headers)
33 | if subdir_response.status_code != 200:
34 | print("Received " + str(subdir_response.status_code) + " STATUS CODE. \n" + response.text)
35 | exit()
36 | subdir_contents = subdir_response.json()
37 |
38 | # Check if metadata.yaml exists in the subdirectory
39 | metadata_content = None
40 | for content in subdir_contents:
41 | if content["name"] == "metadata.yaml":
42 | metadata_url = content["download_url"]
43 | metadata_response = requests.get(metadata_url, headers=headers)
44 | if metadata_response.status_code != 200:
45 | print("Received " + str(metadata_response.status_code) + " STATUS CODE. \n" + response.text)
46 | exit()
47 | metadata_content = metadata_response.text
48 | break
49 |
50 | if metadata_content:
51 | # Parse YAML content
52 | metadata_data = yaml.safe_load(metadata_content)
53 |
54 | # Check if 'metrics' section exists in metadata.yaml then save
55 | if "metrics" in metadata_data:
56 | filename = f"./otel-receiver-yaml/{sub}_metadata.yaml"
57 | with open(filename, "w") as file:
58 | file.write(metadata_content)
59 | print(f"Metadata.yaml with 'metrics' section extracted from {sub}")
60 |
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - Critically low fan speed (%).json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1727095467806,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GYKAGp3AEAs",
12 | "labelResolutions": {
13 | "Hardware - Critically low fan speed (%)": 1000
14 | },
15 | "lastUpdated": 1729906605545,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": 0,
18 | "minDelay": 0,
19 | "name": "Hardware - Critically low fan speed (%)",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "A = data('hw.fan.speed').publish(label='A')\ndetect(when(A < threshold(1))).publish('Hardware - Critically low fan speed (%)')",
23 | "rules": [
24 | {
25 | "description": "The value of hw.fan.speed is below 1.",
26 | "detectLabel": "Hardware - Critically low fan speed (%)",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n###Low fan speed\nFan speed for **{{dimensions.[name]}}** is critically low on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Consequence\nThe temperature of the chip, component or device that was cooled down by this fan, may rise rapidly. This could lead to severe hardware damage and system crashes.\n\n###Recommended action\nCheck if the fan no longer cools down the system. If so, replace the fan.\n{{else}}\nRecovered fan speed.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "Critically low fan speed (%)",
31 | "severity": "Minor"
32 | }
33 | ],
34 | "sf_metricsInObjectProgramText": [
35 | "hw.fan.speed"
36 | ],
37 | "status": "ACTIVE",
38 | "tags": [],
39 | "teams": [],
40 | "timezone": "",
41 | "visualizationOptions": {
42 | "disableSampling": false,
43 | "publishLabelOptions": [
44 | {
45 | "displayName": "hw.fan.speed",
46 | "label": "A",
47 | "paletteIndex": null,
48 | "valuePrefix": null,
49 | "valueSuffix": null,
50 | "valueUnit": null
51 | }
52 | ],
53 | "showDataMarkers": true,
54 | "showEventLines": false,
55 | "time": {
56 | "range": 86400000,
57 | "rangeEnd": 0,
58 | "type": "relative"
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - Predicted failure.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1726843486243,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GX6tR8qAEBQ",
12 | "labelResolutions": {
13 | "Hardware - Predicted Failure": 180000
14 | },
15 | "lastUpdated": 1729905705828,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": 0,
18 | "minDelay": 0,
19 | "name": "Hardware - Predicted failure",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "AB = alerts(detector_name='Hardware - Predicted Failure').publish(label='AB')\nA = data('hw.status', filter=filter('state', 'predicted_failure'), rollup='max').publish(label='A')\ndetect(when(A > threshold(0))).publish('Hardware - Predicted Failure')",
23 | "rules": [
24 | {
25 | "description": "The value of hw.status is above 0.",
26 | "detectLabel": "Hardware - Predicted Failure",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n###Predicted {{dimensions.[hw.type]}} failure\n\n**{{dimensions.name}}** is predicted to fail soon on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n{{else}}\nFailure is no longer predicted for {{dimensions.[hw.type]}} **{{dimensions.name}}** since{{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n###Device Details\n**Name:** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Type:** {{dimensions.[hw.type]}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "Predicted failure for {{dimensions.[hw.type]}} {{dimensions.name}}",
31 | "severity": "Warning"
32 | }
33 | ],
34 | "sf_metricsInObjectProgramText": [
35 | "hw.status"
36 | ],
37 | "status": "ACTIVE",
38 | "tags": [],
39 | "teams": [],
40 | "timezone": "",
41 | "visualizationOptions": {
42 | "disableSampling": false,
43 | "publishLabelOptions": [
44 | {
45 | "displayName": "hw.status",
46 | "label": "A",
47 | "paletteIndex": null,
48 | "valuePrefix": null,
49 | "valueSuffix": null,
50 | "valueUnit": null
51 | }
52 | ],
53 | "showDataMarkers": true,
54 | "showEventLines": false,
55 | "time": {
56 | "range": 86400000,
57 | "rangeEnd": 0,
58 | "type": "relative"
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/detectors/snowflakedb/Snowflake - Blocked Queries.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1660060502884,
7 | "customProperties": {},
8 | "description": "Blocked Queries by Warehouse",
9 | "detectorOrigin": "Standard",
10 | "labelResolutions": {
11 | "Snowflake - Blocked Queries": 3600000
12 | },
13 | "lastUpdated": 1660233347529,
14 | "lastUpdatedBy": "E0jpLZIAYAA",
15 | "maxDelay": 0,
16 | "minDelay": 0,
17 | "name": "Snowflake - Blocked Queries",
18 | "overMTSLimit": false,
19 | "packageSpecifications": "",
20 | "parentDetectorId": null,
21 | "programText": "A = data('snowflake.query.blocked').sum(by=['WAREHOUSE_NAME']).publish(label='A')\ndetect(when(A > threshold(0), lasting='2h')).publish('Snowflake - Blocked Queries')",
22 | "rules": [
23 | {
24 | "description": "The value of Blocked Queries by Warehouse is above 0.",
25 | "detectLabel": "Snowflake - Blocked Queries",
26 | "disabled": false,
27 | "notifications": [],
28 | "parameterizedBody": "{{#if anomalous}}\n\n Rule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" triggered at {{dateTimeFormat timestamp format=\"full\"}}.\n{{else}}\n Rule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" cleared at {{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n{{#if anomalous}}\nTriggering condition: {{{readableRule}}} (%)\n{{/if}}\n\n{{#if anomalous}}Snowflake Queries Blocked per Warehouse in breaching state: \nQueries Blocked for Warehouse({{ dimensions.WAREHOUSE_NAME }}) value: {{inputs.A.value}}\n{{else}}Current signal value(s):\nQueries Blocked for Warehouse({{ dimensions.WAREHOUSE_NAME }}): {{inputs.A.value}}\n{{/if}}\n\n{{#notEmpty dimensions}}\nSignal details:\n{{{dimensions}}}\n{{/notEmpty}}\n\n{{#if anomalous}}\n{{#if runbookUrl}}Runbook: {{{runbookUrl}}}{{/if}}\n{{#if tip}}Tip: {{{tip}}}{{/if}}\n{{/if}}",
29 | "parameterizedSubject": null,
30 | "runbookUrl": null,
31 | "severity": "Critical",
32 | "tip": null
33 | }
34 | ],
35 | "sf_metricsInObjectProgramText": null,
36 | "tags": [],
37 | "teams": [],
38 | "timezone": "",
39 | "visualizationOptions": {
40 | "disableSampling": false,
41 | "publishLabelOptions": [
42 | {
43 | "displayName": "Blocked Queries by Warehouse",
44 | "label": "A",
45 | "paletteIndex": null,
46 | "valuePrefix": null,
47 | "valueSuffix": null,
48 | "valueUnit": null
49 | }
50 | ],
51 | "showDataMarkers": true,
52 | "showEventLines": false,
53 | "time": {
54 | "range": 86400000,
55 | "rangeEnd": 0,
56 | "type": "relative"
57 | }
58 | }
59 | }
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/observability-tf-configs/nginxreceiver_metadata.yaml.tf:
--------------------------------------------------------------------------------
1 |
2 | resource "signalfx_dashboard" "nginxdashboard" {
3 | name = "nginx"
4 | dashboard_group = signalfx_dashboard_group.nginxdashboardgroup0.id
5 | time_range = "-1h"
6 |
7 | grid {
8 | chart_ids = [
9 | signalfx_time_chart.nginx_requests.id, signalfx_time_chart.nginx_connections_accepted.id, signalfx_time_chart.nginx_connections_handled.id, signalfx_time_chart.nginx_connections_current.id
10 | ]
11 | width = 4
12 | height = 1
13 | }
14 | }
15 |
16 | resource "signalfx_dashboard_group" "nginxdashboardgroup0" {
17 | name = "nginx generated OTel dashboard group"
18 | description = "nginx generated OTel dashboard group"
19 | }
20 |
21 | resource "signalfx_time_chart" "nginx_requests" {
22 | name = "Total number of requests made to the server since it started"
23 |
24 | program_text = <<-EOF
25 | data("nginx.requests").publish(label="Total number of requests made to the server since it started")
26 | EOF
27 |
28 | time_range = 14400
29 |
30 | plot_type = "LineChart"
31 | show_data_markers = true
32 | }
33 |
34 |
35 | resource "signalfx_time_chart" "nginx_connections_accepted" {
36 | name = "The total number of accepted client connections"
37 |
38 | program_text = <<-EOF
39 | data("nginx.connections_accepted").publish(label="The total number of accepted client connections")
40 | EOF
41 |
42 | time_range = 14400
43 |
44 | plot_type = "LineChart"
45 | show_data_markers = true
46 | }
47 |
48 |
49 | resource "signalfx_time_chart" "nginx_connections_handled" {
50 | name = "The total number of handled connections. Generally, the parameter value is the same as nginx.connections_accepted unless some resource limits have been reached (for example, the worker_connections limit)."
51 |
52 | program_text = <<-EOF
53 | data("nginx.connections_handled").publish(label="The total number of handled connections. Generally, the parameter value is the same as nginx.connections_accepted unless some resource limits have been reached (for example, the worker_connections limit).")
54 | EOF
55 |
56 | time_range = 14400
57 |
58 | plot_type = "LineChart"
59 | show_data_markers = true
60 | }
61 |
62 |
63 | resource "signalfx_time_chart" "nginx_connections_current" {
64 | name = "The current number of nginx connections by state"
65 |
66 | program_text = <<-EOF
67 | data("nginx.connections_current").publish(label="The current number of nginx connections by state")
68 | EOF
69 |
70 | time_range = 14400
71 |
72 | plot_type = "LineChart"
73 | show_data_markers = true
74 | }
75 |
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/chronyreceiver_metadata.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | type: chrony
3 |
4 | status:
5 | class: receiver
6 | stability:
7 | alpha: [metrics]
8 | distributions: [contrib]
9 | codeowners:
10 | active: [MovieStoreGuy, jamesmoessis]
11 |
12 | attributes:
13 | leap.status:
14 | description: how the chrony is handling leap seconds
15 | type: string
16 | enum:
17 | - normal
18 | - insert_second
19 | - delete_second
20 | - unsynchronised
21 |
22 | metrics:
23 | ntp.frequency.offset:
24 | enabled: false
25 | description: The frequency is the rate by which the system s clock would be wrong if chronyd was not correcting it.
26 | extended_documentation: "It is expressed in ppm (parts per million). For example, a value of 1 ppm would mean that when the system’s clock thinks it has advanced 1 second, it has actually advanced by 1.000001 seconds relative to true time."
27 | unit: "ppm"
28 | gauge:
29 | value_type: double
30 | attributes:
31 | - leap.status
32 | ntp.skew:
33 | enabled: true
34 | description: This is the estimated error bound on the frequency.
35 | unit: "ppm"
36 | gauge:
37 | value_type: double
38 | ntp.stratum:
39 | enabled: false
40 | description: The number of hops away from the reference system keeping the reference time
41 | extended_documentation: To read further, refer to https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/system_administrators_guide/ch-configuring_ntp_using_the_chrony_suite#sect-Checking_chrony_tracking
42 | unit: "{count}"
43 | gauge:
44 | value_type: int
45 | ntp.time.correction:
46 | enabled: true
47 | description: The number of seconds difference between the system's clock and the reference clock
48 | unit: seconds
49 | gauge:
50 | value_type: double
51 | attributes:
52 | - leap.status
53 | ntp.time.last_offset:
54 | enabled: true
55 | description: The estimated local offset on the last clock update
56 | unit: seconds
57 | gauge:
58 | value_type: double
59 | attributes:
60 | - leap.status
61 | ntp.time.rms_offset:
62 | enabled: false
63 | description: the long term average of the offset value
64 | unit: seconds
65 | gauge:
66 | value_type: double
67 | attributes:
68 | - leap.status
69 | ntp.time.root_delay:
70 | enabled: false
71 | description: This is the total of the network path delays to the stratum-1 system from which the system is ultimately synchronised.
72 | unit: seconds
73 | gauge:
74 | value_type: double
75 | attributes:
76 | - leap.status
77 |
--------------------------------------------------------------------------------
/detectors/snowflakedb/Snowflake - Overloaded Queries.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1660073252422,
7 | "customProperties": {},
8 | "description": "Overloaded Queries in Snowflake",
9 | "detectorOrigin": "Standard",
10 | "labelResolutions": {
11 | "Snowflake - Overloaded Queries": 3600000
12 | },
13 | "lastUpdated": 1660233607397,
14 | "lastUpdatedBy": "E0jpLZIAYAA",
15 | "maxDelay": 0,
16 | "minDelay": 0,
17 | "name": "Snowflake - Overloaded Queries",
18 | "overMTSLimit": false,
19 | "packageSpecifications": "",
20 | "parentDetectorId": null,
21 | "programText": "A = data('snowflake.query.queued_overload').sum(by=['WAREHOUSE_NAME']).publish(label='A')\ndetect(when(A > threshold(0), lasting='1h')).publish('Snowflake - Overloaded Queries')",
22 | "rules": [
23 | {
24 | "description": "The value of Overloaded Queries is above 0.",
25 | "detectLabel": "Snowflake - Overloaded Queries",
26 | "disabled": false,
27 | "notifications": [],
28 | "parameterizedBody": "{{#if anomalous}}\n\n Rule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" triggered at {{dateTimeFormat timestamp format=\"full\"}}.\n{{else}}\n Rule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" cleared at {{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n{{#if anomalous}}\nTriggering condition: {{{readableRule}}} (%)\n{{/if}}\n\n{{#if anomalous}}Snowflake Overloaded Queries per Warehouse in breaching state: \nQueries Overloaded for Warehouse({{ dimensions.WAREHOUSE_NAME }}) value: {{inputs.A.value}}\n{{else}}Current signal value(s):\nQueries Overloaded for Warehouse({{ dimensions.WAREHOUSE_NAME }}): {{inputs.A.value}}\n{{/if}}\n\n{{#notEmpty dimensions}}\nSignal details:\n{{{dimensions}}}\n{{/notEmpty}}\n\n{{#if anomalous}}\n{{#if runbookUrl}}Runbook: {{{runbookUrl}}}{{/if}}\n{{#if tip}}Tip: {{{tip}}}{{/if}}\n{{/if}}",
29 | "parameterizedSubject": null,
30 | "runbookUrl": null,
31 | "severity": "Critical",
32 | "tip": null
33 | }
34 | ],
35 | "sf_metricsInObjectProgramText": null,
36 | "tags": [],
37 | "teams": [],
38 | "timezone": "",
39 | "visualizationOptions": {
40 | "disableSampling": false,
41 | "publishLabelOptions": [
42 | {
43 | "displayName": "Overloaded Queries",
44 | "label": "A",
45 | "paletteIndex": null,
46 | "valuePrefix": null,
47 | "valueSuffix": null,
48 | "valueUnit": null
49 | }
50 | ],
51 | "showDataMarkers": true,
52 | "showEventLines": false,
53 | "time": {
54 | "range": 86400000,
55 | "rangeEnd": 0,
56 | "type": "relative"
57 | }
58 | }
59 | }
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - Device status failed.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1726834590194,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GX6UimfAAEk",
12 | "labelResolutions": {
13 | "Hardware - Device status failed": 120000
14 | },
15 | "lastUpdated": 1730972332658,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": 0,
18 | "minDelay": 0,
19 | "name": "Hardware - Device status failed",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "A = data('hw.status', filter=filter('state', 'failed'), rollup='max').publish(label='A')\ndetect(when(A > threshold(0))).publish('Hardware - Device status failed')",
23 | "rules": [
24 | {
25 | "description": "The value of hw.status is above 0.",
26 | "detectLabel": "Hardware - Device status failed",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n###Failed {{dimensions.[hw.type]}}.\n\n**{{dimensions.name}}** has failed on **{{dimensions.[host.name]}}** in **{{dimensions.site}}** at {{dateTimeFormat timestamp format=\"full\"}}.\n{{else}}\nRecovered {{dimensions.[hw.type]}} **{{dimensions.name}}** from **failed** status at {{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n###Device Details\n**Name:** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Type:** {{dimensions.[hw.type]}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "[Hardware] Status failed for {{dimensions.[hw.type]}} of {{dimensions.[host.name]}} in {{dimensions.site}}",
31 | "runbookUrl": "",
32 | "severity": "Major",
33 | "tip": ""
34 | }
35 | ],
36 | "sf_metricsInObjectProgramText": [
37 | "hw.status"
38 | ],
39 | "status": "ACTIVE",
40 | "tags": [],
41 | "teams": [],
42 | "timezone": "",
43 | "visualizationOptions": {
44 | "disableSampling": false,
45 | "publishLabelOptions": [
46 | {
47 | "displayName": "hw.status",
48 | "label": "A",
49 | "paletteIndex": null,
50 | "valuePrefix": null,
51 | "valueSuffix": null,
52 | "valueUnit": null
53 | }
54 | ],
55 | "showDataMarkers": true,
56 | "showEventLines": false,
57 | "time": {
58 | "range": 43200000,
59 | "rangeEnd": 0,
60 | "type": "relative"
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/integration-examples/apiScripts/getMetricsForHost.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # This script will get all unique metrics for a given host.
4 | #
5 | # Edit token.yaml to contain valid
6 | # - Access Token (access_token)
7 | # - Realm (realm)
8 | #
9 | # Syntax: python3 getMetricsForHost.py -h
10 | # or
11 | # python3 getMetricsForHost.py -h -r -t
12 | #
13 | # HOST_NAME should be an exact match
14 |
15 | import argparse
16 | import yaml
17 | import requests
18 | import json
19 |
20 | def run(hostname, realm, token):
21 | limit = 5000
22 | url = "https://api.{}.signalfx.com/v2/metrictimeseries?limit={}&query=host.name:{}".format(realm, limit, hostname)
23 | headers = {"Content-Type": "application/json", "X-SF-TOKEN": "{}".format(token) }
24 | response = requests.get(url, headers=headers)
25 | responseJSON = json.loads(response.text)
26 |
27 | # If the result count is > limit, say so and exit
28 | try:
29 | cnt = responseJSON["count"]
30 | except:
31 | print("ERROR: Check your token, that's the most likely issue.")
32 | return
33 |
34 | if (cnt == 0):
35 | # Let's try using host instead of host.name (SmartAgent)
36 | print("--> No results for host.name, trying host")
37 | url = "https://api.{}.signalfx.com/v2/metrictimeseries?limit={}&query=host:{}".format(realm, limit, hostname)
38 | response = requests.get(url, headers=headers)
39 | responseJSON = json.loads(response.text)
40 | try:
41 | cnt = responseJSON["count"]
42 | except:
43 | print("ERROR: Unusual to fail here, probably an issue with the script.")
44 | return
45 |
46 | if (cnt > limit):
47 | print("Need to increase limit, this host has > {} mts's.".format(limit))
48 | return
49 |
50 | # Add metrics to a list
51 | arr = []
52 | for result in responseJSON['results']:
53 | arr.append(result['metric'])
54 |
55 | totalCount = len(arr)
56 | arr = list(set(arr)) # Remove Duplicates
57 | arr.sort()
58 | print(*arr, sep = "\n") # Print one per line
59 | print("--> {} metrics; {} mts".format(len(arr), totalCount))
60 |
61 | if __name__ == '__main__':
62 | with open('token.yaml', 'r') as ymlfile:
63 | cfg = yaml.safe_load(ymlfile)
64 |
65 | parser = argparse.ArgumentParser(description='Splunk - Get Host Metrics')
66 | parser.add_argument('-n', '--hostName', help='HostName', required=True)
67 | parser.add_argument('-r', '--realm', help='Realm', required=False)
68 | parser.add_argument('-t', '--token', help='Token', required=False)
69 | args = parser.parse_args()
70 |
71 | if (args.token is None):
72 | run(args.hostName, cfg['realm'], cfg['access_token'])
73 | else:
74 | run(args.hostName, args.realm, args.token)
75 |
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - Critical LUN pathing issue.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1727101400682,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GYKRRPGAAAA",
12 | "labelResolutions": {
13 | "Hardware - Critical LUN pathing issue": 180000
14 | },
15 | "lastUpdated": 1730952952374,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": null,
18 | "minDelay": null,
19 | "name": "Hardware - Critical LUN pathing issue",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "A = data('hw.lun.paths').publish(label='A')\ndetect(when(A < threshold(1))).publish('Hardware - Critical LUN pathing issue')",
23 | "rules": [
24 | {
25 | "description": "The value of hw.lun.paths is below 1.",
26 | "detectLabel": "Hardware - Critical LUN pathing issue",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n## Lost data access\nLUN **{{dimensions.[name]}}** is no longer available on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**\n\n## Consequence\nOne or more filesystems are no longer available (possible data loss).\n\n## Recommended action\nVerify the status of the underlying HBA and its connectivity. Verify the reachability of the storage system and whether any configuration change has been made to the corresponding storage volume.\n{{else}}\nRecovered available LUN paths.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "Critical LUN pathing issue",
31 | "runbookUrl": "",
32 | "severity": "Major",
33 | "tip": ""
34 | }
35 | ],
36 | "sf_metricsInObjectProgramText": [
37 | "hw.lun.paths"
38 | ],
39 | "status": "ACTIVE",
40 | "tags": [],
41 | "teams": [],
42 | "timezone": "",
43 | "visualizationOptions": {
44 | "disableSampling": false,
45 | "publishLabelOptions": [
46 | {
47 | "displayName": "hw.lun.paths",
48 | "label": "A",
49 | "paletteIndex": null,
50 | "valuePrefix": null,
51 | "valueSuffix": null,
52 | "valueUnit": null
53 | }
54 | ],
55 | "showDataMarkers": true,
56 | "showEventLines": false,
57 | "time": {
58 | "range": 86400000,
59 | "rangeEnd": 0,
60 | "type": "relative"
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/metricshub/README.md:
--------------------------------------------------------------------------------
1 | # MetricsHub
2 |
3 | ## Overview
4 |
5 | **MetricsHub** is a universal metrics collection agent designed for monitoring hardware components, system performance, and sustainability KPIs. It collects data from servers, storage systems, and network devices and pushes it to OpenTelemetry back-ends such as the Splunk Observability Cloud.
6 |
7 | ### Key Features
8 |
9 | - **Remote Monitoring**: MetricsHub supports the monitoring of thousands of systems remotely through protocols such as REST APIs, SNMP, WBEM, WMI, SSH, IPMI, and more.
10 | - **OpenTelemetry Integration**: MetricsHub acts as an OpenTelemetry agent, following its standards for easy integration with various observability platforms.
11 | - **Sustainability Metrics**: Track and report on energy usage and carbon footprint to optimize infrastructure efficiency.
12 | - **250+ Connectors**: Ready-to-use connectors for monitoring a wide variety of platforms. MetricsHub agent is truly vendor-neutral, providing consistent coverage for all manufacturers (e.g., Cisco, Dell EMC, Huawei, HP, IBM, Lenovo, Pure, and more).
13 |
14 | ### Dashboards
15 |
16 | MetricsHub comes with pre-configured dashboards that visualize hardware, as well as sustainability KPIs:
17 |
18 | | Dashboard | Description |
19 | | --- | --- |
20 | | **Hardware - Main** | Overview of all monitored systems, focusing on key hardware and sustainability metrics. |
21 | | **Hardware - Site** | Metrics specific to a particular site (a data center or a server room) and its monitored hosts. |
22 | | **Hardware - Host** | Metrics associated with one *host* and its internal devices. |
23 |
24 | ## Setup
25 |
26 | 1. Follow the [installation instructions](https://metricshub.com/docs/latest/installation/index.html)
27 | 2. Configure the OpenTelemetry Collector to export metrics to Splunk by editing `otel-config.yaml`:
28 |
29 | ```yaml
30 | exporters:
31 | signalfx:
32 | # Access token to send data to SignalFx.
33 | access_token:
34 | # SignalFx realm where the data will be received.
35 | realm:
36 | ```
37 |
38 | Get more information about the [SignalFx Metrics Exporter](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/signalfxexporter).
39 |
40 | ## Support
41 |
42 | Subscribers to **MetricsHub** gain access to the **MetricsHub Support Desk**, which provides:
43 |
44 | - Technical support
45 | - Patches and updates
46 | - Knowledge base access
47 |
48 | Splunk does not provide support for these dashboards and users should contact Sentry Software's support with any support requests.
49 |
50 | ### Further Reading
51 |
52 | For more information, visit the [MetricsHub](https://metricshub.com/) website.
53 |
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - Device status degraded.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1726835785724,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GX6QUA7AAAA",
12 | "labelResolutions": {
13 | "Hardware - Device status degraded": 180000
14 | },
15 | "lastUpdated": 1730114326958,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": 0,
18 | "minDelay": 0,
19 | "name": "Hardware - Device status degraded",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "A = data('hw.status', filter=filter('state', 'degraded'), rollup='min').publish(label='A')\ndetect(when(A > threshold(0), lasting='5m'), auto_resolve_after='15m').publish('Hardware - Device status degraded')",
23 | "rules": [
24 | {
25 | "description": "The value of hw.status is above 0.",
26 | "detectLabel": "Hardware - Device status degraded",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n###Degraded {{dimensions.[hw.type]}}.\n\n**{{dimensions.name}}** is degraded on **{{dimensions.[host.name]}}** in **{{dimensions.site}}** at {{dateTimeFormat timestamp format=\"full\"}}.\n{{else}}\nRecovered {{dimensions.[hw.type]}} **{{dimensions.name}}** from **degraded** status at {{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n###Device Details\n**Name:** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Type:** {{dimensions.[hw.type]}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "[Hardware] Status degraded for {{dimensions.[hw.type]}} of {{dimensions.[host.name]}} in {{dimensions.site}}",
31 | "runbookUrl": "",
32 | "severity": "Warning",
33 | "tip": ""
34 | }
35 | ],
36 | "sf_metricsInObjectProgramText": [
37 | "hw.status"
38 | ],
39 | "status": "ACTIVE",
40 | "tags": [],
41 | "teams": [],
42 | "timezone": "",
43 | "visualizationOptions": {
44 | "disableSampling": false,
45 | "publishLabelOptions": [
46 | {
47 | "displayName": "hw.status",
48 | "label": "A",
49 | "paletteIndex": null,
50 | "valuePrefix": null,
51 | "valueSuffix": null,
52 | "valueUnit": null
53 | }
54 | ],
55 | "showDataMarkers": true,
56 | "showEventLines": false,
57 | "time": {
58 | "range": 86400000,
59 | "rangeEnd": 0,
60 | "type": "relative"
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/host/cpu.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_detector" "cpu_historical_norm" {
2 | name = "${var.o11y_prefix} CPU utilization % greater than historical norm"
3 | description = "Alerts when CPU usage for this host for the last 30 minutes was significantly higher than normal, as compared to the last 24 hours"
4 | program_text = <<-EOF
5 | from signalfx.detectors.against_recent import against_recent
6 | A = data('cpu.utilization').publish(label='A', enable=True)
7 | against_recent.detector_mean_std(stream=A, current_window='30m', historical_window='24h', fire_num_stddev=3, clear_num_stddev=2.5, orientation='above', ignore_extremes=True, calculation_mode='vanilla').publish('CPU utilization is significantly greater than normal, and increasing')
8 | EOF
9 | rule {
10 | detect_label = "CPU utilization is significantly greater than normal, and increasing"
11 | severity = "Warning"
12 | parameterized_body = var.message_body
13 | }
14 | }
15 |
16 | resource "signalfx_detector" "cpu_historical_cyclical_norm" {
17 | name = "${var.o11y_prefix} CPU utilization % greater than 3.5 std dev compared to the same time window over the last 3 days"
18 | description = "Alerts when CPU usage for this host for the last 30 minutes was significantly higher than normal, as compared to the last 24 hours"
19 | program_text = <<-EOF
20 | from signalfx.detectors.against_periods import against_periods
21 | A = data('cpu.utilization').publish(label='A', enable=True)
22 | against_periods.detector_mean_std(stream=A, window_to_compare='30m', space_between_windows='24h', num_windows=3, fire_num_stddev=3.5, clear_num_stddev=2, discard_historical_outliers=True, orientation='above').publish('CPU Utilization is greater than normal for the same time window compared to the last 3 days')
23 | EOF
24 | rule {
25 | detect_label = "CPU Utilization is greater than normal for the same time window compared to the last 3 days"
26 | severity = "Warning"
27 | parameterized_body = var.message_body
28 | }
29 | }
30 |
31 | resource "signalfx_detector" "cpu_not_reporting" {
32 | name = "${var.o11y_prefix} Host has stopped reporting data for atleast 1 minute"
33 | description = "Alerts when Host has stopped reporting data for atleast a minute"
34 | program_text = <<-EOF
35 | from signalfx.detectors.not_reporting import not_reporting
36 | A = data('cpu.utilization').publish(label='A', enable=True)
37 | not_reporting.detector(stream=A, resource_identifier=None, duration='1m').publish('Host Not Reporting')
38 | EOF
39 | rule {
40 | detect_label = "Host Not Reporting"
41 | severity = "Critical"
42 | parameterized_body = var.message_body
43 | }
44 | }
--------------------------------------------------------------------------------
/integration-examples/splunk-otel-databricks/splunk-start-up.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # on error, run away, exit, don't continue, etc...
4 | set -e
5 |
6 | # output a script to the file system so it can be executed later...
7 | cat <>/tmp/splunk/otel-script.sh
8 | #!/bin/bash
9 | set -e
10 |
11 | if [ \$DB_IS_DRIVER ]; then
12 | # Set default environment variables for the installation scripts to use. ##
13 | # 1. splunkObservability.realm=
14 | # 2. splunkObservability.accessToken=
15 | # 3. clusterName=
16 | # OTEL Service Information ##
17 | # OTEL_SERVICE_NAME = "Splunk-Databricks-OTEL"
18 | # OTEL_TRACES_EXPORTER = "jaeger-thrift-splunk"
19 | # OTEL_EXPORTER_JAEGER_ENDPOINT = "https://ingest..signalfx.com/v2/trace"
20 |
21 | # Validate Secrets: Check to see if there is a secret in the secret store before executing the script to install the OpenTelemetry Collector.
22 | echo "Running OpenTelemetry collector installation script"
23 | echo "Pre-Installation: Validation: Secret Key(s)"
24 | echo "SPLUNK_ACCESS_TOKEN must be stored in the Databricks "
25 |
26 | if [ -z "\$SPLUNK_ACCESS_TOKEN" ]; then
27 | echo 'Please set the secret for the SPLUNK_ACCESS_TOKEN in the databricks environment secret store.'
28 | exit 1;
29 | fi
30 |
31 | # Validation of parameters installation of the Splunk OpenTelemetry Collector Script
32 | echo "Pre-Installation: Validation environmental parameters"
33 | echo "SPLUNK_REALM: us0 (default), Actual: "\$SPLUNK_REALM
34 | echo "SPLUNK_MEMORY_TOTAL_MIB: 512 MIB (default), Actual: "\$SPLUNK_MEMORY_TOTAL_MIB
35 |
36 | SPLUNK_ACCESS_TOKEN="\$SPLUNK_ACCESS_TOKEN" bash -c "\$(curl -sSL https://dl.signalfx.com/splunk-otel-collector.sh > /tmp/splunk-otel-collector.sh;)"
37 | SPLUNK_ACCESS_TOKEN="\$SPLUNK_ACCESS_TOKEN" bash -c "\$(sudo sh /tmp/splunk-otel-collector.sh --realm \$SPLUNK_REALM --memory \$SPLUNK_MEMORY_TOTAL_MIB \
38 | -- \$SPLUNK_ACCESS_TOKEN)"
39 | EOF
40 |
41 | # Determine where the script is being executed and run logic, set parameters etc (https://docs.databricks.com/clusters/init-scripts.html): ##
42 | # if: Driver: do driver stuff, else if Worker: do worker stuff, else Driver and Worker: do stuff ##
43 |
44 | echo $DB_IS_DRIVER
45 | if [[ $DB_IS_DRIVER = "TRUE" ]]; then
46 | # Logic for the Driver would go here ##
47 |
48 | else
49 | # Logic for the Worker would go here ##
50 | fi
51 | # Shared Logic for the Driver and Worker ##
52 |
53 | # Modify the permissions of the script so it can be executed.
54 | chmod a+x /tmp/splunk/otel-script.sh
55 | # Run the installation script and output logs to: /tmp/splunk/otel-script.log
56 | /tmp/splunk/otel-script.sh >> /tmp/splunk/otel-script.log 2>&1 & disown
--------------------------------------------------------------------------------
/detectors/snowflakedb/Snowflake - No Queries in Last 3 Hours.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1660074725727,
7 | "customProperties": {},
8 | "description": "No Queries in last 3 Hours",
9 | "detectorOrigin": "Standard",
10 | "labelResolutions": {
11 | "Snowflake - No Queries in Last 3 Hours": 3600000
12 | },
13 | "lastUpdated": 1660233574385,
14 | "lastUpdatedBy": "E0jpLZIAYAA",
15 | "maxDelay": 0,
16 | "minDelay": 0,
17 | "name": "Snowflake - No Queries in Last 3 Hours",
18 | "overMTSLimit": false,
19 | "packageSpecifications": "",
20 | "parentDetectorId": null,
21 | "programText": "A = data('snowflake.database.query.count', extrapolation='zero').sum().publish(label='A')\ndetect(when(A < threshold(1), lasting='3h')).publish('Snowflake - No Queries in Last 3 Hours')",
22 | "rules": [
23 | {
24 | "description": "The value of Total Queries is below 1.",
25 | "detectLabel": "Snowflake - No Queries in Last 3 Hours",
26 | "disabled": false,
27 | "notifications": [],
28 | "parameterizedBody": "{{#if anomalous}}\n\n Rule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" triggered at {{dateTimeFormat timestamp format=\"full\"}}.\n{{else}}\n Rule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" cleared at {{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n{{#if anomalous}}\nTriggering condition: {{{readableRule}}}\n{{/if}}\n\n{{#if anomalous}}Snowflake Traffic is below 1 for 3 hours: \nNumber of queries: {{inputs.A.value}}\n{{else}}Current signal value(s):\nNumber of queries: {{inputs.A.value}}\n{{/if}}\n\n{{#notEmpty dimensions}}\nSignal details:\n{{{dimensions}}}\n{{/notEmpty}}\n\n{{#if anomalous}}\n{{#if runbookUrl}}Runbook: {{{runbookUrl}}}{{/if}}\n{{#if tip}}Tip: {{{tip}}}{{/if}}\n{{/if}}",
29 | "parameterizedSubject": null,
30 | "runbookUrl": null,
31 | "severity": "Critical",
32 | "tip": "This alert looks at last 3 hours due to the possible latency of data existing in Snowflake internal ACCOUNT_USAGE https://docs.snowflake.com/en/sql-reference/account-usage.html#account-usage-views"
33 | }
34 | ],
35 | "sf_metricsInObjectProgramText": null,
36 | "tags": [],
37 | "teams": [],
38 | "timezone": "",
39 | "visualizationOptions": {
40 | "disableSampling": false,
41 | "publishLabelOptions": [
42 | {
43 | "displayName": "Total Queries",
44 | "label": "A",
45 | "paletteIndex": null,
46 | "valuePrefix": null,
47 | "valueSuffix": null,
48 | "valueUnit": null
49 | }
50 | ],
51 | "showDataMarkers": true,
52 | "showEventLines": false,
53 | "time": {
54 | "range": 75600000,
55 | "rangeEnd": 0,
56 | "type": "relative"
57 | }
58 | }
59 | }
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - LUN multi-pathing issue.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1727101189016,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GYKMWWkAAAA",
12 | "labelResolutions": {
13 | "Hardware - LUN multi-pathing issue": 180000
14 | },
15 | "lastUpdated": 1730986249546,
16 | "lastUpdatedBy": "GRtepaIAICg",
17 | "maxDelay": 0,
18 | "minDelay": 0,
19 | "name": "Hardware - LUN multi-pathing issue",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "A = data('hw.lun.paths').publish(label='A')\ndetect(when(A < threshold(2))).publish('Hardware - LUN multi-pathing issue')",
23 | "rules": [
24 | {
25 | "description": "The value of hw.lun.paths is below 2.",
26 | "detectLabel": "Hardware - LUN multi-pathing issue",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n## Lost redundancy\nOnly 1 remaining path in multipathing configuration for LUN **{{dimensions.[name]}}** on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**\n\n## Consequence\nThe performance of the system may be affected and the risk of losing access to data is high.\n\n## Recommended action\nVerify on the SAN switches which links are broken (link down, or zone exclusion, etc.). Check the mapping and masking configuration of the corresponding storage volume in the storage system.\n{{else}}\nRecovered available LUN paths.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "Hardware - LUN multi-pathing issue",
31 | "runbookUrl": "",
32 | "severity": "Warning",
33 | "tip": ""
34 | }
35 | ],
36 | "sf_metricsInObjectProgramText": [
37 | "hw.lun.paths"
38 | ],
39 | "status": "ACTIVE",
40 | "tags": [],
41 | "teams": [],
42 | "timezone": "",
43 | "visualizationOptions": {
44 | "disableSampling": false,
45 | "publishLabelOptions": [
46 | {
47 | "displayName": "hw.lun.paths",
48 | "label": "A",
49 | "paletteIndex": null,
50 | "valuePrefix": null,
51 | "valueSuffix": null,
52 | "valueUnit": null
53 | }
54 | ],
55 | "showDataMarkers": true,
56 | "showEventLines": false,
57 | "time": {
58 | "range": 86400000,
59 | "rangeEnd": 0,
60 | "type": "relative"
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - Network errors.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1727083108813,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GYJPupTAAAA",
12 | "labelResolutions": {
13 | "Hardware - Network errors": 1000
14 | },
15 | "lastUpdated": 1727083147113,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": 0,
18 | "minDelay": 0,
19 | "name": "Hardware - Network errors",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "A = data('hw.errors', filter=filter('hw.type', 'network')).publish(label='A')\ndetect(when(A > threshold(0))).publish('Hardware - Network errors')",
23 | "rules": [
24 | {
25 | "description": "The value of hw.errors is above 0.",
26 | "detectLabel": "Hardware - Network errors",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n###Network errors\n\nInterface **{{dimensions.name}}** is encountering or generating a high number of errors of received or transmitted packets) on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Consequence\nThis strongly impacts the network performance.\n\n###Recommended action\nCheck the network cable, the driver settings, the speed and duplex mode of the link. If everything seems normal, you may have to replace this network adapter. \n{{else}}\nThe network card no longer encounters or generates errors. \n{{/if}}\n\n###Device Details\n**Name:** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Type:** {{dimensions.[hw.type]}}\n**Physical Address:** {{dimensions.physical_address}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "Hardware - Network errors on {{dimensions.[host.name]}}",
31 | "severity": "Warning"
32 | }
33 | ],
34 | "sf_metricsInObjectProgramText": [
35 | "hw.errors"
36 | ],
37 | "status": "ACTIVE",
38 | "tags": [],
39 | "teams": [],
40 | "timezone": "",
41 | "visualizationOptions": {
42 | "disableSampling": false,
43 | "publishLabelOptions": [
44 | {
45 | "displayName": "hw.errors",
46 | "label": "A",
47 | "paletteIndex": null,
48 | "valuePrefix": null,
49 | "valueSuffix": null,
50 | "valueUnit": null
51 | }
52 | ],
53 | "showDataMarkers": true,
54 | "showEventLines": false,
55 | "time": {
56 | "range": 900000,
57 | "rangeEnd": 0,
58 | "type": "relative"
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - Physical intrusion.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1726844670799,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GX64jrUAEB0",
12 | "labelResolutions": {
13 | "Hardware - Physical intrusion": 120000
14 | },
15 | "lastUpdated": 1730969153413,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": null,
18 | "minDelay": null,
19 | "name": "Hardware - Physical intrusion",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "A = data('hw.status', filter=filter('state', 'open'), rollup='max').publish(label='A')\ndetect(when(A > threshold(0))).publish('Hardware - Physical intrusion')",
23 | "rules": [
24 | {
25 | "description": "The value of hw.status is above 0.",
26 | "detectLabel": "Hardware - Physical intrusion",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n###Intrusion\nEnclosure {{dimensions.[name]}} is open ({{dimensions.[host.name]}} in {{dimensions.site}}).\n\n###Consequence\nThis could mean that somebody is accessing the hardware components in the enclosure, including the harddisks which may contain private information.\n\n###Recommended action\nMake sure the enclosure has been opened by authorized personnel only and close it as soon as possible.\n{{else}}\nEnclosure is now closed.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Vendor:** {{dimensions.vendor}}\n**Model:** {{dimensions.model}}\n**Serial Number:** {{dimensions.serial_number}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "Intrusion in {{dimensions.[host.name]}}'s enclosure",
31 | "runbookUrl": "",
32 | "severity": "Major",
33 | "tip": ""
34 | }
35 | ],
36 | "sf_metricsInObjectProgramText": [
37 | "hw.status"
38 | ],
39 | "status": "ACTIVE",
40 | "tags": [],
41 | "teams": [],
42 | "timezone": "",
43 | "visualizationOptions": {
44 | "disableSampling": false,
45 | "publishLabelOptions": [
46 | {
47 | "displayName": "hw.status",
48 | "label": "A",
49 | "paletteIndex": null,
50 | "valuePrefix": null,
51 | "valueSuffix": null,
52 | "valueUnit": null
53 | }
54 | ],
55 | "showDataMarkers": true,
56 | "showEventLines": false,
57 | "time": {
58 | "range": 86400000,
59 | "rangeEnd": 0,
60 | "type": "relative"
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - Networking link down.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1727082785650,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GYJAKjwAEAA",
12 | "labelResolutions": {
13 | "Hardware - Networking link down": 180000
14 | },
15 | "lastUpdated": 1728507098084,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": 0,
18 | "minDelay": 0,
19 | "name": "Hardware - Networking link down",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "A = data('hw.network.up', rollup='min').publish(label='A')\ndetect(when(A < threshold(1))).publish('Hardware - Networking link down')",
23 | "rules": [
24 | {
25 | "description": "The value of hw.network.up is below 1.",
26 | "detectLabel": "Hardware - Networking link down",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n###Link down\n\nInterface **{{dimensions.name}}** is disconnected on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Consequence\nThe network traffic (if any) that was processed by this adapter is no longer being handled, or is overloading another network adapter.\n\n###Recommended action\nCheck that the network cable (if any) is not unplugged or broken/cut, and that it is properly plugged into the network card. Ensure that the network hub/switch/router is working properly.\n{{else}}\nLink restored for {{dimensions.name}}.\n{{/if}}\n\n###Device Details\n**Name:** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Type:** {{dimensions.[hw.type]}}\n**Physical Address:** {{dimensions.physical_address}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "Network link down for **{{dimensions.[host.name]}}**",
31 | "severity": "Warning"
32 | }
33 | ],
34 | "sf_metricsInObjectProgramText": [
35 | "hw.network.up"
36 | ],
37 | "status": "ACTIVE",
38 | "tags": [],
39 | "teams": [],
40 | "timezone": "",
41 | "visualizationOptions": {
42 | "disableSampling": false,
43 | "publishLabelOptions": [
44 | {
45 | "displayName": "hw.network.up",
46 | "label": "A",
47 | "paletteIndex": null,
48 | "valuePrefix": null,
49 | "valueSuffix": null,
50 | "valueUnit": null
51 | }
52 | ],
53 | "showDataMarkers": true,
54 | "showEventLines": false,
55 | "time": {
56 | "range": 86400000,
57 | "rangeEnd": 0,
58 | "type": "relative"
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/integration-examples/terraform-jumpstart/modules/pivotal/gorouter.tf:
--------------------------------------------------------------------------------
1 | resource "signalfx_detector" "pivotal_cloudfoundry_gorouter_errors" {
2 | name = "${var.o11y_prefix} Pivotal cloudFoundry gorouter errors"
3 | description = "Alerts for various Pivotal CloudFoundry gorouter related error scenarios"
4 | program_text = <<-EOF
5 | from signalfx.detectors.against_periods import against_periods
6 | from signalfx.detectors.against_recent import against_recent
7 | total_requests = data('gorouter.total_requests', filter=filter('metric_source', 'cloudfoundry'), rollup='average').delta().mean(over='5m').publish(label='total_requests', enable=True)
8 | latency = data('gorouter.latency', filter=filter('metric_source', 'cloudfoundry'), rollup='average').mean(over='30m').publish(label='latency', enable=True)
9 | detect((when(total_requests >= 0.5) and (total_requests < 1))).publish('Pivotal Cloudfoundry - The number of Tasks that the auctioneer failed to place on Diego cellis between .5 and 1.')
10 | detect(when(total_requests >=1)).publish('Pivotal Cloudfoundry - The number of Tasks that the auctioneer failed to place on Diego cell is greater or equal to 1.')
11 | detect(when(latency > 100)).publish('Pivotal Cloudfoundry - gorouter latency above 100 ms')
12 | EOF
13 | rule {
14 | detect_label = "Pivotal Cloudfoundry - The number of Tasks that the auctioneer failed to place on Diego cellis between .5 and 1."
15 | severity = "Minor"
16 | tip = "To increase throughput and maintain low latency, scale the Gorouters either horizontally or vertically and watch that the system.cpu.user metric for the Gorouter stays in the suggested range of 60-70% CPU Utilization."
17 | parameterized_body = var.message_body
18 | }
19 | rule {
20 | detect_label = "Pivotal Cloudfoundry - The number of Tasks that the auctioneer failed to place on Diego cell is greater or equal to 1."
21 | severity = "Critical"
22 | tip = "To increase throughput and maintain low latency, scale the Gorouters either horizontally or vertically and watch that the system.cpu.user metric for the Gorouter stays in the suggested range of 60-70% CPU Utilization."
23 | parameterized_body = var.message_body
24 | }
25 |
26 | rule {
27 | detect_label = "Pivotal Cloudfoundry - gorouter latency above 100 ms"
28 | severity = "Warning"
29 | tip = "First inspect logs for network issues and indications of misbehaving backends./nIf it appears that the Gorouter needs to scale due to ongoing traffic congestion, do not scale on the latency metric alone. You should also look at the CPU utilization of the Gorouter VMs and keep it within a maximum 60-70% range./nResolve high utilization by scaling the Gorouter."
30 | parameterized_body = var.message_body
31 | }
32 |
33 | }
--------------------------------------------------------------------------------
/integration-examples/lambda-vpc-connection-sample/tests/unit/test_handler.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import pytest
4 |
5 | from hello_world import app
6 |
7 |
8 | @pytest.fixture()
9 | def apigw_event():
10 | """ Generates API GW Event"""
11 |
12 | return {
13 | "body": '{ "test": "body"}',
14 | "resource": "/{proxy+}",
15 | "requestContext": {
16 | "resourceId": "123456",
17 | "apiId": "1234567890",
18 | "resourcePath": "/{proxy+}",
19 | "httpMethod": "POST",
20 | "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef",
21 | "accountId": "123456789012",
22 | "identity": {
23 | "apiKey": "",
24 | "userArn": "",
25 | "cognitoAuthenticationType": "",
26 | "caller": "",
27 | "userAgent": "Custom User Agent String",
28 | "user": "",
29 | "cognitoIdentityPoolId": "",
30 | "cognitoIdentityId": "",
31 | "cognitoAuthenticationProvider": "",
32 | "sourceIp": "127.0.0.1",
33 | "accountId": "",
34 | },
35 | "stage": "prod",
36 | },
37 | "queryStringParameters": {"foo": "bar"},
38 | "headers": {
39 | "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)",
40 | "Accept-Language": "en-US,en;q=0.8",
41 | "CloudFront-Is-Desktop-Viewer": "true",
42 | "CloudFront-Is-SmartTV-Viewer": "false",
43 | "CloudFront-Is-Mobile-Viewer": "false",
44 | "X-Forwarded-For": "127.0.0.1, 127.0.0.2",
45 | "CloudFront-Viewer-Country": "US",
46 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
47 | "Upgrade-Insecure-Requests": "1",
48 | "X-Forwarded-Port": "443",
49 | "Host": "1234567890.execute-api.us-east-1.amazonaws.com",
50 | "X-Forwarded-Proto": "https",
51 | "X-Amz-Cf-Id": "aaaaaaaaaae3VYQb9jd-nvCd-de396Uhbp027Y2JvkCPNLmGJHqlaA==",
52 | "CloudFront-Is-Tablet-Viewer": "false",
53 | "Cache-Control": "max-age=0",
54 | "User-Agent": "Custom User Agent String",
55 | "CloudFront-Forwarded-Proto": "https",
56 | "Accept-Encoding": "gzip, deflate, sdch",
57 | },
58 | "pathParameters": {"proxy": "/examplepath"},
59 | "httpMethod": "POST",
60 | "stageVariables": {"baz": "qux"},
61 | "path": "/examplepath",
62 | }
63 |
64 |
65 | def test_lambda_handler(apigw_event):
66 |
67 | ret = app.lambda_handler(apigw_event, "")
68 | data = json.loads(ret["body"])
69 |
70 | assert ret["statusCode"] == 200
71 | assert "message" in ret["body"]
72 | assert data["message"] == "hello world"
73 |
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - Tape drive needs cleaning.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1727093437101,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GYJpVTXAEAA",
12 | "labelResolutions": {
13 | "Hardware - Tape drive needs cleaning": 1000
14 | },
15 | "lastUpdated": 1727093437495,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": null,
18 | "minDelay": null,
19 | "name": "Hardware - Tape drive needs cleaning",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "A = data('hw.status', filter=filter('state', 'needs_cleaning')).publish(label='A')\ndetect(when(A > threshold(0))).publish('Hardware - Tape drive needs cleaning')",
23 | "rules": [
24 | {
25 | "description": "The value of hw.status is above 0.",
26 | "detectLabel": "Hardware - Tape drive needs cleaning",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n###Cleaning needed\nTape drive **{{dimensions.[name]}}** needs cleaning on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Consequence\nRegular tape drive cleaning helps in long-term reliability, prevents read/write errors and should be conducted on a scheduled cycle as well as when requested by the drive.\n\n###Recommended action\nWait for any running operation to finish, eject the tape and clean the drive.\n{{else}}\nTape drive no longer needs cleaning. \n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Vendor:** {{dimensions.vendor}}\n**Model:** {{dimensions.model}}\n**Serial Number:** {{dimensions.serial_number}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "ape drive {{dimensions.[name]}} needs cleaning",
31 | "runbookUrl": "",
32 | "severity": "Warning",
33 | "tip": ""
34 | }
35 | ],
36 | "sf_metricsInObjectProgramText": [
37 | "hw.status"
38 | ],
39 | "status": "ACTIVE",
40 | "tags": [],
41 | "teams": [],
42 | "timezone": "",
43 | "visualizationOptions": {
44 | "disableSampling": false,
45 | "publishLabelOptions": [
46 | {
47 | "displayName": "hw.status",
48 | "label": "A",
49 | "paletteIndex": null,
50 | "valuePrefix": null,
51 | "valueSuffix": null,
52 | "valueUnit": null
53 | }
54 | ],
55 | "showDataMarkers": true,
56 | "showEventLines": false,
57 | "time": {
58 | "range": 86400000,
59 | "rangeEnd": 0,
60 | "type": "relative"
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/integration-examples/apiScripts/muteAllAutoDetectors.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # This script will mute all auto-detectors.
4 | #
5 | # Edit token.yaml to contain valid
6 | # - Access Token (access_token)
7 | # - Realm (realm)
8 | #
9 | # Syntax: python3 muteAllAutoDetectors.py
10 | # to disable all auto detectors
11 | #
12 | # python3 muteAllAutoDetectors.py -e
13 | # to re-enable all auto detectors
14 |
15 | import argparse
16 | import yaml
17 | import requests
18 | import json
19 |
20 | def muteDetectors(realm, token, enableDisable, responseJSON):
21 | arrDetectors = []
22 | for result in responseJSON['results']:
23 | id = result['id']
24 | name = result['name']
25 | type = result['detectorOrigin']
26 | if type == "AutoDetect":
27 | url = f"https://api.{realm}.signalfx.com/v2/detector/{id}/{enableDisable}"
28 | headers = {"Content-Type": "application/json", "X-SF-TOKEN": f"{token}" }
29 | response = requests.put(url, headers=headers)
30 | if response.status_code == 204:
31 | print(f"SUCCESS: {name} muting {enableDisable}d.")
32 | arrDetectors.append(name)
33 | else:
34 | print(f"ERROR: {name} muting change failed.")
35 | return arrDetectors
36 |
37 | def callAPI(realm, token, bDisable):
38 | arrDetectors = []
39 | limit = 10000
40 | offset = 0
41 |
42 | if bDisable:
43 | enableDisable = "disable"
44 | else:
45 | enableDisable = "enable"
46 |
47 | url = f"https://api.{realm}.signalfx.com/v2/detector?limit={limit}"
48 | headers = {"Content-Type": "application/json", "X-SF-TOKEN": f"{token}" }
49 | response = requests.get(url, headers=headers)
50 | responseJSON = json.loads(response.text)
51 | try:
52 | cnt = responseJSON["count"]
53 | except:
54 | print("ERROR: Check your token, that's the most likely issue.")
55 | print(response.text)
56 | return
57 |
58 | if (cnt > 10000):
59 | print(f'You have more than 10,000 detectors ({cnt} found).')
60 | print('Presenting the results for the first 10,000.')
61 | #break
62 |
63 | arrDetectors = muteDetectors(realm, token, enableDisable, responseJSON)
64 | #print(arrDetectors)
65 |
66 | if __name__ == '__main__':
67 | with open('token.yaml', 'r') as ymlfile:
68 | cfg = yaml.safe_load(ymlfile)
69 |
70 | parser = argparse.ArgumentParser(description='Splunk - Mute All Auto-Detectors')
71 | parser.add_argument('-r', '--realm', help='Realm', required=False)
72 | parser.add_argument('-t', '--token', help='Token', required=False)
73 | parser.add_argument('-e', '--enable', action=argparse.BooleanOptionalAction)
74 | args = parser.parse_args()
75 |
76 | bDisable = True
77 | if args.enable is not None:
78 | bDisable = False
79 |
80 | realm = cfg['realm'] if args.realm is None else args.realm
81 | token = cfg['access_token'] if args.token is None else args.token
82 |
83 | callAPI(realm, token, bDisable)
84 |
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - Low fan speed (%).json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1727095352339,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GYJxNpHAIAA",
12 | "labelResolutions": {
13 | "Hardware - Low fan speed (%)": 1000
14 | },
15 | "lastUpdated": 1727096537448,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": 0,
18 | "minDelay": 0,
19 | "name": "Hardware - Low fan speed (%)",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "A = data('hw.fan.speed_ratio').publish(label='A', enable=False)\nB = data('A*100').publish(label='B')\ndetect(when(B < threshold(10))).publish('Hardware - Low fan speed (%)')",
23 | "rules": [
24 | {
25 | "description": "The value of A*100 is below 10.",
26 | "detectLabel": "Hardware - Low fan speed (%)",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n###Low fan speed\nFan speed for **{{dimensions.[name]}}** is abnormally low on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Consequence\nThe temperature of the chip, component or device that was cooled down by this fan, may rise rapidly. This could lead to severe hardware damage and system crashes.\n\n###Recommended action\nCheck if the fan no longer cools down the system. If so, replace the fan.\n{{else}}\nRecovered fan speed.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "Low fan speed",
31 | "severity": "Warning"
32 | }
33 | ],
34 | "sf_metricsInObjectProgramText": [
35 | "A*100"
36 | ],
37 | "status": "ACTIVE",
38 | "tags": [],
39 | "teams": [],
40 | "timezone": "",
41 | "visualizationOptions": {
42 | "disableSampling": false,
43 | "publishLabelOptions": [
44 | {
45 | "displayName": "hw.fan.speed_ratio",
46 | "label": "A",
47 | "paletteIndex": null,
48 | "valuePrefix": null,
49 | "valueSuffix": null,
50 | "valueUnit": null
51 | },
52 | {
53 | "displayName": "A*100",
54 | "label": "B",
55 | "paletteIndex": null,
56 | "valuePrefix": null,
57 | "valueSuffix": null,
58 | "valueUnit": null
59 | }
60 | ],
61 | "showDataMarkers": true,
62 | "showEventLines": false,
63 | "time": {
64 | "range": 86400000,
65 | "rangeEnd": 0,
66 | "type": "relative"
67 | }
68 | }
69 | }
--------------------------------------------------------------------------------
/.github/workflows/CLA.yaml:
--------------------------------------------------------------------------------
1 | name: "CLA Assistant"
2 | on:
3 | issue_comment:
4 | types: [created]
5 | pull_request_target:
6 | types: [opened, closed, synchronize]
7 | # explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings
8 | permissions:
9 | actions: write
10 | contents: write
11 | pull-requests: write
12 | statuses: write
13 | jobs:
14 | CLAAssistant:
15 | runs-on: ubuntu-latest
16 | steps:
17 | - name: "CLA Assistant"
18 | if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target'
19 | uses: contributor-assistant/github-action@v2.3.0
20 | env:
21 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
22 | # the below token should have repo scope and must be manually added by you in the repository's secret
23 | # This token is required only if you have configured to store the signatures in a remote repository/organization
24 | PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
25 | with:
26 | path-to-signatures: 'signatures/version1/cla.json'
27 | path-to-document: 'https://github.com/splunk/cla-agreement/blob/main/CLA.md' # e.g. a CLA or a DCO document
28 | # branch should not be protected
29 | branch: 'main'
30 | allowlist: dependabot[bot]
31 | remote-organization-name: splunk
32 | remote-repository-name: cla-agreement
33 | CodeOfConduct:
34 | runs-on: ubuntu-latest
35 | steps:
36 | - name: "COC Assistant"
37 | if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the Code of Conduct and I hereby accept the Terms') || github.event_name == 'pull_request_target'
38 | uses: cla-assistant/github-action@v2.3.0
39 | env:
40 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
41 | PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
42 | with:
43 | path-to-signatures: "signatures/version1/coc.json"
44 | path-to-document: "https://github.com/splunk/cla-agreement/blob/main/CODE_OF_CONDUCT.md" # e.g. a COC or a DCO document
45 | branch: "main"
46 | allowlist: dependabot[bot]
47 | remote-organization-name: splunk
48 | remote-repository-name: cla-agreement
49 | custom-pr-sign-comment: "I have read the Code of Conduct and I hereby accept the Terms"
50 | create-file-commit-message: "For example: Creating file for storing COC Signatures"
51 | signed-commit-message: "$contributorName has signed the COC in #$pullRequestNo"
52 | custom-notsigned-prcomment: "All contributors have NOT signed the COC Document"
53 | custom-allsigned-prcomment: "****CLA Assistant Lite bot**** All contributors have signed the COC ✍️ ✅"
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - Connector failed.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1727160940219,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GYN3h5fAIBw",
12 | "labelResolutions": {
13 | "Hardware - Connector failed": 1000
14 | },
15 | "lastUpdated": 1727215761634,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": null,
18 | "minDelay": null,
19 | "name": "Hardware - Connector failed",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "A = data('metricshub.connector.status', filter=filter('state', 'failed'), rollup='max').publish(label='A')\ndetect(when(A > threshold(0))).publish('Hardware - Connector failed')",
23 | "rules": [
24 | {
25 | "description": "The value of metricshub.connector.status is above 0.",
26 | "detectLabel": "Hardware - Connector failed",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n## Failed connector\nAgent **{{dimensions.[agent.host.name]}}** is failing to use **{{dimensions.[name]}}** to monitor **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n## Consequence\nAll of the components that were monitored through this connector can no longer be monitored.\n\n## Recommended action\nMake sure {{dimensions.[agent.host.name]}} can communicate with {{dimensions.[host.name]}} with the protocol used by {{dimensions.[name]}} and that the specified credentials in Metrics Hub's configuration are valid.\n{{else}}\nRecovered monitoring with {{dimensions.[name]}} connector.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "Hardware - Failed connector on {{dimensions.[host.name]}}",
31 | "runbookUrl": "",
32 | "severity": "Major",
33 | "tip": ""
34 | }
35 | ],
36 | "sf_metricsInObjectProgramText": [
37 | "metricshub.connector.status"
38 | ],
39 | "status": "ACTIVE",
40 | "tags": [],
41 | "teams": [],
42 | "timezone": "",
43 | "visualizationOptions": {
44 | "disableSampling": false,
45 | "publishLabelOptions": [
46 | {
47 | "displayName": "metricshub.connector.status",
48 | "label": "A",
49 | "paletteIndex": null,
50 | "valuePrefix": null,
51 | "valueSuffix": null,
52 | "valueUnit": null
53 | }
54 | ],
55 | "showDataMarkers": true,
56 | "showEventLines": false,
57 | "time": {
58 | "range": 86400000,
59 | "rangeEnd": 0,
60 | "type": "relative"
61 | }
62 | }
63 | }
--------------------------------------------------------------------------------
/detectors/metricshub/Hardware - Low battery.json:
--------------------------------------------------------------------------------
1 | {
2 | "authorizedWriters": {
3 | "teams": [],
4 | "users": []
5 | },
6 | "created": 1727098749573,
7 | "creator": "GRtepaIAICg",
8 | "customProperties": null,
9 | "description": "",
10 | "detectorOrigin": "Standard",
11 | "id": "GYKAzuVAIAU",
12 | "labelResolutions": {
13 | "Hardware - Low battery": 240000
14 | },
15 | "lastUpdated": 1727178856192,
16 | "lastUpdatedBy": "AAAAAAAAAAA",
17 | "maxDelay": 0,
18 | "minDelay": 0,
19 | "name": "Hardware - Low battery",
20 | "overMTSLimit": false,
21 | "packageSpecifications": "",
22 | "programText": "A = data('hw.battery.charge').publish(label='A', enable=False)\nB = (A*100).publish(label='B')\ndetect(when(B < threshold(50))).publish('Hardware - Low battery')",
23 | "rules": [
24 | {
25 | "description": "The value of A*100 is below 50.",
26 | "detectLabel": "Hardware - Low battery",
27 | "disabled": false,
28 | "notifications": [],
29 | "parameterizedBody": "{{#if anomalous}}\n###Low battery\nBattery **{{dimensions.[name]}}** charge is abnormally low on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Consequence\nA low charge battery may lead to data loss in case of a power outage.\n\n###Recommended action\nCheck why the battery is not fully charged (it may be due to a power outage or an unplugged power cable) and fully recharge the battery when possible.\n{{else}}\nThe battery charge is back within the normal operational range.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Vendor:** {{dimensions.vendor}}\n**Model:** {{dimensions.model}}\n**Serial Number:** {{dimensions.serial_number}}\n**Information:** {{dimensions.info}}",
30 | "parameterizedSubject": "Low battery",
31 | "severity": "Warning"
32 | }
33 | ],
34 | "sf_metricsInObjectProgramText": [
35 | "hw.battery.charge"
36 | ],
37 | "status": "ACTIVE",
38 | "tags": [],
39 | "teams": [],
40 | "timezone": "",
41 | "visualizationOptions": {
42 | "disableSampling": false,
43 | "publishLabelOptions": [
44 | {
45 | "displayName": "hw.battery.charge",
46 | "label": "A",
47 | "paletteIndex": null,
48 | "valuePrefix": null,
49 | "valueSuffix": null,
50 | "valueUnit": null
51 | },
52 | {
53 | "displayName": "A*100",
54 | "label": "B",
55 | "paletteIndex": null,
56 | "valuePrefix": null,
57 | "valueSuffix": null,
58 | "valueUnit": null
59 | }
60 | ],
61 | "showDataMarkers": true,
62 | "showEventLines": false,
63 | "time": {
64 | "range": 900000,
65 | "rangeEnd": 0,
66 | "type": "relative"
67 | }
68 | }
69 | }
--------------------------------------------------------------------------------
/dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/couchdbreceiver_metadata.yaml:
--------------------------------------------------------------------------------
1 | type: couchdb
2 |
3 | status:
4 | class: receiver
5 | stability:
6 | beta: [metrics]
7 | distributions: [contrib,observiq, sumo]
8 | codeowners:
9 | active: [djaglowski]
10 |
11 | resource_attributes:
12 | couchdb.node.name:
13 | description: The name of the node.
14 | type: string
15 | enabled: true
16 |
17 | attributes:
18 | http.method:
19 | description: An HTTP request method.
20 | type: string
21 | enum: [ COPY, DELETE, GET, HEAD, OPTIONS, POST, PUT ]
22 | http.status_code:
23 | description: An HTTP status code.
24 | type: string
25 | view:
26 | description: The view type.
27 | type: string
28 | enum: [ temporary_view_reads, view_reads ]
29 | operation:
30 | description: The operation type.
31 | type: string
32 | enum: [ writes, reads ]
33 |
34 | metrics:
35 | couchdb.average_request_time:
36 | enabled: true
37 | description: The average duration of a served request.
38 | unit: ms
39 | gauge:
40 | value_type: double
41 | couchdb.httpd.bulk_requests:
42 | enabled: true
43 | description: The number of bulk requests.
44 | unit: "{requests}"
45 | sum:
46 | value_type: int
47 | monotonic: true
48 | aggregation_temporality: cumulative
49 | couchdb.httpd.requests:
50 | enabled: true
51 | description: The number of HTTP requests by method.
52 | unit: "{requests}"
53 | sum:
54 | value_type: int
55 | monotonic: true
56 | aggregation_temporality: cumulative
57 | attributes: [ http.method ]
58 | couchdb.httpd.responses:
59 | enabled: true
60 | description: The number of each HTTP status code.
61 | unit: "{responses}"
62 | sum:
63 | value_type: int
64 | monotonic: true
65 | aggregation_temporality: cumulative
66 | attributes: [ http.status_code ]
67 | couchdb.httpd.views:
68 | enabled: true
69 | description: The number of views read.
70 | unit: "{views}"
71 | sum:
72 | value_type: int
73 | monotonic: true
74 | aggregation_temporality: cumulative
75 | attributes: [ view ]
76 | couchdb.database.open:
77 | enabled: true
78 | description: The number of open databases.
79 | unit: "{databases}"
80 | sum:
81 | value_type: int
82 | monotonic: false
83 | aggregation_temporality: cumulative
84 | couchdb.file_descriptor.open:
85 | enabled: true
86 | description: The number of open file descriptors.
87 | unit: "{files}"
88 | sum:
89 | value_type: int
90 | monotonic: false
91 | aggregation_temporality: cumulative
92 | couchdb.database.operations:
93 | enabled: true
94 | description: The number of database operations.
95 | unit: "{operations}"
96 | sum:
97 | value_type: int
98 | monotonic: true
99 | aggregation_temporality: cumulative
100 | attributes: [ operation ]
101 |
--------------------------------------------------------------------------------