├── .github └── workflows │ └── CLA.yaml ├── .gitignore ├── CLA.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── dashboards-and-dashboard-groups ├── README.md ├── RUM-Real-User-Monitoring │ ├── README.md │ ├── rum-app-charts.tf │ ├── rum-browser-charts.tf │ ├── rum-dashboards.tf │ └── rum-synthetic-charts.tf ├── SC4SNMP │ ├── Network Devices (SNMP).json │ ├── README.md │ └── dashboard_SNMP Agents.json ├── SLO-Error-Budget │ ├── README.md │ ├── SLO-SLx.tf │ └── main.tf ├── executive-dashboards │ ├── APM_IMM-Exec.tf │ ├── Exec-Level-Dashboard-Group.json │ ├── Logs-Exec.tf │ ├── README.md │ ├── RUM-Exec.tf │ ├── Token-Usage-Exec.tf │ ├── Usage-Overview-Exec.tf │ └── main.tf ├── inferred-services-dg │ ├── Dashboard_Group_Inferred Services.json │ ├── Inferred-services-DashboardGroup.png │ └── README.md ├── metricshub │ ├── MetricsHub.json │ └── README.md ├── oracle-cloud │ ├── README.md │ ├── dashboard_group_OCC.json │ ├── occ-instance.png │ └── occ-instances.png ├── otel-receiver-dashboard-generator │ ├── .gitignore │ ├── README.md │ ├── observability-tf-configs │ │ ├── activedirectorydsreceiver_metadata.yaml.tf │ │ ├── aerospikereceiver_metadata.yaml.tf │ │ ├── apachereceiver_metadata.yaml.tf │ │ ├── apachesparkreceiver_metadata.yaml.tf │ │ ├── bigipreceiver_metadata.yaml.tf │ │ ├── chronyreceiver_metadata.yaml.tf │ │ ├── couchdbreceiver_metadata.yaml.tf │ │ ├── dockerstatsreceiver_metadata.yaml.tf │ │ ├── elasticsearchreceiver_metadata.yaml.tf │ │ ├── expvarreceiver_metadata.yaml.tf │ │ ├── filestatsreceiver_metadata.yaml.tf │ │ ├── flinkmetricsreceiver_metadata.yaml.tf │ │ ├── gitproviderreceiver_metadata.yaml.tf │ │ ├── haproxyreceiver_metadata.yaml.tf │ │ ├── httpcheckreceiver_metadata.yaml.tf │ │ ├── iisreceiver_metadata.yaml.tf │ │ ├── k8sclusterreceiver_metadata.yaml.tf │ │ ├── kafkametricsreceiver_metadata.yaml.tf │ │ ├── kubeletstatsreceiver_metadata.yaml.tf │ │ ├── memcachedreceiver_metadata.yaml.tf │ │ ├── mongodbatlasreceiver_metadata.yaml.tf │ │ ├── mongodbreceiver_metadata.yaml.tf │ │ ├── mysqlreceiver_metadata.yaml.tf │ │ ├── nginxreceiver_metadata.yaml.tf │ │ ├── nsxtreceiver_metadata.yaml.tf │ │ ├── oracledbreceiver_metadata.yaml.tf │ │ ├── postgresqlreceiver_metadata.yaml.tf │ │ ├── rabbitmqreceiver_metadata.yaml.tf │ │ ├── redisreceiver_metadata.yaml.tf │ │ ├── riakreceiver_metadata.yaml.tf │ │ ├── saphanareceiver_metadata.yaml.tf │ │ ├── snowflakereceiver_metadata.yaml.tf │ │ ├── splunkenterprisereceiver_metadata.yaml.tf │ │ ├── sqlserverreceiver_metadata.yaml.tf │ │ ├── sshcheckreceiver_metadata.yaml.tf │ │ ├── vcenterreceiver_metadata.yaml.tf │ │ └── zookeeperreceiver_metadata.yaml.tf │ ├── otel-dashboard-o11y.py │ ├── otel-receiver-yaml │ │ ├── activedirectorydsreceiver_metadata.yaml │ │ ├── aerospikereceiver_metadata.yaml │ │ ├── apachereceiver_metadata.yaml │ │ ├── apachesparkreceiver_metadata.yaml │ │ ├── bigipreceiver_metadata.yaml │ │ ├── chronyreceiver_metadata.yaml │ │ ├── couchdbreceiver_metadata.yaml │ │ ├── dockerstatsreceiver_metadata.yaml │ │ ├── elasticsearchreceiver_metadata.yaml │ │ ├── expvarreceiver_metadata.yaml │ │ ├── filestatsreceiver_metadata.yaml │ │ ├── flinkmetricsreceiver_metadata.yaml │ │ ├── gitproviderreceiver_metadata.yaml │ │ ├── haproxyreceiver_metadata.yaml │ │ ├── httpcheckreceiver_metadata.yaml │ │ ├── iisreceiver_metadata.yaml │ │ ├── k8sclusterreceiver_metadata.yaml │ │ ├── kafkametricsreceiver_metadata.yaml │ │ ├── kubeletstatsreceiver_metadata.yaml │ │ ├── memcachedreceiver_metadata.yaml │ │ ├── mongodbatlasreceiver_metadata.yaml │ │ ├── mongodbreceiver_metadata.yaml │ │ ├── mysqlreceiver_metadata.yaml │ │ ├── nginxreceiver_metadata.yaml │ │ ├── nsxtreceiver_metadata.yaml │ │ ├── oracledbreceiver_metadata.yaml │ │ ├── postgresqlreceiver_metadata.yaml │ │ ├── rabbitmqreceiver_metadata.yaml │ │ ├── redisreceiver_metadata.yaml │ │ ├── riakreceiver_metadata.yaml │ │ ├── saphanareceiver_metadata.yaml │ │ ├── snowflakereceiver_metadata.yaml │ │ ├── splunkenterprisereceiver_metadata.yaml │ │ ├── sqlserverreceiver_metadata.yaml │ │ ├── sshcheckreceiver_metadata.yaml │ │ ├── vcenterreceiver_metadata.yaml │ │ └── zookeeperreceiver_metadata.yaml │ ├── pull-otel-yaml.py │ └── requirements.txt └── snowflakedb │ ├── Configuration │ ├── README.md │ ├── agent_config.yaml │ ├── snowflake-metrics.yaml │ ├── snowflake-other-metrics.yaml │ ├── snowflake-receiver │ │ ├── agent_config.yaml │ │ └── splunk-otel-collector.conf │ └── splunk-otel-collector.conf │ ├── Dashboards │ ├── Individual Dashboards │ │ ├── dashboard_Snowflake_opentelemetry_Cost.json │ │ ├── dashboard_Snowflake_opentelemetry_Database.json │ │ ├── dashboard_Snowflake_opentelemetry_Home.json │ │ ├── dashboard_Snowflake_opentelemetry_Queries.json │ │ ├── dashboard_Snowflake_opentelemetry_Schema.json │ │ ├── dashboard_Snowflake_opentelemetry_Security.json │ │ ├── dashboard_Snowflake_opentelemetry_Warehouse.json │ │ ├── dashboard_Snowflake_smartagent_Cost.json │ │ ├── dashboard_Snowflake_smartagent_Database.json │ │ ├── dashboard_Snowflake_smartagent_Home.json │ │ ├── dashboard_Snowflake_smartagent_Queries.json │ │ ├── dashboard_Snowflake_smartagent_Query_Details.json │ │ ├── dashboard_Snowflake_smartagent_Schema.json │ │ ├── dashboard_Snowflake_smartagent_Security.json │ │ └── dashboard_Snowflake_smartagent_Warehouse.json │ ├── Snowflake_DashboardGroup_opentelemetry.json │ └── Snowflake_DashboardGroup_smartagent.json │ └── README.md ├── detectors ├── README.md ├── inferred-services-detectors │ ├── POST_Detector_error_rate.sh │ ├── POST_Detector_latency_spike.sh │ ├── README.md │ ├── detectors-1.png │ ├── detectors-errors.png │ └── detectors-latency.png ├── metricshub │ ├── Hardware - Connector failed.json │ ├── Hardware - Critical LUN pathing issue.json │ ├── Hardware - Critically high temperature.json │ ├── Hardware - Critically low battery.json │ ├── Hardware - Critically low fan speed (%).json │ ├── Hardware - Critically low fan speed.json │ ├── Hardware - Device status degraded.json │ ├── Hardware - Device status failed.json │ ├── Hardware - High number of errors.json │ ├── Hardware - High power usage.json │ ├── Hardware - High temperature.json │ ├── Hardware - High voltage.json │ ├── Hardware - LUN multi-pathing issue.json │ ├── Hardware - Low battery.json │ ├── Hardware - Low fan speed (%).json │ ├── Hardware - Low fan speed.json │ ├── Hardware - Low voltage.json │ ├── Hardware - Missing device.json │ ├── Hardware - Network errors.json │ ├── Hardware - Networking link down.json │ ├── Hardware - Physical intrusion.json │ ├── Hardware - Power capacity.json │ ├── Hardware - Predicted failure.json │ ├── Hardware - Tape drive needs cleaning.json │ └── README.md └── snowflakedb │ ├── README.md │ ├── Snowflake - % of spend for Cloud Service costs.json │ ├── Snowflake - Blocked Queries.json │ ├── Snowflake - Credits used per Warehouse (Anomaly).json │ ├── Snowflake - DB Error Rate.json │ ├── Snowflake - Database Errors.json │ ├── Snowflake - Login Failures By User.json │ ├── Snowflake - Long Queries in Small X-Small warehouses.json │ ├── Snowflake - Long Queries over 15m.json │ ├── Snowflake - No Queries in Last 3 Hours.json │ ├── Snowflake - Overloaded Queries.json │ └── Snowflake - Queued Queries longer than 5 minutes.json └── integration-examples ├── README.md ├── active_detectors ├── README.md ├── active_detectors.py ├── images │ └── screenshot.png └── requirements.txt ├── apiScripts ├── README.md ├── addEmailToDetectors.py ├── getMetricsForHost.py ├── muteAllAutoDetectors.py ├── requirements.txt └── token.yaml ├── azure-devops └── README.md ├── ci-webhook-serverless ├── README.md ├── ci-webhook-handler │ ├── README.md │ ├── handler.py │ ├── requirements.txt │ └── serverless.yml ├── demo-dashbaord-group.json └── generate-test-events.py ├── jenkins-apm └── README.md ├── lambda-vpc-connection-sample ├── .gitignore ├── ORIGINAL_README.md ├── README.md ├── __init__.py ├── events │ └── event.json ├── hello_world │ ├── __init__.py │ ├── app.py │ └── requirements.txt ├── template.yaml └── tests │ ├── __init__.py │ ├── integration │ ├── __init__.py │ └── test_api_gateway.py │ ├── requirements.txt │ └── unit │ ├── __init__.py │ └── test_handler.py ├── observability-microservices-jumpstart └── README.md ├── splunk-otel-databricks ├── README.md └── splunk-start-up.sh ├── splunk-otel-dotnet-docker ├── .gitignore ├── MultiStageDocker │ ├── Dockerfile │ ├── MultiStageDocker.csproj │ └── entrypoint.sh ├── MultiStageDockerNuGetOption │ ├── Dockerfile │ └── MultiStageDocker.csproj ├── Program.cs └── README.md ├── synthetics-examples ├── API │ ├── graphql-api │ │ ├── README.md │ │ └── synthetics_example_graphql_api_check.tf │ ├── status-page-to-metrics-api │ │ ├── README.md │ │ ├── image.png │ │ ├── synthetic-variables.png │ │ └── synthetics_thirdparty_status_api_check.tf │ ├── status-to-splunk-hec │ │ ├── README.md │ │ └── synthetics_status_to_splunk_hec_api_check.tf │ └── token-expiration-to-metrics-api │ │ ├── README.md │ │ ├── detector_token_expiration.tf │ │ ├── synthetic-variables.png │ │ ├── synthetics_token_expiration_api_check.tf │ │ └── token-expire-chart.png └── Browser │ ├── hipstershop-complete-order-test-browser │ ├── README.md │ └── synthetics_hipstershop_order_completion_browser_check.tf │ └── o11y-login-apm-loading-browser │ ├── README.md │ └── synthetics_o11y_login_apm_loading_browser_check.tf ├── system-scanner ├── README.md ├── dotnet_framework.py ├── health.py ├── main.py ├── os_info.py ├── runtime_versions.py ├── utils.py └── validators.py ├── terraform-jumpstart ├── .gitignore ├── README.md ├── export_script │ ├── .gitignore │ ├── README.md │ ├── export.py │ └── requirements.txt ├── main.tf ├── modules │ ├── aws │ │ ├── ecs.tf │ │ ├── elb.tf │ │ ├── lambda.tf │ │ ├── rds.tf │ │ ├── variables.tf │ │ └── versions.tf │ ├── azure │ │ ├── sql.tf │ │ ├── variables.tf │ │ ├── versions.tf │ │ └── vm.tf │ ├── dashboards │ │ ├── executive-dashboards │ │ │ ├── APM_IMM-Exec.tf │ │ │ ├── Logs-Exec.tf │ │ │ ├── RUM-Exec.tf │ │ │ ├── Token-Usage-Exec.tf │ │ │ ├── Usage-Overview-Exec.tf │ │ │ ├── main.tf │ │ │ ├── variables.tf │ │ │ └── versions.tf │ │ ├── parent │ │ │ ├── children-charts.tf │ │ │ ├── main.tf │ │ │ ├── variables.tf │ │ │ └── versions.tf │ │ ├── rum_and_synthetics │ │ │ ├── main.tf │ │ │ ├── rum_and_synthetics.tf │ │ │ ├── synthetics.tf │ │ │ ├── synthetics_with_trends.tf │ │ │ ├── variables.tf │ │ │ └── versions.tf │ │ └── usage │ │ │ ├── host-model-dashboard.tf │ │ │ ├── main.tf │ │ │ ├── mts-events-dashboard.tf │ │ │ ├── variables.tf │ │ │ └── versions.tf │ ├── docker │ │ ├── container.tf │ │ ├── variables.tf │ │ └── versions.tf │ ├── gcp │ │ ├── compute.tf │ │ ├── storage.tf │ │ ├── variables.tf │ │ └── versions.tf │ ├── host │ │ ├── cpu.tf │ │ ├── disk.tf │ │ ├── mem.tf │ │ ├── variables.tf │ │ └── versions.tf │ ├── kafka │ │ ├── kafka-dashboards.tf │ │ ├── kafka.tf │ │ ├── variables.tf │ │ └── versions.tf │ ├── kubernetes │ │ ├── cluster.tf │ │ ├── container.tf │ │ ├── node.tf │ │ ├── pod.tf │ │ ├── variables.tf │ │ └── versions.tf │ └── pivotal │ │ ├── CellMetrics.tf │ │ ├── RouteEmitter.tf │ │ ├── auctioneer.tf │ │ ├── diego.tf │ │ ├── gorouter.tf │ │ ├── logs.tf │ │ ├── system.tf │ │ ├── variables.tf │ │ └── versions.tf ├── terraform.tfvars.template ├── variables.tf └── versions.tf ├── token-expiration-monitor ├── README.md └── splunk_o11y_token_health.py └── usage-reports-scripts ├── README.md ├── custom-metric-report-parser.py ├── images └── custom-metric-report.png └── requirements.txt /.github/workflows/CLA.yaml: -------------------------------------------------------------------------------- 1 | name: "CLA Assistant" 2 | on: 3 | issue_comment: 4 | types: [created] 5 | pull_request_target: 6 | types: [opened, closed, synchronize] 7 | # explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings 8 | permissions: 9 | actions: write 10 | contents: write 11 | pull-requests: write 12 | statuses: write 13 | jobs: 14 | CLAAssistant: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: "CLA Assistant" 18 | if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target' 19 | uses: contributor-assistant/github-action@v2.3.0 20 | env: 21 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 22 | # the below token should have repo scope and must be manually added by you in the repository's secret 23 | # This token is required only if you have configured to store the signatures in a remote repository/organization 24 | PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 25 | with: 26 | path-to-signatures: 'signatures/version1/cla.json' 27 | path-to-document: 'https://github.com/splunk/cla-agreement/blob/main/CLA.md' # e.g. a CLA or a DCO document 28 | # branch should not be protected 29 | branch: 'main' 30 | allowlist: dependabot[bot] 31 | remote-organization-name: splunk 32 | remote-repository-name: cla-agreement 33 | CodeOfConduct: 34 | runs-on: ubuntu-latest 35 | steps: 36 | - name: "COC Assistant" 37 | if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the Code of Conduct and I hereby accept the Terms') || github.event_name == 'pull_request_target' 38 | uses: cla-assistant/github-action@v2.3.0 39 | env: 40 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 41 | PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 42 | with: 43 | path-to-signatures: "signatures/version1/coc.json" 44 | path-to-document: "https://github.com/splunk/cla-agreement/blob/main/CODE_OF_CONDUCT.md" # e.g. a COC or a DCO document 45 | branch: "main" 46 | allowlist: dependabot[bot] 47 | remote-organization-name: splunk 48 | remote-repository-name: cla-agreement 49 | custom-pr-sign-comment: "I have read the Code of Conduct and I hereby accept the Terms" 50 | create-file-commit-message: "For example: Creating file for storing COC Signatures" 51 | signed-commit-message: "$contributorName has signed the COC in #$pullRequestNo" 52 | custom-notsigned-prcomment: "All contributors have NOT signed the COC Document" 53 | custom-allsigned-prcomment: "****CLA Assistant Lite bot**** All contributors have signed the COC ✍️ ✅" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # VSCode 2 | .vscode/* 3 | !.vscode/settings.json 4 | !.vscode/tasks.json 5 | !.vscode/launch.json 6 | !.vscode/extensions.json 7 | *.code-workspace 8 | 9 | # Local History for Visual Studio Code 10 | .history/ 11 | 12 | # Common credential files 13 | **/credentials.json 14 | **/client_secrets.json 15 | **/client_secret.json 16 | *creds* 17 | *.dat 18 | *password* 19 | *.httr-oauth* 20 | 21 | 22 | # Mac/OSX 23 | .DS_Store 24 | 25 | # Distribution / packaging 26 | .Python 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | share/python-wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | MANIFEST 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Environments 50 | .env 51 | .venv 52 | env/ 53 | venv/ 54 | ENV/ 55 | env.bak/ 56 | venv.bak/ 57 | 58 | # Terraform 59 | .terraform.lock.hcl 60 | .terraform/ 61 | terraform.tfstate 62 | terraform.tfstate.backup -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions are encouraged and greatly appreciated! Every 4 | little bit helps, and credit will always be given. 5 | 6 | You can contribute in many ways: 7 | 8 | ## Types of Contributions 9 | 10 | ### Report Bugs/Issues: 11 | 12 | If you are reporting a bug or issues, please include: 13 | 14 | - Operating system name and version. 15 | - Any details about your local setup that might be helpful 16 | in troubleshooting (E.G. Python version if using a python script, Terraform version if you're using a Terraform script.). 17 | - Detailed steps to reproduce the bug. 18 | 19 | ### Fix Bugs 20 | 21 | Check the Issues for this repo on GitHub. Anything tagged with 22 | a "bug" ticket type is open to whoever wants to implement it. 23 | 24 | ### Implement Features 25 | 26 | If you have a great set of dashboards, detectors, API scripts for sending metrics, or any other content 27 | you believe will be of use to others, please contribute it! 28 | 29 | Or check the Issues for this repo on GitHub. Anything tagged with "enhancement" 30 | and "help wanted" is open to whoever wants to implement it. 31 | 32 | ### Write Documentation 33 | 34 | Submissions and `README.md` files could always use more documentation. Documentation can always use an update or tweak in the official docs, in docstrings of scripts, comments in configs, or anywhere a bit of clarity may be useful.. 35 | 36 | ### Submit Feedback 37 | 38 | If you are proposing a feature: 39 | 40 | - Explain in detail how it would work. 41 | - Keep the scope as narrow as possible, to make it easier 42 | to implement. 43 | - Remember that this is a volunteer-driven project, and that 44 | contributions are welcome :) 45 | 46 | ## Pull Request Guidelines 47 | 48 | Before you submit a pull request, check that it meets these guidelines: 49 | 50 | 1. The pull request should include a `README.md` for any new submission. 51 | 2. If the pull request adds functionality, the `README.md` docs for that component or submission should be updated. 52 | Put your new functionality into a function with a docstring, and add 53 | the feature to the list in README.md. 54 | 3. Terraform submissions should work with the most current version of the included Terraform Provider. 55 | 4. Python submissions should work for Python3 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Contribution repository for Splunk Observability Content 2 | 3 | This repository exists to enable sharing of content. No formal testing is 4 | required so it just might work. Some examples of content which would fit here 5 | 6 | * Dashboards 7 | * Detectors 8 | * API Examples 9 | * Usage Reports 10 | * OTel Example Configurations 11 | * Links to Other Relevant Projects 12 | 13 | ## Contributions 14 | Contributions are welcome and encouraged! 15 | 16 | Please see [CONTRIBUTING.md](./CONTRIBUTING.md) for details on contributing to this repository. 17 | 18 | All contributors must sign the CLA and Code of Conduct. You will be prompted by the [cla-assistant](https://github.com/marketplace/actions/cla-assistant-lite) workflow action during your Pull Request for your agreement. 19 | 20 | To agree to the CLA and COC please comment these in **separate individual messages** on your PR: 21 | 22 | CLA: 23 | ``` 24 | I have read the CLA Document and I hereby sign the CLA 25 | ``` 26 | 27 | Code of Conduct: 28 | ``` 29 | I have read the Code of Conduct and I hereby accept the Terms 30 | ``` -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/RUM-Real-User-Monitoring/README.md: -------------------------------------------------------------------------------- 1 | # Real User Monitoring Detailed Dashboards 2 | 3 | This directory contains detail oriented dashboards and required chart definitions in Terraform for: 4 | - RUM Apps 5 | - RUM Browsers 6 | - RUM Synthetics 7 | 8 | Each of these dashboards is meant as a place to look at details of the specific metrics RUM provides split by Browser, App, or Rigor Synthetics. 9 | 10 | To use: 11 | 12 | ``` 13 | terraform init --upgrade 14 | terraform plan -var="access_token=" -var="realm=" 15 | terraform apply -auto-approve -var="access_token=" -var="realm=" 16 | ``` 17 | 18 | And to remove 19 | 20 | ``` 21 | terraform destroy -auto-approve -var="access_token=" -var="realm=" 22 | ``` 23 | 24 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/SC4SNMP/README.md: -------------------------------------------------------------------------------- 1 | # SC4SNMP (Splunk Connect for SNMP) Dashboard 2 | 3 | This folder contains a dashboard, `SNMP Agents` (WIP), to get insights out of [SC4SNMP](https://splunk.github.io/splunk-connect-for-snmp/main) metrics. This dashboard was built for `snmpd` running on Linux. Metric names may be different for other devices, which would require the dashboard to be updated. 4 | 5 | There is also a dashboard group, `Network Devices (SNMP)`, which provides basic device, interface and protocol metrics 6 | for devices based on the MIB-2 metrics (which should be available in basically any network device). This dashboard group is 7 | recommended as an initial dashboard for edge network device monitoring. 8 | 9 | ## Setup 10 | 11 | This dashboard requires [SC4SNMP](https://splunk.github.io/splunk-connect-for-snmp/main) to be set up: 12 | 13 | - [SC4SNMP official documentation](https://splunk.github.io/splunk-connect-for-snmp/main) 14 | - [Walkthrough of SC4SNMP setup with Linux agents running `snpmd`](https://smathur-splunk.github.io/workshops/snmp_intro) 15 | 16 | Follow these links to set up and configure SC4SNMP to send data to O11y Cloud. Set up SNMP agents as described in the second link and configure them for polling by SC4SNMP. 17 | 18 | The dashboard should automatically populate, but metric names may need changing as they may vary from agent to agent. 19 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/SLO-Error-Budget/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | version = ">=6.13.1" 6 | } 7 | } 8 | } 9 | 10 | variable "signalfx_auth_token" { 11 | type=string 12 | } 13 | 14 | provider "signalfx" { 15 | auth_token = "${var.signalfx_auth_token}" 16 | # If your organization uses a different realm 17 | # api_url = "https://api.us2.signalfx.com" 18 | # If your organization uses a custom URL 19 | # custom_app_url = "https://myorg.signalfx.com" 20 | } 21 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/executive-dashboards/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | version = ">=6.13.1" 6 | } 7 | } 8 | } 9 | 10 | variable "signalfx_auth_token" { 11 | type=string 12 | } 13 | 14 | provider "signalfx" { 15 | auth_token = "${var.signalfx_auth_token}" 16 | # If your organization uses a different realm 17 | # api_url = "https://api.us2.signalfx.com" 18 | # If your organization uses a custom URL 19 | # custom_app_url = "https://myorg.signalfx.com" 20 | } 21 | 22 | 23 | ### Create a Dashboard Group for our Dashboards 24 | resource "signalfx_dashboard_group" "exec_dashboard_group" { 25 | name = "Exec Level Dashboards" 26 | description = "Executive Level Dashboards" 27 | 28 | ### Note that if you use these features, you must use a user's 29 | ### admin key to authenticate the provider, lest Terraform not be able 30 | ### to modify the dashboard group in the future! 31 | #authorized_writer_teams = [signalfx_team.mycoolteam.id] 32 | #authorized_writer_users = ["abc123"] 33 | } 34 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/inferred-services-dg/Inferred-services-DashboardGroup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/dashboards-and-dashboard-groups/inferred-services-dg/Inferred-services-DashboardGroup.png -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/inferred-services-dg/README.md: -------------------------------------------------------------------------------- 1 | # Inferred Services - assets to help observing 2 | 3 | 1. [Dashboard Group - Inferred Services](./Dashboard_Group_Inferred%20Services.json) 4 | 5 | Feel free to also use 6 | 7 | 2. [Sample Detectors: Latency Spike (>3s for 90% of 5min); Error Rate (>50%, sudden change)](../../detectors/inferred-services-detectors/README.md) 8 | 9 | Learn more about Inferred Services: 10 | - [What are Inferred Services](https://docs.splunk.com/observability/en/apm/apm-spans-traces/inferred-services.html) 11 | - [Metrics available for Inferred Services](https://docs.splunk.com/observability/en/apm/span-tags/metricsets.html#available-default-mms-metrics-and-dimensions) 12 | 13 | ## Inferred Services - Dashboard Group 14 | 15 | 1. Import Dashboard Group 16 | *From UI:* 17 | Click on '+' on the top right and select Import->Dashboard Group. 18 | 19 | 2. Find your dashboard group `Inferred Services` and use as a starting point to create charts. 20 | 21 | Screenshot: 22 | ![Dashboard Group 'Inferred Services'](./Inferred-services-DashboardGroup.png) -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/metricshub/README.md: -------------------------------------------------------------------------------- 1 | # MetricsHub 2 | 3 | ## Overview 4 | 5 | **MetricsHub** is a universal metrics collection agent designed for monitoring hardware components, system performance, and sustainability KPIs. It collects data from servers, storage systems, and network devices and pushes it to OpenTelemetry back-ends such as the Splunk Observability Cloud. 6 | 7 | ### Key Features 8 | 9 | - **Remote Monitoring**: MetricsHub supports the monitoring of thousands of systems remotely through protocols such as REST APIs, SNMP, WBEM, WMI, SSH, IPMI, and more. 10 | - **OpenTelemetry Integration**: MetricsHub acts as an OpenTelemetry agent, following its standards for easy integration with various observability platforms. 11 | - **Sustainability Metrics**: Track and report on energy usage and carbon footprint to optimize infrastructure efficiency. 12 | - **250+ Connectors**: Ready-to-use connectors for monitoring a wide variety of platforms. MetricsHub agent is truly vendor-neutral, providing consistent coverage for all manufacturers (e.g., Cisco, Dell EMC, Huawei, HP, IBM, Lenovo, Pure, and more). 13 | 14 | ### Dashboards 15 | 16 | MetricsHub comes with pre-configured dashboards that visualize hardware, as well as sustainability KPIs: 17 | 18 | | Dashboard | Description | 19 | | --- | --- | 20 | | **Hardware - Main** | Overview of all monitored systems, focusing on key hardware and sustainability metrics. | 21 | | **Hardware - Site** | Metrics specific to a particular site (a data center or a server room) and its monitored hosts. | 22 | | **Hardware - Host** | Metrics associated with one *host* and its internal devices. | 23 | 24 | ## Setup 25 | 26 | 1. Follow the [installation instructions](https://metricshub.com/docs/latest/installation/index.html) 27 | 2. Configure the OpenTelemetry Collector to export metrics to Splunk by editing `otel-config.yaml`: 28 | 29 | ```yaml 30 | exporters: 31 | signalfx: 32 | # Access token to send data to SignalFx. 33 | access_token: 34 | # SignalFx realm where the data will be received. 35 | realm: 36 | ``` 37 | 38 | Get more information about the [SignalFx Metrics Exporter](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/signalfxexporter). 39 | 40 | ## Support 41 | 42 | Subscribers to **MetricsHub** gain access to the **MetricsHub Support Desk**, which provides: 43 | 44 | - Technical support 45 | - Patches and updates 46 | - Knowledge base access 47 | 48 | Splunk does not provide support for these dashboards and users should contact Sentry Software's support with any support requests. 49 | 50 | ### Further Reading 51 | 52 | For more information, visit the [MetricsHub](https://metricshub.com/) website. 53 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/oracle-cloud/README.md: -------------------------------------------------------------------------------- 1 | # Oracle Cloud Infrastructure (OCI) Example Content 2 | 3 | NOTE: The example content included here presumes you have connected to Oracle 4 | Cloud using the metrics forwarding integration available [here](https://github.com/splunk/oracle-cloud-examples-splunk-observability/tree/master/samples/oci-monitoring-metrics-to-splunk-observability-python). 5 | You may also want to collect log events using the integration available [here](https://github.com/splunk/oracle-cloud-examples-splunk-observability/tree/master/samples/oci-logs-splunk-hec) 6 | 7 | You should be able to import these examples directly using the UI. The option is 8 | available using the "+"(upper right corner) -> Import -> Dashboard Group option. 9 | 10 | The content will provide: 11 | 12 | 1. Aggregate View: 13 | 14 | ![Aggregate View Screenshot](./occ-instances.png) 15 | 16 | 2. Instance View: 17 | 18 | ![Instance View Screenshot](./occ-instance.png) 19 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/oracle-cloud/occ-instance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/dashboards-and-dashboard-groups/oracle-cloud/occ-instance.png -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/oracle-cloud/occ-instances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/dashboards-and-dashboard-groups/oracle-cloud/occ-instances.png -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/observability-tf-configs/filestatsreceiver_metadata.yaml.tf: -------------------------------------------------------------------------------- 1 | 2 | resource "signalfx_dashboard" "filestatsdashboard" { 3 | name = "filestats" 4 | dashboard_group = signalfx_dashboard_group.filestatsdashboardgroup0.id 5 | time_range = "-1h" 6 | 7 | grid { 8 | chart_ids = [ 9 | signalfx_time_chart.file_mtime.id, signalfx_time_chart.file_ctime.id, signalfx_time_chart.file_atime.id, signalfx_time_chart.file_size.id 10 | ] 11 | width = 4 12 | height = 1 13 | } 14 | } 15 | 16 | resource "signalfx_dashboard_group" "filestatsdashboardgroup0" { 17 | name = "filestats generated OTel dashboard group" 18 | description = "filestats generated OTel dashboard group" 19 | } 20 | 21 | resource "signalfx_time_chart" "file_mtime" { 22 | name = "Elapsed time since the last modification of the file or folder, in seconds since Epoch." 23 | 24 | program_text = <<-EOF 25 | data("file.mtime").publish(label="Elapsed time since the last modification of the file or folder, in seconds since Epoch.") 26 | EOF 27 | 28 | time_range = 14400 29 | 30 | plot_type = "LineChart" 31 | show_data_markers = true 32 | } 33 | 34 | 35 | resource "signalfx_time_chart" "file_ctime" { 36 | name = "Elapsed time since the last change of the file or folder, in seconds since Epoch. In addition to `file.mtime`, this metric tracks metadata changes such as permissions or renaming the file." 37 | 38 | program_text = <<-EOF 39 | data("file.ctime").publish(label="Elapsed time since the last change of the file or folder, in seconds since Epoch. In addition to `file.mtime`, this metric tracks metadata changes such as permissions or renaming the file.") 40 | EOF 41 | 42 | time_range = 14400 43 | 44 | plot_type = "LineChart" 45 | show_data_markers = true 46 | } 47 | 48 | 49 | resource "signalfx_time_chart" "file_atime" { 50 | name = "Elapsed time since last access of the file or folder, in seconds since Epoch." 51 | 52 | program_text = <<-EOF 53 | data("file.atime").publish(label="Elapsed time since last access of the file or folder, in seconds since Epoch.") 54 | EOF 55 | 56 | time_range = 14400 57 | 58 | plot_type = "LineChart" 59 | show_data_markers = true 60 | } 61 | 62 | 63 | resource "signalfx_time_chart" "file_size" { 64 | name = "The size of the file or folder, in bytes." 65 | 66 | program_text = <<-EOF 67 | data("file.size").publish(label="The size of the file or folder, in bytes.") 68 | EOF 69 | 70 | time_range = 14400 71 | 72 | plot_type = "LineChart" 73 | show_data_markers = true 74 | } 75 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/observability-tf-configs/gitproviderreceiver_metadata.yaml.tf: -------------------------------------------------------------------------------- 1 | 2 | resource "signalfx_dashboard" "gitproviderdashboard" { 3 | name = "gitprovider" 4 | dashboard_group = signalfx_dashboard_group.gitproviderdashboardgroup0.id 5 | time_range = "-1h" 6 | 7 | grid { 8 | chart_ids = [ 9 | signalfx_time_chart.git_repository_count.id, signalfx_time_chart.git_repository_branch_count.id, signalfx_time_chart.git_repository_contributor_count.id 10 | ] 11 | width = 4 12 | height = 1 13 | } 14 | } 15 | 16 | resource "signalfx_dashboard_group" "gitproviderdashboardgroup0" { 17 | name = "gitprovider generated OTel dashboard group" 18 | description = "gitprovider generated OTel dashboard group" 19 | } 20 | 21 | resource "signalfx_time_chart" "git_repository_count" { 22 | name = "Number of repositories in an organization" 23 | 24 | program_text = <<-EOF 25 | data("git.repository.count").publish(label="Number of repositories in an organization") 26 | EOF 27 | 28 | time_range = 14400 29 | 30 | plot_type = "LineChart" 31 | show_data_markers = true 32 | } 33 | 34 | 35 | resource "signalfx_time_chart" "git_repository_branch_count" { 36 | name = "Number of branches in the repository" 37 | 38 | program_text = <<-EOF 39 | data("git.repository.branch.count").publish(label="Number of branches in the repository") 40 | EOF 41 | 42 | time_range = 14400 43 | 44 | plot_type = "LineChart" 45 | show_data_markers = true 46 | } 47 | 48 | 49 | resource "signalfx_time_chart" "git_repository_contributor_count" { 50 | name = "Total number of unique contributors to this repository" 51 | 52 | program_text = <<-EOF 53 | data("git.repository.contributor.count").publish(label="Total number of unique contributors to this repository") 54 | EOF 55 | 56 | time_range = 14400 57 | 58 | plot_type = "LineChart" 59 | show_data_markers = true 60 | } 61 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/observability-tf-configs/httpcheckreceiver_metadata.yaml.tf: -------------------------------------------------------------------------------- 1 | 2 | resource "signalfx_dashboard" "httpcheckdashboard" { 3 | name = "httpcheck" 4 | dashboard_group = signalfx_dashboard_group.httpcheckdashboardgroup0.id 5 | time_range = "-1h" 6 | 7 | grid { 8 | chart_ids = [ 9 | signalfx_time_chart.httpcheck_status.id, signalfx_time_chart.httpcheck_duration.id, signalfx_time_chart.httpcheck_error.id 10 | ] 11 | width = 4 12 | height = 1 13 | } 14 | } 15 | 16 | resource "signalfx_dashboard_group" "httpcheckdashboardgroup0" { 17 | name = "httpcheck generated OTel dashboard group" 18 | description = "httpcheck generated OTel dashboard group" 19 | } 20 | 21 | resource "signalfx_time_chart" "httpcheck_status" { 22 | name = "1 if the check resulted in status_code matching the status_class, otherwise 0." 23 | 24 | program_text = <<-EOF 25 | data("httpcheck.status").publish(label="1 if the check resulted in status_code matching the status_class, otherwise 0.") 26 | EOF 27 | 28 | time_range = 14400 29 | 30 | plot_type = "LineChart" 31 | show_data_markers = true 32 | } 33 | 34 | 35 | resource "signalfx_time_chart" "httpcheck_duration" { 36 | name = "Measures the duration of the HTTP check." 37 | 38 | program_text = <<-EOF 39 | data("httpcheck.duration").publish(label="Measures the duration of the HTTP check.") 40 | EOF 41 | 42 | time_range = 14400 43 | 44 | plot_type = "LineChart" 45 | show_data_markers = true 46 | } 47 | 48 | 49 | resource "signalfx_time_chart" "httpcheck_error" { 50 | name = "Records errors occurring during HTTP check." 51 | 52 | program_text = <<-EOF 53 | data("httpcheck.error").publish(label="Records errors occurring during HTTP check.") 54 | EOF 55 | 56 | time_range = 14400 57 | 58 | plot_type = "LineChart" 59 | show_data_markers = true 60 | } 61 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/observability-tf-configs/nginxreceiver_metadata.yaml.tf: -------------------------------------------------------------------------------- 1 | 2 | resource "signalfx_dashboard" "nginxdashboard" { 3 | name = "nginx" 4 | dashboard_group = signalfx_dashboard_group.nginxdashboardgroup0.id 5 | time_range = "-1h" 6 | 7 | grid { 8 | chart_ids = [ 9 | signalfx_time_chart.nginx_requests.id, signalfx_time_chart.nginx_connections_accepted.id, signalfx_time_chart.nginx_connections_handled.id, signalfx_time_chart.nginx_connections_current.id 10 | ] 11 | width = 4 12 | height = 1 13 | } 14 | } 15 | 16 | resource "signalfx_dashboard_group" "nginxdashboardgroup0" { 17 | name = "nginx generated OTel dashboard group" 18 | description = "nginx generated OTel dashboard group" 19 | } 20 | 21 | resource "signalfx_time_chart" "nginx_requests" { 22 | name = "Total number of requests made to the server since it started" 23 | 24 | program_text = <<-EOF 25 | data("nginx.requests").publish(label="Total number of requests made to the server since it started") 26 | EOF 27 | 28 | time_range = 14400 29 | 30 | plot_type = "LineChart" 31 | show_data_markers = true 32 | } 33 | 34 | 35 | resource "signalfx_time_chart" "nginx_connections_accepted" { 36 | name = "The total number of accepted client connections" 37 | 38 | program_text = <<-EOF 39 | data("nginx.connections_accepted").publish(label="The total number of accepted client connections") 40 | EOF 41 | 42 | time_range = 14400 43 | 44 | plot_type = "LineChart" 45 | show_data_markers = true 46 | } 47 | 48 | 49 | resource "signalfx_time_chart" "nginx_connections_handled" { 50 | name = "The total number of handled connections. Generally, the parameter value is the same as nginx.connections_accepted unless some resource limits have been reached (for example, the worker_connections limit)." 51 | 52 | program_text = <<-EOF 53 | data("nginx.connections_handled").publish(label="The total number of handled connections. Generally, the parameter value is the same as nginx.connections_accepted unless some resource limits have been reached (for example, the worker_connections limit).") 54 | EOF 55 | 56 | time_range = 14400 57 | 58 | plot_type = "LineChart" 59 | show_data_markers = true 60 | } 61 | 62 | 63 | resource "signalfx_time_chart" "nginx_connections_current" { 64 | name = "The current number of nginx connections by state" 65 | 66 | program_text = <<-EOF 67 | data("nginx.connections_current").publish(label="The current number of nginx connections by state") 68 | EOF 69 | 70 | time_range = 14400 71 | 72 | plot_type = "LineChart" 73 | show_data_markers = true 74 | } 75 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/chronyreceiver_metadata.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | type: chrony 3 | 4 | status: 5 | class: receiver 6 | stability: 7 | alpha: [metrics] 8 | distributions: [contrib] 9 | codeowners: 10 | active: [MovieStoreGuy, jamesmoessis] 11 | 12 | attributes: 13 | leap.status: 14 | description: how the chrony is handling leap seconds 15 | type: string 16 | enum: 17 | - normal 18 | - insert_second 19 | - delete_second 20 | - unsynchronised 21 | 22 | metrics: 23 | ntp.frequency.offset: 24 | enabled: false 25 | description: The frequency is the rate by which the system s clock would be wrong if chronyd was not correcting it. 26 | extended_documentation: "It is expressed in ppm (parts per million). For example, a value of 1 ppm would mean that when the system’s clock thinks it has advanced 1 second, it has actually advanced by 1.000001 seconds relative to true time." 27 | unit: "ppm" 28 | gauge: 29 | value_type: double 30 | attributes: 31 | - leap.status 32 | ntp.skew: 33 | enabled: true 34 | description: This is the estimated error bound on the frequency. 35 | unit: "ppm" 36 | gauge: 37 | value_type: double 38 | ntp.stratum: 39 | enabled: false 40 | description: The number of hops away from the reference system keeping the reference time 41 | extended_documentation: To read further, refer to https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/system_administrators_guide/ch-configuring_ntp_using_the_chrony_suite#sect-Checking_chrony_tracking 42 | unit: "{count}" 43 | gauge: 44 | value_type: int 45 | ntp.time.correction: 46 | enabled: true 47 | description: The number of seconds difference between the system's clock and the reference clock 48 | unit: seconds 49 | gauge: 50 | value_type: double 51 | attributes: 52 | - leap.status 53 | ntp.time.last_offset: 54 | enabled: true 55 | description: The estimated local offset on the last clock update 56 | unit: seconds 57 | gauge: 58 | value_type: double 59 | attributes: 60 | - leap.status 61 | ntp.time.rms_offset: 62 | enabled: false 63 | description: the long term average of the offset value 64 | unit: seconds 65 | gauge: 66 | value_type: double 67 | attributes: 68 | - leap.status 69 | ntp.time.root_delay: 70 | enabled: false 71 | description: This is the total of the network path delays to the stratum-1 system from which the system is ultimately synchronised. 72 | unit: seconds 73 | gauge: 74 | value_type: double 75 | attributes: 76 | - leap.status 77 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/couchdbreceiver_metadata.yaml: -------------------------------------------------------------------------------- 1 | type: couchdb 2 | 3 | status: 4 | class: receiver 5 | stability: 6 | beta: [metrics] 7 | distributions: [contrib,observiq, sumo] 8 | codeowners: 9 | active: [djaglowski] 10 | 11 | resource_attributes: 12 | couchdb.node.name: 13 | description: The name of the node. 14 | type: string 15 | enabled: true 16 | 17 | attributes: 18 | http.method: 19 | description: An HTTP request method. 20 | type: string 21 | enum: [ COPY, DELETE, GET, HEAD, OPTIONS, POST, PUT ] 22 | http.status_code: 23 | description: An HTTP status code. 24 | type: string 25 | view: 26 | description: The view type. 27 | type: string 28 | enum: [ temporary_view_reads, view_reads ] 29 | operation: 30 | description: The operation type. 31 | type: string 32 | enum: [ writes, reads ] 33 | 34 | metrics: 35 | couchdb.average_request_time: 36 | enabled: true 37 | description: The average duration of a served request. 38 | unit: ms 39 | gauge: 40 | value_type: double 41 | couchdb.httpd.bulk_requests: 42 | enabled: true 43 | description: The number of bulk requests. 44 | unit: "{requests}" 45 | sum: 46 | value_type: int 47 | monotonic: true 48 | aggregation_temporality: cumulative 49 | couchdb.httpd.requests: 50 | enabled: true 51 | description: The number of HTTP requests by method. 52 | unit: "{requests}" 53 | sum: 54 | value_type: int 55 | monotonic: true 56 | aggregation_temporality: cumulative 57 | attributes: [ http.method ] 58 | couchdb.httpd.responses: 59 | enabled: true 60 | description: The number of each HTTP status code. 61 | unit: "{responses}" 62 | sum: 63 | value_type: int 64 | monotonic: true 65 | aggregation_temporality: cumulative 66 | attributes: [ http.status_code ] 67 | couchdb.httpd.views: 68 | enabled: true 69 | description: The number of views read. 70 | unit: "{views}" 71 | sum: 72 | value_type: int 73 | monotonic: true 74 | aggregation_temporality: cumulative 75 | attributes: [ view ] 76 | couchdb.database.open: 77 | enabled: true 78 | description: The number of open databases. 79 | unit: "{databases}" 80 | sum: 81 | value_type: int 82 | monotonic: false 83 | aggregation_temporality: cumulative 84 | couchdb.file_descriptor.open: 85 | enabled: true 86 | description: The number of open file descriptors. 87 | unit: "{files}" 88 | sum: 89 | value_type: int 90 | monotonic: false 91 | aggregation_temporality: cumulative 92 | couchdb.database.operations: 93 | enabled: true 94 | description: The number of database operations. 95 | unit: "{operations}" 96 | sum: 97 | value_type: int 98 | monotonic: true 99 | aggregation_temporality: cumulative 100 | attributes: [ operation ] 101 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/filestatsreceiver_metadata.yaml: -------------------------------------------------------------------------------- 1 | type: filestats 2 | 3 | status: 4 | class: receiver 5 | stability: 6 | beta: [metrics] 7 | distributions: [contrib, sumo] 8 | codeowners: 9 | active: [atoulme] 10 | 11 | 12 | resource_attributes: 13 | file.name: 14 | description: The name of the file 15 | enabled: true 16 | type: string 17 | file.path: 18 | description: The absolute path of the file 19 | enabled: false 20 | type: string 21 | 22 | attributes: 23 | file.permissions: 24 | description: the permissions associated with the file, using an octal format. 25 | type: string 26 | 27 | metrics: 28 | file.mtime: 29 | description: Elapsed time since the last modification of the file or folder, in seconds since Epoch. 30 | enabled: true 31 | sum: 32 | monotonic: false 33 | aggregation_temporality: cumulative 34 | value_type: int 35 | unit: "s" 36 | file.ctime: 37 | description: Elapsed time since the last change of the file or folder, in seconds since Epoch. In addition to `file.mtime`, this metric tracks metadata changes such as permissions or renaming the file. 38 | enabled: false 39 | sum: 40 | monotonic: false 41 | aggregation_temporality: cumulative 42 | value_type: int 43 | unit: "s" 44 | attributes: 45 | - file.permissions 46 | file.atime: 47 | description: Elapsed time since last access of the file or folder, in seconds since Epoch. 48 | enabled: false 49 | sum: 50 | monotonic: false 51 | aggregation_temporality: cumulative 52 | value_type: int 53 | unit: "s" 54 | file.size: 55 | description: The size of the file or folder, in bytes. 56 | enabled: true 57 | gauge: 58 | value_type: int 59 | unit: "b" 60 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/gitproviderreceiver_metadata.yaml: -------------------------------------------------------------------------------- 1 | type: gitprovider 2 | 3 | sem_conv_version: 1.9.0 4 | 5 | status: 6 | class: receiver 7 | stability: 8 | development: [metrics] 9 | distributions: [liatrio] 10 | codeowners: 11 | active: [adrielp, astencel-sumo] 12 | 13 | # this might need to be unique per sub receiver implementation 14 | resource_attributes: 15 | organization.name: 16 | enabled: true 17 | description: Git Organization or Project Name 18 | type: string 19 | git.vendor.name: 20 | enabled: true 21 | # github, gitlab, bitbucket, gittea 22 | description: The name of the Git vendor/provider (ie. GitHub / GitLab) 23 | type: string 24 | 25 | ## Attritbutes that will be uncommented when the rest of the metrics are added 26 | attributes: 27 | repository.name: 28 | description: The full name of the Git repository 29 | type: string 30 | 31 | metrics: 32 | git.repository.count: 33 | enabled: true 34 | description: Number of repositories in an organization 35 | unit: 1 36 | gauge: 37 | value_type: int 38 | attributes: [] 39 | git.repository.branch.count: 40 | enabled: true 41 | description: Number of branches in the repository 42 | unit: 1 43 | gauge: 44 | value_type: int 45 | attributes: [repository.name] 46 | git.repository.contributor.count: 47 | enabled: false 48 | description: Total number of unique contributors to this repository 49 | unit: 1 50 | gauge: 51 | value_type: int 52 | attributes: [repository.name] 53 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/httpcheckreceiver_metadata.yaml: -------------------------------------------------------------------------------- 1 | type: httpcheck 2 | 3 | status: 4 | class: receiver 5 | stability: 6 | development: [metrics] 7 | distributions: [contrib, sumo] 8 | warnings: [] 9 | codeowners: 10 | active: [codeboten] 11 | 12 | resource_attributes: 13 | 14 | attributes: 15 | http.url: 16 | description: Full HTTP request URL. 17 | type: string 18 | http.status_code: 19 | description: HTTP response status code 20 | type: int 21 | http.method: 22 | description: HTTP request method 23 | type: string 24 | http.status_class: 25 | description: HTTP response status class 26 | type: string 27 | error.message: 28 | description: Error message recorded during check 29 | type: string 30 | 31 | metrics: 32 | httpcheck.status: 33 | description: 1 if the check resulted in status_code matching the status_class, otherwise 0. 34 | enabled: true 35 | sum: 36 | value_type: int 37 | aggregation_temporality: cumulative 38 | monotonic: false 39 | unit: 1 40 | attributes: [http.url, http.status_code, http.method, http.status_class] 41 | httpcheck.duration: 42 | description: Measures the duration of the HTTP check. 43 | enabled: true 44 | gauge: 45 | value_type: int 46 | unit: ms 47 | attributes: [http.url] 48 | httpcheck.error: 49 | description: Records errors occurring during HTTP check. 50 | enabled: true 51 | sum: 52 | value_type: int 53 | aggregation_temporality: cumulative 54 | monotonic: false 55 | unit: "{error}" 56 | attributes: [http.url, error.message] 57 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/nginxreceiver_metadata.yaml: -------------------------------------------------------------------------------- 1 | type: nginx 2 | 3 | status: 4 | class: receiver 5 | stability: 6 | beta: [metrics] 7 | distributions: [contrib, observiq, sumo] 8 | codeowners: 9 | active: [djaglowski] 10 | 11 | attributes: 12 | state: 13 | description: The state of a connection 14 | type: string 15 | enum: 16 | - active 17 | - reading 18 | - writing 19 | - waiting 20 | 21 | metrics: 22 | nginx.requests: 23 | enabled: true 24 | description: Total number of requests made to the server since it started 25 | unit: requests 26 | sum: 27 | value_type: int 28 | monotonic: true 29 | aggregation_temporality: cumulative 30 | attributes: [] 31 | nginx.connections_accepted: 32 | enabled: true 33 | description: The total number of accepted client connections 34 | unit: connections 35 | sum: 36 | value_type: int 37 | monotonic: true 38 | aggregation_temporality: cumulative 39 | attributes: [] 40 | nginx.connections_handled: 41 | enabled: true 42 | description: The total number of handled connections. Generally, the parameter value is the same as nginx.connections_accepted unless some resource limits have been reached (for example, the worker_connections limit). 43 | unit: connections 44 | sum: 45 | value_type: int 46 | monotonic: true 47 | aggregation_temporality: cumulative 48 | attributes: [] 49 | nginx.connections_current: 50 | enabled: true 51 | description: The current number of nginx connections by state 52 | unit: connections 53 | sum: 54 | value_type: int 55 | monotonic: false 56 | aggregation_temporality: cumulative 57 | attributes: [state] 58 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/rabbitmqreceiver_metadata.yaml: -------------------------------------------------------------------------------- 1 | type: rabbitmq 2 | 3 | status: 4 | class: receiver 5 | stability: 6 | beta: [metrics] 7 | distributions: [contrib, observiq, sumo] 8 | codeowners: 9 | active: [djaglowski, cpheps] 10 | 11 | resource_attributes: 12 | rabbitmq.queue.name: 13 | description: The name of the RabbitMQ queue. 14 | enabled: true 15 | type: string 16 | rabbitmq.node.name: 17 | description: The name of the RabbitMQ node. 18 | enabled: true 19 | type: string 20 | rabbitmq.vhost.name: 21 | description: The name of the RabbitMQ vHost. 22 | enabled: true 23 | type: string 24 | 25 | attributes: 26 | message.state: 27 | name_override: state 28 | description: The state of messages in a queue. 29 | type: string 30 | enum: 31 | - ready 32 | - unacknowledged 33 | metrics: 34 | rabbitmq.consumer.count: 35 | description: The number of consumers currently reading from the queue. 36 | unit: "{consumers}" 37 | sum: 38 | monotonic: false 39 | aggregation_temporality: cumulative 40 | value_type: int 41 | enabled: true 42 | rabbitmq.message.delivered: 43 | description: The number of messages delivered to consumers. 44 | unit: "{messages}" 45 | sum: 46 | monotonic: true 47 | aggregation_temporality: cumulative 48 | value_type: int 49 | enabled: true 50 | rabbitmq.message.published: 51 | description: The number of messages published to a queue. 52 | unit: "{messages}" 53 | sum: 54 | monotonic: true 55 | aggregation_temporality: cumulative 56 | value_type: int 57 | enabled: true 58 | rabbitmq.message.acknowledged: 59 | description: The number of messages acknowledged by consumers. 60 | unit: "{messages}" 61 | sum: 62 | monotonic: true 63 | aggregation_temporality: cumulative 64 | value_type: int 65 | enabled: true 66 | rabbitmq.message.dropped: 67 | description: The number of messages dropped as unroutable. 68 | unit: "{messages}" 69 | sum: 70 | monotonic: true 71 | aggregation_temporality: cumulative 72 | value_type: int 73 | enabled: true 74 | rabbitmq.message.current: 75 | description: The total number of messages currently in the queue. 76 | unit: "{messages}" 77 | sum: 78 | monotonic: false 79 | aggregation_temporality: cumulative 80 | value_type: int 81 | attributes: [message.state] 82 | enabled: true 83 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/riakreceiver_metadata.yaml: -------------------------------------------------------------------------------- 1 | type: riak 2 | 3 | status: 4 | class: receiver 5 | stability: 6 | beta: [metrics] 7 | distributions: [contrib, observiq, sumo] 8 | codeowners: 9 | active: [djaglowski, armstrmi] 10 | 11 | resource_attributes: 12 | riak.node.name: 13 | description: The name this node uses to identify itself. 14 | enabled: true 15 | type: string 16 | 17 | attributes: 18 | request: 19 | description: The request operation type. 20 | type: string 21 | enum: 22 | - put 23 | - get 24 | operation: 25 | description: The operation type for index operations. 26 | type: string 27 | enum: 28 | - read 29 | - write 30 | - delete 31 | 32 | metrics: 33 | riak.node.operation.count: 34 | description: The number of operations performed by the node. 35 | unit: "{operation}" 36 | sum: 37 | monotonic: true 38 | aggregation_temporality: cumulative 39 | value_type: int 40 | enabled: true 41 | attributes: [request] 42 | riak.node.operation.time.mean: 43 | description: The mean time between request and response for operations performed by the node over the last minute. 44 | unit: us 45 | gauge: 46 | value_type: int 47 | enabled: true 48 | attributes: [request] 49 | riak.node.read_repair.count: 50 | description: The number of read repairs performed by the node. 51 | unit: "{read_repair}" 52 | sum: 53 | monotonic: true 54 | aggregation_temporality: cumulative 55 | value_type: int 56 | enabled: true 57 | riak.memory.limit: 58 | description: The amount of memory allocated to the node. 59 | unit: By 60 | sum: 61 | monotonic: false 62 | aggregation_temporality: cumulative 63 | value_type: int 64 | enabled: true 65 | riak.vnode.operation.count: 66 | description: The number of operations performed by vnodes on the node. 67 | unit: "{operation}" 68 | sum: 69 | monotonic: true 70 | aggregation_temporality: cumulative 71 | value_type: int 72 | enabled: true 73 | attributes: [request] 74 | riak.vnode.index.operation.count: 75 | description: The number of index operations performed by vnodes on the node. 76 | unit: "{operation}" 77 | sum: 78 | monotonic: false 79 | aggregation_temporality: cumulative 80 | value_type: int 81 | attributes: [operation] 82 | enabled: true 83 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/otel-receiver-yaml/sshcheckreceiver_metadata.yaml: -------------------------------------------------------------------------------- 1 | type: sshcheck 2 | 3 | status: 4 | class: receiver 5 | stability: 6 | alpha: [metrics] 7 | distributions: [contrib, sumo] 8 | codeowners: 9 | active: [nslaughter, codeboten] 10 | 11 | resource_attributes: 12 | ssh.endpoint: 13 | description: Full SSH endpoint 14 | type: string 15 | 16 | attributes: 17 | error.message: 18 | description: Error message recorded during check 19 | type: string 20 | 21 | metrics: 22 | sshcheck.status: 23 | description: 1 if the SSH client successfully connected, otherwise 0. 24 | enabled: true 25 | sum: 26 | value_type: int 27 | aggregation_temporality: cumulative 28 | monotonic: false 29 | unit: 1 30 | sshcheck.duration: 31 | description: Measures the duration of SSH connection. 32 | enabled: true 33 | gauge: 34 | value_type: int 35 | unit: ms 36 | sshcheck.error: 37 | description: Records errors occurring during SSH check. 38 | enabled: true 39 | sum: 40 | value_type: int 41 | aggregation_temporality: cumulative 42 | monotonic: false 43 | unit: "{error}" 44 | attributes: [error.message] 45 | sshcheck.sftp_status: 46 | description: 1 if the SFTP server replied to request, otherwise 0. 47 | enabled: false 48 | sum: 49 | value_type: int 50 | aggregation_temporality: cumulative 51 | monotonic: false 52 | unit: 1 53 | sshcheck.sftp_duration: 54 | description: Measures SFTP request duration. 55 | enabled: false 56 | gauge: 57 | value_type: int 58 | unit: ms 59 | sshcheck.sftp_error: 60 | description: Records errors occurring during SFTP check. 61 | enabled: false 62 | sum: 63 | value_type: int 64 | aggregation_temporality: cumulative 65 | monotonic: false 66 | unit: "{error}" 67 | attributes: [error.message] 68 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/pull-otel-yaml.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import yaml 4 | 5 | # GitHub repository information 6 | repo_owner = "open-telemetry" 7 | repo_name = "opentelemetry-collector-contrib" 8 | repo_path = "receiver" 9 | github_token = os.environ.get('GITHUB_PAT_TOKEN') 10 | headers = {} 11 | api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{repo_path}" 12 | 13 | # Check for our PAT and make auth headers so we don't get rate limited 14 | if github_token is not None: 15 | headers["Authorization"] = "Bearer " + github_token 16 | else: 17 | print("no $GITHUB_PAT_TOKEN environment variable found. Expect rate limiting.") 18 | 19 | # Make a request to GitHub API 20 | response = requests.get(api_url, headers=headers) 21 | contents = response.json() 22 | if response.status_code != 200: 23 | print("Received " + str(response.status_code) + " STATUS CODE. \n" + response.text) 24 | exit() 25 | 26 | # Iterate through contents and find subdirectories 27 | directories = [content["name"] for content in contents if content["type"] == "dir"] 28 | 29 | # Iterate through subdirectories and extract metadata.yaml with 'metrics' section 30 | for sub in directories: 31 | subdir_api_url = f"{api_url}/{sub}" 32 | subdir_response = requests.get(subdir_api_url, headers=headers) 33 | if subdir_response.status_code != 200: 34 | print("Received " + str(subdir_response.status_code) + " STATUS CODE. \n" + response.text) 35 | exit() 36 | subdir_contents = subdir_response.json() 37 | 38 | # Check if metadata.yaml exists in the subdirectory 39 | metadata_content = None 40 | for content in subdir_contents: 41 | if content["name"] == "metadata.yaml": 42 | metadata_url = content["download_url"] 43 | metadata_response = requests.get(metadata_url, headers=headers) 44 | if metadata_response.status_code != 200: 45 | print("Received " + str(metadata_response.status_code) + " STATUS CODE. \n" + response.text) 46 | exit() 47 | metadata_content = metadata_response.text 48 | break 49 | 50 | if metadata_content: 51 | # Parse YAML content 52 | metadata_data = yaml.safe_load(metadata_content) 53 | 54 | # Check if 'metrics' section exists in metadata.yaml then save 55 | if "metrics" in metadata_data: 56 | filename = f"./otel-receiver-yaml/{sub}_metadata.yaml" 57 | with open(filename, "w") as file: 58 | file.write(metadata_content) 59 | print(f"Metadata.yaml with 'metrics' section extracted from {sub}") 60 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/otel-receiver-dashboard-generator/requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML==6.* 2 | Requests==2.* 3 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/snowflakedb/Configuration/README.md: -------------------------------------------------------------------------------- 1 | # Configuration Examples 2 | **NOTE:** These are only examples. You configuration will likely be slightly different. 3 | 4 | These examples expect you are using the [`splunk-otel-collector`](https://github.com/signalfx/splunk-otel-collector) but these examples will also work with any other OTEL configuration. 5 | 6 | 1. [`agent_config.yaml`](./agent_config.yaml) Contains receiver, exporter, pipeline configuration 7 | The receiver entries for Snowflake can be found under `smartagent/sql` 8 | 1. **NOTE:** You MUST add your Snowflake `account` to this config where `account` taken from this format `.snowflakecomputing.com` 9 | 2. If you plan to use a custom `role` rather than `ACCOUNTADMIN` you will need to add your `role` to this config 10 | - **NOTE:** Resolution of `3600` seconds (1 hour) is recommended due to the latency between actions happening and then showing up in the `SNOWFLAKE/ACCOUNT_USAGE` db view. It is possible to collect at a higher interval but is not recommended. 11 | 2. [`splunk-otel-collector.conf`](./splunk-otel-collector.conf) Contains referenced variables like snowflake username / password, and Splunk Observability token, etc 12 | 1. Add your Splunk Observability token in `SPLUNK_ACCESS_TOKEN` 13 | 2. Add your Snowflake User to `SNOWFLAKE_USER` (the user MUST have a role that allows access to the `SNOWFLAKE/ACCOUNT_USAGE` db view) 14 | 3. Add the password for your Snowflake user account to `SNOWFLAKE_PASS` 15 | 3. [`snowflake-metrics.yaml`](./snowflake-metrics.yaml) Contains SQL queries and mappings for our Splunk Observability metrics and dimensions 16 | - [`snowflake-other-metrics.yaml`](./snowflake-other-metrics.yaml) file contains SQL queries for: 17 | - detailed and *high cardinality* DB query metrics including the `query_id` dimension which is a GUID 18 | - When using these metrics replace the `DB Metrics` in `snowflake-metrics.yaml` 19 | - Billing usage in USD -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/snowflakedb/Configuration/snowflake-receiver/splunk-otel-collector.conf: -------------------------------------------------------------------------------- 1 | SPLUNK_CONFIG=/etc/otel/collector/agent_config.yaml 2 | SPLUNK_ACCESS_TOKEN=TH1S_1SN7_4_R34L_T0K3N 3 | SPLUNK_REALM=us0 4 | SPLUNK_API_URL=https://api.signalfx.com 5 | SPLUNK_LOGOBSERVER_URL=https://ingest.signalfx.com/v1/log 6 | SPLUNK_INGEST_URL=https://ingest.signalfx.com 7 | SPLUNK_TRACE_URL=https://ingest.signalfx.com/v2/trace 8 | SPLUNK_HEC_URL=https://ingest.signalfx.com/v1/log 9 | SPLUNK_HEC_TOKEN=TH1S_1SN7_4_R34L_T0K3N 10 | SPLUNK_MEMORY_TOTAL_MIB=512 11 | SPLUNK_BUNDLE_DIR=/usr/lib/splunk-otel-collector/agent-bundle 12 | SPLUNK_COLLECTD_DIR=/usr/lib/splunk-otel-collector/agent-bundle/run/collectd 13 | SNOWFLAKE_USER=ADD_YOUR_SNOWFLAKE_USERNAME_WITH_ACCOUNT_ADMIN_PRIVS 14 | SNOWFLAKE_PASS=ADD_YOUR_SNOWFLAKE_PASSWORD 15 | -------------------------------------------------------------------------------- /dashboards-and-dashboard-groups/snowflakedb/Configuration/splunk-otel-collector.conf: -------------------------------------------------------------------------------- 1 | SPLUNK_CONFIG=/etc/otel/collector/agent_config.yaml 2 | SPLUNK_ACCESS_TOKEN=TH1S_1SN7_4_R34L_T0K3N 3 | SPLUNK_REALM=us0 4 | SPLUNK_API_URL=https://api.signalfx.com 5 | SPLUNK_LOGOBSERVER_URL=https://ingest.signalfx.com/v1/log 6 | SPLUNK_INGEST_URL=https://ingest.signalfx.com 7 | SPLUNK_TRACE_URL=https://ingest.signalfx.com/v2/trace 8 | SPLUNK_HEC_URL=https://ingest.signalfx.com/v1/log 9 | SPLUNK_HEC_TOKEN=TH1S_1SN7_4_R34L_T0K3N 10 | SPLUNK_MEMORY_TOTAL_MIB=512 11 | SPLUNK_BUNDLE_DIR=/usr/lib/splunk-otel-collector/agent-bundle 12 | SPLUNK_COLLECTD_DIR=/usr/lib/splunk-otel-collector/agent-bundle/run/collectd 13 | SNOWFLAKE_USER=ADD_YOUR_SNOWFLAKE_USERNAME_WITH_ACCOUNT_ADMIN_PRIVS 14 | SNOWFLAKE_PASS=ADD_YOUR_SNOWFLAKE_PASSWORD 15 | -------------------------------------------------------------------------------- /detectors/README.md: -------------------------------------------------------------------------------- 1 | # General Recommendations for Detector Content for Reuse 2 | 3 | There are many use cases for detectors, and any detector which provides 4 | insight may be useful for other users. There are some patterns we have found which 5 | work well in Splunk Observability and encourage content reuse. 6 | 7 | 1. **Noun-centric** : While each environment is different, frequently their 8 | applications are composed of common software components/platforms. detectors 9 | which are oriented to understanding those common software components tend to be 10 | more reusable than detectors which are related to processes (which tend to vary 11 | from environment to environment). 12 | 13 | 1. **Instances and Aggregates** : Users typically need to see the "forest" and 14 | the "trees". The way we typically implement this is to define instance and 15 | aggregate views. In addition to being generally useful to users, these 16 | views are used when promoting a detector set to Navigator Views. 17 | 18 | * **Aggregate views** focus on enabling users to identify which particular 19 | instances are outliers. Frequently the information presented in these 20 | detectors is aggregated to the instance as there is instance-level detail to 21 | enable users to further isolate the problem. An example would be showing the 22 | maximum utilization of all filesystems for a host in the aggregate view. 23 | Knowing that a host has a filesystem at 97% utilization would be enough 24 | information for a user to identify that host as an outlier and then further 25 | investigate which specific filesystem was approaching it's limit. 26 | 27 | * **Instance views** focus on enabling users to identify the specific 28 | problem related to the instance. So for instance breaking out the filesystem 29 | utilization metrics by filesystem so that the user knows exactly what 30 | resource is approaching exhaustion. 31 | 32 | 3. **External KPIs** : 33 | 34 | 4. **Internal KPIs** : 35 | 36 | 5. **Detector Variables** : 37 | -------------------------------------------------------------------------------- /detectors/inferred-services-detectors/README.md: -------------------------------------------------------------------------------- 1 | # Inferred Services - assets to help observing 2 | 3 | 1. [Detector: Latency Spike (>3s for 90% of 5min)](./POST_Detector_latency_spike.sh) 4 | 5 | 2. [Detector: Error Rate (>50%, sudden change)](./POST_Detector_error_rate.sh) 6 | 7 | Feel free to also use 8 | 9 | 3. [Dashboard Group - Inferred Services](../../dashboards-and-dashboard-groups/inferred-services-dg/README.md) 10 | 11 | Learn more about Inferred Services: 12 | - [What are Inferred Services](https://docs.splunk.com/observability/en/apm/apm-spans-traces/inferred-services.html) 13 | - [Metrics available for Inferred Services](https://docs.splunk.com/observability/en/apm/span-tags/metricsets.html#available-default-mms-metrics-and-dimensions) 14 | 15 | ## Inferred Services - Sample Detectors 16 | ![Sample Detectors for Latency and Error rate of Inferred Services](../../detectors/inferred-services-detectors/detectors-1.png) 17 | 18 | Use curl command to post the detector (replace `Token` and `Realm` as required). 19 | 20 | These can be used as a starting point to customise signals, thresholds, messaging etc. 21 | 22 | Screeshots: 23 | ![Error Rate Detector](../../detectors/inferred-services-detectors/detectors-errors.png) 24 | ![Latency Spike Detector](../../detectors/inferred-services-detectors/detectors-latency.png) 25 | -------------------------------------------------------------------------------- /detectors/inferred-services-detectors/detectors-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/detectors/inferred-services-detectors/detectors-1.png -------------------------------------------------------------------------------- /detectors/inferred-services-detectors/detectors-errors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/detectors/inferred-services-detectors/detectors-errors.png -------------------------------------------------------------------------------- /detectors/inferred-services-detectors/detectors-latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/detectors/inferred-services-detectors/detectors-latency.png -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Connector failed.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1727160940219, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GYN3h5fAIBw", 12 | "labelResolutions": { 13 | "Hardware - Connector failed": 1000 14 | }, 15 | "lastUpdated": 1727215761634, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": null, 18 | "minDelay": null, 19 | "name": "Hardware - Connector failed", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('metricshub.connector.status', filter=filter('state', 'failed'), rollup='max').publish(label='A')\ndetect(when(A > threshold(0))).publish('Hardware - Connector failed')", 23 | "rules": [ 24 | { 25 | "description": "The value of metricshub.connector.status is above 0.", 26 | "detectLabel": "Hardware - Connector failed", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n## Failed connector\nAgent **{{dimensions.[agent.host.name]}}** is failing to use **{{dimensions.[name]}}** to monitor **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n## Consequence\nAll of the components that were monitored through this connector can no longer be monitored.\n\n## Recommended action\nMake sure {{dimensions.[agent.host.name]}} can communicate with {{dimensions.[host.name]}} with the protocol used by {{dimensions.[name]}} and that the specified credentials in Metrics Hub's configuration are valid.\n{{else}}\nRecovered monitoring with {{dimensions.[name]}} connector.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "Hardware - Failed connector on {{dimensions.[host.name]}}", 31 | "runbookUrl": "", 32 | "severity": "Major", 33 | "tip": "" 34 | } 35 | ], 36 | "sf_metricsInObjectProgramText": [ 37 | "metricshub.connector.status" 38 | ], 39 | "status": "ACTIVE", 40 | "tags": [], 41 | "teams": [], 42 | "timezone": "", 43 | "visualizationOptions": { 44 | "disableSampling": false, 45 | "publishLabelOptions": [ 46 | { 47 | "displayName": "metricshub.connector.status", 48 | "label": "A", 49 | "paletteIndex": null, 50 | "valuePrefix": null, 51 | "valueSuffix": null, 52 | "valueUnit": null 53 | } 54 | ], 55 | "showDataMarkers": true, 56 | "showEventLines": false, 57 | "time": { 58 | "range": 86400000, 59 | "rangeEnd": 0, 60 | "type": "relative" 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Critical LUN pathing issue.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1727101400682, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GYKRRPGAAAA", 12 | "labelResolutions": { 13 | "Hardware - Critical LUN pathing issue": 180000 14 | }, 15 | "lastUpdated": 1730952952374, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": null, 18 | "minDelay": null, 19 | "name": "Hardware - Critical LUN pathing issue", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('hw.lun.paths').publish(label='A')\ndetect(when(A < threshold(1))).publish('Hardware - Critical LUN pathing issue')", 23 | "rules": [ 24 | { 25 | "description": "The value of hw.lun.paths is below 1.", 26 | "detectLabel": "Hardware - Critical LUN pathing issue", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n## Lost data access\nLUN **{{dimensions.[name]}}** is no longer available on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**\n\n## Consequence\nOne or more filesystems are no longer available (possible data loss).\n\n## Recommended action\nVerify the status of the underlying HBA and its connectivity. Verify the reachability of the storage system and whether any configuration change has been made to the corresponding storage volume.\n{{else}}\nRecovered available LUN paths.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "Critical LUN pathing issue", 31 | "runbookUrl": "", 32 | "severity": "Major", 33 | "tip": "" 34 | } 35 | ], 36 | "sf_metricsInObjectProgramText": [ 37 | "hw.lun.paths" 38 | ], 39 | "status": "ACTIVE", 40 | "tags": [], 41 | "teams": [], 42 | "timezone": "", 43 | "visualizationOptions": { 44 | "disableSampling": false, 45 | "publishLabelOptions": [ 46 | { 47 | "displayName": "hw.lun.paths", 48 | "label": "A", 49 | "paletteIndex": null, 50 | "valuePrefix": null, 51 | "valueSuffix": null, 52 | "valueUnit": null 53 | } 54 | ], 55 | "showDataMarkers": true, 56 | "showEventLines": false, 57 | "time": { 58 | "range": 86400000, 59 | "rangeEnd": 0, 60 | "type": "relative" 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Critically low battery.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1727098905351, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GYKHAnMAAAI", 12 | "labelResolutions": { 13 | "Hardware - Critically low battery": 240000 14 | }, 15 | "lastUpdated": 1727178855678, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": null, 18 | "minDelay": null, 19 | "name": "Hardware - Critically low battery", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('hw.battery.charge').publish(label='A', enable=False)\nB = (A*100).publish(label='B')\ndetect(when(B < threshold(30))).publish('Hardware - Critically low battery')", 23 | "rules": [ 24 | { 25 | "description": "The value of A*100 is below 30.", 26 | "detectLabel": "Hardware - Critically low battery", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n###Low battery\nBattery **{{dimensions.[name]}}** charge is critically low on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Consequence\nA low charge battery may lead to data loss in case of a power outage.\n\n###Recommended action\nCheck why the battery is not fully charged (it may be due to a power outage or an unplugged power cable) and if necessary, replace the battery.\n{{else}}\nThe battery charge is back within the normal operational range.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Vendor:** {{dimensions.vendor}}\n**Model:** {{dimensions.model}}\n**Serial Number:** {{dimensions.serial_number}}\n**Information:** {{dimensions.info}}", 30 | "runbookUrl": "", 31 | "severity": "Major", 32 | "tip": "" 33 | } 34 | ], 35 | "sf_metricsInObjectProgramText": [ 36 | "hw.battery.charge" 37 | ], 38 | "status": "ACTIVE", 39 | "tags": [], 40 | "teams": [], 41 | "timezone": "", 42 | "visualizationOptions": { 43 | "disableSampling": false, 44 | "publishLabelOptions": [ 45 | { 46 | "displayName": "hw.battery.charge", 47 | "label": "A", 48 | "paletteIndex": null, 49 | "valuePrefix": null, 50 | "valueSuffix": null, 51 | "valueUnit": null 52 | }, 53 | { 54 | "displayName": "A*100", 55 | "label": "B", 56 | "paletteIndex": null, 57 | "valuePrefix": null, 58 | "valueSuffix": null, 59 | "valueUnit": null 60 | } 61 | ], 62 | "showDataMarkers": true, 63 | "showEventLines": false, 64 | "time": { 65 | "range": 86400000, 66 | "rangeEnd": 0, 67 | "type": "relative" 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Critically low fan speed (%).json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1727095467806, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GYKAGp3AEAs", 12 | "labelResolutions": { 13 | "Hardware - Critically low fan speed (%)": 1000 14 | }, 15 | "lastUpdated": 1729906605545, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": 0, 18 | "minDelay": 0, 19 | "name": "Hardware - Critically low fan speed (%)", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('hw.fan.speed').publish(label='A')\ndetect(when(A < threshold(1))).publish('Hardware - Critically low fan speed (%)')", 23 | "rules": [ 24 | { 25 | "description": "The value of hw.fan.speed is below 1.", 26 | "detectLabel": "Hardware - Critically low fan speed (%)", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n###Low fan speed\nFan speed for **{{dimensions.[name]}}** is critically low on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Consequence\nThe temperature of the chip, component or device that was cooled down by this fan, may rise rapidly. This could lead to severe hardware damage and system crashes.\n\n###Recommended action\nCheck if the fan no longer cools down the system. If so, replace the fan.\n{{else}}\nRecovered fan speed.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "Critically low fan speed (%)", 31 | "severity": "Minor" 32 | } 33 | ], 34 | "sf_metricsInObjectProgramText": [ 35 | "hw.fan.speed" 36 | ], 37 | "status": "ACTIVE", 38 | "tags": [], 39 | "teams": [], 40 | "timezone": "", 41 | "visualizationOptions": { 42 | "disableSampling": false, 43 | "publishLabelOptions": [ 44 | { 45 | "displayName": "hw.fan.speed", 46 | "label": "A", 47 | "paletteIndex": null, 48 | "valuePrefix": null, 49 | "valueSuffix": null, 50 | "valueUnit": null 51 | } 52 | ], 53 | "showDataMarkers": true, 54 | "showEventLines": false, 55 | "time": { 56 | "range": 86400000, 57 | "rangeEnd": 0, 58 | "type": "relative" 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Device status degraded.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1726835785724, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GX6QUA7AAAA", 12 | "labelResolutions": { 13 | "Hardware - Device status degraded": 180000 14 | }, 15 | "lastUpdated": 1730114326958, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": 0, 18 | "minDelay": 0, 19 | "name": "Hardware - Device status degraded", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('hw.status', filter=filter('state', 'degraded'), rollup='min').publish(label='A')\ndetect(when(A > threshold(0), lasting='5m'), auto_resolve_after='15m').publish('Hardware - Device status degraded')", 23 | "rules": [ 24 | { 25 | "description": "The value of hw.status is above 0.", 26 | "detectLabel": "Hardware - Device status degraded", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n###Degraded {{dimensions.[hw.type]}}.\n\n**{{dimensions.name}}** is degraded on **{{dimensions.[host.name]}}** in **{{dimensions.site}}** at {{dateTimeFormat timestamp format=\"full\"}}.\n{{else}}\nRecovered {{dimensions.[hw.type]}} **{{dimensions.name}}** from **degraded** status at {{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n###Device Details\n**Name:** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Type:** {{dimensions.[hw.type]}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "[Hardware] Status degraded for {{dimensions.[hw.type]}} of {{dimensions.[host.name]}} in {{dimensions.site}}", 31 | "runbookUrl": "", 32 | "severity": "Warning", 33 | "tip": "" 34 | } 35 | ], 36 | "sf_metricsInObjectProgramText": [ 37 | "hw.status" 38 | ], 39 | "status": "ACTIVE", 40 | "tags": [], 41 | "teams": [], 42 | "timezone": "", 43 | "visualizationOptions": { 44 | "disableSampling": false, 45 | "publishLabelOptions": [ 46 | { 47 | "displayName": "hw.status", 48 | "label": "A", 49 | "paletteIndex": null, 50 | "valuePrefix": null, 51 | "valueSuffix": null, 52 | "valueUnit": null 53 | } 54 | ], 55 | "showDataMarkers": true, 56 | "showEventLines": false, 57 | "time": { 58 | "range": 86400000, 59 | "rangeEnd": 0, 60 | "type": "relative" 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Device status failed.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1726834590194, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GX6UimfAAEk", 12 | "labelResolutions": { 13 | "Hardware - Device status failed": 120000 14 | }, 15 | "lastUpdated": 1730972332658, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": 0, 18 | "minDelay": 0, 19 | "name": "Hardware - Device status failed", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('hw.status', filter=filter('state', 'failed'), rollup='max').publish(label='A')\ndetect(when(A > threshold(0))).publish('Hardware - Device status failed')", 23 | "rules": [ 24 | { 25 | "description": "The value of hw.status is above 0.", 26 | "detectLabel": "Hardware - Device status failed", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n###Failed {{dimensions.[hw.type]}}.\n\n**{{dimensions.name}}** has failed on **{{dimensions.[host.name]}}** in **{{dimensions.site}}** at {{dateTimeFormat timestamp format=\"full\"}}.\n{{else}}\nRecovered {{dimensions.[hw.type]}} **{{dimensions.name}}** from **failed** status at {{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n###Device Details\n**Name:** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Type:** {{dimensions.[hw.type]}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "[Hardware] Status failed for {{dimensions.[hw.type]}} of {{dimensions.[host.name]}} in {{dimensions.site}}", 31 | "runbookUrl": "", 32 | "severity": "Major", 33 | "tip": "" 34 | } 35 | ], 36 | "sf_metricsInObjectProgramText": [ 37 | "hw.status" 38 | ], 39 | "status": "ACTIVE", 40 | "tags": [], 41 | "teams": [], 42 | "timezone": "", 43 | "visualizationOptions": { 44 | "disableSampling": false, 45 | "publishLabelOptions": [ 46 | { 47 | "displayName": "hw.status", 48 | "label": "A", 49 | "paletteIndex": null, 50 | "valuePrefix": null, 51 | "valueSuffix": null, 52 | "valueUnit": null 53 | } 54 | ], 55 | "showDataMarkers": true, 56 | "showEventLines": false, 57 | "time": { 58 | "range": 43200000, 59 | "rangeEnd": 0, 60 | "type": "relative" 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - High number of errors.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1727103358641, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GYKN4UQAABo", 12 | "labelResolutions": { 13 | "Hardware - High number of errors": 1000 14 | }, 15 | "lastUpdated": 1730894451575, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": 0, 18 | "minDelay": 0, 19 | "name": "Hardware - High number of errors", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('hw.errors', filter=filter('hw.type', 'physical_disk', 'memory')).publish(label='A')\ndetect(when(A > threshold(1))).publish('Hardware - High number of errors')", 23 | "rules": [ 24 | { 25 | "description": "The value of hw.errors is above 1.", 26 | "detectLabel": "Hardware - High number of errors", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n## Errors\n**{{dimensions.[name]}}** encountered internal error(s) on {{dimensions.[host.name]}} in **{{dimensions.site}}**\n\n{{else}}\n{{dimensions.[hw.type]}} {{dimensions.[name]}} is no longer reporting errors.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "Hardware - Errors {{dimensions.[hw.type]}} {{dimensions.[name]}}", 31 | "severity": "Warning" 32 | } 33 | ], 34 | "sf_metricsInObjectProgramText": [ 35 | "hw.errors" 36 | ], 37 | "status": "ACTIVE", 38 | "tags": [], 39 | "teams": [], 40 | "timezone": "", 41 | "visualizationOptions": { 42 | "disableSampling": false, 43 | "publishLabelOptions": [ 44 | { 45 | "displayName": "hw.errors", 46 | "label": "A", 47 | "paletteIndex": null, 48 | "valuePrefix": null, 49 | "valueSuffix": null, 50 | "valueUnit": null 51 | } 52 | ], 53 | "showDataMarkers": true, 54 | "showEventLines": false, 55 | "time": { 56 | "range": 900000, 57 | "rangeEnd": 0, 58 | "type": "relative" 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - LUN multi-pathing issue.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1727101189016, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GYKMWWkAAAA", 12 | "labelResolutions": { 13 | "Hardware - LUN multi-pathing issue": 180000 14 | }, 15 | "lastUpdated": 1730986249546, 16 | "lastUpdatedBy": "GRtepaIAICg", 17 | "maxDelay": 0, 18 | "minDelay": 0, 19 | "name": "Hardware - LUN multi-pathing issue", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('hw.lun.paths').publish(label='A')\ndetect(when(A < threshold(2))).publish('Hardware - LUN multi-pathing issue')", 23 | "rules": [ 24 | { 25 | "description": "The value of hw.lun.paths is below 2.", 26 | "detectLabel": "Hardware - LUN multi-pathing issue", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n## Lost redundancy\nOnly 1 remaining path in multipathing configuration for LUN **{{dimensions.[name]}}** on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**\n\n## Consequence\nThe performance of the system may be affected and the risk of losing access to data is high.\n\n## Recommended action\nVerify on the SAN switches which links are broken (link down, or zone exclusion, etc.). Check the mapping and masking configuration of the corresponding storage volume in the storage system.\n{{else}}\nRecovered available LUN paths.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "Hardware - LUN multi-pathing issue", 31 | "runbookUrl": "", 32 | "severity": "Warning", 33 | "tip": "" 34 | } 35 | ], 36 | "sf_metricsInObjectProgramText": [ 37 | "hw.lun.paths" 38 | ], 39 | "status": "ACTIVE", 40 | "tags": [], 41 | "teams": [], 42 | "timezone": "", 43 | "visualizationOptions": { 44 | "disableSampling": false, 45 | "publishLabelOptions": [ 46 | { 47 | "displayName": "hw.lun.paths", 48 | "label": "A", 49 | "paletteIndex": null, 50 | "valuePrefix": null, 51 | "valueSuffix": null, 52 | "valueUnit": null 53 | } 54 | ], 55 | "showDataMarkers": true, 56 | "showEventLines": false, 57 | "time": { 58 | "range": 86400000, 59 | "rangeEnd": 0, 60 | "type": "relative" 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Low battery.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1727098749573, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GYKAzuVAIAU", 12 | "labelResolutions": { 13 | "Hardware - Low battery": 240000 14 | }, 15 | "lastUpdated": 1727178856192, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": 0, 18 | "minDelay": 0, 19 | "name": "Hardware - Low battery", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('hw.battery.charge').publish(label='A', enable=False)\nB = (A*100).publish(label='B')\ndetect(when(B < threshold(50))).publish('Hardware - Low battery')", 23 | "rules": [ 24 | { 25 | "description": "The value of A*100 is below 50.", 26 | "detectLabel": "Hardware - Low battery", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n###Low battery\nBattery **{{dimensions.[name]}}** charge is abnormally low on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Consequence\nA low charge battery may lead to data loss in case of a power outage.\n\n###Recommended action\nCheck why the battery is not fully charged (it may be due to a power outage or an unplugged power cable) and fully recharge the battery when possible.\n{{else}}\nThe battery charge is back within the normal operational range.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Vendor:** {{dimensions.vendor}}\n**Model:** {{dimensions.model}}\n**Serial Number:** {{dimensions.serial_number}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "Low battery", 31 | "severity": "Warning" 32 | } 33 | ], 34 | "sf_metricsInObjectProgramText": [ 35 | "hw.battery.charge" 36 | ], 37 | "status": "ACTIVE", 38 | "tags": [], 39 | "teams": [], 40 | "timezone": "", 41 | "visualizationOptions": { 42 | "disableSampling": false, 43 | "publishLabelOptions": [ 44 | { 45 | "displayName": "hw.battery.charge", 46 | "label": "A", 47 | "paletteIndex": null, 48 | "valuePrefix": null, 49 | "valueSuffix": null, 50 | "valueUnit": null 51 | }, 52 | { 53 | "displayName": "A*100", 54 | "label": "B", 55 | "paletteIndex": null, 56 | "valuePrefix": null, 57 | "valueSuffix": null, 58 | "valueUnit": null 59 | } 60 | ], 61 | "showDataMarkers": true, 62 | "showEventLines": false, 63 | "time": { 64 | "range": 900000, 65 | "rangeEnd": 0, 66 | "type": "relative" 67 | } 68 | } 69 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Low fan speed (%).json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1727095352339, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GYJxNpHAIAA", 12 | "labelResolutions": { 13 | "Hardware - Low fan speed (%)": 1000 14 | }, 15 | "lastUpdated": 1727096537448, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": 0, 18 | "minDelay": 0, 19 | "name": "Hardware - Low fan speed (%)", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('hw.fan.speed_ratio').publish(label='A', enable=False)\nB = data('A*100').publish(label='B')\ndetect(when(B < threshold(10))).publish('Hardware - Low fan speed (%)')", 23 | "rules": [ 24 | { 25 | "description": "The value of A*100 is below 10.", 26 | "detectLabel": "Hardware - Low fan speed (%)", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n###Low fan speed\nFan speed for **{{dimensions.[name]}}** is abnormally low on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Consequence\nThe temperature of the chip, component or device that was cooled down by this fan, may rise rapidly. This could lead to severe hardware damage and system crashes.\n\n###Recommended action\nCheck if the fan no longer cools down the system. If so, replace the fan.\n{{else}}\nRecovered fan speed.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "Low fan speed", 31 | "severity": "Warning" 32 | } 33 | ], 34 | "sf_metricsInObjectProgramText": [ 35 | "A*100" 36 | ], 37 | "status": "ACTIVE", 38 | "tags": [], 39 | "teams": [], 40 | "timezone": "", 41 | "visualizationOptions": { 42 | "disableSampling": false, 43 | "publishLabelOptions": [ 44 | { 45 | "displayName": "hw.fan.speed_ratio", 46 | "label": "A", 47 | "paletteIndex": null, 48 | "valuePrefix": null, 49 | "valueSuffix": null, 50 | "valueUnit": null 51 | }, 52 | { 53 | "displayName": "A*100", 54 | "label": "B", 55 | "paletteIndex": null, 56 | "valuePrefix": null, 57 | "valueSuffix": null, 58 | "valueUnit": null 59 | } 60 | ], 61 | "showDataMarkers": true, 62 | "showEventLines": false, 63 | "time": { 64 | "range": 86400000, 65 | "rangeEnd": 0, 66 | "type": "relative" 67 | } 68 | } 69 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Missing device.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1726836391594, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GX6ZxEYAAAA", 12 | "labelResolutions": { 13 | "Hardware - Missing device": 5000 14 | }, 15 | "lastUpdated": 1727976755429, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": 0, 18 | "minDelay": 0, 19 | "name": "Hardware - Missing device", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "AB = alerts(detector_name='Hardware - Missing Device').publish(label='AB')\nA = data('hw.status', filter=filter('state', 'present'), rollup='min').publish(label='A')\ndetect(when(A < threshold(1))).publish('Hardware - Missing device')", 23 | "rules": [ 24 | { 25 | "description": "The value of hw.status is below 1.", 26 | "detectLabel": "Hardware - Missing device", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n###Missing {{dimensions.[hw.type]}}\n\n**{{dimensions.name}}** is no longer detected on on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Recommended action\nCheck whether the device was intentionally removed from the system or if it is not responding. \n{{else}}\nThe device has recovered. \n{{/if}}\n\n###Device Details\n**Name:** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Type:** {{dimensions.[hw.type]}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "Missing {{dimensions.[hw.type]}} on **{{dimensions.[host.name]}}**", 31 | "severity": "Major" 32 | } 33 | ], 34 | "sf_metricsInObjectProgramText": [ 35 | "hw.status" 36 | ], 37 | "status": "ACTIVE", 38 | "tags": [], 39 | "teams": [], 40 | "timezone": "", 41 | "visualizationOptions": { 42 | "disableSampling": false, 43 | "publishLabelOptions": [ 44 | { 45 | "displayName": "hw.status", 46 | "label": "A", 47 | "paletteIndex": null, 48 | "valuePrefix": null, 49 | "valueSuffix": null, 50 | "valueUnit": null 51 | } 52 | ], 53 | "showDataMarkers": true, 54 | "showEventLines": false, 55 | "time": { 56 | "range": 86400000, 57 | "rangeEnd": 0, 58 | "type": "relative" 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Network errors.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1727083108813, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GYJPupTAAAA", 12 | "labelResolutions": { 13 | "Hardware - Network errors": 1000 14 | }, 15 | "lastUpdated": 1727083147113, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": 0, 18 | "minDelay": 0, 19 | "name": "Hardware - Network errors", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('hw.errors', filter=filter('hw.type', 'network')).publish(label='A')\ndetect(when(A > threshold(0))).publish('Hardware - Network errors')", 23 | "rules": [ 24 | { 25 | "description": "The value of hw.errors is above 0.", 26 | "detectLabel": "Hardware - Network errors", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n###Network errors\n\nInterface **{{dimensions.name}}** is encountering or generating a high number of errors of received or transmitted packets) on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Consequence\nThis strongly impacts the network performance.\n\n###Recommended action\nCheck the network cable, the driver settings, the speed and duplex mode of the link. If everything seems normal, you may have to replace this network adapter. \n{{else}}\nThe network card no longer encounters or generates errors. \n{{/if}}\n\n###Device Details\n**Name:** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Type:** {{dimensions.[hw.type]}}\n**Physical Address:** {{dimensions.physical_address}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "Hardware - Network errors on {{dimensions.[host.name]}}", 31 | "severity": "Warning" 32 | } 33 | ], 34 | "sf_metricsInObjectProgramText": [ 35 | "hw.errors" 36 | ], 37 | "status": "ACTIVE", 38 | "tags": [], 39 | "teams": [], 40 | "timezone": "", 41 | "visualizationOptions": { 42 | "disableSampling": false, 43 | "publishLabelOptions": [ 44 | { 45 | "displayName": "hw.errors", 46 | "label": "A", 47 | "paletteIndex": null, 48 | "valuePrefix": null, 49 | "valueSuffix": null, 50 | "valueUnit": null 51 | } 52 | ], 53 | "showDataMarkers": true, 54 | "showEventLines": false, 55 | "time": { 56 | "range": 900000, 57 | "rangeEnd": 0, 58 | "type": "relative" 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Networking link down.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1727082785650, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GYJAKjwAEAA", 12 | "labelResolutions": { 13 | "Hardware - Networking link down": 180000 14 | }, 15 | "lastUpdated": 1728507098084, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": 0, 18 | "minDelay": 0, 19 | "name": "Hardware - Networking link down", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('hw.network.up', rollup='min').publish(label='A')\ndetect(when(A < threshold(1))).publish('Hardware - Networking link down')", 23 | "rules": [ 24 | { 25 | "description": "The value of hw.network.up is below 1.", 26 | "detectLabel": "Hardware - Networking link down", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n###Link down\n\nInterface **{{dimensions.name}}** is disconnected on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Consequence\nThe network traffic (if any) that was processed by this adapter is no longer being handled, or is overloading another network adapter.\n\n###Recommended action\nCheck that the network cable (if any) is not unplugged or broken/cut, and that it is properly plugged into the network card. Ensure that the network hub/switch/router is working properly.\n{{else}}\nLink restored for {{dimensions.name}}.\n{{/if}}\n\n###Device Details\n**Name:** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Type:** {{dimensions.[hw.type]}}\n**Physical Address:** {{dimensions.physical_address}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "Network link down for **{{dimensions.[host.name]}}**", 31 | "severity": "Warning" 32 | } 33 | ], 34 | "sf_metricsInObjectProgramText": [ 35 | "hw.network.up" 36 | ], 37 | "status": "ACTIVE", 38 | "tags": [], 39 | "teams": [], 40 | "timezone": "", 41 | "visualizationOptions": { 42 | "disableSampling": false, 43 | "publishLabelOptions": [ 44 | { 45 | "displayName": "hw.network.up", 46 | "label": "A", 47 | "paletteIndex": null, 48 | "valuePrefix": null, 49 | "valueSuffix": null, 50 | "valueUnit": null 51 | } 52 | ], 53 | "showDataMarkers": true, 54 | "showEventLines": false, 55 | "time": { 56 | "range": 86400000, 57 | "rangeEnd": 0, 58 | "type": "relative" 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Physical intrusion.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1726844670799, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GX64jrUAEB0", 12 | "labelResolutions": { 13 | "Hardware - Physical intrusion": 120000 14 | }, 15 | "lastUpdated": 1730969153413, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": null, 18 | "minDelay": null, 19 | "name": "Hardware - Physical intrusion", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('hw.status', filter=filter('state', 'open'), rollup='max').publish(label='A')\ndetect(when(A > threshold(0))).publish('Hardware - Physical intrusion')", 23 | "rules": [ 24 | { 25 | "description": "The value of hw.status is above 0.", 26 | "detectLabel": "Hardware - Physical intrusion", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n###Intrusion\nEnclosure {{dimensions.[name]}} is open ({{dimensions.[host.name]}} in {{dimensions.site}}).\n\n###Consequence\nThis could mean that somebody is accessing the hardware components in the enclosure, including the harddisks which may contain private information.\n\n###Recommended action\nMake sure the enclosure has been opened by authorized personnel only and close it as soon as possible.\n{{else}}\nEnclosure is now closed.\n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Vendor:** {{dimensions.vendor}}\n**Model:** {{dimensions.model}}\n**Serial Number:** {{dimensions.serial_number}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "Intrusion in {{dimensions.[host.name]}}'s enclosure", 31 | "runbookUrl": "", 32 | "severity": "Major", 33 | "tip": "" 34 | } 35 | ], 36 | "sf_metricsInObjectProgramText": [ 37 | "hw.status" 38 | ], 39 | "status": "ACTIVE", 40 | "tags": [], 41 | "teams": [], 42 | "timezone": "", 43 | "visualizationOptions": { 44 | "disableSampling": false, 45 | "publishLabelOptions": [ 46 | { 47 | "displayName": "hw.status", 48 | "label": "A", 49 | "paletteIndex": null, 50 | "valuePrefix": null, 51 | "valueSuffix": null, 52 | "valueUnit": null 53 | } 54 | ], 55 | "showDataMarkers": true, 56 | "showEventLines": false, 57 | "time": { 58 | "range": 86400000, 59 | "rangeEnd": 0, 60 | "type": "relative" 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Predicted failure.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1726843486243, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GX6tR8qAEBQ", 12 | "labelResolutions": { 13 | "Hardware - Predicted Failure": 180000 14 | }, 15 | "lastUpdated": 1729905705828, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": 0, 18 | "minDelay": 0, 19 | "name": "Hardware - Predicted failure", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "AB = alerts(detector_name='Hardware - Predicted Failure').publish(label='AB')\nA = data('hw.status', filter=filter('state', 'predicted_failure'), rollup='max').publish(label='A')\ndetect(when(A > threshold(0))).publish('Hardware - Predicted Failure')", 23 | "rules": [ 24 | { 25 | "description": "The value of hw.status is above 0.", 26 | "detectLabel": "Hardware - Predicted Failure", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n###Predicted {{dimensions.[hw.type]}} failure\n\n**{{dimensions.name}}** is predicted to fail soon on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n{{else}}\nFailure is no longer predicted for {{dimensions.[hw.type]}} **{{dimensions.name}}** since{{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n###Device Details\n**Name:** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Type:** {{dimensions.[hw.type]}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "Predicted failure for {{dimensions.[hw.type]}} {{dimensions.name}}", 31 | "severity": "Warning" 32 | } 33 | ], 34 | "sf_metricsInObjectProgramText": [ 35 | "hw.status" 36 | ], 37 | "status": "ACTIVE", 38 | "tags": [], 39 | "teams": [], 40 | "timezone": "", 41 | "visualizationOptions": { 42 | "disableSampling": false, 43 | "publishLabelOptions": [ 44 | { 45 | "displayName": "hw.status", 46 | "label": "A", 47 | "paletteIndex": null, 48 | "valuePrefix": null, 49 | "valueSuffix": null, 50 | "valueUnit": null 51 | } 52 | ], 53 | "showDataMarkers": true, 54 | "showEventLines": false, 55 | "time": { 56 | "range": 86400000, 57 | "rangeEnd": 0, 58 | "type": "relative" 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /detectors/metricshub/Hardware - Tape drive needs cleaning.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1727093437101, 7 | "creator": "GRtepaIAICg", 8 | "customProperties": null, 9 | "description": "", 10 | "detectorOrigin": "Standard", 11 | "id": "GYJpVTXAEAA", 12 | "labelResolutions": { 13 | "Hardware - Tape drive needs cleaning": 1000 14 | }, 15 | "lastUpdated": 1727093437495, 16 | "lastUpdatedBy": "AAAAAAAAAAA", 17 | "maxDelay": null, 18 | "minDelay": null, 19 | "name": "Hardware - Tape drive needs cleaning", 20 | "overMTSLimit": false, 21 | "packageSpecifications": "", 22 | "programText": "A = data('hw.status', filter=filter('state', 'needs_cleaning')).publish(label='A')\ndetect(when(A > threshold(0))).publish('Hardware - Tape drive needs cleaning')", 23 | "rules": [ 24 | { 25 | "description": "The value of hw.status is above 0.", 26 | "detectLabel": "Hardware - Tape drive needs cleaning", 27 | "disabled": false, 28 | "notifications": [], 29 | "parameterizedBody": "{{#if anomalous}}\n###Cleaning needed\nTape drive **{{dimensions.[name]}}** needs cleaning on **{{dimensions.[host.name]}}** in **{{dimensions.site}}**.\n\n###Consequence\nRegular tape drive cleaning helps in long-term reliability, prevents read/write errors and should be conducted on a scheduled cycle as well as when requested by the drive.\n\n###Recommended action\nWait for any running operation to finish, eject the tape and clean the drive.\n{{else}}\nTape drive no longer needs cleaning. \n{{/if}}\n\n###Device Details\n**Name: ** {{dimensions.[name]}}\n**ID:** {{dimensions.id}}\n**Vendor:** {{dimensions.vendor}}\n**Model:** {{dimensions.model}}\n**Serial Number:** {{dimensions.serial_number}}\n**Information:** {{dimensions.info}}", 30 | "parameterizedSubject": "ape drive {{dimensions.[name]}} needs cleaning", 31 | "runbookUrl": "", 32 | "severity": "Warning", 33 | "tip": "" 34 | } 35 | ], 36 | "sf_metricsInObjectProgramText": [ 37 | "hw.status" 38 | ], 39 | "status": "ACTIVE", 40 | "tags": [], 41 | "teams": [], 42 | "timezone": "", 43 | "visualizationOptions": { 44 | "disableSampling": false, 45 | "publishLabelOptions": [ 46 | { 47 | "displayName": "hw.status", 48 | "label": "A", 49 | "paletteIndex": null, 50 | "valuePrefix": null, 51 | "valueSuffix": null, 52 | "valueUnit": null 53 | } 54 | ], 55 | "showDataMarkers": true, 56 | "showEventLines": false, 57 | "time": { 58 | "range": 86400000, 59 | "rangeEnd": 0, 60 | "type": "relative" 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /detectors/metricshub/README.md: -------------------------------------------------------------------------------- 1 | # MetricsHub Detectors 2 | 3 | This folder contains detectors that may be used to trigger events based on the metrics collected by MetricsHub. 4 | 5 | Please note that you may want or need different thresholds than those provided here. 6 | You may also want to create your own detectors. 7 | 8 | ## Importing Detectors 9 | Two options exist: 10 | 1. Edit and send the Detector JSON [via API](https://dev.splunk.com/observability/reference/api/detectors/latest#endpoint-create-single-detector) 11 | ``` 12 | curl -X POST "https://api.{REALM}.signalfx.com/v2/detector" \ 13 | -H "Content-Type: application/json" \ 14 | -H "X-SF-TOKEN: " \ 15 | -d @"/path/to/detector/detector_name.json" 16 | ``` 17 | 2. Copy the detector's JSON and paste it into your own Detector [via the UI](https://docs.splunk.com/Observability/alerts-detectors-notifications/create-detectors-for-alerts.html#nav-Create-detectors-to-trigger-alerts) 18 | 19 | # Support 20 | 21 | Subscribers to **MetricsHub** gain access to the **MetricsHub Support Desk**, which provides: 22 | 23 | - Technical support 24 | - Patches and updates 25 | - Knowledge base access 26 | 27 | For more information, visit the [MetricsHub](https://metricshub.com/) website. 28 | 29 | Splunk does not provide support for these detectors and users should contact Sentry Software's support with any support requests. 30 | 31 | -------------------------------------------------------------------------------- /detectors/snowflakedb/README.md: -------------------------------------------------------------------------------- 1 | # Snowflake Detectors 2 | 3 | This folder contains detectors that may be useful when working with Snowflake. 4 | 5 | Please note that you may want or need different thresholds than those provided here. 6 | 7 | ## Snowflake Metrics Configuration 8 | Please see [configuration examples](../../dashboards-and-dashboard-groups/snowflakedb/Configuration/) for help getting metrics from Snowflake into Splunk Observability. 9 | ## Importing Detectors 10 | Two options exist: 11 | 1. Edit and send the Detector JSON [via API](https://dev.splunk.com/observability/reference/api/detectors/latest#endpoint-create-single-detector) 12 | ``` 13 | curl -X POST "https://api.{REALM}.signalfx.com/v2/detector" \ 14 | -H "Content-Type: application/json" \ 15 | -H "X-SF-TOKEN: " \ 16 | -d @"/path/to/detector/detector_name_is_amazing.json" 17 | ``` 18 | 2. Copy the SignalFlow out of the detector JSON and paste into your own Detector [via the UI](https://docs.splunk.com/Observability/alerts-detectors-notifications/create-detectors-for-alerts.html#nav-Create-detectors-to-trigger-alerts) 19 | 20 | ## Available Detectors 21 | Provided alerts follow the 4 Golden Signals of Latency, Errors, Traffic, and Saturation (L.E.T.S.) along with Billing. 22 | ### Latency: 23 | - Queries in Small / X-Small Warehouses longer than 5 minutes (I.E. 300000 ms) 24 | - Queries taking more than 15 minutes (900 seconds) 25 | 26 | ### Errors 27 | - Database Errors by Warehouse (Arbitrarily threshold of 100 errors) 28 | - Database Error Rate by Warehouse (Arbitrary threshold of 15%) 29 | - Login Failures by User (Threshold of 15 per hour) 30 | 31 | ### Traffic 32 | - Blocked Queries by Warehouse 33 | - No Queries in last 3 hours 34 | 35 | ### Saturation 36 | - Overloaded Queries by Warehouse 37 | - Queries Queued longer than 5 minutes (I.E. 300000 ms) 38 | 39 | ### Billing 40 | - Credits used by Warehouse (Anomaly detection) 41 | - % of spend for Cloud Service greater than 15% by Warehouse -------------------------------------------------------------------------------- /detectors/snowflakedb/Snowflake - Blocked Queries.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1660060502884, 7 | "customProperties": {}, 8 | "description": "Blocked Queries by Warehouse", 9 | "detectorOrigin": "Standard", 10 | "labelResolutions": { 11 | "Snowflake - Blocked Queries": 3600000 12 | }, 13 | "lastUpdated": 1660233347529, 14 | "lastUpdatedBy": "E0jpLZIAYAA", 15 | "maxDelay": 0, 16 | "minDelay": 0, 17 | "name": "Snowflake - Blocked Queries", 18 | "overMTSLimit": false, 19 | "packageSpecifications": "", 20 | "parentDetectorId": null, 21 | "programText": "A = data('snowflake.query.blocked').sum(by=['WAREHOUSE_NAME']).publish(label='A')\ndetect(when(A > threshold(0), lasting='2h')).publish('Snowflake - Blocked Queries')", 22 | "rules": [ 23 | { 24 | "description": "The value of Blocked Queries by Warehouse is above 0.", 25 | "detectLabel": "Snowflake - Blocked Queries", 26 | "disabled": false, 27 | "notifications": [], 28 | "parameterizedBody": "{{#if anomalous}}\n\n Rule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" triggered at {{dateTimeFormat timestamp format=\"full\"}}.\n{{else}}\n Rule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" cleared at {{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n{{#if anomalous}}\nTriggering condition: {{{readableRule}}} (%)\n{{/if}}\n\n{{#if anomalous}}Snowflake Queries Blocked per Warehouse in breaching state: \nQueries Blocked for Warehouse({{ dimensions.WAREHOUSE_NAME }}) value: {{inputs.A.value}}\n{{else}}Current signal value(s):\nQueries Blocked for Warehouse({{ dimensions.WAREHOUSE_NAME }}): {{inputs.A.value}}\n{{/if}}\n\n{{#notEmpty dimensions}}\nSignal details:\n{{{dimensions}}}\n{{/notEmpty}}\n\n{{#if anomalous}}\n{{#if runbookUrl}}Runbook: {{{runbookUrl}}}{{/if}}\n{{#if tip}}Tip: {{{tip}}}{{/if}}\n{{/if}}", 29 | "parameterizedSubject": null, 30 | "runbookUrl": null, 31 | "severity": "Critical", 32 | "tip": null 33 | } 34 | ], 35 | "sf_metricsInObjectProgramText": null, 36 | "tags": [], 37 | "teams": [], 38 | "timezone": "", 39 | "visualizationOptions": { 40 | "disableSampling": false, 41 | "publishLabelOptions": [ 42 | { 43 | "displayName": "Blocked Queries by Warehouse", 44 | "label": "A", 45 | "paletteIndex": null, 46 | "valuePrefix": null, 47 | "valueSuffix": null, 48 | "valueUnit": null 49 | } 50 | ], 51 | "showDataMarkers": true, 52 | "showEventLines": false, 53 | "time": { 54 | "range": 86400000, 55 | "rangeEnd": 0, 56 | "type": "relative" 57 | } 58 | } 59 | } -------------------------------------------------------------------------------- /detectors/snowflakedb/Snowflake - No Queries in Last 3 Hours.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1660074725727, 7 | "customProperties": {}, 8 | "description": "No Queries in last 3 Hours", 9 | "detectorOrigin": "Standard", 10 | "labelResolutions": { 11 | "Snowflake - No Queries in Last 3 Hours": 3600000 12 | }, 13 | "lastUpdated": 1660233574385, 14 | "lastUpdatedBy": "E0jpLZIAYAA", 15 | "maxDelay": 0, 16 | "minDelay": 0, 17 | "name": "Snowflake - No Queries in Last 3 Hours", 18 | "overMTSLimit": false, 19 | "packageSpecifications": "", 20 | "parentDetectorId": null, 21 | "programText": "A = data('snowflake.database.query.count', extrapolation='zero').sum().publish(label='A')\ndetect(when(A < threshold(1), lasting='3h')).publish('Snowflake - No Queries in Last 3 Hours')", 22 | "rules": [ 23 | { 24 | "description": "The value of Total Queries is below 1.", 25 | "detectLabel": "Snowflake - No Queries in Last 3 Hours", 26 | "disabled": false, 27 | "notifications": [], 28 | "parameterizedBody": "{{#if anomalous}}\n\n Rule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" triggered at {{dateTimeFormat timestamp format=\"full\"}}.\n{{else}}\n Rule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" cleared at {{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n{{#if anomalous}}\nTriggering condition: {{{readableRule}}}\n{{/if}}\n\n{{#if anomalous}}Snowflake Traffic is below 1 for 3 hours: \nNumber of queries: {{inputs.A.value}}\n{{else}}Current signal value(s):\nNumber of queries: {{inputs.A.value}}\n{{/if}}\n\n{{#notEmpty dimensions}}\nSignal details:\n{{{dimensions}}}\n{{/notEmpty}}\n\n{{#if anomalous}}\n{{#if runbookUrl}}Runbook: {{{runbookUrl}}}{{/if}}\n{{#if tip}}Tip: {{{tip}}}{{/if}}\n{{/if}}", 29 | "parameterizedSubject": null, 30 | "runbookUrl": null, 31 | "severity": "Critical", 32 | "tip": "This alert looks at last 3 hours due to the possible latency of data existing in Snowflake internal ACCOUNT_USAGE https://docs.snowflake.com/en/sql-reference/account-usage.html#account-usage-views" 33 | } 34 | ], 35 | "sf_metricsInObjectProgramText": null, 36 | "tags": [], 37 | "teams": [], 38 | "timezone": "", 39 | "visualizationOptions": { 40 | "disableSampling": false, 41 | "publishLabelOptions": [ 42 | { 43 | "displayName": "Total Queries", 44 | "label": "A", 45 | "paletteIndex": null, 46 | "valuePrefix": null, 47 | "valueSuffix": null, 48 | "valueUnit": null 49 | } 50 | ], 51 | "showDataMarkers": true, 52 | "showEventLines": false, 53 | "time": { 54 | "range": 75600000, 55 | "rangeEnd": 0, 56 | "type": "relative" 57 | } 58 | } 59 | } -------------------------------------------------------------------------------- /detectors/snowflakedb/Snowflake - Overloaded Queries.json: -------------------------------------------------------------------------------- 1 | { 2 | "authorizedWriters": { 3 | "teams": [], 4 | "users": [] 5 | }, 6 | "created": 1660073252422, 7 | "customProperties": {}, 8 | "description": "Overloaded Queries in Snowflake", 9 | "detectorOrigin": "Standard", 10 | "labelResolutions": { 11 | "Snowflake - Overloaded Queries": 3600000 12 | }, 13 | "lastUpdated": 1660233607397, 14 | "lastUpdatedBy": "E0jpLZIAYAA", 15 | "maxDelay": 0, 16 | "minDelay": 0, 17 | "name": "Snowflake - Overloaded Queries", 18 | "overMTSLimit": false, 19 | "packageSpecifications": "", 20 | "parentDetectorId": null, 21 | "programText": "A = data('snowflake.query.queued_overload').sum(by=['WAREHOUSE_NAME']).publish(label='A')\ndetect(when(A > threshold(0), lasting='1h')).publish('Snowflake - Overloaded Queries')", 22 | "rules": [ 23 | { 24 | "description": "The value of Overloaded Queries is above 0.", 25 | "detectLabel": "Snowflake - Overloaded Queries", 26 | "disabled": false, 27 | "notifications": [], 28 | "parameterizedBody": "{{#if anomalous}}\n\n Rule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" triggered at {{dateTimeFormat timestamp format=\"full\"}}.\n{{else}}\n Rule \"{{{ruleName}}}\" in detector \"{{{detectorName}}}\" cleared at {{dateTimeFormat timestamp format=\"full\"}}.\n{{/if}}\n\n{{#if anomalous}}\nTriggering condition: {{{readableRule}}} (%)\n{{/if}}\n\n{{#if anomalous}}Snowflake Overloaded Queries per Warehouse in breaching state: \nQueries Overloaded for Warehouse({{ dimensions.WAREHOUSE_NAME }}) value: {{inputs.A.value}}\n{{else}}Current signal value(s):\nQueries Overloaded for Warehouse({{ dimensions.WAREHOUSE_NAME }}): {{inputs.A.value}}\n{{/if}}\n\n{{#notEmpty dimensions}}\nSignal details:\n{{{dimensions}}}\n{{/notEmpty}}\n\n{{#if anomalous}}\n{{#if runbookUrl}}Runbook: {{{runbookUrl}}}{{/if}}\n{{#if tip}}Tip: {{{tip}}}{{/if}}\n{{/if}}", 29 | "parameterizedSubject": null, 30 | "runbookUrl": null, 31 | "severity": "Critical", 32 | "tip": null 33 | } 34 | ], 35 | "sf_metricsInObjectProgramText": null, 36 | "tags": [], 37 | "teams": [], 38 | "timezone": "", 39 | "visualizationOptions": { 40 | "disableSampling": false, 41 | "publishLabelOptions": [ 42 | { 43 | "displayName": "Overloaded Queries", 44 | "label": "A", 45 | "paletteIndex": null, 46 | "valuePrefix": null, 47 | "valueSuffix": null, 48 | "valueUnit": null 49 | } 50 | ], 51 | "showDataMarkers": true, 52 | "showEventLines": false, 53 | "time": { 54 | "range": 86400000, 55 | "rangeEnd": 0, 56 | "type": "relative" 57 | } 58 | } 59 | } -------------------------------------------------------------------------------- /integration-examples/README.md: -------------------------------------------------------------------------------- 1 | # General Recommendations for Integration and API Example Content for Reuse 2 | 3 | Integration Examples can be everything from example code for interacting with APIs, Configurations for Open Telemetry, code for getting custom metrics into Observability, and more. 4 | 5 | 1. **Noun-centric Organization** : Integrations and API interactions are generally composed of common software components/platforms. 6 | 7 | Please organize folders and submissions to group similar software and platforms together. 8 | 9 | Integrations should be oriented towards specific software/platforms with a focus on reusable patterns wherever possible for easily adjusting to a specific user's needs. 10 | 11 | 2. **Integrations** : Loosely integrations covers collections of code, scripts, documentation, etc which will aide others in setting up functionality with Splunk Observability. This could include Getting Data In (GDI), Tips and reusable SignalFlow patterns, Webhook setup information for a vendor, serverless code for performing checks, etc 12 | 13 | Include a `README.md` within your submission directory documenting and detailing the process of using your Submisson. If metrics are produced, include a list of those metrics and any associated dimensions in your `README.md`. 14 | 15 | 3. **OpenTelemetry Configurations** : Integrations using OpenTelemetry should include OpenTelemetry config files along with a `README.md` that briefly describes any novel pipelines and the receivers, processors, and exporters used. 16 | 17 | 4. **API Scripts and Interactions** : API Scripts and Interactions should include a `README.md` file that explains what the script does. If it emits metrics and dimensions for those metrics they should be noted in the `README.md` file. 18 | 19 | Double check and verify that you have not accidentally added your API tokens or secrets with your code. Wherever possible use environment variables to pass these secrets to the script. 20 | -------------------------------------------------------------------------------- /integration-examples/active_detectors/README.md: -------------------------------------------------------------------------------- 1 | # Active Detector Report 2 | 3 | This script will fetch the events for detectors (max. 1,000) in an Org. 4 | 5 | The output will show detectors with no events, detectors with events and how many events have fired within the number of days specified on the command line. 6 | 7 | The table supports hyperlinks on the detector ID if your terminal supports it. 8 | 9 | Also, a CSV report will be created in the same directory as you run the script. 10 | 11 | ## Using the Active Detector Report script 12 | 13 | ![Active Detectors Report](./images/screenshot.png) 14 | 15 | 1. Install the required packages with `pip3 installl -r requirements.txt` 16 | 2. Obtain an Org Access Token (with API permissions) and note the Realm your Org is in e.g. us1, eu0, jp0 etc. 17 | 3. Run the script e.g. `python3 active_detectors -t -d ` 18 | 19 | ### Full CLI options 20 | 21 | ``` bash 22 | $ python3 active_detectors.py -h 23 | 24 | usage: active_detectors.py [-h] -t TOKEN -r REALM -d DAYS 25 | 26 | Splunk O11y Cloud - Active Detectors 27 | 28 | options: 29 | -h, --help show this help message and exit 30 | -t TOKEN, --token TOKEN 31 | Access Token 32 | -r REALM, --realm REALM 33 | us0, us1, us2, eu0 or jp0 34 | -d DAYS, --days DAYS No. of days ago 35 | ``` 36 | -------------------------------------------------------------------------------- /integration-examples/active_detectors/images/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/integration-examples/active_detectors/images/screenshot.png -------------------------------------------------------------------------------- /integration-examples/active_detectors/requirements.txt: -------------------------------------------------------------------------------- 1 | rich 2 | background 3 | requests 4 | -------------------------------------------------------------------------------- /integration-examples/apiScripts/README.md: -------------------------------------------------------------------------------- 1 | # Contribution repository for Splunk Observability Content apiScripts 2 | 3 | This repository exists to enable sharing of content. 4 | 5 | This directory contains sample API Scripts that you can use to call the Splunk Observability Cloud API. 6 | 7 | # addEmailToDetectors.py 8 | This script will allow users to insert an email address to the notifications for one or more detectors. It works with the token.yaml file which contains optional and required values needed to run the script. Please refer to the comments in the token.yaml file for more details. 9 | 10 | # getMetricsForHost.py 11 | This script is used to find all the metrics for a given host. 12 | 13 | Usage: 14 | 15 | ``` 16 | python3 getMetricsForHost.py -h -r -t 17 | ``` 18 | 19 | # muteAllAutoDetectors.py 20 | This script will mute all auto-detectors. It can also be used to re-enable (unmute) all detectors. (NOTE: Unmuting won't distinguish those you muted with the script or muted manually.) 21 | 22 | Usage: 23 | ``` 24 | python3 muteAllAutoDetectors.py (to mute all) 25 | python3 muteAllAutoDetectors.py -e (to enable all) 26 | ``` 27 | -------------------------------------------------------------------------------- /integration-examples/apiScripts/getMetricsForHost.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # This script will get all unique metrics for a given host. 4 | # 5 | # Edit token.yaml to contain valid 6 | # - Access Token (access_token) 7 | # - Realm (realm) 8 | # 9 | # Syntax: python3 getMetricsForHost.py -h 10 | # or 11 | # python3 getMetricsForHost.py -h -r -t 12 | # 13 | # HOST_NAME should be an exact match 14 | 15 | import argparse 16 | import yaml 17 | import requests 18 | import json 19 | 20 | def run(hostname, realm, token): 21 | limit = 5000 22 | url = "https://api.{}.signalfx.com/v2/metrictimeseries?limit={}&query=host.name:{}".format(realm, limit, hostname) 23 | headers = {"Content-Type": "application/json", "X-SF-TOKEN": "{}".format(token) } 24 | response = requests.get(url, headers=headers) 25 | responseJSON = json.loads(response.text) 26 | 27 | # If the result count is > limit, say so and exit 28 | try: 29 | cnt = responseJSON["count"] 30 | except: 31 | print("ERROR: Check your token, that's the most likely issue.") 32 | return 33 | 34 | if (cnt == 0): 35 | # Let's try using host instead of host.name (SmartAgent) 36 | print("--> No results for host.name, trying host") 37 | url = "https://api.{}.signalfx.com/v2/metrictimeseries?limit={}&query=host:{}".format(realm, limit, hostname) 38 | response = requests.get(url, headers=headers) 39 | responseJSON = json.loads(response.text) 40 | try: 41 | cnt = responseJSON["count"] 42 | except: 43 | print("ERROR: Unusual to fail here, probably an issue with the script.") 44 | return 45 | 46 | if (cnt > limit): 47 | print("Need to increase limit, this host has > {} mts's.".format(limit)) 48 | return 49 | 50 | # Add metrics to a list 51 | arr = [] 52 | for result in responseJSON['results']: 53 | arr.append(result['metric']) 54 | 55 | totalCount = len(arr) 56 | arr = list(set(arr)) # Remove Duplicates 57 | arr.sort() 58 | print(*arr, sep = "\n") # Print one per line 59 | print("--> {} metrics; {} mts".format(len(arr), totalCount)) 60 | 61 | if __name__ == '__main__': 62 | with open('token.yaml', 'r') as ymlfile: 63 | cfg = yaml.safe_load(ymlfile) 64 | 65 | parser = argparse.ArgumentParser(description='Splunk - Get Host Metrics') 66 | parser.add_argument('-n', '--hostName', help='HostName', required=True) 67 | parser.add_argument('-r', '--realm', help='Realm', required=False) 68 | parser.add_argument('-t', '--token', help='Token', required=False) 69 | args = parser.parse_args() 70 | 71 | if (args.token is None): 72 | run(args.hostName, cfg['realm'], cfg['access_token']) 73 | else: 74 | run(args.hostName, args.realm, args.token) 75 | -------------------------------------------------------------------------------- /integration-examples/apiScripts/muteAllAutoDetectors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # This script will mute all auto-detectors. 4 | # 5 | # Edit token.yaml to contain valid 6 | # - Access Token (access_token) 7 | # - Realm (realm) 8 | # 9 | # Syntax: python3 muteAllAutoDetectors.py 10 | # to disable all auto detectors 11 | # 12 | # python3 muteAllAutoDetectors.py -e 13 | # to re-enable all auto detectors 14 | 15 | import argparse 16 | import yaml 17 | import requests 18 | import json 19 | 20 | def muteDetectors(realm, token, enableDisable, responseJSON): 21 | arrDetectors = [] 22 | for result in responseJSON['results']: 23 | id = result['id'] 24 | name = result['name'] 25 | type = result['detectorOrigin'] 26 | if type == "AutoDetect": 27 | url = f"https://api.{realm}.signalfx.com/v2/detector/{id}/{enableDisable}" 28 | headers = {"Content-Type": "application/json", "X-SF-TOKEN": f"{token}" } 29 | response = requests.put(url, headers=headers) 30 | if response.status_code == 204: 31 | print(f"SUCCESS: {name} muting {enableDisable}d.") 32 | arrDetectors.append(name) 33 | else: 34 | print(f"ERROR: {name} muting change failed.") 35 | return arrDetectors 36 | 37 | def callAPI(realm, token, bDisable): 38 | arrDetectors = [] 39 | limit = 10000 40 | offset = 0 41 | 42 | if bDisable: 43 | enableDisable = "disable" 44 | else: 45 | enableDisable = "enable" 46 | 47 | url = f"https://api.{realm}.signalfx.com/v2/detector?limit={limit}" 48 | headers = {"Content-Type": "application/json", "X-SF-TOKEN": f"{token}" } 49 | response = requests.get(url, headers=headers) 50 | responseJSON = json.loads(response.text) 51 | try: 52 | cnt = responseJSON["count"] 53 | except: 54 | print("ERROR: Check your token, that's the most likely issue.") 55 | print(response.text) 56 | return 57 | 58 | if (cnt > 10000): 59 | print(f'You have more than 10,000 detectors ({cnt} found).') 60 | print('Presenting the results for the first 10,000.') 61 | #break 62 | 63 | arrDetectors = muteDetectors(realm, token, enableDisable, responseJSON) 64 | #print(arrDetectors) 65 | 66 | if __name__ == '__main__': 67 | with open('token.yaml', 'r') as ymlfile: 68 | cfg = yaml.safe_load(ymlfile) 69 | 70 | parser = argparse.ArgumentParser(description='Splunk - Mute All Auto-Detectors') 71 | parser.add_argument('-r', '--realm', help='Realm', required=False) 72 | parser.add_argument('-t', '--token', help='Token', required=False) 73 | parser.add_argument('-e', '--enable', action=argparse.BooleanOptionalAction) 74 | args = parser.parse_args() 75 | 76 | bDisable = True 77 | if args.enable is not None: 78 | bDisable = False 79 | 80 | realm = cfg['realm'] if args.realm is None else args.realm 81 | token = cfg['access_token'] if args.token is None else args.token 82 | 83 | callAPI(realm, token, bDisable) 84 | -------------------------------------------------------------------------------- /integration-examples/apiScripts/requirements.txt: -------------------------------------------------------------------------------- 1 | jsonpath_ng 2 | pyyaml 3 | requests -------------------------------------------------------------------------------- /integration-examples/apiScripts/token.yaml: -------------------------------------------------------------------------------- 1 | # these values are used by the apiScripts 2 | access_token: #required. User API Access Token from your Splunk Observability Cloud Organization 3 | realm: #required. Realm for your Splunk Observability Cloud Organization (ex: us1) 4 | emailAddress: # required. Email address that you would like to add to the detector(s) 5 | detectorName: # optional. If not included, script will add email address to all detectors 6 | limit: 50 # optional. Will default to 50; Number of results to return from the list of detectors that match your search criteria. 7 | offset: 0 # optional; Will default to 0; Index, in the list of detectors that match your search criteria, at which you want to start downloading results. -------------------------------------------------------------------------------- /integration-examples/azure-devops/README.md: -------------------------------------------------------------------------------- 1 | # Azure DevOps integrations for Splunk Observability 2 | 3 | 1. Azure DevOps - [Splunk Observability Cloud Events](https://marketplace.visualstudio.com/items?itemName=jeremyh-splunk.splunk-events) 4 | - Send Azure DevOps Deployment events and alerts to Splunk Observability 5 | - Visualize these events overlaid on Splunk Observability Dashboards 6 | 2. Azure DevOps - [Splunk Observability Alert Gates](https://marketplace.visualstudio.com/items?itemName=jeremyh-splunk.splunk-alert) 7 | - Gate Azure DevOps Deployments based on status of Alerts in Splunk Observability 8 | - Deploy your software more safely by checking that the coast is clear in Splunk Observability. 9 | 10 | ## For more details: 11 | - Splunk Blog Post: [Azure DevOps: Fun with Observability Events and Alerts!](https://www.splunk.com/en_us/blog/devops/azure-devops-fun-with-observability-events-and-alerts.html) 12 | - Describes use cases and benefits of the above integrations for Azure DevOps users. 13 | - Microsoft Marketplace Links: 14 | - [Splunk Observability Cloud Events](https://marketplace.visualstudio.com/items?itemName=jeremyh-splunk.splunk-events) 15 | - [Splunk Observability Alert Gates](https://marketplace.visualstudio.com/items?itemName=jeremyh-splunk.splunk-alert) 16 | - GitHub Open Source Repositories and detailed setup instructions: 17 | - [azure-devops-splunk-events](https://github.com/splunk/azure-devops-splunk-events) 18 | - [azure-devops-splunk-alert-gate](https://github.com/splunk/azure-devops-splunk-alert-gate) -------------------------------------------------------------------------------- /integration-examples/ci-webhook-serverless/ci-webhook-handler/requirements.txt: -------------------------------------------------------------------------------- 1 | # dependencies for current version 2 | signalfx 3 | -------------------------------------------------------------------------------- /integration-examples/ci-webhook-serverless/ci-webhook-handler/serverless.yml: -------------------------------------------------------------------------------- 1 | service: ci-webhook-handler 2 | frameworkVersion: '2' 3 | 4 | plugins: 5 | - serverless-python-requirements 6 | 7 | provider: 8 | name: aws 9 | runtime: python3.8 10 | lambdaHashingVersion: '20201221' 11 | iam: 12 | role: 13 | statements: # permissions for all of your functions can be set here 14 | - Effect: Allow 15 | Action: # Gives permission to specific DynamoDB tables in all regions 16 | - dynamodb:DescribeTable 17 | - dynamodb:Query 18 | - dynamodb:Scan 19 | - dynamodb:GetItem 20 | - dynamodb:BatchGetItem 21 | - dynamodb:PutItem 22 | - dynamodb:BatchWriteItem 23 | - dynamodb:UpdateItem 24 | - dynamodb:DeleteItem 25 | Resource: 'arn:aws:dynamodb:*:*:table/webhookEventsTable' 26 | - Effect: Allow 27 | Action: # Gives permission to read secret via SecretsManager in all regions 28 | - secretsmanager:ListSecretVersionIds 29 | - secretsmanager:DescribeSecret 30 | - secretsmanager:GetResourcePolicy 31 | - secretsmanager:GetSecretValue 32 | Resource: 'arn:aws:secretsmanager:*:*:secret:SignalFx/Ingest-*' 33 | 34 | functions: 35 | ciwebhook: 36 | handler: handler.ciwebhook 37 | events: 38 | - httpApi: 39 | path: / 40 | method: post 41 | 42 | resources: 43 | Resources: 44 | eventsTable: 45 | Type: AWS::DynamoDB::Table 46 | Properties: 47 | TableName: webhookEventsTable 48 | AttributeDefinitions: 49 | - AttributeName: buildId 50 | AttributeType: S 51 | - AttributeName: buildStep 52 | AttributeType: S 53 | KeySchema: 54 | - AttributeName: buildId 55 | KeyType: HASH 56 | - AttributeName: buildStep 57 | KeyType: RANGE 58 | ProvisionedThroughput: 59 | ReadCapacityUnits: 1 60 | WriteCapacityUnits: 1 61 | authToken: 62 | Type: AWS::SecretsManager::Secret 63 | Properties: 64 | Description: SignalFx Endpoint and Token Info 65 | Name: SignalFx/Ingest 66 | 67 | custom: 68 | pythonRequirements: 69 | dockerizePip: non-linux 70 | -------------------------------------------------------------------------------- /integration-examples/ci-webhook-serverless/generate-test-events.py: -------------------------------------------------------------------------------- 1 | import urllib3 2 | import random 3 | import threading 4 | import sys 5 | import time 6 | import json 7 | 8 | build_delay_min = 60 9 | build_delay_max = 300 10 | 11 | build_stepdelay_min = 10 12 | build_stepdelay_max = 50 13 | 14 | build_nsteps_min = 2 15 | build_nsteps_max = 5 16 | 17 | # failure rate 1 out of every n 18 | step_failure_rate_1_per = 20 19 | 20 | global gwebhookurl 21 | global genvironment 22 | global gpoolmgr 23 | 24 | gpoolmgr = None 25 | gwebhookurl = None 26 | genvironment = None 27 | 28 | def fake_build(): 29 | bsteps = random.randint(build_nsteps_min, build_nsteps_max) 30 | buildId = 'build' + str(random.randint(0,65536)) 31 | for step in range(bsteps): 32 | buildStep = 'step' + str(step) 33 | body = {} 34 | body['environment'] = genvironment 35 | body['buildId'] = buildId 36 | body['buildStep'] = buildStep 37 | body['status'] = 'success' 38 | if 1 == random.randint(1, step_failure_rate_1_per): 39 | body['status'] = 'failed' 40 | if step == 0: 41 | body['eventType'] = 'start_build' 42 | elif step == (bsteps - 1): 43 | body['eventType'] = 'build_complete' 44 | else: 45 | body['eventType'] = 'build_step' 46 | bodyJson = json.dumps(body) 47 | print('sending data to url %s:' % (gwebhookurl)) 48 | print(' %s' % (bodyJson)) 49 | resp = gpoolmgr.request("POST", gwebhookurl, timeout=30, 50 | headers={'Content-Type': 'application/json'}, 51 | body=bodyJson) 52 | print("resp = %s" % (resp.data.decode())) 53 | if body['status'] == 'failed': 54 | return 55 | time.sleep(random.randint(build_stepdelay_min,build_stepdelay_max)) 56 | 57 | 58 | if __name__ == '__main__': 59 | if len(sys.argv) != 3: 60 | sys.exit('Usage %s ') 61 | 62 | gwebhookurl = sys.argv[1] 63 | genvironment = sys.argv[2] 64 | gpoolmgr = urllib3.PoolManager() 65 | 66 | while True: 67 | x = threading.Thread(target=fake_build) 68 | x.start() 69 | time.sleep(random.randint(build_delay_min,build_delay_max)) 70 | -------------------------------------------------------------------------------- /integration-examples/jenkins-apm/README.md: -------------------------------------------------------------------------------- 1 | # APM Tracing for Jenkins 2 | With Splunk APM and Splunk Log Observer you can gain a deeper understanding of your Jenkins usage! 3 | - Get Waterfall trace charts of your entire pipeline run 4 | - Identify long running steps in your jenkins jobs 5 | - Easily mark deployment successes and failures on Splunk Observability dashboards 6 | - And much more! 7 | 8 | For a complete integration guide including example configuration files and dashboard exports check out the repository at [https://github.com/splunk/splunk-jenkins-otel](https://github.com/splunk/splunk-jenkins-otel) 9 | 10 | ## For more details: 11 | - Splunk Blog: [Jenkins, OpenTelemetry, Observability](https://www.splunk.com/en_us/blog/devops/jenkins-opentelemetry-observability.html) 12 | - Jenkins OpenTelemetry Plugin: [Jenkins OTEL plugin](https://plugins.jenkins.io/opentelemetry/#getting-started) (by Cyrille Le Clerc) can be used with an [OTEL collector](https://github.com/signalfx/splunk-otel-collector) to send to Splunk Observability Cloud (formerly SignalFx) APM 13 | -------------------------------------------------------------------------------- /integration-examples/lambda-vpc-connection-sample/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/integration-examples/lambda-vpc-connection-sample/__init__.py -------------------------------------------------------------------------------- /integration-examples/lambda-vpc-connection-sample/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "body": "{\"message\": \"hello world\"}", 3 | "resource": "/hello", 4 | "path": "/hello", 5 | "httpMethod": "GET", 6 | "isBase64Encoded": false, 7 | "queryStringParameters": { 8 | "foo": "bar" 9 | }, 10 | "pathParameters": { 11 | "proxy": "/path/to/resource" 12 | }, 13 | "stageVariables": { 14 | "baz": "qux" 15 | }, 16 | "headers": { 17 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 18 | "Accept-Encoding": "gzip, deflate, sdch", 19 | "Accept-Language": "en-US,en;q=0.8", 20 | "Cache-Control": "max-age=0", 21 | "CloudFront-Forwarded-Proto": "https", 22 | "CloudFront-Is-Desktop-Viewer": "true", 23 | "CloudFront-Is-Mobile-Viewer": "false", 24 | "CloudFront-Is-SmartTV-Viewer": "false", 25 | "CloudFront-Is-Tablet-Viewer": "false", 26 | "CloudFront-Viewer-Country": "US", 27 | "Host": "1234567890.execute-api.us-east-1.amazonaws.com", 28 | "Upgrade-Insecure-Requests": "1", 29 | "User-Agent": "Custom User Agent String", 30 | "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)", 31 | "X-Amz-Cf-Id": "cDehVQoZnx43VYQb9j2-nvCh-9z396Uhbp027Y2JvkCPNLmGJHqlaA==", 32 | "X-Forwarded-For": "127.0.0.1, 127.0.0.2", 33 | "X-Forwarded-Port": "443", 34 | "X-Forwarded-Proto": "https" 35 | }, 36 | "requestContext": { 37 | "accountId": "123456789012", 38 | "resourceId": "123456", 39 | "stage": "prod", 40 | "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef", 41 | "requestTime": "09/Apr/2015:12:34:56 +0000", 42 | "requestTimeEpoch": 1428582896000, 43 | "identity": { 44 | "cognitoIdentityPoolId": null, 45 | "accountId": null, 46 | "cognitoIdentityId": null, 47 | "caller": null, 48 | "accessKey": null, 49 | "sourceIp": "127.0.0.1", 50 | "cognitoAuthenticationType": null, 51 | "cognitoAuthenticationProvider": null, 52 | "userArn": null, 53 | "userAgent": "Custom User Agent String", 54 | "user": null 55 | }, 56 | "path": "/prod/hello", 57 | "resourcePath": "/hello", 58 | "httpMethod": "POST", 59 | "apiId": "1234567890", 60 | "protocol": "HTTP/1.1" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /integration-examples/lambda-vpc-connection-sample/hello_world/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/integration-examples/lambda-vpc-connection-sample/hello_world/__init__.py -------------------------------------------------------------------------------- /integration-examples/lambda-vpc-connection-sample/hello_world/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | 4 | 5 | def lambda_handler(event, context): 6 | x = requests.get('https://w3schools.com/python/demopage.htm') 7 | print(x.text) 8 | 9 | return { 10 | 'statusCode': 200, 11 | 'body': json.dumps(x.text) 12 | } 13 | -------------------------------------------------------------------------------- /integration-examples/lambda-vpc-connection-sample/hello_world/requirements.txt: -------------------------------------------------------------------------------- 1 | requests -------------------------------------------------------------------------------- /integration-examples/lambda-vpc-connection-sample/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/integration-examples/lambda-vpc-connection-sample/tests/__init__.py -------------------------------------------------------------------------------- /integration-examples/lambda-vpc-connection-sample/tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/integration-examples/lambda-vpc-connection-sample/tests/integration/__init__.py -------------------------------------------------------------------------------- /integration-examples/lambda-vpc-connection-sample/tests/integration/test_api_gateway.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import boto3 4 | import pytest 5 | import requests 6 | 7 | """ 8 | Make sure env variable AWS_SAM_STACK_NAME exists with the name of the stack we are going to test. 9 | """ 10 | 11 | 12 | class TestApiGateway: 13 | 14 | @pytest.fixture() 15 | def api_gateway_url(self): 16 | """ Get the API Gateway URL from Cloudformation Stack outputs """ 17 | stack_name = os.environ.get("AWS_SAM_STACK_NAME") 18 | 19 | if stack_name is None: 20 | raise ValueError('Please set the AWS_SAM_STACK_NAME environment variable to the name of your stack') 21 | 22 | client = boto3.client("cloudformation") 23 | 24 | try: 25 | response = client.describe_stacks(StackName=stack_name) 26 | except Exception as e: 27 | raise Exception( 28 | f"Cannot find stack {stack_name} \n" f'Please make sure a stack with the name "{stack_name}" exists' 29 | ) from e 30 | 31 | stacks = response["Stacks"] 32 | stack_outputs = stacks[0]["Outputs"] 33 | api_outputs = [output for output in stack_outputs if output["OutputKey"] == "HelloWorldApi"] 34 | 35 | if not api_outputs: 36 | raise KeyError(f"HelloWorldAPI not found in stack {stack_name}") 37 | 38 | return api_outputs[0]["OutputValue"] # Extract url from stack outputs 39 | 40 | def test_api_gateway(self, api_gateway_url): 41 | """ Call the API Gateway endpoint and check the response """ 42 | response = requests.get(api_gateway_url) 43 | 44 | assert response.status_code == 200 45 | assert response.json() == {"message": "hello world"} 46 | -------------------------------------------------------------------------------- /integration-examples/lambda-vpc-connection-sample/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | boto3 3 | requests 4 | -------------------------------------------------------------------------------- /integration-examples/lambda-vpc-connection-sample/tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/integration-examples/lambda-vpc-connection-sample/tests/unit/__init__.py -------------------------------------------------------------------------------- /integration-examples/lambda-vpc-connection-sample/tests/unit/test_handler.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | from hello_world import app 6 | 7 | 8 | @pytest.fixture() 9 | def apigw_event(): 10 | """ Generates API GW Event""" 11 | 12 | return { 13 | "body": '{ "test": "body"}', 14 | "resource": "/{proxy+}", 15 | "requestContext": { 16 | "resourceId": "123456", 17 | "apiId": "1234567890", 18 | "resourcePath": "/{proxy+}", 19 | "httpMethod": "POST", 20 | "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef", 21 | "accountId": "123456789012", 22 | "identity": { 23 | "apiKey": "", 24 | "userArn": "", 25 | "cognitoAuthenticationType": "", 26 | "caller": "", 27 | "userAgent": "Custom User Agent String", 28 | "user": "", 29 | "cognitoIdentityPoolId": "", 30 | "cognitoIdentityId": "", 31 | "cognitoAuthenticationProvider": "", 32 | "sourceIp": "127.0.0.1", 33 | "accountId": "", 34 | }, 35 | "stage": "prod", 36 | }, 37 | "queryStringParameters": {"foo": "bar"}, 38 | "headers": { 39 | "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)", 40 | "Accept-Language": "en-US,en;q=0.8", 41 | "CloudFront-Is-Desktop-Viewer": "true", 42 | "CloudFront-Is-SmartTV-Viewer": "false", 43 | "CloudFront-Is-Mobile-Viewer": "false", 44 | "X-Forwarded-For": "127.0.0.1, 127.0.0.2", 45 | "CloudFront-Viewer-Country": "US", 46 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 47 | "Upgrade-Insecure-Requests": "1", 48 | "X-Forwarded-Port": "443", 49 | "Host": "1234567890.execute-api.us-east-1.amazonaws.com", 50 | "X-Forwarded-Proto": "https", 51 | "X-Amz-Cf-Id": "aaaaaaaaaae3VYQb9jd-nvCd-de396Uhbp027Y2JvkCPNLmGJHqlaA==", 52 | "CloudFront-Is-Tablet-Viewer": "false", 53 | "Cache-Control": "max-age=0", 54 | "User-Agent": "Custom User Agent String", 55 | "CloudFront-Forwarded-Proto": "https", 56 | "Accept-Encoding": "gzip, deflate, sdch", 57 | }, 58 | "pathParameters": {"proxy": "/examplepath"}, 59 | "httpMethod": "POST", 60 | "stageVariables": {"baz": "qux"}, 61 | "path": "/examplepath", 62 | } 63 | 64 | 65 | def test_lambda_handler(apigw_event): 66 | 67 | ret = app.lambda_handler(apigw_event, "") 68 | data = json.loads(ret["body"]) 69 | 70 | assert ret["statusCode"] == 200 71 | assert "message" in ret["body"] 72 | assert data["message"] == "hello world" 73 | -------------------------------------------------------------------------------- /integration-examples/splunk-otel-databricks/splunk-start-up.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # on error, run away, exit, don't continue, etc... 4 | set -e 5 | 6 | # output a script to the file system so it can be executed later... 7 | cat <>/tmp/splunk/otel-script.sh 8 | #!/bin/bash 9 | set -e 10 | 11 | if [ \$DB_IS_DRIVER ]; then 12 | # Set default environment variables for the installation scripts to use. ## 13 | # 1. splunkObservability.realm= 14 | # 2. splunkObservability.accessToken= 15 | # 3. clusterName= 16 | # OTEL Service Information ## 17 | # OTEL_SERVICE_NAME = "Splunk-Databricks-OTEL" 18 | # OTEL_TRACES_EXPORTER = "jaeger-thrift-splunk" 19 | # OTEL_EXPORTER_JAEGER_ENDPOINT = "https://ingest..signalfx.com/v2/trace" 20 | 21 | # Validate Secrets: Check to see if there is a secret in the secret store before executing the script to install the OpenTelemetry Collector. 22 | echo "Running OpenTelemetry collector installation script" 23 | echo "Pre-Installation: Validation: Secret Key(s)" 24 | echo "SPLUNK_ACCESS_TOKEN must be stored in the Databricks " 25 | 26 | if [ -z "\$SPLUNK_ACCESS_TOKEN" ]; then 27 | echo 'Please set the secret for the SPLUNK_ACCESS_TOKEN in the databricks environment secret store.' 28 | exit 1; 29 | fi 30 | 31 | # Validation of parameters installation of the Splunk OpenTelemetry Collector Script 32 | echo "Pre-Installation: Validation environmental parameters" 33 | echo "SPLUNK_REALM: us0 (default), Actual: "\$SPLUNK_REALM 34 | echo "SPLUNK_MEMORY_TOTAL_MIB: 512 MIB (default), Actual: "\$SPLUNK_MEMORY_TOTAL_MIB 35 | 36 | SPLUNK_ACCESS_TOKEN="\$SPLUNK_ACCESS_TOKEN" bash -c "\$(curl -sSL https://dl.signalfx.com/splunk-otel-collector.sh > /tmp/splunk-otel-collector.sh;)" 37 | SPLUNK_ACCESS_TOKEN="\$SPLUNK_ACCESS_TOKEN" bash -c "\$(sudo sh /tmp/splunk-otel-collector.sh --realm \$SPLUNK_REALM --memory \$SPLUNK_MEMORY_TOTAL_MIB \ 38 | -- \$SPLUNK_ACCESS_TOKEN)" 39 | EOF 40 | 41 | # Determine where the script is being executed and run logic, set parameters etc (https://docs.databricks.com/clusters/init-scripts.html): ## 42 | # if: Driver: do driver stuff, else if Worker: do worker stuff, else Driver and Worker: do stuff ## 43 | 44 | echo $DB_IS_DRIVER 45 | if [[ $DB_IS_DRIVER = "TRUE" ]]; then 46 | # Logic for the Driver would go here ## 47 | 48 | else 49 | # Logic for the Worker would go here ## 50 | fi 51 | # Shared Logic for the Driver and Worker ## 52 | 53 | # Modify the permissions of the script so it can be executed. 54 | chmod a+x /tmp/splunk/otel-script.sh 55 | # Run the installation script and output logs to: /tmp/splunk/otel-script.log 56 | /tmp/splunk/otel-script.sh >> /tmp/splunk/otel-script.log 2>&1 & disown -------------------------------------------------------------------------------- /integration-examples/splunk-otel-dotnet-docker/.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | -------------------------------------------------------------------------------- /integration-examples/splunk-otel-dotnet-docker/MultiStageDocker/Dockerfile: -------------------------------------------------------------------------------- 1 | #See https://aka.ms/customizecontainer to learn how to customize your debug container and how Visual Studio uses this Dockerfile to build your images for faster debugging. 2 | 3 | FROM mcr.microsoft.com/dotnet/aspnet:8.0 AS base 4 | USER app 5 | WORKDIR /app 6 | EXPOSE 8080 7 | 8 | FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build 9 | ARG BUILD_CONFIGURATION=Release 10 | WORKDIR /src 11 | COPY ["MultiStageDocker/MultiStageDocker.csproj", "MultiStageDocker/"] 12 | RUN dotnet restore "./MultiStageDocker/./MultiStageDocker.csproj" 13 | WORKDIR "/src/MultiStageDocker" 14 | COPY . . 15 | RUN dotnet build "./MultiStageDocker.csproj" -c $BUILD_CONFIGURATION -o /app/build 16 | 17 | # Add dependencies for splunk-otel-dotnet-install.sh 18 | RUN apt-get update && \ 19 | apt-get install -y unzip 20 | 21 | # Download Splunk OTel .NET installer 22 | RUN curl -sSfL https://github.com/signalfx/splunk-otel-dotnet/releases/latest/download/splunk-otel-dotnet-install.sh -O 23 | 24 | # Install the distribution 25 | RUN sh ./splunk-otel-dotnet-install.sh 26 | 27 | FROM build AS publish 28 | ARG BUILD_CONFIGURATION=Release 29 | RUN dotnet publish "./MultiStageDocker.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false 30 | 31 | FROM base AS final 32 | 33 | # Copy instrumentation file tree 34 | WORKDIR "//home/app/.splunk-otel-dotnet" 35 | COPY --from=build /root/.splunk-otel-dotnet/ . 36 | 37 | WORKDIR /app 38 | COPY --from=publish /app/publish . 39 | COPY MultiStageDocker/entrypoint.sh . 40 | 41 | ENTRYPOINT ["sh", "entrypoint.sh"] 42 | CMD ["dotnet", "MultiStageDocker.dll"] -------------------------------------------------------------------------------- /integration-examples/splunk-otel-dotnet-docker/MultiStageDocker/MultiStageDocker.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | enable 6 | enable 7 | Linux 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /integration-examples/splunk-otel-dotnet-docker/MultiStageDocker/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Read in the file of environment settings 3 | . /$HOME/.splunk-otel-dotnet/instrument.sh 4 | 5 | # Then run the CMD 6 | exec "$@" -------------------------------------------------------------------------------- /integration-examples/splunk-otel-dotnet-docker/MultiStageDockerNuGetOption/Dockerfile: -------------------------------------------------------------------------------- 1 | #See https://aka.ms/customizecontainer to learn how to customize your debug container and how Visual Studio uses this Dockerfile to build your images for faster debugging. 2 | 3 | FROM mcr.microsoft.com/dotnet/aspnet:8.0 AS base 4 | USER app 5 | WORKDIR /app 6 | EXPOSE 8080 7 | 8 | FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build 9 | ARG BUILD_CONFIGURATION=Release 10 | WORKDIR /src 11 | COPY ["MultiStageDocker/MultiStageDocker.csproj", "MultiStageDocker/"] 12 | RUN dotnet restore "./MultiStageDocker/./MultiStageDocker.csproj" 13 | WORKDIR "/src/MultiStageDocker" 14 | COPY . . 15 | 16 | RUN dotnet add "./MultiStageDocker.csproj" package Splunk.OpenTelemetry.AutoInstrumentation --prerelease 17 | 18 | RUN dotnet build "./MultiStageDocker.csproj" -r linux-x64 -c $BUILD_CONFIGURATION -o /app/build 19 | 20 | FROM build AS publish 21 | ARG BUILD_CONFIGURATION=Release 22 | RUN dotnet publish "./MultiStageDocker.csproj" -r linux-x64 -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false 23 | 24 | FROM base AS final 25 | 26 | WORKDIR /app 27 | COPY --from=publish /app/publish . 28 | 29 | ENTRYPOINT ["./splunk-launch.sh", "dotnet", "MultiStageDocker.dll"] -------------------------------------------------------------------------------- /integration-examples/splunk-otel-dotnet-docker/MultiStageDockerNuGetOption/MultiStageDocker.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | enable 6 | enable 7 | Linux 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /integration-examples/splunk-otel-dotnet-docker/Program.cs: -------------------------------------------------------------------------------- 1 | var builder = WebApplication.CreateBuilder(args); 2 | var app = builder.Build(); 3 | 4 | app.MapGet("/", () => "Hello World!"); 5 | 6 | app.Run(); 7 | -------------------------------------------------------------------------------- /integration-examples/synthetics-examples/API/status-page-to-metrics-api/README.md: -------------------------------------------------------------------------------- 1 | # Third-party Status Page API Check to Metric 2 | This example API test shows how to call multiple APIs, collect data, turn that data into a usable JSON payload, and send it off to another API. 3 | This test creates metrics using a Splunk Synthetics API test. 4 | The test and it's configuration are included in this directory: 5 | - [`synthetics_thirdparty_status_api_check.tf`](./synthetics_thirdparty_status_api_check.tf) 6 | - Uses the [Splunk Synthetics Terraform provider](https://registry.terraform.io/providers/splunk/synthetics/latest/docs) 7 | 8 | For a detailed description of this test and how it functions check out the [Splunk Lantern Article: Constructing an API test JSON payload](https://lantern.splunk.com/Observability/Product_Tips/Synthetic_Monitoring/Constructing_an_API_test_JSON_payload_for_alerting_on_external_dependencies) 9 | 10 | ## Synthetic API Test 11 | The synthetic API test will call the CloudFlare and GitHub status pages and report a metric with a value of 1 (status is impacted) or 0 (status is normal) for each: 12 | - `cloudflare.status` 13 | - `github.status` 14 | 15 | These metrics include dimensions for description of any impact to status and an indicator (none, minor, major, or critical). 16 | ![alt text](image.png) 17 | 18 | ### Required Splunk Synthetic Global Variables 19 | The following [global variables](https://docs.splunk.com/observability/en/synthetics/test-config/global-variables.html) are **REQUIRED** to run the included API test. 20 | - `org_ingest_token`: A provisioned INGEST token 21 | ![required synthetic variables](synthetic-variables.png) 22 | 23 | -------------------------------------------------------------------------------- /integration-examples/synthetics-examples/API/status-page-to-metrics-api/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/integration-examples/synthetics-examples/API/status-page-to-metrics-api/image.png -------------------------------------------------------------------------------- /integration-examples/synthetics-examples/API/status-page-to-metrics-api/synthetic-variables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/integration-examples/synthetics-examples/API/status-page-to-metrics-api/synthetic-variables.png -------------------------------------------------------------------------------- /integration-examples/synthetics-examples/API/status-to-splunk-hec/README.md: -------------------------------------------------------------------------------- 1 | # Third-party Status Page API Check to Metric 2 | This example API test calls the OpenAI status endpoint and collects data on ongoing incidents and updates. 3 | This test creates and sends a log event containing that incident data to a Splunk HEC endpoint. 4 | The test and it's configuration are included in this directory: 5 | - [`synthetics_status_to_splunk_hec_api_check.tf`](./synthetics_status_to_splunk_hec_api_check.tf) 6 | - Uses the [Splunk Synthetics Terraform provider](https://registry.terraform.io/providers/splunk/synthetics/latest/docs) 7 | 8 | ## Synthetic API Test 9 | The synthetic API test will call the OpenAI status page and report any current and ongoing incidents to a Splunk HEC endpoint of your choice. This example is mostly to illustrate ingest arbitrary ingest into Splunk. The test serves a double function of providing external monitoring of the HEC endpoint in question in addition to providing ingest of useful incident data. 10 | 11 | 12 | ### Required Splunk Synthetic Global Variables 13 | The following [global variables](https://docs.splunk.com/observability/en/synthetics/test-config/global-variables.html) are **REQUIRED** to run the included API test. 14 | - `splunk_hec_url`: The url to your hec raw ingest (E.G. `https://hec-inputs-for-my-service.mysplunkinstance.com:443/services/collector/raw`) 15 | - **Terraform apply will fail if this global variable does not exist in your environment!** 16 | - `hec_token`: A provisioned hec token for basic auth (E.G. `Splunk 123412-3123-1234-abcd-1234123412abc`) 17 | 18 | -------------------------------------------------------------------------------- /integration-examples/synthetics-examples/API/status-to-splunk-hec/synthetics_status_to_splunk_hec_api_check.tf: -------------------------------------------------------------------------------- 1 | resource "synthetics_create_api_check_v2" "synthetics_status_to_splunk_hec_api_check" { 2 | test { 3 | active = true 4 | automatic_retries = 0 5 | device_id = 34 6 | frequency = 60 7 | location_ids = ["aws-us-east-1", "aws-us-west-1"] 8 | name = "OpenAI Status - To Splunk HEC" 9 | scheduling_strategy = "round_robin" 10 | requests { 11 | configuration { 12 | body = null 13 | headers = {} 14 | name = "Get OpenAI status" 15 | request_method = "GET" 16 | url = "https://status.openai.com/proxy/status.openai.com" 17 | } 18 | validations { 19 | actual = "{{response.code}}" 20 | code = null 21 | comparator = "is_less_than" 22 | expected = "300" 23 | extractor = null 24 | name = "Assert response code is less than 300" 25 | source = null 26 | type = "assert_numeric" 27 | value = null 28 | variable = null 29 | } 30 | validations { 31 | actual = null 32 | code = null 33 | comparator = null 34 | expected = null 35 | extractor = "$.summary.ongoing_incidents[*].updates" 36 | name = "Extract from response body" 37 | source = "{{response.body}}" 38 | type = "extract_json" 39 | value = null 40 | variable = "openai_ongoing_incidents" 41 | } 42 | } 43 | requests { 44 | configuration { 45 | body = "{{custom.openai_ongoing_incidents}}" 46 | headers = { 47 | Authorization = "{{env.hec_token}}" 48 | } 49 | name = "Send to Splunk HEC Ingest" 50 | request_method = "POST" 51 | url = "{{env.splunk_hec_url}}" 52 | } 53 | validations { 54 | actual = "{{response.code}}" 55 | code = null 56 | comparator = "is_less_than" 57 | expected = "300" 58 | extractor = null 59 | name = "Assert response code is less than 300" 60 | source = null 61 | type = "assert_numeric" 62 | value = null 63 | variable = null 64 | } 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /integration-examples/synthetics-examples/API/token-expiration-to-metrics-api/README.md: -------------------------------------------------------------------------------- 1 | # Token Expiration using Splunk Synthetics API check 2 | This Test queries the `/organization` endpoint of a Splunk Observability organization and retrieves the values of any tokens expiring within the next 30 days or next 7 days and sends metrics for that data to Splunk Observability. 3 | - [`synthetics_token_expiration_api_check.tf`](./synthetics_token_expiration_api_check.tf) 4 | This API test includes a detector which relies on metrics created by this test. That test and it's configuration are also included in this directory along with the detector as Terraform `.tf` files. 5 | - Uses the [Splunk Synthetics Terraform provider](https://registry.terraform.io/providers/splunk/synthetics/latest/docs) 6 | - [`detector_token_expiration.tf`](detector_token_expiration.tf) 7 | - Uses the [Signalfx Terraform Provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) 8 | 9 | ## Synthetic API Test 10 | The synthetic API test will call the [`/organization` endpoint](https://dev.splunk.com/observability/reference/api/organizations/latest#endpoint-retrieve-organization) for your Splunk Observability organization and collect the list of tokens expiring in the next 7 and 30 days. Those token names will be added as dimension attributes to two new metrics named: 11 | - `tokens.expiring.7days` 12 | - `tokens.expiring.30days` 13 | 14 | These metrics and dimensions will be sent to your organization's ingest endpoint and will power your detector. 15 | 16 | ### Required Splunk Synthetic Global Variables 17 | The following [global variables](https://docs.splunk.com/observability/en/synthetics/test-config/global-variables.html) are **REQUIRED** to run the included API test. 18 | - `org_api_token`: A provisioned API token (Read-only is fine) 19 | - `org_ingest_token`: A provisioned INGEST token 20 | ![required synthetic variables](synthetic-variables.png) 21 | 22 | 23 | ## Token Expiration Metrics and Detection 24 | Both `tokens.expiring.7days` and `tokens.expiring.30days` can be charted as you normally would with any other metric. 25 | ![chart of token expiration metrics](token-expire-chart.png) 26 | 27 | The [included alert](./detector_token_expiration.tf) includes custom thresholds for both of the included metrics. If you'd prefer these can easily be split into two alerts of different severities. Simply alert when either of the signals is greater than 0. -------------------------------------------------------------------------------- /integration-examples/synthetics-examples/API/token-expiration-to-metrics-api/synthetic-variables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/integration-examples/synthetics-examples/API/token-expiration-to-metrics-api/synthetic-variables.png -------------------------------------------------------------------------------- /integration-examples/synthetics-examples/API/token-expiration-to-metrics-api/token-expire-chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/integration-examples/synthetics-examples/API/token-expiration-to-metrics-api/token-expire-chart.png -------------------------------------------------------------------------------- /integration-examples/synthetics-examples/Browser/hipstershop-complete-order-test-browser/README.md: -------------------------------------------------------------------------------- 1 | # Synthetic Browser Check - Purchase Checkout Example 2 | This synthetic browser check provides an example test for complex user flows (in this case checkout) on an e-commerce website ([HipsterShop](https://github.com/signalfx/microservices-demo/)). It simulates the user journey from browsing products to completing an order, ensuring critical functionalities are working correctly. 3 | The test and it's configuration are included in this directory: 4 | - [`synthetics_hipstershop_order_completion_browser_check.tf`](./synthetics_hipstershop_order_completion_browser_check.tf) 5 | - Uses the [Splunk Synthetics Terraform provider](https://registry.terraform.io/providers/splunk/synthetics/latest/docs) 6 | 7 | ## Synthetic Browser Test 8 | The configuration leverages Terraform's synthetics browser check resource to automate interactions such as navigating URLs, selecting products, adding them to the cart, and placing orders. This example can be adapted for testing similar flows in your own applications. 9 | 10 | - For for more information on selectors and how to find the correct ones when building off this example check out this [Splunk Lantern article](https://lantern.splunk.com/Observability/UCE/Proactive_response/Improve_User_Experiences/Running_Synthetics_browser_tests/Selectors_for_multi-step_browser_tests)! 11 | 12 | ## Required Setup 13 | 14 | 1. **Replace the hipstershop URL in the test with your URL**: Modify the placeholder value in this test from `https://my-hipstershop-demo-url-should-go-here.com/` to the URL for your hipstershop instance url 15 | 16 | ## Transaction Steps Details: 17 | 18 | **Home Transaction:** 19 | Uses the go_to_url action to navigate to the Hipstershop demo site's URL. 20 | 21 | **Shop Transaction:** 22 | Executes JavaScript to select a random product from a predefined list and open the product's page. 23 | 24 | **Cart Transaction:** 25 | Clicks the "Add to Cart" button using an `xpath` selector to locate the button. 26 | 27 | **Place Order Transaction:** 28 | Step 1: Clicks the "Place order" button using an `xpath` selector. 29 | Step 2: Waits for 20 seconds to allow for the backend to process the order. 30 | Step 3: Asserts that the text "Order Confirmation ID" is present on the page. 31 | 32 | **Keep Browsing Transaction:** 33 | Clicks a button to navigate away from the order confirmation page 34 | 35 | 36 | -------------------------------------------------------------------------------- /integration-examples/system-scanner/health.py: -------------------------------------------------------------------------------- 1 | import os 2 | import socket 3 | import shutil 4 | from typing import Dict 5 | import logging 6 | 7 | 8 | class HealthCheck: 9 | def __init__(self): 10 | self.logger = logging.getLogger(__name__) 11 | 12 | def check_system_resources(self) -> Dict[str, bool]: 13 | checks = { 14 | "disk_space": self._check_disk_space(), 15 | "network": self._check_network(), 16 | "file_permissions": self._check_file_permissions(), 17 | } 18 | self._log_health_status(checks) 19 | return checks 20 | 21 | def _check_disk_space(self, min_space_gb: float = 1.0) -> bool: 22 | """Check if there's sufficient disk space using standard lib shutil""" 23 | try: 24 | total, used, free = shutil.disk_usage("/") 25 | free_gb = free // (2**30) # Convert bytes to GB 26 | return free_gb >= min_space_gb 27 | except Exception as e: 28 | self.logger.error(f"Error checking disk space: {e}") 29 | return False 30 | 31 | def _check_network(self) -> bool: 32 | """Basic network connectivity check using standard socket library""" 33 | try: 34 | # Try to connect to Google's DNS server 35 | socket.create_connection(("8.8.8.8", 53), timeout=3) 36 | return True 37 | except Exception as e: 38 | self.logger.error(f"Error checking network: {e}") 39 | return False 40 | 41 | def _check_file_permissions(self) -> bool: 42 | """Check if the program has necessary file permissions""" 43 | try: 44 | # Try to create a temporary file 45 | test_file = "permission_test.tmp" 46 | with open(test_file, "w") as f: 47 | f.write("test") 48 | os.remove(test_file) 49 | return True 50 | except Exception as e: 51 | self.logger.error(f"Error checking file permissions: {e}") 52 | return False 53 | 54 | def _log_health_status(self, checks: Dict[str, bool]): 55 | for check, status in checks.items(): 56 | if not status: 57 | self.logger.warning(f"Health check failed for: {check}") 58 | else: 59 | self.logger.info(f"Health check passed for: {check}") 60 | -------------------------------------------------------------------------------- /integration-examples/system-scanner/os_info.py: -------------------------------------------------------------------------------- 1 | """ 2 | SystemScanner: OS Information Module 3 | 4 | This module provides functionality to retrieve detailed information 5 | about the operating system on which the script is running. 6 | 7 | It includes functions to get the system name, release version, 8 | and architecture. 9 | """ 10 | 11 | import platform 12 | import sys 13 | 14 | 15 | def get_os_info(): 16 | system = platform.system() 17 | release = platform.release() 18 | architecture = platform.machine() 19 | os_flavor = "" 20 | 21 | # Add OS flavor information focusing on the [0] output of platform 22 | if system == "Darwin": # macOS 23 | os_flavor = f"macOS {platform.mac_ver()[0]}" 24 | elif system == "Linux": 25 | try: 26 | # Use platform.freedesktop_os_release() if available (Python 3.8+) 27 | if sys.version_info >= (3, 8): 28 | os_info = platform.freedesktop_os_release() 29 | if os_info.get("PRETTY_NAME"): 30 | os_flavor = f"Linux {os_info['PRETTY_NAME']}" 31 | elif os_info.get("NAME"): 32 | os_flavor = f"Linux {os_info['NAME']}" 33 | else: 34 | os_flavor = "Linux" 35 | else: 36 | # Fallback to reading /etc/os-release (for Python < 3.8) 37 | with open("/etc/os-release") as f: 38 | for line in f: 39 | if line.startswith("PRETTY_NAME="): 40 | os_flavor = line.split("=")[1].strip().strip('"') 41 | break 42 | except: 43 | os_flavor = "Linux" 44 | elif system == "Windows": 45 | os_flavor = f"Windows {platform.win32_ver()[0]}" 46 | 47 | return system, release, architecture, os_flavor 48 | -------------------------------------------------------------------------------- /integration-examples/system-scanner/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import contextlib 3 | from datetime import datetime 4 | import os 5 | 6 | 7 | class ContextLogger: 8 | def __init__(self, logger_name: str, log_file: str = "system_scanner.log"): 9 | self.logger = logging.getLogger(logger_name) 10 | 11 | # Create logs directory if it doesn't exist 12 | log_dir = "logs" 13 | if not os.path.exists(log_dir): 14 | os.makedirs(log_dir) 15 | 16 | log_path = os.path.join(log_dir, log_file) 17 | 18 | # Configure logging 19 | formatter = logging.Formatter( 20 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 21 | ) 22 | file_handler = logging.FileHandler(log_path) 23 | file_handler.setFormatter(formatter) 24 | 25 | # Add handlers if they don't exist 26 | if not self.logger.handlers: 27 | self.logger.addHandler(file_handler) 28 | self.logger.setLevel(logging.INFO) 29 | 30 | self.start_time = None 31 | 32 | @contextlib.contextmanager 33 | def operation_context(self, operation: str): 34 | self.start_time = datetime.now() 35 | self.logger.info(f"Starting operation: {operation}") 36 | try: 37 | yield 38 | except Exception as e: 39 | self.logger.error(f"Error during {operation}: {e}") 40 | raise 41 | finally: 42 | duration = datetime.now() - self.start_time 43 | self.logger.info( 44 | f"{operation} completed in {duration.total_seconds():.2f}s" 45 | ) 46 | -------------------------------------------------------------------------------- /integration-examples/system-scanner/validators.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | from typing import Optional 4 | 5 | 6 | def sanitize_command_output(output: str) -> str: 7 | """Sanitize command output to prevent injection""" 8 | if not isinstance(output, str): 9 | return "" 10 | return re.sub(r"[^\w\s.()-]", "", output) # Added () to the allowed characters 11 | 12 | 13 | def validate_path(path: str) -> Optional[str]: 14 | """Validate and normalize file path""" 15 | if not path or not isinstance(path, str): 16 | return None 17 | try: 18 | normalized_path = os.path.normpath(path) 19 | return normalized_path if os.path.exists(normalized_path) else None 20 | except Exception: 21 | return None 22 | 23 | 24 | def validate_version_string(version: str) -> str: 25 | """Validate and clean version string""" 26 | if not version or not isinstance(version, str): 27 | return "Unknown" 28 | # Remove any potentially harmful characters, keeping only valid version characters 29 | cleaned = re.sub(r"[^\w\s.-]", "", version) 30 | return cleaned if cleaned else "Unknown" 31 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/.gitignore: -------------------------------------------------------------------------------- 1 | terraform.tfstate 2 | terraform.tfstate.d/* 3 | terraform.tfstate.backup 4 | .terraform.lock.hcl 5 | .terraform 6 | secret.tfvars 7 | terraform.tfvars 8 | terraform.tfvars 9 | .DS_Store 10 | 11 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/README.md: -------------------------------------------------------------------------------- 1 | # Observability Cloud Jumpstart 2 | 3 | **Note:** Requires Terraform (minimum) v1.3.x 4 | 5 | ## Introduction 6 | 7 | This repository provides detectors, dashboard groups, and dashboards that can easily be deployed in a Splunk Observability Cloud org using Terraform. 8 | 9 | This can be useful for the assets themselves, but also as a construct for how you can easily share assets across multiple parent/child orgs. Also included is an [export script](./export_script) which can be used to easily export dashboards, dashboard groups, and detectors. 10 | 11 | These are complimentary to the out of the box content provided by Splunk. This repository and its assets are provided "as-is" and are not supported by Splunk. 12 | 13 | ## Clone the repository 14 | 15 | `git clone https://github.com/splunk/observability-content-contrib.git` 16 | 17 | ## Change into JumpStart directory 18 | 19 | `cd observability-content-contrib/integration-examples/terraform-jumpstart` 20 | 21 | ## Initialise Terraform 22 | 23 | ``` text 24 | terraform init --upgrade 25 | ``` 26 | 27 | ## Create a workspace (optional) 28 | 29 | ``` text 30 | terraform workspace new my_workspace 31 | ``` 32 | 33 | Where `my_workspace` is the name of the workspace you want to create. 34 | 35 | ## 5. Terraform variables description 36 | 37 | - `api_token`: Observability API Token 38 | - `splunk_realm`: Observability Realm (`eu0`, `us0`, `us1`, `us2`, `jp0`, `au0`) 39 | - `o11y_prefix`: Text that will prefix all the detectors, dashboard groups, and dashboards 40 | 41 | ## Create a `terraform.tfvars` file 42 | 43 | Copy the template file `terraform.tfvars.template` to `terraform.tfvars` and fill in the values e.g. 44 | 45 | ``` text 46 | api_token="1234xxx5678yyyy" 47 | realm="eu0" 48 | o11y_prefix="[Splunk]" 49 | ``` 50 | 51 | ## Review the execution plan 52 | 53 | ``` text 54 | terraform plan 55 | ``` 56 | 57 | ## Apply the changes 58 | 59 | ``` text 60 | terraform apply 61 | ``` 62 | 63 | ## Destroy everything 64 | 65 | If you created a workspace you will first need to ensure you are in the correct workspace e.g. 66 | 67 | ``` text 68 | terraform workspace select my_workspace 69 | ``` 70 | 71 | Where `my_workspace` is the name of the workspace you want to be in. Then run the destroy command: 72 | 73 | ``` text 74 | terraform destroy 75 | ``` 76 | 77 | ## Deploying a module 78 | 79 | ``` text 80 | terraform apply -target=module.aws 81 | terraform apply -target=module.dashboards 82 | terraform apply -target=module.gcp 83 | ``` 84 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/export_script/.gitignore: -------------------------------------------------------------------------------- 1 | *.tf 2 | venv -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/export_script/requirements.txt: -------------------------------------------------------------------------------- 1 | signalfx >= 1.1.7 2 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/main.tf: -------------------------------------------------------------------------------- 1 | provider "signalfx" { 2 | auth_token = var.api_token 3 | api_url = "https://api.${var.realm}.signalfx.com" 4 | } 5 | 6 | module "aws" { 7 | source = "./modules/aws" 8 | o11y_prefix = var.o11y_prefix 9 | } 10 | 11 | module "host" { 12 | source = "./modules/host" 13 | o11y_prefix = var.o11y_prefix 14 | 15 | } 16 | 17 | module "kafka" { 18 | source = "./modules/kafka" 19 | o11y_prefix = var.o11y_prefix 20 | 21 | } 22 | 23 | module "azure" { 24 | source = "./modules/azure" 25 | o11y_prefix = var.o11y_prefix 26 | } 27 | 28 | module "docker" { 29 | source = "./modules/docker" 30 | o11y_prefix = var.o11y_prefix 31 | } 32 | 33 | module "gcp" { 34 | source = "./modules/gcp" 35 | o11y_prefix = var.o11y_prefix 36 | } 37 | 38 | module "kubernetes" { 39 | source = "./modules/kubernetes" 40 | o11y_prefix = var.o11y_prefix 41 | } 42 | 43 | module "pivotal" { 44 | source = "./modules/pivotal" 45 | o11y_prefix = var.o11y_prefix 46 | } 47 | 48 | module "usage_dashboard" { 49 | source = "./modules/dashboards/usage" 50 | o11y_prefix = var.o11y_prefix 51 | } 52 | 53 | module "parent_child_dashboard" { 54 | source = "./modules/dashboards/parent" 55 | o11y_prefix = var.o11y_prefix 56 | } 57 | 58 | module "rum_and_synthetics_dashboard" { 59 | source = "./modules/dashboards/rum_and_synthetics" 60 | o11y_prefix = var.o11y_prefix 61 | } 62 | 63 | module "executive-dashboards" { 64 | source = "./modules/dashboards/executive-dashboards" 65 | o11y_prefix = var.o11y_prefix 66 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/aws/ecs.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_detector" "aws_ecs_smartagent_cpu" { 2 | name = "${var.o11y_prefix} ECS Cluster High CPU 5m (SFX) - SmartAgent" 3 | description = "Alert when an ECS Cluster has sustained high CPU levels for 5 minutes" 4 | program_text = <<-EOF 5 | A = data('cpu.usage.total', filter=filter('ecs_task_group', '*'), rollup='rate').publish(label='A', enable=False) 6 | B = data('cpu.usage.system', filter=filter('ecs_task_group', '*'), rollup='rate').publish(label='B', enable=False) 7 | C = ((A/B)*100).publish(label='C', enable=False) 8 | E = (C).min().publish(label='E', enable=False) 9 | G = (C).percentile(pct=10).publish(label='G', enable=False) 10 | F = (C).percentile(pct=50).publish(label='F', enable=False) 11 | H = (C).percentile(pct=95).publish(label='H', enable=False) 12 | D = (C).max().publish(label='D', enable=False) 13 | detect(when(D > 90, lasting='5m')).publish('AWS/ECS Cluster High CPU 5m') 14 | EOF 15 | rule { 16 | detect_label = "AWS/ECS Cluster High CPU 5m" 17 | severity = "Major" 18 | parameterized_body = var.message_body 19 | } 20 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/aws/elb.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_detector" "httpcode_elb_5xx" { 2 | name = "${var.o11y_prefix} AWS/ELB has high 5XX response ratio" 3 | description = "Alerts when 10% of requests were 5XX for last 5m" 4 | 5 | program_text = <<-EOF 6 | A = data('HTTPCode_ELB_5XX', filter=(filter('namespace', 'AWS/ELB') and filter('stat', 'count') and filter('LoadBalancerName', '*'))).publish(label='HTTPCode_ELB_5XX', enable=False) 7 | B = data('RequestCount', filter=(filter('namespace', 'AWS/ELB') and filter('stat', 'count') and filter('LoadBalancerName', '*'))).publish(label='RequestCount', enable=False) 8 | detect(when(((A/B)*100) >= 10, lasting='5m')).publish('AWS/ELB 10% of requests were 5XX for last 5m') 9 | EOF 10 | 11 | rule { 12 | detect_label = "AWS/ELB 10% of requests were 5XX for last 5m" 13 | severity = "Critical" 14 | parameterized_body = var.message_body 15 | } 16 | } 17 | 18 | resource "signalfx_detector" "surgequeuelength_elb" { 19 | name = "${var.o11y_prefix} AWS/ELB has high Surge Queue Length (>= 90%)" 20 | description = "Alerts when Surge Queue Length is >= 90%" 21 | 22 | program_text = <<-EOF 23 | A = data('SurgeQueueLength', filter=filter('stat', 'upper') and (not filter('AvailabilityZone', '*'))).publish(label='A') 24 | detect(when((A/1024)*100 >= 90, lasting='5m')).publish('AWS/ELB SurgeQueueLength is close to capacity') 25 | EOF 26 | 27 | rule { 28 | detect_label = "AWS/ELB SurgeQueueLength is close to capacity" 29 | severity = "Critical" 30 | parameterized_body = var.message_body 31 | } 32 | } 33 | 34 | resource "signalfx_detector" "spillover_elb" { 35 | name = "${var.o11y_prefix} AWS/ELB has spillover" 36 | description = "Alerts when ELB Spillover is detected (generates 503 for users)" 37 | 38 | program_text = <<-EOF 39 | A = data('SpilloverCount', filter=filter('stat', 'sum') and filter('namespace', 'AWS/ELB') and (not filter('AvailabilityZone', '*'))).publish(label='A') 40 | detect(when(A > 0)).publish('AWS/ELB Spillover detected') 41 | EOF 42 | 43 | rule { 44 | detect_label = "AWS/ELB Spillover detected" 45 | severity = "Critical" 46 | parameterized_body = var.message_body 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/aws/lambda.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_detector" "lambda_errors" { 2 | name = "${var.o11y_prefix} AWS/Lambda Errors" 3 | description = "AWS/Lambda Function Error Rates" 4 | program_text = <<-EOF 5 | function_errors = data('Errors', filter=(filter('namespace', 'AWS/Lambda') and filter('FunctionName', '*') and filter('Resource', '*') and filter('stat', 'sum'))).publish(label='function_errors', enable=False) 6 | detect((when(function_errors > 10, lasting='5m'))).publish('AWS/Lambda function error rate is greater than 10 for the last 5m') 7 | from signalfx.detectors.against_periods import against_periods 8 | hist_duration_errors = data('Duration', filter=filter('namespace', 'AWS/Lambda')).mean().publish(label='hist_duration_errors', enable=False) 9 | against_periods.detector_mean_std(stream=hist_duration_errors, window_to_compare='15m', space_between_windows='60m', num_windows=4, fire_num_stddev=3, clear_num_stddev=2, discard_historical_outliers=True, orientation='above').publish('AWS/Lambda Lambda duration has been greater then historical norm during the past 15 minutes') 10 | from signalfx.detectors.against_periods import against_periods 11 | cold_start_errors = data('function.cold_starts',filter=filter('namespace', 'AWS/Lambda')).publish(label='cold_start_errors', enable=False) 12 | against_periods.detector_mean_std(stream=cold_start_errors, window_to_compare='10m', space_between_windows='24h', num_windows=4, fire_num_stddev=3, clear_num_stddev=2.5, discard_historical_outliers=True, orientation='above').publish('AWS/Lambda Wrapper coldstart count has been greater then historical norm during the past 10 minutes') 13 | EOF 14 | rule { 15 | detect_label = "AWS/Lambda function error rate is greater than 10 for the last 5m" 16 | severity = "Major" 17 | parameterized_body = var.message_body 18 | } 19 | rule { 20 | detect_label = "AWS/Lambda Lambda duration has been greater then historical norm during the past 15 minutes" 21 | severity = "Minor" 22 | parameterized_body = var.message_body 23 | } 24 | rule { 25 | detect_label = "AWS/Lambda Wrapper coldstart count has been greater then historical norm during the past 10 minutes" 26 | severity = "Warning" 27 | parameterized_body = var.message_body 28 | } 29 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/aws/variables.tf: -------------------------------------------------------------------------------- 1 | variable "message_body" { 2 | type = string 3 | 4 | default = <<-EOF 5 | {{#if anomalous}} 6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}. 7 | {{else}} 8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}. 9 | {{/if}} 10 | 11 | {{#if anomalous}} 12 | Triggering condition: {{{readableRule}}} 13 | {{/if}} 14 | 15 | {{#if anomalous}} 16 | Signal value: {{inputs.A.value}} 17 | {{else}} 18 | Current signal value: {{inputs.A.value}} 19 | {{/if}} 20 | 21 | {{#notEmpty dimensions}} 22 | Signal details: {{{dimensions}}} 23 | {{/notEmpty}} 24 | 25 | {{#if anomalous}} 26 | {{#if runbookUrl}} 27 | Runbook: {{{runbookUrl}}} 28 | {{/if}} 29 | {{#if tip}} 30 | Tip: {{{tip}}} 31 | {{/if}} 32 | {{/if}} 33 | EOF 34 | } 35 | 36 | variable "o11y_prefix" { 37 | type = string 38 | description = "Detector Prefix" 39 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/aws/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | } 6 | } 7 | required_version = ">= 0.13" 8 | } 9 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/azure/variables.tf: -------------------------------------------------------------------------------- 1 | variable "message_body" { 2 | type = string 3 | 4 | default = <<-EOF 5 | {{#if anomalous}} 6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}. 7 | {{else}} 8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}. 9 | {{/if}} 10 | 11 | {{#if anomalous}} 12 | Triggering condition: {{{readableRule}}} 13 | {{/if}} 14 | 15 | {{#if anomalous}} 16 | Signal value: {{inputs.A.value}} 17 | {{else}} 18 | Current signal value: {{inputs.A.value}} 19 | {{/if}} 20 | 21 | {{#notEmpty dimensions}} 22 | Signal details: {{{dimensions}}} 23 | {{/notEmpty}} 24 | 25 | {{#if anomalous}} 26 | {{#if runbookUrl}} 27 | Runbook: {{{runbookUrl}}} 28 | {{/if}} 29 | {{#if tip}} 30 | Tip: {{{tip}}} 31 | {{/if}} 32 | {{/if}} 33 | EOF 34 | } 35 | 36 | variable "o11y_prefix" { 37 | type = string 38 | description = "Detector Prefix" 39 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/azure/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | } 6 | } 7 | required_version = ">= 0.13" 8 | } 9 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/azure/vm.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_detector" "azure_cpu_historical_norm" { 2 | name = "${var.o11y_prefix} Azure VM CPU % greater than historical norm" 3 | description = "Alerts when CPU usage for this host for the last 10 minutes was significantly higher than normal, as compared to the last 24 hours" 4 | program_text = <<-EOF 5 | from signalfx.detectors.against_recent import against_recent 6 | A = data('Percentage CPU', filter=(filter('primary_aggregation_type', 'true'))).publish(label='A', enable=False) 7 | against_recent.detector_mean_std(stream=A, current_window='10m', historical_window='24h', fire_num_stddev=3, clear_num_stddev=2.5, orientation='above', ignore_extremes=True, calculation_mode='vanilla').publish('CPU % is significantly greater than normal, and increasing') 8 | EOF 9 | rule { 10 | detect_label = "CPU % is significantly greater than normal, and increasing" 11 | severity = "Warning" 12 | parameterized_body = var.message_body 13 | } 14 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/dashboards/executive-dashboards/main.tf: -------------------------------------------------------------------------------- 1 | ### Create a Dashboard Group for our Dashboards 2 | resource "signalfx_dashboard_group" "exec_dashboard_group" { 3 | name = "${var.o11y_prefix} Exec Level Dashboards" 4 | description = "Executive Level Dashboards" 5 | 6 | ### Note that if you use these features, you must use a user's 7 | ### admin key to authenticate the provider, lest Terraform not be able 8 | ### to modify the dashboard group in the future! 9 | #authorized_writer_teams = [signalfx_team.mycoolteam.id] 10 | #authorized_writer_users = ["abc123"] 11 | } 12 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/dashboards/executive-dashboards/variables.tf: -------------------------------------------------------------------------------- 1 | variable "o11y_prefix" { 2 | type = string 3 | description = "Detector Prefix" 4 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/dashboards/executive-dashboards/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | version = ">=6.13.1" 6 | } 7 | } 8 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/dashboards/parent/main.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_dashboard_group" "parentchildoverview" { 2 | name = "${var.o11y_prefix} Parent/Child Overview (Terraform)" 3 | description = "Parent/Child Overview/Usage Dashboards" 4 | } 5 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/dashboards/parent/variables.tf: -------------------------------------------------------------------------------- 1 | variable "o11y_prefix" { 2 | type = string 3 | description = "Dashboard Prefix" 4 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/dashboards/parent/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | } 6 | } 7 | required_version = ">= 0.13" 8 | } 9 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/dashboards/rum_and_synthetics/main.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_dashboard_group" "rumandsynthetics" { 2 | name = "${var.o11y_prefix} RUM and Synthetics (Terraform)" 3 | description = "RUM and SYnthetics Dashboard" 4 | } 5 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/dashboards/rum_and_synthetics/variables.tf: -------------------------------------------------------------------------------- 1 | variable "o11y_prefix" { 2 | type = string 3 | description = "Dashboard Prefix" 4 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/dashboards/rum_and_synthetics/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | } 6 | } 7 | required_version = ">= 0.13" 8 | } 9 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/dashboards/usage/main.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_dashboard_group" "usageoverview" { 2 | name = "${var.o11y_prefix} Usage Overview (Terraform)" 3 | description = "Host Based Model, MTS and Events Usage" 4 | } 5 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/dashboards/usage/variables.tf: -------------------------------------------------------------------------------- 1 | variable "o11y_prefix" { 2 | type = string 3 | description = "Dashboard Prefix" 4 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/dashboards/usage/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | } 6 | } 7 | required_version = ">= 0.13" 8 | } 9 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/docker/container.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_detector" "container_cpu_utilization" { 2 | name = "${var.o11y_prefix} Container CPU utilization % high" 3 | description = "Alerts when CPU Utilization % is between 70% & 80% for 10mins and > 80% for 5mins" 4 | program_text = <<-EOF 5 | A = data('cpu.usage.total', filter=filter('plugin', 'docker')).publish(label='A', enable=False) 6 | B = data('cpu.usage.system', filter=filter('plugin', 'docker')).publish(label='B', enable=False) 7 | C = (A/B*100).publish(label='Container CPU') 8 | detect(when(C > 80, lasting='5m')).publish('Container CPU utilization % is above 80 for 5m') 9 | detect(when(not (C > 80) and not (C < 70), lasting='10m')).publish('Container CPU utilization % is within 70 and 80 for 10m') 10 | EOF 11 | rule { 12 | detect_label = "Container CPU utilization % is within 70 and 80 for 10m" 13 | severity = "Warning" 14 | parameterized_body = var.message_body 15 | } 16 | rule { 17 | detect_label = "Container CPU utilization % is above 80 for 5m" 18 | severity = "Major" 19 | parameterized_body = var.message_body 20 | } 21 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/docker/variables.tf: -------------------------------------------------------------------------------- 1 | variable "message_body" { 2 | type = string 3 | 4 | default = <<-EOF 5 | {{#if anomalous}} 6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}. 7 | {{else}} 8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}. 9 | {{/if}} 10 | 11 | {{#if anomalous}} 12 | Triggering condition: {{{readableRule}}} 13 | {{/if}} 14 | 15 | {{#if anomalous}} 16 | Signal value: {{inputs.A.value}} 17 | {{else}} 18 | Current signal value: {{inputs.A.value}} 19 | {{/if}} 20 | 21 | {{#notEmpty dimensions}} 22 | Signal details: {{{dimensions}}} 23 | {{/notEmpty}} 24 | 25 | {{#if anomalous}} 26 | {{#if runbookUrl}} 27 | Runbook: {{{runbookUrl}}} 28 | {{/if}} 29 | {{#if tip}} 30 | Tip: {{{tip}}} 31 | {{/if}} 32 | {{/if}} 33 | EOF 34 | } 35 | 36 | variable "o11y_prefix" { 37 | type = string 38 | description = "Detector Prefix" 39 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/docker/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | } 6 | } 7 | required_version = ">= 0.13" 8 | } 9 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/gcp/compute.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_detector" "gcp_cpu_historical_norm" { 2 | name = "${var.o11y_prefix} GCP Compute Engine CPU % greater than historical norm" 3 | description = "Alerts when CPU usage for this host for the last 10 minutes was significantly higher than normal, as compared to the last 24 hours" 4 | program_text = <<-EOF 5 | from signalfx.detectors.against_recent import against_recent 6 | A = data('instance/cpu/utilization').publish(label='A', enable=False) 7 | against_recent.detector_mean_std(stream=A, current_window='10m', historical_window='24h', fire_num_stddev=3, clear_num_stddev=2.5, orientation='above', ignore_extremes=True, calculation_mode='vanilla').publish('CPU utilization is significantly greater than normal, and increasing') 8 | EOF 9 | rule { 10 | detect_label = "CPU utilization is significantly greater than normal, and increasing" 11 | severity = "Warning" 12 | parameterized_body = var.message_body 13 | } 14 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/gcp/storage.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_detector" "gcp_cloud_storage_errors" { 2 | name = "${var.o11y_prefix} GCP Cloud Storage Requests High Error Rate" 3 | description = "Alerts when there is a high 4xx or 5xx error rate" 4 | program_text = <<-EOF 5 | A = data('api/request_count', filter=filter('response_code', '4*'), rollup='latest').sum(by=['bucket_name']).publish(label='4xx error', enable=False) 6 | B = data('api/request_count', rollup='latest').sum(by=['bucket_name']).publish(label='total', enable=False) 7 | detect(when(((A/B)*100) >= 10, lasting='5m')).publish('GCP Cloud Storage 10% of requests were 4xx for 5m') 8 | C = data('api/request_count', filter=filter('response_code', '5*'), rollup='latest').sum(by=['bucket_name']).publish(label='5xx error', enable=False) 9 | D = data('api/request_count', rollup='latest').sum(by=['bucket_name']).publish(label='total', enable=False) 10 | detect(when(((C/D)*100) >= 10, lasting='5m')).publish('GCP Cloud Storage 10% of requests were 5xx for 5m') 11 | EOF 12 | rule { 13 | detect_label = "GCP Cloud Storage 10% of requests were 4xx for 5m" 14 | severity = "Major" 15 | parameterized_body = var.message_body 16 | } 17 | rule { 18 | detect_label = "GCP Cloud Storage 10% of requests were 5xx for 5m" 19 | severity = "Major" 20 | parameterized_body = var.message_body 21 | } 22 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/gcp/variables.tf: -------------------------------------------------------------------------------- 1 | variable "message_body" { 2 | type = string 3 | 4 | default = <<-EOF 5 | {{#if anomalous}} 6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}. 7 | {{else}} 8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}. 9 | {{/if}} 10 | 11 | {{#if anomalous}} 12 | Triggering condition: {{{readableRule}}} 13 | {{/if}} 14 | 15 | {{#if anomalous}} 16 | Signal value: {{inputs.A.value}} 17 | {{else}} 18 | Current signal value: {{inputs.A.value}} 19 | {{/if}} 20 | 21 | {{#notEmpty dimensions}} 22 | Signal details: {{{dimensions}}} 23 | {{/notEmpty}} 24 | 25 | {{#if anomalous}} 26 | {{#if runbookUrl}} 27 | Runbook: {{{runbookUrl}}} 28 | {{/if}} 29 | {{#if tip}} 30 | Tip: {{{tip}}} 31 | {{/if}} 32 | {{/if}} 33 | EOF 34 | } 35 | 36 | variable "o11y_prefix" { 37 | type = string 38 | description = "Detector Prefix" 39 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/gcp/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | } 6 | } 7 | required_version = ">= 0.13" 8 | } 9 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/host/cpu.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_detector" "cpu_historical_norm" { 2 | name = "${var.o11y_prefix} CPU utilization % greater than historical norm" 3 | description = "Alerts when CPU usage for this host for the last 30 minutes was significantly higher than normal, as compared to the last 24 hours" 4 | program_text = <<-EOF 5 | from signalfx.detectors.against_recent import against_recent 6 | A = data('cpu.utilization').publish(label='A', enable=True) 7 | against_recent.detector_mean_std(stream=A, current_window='30m', historical_window='24h', fire_num_stddev=3, clear_num_stddev=2.5, orientation='above', ignore_extremes=True, calculation_mode='vanilla').publish('CPU utilization is significantly greater than normal, and increasing') 8 | EOF 9 | rule { 10 | detect_label = "CPU utilization is significantly greater than normal, and increasing" 11 | severity = "Warning" 12 | parameterized_body = var.message_body 13 | } 14 | } 15 | 16 | resource "signalfx_detector" "cpu_historical_cyclical_norm" { 17 | name = "${var.o11y_prefix} CPU utilization % greater than 3.5 std dev compared to the same time window over the last 3 days" 18 | description = "Alerts when CPU usage for this host for the last 30 minutes was significantly higher than normal, as compared to the last 24 hours" 19 | program_text = <<-EOF 20 | from signalfx.detectors.against_periods import against_periods 21 | A = data('cpu.utilization').publish(label='A', enable=True) 22 | against_periods.detector_mean_std(stream=A, window_to_compare='30m', space_between_windows='24h', num_windows=3, fire_num_stddev=3.5, clear_num_stddev=2, discard_historical_outliers=True, orientation='above').publish('CPU Utilization is greater than normal for the same time window compared to the last 3 days') 23 | EOF 24 | rule { 25 | detect_label = "CPU Utilization is greater than normal for the same time window compared to the last 3 days" 26 | severity = "Warning" 27 | parameterized_body = var.message_body 28 | } 29 | } 30 | 31 | resource "signalfx_detector" "cpu_not_reporting" { 32 | name = "${var.o11y_prefix} Host has stopped reporting data for atleast 1 minute" 33 | description = "Alerts when Host has stopped reporting data for atleast a minute" 34 | program_text = <<-EOF 35 | from signalfx.detectors.not_reporting import not_reporting 36 | A = data('cpu.utilization').publish(label='A', enable=True) 37 | not_reporting.detector(stream=A, resource_identifier=None, duration='1m').publish('Host Not Reporting') 38 | EOF 39 | rule { 40 | detect_label = "Host Not Reporting" 41 | severity = "Critical" 42 | parameterized_body = var.message_body 43 | } 44 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/host/disk.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_detector" "disk_space_low" { 2 | name = "${var.o11y_prefix} Low Disk Space" 3 | description = "Alerts when a partition is filling up or total disk space will fill up within 24hrs" 4 | 5 | program_text = <<-EOF 6 | A = data('disk.utilization', filter=(not filter('plugin_instance', 'snap*'))).publish(label='Disk Utilization', enable=False) 7 | detect(when(A >= 80 and A < 90)).publish('Disk space has filled upto greater than 80% but less than 90%') 8 | detect(when(A >= 90)).publish('Disk space has filled upto or is greater than 90%') 9 | from signalfx.detectors.countdown import countdown 10 | B = data('disk.summary_utilization').publish(label='Disk Summary Utilization', enable=False) 11 | countdown.hours_left_stream_incr_detector(stream=B, maximum_capacity=100, lower_threshold=24, fire_lasting=lasting('15m', 1), clear_threshold=36, clear_lasting=lasting('15m', 1), use_double_ewma=False).publish('Disk space utilization is projected to reach 100% within 24 hours') 12 | EOF 13 | 14 | rule { 15 | detect_label = "Disk space has filled upto greater than 80% but less than 90%" 16 | severity = "Major" 17 | parameterized_body = var.message_body 18 | } 19 | rule { 20 | detect_label = "Disk space has filled upto or is greater than 90%" 21 | severity = "Critical" 22 | parameterized_body = var.message_body 23 | 24 | } 25 | rule { 26 | detect_label = "Disk space utilization is projected to reach 100% within 24 hours" 27 | severity = "Critical" 28 | parameterized_body = var.message_body 29 | } 30 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/host/mem.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_detector" "mem_historical_norm" { 2 | name = "${var.o11y_prefix} Mem utilization % greater than historical norm" 3 | description = "Alerts when Mem usage for this host for the last 30 minutes was significantly higher than normal, as compared to the last 24 hours" 4 | program_text = <<-EOF 5 | from signalfx.detectors.against_recent import against_recent 6 | A = data('memory.utilization').publish(label='A', enable=True) 7 | against_recent.detector_mean_std(stream=A, current_window='30m', historical_window='24h', fire_num_stddev=3, clear_num_stddev=2.5, orientation='above', ignore_extremes=True, calculation_mode='vanilla').publish('Memory utilization is significantly greater than normal, and increasing') 8 | EOF 9 | rule { 10 | detect_label = "Memory utilization is significantly greater than normal, and increasing" 11 | severity = "Warning" 12 | parameterized_body = var.message_body 13 | } 14 | } 15 | 16 | resource "signalfx_detector" "mem_historical_cyclical_norm" { 17 | name = "${var.o11y_prefix} Memory utilization % greater than 3.5 std dev compared to the same time window over the last 3 days" 18 | description = "Alerts when Memory usage for this host for the last 30 minutes was significantly higher than normal, as compared to the last 24 hours" 19 | program_text = <<-EOF 20 | from signalfx.detectors.against_periods import against_periods 21 | A = data('memory.utilization').publish(label='A', enable=True) 22 | against_periods.detector_mean_std(stream=A, window_to_compare='30m', space_between_windows='24h', num_windows=3, fire_num_stddev=3.5, clear_num_stddev=2, discard_historical_outliers=True, orientation='above').publish('Memory Utilization is greater than normal for the same time window compared to the last 3 days') 23 | EOF 24 | rule { 25 | detect_label = "Memory Utilization is greater than normal for the same time window compared to the last 3 days" 26 | severity = "Warning" 27 | parameterized_body = var.message_body 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/host/variables.tf: -------------------------------------------------------------------------------- 1 | variable "message_body" { 2 | type = string 3 | 4 | default = <<-EOF 5 | {{#if anomalous}} 6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}. 7 | {{else}} 8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}. 9 | {{/if}} 10 | 11 | {{#if anomalous}} 12 | Triggering condition: {{{readableRule}}} 13 | {{/if}} 14 | 15 | {{#if anomalous}} 16 | Signal value: {{inputs.A.value}} 17 | {{else}} 18 | Current signal value: {{inputs.A.value}} 19 | {{/if}} 20 | 21 | {{#notEmpty dimensions}} 22 | Signal details: {{{dimensions}}} 23 | {{/notEmpty}} 24 | 25 | {{#if anomalous}} 26 | {{#if runbookUrl}} 27 | Runbook: {{{runbookUrl}}} 28 | {{/if}} 29 | {{#if tip}} 30 | Tip: {{{tip}}} 31 | {{/if}} 32 | {{/if}} 33 | EOF 34 | } 35 | 36 | variable "o11y_prefix" { 37 | type = string 38 | description = "Detector Prefix" 39 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/host/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | } 6 | } 7 | required_version = ">= 0.13" 8 | } 9 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/kafka/variables.tf: -------------------------------------------------------------------------------- 1 | variable "message_body" { 2 | type = string 3 | 4 | default = <<-EOF 5 | {{#if anomalous}} 6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}. 7 | {{else}} 8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}. 9 | {{/if}} 10 | 11 | {{#if anomalous}} 12 | Triggering condition: {{{readableRule}}} 13 | {{/if}} 14 | 15 | {{#if anomalous}} 16 | Signal value: {{inputs.A.value}} 17 | {{else}} 18 | Current signal value: {{inputs.A.value}} 19 | {{/if}} 20 | 21 | {{#notEmpty dimensions}} 22 | Signal details: {{{dimensions}}} 23 | {{/notEmpty}} 24 | 25 | {{#if anomalous}} 26 | {{#if runbookUrl}} 27 | Runbook: {{{runbookUrl}}} 28 | {{/if}} 29 | {{#if tip}} 30 | Tip: {{{tip}}} 31 | {{/if}} 32 | {{/if}} 33 | EOF 34 | } 35 | 36 | variable "o11y_prefix" { 37 | type = string 38 | description = "Detector Prefix" 39 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/kafka/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | } 6 | } 7 | required_version = ">= 0.13" 8 | } 9 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/kubernetes/node.tf: -------------------------------------------------------------------------------- 1 | /* 2 | resource "signalfx_detector" "k8s_node_cpu_imbalance" { 3 | name = "${var.o11y_prefix} K8S Cluster CPU balance" 4 | description = "Alerts when cluster CPU usage is imbalanced" 5 | program_text = <<-EOF 6 | A = data('container_cpu_utilization', filter=filter('k8s.cluster.name', '*') and filter('k8s.node.name', '*'), rollup='rate').sum(by=['k8s.node.name', 'k8s.cluster.name']).publish(label='A', enable=False) 7 | B = data('container_cpu_utilization', filter=filter('k8s.cluster.name', '*') and filter('k8s.node.name', '*')).sum(by=['k8s.node.name']).mean(by=['k8s.cluster.name']).publish(label='B', enable=False) 8 | C = ((A-B)/B).stddev(by=['k8s.cluster.name']).publish(label='C', enable=False) 9 | D = data('kube_node_info', filter=filter('k8s.cluster.name', '*'), rollup='count').count(by=['k8s.cluster.name']).publish(label='D', enable=False) 10 | E = (C*D).publish(label='K8S Cluster CPU usage is imbalanced') 11 | EOF 12 | rule { 13 | detect_label = "K8S Cluster CPU usage is imbalanced" 14 | severity = "Critical" 15 | disabled = true 16 | parameterized_body = var.message_body 17 | } 18 | } 19 | */ 20 | 21 | resource "signalfx_detector" "k8s_node_not_ready" { 22 | name = "${var.o11y_prefix} K8S Nodes are not ready" 23 | description = "Alerts when K8s Node is not a ready state" 24 | program_text = <<-EOF 25 | A = data('k8s.node.condition_ready').sum(by=['k8s.cluster.name', 'k8s.node.name']).publish(label='A') 26 | detect(when(A < threshold(1), lasting='30s')).publish('K8s Node is not in a ready state') 27 | EOF 28 | rule { 29 | detect_label = "K8s Node is not in a ready state" 30 | severity = "Critical" 31 | parameterized_body = var.message_body 32 | } 33 | } 34 | 35 | 36 | resource "signalfx_detector" "k8s_node_high_memory" { 37 | name = "${var.o11y_prefix} K8S Node Memory > 90%" 38 | description = "Alerts when K8s Node is using memory > 90% for 5m" 39 | program_text = <<-EOF 40 | A = data('memory.utilization', filter=filter('k8s.cluster.name', '*')).sum(by=['host', 'k8s.cluster.name']).publish(label='A') 41 | detect(when(A > threshold(90), lasting='5m')).publish('K8s Node Memory is higher than 90% for 5m') 42 | EOF 43 | rule { 44 | detect_label = "K8s Node Memory is higher than 90% for 5m" 45 | severity = "Major" 46 | parameterized_body = var.message_body 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/kubernetes/variables.tf: -------------------------------------------------------------------------------- 1 | variable "message_body" { 2 | type = string 3 | 4 | default = <<-EOF 5 | {{#if anomalous}} 6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}. 7 | {{else}} 8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}. 9 | {{/if}} 10 | 11 | {{#if anomalous}} 12 | Triggering condition: {{{readableRule}}} 13 | {{/if}} 14 | 15 | {{#if anomalous}} 16 | Signal value: {{inputs.A.value}} 17 | {{else}} 18 | Current signal value: {{inputs.A.value}} 19 | {{/if}} 20 | 21 | {{#notEmpty dimensions}} 22 | Signal details: {{{dimensions}}} 23 | {{/notEmpty}} 24 | 25 | {{#if anomalous}} 26 | {{#if runbookUrl}} 27 | Runbook: {{{runbookUrl}}} 28 | {{/if}} 29 | {{#if tip}} 30 | Tip: {{{tip}}} 31 | {{/if}} 32 | {{/if}} 33 | EOF 34 | } 35 | 36 | variable "o11y_prefix" { 37 | type = string 38 | description = "Detector Prefix" 39 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/kubernetes/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | } 6 | } 7 | required_version = ">= 0.13" 8 | } 9 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/pivotal/RouteEmitter.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_detector" "pivotal_cloudfoundry_DREM_errors" { 2 | name = "${var.o11y_prefix} Pivotal CloudFoundry Diego Route Emitter Metrics errors" 3 | description = "Alerts for various Pivotal CloudFoundry Route Emitter Metrics related error scenarios" 4 | program_text = <<-EOF 5 | from signalfx.detectors.against_periods import against_periods 6 | from signalfx.detectors.against_recent import against_recent 7 | from signalfx.detectors.not_reporting import not_reporting 8 | from signalfx.detectors.countdown import countdown 9 | TMP1 = data('route_emitter.RouteEmitterSyncDuration', filter=filter('metric_source', 'cloudfoundry'), rollup='max').max(over='15m').publish(label='TMP1', enable=False) 10 | RouteEmitterSyncDuration = (TMP1/1000000000).publish(label='C', enable=False) 11 | detect(when((RouteEmitterSyncDuration >= 5) and (RouteEmitterSyncDuration < 10))).publish('Pivotal Cloudfoundry - RouteEmitterSyncDuration between 5 and 10 seconds.') 12 | detect(when(RouteEmitterSyncDuration >= 10)).publish('Pivotal Cloudfoundry - RouteEmitterSyncDuration greater or eaqual to 10 seconds.') 13 | EOF 14 | rule { 15 | detect_label = "Pivotal Cloudfoundry - RouteEmitterSyncDuration between 5 and 10 seconds." 16 | severity = "Minor" 17 | tip = "If all or many jobs showing as impacted, there is likely an issue with Diego.\n 1 - Investigate the Route Emitter and Diego BBS logs for errors.\n2 - Verify that app routes are functional by making a request to an app, pushing an app and pinging it, or if applicable, checking that your smoke tests have passed.\nIf one or a few jobs showing as impacted, there is likely a connectivity issue and the impacted job should be investigated further." 18 | parameterized_body = var.message_body 19 | } 20 | 21 | rule { 22 | detect_label = "Pivotal Cloudfoundry - RouteEmitterSyncDuration greater or eaqual to 10 seconds." 23 | severity = "Minor" 24 | tip = "If all or many jobs showing as impacted, there is likely an issue with Diego.\n 1 - Investigate the Route Emitter and Diego BBS logs for errors.\n2 - Verify that app routes are functional by making a request to an app, pushing an app and pinging it, or if applicable, checking that your smoke tests have passed.\nIf one or a few jobs showing as impacted, there is likely a connectivity issue and the impacted job should be investigated further." 25 | parameterized_body = var.message_body 26 | } 27 | 28 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/pivotal/gorouter.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_detector" "pivotal_cloudfoundry_gorouter_errors" { 2 | name = "${var.o11y_prefix} Pivotal cloudFoundry gorouter errors" 3 | description = "Alerts for various Pivotal CloudFoundry gorouter related error scenarios" 4 | program_text = <<-EOF 5 | from signalfx.detectors.against_periods import against_periods 6 | from signalfx.detectors.against_recent import against_recent 7 | total_requests = data('gorouter.total_requests', filter=filter('metric_source', 'cloudfoundry'), rollup='average').delta().mean(over='5m').publish(label='total_requests', enable=True) 8 | latency = data('gorouter.latency', filter=filter('metric_source', 'cloudfoundry'), rollup='average').mean(over='30m').publish(label='latency', enable=True) 9 | detect((when(total_requests >= 0.5) and (total_requests < 1))).publish('Pivotal Cloudfoundry - The number of Tasks that the auctioneer failed to place on Diego cellis between .5 and 1.') 10 | detect(when(total_requests >=1)).publish('Pivotal Cloudfoundry - The number of Tasks that the auctioneer failed to place on Diego cell is greater or equal to 1.') 11 | detect(when(latency > 100)).publish('Pivotal Cloudfoundry - gorouter latency above 100 ms') 12 | EOF 13 | rule { 14 | detect_label = "Pivotal Cloudfoundry - The number of Tasks that the auctioneer failed to place on Diego cellis between .5 and 1." 15 | severity = "Minor" 16 | tip = "To increase throughput and maintain low latency, scale the Gorouters either horizontally or vertically and watch that the system.cpu.user metric for the Gorouter stays in the suggested range of 60-70% CPU Utilization." 17 | parameterized_body = var.message_body 18 | } 19 | rule { 20 | detect_label = "Pivotal Cloudfoundry - The number of Tasks that the auctioneer failed to place on Diego cell is greater or equal to 1." 21 | severity = "Critical" 22 | tip = "To increase throughput and maintain low latency, scale the Gorouters either horizontally or vertically and watch that the system.cpu.user metric for the Gorouter stays in the suggested range of 60-70% CPU Utilization." 23 | parameterized_body = var.message_body 24 | } 25 | 26 | rule { 27 | detect_label = "Pivotal Cloudfoundry - gorouter latency above 100 ms" 28 | severity = "Warning" 29 | tip = "First inspect logs for network issues and indications of misbehaving backends./nIf it appears that the Gorouter needs to scale due to ongoing traffic congestion, do not scale on the latency metric alone. You should also look at the CPU utilization of the Gorouter VMs and keep it within a maximum 60-70% range./nResolve high utilization by scaling the Gorouter." 30 | parameterized_body = var.message_body 31 | } 32 | 33 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/pivotal/system.tf: -------------------------------------------------------------------------------- 1 | resource "signalfx_detector" "pivotal_cloudfoundry_system_errors" { 2 | name = "${var.o11y_prefix} Pivotal cloudFoundry system errors" 3 | description = "Alerts for various Pivotal CloudFoundry system related error scenarios" 4 | program_text = <<-EOF 5 | from signalfx.detectors.against_periods import against_periods 6 | from signalfx.detectors.against_recent import against_recent 7 | system_healthy = data('system.healthy', filter=filter('metric_source', 'cloudfoundry'), rollup='average').mean(over='5m').publish(label='system_healthy', enable=False) 8 | detect(when(system_healthy > 1)).publish('Pivotal Cloudfoundry - The value of system.healthy - Mean(5m) is above 1.') 9 | EOF 10 | rule { 11 | detect_label = "Pivotal Cloudfoundry - The value of system.healthy - Mean(5m) is above 1." 12 | severity = "Minor" 13 | tip = "Investigate CF logs for the unhealthy component(s)." 14 | parameterized_body = var.message_body 15 | } 16 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/pivotal/variables.tf: -------------------------------------------------------------------------------- 1 | variable "message_body" { 2 | type = string 3 | 4 | default = <<-EOF 5 | {{#if anomalous}} 6 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" triggered at {{timestamp}}. 7 | {{else}} 8 | Rule "{{{ruleName}}}" in detector "{{{detectorName}}}" cleared at {{timestamp}}. 9 | {{/if}} 10 | 11 | {{#if anomalous}} 12 | Triggering condition: {{{readableRule}}} 13 | {{/if}} 14 | 15 | {{#if anomalous}} 16 | Signal value: {{inputs.A.value}} 17 | {{else}} 18 | Current signal value: {{inputs.A.value}} 19 | {{/if}} 20 | 21 | {{#notEmpty dimensions}} 22 | Signal details: {{{dimensions}}} 23 | {{/notEmpty}} 24 | 25 | {{#if anomalous}} 26 | {{#if runbookUrl}} 27 | Runbook: {{{runbookUrl}}} 28 | {{/if}} 29 | {{#if tip}} 30 | Tip: {{{tip}}} 31 | {{/if}} 32 | {{/if}} 33 | EOF 34 | } 35 | 36 | variable "o11y_prefix" { 37 | type = string 38 | description = "Detector Prefix" 39 | } -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/modules/pivotal/versions.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | signalfx = { 4 | source = "splunk-terraform/signalfx" 5 | } 6 | } 7 | required_version = ">= 0.13" 8 | } 9 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/terraform.tfvars.template: -------------------------------------------------------------------------------- 1 | api_token="1234xxx5678yyyy" 2 | realm="eu0" 3 | o11y_prefix="[Splunk]" 4 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/variables.tf: -------------------------------------------------------------------------------- 1 | variable "api_token" { 2 | description = "Splunk API Token" 3 | } 4 | 5 | variable "realm" { 6 | description = "Splunk Realm" 7 | } 8 | 9 | variable "o11y_prefix" { 10 | type = string 11 | description = "Detector Prefix" 12 | default = "[Splunk]" 13 | } 14 | -------------------------------------------------------------------------------- /integration-examples/terraform-jumpstart/versions.tf: -------------------------------------------------------------------------------- 1 | 2 | terraform { 3 | required_version = ">= 0.13" 4 | required_providers { 5 | signalfx = { 6 | source = "splunk-terraform/signalfx" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /integration-examples/usage-reports-scripts/README.md: -------------------------------------------------------------------------------- 1 | # Usage Reporting Scripts 2 | 3 | ## Custom Metric Reports 4 | This script prases unstructured Custom Metric Reports and outputs easy to read table of the values from highest to lowest. 5 | 6 | Special thanks to Robert Castley! 7 | 8 | ### Using The Custom Metric Report Script 9 | **Pre-Requisite:** Download your Custom Metric Report from Splunk Observability like so: 10 | ![Custom Metric Report](./images/custom-metric-report.png) 11 | 12 | 1. Pull down this script 13 | 2. Install any required packages with `pip install -r requirements.txt` 14 | 3. Run the script on your downloaded Custom Metrics Report. 15 | - E.G. `python custom-metric-report-parser.py -r ~/Downloads/2022-07-22_mts-by-metric.txt ` 16 | 17 | 18 | #### Full CLI Options 19 | ``` 20 | # python custom-metric-report-parser.py -h 21 | 22 | usage: custom-metric-report-parser.py [-h] [-c CATEGORY] [-l LIMIT] -r REPORT 23 | 24 | Splunk Observability Cloud - Custom Metrics Report Parser 25 | 26 | optional arguments: 27 | -h, --help show this help message and exit 28 | -c CATEGORY, --category CATEGORY 29 | 1 (Host), 2 (Container), 3 (Custom), 4 (Hi-Res), 5 (Bundled) 30 | -l LIMIT, --limit LIMIT 31 | Limit no. of metrics displayed in table 32 | -r REPORT, --report REPORT 33 | Custom Metric Report 34 | ``` -------------------------------------------------------------------------------- /integration-examples/usage-reports-scripts/custom-metric-report-parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import csv 5 | from rich.console import Console 6 | from rich.table import Table 7 | 8 | parser = argparse.ArgumentParser( 9 | description="Splunk Observability Cloud - Custom Metrics Report Parser" 10 | ) 11 | parser.add_argument( 12 | "-c", 13 | "--category", 14 | help="1 (Host), 2 (Container), 3 (Custom), 4 (Hi-Res), 5 (Bundled)", 15 | default="3", 16 | ) 17 | parser.add_argument( 18 | "-l", "--limit", help="Limit no. of metrics displayed in table", default=10000 19 | ) 20 | parser.add_argument("-r", "--report", help="Custom Metric Report", required=True) 21 | args = vars(parser.parse_args()) 22 | 23 | if args["category"] == "1": 24 | type = "No. Host MTS" 25 | elif args["category"] == "2": 26 | type = "No. Container MTS" 27 | elif args["category"] == "3": 28 | type = "No. Custom MTS" 29 | elif args["category"] == "4": 30 | type = "No. High Resolution MTS" 31 | elif args["category"] == "5": 32 | type = "No. Bundled MTS" 33 | 34 | console = Console() 35 | 36 | metrics_list = {} 37 | 38 | table = Table( 39 | title="Splunk - Custom Metrics Report Parser", 40 | style="bright_magenta", 41 | title_style="bold italic", 42 | ) 43 | 44 | table.add_column("Metric Name", justify="left", style="cyan", no_wrap=True, width=80) 45 | table.add_column("MTS", justify="right", style="green") 46 | 47 | 48 | with open(args["report"]) as f: 49 | reader = csv.DictReader(f, delimiter="\t") 50 | for row in reader: 51 | if int(row[type]) != 0: 52 | metrics_list[row["Metric Name"]] = int(row[type]) 53 | 54 | total = 0 55 | 56 | res = sorted(metrics_list.items(), key=lambda v: v[1], reverse=True) 57 | for r in res[: int(args["limit"])]: 58 | mts = "{:,}".format(r[1]) 59 | table.add_row(r[0], mts) 60 | total = total + int(r[1]) 61 | 62 | total = "{:,}".format(total) 63 | table.add_row("Total MTS", total, style="bold white", end_section=True) 64 | console.print(table) 65 | -------------------------------------------------------------------------------- /integration-examples/usage-reports-scripts/images/custom-metric-report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/splunk/observability-content-contrib/0083468e0127aeda6097b8e41eb5e2a31cef4308/integration-examples/usage-reports-scripts/images/custom-metric-report.png -------------------------------------------------------------------------------- /integration-examples/usage-reports-scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | rich 2 | --------------------------------------------------------------------------------