├── .gitignore ├── docs └── architecture.png ├── datasources ├── team-1 │ ├── cloudwatch-example-1.yaml │ └── clickhouse-example-2.yaml ├── team-4 │ ├── clickhouse-example-1.yaml │ └── influxdb-example-1.yaml ├── team-2 │ ├── prometheus-example-1.yml │ └── mysql-example-1.yaml └── team-3 │ ├── victoria-metrics-example-1.yaml │ └── elasticsearch-example-1.yml ├── .editorconfig ├── aws-ssm-documents ├── runbooks │ └── GF-GenerateDataSources │ │ ├── role │ │ ├── trusted-policy.json │ │ └── permission-policy.json │ │ ├── README.md │ │ └── GF-UpdateDataSources.yaml └── commands │ └── GH-CloneRepository │ ├── README.md │ └── GH-CloneRepository.yaml ├── README.md └── scripts ├── generate-datasource.py └── get-secrets.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/.DS_Store 2 | -------------------------------------------------------------------------------- /docs/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/observability-studio/grafana-ops/HEAD/docs/architecture.png -------------------------------------------------------------------------------- /datasources/team-1/cloudwatch-example-1.yaml: -------------------------------------------------------------------------------- 1 | name: CloudWatch Example 1 2 | type: cloudwatch 3 | access: proxy 4 | jsonData: 5 | authType: keys 6 | defaultRegion: ap-northeast-1 7 | timeField: "@timestamp" 8 | -------------------------------------------------------------------------------- /datasources/team-1/clickhouse-example-2.yaml: -------------------------------------------------------------------------------- 1 | name: Clickhouse Example 2 2 | type: grafana-clickhouse-datasource 3 | access: proxy 4 | url: http://clickhouse-2.example.com 5 | user: "@team-1/clickhouse-example-2:username" 6 | password: "@team-1/clickhouse-example-2:password" 7 | basicAuth: true 8 | -------------------------------------------------------------------------------- /datasources/team-4/clickhouse-example-1.yaml: -------------------------------------------------------------------------------- 1 | name: Clickhouse Example 1 2 | type: vertamedia-clickhouse-datasource 3 | access: proxy 4 | url: http://clickhouse.example.com 5 | user: "@team-4/clickhouse-example-1:username" 6 | password: "@team-4/clickhouse-example-1:password" 7 | basicAuth: true 8 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 2 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | 11 | [**/*.md] 12 | trim_trailing_whitespace = false 13 | 14 | [**/*.py] 15 | indent_size = 4 16 | -------------------------------------------------------------------------------- /datasources/team-4/influxdb-example-1.yaml: -------------------------------------------------------------------------------- 1 | name: InfluxDB Example 1 2 | type: influxdb 3 | access: direct 4 | url: https://influxdb.example.com 5 | user: "@team-4/influxdb-example-1:username" 6 | password: "@team-4/influxdb-example-1:password" 7 | database: automation 8 | jsonData: 9 | httpMode: POST 10 | -------------------------------------------------------------------------------- /datasources/team-2/prometheus-example-1.yml: -------------------------------------------------------------------------------- 1 | name: Prometheus Example 1 2 | type: prometheus 3 | access: proxy 4 | url: http://10.20.30.40:9090 5 | user: "@team-2/prometheus-example-1:username" 6 | password: "@team-2/prometheus-example-1:password" 7 | basicAuth: "false" 8 | jsonData: 9 | httpMethod: POST 10 | -------------------------------------------------------------------------------- /datasources/team-3/victoria-metrics-example-1.yaml: -------------------------------------------------------------------------------- 1 | name: Victoria Metrics Example 1 2 | type: prometheus 3 | access: proxy 4 | url: http://vmauth:8427 5 | isDefault: true 6 | jsonData: 7 | httpHeaderName1: Authorization 8 | secureJsonData: 9 | httpHeaderValue1: Bearer @team-3/victoria-metrics-example-1:authorizationToken 10 | -------------------------------------------------------------------------------- /aws-ssm-documents/runbooks/GF-GenerateDataSources/role/trusted-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "", 6 | "Effect": "Allow", 7 | "Principal": { 8 | "Service": "ssm.amazonaws.com" 9 | }, 10 | "Action": "sts:AssumeRole" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /datasources/team-2/mysql-example-1.yaml: -------------------------------------------------------------------------------- 1 | name: MySQL Example 1 2 | type: mysql 3 | access: proxy 4 | url: mysql-example.cluster-123abc.ap-southeast-1.rds.amazonaws.com 5 | user: "@team-2/mysql-example-1:username" 6 | password: "@team-2/mysql-example-1:password" 7 | database: monitoring 8 | jsonData: 9 | connMaxLifetime: 30 10 | maxOpenConns: 10 11 | tlsAuth: "false" 12 | tlsAuthWithCACert: "false" 13 | -------------------------------------------------------------------------------- /datasources/team-3/elasticsearch-example-1.yml: -------------------------------------------------------------------------------- 1 | name: Elasticsearch Example 1 2 | type: elasticsearch 3 | access: proxy 4 | url: http://elasticsearch.example.com:9200 5 | user: "@team-3/elasticsearch-example-1:username" 6 | password: "@team-3/elasticsearch-example-1:password" 7 | # index 8 | database: logs-index 9 | basicAuth: true 10 | jsonData: 11 | esVersion: 7.7.0 12 | includeFrozen: false 13 | logLevelField: "" 14 | logMessageField: "" 15 | maxConcurrentShardRequests: 5 16 | timeField: "@timestamp" 17 | -------------------------------------------------------------------------------- /aws-ssm-documents/commands/GH-CloneRepository/README.md: -------------------------------------------------------------------------------- 1 | # GH-CloneRepository 2 | 3 | ## What does this document do? 4 | 5 | Clone Github repository. If the repository is already cloned, repository will be pulled instead 6 | 7 | ## Input Parameters 8 | 9 | - **RepositoryUrl**: (Required) HTTP URL of the repository. Including OAuth token in the URL. E.g. https://username:personal-access-token@github.com/username/repo.git 10 | - **Branch**: Branch that will be checked out. Default `master` 11 | - **WorkingDirectory**: Working directory that the give repository will be cloned into. Default `/opt` 12 | -------------------------------------------------------------------------------- /aws-ssm-documents/runbooks/GF-GenerateDataSources/README.md: -------------------------------------------------------------------------------- 1 | # GF-GenerateDataSources 2 | 3 | ## What does this runbook do? 4 | 5 | Generate Grafana data sources configuration file 6 | 7 | ## Input Parameters 8 | 9 | - **AssumeRole**: (Required) IAM Role that runbook will assume to perform AWS call on your behalf 10 | - **InstanceIds**: (Required) List of EC2 instances that runbook will be applied 11 | - **RepositoryUrl**: (Required) HTTP URL of the repository. Including OAuth token in the URL. E.g. https://username:personal-access-token@github.com/username/repo.git 12 | - **Env**: (Required) Which environment that runbook will be applied to. E.g. dev, test, staging, prod 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Grafana Ops 2 | 3 | ## Movitation 4 | 5 | Check out this post [A GitOps Way To Manage Grafana Data Sources At Scale](https://dev.to/aws-builders/a-gitops-way-to-manage-grafana-data-sources-at-scale-59la) 6 | 7 | ## High-level architecture 8 | 9 | ![architecture](docs/architecture.png) 10 | 11 | ## Directory structure 12 | 13 | ```bash 14 | . 15 | ├── README.md 16 | ├── aws-ssm-documents 17 | │ ├── commands 18 | │ │ ├── GH-CloneRepository 19 | │ │ │ ├── GH-CloneRepository.yaml 20 | │ │ │ └── README.md 21 | │ │ └── ... 22 | │ └── runbooks 23 | │ └── GF-GenerateDataSources 24 | │ ├── GF-UpdateDataSources.yaml 25 | │ ├── README.md 26 | │ └── role 27 | │ ├── permission-policy.json 28 | │ └── trusted-policy.json 29 | ├── datasources 30 | │ ├── team-1 31 | │ │ ├── clickhouse-example-2.yaml 32 | │ │ └── cloudwatch-example-1.yaml 33 | │ ├── team-2 34 | │ │ ├── ... 35 | │ ├── ... 36 | └── scripts 37 | ├── generate-datasource.py 38 | └── get-secrets.py 39 | └── ... 40 | ``` 41 | -------------------------------------------------------------------------------- /aws-ssm-documents/commands/GH-CloneRepository/GH-CloneRepository.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | schemaVersion: "2.2" 3 | description: | 4 | # GH-CloneRepository 5 | 6 | ## What does this document do? 7 | 8 | Clone Github repository. If the repository is already cloned, repository will be pulled instead 9 | 10 | ## Input Parameters 11 | 12 | - **RepositoryUrl**: (Required) HTTP URL of the repository. Including OAuth token in the URL. E.g. https://username:personal-access-token@github.com/username/repo.git 13 | - **Branch**: Branch that will be checked out. Default `master` 14 | - **WorkingDirectory**: Working directory that the give repository will be cloned into. Default `/opt` 15 | 16 | parameters: 17 | RepositoryUrl: 18 | type: String 19 | description: (Required) HTTP URL of the repository. Including OAuth token in the URL. E.g. https://oauth-key@github.com/username/repo.git 20 | Branch: 21 | type: String 22 | description: Branch that will be checked out. Default master 23 | default: master 24 | WorkingDirectory: 25 | type: String 26 | description: Working directory that the give repository will be cloned into. Default /opt/ 27 | default: /opt 28 | mainSteps: 29 | - name: CloneRepo 30 | action: aws:runShellScript 31 | inputs: 32 | timeoutSeconds: 30 33 | runCommand: 34 | - |- 35 | repoName=`basename {{ RepositoryUrl }} .git` 36 | 37 | if [ ! -d "{{ WorkingDirectory }}/$repoName" ]; then 38 | git clone --branch {{ Branch }} {{ RepositoryUrl }} {{ WorkingDirectory }}/$repoName 39 | else 40 | echo "The repository is already existed!" 41 | git -C {{ WorkingDirectory }}/$repoName pull 42 | fi 43 | -------------------------------------------------------------------------------- /scripts/generate-datasource.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import json 3 | import os 4 | from posixpath import basename 5 | import sys 6 | import yaml 7 | from pathlib import Path 8 | 9 | 10 | def generate_datasource(team_dir: str, provision_dir: str, secret: dict) -> None: 11 | datasources = [] 12 | team = team_dir.split(os.sep)[-1] 13 | 14 | datasource_files = [f.path for f in os.scandir(team_dir) if f.is_file()] 15 | for file in datasource_files: 16 | datasource_key = basename(Path(file).with_suffix('')) 17 | datasource_secret = secret[datasource_key] if datasource_key in secret.keys() else {} 18 | 19 | raw_data = None 20 | with open(file) as stream: 21 | raw_data = stream.read() 22 | if raw_data: 23 | for key in datasource_secret.keys(): 24 | raw_data = raw_data.replace(f'@{team}/{datasource_key}:{key}', datasource_secret[key]) 25 | datasources.append(yaml.safe_load(raw_data)) 26 | 27 | dest_file = os.sep.join([provision_dir, f'{team}.yaml']) 28 | with open(dest_file, 'w') as f: 29 | yaml.dump({'apiVersion': 1, 'datasources': datasources}, f) 30 | 31 | 32 | if __name__ == '__main__': 33 | # ./datasources 34 | datasource_dir = sys.argv[1] 35 | # secret data as json string 36 | secret = json.loads(sys.argv[2]) 37 | # /var/lib/grafana/provisioning/datasources 38 | gf_provision_dir = sys.argv[3] 39 | 40 | directories = [d.path for d in os.scandir(datasource_dir) if d.is_dir()] 41 | for dir in directories: 42 | team = dir.split(os.sep)[-1] 43 | team_secret = secret[team] if team in secret.keys() else {} 44 | generate_datasource(team_dir=dir, provision_dir=gf_provision_dir, secret=team_secret ) 45 | -------------------------------------------------------------------------------- /scripts/get-secrets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from copy import deepcopy 3 | import json 4 | from typing import Dict, List 5 | import boto3 6 | 7 | 8 | sm = boto3.session.Session().client( 9 | service_name='secretsmanager' 10 | ) 11 | 12 | 13 | def script_handler(events, context): 14 | secret_type = events['secretType'] 15 | env = events['env'] 16 | 17 | res = sm.list_secrets(Filters=[ 18 | {'Key': 'tag-key', 'Values': ['env']}, 19 | {'Key': 'tag-value', 'Values': [env]}, 20 | {'Key': 'tag-key', 'Values': ['secret-type']}, 21 | {'Key': 'tag-value', 'Values': [secret_type]}, 22 | ]) 23 | 24 | secret_data = {} 25 | for secret in res['SecretList']: 26 | merge_dicts(secret_data, get_secret(secret['Name']), no_copy=True) 27 | 28 | return {'secret': json.dumps(secret_data)} 29 | 30 | 31 | def get_secret(secret_name: str) -> dict: 32 | team, datasource_name = secret_name.split('/')[-2:] 33 | res = sm.get_secret_value( 34 | SecretId=secret_name 35 | ) 36 | secret = json.loads(res['SecretString']) 37 | return {team: {datasource_name: secret}} 38 | 39 | 40 | def merge_dicts(*from_dicts: List[Dict], no_copy: bool = False) -> Dict: 41 | """ no recursion deep merge of 2 dicts 42 | 43 | By default creates fresh Dict and merges all to it. 44 | 45 | no_copy = True, will merge all dicts to a fist one in a list without copy. 46 | Why? Sometime I need to combine one dictionary from "layers". 47 | The "layers" are not in use and dropped immediately after merging. 48 | """ 49 | 50 | if no_copy: 51 | def xerox(x): return x 52 | else: 53 | xerox = deepcopy 54 | 55 | result = xerox(from_dicts[0]) 56 | 57 | for _from in from_dicts[1:]: 58 | merge_queue = [(result, _from)] 59 | for _to, _from in merge_queue: 60 | for k, v in _from.items(): 61 | if k in _to and isinstance(_to[k], dict) and isinstance(v, dict): 62 | # key collision add both are dicts. 63 | # add to merging queue 64 | merge_queue.append((_to[k], v)) 65 | continue 66 | _to[k] = xerox(v) 67 | 68 | return result 69 | -------------------------------------------------------------------------------- /aws-ssm-documents/runbooks/GF-GenerateDataSources/role/permission-policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "lambda:InvokeFunction" 8 | ], 9 | "Resource": [ 10 | "arn:aws:lambda:*:*:function:Automation*" 11 | ] 12 | }, 13 | { 14 | "Effect": "Allow", 15 | "Action": [ 16 | "ec2:CreateImage", 17 | "ec2:CopyImage", 18 | "ec2:DeregisterImage", 19 | "ec2:DescribeImages", 20 | "ec2:DeleteSnapshot", 21 | "ec2:StartInstances", 22 | "ec2:RunInstances", 23 | "ec2:StopInstances", 24 | "ec2:TerminateInstances", 25 | "ec2:DescribeInstanceStatus", 26 | "ec2:CreateTags", 27 | "ec2:DeleteTags", 28 | "ec2:DescribeTags", 29 | "cloudformation:CreateStack", 30 | "cloudformation:DescribeStackEvents", 31 | "cloudformation:DescribeStacks", 32 | "cloudformation:UpdateStack", 33 | "cloudformation:DeleteStack" 34 | ], 35 | "Resource": [ 36 | "*" 37 | ] 38 | }, 39 | { 40 | "Effect": "Allow", 41 | "Action": [ 42 | "ssm:*" 43 | ], 44 | "Resource": [ 45 | "*" 46 | ] 47 | }, 48 | { 49 | "Effect": "Allow", 50 | "Action": [ 51 | "sns:Publish" 52 | ], 53 | "Resource": [ 54 | "arn:aws:sns:*:*:Automation*" 55 | ] 56 | }, 57 | { 58 | "Effect": "Allow", 59 | "Action": [ 60 | "kms:ListKeys", 61 | "kms:ListAliases", 62 | "kms:DescribeKey" 63 | ], 64 | "Resource": [ 65 | "arn:aws:kms:*::alias/aws/secretsmanager" 66 | ] 67 | }, 68 | { 69 | "Effect": "Allow", 70 | "Action": [ 71 | "secretsmanager:GetSecretValue" 72 | ], 73 | "Resource": [ 74 | "arn:aws:secretsmanager:*::secret:/grafana/datasource/*" 75 | ] 76 | }, 77 | { 78 | "Effect": "Allow", 79 | "Action": [ 80 | "secretsmanager:ListSecrets" 81 | ], 82 | "Resource": "*" 83 | }, 84 | { 85 | "Effect": "Allow", 86 | "Action": "iam:PassRole", 87 | "Resource": "arn:aws:iam:::role/" 88 | } 89 | ] 90 | } 91 | -------------------------------------------------------------------------------- /aws-ssm-documents/runbooks/GF-GenerateDataSources/GF-UpdateDataSources.yaml: -------------------------------------------------------------------------------- 1 | schemaVersion: '0.3' 2 | description: |- 3 | # GF-UpdateDataSources 4 | 5 | ## What does this runbook do? 6 | 7 | Generate and update Grafana data sources configuration file 8 | 9 | ## Input Parameters 10 | 11 | - **AssumeRole**: (Required) IAM Role that runbook will assume to perform AWS call on your behalf 12 | - **InstanceIds**: (Required) List of EC2 instances that runbook will be applied 13 | - **RepositoryUrl**: (Required) HTTP URL of the repository. Including OAuth token in the URL. E.g. https://oauth-key@github.com/username/repo.git 14 | - **Env**: (Required) Which environment that runbook will be applied to. E.g. nonprod, prod 15 | 16 | assumeRole: '{{ AssumeRole }}' 17 | parameters: 18 | AssumeRole: 19 | type: String 20 | description: (Required) IAM Role that runbook will assume to perform AWS call on your behalf 21 | InstanceIds: 22 | type: 'List' 23 | description: (Required) List of EC2 instances that runbook will be applied 24 | RepositoryUrl: 25 | type: String 26 | description: (Required) HTTP URL of the repository. Including OAuth token in the URL. E.g. https://username:personal-access-token@github.com/username/repo.git 27 | Env: 28 | type: String 29 | description: Which environment that runbook will be applied to. E.g. nonprod, prod 30 | mainSteps: 31 | - name: PullConfiguration 32 | action: 'aws:runCommand' 33 | inputs: 34 | DocumentName: GH-CloneRepository 35 | InstanceIds: 36 | - '{{ InstanceIds }}' 37 | Parameters: 38 | RepositoryUrl: ' {{ RepositoryUrl }}' 39 | timeoutSeconds: 120 40 | 41 | - name: GetSecrets 42 | action: 'aws:executeScript' 43 | description: Run a **Python script** to get scecret from secret manager 44 | inputs: 45 | Runtime: python3.8 46 | Handler: script_handler 47 | InputPayload: 48 | secretType: grafana-datasource 49 | env: '{{ Env }}' 50 | Script: | 51 | from copy import deepcopy 52 | import json 53 | from typing import Dict, List 54 | import boto3 55 | 56 | 57 | sm = boto3.session.Session().client( 58 | service_name='secretsmanager' 59 | ) 60 | 61 | 62 | def script_handler(events, context): 63 | secret_type = events['secretType'] 64 | env = events['env'] 65 | 66 | res = sm.list_secrets(Filters=[ 67 | {'Key': 'tag-key', 'Values': ['env']}, 68 | {'Key': 'tag-value', 'Values': [env]}, 69 | {'Key': 'tag-key', 'Values': ['secret-type']}, 70 | {'Key': 'tag-value', 'Values': [secret_type]}, 71 | ]) 72 | 73 | secret_data = {} 74 | for secret in res['SecretList']: 75 | merge_dicts(secret_data, get_secret(secret['Name']), no_copy=True) 76 | 77 | return {'secret': json.dumps(secret_data)} 78 | 79 | 80 | def get_secret(secret_name: str) -> dict: 81 | team, datasource_name = secret_name.split('/')[-2:] 82 | res = sm.get_secret_value( 83 | SecretId=secret_name 84 | ) 85 | secret = json.loads(res['SecretString']) 86 | return {team: {datasource_name: secret}} 87 | 88 | 89 | def merge_dicts(*from_dicts: List[Dict], no_copy: bool = False) -> Dict: 90 | """ no recursion deep merge of 2 dicts 91 | 92 | By default creates fresh Dict and merges all to it. 93 | 94 | no_copy = True, will merge all dicts to a fist one in a list without copy. 95 | Why? Sometime I need to combine one dictionary from "layers". 96 | The "layers" are not in use and dropped immediately after merging. 97 | """ 98 | 99 | if no_copy: 100 | def xerox(x): return x 101 | else: 102 | xerox = deepcopy 103 | 104 | result = xerox(from_dicts[0]) 105 | 106 | for _from in from_dicts[1:]: 107 | merge_queue = [(result, _from)] 108 | for _to, _from in merge_queue: 109 | for k, v in _from.items(): 110 | if k in _to and isinstance(_to[k], dict) and isinstance(v, dict): 111 | # key collision add both are dicts. 112 | # add to merging queue 113 | merge_queue.append((_to[k], v)) 114 | continue 115 | _to[k] = xerox(v) 116 | 117 | return result 118 | outputs: 119 | - Name: secret 120 | Selector: $.Payload.secret 121 | Type: String 122 | timeoutSeconds: 600 123 | 124 | - name: GenerateDataSources 125 | action: 'aws:runCommand' 126 | inputs: 127 | DocumentName: AWS-RunShellScript 128 | InstanceIds: 129 | - '{{ InstanceIds }}' 130 | Parameters: 131 | commands: 132 | - python3 /opt/grafana-ops/scripts/generate-datasource.py /opt/grafana-ops/datasources '{{ GetSecrets.secret }}' /var/lib/grafana/provisioning/datasources 133 | timeoutSeconds: 600 134 | --------------------------------------------------------------------------------