├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── cmd
└── main.go
├── deploy-adot
├── README.md
├── adotTaskDefinition.json.template
├── cleanup-cloudmap.sh
├── cleanup-ecs.sh
├── cleanup-iam.sh
├── cleanup-ssm.sh
├── cloudmap.sh
├── ecs-cluster.yaml
├── env.sh
├── iam.sh
├── otel-collector-config-cloudwatch.yaml.template
├── otel-collector-config.yaml.template
├── otel-config.sh
├── services.sh
├── task-definitions.sh
└── webappTaskDefinition.json.template
├── deploy-prometheus
├── README.md
├── amp.sh
├── cleanup-cloudmap.sh
├── cleanup-ecs.sh
├── cleanup-iam.sh
├── cloudmap.sh
├── ecs-cluster.yaml
├── env.sh
├── iam.sh
├── nodeExporterTaskDefinition.json.template
├── prometheus.yaml.template
├── prometheusTaskDefinition.json.template
├── services.sh
├── task-definitions.sh
└── webappTaskDefinition.json.template
├── go.mod
├── go.sum
├── images
├── Deployment-Architecture-ADOT.png
└── Deployment-Architecture-Prometheus.png
└── pkg
└── aws
├── cloudmap.go
├── session.go
└── ssm.go
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 |
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 |
42 |
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 |
46 |
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 |
52 |
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 |
56 |
57 | ## Licensing
58 |
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM golang:1.15 as builder
2 | WORKDIR /src
3 | COPY go.* /src/
4 | RUN go env -w GOPROXY=direct
5 | RUN go mod download
6 | RUN go mod vendor
7 | COPY . .
8 | ARG TARGETOS
9 | ARG TARGETARCH
10 | RUN CGO_ENABLED=0 GO111MODULE=on GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -a -mod=vendor -tags=netgo -o config-reloader cmd/main.go
11 |
12 |
13 | FROM alpine:latest AS final
14 | WORKDIR /home/prometheus-for-ecs
15 | COPY --from=builder /src/config-reloader .
16 | ENV GO111MODULE=on
17 | ENTRYPOINT ["./config-reloader"]
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | this software and associated documentation files (the "Software"), to deal in
5 | the Software without restriction, including without limitation the rights to
6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7 | the Software, and to permit persons to whom the Software is furnished to do so.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 |
16 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Prometheus metrics collection from Amazon ECS
2 |
3 | This Git repository contains software artifacts related to two different approaches for collecting Prometheus metrics from applications deployed to an Amazon ECS cluster. Both approaches employ the same custom service discovery mechanism that leverages the integration between Amazon ECS and AWS Cloud Map to dynamically discover scraping targets for Prometheus. The Golang code in the repository pertains to that of a sidecar container which impements the custom service discovery mechanism.
4 |
5 | The two sub-directories, namely [deploy-prometheus](https://github.com/aws-samples/prometheus-for-ecs/blob/main/deploy-prometheus) and [deploy-adot](https://github.com/aws-samples/prometheus-for-ecs/blob/main/deploy-adot) contain the artifacts required to deploy the respective solutions to an Amazon ECS cluster.
6 |
7 | - The first approach employs a single instance of [Prometheus server deployed to an ECS cluster](https://github.com/aws-samples/prometheus-for-ecs/blob/main/deploy-prometheus/README.md).
8 |
9 | - The second approach employs a single instance of [AWS Distro for OpenTelemetry Collector deployed to an ECS cluster](https://github.com/aws-samples/prometheus-for-ecs/blob/main/deploy-adot/README.md). Prometheus metrics are collected by the Prometheus Receiver that runs in the ADOT Collector pipeline.
10 |
11 |
12 | ## Security
13 |
14 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
15 |
16 | ## License
17 |
18 | This library is licensed under the MIT-0 License. See the LICENSE file.
19 |
20 |
--------------------------------------------------------------------------------
/cmd/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "io/ioutil"
7 | "log"
8 | "net/http"
9 | "os"
10 | "strconv"
11 | "strings"
12 | "time"
13 |
14 | "github.com/aws-samples/prometheus-for-ecs/pkg/aws"
15 | )
16 |
17 | const (
18 | TARGETS = "/prometheus-targets"
19 | PORT = 9001
20 | )
21 |
22 | var present bool
23 | var configFileDir, configReloadFrequency string
24 | var prometheusConfigFilePath, scrapeConfigFilePath string
25 |
26 | func main() {
27 | aws.InitializeAWSSession()
28 | sdMode, present := os.LookupEnv("SERVICE_DISCOVERY_MODE")
29 | if !present {
30 | sdMode = "FILE_BASED"
31 | }
32 | if sdMode == "FILE_BASED" {
33 | fileBasedSD()
34 | } else if sdMode == "HTTP_BASED" {
35 | httpBasedSD()
36 | } else {
37 | log.Printf("Invalid service discovery mode %s", sdMode)
38 | }
39 | }
40 |
41 | func httpBasedSD() {
42 | log.Println("Service discovery application started in HTTP-based mode")
43 | serveMux := http.NewServeMux()
44 | serveMux.HandleFunc(TARGETS, getScrapeConfig)
45 |
46 | stopChannel := make(chan string)
47 | defer close(stopChannel)
48 |
49 | go func(doneChannel chan string) {
50 | port := PORT
51 | addr := fmt.Sprintf(":%d", port)
52 | fmt.Printf("Started HTTP server at %s\n", addr)
53 |
54 | server := &http.Server{
55 | Addr: addr,
56 | Handler: serveMux,
57 | ReadTimeout: 10 * time.Second,
58 | WriteTimeout: 10 * time.Second,
59 | MaxHeaderBytes: 1 << 20,
60 | }
61 | server.SetKeepAlivesEnabled(true)
62 | log.Fatal(server.ListenAndServe())
63 | doneChannel <- "HTTP server terminated abnormally"
64 | }(stopChannel)
65 |
66 | fmt.Println("Waiting for all goroutines to complete")
67 |
68 | for {
69 | select {
70 | case status := <-stopChannel:
71 | fmt.Println(status)
72 | break
73 | }
74 | }
75 | }
76 |
77 | func fileBasedSD() {
78 | log.Println("Service discovery application started in file-based mode")
79 | aws.InitializeAWSSession()
80 |
81 | configFileDir, present = os.LookupEnv("CONFIG_FILE_DIR")
82 | if !present {
83 | configFileDir = "/etc/config/"
84 | }
85 | configReloadFrequency, present = os.LookupEnv("CONFIG_RELOAD_FREQUENCY")
86 | if !present {
87 | configReloadFrequency = "30"
88 | }
89 |
90 | loadPrometheusConfig()
91 | initScrapeTargetConfig()
92 |
93 | go func() {
94 | reloadFrequency, _ := strconv.Atoi(configReloadFrequency)
95 | ticker := time.NewTicker(time.Duration(reloadFrequency) * time.Second)
96 | for {
97 | select {
98 | case <-ticker.C:
99 | //
100 | // Ticker contains a channel
101 | // It sends the time on the channel after the number of ticks specified by the duration have elapsed.
102 | //
103 | reloadScrapeConfig()
104 | }
105 | }
106 | }()
107 | log.Println("Periodic reloads under progress...")
108 |
109 | //
110 | // Block indefinitely on the main channel
111 | //
112 | stopChannel := make(chan string)
113 | for {
114 | select {
115 | case status := <-stopChannel:
116 | fmt.Println(status)
117 | break
118 | }
119 | }
120 |
121 | }
122 |
123 | func loadPrometheusConfig() {
124 | // When deployed with OTel Collector, Prometheus runs as a Receiver in the OTel Pipeline
125 | // Hence, Prometheus configuration is part of the OTel Pipeline configuration which is loaded from SSM
126 | deploymentModeParameter, present := os.LookupEnv("DEPLOYMENT_MODE")
127 | if present && (deploymentModeParameter == "OTEL" || deploymentModeParameter == "otel") {
128 | log.Println("Running in OTel mode")
129 | return
130 | }
131 |
132 | prometheusConfigParameter, present := os.LookupEnv("PROMETHEUS_CONFIG_PARAMETER_NAME")
133 | if !present {
134 | prometheusConfigParameter = "ECS-Prometheus-Configuration"
135 | }
136 | prometheusConfig := aws.GetParameter(prometheusConfigParameter)
137 |
138 | prometheusConfigFilePath = strings.Join([]string{configFileDir, "prometheus.yaml"}, "/")
139 | err := ioutil.WriteFile(prometheusConfigFilePath, []byte(*prometheusConfig), 0644)
140 | if err != nil {
141 | log.Println(err)
142 | }
143 | log.Println("Loaded Prometheus configuration file")
144 |
145 | }
146 |
147 | func initScrapeTargetConfig() {
148 | scrapeConfigFilePath = strings.Join([]string{configFileDir, "ecs-services.json"}, "/")
149 | err := ioutil.WriteFile(scrapeConfigFilePath, []byte("[]"), 0644)
150 | if err != nil {
151 | log.Println(err)
152 | }
153 | log.Println("Created initial scrape target configuration file")
154 | }
155 |
156 | func reloadScrapeConfig() {
157 | scrapConfig := buildSrapeConfig()
158 | err := ioutil.WriteFile(scrapeConfigFilePath, []byte(*scrapConfig), 0644)
159 | if err != nil {
160 | log.Println(err)
161 | }
162 | }
163 |
164 | func getScrapeConfig(w http.ResponseWriter, r *http.Request) {
165 | scrapConfig := buildSrapeConfig()
166 | w.Header().Set("Content-Type", "application/json")
167 | io.WriteString(w, *scrapConfig)
168 | }
169 |
170 | func buildSrapeConfig() *string {
171 | discoveryNamespacesParameter, present := os.LookupEnv("DISCOVERY_NAMESPACES_PARAMETER_NAME")
172 | if !present {
173 | discoveryNamespacesParameter = "ECS-ServiceDiscovery-Namespaces"
174 | }
175 | namespaceList := aws.GetParameter(discoveryNamespacesParameter)
176 | namespaces := strings.Split(*namespaceList, ",")
177 | scrapConfig := aws.GetPrometheusScrapeConfig(namespaces)
178 | return scrapConfig
179 | }
180 |
--------------------------------------------------------------------------------
/deploy-adot/README.md:
--------------------------------------------------------------------------------
1 | ## Metrics and Traces Collection from Amazon ECS using AWS Distro for OpenTelemetry
2 |
3 | This directory contains software artifacts to deploy [ADOT](https://aws-otel.github.io/docs/introductions) Collector to an Amazon ECS cluster and collect Prometheus metrics and X-Ray traces from applications, using AWS Cloud Map for dynamic service discovery. Please refer to this [blog](https://aws.amazon.com/blogs/containers/metrics-and-traces-collection-from-amazon-ecs-using-aws-distro-for-opentelemetry-with-dynamic-service-discovery/) for implementations details about this solution architecture.
4 |
5 |
6 |
7 | ### Solution architecture overview
8 |
9 | At a high level, we will be following the steps outlined below for this solution:
10 |
11 |
12 | -
13 | Setup AWS Cloud Map for service discovery
14 |
15 | -
16 | Deploy application services to an Amazon ECS and register them with AWS Cloud Map
17 |
18 | -
19 | Deploy ADOT Collector to Amazon ECS, configure HTTP-based service discovery
20 |
21 | -
22 | Setup a metrics pipeline in the collector to scrape Prometheus metrics from workloads and send them to a workspace in Amazon Managed Service for Prometheus
23 |
24 | -
25 | Visualize metrics data using Amazon Managed Grafana
26 |
27 | -
28 | Setup a traces pipeline in the collector to collect X-Ray trace segments from workloads and send them to AWS X-Ray
29 |
30 | -
31 | Deploy application services instrumented with X-Ray SDK, send trace data to the ADOT Collector and visualize them in AWS X-Ray Service Map.
32 |
33 |
34 |
35 | ### Deploy
36 |
37 | Make sure you have the latest version of AWS CLI that provides support for Amazon Managed Prometheus. The deployment requires an ECS cluster. All deployment artifacts are under the [deploy-adot](https://github.com/aws-samples/prometheus-for-ecs/tree/main/deploy-adot) directory.
38 |
39 | The deployment comprises the following components:
40 | - An ECS task comprising the ADOT Collector and the service discovery application containers. The collector has a *metrics* pipeline configured with a [Prometheus Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver), and an [AWS Prometheus Remote Write Exporter](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/awsprometheusremotewriteexporter) as shown in the figure above. This enables it to collect Prometheus metrics from workloads and send them to an Amazon Managed Prometheus workspace. It also has a has a *traces* pipeline which comprises an instance of [AWS X-Ray Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/awsxrayreceiver) and [AWS X-Ray Exporter](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/awsxrayexporter) which enables it to collect the trace segments and send them to AWS X-Ray. The [service discovery application](https://github.com/aws-samples/prometheus-for-ecs/tree/main/cmd) helps discover the services registered in AWS Cloud Map and dynamically updates the scrape configurations used by the Prometheus Receiver.
41 |
42 | - An ECS task comprising a stateless web application and the [ECS Exporter](https://github.com/prometheus-community/ecs_exporter) containers. The web application is instrumented with [Prometheus Go client library](https://github.com/prometheus/client_golang) and exposes an HTTP endpoint */work*. The application has an internal load generator that sends client requests to the HTTP endpoint. The application exposes two custom metrics, namely, a [Counter](https://prometheus.io/docs/concepts/metric_types/#counter) named *http_requests_total* and a [Histogram](https://prometheus.io/docs/concepts/metric_types/#histogram) named *request_duration_milliseconds*. The ECS container agent injects an environment variable named ECS_CONTAINER_METADATA_URI_V4 into each container, referred to as the *task metadata endpoint* which provides various task metadata and [Docker stats](https://docs.docker.com/engine/api/v1.30/#operation/ContainerStats) to the container. The ECS Exporter container reads this data and exports them as Prometheus metrics on port 9779.
43 |
44 | The deploment scripts assume that the underlying ECS cluster was created using the [ecs-cluster.yaml](https://github.com/aws-samples/prometheus-for-ecs/blob/main/deploy-adot/ecs-cluster.yaml) CloudFormation template.
45 | Create the cluster with the following command:
46 | ```
47 | VPC_STACK_NAME=ecs-stack
48 | VPC_TEMPLATE=ecs-cluster.yaml
49 | aws cloudformation deploy --stack-name $VPC_STACK_NAME --template-file $VPC_TEMPLATE --capabilities CAPABILITY_IAM
50 | ```
51 |
52 | Before proceeding further, export a set of environment variables that are required by scripts used in subsequent steps. Modify the **ACCOUNT_ID** and **AWS_REGION** variables in the *env.sh* script before running the command below.
53 | ```
54 | source env.sh
55 | ```
56 |
57 | Create the ECS task role, task execution roles and the relevant IAM policies.
58 | ```
59 | source iam.sh
60 | ```
61 |
62 | Create a service discovery namespace and service registries under AWS Cloud Map. The ECS tasks that you will deploy will register themselves in these service registries upon launch.
63 | ```
64 | source cloudmap.sh
65 | ```
66 |
67 | Setup ADOT Collector pipeline configurations and an Amazon Managed Prometheus worksapce for ingesting Prometheus metrics scraped from ECS services.
68 | ```
69 | source otel-config.sh
70 | ```
71 | The above command creates an Amazon Managed Prometheus workspace named **adot-prometheus-for-ecs** and the ADOT Collector pipeline configuration, with that worksapce set as the destination for AWS Remote Write Exporter in the pipeline. It also creates two parameters in the AWS SSM Parameter Store as follows:
72 | - parameter named **otel-collector-config** and of type *String* which stores the pipeline configuration. The ADOT Collector task will read its pipeline configuration from this parameter.
73 | - parameter named **ECS-Namespaces** and of type *String* with its value set to **ecs-services**. This is the AWS Cloud Map namespace which will be used by the service discovery sidecar in the ADOT Collector task to discover scraping targets.
74 |
75 | Next, register task definitions with ECS
76 | ```
77 | source task-definitions.sh
78 | ```
79 |
80 | Launch the ECS services using the task definitions created above.
81 | ```
82 | source services.sh
83 | ```
84 |
85 | ### Metrics
86 |
87 | Once the services are all up and running, the workspace in Amazon Managed Service for Prometheus will start ingesting metrics collected by the ADOT Collector from the web application. Use Amazon Managed Grafana to query and visualize the metrics. You may use the following PromQL queries to visualize the metrics collected from the web application and Prometheus Node Exporter
88 | - HTTP request rate: *sum(rate(http_requests_total[5m]))*
89 | - Average response latency: *(sum(rate(request_duration_milliseconds_sum[5m])) by (path,taskid))/(sum(rate(request_duration_milliseconds_count[5m])) by (path,taskid))*
90 | - Response latency: *sum(rate(request_duration_milliseconds_bucket{le="THRESHOLD"}[5m])) / sum(rate(request_duration_milliseconds_count[5m])) * 100*. The following are the thresholds captured: 500, 1000, 2500, 5000
91 | - Average CPU usage: *ecs_cpu_percent{container="webapp"}*
92 |
93 | ### Traces
94 |
95 | The AWS X-Ray receiver in the collector pipeline listens for traffic on UDP port 2000. Upon launch, the ADOT service is registered in the Cloud Map service registry identified by the private DNS name *adot-collector-svc.ecs-service*. With this setup, application services in the cluster that are instrumented with X-Ray SDK can now be configured with the environment variable **AWS_XRAY_DAEMON_ADDRESS** set to *adot-collector-svc.ecs-service:2000* and send traces data to the collector which are subsequently sent to AWS X-Ray by the exporter in the collector pipeline.
96 |
97 | ### Cleanup
98 |
99 | When you are done, cleanup the resources you created above with the follwing set of commands.
100 | ```
101 | source cleanup-ecs.sh
102 | source cleanup-cloudmap.sh
103 | source cleanup-iam.sh
104 | source cleanup-ssm.sh
105 | aws cloudformation delete-stack --stack-name $VPC_STACK_NAME
106 | ```
107 |
108 | ## Security
109 |
110 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
111 |
112 | ## License
113 |
114 | This library is licensed under the MIT-0 License. See the LICENSE file.
115 |
116 |
--------------------------------------------------------------------------------
/deploy-adot/adotTaskDefinition.json.template:
--------------------------------------------------------------------------------
1 | {
2 | "family":"AdotTask",
3 | "taskRoleArn":"arn:aws:iam::ACCOUNT:role/ECS-ADOT-Task-Role",
4 | "executionRoleArn":"arn:aws:iam::ACCOUNT:role/ECS-Task-Execution-Role",
5 | "networkMode":"awsvpc",
6 | "containerDefinitions":[
7 | {
8 | "name":"config-reloader",
9 | "image":"public.ecr.aws/awsvijisarathy/adot-http-sdconfig:2.0",
10 | "cpu":128,
11 | "memory":128,
12 | "environment":[
13 | {
14 | "name":"SERVICE_DISCOVERY_MODE",
15 | "value":"HTTP_BASED"
16 | },
17 | {
18 | "name":"DISCOVERY_NAMESPACES_PARAMETER_NAME",
19 | "value":"ECS-Namespaces"
20 | }
21 | ],
22 | "logConfiguration":{
23 | "logDriver":"awslogs",
24 | "options":{
25 | "awslogs-group":"/ecs/ADOT",
26 | "awslogs-create-group":"true",
27 | "awslogs-region":"REGION",
28 | "awslogs-stream-prefix":"config-reloader"
29 | }
30 | },
31 | "essential":true
32 | },
33 | {
34 | "name":"aws-otel-collector",
35 | "image":"amazon/aws-otel-collector:v0.15.1",
36 | "cpu":256,
37 | "memory":512,
38 | "secrets":[
39 | {
40 | "name":"AOT_CONFIG_CONTENT",
41 | "valueFrom":"arn:aws:ssm:REGION:ACCOUNT:parameter/otel-collector-config"
42 | }
43 | ],
44 | "logConfiguration":{
45 | "logDriver":"awslogs",
46 | "options":{
47 | "awslogs-group":"/ecs/ADOT",
48 | "awslogs-create-group":"true",
49 | "awslogs-region":"REGION",
50 | "awslogs-stream-prefix":"collector"
51 | }
52 | },
53 | "portMappings":[
54 | {
55 | "containerPort":2000,
56 | "protocol":"udp"
57 | }
58 | ],
59 | "dependsOn":[
60 | {
61 | "containerName":"config-reloader",
62 | "condition":"START"
63 | }
64 | ],
65 | "essential":true
66 | }
67 | ],
68 | "requiresCompatibilities":[
69 | "EC2"
70 | ],
71 | "cpu":"1000",
72 | "memory":"1024"
73 | }
--------------------------------------------------------------------------------
/deploy-adot/cleanup-cloudmap.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 |
3 | #
4 | # Delete CloudMap service registries and namespace
5 | #
6 | aws servicediscovery delete-service --id $CLOUDMAP_WEBAPP_SERVICE_ID
7 | aws servicediscovery delete-service --id $CLOUDMAP_ADOT_COLLECTOR_SERVICE_ID
8 | aws servicediscovery delete-namespace --id $CLOUDMAP_NAMESPACE_ID
9 |
--------------------------------------------------------------------------------
/deploy-adot/cleanup-ecs.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 |
3 | #
4 | # Delete the ECS services
5 | #
6 | SERVICE_NAME=WebAppService
7 | aws ecs update-service --cluster $CLUSTER_NAME --service $SERVICE_NAME --desired-count 0
8 | aws ecs delete-service --cluster $CLUSTER_NAME --service $SERVICE_NAME
9 |
10 | SERVICE_NAME=ADOTService
11 | aws ecs update-service --cluster $CLUSTER_NAME --service $SERVICE_NAME --desired-count 0
12 | aws ecs delete-service --cluster $CLUSTER_NAME --service $SERVICE_NAME
13 |
14 | #
15 | # Deregister task definitions
16 | #
17 | aws ecs deregister-task-definition --task-definition $WEBAPP_TASK_DEFINITION
18 | aws ecs deregister-task-definition --task-definition $ADOT_TASK_DEFINITION
19 |
--------------------------------------------------------------------------------
/deploy-adot/cleanup-iam.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 | #
3 | # Delete IAM roles and policies
4 | #
5 | aws iam detach-role-policy --role-name $ECS_GENERIC_TASK_ROLE --policy-arn $CLOUDWATCH_LOGS_POLICY_ARN
6 |
7 | aws iam detach-role-policy --role-name $ECS_TASK_EXECUTION_ROLE --policy-arn $CLOUDWATCH_LOGS_POLICY_ARN
8 | aws iam detach-role-policy --role-name $ECS_TASK_EXECUTION_ROLE --policy-arn $ECS_TASK_EXECUTION_POLICY_ARN
9 | aws iam detach-role-policy --role-name $ECS_TASK_EXECUTION_ROLE --policy-arn $ECS_SSM_TASK_EXECUTION_POLICY_ARN
10 |
11 | aws iam detach-role-policy --role-name $ECS_ADOT_TASK_ROLE --policy-arn $XRAY_DAEMON_POLICY_ARN
12 | aws iam detach-role-policy --role-name $ECS_ADOT_TASK_ROLE --policy-arn $CLOUDWATCH_LOGS_POLICY_ARN
13 | aws iam detach-role-policy --role-name $ECS_ADOT_TASK_ROLE --policy-arn $ECS_ADOT_TASK_POLICY_ARN
14 |
15 | aws iam delete-policy --policy-arn $ECS_SSM_TASK_EXECUTION_POLICY_ARN
16 | aws iam delete-policy --policy-arn $ECS_ADOT_TASK_POLICY_ARN
17 |
18 | aws iam delete-role --role-name $ECS_GENERIC_TASK_ROLE
19 | aws iam delete-role --role-name $ECS_ADOT_TASK_ROLE
20 | aws iam delete-role --role-name $ECS_TASK_EXECUTION_ROLE
21 |
22 |
--------------------------------------------------------------------------------
/deploy-adot/cleanup-ssm.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 | #
3 | # Delete AMP workspace
4 | #
5 | aws amp delete-workspace --workspace-id $WORKSPACE_ID
6 | #
7 | # Delete SSM parameters
8 | #
9 | aws ssm delete-parameter --name otel-collector-config
10 | aws ssm delete-parameter --name ECS-Namespaces
11 |
12 |
13 |
--------------------------------------------------------------------------------
/deploy-adot/cloudmap.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 |
3 | #
4 | # Create a Service Discovery namespace
5 | #
6 | SERVICE_DISCOVERY_NAMESPACE=ecs-services
7 | OPERATION_ID=$(aws servicediscovery create-private-dns-namespace \
8 | --vpc $VPC_ID \
9 | --name $SERVICE_DISCOVERY_NAMESPACE \
10 | --query "OperationId" --output text)
11 |
12 | operationStatus() {
13 | aws servicediscovery get-operation --operation-id $OPERATION_ID --query "Operation.Status" --output text
14 | }
15 |
16 | until [ $(operationStatus) != "PENDING" ]; do
17 | echo "Namespace $SERVICE_DISCOVERY_NAMESPACE is creating ..."
18 | sleep 10s
19 | if [ $(operationStatus) = "SUCCESS" ]; then
20 | echo "Namespace $SERVICE_DISCOVERY_NAMESPACE created"
21 | break
22 | fi
23 | done
24 |
25 | CLOUDMAP_NAMESPACE_ID=$(aws servicediscovery get-operation \
26 | --operation-id $OPERATION_ID \
27 | --query "Operation.Targets.NAMESPACE" --output text)
28 |
29 | #
30 | # Create a Service Discovery service in the above namespace
31 | # When create a Service Discovery service with either private or public DNS, there are different options available for DNS record type.
32 | # When doing a DNS query on the service name:
33 | # 1. "A" records return a set of IP addresses that correspond to your tasks.
34 | # 2. "SRV" records return a set of IP addresses and ports per task.
35 | #
36 | METRICS_PATH=/metrics
37 | METRICS_PORT=3000
38 | ECS_METRICS_PATH=/metrics
39 | ECS_METRICS_PORT=9779
40 | SERVICE_REGISTRY_NAME="webapp-svc"
41 | SERVICE_REGISTRY_DESCRIPTION="Service registry for Webapp ECS service"
42 | CLOUDMAP_WEBAPP_SERVICE_ID=$(aws servicediscovery create-service \
43 | --name $SERVICE_REGISTRY_NAME \
44 | --description "$SERVICE_REGISTRY_DESCRIPTION" \
45 | --namespace-id $CLOUDMAP_NAMESPACE_ID \
46 | --dns-config "NamespaceId=$CLOUDMAP_NAMESPACE_ID,RoutingPolicy=WEIGHTED,DnsRecords=[{Type=A,TTL=10}]" \
47 | --region $AWS_REGION \
48 | --tags Key=METRICS_PATH,Value=$METRICS_PATH Key=METRICS_PORT,Value=$METRICS_PORT Key=ECS_METRICS_PATH,Value=$ECS_METRICS_PATH Key=ECS_METRICS_PORT,Value=$ECS_METRICS_PORT \
49 | --query "Service.Id" --output text)
50 | CLOUDMAP_WEBAPP_SERVICE_ARN=$(aws servicediscovery get-service \
51 | --id $CLOUDMAP_WEBAPP_SERVICE_ID \
52 | --query "Service.Arn" --output text)
53 | echo "Service registry $SERVICE_REGISTRY_NAME created"
54 |
55 | SERVICE_REGISTRY_NAME="adot-collector-svc"
56 | SERVICE_REGISTRY_DESCRIPTION="Service registry for ADOT Collector ECS service"
57 | CLOUDMAP_ADOT_COLLECTOR_SERVICE_ID=$(aws servicediscovery create-service \
58 | --name $SERVICE_REGISTRY_NAME \
59 | --description "$SERVICE_REGISTRY_DESCRIPTION" \
60 | --namespace-id $CLOUDMAP_NAMESPACE_ID \
61 | --dns-config "NamespaceId=$CLOUDMAP_NAMESPACE_ID,RoutingPolicy=WEIGHTED,DnsRecords=[{Type=A,TTL=10}]" \
62 | --region $AWS_REGION \
63 | --query "Service.Id" --output text)
64 | CLOUDMAP_ADOT_COLLECTOR_SERVICE_ARN=$(aws servicediscovery get-service \
65 | --id $CLOUDMAP_ADOT_COLLECTOR_SERVICE_ID \
66 | --query "Service.Arn" --output text)
67 | echo "Service registry $SERVICE_REGISTRY_NAME created"
68 |
69 | export CLOUDMAP_NAMESPACE_ID
70 | export CLOUDMAP_WEBAPP_SERVICE_ARN
71 | export CLOUDMAP_WEBAPP_SERVICE_ID
72 | export CLOUDMAP_ADOT_COLLECTOR_SERVICE_ARN
73 | export CLOUDMAP_ADOT_COLLECTOR_SERVICE_ID
--------------------------------------------------------------------------------
/deploy-adot/ecs-cluster.yaml:
--------------------------------------------------------------------------------
1 | AWSTemplateFormatVersion: '2010-09-09'
2 | Description: EC2 ECS cluster running containers in a private subnet. Supports
3 | public facing load balancers, private internal load balancers, and
4 | both internal and external service discovery namespaces.
5 | Parameters:
6 | VpcName:
7 | Type: String
8 | Default: ECS-VPC
9 | Description: Unique name for the VPC
10 | EnvironmentName:
11 | Type: String
12 | Default: ecs-prometheus-cluster
13 | Description: "A friendly environment name that will be used for namespacing all cluster resources. Example: staging, qa, or production"
14 | InstanceType:
15 | Description: EC2 instance type
16 | Type: String
17 | Default: c5.large
18 | Description: Class of EC2 instance used to host containers. Choose t2 for testing, m5 for general purpose, c5 for CPU intensive services, and r5 for memory intensive services
19 | AllowedValues: [ t2.micro, t2.small, t2.medium, t2.large, t2.xlarge, t2.2xlarge,
20 | m5.large, m5.xlarge, m5.2large, m5.4xlarge, m5.12xlarge, m5.24large,
21 | c5.large, c5.xlarge, c5.2xlarge, c5.4xlarge, c5.9xlarge, c5.18xlarge,
22 | r5.large, r5.xlarge, r5.2xlarge, r5.4xlarge, r5.12xlarge, r5.24xlarge ]
23 | ConstraintDescription: Please choose a valid instance type.
24 | DesiredCapacity:
25 | Type: Number
26 | Default: '2'
27 | Description: Number of EC2 instances to launch in your ECS cluster.
28 | MaxSize:
29 | Type: Number
30 | Default: '4'
31 | Description: Maximum number of EC2 instances that can be launched in your ECS cluster.
32 | ECSAMI:
33 | Description: AMI ID
34 | Type: AWS::SSM::Parameter::Value
35 | Default: /aws/service/ecs/optimized-ami/amazon-linux-2/recommended/image_id
36 | Description: The Amazon Machine Image ID used for the cluster, leave it as the default value to get the latest AMI
37 | Subnet01Name:
38 | Type: String
39 | Default: "PUBLIC-WORKER-1"
40 | Description: Name for public subnet 01
41 | Subnet02Name:
42 | Type: String
43 | Default: "PUBLIC-WORKER-2"
44 | Description: Name for public subnet 02
45 | Subnet01PrivateName:
46 | Type: String
47 | Default: "PRIVATE-WORKER-1"
48 | Description: Name for private subnet 01
49 | Subnet02PrivateName:
50 | Type: String
51 | Default: "PRIVATE-WORKER-2"
52 | Description: Name for private subnet 02
53 | Mappings:
54 | SubnetConfig:
55 | VPC:
56 | CIDR: '10.10.0.0/16'
57 | PublicOne:
58 | CIDR: '10.10.0.0/24'
59 | PublicTwo:
60 | CIDR: '10.10.1.0/24'
61 | PrivateOne:
62 | CIDR: '10.10.100.0/24'
63 | PrivateTwo:
64 | CIDR: '10.10.101.0/24'
65 | Resources:
66 | VPC:
67 | Type: AWS::EC2::VPC
68 | Properties:
69 | EnableDnsSupport: true
70 | EnableDnsHostnames: true
71 | CidrBlock: !FindInMap ['SubnetConfig', 'VPC', 'CIDR']
72 | Tags:
73 | - Key: Name
74 | Value: !Ref VpcName
75 |
76 | #
77 | # Two public subnets, where containers can have public IP addresses
78 | #
79 | PublicSubnetOne:
80 | Type: AWS::EC2::Subnet
81 | Properties:
82 | AvailabilityZone: !Select
83 | - 0
84 | - Fn::GetAZs: !Ref 'AWS::Region'
85 | VpcId: !Ref 'VPC'
86 | Tags:
87 | - Key: Name
88 | Value: !Join [ '-', [ !Ref VpcName, !Ref Subnet01Name] ]
89 | - Key: ecs.io/role/elb
90 | Value: 1
91 | CidrBlock: !FindInMap ['SubnetConfig', 'PublicOne', 'CIDR']
92 | MapPublicIpOnLaunch: true
93 | PublicSubnetTwo:
94 | Type: AWS::EC2::Subnet
95 | Properties:
96 | AvailabilityZone: !Select
97 | - 1
98 | - Fn::GetAZs: !Ref 'AWS::Region'
99 | VpcId: !Ref 'VPC'
100 | Tags:
101 | - Key: Name
102 | Value: !Join [ '-', [ !Ref VpcName, !Ref Subnet02Name] ]
103 | - Key: ecs.io/role/elb
104 | Value: 1
105 | CidrBlock: !FindInMap ['SubnetConfig', 'PublicTwo', 'CIDR']
106 | MapPublicIpOnLaunch: true
107 |
108 | #
109 | # Two private subnets where containers will only have private IP addresses
110 | #
111 | PrivateSubnetOne:
112 | Type: AWS::EC2::Subnet
113 | Properties:
114 | AvailabilityZone: !Select
115 | - 0
116 | - Fn::GetAZs: !Ref 'AWS::Region'
117 | VpcId: !Ref 'VPC'
118 | Tags:
119 | - Key: Name
120 | Value: !Join [ '-', [ !Ref VpcName, !Ref Subnet01PrivateName] ]
121 | - Key: ecs.io/role/internal-elb
122 | Value: 1
123 | CidrBlock: !FindInMap ['SubnetConfig', 'PrivateOne', 'CIDR']
124 | PrivateSubnetTwo:
125 | Type: AWS::EC2::Subnet
126 | Properties:
127 | AvailabilityZone: !Select
128 | - 1
129 | - Fn::GetAZs: !Ref 'AWS::Region'
130 | VpcId: !Ref 'VPC'
131 | Tags:
132 | - Key: Name
133 | Value: !Join [ '-', [ !Ref VpcName, !Ref Subnet02PrivateName] ]
134 | - Key: ecs.io/role/internal-elb
135 | Value: 1
136 | CidrBlock: !FindInMap ['SubnetConfig', 'PrivateTwo', 'CIDR']
137 |
138 | #
139 | # Setup networking resources for the public subnets. Containers
140 | # in the public subnets have public IP addresses and the routing table
141 | # sends network traffic via the internet gateway.
142 | #
143 | InternetGateway:
144 | Type: AWS::EC2::InternetGateway
145 | GatewayAttachment:
146 | Type: AWS::EC2::VPCGatewayAttachment
147 | Properties:
148 | VpcId: !Ref 'VPC'
149 | InternetGatewayId: !Ref 'InternetGateway'
150 | PublicRouteTable:
151 | Type: AWS::EC2::RouteTable
152 | Properties:
153 | VpcId: !Ref 'VPC'
154 | Tags:
155 | - Key: Name
156 | Value: !Join [ '-', [ !Ref VpcName, 'PUBLIC-ROUTE-TABLE'] ]
157 | - Key: Network
158 | Value: Public
159 | PublicRoute:
160 | Type: AWS::EC2::Route
161 | DependsOn: GatewayAttachment
162 | Properties:
163 | RouteTableId: !Ref 'PublicRouteTable'
164 | DestinationCidrBlock: '0.0.0.0/0'
165 | GatewayId: !Ref 'InternetGateway'
166 | PublicSubnetOneRouteTableAssociation:
167 | Type: AWS::EC2::SubnetRouteTableAssociation
168 | Properties:
169 | SubnetId: !Ref PublicSubnetOne
170 | RouteTableId: !Ref PublicRouteTable
171 | PublicSubnetTwoRouteTableAssociation:
172 | Type: AWS::EC2::SubnetRouteTableAssociation
173 | Properties:
174 | SubnetId: !Ref PublicSubnetTwo
175 | RouteTableId: !Ref PublicRouteTable
176 |
177 | #
178 | # Setup networking resources for the private subnets. Containers
179 | # in these subnets have only private IP addresses, and must use a NAT
180 | # gateway to talk to the internet. We launch two NAT gateways, one for
181 | # each private subnet.
182 | #
183 | NatGatewayOneAttachment:
184 | Type: AWS::EC2::EIP
185 | DependsOn: GatewayAttachment
186 | Properties:
187 | Domain: vpc
188 | NatGatewayTwoAttachment:
189 | Type: AWS::EC2::EIP
190 | DependsOn: GatewayAttachment
191 | Properties:
192 | Domain: vpc
193 | NatGatewayOne:
194 | Type: AWS::EC2::NatGateway
195 | Properties:
196 | AllocationId: !GetAtt NatGatewayOneAttachment.AllocationId
197 | SubnetId: !Ref PublicSubnetOne
198 | NatGatewayTwo:
199 | Type: AWS::EC2::NatGateway
200 | Properties:
201 | AllocationId: !GetAtt NatGatewayTwoAttachment.AllocationId
202 | SubnetId: !Ref PublicSubnetTwo
203 | PrivateRouteTableOne:
204 | Type: AWS::EC2::RouteTable
205 | Properties:
206 | VpcId: !Ref 'VPC'
207 | Tags:
208 | - Key: Name
209 | Value: !Join [ '-', [ !Ref VpcName, 'PRIVATE-ROUTE-TABLE-01'] ]
210 | - Key: Network
211 | Value: Private
212 | PrivateRouteOne:
213 | Type: AWS::EC2::Route
214 | Properties:
215 | RouteTableId: !Ref PrivateRouteTableOne
216 | DestinationCidrBlock: 0.0.0.0/0
217 | NatGatewayId: !Ref NatGatewayOne
218 | PrivateRouteTableOneAssociation:
219 | Type: AWS::EC2::SubnetRouteTableAssociation
220 | Properties:
221 | RouteTableId: !Ref PrivateRouteTableOne
222 | SubnetId: !Ref PrivateSubnetOne
223 | PrivateRouteTableTwo:
224 | Type: AWS::EC2::RouteTable
225 | Properties:
226 | VpcId: !Ref 'VPC'
227 | Tags:
228 | - Key: Name
229 | Value: !Join [ '-', [ !Ref VpcName, 'PRIVATE-ROUTE-TABLE-02'] ]
230 | - Key: Network
231 | Value: Private
232 | PrivateRouteTwo:
233 | Type: AWS::EC2::Route
234 | Properties:
235 | RouteTableId: !Ref PrivateRouteTableTwo
236 | DestinationCidrBlock: 0.0.0.0/0
237 | NatGatewayId: !Ref NatGatewayTwo
238 | PrivateRouteTableTwoAssociation:
239 | Type: AWS::EC2::SubnetRouteTableAssociation
240 | Properties:
241 | RouteTableId: !Ref PrivateRouteTableTwo
242 | SubnetId: !Ref PrivateSubnetTwo
243 |
244 | #
245 | # ECS Cluster
246 | #
247 | ECSCluster:
248 | Type: AWS::ECS::Cluster
249 | Properties:
250 | ClusterName: !Ref EnvironmentName
251 |
252 | #
253 | # A security group for the containers we will run in ECS.
254 | # Rules are added to this security group based on what ingress you
255 | # add for the cluster.
256 | #
257 | ContainerSecurityGroup:
258 | Type: AWS::EC2::SecurityGroup
259 | Properties:
260 | GroupDescription: Access to the ECS hosts that run containers
261 | VpcId: !Ref 'VPC'
262 | Tags:
263 | - Key: Name
264 | Value: ECS-ContainerInstance-Security-Group
265 |
266 | ContainerSecurityGroupMemberIngress:
267 | Type: AWS::EC2::SecurityGroupIngress
268 | DependsOn: ContainerSecurityGroup
269 | Properties:
270 | Description: Allow nodes in this security group to communicate with each other
271 | GroupId: !Ref ContainerSecurityGroup
272 | SourceSecurityGroupId: !Ref ContainerSecurityGroup
273 | IpProtocol: '-1'
274 | FromPort: 0
275 | ToPort: 65535
276 |
277 | ContainerSecurityGroupHttpIngress:
278 | Type: AWS::EC2::SecurityGroupIngress
279 | DependsOn: ContainerSecurityGroup
280 | Properties:
281 | Description: Allow nodes in this security group to communicate with each other
282 | GroupId: !Ref ContainerSecurityGroup
283 | IpProtocol: tcp
284 | CidrIp: 0.0.0.0/0
285 | FromPort: 80
286 | ToPort: 80
287 |
288 | #
289 | # Autoscaling group. This launches the actual EC2 instances that will register
290 | # themselves as members of the cluster, and run the docker containers.
291 | #
292 | ECSAutoScalingGroup:
293 | Type: AWS::AutoScaling::AutoScalingGroup
294 | Properties:
295 | VPCZoneIdentifier:
296 | - !Ref PrivateSubnetOne
297 | - !Ref PrivateSubnetTwo
298 | LaunchConfigurationName: !Ref 'ContainerInstances'
299 | MinSize: '1'
300 | MaxSize: !Ref 'MaxSize'
301 | DesiredCapacity: !Ref 'DesiredCapacity'
302 | Tags:
303 | - Key: Name
304 | Value: !Sub "${EnvironmentName}-Container-Instance"
305 | PropagateAtLaunch: 'true'
306 | CreationPolicy:
307 | ResourceSignal:
308 | Timeout: PT5M
309 | UpdatePolicy:
310 | AutoScalingReplacingUpdate:
311 | WillReplace: 'true'
312 |
313 | ContainerInstances:
314 | Type: AWS::AutoScaling::LaunchConfiguration
315 | Properties:
316 | ImageId: !Ref 'ECSAMI'
317 | SecurityGroups: [!Ref 'ContainerSecurityGroup']
318 | InstanceType: !Ref 'InstanceType'
319 | IamInstanceProfile: !Ref 'EC2InstanceProfile'
320 | UserData:
321 | Fn::Base64: !Sub |
322 | #!/bin/bash -xe
323 | echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
324 | echo ECS_IMAGE_PULL_BEHAVIOR=always >> /etc/ecs/ecs.config
325 | echo ECS_ENABLE_CONTAINER_METADATA=true >> /etc/ecs/ecs.config
326 | echo ECS_ENABLE_SPOT_INSTANCE_DRAINING=true >> /etc/ecs/ecs.config
327 | yum install -y aws-cfn-bootstrap
328 | /opt/aws/bin/cfn-signal -e $? --stack ${AWS::StackName} --resource ECSAutoScalingGroup --region ${AWS::Region}
329 |
330 | EC2InstanceProfile:
331 | Type: AWS::IAM::InstanceProfile
332 | Properties:
333 | Path: /
334 | Roles: [!Ref 'EC2InstanceRole']
335 |
336 | EC2InstanceRole:
337 | Type: AWS::IAM::Role
338 | Properties:
339 | AssumeRolePolicyDocument:
340 | Statement:
341 | - Effect: Allow
342 | Principal:
343 | Service: [ec2.amazonaws.com]
344 | Action: ['sts:AssumeRole']
345 | Path: /
346 | ManagedPolicyArns:
347 | - arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role
348 |
349 | Outputs:
350 | ClusterName:
351 | Description: The name of the ECS cluster
352 | Value: !Ref 'ECSCluster'
353 | Export:
354 | Name: !Sub ${AWS::StackName}-ClusterName
355 | EC2InstanceRole:
356 | Description: The ARN of the EC2 Instance role
357 | Value: !GetAtt 'EC2InstanceRole.Arn'
358 | Export:
359 | Name: !Sub ${AWS::StackName}-ECSRole
360 | VpcId:
361 | Description: The ID of the VPC that this stack is deployed in
362 | Value: !Ref 'VPC'
363 | Export:
364 | Name: !Sub ${AWS::StackName}-VpcId
365 | PublicSubnetOne:
366 | Description: Public subnet one
367 | Value: !Ref 'PublicSubnetOne'
368 | Export:
369 | Name: !Sub ${AWS::StackName}-PublicSubnetOne
370 | PublicSubnetTwo:
371 | Description: Public subnet two
372 | Value: !Ref 'PublicSubnetTwo'
373 | Export:
374 | Name: !Sub ${AWS::StackName}-PublicSubnetTwo
375 | PrivateSubnetOne:
376 | Description: Private subnet one
377 | Value: !Ref 'PrivateSubnetOne'
378 | Export:
379 | Name: !Sub ${AWS::StackName}-PrivateSubnetOne
380 | PrivateSubnetTwo:
381 | Description: Private subnet two
382 | Value: !Ref 'PrivateSubnetTwo'
383 | Export:
384 | Name: !Sub ${AWS::StackName}-PrivateSubnetTwo
385 | ECSAutoScalingGroup:
386 | Description: Autoscaling group for EC2 instances
387 | Value: !Ref ECSAutoScalingGroup
388 | Export:
389 | Name: !Sub ${AWS::StackName}-ECSAutoScalingGroup
390 | ECSLaunchConfiguration:
391 | Description: Launch configuration for EC2 instances
392 | Value: !Ref ContainerInstances
393 | Export:
394 | Name: !Sub ${AWS::StackName}-ECSLaunchConfiguration
395 | ContainerSecurityGroup:
396 | Description: A security group used to allow containers to receive traffic
397 | Value: !Ref 'ContainerSecurityGroup'
398 | Export:
399 | Name: !Sub ${AWS::StackName}-ContainerSecurityGroup
400 |
--------------------------------------------------------------------------------
/deploy-adot/env.sh:
--------------------------------------------------------------------------------
1 |
2 | ##!/bin/bash
3 | export AWS_REGION=us-east-1
4 | export ACCOUNT_ID=123456789012
5 | export STACK_NAME=ecs-stack
6 |
7 | export CLUSTER_NAME=$(aws cloudformation describe-stacks --stack-name $STACK_NAME --query 'Stacks[0].Outputs[?OutputKey==`ClusterName`].OutputValue' --output text)
8 | export VPC_ID=$(aws cloudformation describe-stacks --stack-name $STACK_NAME --query 'Stacks[0].Outputs[?OutputKey==`VpcId`].OutputValue' --output text)
9 | export PUBLIC_SUBNET_IDS=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$VPC_ID" "Name=tag-key,Values=ecs.io/role/elb" --query "Subnets[].SubnetId" --output text)
10 | export PRIVATE_SUBNET_IDS=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$VPC_ID" "Name=tag-key,Values=ecs.io/role/internal-elb" --query "Subnets[].SubnetId" --output json)
11 | export SECURITY_GROUP_ID=$(aws cloudformation describe-stacks --stack-name $STACK_NAME --query 'Stacks[0].Outputs[?OutputKey==`ContainerSecurityGroup`].OutputValue' --output text)
12 |
--------------------------------------------------------------------------------
/deploy-adot/iam.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 |
3 | #
4 | # Create a trust policy for ECS task and task execution roles
5 | #
6 | cat < TrustPolicy.json
7 | {
8 | "Version": "2012-10-17",
9 | "Statement": [
10 | {
11 | "Sid": "",
12 | "Effect": "Allow",
13 | "Principal": {
14 | "Service": "ecs-tasks.amazonaws.com"
15 | },
16 | "Action": "sts:AssumeRole"
17 | }
18 | ]
19 | }
20 | EOF
21 |
22 | #
23 | # Create a permission policy for the Task Execution Role
24 | # This allows ECS to retrieve parameters from SSM Parameter Store defined in the Task Definitions
25 | #
26 | cat < TaskExecutionPermissionPolicy.json
27 | {
28 | "Version": "2012-10-17",
29 | "Statement": [
30 | {
31 | "Effect": "Allow",
32 | "Action": [
33 | "ssm:GetParameter",
34 | "ssm:GetParameters"
35 | ],
36 | "Resource": "*"
37 | }
38 | ]
39 | }
40 | EOF
41 |
42 |
43 | #
44 | # Create a permission policy for the Task role associated with the ADOT task
45 | # This allows the ADOT Collector to send metrics to a workspace in AMP, access SSM Parameter Store and read service registries in Cloud Map
46 | #
47 | cat < AdotTaskPermissionPolicy.json
48 | {
49 | "Version": "2012-10-17",
50 | "Statement": [
51 | {
52 | "Effect": "Allow",
53 | "Action": [
54 | "aps:RemoteWrite",
55 | "aps:GetSeries",
56 | "aps:GetLabels",
57 | "aps:GetMetricMetadata"
58 | ],
59 | "Resource": "*"
60 | },
61 | {
62 | "Effect": "Allow",
63 | "Action": [
64 | "ssm:GetParameter",
65 | "ssm:GetParameters"
66 | ],
67 | "Resource": "*"
68 | },
69 | {
70 | "Effect": "Allow",
71 | "Action": [
72 | "servicediscovery:*"
73 | ],
74 | "Resource": "*"
75 | }
76 | ]
77 | }
78 | EOF
79 |
80 | XRAY_DAEMON_POLICY_ARN=arn:aws:iam::aws:policy/AWSXRayDaemonWriteAccess
81 | CLOUDWATCH_LOGS_POLICY_ARN=arn:aws:iam::aws:policy/CloudWatchLogsFullAccess
82 | ECS_TASK_EXECUTION_POLICY_ARN=arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy
83 |
84 |
85 | ECS_TASK_EXECUTION_ROLE="ECS-Task-Execution-Role"
86 | ECS_TASK_EXECUTION_ROLE_ARN=$(aws iam create-role \
87 | --role-name $ECS_TASK_EXECUTION_ROLE \
88 | --assume-role-policy-document file://TrustPolicy.json \
89 | --query "Role.Arn" --output text)
90 |
91 | ECS_SSM_TASK_EXECUTION_POLICY="ECSSSMTaskExecutionPolicy"
92 | ECS_SSM_TASK_EXECUTION_POLICY_ARN=$(aws iam create-policy --policy-name $ECS_SSM_TASK_EXECUTION_POLICY \
93 | --policy-document file://TaskExecutionPermissionPolicy.json \
94 | --query 'Policy.Arn' --output text)
95 |
96 | aws iam attach-role-policy --role-name $ECS_TASK_EXECUTION_ROLE --policy-arn $CLOUDWATCH_LOGS_POLICY_ARN
97 | aws iam attach-role-policy --role-name $ECS_TASK_EXECUTION_ROLE --policy-arn $ECS_TASK_EXECUTION_POLICY_ARN
98 | aws iam attach-role-policy --role-name $ECS_TASK_EXECUTION_ROLE --policy-arn $ECS_SSM_TASK_EXECUTION_POLICY_ARN
99 |
100 | ECS_GENERIC_TASK_ROLE="ECS-Generic-Task-Role"
101 | ECS_GENERIC_TASK_ROLE_ARN=$(aws iam create-role \
102 | --role-name $ECS_GENERIC_TASK_ROLE \
103 | --assume-role-policy-document file://TrustPolicy.json \
104 | --query "Role.Arn" --output text)
105 | aws iam attach-role-policy --role-name $ECS_GENERIC_TASK_ROLE --policy-arn $CLOUDWATCH_LOGS_POLICY_ARN
106 |
107 | ECS_ADOT_TASK_ROLE="ECS-ADOT-Task-Role"
108 | ECS_ADOT_TASK_ROLE_ARN=$(aws iam create-role \
109 | --role-name $ECS_ADOT_TASK_ROLE \
110 | --assume-role-policy-document file://TrustPolicy.json \
111 | --query "Role.Arn" --output text)
112 |
113 | ECS_ADOT_TASK_POLICY="ECSAdotTaskPolicy"
114 | ECS_ADOT_TASK_POLICY_ARN=$(aws iam create-policy --policy-name $ECS_ADOT_TASK_POLICY \
115 | --policy-document file://AdotTaskPermissionPolicy.json \
116 | --query 'Policy.Arn' --output text)
117 |
118 | aws iam attach-role-policy --role-name $ECS_ADOT_TASK_ROLE --policy-arn $XRAY_DAEMON_POLICY_ARN
119 | aws iam attach-role-policy --role-name $ECS_ADOT_TASK_ROLE --policy-arn $CLOUDWATCH_LOGS_POLICY_ARN
120 | aws iam attach-role-policy --role-name $ECS_ADOT_TASK_ROLE --policy-arn $ECS_ADOT_TASK_POLICY_ARN
121 |
122 | export ECS_GENERIC_TASK_ROLE
123 | export ECS_TASK_EXECUTION_ROLE
124 | export ECS_ADOT_TASK_ROLE
125 |
126 | export XRAY_DAEMON_POLICY_ARN
127 | export CLOUDWATCH_LOGS_POLICY_ARN
128 | export ECS_TASK_EXECUTION_POLICY_ARN
129 | export ECS_SSM_TASK_EXECUTION_POLICY_ARN
130 | export ECS_ADOT_TASK_POLICY_ARN
131 |
--------------------------------------------------------------------------------
/deploy-adot/otel-collector-config-cloudwatch.yaml.template:
--------------------------------------------------------------------------------
1 | receivers:
2 | awsxray:
3 | prometheus:
4 | config:
5 | global:
6 | scrape_interval: 15s
7 | scrape_timeout: 10s
8 | scrape_configs:
9 | - job_name: ecs_services
10 | http_sd_configs:
11 | - url: http://localhost:9001/prometheus-targets
12 | refresh_interval: 30s
13 |
14 | processors:
15 | batch/metrics:
16 | timeout: 60s
17 | metricstransform/labelling:
18 | transforms:
19 | - include: .*
20 | match_type: regexp
21 | action: update
22 | operations:
23 | - action: update_label
24 | label: cluster
25 | new_label: ClusterName
26 | - action: update_label
27 | label: service
28 | new_label: SdServiceName
29 | - action: update_label
30 | label: taskid
31 | new_label: SdTaskID
32 | - action: update_label
33 | label: namespace
34 | new_label: SdNamespaceName
35 | filter/include:
36 | metrics:
37 | include:
38 | match_type: regexp
39 | metric_names:
40 | - ^http_requests_total$
41 |
42 | exporters:
43 | awsxray:
44 | awsemf:
45 | namespace: ECS/ContainerInsights
46 | log_group_name: '/aws/ecs/containerinsights/{ClusterName}/prometheus'
47 | dimension_rollup_option: NoDimensionRollup
48 | metric_declarations:
49 | - dimensions: [[ClusterName, SdNamespaceName, SdServiceName, SdTaskID]]
50 | metric_name_selectors:
51 | - http_requests_total
52 |
53 | extensions:
54 | health_check: null
55 | pprof:
56 | endpoint: ':1888'
57 | zpages:
58 | endpoint: ':55679'
59 | service:
60 | extensions:
61 | - pprof
62 | - zpages
63 | - health_check
64 | pipelines:
65 | metrics:
66 | receivers: [prometheus]
67 | processors: [filter/include,batch/metrics,metricstransform/labelling]
68 | exporters: [awsemf]
69 | traces:
70 | receivers: [awsxray]
71 | exporters: [awsxray]
--------------------------------------------------------------------------------
/deploy-adot/otel-collector-config.yaml.template:
--------------------------------------------------------------------------------
1 | receivers:
2 | awsxray:
3 | prometheus:
4 | config:
5 | global:
6 | scrape_interval: 15s
7 | scrape_timeout: 10s
8 | scrape_configs:
9 | - job_name: ecs_services
10 | http_sd_configs:
11 | - url: http://localhost:9001/prometheus-targets
12 | refresh_interval: 30s
13 | exporters:
14 | awsxray:
15 | awsprometheusremotewrite:
16 | endpoint: https://aps-workspaces.REGION.amazonaws.com/workspaces/WORKSPACE/api/v1/remote_write
17 | aws_auth:
18 | region: REGION
19 | service: aps
20 | extensions:
21 | health_check: null
22 | pprof:
23 | endpoint: ':1888'
24 | zpages:
25 | endpoint: ':55679'
26 | service:
27 | extensions:
28 | - pprof
29 | - zpages
30 | - health_check
31 | pipelines:
32 | metrics:
33 | receivers: [prometheus]
34 | exporters: [awsprometheusremotewrite]
35 | traces:
36 | receivers: [awsxray]
37 | exporters: [awsxray]
--------------------------------------------------------------------------------
/deploy-adot/otel-config.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 | WORKSPACE_ID=$(aws amp create-workspace --alias adot-prometheus-for-ecs --query "workspaceId" --output text)
3 |
4 | sed -e s/WORKSPACE/$WORKSPACE_ID/g \
5 | -e s/REGION/$AWS_REGION/g \
6 | < otel-collector-config.yaml.template \
7 | > otel-collector-config.yaml
8 |
9 | aws ssm put-parameter --name otel-collector-config --value file://otel-collector-config.yaml --type String
10 | aws ssm put-parameter --name ECS-Namespaces --value ecs-services --type StringList
11 |
12 | export WORKSPACE_ID
13 |
--------------------------------------------------------------------------------
/deploy-adot/services.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 |
3 | #
4 | # WebApp Service
5 | #
6 | SERVICE_NAME=WebAppService
7 | TASK_DEFINITION=$WEBAPP_TASK_DEFINITION
8 | CLOUDMAP_SERVICE_ARN=$CLOUDMAP_WEBAPP_SERVICE_ARN
9 | aws ecs create-service --service-name $SERVICE_NAME \
10 | --cluster $CLUSTER_NAME \
11 | --task-definition $TASK_DEFINITION \
12 | --desired-count 1 \
13 | --enable-execute-command \
14 | --service-registries "registryArn=$CLOUDMAP_SERVICE_ARN" \
15 | --network-configuration "awsvpcConfiguration={subnets=$PRIVATE_SUBNET_IDS,securityGroups=[$SECURITY_GROUP_ID],assignPublicIp=DISABLED}" \
16 | --scheduling-strategy REPLICA \
17 | --launch-type EC2
18 |
19 | #
20 | # Create the ADOT Service
21 | #
22 | SERVICE_NAME=ADOTService
23 | TASK_DEFINITION=$ADOT_TASK_DEFINITION
24 | CLOUDMAP_SERVICE_ARN=$CLOUDMAP_ADOT_COLLECTOR_SERVICE_ARN
25 | aws ecs create-service --service-name $SERVICE_NAME \
26 | --cluster $CLUSTER_NAME \
27 | --task-definition $TASK_DEFINITION \
28 | --desired-count 1 \
29 | --enable-execute-command \
30 | --service-registries "registryArn=$CLOUDMAP_SERVICE_ARN" \
31 | --network-configuration "awsvpcConfiguration={subnets=$PRIVATE_SUBNET_IDS,securityGroups=[$SECURITY_GROUP_ID],assignPublicIp=DISABLED}" \
32 | --scheduling-strategy REPLICA \
33 | --launch-type EC2
34 |
35 |
36 |
--------------------------------------------------------------------------------
/deploy-adot/task-definitions.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 |
3 | #
4 | # Task Definitons
5 | #
6 | sed -e s/ACCOUNT/$ACCOUNT_ID/g \
7 | -e s/REGION/$AWS_REGION/g \
8 | < webappTaskDefinition.json.template \
9 | > webappTaskDefinition.json
10 | WEBAPP_TASK_DEFINITION=$(aws ecs register-task-definition \
11 | --cli-input-json file://webappTaskDefinition.json \
12 | --region $AWS_REGION \
13 | --query "taskDefinition.taskDefinitionArn" --output text)
14 |
15 | sed -e s/ACCOUNT/$ACCOUNT_ID/g \
16 | -e s/REGION/$AWS_REGION/g \
17 | < adotTaskDefinition.json.template \
18 | > adotTaskDefinition.json
19 | ADOT_TASK_DEFINITION=$(aws ecs register-task-definition \
20 | --cli-input-json file://adotTaskDefinition.json \
21 | --region $AWS_REGION \
22 | --query "taskDefinition.taskDefinitionArn" --output text)
23 |
24 | export WEBAPP_TASK_DEFINITION
25 | export ADOT_TASK_DEFINITION
--------------------------------------------------------------------------------
/deploy-adot/webappTaskDefinition.json.template:
--------------------------------------------------------------------------------
1 | {
2 | "family":"WebAppTask",
3 | "taskRoleArn":"arn:aws:iam::ACCOUNT:role/ECS-Generic-Task-Role",
4 | "executionRoleArn":"arn:aws:iam::ACCOUNT:role/ECS-Task-Execution-Role",
5 | "networkMode":"awsvpc",
6 | "containerDefinitions":[
7 | {
8 | "name":"webapp",
9 | "image":"public.ecr.aws/awsvijisarathy/generic-webapp:6.0",
10 | "portMappings" : [
11 | {
12 | "containerPort": 3000,
13 | "protocol": "tcp"
14 | }
15 | ],
16 | "logConfiguration":{
17 | "logDriver":"awslogs",
18 | "options":{
19 | "awslogs-group":"/ecs/webapp",
20 | "awslogs-create-group":"true",
21 | "awslogs-region":"REGION"
22 | }
23 | },
24 | "essential":true
25 | },
26 | {
27 | "name":"ecs-exporter",
28 | "image":"public.ecr.aws/awsvijisarathy/ecs-exporter:1.2",
29 | "portMappings" : [
30 | {
31 | "containerPort": 9779,
32 | "protocol": "tcp"
33 | }
34 | ],
35 | "logConfiguration":{
36 | "logDriver":"awslogs",
37 | "options":{
38 | "awslogs-group":"/ecs/ecs-exporter",
39 | "awslogs-create-group":"true",
40 | "awslogs-region":"REGION"
41 | }
42 | },
43 | "essential":true
44 | }
45 | ],
46 | "requiresCompatibilities":[
47 | "EC2"
48 | ],
49 | "cpu":"256",
50 | "memory":"256"
51 | }
52 |
--------------------------------------------------------------------------------
/deploy-prometheus/README.md:
--------------------------------------------------------------------------------
1 | ## Metrics collection using Prometheus on Amazon ECS
2 |
3 | This directory contains software artifacts to deploy [Prometheus](https://prometheus.io/docs/introduction/overview/#what-is-prometheus) server and [Prometheus Node Exporter](https://prometheus.io/docs/guides/node-exporter) to an Amazon ECS cluster and collect Prometheus metrics from applications, using AWS Cloud Map for dynamic service discovery. Please refer to this [blog](https://aws.amazon.com/blogs/opensource/metrics-collection-from-amazon-ecs-using-amazon-managed-service-for-prometheus/) for implementations details about this solution architecture.
4 |
5 |
6 |
7 | ### Solution overview
8 |
9 | At a high level, we will be following the steps outlined below for this solution:
10 |
11 |
12 | -
13 | Setup AWS Cloud Map for service discovery
14 |
15 | -
16 | Deploy application services to an Amazon ECS and register them with AWS Cloud Map
17 |
18 | -
19 | Deploy Prometheus server to Amazon ECS, configure service discovery and send metrics data to Amazon Managed Service for Prometheus (AMP)
20 |
21 | -
22 | Visualize metrics data using Amazon Managed Service for Grafana (AMG)
23 |
24 |
25 |
26 | ### Deploy
27 |
28 | Make sure you have the latest version of AWS CLI that provides support for AMP. The deployment requires an ECS cluster. For deploying the Prometheus Node Exporter, a cluster with EC2 instances is required. All deployment artifacts are under the [deploy](https://github.com/aws-samples/prometheus-for-ecs/tree/main/deploy-prometheus) directory. The deployment comprises the following components:
29 | - An ECS task comprising the Prometheus server, AWS Sig4 proxy and the [service discovery application](https://github.com/aws-samples/prometheus-for-ecs/tree/main/cmd) containers
30 |
31 | - A sample web application that is instrumented with [Prometheus Go client library](https://github.com/prometheus/client_golang) and exposes an HTTP endpoint */work*. The application has an internal load generator that sends client requests to the HTTP endpoint. The service exposes a [Counter](https://prometheus.io/docs/concepts/metric_types/#counter) named *http_requests_total* and a [Histogram](https://prometheus.io/docs/concepts/metric_types/#histogram) named *request_duration_milliseconds*
32 |
33 | - Prometheus Node Exporter to monitor system metrics from every container instance in the cluster. This service is deployed using [host networking mode](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_definition_parameters.html#network_mode) and with the daemon scheduling strategy. Note that we can’t deploy the Node Exporter on AWS Fargate because it does not support the daemon scheduling strategy.
34 |
35 |
36 | The deploment scripts assume that the underlying ECS cluster was created using the [ecs-cluster.yaml](https://github.com/aws-samples/prometheus-for-ecs/blob/main/deploy-prometheus/ecs-cluster.yaml) CloudFormation template.
37 | Create the cluster with the following command:
38 | ```
39 | VPC_STACK_NAME=ecs-stack
40 | VPC_TEMPLATE=ecs-cluster.yaml
41 | aws cloudformation deploy --stack-name $VPC_STACK_NAME --template-file $VPC_TEMPLATE --capabilities CAPABILITY_IAM
42 | ```
43 |
44 | Before proceeding further, export a set of environment variables that are required by scripts used in subsequent steps. Modify the **ACCOUNT_ID** and **AWS_REGION** variables in the *env.sh* script before running the command below.
45 | ```
46 | source env.sh
47 | ```
48 |
49 | Create the ECS task role, task execution roles and the relevant IAM policies.
50 | ```
51 | source iam.sh
52 | ```
53 |
54 | Create a service discovery namespace and service registries under AWS Cloud Map. The ECS tasks that you will deploy will register themselves in these service registries upon launch.
55 | ```
56 | source cloudmap.sh
57 | ```
58 |
59 | Create a workspace under AMP for ingesting Prometheus metrics scraped from ECS services.
60 | ```
61 | source amp.sh
62 | ```
63 | The above command generates the initial configuration file *prometheus.yaml* for the Prometheus server, with the AMP worksapce as the remote write destination.
64 | Create two parameters in the AWS SSM Parameter Store as follows:
65 | - parameter named **ECS-Prometheus-Configuration** and of type *String* using the contents of the *prometheus.yaml* file
66 | - parameter named **ECS-ServiceDiscovery-Namespaces** and of type *String* with its value set to **ecs-services**
67 |
68 | Next, register task definitions with ECS
69 | ```
70 | source task-definitions.sh
71 | ```
72 |
73 | Launch the ECS services using the task definitions created above.
74 | ```
75 | source services.sh
76 | ```
77 |
78 | Once the services are all up and running, the AMP workspace will start ingesting metrics collected by the Prometheus server from the web application. Use AMG to query and visualize the metrics ingested into AMP. You may use the following PromQL queries to visualize the metrics collected from the web application and Prometheus Node Exporter
79 | - HTTP request rate: *sum(rate(http_requests_total[5m]))*
80 | - Average response latency: *sum(rate(request_duration_milliseconds_sum[5m])) / sum(rate(request_duration_milliseconds_count[5m]))*
81 | - Average CPU usage: *100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100)*
82 |
83 | ### Cleanup
84 |
85 | When you are done, cleanup the resources you created above with the follwing set of commands.
86 | ```
87 | source cleanup-ecs.sh
88 | source cleanup-cloudmap.sh
89 | source cleanup-iam.sh
90 | aws cloudformation delete-stack --stack-name $VPC_STACK_NAME
91 | ```
92 |
93 | ## Security
94 |
95 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
96 |
97 | ## License
98 |
99 | This library is licensed under the MIT-0 License. See the LICENSE file.
100 |
101 |
--------------------------------------------------------------------------------
/deploy-prometheus/amp.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 | WORKSPACE_ID=$(aws amp create-workspace --alias prometheus-for-ecs --query "workspaceId" --output text)
3 |
4 | sed -e s/WORKSPACE/$WORKSPACE_ID/g \
5 | < prometheus.yaml.template \
6 | > prometheus.yaml
7 |
--------------------------------------------------------------------------------
/deploy-prometheus/cleanup-cloudmap.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 |
3 | #
4 | # Delete CloudMap service registries and namespace
5 | #
6 | aws servicediscovery delete-service --id $CLOUDMAP_WEBAPP_SERVICE_ID
7 | aws servicediscovery delete-service --id $CLOUDMAP_NODE_EXPORTER_SERVICE_ID
8 | aws servicediscovery delete-namespace --id $CLOUDMAP_NAMESPACE_ID
9 |
--------------------------------------------------------------------------------
/deploy-prometheus/cleanup-ecs.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 |
3 | #
4 | # Delete the ECS services
5 | #
6 | SERVICE_NAME=WebAppService
7 | aws ecs update-service --cluster $CLUSTER_NAME --service $SERVICE_NAME --desired-count 0
8 | aws ecs delete-service --cluster $CLUSTER_NAME --service $SERVICE_NAME
9 |
10 | SERVICE_NAME=PrometheusService
11 | aws ecs update-service --cluster $CLUSTER_NAME --service $SERVICE_NAME --desired-count 0
12 | aws ecs delete-service --cluster $CLUSTER_NAME --service $SERVICE_NAME
13 |
14 | SERVICE_NAME=NodeExporterService
15 | aws ecs delete-service --cluster $CLUSTER_NAME --service $SERVICE_NAME
16 |
17 | #
18 | # Deregister task definitions
19 | #
20 | aws ecs deregister-task-definition --task-definition $WEBAPP_TASK_DEFINITION
21 | aws ecs deregister-task-definition --task-definition $PROMETHEUS_TASK_DEFINITION
22 | aws ecs deregister-task-definition --task-definition $NODEEXPORTER_TASK_DEFINITION
23 |
--------------------------------------------------------------------------------
/deploy-prometheus/cleanup-iam.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 | #
3 | # Delete IAM roles and policies
4 | #
5 | aws iam detach-role-policy --role-name $ECS_GENERIC_TASK_ROLE --policy-arn $CLOUDWATCH_LOGS_POLICY_ARN
6 | aws iam detach-role-policy --role-name $ECS_TASK_EXECUTION_ROLE --policy-arn $CLOUDWATCH_LOGS_POLICY_ARN
7 | aws iam detach-role-policy --role-name $ECS_TASK_EXECUTION_ROLE --policy-arn $ECS_TASK_EXECUTION_POLICY_ARN
8 | aws iam detach-role-policy --role-name $ECS_PROMETHEUS_TASK_ROLE --policy-arn $ECS_PROMETHEUS_TASK_POLICY_ARN
9 |
10 | aws iam delete-policy --policy-arn $ECS_PROMETHEUS_TASK_POLICY_ARN
11 |
12 | aws iam delete-role --role-name $ECS_GENERIC_TASK_ROLE
13 | aws iam delete-role --role-name $ECS_TASK_EXECUTION_ROLE
14 | aws iam delete-role --role-name $ECS_PROMETHEUS_TASK_ROLE
15 |
16 |
--------------------------------------------------------------------------------
/deploy-prometheus/cloudmap.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 |
3 | #
4 | # Create a Service Discovery namespace
5 | #
6 | SERVICE_DISCOVERY_NAMESPACE=ecs-services
7 | OPERATION_ID=$(aws servicediscovery create-private-dns-namespace \
8 | --vpc $VPC_ID \
9 | --name $SERVICE_DISCOVERY_NAMESPACE \
10 | --query "OperationId" --output text)
11 |
12 | operationStatus() {
13 | aws servicediscovery get-operation --operation-id $OPERATION_ID --query "Operation.Status" --output text
14 | }
15 |
16 | until [ $(operationStatus) != "PENDING" ]; do
17 | echo "Namespace $SERVICE_DISCOVERY_NAMESPACE is creating ..."
18 | sleep 10s
19 | if [ $(operationStatus) == "SUCCESS" ]; then
20 | echo "Namespace $SERVICE_DISCOVERY_NAMESPACE created"
21 | break
22 | fi
23 | done
24 |
25 | CLOUDMAP_NAMESPACE_ID=$(aws servicediscovery get-operation \
26 | --operation-id $OPERATION_ID \
27 | --query "Operation.Targets.NAMESPACE" --output text)
28 |
29 | #
30 | # Create a Service Discovery service in the above namespace
31 | # When create a Service Discovery service with either private or public DNS, there are different options available for DNS record type.
32 | # When doing a DNS query on the service name:
33 | # 1. "A" records return a set of IP addresses that correspond to your tasks.
34 | # 2. "SRV" records return a set of IP addresses and ports per task.
35 | #
36 | METRICS_PATH=/metrics
37 | METRICS_PORT=3000
38 | SERVICE_REGISTRY_NAME="webapp-svc"
39 | SERVICE_REGISTRY_DESCRIPTION="Service registry for Webapp ECS service"
40 | CLOUDMAP_WEBAPP_SERVICE_ID=$(aws servicediscovery create-service \
41 | --name $SERVICE_REGISTRY_NAME \
42 | --description "$SERVICE_REGISTRY_DESCRIPTION" \
43 | --namespace-id $CLOUDMAP_NAMESPACE_ID \
44 | --dns-config "NamespaceId=$CLOUDMAP_NAMESPACE_ID,RoutingPolicy=WEIGHTED,DnsRecords=[{Type=A,TTL=10}]" \
45 | --region $AWS_REGION \
46 | --tags Key=METRICS_PATH,Value=$METRICS_PATH Key=METRICS_PORT,Value=$METRICS_PORT \
47 | --query "Service.Id" --output text)
48 | CLOUDMAP_WEBAPP_SERVICE_ARN=$(aws servicediscovery get-service \
49 | --id $CLOUDMAP_WEBAPP_SERVICE_ID \
50 | --query "Service.Arn" --output text)
51 | echo "Service registry $SERVICE_REGISTRY_NAME created"
52 |
53 |
54 | METRICS_PATH=/metrics
55 | METRICS_PORT=9100
56 | SERVICE_REGISTRY_NAME="node-exporter-svc"
57 | SERVICE_REGISTRY_DESCRIPTION="Service registry for Node Exporter ECS service"
58 | CLOUDMAP_NODE_EXPORTER_SERVICE_ID=$(aws servicediscovery create-service \
59 | --name $SERVICE_REGISTRY_NAME \
60 | --description "$SERVICE_REGISTRY_DESCRIPTION" \
61 | --namespace-id $CLOUDMAP_NAMESPACE_ID \
62 | --dns-config "NamespaceId=$CLOUDMAP_NAMESPACE_ID,RoutingPolicy=WEIGHTED,DnsRecords=[{Type=SRV,TTL=10}]" \
63 | --region $AWS_REGION \
64 | --tags Key=METRICS_PATH,Value=$METRICS_PATH Key=METRICS_PORT,Value=$METRICS_PORT \
65 | --query "Service.Id" --output text)
66 | CLOUDMAP_NODE_EXPORTER_SERVICE_ARN=$(aws servicediscovery get-service \
67 | --id $CLOUDMAP_NODE_EXPORTER_SERVICE_ID \
68 | --query "Service.Arn" --output text)
69 | echo "Service registry $SERVICE_REGISTRY_NAME created"
70 | echo "Service registry $SERVICE_REGISTRY_NAME created"
71 |
72 | export CLOUDMAP_NAMESPACE_ID
73 | export CLOUDMAP_NODE_EXPORTER_SERVICE_ARN
74 | export CLOUDMAP_NODE_EXPORTER_SERVICE_ID
75 | export CLOUDMAP_WEBAPP_SERVICE_ARN
76 | export CLOUDMAP_WEBAPP_SERVICE_ID
77 |
--------------------------------------------------------------------------------
/deploy-prometheus/ecs-cluster.yaml:
--------------------------------------------------------------------------------
1 | AWSTemplateFormatVersion: '2010-09-09'
2 | Description: EC2 ECS cluster running containers in a private subnet. Supports
3 | public facing load balancers, private internal load balancers, and
4 | both internal and external service discovery namespaces.
5 | Parameters:
6 | VpcName:
7 | Type: String
8 | Default: ECS-VPC
9 | Description: Unique name for the VPC
10 | EnvironmentName:
11 | Type: String
12 | Default: ecs-prometheus-cluster
13 | Description: "A friendly environment name that will be used for namespacing all cluster resources. Example: staging, qa, or production"
14 | InstanceType:
15 | Description: EC2 instance type
16 | Type: String
17 | Default: c5.large
18 | Description: Class of EC2 instance used to host containers. Choose t2 for testing, m5 for general purpose, c5 for CPU intensive services, and r5 for memory intensive services
19 | AllowedValues: [ t2.micro, t2.small, t2.medium, t2.large, t2.xlarge, t2.2xlarge,
20 | m5.large, m5.xlarge, m5.2large, m5.4xlarge, m5.12xlarge, m5.24large,
21 | c5.large, c5.xlarge, c5.2xlarge, c5.4xlarge, c5.9xlarge, c5.18xlarge,
22 | r5.large, r5.xlarge, r5.2xlarge, r5.4xlarge, r5.12xlarge, r5.24xlarge ]
23 | ConstraintDescription: Please choose a valid instance type.
24 | DesiredCapacity:
25 | Type: Number
26 | Default: '2'
27 | Description: Number of EC2 instances to launch in your ECS cluster.
28 | MaxSize:
29 | Type: Number
30 | Default: '4'
31 | Description: Maximum number of EC2 instances that can be launched in your ECS cluster.
32 | ECSAMI:
33 | Description: AMI ID
34 | Type: AWS::SSM::Parameter::Value
35 | Default: /aws/service/ecs/optimized-ami/amazon-linux-2/recommended/image_id
36 | Description: The Amazon Machine Image ID used for the cluster, leave it as the default value to get the latest AMI
37 | Subnet01Name:
38 | Type: String
39 | Default: "PUBLIC-WORKER-1"
40 | Description: Name for public subnet 01
41 | Subnet02Name:
42 | Type: String
43 | Default: "PUBLIC-WORKER-2"
44 | Description: Name for public subnet 02
45 | Subnet01PrivateName:
46 | Type: String
47 | Default: "PRIVATE-WORKER-1"
48 | Description: Name for private subnet 01
49 | Subnet02PrivateName:
50 | Type: String
51 | Default: "PRIVATE-WORKER-2"
52 | Description: Name for private subnet 02
53 | Mappings:
54 | SubnetConfig:
55 | VPC:
56 | CIDR: '10.10.0.0/16'
57 | PublicOne:
58 | CIDR: '10.10.0.0/24'
59 | PublicTwo:
60 | CIDR: '10.10.1.0/24'
61 | PrivateOne:
62 | CIDR: '10.10.100.0/24'
63 | PrivateTwo:
64 | CIDR: '10.10.101.0/24'
65 | Resources:
66 | VPC:
67 | Type: AWS::EC2::VPC
68 | Properties:
69 | EnableDnsSupport: true
70 | EnableDnsHostnames: true
71 | CidrBlock: !FindInMap ['SubnetConfig', 'VPC', 'CIDR']
72 | Tags:
73 | - Key: Name
74 | Value: !Ref VpcName
75 |
76 | #
77 | # Two public subnets, where containers can have public IP addresses
78 | #
79 | PublicSubnetOne:
80 | Type: AWS::EC2::Subnet
81 | Properties:
82 | AvailabilityZone: !Select
83 | - 0
84 | - Fn::GetAZs: !Ref 'AWS::Region'
85 | VpcId: !Ref 'VPC'
86 | Tags:
87 | - Key: Name
88 | Value: !Join [ '-', [ !Ref VpcName, !Ref Subnet01Name] ]
89 | - Key: ecs.io/role/elb
90 | Value: 1
91 | CidrBlock: !FindInMap ['SubnetConfig', 'PublicOne', 'CIDR']
92 | MapPublicIpOnLaunch: true
93 | PublicSubnetTwo:
94 | Type: AWS::EC2::Subnet
95 | Properties:
96 | AvailabilityZone: !Select
97 | - 1
98 | - Fn::GetAZs: !Ref 'AWS::Region'
99 | VpcId: !Ref 'VPC'
100 | Tags:
101 | - Key: Name
102 | Value: !Join [ '-', [ !Ref VpcName, !Ref Subnet02Name] ]
103 | - Key: ecs.io/role/elb
104 | Value: 1
105 | CidrBlock: !FindInMap ['SubnetConfig', 'PublicTwo', 'CIDR']
106 | MapPublicIpOnLaunch: true
107 |
108 | #
109 | # Two private subnets where containers will only have private IP addresses
110 | #
111 | PrivateSubnetOne:
112 | Type: AWS::EC2::Subnet
113 | Properties:
114 | AvailabilityZone: !Select
115 | - 0
116 | - Fn::GetAZs: !Ref 'AWS::Region'
117 | VpcId: !Ref 'VPC'
118 | Tags:
119 | - Key: Name
120 | Value: !Join [ '-', [ !Ref VpcName, !Ref Subnet01PrivateName] ]
121 | - Key: ecs.io/role/internal-elb
122 | Value: 1
123 | CidrBlock: !FindInMap ['SubnetConfig', 'PrivateOne', 'CIDR']
124 | PrivateSubnetTwo:
125 | Type: AWS::EC2::Subnet
126 | Properties:
127 | AvailabilityZone: !Select
128 | - 1
129 | - Fn::GetAZs: !Ref 'AWS::Region'
130 | VpcId: !Ref 'VPC'
131 | Tags:
132 | - Key: Name
133 | Value: !Join [ '-', [ !Ref VpcName, !Ref Subnet02PrivateName] ]
134 | - Key: ecs.io/role/internal-elb
135 | Value: 1
136 | CidrBlock: !FindInMap ['SubnetConfig', 'PrivateTwo', 'CIDR']
137 |
138 | #
139 | # Setup networking resources for the public subnets. Containers
140 | # in the public subnets have public IP addresses and the routing table
141 | # sends network traffic via the internet gateway.
142 | #
143 | InternetGateway:
144 | Type: AWS::EC2::InternetGateway
145 | GatewayAttachment:
146 | Type: AWS::EC2::VPCGatewayAttachment
147 | Properties:
148 | VpcId: !Ref 'VPC'
149 | InternetGatewayId: !Ref 'InternetGateway'
150 | PublicRouteTable:
151 | Type: AWS::EC2::RouteTable
152 | Properties:
153 | VpcId: !Ref 'VPC'
154 | Tags:
155 | - Key: Name
156 | Value: !Join [ '-', [ !Ref VpcName, 'PUBLIC-ROUTE-TABLE'] ]
157 | - Key: Network
158 | Value: Public
159 | PublicRoute:
160 | Type: AWS::EC2::Route
161 | DependsOn: GatewayAttachment
162 | Properties:
163 | RouteTableId: !Ref 'PublicRouteTable'
164 | DestinationCidrBlock: '0.0.0.0/0'
165 | GatewayId: !Ref 'InternetGateway'
166 | PublicSubnetOneRouteTableAssociation:
167 | Type: AWS::EC2::SubnetRouteTableAssociation
168 | Properties:
169 | SubnetId: !Ref PublicSubnetOne
170 | RouteTableId: !Ref PublicRouteTable
171 | PublicSubnetTwoRouteTableAssociation:
172 | Type: AWS::EC2::SubnetRouteTableAssociation
173 | Properties:
174 | SubnetId: !Ref PublicSubnetTwo
175 | RouteTableId: !Ref PublicRouteTable
176 |
177 | #
178 | # Setup networking resources for the private subnets. Containers
179 | # in these subnets have only private IP addresses, and must use a NAT
180 | # gateway to talk to the internet. We launch two NAT gateways, one for
181 | # each private subnet.
182 | #
183 | NatGatewayOneAttachment:
184 | Type: AWS::EC2::EIP
185 | DependsOn: GatewayAttachment
186 | Properties:
187 | Domain: vpc
188 | NatGatewayTwoAttachment:
189 | Type: AWS::EC2::EIP
190 | DependsOn: GatewayAttachment
191 | Properties:
192 | Domain: vpc
193 | NatGatewayOne:
194 | Type: AWS::EC2::NatGateway
195 | Properties:
196 | AllocationId: !GetAtt NatGatewayOneAttachment.AllocationId
197 | SubnetId: !Ref PublicSubnetOne
198 | NatGatewayTwo:
199 | Type: AWS::EC2::NatGateway
200 | Properties:
201 | AllocationId: !GetAtt NatGatewayTwoAttachment.AllocationId
202 | SubnetId: !Ref PublicSubnetTwo
203 | PrivateRouteTableOne:
204 | Type: AWS::EC2::RouteTable
205 | Properties:
206 | VpcId: !Ref 'VPC'
207 | Tags:
208 | - Key: Name
209 | Value: !Join [ '-', [ !Ref VpcName, 'PRIVATE-ROUTE-TABLE-01'] ]
210 | - Key: Network
211 | Value: Private
212 | PrivateRouteOne:
213 | Type: AWS::EC2::Route
214 | Properties:
215 | RouteTableId: !Ref PrivateRouteTableOne
216 | DestinationCidrBlock: 0.0.0.0/0
217 | NatGatewayId: !Ref NatGatewayOne
218 | PrivateRouteTableOneAssociation:
219 | Type: AWS::EC2::SubnetRouteTableAssociation
220 | Properties:
221 | RouteTableId: !Ref PrivateRouteTableOne
222 | SubnetId: !Ref PrivateSubnetOne
223 | PrivateRouteTableTwo:
224 | Type: AWS::EC2::RouteTable
225 | Properties:
226 | VpcId: !Ref 'VPC'
227 | Tags:
228 | - Key: Name
229 | Value: !Join [ '-', [ !Ref VpcName, 'PRIVATE-ROUTE-TABLE-02'] ]
230 | - Key: Network
231 | Value: Private
232 | PrivateRouteTwo:
233 | Type: AWS::EC2::Route
234 | Properties:
235 | RouteTableId: !Ref PrivateRouteTableTwo
236 | DestinationCidrBlock: 0.0.0.0/0
237 | NatGatewayId: !Ref NatGatewayTwo
238 | PrivateRouteTableTwoAssociation:
239 | Type: AWS::EC2::SubnetRouteTableAssociation
240 | Properties:
241 | RouteTableId: !Ref PrivateRouteTableTwo
242 | SubnetId: !Ref PrivateSubnetTwo
243 |
244 | #
245 | # ECS Cluster
246 | #
247 | ECSCluster:
248 | Type: AWS::ECS::Cluster
249 | Properties:
250 | ClusterName: !Ref EnvironmentName
251 |
252 | #
253 | # A security group for the containers we will run in ECS.
254 | # Rules are added to this security group based on what ingress you
255 | # add for the cluster.
256 | #
257 | ContainerSecurityGroup:
258 | Type: AWS::EC2::SecurityGroup
259 | Properties:
260 | GroupDescription: Access to the ECS hosts that run containers
261 | VpcId: !Ref 'VPC'
262 | Tags:
263 | - Key: Name
264 | Value: ECS-ContainerInstance-Security-Group
265 |
266 | ContainerSecurityGroupMemberIngress:
267 | Type: AWS::EC2::SecurityGroupIngress
268 | DependsOn: ContainerSecurityGroup
269 | Properties:
270 | Description: Allow nodes in this security group to communicate with each other
271 | GroupId: !Ref ContainerSecurityGroup
272 | SourceSecurityGroupId: !Ref ContainerSecurityGroup
273 | IpProtocol: '-1'
274 | FromPort: 0
275 | ToPort: 65535
276 |
277 | ContainerSecurityGroupHttpIngress:
278 | Type: AWS::EC2::SecurityGroupIngress
279 | DependsOn: ContainerSecurityGroup
280 | Properties:
281 | Description: Allow nodes in this security group to communicate with each other
282 | GroupId: !Ref ContainerSecurityGroup
283 | IpProtocol: tcp
284 | CidrIp: 0.0.0.0/0
285 | FromPort: 80
286 | ToPort: 80
287 |
288 | #
289 | # Autoscaling group. This launches the actual EC2 instances that will register
290 | # themselves as members of the cluster, and run the docker containers.
291 | #
292 | ECSAutoScalingGroup:
293 | Type: AWS::AutoScaling::AutoScalingGroup
294 | Properties:
295 | VPCZoneIdentifier:
296 | - !Ref PrivateSubnetOne
297 | - !Ref PrivateSubnetTwo
298 | LaunchConfigurationName: !Ref 'ContainerInstances'
299 | MinSize: '1'
300 | MaxSize: !Ref 'MaxSize'
301 | DesiredCapacity: !Ref 'DesiredCapacity'
302 | Tags:
303 | - Key: Name
304 | Value: !Sub "${EnvironmentName}-Container-Instance"
305 | PropagateAtLaunch: 'true'
306 | CreationPolicy:
307 | ResourceSignal:
308 | Timeout: PT5M
309 | UpdatePolicy:
310 | AutoScalingReplacingUpdate:
311 | WillReplace: 'true'
312 |
313 | ContainerInstances:
314 | Type: AWS::AutoScaling::LaunchConfiguration
315 | Properties:
316 | ImageId: !Ref 'ECSAMI'
317 | SecurityGroups: [!Ref 'ContainerSecurityGroup']
318 | InstanceType: !Ref 'InstanceType'
319 | IamInstanceProfile: !Ref 'EC2InstanceProfile'
320 | UserData:
321 | Fn::Base64: !Sub |
322 | #!/bin/bash -xe
323 | echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
324 | echo ECS_IMAGE_PULL_BEHAVIOR=always >> /etc/ecs/ecs.config
325 | echo ECS_ENABLE_CONTAINER_METADATA=true >> /etc/ecs/ecs.config
326 | echo ECS_ENABLE_SPOT_INSTANCE_DRAINING=true >> /etc/ecs/ecs.config
327 | yum install -y aws-cfn-bootstrap
328 | /opt/aws/bin/cfn-signal -e $? --stack ${AWS::StackName} --resource ECSAutoScalingGroup --region ${AWS::Region}
329 |
330 | EC2InstanceProfile:
331 | Type: AWS::IAM::InstanceProfile
332 | Properties:
333 | Path: /
334 | Roles: [!Ref 'EC2InstanceRole']
335 |
336 | EC2InstanceRole:
337 | Type: AWS::IAM::Role
338 | Properties:
339 | AssumeRolePolicyDocument:
340 | Statement:
341 | - Effect: Allow
342 | Principal:
343 | Service: [ec2.amazonaws.com]
344 | Action: ['sts:AssumeRole']
345 | Path: /
346 | ManagedPolicyArns:
347 | - arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role
348 |
349 | Outputs:
350 | ClusterName:
351 | Description: The name of the ECS cluster
352 | Value: !Ref 'ECSCluster'
353 | Export:
354 | Name: !Sub ${AWS::StackName}-ClusterName
355 | EC2InstanceRole:
356 | Description: The ARN of the EC2 Instance role
357 | Value: !GetAtt 'EC2InstanceRole.Arn'
358 | Export:
359 | Name: !Sub ${AWS::StackName}-ECSRole
360 | VpcId:
361 | Description: The ID of the VPC that this stack is deployed in
362 | Value: !Ref 'VPC'
363 | Export:
364 | Name: !Sub ${AWS::StackName}-VpcId
365 | PublicSubnetOne:
366 | Description: Public subnet one
367 | Value: !Ref 'PublicSubnetOne'
368 | Export:
369 | Name: !Sub ${AWS::StackName}-PublicSubnetOne
370 | PublicSubnetTwo:
371 | Description: Public subnet two
372 | Value: !Ref 'PublicSubnetTwo'
373 | Export:
374 | Name: !Sub ${AWS::StackName}-PublicSubnetTwo
375 | PrivateSubnetOne:
376 | Description: Private subnet one
377 | Value: !Ref 'PrivateSubnetOne'
378 | Export:
379 | Name: !Sub ${AWS::StackName}-PrivateSubnetOne
380 | PrivateSubnetTwo:
381 | Description: Private subnet two
382 | Value: !Ref 'PrivateSubnetTwo'
383 | Export:
384 | Name: !Sub ${AWS::StackName}-PrivateSubnetTwo
385 | ECSAutoScalingGroup:
386 | Description: Autoscaling group for EC2 instances
387 | Value: !Ref ECSAutoScalingGroup
388 | Export:
389 | Name: !Sub ${AWS::StackName}-ECSAutoScalingGroup
390 | ECSLaunchConfiguration:
391 | Description: Launch configuration for EC2 instances
392 | Value: !Ref ContainerInstances
393 | Export:
394 | Name: !Sub ${AWS::StackName}-ECSLaunchConfiguration
395 | ContainerSecurityGroup:
396 | Description: A security group used to allow containers to receive traffic
397 | Value: !Ref 'ContainerSecurityGroup'
398 | Export:
399 | Name: !Sub ${AWS::StackName}-ContainerSecurityGroup
400 |
--------------------------------------------------------------------------------
/deploy-prometheus/env.sh:
--------------------------------------------------------------------------------
1 |
2 | ##!/bin/bash
3 | export AWS_REGION=us-east-1
4 | export ACCOUNT_ID=123456789012
5 | export STACK_NAME=ecs-stack
6 |
7 | export CLUSTER_NAME=$(aws cloudformation describe-stacks --stack-name $STACK_NAME --query 'Stacks[0].Outputs[?OutputKey==`ClusterName`].OutputValue' --output text)
8 | export VPC_ID=$(aws cloudformation describe-stacks --stack-name $STACK_NAME --query 'Stacks[0].Outputs[?OutputKey==`VpcId`].OutputValue' --output text)
9 | export PUBLIC_SUBNET_IDS=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$VPC_ID" "Name=tag-key,Values=ecs.io/role/elb" --query "Subnets[].SubnetId" --output text)
10 | export PRIVATE_SUBNET_IDS=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$VPC_ID" "Name=tag-key,Values=ecs.io/role/internal-elb" --query "Subnets[].SubnetId" --output json)
11 | export SECURITY_GROUP_ID=$(aws cloudformation describe-stacks --stack-name $STACK_NAME --query 'Stacks[0].Outputs[?OutputKey==`ContainerSecurityGroup`].OutputValue' --output text)
12 |
--------------------------------------------------------------------------------
/deploy-prometheus/iam.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 |
3 | #
4 | # Create a trust policy for ECS task and task execution roles
5 | #
6 | cat < TrustPolicy.json
7 | {
8 | "Version": "2012-10-17",
9 | "Statement": [
10 | {
11 | "Sid": "",
12 | "Effect": "Allow",
13 | "Principal": {
14 | "Service": "ecs-tasks.amazonaws.com"
15 | },
16 | "Action": "sts:AssumeRole"
17 | }
18 | ]
19 | }
20 | EOF
21 |
22 | #
23 | # Create a permission policy for IAM role used by Prometheus task
24 | #
25 | cat < PermissionPolicyIngest.json
26 | {
27 | "Version": "2012-10-17",
28 | "Statement": [
29 | {
30 | "Effect": "Allow",
31 | "Action": [
32 | "aps:RemoteWrite",
33 | "aps:GetSeries",
34 | "aps:GetLabels",
35 | "aps:GetMetricMetadata"
36 | ],
37 | "Resource": "*"
38 | },
39 | {
40 | "Effect": "Allow",
41 | "Action": [
42 | "ssm:GetParameter"
43 | ],
44 | "Resource": "*"
45 | },
46 | {
47 | "Effect": "Allow",
48 | "Action": [
49 | "servicediscovery:*"
50 | ],
51 | "Resource": "*"
52 | }
53 | ]
54 | }
55 | EOF
56 |
57 | CLOUDWATCH_LOGS_POLICY_ARN=arn:aws:iam::aws:policy/CloudWatchLogsFullAccess
58 | ECS_TASK_EXECUTION_POLICY_ARN=arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy
59 |
60 | ECS_GENERIC_TASK_ROLE="ECS-Generic-Task-Role"
61 | ECS_GENERIC_TASK_ROLE_ARN=$(aws iam create-role \
62 | --role-name $ECS_GENERIC_TASK_ROLE \
63 | --assume-role-policy-document file://TrustPolicy.json \
64 | --query "Role.Arn" --output text)
65 | aws iam attach-role-policy --role-name $ECS_GENERIC_TASK_ROLE --policy-arn $CLOUDWATCH_LOGS_POLICY_ARN
66 |
67 | ECS_TASK_EXECUTION_ROLE="ECS-Task-Execution-Role"
68 | ECS_TASK_EXECUTION_ROLE_ARN=$(aws iam create-role \
69 | --role-name $ECS_TASK_EXECUTION_ROLE \
70 | --assume-role-policy-document file://TrustPolicy.json \
71 | --query "Role.Arn" --output text)
72 | aws iam attach-role-policy --role-name $ECS_TASK_EXECUTION_ROLE --policy-arn $CLOUDWATCH_LOGS_POLICY_ARN
73 | aws iam attach-role-policy --role-name $ECS_TASK_EXECUTION_ROLE --policy-arn $ECS_TASK_EXECUTION_POLICY_ARN
74 |
75 | ECS_PROMETHEUS_TASK_ROLE="ECS-Prometheus-Task-Role"
76 | ECS_PROMETHEUS_TASK_ROLE_ARN=$(aws iam create-role \
77 | --role-name $ECS_PROMETHEUS_TASK_ROLE \
78 | --assume-role-policy-document file://TrustPolicy.json \
79 | --query "Role.Arn" --output text)
80 |
81 | ECS_PROMETHEUS_TASK_POLICY="ECSPrometheusTaskPolicy"
82 | ECS_PROMETHEUS_TASK_POLICY_ARN=$(aws iam create-policy --policy-name $ECS_PROMETHEUS_TASK_POLICY \
83 | --policy-document file://PermissionPolicyIngest.json \
84 | --query 'Policy.Arn' --output text)
85 |
86 | aws iam attach-role-policy --role-name $ECS_PROMETHEUS_TASK_ROLE --policy-arn $ECS_PROMETHEUS_TASK_POLICY_ARN
87 |
88 | export ECS_GENERIC_TASK_ROLE
89 | export ECS_TASK_EXECUTION_ROLE
90 | export ECS_PROMETHEUS_TASK_ROLE
91 |
92 | export CLOUDWATCH_LOGS_POLICY_ARN
93 | export ECS_TASK_EXECUTION_POLICY_ARN
94 | export ECS_PROMETHEUS_TASK_POLICY_ARN
95 |
--------------------------------------------------------------------------------
/deploy-prometheus/nodeExporterTaskDefinition.json.template:
--------------------------------------------------------------------------------
1 | {
2 | "family":"NodeExporterTask",
3 | "taskRoleArn":"arn:aws:iam::ACCOUNT:role/ECS-Generic-Task-Role",
4 | "executionRoleArn":"arn:aws:iam::ACCOUNT:role/ECS-Task-Execution-Role",
5 | "networkMode":"host",
6 | "containerDefinitions":[
7 | {
8 | "name":"prometheus-node-exporter",
9 | "image":"quay.io/prometheus/node-exporter:v1.0.1",
10 | "user":"root",
11 | "portMappings":[
12 | {
13 | "hostPort":9100,
14 | "containerPort":9100,
15 | "protocol":"tcp"
16 | }
17 | ],
18 | "command":[
19 | "--path.procfs=/host/proc",
20 | "--path.sysfs=/host/sys",
21 | "--path.rootfs=/host/root",
22 | "--web.listen-address=:9100"
23 | ],
24 | "logConfiguration":{
25 | "logDriver":"awslogs",
26 | "options":{
27 | "awslogs-group":"/ecs/Prometheus",
28 | "awslogs-create-group":"true",
29 | "awslogs-region":"REGION",
30 | "awslogs-stream-prefix":"node-exporter"
31 | }
32 | },
33 | "mountPoints":[
34 | {
35 | "sourceVolume":"rootVolume",
36 | "containerPath":"/host/root",
37 | "readOnly":true
38 | },
39 | {
40 | "sourceVolume":"procVolume",
41 | "containerPath":"/host/proc",
42 | "readOnly":true
43 | },
44 | {
45 | "sourceVolume":"sysVolume",
46 | "containerPath":"/host/sys",
47 | "readOnly":true
48 | }
49 | ],
50 | "essential":true
51 | }
52 | ],
53 | "volumes":[
54 | {
55 | "name":"rootVolume",
56 | "host":{
57 | "sourcePath":"/"
58 | }
59 | },
60 | {
61 | "name":"procVolume",
62 | "host":{
63 | "sourcePath":"/proc"
64 | }
65 | },
66 | {
67 | "name":"sysVolume",
68 | "host":{
69 | "sourcePath":"/sys"
70 | }
71 | }
72 | ],
73 | "requiresCompatibilities":[
74 | "EC2"
75 | ],
76 | "cpu":"256",
77 | "memory":"256"
78 | }
79 |
--------------------------------------------------------------------------------
/deploy-prometheus/prometheus.yaml.template:
--------------------------------------------------------------------------------
1 | global:
2 | evaluation_interval: 1m
3 | scrape_interval: 30s
4 | scrape_timeout: 10s
5 | remote_write:
6 | - url: http://localhost:8080/workspaces/WORKSPACE/api/v1/remote_write
7 | scrape_configs:
8 | - job_name: ecs_services
9 | file_sd_configs:
10 | - files:
11 | - /etc/config/ecs-services.json
12 | refresh_interval: 30s
--------------------------------------------------------------------------------
/deploy-prometheus/prometheusTaskDefinition.json.template:
--------------------------------------------------------------------------------
1 | {
2 | "family":"PrometheusTask",
3 | "taskRoleArn":"arn:aws:iam::ACCOUNT:role/ECS-Prometheus-Task-Role",
4 | "executionRoleArn":"arn:aws:iam::ACCOUNT:role/ECS-Task-Execution-Role",
5 | "networkMode":"awsvpc",
6 | "containerDefinitions":[
7 | {
8 | "name":"config-reloader",
9 | "image":"public.ecr.aws/awsvijisarathy/prometheus-sdconfig-reloader:1.0",
10 | "user":"root",
11 | "cpu": 128,
12 | "memory": 128,
13 | "environment":[
14 | {
15 | "name":"CONFIG_FILE_DIR",
16 | "value":"/etc/config"
17 | },
18 | {
19 | "name":"CONFIG_RELOAD_FREQUENCY",
20 | "value":"60"
21 | },
22 | {
23 | "name":"PROMETHEUS_CONFIG_PARAMETER_NAME",
24 | "value":"ECS-Prometheus-Configuration"
25 | },
26 | {
27 | "name":"DISCOVERY_NAMESPACES_PARAMETER_NAME",
28 | "value":"ECS-ServiceDiscovery-Namespaces"
29 | }
30 | ],
31 | "mountPoints":[
32 | {
33 | "sourceVolume":"configVolume",
34 | "containerPath":"/etc/config",
35 | "readOnly":false
36 | }
37 | ],
38 | "logConfiguration":{
39 | "logDriver":"awslogs",
40 | "options":{
41 | "awslogs-group":"/ecs/Prometheus",
42 | "awslogs-create-group":"true",
43 | "awslogs-region":"REGION",
44 | "awslogs-stream-prefix":"reloader"
45 | }
46 | },
47 | "essential":true
48 | },
49 | {
50 | "name":"aws-iamproxy",
51 | "image":"public.ecr.aws/aws-observability/aws-sigv4-proxy:1.0",
52 | "cpu": 256,
53 | "memory": 256,
54 | "portMappings":[
55 | {
56 | "containerPort":8080,
57 | "protocol":"tcp"
58 | }
59 | ],
60 | "command":[
61 | "--name",
62 | "aps",
63 | "--region",
64 | "REGION",
65 | "--host",
66 | "aps-workspaces.REGION.amazonaws.com"
67 | ],
68 | "logConfiguration":{
69 | "logDriver":"awslogs",
70 | "options":{
71 | "awslogs-group":"/ecs/Prometheus",
72 | "awslogs-create-group":"true",
73 | "awslogs-region":"REGION",
74 | "awslogs-stream-prefix":"iamproxy"
75 | }
76 | },
77 | "essential":true
78 | },
79 | {
80 | "name":"prometheus-server",
81 | "image":"quay.io/prometheus/prometheus:v2.24.0",
82 | "user":"root",
83 | "cpu": 512,
84 | "memory": 512,
85 | "portMappings":[
86 | {
87 | "containerPort":9090,
88 | "protocol":"tcp"
89 | }
90 | ],
91 | "command":[
92 | "--storage.tsdb.retention.time=15d",
93 | "--config.file=/etc/config/prometheus.yaml",
94 | "--storage.tsdb.path=/data",
95 | "--web.console.libraries=/etc/prometheus/console_libraries",
96 | "--web.console.templates=/etc/prometheus/consoles",
97 | "--web.enable-lifecycle"
98 | ],
99 | "logConfiguration":{
100 | "logDriver":"awslogs",
101 | "options":{
102 | "awslogs-group":"/ecs/Prometheus",
103 | "awslogs-create-group":"true",
104 | "awslogs-region":"REGION",
105 | "awslogs-stream-prefix":"server"
106 | }
107 | },
108 | "mountPoints":[
109 | {
110 | "sourceVolume":"configVolume",
111 | "containerPath":"/etc/config",
112 | "readOnly":false
113 | },
114 | {
115 | "sourceVolume":"logsVolume",
116 | "containerPath":"/data"
117 | }
118 | ],
119 | "healthCheck":{
120 | "command":[
121 | "CMD-SHELL",
122 | "wget http://localhost:9090/-/healthy -O /dev/null|| exit 1"
123 | ],
124 | "interval":10,
125 | "timeout":2,
126 | "retries":2,
127 | "startPeriod":10
128 | },
129 | "dependsOn": [
130 | {
131 | "containerName": "config-reloader",
132 | "condition": "START"
133 | },
134 | {
135 | "containerName": "aws-iamproxy",
136 | "condition": "START"
137 | }
138 | ],
139 | "essential":true
140 | }
141 | ],
142 | "volumes":[
143 | {
144 | "name":"configVolume",
145 | "host":{}
146 | },
147 | {
148 | "name":"logsVolume",
149 | "host":{}
150 | }
151 | ],
152 | "requiresCompatibilities":[
153 | "EC2"
154 | ],
155 | "cpu":"1000",
156 | "memory":"1024"
157 | }
158 |
--------------------------------------------------------------------------------
/deploy-prometheus/services.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 |
3 | #
4 | # NodeExporter Service
5 | #
6 | SERVICE_NAME=NodeExporterService
7 | TASK_DEFINITION=$NODEEXPORTER_TASK_DEFINITION
8 | CLOUDMAP_SERVICE_ARN=$CLOUDMAP_NODE_EXPORTER_SERVICE_ARN
9 | aws ecs create-service --service-name $SERVICE_NAME \
10 | --cluster $CLUSTER_NAME \
11 | --task-definition $TASK_DEFINITION \
12 | --service-registries "containerName=prometheus-node-exporter,containerPort=9100,registryArn=$CLOUDMAP_SERVICE_ARN" \
13 | --scheduling-strategy DAEMON \
14 | --launch-type EC2
15 |
16 | #
17 | # WebApp Service
18 | #
19 | SERVICE_NAME=WebAppService
20 | TASK_DEFINITION=$WEBAPP_TASK_DEFINITION
21 | CLOUDMAP_SERVICE_ARN=$CLOUDMAP_WEBAPP_SERVICE_ARN
22 | aws ecs create-service --service-name $SERVICE_NAME \
23 | --cluster $CLUSTER_NAME \
24 | --task-definition $TASK_DEFINITION \
25 | --desired-count 2 \
26 | --service-registries "registryArn=$CLOUDMAP_SERVICE_ARN" \
27 | --network-configuration "awsvpcConfiguration={subnets=$PRIVATE_SUBNET_IDS,securityGroups=[$SECURITY_GROUP_ID],assignPublicIp=DISABLED}" \
28 | --scheduling-strategy REPLICA \
29 | --launch-type EC2
30 |
31 | #
32 | # Create the Prometheus Service
33 | #
34 | SERVICE_NAME=PrometheusService
35 | TASK_DEFINITION=$PROMETHEUS_TASK_DEFINITION
36 | aws ecs create-service --service-name $SERVICE_NAME \
37 | --cluster $CLUSTER_NAME \
38 | --task-definition $TASK_DEFINITION \
39 | --desired-count 1 \
40 | --network-configuration "awsvpcConfiguration={subnets=$PRIVATE_SUBNET_IDS,securityGroups=[$SECURITY_GROUP_ID],assignPublicIp=DISABLED}" \
41 | --scheduling-strategy REPLICA \
42 | --launch-type EC2
--------------------------------------------------------------------------------
/deploy-prometheus/task-definitions.sh:
--------------------------------------------------------------------------------
1 | ##!/bin/bash
2 |
3 | #
4 | # Task Definitons
5 | #
6 | sed -e s/ACCOUNT/$ACCOUNT_ID/g \
7 | -e s/REGION/$AWS_REGION/g \
8 | < webappTaskDefinition.json.template \
9 | > webappTaskDefinition.json
10 | WEBAPP_TASK_DEFINITION=$(aws ecs register-task-definition \
11 | --cli-input-json file://webappTaskDefinition.json \
12 | --region $AWS_REGION \
13 | --query "taskDefinition.taskDefinitionArn" --output text)
14 |
15 | sed -e s/ACCOUNT/$ACCOUNT_ID/g \
16 | -e s/REGION/$AWS_REGION/g \
17 | < prometheusTaskDefinition.json.template \
18 | > prometheusTaskDefinition.json
19 | PROMETHEUS_TASK_DEFINITION=$(aws ecs register-task-definition \
20 | --cli-input-json file://prometheusTaskDefinition.json \
21 | --region $AWS_REGION \
22 | --query "taskDefinition.taskDefinitionArn" --output text)
23 |
24 | sed -e s/ACCOUNT/$ACCOUNT_ID/g \
25 | -e s/REGION/$AWS_REGION/g \
26 | < nodeExporterTaskDefinition.json.template \
27 | > nodeExporterTaskDefinition.json
28 | NODEEXPORTER_TASK_DEFINITION=$(aws ecs register-task-definition \
29 | --cli-input-json file://nodeExporterTaskDefinition.json \
30 | --region $AWS_REGION \
31 | --query "taskDefinition.taskDefinitionArn" --output text)
32 |
33 | export WEBAPP_TASK_DEFINITION
34 | export PROMETHEUS_TASK_DEFINITION
35 | export NODEEXPORTER_TASK_DEFINITION
--------------------------------------------------------------------------------
/deploy-prometheus/webappTaskDefinition.json.template:
--------------------------------------------------------------------------------
1 | {
2 | "family":"WebAppTask",
3 | "taskRoleArn":"arn:aws:iam::ACCOUNT:role/ECS-Generic-Task-Role",
4 | "executionRoleArn":"arn:aws:iam::ACCOUNT:role/ECS-Task-Execution-Role",
5 | "networkMode":"awsvpc",
6 | "containerDefinitions":[
7 | {
8 | "name":"webapp",
9 | "image":"public.ecr.aws/awsvijisarathy/prometheus-webapp:latest",
10 | "portMappings" : [
11 | {
12 | "containerPort": 3000,
13 | "protocol": "tcp"
14 | }
15 | ],
16 | "logConfiguration":{
17 | "logDriver":"awslogs",
18 | "options":{
19 | "awslogs-group":"/ecs/Webapp",
20 | "awslogs-create-group":"true",
21 | "awslogs-region":"REGION"
22 | }
23 | },
24 | "essential":true
25 | }
26 | ],
27 | "requiresCompatibilities":[
28 | "EC2"
29 | ],
30 | "cpu":"128",
31 | "memory":"128"
32 | }
33 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/aws-samples/prometheus-for-ecs
2 |
3 | go 1.15
4 |
5 | require github.com/aws/aws-sdk-go v1.37.19
6 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/aws/aws-sdk-go v1.37.19 h1:/xKHoSsYfH9qe16pJAHIjqTVpMM2DRSsEt8Ok1bzYiw=
2 | github.com/aws/aws-sdk-go v1.37.19/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro=
3 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
4 | github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
5 | github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
6 | github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
7 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
8 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
9 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
10 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
11 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
12 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
13 | golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
14 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
15 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
16 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
17 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
18 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
19 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
20 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
21 | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
22 |
--------------------------------------------------------------------------------
/images/Deployment-Architecture-ADOT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/prometheus-for-ecs/a9aaa21232f32d8e21281e8aebdd873dff9852c7/images/Deployment-Architecture-ADOT.png
--------------------------------------------------------------------------------
/images/Deployment-Architecture-Prometheus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/prometheus-for-ecs/a9aaa21232f32d8e21281e8aebdd873dff9852c7/images/Deployment-Architecture-Prometheus.png
--------------------------------------------------------------------------------
/pkg/aws/cloudmap.go:
--------------------------------------------------------------------------------
1 | package aws
2 |
3 | import (
4 | "encoding/json"
5 | "errors"
6 | "fmt"
7 | "log"
8 |
9 | "github.com/aws/aws-sdk-go/aws"
10 | "github.com/aws/aws-sdk-go/service/servicediscovery"
11 | )
12 |
13 | const (
14 | IpAddressAttribute = "AWS_INSTANCE_IPV4"
15 | PortNumberAttribute = "AWS_INSTANCE_PORT"
16 | ClusterNameAttribute = "ECS_CLUSTER_NAME"
17 | ServiceNameAttribute = "ECS_SERVICE_NAME"
18 | TaskDefinitionAttribute = "ECS_TASK_DEFINITION_FAMILY"
19 | MetricsPortTag = "METRICS_PORT"
20 | MetricsPathTag = "METRICS_PATH"
21 | EcsMetricsPortTag = "ECS_METRICS_PORT"
22 | EcsMetricsPathTag = "ECS_METRICS_PATH"
23 | )
24 |
25 | type CloudMapClient struct {
26 | service *servicediscovery.ServiceDiscovery
27 | }
28 |
29 | type ServiceDiscoveryInstance struct {
30 | service *string
31 | instanceId *string
32 | attributes map[string]*string
33 | }
34 |
35 | type InstanceScrapeConfig struct {
36 | Targets []string `json:"targets,omitempty"`
37 | Labels map[string]string `json:"labels,omitempty"`
38 | }
39 |
40 | //
41 | // Retrieve a JSON object that provides a list of ECS targets to be scraped for Prometheus metrics
42 | //
43 | func GetPrometheusScrapeConfig(selectedNamespaces []string) *string {
44 | client := &CloudMapClient{service: servicediscovery.New(sharedSession)}
45 |
46 | sdNamespaces, _ := client.getNamespaces()
47 | sdServicesMap, _ := client.getServices(selectedNamespaces, sdNamespaces)
48 | scrapeConfigurations := make([]*InstanceScrapeConfig, 0)
49 | for sdNamespace, sdServices := range sdServicesMap {
50 | for _, service := range sdServices {
51 | serviceTags := client.getServiceTags(service)
52 | sdInstances, _ := client.getInstances(service)
53 | for _, instance := range sdInstances {
54 | appScrapeConfig, _ := client.getInstanceScrapeConfigurationApplication(instance, serviceTags, &sdNamespace)
55 | infraScrapeConfig, _ := client.getInstanceScrapeConfigurationInfrastructure(instance, serviceTags, &sdNamespace)
56 | if appScrapeConfig != nil {
57 | scrapeConfigurations = append(scrapeConfigurations, appScrapeConfig)
58 | }
59 | if infraScrapeConfig != nil {
60 | scrapeConfigurations = append(scrapeConfigurations, infraScrapeConfig)
61 | }
62 | }
63 | }
64 | }
65 |
66 | jsonBytes, err := json.MarshalIndent(scrapeConfigurations, "", " ")
67 | if err != nil {
68 | log.Println(err)
69 | return aws.String("")
70 | }
71 | jsonString := string(jsonBytes)
72 | return &jsonString
73 | }
74 |
75 | //
76 | // Get a list of all ServiceDiscovery namespaces and their respective IDs available under Cloud Map
77 | //
78 | func (c *CloudMapClient) getNamespaces() (map[string]string, error) {
79 | filterType := aws.String("TYPE")
80 | filterCondition := aws.String("EQ")
81 | filterValues := []*string{aws.String("DNS_PRIVATE")}
82 | namespaceFilter := servicediscovery.NamespaceFilter{
83 | Name: filterType,
84 | Values: filterValues,
85 | Condition: filterCondition}
86 | listNamespacesOutput, err := c.service.ListNamespaces(&servicediscovery.ListNamespacesInput{Filters: []*servicediscovery.NamespaceFilter{&namespaceFilter}})
87 | if err != nil {
88 | log.Println(err)
89 | return nil, err
90 | }
91 | sdNamespaces := make(map[string]string)
92 | for _, namespaceSummary := range listNamespacesOutput.Namespaces {
93 | sdNamespaces[*namespaceSummary.Name] = *namespaceSummary.Id
94 | }
95 | return sdNamespaces, nil
96 | }
97 |
98 | //
99 | // Cycle through each ServiceDiscovery namespace and find the list of ServiceDiscovery services
100 | //
101 | func (c *CloudMapClient) getServices(selectedNamespaces []string, sdNamespaces map[string]string) (map[string][]*servicediscovery.ServiceSummary, error) {
102 | sdServicesMap := make(map[string][]*servicediscovery.ServiceSummary)
103 | sdServicesCount := 0
104 | for _, name := range selectedNamespaces {
105 | sdServices := make([]*servicediscovery.ServiceSummary, 0)
106 | if id, present := sdNamespaces[name]; present {
107 | fmt.Printf("Discovering scraping targets in the namespace '%s'\n", name)
108 | filterType := aws.String("NAMESPACE_ID")
109 | filterCondition := aws.String("EQ")
110 | filterValues := []*string{&id}
111 | serviceFilter := servicediscovery.ServiceFilter{
112 | Name: filterType,
113 | Values: filterValues,
114 | Condition: filterCondition}
115 | listServiceOutput, err := c.service.ListServices(&servicediscovery.ListServicesInput{Filters: []*servicediscovery.ServiceFilter{&serviceFilter}})
116 | if err != nil {
117 | log.Println(err)
118 | return nil, err
119 | }
120 | for _, serviceSummary := range listServiceOutput.Services {
121 | sdServices = append(sdServices, serviceSummary)
122 | sdServicesCount++
123 | }
124 | sdServicesMap[name] = sdServices
125 | }
126 | }
127 | fmt.Printf("No.of services discovered for scraping = %d\n", sdServicesCount)
128 | return sdServicesMap, nil
129 | }
130 |
131 | //
132 | // Retrieve the list of tags associated with each ServiceDiscovery service.
133 | // Tags are used to specify the URL path and port for endpoint where metrics are scraped from
134 | // We are resorting to using tags because ServiceDiscovery API does not yet support adding custom attributes
135 | //
136 | func (c *CloudMapClient) getServiceTags(summary *servicediscovery.ServiceSummary) map[string]*string {
137 | tags := make(map[string]*string)
138 | getListTagsForResourceOutput, _ := c.service.ListTagsForResource(&servicediscovery.ListTagsForResourceInput{ResourceARN: summary.Arn})
139 | for _, serviceTag := range getListTagsForResourceOutput.Tags {
140 | tags[*serviceTag.Key] = serviceTag.Value
141 | }
142 | return tags
143 | }
144 |
145 | //
146 | // Retrieve the list of ServiceDiscovery instances associated with each ServiceDiscovery service
147 | // For each ServiceDiscovery instance, retrieve the default ECS attributes
148 | //
149 | func (c *CloudMapClient) getInstances(serviceSummary *servicediscovery.ServiceSummary) ([]*ServiceDiscoveryInstance, error) {
150 | sdInstaces := make([]*ServiceDiscoveryInstance, 0)
151 | getListInstancesOutput, err := c.service.ListInstances(&servicediscovery.ListInstancesInput{ServiceId: serviceSummary.Id})
152 | if err != nil {
153 | log.Println(err)
154 | return nil, err
155 | }
156 | for _, instanceSummary := range getListInstancesOutput.Instances {
157 | sdInstance := ServiceDiscoveryInstance{service: serviceSummary.Name, instanceId: instanceSummary.Id, attributes: instanceSummary.Attributes}
158 | sdInstaces = append(sdInstaces, &sdInstance)
159 | }
160 | fmt.Printf("No.of instances discovered for scraping in service '%s' = %d\n", *serviceSummary.Name, len(sdInstaces))
161 | return sdInstaces, nil
162 | }
163 |
164 | //
165 | // Construct Prometheus scrape configuration for each ServiceDiscovery instance based on its attributes and the associated ServiceDiscovery service tags
166 | //
167 | func (c *CloudMapClient) getInstanceScrapeConfigurationApplication(sdInstance *ServiceDiscoveryInstance, serviceTags map[string]*string, sdNamespace *string) (*InstanceScrapeConfig, error) {
168 | // Path for application metrics endpoint is expected as a resource tag with the key 'METRICS_PATH'
169 | metricsPath, present := serviceTags[MetricsPathTag]
170 | if !present {
171 | return nil, nil
172 | }
173 |
174 | // This is relevant for ECS tasks using bridge networking mode that are using host->container port mapping.
175 | // Port number of the resource is available, by default, as an attribute with the key 'AWS_INSTANCE_PORT'
176 | defaultPort, present := sdInstance.attributes[PortNumberAttribute]
177 | if !present {
178 | defaultPort = aws.String("80")
179 | }
180 |
181 | // Application metrics port is expected as a resource tag with the key 'METRICS_PORT'
182 | metricsPort, present := serviceTags[MetricsPortTag]
183 | if !present {
184 | metricsPort = defaultPort
185 | }
186 |
187 | return c.getInstanceScrapeConfiguration(sdInstance, metricsPort, metricsPath, sdNamespace)
188 | }
189 |
190 | //
191 | // This is relevant when the application is deployed along with a side-car container which exposes Docker stats as Prometheus metrics
192 | // The Docker stats are available at the Task metadata endpoint ${ECS_CONTAINER_METADATA_URI_V4}/stats
193 | // https://github.com/prometheus-community/ecs_exporter provides an implementation of a side-car that exposes Docker stats as Prometheus metrics
194 | //
195 | func (c *CloudMapClient) getInstanceScrapeConfigurationInfrastructure(sdInstance *ServiceDiscoveryInstance, serviceTags map[string]*string, sdNamespace *string) (*InstanceScrapeConfig, error) {
196 | // Path for infrastructure metrics endpoint is expected as a resource tag with the key 'ECS_METRICS_PATH'
197 | metricsPath, present := serviceTags[EcsMetricsPathTag]
198 | if !present {
199 | return nil, nil
200 | }
201 |
202 | // Infrastructure Metrics port is expected as a resource tag with the key 'ECS_METRICS_PORT'
203 | metricsPort, present := serviceTags[EcsMetricsPortTag]
204 | if !present {
205 | return nil, nil
206 | }
207 |
208 | return c.getInstanceScrapeConfiguration(sdInstance, metricsPort, metricsPath, sdNamespace)
209 | }
210 |
211 | func (c *CloudMapClient) getInstanceScrapeConfiguration(sdInstance *ServiceDiscoveryInstance, metricsPort *string, metricsPath *string, sdNamespace *string) (*InstanceScrapeConfig, error) {
212 | labels := make(map[string]string)
213 | targets := make([]string, 0)
214 |
215 | // IP address of the resource is available, by default, as an attribute with the key 'AWS_INSTANCE_IPV4'
216 | address, present := sdInstance.attributes[IpAddressAttribute]
217 | if !present {
218 | return nil, errors.New(fmt.Sprintf("Cannot find IP address for instance in service %v", sdInstance.service))
219 | }
220 | targets = append(targets, fmt.Sprintf("%s:%s", *address, *metricsPort))
221 |
222 | //
223 | // ECS Task instances registered in Cloud Map are assigned the following default attributes
224 | // ECS_CLUSTER_NAME, ECS_SERVICE_NAME, ECS_TASK_DEFINITION_FAMILY
225 | // Add these attributes as labels to be attached to the Prometheus metric
226 | //
227 | cluster, present := sdInstance.attributes[ClusterNameAttribute]
228 | if present {
229 | labels["cluster"] = *cluster
230 | }
231 | service, present := sdInstance.attributes[ServiceNameAttribute]
232 | if present {
233 | labels["service"] = *service
234 | }
235 | taskdefinition, present := sdInstance.attributes[TaskDefinitionAttribute]
236 | if present {
237 | labels["taskdefinition"] = *taskdefinition
238 | }
239 | labels["namespace"] = *sdNamespace
240 | labels["taskid"] = *sdInstance.instanceId
241 | labels["instance"] = *address
242 | labels["__metrics_path__"] = *metricsPath
243 |
244 | return &InstanceScrapeConfig{Targets: targets, Labels: labels}, nil
245 | }
246 |
--------------------------------------------------------------------------------
/pkg/aws/session.go:
--------------------------------------------------------------------------------
1 | package aws
2 |
3 | import (
4 | "log"
5 | "os"
6 |
7 | "github.com/aws/aws-sdk-go/aws"
8 | "github.com/aws/aws-sdk-go/aws/session"
9 | )
10 |
11 | var sharedSession *session.Session = nil
12 |
13 | func InitializeAWSSession() {
14 | region := os.Getenv("AWS_REGION")
15 | if region == "" {
16 | region = "us-east-1"
17 | }
18 | sharedSession, _ = session.NewSession(&aws.Config{Region: aws.String(region)})
19 | if sharedSession == nil {
20 | log.Fatalf("Unable to create a new AWS client session")
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/pkg/aws/ssm.go:
--------------------------------------------------------------------------------
1 | package aws
2 |
3 | import (
4 | "log"
5 |
6 | "github.com/aws/aws-sdk-go/aws"
7 | "github.com/aws/aws-sdk-go/service/ssm"
8 | )
9 |
10 | func getSSMClient() *ssm.SSM {
11 | service := ssm.New(sharedSession)
12 | return service
13 | }
14 |
15 | //
16 | // Retrive the value for a given SSM Parameter name
17 | //
18 | func GetParameter(parameterName string) *string {
19 | ssmService := ssm.New(sharedSession)
20 | getParameterOutput, err := ssmService.GetParameter(&ssm.GetParameterInput{Name: ¶meterName})
21 | if err != nil {
22 | log.Println(err)
23 | return aws.String("")
24 | }
25 | return getParameterOutput.Parameter.Value
26 | }
27 |
--------------------------------------------------------------------------------